diff --git a/Builder/all-beta.sh b/Builder/all-beta.sh index 945e05942..5e43fb0cc 100755 --- a/Builder/all-beta.sh +++ b/Builder/all-beta.sh @@ -5,4 +5,5 @@ cd ../ cd Builder ./remove_all.sh -./install_all.sh \ No newline at end of file +./install_all.sh +./clear-cache.sh \ No newline at end of file diff --git a/Builder/jni-1.11/simd/Android.mk b/Builder/jni-1.11/simd/Android.mk index 9ca082a62..9a9fec44a 100644 --- a/Builder/jni-1.11/simd/Android.mk +++ b/Builder/jni-1.11/simd/Android.mk @@ -1,42 +1,5 @@ # Makefile for libjpeg-turbo - LOCAL_PATH := $(call my-dir) - -################################################## -### simd_arm ### -################################################## - -include $(CLEAR_VARS) - -LOCAL_MODULE := libsimd_arm - -LOCAL_MODULE_TAGS := release - -LOCAL_CFLAGS := $(APP_CFLAGS) -LOCAL_CPPFLAGS := $(APP_CPPFLAGS) -LOCAL_ARM_MODE := $(APP_ARM_MODE) - -LOCAL_SRC_FILES := arm/src/jsimd_arm_neon.S - -include $(BUILD_STATIC_LIBRARY) - -################################################## -### simd_i386 ### -################################################## - -include $(CLEAR_VARS) - -LOCAL_MODULE := libsimd_i386 - -LOCAL_MODULE_TAGS := release - -LOCAL_CFLAGS := $(APP_CFLAGS) -LOCAL_CPPFLAGS := $(APP_CPPFLAGS) - -LOCAL_SRC_FILES := i386/lib/jsimd_i386.a - -include $(PREBUILT_STATIC_LIBRARY) - ################################################## ### simd ### ################################################## @@ -54,41 +17,25 @@ LOCAL_C_INCLUDES := $(LOCAL_PATH)/src \ $(LOCAL_PATH)/../jpeg-turbo/android \ $(LOCAL_PATH)/../jpeg-turbo/include \ $(LOCAL_PATH)/../standalone/include - -ifeq ($(TARGET_ARCH_ABI),armeabi) - LOCAL_ARM_MODE := $(APP_ARM_MODE) - LOCAL_SRC_FILES := src/jsimd_arm.c - LOCAL_STATIC_LIBRARIES := libsimd_arm -endif # TARGET_ARCH_ABI == armeabi ifeq ($(TARGET_ARCH_ABI),armeabi-v7a) LOCAL_ARM_MODE := $(APP_ARM_MODE) - LOCAL_SRC_FILES := src/jsimd_arm.c - LOCAL_STATIC_LIBRARIES := libsimd_arm -endif # TARGET_ARCH_ABI == armeabi-v7a - -ifeq ($(TARGET_ARCH_ABI),armeabi-v7a-hard) - LOCAL_ARM_MODE := $(APP_ARM_MODE) - LOCAL_SRC_FILES := src/jsimd_arm.c - LOCAL_STATIC_LIBRARIES := libsimd_arm -endif # TARGET_ARCH_ABI == armeabi-v7a + LOCAL_SRC_FILES := jsimd_arm.c jsimd_arm_neon.S +endif ifeq ($(TARGET_ARCH_ABI),arm64-v8a) - LOCAL_SRC_FILES := src/jsimd_none.c -endif # TARGET_ARCH_ABI == arm64-v8a + LOCAL_ARM_MODE := $(APP_ARM_MODE) + LOCAL_SRC_FILES := jsimd_arm64.c jsimd_arm64_neon.S +endif ifeq ($(TARGET_ARCH_ABI),x86) - LOCAL_SRC_FILES := src/jsimd_i386.c - LOCAL_STATIC_LIBRARIES := libsimd_i386 -endif # TARGET_ARCH_ABI == x86 + LOCAL_SRC_FILES := jsimd_i386.c +endif ifeq ($(TARGET_ARCH_ABI),x86_64) - LOCAL_SRC_FILES := src/jsimd_none.c -endif # TARGET_ARCH_ABI == x86 + LOCAL_SRC_FILES := jsimd_x86_64.c +endif -ifeq ($(TARGET_ARCH_ABI),mips) - LOCAL_SRC_FILES := src/jsimd_none.c -endif # TARGET_ARCH_ABI == mips include $(BUILD_STATIC_LIBRARY) diff --git a/Builder/jni-1.11/simd/CMakeLists.txt b/Builder/jni-1.11/simd/CMakeLists.txt new file mode 100755 index 000000000..6e898d8d7 --- /dev/null +++ b/Builder/jni-1.11/simd/CMakeLists.txt @@ -0,0 +1,81 @@ +if(NOT DEFINED NASM) + find_program(NASM NAMES nasm yasm DOC "Path to NASM/YASM executable") +endif() +message(STATUS "NASM = ${NASM}") + +if(SIMD_X86_64) + set(NAFLAGS -fwin64 -DWIN64 -D__x86_64__) +else() + if(BORLAND) + set(NAFLAGS -fobj -DOBJ32) + else() + set(NAFLAGS -fwin32 -DWIN32) + endif() +endif() +set(NAFLAGS ${NAFLAGS} -I${CMAKE_SOURCE_DIR}/win/ -I${CMAKE_CURRENT_SOURCE_DIR}/) + +# This only works if building from the command line. There is currently no way +# to set a variable's value based on the build type when using the MSVC IDE. +if(CMAKE_BUILD_TYPE STREQUAL "Debug" + OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") + set(NAFLAGS ${NAFLAGS} -g) +endif() + +if(SIMD_X86_64) + set(SIMD_BASENAMES jfdctflt-sse-64 jccolor-sse2-64 jcgray-sse2-64 + jchuff-sse2-64 jcsample-sse2-64 jdcolor-sse2-64 jdmerge-sse2-64 + jdsample-sse2-64 jfdctfst-sse2-64 jfdctint-sse2-64 jidctflt-sse2-64 + jidctfst-sse2-64 jidctint-sse2-64 jidctred-sse2-64 jquantf-sse2-64 + jquanti-sse2-64) + message(STATUS "Building x86_64 SIMD extensions") +else() + set(SIMD_BASENAMES jsimdcpu jfdctflt-3dn jidctflt-3dn jquant-3dn jccolor-mmx + jcgray-mmx jcsample-mmx jdcolor-mmx jdmerge-mmx jdsample-mmx jfdctfst-mmx + jfdctint-mmx jidctfst-mmx jidctint-mmx jidctred-mmx jquant-mmx jfdctflt-sse + jidctflt-sse jquant-sse jccolor-sse2 jcgray-sse2 jchuff-sse2 jcsample-sse2 + jdcolor-sse2 jdmerge-sse2 jdsample-sse2 jfdctfst-sse2 jfdctint-sse2 + jidctflt-sse2 jidctfst-sse2 jidctint-sse2 jidctred-sse2 jquantf-sse2 + jquanti-sse2) + message(STATUS "Building i386 SIMD extensions") +endif() + +if(MSVC_IDE) + set(OBJDIR "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR}") +else() + set(OBJDIR ${CMAKE_CURRENT_BINARY_DIR}) +endif() + +file(GLOB INC_FILES *.inc) + +foreach(file ${SIMD_BASENAMES}) + set(DEPFILE "") + set(SIMD_SRC ${CMAKE_CURRENT_SOURCE_DIR}/${file}.asm) + if(${file} MATCHES jccolor) + set(DEPFILE ${file}) + string(REGEX REPLACE "jccolor" "jccolext" DEPFILE ${DEPFILE}) + set(DEPFILE ${CMAKE_CURRENT_SOURCE_DIR}/${DEPFILE}.asm) + endif() + if(${file} MATCHES jcgray) + set(DEPFILE ${file}) + string(REGEX REPLACE "jcgray" "jcgryext" DEPFILE ${DEPFILE}) + set(DEPFILE ${CMAKE_CURRENT_SOURCE_DIR}/${DEPFILE}.asm) + endif() + if(${file} MATCHES jdcolor) + set(DEPFILE ${file}) + string(REGEX REPLACE "jdcolor" "jdcolext" DEPFILE ${DEPFILE}) + set(DEPFILE ${CMAKE_CURRENT_SOURCE_DIR}/${DEPFILE}.asm) + endif() + if(${file} MATCHES jdmerge) + set(DEPFILE ${file}) + string(REGEX REPLACE "jdmerge" "jdmrgext" DEPFILE ${DEPFILE}) + set(DEPFILE ${CMAKE_CURRENT_SOURCE_DIR}/${DEPFILE}.asm) + endif() + set(SIMD_OBJ ${OBJDIR}/${file}.obj) + add_custom_command(OUTPUT ${SIMD_OBJ} + DEPENDS ${SIMD_SRC} ${DEPFILE} ${INC_FILES} + COMMAND ${NASM} ${NAFLAGS} ${SIMD_SRC} -o${SIMD_OBJ}) + set(SIMD_OBJS ${SIMD_OBJS} ${SIMD_OBJ}) +endforeach() + +set(SIMD_OBJS ${SIMD_OBJS} PARENT_SCOPE) +add_custom_target(simd DEPENDS ${SIMD_OBJS}) diff --git a/Builder/jni-1.11/simd/Makefile.am b/Builder/jni-1.11/simd/Makefile.am new file mode 100644 index 000000000..b8660d1c0 --- /dev/null +++ b/Builder/jni-1.11/simd/Makefile.am @@ -0,0 +1,102 @@ +noinst_LTLIBRARIES = libsimd.la + +BUILT_SOURCES = jsimdcfg.inc + +EXTRA_DIST = nasm_lt.sh CMakeLists.txt \ + jccolext-mmx.asm jcgryext-mmx.asm jdcolext-mmx.asm jdmrgext-mmx.asm \ + jccolext-sse2.asm jcgryext-sse2.asm jdcolext-sse2.asm jdmrgext-sse2.asm \ + jccolext-sse2-64.asm jcgryext-sse2-64.asm jdcolext-sse2-64.asm \ + jdmrgext-sse2-64.asm jccolext-altivec.c jcgryext-altivec.c \ + jdcolext-altivec.c jdmrgext-altivec.c + +if SIMD_X86_64 + +libsimd_la_SOURCES = jsimd_x86_64.c jsimd.h jsimdcfg.inc.h jsimdext.inc \ + jcolsamp.inc jdct.inc jpeg_nbits_table.inc jfdctflt-sse-64.asm \ + jccolor-sse2-64.asm jcgray-sse2-64.asm jchuff-sse2-64.asm \ + jcsample-sse2-64.asm jdcolor-sse2-64.asm jdmerge-sse2-64.asm \ + jdsample-sse2-64.asm jfdctfst-sse2-64.asm jfdctint-sse2-64.asm \ + jidctflt-sse2-64.asm jidctfst-sse2-64.asm jidctint-sse2-64.asm \ + jidctred-sse2-64.asm jquantf-sse2-64.asm jquanti-sse2-64.asm + +jccolor-sse2-64.lo: jccolext-sse2-64.asm +jcgray-sse2-64.lo: jcgryext-sse2-64.asm +jdcolor-sse2-64.lo: jdcolext-sse2-64.asm +jdmerge-sse2-64.lo: jdmrgext-sse2-64.asm + +endif + +if SIMD_I386 + +libsimd_la_SOURCES = jsimd_i386.c jsimd.h jsimdcfg.inc.h jsimdext.inc \ + jcolsamp.inc jdct.inc jpeg_nbits_table.inc jsimdcpu.asm \ + jfdctflt-3dn.asm jidctflt-3dn.asm jquant-3dn.asm \ + jccolor-mmx.asm jcgray-mmx.asm jcsample-mmx.asm \ + jdcolor-mmx.asm jdmerge-mmx.asm jdsample-mmx.asm \ + jfdctfst-mmx.asm jfdctint-mmx.asm jidctfst-mmx.asm \ + jidctint-mmx.asm jidctred-mmx.asm jquant-mmx.asm \ + jfdctflt-sse.asm jidctflt-sse.asm jquant-sse.asm \ + jccolor-sse2.asm jcgray-sse2.asm jchuff-sse2.asm \ + jcsample-sse2.asm jdcolor-sse2.asm jdmerge-sse2.asm \ + jdsample-sse2.asm jfdctfst-sse2.asm jfdctint-sse2.asm \ + jidctflt-sse2.asm jidctfst-sse2.asm jidctint-sse2.asm \ + jidctred-sse2.asm jquantf-sse2.asm jquanti-sse2.asm + +jccolor-mmx.lo: jccolext-mmx.asm +jcgray.-mmx.lo: jcgryext-mmx.asm +jdcolor-mmx.lo: jdcolext-mmx.asm +jdmerge-mmx.lo: jdmrgext-mmx.asm +jccolor-sse2.lo: jccolext-sse2.asm +jcgray-sse2.lo: jcgryext-sse2.asm +jdcolor-sse2.lo: jdcolext-sse2.asm +jdmerge-sse2.lo: jdmrgext-sse2.asm + +endif + +if SIMD_ARM + +libsimd_la_SOURCES = jsimd_arm.c jsimd_arm_neon.S + +endif + +if SIMD_ARM_64 + +libsimd_la_SOURCES = jsimd_arm64.c jsimd_arm64_neon.S + +endif + +if SIMD_MIPS + +libsimd_la_SOURCES = jsimd_mips.c jsimd_mips_dspr2_asm.h jsimd_mips_dspr2.S + +endif + +if SIMD_POWERPC + +noinst_LTLIBRARIES += libsimd_altivec.la + +libsimd_altivec_la_SOURCES = \ + jccolor-altivec.c jcgray-altivec.c jcsample-altivec.c \ + jdcolor-altivec.c jdmerge-altivec.c jdsample-altivec.c \ + jfdctfst-altivec.c jfdctint-altivec.c \ + jidctfst-altivec.c jidctint-altivec.c \ + jquanti-altivec.c +libsimd_altivec_la_CFLAGS = -maltivec + +jccolor-altivec.lo: jccolext-altivec.c +jcgray-altivec.lo: jcgryext-altivec.c +jdcolor-altivec.lo: jdcolext-altivec.c +jdmerge-altivec.lo: jdmrgext-altivec.c + +libsimd_la_SOURCES = jsimd_powerpc.c jsimd_altivec.h jcsample.h +libsimd_la_LIBADD = libsimd_altivec.la + +endif + +AM_CPPFLAGS = -I$(top_srcdir) + +.asm.lo: + $(AM_V_GEN) $(LIBTOOL) $(AM_V_lt) --mode=compile --tag NASM $(srcdir)/nasm_lt.sh $(AM_V_lt) $(NASM) $(NAFLAGS) -I$(srcdir) -I. $< -o $@ + +jsimdcfg.inc: $(srcdir)/jsimdcfg.inc.h ../jpeglib.h ../jconfig.h ../jmorecfg.h + $(AM_V_GEN) $(CPP) -I$(top_builddir) -I$(top_builddir)/simd $(srcdir)/jsimdcfg.inc.h | $(EGREP) "^[\;%]|^\ %" | sed 's%_cpp_protection_%%' | sed 's@% define@%define@g' > $@ diff --git a/Builder/jni-1.11/simd/Makefile.in b/Builder/jni-1.11/simd/Makefile.in new file mode 100644 index 000000000..d6afe9873 --- /dev/null +++ b/Builder/jni-1.11/simd/Makefile.in @@ -0,0 +1,916 @@ +# Makefile.in generated by automake 1.15 from Makefile.am. +# @configure_input@ + +# Copyright (C) 1994-2014 Free Software Foundation, Inc. + +# This Makefile.in is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY, to the extent permitted by law; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. + +@SET_MAKE@ + +VPATH = @srcdir@ +am__is_gnu_make = { \ + if test -z '$(MAKELEVEL)'; then \ + false; \ + elif test -n '$(MAKE_HOST)'; then \ + true; \ + elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \ + true; \ + else \ + false; \ + fi; \ +} +am__make_running_with_option = \ + case $${target_option-} in \ + ?) ;; \ + *) echo "am__make_running_with_option: internal error: invalid" \ + "target option '$${target_option-}' specified" >&2; \ + exit 1;; \ + esac; \ + has_opt=no; \ + sane_makeflags=$$MAKEFLAGS; \ + if $(am__is_gnu_make); then \ + sane_makeflags=$$MFLAGS; \ + else \ + case $$MAKEFLAGS in \ + *\\[\ \ ]*) \ + bs=\\; \ + sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \ + | sed "s/$$bs$$bs[$$bs $$bs ]*//g"`;; \ + esac; \ + fi; \ + skip_next=no; \ + strip_trailopt () \ + { \ + flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \ + }; \ + for flg in $$sane_makeflags; do \ + test $$skip_next = yes && { skip_next=no; continue; }; \ + case $$flg in \ + *=*|--*) continue;; \ + -*I) strip_trailopt 'I'; skip_next=yes;; \ + -*I?*) strip_trailopt 'I';; \ + -*O) strip_trailopt 'O'; skip_next=yes;; \ + -*O?*) strip_trailopt 'O';; \ + -*l) strip_trailopt 'l'; skip_next=yes;; \ + -*l?*) strip_trailopt 'l';; \ + -[dEDm]) skip_next=yes;; \ + -[JT]) skip_next=yes;; \ + esac; \ + case $$flg in \ + *$$target_option*) has_opt=yes; break;; \ + esac; \ + done; \ + test $$has_opt = yes +am__make_dryrun = (target_option=n; $(am__make_running_with_option)) +am__make_keepgoing = (target_option=k; $(am__make_running_with_option)) +pkgdatadir = $(datadir)/@PACKAGE@ +pkgincludedir = $(includedir)/@PACKAGE@ +pkglibdir = $(libdir)/@PACKAGE@ +pkglibexecdir = $(libexecdir)/@PACKAGE@ +am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd +install_sh_DATA = $(install_sh) -c -m 644 +install_sh_PROGRAM = $(install_sh) -c +install_sh_SCRIPT = $(install_sh) -c +INSTALL_HEADER = $(INSTALL_DATA) +transform = $(program_transform_name) +NORMAL_INSTALL = : +PRE_INSTALL = : +POST_INSTALL = : +NORMAL_UNINSTALL = : +PRE_UNINSTALL = : +POST_UNINSTALL = : +build_triplet = @build@ +host_triplet = @host@ +@SIMD_POWERPC_TRUE@am__append_1 = libsimd_altivec.la +subdir = simd +ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 +am__aclocal_m4_deps = $(top_srcdir)/acinclude.m4 \ + $(top_srcdir)/configure.ac +am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ + $(ACLOCAL_M4) +DIST_COMMON = $(srcdir)/Makefile.am $(am__DIST_COMMON) +mkinstalldirs = $(install_sh) -d +CONFIG_HEADER = $(top_builddir)/config.h $(top_builddir)/jconfig.h \ + $(top_builddir)/jconfigint.h +CONFIG_CLEAN_FILES = +CONFIG_CLEAN_VPATH_FILES = +LTLIBRARIES = $(noinst_LTLIBRARIES) +@SIMD_POWERPC_TRUE@libsimd_la_DEPENDENCIES = libsimd_altivec.la +am__libsimd_la_SOURCES_DIST = jsimd_arm64.c jsimd_arm64_neon.S \ + jsimd_arm.c jsimd_arm_neon.S jsimd_i386.c jsimd.h \ + jsimdcfg.inc.h jsimdext.inc jcolsamp.inc jdct.inc \ + jpeg_nbits_table.inc jsimdcpu.asm jfdctflt-3dn.asm \ + jidctflt-3dn.asm jquant-3dn.asm jccolor-mmx.asm jcgray-mmx.asm \ + jcsample-mmx.asm jdcolor-mmx.asm jdmerge-mmx.asm \ + jdsample-mmx.asm jfdctfst-mmx.asm jfdctint-mmx.asm \ + jidctfst-mmx.asm jidctint-mmx.asm jidctred-mmx.asm \ + jquant-mmx.asm jfdctflt-sse.asm jidctflt-sse.asm \ + jquant-sse.asm jccolor-sse2.asm jcgray-sse2.asm \ + jchuff-sse2.asm jcsample-sse2.asm jdcolor-sse2.asm \ + jdmerge-sse2.asm jdsample-sse2.asm jfdctfst-sse2.asm \ + jfdctint-sse2.asm jidctflt-sse2.asm jidctfst-sse2.asm \ + jidctint-sse2.asm jidctred-sse2.asm jquantf-sse2.asm \ + jquanti-sse2.asm jsimd_mips.c jsimd_mips_dspr2_asm.h \ + jsimd_mips_dspr2.S jsimd_powerpc.c jsimd_altivec.h jcsample.h \ + jsimd_x86_64.c jfdctflt-sse-64.asm jccolor-sse2-64.asm \ + jcgray-sse2-64.asm jchuff-sse2-64.asm jcsample-sse2-64.asm \ + jdcolor-sse2-64.asm jdmerge-sse2-64.asm jdsample-sse2-64.asm \ + jfdctfst-sse2-64.asm jfdctint-sse2-64.asm jidctflt-sse2-64.asm \ + jidctfst-sse2-64.asm jidctint-sse2-64.asm jidctred-sse2-64.asm \ + jquantf-sse2-64.asm jquanti-sse2-64.asm +@SIMD_ARM_64_FALSE@@SIMD_ARM_FALSE@@SIMD_I386_FALSE@@SIMD_MIPS_FALSE@@SIMD_POWERPC_FALSE@@SIMD_X86_64_TRUE@am_libsimd_la_OBJECTS = jsimd_x86_64.lo \ +@SIMD_ARM_64_FALSE@@SIMD_ARM_FALSE@@SIMD_I386_FALSE@@SIMD_MIPS_FALSE@@SIMD_POWERPC_FALSE@@SIMD_X86_64_TRUE@ jfdctflt-sse-64.lo \ +@SIMD_ARM_64_FALSE@@SIMD_ARM_FALSE@@SIMD_I386_FALSE@@SIMD_MIPS_FALSE@@SIMD_POWERPC_FALSE@@SIMD_X86_64_TRUE@ jccolor-sse2-64.lo \ +@SIMD_ARM_64_FALSE@@SIMD_ARM_FALSE@@SIMD_I386_FALSE@@SIMD_MIPS_FALSE@@SIMD_POWERPC_FALSE@@SIMD_X86_64_TRUE@ jcgray-sse2-64.lo \ +@SIMD_ARM_64_FALSE@@SIMD_ARM_FALSE@@SIMD_I386_FALSE@@SIMD_MIPS_FALSE@@SIMD_POWERPC_FALSE@@SIMD_X86_64_TRUE@ jchuff-sse2-64.lo \ +@SIMD_ARM_64_FALSE@@SIMD_ARM_FALSE@@SIMD_I386_FALSE@@SIMD_MIPS_FALSE@@SIMD_POWERPC_FALSE@@SIMD_X86_64_TRUE@ jcsample-sse2-64.lo \ +@SIMD_ARM_64_FALSE@@SIMD_ARM_FALSE@@SIMD_I386_FALSE@@SIMD_MIPS_FALSE@@SIMD_POWERPC_FALSE@@SIMD_X86_64_TRUE@ jdcolor-sse2-64.lo \ +@SIMD_ARM_64_FALSE@@SIMD_ARM_FALSE@@SIMD_I386_FALSE@@SIMD_MIPS_FALSE@@SIMD_POWERPC_FALSE@@SIMD_X86_64_TRUE@ jdmerge-sse2-64.lo \ +@SIMD_ARM_64_FALSE@@SIMD_ARM_FALSE@@SIMD_I386_FALSE@@SIMD_MIPS_FALSE@@SIMD_POWERPC_FALSE@@SIMD_X86_64_TRUE@ jdsample-sse2-64.lo \ +@SIMD_ARM_64_FALSE@@SIMD_ARM_FALSE@@SIMD_I386_FALSE@@SIMD_MIPS_FALSE@@SIMD_POWERPC_FALSE@@SIMD_X86_64_TRUE@ jfdctfst-sse2-64.lo \ +@SIMD_ARM_64_FALSE@@SIMD_ARM_FALSE@@SIMD_I386_FALSE@@SIMD_MIPS_FALSE@@SIMD_POWERPC_FALSE@@SIMD_X86_64_TRUE@ jfdctint-sse2-64.lo \ +@SIMD_ARM_64_FALSE@@SIMD_ARM_FALSE@@SIMD_I386_FALSE@@SIMD_MIPS_FALSE@@SIMD_POWERPC_FALSE@@SIMD_X86_64_TRUE@ jidctflt-sse2-64.lo \ +@SIMD_ARM_64_FALSE@@SIMD_ARM_FALSE@@SIMD_I386_FALSE@@SIMD_MIPS_FALSE@@SIMD_POWERPC_FALSE@@SIMD_X86_64_TRUE@ jidctfst-sse2-64.lo \ +@SIMD_ARM_64_FALSE@@SIMD_ARM_FALSE@@SIMD_I386_FALSE@@SIMD_MIPS_FALSE@@SIMD_POWERPC_FALSE@@SIMD_X86_64_TRUE@ jidctint-sse2-64.lo \ +@SIMD_ARM_64_FALSE@@SIMD_ARM_FALSE@@SIMD_I386_FALSE@@SIMD_MIPS_FALSE@@SIMD_POWERPC_FALSE@@SIMD_X86_64_TRUE@ jidctred-sse2-64.lo \ +@SIMD_ARM_64_FALSE@@SIMD_ARM_FALSE@@SIMD_I386_FALSE@@SIMD_MIPS_FALSE@@SIMD_POWERPC_FALSE@@SIMD_X86_64_TRUE@ jquantf-sse2-64.lo \ +@SIMD_ARM_64_FALSE@@SIMD_ARM_FALSE@@SIMD_I386_FALSE@@SIMD_MIPS_FALSE@@SIMD_POWERPC_FALSE@@SIMD_X86_64_TRUE@ jquanti-sse2-64.lo +@SIMD_ARM_64_FALSE@@SIMD_ARM_FALSE@@SIMD_I386_FALSE@@SIMD_MIPS_FALSE@@SIMD_POWERPC_TRUE@am_libsimd_la_OBJECTS = jsimd_powerpc.lo +@SIMD_ARM_64_FALSE@@SIMD_ARM_FALSE@@SIMD_I386_FALSE@@SIMD_MIPS_TRUE@am_libsimd_la_OBJECTS = jsimd_mips.lo \ +@SIMD_ARM_64_FALSE@@SIMD_ARM_FALSE@@SIMD_I386_FALSE@@SIMD_MIPS_TRUE@ jsimd_mips_dspr2.lo +@SIMD_ARM_64_FALSE@@SIMD_ARM_FALSE@@SIMD_I386_TRUE@am_libsimd_la_OBJECTS = jsimd_i386.lo \ +@SIMD_ARM_64_FALSE@@SIMD_ARM_FALSE@@SIMD_I386_TRUE@ jsimdcpu.lo \ +@SIMD_ARM_64_FALSE@@SIMD_ARM_FALSE@@SIMD_I386_TRUE@ jfdctflt-3dn.lo \ +@SIMD_ARM_64_FALSE@@SIMD_ARM_FALSE@@SIMD_I386_TRUE@ jidctflt-3dn.lo \ +@SIMD_ARM_64_FALSE@@SIMD_ARM_FALSE@@SIMD_I386_TRUE@ jquant-3dn.lo \ +@SIMD_ARM_64_FALSE@@SIMD_ARM_FALSE@@SIMD_I386_TRUE@ jccolor-mmx.lo \ +@SIMD_ARM_64_FALSE@@SIMD_ARM_FALSE@@SIMD_I386_TRUE@ jcgray-mmx.lo \ +@SIMD_ARM_64_FALSE@@SIMD_ARM_FALSE@@SIMD_I386_TRUE@ jcsample-mmx.lo \ +@SIMD_ARM_64_FALSE@@SIMD_ARM_FALSE@@SIMD_I386_TRUE@ jdcolor-mmx.lo \ +@SIMD_ARM_64_FALSE@@SIMD_ARM_FALSE@@SIMD_I386_TRUE@ jdmerge-mmx.lo \ +@SIMD_ARM_64_FALSE@@SIMD_ARM_FALSE@@SIMD_I386_TRUE@ jdsample-mmx.lo \ +@SIMD_ARM_64_FALSE@@SIMD_ARM_FALSE@@SIMD_I386_TRUE@ jfdctfst-mmx.lo \ +@SIMD_ARM_64_FALSE@@SIMD_ARM_FALSE@@SIMD_I386_TRUE@ jfdctint-mmx.lo \ +@SIMD_ARM_64_FALSE@@SIMD_ARM_FALSE@@SIMD_I386_TRUE@ jidctfst-mmx.lo \ +@SIMD_ARM_64_FALSE@@SIMD_ARM_FALSE@@SIMD_I386_TRUE@ jidctint-mmx.lo \ +@SIMD_ARM_64_FALSE@@SIMD_ARM_FALSE@@SIMD_I386_TRUE@ jidctred-mmx.lo \ +@SIMD_ARM_64_FALSE@@SIMD_ARM_FALSE@@SIMD_I386_TRUE@ jquant-mmx.lo \ +@SIMD_ARM_64_FALSE@@SIMD_ARM_FALSE@@SIMD_I386_TRUE@ jfdctflt-sse.lo \ +@SIMD_ARM_64_FALSE@@SIMD_ARM_FALSE@@SIMD_I386_TRUE@ jidctflt-sse.lo \ +@SIMD_ARM_64_FALSE@@SIMD_ARM_FALSE@@SIMD_I386_TRUE@ jquant-sse.lo \ +@SIMD_ARM_64_FALSE@@SIMD_ARM_FALSE@@SIMD_I386_TRUE@ jccolor-sse2.lo \ +@SIMD_ARM_64_FALSE@@SIMD_ARM_FALSE@@SIMD_I386_TRUE@ jcgray-sse2.lo \ +@SIMD_ARM_64_FALSE@@SIMD_ARM_FALSE@@SIMD_I386_TRUE@ jchuff-sse2.lo \ +@SIMD_ARM_64_FALSE@@SIMD_ARM_FALSE@@SIMD_I386_TRUE@ jcsample-sse2.lo \ +@SIMD_ARM_64_FALSE@@SIMD_ARM_FALSE@@SIMD_I386_TRUE@ jdcolor-sse2.lo \ +@SIMD_ARM_64_FALSE@@SIMD_ARM_FALSE@@SIMD_I386_TRUE@ jdmerge-sse2.lo \ +@SIMD_ARM_64_FALSE@@SIMD_ARM_FALSE@@SIMD_I386_TRUE@ jdsample-sse2.lo \ +@SIMD_ARM_64_FALSE@@SIMD_ARM_FALSE@@SIMD_I386_TRUE@ jfdctfst-sse2.lo \ +@SIMD_ARM_64_FALSE@@SIMD_ARM_FALSE@@SIMD_I386_TRUE@ jfdctint-sse2.lo \ +@SIMD_ARM_64_FALSE@@SIMD_ARM_FALSE@@SIMD_I386_TRUE@ jidctflt-sse2.lo \ +@SIMD_ARM_64_FALSE@@SIMD_ARM_FALSE@@SIMD_I386_TRUE@ jidctfst-sse2.lo \ +@SIMD_ARM_64_FALSE@@SIMD_ARM_FALSE@@SIMD_I386_TRUE@ jidctint-sse2.lo \ +@SIMD_ARM_64_FALSE@@SIMD_ARM_FALSE@@SIMD_I386_TRUE@ jidctred-sse2.lo \ +@SIMD_ARM_64_FALSE@@SIMD_ARM_FALSE@@SIMD_I386_TRUE@ jquantf-sse2.lo \ +@SIMD_ARM_64_FALSE@@SIMD_ARM_FALSE@@SIMD_I386_TRUE@ jquanti-sse2.lo +@SIMD_ARM_64_FALSE@@SIMD_ARM_TRUE@am_libsimd_la_OBJECTS = \ +@SIMD_ARM_64_FALSE@@SIMD_ARM_TRUE@ jsimd_arm.lo \ +@SIMD_ARM_64_FALSE@@SIMD_ARM_TRUE@ jsimd_arm_neon.lo +@SIMD_ARM_64_TRUE@am_libsimd_la_OBJECTS = jsimd_arm64.lo \ +@SIMD_ARM_64_TRUE@ jsimd_arm64_neon.lo +libsimd_la_OBJECTS = $(am_libsimd_la_OBJECTS) +AM_V_lt = $(am__v_lt_@AM_V@) +am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@) +am__v_lt_0 = --silent +am__v_lt_1 = +libsimd_altivec_la_LIBADD = +am__libsimd_altivec_la_SOURCES_DIST = jccolor-altivec.c \ + jcgray-altivec.c jcsample-altivec.c jdcolor-altivec.c \ + jdmerge-altivec.c jdsample-altivec.c jfdctfst-altivec.c \ + jfdctint-altivec.c jidctfst-altivec.c jidctint-altivec.c \ + jquanti-altivec.c +@SIMD_POWERPC_TRUE@am_libsimd_altivec_la_OBJECTS = \ +@SIMD_POWERPC_TRUE@ libsimd_altivec_la-jccolor-altivec.lo \ +@SIMD_POWERPC_TRUE@ libsimd_altivec_la-jcgray-altivec.lo \ +@SIMD_POWERPC_TRUE@ libsimd_altivec_la-jcsample-altivec.lo \ +@SIMD_POWERPC_TRUE@ libsimd_altivec_la-jdcolor-altivec.lo \ +@SIMD_POWERPC_TRUE@ libsimd_altivec_la-jdmerge-altivec.lo \ +@SIMD_POWERPC_TRUE@ libsimd_altivec_la-jdsample-altivec.lo \ +@SIMD_POWERPC_TRUE@ libsimd_altivec_la-jfdctfst-altivec.lo \ +@SIMD_POWERPC_TRUE@ libsimd_altivec_la-jfdctint-altivec.lo \ +@SIMD_POWERPC_TRUE@ libsimd_altivec_la-jidctfst-altivec.lo \ +@SIMD_POWERPC_TRUE@ libsimd_altivec_la-jidctint-altivec.lo \ +@SIMD_POWERPC_TRUE@ libsimd_altivec_la-jquanti-altivec.lo +libsimd_altivec_la_OBJECTS = $(am_libsimd_altivec_la_OBJECTS) +libsimd_altivec_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC \ + $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CCLD) \ + $(libsimd_altivec_la_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \ + $(LDFLAGS) -o $@ +@SIMD_POWERPC_TRUE@am_libsimd_altivec_la_rpath = +AM_V_P = $(am__v_P_@AM_V@) +am__v_P_ = $(am__v_P_@AM_DEFAULT_V@) +am__v_P_0 = false +am__v_P_1 = : +AM_V_GEN = $(am__v_GEN_@AM_V@) +am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@) +am__v_GEN_0 = @echo " GEN " $@; +am__v_GEN_1 = +AM_V_at = $(am__v_at_@AM_V@) +am__v_at_ = $(am__v_at_@AM_DEFAULT_V@) +am__v_at_0 = @ +am__v_at_1 = +DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir) +depcomp = $(SHELL) $(top_srcdir)/depcomp +am__depfiles_maybe = depfiles +am__mv = mv -f +CPPASCOMPILE = $(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \ + $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CCASFLAGS) $(CCASFLAGS) +LTCPPASCOMPILE = $(LIBTOOL) $(AM_V_lt) $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(DEFS) \ + $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \ + $(AM_CCASFLAGS) $(CCASFLAGS) +AM_V_CPPAS = $(am__v_CPPAS_@AM_V@) +am__v_CPPAS_ = $(am__v_CPPAS_@AM_DEFAULT_V@) +am__v_CPPAS_0 = @echo " CPPAS " $@; +am__v_CPPAS_1 = +COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ + $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) +LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) \ + $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \ + $(AM_CFLAGS) $(CFLAGS) +AM_V_CC = $(am__v_CC_@AM_V@) +am__v_CC_ = $(am__v_CC_@AM_DEFAULT_V@) +am__v_CC_0 = @echo " CC " $@; +am__v_CC_1 = +CCLD = $(CC) +LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \ + $(AM_LDFLAGS) $(LDFLAGS) -o $@ +AM_V_CCLD = $(am__v_CCLD_@AM_V@) +am__v_CCLD_ = $(am__v_CCLD_@AM_DEFAULT_V@) +am__v_CCLD_0 = @echo " CCLD " $@; +am__v_CCLD_1 = +SOURCES = $(libsimd_la_SOURCES) $(libsimd_altivec_la_SOURCES) +DIST_SOURCES = $(am__libsimd_la_SOURCES_DIST) \ + $(am__libsimd_altivec_la_SOURCES_DIST) +am__can_run_installinfo = \ + case $$AM_UPDATE_INFO_DIR in \ + n|no|NO) false;; \ + *) (install-info --version) >/dev/null 2>&1;; \ + esac +am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP) +# Read a list of newline-separated strings from the standard input, +# and print each of them once, without duplicates. Input order is +# *not* preserved. +am__uniquify_input = $(AWK) '\ + BEGIN { nonempty = 0; } \ + { items[$$0] = 1; nonempty = 1; } \ + END { if (nonempty) { for (i in items) print i; }; } \ +' +# Make sure the list of sources is unique. This is necessary because, +# e.g., the same source file might be shared among _SOURCES variables +# for different programs/libraries. +am__define_uniq_tagged_files = \ + list='$(am__tagged_files)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | $(am__uniquify_input)` +ETAGS = etags +CTAGS = ctags +am__DIST_COMMON = $(srcdir)/Makefile.in $(top_srcdir)/depcomp +DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) +ACLOCAL = @ACLOCAL@ +AMTAR = @AMTAR@ +AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ +AR = @AR@ +AUTOCONF = @AUTOCONF@ +AUTOHEADER = @AUTOHEADER@ +AUTOMAKE = @AUTOMAKE@ +AWK = @AWK@ +BUILD = @BUILD@ +CC = @CC@ +CCAS = @CCAS@ +CCASDEPMODE = @CCASDEPMODE@ +CCASFLAGS = @CCASFLAGS@ +CCDEPMODE = @CCDEPMODE@ +CFLAGS = @CFLAGS@ +CPP = @CPP@ +CPPFLAGS = @CPPFLAGS@ +CYGPATH_W = @CYGPATH_W@ +DEBARCH = @DEBARCH@ +DEFS = @DEFS@ +DEPDIR = @DEPDIR@ +DLLTOOL = @DLLTOOL@ +DSYMUTIL = @DSYMUTIL@ +DUMPBIN = @DUMPBIN@ +ECHO_C = @ECHO_C@ +ECHO_N = @ECHO_N@ +ECHO_T = @ECHO_T@ +EGREP = @EGREP@ +EXEEXT = @EXEEXT@ +FGREP = @FGREP@ +GREP = @GREP@ +INSTALL = @INSTALL@ +INSTALL_DATA = @INSTALL_DATA@ +INSTALL_PROGRAM = @INSTALL_PROGRAM@ +INSTALL_SCRIPT = @INSTALL_SCRIPT@ +INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ +JAR = @JAR@ +JAVA = @JAVA@ +JAVAC = @JAVAC@ +JAVACFLAGS = @JAVACFLAGS@ +JAVA_RPM_CONTENTS_1 = @JAVA_RPM_CONTENTS_1@ +JAVA_RPM_CONTENTS_2 = @JAVA_RPM_CONTENTS_2@ +JNI_CFLAGS = @JNI_CFLAGS@ +JPEG_LIB_VERSION = @JPEG_LIB_VERSION@ +JPEG_LIB_VERSION_DECIMAL = @JPEG_LIB_VERSION_DECIMAL@ +LD = @LD@ +LDFLAGS = @LDFLAGS@ +LIBOBJS = @LIBOBJS@ +LIBS = @LIBS@ +LIBTOOL = @LIBTOOL@ +LIBTOOL_CURRENT = @LIBTOOL_CURRENT@ +LIPO = @LIPO@ +LN_S = @LN_S@ +LTLIBOBJS = @LTLIBOBJS@ +LT_SYS_LIBRARY_PATH = @LT_SYS_LIBRARY_PATH@ +MAKEINFO = @MAKEINFO@ +MANIFEST_TOOL = @MANIFEST_TOOL@ +MEM_SRCDST_FUNCTIONS = @MEM_SRCDST_FUNCTIONS@ +MKDIR_P = @MKDIR_P@ +NAFLAGS = @NAFLAGS@ +NASM = @NASM@ +NM = @NM@ +NMEDIT = @NMEDIT@ +OBJDUMP = @OBJDUMP@ +OBJEXT = @OBJEXT@ +OTOOL = @OTOOL@ +OTOOL64 = @OTOOL64@ +PACKAGE = @PACKAGE@ +PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ +PACKAGE_NAME = @PACKAGE_NAME@ +PACKAGE_STRING = @PACKAGE_STRING@ +PACKAGE_TARNAME = @PACKAGE_TARNAME@ +PACKAGE_URL = @PACKAGE_URL@ +PACKAGE_VERSION = @PACKAGE_VERSION@ +PATH_SEPARATOR = @PATH_SEPARATOR@ +PKGNAME = @PKGNAME@ +RANLIB = @RANLIB@ +RPMARCH = @RPMARCH@ +RPM_CONFIG_ARGS = @RPM_CONFIG_ARGS@ +SED = @SED@ +SET_MAKE = @SET_MAKE@ +SHELL = @SHELL@ +SO_AGE = @SO_AGE@ +SO_MAJOR_VERSION = @SO_MAJOR_VERSION@ +SO_MINOR_VERSION = @SO_MINOR_VERSION@ +STRIP = @STRIP@ +VERSION = @VERSION@ +VERSION_SCRIPT_FLAG = @VERSION_SCRIPT_FLAG@ +WITH_JAVA = @WITH_JAVA@ +abs_builddir = @abs_builddir@ +abs_srcdir = @abs_srcdir@ +abs_top_builddir = @abs_top_builddir@ +abs_top_srcdir = @abs_top_srcdir@ +ac_ct_AR = @ac_ct_AR@ +ac_ct_CC = @ac_ct_CC@ +ac_ct_DUMPBIN = @ac_ct_DUMPBIN@ +am__include = @am__include@ +am__leading_dot = @am__leading_dot@ +am__quote = @am__quote@ +am__tar = @am__tar@ +am__untar = @am__untar@ +bindir = @bindir@ +build = @build@ +build_alias = @build_alias@ +build_cpu = @build_cpu@ +build_os = @build_os@ +build_vendor = @build_vendor@ +builddir = @builddir@ +datadir = @datadir@ +datarootdir = @datarootdir@ +docdir = @docdir@ +dvidir = @dvidir@ +exec_prefix = @exec_prefix@ +host = @host@ +host_alias = @host_alias@ +host_cpu = @host_cpu@ +host_os = @host_os@ +host_vendor = @host_vendor@ +htmldir = @htmldir@ +includedir = @includedir@ +infodir = @infodir@ +install_sh = @install_sh@ +libdir = @libdir@ +libexecdir = @libexecdir@ +localedir = @localedir@ +localstatedir = @localstatedir@ +mandir = @mandir@ +mkdir_p = @mkdir_p@ +oldincludedir = @oldincludedir@ +pdfdir = @pdfdir@ +prefix = @prefix@ +program_transform_name = @program_transform_name@ +psdir = @psdir@ +sbindir = @sbindir@ +sharedstatedir = @sharedstatedir@ +srcdir = @srcdir@ +sysconfdir = @sysconfdir@ +target_alias = @target_alias@ +top_build_prefix = @top_build_prefix@ +top_builddir = @top_builddir@ +top_srcdir = @top_srcdir@ +noinst_LTLIBRARIES = libsimd.la $(am__append_1) +BUILT_SOURCES = jsimdcfg.inc +EXTRA_DIST = nasm_lt.sh CMakeLists.txt \ + jccolext-mmx.asm jcgryext-mmx.asm jdcolext-mmx.asm jdmrgext-mmx.asm \ + jccolext-sse2.asm jcgryext-sse2.asm jdcolext-sse2.asm jdmrgext-sse2.asm \ + jccolext-sse2-64.asm jcgryext-sse2-64.asm jdcolext-sse2-64.asm \ + jdmrgext-sse2-64.asm jccolext-altivec.c jcgryext-altivec.c \ + jdcolext-altivec.c jdmrgext-altivec.c + +@SIMD_ARM_64_TRUE@libsimd_la_SOURCES = jsimd_arm64.c jsimd_arm64_neon.S +@SIMD_ARM_TRUE@libsimd_la_SOURCES = jsimd_arm.c jsimd_arm_neon.S +@SIMD_I386_TRUE@libsimd_la_SOURCES = jsimd_i386.c jsimd.h jsimdcfg.inc.h jsimdext.inc \ +@SIMD_I386_TRUE@ jcolsamp.inc jdct.inc jpeg_nbits_table.inc jsimdcpu.asm \ +@SIMD_I386_TRUE@ jfdctflt-3dn.asm jidctflt-3dn.asm jquant-3dn.asm \ +@SIMD_I386_TRUE@ jccolor-mmx.asm jcgray-mmx.asm jcsample-mmx.asm \ +@SIMD_I386_TRUE@ jdcolor-mmx.asm jdmerge-mmx.asm jdsample-mmx.asm \ +@SIMD_I386_TRUE@ jfdctfst-mmx.asm jfdctint-mmx.asm jidctfst-mmx.asm \ +@SIMD_I386_TRUE@ jidctint-mmx.asm jidctred-mmx.asm jquant-mmx.asm \ +@SIMD_I386_TRUE@ jfdctflt-sse.asm jidctflt-sse.asm jquant-sse.asm \ +@SIMD_I386_TRUE@ jccolor-sse2.asm jcgray-sse2.asm jchuff-sse2.asm \ +@SIMD_I386_TRUE@ jcsample-sse2.asm jdcolor-sse2.asm jdmerge-sse2.asm \ +@SIMD_I386_TRUE@ jdsample-sse2.asm jfdctfst-sse2.asm jfdctint-sse2.asm \ +@SIMD_I386_TRUE@ jidctflt-sse2.asm jidctfst-sse2.asm jidctint-sse2.asm \ +@SIMD_I386_TRUE@ jidctred-sse2.asm jquantf-sse2.asm jquanti-sse2.asm + +@SIMD_MIPS_TRUE@libsimd_la_SOURCES = jsimd_mips.c jsimd_mips_dspr2_asm.h jsimd_mips_dspr2.S +@SIMD_POWERPC_TRUE@libsimd_la_SOURCES = jsimd_powerpc.c jsimd_altivec.h jcsample.h +@SIMD_X86_64_TRUE@libsimd_la_SOURCES = jsimd_x86_64.c jsimd.h jsimdcfg.inc.h jsimdext.inc \ +@SIMD_X86_64_TRUE@ jcolsamp.inc jdct.inc jpeg_nbits_table.inc jfdctflt-sse-64.asm \ +@SIMD_X86_64_TRUE@ jccolor-sse2-64.asm jcgray-sse2-64.asm jchuff-sse2-64.asm \ +@SIMD_X86_64_TRUE@ jcsample-sse2-64.asm jdcolor-sse2-64.asm jdmerge-sse2-64.asm \ +@SIMD_X86_64_TRUE@ jdsample-sse2-64.asm jfdctfst-sse2-64.asm jfdctint-sse2-64.asm \ +@SIMD_X86_64_TRUE@ jidctflt-sse2-64.asm jidctfst-sse2-64.asm jidctint-sse2-64.asm \ +@SIMD_X86_64_TRUE@ jidctred-sse2-64.asm jquantf-sse2-64.asm jquanti-sse2-64.asm + +@SIMD_POWERPC_TRUE@libsimd_altivec_la_SOURCES = \ +@SIMD_POWERPC_TRUE@ jccolor-altivec.c jcgray-altivec.c jcsample-altivec.c \ +@SIMD_POWERPC_TRUE@ jdcolor-altivec.c jdmerge-altivec.c jdsample-altivec.c \ +@SIMD_POWERPC_TRUE@ jfdctfst-altivec.c jfdctint-altivec.c \ +@SIMD_POWERPC_TRUE@ jidctfst-altivec.c jidctint-altivec.c \ +@SIMD_POWERPC_TRUE@ jquanti-altivec.c + +@SIMD_POWERPC_TRUE@libsimd_altivec_la_CFLAGS = -maltivec +@SIMD_POWERPC_TRUE@libsimd_la_LIBADD = libsimd_altivec.la +AM_CPPFLAGS = -I$(top_srcdir) +all: $(BUILT_SOURCES) + $(MAKE) $(AM_MAKEFLAGS) all-am + +.SUFFIXES: +.SUFFIXES: .S .asm .c .lo .o .obj +$(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps) + @for dep in $?; do \ + case '$(am__configure_deps)' in \ + *$$dep*) \ + ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \ + && { if test -f $@; then exit 0; else break; fi; }; \ + exit 1;; \ + esac; \ + done; \ + echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign simd/Makefile'; \ + $(am__cd) $(top_srcdir) && \ + $(AUTOMAKE) --foreign simd/Makefile +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + @case '$?' in \ + *config.status*) \ + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ + *) \ + echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \ + cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \ + esac; + +$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh + +$(top_srcdir)/configure: $(am__configure_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(ACLOCAL_M4): $(am__aclocal_m4_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(am__aclocal_m4_deps): + +clean-noinstLTLIBRARIES: + -test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES) + @list='$(noinst_LTLIBRARIES)'; \ + locs=`for p in $$list; do echo $$p; done | \ + sed 's|^[^/]*$$|.|; s|/[^/]*$$||; s|$$|/so_locations|' | \ + sort -u`; \ + test -z "$$locs" || { \ + echo rm -f $${locs}; \ + rm -f $${locs}; \ + } + +libsimd.la: $(libsimd_la_OBJECTS) $(libsimd_la_DEPENDENCIES) $(EXTRA_libsimd_la_DEPENDENCIES) + $(AM_V_CCLD)$(LINK) $(libsimd_la_OBJECTS) $(libsimd_la_LIBADD) $(LIBS) + +libsimd_altivec.la: $(libsimd_altivec_la_OBJECTS) $(libsimd_altivec_la_DEPENDENCIES) $(EXTRA_libsimd_altivec_la_DEPENDENCIES) + $(AM_V_CCLD)$(libsimd_altivec_la_LINK) $(am_libsimd_altivec_la_rpath) $(libsimd_altivec_la_OBJECTS) $(libsimd_altivec_la_LIBADD) $(LIBS) + +mostlyclean-compile: + -rm -f *.$(OBJEXT) + +distclean-compile: + -rm -f *.tab.c + +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/jsimd_arm.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/jsimd_arm64.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/jsimd_arm64_neon.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/jsimd_arm_neon.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/jsimd_i386.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/jsimd_mips.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/jsimd_mips_dspr2.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/jsimd_powerpc.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/jsimd_x86_64.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libsimd_altivec_la-jccolor-altivec.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libsimd_altivec_la-jcgray-altivec.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libsimd_altivec_la-jcsample-altivec.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libsimd_altivec_la-jdcolor-altivec.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libsimd_altivec_la-jdmerge-altivec.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libsimd_altivec_la-jdsample-altivec.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libsimd_altivec_la-jfdctfst-altivec.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libsimd_altivec_la-jfdctint-altivec.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libsimd_altivec_la-jidctfst-altivec.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libsimd_altivec_la-jidctint-altivec.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libsimd_altivec_la-jquanti-altivec.Plo@am__quote@ + +.S.o: +@am__fastdepCCAS_TRUE@ $(AM_V_CPPAS)$(CPPASCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< +@am__fastdepCCAS_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po +@AMDEP_TRUE@@am__fastdepCCAS_FALSE@ $(AM_V_CPPAS)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCCAS_FALSE@ DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCCAS_FALSE@ $(AM_V_CPPAS@am__nodep@)$(CPPASCOMPILE) -c -o $@ $< + +.S.obj: +@am__fastdepCCAS_TRUE@ $(AM_V_CPPAS)$(CPPASCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'` +@am__fastdepCCAS_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po +@AMDEP_TRUE@@am__fastdepCCAS_FALSE@ $(AM_V_CPPAS)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCCAS_FALSE@ DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCCAS_FALSE@ $(AM_V_CPPAS@am__nodep@)$(CPPASCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'` + +.S.lo: +@am__fastdepCCAS_TRUE@ $(AM_V_CPPAS)$(LTCPPASCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< +@am__fastdepCCAS_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo +@AMDEP_TRUE@@am__fastdepCCAS_FALSE@ $(AM_V_CPPAS)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCCAS_FALSE@ DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCCAS_FALSE@ $(AM_V_CPPAS@am__nodep@)$(LTCPPASCOMPILE) -c -o $@ $< + +.c.o: +@am__fastdepCC_TRUE@ $(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< +@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ $< + +.c.obj: +@am__fastdepCC_TRUE@ $(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'` +@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ `$(CYGPATH_W) '$<'` + +.c.lo: +@am__fastdepCC_TRUE@ $(AM_V_CC)$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< +@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LTCOMPILE) -c -o $@ $< + +libsimd_altivec_la-jccolor-altivec.lo: jccolor-altivec.c +@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libsimd_altivec_la_CFLAGS) $(CFLAGS) -MT libsimd_altivec_la-jccolor-altivec.lo -MD -MP -MF $(DEPDIR)/libsimd_altivec_la-jccolor-altivec.Tpo -c -o libsimd_altivec_la-jccolor-altivec.lo `test -f 'jccolor-altivec.c' || echo '$(srcdir)/'`jccolor-altivec.c +@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libsimd_altivec_la-jccolor-altivec.Tpo $(DEPDIR)/libsimd_altivec_la-jccolor-altivec.Plo +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='jccolor-altivec.c' object='libsimd_altivec_la-jccolor-altivec.lo' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libsimd_altivec_la_CFLAGS) $(CFLAGS) -c -o libsimd_altivec_la-jccolor-altivec.lo `test -f 'jccolor-altivec.c' || echo '$(srcdir)/'`jccolor-altivec.c + +libsimd_altivec_la-jcgray-altivec.lo: jcgray-altivec.c +@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libsimd_altivec_la_CFLAGS) $(CFLAGS) -MT libsimd_altivec_la-jcgray-altivec.lo -MD -MP -MF $(DEPDIR)/libsimd_altivec_la-jcgray-altivec.Tpo -c -o libsimd_altivec_la-jcgray-altivec.lo `test -f 'jcgray-altivec.c' || echo '$(srcdir)/'`jcgray-altivec.c +@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libsimd_altivec_la-jcgray-altivec.Tpo $(DEPDIR)/libsimd_altivec_la-jcgray-altivec.Plo +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='jcgray-altivec.c' object='libsimd_altivec_la-jcgray-altivec.lo' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libsimd_altivec_la_CFLAGS) $(CFLAGS) -c -o libsimd_altivec_la-jcgray-altivec.lo `test -f 'jcgray-altivec.c' || echo '$(srcdir)/'`jcgray-altivec.c + +libsimd_altivec_la-jcsample-altivec.lo: jcsample-altivec.c +@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libsimd_altivec_la_CFLAGS) $(CFLAGS) -MT libsimd_altivec_la-jcsample-altivec.lo -MD -MP -MF $(DEPDIR)/libsimd_altivec_la-jcsample-altivec.Tpo -c -o libsimd_altivec_la-jcsample-altivec.lo `test -f 'jcsample-altivec.c' || echo '$(srcdir)/'`jcsample-altivec.c +@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libsimd_altivec_la-jcsample-altivec.Tpo $(DEPDIR)/libsimd_altivec_la-jcsample-altivec.Plo +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='jcsample-altivec.c' object='libsimd_altivec_la-jcsample-altivec.lo' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libsimd_altivec_la_CFLAGS) $(CFLAGS) -c -o libsimd_altivec_la-jcsample-altivec.lo `test -f 'jcsample-altivec.c' || echo '$(srcdir)/'`jcsample-altivec.c + +libsimd_altivec_la-jdcolor-altivec.lo: jdcolor-altivec.c +@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libsimd_altivec_la_CFLAGS) $(CFLAGS) -MT libsimd_altivec_la-jdcolor-altivec.lo -MD -MP -MF $(DEPDIR)/libsimd_altivec_la-jdcolor-altivec.Tpo -c -o libsimd_altivec_la-jdcolor-altivec.lo `test -f 'jdcolor-altivec.c' || echo '$(srcdir)/'`jdcolor-altivec.c +@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libsimd_altivec_la-jdcolor-altivec.Tpo $(DEPDIR)/libsimd_altivec_la-jdcolor-altivec.Plo +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='jdcolor-altivec.c' object='libsimd_altivec_la-jdcolor-altivec.lo' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libsimd_altivec_la_CFLAGS) $(CFLAGS) -c -o libsimd_altivec_la-jdcolor-altivec.lo `test -f 'jdcolor-altivec.c' || echo '$(srcdir)/'`jdcolor-altivec.c + +libsimd_altivec_la-jdmerge-altivec.lo: jdmerge-altivec.c +@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libsimd_altivec_la_CFLAGS) $(CFLAGS) -MT libsimd_altivec_la-jdmerge-altivec.lo -MD -MP -MF $(DEPDIR)/libsimd_altivec_la-jdmerge-altivec.Tpo -c -o libsimd_altivec_la-jdmerge-altivec.lo `test -f 'jdmerge-altivec.c' || echo '$(srcdir)/'`jdmerge-altivec.c +@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libsimd_altivec_la-jdmerge-altivec.Tpo $(DEPDIR)/libsimd_altivec_la-jdmerge-altivec.Plo +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='jdmerge-altivec.c' object='libsimd_altivec_la-jdmerge-altivec.lo' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libsimd_altivec_la_CFLAGS) $(CFLAGS) -c -o libsimd_altivec_la-jdmerge-altivec.lo `test -f 'jdmerge-altivec.c' || echo '$(srcdir)/'`jdmerge-altivec.c + +libsimd_altivec_la-jdsample-altivec.lo: jdsample-altivec.c +@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libsimd_altivec_la_CFLAGS) $(CFLAGS) -MT libsimd_altivec_la-jdsample-altivec.lo -MD -MP -MF $(DEPDIR)/libsimd_altivec_la-jdsample-altivec.Tpo -c -o libsimd_altivec_la-jdsample-altivec.lo `test -f 'jdsample-altivec.c' || echo '$(srcdir)/'`jdsample-altivec.c +@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libsimd_altivec_la-jdsample-altivec.Tpo $(DEPDIR)/libsimd_altivec_la-jdsample-altivec.Plo +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='jdsample-altivec.c' object='libsimd_altivec_la-jdsample-altivec.lo' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libsimd_altivec_la_CFLAGS) $(CFLAGS) -c -o libsimd_altivec_la-jdsample-altivec.lo `test -f 'jdsample-altivec.c' || echo '$(srcdir)/'`jdsample-altivec.c + +libsimd_altivec_la-jfdctfst-altivec.lo: jfdctfst-altivec.c +@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libsimd_altivec_la_CFLAGS) $(CFLAGS) -MT libsimd_altivec_la-jfdctfst-altivec.lo -MD -MP -MF $(DEPDIR)/libsimd_altivec_la-jfdctfst-altivec.Tpo -c -o libsimd_altivec_la-jfdctfst-altivec.lo `test -f 'jfdctfst-altivec.c' || echo '$(srcdir)/'`jfdctfst-altivec.c +@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libsimd_altivec_la-jfdctfst-altivec.Tpo $(DEPDIR)/libsimd_altivec_la-jfdctfst-altivec.Plo +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='jfdctfst-altivec.c' object='libsimd_altivec_la-jfdctfst-altivec.lo' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libsimd_altivec_la_CFLAGS) $(CFLAGS) -c -o libsimd_altivec_la-jfdctfst-altivec.lo `test -f 'jfdctfst-altivec.c' || echo '$(srcdir)/'`jfdctfst-altivec.c + +libsimd_altivec_la-jfdctint-altivec.lo: jfdctint-altivec.c +@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libsimd_altivec_la_CFLAGS) $(CFLAGS) -MT libsimd_altivec_la-jfdctint-altivec.lo -MD -MP -MF $(DEPDIR)/libsimd_altivec_la-jfdctint-altivec.Tpo -c -o libsimd_altivec_la-jfdctint-altivec.lo `test -f 'jfdctint-altivec.c' || echo '$(srcdir)/'`jfdctint-altivec.c +@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libsimd_altivec_la-jfdctint-altivec.Tpo $(DEPDIR)/libsimd_altivec_la-jfdctint-altivec.Plo +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='jfdctint-altivec.c' object='libsimd_altivec_la-jfdctint-altivec.lo' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libsimd_altivec_la_CFLAGS) $(CFLAGS) -c -o libsimd_altivec_la-jfdctint-altivec.lo `test -f 'jfdctint-altivec.c' || echo '$(srcdir)/'`jfdctint-altivec.c + +libsimd_altivec_la-jidctfst-altivec.lo: jidctfst-altivec.c +@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libsimd_altivec_la_CFLAGS) $(CFLAGS) -MT libsimd_altivec_la-jidctfst-altivec.lo -MD -MP -MF $(DEPDIR)/libsimd_altivec_la-jidctfst-altivec.Tpo -c -o libsimd_altivec_la-jidctfst-altivec.lo `test -f 'jidctfst-altivec.c' || echo '$(srcdir)/'`jidctfst-altivec.c +@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libsimd_altivec_la-jidctfst-altivec.Tpo $(DEPDIR)/libsimd_altivec_la-jidctfst-altivec.Plo +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='jidctfst-altivec.c' object='libsimd_altivec_la-jidctfst-altivec.lo' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libsimd_altivec_la_CFLAGS) $(CFLAGS) -c -o libsimd_altivec_la-jidctfst-altivec.lo `test -f 'jidctfst-altivec.c' || echo '$(srcdir)/'`jidctfst-altivec.c + +libsimd_altivec_la-jidctint-altivec.lo: jidctint-altivec.c +@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libsimd_altivec_la_CFLAGS) $(CFLAGS) -MT libsimd_altivec_la-jidctint-altivec.lo -MD -MP -MF $(DEPDIR)/libsimd_altivec_la-jidctint-altivec.Tpo -c -o libsimd_altivec_la-jidctint-altivec.lo `test -f 'jidctint-altivec.c' || echo '$(srcdir)/'`jidctint-altivec.c +@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libsimd_altivec_la-jidctint-altivec.Tpo $(DEPDIR)/libsimd_altivec_la-jidctint-altivec.Plo +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='jidctint-altivec.c' object='libsimd_altivec_la-jidctint-altivec.lo' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libsimd_altivec_la_CFLAGS) $(CFLAGS) -c -o libsimd_altivec_la-jidctint-altivec.lo `test -f 'jidctint-altivec.c' || echo '$(srcdir)/'`jidctint-altivec.c + +libsimd_altivec_la-jquanti-altivec.lo: jquanti-altivec.c +@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libsimd_altivec_la_CFLAGS) $(CFLAGS) -MT libsimd_altivec_la-jquanti-altivec.lo -MD -MP -MF $(DEPDIR)/libsimd_altivec_la-jquanti-altivec.Tpo -c -o libsimd_altivec_la-jquanti-altivec.lo `test -f 'jquanti-altivec.c' || echo '$(srcdir)/'`jquanti-altivec.c +@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libsimd_altivec_la-jquanti-altivec.Tpo $(DEPDIR)/libsimd_altivec_la-jquanti-altivec.Plo +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='jquanti-altivec.c' object='libsimd_altivec_la-jquanti-altivec.lo' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libsimd_altivec_la_CFLAGS) $(CFLAGS) -c -o libsimd_altivec_la-jquanti-altivec.lo `test -f 'jquanti-altivec.c' || echo '$(srcdir)/'`jquanti-altivec.c + +mostlyclean-libtool: + -rm -f *.lo + +clean-libtool: + -rm -rf .libs _libs + +ID: $(am__tagged_files) + $(am__define_uniq_tagged_files); mkid -fID $$unique +tags: tags-am +TAGS: tags + +tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) + set x; \ + here=`pwd`; \ + $(am__define_uniq_tagged_files); \ + shift; \ + if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \ + test -n "$$unique" || unique=$$empty_fix; \ + if test $$# -gt 0; then \ + $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + "$$@" $$unique; \ + else \ + $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + $$unique; \ + fi; \ + fi +ctags: ctags-am + +CTAGS: ctags +ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) + $(am__define_uniq_tagged_files); \ + test -z "$(CTAGS_ARGS)$$unique" \ + || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ + $$unique + +GTAGS: + here=`$(am__cd) $(top_builddir) && pwd` \ + && $(am__cd) $(top_srcdir) \ + && gtags -i $(GTAGS_ARGS) "$$here" +cscopelist: cscopelist-am + +cscopelist-am: $(am__tagged_files) + list='$(am__tagged_files)'; \ + case "$(srcdir)" in \ + [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \ + *) sdir=$(subdir)/$(srcdir) ;; \ + esac; \ + for i in $$list; do \ + if test -f "$$i"; then \ + echo "$(subdir)/$$i"; \ + else \ + echo "$$sdir/$$i"; \ + fi; \ + done >> $(top_builddir)/cscope.files + +distclean-tags: + -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags + +distdir: $(DISTFILES) + @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + list='$(DISTFILES)'; \ + dist_files=`for file in $$list; do echo $$file; done | \ + sed -e "s|^$$srcdirstrip/||;t" \ + -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \ + case $$dist_files in \ + */*) $(MKDIR_P) `echo "$$dist_files" | \ + sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \ + sort -u` ;; \ + esac; \ + for file in $$dist_files; do \ + if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ + if test -d $$d/$$file; then \ + dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \ + if test -d "$(distdir)/$$file"; then \ + find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ + fi; \ + if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ + cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \ + find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ + fi; \ + cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \ + else \ + test -f "$(distdir)/$$file" \ + || cp -p $$d/$$file "$(distdir)/$$file" \ + || exit 1; \ + fi; \ + done +check-am: all-am +check: $(BUILT_SOURCES) + $(MAKE) $(AM_MAKEFLAGS) check-am +all-am: Makefile $(LTLIBRARIES) +installdirs: +install: $(BUILT_SOURCES) + $(MAKE) $(AM_MAKEFLAGS) install-am +install-exec: install-exec-am +install-data: install-data-am +uninstall: uninstall-am + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am + +installcheck: installcheck-am +install-strip: + if test -z '$(STRIP)'; then \ + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + install; \ + else \ + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \ + fi +mostlyclean-generic: + +clean-generic: + +distclean-generic: + -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) + -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) + +maintainer-clean-generic: + @echo "This command is intended for maintainers to use" + @echo "it deletes files that may require special tools to rebuild." + -test -z "$(BUILT_SOURCES)" || rm -f $(BUILT_SOURCES) +clean: clean-am + +clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \ + mostlyclean-am + +distclean: distclean-am + -rm -rf ./$(DEPDIR) + -rm -f Makefile +distclean-am: clean-am distclean-compile distclean-generic \ + distclean-tags + +dvi: dvi-am + +dvi-am: + +html: html-am + +html-am: + +info: info-am + +info-am: + +install-data-am: + +install-dvi: install-dvi-am + +install-dvi-am: + +install-exec-am: + +install-html: install-html-am + +install-html-am: + +install-info: install-info-am + +install-info-am: + +install-man: + +install-pdf: install-pdf-am + +install-pdf-am: + +install-ps: install-ps-am + +install-ps-am: + +installcheck-am: + +maintainer-clean: maintainer-clean-am + -rm -rf ./$(DEPDIR) + -rm -f Makefile +maintainer-clean-am: distclean-am maintainer-clean-generic + +mostlyclean: mostlyclean-am + +mostlyclean-am: mostlyclean-compile mostlyclean-generic \ + mostlyclean-libtool + +pdf: pdf-am + +pdf-am: + +ps: ps-am + +ps-am: + +uninstall-am: + +.MAKE: all check install install-am install-strip + +.PHONY: CTAGS GTAGS TAGS all all-am check check-am clean clean-generic \ + clean-libtool clean-noinstLTLIBRARIES cscopelist-am ctags \ + ctags-am distclean distclean-compile distclean-generic \ + distclean-libtool distclean-tags distdir dvi dvi-am html \ + html-am info info-am install install-am install-data \ + install-data-am install-dvi install-dvi-am install-exec \ + install-exec-am install-html install-html-am install-info \ + install-info-am install-man install-pdf install-pdf-am \ + install-ps install-ps-am install-strip installcheck \ + installcheck-am installdirs maintainer-clean \ + maintainer-clean-generic mostlyclean mostlyclean-compile \ + mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \ + tags tags-am uninstall uninstall-am + +.PRECIOUS: Makefile + + +@SIMD_X86_64_TRUE@jccolor-sse2-64.lo: jccolext-sse2-64.asm +@SIMD_X86_64_TRUE@jcgray-sse2-64.lo: jcgryext-sse2-64.asm +@SIMD_X86_64_TRUE@jdcolor-sse2-64.lo: jdcolext-sse2-64.asm +@SIMD_X86_64_TRUE@jdmerge-sse2-64.lo: jdmrgext-sse2-64.asm + +@SIMD_I386_TRUE@jccolor-mmx.lo: jccolext-mmx.asm +@SIMD_I386_TRUE@jcgray.-mmx.lo: jcgryext-mmx.asm +@SIMD_I386_TRUE@jdcolor-mmx.lo: jdcolext-mmx.asm +@SIMD_I386_TRUE@jdmerge-mmx.lo: jdmrgext-mmx.asm +@SIMD_I386_TRUE@jccolor-sse2.lo: jccolext-sse2.asm +@SIMD_I386_TRUE@jcgray-sse2.lo: jcgryext-sse2.asm +@SIMD_I386_TRUE@jdcolor-sse2.lo: jdcolext-sse2.asm +@SIMD_I386_TRUE@jdmerge-sse2.lo: jdmrgext-sse2.asm + +@SIMD_POWERPC_TRUE@jccolor-altivec.lo: jccolext-altivec.c +@SIMD_POWERPC_TRUE@jcgray-altivec.lo: jcgryext-altivec.c +@SIMD_POWERPC_TRUE@jdcolor-altivec.lo: jdcolext-altivec.c +@SIMD_POWERPC_TRUE@jdmerge-altivec.lo: jdmrgext-altivec.c + +.asm.lo: + $(AM_V_GEN) $(LIBTOOL) $(AM_V_lt) --mode=compile --tag NASM $(srcdir)/nasm_lt.sh $(AM_V_lt) $(NASM) $(NAFLAGS) -I$(srcdir) -I. $< -o $@ + +jsimdcfg.inc: $(srcdir)/jsimdcfg.inc.h ../jpeglib.h ../jconfig.h ../jmorecfg.h + $(AM_V_GEN) $(CPP) -I$(top_builddir) -I$(top_builddir)/simd $(srcdir)/jsimdcfg.inc.h | $(EGREP) "^[\;%]|^\ %" | sed 's%_cpp_protection_%%' | sed 's@% define@%define@g' > $@ + +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/Builder/jni-1.11/simd/arm/src/jsimd_arm_neon.S b/Builder/jni-1.11/simd/arm/src/jsimd_arm_neon.S deleted file mode 100644 index ac6c8607b..000000000 --- a/Builder/jni-1.11/simd/arm/src/jsimd_arm_neon.S +++ /dev/null @@ -1,2408 +0,0 @@ -/* - * ARMv7 NEON optimizations for libjpeg-turbo - * - * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies). - * All rights reserved. - * Author: Siarhei Siamashka - * - * This software is provided 'as-is', without any express or implied - * warranty. In no event will the authors be held liable for any damages - * arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to the following restrictions: - * - * 1. The origin of this software must not be misrepresented; you must not - * claim that you wrote the original software. If you use this software - * in a product, an acknowledgment in the product documentation would be - * appreciated but is not required. - * 2. Altered source versions must be plainly marked as such, and must not be - * misrepresented as being the original software. - * 3. This notice may not be removed or altered from any source distribution. - */ - -#if defined(__linux__) && defined(__ELF__) -.section .note.GNU-stack,"",%progbits /* mark stack as non-executable */ -#endif - -.text -.fpu neon -.arch armv7a -.object_arch armv4 -.arm - - -#define RESPECT_STRICT_ALIGNMENT 1 - - -/*****************************************************************************/ - -/* Supplementary macro for setting function attributes */ -.macro asm_function fname -#ifdef __APPLE__ - .func _\fname - .globl _\fname -_\fname: -#else - .func \fname - .global \fname -#ifdef __ELF__ - .hidden \fname - .type \fname, %function -#endif -\fname: -#endif -.endm - -/* Transpose a block of 4x4 coefficients in four 64-bit registers */ -.macro transpose_4x4 x0, x1, x2, x3 - vtrn.16 \x0, \x1 - vtrn.16 \x2, \x3 - vtrn.32 \x0, \x2 - vtrn.32 \x1, \x3 -.endm - - -#define CENTERJSAMPLE 128 - -/*****************************************************************************/ - -/* - * Perform dequantization and inverse DCT on one block of coefficients. - * - * GLOBAL(void) - * jsimd_idct_islow_neon (void * dct_table, JCOEFPTR coef_block, - * JSAMPARRAY output_buf, JDIMENSION output_col) - */ - -#define FIX_0_298631336 (2446) -#define FIX_0_390180644 (3196) -#define FIX_0_541196100 (4433) -#define FIX_0_765366865 (6270) -#define FIX_0_899976223 (7373) -#define FIX_1_175875602 (9633) -#define FIX_1_501321110 (12299) -#define FIX_1_847759065 (15137) -#define FIX_1_961570560 (16069) -#define FIX_2_053119869 (16819) -#define FIX_2_562915447 (20995) -#define FIX_3_072711026 (25172) - -#define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560) -#define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644) -#define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065) -#define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447) -#define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223) -#define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223) -#define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447) -#define FIX_0_541196100_PLUS_0_765366865 (FIX_0_541196100 + FIX_0_765366865) - -/* - * Reference SIMD-friendly 1-D ISLOW iDCT C implementation. - * Uses some ideas from the comments in 'simd/jiss2int-64.asm' - */ -#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) \ -{ \ - DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \ - INT32 q1, q2, q3, q4, q5, q6, q7; \ - INT32 tmp11_plus_tmp2, tmp11_minus_tmp2; \ - \ - /* 1-D iDCT input data */ \ - row0 = xrow0; \ - row1 = xrow1; \ - row2 = xrow2; \ - row3 = xrow3; \ - row4 = xrow4; \ - row5 = xrow5; \ - row6 = xrow6; \ - row7 = xrow7; \ - \ - q5 = row7 + row3; \ - q4 = row5 + row1; \ - q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \ - MULTIPLY(q4, FIX_1_175875602); \ - q7 = MULTIPLY(q5, FIX_1_175875602) + \ - MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \ - q2 = MULTIPLY(row2, FIX_0_541196100) + \ - MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \ - q4 = q6; \ - q3 = ((INT32) row0 - (INT32) row4) << 13; \ - q6 += MULTIPLY(row5, -FIX_2_562915447) + \ - MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \ - /* now we can use q1 (reloadable constants have been used up) */ \ - q1 = q3 + q2; \ - q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \ - MULTIPLY(row1, -FIX_0_899976223); \ - q5 = q7; \ - q1 = q1 + q6; \ - q7 += MULTIPLY(row7, -FIX_0_899976223) + \ - MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \ - \ - /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \ - tmp11_plus_tmp2 = q1; \ - row1 = 0; \ - \ - q1 = q1 - q6; \ - q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \ - MULTIPLY(row3, -FIX_2_562915447); \ - q1 = q1 - q6; \ - q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \ - MULTIPLY(row6, FIX_0_541196100); \ - q3 = q3 - q2; \ - \ - /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \ - tmp11_minus_tmp2 = q1; \ - \ - q1 = ((INT32) row0 + (INT32) row4) << 13; \ - q2 = q1 + q6; \ - q1 = q1 - q6; \ - \ - /* pick up the results */ \ - tmp0 = q4; \ - tmp1 = q5; \ - tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \ - tmp3 = q7; \ - tmp10 = q2; \ - tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \ - tmp12 = q3; \ - tmp13 = q1; \ -} - -#define XFIX_0_899976223 d0[0] -#define XFIX_0_541196100 d0[1] -#define XFIX_2_562915447 d0[2] -#define XFIX_0_298631336_MINUS_0_899976223 d0[3] -#define XFIX_1_501321110_MINUS_0_899976223 d1[0] -#define XFIX_2_053119869_MINUS_2_562915447 d1[1] -#define XFIX_0_541196100_PLUS_0_765366865 d1[2] -#define XFIX_1_175875602 d1[3] -#define XFIX_1_175875602_MINUS_0_390180644 d2[0] -#define XFIX_0_541196100_MINUS_1_847759065 d2[1] -#define XFIX_3_072711026_MINUS_2_562915447 d2[2] -#define XFIX_1_175875602_MINUS_1_961570560 d2[3] - -.balign 16 -jsimd_idct_islow_neon_consts: - .short FIX_0_899976223 /* d0[0] */ - .short FIX_0_541196100 /* d0[1] */ - .short FIX_2_562915447 /* d0[2] */ - .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */ - .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */ - .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */ - .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */ - .short FIX_1_175875602 /* d1[3] */ - /* reloadable constants */ - .short FIX_1_175875602_MINUS_0_390180644 /* d2[0] */ - .short FIX_0_541196100_MINUS_1_847759065 /* d2[1] */ - .short FIX_3_072711026_MINUS_2_562915447 /* d2[2] */ - .short FIX_1_175875602_MINUS_1_961570560 /* d2[3] */ - -asm_function jsimd_idct_islow_neon - - DCT_TABLE .req r0 - COEF_BLOCK .req r1 - OUTPUT_BUF .req r2 - OUTPUT_COL .req r3 - TMP1 .req r0 - TMP2 .req r1 - TMP3 .req r2 - TMP4 .req ip - - ROW0L .req d16 - ROW0R .req d17 - ROW1L .req d18 - ROW1R .req d19 - ROW2L .req d20 - ROW2R .req d21 - ROW3L .req d22 - ROW3R .req d23 - ROW4L .req d24 - ROW4R .req d25 - ROW5L .req d26 - ROW5R .req d27 - ROW6L .req d28 - ROW6R .req d29 - ROW7L .req d30 - ROW7R .req d31 - - /* Load and dequantize coefficients into NEON registers - * with the following allocation: - * 0 1 2 3 | 4 5 6 7 - * ---------+-------- - * 0 | d16 | d17 ( q8 ) - * 1 | d18 | d19 ( q9 ) - * 2 | d20 | d21 ( q10 ) - * 3 | d22 | d23 ( q11 ) - * 4 | d24 | d25 ( q12 ) - * 5 | d26 | d27 ( q13 ) - * 6 | d28 | d29 ( q14 ) - * 7 | d30 | d31 ( q15 ) - */ - adr ip, jsimd_idct_islow_neon_consts - vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]! - vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! - vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]! - vmul.s16 q8, q8, q0 - vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! - vmul.s16 q9, q9, q1 - vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]! - vmul.s16 q10, q10, q2 - vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! - vmul.s16 q11, q11, q3 - vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128] - vmul.s16 q12, q12, q0 - vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! - vmul.s16 q14, q14, q2 - vmul.s16 q13, q13, q1 - vld1.16 {d0, d1, d2, d3}, [ip, :128] /* load constants */ - add ip, ip, #16 - vmul.s16 q15, q15, q3 - vpush {d8-d15} /* save NEON registers */ - /* 1-D IDCT, pass 1, left 4x8 half */ - vadd.s16 d4, ROW7L, ROW3L - vadd.s16 d5, ROW5L, ROW1L - vmull.s16 q6, d4, XFIX_1_175875602_MINUS_1_961570560 - vmlal.s16 q6, d5, XFIX_1_175875602 - vmull.s16 q7, d4, XFIX_1_175875602 - /* Check for the zero coefficients in the right 4x8 half */ - push {r4, r5} - vmlal.s16 q7, d5, XFIX_1_175875602_MINUS_0_390180644 - vsubl.s16 q3, ROW0L, ROW4L - ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))] - vmull.s16 q2, ROW2L, XFIX_0_541196100 - vmlal.s16 q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065 - orr r0, r4, r5 - vmov q4, q6 - vmlsl.s16 q6, ROW5L, XFIX_2_562915447 - ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))] - vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 - vshl.s32 q3, q3, #13 - orr r0, r0, r4 - vmlsl.s16 q4, ROW1L, XFIX_0_899976223 - orr r0, r0, r5 - vadd.s32 q1, q3, q2 - ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))] - vmov q5, q7 - vadd.s32 q1, q1, q6 - orr r0, r0, r4 - vmlsl.s16 q7, ROW7L, XFIX_0_899976223 - orr r0, r0, r5 - vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 - vrshrn.s32 ROW1L, q1, #11 - ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))] - vsub.s32 q1, q1, q6 - vmlal.s16 q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447 - orr r0, r0, r4 - vmlsl.s16 q5, ROW3L, XFIX_2_562915447 - orr r0, r0, r5 - vsub.s32 q1, q1, q6 - vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 - ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))] - vmlal.s16 q6, ROW6L, XFIX_0_541196100 - vsub.s32 q3, q3, q2 - orr r0, r0, r4 - vrshrn.s32 ROW6L, q1, #11 - orr r0, r0, r5 - vadd.s32 q1, q3, q5 - ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))] - vsub.s32 q3, q3, q5 - vaddl.s16 q5, ROW0L, ROW4L - orr r0, r0, r4 - vrshrn.s32 ROW2L, q1, #11 - orr r0, r0, r5 - vrshrn.s32 ROW5L, q3, #11 - ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))] - vshl.s32 q5, q5, #13 - vmlal.s16 q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223 - orr r0, r0, r4 - vadd.s32 q2, q5, q6 - orrs r0, r0, r5 - vsub.s32 q1, q5, q6 - vadd.s32 q6, q2, q7 - ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))] - vsub.s32 q2, q2, q7 - vadd.s32 q5, q1, q4 - orr r0, r4, r5 - vsub.s32 q3, q1, q4 - pop {r4, r5} - vrshrn.s32 ROW7L, q2, #11 - vrshrn.s32 ROW3L, q5, #11 - vrshrn.s32 ROW0L, q6, #11 - vrshrn.s32 ROW4L, q3, #11 - - beq 3f /* Go to do some special handling for the sparse right 4x8 half */ - - /* 1-D IDCT, pass 1, right 4x8 half */ - vld1.s16 {d2}, [ip, :64] /* reload constants */ - vadd.s16 d10, ROW7R, ROW3R - vadd.s16 d8, ROW5R, ROW1R - /* Transpose left 4x8 half */ - vtrn.16 ROW6L, ROW7L - vmull.s16 q6, d10, XFIX_1_175875602_MINUS_1_961570560 - vmlal.s16 q6, d8, XFIX_1_175875602 - vtrn.16 ROW2L, ROW3L - vmull.s16 q7, d10, XFIX_1_175875602 - vmlal.s16 q7, d8, XFIX_1_175875602_MINUS_0_390180644 - vtrn.16 ROW0L, ROW1L - vsubl.s16 q3, ROW0R, ROW4R - vmull.s16 q2, ROW2R, XFIX_0_541196100 - vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065 - vtrn.16 ROW4L, ROW5L - vmov q4, q6 - vmlsl.s16 q6, ROW5R, XFIX_2_562915447 - vmlal.s16 q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447 - vtrn.32 ROW1L, ROW3L - vshl.s32 q3, q3, #13 - vmlsl.s16 q4, ROW1R, XFIX_0_899976223 - vtrn.32 ROW4L, ROW6L - vadd.s32 q1, q3, q2 - vmov q5, q7 - vadd.s32 q1, q1, q6 - vtrn.32 ROW0L, ROW2L - vmlsl.s16 q7, ROW7R, XFIX_0_899976223 - vmlal.s16 q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223 - vrshrn.s32 ROW1R, q1, #11 - vtrn.32 ROW5L, ROW7L - vsub.s32 q1, q1, q6 - vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447 - vmlsl.s16 q5, ROW3R, XFIX_2_562915447 - vsub.s32 q1, q1, q6 - vmull.s16 q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865 - vmlal.s16 q6, ROW6R, XFIX_0_541196100 - vsub.s32 q3, q3, q2 - vrshrn.s32 ROW6R, q1, #11 - vadd.s32 q1, q3, q5 - vsub.s32 q3, q3, q5 - vaddl.s16 q5, ROW0R, ROW4R - vrshrn.s32 ROW2R, q1, #11 - vrshrn.s32 ROW5R, q3, #11 - vshl.s32 q5, q5, #13 - vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223 - vadd.s32 q2, q5, q6 - vsub.s32 q1, q5, q6 - vadd.s32 q6, q2, q7 - vsub.s32 q2, q2, q7 - vadd.s32 q5, q1, q4 - vsub.s32 q3, q1, q4 - vrshrn.s32 ROW7R, q2, #11 - vrshrn.s32 ROW3R, q5, #11 - vrshrn.s32 ROW0R, q6, #11 - vrshrn.s32 ROW4R, q3, #11 - /* Transpose right 4x8 half */ - vtrn.16 ROW6R, ROW7R - vtrn.16 ROW2R, ROW3R - vtrn.16 ROW0R, ROW1R - vtrn.16 ROW4R, ROW5R - vtrn.32 ROW1R, ROW3R - vtrn.32 ROW4R, ROW6R - vtrn.32 ROW0R, ROW2R - vtrn.32 ROW5R, ROW7R - -1: /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */ - vld1.s16 {d2}, [ip, :64] /* reload constants */ - vmull.s16 q6, ROW1R, XFIX_1_175875602 /* ROW5L <-> ROW1R */ - vmlal.s16 q6, ROW1L, XFIX_1_175875602 - vmlal.s16 q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */ - vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560 - vmull.s16 q7, ROW3R, XFIX_1_175875602 /* ROW7L <-> ROW3R */ - vmlal.s16 q7, ROW3L, XFIX_1_175875602 - vmlal.s16 q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */ - vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644 - vsubl.s16 q3, ROW0L, ROW0R /* ROW4L <-> ROW0R */ - vmull.s16 q2, ROW2L, XFIX_0_541196100 - vmlal.s16 q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L <-> ROW2R */ - vmov q4, q6 - vmlsl.s16 q6, ROW1R, XFIX_2_562915447 /* ROW5L <-> ROW1R */ - vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 - vshl.s32 q3, q3, #13 - vmlsl.s16 q4, ROW1L, XFIX_0_899976223 - vadd.s32 q1, q3, q2 - vmov q5, q7 - vadd.s32 q1, q1, q6 - vmlsl.s16 q7, ROW3R, XFIX_0_899976223 /* ROW7L <-> ROW3R */ - vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 - vshrn.s32 ROW1L, q1, #16 - vsub.s32 q1, q1, q6 - vmlal.s16 q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L <-> ROW1R */ - vmlsl.s16 q5, ROW3L, XFIX_2_562915447 - vsub.s32 q1, q1, q6 - vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 - vmlal.s16 q6, ROW2R, XFIX_0_541196100 /* ROW6L <-> ROW2R */ - vsub.s32 q3, q3, q2 - vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */ - vadd.s32 q1, q3, q5 - vsub.s32 q3, q3, q5 - vaddl.s16 q5, ROW0L, ROW0R /* ROW4L <-> ROW0R */ - vshrn.s32 ROW2L, q1, #16 - vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */ - vshl.s32 q5, q5, #13 - vmlal.s16 q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L <-> ROW3R */ - vadd.s32 q2, q5, q6 - vsub.s32 q1, q5, q6 - vadd.s32 q6, q2, q7 - vsub.s32 q2, q2, q7 - vadd.s32 q5, q1, q4 - vsub.s32 q3, q1, q4 - vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */ - vshrn.s32 ROW3L, q5, #16 - vshrn.s32 ROW0L, q6, #16 - vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */ - /* 1-D IDCT, pass 2, right 4x8 half */ - vld1.s16 {d2}, [ip, :64] /* reload constants */ - vmull.s16 q6, ROW5R, XFIX_1_175875602 - vmlal.s16 q6, ROW5L, XFIX_1_175875602 /* ROW5L <-> ROW1R */ - vmlal.s16 q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560 - vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */ - vmull.s16 q7, ROW7R, XFIX_1_175875602 - vmlal.s16 q7, ROW7L, XFIX_1_175875602 /* ROW7L <-> ROW3R */ - vmlal.s16 q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644 - vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */ - vsubl.s16 q3, ROW4L, ROW4R /* ROW4L <-> ROW0R */ - vmull.s16 q2, ROW6L, XFIX_0_541196100 /* ROW6L <-> ROW2R */ - vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065 - vmov q4, q6 - vmlsl.s16 q6, ROW5R, XFIX_2_562915447 - vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L <-> ROW3R */ - vshl.s32 q3, q3, #13 - vmlsl.s16 q4, ROW5L, XFIX_0_899976223 /* ROW5L <-> ROW1R */ - vadd.s32 q1, q3, q2 - vmov q5, q7 - vadd.s32 q1, q1, q6 - vmlsl.s16 q7, ROW7R, XFIX_0_899976223 - vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L <-> ROW1R */ - vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */ - vsub.s32 q1, q1, q6 - vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447 - vmlsl.s16 q5, ROW7L, XFIX_2_562915447 /* ROW7L <-> ROW3R */ - vsub.s32 q1, q1, q6 - vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L <-> ROW2R */ - vmlal.s16 q6, ROW6R, XFIX_0_541196100 - vsub.s32 q3, q3, q2 - vshrn.s32 ROW6R, q1, #16 - vadd.s32 q1, q3, q5 - vsub.s32 q3, q3, q5 - vaddl.s16 q5, ROW4L, ROW4R /* ROW4L <-> ROW0R */ - vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */ - vshrn.s32 ROW5R, q3, #16 - vshl.s32 q5, q5, #13 - vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223 - vadd.s32 q2, q5, q6 - vsub.s32 q1, q5, q6 - vadd.s32 q6, q2, q7 - vsub.s32 q2, q2, q7 - vadd.s32 q5, q1, q4 - vsub.s32 q3, q1, q4 - vshrn.s32 ROW7R, q2, #16 - vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */ - vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */ - vshrn.s32 ROW4R, q3, #16 - -2: /* Descale to 8-bit and range limit */ - vqrshrn.s16 d16, q8, #2 - vqrshrn.s16 d17, q9, #2 - vqrshrn.s16 d18, q10, #2 - vqrshrn.s16 d19, q11, #2 - vpop {d8-d15} /* restore NEON registers */ - vqrshrn.s16 d20, q12, #2 - /* Transpose the final 8-bit samples and do signed->unsigned conversion */ - vtrn.16 q8, q9 - vqrshrn.s16 d21, q13, #2 - vqrshrn.s16 d22, q14, #2 - vmov.u8 q0, #(CENTERJSAMPLE) - vqrshrn.s16 d23, q15, #2 - vtrn.8 d16, d17 - vtrn.8 d18, d19 - vadd.u8 q8, q8, q0 - vadd.u8 q9, q9, q0 - vtrn.16 q10, q11 - /* Store results to the output buffer */ - ldmia OUTPUT_BUF!, {TMP1, TMP2} - add TMP1, TMP1, OUTPUT_COL - add TMP2, TMP2, OUTPUT_COL - vst1.8 {d16}, [TMP1] - vtrn.8 d20, d21 - vst1.8 {d17}, [TMP2] - ldmia OUTPUT_BUF!, {TMP1, TMP2} - add TMP1, TMP1, OUTPUT_COL - add TMP2, TMP2, OUTPUT_COL - vst1.8 {d18}, [TMP1] - vadd.u8 q10, q10, q0 - vst1.8 {d19}, [TMP2] - ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} - add TMP1, TMP1, OUTPUT_COL - add TMP2, TMP2, OUTPUT_COL - add TMP3, TMP3, OUTPUT_COL - add TMP4, TMP4, OUTPUT_COL - vtrn.8 d22, d23 - vst1.8 {d20}, [TMP1] - vadd.u8 q11, q11, q0 - vst1.8 {d21}, [TMP2] - vst1.8 {d22}, [TMP3] - vst1.8 {d23}, [TMP4] - bx lr - -3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */ - - /* Transpose left 4x8 half */ - vtrn.16 ROW6L, ROW7L - vtrn.16 ROW2L, ROW3L - vtrn.16 ROW0L, ROW1L - vtrn.16 ROW4L, ROW5L - vshl.s16 ROW0R, ROW0R, #2 /* PASS1_BITS */ - vtrn.32 ROW1L, ROW3L - vtrn.32 ROW4L, ROW6L - vtrn.32 ROW0L, ROW2L - vtrn.32 ROW5L, ROW7L - - cmp r0, #0 - beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second pass */ - - /* Only row 0 is non-zero for the right 4x8 half */ - vdup.s16 ROW1R, ROW0R[1] - vdup.s16 ROW2R, ROW0R[2] - vdup.s16 ROW3R, ROW0R[3] - vdup.s16 ROW4R, ROW0R[0] - vdup.s16 ROW5R, ROW0R[1] - vdup.s16 ROW6R, ROW0R[2] - vdup.s16 ROW7R, ROW0R[3] - vdup.s16 ROW0R, ROW0R[0] - b 1b /* Go to 'normal' second pass */ - -4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */ - vld1.s16 {d2}, [ip, :64] /* reload constants */ - vmull.s16 q6, ROW1L, XFIX_1_175875602 - vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560 - vmull.s16 q7, ROW3L, XFIX_1_175875602 - vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644 - vmull.s16 q2, ROW2L, XFIX_0_541196100 - vshll.s16 q3, ROW0L, #13 - vmov q4, q6 - vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 - vmlsl.s16 q4, ROW1L, XFIX_0_899976223 - vadd.s32 q1, q3, q2 - vmov q5, q7 - vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 - vadd.s32 q1, q1, q6 - vadd.s32 q6, q6, q6 - vmlsl.s16 q5, ROW3L, XFIX_2_562915447 - vshrn.s32 ROW1L, q1, #16 - vsub.s32 q1, q1, q6 - vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 - vsub.s32 q3, q3, q2 - vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */ - vadd.s32 q1, q3, q5 - vsub.s32 q3, q3, q5 - vshll.s16 q5, ROW0L, #13 - vshrn.s32 ROW2L, q1, #16 - vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */ - vadd.s32 q2, q5, q6 - vsub.s32 q1, q5, q6 - vadd.s32 q6, q2, q7 - vsub.s32 q2, q2, q7 - vadd.s32 q5, q1, q4 - vsub.s32 q3, q1, q4 - vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */ - vshrn.s32 ROW3L, q5, #16 - vshrn.s32 ROW0L, q6, #16 - vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */ - /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */ - vld1.s16 {d2}, [ip, :64] /* reload constants */ - vmull.s16 q6, ROW5L, XFIX_1_175875602 - vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 - vmull.s16 q7, ROW7L, XFIX_1_175875602 - vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 - vmull.s16 q2, ROW6L, XFIX_0_541196100 - vshll.s16 q3, ROW4L, #13 - vmov q4, q6 - vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 - vmlsl.s16 q4, ROW5L, XFIX_0_899976223 - vadd.s32 q1, q3, q2 - vmov q5, q7 - vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 - vadd.s32 q1, q1, q6 - vadd.s32 q6, q6, q6 - vmlsl.s16 q5, ROW7L, XFIX_2_562915447 - vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */ - vsub.s32 q1, q1, q6 - vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 - vsub.s32 q3, q3, q2 - vshrn.s32 ROW6R, q1, #16 - vadd.s32 q1, q3, q5 - vsub.s32 q3, q3, q5 - vshll.s16 q5, ROW4L, #13 - vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */ - vshrn.s32 ROW5R, q3, #16 - vadd.s32 q2, q5, q6 - vsub.s32 q1, q5, q6 - vadd.s32 q6, q2, q7 - vsub.s32 q2, q2, q7 - vadd.s32 q5, q1, q4 - vsub.s32 q3, q1, q4 - vshrn.s32 ROW7R, q2, #16 - vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */ - vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */ - vshrn.s32 ROW4R, q3, #16 - b 2b /* Go to epilogue */ - - .unreq DCT_TABLE - .unreq COEF_BLOCK - .unreq OUTPUT_BUF - .unreq OUTPUT_COL - .unreq TMP1 - .unreq TMP2 - .unreq TMP3 - .unreq TMP4 - - .unreq ROW0L - .unreq ROW0R - .unreq ROW1L - .unreq ROW1R - .unreq ROW2L - .unreq ROW2R - .unreq ROW3L - .unreq ROW3R - .unreq ROW4L - .unreq ROW4R - .unreq ROW5L - .unreq ROW5R - .unreq ROW6L - .unreq ROW6R - .unreq ROW7L - .unreq ROW7R -.endfunc - - -/*****************************************************************************/ - -/* - * jsimd_idct_ifast_neon - * - * This function contains a fast, not so accurate integer implementation of - * the inverse DCT (Discrete Cosine Transform). It uses the same calculations - * and produces exactly the same output as IJG's original 'jpeg_idct_ifast' - * function from jidctfst.c - * - * Normally 1-D AAN DCT needs 5 multiplications and 29 additions. - * But in ARM NEON case some extra additions are required because VQDMULH - * instruction can't handle the constants larger than 1. So the expressions - * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x", - * which introduces an extra addition. Overall, there are 6 extra additions - * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions. - */ - -#define XFIX_1_082392200 d0[0] -#define XFIX_1_414213562 d0[1] -#define XFIX_1_847759065 d0[2] -#define XFIX_2_613125930 d0[3] - -.balign 16 -jsimd_idct_ifast_neon_consts: - .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */ - .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */ - .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */ - .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */ - -asm_function jsimd_idct_ifast_neon - - DCT_TABLE .req r0 - COEF_BLOCK .req r1 - OUTPUT_BUF .req r2 - OUTPUT_COL .req r3 - TMP1 .req r0 - TMP2 .req r1 - TMP3 .req r2 - TMP4 .req ip - - /* Load and dequantize coefficients into NEON registers - * with the following allocation: - * 0 1 2 3 | 4 5 6 7 - * ---------+-------- - * 0 | d16 | d17 ( q8 ) - * 1 | d18 | d19 ( q9 ) - * 2 | d20 | d21 ( q10 ) - * 3 | d22 | d23 ( q11 ) - * 4 | d24 | d25 ( q12 ) - * 5 | d26 | d27 ( q13 ) - * 6 | d28 | d29 ( q14 ) - * 7 | d30 | d31 ( q15 ) - */ - adr ip, jsimd_idct_ifast_neon_consts - vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]! - vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! - vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]! - vmul.s16 q8, q8, q0 - vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! - vmul.s16 q9, q9, q1 - vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]! - vmul.s16 q10, q10, q2 - vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! - vmul.s16 q11, q11, q3 - vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128] - vmul.s16 q12, q12, q0 - vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! - vmul.s16 q14, q14, q2 - vmul.s16 q13, q13, q1 - vld1.16 {d0}, [ip, :64] /* load constants */ - vmul.s16 q15, q15, q3 - vpush {d8-d13} /* save NEON registers */ - /* 1-D IDCT, pass 1 */ - vsub.s16 q2, q10, q14 - vadd.s16 q14, q10, q14 - vsub.s16 q1, q11, q13 - vadd.s16 q13, q11, q13 - vsub.s16 q5, q9, q15 - vadd.s16 q15, q9, q15 - vqdmulh.s16 q4, q2, XFIX_1_414213562 - vqdmulh.s16 q6, q1, XFIX_2_613125930 - vadd.s16 q3, q1, q1 - vsub.s16 q1, q5, q1 - vadd.s16 q10, q2, q4 - vqdmulh.s16 q4, q1, XFIX_1_847759065 - vsub.s16 q2, q15, q13 - vadd.s16 q3, q3, q6 - vqdmulh.s16 q6, q2, XFIX_1_414213562 - vadd.s16 q1, q1, q4 - vqdmulh.s16 q4, q5, XFIX_1_082392200 - vsub.s16 q10, q10, q14 - vadd.s16 q2, q2, q6 - vsub.s16 q6, q8, q12 - vadd.s16 q12, q8, q12 - vadd.s16 q9, q5, q4 - vadd.s16 q5, q6, q10 - vsub.s16 q10, q6, q10 - vadd.s16 q6, q15, q13 - vadd.s16 q8, q12, q14 - vsub.s16 q3, q6, q3 - vsub.s16 q12, q12, q14 - vsub.s16 q3, q3, q1 - vsub.s16 q1, q9, q1 - vadd.s16 q2, q3, q2 - vsub.s16 q15, q8, q6 - vadd.s16 q1, q1, q2 - vadd.s16 q8, q8, q6 - vadd.s16 q14, q5, q3 - vsub.s16 q9, q5, q3 - vsub.s16 q13, q10, q2 - vadd.s16 q10, q10, q2 - /* Transpose */ - vtrn.16 q8, q9 - vsub.s16 q11, q12, q1 - vtrn.16 q14, q15 - vadd.s16 q12, q12, q1 - vtrn.16 q10, q11 - vtrn.16 q12, q13 - vtrn.32 q9, q11 - vtrn.32 q12, q14 - vtrn.32 q8, q10 - vtrn.32 q13, q15 - vswp d28, d21 - vswp d26, d19 - /* 1-D IDCT, pass 2 */ - vsub.s16 q2, q10, q14 - vswp d30, d23 - vadd.s16 q14, q10, q14 - vswp d24, d17 - vsub.s16 q1, q11, q13 - vadd.s16 q13, q11, q13 - vsub.s16 q5, q9, q15 - vadd.s16 q15, q9, q15 - vqdmulh.s16 q4, q2, XFIX_1_414213562 - vqdmulh.s16 q6, q1, XFIX_2_613125930 - vadd.s16 q3, q1, q1 - vsub.s16 q1, q5, q1 - vadd.s16 q10, q2, q4 - vqdmulh.s16 q4, q1, XFIX_1_847759065 - vsub.s16 q2, q15, q13 - vadd.s16 q3, q3, q6 - vqdmulh.s16 q6, q2, XFIX_1_414213562 - vadd.s16 q1, q1, q4 - vqdmulh.s16 q4, q5, XFIX_1_082392200 - vsub.s16 q10, q10, q14 - vadd.s16 q2, q2, q6 - vsub.s16 q6, q8, q12 - vadd.s16 q12, q8, q12 - vadd.s16 q9, q5, q4 - vadd.s16 q5, q6, q10 - vsub.s16 q10, q6, q10 - vadd.s16 q6, q15, q13 - vadd.s16 q8, q12, q14 - vsub.s16 q3, q6, q3 - vsub.s16 q12, q12, q14 - vsub.s16 q3, q3, q1 - vsub.s16 q1, q9, q1 - vadd.s16 q2, q3, q2 - vsub.s16 q15, q8, q6 - vadd.s16 q1, q1, q2 - vadd.s16 q8, q8, q6 - vadd.s16 q14, q5, q3 - vsub.s16 q9, q5, q3 - vsub.s16 q13, q10, q2 - vpop {d8-d13} /* restore NEON registers */ - vadd.s16 q10, q10, q2 - vsub.s16 q11, q12, q1 - vadd.s16 q12, q12, q1 - /* Descale to 8-bit and range limit */ - vmov.u8 q0, #0x80 - vqshrn.s16 d16, q8, #5 - vqshrn.s16 d17, q9, #5 - vqshrn.s16 d18, q10, #5 - vqshrn.s16 d19, q11, #5 - vqshrn.s16 d20, q12, #5 - vqshrn.s16 d21, q13, #5 - vqshrn.s16 d22, q14, #5 - vqshrn.s16 d23, q15, #5 - vadd.u8 q8, q8, q0 - vadd.u8 q9, q9, q0 - vadd.u8 q10, q10, q0 - vadd.u8 q11, q11, q0 - /* Transpose the final 8-bit samples */ - vtrn.16 q8, q9 - vtrn.16 q10, q11 - vtrn.32 q8, q10 - vtrn.32 q9, q11 - vtrn.8 d16, d17 - vtrn.8 d18, d19 - /* Store results to the output buffer */ - ldmia OUTPUT_BUF!, {TMP1, TMP2} - add TMP1, TMP1, OUTPUT_COL - add TMP2, TMP2, OUTPUT_COL - vst1.8 {d16}, [TMP1] - vst1.8 {d17}, [TMP2] - ldmia OUTPUT_BUF!, {TMP1, TMP2} - add TMP1, TMP1, OUTPUT_COL - add TMP2, TMP2, OUTPUT_COL - vst1.8 {d18}, [TMP1] - vtrn.8 d20, d21 - vst1.8 {d19}, [TMP2] - ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} - add TMP1, TMP1, OUTPUT_COL - add TMP2, TMP2, OUTPUT_COL - add TMP3, TMP3, OUTPUT_COL - add TMP4, TMP4, OUTPUT_COL - vst1.8 {d20}, [TMP1] - vtrn.8 d22, d23 - vst1.8 {d21}, [TMP2] - vst1.8 {d22}, [TMP3] - vst1.8 {d23}, [TMP4] - bx lr - - .unreq DCT_TABLE - .unreq COEF_BLOCK - .unreq OUTPUT_BUF - .unreq OUTPUT_COL - .unreq TMP1 - .unreq TMP2 - .unreq TMP3 - .unreq TMP4 -.endfunc - - -/*****************************************************************************/ - -/* - * jsimd_idct_4x4_neon - * - * This function contains inverse-DCT code for getting reduced-size - * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations - * and produces exactly the same output as IJG's original 'jpeg_idct_4x4' - * function from jpeg-6b (jidctred.c). - * - * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which - * requires much less arithmetic operations and hence should be faster. - * The primary purpose of this particular NEON optimized function is - * bit exact compatibility with jpeg-6b. - * - * TODO: a bit better instructions scheduling can be achieved by expanding - * idct_helper/transpose_4x4 macros and reordering instructions, - * but readability will suffer somewhat. - */ - -#define CONST_BITS 13 - -#define FIX_0_211164243 (1730) /* FIX(0.211164243) */ -#define FIX_0_509795579 (4176) /* FIX(0.509795579) */ -#define FIX_0_601344887 (4926) /* FIX(0.601344887) */ -#define FIX_0_720959822 (5906) /* FIX(0.720959822) */ -#define FIX_0_765366865 (6270) /* FIX(0.765366865) */ -#define FIX_0_850430095 (6967) /* FIX(0.850430095) */ -#define FIX_0_899976223 (7373) /* FIX(0.899976223) */ -#define FIX_1_061594337 (8697) /* FIX(1.061594337) */ -#define FIX_1_272758580 (10426) /* FIX(1.272758580) */ -#define FIX_1_451774981 (11893) /* FIX(1.451774981) */ -#define FIX_1_847759065 (15137) /* FIX(1.847759065) */ -#define FIX_2_172734803 (17799) /* FIX(2.172734803) */ -#define FIX_2_562915447 (20995) /* FIX(2.562915447) */ -#define FIX_3_624509785 (29692) /* FIX(3.624509785) */ - -.balign 16 -jsimd_idct_4x4_neon_consts: - .short FIX_1_847759065 /* d0[0] */ - .short -FIX_0_765366865 /* d0[1] */ - .short -FIX_0_211164243 /* d0[2] */ - .short FIX_1_451774981 /* d0[3] */ - .short -FIX_2_172734803 /* d1[0] */ - .short FIX_1_061594337 /* d1[1] */ - .short -FIX_0_509795579 /* d1[2] */ - .short -FIX_0_601344887 /* d1[3] */ - .short FIX_0_899976223 /* d2[0] */ - .short FIX_2_562915447 /* d2[1] */ - .short 1 << (CONST_BITS+1) /* d2[2] */ - .short 0 /* d2[3] */ - -.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29 - vmull.s16 q14, \x4, d2[2] - vmlal.s16 q14, \x8, d0[0] - vmlal.s16 q14, \x14, d0[1] - - vmull.s16 q13, \x16, d1[2] - vmlal.s16 q13, \x12, d1[3] - vmlal.s16 q13, \x10, d2[0] - vmlal.s16 q13, \x6, d2[1] - - vmull.s16 q15, \x4, d2[2] - vmlsl.s16 q15, \x8, d0[0] - vmlsl.s16 q15, \x14, d0[1] - - vmull.s16 q12, \x16, d0[2] - vmlal.s16 q12, \x12, d0[3] - vmlal.s16 q12, \x10, d1[0] - vmlal.s16 q12, \x6, d1[1] - - vadd.s32 q10, q14, q13 - vsub.s32 q14, q14, q13 - -.if \shift > 16 - vrshr.s32 q10, q10, #\shift - vrshr.s32 q14, q14, #\shift - vmovn.s32 \y26, q10 - vmovn.s32 \y29, q14 -.else - vrshrn.s32 \y26, q10, #\shift - vrshrn.s32 \y29, q14, #\shift -.endif - - vadd.s32 q10, q15, q12 - vsub.s32 q15, q15, q12 - -.if \shift > 16 - vrshr.s32 q10, q10, #\shift - vrshr.s32 q15, q15, #\shift - vmovn.s32 \y27, q10 - vmovn.s32 \y28, q15 -.else - vrshrn.s32 \y27, q10, #\shift - vrshrn.s32 \y28, q15, #\shift -.endif - -.endm - -asm_function jsimd_idct_4x4_neon - - DCT_TABLE .req r0 - COEF_BLOCK .req r1 - OUTPUT_BUF .req r2 - OUTPUT_COL .req r3 - TMP1 .req r0 - TMP2 .req r1 - TMP3 .req r2 - TMP4 .req ip - - vpush {d8-d15} - - /* Load constants (d3 is just used for padding) */ - adr TMP4, jsimd_idct_4x4_neon_consts - vld1.16 {d0, d1, d2, d3}, [TMP4, :128] - - /* Load all COEF_BLOCK into NEON registers with the following allocation: - * 0 1 2 3 | 4 5 6 7 - * ---------+-------- - * 0 | d4 | d5 - * 1 | d6 | d7 - * 2 | d8 | d9 - * 3 | d10 | d11 - * 4 | - | - - * 5 | d12 | d13 - * 6 | d14 | d15 - * 7 | d16 | d17 - */ - vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]! - vld1.16 {d8, d9, d10, d11}, [COEF_BLOCK, :128]! - add COEF_BLOCK, COEF_BLOCK, #16 - vld1.16 {d12, d13, d14, d15}, [COEF_BLOCK, :128]! - vld1.16 {d16, d17}, [COEF_BLOCK, :128]! - /* dequantize */ - vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]! - vmul.s16 q2, q2, q9 - vld1.16 {d22, d23, d24, d25}, [DCT_TABLE, :128]! - vmul.s16 q3, q3, q10 - vmul.s16 q4, q4, q11 - add DCT_TABLE, DCT_TABLE, #16 - vld1.16 {d26, d27, d28, d29}, [DCT_TABLE, :128]! - vmul.s16 q5, q5, q12 - vmul.s16 q6, q6, q13 - vld1.16 {d30, d31}, [DCT_TABLE, :128]! - vmul.s16 q7, q7, q14 - vmul.s16 q8, q8, q15 - - /* Pass 1 */ - idct_helper d4, d6, d8, d10, d12, d14, d16, 12, d4, d6, d8, d10 - transpose_4x4 d4, d6, d8, d10 - idct_helper d5, d7, d9, d11, d13, d15, d17, 12, d5, d7, d9, d11 - transpose_4x4 d5, d7, d9, d11 - - /* Pass 2 */ - idct_helper d4, d6, d8, d10, d7, d9, d11, 19, d26, d27, d28, d29 - transpose_4x4 d26, d27, d28, d29 - - /* Range limit */ - vmov.u16 q15, #0x80 - vadd.s16 q13, q13, q15 - vadd.s16 q14, q14, q15 - vqmovun.s16 d26, q13 - vqmovun.s16 d27, q14 - - /* Store results to the output buffer */ - ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} - add TMP1, TMP1, OUTPUT_COL - add TMP2, TMP2, OUTPUT_COL - add TMP3, TMP3, OUTPUT_COL - add TMP4, TMP4, OUTPUT_COL - -#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT - /* We can use much less instructions on little endian systems if the - * OS kernel is not configured to trap unaligned memory accesses - */ - vst1.32 {d26[0]}, [TMP1]! - vst1.32 {d27[0]}, [TMP3]! - vst1.32 {d26[1]}, [TMP2]! - vst1.32 {d27[1]}, [TMP4]! -#else - vst1.8 {d26[0]}, [TMP1]! - vst1.8 {d27[0]}, [TMP3]! - vst1.8 {d26[1]}, [TMP1]! - vst1.8 {d27[1]}, [TMP3]! - vst1.8 {d26[2]}, [TMP1]! - vst1.8 {d27[2]}, [TMP3]! - vst1.8 {d26[3]}, [TMP1]! - vst1.8 {d27[3]}, [TMP3]! - - vst1.8 {d26[4]}, [TMP2]! - vst1.8 {d27[4]}, [TMP4]! - vst1.8 {d26[5]}, [TMP2]! - vst1.8 {d27[5]}, [TMP4]! - vst1.8 {d26[6]}, [TMP2]! - vst1.8 {d27[6]}, [TMP4]! - vst1.8 {d26[7]}, [TMP2]! - vst1.8 {d27[7]}, [TMP4]! -#endif - - vpop {d8-d15} - bx lr - - .unreq DCT_TABLE - .unreq COEF_BLOCK - .unreq OUTPUT_BUF - .unreq OUTPUT_COL - .unreq TMP1 - .unreq TMP2 - .unreq TMP3 - .unreq TMP4 -.endfunc - -.purgem idct_helper - - -/*****************************************************************************/ - -/* - * jsimd_idct_2x2_neon - * - * This function contains inverse-DCT code for getting reduced-size - * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations - * and produces exactly the same output as IJG's original 'jpeg_idct_2x2' - * function from jpeg-6b (jidctred.c). - * - * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which - * requires much less arithmetic operations and hence should be faster. - * The primary purpose of this particular NEON optimized function is - * bit exact compatibility with jpeg-6b. - */ - -.balign 8 -jsimd_idct_2x2_neon_consts: - .short -FIX_0_720959822 /* d0[0] */ - .short FIX_0_850430095 /* d0[1] */ - .short -FIX_1_272758580 /* d0[2] */ - .short FIX_3_624509785 /* d0[3] */ - -.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27 - vshll.s16 q14, \x4, #15 - vmull.s16 q13, \x6, d0[3] - vmlal.s16 q13, \x10, d0[2] - vmlal.s16 q13, \x12, d0[1] - vmlal.s16 q13, \x16, d0[0] - - vadd.s32 q10, q14, q13 - vsub.s32 q14, q14, q13 - -.if \shift > 16 - vrshr.s32 q10, q10, #\shift - vrshr.s32 q14, q14, #\shift - vmovn.s32 \y26, q10 - vmovn.s32 \y27, q14 -.else - vrshrn.s32 \y26, q10, #\shift - vrshrn.s32 \y27, q14, #\shift -.endif - -.endm - -asm_function jsimd_idct_2x2_neon - - DCT_TABLE .req r0 - COEF_BLOCK .req r1 - OUTPUT_BUF .req r2 - OUTPUT_COL .req r3 - TMP1 .req r0 - TMP2 .req ip - - vpush {d8-d15} - - /* Load constants */ - adr TMP2, jsimd_idct_2x2_neon_consts - vld1.16 {d0}, [TMP2, :64] - - /* Load all COEF_BLOCK into NEON registers with the following allocation: - * 0 1 2 3 | 4 5 6 7 - * ---------+-------- - * 0 | d4 | d5 - * 1 | d6 | d7 - * 2 | - | - - * 3 | d10 | d11 - * 4 | - | - - * 5 | d12 | d13 - * 6 | - | - - * 7 | d16 | d17 - */ - vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]! - add COEF_BLOCK, COEF_BLOCK, #16 - vld1.16 {d10, d11}, [COEF_BLOCK, :128]! - add COEF_BLOCK, COEF_BLOCK, #16 - vld1.16 {d12, d13}, [COEF_BLOCK, :128]! - add COEF_BLOCK, COEF_BLOCK, #16 - vld1.16 {d16, d17}, [COEF_BLOCK, :128]! - /* Dequantize */ - vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]! - vmul.s16 q2, q2, q9 - vmul.s16 q3, q3, q10 - add DCT_TABLE, DCT_TABLE, #16 - vld1.16 {d24, d25}, [DCT_TABLE, :128]! - vmul.s16 q5, q5, q12 - add DCT_TABLE, DCT_TABLE, #16 - vld1.16 {d26, d27}, [DCT_TABLE, :128]! - vmul.s16 q6, q6, q13 - add DCT_TABLE, DCT_TABLE, #16 - vld1.16 {d30, d31}, [DCT_TABLE, :128]! - vmul.s16 q8, q8, q15 - - /* Pass 1 */ -#if 0 - idct_helper d4, d6, d10, d12, d16, 13, d4, d6 - transpose_4x4 d4, d6, d8, d10 - idct_helper d5, d7, d11, d13, d17, 13, d5, d7 - transpose_4x4 d5, d7, d9, d11 -#else - vmull.s16 q13, d6, d0[3] - vmlal.s16 q13, d10, d0[2] - vmlal.s16 q13, d12, d0[1] - vmlal.s16 q13, d16, d0[0] - vmull.s16 q12, d7, d0[3] - vmlal.s16 q12, d11, d0[2] - vmlal.s16 q12, d13, d0[1] - vmlal.s16 q12, d17, d0[0] - vshll.s16 q14, d4, #15 - vshll.s16 q15, d5, #15 - vadd.s32 q10, q14, q13 - vsub.s32 q14, q14, q13 - vrshrn.s32 d4, q10, #13 - vrshrn.s32 d6, q14, #13 - vadd.s32 q10, q15, q12 - vsub.s32 q14, q15, q12 - vrshrn.s32 d5, q10, #13 - vrshrn.s32 d7, q14, #13 - vtrn.16 q2, q3 - vtrn.32 q3, q5 -#endif - - /* Pass 2 */ - idct_helper d4, d6, d10, d7, d11, 20, d26, d27 - - /* Range limit */ - vmov.u16 q15, #0x80 - vadd.s16 q13, q13, q15 - vqmovun.s16 d26, q13 - vqmovun.s16 d27, q13 - - /* Store results to the output buffer */ - ldmia OUTPUT_BUF, {TMP1, TMP2} - add TMP1, TMP1, OUTPUT_COL - add TMP2, TMP2, OUTPUT_COL - - vst1.8 {d26[0]}, [TMP1]! - vst1.8 {d27[4]}, [TMP1]! - vst1.8 {d26[1]}, [TMP2]! - vst1.8 {d27[5]}, [TMP2]! - - vpop {d8-d15} - bx lr - - .unreq DCT_TABLE - .unreq COEF_BLOCK - .unreq OUTPUT_BUF - .unreq OUTPUT_COL - .unreq TMP1 - .unreq TMP2 -.endfunc - -.purgem idct_helper - - -/*****************************************************************************/ - -/* - * jsimd_ycc_extrgb_convert_neon - * jsimd_ycc_extbgr_convert_neon - * jsimd_ycc_extrgbx_convert_neon - * jsimd_ycc_extbgrx_convert_neon - * jsimd_ycc_extxbgr_convert_neon - * jsimd_ycc_extxrgb_convert_neon - * - * Colorspace conversion YCbCr -> RGB - */ - - -.macro do_load size - .if \size == 8 - vld1.8 {d4}, [U, :64]! - vld1.8 {d5}, [V, :64]! - vld1.8 {d0}, [Y, :64]! - pld [U, #64] - pld [V, #64] - pld [Y, #64] - .elseif \size == 4 - vld1.8 {d4[0]}, [U]! - vld1.8 {d4[1]}, [U]! - vld1.8 {d4[2]}, [U]! - vld1.8 {d4[3]}, [U]! - vld1.8 {d5[0]}, [V]! - vld1.8 {d5[1]}, [V]! - vld1.8 {d5[2]}, [V]! - vld1.8 {d5[3]}, [V]! - vld1.8 {d0[0]}, [Y]! - vld1.8 {d0[1]}, [Y]! - vld1.8 {d0[2]}, [Y]! - vld1.8 {d0[3]}, [Y]! - .elseif \size == 2 - vld1.8 {d4[4]}, [U]! - vld1.8 {d4[5]}, [U]! - vld1.8 {d5[4]}, [V]! - vld1.8 {d5[5]}, [V]! - vld1.8 {d0[4]}, [Y]! - vld1.8 {d0[5]}, [Y]! - .elseif \size == 1 - vld1.8 {d4[6]}, [U]! - vld1.8 {d5[6]}, [V]! - vld1.8 {d0[6]}, [Y]! - .else - .error unsupported macroblock size - .endif -.endm - -.macro do_store bpp, size - .if \bpp == 24 - .if \size == 8 - vst3.8 {d10, d11, d12}, [RGB]! - .elseif \size == 4 - vst3.8 {d10[0], d11[0], d12[0]}, [RGB]! - vst3.8 {d10[1], d11[1], d12[1]}, [RGB]! - vst3.8 {d10[2], d11[2], d12[2]}, [RGB]! - vst3.8 {d10[3], d11[3], d12[3]}, [RGB]! - .elseif \size == 2 - vst3.8 {d10[4], d11[4], d12[4]}, [RGB]! - vst3.8 {d10[5], d11[5], d12[5]}, [RGB]! - .elseif \size == 1 - vst3.8 {d10[6], d11[6], d12[6]}, [RGB]! - .else - .error unsupported macroblock size - .endif - .elseif \bpp == 32 - .if \size == 8 - vst4.8 {d10, d11, d12, d13}, [RGB]! - .elseif \size == 4 - vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]! - vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]! - vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]! - vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]! - .elseif \size == 2 - vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]! - vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]! - .elseif \size == 1 - vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]! - .else - .error unsupported macroblock size - .endif - .else - .error unsupported bpp - .endif -.endm - -.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs - -/* - * 2 stage pipelined YCbCr->RGB conversion - */ - -.macro do_yuv_to_rgb_stage1 - vaddw.u8 q3, q1, d4 /* q3 = u - 128 */ - vaddw.u8 q4, q1, d5 /* q2 = v - 128 */ - vmull.s16 q10, d6, d1[1] /* multiply by -11277 */ - vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */ - vmull.s16 q11, d7, d1[1] /* multiply by -11277 */ - vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */ - vmull.s16 q12, d8, d1[0] /* multiply by 22971 */ - vmull.s16 q13, d9, d1[0] /* multiply by 22971 */ - vmull.s16 q14, d6, d1[3] /* multiply by 29033 */ - vmull.s16 q15, d7, d1[3] /* multiply by 29033 */ -.endm - -.macro do_yuv_to_rgb_stage2 - vrshrn.s32 d20, q10, #15 - vrshrn.s32 d21, q11, #15 - vrshrn.s32 d24, q12, #14 - vrshrn.s32 d25, q13, #14 - vrshrn.s32 d28, q14, #14 - vrshrn.s32 d29, q15, #14 - vaddw.u8 q10, q10, d0 - vaddw.u8 q12, q12, d0 - vaddw.u8 q14, q14, d0 - vqmovun.s16 d1\g_offs, q10 - vqmovun.s16 d1\r_offs, q12 - vqmovun.s16 d1\b_offs, q14 -.endm - -.macro do_yuv_to_rgb_stage2_store_load_stage1 - vld1.8 {d4}, [U, :64]! - vrshrn.s32 d20, q10, #15 - vrshrn.s32 d21, q11, #15 - vrshrn.s32 d24, q12, #14 - vrshrn.s32 d25, q13, #14 - vrshrn.s32 d28, q14, #14 - vld1.8 {d5}, [V, :64]! - vrshrn.s32 d29, q15, #14 - vaddw.u8 q10, q10, d0 - vaddw.u8 q12, q12, d0 - vaddw.u8 q14, q14, d0 - vqmovun.s16 d1\g_offs, q10 - vld1.8 {d0}, [Y, :64]! - vqmovun.s16 d1\r_offs, q12 - pld [U, #64] - pld [V, #64] - pld [Y, #64] - vqmovun.s16 d1\b_offs, q14 - vaddw.u8 q3, q1, d4 /* q3 = u - 128 */ - vaddw.u8 q4, q1, d5 /* q2 = v - 128 */ - do_store \bpp, 8 - vmull.s16 q10, d6, d1[1] /* multiply by -11277 */ - vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */ - vmull.s16 q11, d7, d1[1] /* multiply by -11277 */ - vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */ - vmull.s16 q12, d8, d1[0] /* multiply by 22971 */ - vmull.s16 q13, d9, d1[0] /* multiply by 22971 */ - vmull.s16 q14, d6, d1[3] /* multiply by 29033 */ - vmull.s16 q15, d7, d1[3] /* multiply by 29033 */ -.endm - -.macro do_yuv_to_rgb - do_yuv_to_rgb_stage1 - do_yuv_to_rgb_stage2 -.endm - -/* Apple gas crashes on adrl, work around that by using adr. - * But this requires a copy of these constants for each function. - */ - -.balign 16 -jsimd_ycc_\colorid\()_neon_consts: - .short 0, 0, 0, 0 - .short 22971, -11277, -23401, 29033 - .short -128, -128, -128, -128 - .short -128, -128, -128, -128 - -asm_function jsimd_ycc_\colorid\()_convert_neon - OUTPUT_WIDTH .req r0 - INPUT_BUF .req r1 - INPUT_ROW .req r2 - OUTPUT_BUF .req r3 - NUM_ROWS .req r4 - - INPUT_BUF0 .req r5 - INPUT_BUF1 .req r6 - INPUT_BUF2 .req INPUT_BUF - - RGB .req r7 - Y .req r8 - U .req r9 - V .req r10 - N .req ip - - /* Load constants to d1, d2, d3 (d0 is just used for padding) */ - adr ip, jsimd_ycc_\colorid\()_neon_consts - vld1.16 {d0, d1, d2, d3}, [ip, :128] - - /* Save ARM registers and handle input arguments */ - push {r4, r5, r6, r7, r8, r9, r10, lr} - ldr NUM_ROWS, [sp, #(4 * 8)] - ldr INPUT_BUF0, [INPUT_BUF] - ldr INPUT_BUF1, [INPUT_BUF, #4] - ldr INPUT_BUF2, [INPUT_BUF, #8] - .unreq INPUT_BUF - - /* Save NEON registers */ - vpush {d8-d15} - - /* Initially set d10, d11, d12, d13 to 0xFF */ - vmov.u8 q5, #255 - vmov.u8 q6, #255 - - /* Outer loop over scanlines */ - cmp NUM_ROWS, #1 - blt 9f -0: - ldr Y, [INPUT_BUF0, INPUT_ROW, lsl #2] - ldr U, [INPUT_BUF1, INPUT_ROW, lsl #2] - mov N, OUTPUT_WIDTH - ldr V, [INPUT_BUF2, INPUT_ROW, lsl #2] - add INPUT_ROW, INPUT_ROW, #1 - ldr RGB, [OUTPUT_BUF], #4 - - /* Inner loop over pixels */ - subs N, N, #8 - blt 3f - do_load 8 - do_yuv_to_rgb_stage1 - subs N, N, #8 - blt 2f -1: - do_yuv_to_rgb_stage2_store_load_stage1 - subs N, N, #8 - bge 1b -2: - do_yuv_to_rgb_stage2 - do_store \bpp, 8 - tst N, #7 - beq 8f -3: - tst N, #4 - beq 3f - do_load 4 -3: - tst N, #2 - beq 4f - do_load 2 -4: - tst N, #1 - beq 5f - do_load 1 -5: - do_yuv_to_rgb - tst N, #4 - beq 6f - do_store \bpp, 4 -6: - tst N, #2 - beq 7f - do_store \bpp, 2 -7: - tst N, #1 - beq 8f - do_store \bpp, 1 -8: - subs NUM_ROWS, NUM_ROWS, #1 - bgt 0b -9: - /* Restore all registers and return */ - vpop {d8-d15} - pop {r4, r5, r6, r7, r8, r9, r10, pc} - - .unreq OUTPUT_WIDTH - .unreq INPUT_ROW - .unreq OUTPUT_BUF - .unreq NUM_ROWS - .unreq INPUT_BUF0 - .unreq INPUT_BUF1 - .unreq INPUT_BUF2 - .unreq RGB - .unreq Y - .unreq U - .unreq V - .unreq N -.endfunc - -.purgem do_yuv_to_rgb -.purgem do_yuv_to_rgb_stage1 -.purgem do_yuv_to_rgb_stage2 -.purgem do_yuv_to_rgb_stage2_store_load_stage1 - -.endm - -/*--------------------------------- id ----- bpp R G B */ -generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, 1, 2 -generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, 1, 0 -generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2 -generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0 -generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1 -generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3 - -.purgem do_load -.purgem do_store - - -/*****************************************************************************/ - -/* - * jsimd_extrgb_ycc_convert_neon - * jsimd_extbgr_ycc_convert_neon - * jsimd_extrgbx_ycc_convert_neon - * jsimd_extbgrx_ycc_convert_neon - * jsimd_extxbgr_ycc_convert_neon - * jsimd_extxrgb_ycc_convert_neon - * - * Colorspace conversion RGB -> YCbCr - */ - -.macro do_store size - .if \size == 8 - vst1.8 {d20}, [Y]! - vst1.8 {d21}, [U]! - vst1.8 {d22}, [V]! - .elseif \size == 4 - vst1.8 {d20[0]}, [Y]! - vst1.8 {d20[1]}, [Y]! - vst1.8 {d20[2]}, [Y]! - vst1.8 {d20[3]}, [Y]! - vst1.8 {d21[0]}, [U]! - vst1.8 {d21[1]}, [U]! - vst1.8 {d21[2]}, [U]! - vst1.8 {d21[3]}, [U]! - vst1.8 {d22[0]}, [V]! - vst1.8 {d22[1]}, [V]! - vst1.8 {d22[2]}, [V]! - vst1.8 {d22[3]}, [V]! - .elseif \size == 2 - vst1.8 {d20[4]}, [Y]! - vst1.8 {d20[5]}, [Y]! - vst1.8 {d21[4]}, [U]! - vst1.8 {d21[5]}, [U]! - vst1.8 {d22[4]}, [V]! - vst1.8 {d22[5]}, [V]! - .elseif \size == 1 - vst1.8 {d20[6]}, [Y]! - vst1.8 {d21[6]}, [U]! - vst1.8 {d22[6]}, [V]! - .else - .error unsupported macroblock size - .endif -.endm - -.macro do_load bpp, size - .if \bpp == 24 - .if \size == 8 - vld3.8 {d10, d11, d12}, [RGB]! - pld [RGB, #128] - .elseif \size == 4 - vld3.8 {d10[0], d11[0], d12[0]}, [RGB]! - vld3.8 {d10[1], d11[1], d12[1]}, [RGB]! - vld3.8 {d10[2], d11[2], d12[2]}, [RGB]! - vld3.8 {d10[3], d11[3], d12[3]}, [RGB]! - .elseif \size == 2 - vld3.8 {d10[4], d11[4], d12[4]}, [RGB]! - vld3.8 {d10[5], d11[5], d12[5]}, [RGB]! - .elseif \size == 1 - vld3.8 {d10[6], d11[6], d12[6]}, [RGB]! - .else - .error unsupported macroblock size - .endif - .elseif \bpp == 32 - .if \size == 8 - vld4.8 {d10, d11, d12, d13}, [RGB]! - pld [RGB, #128] - .elseif \size == 4 - vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]! - vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]! - vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]! - vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]! - .elseif \size == 2 - vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]! - vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]! - .elseif \size == 1 - vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]! - .else - .error unsupported macroblock size - .endif - .else - .error unsupported bpp - .endif -.endm - -.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs - -/* - * 2 stage pipelined RGB->YCbCr conversion - */ - -.macro do_rgb_to_yuv_stage1 - vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */ - vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */ - vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */ - vmull.u16 q7, d4, d0[0] - vmlal.u16 q7, d6, d0[1] - vmlal.u16 q7, d8, d0[2] - vmull.u16 q8, d5, d0[0] - vmlal.u16 q8, d7, d0[1] - vmlal.u16 q8, d9, d0[2] - vrev64.32 q9, q1 - vrev64.32 q13, q1 - vmlsl.u16 q9, d4, d0[3] - vmlsl.u16 q9, d6, d1[0] - vmlal.u16 q9, d8, d1[1] - vmlsl.u16 q13, d5, d0[3] - vmlsl.u16 q13, d7, d1[0] - vmlal.u16 q13, d9, d1[1] - vrev64.32 q14, q1 - vrev64.32 q15, q1 - vmlal.u16 q14, d4, d1[1] - vmlsl.u16 q14, d6, d1[2] - vmlsl.u16 q14, d8, d1[3] - vmlal.u16 q15, d5, d1[1] - vmlsl.u16 q15, d7, d1[2] - vmlsl.u16 q15, d9, d1[3] -.endm - -.macro do_rgb_to_yuv_stage2 - vrshrn.u32 d20, q7, #16 - vrshrn.u32 d21, q8, #16 - vshrn.u32 d22, q9, #16 - vshrn.u32 d23, q13, #16 - vshrn.u32 d24, q14, #16 - vshrn.u32 d25, q15, #16 - vmovn.u16 d20, q10 /* d20 = y */ - vmovn.u16 d21, q11 /* d21 = u */ - vmovn.u16 d22, q12 /* d22 = v */ -.endm - -.macro do_rgb_to_yuv - do_rgb_to_yuv_stage1 - do_rgb_to_yuv_stage2 -.endm - -.macro do_rgb_to_yuv_stage2_store_load_stage1 - vrshrn.u32 d20, q7, #16 - vrshrn.u32 d21, q8, #16 - vshrn.u32 d22, q9, #16 - vrev64.32 q9, q1 - vshrn.u32 d23, q13, #16 - vrev64.32 q13, q1 - vshrn.u32 d24, q14, #16 - vshrn.u32 d25, q15, #16 - do_load \bpp, 8 - vmovn.u16 d20, q10 /* d20 = y */ - vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */ - vmovn.u16 d21, q11 /* d21 = u */ - vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */ - vmovn.u16 d22, q12 /* d22 = v */ - vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */ - vmull.u16 q7, d4, d0[0] - vmlal.u16 q7, d6, d0[1] - vmlal.u16 q7, d8, d0[2] - vst1.8 {d20}, [Y]! - vmull.u16 q8, d5, d0[0] - vmlal.u16 q8, d7, d0[1] - vmlal.u16 q8, d9, d0[2] - vmlsl.u16 q9, d4, d0[3] - vmlsl.u16 q9, d6, d1[0] - vmlal.u16 q9, d8, d1[1] - vst1.8 {d21}, [U]! - vmlsl.u16 q13, d5, d0[3] - vmlsl.u16 q13, d7, d1[0] - vmlal.u16 q13, d9, d1[1] - vrev64.32 q14, q1 - vrev64.32 q15, q1 - vmlal.u16 q14, d4, d1[1] - vmlsl.u16 q14, d6, d1[2] - vmlsl.u16 q14, d8, d1[3] - vst1.8 {d22}, [V]! - vmlal.u16 q15, d5, d1[1] - vmlsl.u16 q15, d7, d1[2] - vmlsl.u16 q15, d9, d1[3] -.endm - -.balign 16 -jsimd_\colorid\()_ycc_neon_consts: - .short 19595, 38470, 7471, 11059 - .short 21709, 32768, 27439, 5329 - .short 32767, 128, 32767, 128 - .short 32767, 128, 32767, 128 - -asm_function jsimd_\colorid\()_ycc_convert_neon - OUTPUT_WIDTH .req r0 - INPUT_BUF .req r1 - OUTPUT_BUF .req r2 - OUTPUT_ROW .req r3 - NUM_ROWS .req r4 - - OUTPUT_BUF0 .req r5 - OUTPUT_BUF1 .req r6 - OUTPUT_BUF2 .req OUTPUT_BUF - - RGB .req r7 - Y .req r8 - U .req r9 - V .req r10 - N .req ip - - /* Load constants to d0, d1, d2, d3 */ - adr ip, jsimd_\colorid\()_ycc_neon_consts - vld1.16 {d0, d1, d2, d3}, [ip, :128] - - /* Save ARM registers and handle input arguments */ - push {r4, r5, r6, r7, r8, r9, r10, lr} - ldr NUM_ROWS, [sp, #(4 * 8)] - ldr OUTPUT_BUF0, [OUTPUT_BUF] - ldr OUTPUT_BUF1, [OUTPUT_BUF, #4] - ldr OUTPUT_BUF2, [OUTPUT_BUF, #8] - .unreq OUTPUT_BUF - - /* Save NEON registers */ - vpush {d8-d15} - - /* Outer loop over scanlines */ - cmp NUM_ROWS, #1 - blt 9f -0: - ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2] - ldr U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2] - mov N, OUTPUT_WIDTH - ldr V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2] - add OUTPUT_ROW, OUTPUT_ROW, #1 - ldr RGB, [INPUT_BUF], #4 - - /* Inner loop over pixels */ - subs N, N, #8 - blt 3f - do_load \bpp, 8 - do_rgb_to_yuv_stage1 - subs N, N, #8 - blt 2f -1: - do_rgb_to_yuv_stage2_store_load_stage1 - subs N, N, #8 - bge 1b -2: - do_rgb_to_yuv_stage2 - do_store 8 - tst N, #7 - beq 8f -3: - tst N, #4 - beq 3f - do_load \bpp, 4 -3: - tst N, #2 - beq 4f - do_load \bpp, 2 -4: - tst N, #1 - beq 5f - do_load \bpp, 1 -5: - do_rgb_to_yuv - tst N, #4 - beq 6f - do_store 4 -6: - tst N, #2 - beq 7f - do_store 2 -7: - tst N, #1 - beq 8f - do_store 1 -8: - subs NUM_ROWS, NUM_ROWS, #1 - bgt 0b -9: - /* Restore all registers and return */ - vpop {d8-d15} - pop {r4, r5, r6, r7, r8, r9, r10, pc} - - .unreq OUTPUT_WIDTH - .unreq OUTPUT_ROW - .unreq INPUT_BUF - .unreq NUM_ROWS - .unreq OUTPUT_BUF0 - .unreq OUTPUT_BUF1 - .unreq OUTPUT_BUF2 - .unreq RGB - .unreq Y - .unreq U - .unreq V - .unreq N -.endfunc - -.purgem do_rgb_to_yuv -.purgem do_rgb_to_yuv_stage1 -.purgem do_rgb_to_yuv_stage2 -.purgem do_rgb_to_yuv_stage2_store_load_stage1 - -.endm - -/*--------------------------------- id ----- bpp R G B */ -generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2 -generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0 -generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2 -generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0 -generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1 -generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3 - -.purgem do_load -.purgem do_store - - -/*****************************************************************************/ - -/* - * Load data into workspace, applying unsigned->signed conversion - * - * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get - * rid of VST1.16 instructions - */ - -asm_function jsimd_convsamp_neon - SAMPLE_DATA .req r0 - START_COL .req r1 - WORKSPACE .req r2 - TMP1 .req r3 - TMP2 .req r4 - TMP3 .req r5 - TMP4 .req ip - - push {r4, r5} - vmov.u8 d0, #128 - - ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4} - add TMP1, TMP1, START_COL - add TMP2, TMP2, START_COL - add TMP3, TMP3, START_COL - add TMP4, TMP4, START_COL - vld1.8 {d16}, [TMP1] - vsubl.u8 q8, d16, d0 - vld1.8 {d18}, [TMP2] - vsubl.u8 q9, d18, d0 - vld1.8 {d20}, [TMP3] - vsubl.u8 q10, d20, d0 - vld1.8 {d22}, [TMP4] - ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4} - vsubl.u8 q11, d22, d0 - vst1.16 {d16, d17, d18, d19}, [WORKSPACE, :128]! - add TMP1, TMP1, START_COL - add TMP2, TMP2, START_COL - vst1.16 {d20, d21, d22, d23}, [WORKSPACE, :128]! - add TMP3, TMP3, START_COL - add TMP4, TMP4, START_COL - vld1.8 {d24}, [TMP1] - vsubl.u8 q12, d24, d0 - vld1.8 {d26}, [TMP2] - vsubl.u8 q13, d26, d0 - vld1.8 {d28}, [TMP3] - vsubl.u8 q14, d28, d0 - vld1.8 {d30}, [TMP4] - vsubl.u8 q15, d30, d0 - vst1.16 {d24, d25, d26, d27}, [WORKSPACE, :128]! - vst1.16 {d28, d29, d30, d31}, [WORKSPACE, :128]! - pop {r4, r5} - bx lr - - .unreq SAMPLE_DATA - .unreq START_COL - .unreq WORKSPACE - .unreq TMP1 - .unreq TMP2 - .unreq TMP3 - .unreq TMP4 -.endfunc - - -/*****************************************************************************/ - -/* - * jsimd_fdct_ifast_neon - * - * This function contains a fast, not so accurate integer implementation of - * the forward DCT (Discrete Cosine Transform). It uses the same calculations - * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast' - * function from jfdctfst.c - * - * TODO: can be combined with 'jsimd_convsamp_neon' to get - * rid of a bunch of VLD1.16 instructions - */ - -#define XFIX_0_382683433 d0[0] -#define XFIX_0_541196100 d0[1] -#define XFIX_0_707106781 d0[2] -#define XFIX_1_306562965 d0[3] - -.balign 16 -jsimd_fdct_ifast_neon_consts: - .short (98 * 128) /* XFIX_0_382683433 */ - .short (139 * 128) /* XFIX_0_541196100 */ - .short (181 * 128) /* XFIX_0_707106781 */ - .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */ - -asm_function jsimd_fdct_ifast_neon - - DATA .req r0 - TMP .req ip - - vpush {d8-d15} - - /* Load constants */ - adr TMP, jsimd_fdct_ifast_neon_consts - vld1.16 {d0}, [TMP, :64] - - /* Load all DATA into NEON registers with the following allocation: - * 0 1 2 3 | 4 5 6 7 - * ---------+-------- - * 0 | d16 | d17 | q8 - * 1 | d18 | d19 | q9 - * 2 | d20 | d21 | q10 - * 3 | d22 | d23 | q11 - * 4 | d24 | d25 | q12 - * 5 | d26 | d27 | q13 - * 6 | d28 | d29 | q14 - * 7 | d30 | d31 | q15 - */ - - vld1.16 {d16, d17, d18, d19}, [DATA, :128]! - vld1.16 {d20, d21, d22, d23}, [DATA, :128]! - vld1.16 {d24, d25, d26, d27}, [DATA, :128]! - vld1.16 {d28, d29, d30, d31}, [DATA, :128] - sub DATA, DATA, #(128 - 32) - - mov TMP, #2 -1: - /* Transpose */ - vtrn.16 q12, q13 - vtrn.16 q10, q11 - vtrn.16 q8, q9 - vtrn.16 q14, q15 - vtrn.32 q9, q11 - vtrn.32 q13, q15 - vtrn.32 q8, q10 - vtrn.32 q12, q14 - vswp d30, d23 - vswp d24, d17 - vswp d26, d19 - /* 1-D FDCT */ - vadd.s16 q2, q11, q12 - vswp d28, d21 - vsub.s16 q12, q11, q12 - vsub.s16 q6, q10, q13 - vadd.s16 q10, q10, q13 - vsub.s16 q7, q9, q14 - vadd.s16 q9, q9, q14 - vsub.s16 q1, q8, q15 - vadd.s16 q8, q8, q15 - vsub.s16 q4, q9, q10 - vsub.s16 q5, q8, q2 - vadd.s16 q3, q9, q10 - vadd.s16 q4, q4, q5 - vadd.s16 q2, q8, q2 - vqdmulh.s16 q4, q4, XFIX_0_707106781 - vadd.s16 q11, q12, q6 - vadd.s16 q8, q2, q3 - vsub.s16 q12, q2, q3 - vadd.s16 q3, q6, q7 - vadd.s16 q7, q7, q1 - vqdmulh.s16 q3, q3, XFIX_0_707106781 - vsub.s16 q6, q11, q7 - vadd.s16 q10, q5, q4 - vqdmulh.s16 q6, q6, XFIX_0_382683433 - vsub.s16 q14, q5, q4 - vqdmulh.s16 q11, q11, XFIX_0_541196100 - vqdmulh.s16 q5, q7, XFIX_1_306562965 - vadd.s16 q4, q1, q3 - vsub.s16 q3, q1, q3 - vadd.s16 q7, q7, q6 - vadd.s16 q11, q11, q6 - vadd.s16 q7, q7, q5 - vadd.s16 q13, q3, q11 - vsub.s16 q11, q3, q11 - vadd.s16 q9, q4, q7 - vsub.s16 q15, q4, q7 - subs TMP, TMP, #1 - bne 1b - - /* store results */ - vst1.16 {d16, d17, d18, d19}, [DATA, :128]! - vst1.16 {d20, d21, d22, d23}, [DATA, :128]! - vst1.16 {d24, d25, d26, d27}, [DATA, :128]! - vst1.16 {d28, d29, d30, d31}, [DATA, :128] - - vpop {d8-d15} - bx lr - - .unreq DATA - .unreq TMP -.endfunc - - -/*****************************************************************************/ - -/* - * GLOBAL(void) - * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM * divisors, - * DCTELEM * workspace); - * - * Note: the code uses 2 stage pipelining in order to improve instructions - * scheduling and eliminate stalls (this provides ~15% better - * performance for this function on both ARM Cortex-A8 and - * ARM Cortex-A9 when compared to the non-pipelined variant). - * The instructions which belong to the second stage use different - * indentation for better readiability. - */ -asm_function jsimd_quantize_neon - - COEF_BLOCK .req r0 - DIVISORS .req r1 - WORKSPACE .req r2 - - RECIPROCAL .req DIVISORS - CORRECTION .req r3 - SHIFT .req ip - LOOP_COUNT .req r4 - - vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]! - vabs.s16 q12, q0 - add CORRECTION, DIVISORS, #(64 * 2) - add SHIFT, DIVISORS, #(64 * 6) - vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]! - vabs.s16 q13, q1 - vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]! - vadd.u16 q12, q12, q10 /* add correction */ - vadd.u16 q13, q13, q11 - vmull.u16 q10, d24, d16 /* multiply by reciprocal */ - vmull.u16 q11, d25, d17 - vmull.u16 q8, d26, d18 - vmull.u16 q9, d27, d19 - vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]! - vshrn.u32 d20, q10, #16 - vshrn.u32 d21, q11, #16 - vshrn.u32 d22, q8, #16 - vshrn.u32 d23, q9, #16 - vneg.s16 q12, q12 - vneg.s16 q13, q13 - vshr.s16 q2, q0, #15 /* extract sign */ - vshr.s16 q3, q1, #15 - vshl.u16 q14, q10, q12 /* shift */ - vshl.u16 q15, q11, q13 - - push {r4, r5} - mov LOOP_COUNT, #3 -1: - vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]! - veor.u16 q14, q14, q2 /* restore sign */ - vabs.s16 q12, q0 - vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]! - vabs.s16 q13, q1 - veor.u16 q15, q15, q3 - vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]! - vadd.u16 q12, q12, q10 /* add correction */ - vadd.u16 q13, q13, q11 - vmull.u16 q10, d24, d16 /* multiply by reciprocal */ - vmull.u16 q11, d25, d17 - vmull.u16 q8, d26, d18 - vmull.u16 q9, d27, d19 - vsub.u16 q14, q14, q2 - vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]! - vsub.u16 q15, q15, q3 - vshrn.u32 d20, q10, #16 - vshrn.u32 d21, q11, #16 - vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]! - vshrn.u32 d22, q8, #16 - vshrn.u32 d23, q9, #16 - vneg.s16 q12, q12 - vneg.s16 q13, q13 - vshr.s16 q2, q0, #15 /* extract sign */ - vshr.s16 q3, q1, #15 - vshl.u16 q14, q10, q12 /* shift */ - vshl.u16 q15, q11, q13 - subs LOOP_COUNT, LOOP_COUNT, #1 - bne 1b - pop {r4, r5} - - veor.u16 q14, q14, q2 /* restore sign */ - veor.u16 q15, q15, q3 - vsub.u16 q14, q14, q2 - vsub.u16 q15, q15, q3 - vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]! - - bx lr /* return */ - - .unreq COEF_BLOCK - .unreq DIVISORS - .unreq WORKSPACE - .unreq RECIPROCAL - .unreq CORRECTION - .unreq SHIFT - .unreq LOOP_COUNT -.endfunc - - -/*****************************************************************************/ - -/* - * GLOBAL(void) - * jsimd_h2v1_fancy_upsample_neon (int max_v_samp_factor, - * JDIMENSION downsampled_width, - * JSAMPARRAY input_data, - * JSAMPARRAY * output_data_ptr); - * - * Note: the use of unaligned writes is the main remaining bottleneck in - * this code, which can be potentially solved to get up to tens - * of percents performance improvement on Cortex-A8/Cortex-A9. - */ - -/* - * Upsample 16 source pixels to 32 destination pixels. The new 16 source - * pixels are loaded to q0. The previous 16 source pixels are in q1. The - * shifted-by-one source pixels are constructed in q2 by using q0 and q1. - * Register d28 is used for multiplication by 3. Register q15 is used - * for adding +1 bias. - */ -.macro upsample16 OUTPTR, INPTR - vld1.8 {q0}, [\INPTR]! - vmovl.u8 q8, d0 - vext.8 q2, q1, q0, #15 - vmovl.u8 q9, d1 - vaddw.u8 q10, q15, d4 - vaddw.u8 q11, q15, d5 - vmlal.u8 q8, d4, d28 - vmlal.u8 q9, d5, d28 - vmlal.u8 q10, d0, d28 - vmlal.u8 q11, d1, d28 - vmov q1, q0 /* backup source pixels to q1 */ - vrshrn.u16 d6, q8, #2 - vrshrn.u16 d7, q9, #2 - vshrn.u16 d8, q10, #2 - vshrn.u16 d9, q11, #2 - vst2.8 {d6, d7, d8, d9}, [\OUTPTR]! -.endm - -/* - * Upsample 32 source pixels to 64 destination pixels. Compared to 'usample16' - * macro, the roles of q0 and q1 registers are reversed for even and odd - * groups of 16 pixels, that's why "vmov q1, q0" instructions are not needed. - * Also this unrolling allows to reorder loads and stores to compensate - * multiplication latency and reduce stalls. - */ -.macro upsample32 OUTPTR, INPTR - /* even 16 pixels group */ - vld1.8 {q0}, [\INPTR]! - vmovl.u8 q8, d0 - vext.8 q2, q1, q0, #15 - vmovl.u8 q9, d1 - vaddw.u8 q10, q15, d4 - vaddw.u8 q11, q15, d5 - vmlal.u8 q8, d4, d28 - vmlal.u8 q9, d5, d28 - vmlal.u8 q10, d0, d28 - vmlal.u8 q11, d1, d28 - /* odd 16 pixels group */ - vld1.8 {q1}, [\INPTR]! - vrshrn.u16 d6, q8, #2 - vrshrn.u16 d7, q9, #2 - vshrn.u16 d8, q10, #2 - vshrn.u16 d9, q11, #2 - vmovl.u8 q8, d2 - vext.8 q2, q0, q1, #15 - vmovl.u8 q9, d3 - vaddw.u8 q10, q15, d4 - vaddw.u8 q11, q15, d5 - vmlal.u8 q8, d4, d28 - vmlal.u8 q9, d5, d28 - vmlal.u8 q10, d2, d28 - vmlal.u8 q11, d3, d28 - vst2.8 {d6, d7, d8, d9}, [\OUTPTR]! - vrshrn.u16 d6, q8, #2 - vrshrn.u16 d7, q9, #2 - vshrn.u16 d8, q10, #2 - vshrn.u16 d9, q11, #2 - vst2.8 {d6, d7, d8, d9}, [\OUTPTR]! -.endm - -/* - * Upsample a row of WIDTH pixels from INPTR to OUTPTR. - */ -.macro upsample_row OUTPTR, INPTR, WIDTH, TMP1 - /* special case for the first and last pixels */ - sub \WIDTH, \WIDTH, #1 - add \OUTPTR, \OUTPTR, #1 - ldrb \TMP1, [\INPTR, \WIDTH] - strb \TMP1, [\OUTPTR, \WIDTH, asl #1] - ldrb \TMP1, [\INPTR], #1 - strb \TMP1, [\OUTPTR, #-1] - vmov.8 d3[7], \TMP1 - - subs \WIDTH, \WIDTH, #32 - blt 5f -0: /* process 32 pixels per iteration */ - upsample32 \OUTPTR, \INPTR - subs \WIDTH, \WIDTH, #32 - bge 0b -5: - adds \WIDTH, \WIDTH, #16 - blt 1f -0: /* process 16 pixels if needed */ - upsample16 \OUTPTR, \INPTR - subs \WIDTH, \WIDTH, #16 -1: - adds \WIDTH, \WIDTH, #16 - beq 9f - - /* load the remaining 1-15 pixels */ - add \INPTR, \INPTR, \WIDTH - tst \WIDTH, #1 - beq 2f - sub \INPTR, \INPTR, #1 - vld1.8 {d0[0]}, [\INPTR] -2: - tst \WIDTH, #2 - beq 2f - vext.8 d0, d0, d0, #6 - sub \INPTR, \INPTR, #1 - vld1.8 {d0[1]}, [\INPTR] - sub \INPTR, \INPTR, #1 - vld1.8 {d0[0]}, [\INPTR] -2: - tst \WIDTH, #4 - beq 2f - vrev64.32 d0, d0 - sub \INPTR, \INPTR, #1 - vld1.8 {d0[3]}, [\INPTR] - sub \INPTR, \INPTR, #1 - vld1.8 {d0[2]}, [\INPTR] - sub \INPTR, \INPTR, #1 - vld1.8 {d0[1]}, [\INPTR] - sub \INPTR, \INPTR, #1 - vld1.8 {d0[0]}, [\INPTR] -2: - tst \WIDTH, #8 - beq 2f - vmov d1, d0 - sub \INPTR, \INPTR, #8 - vld1.8 {d0}, [\INPTR] -2: /* upsample the remaining pixels */ - vmovl.u8 q8, d0 - vext.8 q2, q1, q0, #15 - vmovl.u8 q9, d1 - vaddw.u8 q10, q15, d4 - vaddw.u8 q11, q15, d5 - vmlal.u8 q8, d4, d28 - vmlal.u8 q9, d5, d28 - vmlal.u8 q10, d0, d28 - vmlal.u8 q11, d1, d28 - vrshrn.u16 d10, q8, #2 - vrshrn.u16 d12, q9, #2 - vshrn.u16 d11, q10, #2 - vshrn.u16 d13, q11, #2 - vzip.8 d10, d11 - vzip.8 d12, d13 - /* store the remaining pixels */ - tst \WIDTH, #8 - beq 2f - vst1.8 {d10, d11}, [\OUTPTR]! - vmov q5, q6 -2: - tst \WIDTH, #4 - beq 2f - vst1.8 {d10}, [\OUTPTR]! - vmov d10, d11 -2: - tst \WIDTH, #2 - beq 2f - vst1.8 {d10[0]}, [\OUTPTR]! - vst1.8 {d10[1]}, [\OUTPTR]! - vst1.8 {d10[2]}, [\OUTPTR]! - vst1.8 {d10[3]}, [\OUTPTR]! - vext.8 d10, d10, d10, #4 -2: - tst \WIDTH, #1 - beq 2f - vst1.8 {d10[0]}, [\OUTPTR]! - vst1.8 {d10[1]}, [\OUTPTR]! -2: -9: -.endm - -asm_function jsimd_h2v1_fancy_upsample_neon - - MAX_V_SAMP_FACTOR .req r0 - DOWNSAMPLED_WIDTH .req r1 - INPUT_DATA .req r2 - OUTPUT_DATA_PTR .req r3 - OUTPUT_DATA .req OUTPUT_DATA_PTR - - OUTPTR .req r4 - INPTR .req r5 - WIDTH .req ip - TMP .req lr - - push {r4, r5, r6, lr} - vpush {d8-d15} - - ldr OUTPUT_DATA, [OUTPUT_DATA_PTR] - cmp MAX_V_SAMP_FACTOR, #0 - ble 99f - - /* initialize constants */ - vmov.u8 d28, #3 - vmov.u16 q15, #1 -11: - ldr INPTR, [INPUT_DATA], #4 - ldr OUTPTR, [OUTPUT_DATA], #4 - mov WIDTH, DOWNSAMPLED_WIDTH - upsample_row OUTPTR, INPTR, WIDTH, TMP - subs MAX_V_SAMP_FACTOR, MAX_V_SAMP_FACTOR, #1 - bgt 11b - -99: - vpop {d8-d15} - pop {r4, r5, r6, pc} - - .unreq MAX_V_SAMP_FACTOR - .unreq DOWNSAMPLED_WIDTH - .unreq INPUT_DATA - .unreq OUTPUT_DATA_PTR - .unreq OUTPUT_DATA - - .unreq OUTPTR - .unreq INPTR - .unreq WIDTH - .unreq TMP - -.endfunc - -.purgem upsample16 -.purgem upsample32 -.purgem upsample_row diff --git a/Builder/jni-1.11/simd/h/jdct.h b/Builder/jni-1.11/simd/h/jdct.h new file mode 100644 index 000000000..faf8e1cf0 --- /dev/null +++ b/Builder/jni-1.11/simd/h/jdct.h @@ -0,0 +1,208 @@ +/* + * jdct.h + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1994-1996, Thomas G. Lane. + * libjpeg-turbo Modifications: + * Copyright (C) 2015, D. R. Commander. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This include file contains common declarations for the forward and + * inverse DCT modules. These declarations are private to the DCT managers + * (jcdctmgr.c, jddctmgr.c) and the individual DCT algorithms. + * The individual DCT algorithms are kept in separate files to ease + * machine-dependent tuning (e.g., assembly coding). + */ + + +/* + * A forward DCT routine is given a pointer to a work area of type DCTELEM[]; + * the DCT is to be performed in-place in that buffer. Type DCTELEM is int + * for 8-bit samples, JLONG for 12-bit samples. (NOTE: Floating-point DCT + * implementations use an array of type FAST_FLOAT, instead.) + * The DCT inputs are expected to be signed (range +-CENTERJSAMPLE). + * The DCT outputs are returned scaled up by a factor of 8; they therefore + * have a range of +-8K for 8-bit data, +-128K for 12-bit data. This + * convention improves accuracy in integer implementations and saves some + * work in floating-point ones. + * Quantization of the output coefficients is done by jcdctmgr.c. This + * step requires an unsigned type and also one with twice the bits. + */ + +#if BITS_IN_JSAMPLE == 8 +#ifndef WITH_SIMD +typedef int DCTELEM; /* 16 or 32 bits is fine */ +typedef unsigned int UDCTELEM; +typedef unsigned long long UDCTELEM2; +#else +typedef short DCTELEM; /* prefer 16 bit with SIMD for parellelism */ +typedef unsigned short UDCTELEM; +typedef unsigned int UDCTELEM2; +#endif +#else +typedef JLONG DCTELEM; /* must have 32 bits */ +typedef unsigned long long UDCTELEM2; +#endif + + +/* + * An inverse DCT routine is given a pointer to the input JBLOCK and a pointer + * to an output sample array. The routine must dequantize the input data as + * well as perform the IDCT; for dequantization, it uses the multiplier table + * pointed to by compptr->dct_table. The output data is to be placed into the + * sample array starting at a specified column. (Any row offset needed will + * be applied to the array pointer before it is passed to the IDCT code.) + * Note that the number of samples emitted by the IDCT routine is + * DCT_scaled_size * DCT_scaled_size. + */ + +/* typedef inverse_DCT_method_ptr is declared in jpegint.h */ + +/* + * Each IDCT routine has its own ideas about the best dct_table element type. + */ + +typedef MULTIPLIER ISLOW_MULT_TYPE; /* short or int, whichever is faster */ +#if BITS_IN_JSAMPLE == 8 +typedef MULTIPLIER IFAST_MULT_TYPE; /* 16 bits is OK, use short if faster */ +#define IFAST_SCALE_BITS 2 /* fractional bits in scale factors */ +#else +typedef JLONG IFAST_MULT_TYPE; /* need 32 bits for scaled quantizers */ +#define IFAST_SCALE_BITS 13 /* fractional bits in scale factors */ +#endif +typedef FAST_FLOAT FLOAT_MULT_TYPE; /* preferred floating type */ + + +/* + * Each IDCT routine is responsible for range-limiting its results and + * converting them to unsigned form (0..MAXJSAMPLE). The raw outputs could + * be quite far out of range if the input data is corrupt, so a bulletproof + * range-limiting step is required. We use a mask-and-table-lookup method + * to do the combined operations quickly. See the comments with + * prepare_range_limit_table (in jdmaster.c) for more info. + */ + +#define IDCT_range_limit(cinfo) ((cinfo)->sample_range_limit + CENTERJSAMPLE) + +#define RANGE_MASK (MAXJSAMPLE * 4 + 3) /* 2 bits wider than legal samples */ + + +/* Extern declarations for the forward and inverse DCT routines. */ + +EXTERN(void) jpeg_fdct_islow (DCTELEM *data); +EXTERN(void) jpeg_fdct_ifast (DCTELEM *data); +EXTERN(void) jpeg_fdct_float (FAST_FLOAT *data); + +EXTERN(void) jpeg_idct_islow + (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col); +EXTERN(void) jpeg_idct_ifast + (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col); +EXTERN(void) jpeg_idct_float + (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col); +EXTERN(void) jpeg_idct_7x7 + (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col); +EXTERN(void) jpeg_idct_6x6 + (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col); +EXTERN(void) jpeg_idct_5x5 + (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col); +EXTERN(void) jpeg_idct_4x4 + (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col); +EXTERN(void) jpeg_idct_3x3 + (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col); +EXTERN(void) jpeg_idct_2x2 + (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col); +EXTERN(void) jpeg_idct_1x1 + (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col); +EXTERN(void) jpeg_idct_9x9 + (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col); +EXTERN(void) jpeg_idct_10x10 + (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col); +EXTERN(void) jpeg_idct_11x11 + (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col); +EXTERN(void) jpeg_idct_12x12 + (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col); +EXTERN(void) jpeg_idct_13x13 + (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col); +EXTERN(void) jpeg_idct_14x14 + (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col); +EXTERN(void) jpeg_idct_15x15 + (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col); +EXTERN(void) jpeg_idct_16x16 + (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col); + + +/* + * Macros for handling fixed-point arithmetic; these are used by many + * but not all of the DCT/IDCT modules. + * + * All values are expected to be of type JLONG. + * Fractional constants are scaled left by CONST_BITS bits. + * CONST_BITS is defined within each module using these macros, + * and may differ from one module to the next. + */ + +#define ONE ((JLONG) 1) +#define CONST_SCALE (ONE << CONST_BITS) + +/* Convert a positive real constant to an integer scaled by CONST_SCALE. + * Caution: some C compilers fail to reduce "FIX(constant)" at compile time, + * thus causing a lot of useless floating-point operations at run time. + */ + +#define FIX(x) ((JLONG) ((x) * CONST_SCALE + 0.5)) + +/* Descale and correctly round a JLONG value that's scaled by N bits. + * We assume RIGHT_SHIFT rounds towards minus infinity, so adding + * the fudge factor is correct for either sign of X. + */ + +#define DESCALE(x,n) RIGHT_SHIFT((x) + (ONE << ((n)-1)), n) + +/* Multiply a JLONG variable by a JLONG constant to yield a JLONG result. + * This macro is used only when the two inputs will actually be no more than + * 16 bits wide, so that a 16x16->32 bit multiply can be used instead of a + * full 32x32 multiply. This provides a useful speedup on many machines. + * Unfortunately there is no way to specify a 16x16->32 multiply portably + * in C, but some C compilers will do the right thing if you provide the + * correct combination of casts. + */ + +#ifdef SHORTxSHORT_32 /* may work if 'int' is 32 bits */ +#define MULTIPLY16C16(var,const) (((INT16) (var)) * ((INT16) (const))) +#endif +#ifdef SHORTxLCONST_32 /* known to work with Microsoft C 6.0 */ +#define MULTIPLY16C16(var,const) (((INT16) (var)) * ((JLONG) (const))) +#endif + +#ifndef MULTIPLY16C16 /* default definition */ +#define MULTIPLY16C16(var,const) ((var) * (const)) +#endif + +/* Same except both inputs are variables. */ + +#ifdef SHORTxSHORT_32 /* may work if 'int' is 32 bits */ +#define MULTIPLY16V16(var1,var2) (((INT16) (var1)) * ((INT16) (var2))) +#endif + +#ifndef MULTIPLY16V16 /* default definition */ +#define MULTIPLY16V16(var1,var2) ((var1) * (var2)) +#endif diff --git a/Builder/jni-1.11/simd/h/jinclude.h b/Builder/jni-1.11/simd/h/jinclude.h new file mode 100644 index 000000000..d461a1aa1 --- /dev/null +++ b/Builder/jni-1.11/simd/h/jinclude.h @@ -0,0 +1,84 @@ +/* + * jinclude.h + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1991-1994, Thomas G. Lane. + * It was modified by The libjpeg-turbo Project to include only code relevant + * to libjpeg-turbo. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file exists to provide a single place to fix any problems with + * including the wrong system include files. (Common problems are taken + * care of by the standard jconfig symbols, but on really weird systems + * you may have to edit this file.) + * + * NOTE: this file is NOT intended to be included by applications using the + * JPEG library. Most applications need only include jpeglib.h. + */ + + +/* Include auto-config file to find out which system include files we need. */ + +#include "jconfig.h" /* auto configuration options */ +#define JCONFIG_INCLUDED /* so that jpeglib.h doesn't do it again */ + +/* + * We need the NULL macro and size_t typedef. + * On an ANSI-conforming system it is sufficient to include . + * Otherwise, we get them from or ; we may have to + * pull in as well. + * Note that the core JPEG library does not require ; + * only the default error handler and data source/destination modules do. + * But we must pull it in because of the references to FILE in jpeglib.h. + * You can remove those references if you want to compile without . + */ + +#ifdef HAVE_STDDEF_H +#include +#endif + +#ifdef HAVE_STDLIB_H +#include +#endif + +#ifdef NEED_SYS_TYPES_H +#include +#endif + +#include + +/* + * We need memory copying and zeroing functions, plus strncpy(). + * ANSI and System V implementations declare these in . + * BSD doesn't have the mem() functions, but it does have bcopy()/bzero(). + * Some systems may declare memset and memcpy in . + * + * NOTE: we assume the size parameters to these functions are of type size_t. + * Change the casts in these macros if not! + */ + +#ifdef NEED_BSD_STRINGS + +#include +#define MEMZERO(target,size) bzero((void *)(target), (size_t)(size)) +#define MEMCOPY(dest,src,size) bcopy((const void *)(src), (void *)(dest), (size_t)(size)) + +#else /* not BSD, assume ANSI/SysV string lib */ + +#include +#define MEMZERO(target,size) memset((void *)(target), 0, (size_t)(size)) +#define MEMCOPY(dest,src,size) memcpy((void *)(dest), (const void *)(src), (size_t)(size)) + +#endif + +/* + * The modules that use fread() and fwrite() always invoke them through + * these macros. On some systems you may need to twiddle the argument casts. + * CAUTION: argument order is different from underlying functions! + */ + +#define JFREAD(file,buf,sizeofbuf) \ + ((size_t) fread((void *) (buf), (size_t) 1, (size_t) (sizeofbuf), (file))) +#define JFWRITE(file,buf,sizeofbuf) \ + ((size_t) fwrite((const void *) (buf), (size_t) 1, (size_t) (sizeofbuf), (file))) diff --git a/Builder/jni-1.11/simd/h/jpeglib.h b/Builder/jni-1.11/simd/h/jpeglib.h new file mode 100644 index 000000000..6c63f5822 --- /dev/null +++ b/Builder/jni-1.11/simd/h/jpeglib.h @@ -0,0 +1,1122 @@ +/* + * jpeglib.h + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1991-1998, Thomas G. Lane. + * Modified 2002-2009 by Guido Vollbeding. + * libjpeg-turbo Modifications: + * Copyright (C) 2009-2011, 2013-2014, 2016, D. R. Commander. + * Copyright (C) 2015, Google, Inc. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file defines the application interface for the JPEG library. + * Most applications using the library need only include this file, + * and perhaps jerror.h if they want to know the exact error codes. + */ + +#ifndef JPEGLIB_H +#define JPEGLIB_H + +/* + * First we include the configuration files that record how this + * installation of the JPEG library is set up. jconfig.h can be + * generated automatically for many systems. jmorecfg.h contains + * manual configuration options that most people need not worry about. + */ + +#ifndef JCONFIG_INCLUDED /* in case jinclude.h already did */ +#include "jconfig.h" /* widely used configuration options */ +#endif +#include "jmorecfg.h" /* seldom changed options */ + + +#ifdef __cplusplus +#ifndef DONT_USE_EXTERN_C +extern "C" { +#endif +#endif + + +/* Various constants determining the sizes of things. + * All of these are specified by the JPEG standard, so don't change them + * if you want to be compatible. + */ + +#define DCTSIZE 8 /* The basic DCT block is 8x8 samples */ +#define DCTSIZE2 64 /* DCTSIZE squared; # of elements in a block */ +#define NUM_QUANT_TBLS 4 /* Quantization tables are numbered 0..3 */ +#define NUM_HUFF_TBLS 4 /* Huffman tables are numbered 0..3 */ +#define NUM_ARITH_TBLS 16 /* Arith-coding tables are numbered 0..15 */ +#define MAX_COMPS_IN_SCAN 4 /* JPEG limit on # of components in one scan */ +#define MAX_SAMP_FACTOR 4 /* JPEG limit on sampling factors */ +/* Unfortunately, some bozo at Adobe saw no reason to be bound by the standard; + * the PostScript DCT filter can emit files with many more than 10 blocks/MCU. + * If you happen to run across such a file, you can up D_MAX_BLOCKS_IN_MCU + * to handle it. We even let you do this from the jconfig.h file. However, + * we strongly discourage changing C_MAX_BLOCKS_IN_MCU; just because Adobe + * sometimes emits noncompliant files doesn't mean you should too. + */ +#define C_MAX_BLOCKS_IN_MCU 10 /* compressor's limit on blocks per MCU */ +#ifndef D_MAX_BLOCKS_IN_MCU +#define D_MAX_BLOCKS_IN_MCU 10 /* decompressor's limit on blocks per MCU */ +#endif + + +/* Data structures for images (arrays of samples and of DCT coefficients). + */ + +typedef JSAMPLE *JSAMPROW; /* ptr to one image row of pixel samples. */ +typedef JSAMPROW *JSAMPARRAY; /* ptr to some rows (a 2-D sample array) */ +typedef JSAMPARRAY *JSAMPIMAGE; /* a 3-D sample array: top index is color */ + +typedef JCOEF JBLOCK[DCTSIZE2]; /* one block of coefficients */ +typedef JBLOCK *JBLOCKROW; /* pointer to one row of coefficient blocks */ +typedef JBLOCKROW *JBLOCKARRAY; /* a 2-D array of coefficient blocks */ +typedef JBLOCKARRAY *JBLOCKIMAGE; /* a 3-D array of coefficient blocks */ + +typedef JCOEF *JCOEFPTR; /* useful in a couple of places */ + + +/* Types for JPEG compression parameters and working tables. */ + + +/* DCT coefficient quantization tables. */ + +typedef struct { + /* This array gives the coefficient quantizers in natural array order + * (not the zigzag order in which they are stored in a JPEG DQT marker). + * CAUTION: IJG versions prior to v6a kept this array in zigzag order. + */ + UINT16 quantval[DCTSIZE2]; /* quantization step for each coefficient */ + /* This field is used only during compression. It's initialized FALSE when + * the table is created, and set TRUE when it's been output to the file. + * You could suppress output of a table by setting this to TRUE. + * (See jpeg_suppress_tables for an example.) + */ + boolean sent_table; /* TRUE when table has been output */ +} JQUANT_TBL; + + +/* Huffman coding tables. */ + +typedef struct { + /* These two fields directly represent the contents of a JPEG DHT marker */ + UINT8 bits[17]; /* bits[k] = # of symbols with codes of */ + /* length k bits; bits[0] is unused */ + UINT8 huffval[256]; /* The symbols, in order of incr code length */ + /* This field is used only during compression. It's initialized FALSE when + * the table is created, and set TRUE when it's been output to the file. + * You could suppress output of a table by setting this to TRUE. + * (See jpeg_suppress_tables for an example.) + */ + boolean sent_table; /* TRUE when table has been output */ +} JHUFF_TBL; + + +/* Basic info about one component (color channel). */ + +typedef struct { + /* These values are fixed over the whole image. */ + /* For compression, they must be supplied by parameter setup; */ + /* for decompression, they are read from the SOF marker. */ + int component_id; /* identifier for this component (0..255) */ + int component_index; /* its index in SOF or cinfo->comp_info[] */ + int h_samp_factor; /* horizontal sampling factor (1..4) */ + int v_samp_factor; /* vertical sampling factor (1..4) */ + int quant_tbl_no; /* quantization table selector (0..3) */ + /* These values may vary between scans. */ + /* For compression, they must be supplied by parameter setup; */ + /* for decompression, they are read from the SOS marker. */ + /* The decompressor output side may not use these variables. */ + int dc_tbl_no; /* DC entropy table selector (0..3) */ + int ac_tbl_no; /* AC entropy table selector (0..3) */ + + /* Remaining fields should be treated as private by applications. */ + + /* These values are computed during compression or decompression startup: */ + /* Component's size in DCT blocks. + * Any dummy blocks added to complete an MCU are not counted; therefore + * these values do not depend on whether a scan is interleaved or not. + */ + JDIMENSION width_in_blocks; + JDIMENSION height_in_blocks; + /* Size of a DCT block in samples. Always DCTSIZE for compression. + * For decompression this is the size of the output from one DCT block, + * reflecting any scaling we choose to apply during the IDCT step. + * Values from 1 to 16 are supported. + * Note that different components may receive different IDCT scalings. + */ +#if JPEG_LIB_VERSION >= 70 + int DCT_h_scaled_size; + int DCT_v_scaled_size; +#else + int DCT_scaled_size; +#endif + /* The downsampled dimensions are the component's actual, unpadded number + * of samples at the main buffer (preprocessing/compression interface), thus + * downsampled_width = ceil(image_width * Hi/Hmax) + * and similarly for height. For decompression, IDCT scaling is included, so + * downsampled_width = ceil(image_width * Hi/Hmax * DCT_[h_]scaled_size/DCTSIZE) + */ + JDIMENSION downsampled_width; /* actual width in samples */ + JDIMENSION downsampled_height; /* actual height in samples */ + /* This flag is used only for decompression. In cases where some of the + * components will be ignored (eg grayscale output from YCbCr image), + * we can skip most computations for the unused components. + */ + boolean component_needed; /* do we need the value of this component? */ + + /* These values are computed before starting a scan of the component. */ + /* The decompressor output side may not use these variables. */ + int MCU_width; /* number of blocks per MCU, horizontally */ + int MCU_height; /* number of blocks per MCU, vertically */ + int MCU_blocks; /* MCU_width * MCU_height */ + int MCU_sample_width; /* MCU width in samples, MCU_width*DCT_[h_]scaled_size */ + int last_col_width; /* # of non-dummy blocks across in last MCU */ + int last_row_height; /* # of non-dummy blocks down in last MCU */ + + /* Saved quantization table for component; NULL if none yet saved. + * See jdinput.c comments about the need for this information. + * This field is currently used only for decompression. + */ + JQUANT_TBL *quant_table; + + /* Private per-component storage for DCT or IDCT subsystem. */ + void *dct_table; +} jpeg_component_info; + + +/* The script for encoding a multiple-scan file is an array of these: */ + +typedef struct { + int comps_in_scan; /* number of components encoded in this scan */ + int component_index[MAX_COMPS_IN_SCAN]; /* their SOF/comp_info[] indexes */ + int Ss, Se; /* progressive JPEG spectral selection parms */ + int Ah, Al; /* progressive JPEG successive approx. parms */ +} jpeg_scan_info; + +/* The decompressor can save APPn and COM markers in a list of these: */ + +typedef struct jpeg_marker_struct *jpeg_saved_marker_ptr; + +struct jpeg_marker_struct { + jpeg_saved_marker_ptr next; /* next in list, or NULL */ + UINT8 marker; /* marker code: JPEG_COM, or JPEG_APP0+n */ + unsigned int original_length; /* # bytes of data in the file */ + unsigned int data_length; /* # bytes of data saved at data[] */ + JOCTET *data; /* the data contained in the marker */ + /* the marker length word is not counted in data_length or original_length */ +}; + +/* Known color spaces. */ + +#define JCS_EXTENSIONS 1 +#define JCS_ALPHA_EXTENSIONS 1 + +typedef enum { + JCS_UNKNOWN, /* error/unspecified */ + JCS_GRAYSCALE, /* monochrome */ + JCS_RGB, /* red/green/blue as specified by the RGB_RED, + RGB_GREEN, RGB_BLUE, and RGB_PIXELSIZE macros */ + JCS_YCbCr, /* Y/Cb/Cr (also known as YUV) */ + JCS_CMYK, /* C/M/Y/K */ + JCS_YCCK, /* Y/Cb/Cr/K */ + JCS_EXT_RGB, /* red/green/blue */ + JCS_EXT_RGBX, /* red/green/blue/x */ + JCS_EXT_BGR, /* blue/green/red */ + JCS_EXT_BGRX, /* blue/green/red/x */ + JCS_EXT_XBGR, /* x/blue/green/red */ + JCS_EXT_XRGB, /* x/red/green/blue */ + /* When out_color_space it set to JCS_EXT_RGBX, JCS_EXT_BGRX, JCS_EXT_XBGR, + or JCS_EXT_XRGB during decompression, the X byte is undefined, and in + order to ensure the best performance, libjpeg-turbo can set that byte to + whatever value it wishes. Use the following colorspace constants to + ensure that the X byte is set to 0xFF, so that it can be interpreted as an + opaque alpha channel. */ + JCS_EXT_RGBA, /* red/green/blue/alpha */ + JCS_EXT_BGRA, /* blue/green/red/alpha */ + JCS_EXT_ABGR, /* alpha/blue/green/red */ + JCS_EXT_ARGB, /* alpha/red/green/blue */ + JCS_RGB565 /* 5-bit red/6-bit green/5-bit blue */ +} J_COLOR_SPACE; + +/* DCT/IDCT algorithm options. */ + +typedef enum { + JDCT_ISLOW, /* slow but accurate integer algorithm */ + JDCT_IFAST, /* faster, less accurate integer method */ + JDCT_FLOAT /* floating-point: accurate, fast on fast HW */ +} J_DCT_METHOD; + +#ifndef JDCT_DEFAULT /* may be overridden in jconfig.h */ +#define JDCT_DEFAULT JDCT_ISLOW +#endif +#ifndef JDCT_FASTEST /* may be overridden in jconfig.h */ +#define JDCT_FASTEST JDCT_IFAST +#endif + +/* Dithering options for decompression. */ + +typedef enum { + JDITHER_NONE, /* no dithering */ + JDITHER_ORDERED, /* simple ordered dither */ + JDITHER_FS /* Floyd-Steinberg error diffusion dither */ +} J_DITHER_MODE; + + +/* Common fields between JPEG compression and decompression master structs. */ + +#define jpeg_common_fields \ + struct jpeg_error_mgr *err; /* Error handler module */\ + struct jpeg_memory_mgr *mem; /* Memory manager module */\ + struct jpeg_progress_mgr *progress; /* Progress monitor, or NULL if none */\ + void *client_data; /* Available for use by application */\ + boolean is_decompressor; /* So common code can tell which is which */\ + int global_state /* For checking call sequence validity */ + +/* Routines that are to be used by both halves of the library are declared + * to receive a pointer to this structure. There are no actual instances of + * jpeg_common_struct, only of jpeg_compress_struct and jpeg_decompress_struct. + */ +struct jpeg_common_struct { + jpeg_common_fields; /* Fields common to both master struct types */ + /* Additional fields follow in an actual jpeg_compress_struct or + * jpeg_decompress_struct. All three structs must agree on these + * initial fields! (This would be a lot cleaner in C++.) + */ +}; + +typedef struct jpeg_common_struct *j_common_ptr; +typedef struct jpeg_compress_struct *j_compress_ptr; +typedef struct jpeg_decompress_struct *j_decompress_ptr; + + +/* Master record for a compression instance */ + +struct jpeg_compress_struct { + jpeg_common_fields; /* Fields shared with jpeg_decompress_struct */ + + /* Destination for compressed data */ + struct jpeg_destination_mgr *dest; + + /* Description of source image --- these fields must be filled in by + * outer application before starting compression. in_color_space must + * be correct before you can even call jpeg_set_defaults(). + */ + + JDIMENSION image_width; /* input image width */ + JDIMENSION image_height; /* input image height */ + int input_components; /* # of color components in input image */ + J_COLOR_SPACE in_color_space; /* colorspace of input image */ + + double input_gamma; /* image gamma of input image */ + + /* Compression parameters --- these fields must be set before calling + * jpeg_start_compress(). We recommend calling jpeg_set_defaults() to + * initialize everything to reasonable defaults, then changing anything + * the application specifically wants to change. That way you won't get + * burnt when new parameters are added. Also note that there are several + * helper routines to simplify changing parameters. + */ + +#if JPEG_LIB_VERSION >= 70 + unsigned int scale_num, scale_denom; /* fraction by which to scale image */ + + JDIMENSION jpeg_width; /* scaled JPEG image width */ + JDIMENSION jpeg_height; /* scaled JPEG image height */ + /* Dimensions of actual JPEG image that will be written to file, + * derived from input dimensions by scaling factors above. + * These fields are computed by jpeg_start_compress(). + * You can also use jpeg_calc_jpeg_dimensions() to determine these values + * in advance of calling jpeg_start_compress(). + */ +#endif + + int data_precision; /* bits of precision in image data */ + + int num_components; /* # of color components in JPEG image */ + J_COLOR_SPACE jpeg_color_space; /* colorspace of JPEG image */ + + jpeg_component_info *comp_info; + /* comp_info[i] describes component that appears i'th in SOF */ + + JQUANT_TBL *quant_tbl_ptrs[NUM_QUANT_TBLS]; +#if JPEG_LIB_VERSION >= 70 + int q_scale_factor[NUM_QUANT_TBLS]; +#endif + /* ptrs to coefficient quantization tables, or NULL if not defined, + * and corresponding scale factors (percentage, initialized 100). + */ + + JHUFF_TBL *dc_huff_tbl_ptrs[NUM_HUFF_TBLS]; + JHUFF_TBL *ac_huff_tbl_ptrs[NUM_HUFF_TBLS]; + /* ptrs to Huffman coding tables, or NULL if not defined */ + + UINT8 arith_dc_L[NUM_ARITH_TBLS]; /* L values for DC arith-coding tables */ + UINT8 arith_dc_U[NUM_ARITH_TBLS]; /* U values for DC arith-coding tables */ + UINT8 arith_ac_K[NUM_ARITH_TBLS]; /* Kx values for AC arith-coding tables */ + + int num_scans; /* # of entries in scan_info array */ + const jpeg_scan_info *scan_info; /* script for multi-scan file, or NULL */ + /* The default value of scan_info is NULL, which causes a single-scan + * sequential JPEG file to be emitted. To create a multi-scan file, + * set num_scans and scan_info to point to an array of scan definitions. + */ + + boolean raw_data_in; /* TRUE=caller supplies downsampled data */ + boolean arith_code; /* TRUE=arithmetic coding, FALSE=Huffman */ + boolean optimize_coding; /* TRUE=optimize entropy encoding parms */ + boolean CCIR601_sampling; /* TRUE=first samples are cosited */ +#if JPEG_LIB_VERSION >= 70 + boolean do_fancy_downsampling; /* TRUE=apply fancy downsampling */ +#endif + int smoothing_factor; /* 1..100, or 0 for no input smoothing */ + J_DCT_METHOD dct_method; /* DCT algorithm selector */ + + /* The restart interval can be specified in absolute MCUs by setting + * restart_interval, or in MCU rows by setting restart_in_rows + * (in which case the correct restart_interval will be figured + * for each scan). + */ + unsigned int restart_interval; /* MCUs per restart, or 0 for no restart */ + int restart_in_rows; /* if > 0, MCU rows per restart interval */ + + /* Parameters controlling emission of special markers. */ + + boolean write_JFIF_header; /* should a JFIF marker be written? */ + UINT8 JFIF_major_version; /* What to write for the JFIF version number */ + UINT8 JFIF_minor_version; + /* These three values are not used by the JPEG code, merely copied */ + /* into the JFIF APP0 marker. density_unit can be 0 for unknown, */ + /* 1 for dots/inch, or 2 for dots/cm. Note that the pixel aspect */ + /* ratio is defined by X_density/Y_density even when density_unit=0. */ + UINT8 density_unit; /* JFIF code for pixel size units */ + UINT16 X_density; /* Horizontal pixel density */ + UINT16 Y_density; /* Vertical pixel density */ + boolean write_Adobe_marker; /* should an Adobe marker be written? */ + + /* State variable: index of next scanline to be written to + * jpeg_write_scanlines(). Application may use this to control its + * processing loop, e.g., "while (next_scanline < image_height)". + */ + + JDIMENSION next_scanline; /* 0 .. image_height-1 */ + + /* Remaining fields are known throughout compressor, but generally + * should not be touched by a surrounding application. + */ + + /* + * These fields are computed during compression startup + */ + boolean progressive_mode; /* TRUE if scan script uses progressive mode */ + int max_h_samp_factor; /* largest h_samp_factor */ + int max_v_samp_factor; /* largest v_samp_factor */ + +#if JPEG_LIB_VERSION >= 70 + int min_DCT_h_scaled_size; /* smallest DCT_h_scaled_size of any component */ + int min_DCT_v_scaled_size; /* smallest DCT_v_scaled_size of any component */ +#endif + + JDIMENSION total_iMCU_rows; /* # of iMCU rows to be input to coef ctlr */ + /* The coefficient controller receives data in units of MCU rows as defined + * for fully interleaved scans (whether the JPEG file is interleaved or not). + * There are v_samp_factor * DCTSIZE sample rows of each component in an + * "iMCU" (interleaved MCU) row. + */ + + /* + * These fields are valid during any one scan. + * They describe the components and MCUs actually appearing in the scan. + */ + int comps_in_scan; /* # of JPEG components in this scan */ + jpeg_component_info *cur_comp_info[MAX_COMPS_IN_SCAN]; + /* *cur_comp_info[i] describes component that appears i'th in SOS */ + + JDIMENSION MCUs_per_row; /* # of MCUs across the image */ + JDIMENSION MCU_rows_in_scan; /* # of MCU rows in the image */ + + int blocks_in_MCU; /* # of DCT blocks per MCU */ + int MCU_membership[C_MAX_BLOCKS_IN_MCU]; + /* MCU_membership[i] is index in cur_comp_info of component owning */ + /* i'th block in an MCU */ + + int Ss, Se, Ah, Al; /* progressive JPEG parameters for scan */ + +#if JPEG_LIB_VERSION >= 80 + int block_size; /* the basic DCT block size: 1..16 */ + const int *natural_order; /* natural-order position array */ + int lim_Se; /* min( Se, DCTSIZE2-1 ) */ +#endif + + /* + * Links to compression subobjects (methods and private variables of modules) + */ + struct jpeg_comp_master *master; + struct jpeg_c_main_controller *main; + struct jpeg_c_prep_controller *prep; + struct jpeg_c_coef_controller *coef; + struct jpeg_marker_writer *marker; + struct jpeg_color_converter *cconvert; + struct jpeg_downsampler *downsample; + struct jpeg_forward_dct *fdct; + struct jpeg_entropy_encoder *entropy; + jpeg_scan_info *script_space; /* workspace for jpeg_simple_progression */ + int script_space_size; +}; + + +/* Master record for a decompression instance */ + +struct jpeg_decompress_struct { + jpeg_common_fields; /* Fields shared with jpeg_compress_struct */ + + /* Source of compressed data */ + struct jpeg_source_mgr *src; + + /* Basic description of image --- filled in by jpeg_read_header(). */ + /* Application may inspect these values to decide how to process image. */ + + JDIMENSION image_width; /* nominal image width (from SOF marker) */ + JDIMENSION image_height; /* nominal image height */ + int num_components; /* # of color components in JPEG image */ + J_COLOR_SPACE jpeg_color_space; /* colorspace of JPEG image */ + + /* Decompression processing parameters --- these fields must be set before + * calling jpeg_start_decompress(). Note that jpeg_read_header() initializes + * them to default values. + */ + + J_COLOR_SPACE out_color_space; /* colorspace for output */ + + unsigned int scale_num, scale_denom; /* fraction by which to scale image */ + + double output_gamma; /* image gamma wanted in output */ + + boolean buffered_image; /* TRUE=multiple output passes */ + boolean raw_data_out; /* TRUE=downsampled data wanted */ + + J_DCT_METHOD dct_method; /* IDCT algorithm selector */ + boolean do_fancy_upsampling; /* TRUE=apply fancy upsampling */ + boolean do_block_smoothing; /* TRUE=apply interblock smoothing */ + + boolean quantize_colors; /* TRUE=colormapped output wanted */ + /* the following are ignored if not quantize_colors: */ + J_DITHER_MODE dither_mode; /* type of color dithering to use */ + boolean two_pass_quantize; /* TRUE=use two-pass color quantization */ + int desired_number_of_colors; /* max # colors to use in created colormap */ + /* these are significant only in buffered-image mode: */ + boolean enable_1pass_quant; /* enable future use of 1-pass quantizer */ + boolean enable_external_quant;/* enable future use of external colormap */ + boolean enable_2pass_quant; /* enable future use of 2-pass quantizer */ + + /* Description of actual output image that will be returned to application. + * These fields are computed by jpeg_start_decompress(). + * You can also use jpeg_calc_output_dimensions() to determine these values + * in advance of calling jpeg_start_decompress(). + */ + + JDIMENSION output_width; /* scaled image width */ + JDIMENSION output_height; /* scaled image height */ + int out_color_components; /* # of color components in out_color_space */ + int output_components; /* # of color components returned */ + /* output_components is 1 (a colormap index) when quantizing colors; + * otherwise it equals out_color_components. + */ + int rec_outbuf_height; /* min recommended height of scanline buffer */ + /* If the buffer passed to jpeg_read_scanlines() is less than this many rows + * high, space and time will be wasted due to unnecessary data copying. + * Usually rec_outbuf_height will be 1 or 2, at most 4. + */ + + /* When quantizing colors, the output colormap is described by these fields. + * The application can supply a colormap by setting colormap non-NULL before + * calling jpeg_start_decompress; otherwise a colormap is created during + * jpeg_start_decompress or jpeg_start_output. + * The map has out_color_components rows and actual_number_of_colors columns. + */ + int actual_number_of_colors; /* number of entries in use */ + JSAMPARRAY colormap; /* The color map as a 2-D pixel array */ + + /* State variables: these variables indicate the progress of decompression. + * The application may examine these but must not modify them. + */ + + /* Row index of next scanline to be read from jpeg_read_scanlines(). + * Application may use this to control its processing loop, e.g., + * "while (output_scanline < output_height)". + */ + JDIMENSION output_scanline; /* 0 .. output_height-1 */ + + /* Current input scan number and number of iMCU rows completed in scan. + * These indicate the progress of the decompressor input side. + */ + int input_scan_number; /* Number of SOS markers seen so far */ + JDIMENSION input_iMCU_row; /* Number of iMCU rows completed */ + + /* The "output scan number" is the notional scan being displayed by the + * output side. The decompressor will not allow output scan/row number + * to get ahead of input scan/row, but it can fall arbitrarily far behind. + */ + int output_scan_number; /* Nominal scan number being displayed */ + JDIMENSION output_iMCU_row; /* Number of iMCU rows read */ + + /* Current progression status. coef_bits[c][i] indicates the precision + * with which component c's DCT coefficient i (in zigzag order) is known. + * It is -1 when no data has yet been received, otherwise it is the point + * transform (shift) value for the most recent scan of the coefficient + * (thus, 0 at completion of the progression). + * This pointer is NULL when reading a non-progressive file. + */ + int (*coef_bits)[DCTSIZE2]; /* -1 or current Al value for each coef */ + + /* Internal JPEG parameters --- the application usually need not look at + * these fields. Note that the decompressor output side may not use + * any parameters that can change between scans. + */ + + /* Quantization and Huffman tables are carried forward across input + * datastreams when processing abbreviated JPEG datastreams. + */ + + JQUANT_TBL *quant_tbl_ptrs[NUM_QUANT_TBLS]; + /* ptrs to coefficient quantization tables, or NULL if not defined */ + + JHUFF_TBL *dc_huff_tbl_ptrs[NUM_HUFF_TBLS]; + JHUFF_TBL *ac_huff_tbl_ptrs[NUM_HUFF_TBLS]; + /* ptrs to Huffman coding tables, or NULL if not defined */ + + /* These parameters are never carried across datastreams, since they + * are given in SOF/SOS markers or defined to be reset by SOI. + */ + + int data_precision; /* bits of precision in image data */ + + jpeg_component_info *comp_info; + /* comp_info[i] describes component that appears i'th in SOF */ + +#if JPEG_LIB_VERSION >= 80 + boolean is_baseline; /* TRUE if Baseline SOF0 encountered */ +#endif + boolean progressive_mode; /* TRUE if SOFn specifies progressive mode */ + boolean arith_code; /* TRUE=arithmetic coding, FALSE=Huffman */ + + UINT8 arith_dc_L[NUM_ARITH_TBLS]; /* L values for DC arith-coding tables */ + UINT8 arith_dc_U[NUM_ARITH_TBLS]; /* U values for DC arith-coding tables */ + UINT8 arith_ac_K[NUM_ARITH_TBLS]; /* Kx values for AC arith-coding tables */ + + unsigned int restart_interval; /* MCUs per restart interval, or 0 for no restart */ + + /* These fields record data obtained from optional markers recognized by + * the JPEG library. + */ + boolean saw_JFIF_marker; /* TRUE iff a JFIF APP0 marker was found */ + /* Data copied from JFIF marker; only valid if saw_JFIF_marker is TRUE: */ + UINT8 JFIF_major_version; /* JFIF version number */ + UINT8 JFIF_minor_version; + UINT8 density_unit; /* JFIF code for pixel size units */ + UINT16 X_density; /* Horizontal pixel density */ + UINT16 Y_density; /* Vertical pixel density */ + boolean saw_Adobe_marker; /* TRUE iff an Adobe APP14 marker was found */ + UINT8 Adobe_transform; /* Color transform code from Adobe marker */ + + boolean CCIR601_sampling; /* TRUE=first samples are cosited */ + + /* Aside from the specific data retained from APPn markers known to the + * library, the uninterpreted contents of any or all APPn and COM markers + * can be saved in a list for examination by the application. + */ + jpeg_saved_marker_ptr marker_list; /* Head of list of saved markers */ + + /* Remaining fields are known throughout decompressor, but generally + * should not be touched by a surrounding application. + */ + + /* + * These fields are computed during decompression startup + */ + int max_h_samp_factor; /* largest h_samp_factor */ + int max_v_samp_factor; /* largest v_samp_factor */ + +#if JPEG_LIB_VERSION >= 70 + int min_DCT_h_scaled_size; /* smallest DCT_h_scaled_size of any component */ + int min_DCT_v_scaled_size; /* smallest DCT_v_scaled_size of any component */ +#else + int min_DCT_scaled_size; /* smallest DCT_scaled_size of any component */ +#endif + + JDIMENSION total_iMCU_rows; /* # of iMCU rows in image */ + /* The coefficient controller's input and output progress is measured in + * units of "iMCU" (interleaved MCU) rows. These are the same as MCU rows + * in fully interleaved JPEG scans, but are used whether the scan is + * interleaved or not. We define an iMCU row as v_samp_factor DCT block + * rows of each component. Therefore, the IDCT output contains + * v_samp_factor*DCT_[v_]scaled_size sample rows of a component per iMCU row. + */ + + JSAMPLE *sample_range_limit; /* table for fast range-limiting */ + + /* + * These fields are valid during any one scan. + * They describe the components and MCUs actually appearing in the scan. + * Note that the decompressor output side must not use these fields. + */ + int comps_in_scan; /* # of JPEG components in this scan */ + jpeg_component_info *cur_comp_info[MAX_COMPS_IN_SCAN]; + /* *cur_comp_info[i] describes component that appears i'th in SOS */ + + JDIMENSION MCUs_per_row; /* # of MCUs across the image */ + JDIMENSION MCU_rows_in_scan; /* # of MCU rows in the image */ + + int blocks_in_MCU; /* # of DCT blocks per MCU */ + int MCU_membership[D_MAX_BLOCKS_IN_MCU]; + /* MCU_membership[i] is index in cur_comp_info of component owning */ + /* i'th block in an MCU */ + + int Ss, Se, Ah, Al; /* progressive JPEG parameters for scan */ + +#if JPEG_LIB_VERSION >= 80 + /* These fields are derived from Se of first SOS marker. + */ + int block_size; /* the basic DCT block size: 1..16 */ + const int *natural_order; /* natural-order position array for entropy decode */ + int lim_Se; /* min( Se, DCTSIZE2-1 ) for entropy decode */ +#endif + + /* This field is shared between entropy decoder and marker parser. + * It is either zero or the code of a JPEG marker that has been + * read from the data source, but has not yet been processed. + */ + int unread_marker; + + /* + * Links to decompression subobjects (methods, private variables of modules) + */ + struct jpeg_decomp_master *master; + struct jpeg_d_main_controller *main; + struct jpeg_d_coef_controller *coef; + struct jpeg_d_post_controller *post; + struct jpeg_input_controller *inputctl; + struct jpeg_marker_reader *marker; + struct jpeg_entropy_decoder *entropy; + struct jpeg_inverse_dct *idct; + struct jpeg_upsampler *upsample; + struct jpeg_color_deconverter *cconvert; + struct jpeg_color_quantizer *cquantize; +}; + + +/* "Object" declarations for JPEG modules that may be supplied or called + * directly by the surrounding application. + * As with all objects in the JPEG library, these structs only define the + * publicly visible methods and state variables of a module. Additional + * private fields may exist after the public ones. + */ + + +/* Error handler object */ + +struct jpeg_error_mgr { + /* Error exit handler: does not return to caller */ + void (*error_exit) (j_common_ptr cinfo); + /* Conditionally emit a trace or warning message */ + void (*emit_message) (j_common_ptr cinfo, int msg_level); + /* Routine that actually outputs a trace or error message */ + void (*output_message) (j_common_ptr cinfo); + /* Format a message string for the most recent JPEG error or message */ + void (*format_message) (j_common_ptr cinfo, char *buffer); +#define JMSG_LENGTH_MAX 200 /* recommended size of format_message buffer */ + /* Reset error state variables at start of a new image */ + void (*reset_error_mgr) (j_common_ptr cinfo); + + /* The message ID code and any parameters are saved here. + * A message can have one string parameter or up to 8 int parameters. + */ + int msg_code; +#define JMSG_STR_PARM_MAX 80 + union { + int i[8]; + char s[JMSG_STR_PARM_MAX]; + } msg_parm; + + /* Standard state variables for error facility */ + + int trace_level; /* max msg_level that will be displayed */ + + /* For recoverable corrupt-data errors, we emit a warning message, + * but keep going unless emit_message chooses to abort. emit_message + * should count warnings in num_warnings. The surrounding application + * can check for bad data by seeing if num_warnings is nonzero at the + * end of processing. + */ + long num_warnings; /* number of corrupt-data warnings */ + + /* These fields point to the table(s) of error message strings. + * An application can change the table pointer to switch to a different + * message list (typically, to change the language in which errors are + * reported). Some applications may wish to add additional error codes + * that will be handled by the JPEG library error mechanism; the second + * table pointer is used for this purpose. + * + * First table includes all errors generated by JPEG library itself. + * Error code 0 is reserved for a "no such error string" message. + */ + const char * const *jpeg_message_table; /* Library errors */ + int last_jpeg_message; /* Table contains strings 0..last_jpeg_message */ + /* Second table can be added by application (see cjpeg/djpeg for example). + * It contains strings numbered first_addon_message..last_addon_message. + */ + const char * const *addon_message_table; /* Non-library errors */ + int first_addon_message; /* code for first string in addon table */ + int last_addon_message; /* code for last string in addon table */ +}; + + +/* Progress monitor object */ + +struct jpeg_progress_mgr { + void (*progress_monitor) (j_common_ptr cinfo); + + long pass_counter; /* work units completed in this pass */ + long pass_limit; /* total number of work units in this pass */ + int completed_passes; /* passes completed so far */ + int total_passes; /* total number of passes expected */ +}; + + +/* Data destination object for compression */ + +struct jpeg_destination_mgr { + JOCTET *next_output_byte; /* => next byte to write in buffer */ + size_t free_in_buffer; /* # of byte spaces remaining in buffer */ + + void (*init_destination) (j_compress_ptr cinfo); + boolean (*empty_output_buffer) (j_compress_ptr cinfo); + void (*term_destination) (j_compress_ptr cinfo); +}; + + +/* Data source object for decompression */ + +struct jpeg_source_mgr { + const JOCTET *next_input_byte; /* => next byte to read from buffer */ + size_t bytes_in_buffer; /* # of bytes remaining in buffer */ + + void (*init_source) (j_decompress_ptr cinfo); + boolean (*fill_input_buffer) (j_decompress_ptr cinfo); + void (*skip_input_data) (j_decompress_ptr cinfo, long num_bytes); + boolean (*resync_to_restart) (j_decompress_ptr cinfo, int desired); + void (*term_source) (j_decompress_ptr cinfo); +}; + + +/* Memory manager object. + * Allocates "small" objects (a few K total), "large" objects (tens of K), + * and "really big" objects (virtual arrays with backing store if needed). + * The memory manager does not allow individual objects to be freed; rather, + * each created object is assigned to a pool, and whole pools can be freed + * at once. This is faster and more convenient than remembering exactly what + * to free, especially where malloc()/free() are not too speedy. + * NB: alloc routines never return NULL. They exit to error_exit if not + * successful. + */ + +#define JPOOL_PERMANENT 0 /* lasts until master record is destroyed */ +#define JPOOL_IMAGE 1 /* lasts until done with image/datastream */ +#define JPOOL_NUMPOOLS 2 + +typedef struct jvirt_sarray_control *jvirt_sarray_ptr; +typedef struct jvirt_barray_control *jvirt_barray_ptr; + + +struct jpeg_memory_mgr { + /* Method pointers */ + void *(*alloc_small) (j_common_ptr cinfo, int pool_id, size_t sizeofobject); + void *(*alloc_large) (j_common_ptr cinfo, int pool_id, + size_t sizeofobject); + JSAMPARRAY (*alloc_sarray) (j_common_ptr cinfo, int pool_id, + JDIMENSION samplesperrow, JDIMENSION numrows); + JBLOCKARRAY (*alloc_barray) (j_common_ptr cinfo, int pool_id, + JDIMENSION blocksperrow, JDIMENSION numrows); + jvirt_sarray_ptr (*request_virt_sarray) (j_common_ptr cinfo, int pool_id, + boolean pre_zero, + JDIMENSION samplesperrow, + JDIMENSION numrows, + JDIMENSION maxaccess); + jvirt_barray_ptr (*request_virt_barray) (j_common_ptr cinfo, int pool_id, + boolean pre_zero, + JDIMENSION blocksperrow, + JDIMENSION numrows, + JDIMENSION maxaccess); + void (*realize_virt_arrays) (j_common_ptr cinfo); + JSAMPARRAY (*access_virt_sarray) (j_common_ptr cinfo, jvirt_sarray_ptr ptr, + JDIMENSION start_row, JDIMENSION num_rows, + boolean writable); + JBLOCKARRAY (*access_virt_barray) (j_common_ptr cinfo, jvirt_barray_ptr ptr, + JDIMENSION start_row, JDIMENSION num_rows, + boolean writable); + void (*free_pool) (j_common_ptr cinfo, int pool_id); + void (*self_destruct) (j_common_ptr cinfo); + + /* Limit on memory allocation for this JPEG object. (Note that this is + * merely advisory, not a guaranteed maximum; it only affects the space + * used for virtual-array buffers.) May be changed by outer application + * after creating the JPEG object. + */ + long max_memory_to_use; + + /* Maximum allocation request accepted by alloc_large. */ + long max_alloc_chunk; +}; + + +/* Routine signature for application-supplied marker processing methods. + * Need not pass marker code since it is stored in cinfo->unread_marker. + */ +typedef boolean (*jpeg_marker_parser_method) (j_decompress_ptr cinfo); + + +/* Originally, this macro was used as a way of defining function prototypes + * for both modern compilers as well as older compilers that did not support + * prototype parameters. libjpeg-turbo has never supported these older, + * non-ANSI compilers, but the macro is still included because there is some + * software out there that uses it. + */ + +#define JPP(arglist) arglist + + +/* Default error-management setup */ +EXTERN(struct jpeg_error_mgr *) jpeg_std_error (struct jpeg_error_mgr *err); + +/* Initialization of JPEG compression objects. + * jpeg_create_compress() and jpeg_create_decompress() are the exported + * names that applications should call. These expand to calls on + * jpeg_CreateCompress and jpeg_CreateDecompress with additional information + * passed for version mismatch checking. + * NB: you must set up the error-manager BEFORE calling jpeg_create_xxx. + */ +#define jpeg_create_compress(cinfo) \ + jpeg_CreateCompress((cinfo), JPEG_LIB_VERSION, \ + (size_t) sizeof(struct jpeg_compress_struct)) +#define jpeg_create_decompress(cinfo) \ + jpeg_CreateDecompress((cinfo), JPEG_LIB_VERSION, \ + (size_t) sizeof(struct jpeg_decompress_struct)) +EXTERN(void) jpeg_CreateCompress (j_compress_ptr cinfo, int version, + size_t structsize); +EXTERN(void) jpeg_CreateDecompress (j_decompress_ptr cinfo, int version, + size_t structsize); +/* Destruction of JPEG compression objects */ +EXTERN(void) jpeg_destroy_compress (j_compress_ptr cinfo); +EXTERN(void) jpeg_destroy_decompress (j_decompress_ptr cinfo); + +/* Standard data source and destination managers: stdio streams. */ +/* Caller is responsible for opening the file before and closing after. */ +EXTERN(void) jpeg_stdio_dest (j_compress_ptr cinfo, FILE *outfile); +EXTERN(void) jpeg_stdio_src (j_decompress_ptr cinfo, FILE *infile); + +#if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED) +/* Data source and destination managers: memory buffers. */ +EXTERN(void) jpeg_mem_dest (j_compress_ptr cinfo, unsigned char **outbuffer, + unsigned long *outsize); +EXTERN(void) jpeg_mem_src (j_decompress_ptr cinfo, + const unsigned char *inbuffer, + unsigned long insize); +#endif + +/* Default parameter setup for compression */ +EXTERN(void) jpeg_set_defaults (j_compress_ptr cinfo); +/* Compression parameter setup aids */ +EXTERN(void) jpeg_set_colorspace (j_compress_ptr cinfo, + J_COLOR_SPACE colorspace); +EXTERN(void) jpeg_default_colorspace (j_compress_ptr cinfo); +EXTERN(void) jpeg_set_quality (j_compress_ptr cinfo, int quality, + boolean force_baseline); +EXTERN(void) jpeg_set_linear_quality (j_compress_ptr cinfo, int scale_factor, + boolean force_baseline); +#if JPEG_LIB_VERSION >= 70 +EXTERN(void) jpeg_default_qtables (j_compress_ptr cinfo, + boolean force_baseline); +#endif +EXTERN(void) jpeg_add_quant_table (j_compress_ptr cinfo, int which_tbl, + const unsigned int *basic_table, + int scale_factor, boolean force_baseline); +EXTERN(int) jpeg_quality_scaling (int quality); +EXTERN(void) jpeg_simple_progression (j_compress_ptr cinfo); +EXTERN(void) jpeg_suppress_tables (j_compress_ptr cinfo, boolean suppress); +EXTERN(JQUANT_TBL *) jpeg_alloc_quant_table (j_common_ptr cinfo); +EXTERN(JHUFF_TBL *) jpeg_alloc_huff_table (j_common_ptr cinfo); + +/* Main entry points for compression */ +EXTERN(void) jpeg_start_compress (j_compress_ptr cinfo, + boolean write_all_tables); +EXTERN(JDIMENSION) jpeg_write_scanlines (j_compress_ptr cinfo, + JSAMPARRAY scanlines, + JDIMENSION num_lines); +EXTERN(void) jpeg_finish_compress (j_compress_ptr cinfo); + +#if JPEG_LIB_VERSION >= 70 +/* Precalculate JPEG dimensions for current compression parameters. */ +EXTERN(void) jpeg_calc_jpeg_dimensions (j_compress_ptr cinfo); +#endif + +/* Replaces jpeg_write_scanlines when writing raw downsampled data. */ +EXTERN(JDIMENSION) jpeg_write_raw_data (j_compress_ptr cinfo, JSAMPIMAGE data, + JDIMENSION num_lines); + +/* Write a special marker. See libjpeg.txt concerning safe usage. */ +EXTERN(void) jpeg_write_marker (j_compress_ptr cinfo, int marker, + const JOCTET *dataptr, unsigned int datalen); +/* Same, but piecemeal. */ +EXTERN(void) jpeg_write_m_header (j_compress_ptr cinfo, int marker, + unsigned int datalen); +EXTERN(void) jpeg_write_m_byte (j_compress_ptr cinfo, int val); + +/* Alternate compression function: just write an abbreviated table file */ +EXTERN(void) jpeg_write_tables (j_compress_ptr cinfo); + +/* Decompression startup: read start of JPEG datastream to see what's there */ +EXTERN(int) jpeg_read_header (j_decompress_ptr cinfo, boolean require_image); +/* Return value is one of: */ +#define JPEG_SUSPENDED 0 /* Suspended due to lack of input data */ +#define JPEG_HEADER_OK 1 /* Found valid image datastream */ +#define JPEG_HEADER_TABLES_ONLY 2 /* Found valid table-specs-only datastream */ +/* If you pass require_image = TRUE (normal case), you need not check for + * a TABLES_ONLY return code; an abbreviated file will cause an error exit. + * JPEG_SUSPENDED is only possible if you use a data source module that can + * give a suspension return (the stdio source module doesn't). + */ + +/* Main entry points for decompression */ +EXTERN(boolean) jpeg_start_decompress (j_decompress_ptr cinfo); +EXTERN(JDIMENSION) jpeg_read_scanlines (j_decompress_ptr cinfo, + JSAMPARRAY scanlines, + JDIMENSION max_lines); +EXTERN(JDIMENSION) jpeg_skip_scanlines (j_decompress_ptr cinfo, + JDIMENSION num_lines); +EXTERN(void) jpeg_crop_scanline (j_decompress_ptr cinfo, JDIMENSION *xoffset, + JDIMENSION *width); +EXTERN(boolean) jpeg_finish_decompress (j_decompress_ptr cinfo); + +/* Replaces jpeg_read_scanlines when reading raw downsampled data. */ +EXTERN(JDIMENSION) jpeg_read_raw_data (j_decompress_ptr cinfo, JSAMPIMAGE data, + JDIMENSION max_lines); + +/* Additional entry points for buffered-image mode. */ +EXTERN(boolean) jpeg_has_multiple_scans (j_decompress_ptr cinfo); +EXTERN(boolean) jpeg_start_output (j_decompress_ptr cinfo, int scan_number); +EXTERN(boolean) jpeg_finish_output (j_decompress_ptr cinfo); +EXTERN(boolean) jpeg_input_complete (j_decompress_ptr cinfo); +EXTERN(void) jpeg_new_colormap (j_decompress_ptr cinfo); +EXTERN(int) jpeg_consume_input (j_decompress_ptr cinfo); +/* Return value is one of: */ +/* #define JPEG_SUSPENDED 0 Suspended due to lack of input data */ +#define JPEG_REACHED_SOS 1 /* Reached start of new scan */ +#define JPEG_REACHED_EOI 2 /* Reached end of image */ +#define JPEG_ROW_COMPLETED 3 /* Completed one iMCU row */ +#define JPEG_SCAN_COMPLETED 4 /* Completed last iMCU row of a scan */ + +/* Precalculate output dimensions for current decompression parameters. */ +#if JPEG_LIB_VERSION >= 80 +EXTERN(void) jpeg_core_output_dimensions (j_decompress_ptr cinfo); +#endif +EXTERN(void) jpeg_calc_output_dimensions (j_decompress_ptr cinfo); + +/* Control saving of COM and APPn markers into marker_list. */ +EXTERN(void) jpeg_save_markers (j_decompress_ptr cinfo, int marker_code, + unsigned int length_limit); + +/* Install a special processing method for COM or APPn markers. */ +EXTERN(void) jpeg_set_marker_processor (j_decompress_ptr cinfo, + int marker_code, + jpeg_marker_parser_method routine); + +/* Read or write raw DCT coefficients --- useful for lossless transcoding. */ +EXTERN(jvirt_barray_ptr *) jpeg_read_coefficients (j_decompress_ptr cinfo); +EXTERN(void) jpeg_write_coefficients (j_compress_ptr cinfo, + jvirt_barray_ptr *coef_arrays); +EXTERN(void) jpeg_copy_critical_parameters (j_decompress_ptr srcinfo, + j_compress_ptr dstinfo); + +/* If you choose to abort compression or decompression before completing + * jpeg_finish_(de)compress, then you need to clean up to release memory, + * temporary files, etc. You can just call jpeg_destroy_(de)compress + * if you're done with the JPEG object, but if you want to clean it up and + * reuse it, call this: + */ +EXTERN(void) jpeg_abort_compress (j_compress_ptr cinfo); +EXTERN(void) jpeg_abort_decompress (j_decompress_ptr cinfo); + +/* Generic versions of jpeg_abort and jpeg_destroy that work on either + * flavor of JPEG object. These may be more convenient in some places. + */ +EXTERN(void) jpeg_abort (j_common_ptr cinfo); +EXTERN(void) jpeg_destroy (j_common_ptr cinfo); + +/* Default restart-marker-resync procedure for use by data source modules */ +EXTERN(boolean) jpeg_resync_to_restart (j_decompress_ptr cinfo, int desired); + + +/* These marker codes are exported since applications and data source modules + * are likely to want to use them. + */ + +#define JPEG_RST0 0xD0 /* RST0 marker code */ +#define JPEG_EOI 0xD9 /* EOI marker code */ +#define JPEG_APP0 0xE0 /* APP0 marker code */ +#define JPEG_COM 0xFE /* COM marker code */ + + +/* If we have a brain-damaged compiler that emits warnings (or worse, errors) + * for structure definitions that are never filled in, keep it quiet by + * supplying dummy definitions for the various substructures. + */ + +#ifdef INCOMPLETE_TYPES_BROKEN +#ifndef JPEG_INTERNALS /* will be defined in jpegint.h */ +struct jvirt_sarray_control { long dummy; }; +struct jvirt_barray_control { long dummy; }; +struct jpeg_comp_master { long dummy; }; +struct jpeg_c_main_controller { long dummy; }; +struct jpeg_c_prep_controller { long dummy; }; +struct jpeg_c_coef_controller { long dummy; }; +struct jpeg_marker_writer { long dummy; }; +struct jpeg_color_converter { long dummy; }; +struct jpeg_downsampler { long dummy; }; +struct jpeg_forward_dct { long dummy; }; +struct jpeg_entropy_encoder { long dummy; }; +struct jpeg_decomp_master { long dummy; }; +struct jpeg_d_main_controller { long dummy; }; +struct jpeg_d_coef_controller { long dummy; }; +struct jpeg_d_post_controller { long dummy; }; +struct jpeg_input_controller { long dummy; }; +struct jpeg_marker_reader { long dummy; }; +struct jpeg_entropy_decoder { long dummy; }; +struct jpeg_inverse_dct { long dummy; }; +struct jpeg_upsampler { long dummy; }; +struct jpeg_color_deconverter { long dummy; }; +struct jpeg_color_quantizer { long dummy; }; +#endif /* JPEG_INTERNALS */ +#endif /* INCOMPLETE_TYPES_BROKEN */ + + +/* + * The JPEG library modules define JPEG_INTERNALS before including this file. + * The internal structure declarations are read only when that is true. + * Applications using the library should not include jpegint.h, but may wish + * to include jerror.h. + */ + +#ifdef JPEG_INTERNALS +#include "jpegint.h" /* fetch private declarations */ +#include "jerror.h" /* fetch error codes too */ +#endif + +#ifdef __cplusplus +#ifndef DONT_USE_EXTERN_C +} +#endif +#endif + +#endif /* JPEGLIB_H */ diff --git a/Builder/jni-1.11/simd/h/jsimd.h b/Builder/jni-1.11/simd/h/jsimd.h new file mode 100644 index 000000000..3aa0779b8 --- /dev/null +++ b/Builder/jni-1.11/simd/h/jsimd.h @@ -0,0 +1,93 @@ +/* + * jsimd.h + * + * Copyright 2009 Pierre Ossman for Cendio AB + * Copyright (C) 2011, 2014, D. R. Commander. + * Copyright (C) 2015, Matthieu Darbois. + * + * Based on the x86 SIMD extension for IJG JPEG library, + * Copyright (C) 1999-2006, MIYASAKA Masaru. + * For conditions of distribution and use, see copyright notice in jsimdext.inc + * + */ + +#include "jchuff.h" /* Declarations shared with jcphuff.c */ + +EXTERN(int) jsimd_can_rgb_ycc (void); +EXTERN(int) jsimd_can_rgb_gray (void); +EXTERN(int) jsimd_can_ycc_rgb (void); +EXTERN(int) jsimd_can_ycc_rgb565 (void); +EXTERN(int) jsimd_c_can_null_convert (void); + +EXTERN(void) jsimd_rgb_ycc_convert + (j_compress_ptr cinfo, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_rgb_gray_convert + (j_compress_ptr cinfo, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_ycc_rgb_convert + (j_decompress_ptr cinfo, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_rgb565_convert + (j_decompress_ptr cinfo, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_c_null_convert + (j_compress_ptr cinfo, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); + +EXTERN(int) jsimd_can_h2v2_downsample (void); +EXTERN(int) jsimd_can_h2v1_downsample (void); + +EXTERN(void) jsimd_h2v2_downsample + (j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data); + +EXTERN(int) jsimd_can_h2v2_smooth_downsample (void); + +EXTERN(void) jsimd_h2v2_smooth_downsample + (j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data); + +EXTERN(void) jsimd_h2v1_downsample + (j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data); + +EXTERN(int) jsimd_can_h2v2_upsample (void); +EXTERN(int) jsimd_can_h2v1_upsample (void); +EXTERN(int) jsimd_can_int_upsample (void); + +EXTERN(void) jsimd_h2v2_upsample + (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); +EXTERN(void) jsimd_h2v1_upsample + (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); +EXTERN(void) jsimd_int_upsample + (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); + +EXTERN(int) jsimd_can_h2v2_fancy_upsample (void); +EXTERN(int) jsimd_can_h2v1_fancy_upsample (void); + +EXTERN(void) jsimd_h2v2_fancy_upsample + (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); +EXTERN(void) jsimd_h2v1_fancy_upsample + (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); + +EXTERN(int) jsimd_can_h2v2_merged_upsample (void); +EXTERN(int) jsimd_can_h2v1_merged_upsample (void); + +EXTERN(void) jsimd_h2v2_merged_upsample + (j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_merged_upsample + (j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); + +EXTERN(int) jsimd_can_huff_encode_one_block (void); + +EXTERN(JOCTET*) jsimd_huff_encode_one_block + (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val, + c_derived_tbl *dctbl, c_derived_tbl *actbl); diff --git a/Builder/jni-1.11/simd/h/jsimddct.h b/Builder/jni-1.11/simd/h/jsimddct.h new file mode 100644 index 000000000..b19ab48d4 --- /dev/null +++ b/Builder/jni-1.11/simd/h/jsimddct.h @@ -0,0 +1,74 @@ +/* + * jsimddct.h + * + * Copyright 2009 Pierre Ossman for Cendio AB + * + * Based on the x86 SIMD extension for IJG JPEG library, + * Copyright (C) 1999-2006, MIYASAKA Masaru. + * For conditions of distribution and use, see copyright notice in jsimdext.inc + * + */ + +EXTERN(int) jsimd_can_convsamp (void); +EXTERN(int) jsimd_can_convsamp_float (void); + +EXTERN(void) jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, + DCTELEM *workspace); +EXTERN(void) jsimd_convsamp_float (JSAMPARRAY sample_data, + JDIMENSION start_col, + FAST_FLOAT *workspace); + +EXTERN(int) jsimd_can_fdct_islow (void); +EXTERN(int) jsimd_can_fdct_ifast (void); +EXTERN(int) jsimd_can_fdct_float (void); + +EXTERN(void) jsimd_fdct_islow (DCTELEM *data); +EXTERN(void) jsimd_fdct_ifast (DCTELEM *data); +EXTERN(void) jsimd_fdct_float (FAST_FLOAT *data); + +EXTERN(int) jsimd_can_quantize (void); +EXTERN(int) jsimd_can_quantize_float (void); + +EXTERN(void) jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors, + DCTELEM *workspace); +EXTERN(void) jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors, + FAST_FLOAT *workspace); + +EXTERN(int) jsimd_can_idct_2x2 (void); +EXTERN(int) jsimd_can_idct_4x4 (void); +EXTERN(int) jsimd_can_idct_6x6 (void); +EXTERN(int) jsimd_can_idct_12x12 (void); + +EXTERN(void) jsimd_idct_2x2 (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); +EXTERN(void) jsimd_idct_4x4 (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); +EXTERN(void) jsimd_idct_6x6 (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); +EXTERN(void) jsimd_idct_12x12 (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +EXTERN(int) jsimd_can_idct_islow (void); +EXTERN(int) jsimd_can_idct_ifast (void); +EXTERN(int) jsimd_can_idct_float (void); + +EXTERN(void) jsimd_idct_islow (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); +EXTERN(void) jsimd_idct_ifast (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); +EXTERN(void) jsimd_idct_float (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); diff --git a/Builder/jni-1.11/simd/i386/lib/jsimd_i386.a b/Builder/jni-1.11/simd/i386/lib/jsimd_i386.a deleted file mode 100644 index b5d092dfe..000000000 Binary files a/Builder/jni-1.11/simd/i386/lib/jsimd_i386.a and /dev/null differ diff --git a/Builder/jni-1.11/simd/i386/src/jcclrmmx.asm b/Builder/jni-1.11/simd/i386/src/jcclrmmx.asm deleted file mode 100644 index e09525310..000000000 --- a/Builder/jni-1.11/simd/i386/src/jcclrmmx.asm +++ /dev/null @@ -1,477 +0,0 @@ -; -; jcclrmmx.asm - colorspace conversion (MMX) -; -; Copyright 2009 Pierre Ossman for Cendio AB -; -; Based on -; x86 SIMD extension for IJG JPEG library -; Copyright (C) 1999-2006, MIYASAKA Masaru. -; For conditions of distribution and use, see copyright notice in jsimdext.inc -; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] - -%include "jcolsamp.inc" - -; -------------------------------------------------------------------------- -; -; Convert some rows of samples to the output colorspace. -; -; GLOBAL(void) -; jsimd_rgb_ycc_convert_mmx (JDIMENSION img_width, -; JSAMPARRAY input_buf, JSAMPIMAGE output_buf, -; JDIMENSION output_row, int num_rows); -; - -%define img_width(b) (b)+8 ; JDIMENSION img_width -%define input_buf(b) (b)+12 ; JSAMPARRAY input_buf -%define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf -%define output_row(b) (b)+20 ; JDIMENSION output_row -%define num_rows(b) (b)+24 ; int num_rows - -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] -%define WK_NUM 8 -%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr - - align 16 - global EXTN(jsimd_rgb_ycc_convert_mmx) - -EXTN(jsimd_rgb_ycc_convert_mmx): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [wk(0)] - pushpic eax ; make a room for GOT address - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - get_GOT ebx ; get GOT address - movpic POINTER [gotptr], ebx ; save GOT address - - mov ecx, JDIMENSION [img_width(eax)] ; num_cols - test ecx,ecx - jz near .return - - push ecx - - mov esi, JSAMPIMAGE [output_buf(eax)] - mov ecx, JDIMENSION [output_row(eax)] - mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY] - mov ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY] - mov edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY] - lea edi, [edi+ecx*SIZEOF_JSAMPROW] - lea ebx, [ebx+ecx*SIZEOF_JSAMPROW] - lea edx, [edx+ecx*SIZEOF_JSAMPROW] - - pop ecx - - mov esi, JSAMPARRAY [input_buf(eax)] - mov eax, INT [num_rows(eax)] - test eax,eax - jle near .return - alignx 16,7 -.rowloop: - pushpic eax - push edx - push ebx - push edi - push esi - push ecx ; col - - mov esi, JSAMPROW [esi] ; inptr - mov edi, JSAMPROW [edi] ; outptr0 - mov ebx, JSAMPROW [ebx] ; outptr1 - mov edx, JSAMPROW [edx] ; outptr2 - movpic eax, POINTER [gotptr] ; load GOT address (eax) - - cmp ecx, byte SIZEOF_MMWORD - jae short .columnloop - alignx 16,7 - -%if RGB_PIXELSIZE == 3 ; --------------- - -.column_ld1: - push eax - push edx - lea ecx,[ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE - test cl, SIZEOF_BYTE - jz short .column_ld2 - sub ecx, byte SIZEOF_BYTE - xor eax,eax - mov al, BYTE [esi+ecx] -.column_ld2: - test cl, SIZEOF_WORD - jz short .column_ld4 - sub ecx, byte SIZEOF_WORD - xor edx,edx - mov dx, WORD [esi+ecx] - shl eax, WORD_BIT - or eax,edx -.column_ld4: - movd mmA,eax - pop edx - pop eax - test cl, SIZEOF_DWORD - jz short .column_ld8 - sub ecx, byte SIZEOF_DWORD - movd mmG, DWORD [esi+ecx] - psllq mmA, DWORD_BIT - por mmA,mmG -.column_ld8: - test cl, SIZEOF_MMWORD - jz short .column_ld16 - movq mmG,mmA - movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] - mov ecx, SIZEOF_MMWORD - jmp short .rgb_ycc_cnv -.column_ld16: - test cl, 2*SIZEOF_MMWORD - mov ecx, SIZEOF_MMWORD - jz short .rgb_ycc_cnv - movq mmF,mmA - movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] - movq mmG, MMWORD [esi+1*SIZEOF_MMWORD] - jmp short .rgb_ycc_cnv - alignx 16,7 - -.columnloop: - movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] - movq mmG, MMWORD [esi+1*SIZEOF_MMWORD] - movq mmF, MMWORD [esi+2*SIZEOF_MMWORD] - -.rgb_ycc_cnv: - ; mmA=(00 10 20 01 11 21 02 12) - ; mmG=(22 03 13 23 04 14 24 05) - ; mmF=(15 25 06 16 26 07 17 27) - - movq mmD,mmA - psllq mmA,4*BYTE_BIT ; mmA=(-- -- -- -- 00 10 20 01) - psrlq mmD,4*BYTE_BIT ; mmD=(11 21 02 12 -- -- -- --) - - punpckhbw mmA,mmG ; mmA=(00 04 10 14 20 24 01 05) - psllq mmG,4*BYTE_BIT ; mmG=(-- -- -- -- 22 03 13 23) - - punpcklbw mmD,mmF ; mmD=(11 15 21 25 02 06 12 16) - punpckhbw mmG,mmF ; mmG=(22 26 03 07 13 17 23 27) - - movq mmE,mmA - psllq mmA,4*BYTE_BIT ; mmA=(-- -- -- -- 00 04 10 14) - psrlq mmE,4*BYTE_BIT ; mmE=(20 24 01 05 -- -- -- --) - - punpckhbw mmA,mmD ; mmA=(00 02 04 06 10 12 14 16) - psllq mmD,4*BYTE_BIT ; mmD=(-- -- -- -- 11 15 21 25) - - punpcklbw mmE,mmG ; mmE=(20 22 24 26 01 03 05 07) - punpckhbw mmD,mmG ; mmD=(11 13 15 17 21 23 25 27) - - pxor mmH,mmH - - movq mmC,mmA - punpcklbw mmA,mmH ; mmA=(00 02 04 06) - punpckhbw mmC,mmH ; mmC=(10 12 14 16) - - movq mmB,mmE - punpcklbw mmE,mmH ; mmE=(20 22 24 26) - punpckhbw mmB,mmH ; mmB=(01 03 05 07) - - movq mmF,mmD - punpcklbw mmD,mmH ; mmD=(11 13 15 17) - punpckhbw mmF,mmH ; mmF=(21 23 25 27) - -%else ; RGB_PIXELSIZE == 4 ; ----------- - -.column_ld1: - test cl, SIZEOF_MMWORD/8 - jz short .column_ld2 - sub ecx, byte SIZEOF_MMWORD/8 - movd mmA, DWORD [esi+ecx*RGB_PIXELSIZE] -.column_ld2: - test cl, SIZEOF_MMWORD/4 - jz short .column_ld4 - sub ecx, byte SIZEOF_MMWORD/4 - movq mmF,mmA - movq mmA, MMWORD [esi+ecx*RGB_PIXELSIZE] -.column_ld4: - test cl, SIZEOF_MMWORD/2 - mov ecx, SIZEOF_MMWORD - jz short .rgb_ycc_cnv - movq mmD,mmA - movq mmC,mmF - movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] - movq mmF, MMWORD [esi+1*SIZEOF_MMWORD] - jmp short .rgb_ycc_cnv - alignx 16,7 - -.columnloop: - movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] - movq mmF, MMWORD [esi+1*SIZEOF_MMWORD] - movq mmD, MMWORD [esi+2*SIZEOF_MMWORD] - movq mmC, MMWORD [esi+3*SIZEOF_MMWORD] - -.rgb_ycc_cnv: - ; mmA=(00 10 20 30 01 11 21 31) - ; mmF=(02 12 22 32 03 13 23 33) - ; mmD=(04 14 24 34 05 15 25 35) - ; mmC=(06 16 26 36 07 17 27 37) - - movq mmB,mmA - punpcklbw mmA,mmF ; mmA=(00 02 10 12 20 22 30 32) - punpckhbw mmB,mmF ; mmB=(01 03 11 13 21 23 31 33) - - movq mmG,mmD - punpcklbw mmD,mmC ; mmD=(04 06 14 16 24 26 34 36) - punpckhbw mmG,mmC ; mmG=(05 07 15 17 25 27 35 37) - - movq mmE,mmA - punpcklwd mmA,mmD ; mmA=(00 02 04 06 10 12 14 16) - punpckhwd mmE,mmD ; mmE=(20 22 24 26 30 32 34 36) - - movq mmH,mmB - punpcklwd mmB,mmG ; mmB=(01 03 05 07 11 13 15 17) - punpckhwd mmH,mmG ; mmH=(21 23 25 27 31 33 35 37) - - pxor mmF,mmF - - movq mmC,mmA - punpcklbw mmA,mmF ; mmA=(00 02 04 06) - punpckhbw mmC,mmF ; mmC=(10 12 14 16) - - movq mmD,mmB - punpcklbw mmB,mmF ; mmB=(01 03 05 07) - punpckhbw mmD,mmF ; mmD=(11 13 15 17) - - movq mmG,mmE - punpcklbw mmE,mmF ; mmE=(20 22 24 26) - punpckhbw mmG,mmF ; mmG=(30 32 34 36) - - punpcklbw mmF,mmH - punpckhbw mmH,mmH - psrlw mmF,BYTE_BIT ; mmF=(21 23 25 27) - psrlw mmH,BYTE_BIT ; mmH=(31 33 35 37) - -%endif ; RGB_PIXELSIZE ; --------------- - - ; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE - ; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO - - ; (Original) - ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B - ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE - ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE - ; - ; (This implementation) - ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G - ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE - ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE - - movq MMWORD [wk(0)], mm0 ; wk(0)=RE - movq MMWORD [wk(1)], mm1 ; wk(1)=RO - movq MMWORD [wk(2)], mm4 ; wk(2)=BE - movq MMWORD [wk(3)], mm5 ; wk(3)=BO - - movq mm6,mm1 - punpcklwd mm1,mm3 - punpckhwd mm6,mm3 - movq mm7,mm1 - movq mm4,mm6 - pmaddwd mm1,[GOTOFF(eax,PW_F0299_F0337)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337) - pmaddwd mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337) - pmaddwd mm7,[GOTOFF(eax,PW_MF016_MF033)] ; mm7=ROL*-FIX(0.168)+GOL*-FIX(0.331) - pmaddwd mm4,[GOTOFF(eax,PW_MF016_MF033)] ; mm4=ROH*-FIX(0.168)+GOH*-FIX(0.331) - - movq MMWORD [wk(4)], mm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337) - movq MMWORD [wk(5)], mm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337) - - pxor mm1,mm1 - pxor mm6,mm6 - punpcklwd mm1,mm5 ; mm1=BOL - punpckhwd mm6,mm5 ; mm6=BOH - psrld mm1,1 ; mm1=BOL*FIX(0.500) - psrld mm6,1 ; mm6=BOH*FIX(0.500) - - movq mm5,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm5=[PD_ONEHALFM1_CJ] - - paddd mm7,mm1 - paddd mm4,mm6 - paddd mm7,mm5 - paddd mm4,mm5 - psrld mm7,SCALEBITS ; mm7=CbOL - psrld mm4,SCALEBITS ; mm4=CbOH - packssdw mm7,mm4 ; mm7=CbO - - movq mm1, MMWORD [wk(2)] ; mm1=BE - - movq mm6,mm0 - punpcklwd mm0,mm2 - punpckhwd mm6,mm2 - movq mm5,mm0 - movq mm4,mm6 - pmaddwd mm0,[GOTOFF(eax,PW_F0299_F0337)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337) - pmaddwd mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337) - pmaddwd mm5,[GOTOFF(eax,PW_MF016_MF033)] ; mm5=REL*-FIX(0.168)+GEL*-FIX(0.331) - pmaddwd mm4,[GOTOFF(eax,PW_MF016_MF033)] ; mm4=REH*-FIX(0.168)+GEH*-FIX(0.331) - - movq MMWORD [wk(6)], mm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337) - movq MMWORD [wk(7)], mm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337) - - pxor mm0,mm0 - pxor mm6,mm6 - punpcklwd mm0,mm1 ; mm0=BEL - punpckhwd mm6,mm1 ; mm6=BEH - psrld mm0,1 ; mm0=BEL*FIX(0.500) - psrld mm6,1 ; mm6=BEH*FIX(0.500) - - movq mm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ] - - paddd mm5,mm0 - paddd mm4,mm6 - paddd mm5,mm1 - paddd mm4,mm1 - psrld mm5,SCALEBITS ; mm5=CbEL - psrld mm4,SCALEBITS ; mm4=CbEH - packssdw mm5,mm4 ; mm5=CbE - - psllw mm7,BYTE_BIT - por mm5,mm7 ; mm5=Cb - movq MMWORD [ebx], mm5 ; Save Cb - - movq mm0, MMWORD [wk(3)] ; mm0=BO - movq mm6, MMWORD [wk(2)] ; mm6=BE - movq mm1, MMWORD [wk(1)] ; mm1=RO - - movq mm4,mm0 - punpcklwd mm0,mm3 - punpckhwd mm4,mm3 - movq mm7,mm0 - movq mm5,mm4 - pmaddwd mm0,[GOTOFF(eax,PW_F0114_F0250)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250) - pmaddwd mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250) - pmaddwd mm7,[GOTOFF(eax,PW_MF008_MF041)] ; mm7=BOL*-FIX(0.081)+GOL*-FIX(0.418) - pmaddwd mm5,[GOTOFF(eax,PW_MF008_MF041)] ; mm5=BOH*-FIX(0.081)+GOH*-FIX(0.418) - - movq mm3,[GOTOFF(eax,PD_ONEHALF)] ; mm3=[PD_ONEHALF] - - paddd mm0, MMWORD [wk(4)] - paddd mm4, MMWORD [wk(5)] - paddd mm0,mm3 - paddd mm4,mm3 - psrld mm0,SCALEBITS ; mm0=YOL - psrld mm4,SCALEBITS ; mm4=YOH - packssdw mm0,mm4 ; mm0=YO - - pxor mm3,mm3 - pxor mm4,mm4 - punpcklwd mm3,mm1 ; mm3=ROL - punpckhwd mm4,mm1 ; mm4=ROH - psrld mm3,1 ; mm3=ROL*FIX(0.500) - psrld mm4,1 ; mm4=ROH*FIX(0.500) - - movq mm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ] - - paddd mm7,mm3 - paddd mm5,mm4 - paddd mm7,mm1 - paddd mm5,mm1 - psrld mm7,SCALEBITS ; mm7=CrOL - psrld mm5,SCALEBITS ; mm5=CrOH - packssdw mm7,mm5 ; mm7=CrO - - movq mm3, MMWORD [wk(0)] ; mm3=RE - - movq mm4,mm6 - punpcklwd mm6,mm2 - punpckhwd mm4,mm2 - movq mm1,mm6 - movq mm5,mm4 - pmaddwd mm6,[GOTOFF(eax,PW_F0114_F0250)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250) - pmaddwd mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250) - pmaddwd mm1,[GOTOFF(eax,PW_MF008_MF041)] ; mm1=BEL*-FIX(0.081)+GEL*-FIX(0.418) - pmaddwd mm5,[GOTOFF(eax,PW_MF008_MF041)] ; mm5=BEH*-FIX(0.081)+GEH*-FIX(0.418) - - movq mm2,[GOTOFF(eax,PD_ONEHALF)] ; mm2=[PD_ONEHALF] - - paddd mm6, MMWORD [wk(6)] - paddd mm4, MMWORD [wk(7)] - paddd mm6,mm2 - paddd mm4,mm2 - psrld mm6,SCALEBITS ; mm6=YEL - psrld mm4,SCALEBITS ; mm4=YEH - packssdw mm6,mm4 ; mm6=YE - - psllw mm0,BYTE_BIT - por mm6,mm0 ; mm6=Y - movq MMWORD [edi], mm6 ; Save Y - - pxor mm2,mm2 - pxor mm4,mm4 - punpcklwd mm2,mm3 ; mm2=REL - punpckhwd mm4,mm3 ; mm4=REH - psrld mm2,1 ; mm2=REL*FIX(0.500) - psrld mm4,1 ; mm4=REH*FIX(0.500) - - movq mm0,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm0=[PD_ONEHALFM1_CJ] - - paddd mm1,mm2 - paddd mm5,mm4 - paddd mm1,mm0 - paddd mm5,mm0 - psrld mm1,SCALEBITS ; mm1=CrEL - psrld mm5,SCALEBITS ; mm5=CrEH - packssdw mm1,mm5 ; mm1=CrE - - psllw mm7,BYTE_BIT - por mm1,mm7 ; mm1=Cr - movq MMWORD [edx], mm1 ; Save Cr - - sub ecx, byte SIZEOF_MMWORD - add esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; inptr - add edi, byte SIZEOF_MMWORD ; outptr0 - add ebx, byte SIZEOF_MMWORD ; outptr1 - add edx, byte SIZEOF_MMWORD ; outptr2 - cmp ecx, byte SIZEOF_MMWORD - jae near .columnloop - test ecx,ecx - jnz near .column_ld1 - - pop ecx ; col - pop esi - pop edi - pop ebx - pop edx - poppic eax - - add esi, byte SIZEOF_JSAMPROW ; input_buf - add edi, byte SIZEOF_JSAMPROW - add ebx, byte SIZEOF_JSAMPROW - add edx, byte SIZEOF_JSAMPROW - dec eax ; num_rows - jg near .rowloop - - emms ; empty MMX state - -.return: - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret - -; For some reason, the OS X linker does not honor the request to align the -; segment unless we do this. - align 16 diff --git a/Builder/jni-1.11/simd/i386/src/jcclrss2.asm b/Builder/jni-1.11/simd/i386/src/jcclrss2.asm deleted file mode 100644 index 517b70563..000000000 --- a/Builder/jni-1.11/simd/i386/src/jcclrss2.asm +++ /dev/null @@ -1,503 +0,0 @@ -; -; jcclrss2.asm - colorspace conversion (SSE2) -; -; x86 SIMD extension for IJG JPEG library -; Copyright (C) 1999-2006, MIYASAKA Masaru. -; For conditions of distribution and use, see copyright notice in jsimdext.inc -; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] - -%include "jcolsamp.inc" - -; -------------------------------------------------------------------------- -; -; Convert some rows of samples to the output colorspace. -; -; GLOBAL(void) -; jsimd_rgb_ycc_convert_sse2 (JDIMENSION img_width, -; JSAMPARRAY input_buf, JSAMPIMAGE output_buf, -; JDIMENSION output_row, int num_rows); -; - -%define img_width(b) (b)+8 ; JDIMENSION img_width -%define input_buf(b) (b)+12 ; JSAMPARRAY input_buf -%define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf -%define output_row(b) (b)+20 ; JDIMENSION output_row -%define num_rows(b) (b)+24 ; int num_rows - -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 8 -%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr - - align 16 - - global EXTN(jsimd_rgb_ycc_convert_sse2) - -EXTN(jsimd_rgb_ycc_convert_sse2): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [wk(0)] - pushpic eax ; make a room for GOT address - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - get_GOT ebx ; get GOT address - movpic POINTER [gotptr], ebx ; save GOT address - - mov ecx, JDIMENSION [img_width(eax)] - test ecx,ecx - jz near .return - - push ecx - - mov esi, JSAMPIMAGE [output_buf(eax)] - mov ecx, JDIMENSION [output_row(eax)] - mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY] - mov ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY] - mov edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY] - lea edi, [edi+ecx*SIZEOF_JSAMPROW] - lea ebx, [ebx+ecx*SIZEOF_JSAMPROW] - lea edx, [edx+ecx*SIZEOF_JSAMPROW] - - pop ecx - - mov esi, JSAMPARRAY [input_buf(eax)] - mov eax, INT [num_rows(eax)] - test eax,eax - jle near .return - alignx 16,7 -.rowloop: - pushpic eax - push edx - push ebx - push edi - push esi - push ecx ; col - - mov esi, JSAMPROW [esi] ; inptr - mov edi, JSAMPROW [edi] ; outptr0 - mov ebx, JSAMPROW [ebx] ; outptr1 - mov edx, JSAMPROW [edx] ; outptr2 - movpic eax, POINTER [gotptr] ; load GOT address (eax) - - cmp ecx, byte SIZEOF_XMMWORD - jae near .columnloop - alignx 16,7 - -%if RGB_PIXELSIZE == 3 ; --------------- - -.column_ld1: - push eax - push edx - lea ecx,[ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE - test cl, SIZEOF_BYTE - jz short .column_ld2 - sub ecx, byte SIZEOF_BYTE - movzx eax, BYTE [esi+ecx] -.column_ld2: - test cl, SIZEOF_WORD - jz short .column_ld4 - sub ecx, byte SIZEOF_WORD - movzx edx, WORD [esi+ecx] - shl eax, WORD_BIT - or eax,edx -.column_ld4: - movd xmmA,eax - pop edx - pop eax - test cl, SIZEOF_DWORD - jz short .column_ld8 - sub ecx, byte SIZEOF_DWORD - movd xmmF, XMM_DWORD [esi+ecx] - pslldq xmmA, SIZEOF_DWORD - por xmmA,xmmF -.column_ld8: - test cl, SIZEOF_MMWORD - jz short .column_ld16 - sub ecx, byte SIZEOF_MMWORD - movq xmmB, XMM_MMWORD [esi+ecx] - pslldq xmmA, SIZEOF_MMWORD - por xmmA,xmmB -.column_ld16: - test cl, SIZEOF_XMMWORD - jz short .column_ld32 - movdqa xmmF,xmmA - movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] - mov ecx, SIZEOF_XMMWORD - jmp short .rgb_ycc_cnv -.column_ld32: - test cl, 2*SIZEOF_XMMWORD - mov ecx, SIZEOF_XMMWORD - jz short .rgb_ycc_cnv - movdqa xmmB,xmmA - movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] - movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] - jmp short .rgb_ycc_cnv - alignx 16,7 - -.columnloop: - movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] - movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] - movdqu xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD] - -.rgb_ycc_cnv: - ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) - ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) - ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) - - movdqa xmmG,xmmA - pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) - psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) - - punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) - pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) - - punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) - punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) - - movdqa xmmD,xmmA - pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) - psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) - - punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) - pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) - - punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) - punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) - - movdqa xmmE,xmmA - pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) - psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) - - punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) - pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) - - punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) - punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) - - pxor xmmH,xmmH - - movdqa xmmC,xmmA - punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) - punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) - - movdqa xmmB,xmmE - punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) - punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) - - movdqa xmmF,xmmD - punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) - punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) - -%else ; RGB_PIXELSIZE == 4 ; ----------- - -.column_ld1: - test cl, SIZEOF_XMMWORD/16 - jz short .column_ld2 - sub ecx, byte SIZEOF_XMMWORD/16 - movd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE] -.column_ld2: - test cl, SIZEOF_XMMWORD/8 - jz short .column_ld4 - sub ecx, byte SIZEOF_XMMWORD/8 - movq xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE] - pslldq xmmA, SIZEOF_MMWORD - por xmmA,xmmE -.column_ld4: - test cl, SIZEOF_XMMWORD/4 - jz short .column_ld8 - sub ecx, byte SIZEOF_XMMWORD/4 - movdqa xmmE,xmmA - movdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE] -.column_ld8: - test cl, SIZEOF_XMMWORD/2 - mov ecx, SIZEOF_XMMWORD - jz short .rgb_ycc_cnv - movdqa xmmF,xmmA - movdqa xmmH,xmmE - movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] - movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] - jmp short .rgb_ycc_cnv - alignx 16,7 - -.columnloop: - movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] - movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] - movdqu xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD] - movdqu xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD] - -.rgb_ycc_cnv: - ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) - ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) - ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) - ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) - - movdqa xmmD,xmmA - punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) - punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) - - movdqa xmmC,xmmF - punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) - punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) - - movdqa xmmB,xmmA - punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) - punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) - - movdqa xmmG,xmmD - punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) - punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) - - movdqa xmmE,xmmA - punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) - punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) - - movdqa xmmH,xmmB - punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) - punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) - - pxor xmmF,xmmF - - movdqa xmmC,xmmA - punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) - punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) - - movdqa xmmD,xmmB - punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) - punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) - - movdqa xmmG,xmmE - punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) - punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) - - punpcklbw xmmF,xmmH - punpckhbw xmmH,xmmH - psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) - psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) - -%endif ; RGB_PIXELSIZE ; --------------- - - ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE - ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO - - ; (Original) - ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B - ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE - ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE - ; - ; (This implementation) - ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G - ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE - ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE - - movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=RE - movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=RO - movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE - movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO - - movdqa xmm6,xmm1 - punpcklwd xmm1,xmm3 - punpckhwd xmm6,xmm3 - movdqa xmm7,xmm1 - movdqa xmm4,xmm6 - pmaddwd xmm1,[GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) - pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) - pmaddwd xmm7,[GOTOFF(eax,PW_MF016_MF033)] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331) - pmaddwd xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331) - - movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337) - movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337) - - pxor xmm1,xmm1 - pxor xmm6,xmm6 - punpcklwd xmm1,xmm5 ; xmm1=BOL - punpckhwd xmm6,xmm5 ; xmm6=BOH - psrld xmm1,1 ; xmm1=BOL*FIX(0.500) - psrld xmm6,1 ; xmm6=BOH*FIX(0.500) - - movdqa xmm5,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm5=[PD_ONEHALFM1_CJ] - - paddd xmm7,xmm1 - paddd xmm4,xmm6 - paddd xmm7,xmm5 - paddd xmm4,xmm5 - psrld xmm7,SCALEBITS ; xmm7=CbOL - psrld xmm4,SCALEBITS ; xmm4=CbOH - packssdw xmm7,xmm4 ; xmm7=CbO - - movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE - - movdqa xmm6,xmm0 - punpcklwd xmm0,xmm2 - punpckhwd xmm6,xmm2 - movdqa xmm5,xmm0 - movdqa xmm4,xmm6 - pmaddwd xmm0,[GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) - pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) - pmaddwd xmm5,[GOTOFF(eax,PW_MF016_MF033)] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331) - pmaddwd xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331) - - movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337) - movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337) - - pxor xmm0,xmm0 - pxor xmm6,xmm6 - punpcklwd xmm0,xmm1 ; xmm0=BEL - punpckhwd xmm6,xmm1 ; xmm6=BEH - psrld xmm0,1 ; xmm0=BEL*FIX(0.500) - psrld xmm6,1 ; xmm6=BEH*FIX(0.500) - - movdqa xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ] - - paddd xmm5,xmm0 - paddd xmm4,xmm6 - paddd xmm5,xmm1 - paddd xmm4,xmm1 - psrld xmm5,SCALEBITS ; xmm5=CbEL - psrld xmm4,SCALEBITS ; xmm4=CbEH - packssdw xmm5,xmm4 ; xmm5=CbE - - psllw xmm7,BYTE_BIT - por xmm5,xmm7 ; xmm5=Cb - movdqa XMMWORD [ebx], xmm5 ; Save Cb - - movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO - movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE - movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO - - movdqa xmm4,xmm0 - punpcklwd xmm0,xmm3 - punpckhwd xmm4,xmm3 - movdqa xmm7,xmm0 - movdqa xmm5,xmm4 - pmaddwd xmm0,[GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) - pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) - pmaddwd xmm7,[GOTOFF(eax,PW_MF008_MF041)] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418) - pmaddwd xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418) - - movdqa xmm3,[GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF] - - paddd xmm0, XMMWORD [wk(4)] - paddd xmm4, XMMWORD [wk(5)] - paddd xmm0,xmm3 - paddd xmm4,xmm3 - psrld xmm0,SCALEBITS ; xmm0=YOL - psrld xmm4,SCALEBITS ; xmm4=YOH - packssdw xmm0,xmm4 ; xmm0=YO - - pxor xmm3,xmm3 - pxor xmm4,xmm4 - punpcklwd xmm3,xmm1 ; xmm3=ROL - punpckhwd xmm4,xmm1 ; xmm4=ROH - psrld xmm3,1 ; xmm3=ROL*FIX(0.500) - psrld xmm4,1 ; xmm4=ROH*FIX(0.500) - - movdqa xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ] - - paddd xmm7,xmm3 - paddd xmm5,xmm4 - paddd xmm7,xmm1 - paddd xmm5,xmm1 - psrld xmm7,SCALEBITS ; xmm7=CrOL - psrld xmm5,SCALEBITS ; xmm5=CrOH - packssdw xmm7,xmm5 ; xmm7=CrO - - movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE - - movdqa xmm4,xmm6 - punpcklwd xmm6,xmm2 - punpckhwd xmm4,xmm2 - movdqa xmm1,xmm6 - movdqa xmm5,xmm4 - pmaddwd xmm6,[GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) - pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) - pmaddwd xmm1,[GOTOFF(eax,PW_MF008_MF041)] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418) - pmaddwd xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418) - - movdqa xmm2,[GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF] - - paddd xmm6, XMMWORD [wk(6)] - paddd xmm4, XMMWORD [wk(7)] - paddd xmm6,xmm2 - paddd xmm4,xmm2 - psrld xmm6,SCALEBITS ; xmm6=YEL - psrld xmm4,SCALEBITS ; xmm4=YEH - packssdw xmm6,xmm4 ; xmm6=YE - - psllw xmm0,BYTE_BIT - por xmm6,xmm0 ; xmm6=Y - movdqa XMMWORD [edi], xmm6 ; Save Y - - pxor xmm2,xmm2 - pxor xmm4,xmm4 - punpcklwd xmm2,xmm3 ; xmm2=REL - punpckhwd xmm4,xmm3 ; xmm4=REH - psrld xmm2,1 ; xmm2=REL*FIX(0.500) - psrld xmm4,1 ; xmm4=REH*FIX(0.500) - - movdqa xmm0,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm0=[PD_ONEHALFM1_CJ] - - paddd xmm1,xmm2 - paddd xmm5,xmm4 - paddd xmm1,xmm0 - paddd xmm5,xmm0 - psrld xmm1,SCALEBITS ; xmm1=CrEL - psrld xmm5,SCALEBITS ; xmm5=CrEH - packssdw xmm1,xmm5 ; xmm1=CrE - - psllw xmm7,BYTE_BIT - por xmm1,xmm7 ; xmm1=Cr - movdqa XMMWORD [edx], xmm1 ; Save Cr - - sub ecx, byte SIZEOF_XMMWORD - add esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr - add edi, byte SIZEOF_XMMWORD ; outptr0 - add ebx, byte SIZEOF_XMMWORD ; outptr1 - add edx, byte SIZEOF_XMMWORD ; outptr2 - cmp ecx, byte SIZEOF_XMMWORD - jae near .columnloop - test ecx,ecx - jnz near .column_ld1 - - pop ecx ; col - pop esi - pop edi - pop ebx - pop edx - poppic eax - - add esi, byte SIZEOF_JSAMPROW ; input_buf - add edi, byte SIZEOF_JSAMPROW - add ebx, byte SIZEOF_JSAMPROW - add edx, byte SIZEOF_JSAMPROW - dec eax ; num_rows - jg near .rowloop - -.return: - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret - -; For some reason, the OS X linker does not honor the request to align the -; segment unless we do this. - align 16 diff --git a/Builder/jni-1.11/simd/i386/src/jcgrymmx.asm b/Builder/jni-1.11/simd/i386/src/jcgrymmx.asm deleted file mode 100644 index bbeea09be..000000000 --- a/Builder/jni-1.11/simd/i386/src/jcgrymmx.asm +++ /dev/null @@ -1,357 +0,0 @@ -; -; jcgrymmx.asm - grayscale colorspace conversion (MMX) -; -; Copyright 2009 Pierre Ossman for Cendio AB -; Copyright 2011 D. R. Commander -; -; Based on -; x86 SIMD extension for IJG JPEG library -; Copyright (C) 1999-2006, MIYASAKA Masaru. -; For conditions of distribution and use, see copyright notice in jsimdext.inc -; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] - -%include "jcolsamp.inc" - -; -------------------------------------------------------------------------- -; -; Convert some rows of samples to the output colorspace. -; -; GLOBAL(void) -; jsimd_rgb_gray_convert_mmx (JDIMENSION img_width, -; JSAMPARRAY input_buf, JSAMPIMAGE output_buf, -; JDIMENSION output_row, int num_rows); -; - -%define img_width(b) (b)+8 ; JDIMENSION img_width -%define input_buf(b) (b)+12 ; JSAMPARRAY input_buf -%define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf -%define output_row(b) (b)+20 ; JDIMENSION output_row -%define num_rows(b) (b)+24 ; int num_rows - -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] -%define WK_NUM 2 -%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr - - align 16 - global EXTN(jsimd_rgb_gray_convert_mmx) - -EXTN(jsimd_rgb_gray_convert_mmx): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [wk(0)] - pushpic eax ; make a room for GOT address - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - get_GOT ebx ; get GOT address - movpic POINTER [gotptr], ebx ; save GOT address - - mov ecx, JDIMENSION [img_width(eax)] ; num_cols - test ecx,ecx - jz near .return - - push ecx - - mov esi, JSAMPIMAGE [output_buf(eax)] - mov ecx, JDIMENSION [output_row(eax)] - mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY] - lea edi, [edi+ecx*SIZEOF_JSAMPROW] - - pop ecx - - mov esi, JSAMPARRAY [input_buf(eax)] - mov eax, INT [num_rows(eax)] - test eax,eax - jle near .return - alignx 16,7 -.rowloop: - pushpic eax - push edi - push esi - push ecx ; col - - mov esi, JSAMPROW [esi] ; inptr - mov edi, JSAMPROW [edi] ; outptr0 - movpic eax, POINTER [gotptr] ; load GOT address (eax) - - cmp ecx, byte SIZEOF_MMWORD - jae short .columnloop - alignx 16,7 - -%if RGB_PIXELSIZE == 3 ; --------------- - -.column_ld1: - push eax - push edx - lea ecx,[ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE - test cl, SIZEOF_BYTE - jz short .column_ld2 - sub ecx, byte SIZEOF_BYTE - xor eax,eax - mov al, BYTE [esi+ecx] -.column_ld2: - test cl, SIZEOF_WORD - jz short .column_ld4 - sub ecx, byte SIZEOF_WORD - xor edx,edx - mov dx, WORD [esi+ecx] - shl eax, WORD_BIT - or eax,edx -.column_ld4: - movd mmA,eax - pop edx - pop eax - test cl, SIZEOF_DWORD - jz short .column_ld8 - sub ecx, byte SIZEOF_DWORD - movd mmG, DWORD [esi+ecx] - psllq mmA, DWORD_BIT - por mmA,mmG -.column_ld8: - test cl, SIZEOF_MMWORD - jz short .column_ld16 - movq mmG,mmA - movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] - mov ecx, SIZEOF_MMWORD - jmp short .rgb_gray_cnv -.column_ld16: - test cl, 2*SIZEOF_MMWORD - mov ecx, SIZEOF_MMWORD - jz short .rgb_gray_cnv - movq mmF,mmA - movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] - movq mmG, MMWORD [esi+1*SIZEOF_MMWORD] - jmp short .rgb_gray_cnv - alignx 16,7 - -.columnloop: - movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] - movq mmG, MMWORD [esi+1*SIZEOF_MMWORD] - movq mmF, MMWORD [esi+2*SIZEOF_MMWORD] - -.rgb_gray_cnv: - ; mmA=(00 10 20 01 11 21 02 12) - ; mmG=(22 03 13 23 04 14 24 05) - ; mmF=(15 25 06 16 26 07 17 27) - - movq mmD,mmA - psllq mmA,4*BYTE_BIT ; mmA=(-- -- -- -- 00 10 20 01) - psrlq mmD,4*BYTE_BIT ; mmD=(11 21 02 12 -- -- -- --) - - punpckhbw mmA,mmG ; mmA=(00 04 10 14 20 24 01 05) - psllq mmG,4*BYTE_BIT ; mmG=(-- -- -- -- 22 03 13 23) - - punpcklbw mmD,mmF ; mmD=(11 15 21 25 02 06 12 16) - punpckhbw mmG,mmF ; mmG=(22 26 03 07 13 17 23 27) - - movq mmE,mmA - psllq mmA,4*BYTE_BIT ; mmA=(-- -- -- -- 00 04 10 14) - psrlq mmE,4*BYTE_BIT ; mmE=(20 24 01 05 -- -- -- --) - - punpckhbw mmA,mmD ; mmA=(00 02 04 06 10 12 14 16) - psllq mmD,4*BYTE_BIT ; mmD=(-- -- -- -- 11 15 21 25) - - punpcklbw mmE,mmG ; mmE=(20 22 24 26 01 03 05 07) - punpckhbw mmD,mmG ; mmD=(11 13 15 17 21 23 25 27) - - pxor mmH,mmH - - movq mmC,mmA - punpcklbw mmA,mmH ; mmA=(00 02 04 06) - punpckhbw mmC,mmH ; mmC=(10 12 14 16) - - movq mmB,mmE - punpcklbw mmE,mmH ; mmE=(20 22 24 26) - punpckhbw mmB,mmH ; mmB=(01 03 05 07) - - movq mmF,mmD - punpcklbw mmD,mmH ; mmD=(11 13 15 17) - punpckhbw mmF,mmH ; mmF=(21 23 25 27) - -%else ; RGB_PIXELSIZE == 4 ; ----------- - -.column_ld1: - test cl, SIZEOF_MMWORD/8 - jz short .column_ld2 - sub ecx, byte SIZEOF_MMWORD/8 - movd mmA, DWORD [esi+ecx*RGB_PIXELSIZE] -.column_ld2: - test cl, SIZEOF_MMWORD/4 - jz short .column_ld4 - sub ecx, byte SIZEOF_MMWORD/4 - movq mmF,mmA - movq mmA, MMWORD [esi+ecx*RGB_PIXELSIZE] -.column_ld4: - test cl, SIZEOF_MMWORD/2 - mov ecx, SIZEOF_MMWORD - jz short .rgb_gray_cnv - movq mmD,mmA - movq mmC,mmF - movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] - movq mmF, MMWORD [esi+1*SIZEOF_MMWORD] - jmp short .rgb_gray_cnv - alignx 16,7 - -.columnloop: - movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] - movq mmF, MMWORD [esi+1*SIZEOF_MMWORD] - movq mmD, MMWORD [esi+2*SIZEOF_MMWORD] - movq mmC, MMWORD [esi+3*SIZEOF_MMWORD] - -.rgb_gray_cnv: - ; mmA=(00 10 20 30 01 11 21 31) - ; mmF=(02 12 22 32 03 13 23 33) - ; mmD=(04 14 24 34 05 15 25 35) - ; mmC=(06 16 26 36 07 17 27 37) - - movq mmB,mmA - punpcklbw mmA,mmF ; mmA=(00 02 10 12 20 22 30 32) - punpckhbw mmB,mmF ; mmB=(01 03 11 13 21 23 31 33) - - movq mmG,mmD - punpcklbw mmD,mmC ; mmD=(04 06 14 16 24 26 34 36) - punpckhbw mmG,mmC ; mmG=(05 07 15 17 25 27 35 37) - - movq mmE,mmA - punpcklwd mmA,mmD ; mmA=(00 02 04 06 10 12 14 16) - punpckhwd mmE,mmD ; mmE=(20 22 24 26 30 32 34 36) - - movq mmH,mmB - punpcklwd mmB,mmG ; mmB=(01 03 05 07 11 13 15 17) - punpckhwd mmH,mmG ; mmH=(21 23 25 27 31 33 35 37) - - pxor mmF,mmF - - movq mmC,mmA - punpcklbw mmA,mmF ; mmA=(00 02 04 06) - punpckhbw mmC,mmF ; mmC=(10 12 14 16) - - movq mmD,mmB - punpcklbw mmB,mmF ; mmB=(01 03 05 07) - punpckhbw mmD,mmF ; mmD=(11 13 15 17) - - movq mmG,mmE - punpcklbw mmE,mmF ; mmE=(20 22 24 26) - punpckhbw mmG,mmF ; mmG=(30 32 34 36) - - punpcklbw mmF,mmH - punpckhbw mmH,mmH - psrlw mmF,BYTE_BIT ; mmF=(21 23 25 27) - psrlw mmH,BYTE_BIT ; mmH=(31 33 35 37) - -%endif ; RGB_PIXELSIZE ; --------------- - - ; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE - ; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO - - ; (Original) - ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B - ; - ; (This implementation) - ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G - - movq mm6,mm1 - punpcklwd mm1,mm3 - punpckhwd mm6,mm3 - pmaddwd mm1,[GOTOFF(eax,PW_F0299_F0337)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337) - pmaddwd mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337) - - movq mm7, mm6 ; mm7=ROH*FIX(0.299)+GOH*FIX(0.337) - - movq mm6,mm0 - punpcklwd mm0,mm2 - punpckhwd mm6,mm2 - pmaddwd mm0,[GOTOFF(eax,PW_F0299_F0337)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337) - pmaddwd mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337) - - movq MMWORD [wk(0)], mm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337) - movq MMWORD [wk(1)], mm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337) - - movq mm0, mm5 ; mm0=BO - movq mm6, mm4 ; mm6=BE - - movq mm4,mm0 - punpcklwd mm0,mm3 - punpckhwd mm4,mm3 - pmaddwd mm0,[GOTOFF(eax,PW_F0114_F0250)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250) - pmaddwd mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250) - - movq mm3,[GOTOFF(eax,PD_ONEHALF)] ; mm3=[PD_ONEHALF] - - paddd mm0, mm1 - paddd mm4, mm7 - paddd mm0,mm3 - paddd mm4,mm3 - psrld mm0,SCALEBITS ; mm0=YOL - psrld mm4,SCALEBITS ; mm4=YOH - packssdw mm0,mm4 ; mm0=YO - - movq mm4,mm6 - punpcklwd mm6,mm2 - punpckhwd mm4,mm2 - pmaddwd mm6,[GOTOFF(eax,PW_F0114_F0250)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250) - pmaddwd mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250) - - movq mm2,[GOTOFF(eax,PD_ONEHALF)] ; mm2=[PD_ONEHALF] - - paddd mm6, MMWORD [wk(0)] - paddd mm4, MMWORD [wk(1)] - paddd mm6,mm2 - paddd mm4,mm2 - psrld mm6,SCALEBITS ; mm6=YEL - psrld mm4,SCALEBITS ; mm4=YEH - packssdw mm6,mm4 ; mm6=YE - - psllw mm0,BYTE_BIT - por mm6,mm0 ; mm6=Y - movq MMWORD [edi], mm6 ; Save Y - - sub ecx, byte SIZEOF_MMWORD - add esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; inptr - add edi, byte SIZEOF_MMWORD ; outptr0 - cmp ecx, byte SIZEOF_MMWORD - jae near .columnloop - test ecx,ecx - jnz near .column_ld1 - - pop ecx ; col - pop esi - pop edi - poppic eax - - add esi, byte SIZEOF_JSAMPROW ; input_buf - add edi, byte SIZEOF_JSAMPROW - dec eax ; num_rows - jg near .rowloop - - emms ; empty MMX state - -.return: - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret - -; For some reason, the OS X linker does not honor the request to align the -; segment unless we do this. - align 16 diff --git a/Builder/jni-1.11/simd/i386/src/jcgryss2.asm b/Builder/jni-1.11/simd/i386/src/jcgryss2.asm deleted file mode 100644 index c29428793..000000000 --- a/Builder/jni-1.11/simd/i386/src/jcgryss2.asm +++ /dev/null @@ -1,383 +0,0 @@ -; -; jcgryss2.asm - grayscale colorspace conversion (SSE2) -; -; x86 SIMD extension for IJG JPEG library -; Copyright (C) 1999-2006, MIYASAKA Masaru. -; Copyright (C) 2011, D. R. Commander. -; For conditions of distribution and use, see copyright notice in jsimdext.inc -; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] - -%include "jcolsamp.inc" - -; -------------------------------------------------------------------------- -; -; Convert some rows of samples to the output colorspace. -; -; GLOBAL(void) -; jsimd_rgb_gray_convert_sse2 (JDIMENSION img_width, -; JSAMPARRAY input_buf, JSAMPIMAGE output_buf, -; JDIMENSION output_row, int num_rows); -; - -%define img_width(b) (b)+8 ; JDIMENSION img_width -%define input_buf(b) (b)+12 ; JSAMPARRAY input_buf -%define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf -%define output_row(b) (b)+20 ; JDIMENSION output_row -%define num_rows(b) (b)+24 ; int num_rows - -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 2 -%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr - - align 16 - - global EXTN(jsimd_rgb_gray_convert_sse2) - -EXTN(jsimd_rgb_gray_convert_sse2): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [wk(0)] - pushpic eax ; make a room for GOT address - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - get_GOT ebx ; get GOT address - movpic POINTER [gotptr], ebx ; save GOT address - - mov ecx, JDIMENSION [img_width(eax)] - test ecx,ecx - jz near .return - - push ecx - - mov esi, JSAMPIMAGE [output_buf(eax)] - mov ecx, JDIMENSION [output_row(eax)] - mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY] - lea edi, [edi+ecx*SIZEOF_JSAMPROW] - - pop ecx - - mov esi, JSAMPARRAY [input_buf(eax)] - mov eax, INT [num_rows(eax)] - test eax,eax - jle near .return - alignx 16,7 -.rowloop: - pushpic eax - push edi - push esi - push ecx ; col - - mov esi, JSAMPROW [esi] ; inptr - mov edi, JSAMPROW [edi] ; outptr0 - movpic eax, POINTER [gotptr] ; load GOT address (eax) - - cmp ecx, byte SIZEOF_XMMWORD - jae near .columnloop - alignx 16,7 - -%if RGB_PIXELSIZE == 3 ; --------------- - -.column_ld1: - push eax - push edx - lea ecx,[ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE - test cl, SIZEOF_BYTE - jz short .column_ld2 - sub ecx, byte SIZEOF_BYTE - movzx eax, BYTE [esi+ecx] -.column_ld2: - test cl, SIZEOF_WORD - jz short .column_ld4 - sub ecx, byte SIZEOF_WORD - movzx edx, WORD [esi+ecx] - shl eax, WORD_BIT - or eax,edx -.column_ld4: - movd xmmA,eax - pop edx - pop eax - test cl, SIZEOF_DWORD - jz short .column_ld8 - sub ecx, byte SIZEOF_DWORD - movd xmmF, XMM_DWORD [esi+ecx] - pslldq xmmA, SIZEOF_DWORD - por xmmA,xmmF -.column_ld8: - test cl, SIZEOF_MMWORD - jz short .column_ld16 - sub ecx, byte SIZEOF_MMWORD - movq xmmB, XMM_MMWORD [esi+ecx] - pslldq xmmA, SIZEOF_MMWORD - por xmmA,xmmB -.column_ld16: - test cl, SIZEOF_XMMWORD - jz short .column_ld32 - movdqa xmmF,xmmA - movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] - mov ecx, SIZEOF_XMMWORD - jmp short .rgb_gray_cnv -.column_ld32: - test cl, 2*SIZEOF_XMMWORD - mov ecx, SIZEOF_XMMWORD - jz short .rgb_gray_cnv - movdqa xmmB,xmmA - movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] - movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] - jmp short .rgb_gray_cnv - alignx 16,7 - -.columnloop: - movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] - movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] - movdqu xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD] - -.rgb_gray_cnv: - ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) - ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) - ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) - - movdqa xmmG,xmmA - pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) - psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) - - punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) - pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) - - punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) - punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) - - movdqa xmmD,xmmA - pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) - psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) - - punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) - pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) - - punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) - punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) - - movdqa xmmE,xmmA - pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) - psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) - - punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) - pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) - - punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) - punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) - - pxor xmmH,xmmH - - movdqa xmmC,xmmA - punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) - punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) - - movdqa xmmB,xmmE - punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) - punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) - - movdqa xmmF,xmmD - punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) - punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) - -%else ; RGB_PIXELSIZE == 4 ; ----------- - -.column_ld1: - test cl, SIZEOF_XMMWORD/16 - jz short .column_ld2 - sub ecx, byte SIZEOF_XMMWORD/16 - movd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE] -.column_ld2: - test cl, SIZEOF_XMMWORD/8 - jz short .column_ld4 - sub ecx, byte SIZEOF_XMMWORD/8 - movq xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE] - pslldq xmmA, SIZEOF_MMWORD - por xmmA,xmmE -.column_ld4: - test cl, SIZEOF_XMMWORD/4 - jz short .column_ld8 - sub ecx, byte SIZEOF_XMMWORD/4 - movdqa xmmE,xmmA - movdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE] -.column_ld8: - test cl, SIZEOF_XMMWORD/2 - mov ecx, SIZEOF_XMMWORD - jz short .rgb_gray_cnv - movdqa xmmF,xmmA - movdqa xmmH,xmmE - movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] - movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] - jmp short .rgb_gray_cnv - alignx 16,7 - -.columnloop: - movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] - movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] - movdqu xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD] - movdqu xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD] - -.rgb_gray_cnv: - ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) - ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) - ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) - ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) - - movdqa xmmD,xmmA - punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) - punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) - - movdqa xmmC,xmmF - punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) - punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) - - movdqa xmmB,xmmA - punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) - punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) - - movdqa xmmG,xmmD - punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) - punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) - - movdqa xmmE,xmmA - punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) - punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) - - movdqa xmmH,xmmB - punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) - punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) - - pxor xmmF,xmmF - - movdqa xmmC,xmmA - punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) - punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) - - movdqa xmmD,xmmB - punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) - punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) - - movdqa xmmG,xmmE - punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) - punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) - - punpcklbw xmmF,xmmH - punpckhbw xmmH,xmmH - psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) - psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) - -%endif ; RGB_PIXELSIZE ; --------------- - - ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE - ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO - - ; (Original) - ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B - ; - ; (This implementation) - ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G - - movdqa xmm6,xmm1 - punpcklwd xmm1,xmm3 - punpckhwd xmm6,xmm3 - pmaddwd xmm1,[GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) - pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) - - movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337) - - movdqa xmm6,xmm0 - punpcklwd xmm0,xmm2 - punpckhwd xmm6,xmm2 - pmaddwd xmm0,[GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) - pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) - - movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337) - movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337) - - movdqa xmm0, xmm5 ; xmm0=BO - movdqa xmm6, xmm4 ; xmm6=BE - - movdqa xmm4,xmm0 - punpcklwd xmm0,xmm3 - punpckhwd xmm4,xmm3 - pmaddwd xmm0,[GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) - pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) - - movdqa xmm3,[GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF] - - paddd xmm0, xmm1 - paddd xmm4, xmm7 - paddd xmm0,xmm3 - paddd xmm4,xmm3 - psrld xmm0,SCALEBITS ; xmm0=YOL - psrld xmm4,SCALEBITS ; xmm4=YOH - packssdw xmm0,xmm4 ; xmm0=YO - - movdqa xmm4,xmm6 - punpcklwd xmm6,xmm2 - punpckhwd xmm4,xmm2 - pmaddwd xmm6,[GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) - pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) - - movdqa xmm2,[GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF] - - paddd xmm6, XMMWORD [wk(0)] - paddd xmm4, XMMWORD [wk(1)] - paddd xmm6,xmm2 - paddd xmm4,xmm2 - psrld xmm6,SCALEBITS ; xmm6=YEL - psrld xmm4,SCALEBITS ; xmm4=YEH - packssdw xmm6,xmm4 ; xmm6=YE - - psllw xmm0,BYTE_BIT - por xmm6,xmm0 ; xmm6=Y - movdqa XMMWORD [edi], xmm6 ; Save Y - - sub ecx, byte SIZEOF_XMMWORD - add esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr - add edi, byte SIZEOF_XMMWORD ; outptr0 - cmp ecx, byte SIZEOF_XMMWORD - jae near .columnloop - test ecx,ecx - jnz near .column_ld1 - - pop ecx ; col - pop esi - pop edi - poppic eax - - add esi, byte SIZEOF_JSAMPROW ; input_buf - add edi, byte SIZEOF_JSAMPROW - dec eax ; num_rows - jg near .rowloop - -.return: - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret - -; For some reason, the OS X linker does not honor the request to align the -; segment unless we do this. - align 16 diff --git a/Builder/jni-1.11/simd/i386/src/jcqnt3dn.asm b/Builder/jni-1.11/simd/i386/src/jcqnt3dn.asm deleted file mode 100644 index 182c86952..000000000 --- a/Builder/jni-1.11/simd/i386/src/jcqnt3dn.asm +++ /dev/null @@ -1,233 +0,0 @@ -; -; jcqnt3dn.asm - sample data conversion and quantization (3DNow! & MMX) -; -; Copyright 2009 Pierre Ossman for Cendio AB -; -; Based on -; x86 SIMD extension for IJG JPEG library -; Copyright (C) 1999-2006, MIYASAKA Masaru. -; For conditions of distribution and use, see copyright notice in jsimdext.inc -; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] - -%include "jsimdext.inc" -%include "jdct.inc" - -; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 -; -; Load data into workspace, applying unsigned->signed conversion -; -; GLOBAL(void) -; jsimd_convsamp_float_3dnow (JSAMPARRAY sample_data, JDIMENSION start_col, -; FAST_FLOAT * workspace); -; - -%define sample_data ebp+8 ; JSAMPARRAY sample_data -%define start_col ebp+12 ; JDIMENSION start_col -%define workspace ebp+16 ; FAST_FLOAT * workspace - - align 16 - global EXTN(jsimd_convsamp_float_3dnow) - -EXTN(jsimd_convsamp_float_3dnow): - push ebp - mov ebp,esp - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - pcmpeqw mm7,mm7 - psllw mm7,7 - packsswb mm7,mm7 ; mm7 = PB_CENTERJSAMPLE (0x808080..) - - mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) - mov eax, JDIMENSION [start_col] - mov edi, POINTER [workspace] ; (DCTELEM *) - mov ecx, DCTSIZE/2 - alignx 16,7 -.convloop: - mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) - mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) - - movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE] - movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE] - - psubb mm0,mm7 ; mm0=(01234567) - psubb mm1,mm7 ; mm1=(89ABCDEF) - - punpcklbw mm2,mm0 ; mm2=(*0*1*2*3) - punpckhbw mm0,mm0 ; mm0=(*4*5*6*7) - punpcklbw mm3,mm1 ; mm3=(*8*9*A*B) - punpckhbw mm1,mm1 ; mm1=(*C*D*E*F) - - punpcklwd mm4,mm2 ; mm4=(***0***1) - punpckhwd mm2,mm2 ; mm2=(***2***3) - punpcklwd mm5,mm0 ; mm5=(***4***5) - punpckhwd mm0,mm0 ; mm0=(***6***7) - - psrad mm4,(DWORD_BIT-BYTE_BIT) ; mm4=(01) - psrad mm2,(DWORD_BIT-BYTE_BIT) ; mm2=(23) - pi2fd mm4,mm4 - pi2fd mm2,mm2 - psrad mm5,(DWORD_BIT-BYTE_BIT) ; mm5=(45) - psrad mm0,(DWORD_BIT-BYTE_BIT) ; mm0=(67) - pi2fd mm5,mm5 - pi2fd mm0,mm0 - - movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm4 - movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm2 - movq MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5 - movq MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0 - - punpcklwd mm6,mm3 ; mm6=(***8***9) - punpckhwd mm3,mm3 ; mm3=(***A***B) - punpcklwd mm4,mm1 ; mm4=(***C***D) - punpckhwd mm1,mm1 ; mm1=(***E***F) - - psrad mm6,(DWORD_BIT-BYTE_BIT) ; mm6=(89) - psrad mm3,(DWORD_BIT-BYTE_BIT) ; mm3=(AB) - pi2fd mm6,mm6 - pi2fd mm3,mm3 - psrad mm4,(DWORD_BIT-BYTE_BIT) ; mm4=(CD) - psrad mm1,(DWORD_BIT-BYTE_BIT) ; mm1=(EF) - pi2fd mm4,mm4 - pi2fd mm1,mm1 - - movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm6 - movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm3 - movq MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm4 - movq MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1 - - add esi, byte 2*SIZEOF_JSAMPROW - add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT - dec ecx - jnz near .convloop - - femms ; empty MMX/3DNow! state - - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - pop ebp - ret - - -; -------------------------------------------------------------------------- -; -; Quantize/descale the coefficients, and store into coef_block -; -; GLOBAL(void) -; jsimd_quantize_float_3dnow (JCOEFPTR coef_block, FAST_FLOAT * divisors, -; FAST_FLOAT * workspace); -; - -%define coef_block ebp+8 ; JCOEFPTR coef_block -%define divisors ebp+12 ; FAST_FLOAT * divisors -%define workspace ebp+16 ; FAST_FLOAT * workspace - - align 16 - global EXTN(jsimd_quantize_float_3dnow) - -EXTN(jsimd_quantize_float_3dnow): - push ebp - mov ebp,esp -; push ebx ; unused -; push ecx ; unused -; push edx ; need not be preserved - push esi - push edi - - mov eax, 0x4B400000 ; (float)0x00C00000 (rndint_magic) - movd mm7,eax - punpckldq mm7,mm7 ; mm7={12582912.0F 12582912.0F} - - mov esi, POINTER [workspace] - mov edx, POINTER [divisors] - mov edi, JCOEFPTR [coef_block] - mov eax, DCTSIZE2/16 - alignx 16,7 -.quantloop: - movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] - movq mm1, MMWORD [MMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)] - pfmul mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] - pfmul mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] - movq mm2, MMWORD [MMBLOCK(0,2,esi,SIZEOF_FAST_FLOAT)] - movq mm3, MMWORD [MMBLOCK(0,3,esi,SIZEOF_FAST_FLOAT)] - pfmul mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)] - pfmul mm3, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)] - - pfadd mm0,mm7 ; mm0=(00 ** 01 **) - pfadd mm1,mm7 ; mm1=(02 ** 03 **) - pfadd mm2,mm7 ; mm0=(04 ** 05 **) - pfadd mm3,mm7 ; mm1=(06 ** 07 **) - - movq mm4,mm0 - punpcklwd mm0,mm1 ; mm0=(00 02 ** **) - punpckhwd mm4,mm1 ; mm4=(01 03 ** **) - movq mm5,mm2 - punpcklwd mm2,mm3 ; mm2=(04 06 ** **) - punpckhwd mm5,mm3 ; mm5=(05 07 ** **) - - punpcklwd mm0,mm4 ; mm0=(00 01 02 03) - punpcklwd mm2,mm5 ; mm2=(04 05 06 07) - - movq mm6, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] - movq mm1, MMWORD [MMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)] - pfmul mm6, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] - pfmul mm1, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] - movq mm3, MMWORD [MMBLOCK(1,2,esi,SIZEOF_FAST_FLOAT)] - movq mm4, MMWORD [MMBLOCK(1,3,esi,SIZEOF_FAST_FLOAT)] - pfmul mm3, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)] - pfmul mm4, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)] - - pfadd mm6,mm7 ; mm0=(10 ** 11 **) - pfadd mm1,mm7 ; mm4=(12 ** 13 **) - pfadd mm3,mm7 ; mm0=(14 ** 15 **) - pfadd mm4,mm7 ; mm4=(16 ** 17 **) - - movq mm5,mm6 - punpcklwd mm6,mm1 ; mm6=(10 12 ** **) - punpckhwd mm5,mm1 ; mm5=(11 13 ** **) - movq mm1,mm3 - punpcklwd mm3,mm4 ; mm3=(14 16 ** **) - punpckhwd mm1,mm4 ; mm1=(15 17 ** **) - - punpcklwd mm6,mm5 ; mm6=(10 11 12 13) - punpcklwd mm3,mm1 ; mm3=(14 15 16 17) - - movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0 - movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm2 - movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm6 - movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3 - - add esi, byte 16*SIZEOF_FAST_FLOAT - add edx, byte 16*SIZEOF_FAST_FLOAT - add edi, byte 16*SIZEOF_JCOEF - dec eax - jnz near .quantloop - - femms ; empty MMX/3DNow! state - - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; unused -; pop ebx ; unused - pop ebp - ret - -; For some reason, the OS X linker does not honor the request to align the -; segment unless we do this. - align 16 diff --git a/Builder/jni-1.11/simd/i386/src/jcqntmmx.asm b/Builder/jni-1.11/simd/i386/src/jcqntmmx.asm deleted file mode 100644 index 08b08b79e..000000000 --- a/Builder/jni-1.11/simd/i386/src/jcqntmmx.asm +++ /dev/null @@ -1,274 +0,0 @@ -; -; jcqntmmx.asm - sample data conversion and quantization (MMX) -; -; Copyright 2009 Pierre Ossman for Cendio AB -; -; Based on -; x86 SIMD extension for IJG JPEG library -; Copyright (C) 1999-2006, MIYASAKA Masaru. -; For conditions of distribution and use, see copyright notice in jsimdext.inc -; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] - -%include "jsimdext.inc" -%include "jdct.inc" - -; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 -; -; Load data into workspace, applying unsigned->signed conversion -; -; GLOBAL(void) -; jsimd_convsamp_mmx (JSAMPARRAY sample_data, JDIMENSION start_col, -; DCTELEM * workspace); -; - -%define sample_data ebp+8 ; JSAMPARRAY sample_data -%define start_col ebp+12 ; JDIMENSION start_col -%define workspace ebp+16 ; DCTELEM * workspace - - align 16 - global EXTN(jsimd_convsamp_mmx) - -EXTN(jsimd_convsamp_mmx): - push ebp - mov ebp,esp - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - pxor mm6,mm6 ; mm6=(all 0's) - pcmpeqw mm7,mm7 - psllw mm7,7 ; mm7={0xFF80 0xFF80 0xFF80 0xFF80} - - mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) - mov eax, JDIMENSION [start_col] - mov edi, POINTER [workspace] ; (DCTELEM *) - mov ecx, DCTSIZE/4 - alignx 16,7 -.convloop: - mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) - mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) - - movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; mm0=(01234567) - movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE] ; mm1=(89ABCDEF) - - mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *) - mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *) - - movq mm2, MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; mm2=(GHIJKLMN) - movq mm3, MMWORD [edx+eax*SIZEOF_JSAMPLE] ; mm3=(OPQRSTUV) - - movq mm4,mm0 - punpcklbw mm0,mm6 ; mm0=(0123) - punpckhbw mm4,mm6 ; mm4=(4567) - movq mm5,mm1 - punpcklbw mm1,mm6 ; mm1=(89AB) - punpckhbw mm5,mm6 ; mm5=(CDEF) - - paddw mm0,mm7 - paddw mm4,mm7 - paddw mm1,mm7 - paddw mm5,mm7 - - movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0 - movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm4 - movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_DCTELEM)], mm1 - movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_DCTELEM)], mm5 - - movq mm0,mm2 - punpcklbw mm2,mm6 ; mm2=(GHIJ) - punpckhbw mm0,mm6 ; mm0=(KLMN) - movq mm4,mm3 - punpcklbw mm3,mm6 ; mm3=(OPQR) - punpckhbw mm4,mm6 ; mm4=(STUV) - - paddw mm2,mm7 - paddw mm0,mm7 - paddw mm3,mm7 - paddw mm4,mm7 - - movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_DCTELEM)], mm2 - movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_DCTELEM)], mm0 - movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_DCTELEM)], mm3 - movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_DCTELEM)], mm4 - - add esi, byte 4*SIZEOF_JSAMPROW - add edi, byte 4*DCTSIZE*SIZEOF_DCTELEM - dec ecx - jnz short .convloop - - emms ; empty MMX state - - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - pop ebp - ret - -; -------------------------------------------------------------------------- -; -; Quantize/descale the coefficients, and store into coef_block -; -; This implementation is based on an algorithm described in -; "How to optimize for the Pentium family of microprocessors" -; (http://www.agner.org/assem/). -; -; GLOBAL(void) -; jsimd_quantize_mmx (JCOEFPTR coef_block, DCTELEM * divisors, -; DCTELEM * workspace); -; - -%define RECIPROCAL(m,n,b) MMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM) -%define CORRECTION(m,n,b) MMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM) -%define SCALE(m,n,b) MMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM) -%define SHIFT(m,n,b) MMBLOCK(DCTSIZE*3+(m),(n),(b),SIZEOF_DCTELEM) - -%define coef_block ebp+8 ; JCOEFPTR coef_block -%define divisors ebp+12 ; DCTELEM * divisors -%define workspace ebp+16 ; DCTELEM * workspace - - align 16 - global EXTN(jsimd_quantize_mmx) - -EXTN(jsimd_quantize_mmx): - push ebp - mov ebp,esp -; push ebx ; unused -; push ecx ; unused -; push edx ; need not be preserved - push esi - push edi - - mov esi, POINTER [workspace] - mov edx, POINTER [divisors] - mov edi, JCOEFPTR [coef_block] - mov ah, 2 - alignx 16,7 -.quantloop1: - mov al, DCTSIZE2/8/2 - alignx 16,7 -.quantloop2: - movq mm2, MMWORD [MMBLOCK(0,0,esi,SIZEOF_DCTELEM)] - movq mm3, MMWORD [MMBLOCK(0,1,esi,SIZEOF_DCTELEM)] - - movq mm0,mm2 - movq mm1,mm3 - - psraw mm2,(WORD_BIT-1) ; -1 if value < 0, 0 otherwise - psraw mm3,(WORD_BIT-1) - - pxor mm0,mm2 ; val = -val - pxor mm1,mm3 - psubw mm0,mm2 - psubw mm1,mm3 - - ; - ; MMX is an annoyingly crappy instruction set. It has two - ; misfeatures that are causing problems here: - ; - ; - All multiplications are signed. - ; - ; - The second operand for the shifts is not treated as packed. - ; - ; - ; We work around the first problem by implementing this algorithm: - ; - ; unsigned long unsigned_multiply(unsigned short x, unsigned short y) - ; { - ; enum { SHORT_BIT = 16 }; - ; signed short sx = (signed short) x; - ; signed short sy = (signed short) y; - ; signed long sz; - ; - ; sz = (long) sx * (long) sy; /* signed multiply */ - ; - ; if (sx < 0) sz += (long) sy << SHORT_BIT; - ; if (sy < 0) sz += (long) sx << SHORT_BIT; - ; - ; return (unsigned long) sz; - ; } - ; - ; (note that a negative sx adds _sy_ and vice versa) - ; - ; For the second problem, we replace the shift by a multiplication. - ; Unfortunately that means we have to deal with the signed issue again. - ; - - paddw mm0, MMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor - paddw mm1, MMWORD [CORRECTION(0,1,edx)] - - movq mm4,mm0 ; store current value for later - movq mm5,mm1 - pmulhw mm0, MMWORD [RECIPROCAL(0,0,edx)] ; reciprocal - pmulhw mm1, MMWORD [RECIPROCAL(0,1,edx)] - paddw mm0,mm4 ; reciprocal is always negative (MSB=1), - paddw mm1,mm5 ; so we always need to add the initial value - ; (input value is never negative as we - ; inverted it at the start of this routine) - - ; here it gets a bit tricky as both scale - ; and mm0/mm1 can be negative - movq mm6, MMWORD [SCALE(0,0,edx)] ; scale - movq mm7, MMWORD [SCALE(0,1,edx)] - movq mm4,mm0 - movq mm5,mm1 - pmulhw mm0,mm6 - pmulhw mm1,mm7 - - psraw mm6,(WORD_BIT-1) ; determine if scale is negative - psraw mm7,(WORD_BIT-1) - - pand mm6,mm4 ; and add input if it is - pand mm7,mm5 - paddw mm0,mm6 - paddw mm1,mm7 - - psraw mm4,(WORD_BIT-1) ; then check if negative input - psraw mm5,(WORD_BIT-1) - - pand mm4, MMWORD [SCALE(0,0,edx)] ; and add scale if it is - pand mm5, MMWORD [SCALE(0,1,edx)] - paddw mm0,mm4 - paddw mm1,mm5 - - pxor mm0,mm2 ; val = -val - pxor mm1,mm3 - psubw mm0,mm2 - psubw mm1,mm3 - - movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0 - movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm1 - - add esi, byte 8*SIZEOF_DCTELEM - add edx, byte 8*SIZEOF_DCTELEM - add edi, byte 8*SIZEOF_JCOEF - dec al - jnz near .quantloop2 - dec ah - jnz near .quantloop1 ; to avoid branch misprediction - - emms ; empty MMX state - - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; unused -; pop ebx ; unused - pop ebp - ret - -; For some reason, the OS X linker does not honor the request to align the -; segment unless we do this. - align 16 diff --git a/Builder/jni-1.11/simd/i386/src/jcqnts2f.asm b/Builder/jni-1.11/simd/i386/src/jcqnts2f.asm deleted file mode 100644 index d80ae5dc9..000000000 --- a/Builder/jni-1.11/simd/i386/src/jcqnts2f.asm +++ /dev/null @@ -1,171 +0,0 @@ -; -; jcqnts2f.asm - sample data conversion and quantization (SSE & SSE2) -; -; Copyright 2009 Pierre Ossman for Cendio AB -; -; Based on -; x86 SIMD extension for IJG JPEG library -; Copyright (C) 1999-2006, MIYASAKA Masaru. -; For conditions of distribution and use, see copyright notice in jsimdext.inc -; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] - -%include "jsimdext.inc" -%include "jdct.inc" - -; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 -; -; Load data into workspace, applying unsigned->signed conversion -; -; GLOBAL(void) -; jsimd_convsamp_float_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col, -; FAST_FLOAT * workspace); -; - -%define sample_data ebp+8 ; JSAMPARRAY sample_data -%define start_col ebp+12 ; JDIMENSION start_col -%define workspace ebp+16 ; FAST_FLOAT * workspace - - align 16 - global EXTN(jsimd_convsamp_float_sse2) - -EXTN(jsimd_convsamp_float_sse2): - push ebp - mov ebp,esp - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - pcmpeqw xmm7,xmm7 - psllw xmm7,7 - packsswb xmm7,xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..) - - mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) - mov eax, JDIMENSION [start_col] - mov edi, POINTER [workspace] ; (DCTELEM *) - mov ecx, DCTSIZE/2 - alignx 16,7 -.convloop: - mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) - mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) - - movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] - movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] - - psubb xmm0,xmm7 ; xmm0=(01234567) - psubb xmm1,xmm7 ; xmm1=(89ABCDEF) - - punpcklbw xmm0,xmm0 ; xmm0=(*0*1*2*3*4*5*6*7) - punpcklbw xmm1,xmm1 ; xmm1=(*8*9*A*B*C*D*E*F) - - punpcklwd xmm2,xmm0 ; xmm2=(***0***1***2***3) - punpckhwd xmm0,xmm0 ; xmm0=(***4***5***6***7) - punpcklwd xmm3,xmm1 ; xmm3=(***8***9***A***B) - punpckhwd xmm1,xmm1 ; xmm1=(***C***D***E***F) - - psrad xmm2,(DWORD_BIT-BYTE_BIT) ; xmm2=(0123) - psrad xmm0,(DWORD_BIT-BYTE_BIT) ; xmm0=(4567) - cvtdq2ps xmm2,xmm2 ; xmm2=(0123) - cvtdq2ps xmm0,xmm0 ; xmm0=(4567) - psrad xmm3,(DWORD_BIT-BYTE_BIT) ; xmm3=(89AB) - psrad xmm1,(DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF) - cvtdq2ps xmm3,xmm3 ; xmm3=(89AB) - cvtdq2ps xmm1,xmm1 ; xmm1=(CDEF) - - movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm2 - movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0 - movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3 - movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1 - - add esi, byte 2*SIZEOF_JSAMPROW - add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT - dec ecx - jnz short .convloop - - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - pop ebp - ret - - -; -------------------------------------------------------------------------- -; -; Quantize/descale the coefficients, and store into coef_block -; -; GLOBAL(void) -; jsimd_quantize_float_sse2 (JCOEFPTR coef_block, FAST_FLOAT * divisors, -; FAST_FLOAT * workspace); -; - -%define coef_block ebp+8 ; JCOEFPTR coef_block -%define divisors ebp+12 ; FAST_FLOAT * divisors -%define workspace ebp+16 ; FAST_FLOAT * workspace - - align 16 - global EXTN(jsimd_quantize_float_sse2) - -EXTN(jsimd_quantize_float_sse2): - push ebp - mov ebp,esp -; push ebx ; unused -; push ecx ; unused -; push edx ; need not be preserved - push esi - push edi - - mov esi, POINTER [workspace] - mov edx, POINTER [divisors] - mov edi, JCOEFPTR [coef_block] - mov eax, DCTSIZE2/16 - alignx 16,7 -.quantloop: - movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] - movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)] - mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] - mulps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] - movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] - movaps xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)] - mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] - mulps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] - - cvtps2dq xmm0,xmm0 - cvtps2dq xmm1,xmm1 - cvtps2dq xmm2,xmm2 - cvtps2dq xmm3,xmm3 - - packssdw xmm0,xmm1 - packssdw xmm2,xmm3 - - movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_JCOEF)], xmm0 - movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_JCOEF)], xmm2 - - add esi, byte 16*SIZEOF_FAST_FLOAT - add edx, byte 16*SIZEOF_FAST_FLOAT - add edi, byte 16*SIZEOF_JCOEF - dec eax - jnz short .quantloop - - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; unused -; pop ebx ; unused - pop ebp - ret - -; For some reason, the OS X linker does not honor the request to align the -; segment unless we do this. - align 16 diff --git a/Builder/jni-1.11/simd/i386/src/jcqnts2i.asm b/Builder/jni-1.11/simd/i386/src/jcqnts2i.asm deleted file mode 100644 index 0864d6ed4..000000000 --- a/Builder/jni-1.11/simd/i386/src/jcqnts2i.asm +++ /dev/null @@ -1,200 +0,0 @@ -; -; jcqnts2i.asm - sample data conversion and quantization (SSE2) -; -; Copyright 2009 Pierre Ossman for Cendio AB -; -; Based on -; x86 SIMD extension for IJG JPEG library -; Copyright (C) 1999-2006, MIYASAKA Masaru. -; For conditions of distribution and use, see copyright notice in jsimdext.inc -; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] - -%include "jsimdext.inc" -%include "jdct.inc" - -; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 -; -; Load data into workspace, applying unsigned->signed conversion -; -; GLOBAL(void) -; jsimd_convsamp_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col, -; DCTELEM * workspace); -; - -%define sample_data ebp+8 ; JSAMPARRAY sample_data -%define start_col ebp+12 ; JDIMENSION start_col -%define workspace ebp+16 ; DCTELEM * workspace - - align 16 - global EXTN(jsimd_convsamp_sse2) - -EXTN(jsimd_convsamp_sse2): - push ebp - mov ebp,esp - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - pxor xmm6,xmm6 ; xmm6=(all 0's) - pcmpeqw xmm7,xmm7 - psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} - - mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) - mov eax, JDIMENSION [start_col] - mov edi, POINTER [workspace] ; (DCTELEM *) - mov ecx, DCTSIZE/4 - alignx 16,7 -.convloop: - mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) - mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) - - movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm0=(01234567) - movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF) - - mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *) - mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *) - - movq xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN) - movq xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV) - - punpcklbw xmm0,xmm6 ; xmm0=(01234567) - punpcklbw xmm1,xmm6 ; xmm1=(89ABCDEF) - paddw xmm0,xmm7 - paddw xmm1,xmm7 - punpcklbw xmm2,xmm6 ; xmm2=(GHIJKLMN) - punpcklbw xmm3,xmm6 ; xmm3=(OPQRSTUV) - paddw xmm2,xmm7 - paddw xmm3,xmm7 - - movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0 - movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1 - movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2 - movdqa XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3 - - add esi, byte 4*SIZEOF_JSAMPROW - add edi, byte 4*DCTSIZE*SIZEOF_DCTELEM - dec ecx - jnz short .convloop - - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - pop ebp - ret - -; -------------------------------------------------------------------------- -; -; Quantize/descale the coefficients, and store into coef_block -; -; This implementation is based on an algorithm described in -; "How to optimize for the Pentium family of microprocessors" -; (http://www.agner.org/assem/). -; -; GLOBAL(void) -; jsimd_quantize_sse2 (JCOEFPTR coef_block, DCTELEM * divisors, -; DCTELEM * workspace); -; - -%define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM) -%define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM) -%define SCALE(m,n,b) XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM) - -%define coef_block ebp+8 ; JCOEFPTR coef_block -%define divisors ebp+12 ; DCTELEM * divisors -%define workspace ebp+16 ; DCTELEM * workspace - - align 16 - global EXTN(jsimd_quantize_sse2) - -EXTN(jsimd_quantize_sse2): - push ebp - mov ebp,esp -; push ebx ; unused -; push ecx ; unused -; push edx ; need not be preserved - push esi - push edi - - mov esi, POINTER [workspace] - mov edx, POINTER [divisors] - mov edi, JCOEFPTR [coef_block] - mov eax, DCTSIZE2/32 - alignx 16,7 -.quantloop: - movdqa xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)] - movdqa xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)] - movdqa xmm6, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)] - movdqa xmm7, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_DCTELEM)] - movdqa xmm0,xmm4 - movdqa xmm1,xmm5 - movdqa xmm2,xmm6 - movdqa xmm3,xmm7 - psraw xmm4,(WORD_BIT-1) - psraw xmm5,(WORD_BIT-1) - psraw xmm6,(WORD_BIT-1) - psraw xmm7,(WORD_BIT-1) - pxor xmm0,xmm4 - pxor xmm1,xmm5 - pxor xmm2,xmm6 - pxor xmm3,xmm7 - psubw xmm0,xmm4 ; if (xmm0 < 0) xmm0 = -xmm0; - psubw xmm1,xmm5 ; if (xmm1 < 0) xmm1 = -xmm1; - psubw xmm2,xmm6 ; if (xmm2 < 0) xmm2 = -xmm2; - psubw xmm3,xmm7 ; if (xmm3 < 0) xmm3 = -xmm3; - - paddw xmm0, XMMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor - paddw xmm1, XMMWORD [CORRECTION(1,0,edx)] - paddw xmm2, XMMWORD [CORRECTION(2,0,edx)] - paddw xmm3, XMMWORD [CORRECTION(3,0,edx)] - pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,edx)] ; reciprocal - pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,edx)] - pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,edx)] - pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,edx)] - pmulhuw xmm0, XMMWORD [SCALE(0,0,edx)] ; scale - pmulhuw xmm1, XMMWORD [SCALE(1,0,edx)] - pmulhuw xmm2, XMMWORD [SCALE(2,0,edx)] - pmulhuw xmm3, XMMWORD [SCALE(3,0,edx)] - - pxor xmm0,xmm4 - pxor xmm1,xmm5 - pxor xmm2,xmm6 - pxor xmm3,xmm7 - psubw xmm0,xmm4 - psubw xmm1,xmm5 - psubw xmm2,xmm6 - psubw xmm3,xmm7 - movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0 - movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1 - movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2 - movdqa XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3 - - add esi, byte 32*SIZEOF_DCTELEM - add edx, byte 32*SIZEOF_DCTELEM - add edi, byte 32*SIZEOF_JCOEF - dec eax - jnz near .quantloop - - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; unused -; pop ebx ; unused - pop ebp - ret - -; For some reason, the OS X linker does not honor the request to align the -; segment unless we do this. - align 16 diff --git a/Builder/jni-1.11/simd/i386/src/jcqntsse.asm b/Builder/jni-1.11/simd/i386/src/jcqntsse.asm deleted file mode 100644 index 3065eca81..000000000 --- a/Builder/jni-1.11/simd/i386/src/jcqntsse.asm +++ /dev/null @@ -1,211 +0,0 @@ -; -; jcqntsse.asm - sample data conversion and quantization (SSE & MMX) -; -; Copyright 2009 Pierre Ossman for Cendio AB -; -; Based on -; x86 SIMD extension for IJG JPEG library -; Copyright (C) 1999-2006, MIYASAKA Masaru. -; For conditions of distribution and use, see copyright notice in jsimdext.inc -; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] - -%include "jsimdext.inc" -%include "jdct.inc" - -; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 -; -; Load data into workspace, applying unsigned->signed conversion -; -; GLOBAL(void) -; jsimd_convsamp_float_sse (JSAMPARRAY sample_data, JDIMENSION start_col, -; FAST_FLOAT * workspace); -; - -%define sample_data ebp+8 ; JSAMPARRAY sample_data -%define start_col ebp+12 ; JDIMENSION start_col -%define workspace ebp+16 ; FAST_FLOAT * workspace - - align 16 - global EXTN(jsimd_convsamp_float_sse) - -EXTN(jsimd_convsamp_float_sse): - push ebp - mov ebp,esp - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - pcmpeqw mm7,mm7 - psllw mm7,7 - packsswb mm7,mm7 ; mm7 = PB_CENTERJSAMPLE (0x808080..) - - mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) - mov eax, JDIMENSION [start_col] - mov edi, POINTER [workspace] ; (DCTELEM *) - mov ecx, DCTSIZE/2 - alignx 16,7 -.convloop: - mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) - mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) - - movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE] - movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE] - - psubb mm0,mm7 ; mm0=(01234567) - psubb mm1,mm7 ; mm1=(89ABCDEF) - - punpcklbw mm2,mm0 ; mm2=(*0*1*2*3) - punpckhbw mm0,mm0 ; mm0=(*4*5*6*7) - punpcklbw mm3,mm1 ; mm3=(*8*9*A*B) - punpckhbw mm1,mm1 ; mm1=(*C*D*E*F) - - punpcklwd mm4,mm2 ; mm4=(***0***1) - punpckhwd mm2,mm2 ; mm2=(***2***3) - punpcklwd mm5,mm0 ; mm5=(***4***5) - punpckhwd mm0,mm0 ; mm0=(***6***7) - - psrad mm4,(DWORD_BIT-BYTE_BIT) ; mm4=(01) - psrad mm2,(DWORD_BIT-BYTE_BIT) ; mm2=(23) - cvtpi2ps xmm0,mm4 ; xmm0=(01**) - cvtpi2ps xmm1,mm2 ; xmm1=(23**) - psrad mm5,(DWORD_BIT-BYTE_BIT) ; mm5=(45) - psrad mm0,(DWORD_BIT-BYTE_BIT) ; mm0=(67) - cvtpi2ps xmm2,mm5 ; xmm2=(45**) - cvtpi2ps xmm3,mm0 ; xmm3=(67**) - - punpcklwd mm6,mm3 ; mm6=(***8***9) - punpckhwd mm3,mm3 ; mm3=(***A***B) - punpcklwd mm4,mm1 ; mm4=(***C***D) - punpckhwd mm1,mm1 ; mm1=(***E***F) - - psrad mm6,(DWORD_BIT-BYTE_BIT) ; mm6=(89) - psrad mm3,(DWORD_BIT-BYTE_BIT) ; mm3=(AB) - cvtpi2ps xmm4,mm6 ; xmm4=(89**) - cvtpi2ps xmm5,mm3 ; xmm5=(AB**) - psrad mm4,(DWORD_BIT-BYTE_BIT) ; mm4=(CD) - psrad mm1,(DWORD_BIT-BYTE_BIT) ; mm1=(EF) - cvtpi2ps xmm6,mm4 ; xmm6=(CD**) - cvtpi2ps xmm7,mm1 ; xmm7=(EF**) - - movlhps xmm0,xmm1 ; xmm0=(0123) - movlhps xmm2,xmm3 ; xmm2=(4567) - movlhps xmm4,xmm5 ; xmm4=(89AB) - movlhps xmm6,xmm7 ; xmm6=(CDEF) - - movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0 - movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm2 - movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm4 - movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6 - - add esi, byte 2*SIZEOF_JSAMPROW - add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT - dec ecx - jnz near .convloop - - emms ; empty MMX state - - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - pop ebp - ret - - -; -------------------------------------------------------------------------- -; -; Quantize/descale the coefficients, and store into coef_block -; -; GLOBAL(void) -; jsimd_quantize_float_sse (JCOEFPTR coef_block, FAST_FLOAT * divisors, -; FAST_FLOAT * workspace); -; - -%define coef_block ebp+8 ; JCOEFPTR coef_block -%define divisors ebp+12 ; FAST_FLOAT * divisors -%define workspace ebp+16 ; FAST_FLOAT * workspace - - align 16 - global EXTN(jsimd_quantize_float_sse) - -EXTN(jsimd_quantize_float_sse): - push ebp - mov ebp,esp -; push ebx ; unused -; push ecx ; unused -; push edx ; need not be preserved - push esi - push edi - - mov esi, POINTER [workspace] - mov edx, POINTER [divisors] - mov edi, JCOEFPTR [coef_block] - mov eax, DCTSIZE2/16 - alignx 16,7 -.quantloop: - movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] - movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)] - mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] - mulps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] - movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] - movaps xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)] - mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] - mulps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] - - movhlps xmm4,xmm0 - movhlps xmm5,xmm1 - - cvtps2pi mm0,xmm0 - cvtps2pi mm1,xmm1 - cvtps2pi mm4,xmm4 - cvtps2pi mm5,xmm5 - - movhlps xmm6,xmm2 - movhlps xmm7,xmm3 - - cvtps2pi mm2,xmm2 - cvtps2pi mm3,xmm3 - cvtps2pi mm6,xmm6 - cvtps2pi mm7,xmm7 - - packssdw mm0,mm4 - packssdw mm1,mm5 - packssdw mm2,mm6 - packssdw mm3,mm7 - - movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0 - movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1 - movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm2 - movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3 - - add esi, byte 16*SIZEOF_FAST_FLOAT - add edx, byte 16*SIZEOF_FAST_FLOAT - add edi, byte 16*SIZEOF_JCOEF - dec eax - jnz short .quantloop - - emms ; empty MMX state - - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; unused -; pop ebx ; unused - pop ebp - ret - -; For some reason, the OS X linker does not honor the request to align the -; segment unless we do this. - align 16 diff --git a/Builder/jni-1.11/simd/i386/src/jcsammmx.asm b/Builder/jni-1.11/simd/i386/src/jcsammmx.asm deleted file mode 100644 index 9e43b2f85..000000000 --- a/Builder/jni-1.11/simd/i386/src/jcsammmx.asm +++ /dev/null @@ -1,324 +0,0 @@ -; -; jcsammmx.asm - downsampling (MMX) -; -; Copyright 2009 Pierre Ossman for Cendio AB -; -; Based on -; x86 SIMD extension for IJG JPEG library -; Copyright (C) 1999-2006, MIYASAKA Masaru. -; For conditions of distribution and use, see copyright notice in jsimdext.inc -; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] - -%include "jsimdext.inc" - -; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 -; -; Downsample pixel values of a single component. -; This version handles the common case of 2:1 horizontal and 1:1 vertical, -; without smoothing. -; -; GLOBAL(void) -; jsimd_h2v1_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor, -; JDIMENSION v_samp_factor, JDIMENSION width_blocks, -; JSAMPARRAY input_data, JSAMPARRAY output_data); -; - -%define img_width(b) (b)+8 ; JDIMENSION image_width -%define max_v_samp(b) (b)+12 ; int max_v_samp_factor -%define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor -%define width_blks(b) (b)+20 ; JDIMENSION width_blocks -%define input_data(b) (b)+24 ; JSAMPARRAY input_data -%define output_data(b) (b)+28 ; JSAMPARRAY output_data - - align 16 - global EXTN(jsimd_h2v1_downsample_mmx) - -EXTN(jsimd_h2v1_downsample_mmx): - push ebp - mov ebp,esp -; push ebx ; unused -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - mov ecx, JDIMENSION [width_blks(ebp)] - shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols) - jz near .return - - mov edx, JDIMENSION [img_width(ebp)] - - ; -- expand_right_edge - - push ecx - shl ecx,1 ; output_cols * 2 - sub ecx,edx - jle short .expand_end - - mov eax, INT [max_v_samp(ebp)] - test eax,eax - jle short .expand_end - - cld - mov esi, JSAMPARRAY [input_data(ebp)] ; input_data - alignx 16,7 -.expandloop: - push eax - push ecx - - mov edi, JSAMPROW [esi] - add edi,edx - mov al, JSAMPLE [edi-1] - - rep stosb - - pop ecx - pop eax - - add esi, byte SIZEOF_JSAMPROW - dec eax - jg short .expandloop - -.expand_end: - pop ecx ; output_cols - - ; -- h2v1_downsample - - mov eax, JDIMENSION [v_samp(ebp)] ; rowctr - test eax,eax - jle near .return - - mov edx, 0x00010000 ; bias pattern - movd mm7,edx - pcmpeqw mm6,mm6 - punpckldq mm7,mm7 ; mm7={0, 1, 0, 1} - psrlw mm6,BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..} - - mov esi, JSAMPARRAY [input_data(ebp)] ; input_data - mov edi, JSAMPARRAY [output_data(ebp)] ; output_data - alignx 16,7 -.rowloop: - push ecx - push edi - push esi - - mov esi, JSAMPROW [esi] ; inptr - mov edi, JSAMPROW [edi] ; outptr - alignx 16,7 -.columnloop: - - movq mm0, MMWORD [esi+0*SIZEOF_MMWORD] - movq mm1, MMWORD [esi+1*SIZEOF_MMWORD] - movq mm2,mm0 - movq mm3,mm1 - - pand mm0,mm6 - psrlw mm2,BYTE_BIT - pand mm1,mm6 - psrlw mm3,BYTE_BIT - - paddw mm0,mm2 - paddw mm1,mm3 - paddw mm0,mm7 - paddw mm1,mm7 - psrlw mm0,1 - psrlw mm1,1 - - packuswb mm0,mm1 - - movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 - - add esi, byte 2*SIZEOF_MMWORD ; inptr - add edi, byte 1*SIZEOF_MMWORD ; outptr - sub ecx, byte SIZEOF_MMWORD ; outcol - jnz short .columnloop - - pop esi - pop edi - pop ecx - - add esi, byte SIZEOF_JSAMPROW ; input_data - add edi, byte SIZEOF_JSAMPROW ; output_data - dec eax ; rowctr - jg short .rowloop - - emms ; empty MMX state - -.return: - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved -; pop ebx ; unused - pop ebp - ret - -; -------------------------------------------------------------------------- -; -; Downsample pixel values of a single component. -; This version handles the standard case of 2:1 horizontal and 2:1 vertical, -; without smoothing. -; -; GLOBAL(void) -; jsimd_h2v2_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor, -; JDIMENSION v_samp_factor, JDIMENSION width_blocks, -; JSAMPARRAY input_data, JSAMPARRAY output_data); -; - -%define img_width(b) (b)+8 ; JDIMENSION image_width -%define max_v_samp(b) (b)+12 ; int max_v_samp_factor -%define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor -%define width_blks(b) (b)+20 ; JDIMENSION width_blocks -%define input_data(b) (b)+24 ; JSAMPARRAY input_data -%define output_data(b) (b)+28 ; JSAMPARRAY output_data - - align 16 - global EXTN(jsimd_h2v2_downsample_mmx) - -EXTN(jsimd_h2v2_downsample_mmx): - push ebp - mov ebp,esp -; push ebx ; unused -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - mov ecx, JDIMENSION [width_blks(ebp)] - shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols) - jz near .return - - mov edx, JDIMENSION [img_width(ebp)] - - ; -- expand_right_edge - - push ecx - shl ecx,1 ; output_cols * 2 - sub ecx,edx - jle short .expand_end - - mov eax, INT [max_v_samp(ebp)] - test eax,eax - jle short .expand_end - - cld - mov esi, JSAMPARRAY [input_data(ebp)] ; input_data - alignx 16,7 -.expandloop: - push eax - push ecx - - mov edi, JSAMPROW [esi] - add edi,edx - mov al, JSAMPLE [edi-1] - - rep stosb - - pop ecx - pop eax - - add esi, byte SIZEOF_JSAMPROW - dec eax - jg short .expandloop - -.expand_end: - pop ecx ; output_cols - - ; -- h2v2_downsample - - mov eax, JDIMENSION [v_samp(ebp)] ; rowctr - test eax,eax - jle near .return - - mov edx, 0x00020001 ; bias pattern - movd mm7,edx - pcmpeqw mm6,mm6 - punpckldq mm7,mm7 ; mm7={1, 2, 1, 2} - psrlw mm6,BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..} - - mov esi, JSAMPARRAY [input_data(ebp)] ; input_data - mov edi, JSAMPARRAY [output_data(ebp)] ; output_data - alignx 16,7 -.rowloop: - push ecx - push edi - push esi - - mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 - mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1 - mov edi, JSAMPROW [edi] ; outptr - alignx 16,7 -.columnloop: - - movq mm0, MMWORD [edx+0*SIZEOF_MMWORD] - movq mm1, MMWORD [esi+0*SIZEOF_MMWORD] - movq mm2, MMWORD [edx+1*SIZEOF_MMWORD] - movq mm3, MMWORD [esi+1*SIZEOF_MMWORD] - - movq mm4,mm0 - movq mm5,mm1 - pand mm0,mm6 - psrlw mm4,BYTE_BIT - pand mm1,mm6 - psrlw mm5,BYTE_BIT - paddw mm0,mm4 - paddw mm1,mm5 - - movq mm4,mm2 - movq mm5,mm3 - pand mm2,mm6 - psrlw mm4,BYTE_BIT - pand mm3,mm6 - psrlw mm5,BYTE_BIT - paddw mm2,mm4 - paddw mm3,mm5 - - paddw mm0,mm1 - paddw mm2,mm3 - paddw mm0,mm7 - paddw mm2,mm7 - psrlw mm0,2 - psrlw mm2,2 - - packuswb mm0,mm2 - - movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 - - add edx, byte 2*SIZEOF_MMWORD ; inptr0 - add esi, byte 2*SIZEOF_MMWORD ; inptr1 - add edi, byte 1*SIZEOF_MMWORD ; outptr - sub ecx, byte SIZEOF_MMWORD ; outcol - jnz near .columnloop - - pop esi - pop edi - pop ecx - - add esi, byte 2*SIZEOF_JSAMPROW ; input_data - add edi, byte 1*SIZEOF_JSAMPROW ; output_data - dec eax ; rowctr - jg near .rowloop - - emms ; empty MMX state - -.return: - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved -; pop ebx ; unused - pop ebp - ret - -; For some reason, the OS X linker does not honor the request to align the -; segment unless we do this. - align 16 diff --git a/Builder/jni-1.11/simd/i386/src/jcsamss2.asm b/Builder/jni-1.11/simd/i386/src/jcsamss2.asm deleted file mode 100644 index 818e911df..000000000 --- a/Builder/jni-1.11/simd/i386/src/jcsamss2.asm +++ /dev/null @@ -1,351 +0,0 @@ -; -; jcsamss2.asm - downsampling (SSE2) -; -; Copyright 2009 Pierre Ossman for Cendio AB -; -; Based on -; x86 SIMD extension for IJG JPEG library -; Copyright (C) 1999-2006, MIYASAKA Masaru. -; For conditions of distribution and use, see copyright notice in jsimdext.inc -; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] - -%include "jsimdext.inc" - -; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 -; -; Downsample pixel values of a single component. -; This version handles the common case of 2:1 horizontal and 1:1 vertical, -; without smoothing. -; -; GLOBAL(void) -; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor, -; JDIMENSION v_samp_factor, JDIMENSION width_blocks, -; JSAMPARRAY input_data, JSAMPARRAY output_data); -; - -%define img_width(b) (b)+8 ; JDIMENSION image_width -%define max_v_samp(b) (b)+12 ; int max_v_samp_factor -%define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor -%define width_blks(b) (b)+20 ; JDIMENSION width_blocks -%define input_data(b) (b)+24 ; JSAMPARRAY input_data -%define output_data(b) (b)+28 ; JSAMPARRAY output_data - - align 16 - global EXTN(jsimd_h2v1_downsample_sse2) - -EXTN(jsimd_h2v1_downsample_sse2): - push ebp - mov ebp,esp -; push ebx ; unused -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - mov ecx, JDIMENSION [width_blks(ebp)] - shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols) - jz near .return - - mov edx, JDIMENSION [img_width(ebp)] - - ; -- expand_right_edge - - push ecx - shl ecx,1 ; output_cols * 2 - sub ecx,edx - jle short .expand_end - - mov eax, INT [max_v_samp(ebp)] - test eax,eax - jle short .expand_end - - cld - mov esi, JSAMPARRAY [input_data(ebp)] ; input_data - alignx 16,7 -.expandloop: - push eax - push ecx - - mov edi, JSAMPROW [esi] - add edi,edx - mov al, JSAMPLE [edi-1] - - rep stosb - - pop ecx - pop eax - - add esi, byte SIZEOF_JSAMPROW - dec eax - jg short .expandloop - -.expand_end: - pop ecx ; output_cols - - ; -- h2v1_downsample - - mov eax, JDIMENSION [v_samp(ebp)] ; rowctr - test eax,eax - jle near .return - - mov edx, 0x00010000 ; bias pattern - movd xmm7,edx - pcmpeqw xmm6,xmm6 - pshufd xmm7,xmm7,0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1} - psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} - - mov esi, JSAMPARRAY [input_data(ebp)] ; input_data - mov edi, JSAMPARRAY [output_data(ebp)] ; output_data - alignx 16,7 -.rowloop: - push ecx - push edi - push esi - - mov esi, JSAMPROW [esi] ; inptr - mov edi, JSAMPROW [edi] ; outptr - - cmp ecx, byte SIZEOF_XMMWORD - jae short .columnloop - alignx 16,7 - -.columnloop_r8: - movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] - pxor xmm1,xmm1 - mov ecx, SIZEOF_XMMWORD - jmp short .downsample - alignx 16,7 - -.columnloop: - movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] - movdqa xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD] - -.downsample: - movdqa xmm2,xmm0 - movdqa xmm3,xmm1 - - pand xmm0,xmm6 - psrlw xmm2,BYTE_BIT - pand xmm1,xmm6 - psrlw xmm3,BYTE_BIT - - paddw xmm0,xmm2 - paddw xmm1,xmm3 - paddw xmm0,xmm7 - paddw xmm1,xmm7 - psrlw xmm0,1 - psrlw xmm1,1 - - packuswb xmm0,xmm1 - - movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 - - sub ecx, byte SIZEOF_XMMWORD ; outcol - add esi, byte 2*SIZEOF_XMMWORD ; inptr - add edi, byte 1*SIZEOF_XMMWORD ; outptr - cmp ecx, byte SIZEOF_XMMWORD - jae short .columnloop - test ecx,ecx - jnz short .columnloop_r8 - - pop esi - pop edi - pop ecx - - add esi, byte SIZEOF_JSAMPROW ; input_data - add edi, byte SIZEOF_JSAMPROW ; output_data - dec eax ; rowctr - jg near .rowloop - -.return: - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved -; pop ebx ; unused - pop ebp - ret - -; -------------------------------------------------------------------------- -; -; Downsample pixel values of a single component. -; This version handles the standard case of 2:1 horizontal and 2:1 vertical, -; without smoothing. -; -; GLOBAL(void) -; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor, -; JDIMENSION v_samp_factor, JDIMENSION width_blocks, -; JSAMPARRAY input_data, JSAMPARRAY output_data); -; - -%define img_width(b) (b)+8 ; JDIMENSION image_width -%define max_v_samp(b) (b)+12 ; int max_v_samp_factor -%define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor -%define width_blks(b) (b)+20 ; JDIMENSION width_blocks -%define input_data(b) (b)+24 ; JSAMPARRAY input_data -%define output_data(b) (b)+28 ; JSAMPARRAY output_data - - align 16 - global EXTN(jsimd_h2v2_downsample_sse2) - -EXTN(jsimd_h2v2_downsample_sse2): - push ebp - mov ebp,esp -; push ebx ; unused -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - mov ecx, JDIMENSION [width_blks(ebp)] - shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols) - jz near .return - - mov edx, JDIMENSION [img_width(ebp)] - - ; -- expand_right_edge - - push ecx - shl ecx,1 ; output_cols * 2 - sub ecx,edx - jle short .expand_end - - mov eax, INT [max_v_samp(ebp)] - test eax,eax - jle short .expand_end - - cld - mov esi, JSAMPARRAY [input_data(ebp)] ; input_data - alignx 16,7 -.expandloop: - push eax - push ecx - - mov edi, JSAMPROW [esi] - add edi,edx - mov al, JSAMPLE [edi-1] - - rep stosb - - pop ecx - pop eax - - add esi, byte SIZEOF_JSAMPROW - dec eax - jg short .expandloop - -.expand_end: - pop ecx ; output_cols - - ; -- h2v2_downsample - - mov eax, JDIMENSION [v_samp(ebp)] ; rowctr - test eax,eax - jle near .return - - mov edx, 0x00020001 ; bias pattern - movd xmm7,edx - pcmpeqw xmm6,xmm6 - pshufd xmm7,xmm7,0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2} - psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} - - mov esi, JSAMPARRAY [input_data(ebp)] ; input_data - mov edi, JSAMPARRAY [output_data(ebp)] ; output_data - alignx 16,7 -.rowloop: - push ecx - push edi - push esi - - mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 - mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1 - mov edi, JSAMPROW [edi] ; outptr - - cmp ecx, byte SIZEOF_XMMWORD - jae short .columnloop - alignx 16,7 - -.columnloop_r8: - movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD] - movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] - pxor xmm2,xmm2 - pxor xmm3,xmm3 - mov ecx, SIZEOF_XMMWORD - jmp short .downsample - alignx 16,7 - -.columnloop: - movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD] - movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] - movdqa xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD] - movdqa xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD] - -.downsample: - movdqa xmm4,xmm0 - movdqa xmm5,xmm1 - pand xmm0,xmm6 - psrlw xmm4,BYTE_BIT - pand xmm1,xmm6 - psrlw xmm5,BYTE_BIT - paddw xmm0,xmm4 - paddw xmm1,xmm5 - - movdqa xmm4,xmm2 - movdqa xmm5,xmm3 - pand xmm2,xmm6 - psrlw xmm4,BYTE_BIT - pand xmm3,xmm6 - psrlw xmm5,BYTE_BIT - paddw xmm2,xmm4 - paddw xmm3,xmm5 - - paddw xmm0,xmm1 - paddw xmm2,xmm3 - paddw xmm0,xmm7 - paddw xmm2,xmm7 - psrlw xmm0,2 - psrlw xmm2,2 - - packuswb xmm0,xmm2 - - movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 - - sub ecx, byte SIZEOF_XMMWORD ; outcol - add edx, byte 2*SIZEOF_XMMWORD ; inptr0 - add esi, byte 2*SIZEOF_XMMWORD ; inptr1 - add edi, byte 1*SIZEOF_XMMWORD ; outptr - cmp ecx, byte SIZEOF_XMMWORD - jae near .columnloop - test ecx,ecx - jnz near .columnloop_r8 - - pop esi - pop edi - pop ecx - - add esi, byte 2*SIZEOF_JSAMPROW ; input_data - add edi, byte 1*SIZEOF_JSAMPROW ; output_data - dec eax ; rowctr - jg near .rowloop - -.return: - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved -; pop ebx ; unused - pop ebp - ret - -; For some reason, the OS X linker does not honor the request to align the -; segment unless we do this. - align 16 diff --git a/Builder/jni-1.11/simd/i386/src/jdclrmmx.asm b/Builder/jni-1.11/simd/i386/src/jdclrmmx.asm deleted file mode 100644 index 1c255e802..000000000 --- a/Builder/jni-1.11/simd/i386/src/jdclrmmx.asm +++ /dev/null @@ -1,405 +0,0 @@ -; -; jdclrmmx.asm - colorspace conversion (MMX) -; -; Copyright 2009 Pierre Ossman for Cendio AB -; -; Based on -; x86 SIMD extension for IJG JPEG library -; Copyright (C) 1999-2006, MIYASAKA Masaru. -; For conditions of distribution and use, see copyright notice in jsimdext.inc -; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] - -%include "jcolsamp.inc" - -; -------------------------------------------------------------------------- -; -; Convert some rows of samples to the output colorspace. -; -; GLOBAL(void) -; jsimd_ycc_rgb_convert_mmx (JDIMENSION out_width, -; JSAMPIMAGE input_buf, JDIMENSION input_row, -; JSAMPARRAY output_buf, int num_rows) -; - -%define out_width(b) (b)+8 ; JDIMENSION out_width -%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf -%define input_row(b) (b)+16 ; JDIMENSION input_row -%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf -%define num_rows(b) (b)+24 ; int num_rows - -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] -%define WK_NUM 2 -%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr - - align 16 - global EXTN(jsimd_ycc_rgb_convert_mmx) - -EXTN(jsimd_ycc_rgb_convert_mmx): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [wk(0)] - pushpic eax ; make a room for GOT address - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - get_GOT ebx ; get GOT address - movpic POINTER [gotptr], ebx ; save GOT address - - mov ecx, JDIMENSION [out_width(eax)] ; num_cols - test ecx,ecx - jz near .return - - push ecx - - mov edi, JSAMPIMAGE [input_buf(eax)] - mov ecx, JDIMENSION [input_row(eax)] - mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] - mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] - mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] - lea esi, [esi+ecx*SIZEOF_JSAMPROW] - lea ebx, [ebx+ecx*SIZEOF_JSAMPROW] - lea edx, [edx+ecx*SIZEOF_JSAMPROW] - - pop ecx - - mov edi, JSAMPARRAY [output_buf(eax)] - mov eax, INT [num_rows(eax)] - test eax,eax - jle near .return - alignx 16,7 -.rowloop: - push eax - push edi - push edx - push ebx - push esi - push ecx ; col - - mov esi, JSAMPROW [esi] ; inptr0 - mov ebx, JSAMPROW [ebx] ; inptr1 - mov edx, JSAMPROW [edx] ; inptr2 - mov edi, JSAMPROW [edi] ; outptr - movpic eax, POINTER [gotptr] ; load GOT address (eax) - alignx 16,7 -.columnloop: - - movq mm5, MMWORD [ebx] ; mm5=Cb(01234567) - movq mm1, MMWORD [edx] ; mm1=Cr(01234567) - - pcmpeqw mm4,mm4 - pcmpeqw mm7,mm7 - psrlw mm4,BYTE_BIT - psllw mm7,7 ; mm7={0xFF80 0xFF80 0xFF80 0xFF80} - movq mm0,mm4 ; mm0=mm4={0xFF 0x00 0xFF 0x00 ..} - - pand mm4,mm5 ; mm4=Cb(0246)=CbE - psrlw mm5,BYTE_BIT ; mm5=Cb(1357)=CbO - pand mm0,mm1 ; mm0=Cr(0246)=CrE - psrlw mm1,BYTE_BIT ; mm1=Cr(1357)=CrO - - paddw mm4,mm7 - paddw mm5,mm7 - paddw mm0,mm7 - paddw mm1,mm7 - - ; (Original) - ; R = Y + 1.40200 * Cr - ; G = Y - 0.34414 * Cb - 0.71414 * Cr - ; B = Y + 1.77200 * Cb - ; - ; (This implementation) - ; R = Y + 0.40200 * Cr + Cr - ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr - ; B = Y - 0.22800 * Cb + Cb + Cb - - movq mm2,mm4 ; mm2=CbE - movq mm3,mm5 ; mm3=CbO - paddw mm4,mm4 ; mm4=2*CbE - paddw mm5,mm5 ; mm5=2*CbO - movq mm6,mm0 ; mm6=CrE - movq mm7,mm1 ; mm7=CrO - paddw mm0,mm0 ; mm0=2*CrE - paddw mm1,mm1 ; mm1=2*CrO - - pmulhw mm4,[GOTOFF(eax,PW_MF0228)] ; mm4=(2*CbE * -FIX(0.22800)) - pmulhw mm5,[GOTOFF(eax,PW_MF0228)] ; mm5=(2*CbO * -FIX(0.22800)) - pmulhw mm0,[GOTOFF(eax,PW_F0402)] ; mm0=(2*CrE * FIX(0.40200)) - pmulhw mm1,[GOTOFF(eax,PW_F0402)] ; mm1=(2*CrO * FIX(0.40200)) - - paddw mm4,[GOTOFF(eax,PW_ONE)] - paddw mm5,[GOTOFF(eax,PW_ONE)] - psraw mm4,1 ; mm4=(CbE * -FIX(0.22800)) - psraw mm5,1 ; mm5=(CbO * -FIX(0.22800)) - paddw mm0,[GOTOFF(eax,PW_ONE)] - paddw mm1,[GOTOFF(eax,PW_ONE)] - psraw mm0,1 ; mm0=(CrE * FIX(0.40200)) - psraw mm1,1 ; mm1=(CrO * FIX(0.40200)) - - paddw mm4,mm2 - paddw mm5,mm3 - paddw mm4,mm2 ; mm4=(CbE * FIX(1.77200))=(B-Y)E - paddw mm5,mm3 ; mm5=(CbO * FIX(1.77200))=(B-Y)O - paddw mm0,mm6 ; mm0=(CrE * FIX(1.40200))=(R-Y)E - paddw mm1,mm7 ; mm1=(CrO * FIX(1.40200))=(R-Y)O - - movq MMWORD [wk(0)], mm4 ; wk(0)=(B-Y)E - movq MMWORD [wk(1)], mm5 ; wk(1)=(B-Y)O - - movq mm4,mm2 - movq mm5,mm3 - punpcklwd mm2,mm6 - punpckhwd mm4,mm6 - pmaddwd mm2,[GOTOFF(eax,PW_MF0344_F0285)] - pmaddwd mm4,[GOTOFF(eax,PW_MF0344_F0285)] - punpcklwd mm3,mm7 - punpckhwd mm5,mm7 - pmaddwd mm3,[GOTOFF(eax,PW_MF0344_F0285)] - pmaddwd mm5,[GOTOFF(eax,PW_MF0344_F0285)] - - paddd mm2,[GOTOFF(eax,PD_ONEHALF)] - paddd mm4,[GOTOFF(eax,PD_ONEHALF)] - psrad mm2,SCALEBITS - psrad mm4,SCALEBITS - paddd mm3,[GOTOFF(eax,PD_ONEHALF)] - paddd mm5,[GOTOFF(eax,PD_ONEHALF)] - psrad mm3,SCALEBITS - psrad mm5,SCALEBITS - - packssdw mm2,mm4 ; mm2=CbE*-FIX(0.344)+CrE*FIX(0.285) - packssdw mm3,mm5 ; mm3=CbO*-FIX(0.344)+CrO*FIX(0.285) - psubw mm2,mm6 ; mm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E - psubw mm3,mm7 ; mm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O - - movq mm5, MMWORD [esi] ; mm5=Y(01234567) - - pcmpeqw mm4,mm4 - psrlw mm4,BYTE_BIT ; mm4={0xFF 0x00 0xFF 0x00 ..} - pand mm4,mm5 ; mm4=Y(0246)=YE - psrlw mm5,BYTE_BIT ; mm5=Y(1357)=YO - - paddw mm0,mm4 ; mm0=((R-Y)E+YE)=RE=(R0 R2 R4 R6) - paddw mm1,mm5 ; mm1=((R-Y)O+YO)=RO=(R1 R3 R5 R7) - packuswb mm0,mm0 ; mm0=(R0 R2 R4 R6 ** ** ** **) - packuswb mm1,mm1 ; mm1=(R1 R3 R5 R7 ** ** ** **) - - paddw mm2,mm4 ; mm2=((G-Y)E+YE)=GE=(G0 G2 G4 G6) - paddw mm3,mm5 ; mm3=((G-Y)O+YO)=GO=(G1 G3 G5 G7) - packuswb mm2,mm2 ; mm2=(G0 G2 G4 G6 ** ** ** **) - packuswb mm3,mm3 ; mm3=(G1 G3 G5 G7 ** ** ** **) - - paddw mm4, MMWORD [wk(0)] ; mm4=(YE+(B-Y)E)=BE=(B0 B2 B4 B6) - paddw mm5, MMWORD [wk(1)] ; mm5=(YO+(B-Y)O)=BO=(B1 B3 B5 B7) - packuswb mm4,mm4 ; mm4=(B0 B2 B4 B6 ** ** ** **) - packuswb mm5,mm5 ; mm5=(B1 B3 B5 B7 ** ** ** **) - -%if RGB_PIXELSIZE == 3 ; --------------- - - ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) - ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) - ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) - ; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **) - - punpcklbw mmA,mmC ; mmA=(00 10 02 12 04 14 06 16) - punpcklbw mmE,mmB ; mmE=(20 01 22 03 24 05 26 07) - punpcklbw mmD,mmF ; mmD=(11 21 13 23 15 25 17 27) - - movq mmG,mmA - movq mmH,mmA - punpcklwd mmA,mmE ; mmA=(00 10 20 01 02 12 22 03) - punpckhwd mmG,mmE ; mmG=(04 14 24 05 06 16 26 07) - - psrlq mmH,2*BYTE_BIT ; mmH=(02 12 04 14 06 16 -- --) - psrlq mmE,2*BYTE_BIT ; mmE=(22 03 24 05 26 07 -- --) - - movq mmC,mmD - movq mmB,mmD - punpcklwd mmD,mmH ; mmD=(11 21 02 12 13 23 04 14) - punpckhwd mmC,mmH ; mmC=(15 25 06 16 17 27 -- --) - - psrlq mmB,2*BYTE_BIT ; mmB=(13 23 15 25 17 27 -- --) - - movq mmF,mmE - punpcklwd mmE,mmB ; mmE=(22 03 13 23 24 05 15 25) - punpckhwd mmF,mmB ; mmF=(26 07 17 27 -- -- -- --) - - punpckldq mmA,mmD ; mmA=(00 10 20 01 11 21 02 12) - punpckldq mmE,mmG ; mmE=(22 03 13 23 04 14 24 05) - punpckldq mmC,mmF ; mmC=(15 25 06 16 26 07 17 27) - - cmp ecx, byte SIZEOF_MMWORD - jb short .column_st16 - - movq MMWORD [edi+0*SIZEOF_MMWORD], mmA - movq MMWORD [edi+1*SIZEOF_MMWORD], mmE - movq MMWORD [edi+2*SIZEOF_MMWORD], mmC - - sub ecx, byte SIZEOF_MMWORD - jz short .nextrow - - add esi, byte SIZEOF_MMWORD ; inptr0 - add ebx, byte SIZEOF_MMWORD ; inptr1 - add edx, byte SIZEOF_MMWORD ; inptr2 - add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr - jmp near .columnloop - alignx 16,7 - -.column_st16: - lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE - cmp ecx, byte 2*SIZEOF_MMWORD - jb short .column_st8 - movq MMWORD [edi+0*SIZEOF_MMWORD], mmA - movq MMWORD [edi+1*SIZEOF_MMWORD], mmE - movq mmA,mmC - sub ecx, byte 2*SIZEOF_MMWORD - add edi, byte 2*SIZEOF_MMWORD - jmp short .column_st4 -.column_st8: - cmp ecx, byte SIZEOF_MMWORD - jb short .column_st4 - movq MMWORD [edi+0*SIZEOF_MMWORD], mmA - movq mmA,mmE - sub ecx, byte SIZEOF_MMWORD - add edi, byte SIZEOF_MMWORD -.column_st4: - movd eax,mmA - cmp ecx, byte SIZEOF_DWORD - jb short .column_st2 - mov DWORD [edi+0*SIZEOF_DWORD], eax - psrlq mmA,DWORD_BIT - movd eax,mmA - sub ecx, byte SIZEOF_DWORD - add edi, byte SIZEOF_DWORD -.column_st2: - cmp ecx, byte SIZEOF_WORD - jb short .column_st1 - mov WORD [edi+0*SIZEOF_WORD], ax - shr eax,WORD_BIT - sub ecx, byte SIZEOF_WORD - add edi, byte SIZEOF_WORD -.column_st1: - cmp ecx, byte SIZEOF_BYTE - jb short .nextrow - mov BYTE [edi+0*SIZEOF_BYTE], al - -%else ; RGB_PIXELSIZE == 4 ; ----------- - -%ifdef RGBX_FILLER_0XFF - pcmpeqb mm6,mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **) - pcmpeqb mm7,mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **) -%else - pxor mm6,mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **) - pxor mm7,mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **) -%endif - ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) - ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) - ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) - ; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **) - - punpcklbw mmA,mmC ; mmA=(00 10 02 12 04 14 06 16) - punpcklbw mmE,mmG ; mmE=(20 30 22 32 24 34 26 36) - punpcklbw mmB,mmD ; mmB=(01 11 03 13 05 15 07 17) - punpcklbw mmF,mmH ; mmF=(21 31 23 33 25 35 27 37) - - movq mmC,mmA - punpcklwd mmA,mmE ; mmA=(00 10 20 30 02 12 22 32) - punpckhwd mmC,mmE ; mmC=(04 14 24 34 06 16 26 36) - movq mmG,mmB - punpcklwd mmB,mmF ; mmB=(01 11 21 31 03 13 23 33) - punpckhwd mmG,mmF ; mmG=(05 15 25 35 07 17 27 37) - - movq mmD,mmA - punpckldq mmA,mmB ; mmA=(00 10 20 30 01 11 21 31) - punpckhdq mmD,mmB ; mmD=(02 12 22 32 03 13 23 33) - movq mmH,mmC - punpckldq mmC,mmG ; mmC=(04 14 24 34 05 15 25 35) - punpckhdq mmH,mmG ; mmH=(06 16 26 36 07 17 27 37) - - cmp ecx, byte SIZEOF_MMWORD - jb short .column_st16 - - movq MMWORD [edi+0*SIZEOF_MMWORD], mmA - movq MMWORD [edi+1*SIZEOF_MMWORD], mmD - movq MMWORD [edi+2*SIZEOF_MMWORD], mmC - movq MMWORD [edi+3*SIZEOF_MMWORD], mmH - - sub ecx, byte SIZEOF_MMWORD - jz short .nextrow - - add esi, byte SIZEOF_MMWORD ; inptr0 - add ebx, byte SIZEOF_MMWORD ; inptr1 - add edx, byte SIZEOF_MMWORD ; inptr2 - add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr - jmp near .columnloop - alignx 16,7 - -.column_st16: - cmp ecx, byte SIZEOF_MMWORD/2 - jb short .column_st8 - movq MMWORD [edi+0*SIZEOF_MMWORD], mmA - movq MMWORD [edi+1*SIZEOF_MMWORD], mmD - movq mmA,mmC - movq mmD,mmH - sub ecx, byte SIZEOF_MMWORD/2 - add edi, byte 2*SIZEOF_MMWORD -.column_st8: - cmp ecx, byte SIZEOF_MMWORD/4 - jb short .column_st4 - movq MMWORD [edi+0*SIZEOF_MMWORD], mmA - movq mmA,mmD - sub ecx, byte SIZEOF_MMWORD/4 - add edi, byte 1*SIZEOF_MMWORD -.column_st4: - cmp ecx, byte SIZEOF_MMWORD/8 - jb short .nextrow - movd DWORD [edi+0*SIZEOF_DWORD], mmA - -%endif ; RGB_PIXELSIZE ; --------------- - - alignx 16,7 - -.nextrow: - pop ecx - pop esi - pop ebx - pop edx - pop edi - pop eax - - add esi, byte SIZEOF_JSAMPROW - add ebx, byte SIZEOF_JSAMPROW - add edx, byte SIZEOF_JSAMPROW - add edi, byte SIZEOF_JSAMPROW ; output_buf - dec eax ; num_rows - jg near .rowloop - - emms ; empty MMX state - -.return: - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret - -; For some reason, the OS X linker does not honor the request to align the -; segment unless we do this. - align 16 diff --git a/Builder/jni-1.11/simd/i386/src/jdclrss2.asm b/Builder/jni-1.11/simd/i386/src/jdclrss2.asm deleted file mode 100644 index 97754cb43..000000000 --- a/Builder/jni-1.11/simd/i386/src/jdclrss2.asm +++ /dev/null @@ -1,460 +0,0 @@ -; -; jdclrss2.asm - colorspace conversion (SSE2) -; -; Copyright 2009, 2012 Pierre Ossman for Cendio AB -; Copyright 2012 D. R. Commander -; -; Based on -; x86 SIMD extension for IJG JPEG library -; Copyright (C) 1999-2006, MIYASAKA Masaru. -; For conditions of distribution and use, see copyright notice in jsimdext.inc -; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] - -%include "jcolsamp.inc" - -; -------------------------------------------------------------------------- -; -; Convert some rows of samples to the output colorspace. -; -; GLOBAL(void) -; jsimd_ycc_rgb_convert_sse2 (JDIMENSION out_width, -; JSAMPIMAGE input_buf, JDIMENSION input_row, -; JSAMPARRAY output_buf, int num_rows) -; - -%define out_width(b) (b)+8 ; JDIMENSION out_width -%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf -%define input_row(b) (b)+16 ; JDIMENSION input_row -%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf -%define num_rows(b) (b)+24 ; int num_rows - -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 2 -%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr - - align 16 - global EXTN(jsimd_ycc_rgb_convert_sse2) - -EXTN(jsimd_ycc_rgb_convert_sse2): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [wk(0)] - pushpic eax ; make a room for GOT address - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - get_GOT ebx ; get GOT address - movpic POINTER [gotptr], ebx ; save GOT address - - mov ecx, JDIMENSION [out_width(eax)] ; num_cols - test ecx,ecx - jz near .return - - push ecx - - mov edi, JSAMPIMAGE [input_buf(eax)] - mov ecx, JDIMENSION [input_row(eax)] - mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] - mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] - mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] - lea esi, [esi+ecx*SIZEOF_JSAMPROW] - lea ebx, [ebx+ecx*SIZEOF_JSAMPROW] - lea edx, [edx+ecx*SIZEOF_JSAMPROW] - - pop ecx - - mov edi, JSAMPARRAY [output_buf(eax)] - mov eax, INT [num_rows(eax)] - test eax,eax - jle near .return - alignx 16,7 -.rowloop: - push eax - push edi - push edx - push ebx - push esi - push ecx ; col - - mov esi, JSAMPROW [esi] ; inptr0 - mov ebx, JSAMPROW [ebx] ; inptr1 - mov edx, JSAMPROW [edx] ; inptr2 - mov edi, JSAMPROW [edi] ; outptr - movpic eax, POINTER [gotptr] ; load GOT address (eax) - alignx 16,7 -.columnloop: - - movdqa xmm5, XMMWORD [ebx] ; xmm5=Cb(0123456789ABCDEF) - movdqa xmm1, XMMWORD [edx] ; xmm1=Cr(0123456789ABCDEF) - - pcmpeqw xmm4,xmm4 - pcmpeqw xmm7,xmm7 - psrlw xmm4,BYTE_BIT - psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} - movdqa xmm0,xmm4 ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..} - - pand xmm4,xmm5 ; xmm4=Cb(02468ACE)=CbE - psrlw xmm5,BYTE_BIT ; xmm5=Cb(13579BDF)=CbO - pand xmm0,xmm1 ; xmm0=Cr(02468ACE)=CrE - psrlw xmm1,BYTE_BIT ; xmm1=Cr(13579BDF)=CrO - - paddw xmm4,xmm7 - paddw xmm5,xmm7 - paddw xmm0,xmm7 - paddw xmm1,xmm7 - - ; (Original) - ; R = Y + 1.40200 * Cr - ; G = Y - 0.34414 * Cb - 0.71414 * Cr - ; B = Y + 1.77200 * Cb - ; - ; (This implementation) - ; R = Y + 0.40200 * Cr + Cr - ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr - ; B = Y - 0.22800 * Cb + Cb + Cb - - movdqa xmm2,xmm4 ; xmm2=CbE - movdqa xmm3,xmm5 ; xmm3=CbO - paddw xmm4,xmm4 ; xmm4=2*CbE - paddw xmm5,xmm5 ; xmm5=2*CbO - movdqa xmm6,xmm0 ; xmm6=CrE - movdqa xmm7,xmm1 ; xmm7=CrO - paddw xmm0,xmm0 ; xmm0=2*CrE - paddw xmm1,xmm1 ; xmm1=2*CrO - - pmulhw xmm4,[GOTOFF(eax,PW_MF0228)] ; xmm4=(2*CbE * -FIX(0.22800)) - pmulhw xmm5,[GOTOFF(eax,PW_MF0228)] ; xmm5=(2*CbO * -FIX(0.22800)) - pmulhw xmm0,[GOTOFF(eax,PW_F0402)] ; xmm0=(2*CrE * FIX(0.40200)) - pmulhw xmm1,[GOTOFF(eax,PW_F0402)] ; xmm1=(2*CrO * FIX(0.40200)) - - paddw xmm4,[GOTOFF(eax,PW_ONE)] - paddw xmm5,[GOTOFF(eax,PW_ONE)] - psraw xmm4,1 ; xmm4=(CbE * -FIX(0.22800)) - psraw xmm5,1 ; xmm5=(CbO * -FIX(0.22800)) - paddw xmm0,[GOTOFF(eax,PW_ONE)] - paddw xmm1,[GOTOFF(eax,PW_ONE)] - psraw xmm0,1 ; xmm0=(CrE * FIX(0.40200)) - psraw xmm1,1 ; xmm1=(CrO * FIX(0.40200)) - - paddw xmm4,xmm2 - paddw xmm5,xmm3 - paddw xmm4,xmm2 ; xmm4=(CbE * FIX(1.77200))=(B-Y)E - paddw xmm5,xmm3 ; xmm5=(CbO * FIX(1.77200))=(B-Y)O - paddw xmm0,xmm6 ; xmm0=(CrE * FIX(1.40200))=(R-Y)E - paddw xmm1,xmm7 ; xmm1=(CrO * FIX(1.40200))=(R-Y)O - - movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=(B-Y)E - movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(B-Y)O - - movdqa xmm4,xmm2 - movdqa xmm5,xmm3 - punpcklwd xmm2,xmm6 - punpckhwd xmm4,xmm6 - pmaddwd xmm2,[GOTOFF(eax,PW_MF0344_F0285)] - pmaddwd xmm4,[GOTOFF(eax,PW_MF0344_F0285)] - punpcklwd xmm3,xmm7 - punpckhwd xmm5,xmm7 - pmaddwd xmm3,[GOTOFF(eax,PW_MF0344_F0285)] - pmaddwd xmm5,[GOTOFF(eax,PW_MF0344_F0285)] - - paddd xmm2,[GOTOFF(eax,PD_ONEHALF)] - paddd xmm4,[GOTOFF(eax,PD_ONEHALF)] - psrad xmm2,SCALEBITS - psrad xmm4,SCALEBITS - paddd xmm3,[GOTOFF(eax,PD_ONEHALF)] - paddd xmm5,[GOTOFF(eax,PD_ONEHALF)] - psrad xmm3,SCALEBITS - psrad xmm5,SCALEBITS - - packssdw xmm2,xmm4 ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285) - packssdw xmm3,xmm5 ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285) - psubw xmm2,xmm6 ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E - psubw xmm3,xmm7 ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O - - movdqa xmm5, XMMWORD [esi] ; xmm5=Y(0123456789ABCDEF) - - pcmpeqw xmm4,xmm4 - psrlw xmm4,BYTE_BIT ; xmm4={0xFF 0x00 0xFF 0x00 ..} - pand xmm4,xmm5 ; xmm4=Y(02468ACE)=YE - psrlw xmm5,BYTE_BIT ; xmm5=Y(13579BDF)=YO - - paddw xmm0,xmm4 ; xmm0=((R-Y)E+YE)=RE=R(02468ACE) - paddw xmm1,xmm5 ; xmm1=((R-Y)O+YO)=RO=R(13579BDF) - packuswb xmm0,xmm0 ; xmm0=R(02468ACE********) - packuswb xmm1,xmm1 ; xmm1=R(13579BDF********) - - paddw xmm2,xmm4 ; xmm2=((G-Y)E+YE)=GE=G(02468ACE) - paddw xmm3,xmm5 ; xmm3=((G-Y)O+YO)=GO=G(13579BDF) - packuswb xmm2,xmm2 ; xmm2=G(02468ACE********) - packuswb xmm3,xmm3 ; xmm3=G(13579BDF********) - - paddw xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE) - paddw xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF) - packuswb xmm4,xmm4 ; xmm4=B(02468ACE********) - packuswb xmm5,xmm5 ; xmm5=B(13579BDF********) - -%if RGB_PIXELSIZE == 3 ; --------------- - - ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) - ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) - ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) - ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **) - - punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) - punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F) - punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F) - - movdqa xmmG,xmmA - movdqa xmmH,xmmA - punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07) - punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F) - - psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --) - psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --) - - movdqa xmmC,xmmD - movdqa xmmB,xmmD - punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18) - punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --) - - psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --) - - movdqa xmmF,xmmE - punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29) - punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --) - - pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03) - movdqa xmmB,xmmE - punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14) - punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07) - punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29) - - pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B) - movdqa xmmB,xmmF - punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C) - punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F) - punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --) - - punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) - punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) - punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) - - cmp ecx, byte SIZEOF_XMMWORD - jb short .column_st32 - - test edi, SIZEOF_XMMWORD-1 - jnz short .out1 - ; --(aligned)------------------- - movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA - movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD - movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF - jmp short .out0 -.out1: ; --(unaligned)----------------- - movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA - movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD - movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF -.out0: - add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr - sub ecx, byte SIZEOF_XMMWORD - jz near .nextrow - - add esi, byte SIZEOF_XMMWORD ; inptr0 - add ebx, byte SIZEOF_XMMWORD ; inptr1 - add edx, byte SIZEOF_XMMWORD ; inptr2 - jmp near .columnloop - alignx 16,7 - -.column_st32: - lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE - cmp ecx, byte 2*SIZEOF_XMMWORD - jb short .column_st16 - movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA - movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD - add edi, byte 2*SIZEOF_XMMWORD ; outptr - movdqa xmmA,xmmF - sub ecx, byte 2*SIZEOF_XMMWORD - jmp short .column_st15 -.column_st16: - cmp ecx, byte SIZEOF_XMMWORD - jb short .column_st15 - movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA - add edi, byte SIZEOF_XMMWORD ; outptr - movdqa xmmA,xmmD - sub ecx, byte SIZEOF_XMMWORD -.column_st15: - ; Store the lower 8 bytes of xmmA to the output when it has enough - ; space. - cmp ecx, byte SIZEOF_MMWORD - jb short .column_st7 - movq XMM_MMWORD [edi], xmmA - add edi, byte SIZEOF_MMWORD - sub ecx, byte SIZEOF_MMWORD - psrldq xmmA, SIZEOF_MMWORD -.column_st7: - ; Store the lower 4 bytes of xmmA to the output when it has enough - ; space. - cmp ecx, byte SIZEOF_DWORD - jb short .column_st3 - movd XMM_DWORD [edi], xmmA - add edi, byte SIZEOF_DWORD - sub ecx, byte SIZEOF_DWORD - psrldq xmmA, SIZEOF_DWORD -.column_st3: - ; Store the lower 2 bytes of eax to the output when it has enough - ; space. - movd eax, xmmA - cmp ecx, byte SIZEOF_WORD - jb short .column_st1 - mov WORD [edi], ax - add edi, byte SIZEOF_WORD - sub ecx, byte SIZEOF_WORD - shr eax, 16 -.column_st1: - ; Store the lower 1 byte of eax to the output when it has enough - ; space. - test ecx, ecx - jz short .nextrow - mov BYTE [edi], al - -%else ; RGB_PIXELSIZE == 4 ; ----------- - -%ifdef RGBX_FILLER_0XFF - pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********) - pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********) -%else - pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********) - pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********) -%endif - ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) - ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) - ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) - ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **) - - punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) - punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E) - punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F) - punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F) - - movdqa xmmC,xmmA - punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36) - punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E) - movdqa xmmG,xmmB - punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37) - punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F) - - movdqa xmmD,xmmA - punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) - punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) - movdqa xmmH,xmmC - punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) - punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) - - cmp ecx, byte SIZEOF_XMMWORD - jb short .column_st32 - - test edi, SIZEOF_XMMWORD-1 - jnz short .out1 - ; --(aligned)------------------- - movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA - movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD - movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC - movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH - jmp short .out0 -.out1: ; --(unaligned)----------------- - movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA - movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD - movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC - movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH -.out0: - add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr - sub ecx, byte SIZEOF_XMMWORD - jz near .nextrow - - add esi, byte SIZEOF_XMMWORD ; inptr0 - add ebx, byte SIZEOF_XMMWORD ; inptr1 - add edx, byte SIZEOF_XMMWORD ; inptr2 - jmp near .columnloop - alignx 16,7 - -.column_st32: - cmp ecx, byte SIZEOF_XMMWORD/2 - jb short .column_st16 - movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA - movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD - add edi, byte 2*SIZEOF_XMMWORD ; outptr - movdqa xmmA,xmmC - movdqa xmmD,xmmH - sub ecx, byte SIZEOF_XMMWORD/2 -.column_st16: - cmp ecx, byte SIZEOF_XMMWORD/4 - jb short .column_st15 - movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA - add edi, byte SIZEOF_XMMWORD ; outptr - movdqa xmmA,xmmD - sub ecx, byte SIZEOF_XMMWORD/4 -.column_st15: - ; Store two pixels (8 bytes) of xmmA to the output when it has enough - ; space. - cmp ecx, byte SIZEOF_XMMWORD/8 - jb short .column_st7 - movq XMM_MMWORD [edi], xmmA - add edi, byte SIZEOF_XMMWORD/8*4 - sub ecx, byte SIZEOF_XMMWORD/8 - psrldq xmmA, SIZEOF_XMMWORD/8*4 -.column_st7: - ; Store one pixel (4 bytes) of xmmA to the output when it has enough - ; space. - test ecx, ecx - jz short .nextrow - movd XMM_DWORD [edi], xmmA - -%endif ; RGB_PIXELSIZE ; --------------- - - alignx 16,7 - -.nextrow: - pop ecx - pop esi - pop ebx - pop edx - pop edi - pop eax - - add esi, byte SIZEOF_JSAMPROW - add ebx, byte SIZEOF_JSAMPROW - add edx, byte SIZEOF_JSAMPROW - add edi, byte SIZEOF_JSAMPROW ; output_buf - dec eax ; num_rows - jg near .rowloop - - sfence ; flush the write buffer - -.return: - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret - -; For some reason, the OS X linker does not honor the request to align the -; segment unless we do this. - align 16 diff --git a/Builder/jni-1.11/simd/i386/src/jdmrgmmx.asm b/Builder/jni-1.11/simd/i386/src/jdmrgmmx.asm deleted file mode 100644 index d0800a737..000000000 --- a/Builder/jni-1.11/simd/i386/src/jdmrgmmx.asm +++ /dev/null @@ -1,464 +0,0 @@ -; -; jdmrgmmx.asm - merged upsampling/color conversion (MMX) -; -; Copyright 2009 Pierre Ossman for Cendio AB -; -; Based on -; x86 SIMD extension for IJG JPEG library -; Copyright (C) 1999-2006, MIYASAKA Masaru. -; For conditions of distribution and use, see copyright notice in jsimdext.inc -; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] - -%include "jcolsamp.inc" - -; -------------------------------------------------------------------------- -; -; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical. -; -; GLOBAL(void) -; jsimd_h2v1_merged_upsample_mmx (JDIMENSION output_width, -; JSAMPIMAGE input_buf, -; JDIMENSION in_row_group_ctr, -; JSAMPARRAY output_buf); -; - -%define output_width(b) (b)+8 ; JDIMENSION output_width -%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf -%define in_row_group_ctr(b) (b)+16 ; JDIMENSION in_row_group_ctr -%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf - -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] -%define WK_NUM 3 -%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr - - align 16 - global EXTN(jsimd_h2v1_merged_upsample_mmx) - -EXTN(jsimd_h2v1_merged_upsample_mmx): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [wk(0)] - pushpic eax ; make a room for GOT address - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - get_GOT ebx ; get GOT address - movpic POINTER [gotptr], ebx ; save GOT address - - mov ecx, JDIMENSION [output_width(eax)] ; col - test ecx,ecx - jz near .return - - push ecx - - mov edi, JSAMPIMAGE [input_buf(eax)] - mov ecx, JDIMENSION [in_row_group_ctr(eax)] - mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] - mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] - mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] - mov edi, JSAMPARRAY [output_buf(eax)] - mov esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW] ; inptr0 - mov ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW] ; inptr1 - mov edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW] ; inptr2 - mov edi, JSAMPROW [edi] ; outptr - - pop ecx ; col - - alignx 16,7 -.columnloop: - movpic eax, POINTER [gotptr] ; load GOT address (eax) - - movq mm6, MMWORD [ebx] ; mm6=Cb(01234567) - movq mm7, MMWORD [edx] ; mm7=Cr(01234567) - - pxor mm1,mm1 ; mm1=(all 0's) - pcmpeqw mm3,mm3 - psllw mm3,7 ; mm3={0xFF80 0xFF80 0xFF80 0xFF80} - - movq mm4,mm6 - punpckhbw mm6,mm1 ; mm6=Cb(4567)=CbH - punpcklbw mm4,mm1 ; mm4=Cb(0123)=CbL - movq mm0,mm7 - punpckhbw mm7,mm1 ; mm7=Cr(4567)=CrH - punpcklbw mm0,mm1 ; mm0=Cr(0123)=CrL - - paddw mm6,mm3 - paddw mm4,mm3 - paddw mm7,mm3 - paddw mm0,mm3 - - ; (Original) - ; R = Y + 1.40200 * Cr - ; G = Y - 0.34414 * Cb - 0.71414 * Cr - ; B = Y + 1.77200 * Cb - ; - ; (This implementation) - ; R = Y + 0.40200 * Cr + Cr - ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr - ; B = Y - 0.22800 * Cb + Cb + Cb - - movq mm5,mm6 ; mm5=CbH - movq mm2,mm4 ; mm2=CbL - paddw mm6,mm6 ; mm6=2*CbH - paddw mm4,mm4 ; mm4=2*CbL - movq mm1,mm7 ; mm1=CrH - movq mm3,mm0 ; mm3=CrL - paddw mm7,mm7 ; mm7=2*CrH - paddw mm0,mm0 ; mm0=2*CrL - - pmulhw mm6,[GOTOFF(eax,PW_MF0228)] ; mm6=(2*CbH * -FIX(0.22800)) - pmulhw mm4,[GOTOFF(eax,PW_MF0228)] ; mm4=(2*CbL * -FIX(0.22800)) - pmulhw mm7,[GOTOFF(eax,PW_F0402)] ; mm7=(2*CrH * FIX(0.40200)) - pmulhw mm0,[GOTOFF(eax,PW_F0402)] ; mm0=(2*CrL * FIX(0.40200)) - - paddw mm6,[GOTOFF(eax,PW_ONE)] - paddw mm4,[GOTOFF(eax,PW_ONE)] - psraw mm6,1 ; mm6=(CbH * -FIX(0.22800)) - psraw mm4,1 ; mm4=(CbL * -FIX(0.22800)) - paddw mm7,[GOTOFF(eax,PW_ONE)] - paddw mm0,[GOTOFF(eax,PW_ONE)] - psraw mm7,1 ; mm7=(CrH * FIX(0.40200)) - psraw mm0,1 ; mm0=(CrL * FIX(0.40200)) - - paddw mm6,mm5 - paddw mm4,mm2 - paddw mm6,mm5 ; mm6=(CbH * FIX(1.77200))=(B-Y)H - paddw mm4,mm2 ; mm4=(CbL * FIX(1.77200))=(B-Y)L - paddw mm7,mm1 ; mm7=(CrH * FIX(1.40200))=(R-Y)H - paddw mm0,mm3 ; mm0=(CrL * FIX(1.40200))=(R-Y)L - - movq MMWORD [wk(0)], mm6 ; wk(0)=(B-Y)H - movq MMWORD [wk(1)], mm7 ; wk(1)=(R-Y)H - - movq mm6,mm5 - movq mm7,mm2 - punpcklwd mm5,mm1 - punpckhwd mm6,mm1 - pmaddwd mm5,[GOTOFF(eax,PW_MF0344_F0285)] - pmaddwd mm6,[GOTOFF(eax,PW_MF0344_F0285)] - punpcklwd mm2,mm3 - punpckhwd mm7,mm3 - pmaddwd mm2,[GOTOFF(eax,PW_MF0344_F0285)] - pmaddwd mm7,[GOTOFF(eax,PW_MF0344_F0285)] - - paddd mm5,[GOTOFF(eax,PD_ONEHALF)] - paddd mm6,[GOTOFF(eax,PD_ONEHALF)] - psrad mm5,SCALEBITS - psrad mm6,SCALEBITS - paddd mm2,[GOTOFF(eax,PD_ONEHALF)] - paddd mm7,[GOTOFF(eax,PD_ONEHALF)] - psrad mm2,SCALEBITS - psrad mm7,SCALEBITS - - packssdw mm5,mm6 ; mm5=CbH*-FIX(0.344)+CrH*FIX(0.285) - packssdw mm2,mm7 ; mm2=CbL*-FIX(0.344)+CrL*FIX(0.285) - psubw mm5,mm1 ; mm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H - psubw mm2,mm3 ; mm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L - - movq MMWORD [wk(2)], mm5 ; wk(2)=(G-Y)H - - mov al,2 ; Yctr - jmp short .Yloop_1st - alignx 16,7 - -.Yloop_2nd: - movq mm0, MMWORD [wk(1)] ; mm0=(R-Y)H - movq mm2, MMWORD [wk(2)] ; mm2=(G-Y)H - movq mm4, MMWORD [wk(0)] ; mm4=(B-Y)H - alignx 16,7 - -.Yloop_1st: - movq mm7, MMWORD [esi] ; mm7=Y(01234567) - - pcmpeqw mm6,mm6 - psrlw mm6,BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..} - pand mm6,mm7 ; mm6=Y(0246)=YE - psrlw mm7,BYTE_BIT ; mm7=Y(1357)=YO - - movq mm1,mm0 ; mm1=mm0=(R-Y)(L/H) - movq mm3,mm2 ; mm3=mm2=(G-Y)(L/H) - movq mm5,mm4 ; mm5=mm4=(B-Y)(L/H) - - paddw mm0,mm6 ; mm0=((R-Y)+YE)=RE=(R0 R2 R4 R6) - paddw mm1,mm7 ; mm1=((R-Y)+YO)=RO=(R1 R3 R5 R7) - packuswb mm0,mm0 ; mm0=(R0 R2 R4 R6 ** ** ** **) - packuswb mm1,mm1 ; mm1=(R1 R3 R5 R7 ** ** ** **) - - paddw mm2,mm6 ; mm2=((G-Y)+YE)=GE=(G0 G2 G4 G6) - paddw mm3,mm7 ; mm3=((G-Y)+YO)=GO=(G1 G3 G5 G7) - packuswb mm2,mm2 ; mm2=(G0 G2 G4 G6 ** ** ** **) - packuswb mm3,mm3 ; mm3=(G1 G3 G5 G7 ** ** ** **) - - paddw mm4,mm6 ; mm4=((B-Y)+YE)=BE=(B0 B2 B4 B6) - paddw mm5,mm7 ; mm5=((B-Y)+YO)=BO=(B1 B3 B5 B7) - packuswb mm4,mm4 ; mm4=(B0 B2 B4 B6 ** ** ** **) - packuswb mm5,mm5 ; mm5=(B1 B3 B5 B7 ** ** ** **) - -%if RGB_PIXELSIZE == 3 ; --------------- - - ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) - ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) - ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) - ; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **) - - punpcklbw mmA,mmC ; mmA=(00 10 02 12 04 14 06 16) - punpcklbw mmE,mmB ; mmE=(20 01 22 03 24 05 26 07) - punpcklbw mmD,mmF ; mmD=(11 21 13 23 15 25 17 27) - - movq mmG,mmA - movq mmH,mmA - punpcklwd mmA,mmE ; mmA=(00 10 20 01 02 12 22 03) - punpckhwd mmG,mmE ; mmG=(04 14 24 05 06 16 26 07) - - psrlq mmH,2*BYTE_BIT ; mmH=(02 12 04 14 06 16 -- --) - psrlq mmE,2*BYTE_BIT ; mmE=(22 03 24 05 26 07 -- --) - - movq mmC,mmD - movq mmB,mmD - punpcklwd mmD,mmH ; mmD=(11 21 02 12 13 23 04 14) - punpckhwd mmC,mmH ; mmC=(15 25 06 16 17 27 -- --) - - psrlq mmB,2*BYTE_BIT ; mmB=(13 23 15 25 17 27 -- --) - - movq mmF,mmE - punpcklwd mmE,mmB ; mmE=(22 03 13 23 24 05 15 25) - punpckhwd mmF,mmB ; mmF=(26 07 17 27 -- -- -- --) - - punpckldq mmA,mmD ; mmA=(00 10 20 01 11 21 02 12) - punpckldq mmE,mmG ; mmE=(22 03 13 23 04 14 24 05) - punpckldq mmC,mmF ; mmC=(15 25 06 16 26 07 17 27) - - cmp ecx, byte SIZEOF_MMWORD - jb short .column_st16 - - movq MMWORD [edi+0*SIZEOF_MMWORD], mmA - movq MMWORD [edi+1*SIZEOF_MMWORD], mmE - movq MMWORD [edi+2*SIZEOF_MMWORD], mmC - - sub ecx, byte SIZEOF_MMWORD - jz near .endcolumn - - add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr - add esi, byte SIZEOF_MMWORD ; inptr0 - dec al ; Yctr - jnz near .Yloop_2nd - - add ebx, byte SIZEOF_MMWORD ; inptr1 - add edx, byte SIZEOF_MMWORD ; inptr2 - jmp near .columnloop - alignx 16,7 - -.column_st16: - lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE - cmp ecx, byte 2*SIZEOF_MMWORD - jb short .column_st8 - movq MMWORD [edi+0*SIZEOF_MMWORD], mmA - movq MMWORD [edi+1*SIZEOF_MMWORD], mmE - movq mmA,mmC - sub ecx, byte 2*SIZEOF_MMWORD - add edi, byte 2*SIZEOF_MMWORD - jmp short .column_st4 -.column_st8: - cmp ecx, byte SIZEOF_MMWORD - jb short .column_st4 - movq MMWORD [edi+0*SIZEOF_MMWORD], mmA - movq mmA,mmE - sub ecx, byte SIZEOF_MMWORD - add edi, byte SIZEOF_MMWORD -.column_st4: - movd eax,mmA - cmp ecx, byte SIZEOF_DWORD - jb short .column_st2 - mov DWORD [edi+0*SIZEOF_DWORD], eax - psrlq mmA,DWORD_BIT - movd eax,mmA - sub ecx, byte SIZEOF_DWORD - add edi, byte SIZEOF_DWORD -.column_st2: - cmp ecx, byte SIZEOF_WORD - jb short .column_st1 - mov WORD [edi+0*SIZEOF_WORD], ax - shr eax,WORD_BIT - sub ecx, byte SIZEOF_WORD - add edi, byte SIZEOF_WORD -.column_st1: - cmp ecx, byte SIZEOF_BYTE - jb short .endcolumn - mov BYTE [edi+0*SIZEOF_BYTE], al - -%else ; RGB_PIXELSIZE == 4 ; ----------- - -%ifdef RGBX_FILLER_0XFF - pcmpeqb mm6,mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **) - pcmpeqb mm7,mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **) -%else - pxor mm6,mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **) - pxor mm7,mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **) -%endif - ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) - ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) - ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) - ; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **) - - punpcklbw mmA,mmC ; mmA=(00 10 02 12 04 14 06 16) - punpcklbw mmE,mmG ; mmE=(20 30 22 32 24 34 26 36) - punpcklbw mmB,mmD ; mmB=(01 11 03 13 05 15 07 17) - punpcklbw mmF,mmH ; mmF=(21 31 23 33 25 35 27 37) - - movq mmC,mmA - punpcklwd mmA,mmE ; mmA=(00 10 20 30 02 12 22 32) - punpckhwd mmC,mmE ; mmC=(04 14 24 34 06 16 26 36) - movq mmG,mmB - punpcklwd mmB,mmF ; mmB=(01 11 21 31 03 13 23 33) - punpckhwd mmG,mmF ; mmG=(05 15 25 35 07 17 27 37) - - movq mmD,mmA - punpckldq mmA,mmB ; mmA=(00 10 20 30 01 11 21 31) - punpckhdq mmD,mmB ; mmD=(02 12 22 32 03 13 23 33) - movq mmH,mmC - punpckldq mmC,mmG ; mmC=(04 14 24 34 05 15 25 35) - punpckhdq mmH,mmG ; mmH=(06 16 26 36 07 17 27 37) - - cmp ecx, byte SIZEOF_MMWORD - jb short .column_st16 - - movq MMWORD [edi+0*SIZEOF_MMWORD], mmA - movq MMWORD [edi+1*SIZEOF_MMWORD], mmD - movq MMWORD [edi+2*SIZEOF_MMWORD], mmC - movq MMWORD [edi+3*SIZEOF_MMWORD], mmH - - sub ecx, byte SIZEOF_MMWORD - jz short .endcolumn - - add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr - add esi, byte SIZEOF_MMWORD ; inptr0 - dec al ; Yctr - jnz near .Yloop_2nd - - add ebx, byte SIZEOF_MMWORD ; inptr1 - add edx, byte SIZEOF_MMWORD ; inptr2 - jmp near .columnloop - alignx 16,7 - -.column_st16: - cmp ecx, byte SIZEOF_MMWORD/2 - jb short .column_st8 - movq MMWORD [edi+0*SIZEOF_MMWORD], mmA - movq MMWORD [edi+1*SIZEOF_MMWORD], mmD - movq mmA,mmC - movq mmD,mmH - sub ecx, byte SIZEOF_MMWORD/2 - add edi, byte 2*SIZEOF_MMWORD -.column_st8: - cmp ecx, byte SIZEOF_MMWORD/4 - jb short .column_st4 - movq MMWORD [edi+0*SIZEOF_MMWORD], mmA - movq mmA,mmD - sub ecx, byte SIZEOF_MMWORD/4 - add edi, byte 1*SIZEOF_MMWORD -.column_st4: - cmp ecx, byte SIZEOF_MMWORD/8 - jb short .endcolumn - movd DWORD [edi+0*SIZEOF_DWORD], mmA - -%endif ; RGB_PIXELSIZE ; --------------- - -.endcolumn: - emms ; empty MMX state - -.return: - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret - -; -------------------------------------------------------------------------- -; -; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical. -; -; GLOBAL(void) -; jsimd_h2v2_merged_upsample_mmx (JDIMENSION output_width, -; JSAMPIMAGE input_buf, -; JDIMENSION in_row_group_ctr, -; JSAMPARRAY output_buf); -; - -%define output_width(b) (b)+8 ; JDIMENSION output_width -%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf -%define in_row_group_ctr(b) (b)+16 ; JDIMENSION in_row_group_ctr -%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf - - align 16 - global EXTN(jsimd_h2v2_merged_upsample_mmx) - -EXTN(jsimd_h2v2_merged_upsample_mmx): - push ebp - mov ebp,esp - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - mov eax, JDIMENSION [output_width(ebp)] - - mov edi, JSAMPIMAGE [input_buf(ebp)] - mov ecx, JDIMENSION [in_row_group_ctr(ebp)] - mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] - mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] - mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] - mov edi, JSAMPARRAY [output_buf(ebp)] - lea esi, [esi+ecx*SIZEOF_JSAMPROW] - - push edx ; inptr2 - push ebx ; inptr1 - push esi ; inptr00 - mov ebx,esp - - push edi ; output_buf (outptr0) - push ecx ; in_row_group_ctr - push ebx ; input_buf - push eax ; output_width - - call near EXTN(jsimd_h2v1_merged_upsample_mmx) - - add esi, byte SIZEOF_JSAMPROW ; inptr01 - add edi, byte SIZEOF_JSAMPROW ; outptr1 - mov POINTER [ebx+0*SIZEOF_POINTER], esi - mov POINTER [ebx-1*SIZEOF_POINTER], edi - - call near EXTN(jsimd_h2v1_merged_upsample_mmx) - - add esp, byte 7*SIZEOF_DWORD - - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - pop ebp - ret - -; For some reason, the OS X linker does not honor the request to align the -; segment unless we do this. - align 16 diff --git a/Builder/jni-1.11/simd/i386/src/jdmrgss2.asm b/Builder/jni-1.11/simd/i386/src/jdmrgss2.asm deleted file mode 100644 index 6494340f2..000000000 --- a/Builder/jni-1.11/simd/i386/src/jdmrgss2.asm +++ /dev/null @@ -1,519 +0,0 @@ -; -; jdmrgss2.asm - merged upsampling/color conversion (SSE2) -; -; Copyright 2009, 2012 Pierre Ossman for Cendio AB -; Copyright 2012 D. R. Commander -; -; Based on -; x86 SIMD extension for IJG JPEG library -; Copyright (C) 1999-2006, MIYASAKA Masaru. -; For conditions of distribution and use, see copyright notice in jsimdext.inc -; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] - -%include "jcolsamp.inc" - -; -------------------------------------------------------------------------- -; -; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical. -; -; GLOBAL(void) -; jsimd_h2v1_merged_upsample_sse2 (JDIMENSION output_width, -; JSAMPIMAGE input_buf, -; JDIMENSION in_row_group_ctr, -; JSAMPARRAY output_buf); -; - -%define output_width(b) (b)+8 ; JDIMENSION output_width -%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf -%define in_row_group_ctr(b) (b)+16 ; JDIMENSION in_row_group_ctr -%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf - -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 3 -%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr - - align 16 - global EXTN(jsimd_h2v1_merged_upsample_sse2) - -EXTN(jsimd_h2v1_merged_upsample_sse2): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [wk(0)] - pushpic eax ; make a room for GOT address - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - get_GOT ebx ; get GOT address - movpic POINTER [gotptr], ebx ; save GOT address - - mov ecx, JDIMENSION [output_width(eax)] ; col - test ecx,ecx - jz near .return - - push ecx - - mov edi, JSAMPIMAGE [input_buf(eax)] - mov ecx, JDIMENSION [in_row_group_ctr(eax)] - mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] - mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] - mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] - mov edi, JSAMPARRAY [output_buf(eax)] - mov esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW] ; inptr0 - mov ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW] ; inptr1 - mov edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW] ; inptr2 - mov edi, JSAMPROW [edi] ; outptr - - pop ecx ; col - - alignx 16,7 -.columnloop: - movpic eax, POINTER [gotptr] ; load GOT address (eax) - - movdqa xmm6, XMMWORD [ebx] ; xmm6=Cb(0123456789ABCDEF) - movdqa xmm7, XMMWORD [edx] ; xmm7=Cr(0123456789ABCDEF) - - pxor xmm1,xmm1 ; xmm1=(all 0's) - pcmpeqw xmm3,xmm3 - psllw xmm3,7 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..} - - movdqa xmm4,xmm6 - punpckhbw xmm6,xmm1 ; xmm6=Cb(89ABCDEF)=CbH - punpcklbw xmm4,xmm1 ; xmm4=Cb(01234567)=CbL - movdqa xmm0,xmm7 - punpckhbw xmm7,xmm1 ; xmm7=Cr(89ABCDEF)=CrH - punpcklbw xmm0,xmm1 ; xmm0=Cr(01234567)=CrL - - paddw xmm6,xmm3 - paddw xmm4,xmm3 - paddw xmm7,xmm3 - paddw xmm0,xmm3 - - ; (Original) - ; R = Y + 1.40200 * Cr - ; G = Y - 0.34414 * Cb - 0.71414 * Cr - ; B = Y + 1.77200 * Cb - ; - ; (This implementation) - ; R = Y + 0.40200 * Cr + Cr - ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr - ; B = Y - 0.22800 * Cb + Cb + Cb - - movdqa xmm5,xmm6 ; xmm5=CbH - movdqa xmm2,xmm4 ; xmm2=CbL - paddw xmm6,xmm6 ; xmm6=2*CbH - paddw xmm4,xmm4 ; xmm4=2*CbL - movdqa xmm1,xmm7 ; xmm1=CrH - movdqa xmm3,xmm0 ; xmm3=CrL - paddw xmm7,xmm7 ; xmm7=2*CrH - paddw xmm0,xmm0 ; xmm0=2*CrL - - pmulhw xmm6,[GOTOFF(eax,PW_MF0228)] ; xmm6=(2*CbH * -FIX(0.22800)) - pmulhw xmm4,[GOTOFF(eax,PW_MF0228)] ; xmm4=(2*CbL * -FIX(0.22800)) - pmulhw xmm7,[GOTOFF(eax,PW_F0402)] ; xmm7=(2*CrH * FIX(0.40200)) - pmulhw xmm0,[GOTOFF(eax,PW_F0402)] ; xmm0=(2*CrL * FIX(0.40200)) - - paddw xmm6,[GOTOFF(eax,PW_ONE)] - paddw xmm4,[GOTOFF(eax,PW_ONE)] - psraw xmm6,1 ; xmm6=(CbH * -FIX(0.22800)) - psraw xmm4,1 ; xmm4=(CbL * -FIX(0.22800)) - paddw xmm7,[GOTOFF(eax,PW_ONE)] - paddw xmm0,[GOTOFF(eax,PW_ONE)] - psraw xmm7,1 ; xmm7=(CrH * FIX(0.40200)) - psraw xmm0,1 ; xmm0=(CrL * FIX(0.40200)) - - paddw xmm6,xmm5 - paddw xmm4,xmm2 - paddw xmm6,xmm5 ; xmm6=(CbH * FIX(1.77200))=(B-Y)H - paddw xmm4,xmm2 ; xmm4=(CbL * FIX(1.77200))=(B-Y)L - paddw xmm7,xmm1 ; xmm7=(CrH * FIX(1.40200))=(R-Y)H - paddw xmm0,xmm3 ; xmm0=(CrL * FIX(1.40200))=(R-Y)L - - movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=(B-Y)H - movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(R-Y)H - - movdqa xmm6,xmm5 - movdqa xmm7,xmm2 - punpcklwd xmm5,xmm1 - punpckhwd xmm6,xmm1 - pmaddwd xmm5,[GOTOFF(eax,PW_MF0344_F0285)] - pmaddwd xmm6,[GOTOFF(eax,PW_MF0344_F0285)] - punpcklwd xmm2,xmm3 - punpckhwd xmm7,xmm3 - pmaddwd xmm2,[GOTOFF(eax,PW_MF0344_F0285)] - pmaddwd xmm7,[GOTOFF(eax,PW_MF0344_F0285)] - - paddd xmm5,[GOTOFF(eax,PD_ONEHALF)] - paddd xmm6,[GOTOFF(eax,PD_ONEHALF)] - psrad xmm5,SCALEBITS - psrad xmm6,SCALEBITS - paddd xmm2,[GOTOFF(eax,PD_ONEHALF)] - paddd xmm7,[GOTOFF(eax,PD_ONEHALF)] - psrad xmm2,SCALEBITS - psrad xmm7,SCALEBITS - - packssdw xmm5,xmm6 ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285) - packssdw xmm2,xmm7 ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285) - psubw xmm5,xmm1 ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H - psubw xmm2,xmm3 ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L - - movdqa XMMWORD [wk(2)], xmm5 ; wk(2)=(G-Y)H - - mov al,2 ; Yctr - jmp short .Yloop_1st - alignx 16,7 - -.Yloop_2nd: - movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(R-Y)H - movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(G-Y)H - movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(B-Y)H - alignx 16,7 - -.Yloop_1st: - movdqa xmm7, XMMWORD [esi] ; xmm7=Y(0123456789ABCDEF) - - pcmpeqw xmm6,xmm6 - psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} - pand xmm6,xmm7 ; xmm6=Y(02468ACE)=YE - psrlw xmm7,BYTE_BIT ; xmm7=Y(13579BDF)=YO - - movdqa xmm1,xmm0 ; xmm1=xmm0=(R-Y)(L/H) - movdqa xmm3,xmm2 ; xmm3=xmm2=(G-Y)(L/H) - movdqa xmm5,xmm4 ; xmm5=xmm4=(B-Y)(L/H) - - paddw xmm0,xmm6 ; xmm0=((R-Y)+YE)=RE=R(02468ACE) - paddw xmm1,xmm7 ; xmm1=((R-Y)+YO)=RO=R(13579BDF) - packuswb xmm0,xmm0 ; xmm0=R(02468ACE********) - packuswb xmm1,xmm1 ; xmm1=R(13579BDF********) - - paddw xmm2,xmm6 ; xmm2=((G-Y)+YE)=GE=G(02468ACE) - paddw xmm3,xmm7 ; xmm3=((G-Y)+YO)=GO=G(13579BDF) - packuswb xmm2,xmm2 ; xmm2=G(02468ACE********) - packuswb xmm3,xmm3 ; xmm3=G(13579BDF********) - - paddw xmm4,xmm6 ; xmm4=((B-Y)+YE)=BE=B(02468ACE) - paddw xmm5,xmm7 ; xmm5=((B-Y)+YO)=BO=B(13579BDF) - packuswb xmm4,xmm4 ; xmm4=B(02468ACE********) - packuswb xmm5,xmm5 ; xmm5=B(13579BDF********) - -%if RGB_PIXELSIZE == 3 ; --------------- - - ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) - ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) - ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) - ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **) - - punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) - punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F) - punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F) - - movdqa xmmG,xmmA - movdqa xmmH,xmmA - punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07) - punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F) - - psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --) - psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --) - - movdqa xmmC,xmmD - movdqa xmmB,xmmD - punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18) - punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --) - - psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --) - - movdqa xmmF,xmmE - punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29) - punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --) - - pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03) - movdqa xmmB,xmmE - punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14) - punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07) - punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29) - - pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B) - movdqa xmmB,xmmF - punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C) - punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F) - punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --) - - punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) - punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) - punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) - - cmp ecx, byte SIZEOF_XMMWORD - jb short .column_st32 - - test edi, SIZEOF_XMMWORD-1 - jnz short .out1 - ; --(aligned)------------------- - movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA - movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD - movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF - jmp short .out0 -.out1: ; --(unaligned)----------------- - movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA - movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD - movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF -.out0: - add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr - sub ecx, byte SIZEOF_XMMWORD - jz near .endcolumn - - add esi, byte SIZEOF_XMMWORD ; inptr0 - dec al ; Yctr - jnz near .Yloop_2nd - - add ebx, byte SIZEOF_XMMWORD ; inptr1 - add edx, byte SIZEOF_XMMWORD ; inptr2 - jmp near .columnloop - alignx 16,7 - -.column_st32: - lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE - cmp ecx, byte 2*SIZEOF_XMMWORD - jb short .column_st16 - movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA - movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD - add edi, byte 2*SIZEOF_XMMWORD ; outptr - movdqa xmmA,xmmF - sub ecx, byte 2*SIZEOF_XMMWORD - jmp short .column_st15 -.column_st16: - cmp ecx, byte SIZEOF_XMMWORD - jb short .column_st15 - movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA - add edi, byte SIZEOF_XMMWORD ; outptr - movdqa xmmA,xmmD - sub ecx, byte SIZEOF_XMMWORD -.column_st15: - ; Store the lower 8 bytes of xmmA to the output when it has enough - ; space. - cmp ecx, byte SIZEOF_MMWORD - jb short .column_st7 - movq XMM_MMWORD [edi], xmmA - add edi, byte SIZEOF_MMWORD - sub ecx, byte SIZEOF_MMWORD - psrldq xmmA, SIZEOF_MMWORD -.column_st7: - ; Store the lower 4 bytes of xmmA to the output when it has enough - ; space. - cmp ecx, byte SIZEOF_DWORD - jb short .column_st3 - movd XMM_DWORD [edi], xmmA - add edi, byte SIZEOF_DWORD - sub ecx, byte SIZEOF_DWORD - psrldq xmmA, SIZEOF_DWORD -.column_st3: - ; Store the lower 2 bytes of eax to the output when it has enough - ; space. - movd eax, xmmA - cmp ecx, byte SIZEOF_WORD - jb short .column_st1 - mov WORD [edi], ax - add edi, byte SIZEOF_WORD - sub ecx, byte SIZEOF_WORD - shr eax, 16 -.column_st1: - ; Store the lower 1 byte of eax to the output when it has enough - ; space. - test ecx, ecx - jz short .endcolumn - mov BYTE [edi], al - -%else ; RGB_PIXELSIZE == 4 ; ----------- - -%ifdef RGBX_FILLER_0XFF - pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********) - pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********) -%else - pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********) - pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********) -%endif - ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) - ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) - ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) - ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **) - - punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) - punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E) - punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F) - punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F) - - movdqa xmmC,xmmA - punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36) - punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E) - movdqa xmmG,xmmB - punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37) - punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F) - - movdqa xmmD,xmmA - punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) - punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) - movdqa xmmH,xmmC - punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) - punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) - - cmp ecx, byte SIZEOF_XMMWORD - jb short .column_st32 - - test edi, SIZEOF_XMMWORD-1 - jnz short .out1 - ; --(aligned)------------------- - movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA - movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD - movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC - movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH - jmp short .out0 -.out1: ; --(unaligned)----------------- - movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA - movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD - movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC - movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH -.out0: - add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr - sub ecx, byte SIZEOF_XMMWORD - jz near .endcolumn - - add esi, byte SIZEOF_XMMWORD ; inptr0 - dec al ; Yctr - jnz near .Yloop_2nd - - add ebx, byte SIZEOF_XMMWORD ; inptr1 - add edx, byte SIZEOF_XMMWORD ; inptr2 - jmp near .columnloop - alignx 16,7 - -.column_st32: - cmp ecx, byte SIZEOF_XMMWORD/2 - jb short .column_st16 - movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA - movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD - add edi, byte 2*SIZEOF_XMMWORD ; outptr - movdqa xmmA,xmmC - movdqa xmmD,xmmH - sub ecx, byte SIZEOF_XMMWORD/2 -.column_st16: - cmp ecx, byte SIZEOF_XMMWORD/4 - jb short .column_st15 - movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA - add edi, byte SIZEOF_XMMWORD ; outptr - movdqa xmmA,xmmD - sub ecx, byte SIZEOF_XMMWORD/4 -.column_st15: - ; Store two pixels (8 bytes) of xmmA to the output when it has enough - ; space. - cmp ecx, byte SIZEOF_XMMWORD/8 - jb short .column_st7 - movq XMM_MMWORD [edi], xmmA - add edi, byte SIZEOF_XMMWORD/8*4 - sub ecx, byte SIZEOF_XMMWORD/8 - psrldq xmmA, SIZEOF_XMMWORD/8*4 -.column_st7: - ; Store one pixel (4 bytes) of xmmA to the output when it has enough - ; space. - test ecx, ecx - jz short .endcolumn - movd XMM_DWORD [edi], xmmA - -%endif ; RGB_PIXELSIZE ; --------------- - -.endcolumn: - sfence ; flush the write buffer - -.return: - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret - -; -------------------------------------------------------------------------- -; -; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical. -; -; GLOBAL(void) -; jsimd_h2v2_merged_upsample_sse2 (JDIMENSION output_width, -; JSAMPIMAGE input_buf, -; JDIMENSION in_row_group_ctr, -; JSAMPARRAY output_buf); -; - -%define output_width(b) (b)+8 ; JDIMENSION output_width -%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf -%define in_row_group_ctr(b) (b)+16 ; JDIMENSION in_row_group_ctr -%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf - - align 16 - global EXTN(jsimd_h2v2_merged_upsample_sse2) - -EXTN(jsimd_h2v2_merged_upsample_sse2): - push ebp - mov ebp,esp - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - mov eax, POINTER [output_width(ebp)] - - mov edi, JSAMPIMAGE [input_buf(ebp)] - mov ecx, JDIMENSION [in_row_group_ctr(ebp)] - mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] - mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] - mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] - mov edi, JSAMPARRAY [output_buf(ebp)] - lea esi, [esi+ecx*SIZEOF_JSAMPROW] - - push edx ; inptr2 - push ebx ; inptr1 - push esi ; inptr00 - mov ebx,esp - - push edi ; output_buf (outptr0) - push ecx ; in_row_group_ctr - push ebx ; input_buf - push eax ; output_width - - call near EXTN(jsimd_h2v1_merged_upsample_sse2) - - add esi, byte SIZEOF_JSAMPROW ; inptr01 - add edi, byte SIZEOF_JSAMPROW ; outptr1 - mov POINTER [ebx+0*SIZEOF_POINTER], esi - mov POINTER [ebx-1*SIZEOF_POINTER], edi - - call near EXTN(jsimd_h2v1_merged_upsample_sse2) - - add esp, byte 7*SIZEOF_DWORD - - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - pop ebp - ret - -; For some reason, the OS X linker does not honor the request to align the -; segment unless we do this. - align 16 diff --git a/Builder/jni-1.11/simd/i386/src/jdsammmx.asm b/Builder/jni-1.11/simd/i386/src/jdsammmx.asm deleted file mode 100644 index c09e5b96c..000000000 --- a/Builder/jni-1.11/simd/i386/src/jdsammmx.asm +++ /dev/null @@ -1,737 +0,0 @@ -; -; jdsammmx.asm - upsampling (MMX) -; -; Copyright 2009 Pierre Ossman for Cendio AB -; -; Based on -; x86 SIMD extension for IJG JPEG library -; Copyright (C) 1999-2006, MIYASAKA Masaru. -; For conditions of distribution and use, see copyright notice in jsimdext.inc -; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] - -%include "jsimdext.inc" - -; -------------------------------------------------------------------------- - SECTION SEG_CONST - - alignz 16 - global EXTN(jconst_fancy_upsample_mmx) - -EXTN(jconst_fancy_upsample_mmx): - -PW_ONE times 4 dw 1 -PW_TWO times 4 dw 2 -PW_THREE times 4 dw 3 -PW_SEVEN times 4 dw 7 -PW_EIGHT times 4 dw 8 - - alignz 16 - -; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 -; -; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical. -; -; The upsampling algorithm is linear interpolation between pixel centers, -; also known as a "triangle filter". This is a good compromise between -; speed and visual quality. The centers of the output pixels are 1/4 and 3/4 -; of the way between input pixel centers. -; -; GLOBAL(void) -; jsimd_h2v1_fancy_upsample_mmx (int max_v_samp_factor, -; JDIMENSION downsampled_width, -; JSAMPARRAY input_data, -; JSAMPARRAY * output_data_ptr); -; - -%define max_v_samp(b) (b)+8 ; int max_v_samp_factor -%define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width -%define input_data(b) (b)+16 ; JSAMPARRAY input_data -%define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr - - align 16 - global EXTN(jsimd_h2v1_fancy_upsample_mmx) - -EXTN(jsimd_h2v1_fancy_upsample_mmx): - push ebp - mov ebp,esp - pushpic ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - get_GOT ebx ; get GOT address - - mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr - test eax,eax - jz near .return - - mov ecx, INT [max_v_samp(ebp)] ; rowctr - test ecx,ecx - jz near .return - - mov esi, JSAMPARRAY [input_data(ebp)] ; input_data - mov edi, POINTER [output_data_ptr(ebp)] - mov edi, JSAMPARRAY [edi] ; output_data - alignx 16,7 -.rowloop: - push eax ; colctr - push edi - push esi - - mov esi, JSAMPROW [esi] ; inptr - mov edi, JSAMPROW [edi] ; outptr - - test eax, SIZEOF_MMWORD-1 - jz short .skip - mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] - mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample -.skip: - pxor mm0,mm0 ; mm0=(all 0's) - pcmpeqb mm7,mm7 - psrlq mm7,(SIZEOF_MMWORD-1)*BYTE_BIT - pand mm7, MMWORD [esi+0*SIZEOF_MMWORD] - - add eax, byte SIZEOF_MMWORD-1 - and eax, byte -SIZEOF_MMWORD - cmp eax, byte SIZEOF_MMWORD - ja short .columnloop - alignx 16,7 - -.columnloop_last: - pcmpeqb mm6,mm6 - psllq mm6,(SIZEOF_MMWORD-1)*BYTE_BIT - pand mm6, MMWORD [esi+0*SIZEOF_MMWORD] - jmp short .upsample - alignx 16,7 - -.columnloop: - movq mm6, MMWORD [esi+1*SIZEOF_MMWORD] - psllq mm6,(SIZEOF_MMWORD-1)*BYTE_BIT - -.upsample: - movq mm1, MMWORD [esi+0*SIZEOF_MMWORD] - movq mm2,mm1 - movq mm3,mm1 ; mm1=( 0 1 2 3 4 5 6 7) - psllq mm2,BYTE_BIT ; mm2=( - 0 1 2 3 4 5 6) - psrlq mm3,BYTE_BIT ; mm3=( 1 2 3 4 5 6 7 -) - - por mm2,mm7 ; mm2=(-1 0 1 2 3 4 5 6) - por mm3,mm6 ; mm3=( 1 2 3 4 5 6 7 8) - - movq mm7,mm1 - psrlq mm7,(SIZEOF_MMWORD-1)*BYTE_BIT ; mm7=( 7 - - - - - - -) - - movq mm4,mm1 - punpcklbw mm1,mm0 ; mm1=( 0 1 2 3) - punpckhbw mm4,mm0 ; mm4=( 4 5 6 7) - movq mm5,mm2 - punpcklbw mm2,mm0 ; mm2=(-1 0 1 2) - punpckhbw mm5,mm0 ; mm5=( 3 4 5 6) - movq mm6,mm3 - punpcklbw mm3,mm0 ; mm3=( 1 2 3 4) - punpckhbw mm6,mm0 ; mm6=( 5 6 7 8) - - pmullw mm1,[GOTOFF(ebx,PW_THREE)] - pmullw mm4,[GOTOFF(ebx,PW_THREE)] - paddw mm2,[GOTOFF(ebx,PW_ONE)] - paddw mm5,[GOTOFF(ebx,PW_ONE)] - paddw mm3,[GOTOFF(ebx,PW_TWO)] - paddw mm6,[GOTOFF(ebx,PW_TWO)] - - paddw mm2,mm1 - paddw mm5,mm4 - psrlw mm2,2 ; mm2=OutLE=( 0 2 4 6) - psrlw mm5,2 ; mm5=OutHE=( 8 10 12 14) - paddw mm3,mm1 - paddw mm6,mm4 - psrlw mm3,2 ; mm3=OutLO=( 1 3 5 7) - psrlw mm6,2 ; mm6=OutHO=( 9 11 13 15) - - psllw mm3,BYTE_BIT - psllw mm6,BYTE_BIT - por mm2,mm3 ; mm2=OutL=( 0 1 2 3 4 5 6 7) - por mm5,mm6 ; mm5=OutH=( 8 9 10 11 12 13 14 15) - - movq MMWORD [edi+0*SIZEOF_MMWORD], mm2 - movq MMWORD [edi+1*SIZEOF_MMWORD], mm5 - - sub eax, byte SIZEOF_MMWORD - add esi, byte 1*SIZEOF_MMWORD ; inptr - add edi, byte 2*SIZEOF_MMWORD ; outptr - cmp eax, byte SIZEOF_MMWORD - ja near .columnloop - test eax,eax - jnz near .columnloop_last - - pop esi - pop edi - pop eax - - add esi, byte SIZEOF_JSAMPROW ; input_data - add edi, byte SIZEOF_JSAMPROW ; output_data - dec ecx ; rowctr - jg near .rowloop - - emms ; empty MMX state - -.return: - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - poppic ebx - pop ebp - ret - -; -------------------------------------------------------------------------- -; -; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical. -; Again a triangle filter; see comments for h2v1 case, above. -; -; GLOBAL(void) -; jsimd_h2v2_fancy_upsample_mmx (int max_v_samp_factor, -; JDIMENSION downsampled_width, -; JSAMPARRAY input_data, -; JSAMPARRAY * output_data_ptr); -; - -%define max_v_samp(b) (b)+8 ; int max_v_samp_factor -%define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width -%define input_data(b) (b)+16 ; JSAMPARRAY input_data -%define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr - -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] -%define WK_NUM 4 -%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr - - align 16 - global EXTN(jsimd_h2v2_fancy_upsample_mmx) - -EXTN(jsimd_h2v2_fancy_upsample_mmx): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [wk(0)] - pushpic eax ; make a room for GOT address - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - get_GOT ebx ; get GOT address - movpic POINTER [gotptr], ebx ; save GOT address - - mov edx,eax ; edx = original ebp - mov eax, JDIMENSION [downsamp_width(edx)] ; colctr - test eax,eax - jz near .return - - mov ecx, INT [max_v_samp(edx)] ; rowctr - test ecx,ecx - jz near .return - - mov esi, JSAMPARRAY [input_data(edx)] ; input_data - mov edi, POINTER [output_data_ptr(edx)] - mov edi, JSAMPARRAY [edi] ; output_data - alignx 16,7 -.rowloop: - push eax ; colctr - push ecx - push edi - push esi - - mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above) - mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 - mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below) - mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 - mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 - - test eax, SIZEOF_MMWORD-1 - jz short .skip - push edx - mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE] - mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl - mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE] - mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl - mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] - mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample - pop edx -.skip: - ; -- process the first column block - - movq mm0, MMWORD [ebx+0*SIZEOF_MMWORD] ; mm0=row[ 0][0] - movq mm1, MMWORD [ecx+0*SIZEOF_MMWORD] ; mm1=row[-1][0] - movq mm2, MMWORD [esi+0*SIZEOF_MMWORD] ; mm2=row[+1][0] - - pushpic ebx - movpic ebx, POINTER [gotptr] ; load GOT address - - pxor mm3,mm3 ; mm3=(all 0's) - movq mm4,mm0 - punpcklbw mm0,mm3 ; mm0=row[ 0][0]( 0 1 2 3) - punpckhbw mm4,mm3 ; mm4=row[ 0][0]( 4 5 6 7) - movq mm5,mm1 - punpcklbw mm1,mm3 ; mm1=row[-1][0]( 0 1 2 3) - punpckhbw mm5,mm3 ; mm5=row[-1][0]( 4 5 6 7) - movq mm6,mm2 - punpcklbw mm2,mm3 ; mm2=row[+1][0]( 0 1 2 3) - punpckhbw mm6,mm3 ; mm6=row[+1][0]( 4 5 6 7) - - pmullw mm0,[GOTOFF(ebx,PW_THREE)] - pmullw mm4,[GOTOFF(ebx,PW_THREE)] - - pcmpeqb mm7,mm7 - psrlq mm7,(SIZEOF_MMWORD-2)*BYTE_BIT - - paddw mm1,mm0 ; mm1=Int0L=( 0 1 2 3) - paddw mm5,mm4 ; mm5=Int0H=( 4 5 6 7) - paddw mm2,mm0 ; mm2=Int1L=( 0 1 2 3) - paddw mm6,mm4 ; mm6=Int1H=( 4 5 6 7) - - movq MMWORD [edx+0*SIZEOF_MMWORD], mm1 ; temporarily save - movq MMWORD [edx+1*SIZEOF_MMWORD], mm5 ; the intermediate data - movq MMWORD [edi+0*SIZEOF_MMWORD], mm2 - movq MMWORD [edi+1*SIZEOF_MMWORD], mm6 - - pand mm1,mm7 ; mm1=( 0 - - -) - pand mm2,mm7 ; mm2=( 0 - - -) - - movq MMWORD [wk(0)], mm1 - movq MMWORD [wk(1)], mm2 - - poppic ebx - - add eax, byte SIZEOF_MMWORD-1 - and eax, byte -SIZEOF_MMWORD - cmp eax, byte SIZEOF_MMWORD - ja short .columnloop - alignx 16,7 - -.columnloop_last: - ; -- process the last column block - - pushpic ebx - movpic ebx, POINTER [gotptr] ; load GOT address - - pcmpeqb mm1,mm1 - psllq mm1,(SIZEOF_MMWORD-2)*BYTE_BIT - movq mm2,mm1 - - pand mm1, MMWORD [edx+1*SIZEOF_MMWORD] ; mm1=( - - - 7) - pand mm2, MMWORD [edi+1*SIZEOF_MMWORD] ; mm2=( - - - 7) - - movq MMWORD [wk(2)], mm1 - movq MMWORD [wk(3)], mm2 - - jmp short .upsample - alignx 16,7 - -.columnloop: - ; -- process the next column block - - movq mm0, MMWORD [ebx+1*SIZEOF_MMWORD] ; mm0=row[ 0][1] - movq mm1, MMWORD [ecx+1*SIZEOF_MMWORD] ; mm1=row[-1][1] - movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] ; mm2=row[+1][1] - - pushpic ebx - movpic ebx, POINTER [gotptr] ; load GOT address - - pxor mm3,mm3 ; mm3=(all 0's) - movq mm4,mm0 - punpcklbw mm0,mm3 ; mm0=row[ 0][1]( 0 1 2 3) - punpckhbw mm4,mm3 ; mm4=row[ 0][1]( 4 5 6 7) - movq mm5,mm1 - punpcklbw mm1,mm3 ; mm1=row[-1][1]( 0 1 2 3) - punpckhbw mm5,mm3 ; mm5=row[-1][1]( 4 5 6 7) - movq mm6,mm2 - punpcklbw mm2,mm3 ; mm2=row[+1][1]( 0 1 2 3) - punpckhbw mm6,mm3 ; mm6=row[+1][1]( 4 5 6 7) - - pmullw mm0,[GOTOFF(ebx,PW_THREE)] - pmullw mm4,[GOTOFF(ebx,PW_THREE)] - - paddw mm1,mm0 ; mm1=Int0L=( 0 1 2 3) - paddw mm5,mm4 ; mm5=Int0H=( 4 5 6 7) - paddw mm2,mm0 ; mm2=Int1L=( 0 1 2 3) - paddw mm6,mm4 ; mm6=Int1H=( 4 5 6 7) - - movq MMWORD [edx+2*SIZEOF_MMWORD], mm1 ; temporarily save - movq MMWORD [edx+3*SIZEOF_MMWORD], mm5 ; the intermediate data - movq MMWORD [edi+2*SIZEOF_MMWORD], mm2 - movq MMWORD [edi+3*SIZEOF_MMWORD], mm6 - - psllq mm1,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm1=( - - - 0) - psllq mm2,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm2=( - - - 0) - - movq MMWORD [wk(2)], mm1 - movq MMWORD [wk(3)], mm2 - -.upsample: - ; -- process the upper row - - movq mm7, MMWORD [edx+0*SIZEOF_MMWORD] ; mm7=Int0L=( 0 1 2 3) - movq mm3, MMWORD [edx+1*SIZEOF_MMWORD] ; mm3=Int0H=( 4 5 6 7) - - movq mm0,mm7 - movq mm4,mm3 - psrlq mm0,2*BYTE_BIT ; mm0=( 1 2 3 -) - psllq mm4,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm4=( - - - 4) - movq mm5,mm7 - movq mm6,mm3 - psrlq mm5,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm5=( 3 - - -) - psllq mm6,2*BYTE_BIT ; mm6=( - 4 5 6) - - por mm0,mm4 ; mm0=( 1 2 3 4) - por mm5,mm6 ; mm5=( 3 4 5 6) - - movq mm1,mm7 - movq mm2,mm3 - psllq mm1,2*BYTE_BIT ; mm1=( - 0 1 2) - psrlq mm2,2*BYTE_BIT ; mm2=( 5 6 7 -) - movq mm4,mm3 - psrlq mm4,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm4=( 7 - - -) - - por mm1, MMWORD [wk(0)] ; mm1=(-1 0 1 2) - por mm2, MMWORD [wk(2)] ; mm2=( 5 6 7 8) - - movq MMWORD [wk(0)], mm4 - - pmullw mm7,[GOTOFF(ebx,PW_THREE)] - pmullw mm3,[GOTOFF(ebx,PW_THREE)] - paddw mm1,[GOTOFF(ebx,PW_EIGHT)] - paddw mm5,[GOTOFF(ebx,PW_EIGHT)] - paddw mm0,[GOTOFF(ebx,PW_SEVEN)] - paddw mm2,[GOTOFF(ebx,PW_SEVEN)] - - paddw mm1,mm7 - paddw mm5,mm3 - psrlw mm1,4 ; mm1=Out0LE=( 0 2 4 6) - psrlw mm5,4 ; mm5=Out0HE=( 8 10 12 14) - paddw mm0,mm7 - paddw mm2,mm3 - psrlw mm0,4 ; mm0=Out0LO=( 1 3 5 7) - psrlw mm2,4 ; mm2=Out0HO=( 9 11 13 15) - - psllw mm0,BYTE_BIT - psllw mm2,BYTE_BIT - por mm1,mm0 ; mm1=Out0L=( 0 1 2 3 4 5 6 7) - por mm5,mm2 ; mm5=Out0H=( 8 9 10 11 12 13 14 15) - - movq MMWORD [edx+0*SIZEOF_MMWORD], mm1 - movq MMWORD [edx+1*SIZEOF_MMWORD], mm5 - - ; -- process the lower row - - movq mm6, MMWORD [edi+0*SIZEOF_MMWORD] ; mm6=Int1L=( 0 1 2 3) - movq mm4, MMWORD [edi+1*SIZEOF_MMWORD] ; mm4=Int1H=( 4 5 6 7) - - movq mm7,mm6 - movq mm3,mm4 - psrlq mm7,2*BYTE_BIT ; mm7=( 1 2 3 -) - psllq mm3,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm3=( - - - 4) - movq mm0,mm6 - movq mm2,mm4 - psrlq mm0,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm0=( 3 - - -) - psllq mm2,2*BYTE_BIT ; mm2=( - 4 5 6) - - por mm7,mm3 ; mm7=( 1 2 3 4) - por mm0,mm2 ; mm0=( 3 4 5 6) - - movq mm1,mm6 - movq mm5,mm4 - psllq mm1,2*BYTE_BIT ; mm1=( - 0 1 2) - psrlq mm5,2*BYTE_BIT ; mm5=( 5 6 7 -) - movq mm3,mm4 - psrlq mm3,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm3=( 7 - - -) - - por mm1, MMWORD [wk(1)] ; mm1=(-1 0 1 2) - por mm5, MMWORD [wk(3)] ; mm5=( 5 6 7 8) - - movq MMWORD [wk(1)], mm3 - - pmullw mm6,[GOTOFF(ebx,PW_THREE)] - pmullw mm4,[GOTOFF(ebx,PW_THREE)] - paddw mm1,[GOTOFF(ebx,PW_EIGHT)] - paddw mm0,[GOTOFF(ebx,PW_EIGHT)] - paddw mm7,[GOTOFF(ebx,PW_SEVEN)] - paddw mm5,[GOTOFF(ebx,PW_SEVEN)] - - paddw mm1,mm6 - paddw mm0,mm4 - psrlw mm1,4 ; mm1=Out1LE=( 0 2 4 6) - psrlw mm0,4 ; mm0=Out1HE=( 8 10 12 14) - paddw mm7,mm6 - paddw mm5,mm4 - psrlw mm7,4 ; mm7=Out1LO=( 1 3 5 7) - psrlw mm5,4 ; mm5=Out1HO=( 9 11 13 15) - - psllw mm7,BYTE_BIT - psllw mm5,BYTE_BIT - por mm1,mm7 ; mm1=Out1L=( 0 1 2 3 4 5 6 7) - por mm0,mm5 ; mm0=Out1H=( 8 9 10 11 12 13 14 15) - - movq MMWORD [edi+0*SIZEOF_MMWORD], mm1 - movq MMWORD [edi+1*SIZEOF_MMWORD], mm0 - - poppic ebx - - sub eax, byte SIZEOF_MMWORD - add ecx, byte 1*SIZEOF_MMWORD ; inptr1(above) - add ebx, byte 1*SIZEOF_MMWORD ; inptr0 - add esi, byte 1*SIZEOF_MMWORD ; inptr1(below) - add edx, byte 2*SIZEOF_MMWORD ; outptr0 - add edi, byte 2*SIZEOF_MMWORD ; outptr1 - cmp eax, byte SIZEOF_MMWORD - ja near .columnloop - test eax,eax - jnz near .columnloop_last - - pop esi - pop edi - pop ecx - pop eax - - add esi, byte 1*SIZEOF_JSAMPROW ; input_data - add edi, byte 2*SIZEOF_JSAMPROW ; output_data - sub ecx, byte 2 ; rowctr - jg near .rowloop - - emms ; empty MMX state - -.return: - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret - -; -------------------------------------------------------------------------- -; -; Fast processing for the common case of 2:1 horizontal and 1:1 vertical. -; It's still a box filter. -; -; GLOBAL(void) -; jsimd_h2v1_upsample_mmx (int max_v_samp_factor, -; JDIMENSION output_width, -; JSAMPARRAY input_data, -; JSAMPARRAY * output_data_ptr); -; - -%define max_v_samp(b) (b)+8 ; int max_v_samp_factor -%define output_width(b) (b)+12 ; JDIMENSION output_width -%define input_data(b) (b)+16 ; JSAMPARRAY input_data -%define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr - - align 16 - global EXTN(jsimd_h2v1_upsample_mmx) - -EXTN(jsimd_h2v1_upsample_mmx): - push ebp - mov ebp,esp -; push ebx ; unused -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - mov edx, JDIMENSION [output_width(ebp)] - add edx, byte (2*SIZEOF_MMWORD)-1 - and edx, byte -(2*SIZEOF_MMWORD) - jz short .return - - mov ecx, INT [max_v_samp(ebp)] ; rowctr - test ecx,ecx - jz short .return - - mov esi, JSAMPARRAY [input_data(ebp)] ; input_data - mov edi, POINTER [output_data_ptr(ebp)] - mov edi, JSAMPARRAY [edi] ; output_data - alignx 16,7 -.rowloop: - push edi - push esi - - mov esi, JSAMPROW [esi] ; inptr - mov edi, JSAMPROW [edi] ; outptr - mov eax,edx ; colctr - alignx 16,7 -.columnloop: - - movq mm0, MMWORD [esi+0*SIZEOF_MMWORD] - - movq mm1,mm0 - punpcklbw mm0,mm0 - punpckhbw mm1,mm1 - - movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 - movq MMWORD [edi+1*SIZEOF_MMWORD], mm1 - - sub eax, byte 2*SIZEOF_MMWORD - jz short .nextrow - - movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] - - movq mm3,mm2 - punpcklbw mm2,mm2 - punpckhbw mm3,mm3 - - movq MMWORD [edi+2*SIZEOF_MMWORD], mm2 - movq MMWORD [edi+3*SIZEOF_MMWORD], mm3 - - sub eax, byte 2*SIZEOF_MMWORD - jz short .nextrow - - add esi, byte 2*SIZEOF_MMWORD ; inptr - add edi, byte 4*SIZEOF_MMWORD ; outptr - jmp short .columnloop - alignx 16,7 - -.nextrow: - pop esi - pop edi - - add esi, byte SIZEOF_JSAMPROW ; input_data - add edi, byte SIZEOF_JSAMPROW ; output_data - dec ecx ; rowctr - jg short .rowloop - - emms ; empty MMX state - -.return: - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved -; pop ebx ; unused - pop ebp - ret - -; -------------------------------------------------------------------------- -; -; Fast processing for the common case of 2:1 horizontal and 2:1 vertical. -; It's still a box filter. -; -; GLOBAL(void) -; jsimd_h2v2_upsample_mmx (int max_v_samp_factor, -; JDIMENSION output_width, -; JSAMPARRAY input_data, -; JSAMPARRAY * output_data_ptr); -; - -%define max_v_samp(b) (b)+8 ; int max_v_samp_factor -%define output_width(b) (b)+12 ; JDIMENSION output_width -%define input_data(b) (b)+16 ; JSAMPARRAY input_data -%define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr - - align 16 - global EXTN(jsimd_h2v2_upsample_mmx) - -EXTN(jsimd_h2v2_upsample_mmx): - push ebp - mov ebp,esp - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - mov edx, JDIMENSION [output_width(ebp)] - add edx, byte (2*SIZEOF_MMWORD)-1 - and edx, byte -(2*SIZEOF_MMWORD) - jz near .return - - mov ecx, INT [max_v_samp(ebp)] ; rowctr - test ecx,ecx - jz short .return - - mov esi, JSAMPARRAY [input_data(ebp)] ; input_data - mov edi, POINTER [output_data_ptr(ebp)] - mov edi, JSAMPARRAY [edi] ; output_data - alignx 16,7 -.rowloop: - push edi - push esi - - mov esi, JSAMPROW [esi] ; inptr - mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 - mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 - mov eax,edx ; colctr - alignx 16,7 -.columnloop: - - movq mm0, MMWORD [esi+0*SIZEOF_MMWORD] - - movq mm1,mm0 - punpcklbw mm0,mm0 - punpckhbw mm1,mm1 - - movq MMWORD [ebx+0*SIZEOF_MMWORD], mm0 - movq MMWORD [ebx+1*SIZEOF_MMWORD], mm1 - movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 - movq MMWORD [edi+1*SIZEOF_MMWORD], mm1 - - sub eax, byte 2*SIZEOF_MMWORD - jz short .nextrow - - movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] - - movq mm3,mm2 - punpcklbw mm2,mm2 - punpckhbw mm3,mm3 - - movq MMWORD [ebx+2*SIZEOF_MMWORD], mm2 - movq MMWORD [ebx+3*SIZEOF_MMWORD], mm3 - movq MMWORD [edi+2*SIZEOF_MMWORD], mm2 - movq MMWORD [edi+3*SIZEOF_MMWORD], mm3 - - sub eax, byte 2*SIZEOF_MMWORD - jz short .nextrow - - add esi, byte 2*SIZEOF_MMWORD ; inptr - add ebx, byte 4*SIZEOF_MMWORD ; outptr0 - add edi, byte 4*SIZEOF_MMWORD ; outptr1 - jmp short .columnloop - alignx 16,7 - -.nextrow: - pop esi - pop edi - - add esi, byte 1*SIZEOF_JSAMPROW ; input_data - add edi, byte 2*SIZEOF_JSAMPROW ; output_data - sub ecx, byte 2 ; rowctr - jg short .rowloop - - emms ; empty MMX state - -.return: - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - pop ebp - ret - -; For some reason, the OS X linker does not honor the request to align the -; segment unless we do this. - align 16 diff --git a/Builder/jni-1.11/simd/i386/src/jdsamss2.asm b/Builder/jni-1.11/simd/i386/src/jdsamss2.asm deleted file mode 100644 index b5c863b46..000000000 --- a/Builder/jni-1.11/simd/i386/src/jdsamss2.asm +++ /dev/null @@ -1,729 +0,0 @@ -; -; jdsamss2.asm - upsampling (SSE2) -; -; Copyright 2009 Pierre Ossman for Cendio AB -; -; Based on -; x86 SIMD extension for IJG JPEG library -; Copyright (C) 1999-2006, MIYASAKA Masaru. -; For conditions of distribution and use, see copyright notice in jsimdext.inc -; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] - -%include "jsimdext.inc" - -; -------------------------------------------------------------------------- - SECTION SEG_CONST - - alignz 16 - global EXTN(jconst_fancy_upsample_sse2) - -EXTN(jconst_fancy_upsample_sse2): - -PW_ONE times 8 dw 1 -PW_TWO times 8 dw 2 -PW_THREE times 8 dw 3 -PW_SEVEN times 8 dw 7 -PW_EIGHT times 8 dw 8 - - alignz 16 - -; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 -; -; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical. -; -; The upsampling algorithm is linear interpolation between pixel centers, -; also known as a "triangle filter". This is a good compromise between -; speed and visual quality. The centers of the output pixels are 1/4 and 3/4 -; of the way between input pixel centers. -; -; GLOBAL(void) -; jsimd_h2v1_fancy_upsample_sse2 (int max_v_samp_factor, -; JDIMENSION downsampled_width, -; JSAMPARRAY input_data, -; JSAMPARRAY * output_data_ptr); -; - -%define max_v_samp(b) (b)+8 ; int max_v_samp_factor -%define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width -%define input_data(b) (b)+16 ; JSAMPARRAY input_data -%define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr - - align 16 - global EXTN(jsimd_h2v1_fancy_upsample_sse2) - -EXTN(jsimd_h2v1_fancy_upsample_sse2): - push ebp - mov ebp,esp - pushpic ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - get_GOT ebx ; get GOT address - - mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr - test eax,eax - jz near .return - - mov ecx, INT [max_v_samp(ebp)] ; rowctr - test ecx,ecx - jz near .return - - mov esi, JSAMPARRAY [input_data(ebp)] ; input_data - mov edi, POINTER [output_data_ptr(ebp)] - mov edi, JSAMPARRAY [edi] ; output_data - alignx 16,7 -.rowloop: - push eax ; colctr - push edi - push esi - - mov esi, JSAMPROW [esi] ; inptr - mov edi, JSAMPROW [edi] ; outptr - - test eax, SIZEOF_XMMWORD-1 - jz short .skip - mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] - mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample -.skip: - pxor xmm0,xmm0 ; xmm0=(all 0's) - pcmpeqb xmm7,xmm7 - psrldq xmm7,(SIZEOF_XMMWORD-1) - pand xmm7, XMMWORD [esi+0*SIZEOF_XMMWORD] - - add eax, byte SIZEOF_XMMWORD-1 - and eax, byte -SIZEOF_XMMWORD - cmp eax, byte SIZEOF_XMMWORD - ja short .columnloop - alignx 16,7 - -.columnloop_last: - pcmpeqb xmm6,xmm6 - pslldq xmm6,(SIZEOF_XMMWORD-1) - pand xmm6, XMMWORD [esi+0*SIZEOF_XMMWORD] - jmp short .upsample - alignx 16,7 - -.columnloop: - movdqa xmm6, XMMWORD [esi+1*SIZEOF_XMMWORD] - pslldq xmm6,(SIZEOF_XMMWORD-1) - -.upsample: - movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] - movdqa xmm2,xmm1 - movdqa xmm3,xmm1 ; xmm1=( 0 1 2 ... 13 14 15) - pslldq xmm2,1 ; xmm2=(-- 0 1 ... 12 13 14) - psrldq xmm3,1 ; xmm3=( 1 2 3 ... 14 15 --) - - por xmm2,xmm7 ; xmm2=(-1 0 1 ... 12 13 14) - por xmm3,xmm6 ; xmm3=( 1 2 3 ... 14 15 16) - - movdqa xmm7,xmm1 - psrldq xmm7,(SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --) - - movdqa xmm4,xmm1 - punpcklbw xmm1,xmm0 ; xmm1=( 0 1 2 3 4 5 6 7) - punpckhbw xmm4,xmm0 ; xmm4=( 8 9 10 11 12 13 14 15) - movdqa xmm5,xmm2 - punpcklbw xmm2,xmm0 ; xmm2=(-1 0 1 2 3 4 5 6) - punpckhbw xmm5,xmm0 ; xmm5=( 7 8 9 10 11 12 13 14) - movdqa xmm6,xmm3 - punpcklbw xmm3,xmm0 ; xmm3=( 1 2 3 4 5 6 7 8) - punpckhbw xmm6,xmm0 ; xmm6=( 9 10 11 12 13 14 15 16) - - pmullw xmm1,[GOTOFF(ebx,PW_THREE)] - pmullw xmm4,[GOTOFF(ebx,PW_THREE)] - paddw xmm2,[GOTOFF(ebx,PW_ONE)] - paddw xmm5,[GOTOFF(ebx,PW_ONE)] - paddw xmm3,[GOTOFF(ebx,PW_TWO)] - paddw xmm6,[GOTOFF(ebx,PW_TWO)] - - paddw xmm2,xmm1 - paddw xmm5,xmm4 - psrlw xmm2,2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14) - psrlw xmm5,2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30) - paddw xmm3,xmm1 - paddw xmm6,xmm4 - psrlw xmm3,2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15) - psrlw xmm6,2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31) - - psllw xmm3,BYTE_BIT - psllw xmm6,BYTE_BIT - por xmm2,xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15) - por xmm5,xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31) - - movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2 - movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm5 - - sub eax, byte SIZEOF_XMMWORD - add esi, byte 1*SIZEOF_XMMWORD ; inptr - add edi, byte 2*SIZEOF_XMMWORD ; outptr - cmp eax, byte SIZEOF_XMMWORD - ja near .columnloop - test eax,eax - jnz near .columnloop_last - - pop esi - pop edi - pop eax - - add esi, byte SIZEOF_JSAMPROW ; input_data - add edi, byte SIZEOF_JSAMPROW ; output_data - dec ecx ; rowctr - jg near .rowloop - -.return: - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - poppic ebx - pop ebp - ret - -; -------------------------------------------------------------------------- -; -; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical. -; Again a triangle filter; see comments for h2v1 case, above. -; -; GLOBAL(void) -; jsimd_h2v2_fancy_upsample_sse2 (int max_v_samp_factor, -; JDIMENSION downsampled_width, -; JSAMPARRAY input_data, -; JSAMPARRAY * output_data_ptr); -; - -%define max_v_samp(b) (b)+8 ; int max_v_samp_factor -%define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width -%define input_data(b) (b)+16 ; JSAMPARRAY input_data -%define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr - -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 4 -%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr - - align 16 - global EXTN(jsimd_h2v2_fancy_upsample_sse2) - -EXTN(jsimd_h2v2_fancy_upsample_sse2): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [wk(0)] - pushpic eax ; make a room for GOT address - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - get_GOT ebx ; get GOT address - movpic POINTER [gotptr], ebx ; save GOT address - - mov edx,eax ; edx = original ebp - mov eax, JDIMENSION [downsamp_width(edx)] ; colctr - test eax,eax - jz near .return - - mov ecx, INT [max_v_samp(edx)] ; rowctr - test ecx,ecx - jz near .return - - mov esi, JSAMPARRAY [input_data(edx)] ; input_data - mov edi, POINTER [output_data_ptr(edx)] - mov edi, JSAMPARRAY [edi] ; output_data - alignx 16,7 -.rowloop: - push eax ; colctr - push ecx - push edi - push esi - - mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above) - mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 - mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below) - mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 - mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 - - test eax, SIZEOF_XMMWORD-1 - jz short .skip - push edx - mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE] - mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl - mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE] - mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl - mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] - mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample - pop edx -.skip: - ; -- process the first column block - - movdqa xmm0, XMMWORD [ebx+0*SIZEOF_XMMWORD] ; xmm0=row[ 0][0] - movdqa xmm1, XMMWORD [ecx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0] - movdqa xmm2, XMMWORD [esi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0] - - pushpic ebx - movpic ebx, POINTER [gotptr] ; load GOT address - - pxor xmm3,xmm3 ; xmm3=(all 0's) - movdqa xmm4,xmm0 - punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) - punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) - movdqa xmm5,xmm1 - punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) - punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) - movdqa xmm6,xmm2 - punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) - punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) - - pmullw xmm0,[GOTOFF(ebx,PW_THREE)] - pmullw xmm4,[GOTOFF(ebx,PW_THREE)] - - pcmpeqb xmm7,xmm7 - psrldq xmm7,(SIZEOF_XMMWORD-2) - - paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) - paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) - paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) - paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) - - movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save - movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data - movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2 - movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm6 - - pand xmm1,xmm7 ; xmm1=( 0 -- -- -- -- -- -- --) - pand xmm2,xmm7 ; xmm2=( 0 -- -- -- -- -- -- --) - - movdqa XMMWORD [wk(0)], xmm1 - movdqa XMMWORD [wk(1)], xmm2 - - poppic ebx - - add eax, byte SIZEOF_XMMWORD-1 - and eax, byte -SIZEOF_XMMWORD - cmp eax, byte SIZEOF_XMMWORD - ja short .columnloop - alignx 16,7 - -.columnloop_last: - ; -- process the last column block - - pushpic ebx - movpic ebx, POINTER [gotptr] ; load GOT address - - pcmpeqb xmm1,xmm1 - pslldq xmm1,(SIZEOF_XMMWORD-2) - movdqa xmm2,xmm1 - - pand xmm1, XMMWORD [edx+1*SIZEOF_XMMWORD] - pand xmm2, XMMWORD [edi+1*SIZEOF_XMMWORD] - - movdqa XMMWORD [wk(2)], xmm1 ; xmm1=(-- -- -- -- -- -- -- 15) - movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15) - - jmp near .upsample - alignx 16,7 - -.columnloop: - ; -- process the next column block - - movdqa xmm0, XMMWORD [ebx+1*SIZEOF_XMMWORD] ; xmm0=row[ 0][1] - movdqa xmm1, XMMWORD [ecx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1] - movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1] - - pushpic ebx - movpic ebx, POINTER [gotptr] ; load GOT address - - pxor xmm3,xmm3 ; xmm3=(all 0's) - movdqa xmm4,xmm0 - punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) - punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) - movdqa xmm5,xmm1 - punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) - punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) - movdqa xmm6,xmm2 - punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) - punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) - - pmullw xmm0,[GOTOFF(ebx,PW_THREE)] - pmullw xmm4,[GOTOFF(ebx,PW_THREE)] - - paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) - paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) - paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) - paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) - - movdqa XMMWORD [edx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save - movdqa XMMWORD [edx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data - movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2 - movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm6 - - pslldq xmm1,(SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0) - pslldq xmm2,(SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0) - - movdqa XMMWORD [wk(2)], xmm1 - movdqa XMMWORD [wk(3)], xmm2 - -.upsample: - ; -- process the upper row - - movdqa xmm7, XMMWORD [edx+0*SIZEOF_XMMWORD] - movdqa xmm3, XMMWORD [edx+1*SIZEOF_XMMWORD] - - movdqa xmm0,xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7) - movdqa xmm4,xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15) - psrldq xmm0,2 ; xmm0=( 1 2 3 4 5 6 7 --) - pslldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8) - movdqa xmm5,xmm7 - movdqa xmm6,xmm3 - psrldq xmm5,(SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --) - pslldq xmm6,2 ; xmm6=(-- 8 9 10 11 12 13 14) - - por xmm0,xmm4 ; xmm0=( 1 2 3 4 5 6 7 8) - por xmm5,xmm6 ; xmm5=( 7 8 9 10 11 12 13 14) - - movdqa xmm1,xmm7 - movdqa xmm2,xmm3 - pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6) - psrldq xmm2,2 ; xmm2=( 9 10 11 12 13 14 15 --) - movdqa xmm4,xmm3 - psrldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --) - - por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6) - por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16) - - movdqa XMMWORD [wk(0)], xmm4 - - pmullw xmm7,[GOTOFF(ebx,PW_THREE)] - pmullw xmm3,[GOTOFF(ebx,PW_THREE)] - paddw xmm1,[GOTOFF(ebx,PW_EIGHT)] - paddw xmm5,[GOTOFF(ebx,PW_EIGHT)] - paddw xmm0,[GOTOFF(ebx,PW_SEVEN)] - paddw xmm2,[GOTOFF(ebx,PW_SEVEN)] - - paddw xmm1,xmm7 - paddw xmm5,xmm3 - psrlw xmm1,4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14) - psrlw xmm5,4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30) - paddw xmm0,xmm7 - paddw xmm2,xmm3 - psrlw xmm0,4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15) - psrlw xmm2,4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31) - - psllw xmm0,BYTE_BIT - psllw xmm2,BYTE_BIT - por xmm1,xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15) - por xmm5,xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31) - - movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1 - movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5 - - ; -- process the lower row - - movdqa xmm6, XMMWORD [edi+0*SIZEOF_XMMWORD] - movdqa xmm4, XMMWORD [edi+1*SIZEOF_XMMWORD] - - movdqa xmm7,xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7) - movdqa xmm3,xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15) - psrldq xmm7,2 ; xmm7=( 1 2 3 4 5 6 7 --) - pslldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8) - movdqa xmm0,xmm6 - movdqa xmm2,xmm4 - psrldq xmm0,(SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --) - pslldq xmm2,2 ; xmm2=(-- 8 9 10 11 12 13 14) - - por xmm7,xmm3 ; xmm7=( 1 2 3 4 5 6 7 8) - por xmm0,xmm2 ; xmm0=( 7 8 9 10 11 12 13 14) - - movdqa xmm1,xmm6 - movdqa xmm5,xmm4 - pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6) - psrldq xmm5,2 ; xmm5=( 9 10 11 12 13 14 15 --) - movdqa xmm3,xmm4 - psrldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --) - - por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6) - por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16) - - movdqa XMMWORD [wk(1)], xmm3 - - pmullw xmm6,[GOTOFF(ebx,PW_THREE)] - pmullw xmm4,[GOTOFF(ebx,PW_THREE)] - paddw xmm1,[GOTOFF(ebx,PW_EIGHT)] - paddw xmm0,[GOTOFF(ebx,PW_EIGHT)] - paddw xmm7,[GOTOFF(ebx,PW_SEVEN)] - paddw xmm5,[GOTOFF(ebx,PW_SEVEN)] - - paddw xmm1,xmm6 - paddw xmm0,xmm4 - psrlw xmm1,4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14) - psrlw xmm0,4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30) - paddw xmm7,xmm6 - paddw xmm5,xmm4 - psrlw xmm7,4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15) - psrlw xmm5,4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31) - - psllw xmm7,BYTE_BIT - psllw xmm5,BYTE_BIT - por xmm1,xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15) - por xmm0,xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31) - - movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm1 - movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm0 - - poppic ebx - - sub eax, byte SIZEOF_XMMWORD - add ecx, byte 1*SIZEOF_XMMWORD ; inptr1(above) - add ebx, byte 1*SIZEOF_XMMWORD ; inptr0 - add esi, byte 1*SIZEOF_XMMWORD ; inptr1(below) - add edx, byte 2*SIZEOF_XMMWORD ; outptr0 - add edi, byte 2*SIZEOF_XMMWORD ; outptr1 - cmp eax, byte SIZEOF_XMMWORD - ja near .columnloop - test eax,eax - jnz near .columnloop_last - - pop esi - pop edi - pop ecx - pop eax - - add esi, byte 1*SIZEOF_JSAMPROW ; input_data - add edi, byte 2*SIZEOF_JSAMPROW ; output_data - sub ecx, byte 2 ; rowctr - jg near .rowloop - -.return: - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret - -; -------------------------------------------------------------------------- -; -; Fast processing for the common case of 2:1 horizontal and 1:1 vertical. -; It's still a box filter. -; -; GLOBAL(void) -; jsimd_h2v1_upsample_sse2 (int max_v_samp_factor, -; JDIMENSION output_width, -; JSAMPARRAY input_data, -; JSAMPARRAY * output_data_ptr); -; - -%define max_v_samp(b) (b)+8 ; int max_v_samp_factor -%define output_width(b) (b)+12 ; JDIMENSION output_width -%define input_data(b) (b)+16 ; JSAMPARRAY input_data -%define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr - - align 16 - global EXTN(jsimd_h2v1_upsample_sse2) - -EXTN(jsimd_h2v1_upsample_sse2): - push ebp - mov ebp,esp -; push ebx ; unused -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - mov edx, JDIMENSION [output_width(ebp)] - add edx, byte (2*SIZEOF_XMMWORD)-1 - and edx, byte -(2*SIZEOF_XMMWORD) - jz short .return - - mov ecx, INT [max_v_samp(ebp)] ; rowctr - test ecx,ecx - jz short .return - - mov esi, JSAMPARRAY [input_data(ebp)] ; input_data - mov edi, POINTER [output_data_ptr(ebp)] - mov edi, JSAMPARRAY [edi] ; output_data - alignx 16,7 -.rowloop: - push edi - push esi - - mov esi, JSAMPROW [esi] ; inptr - mov edi, JSAMPROW [edi] ; outptr - mov eax,edx ; colctr - alignx 16,7 -.columnloop: - - movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] - - movdqa xmm1,xmm0 - punpcklbw xmm0,xmm0 - punpckhbw xmm1,xmm1 - - movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 - movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1 - - sub eax, byte 2*SIZEOF_XMMWORD - jz short .nextrow - - movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] - - movdqa xmm3,xmm2 - punpcklbw xmm2,xmm2 - punpckhbw xmm3,xmm3 - - movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2 - movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3 - - sub eax, byte 2*SIZEOF_XMMWORD - jz short .nextrow - - add esi, byte 2*SIZEOF_XMMWORD ; inptr - add edi, byte 4*SIZEOF_XMMWORD ; outptr - jmp short .columnloop - alignx 16,7 - -.nextrow: - pop esi - pop edi - - add esi, byte SIZEOF_JSAMPROW ; input_data - add edi, byte SIZEOF_JSAMPROW ; output_data - dec ecx ; rowctr - jg short .rowloop - -.return: - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved -; pop ebx ; unused - pop ebp - ret - -; -------------------------------------------------------------------------- -; -; Fast processing for the common case of 2:1 horizontal and 2:1 vertical. -; It's still a box filter. -; -; GLOBAL(void) -; jsimd_h2v2_upsample_sse2 (nt max_v_samp_factor, -; JDIMENSION output_width, -; JSAMPARRAY input_data, -; JSAMPARRAY * output_data_ptr); -; - -%define max_v_samp(b) (b)+8 ; int max_v_samp_factor -%define output_width(b) (b)+12 ; JDIMENSION output_width -%define input_data(b) (b)+16 ; JSAMPARRAY input_data -%define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr - - align 16 - global EXTN(jsimd_h2v2_upsample_sse2) - -EXTN(jsimd_h2v2_upsample_sse2): - push ebp - mov ebp,esp - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - mov edx, JDIMENSION [output_width(ebp)] - add edx, byte (2*SIZEOF_XMMWORD)-1 - and edx, byte -(2*SIZEOF_XMMWORD) - jz near .return - - mov ecx, INT [max_v_samp(ebp)] ; rowctr - test ecx,ecx - jz near .return - - mov esi, JSAMPARRAY [input_data(ebp)] ; input_data - mov edi, POINTER [output_data_ptr(ebp)] - mov edi, JSAMPARRAY [edi] ; output_data - alignx 16,7 -.rowloop: - push edi - push esi - - mov esi, JSAMPROW [esi] ; inptr - mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 - mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 - mov eax,edx ; colctr - alignx 16,7 -.columnloop: - - movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] - - movdqa xmm1,xmm0 - punpcklbw xmm0,xmm0 - punpckhbw xmm1,xmm1 - - movdqa XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0 - movdqa XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1 - movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 - movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1 - - sub eax, byte 2*SIZEOF_XMMWORD - jz short .nextrow - - movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] - - movdqa xmm3,xmm2 - punpcklbw xmm2,xmm2 - punpckhbw xmm3,xmm3 - - movdqa XMMWORD [ebx+2*SIZEOF_XMMWORD], xmm2 - movdqa XMMWORD [ebx+3*SIZEOF_XMMWORD], xmm3 - movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2 - movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3 - - sub eax, byte 2*SIZEOF_XMMWORD - jz short .nextrow - - add esi, byte 2*SIZEOF_XMMWORD ; inptr - add ebx, byte 4*SIZEOF_XMMWORD ; outptr0 - add edi, byte 4*SIZEOF_XMMWORD ; outptr1 - jmp short .columnloop - alignx 16,7 - -.nextrow: - pop esi - pop edi - - add esi, byte 1*SIZEOF_JSAMPROW ; input_data - add edi, byte 2*SIZEOF_JSAMPROW ; output_data - sub ecx, byte 2 ; rowctr - jg short .rowloop - -.return: - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - pop ebp - ret - -; For some reason, the OS X linker does not honor the request to align the -; segment unless we do this. - align 16 diff --git a/Builder/jni-1.11/simd/i386/src/jf3dnflt.asm b/Builder/jni-1.11/simd/i386/src/jf3dnflt.asm deleted file mode 100644 index 542672dc5..000000000 --- a/Builder/jni-1.11/simd/i386/src/jf3dnflt.asm +++ /dev/null @@ -1,320 +0,0 @@ -; -; jf3dnflt.asm - floating-point FDCT (3DNow!) -; -; Copyright 2009 Pierre Ossman for Cendio AB -; -; Based on -; x86 SIMD extension for IJG JPEG library -; Copyright (C) 1999-2006, MIYASAKA Masaru. -; For conditions of distribution and use, see copyright notice in jsimdext.inc -; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; This file contains a floating-point implementation of the forward DCT -; (Discrete Cosine Transform). The following code is based directly on -; the IJG's original jfdctflt.c; see the jfdctflt.c for more details. -; -; [TAB8] - -%include "jsimdext.inc" -%include "jdct.inc" - -; -------------------------------------------------------------------------- - SECTION SEG_CONST - - alignz 16 - global EXTN(jconst_fdct_float_3dnow) - -EXTN(jconst_fdct_float_3dnow): - -PD_0_382 times 2 dd 0.382683432365089771728460 -PD_0_707 times 2 dd 0.707106781186547524400844 -PD_0_541 times 2 dd 0.541196100146196984399723 -PD_1_306 times 2 dd 1.306562964876376527856643 - - alignz 16 - -; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 -; -; Perform the forward DCT on one block of samples. -; -; GLOBAL(void) -; jsimd_fdct_float_3dnow (FAST_FLOAT * data) -; - -%define data(b) (b)+8 ; FAST_FLOAT * data - -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] -%define WK_NUM 2 - - align 16 - global EXTN(jsimd_fdct_float_3dnow) - -EXTN(jsimd_fdct_float_3dnow): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [wk(0)] - pushpic ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved -; push esi ; unused -; push edi ; unused - - get_GOT ebx ; get GOT address - - ; ---- Pass 1: process rows. - - mov edx, POINTER [data(eax)] ; (FAST_FLOAT *) - mov ecx, DCTSIZE/2 - alignx 16,7 -.rowloop: - - movq mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] - movq mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] - movq mm2, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)] - movq mm3, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)] - - ; mm0=(00 01), mm1=(10 11), mm2=(06 07), mm3=(16 17) - - movq mm4,mm0 ; transpose coefficients - punpckldq mm0,mm1 ; mm0=(00 10)=data0 - punpckhdq mm4,mm1 ; mm4=(01 11)=data1 - movq mm5,mm2 ; transpose coefficients - punpckldq mm2,mm3 ; mm2=(06 16)=data6 - punpckhdq mm5,mm3 ; mm5=(07 17)=data7 - - movq mm6,mm4 - movq mm7,mm0 - pfsub mm4,mm2 ; mm4=data1-data6=tmp6 - pfsub mm0,mm5 ; mm0=data0-data7=tmp7 - pfadd mm6,mm2 ; mm6=data1+data6=tmp1 - pfadd mm7,mm5 ; mm7=data0+data7=tmp0 - - movq mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] - movq mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] - movq mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)] - movq mm5, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)] - - ; mm1=(02 03), mm3=(12 13), mm2=(04 05), mm5=(14 15) - - movq MMWORD [wk(0)], mm4 ; wk(0)=tmp6 - movq MMWORD [wk(1)], mm0 ; wk(1)=tmp7 - - movq mm4,mm1 ; transpose coefficients - punpckldq mm1,mm3 ; mm1=(02 12)=data2 - punpckhdq mm4,mm3 ; mm4=(03 13)=data3 - movq mm0,mm2 ; transpose coefficients - punpckldq mm2,mm5 ; mm2=(04 14)=data4 - punpckhdq mm0,mm5 ; mm0=(05 15)=data5 - - movq mm3,mm4 - movq mm5,mm1 - pfadd mm4,mm2 ; mm4=data3+data4=tmp3 - pfadd mm1,mm0 ; mm1=data2+data5=tmp2 - pfsub mm3,mm2 ; mm3=data3-data4=tmp4 - pfsub mm5,mm0 ; mm5=data2-data5=tmp5 - - ; -- Even part - - movq mm2,mm7 - movq mm0,mm6 - pfsub mm7,mm4 ; mm7=tmp13 - pfsub mm6,mm1 ; mm6=tmp12 - pfadd mm2,mm4 ; mm2=tmp10 - pfadd mm0,mm1 ; mm0=tmp11 - - pfadd mm6,mm7 - pfmul mm6,[GOTOFF(ebx,PD_0_707)] ; mm6=z1 - - movq mm4,mm2 - movq mm1,mm7 - pfsub mm2,mm0 ; mm2=data4 - pfsub mm7,mm6 ; mm7=data6 - pfadd mm4,mm0 ; mm4=data0 - pfadd mm1,mm6 ; mm1=data2 - - movq MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)], mm2 - movq MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)], mm7 - movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4 - movq MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], mm1 - - ; -- Odd part - - movq mm0, MMWORD [wk(0)] ; mm0=tmp6 - movq mm6, MMWORD [wk(1)] ; mm6=tmp7 - - pfadd mm3,mm5 ; mm3=tmp10 - pfadd mm5,mm0 ; mm5=tmp11 - pfadd mm0,mm6 ; mm0=tmp12, mm6=tmp7 - - pfmul mm5,[GOTOFF(ebx,PD_0_707)] ; mm5=z3 - - movq mm2,mm3 ; mm2=tmp10 - pfsub mm3,mm0 - pfmul mm3,[GOTOFF(ebx,PD_0_382)] ; mm3=z5 - pfmul mm2,[GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610) - pfmul mm0,[GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296) - pfadd mm2,mm3 ; mm2=z2 - pfadd mm0,mm3 ; mm0=z4 - - movq mm7,mm6 - pfsub mm6,mm5 ; mm6=z13 - pfadd mm7,mm5 ; mm7=z11 - - movq mm4,mm6 - movq mm1,mm7 - pfsub mm6,mm2 ; mm6=data3 - pfsub mm7,mm0 ; mm7=data7 - pfadd mm4,mm2 ; mm4=data5 - pfadd mm1,mm0 ; mm1=data1 - - movq MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], mm6 - movq MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)], mm7 - movq MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)], mm4 - movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1 - - add edx, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT - dec ecx - jnz near .rowloop - - ; ---- Pass 2: process columns. - - mov edx, POINTER [data(eax)] ; (FAST_FLOAT *) - mov ecx, DCTSIZE/2 - alignx 16,7 -.columnloop: - - movq mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] - movq mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] - movq mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)] - movq mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)] - - ; mm0=(00 10), mm1=(01 11), mm2=(60 70), mm3=(61 71) - - movq mm4,mm0 ; transpose coefficients - punpckldq mm0,mm1 ; mm0=(00 01)=data0 - punpckhdq mm4,mm1 ; mm4=(10 11)=data1 - movq mm5,mm2 ; transpose coefficients - punpckldq mm2,mm3 ; mm2=(60 61)=data6 - punpckhdq mm5,mm3 ; mm5=(70 71)=data7 - - movq mm6,mm4 - movq mm7,mm0 - pfsub mm4,mm2 ; mm4=data1-data6=tmp6 - pfsub mm0,mm5 ; mm0=data0-data7=tmp7 - pfadd mm6,mm2 ; mm6=data1+data6=tmp1 - pfadd mm7,mm5 ; mm7=data0+data7=tmp0 - - movq mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)] - movq mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)] - movq mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)] - movq mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)] - - ; mm1=(20 30), mm3=(21 31), mm2=(40 50), mm5=(41 51) - - movq MMWORD [wk(0)], mm4 ; wk(0)=tmp6 - movq MMWORD [wk(1)], mm0 ; wk(1)=tmp7 - - movq mm4,mm1 ; transpose coefficients - punpckldq mm1,mm3 ; mm1=(20 21)=data2 - punpckhdq mm4,mm3 ; mm4=(30 31)=data3 - movq mm0,mm2 ; transpose coefficients - punpckldq mm2,mm5 ; mm2=(40 41)=data4 - punpckhdq mm0,mm5 ; mm0=(50 51)=data5 - - movq mm3,mm4 - movq mm5,mm1 - pfadd mm4,mm2 ; mm4=data3+data4=tmp3 - pfadd mm1,mm0 ; mm1=data2+data5=tmp2 - pfsub mm3,mm2 ; mm3=data3-data4=tmp4 - pfsub mm5,mm0 ; mm5=data2-data5=tmp5 - - ; -- Even part - - movq mm2,mm7 - movq mm0,mm6 - pfsub mm7,mm4 ; mm7=tmp13 - pfsub mm6,mm1 ; mm6=tmp12 - pfadd mm2,mm4 ; mm2=tmp10 - pfadd mm0,mm1 ; mm0=tmp11 - - pfadd mm6,mm7 - pfmul mm6,[GOTOFF(ebx,PD_0_707)] ; mm6=z1 - - movq mm4,mm2 - movq mm1,mm7 - pfsub mm2,mm0 ; mm2=data4 - pfsub mm7,mm6 ; mm7=data6 - pfadd mm4,mm0 ; mm4=data0 - pfadd mm1,mm6 ; mm1=data2 - - movq MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], mm2 - movq MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], mm7 - movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4 - movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], mm1 - - ; -- Odd part - - movq mm0, MMWORD [wk(0)] ; mm0=tmp6 - movq mm6, MMWORD [wk(1)] ; mm6=tmp7 - - pfadd mm3,mm5 ; mm3=tmp10 - pfadd mm5,mm0 ; mm5=tmp11 - pfadd mm0,mm6 ; mm0=tmp12, mm6=tmp7 - - pfmul mm5,[GOTOFF(ebx,PD_0_707)] ; mm5=z3 - - movq mm2,mm3 ; mm2=tmp10 - pfsub mm3,mm0 - pfmul mm3,[GOTOFF(ebx,PD_0_382)] ; mm3=z5 - pfmul mm2,[GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610) - pfmul mm0,[GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296) - pfadd mm2,mm3 ; mm2=z2 - pfadd mm0,mm3 ; mm0=z4 - - movq mm7,mm6 - pfsub mm6,mm5 ; mm6=z13 - pfadd mm7,mm5 ; mm7=z11 - - movq mm4,mm6 - movq mm1,mm7 - pfsub mm6,mm2 ; mm6=data3 - pfsub mm7,mm0 ; mm7=data7 - pfadd mm4,mm2 ; mm4=data5 - pfadd mm1,mm0 ; mm1=data1 - - movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], mm6 - movq MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], mm7 - movq MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], mm4 - movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1 - - add edx, byte 2*SIZEOF_FAST_FLOAT - dec ecx - jnz near .columnloop - - femms ; empty MMX/3DNow! state - -; pop edi ; unused -; pop esi ; unused -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - poppic ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret - -; For some reason, the OS X linker does not honor the request to align the -; segment unless we do this. - align 16 diff --git a/Builder/jni-1.11/simd/i386/src/jfmmxfst.asm b/Builder/jni-1.11/simd/i386/src/jfmmxfst.asm deleted file mode 100644 index 0647242a9..000000000 --- a/Builder/jni-1.11/simd/i386/src/jfmmxfst.asm +++ /dev/null @@ -1,397 +0,0 @@ -; -; jfmmxfst.asm - fast integer FDCT (MMX) -; -; Copyright 2009 Pierre Ossman for Cendio AB -; -; Based on -; x86 SIMD extension for IJG JPEG library -; Copyright (C) 1999-2006, MIYASAKA Masaru. -; For conditions of distribution and use, see copyright notice in jsimdext.inc -; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; This file contains a fast, not so accurate integer implementation of -; the forward DCT (Discrete Cosine Transform). The following code is -; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c -; for more details. -; -; [TAB8] - -%include "jsimdext.inc" -%include "jdct.inc" - -; -------------------------------------------------------------------------- - -%define CONST_BITS 8 ; 14 is also OK. - -%if CONST_BITS == 8 -F_0_382 equ 98 ; FIX(0.382683433) -F_0_541 equ 139 ; FIX(0.541196100) -F_0_707 equ 181 ; FIX(0.707106781) -F_1_306 equ 334 ; FIX(1.306562965) -%else -; NASM cannot do compile-time arithmetic on floating-point constants. -%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) -F_0_382 equ DESCALE( 410903207,30-CONST_BITS) ; FIX(0.382683433) -F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) -F_0_707 equ DESCALE( 759250124,30-CONST_BITS) ; FIX(0.707106781) -F_1_306 equ DESCALE(1402911301,30-CONST_BITS) ; FIX(1.306562965) -%endif - -; -------------------------------------------------------------------------- - SECTION SEG_CONST - -; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) -; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) - -%define PRE_MULTIPLY_SCALE_BITS 2 -%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) - - alignz 16 - global EXTN(jconst_fdct_ifast_mmx) - -EXTN(jconst_fdct_ifast_mmx): - -PW_F0707 times 4 dw F_0_707 << CONST_SHIFT -PW_F0382 times 4 dw F_0_382 << CONST_SHIFT -PW_F0541 times 4 dw F_0_541 << CONST_SHIFT -PW_F1306 times 4 dw F_1_306 << CONST_SHIFT - - alignz 16 - -; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 -; -; Perform the forward DCT on one block of samples. -; -; GLOBAL(void) -; jsimd_fdct_ifast_mmx (DCTELEM * data) -; - -%define data(b) (b)+8 ; DCTELEM * data - -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] -%define WK_NUM 2 - - align 16 - global EXTN(jsimd_fdct_ifast_mmx) - -EXTN(jsimd_fdct_ifast_mmx): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [wk(0)] - pushpic ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved -; push esi ; unused -; push edi ; unused - - get_GOT ebx ; get GOT address - - ; ---- Pass 1: process rows. - - mov edx, POINTER [data(eax)] ; (DCTELEM *) - mov ecx, DCTSIZE/4 - alignx 16,7 -.rowloop: - - movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)] - movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)] - movq mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)] - movq mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)] - - ; mm0=(20 21 22 23), mm2=(24 25 26 27) - ; mm1=(30 31 32 33), mm3=(34 35 36 37) - - movq mm4,mm0 ; transpose coefficients(phase 1) - punpcklwd mm0,mm1 ; mm0=(20 30 21 31) - punpckhwd mm4,mm1 ; mm4=(22 32 23 33) - movq mm5,mm2 ; transpose coefficients(phase 1) - punpcklwd mm2,mm3 ; mm2=(24 34 25 35) - punpckhwd mm5,mm3 ; mm5=(26 36 27 37) - - movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)] - movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)] - movq mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)] - movq mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)] - - ; mm6=(00 01 02 03), mm1=(04 05 06 07) - ; mm7=(10 11 12 13), mm3=(14 15 16 17) - - movq MMWORD [wk(0)], mm4 ; wk(0)=(22 32 23 33) - movq MMWORD [wk(1)], mm2 ; wk(1)=(24 34 25 35) - - movq mm4,mm6 ; transpose coefficients(phase 1) - punpcklwd mm6,mm7 ; mm6=(00 10 01 11) - punpckhwd mm4,mm7 ; mm4=(02 12 03 13) - movq mm2,mm1 ; transpose coefficients(phase 1) - punpcklwd mm1,mm3 ; mm1=(04 14 05 15) - punpckhwd mm2,mm3 ; mm2=(06 16 07 17) - - movq mm7,mm6 ; transpose coefficients(phase 2) - punpckldq mm6,mm0 ; mm6=(00 10 20 30)=data0 - punpckhdq mm7,mm0 ; mm7=(01 11 21 31)=data1 - movq mm3,mm2 ; transpose coefficients(phase 2) - punpckldq mm2,mm5 ; mm2=(06 16 26 36)=data6 - punpckhdq mm3,mm5 ; mm3=(07 17 27 37)=data7 - - movq mm0,mm7 - movq mm5,mm6 - psubw mm7,mm2 ; mm7=data1-data6=tmp6 - psubw mm6,mm3 ; mm6=data0-data7=tmp7 - paddw mm0,mm2 ; mm0=data1+data6=tmp1 - paddw mm5,mm3 ; mm5=data0+data7=tmp0 - - movq mm2, MMWORD [wk(0)] ; mm2=(22 32 23 33) - movq mm3, MMWORD [wk(1)] ; mm3=(24 34 25 35) - movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6 - movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7 - - movq mm7,mm4 ; transpose coefficients(phase 2) - punpckldq mm4,mm2 ; mm4=(02 12 22 32)=data2 - punpckhdq mm7,mm2 ; mm7=(03 13 23 33)=data3 - movq mm6,mm1 ; transpose coefficients(phase 2) - punpckldq mm1,mm3 ; mm1=(04 14 24 34)=data4 - punpckhdq mm6,mm3 ; mm6=(05 15 25 35)=data5 - - movq mm2,mm7 - movq mm3,mm4 - paddw mm7,mm1 ; mm7=data3+data4=tmp3 - paddw mm4,mm6 ; mm4=data2+data5=tmp2 - psubw mm2,mm1 ; mm2=data3-data4=tmp4 - psubw mm3,mm6 ; mm3=data2-data5=tmp5 - - ; -- Even part - - movq mm1,mm5 - movq mm6,mm0 - psubw mm5,mm7 ; mm5=tmp13 - psubw mm0,mm4 ; mm0=tmp12 - paddw mm1,mm7 ; mm1=tmp10 - paddw mm6,mm4 ; mm6=tmp11 - - paddw mm0,mm5 - psllw mm0,PRE_MULTIPLY_SCALE_BITS - pmulhw mm0,[GOTOFF(ebx,PW_F0707)] ; mm0=z1 - - movq mm7,mm1 - movq mm4,mm5 - psubw mm1,mm6 ; mm1=data4 - psubw mm5,mm0 ; mm5=data6 - paddw mm7,mm6 ; mm7=data0 - paddw mm4,mm0 ; mm4=data2 - - movq MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm1 - movq MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm5 - movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7 - movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4 - - ; -- Odd part - - movq mm6, MMWORD [wk(0)] ; mm6=tmp6 - movq mm0, MMWORD [wk(1)] ; mm0=tmp7 - - paddw mm2,mm3 ; mm2=tmp10 - paddw mm3,mm6 ; mm3=tmp11 - paddw mm6,mm0 ; mm6=tmp12, mm0=tmp7 - - psllw mm2,PRE_MULTIPLY_SCALE_BITS - psllw mm6,PRE_MULTIPLY_SCALE_BITS - - psllw mm3,PRE_MULTIPLY_SCALE_BITS - pmulhw mm3,[GOTOFF(ebx,PW_F0707)] ; mm3=z3 - - movq mm1,mm2 ; mm1=tmp10 - psubw mm2,mm6 - pmulhw mm2,[GOTOFF(ebx,PW_F0382)] ; mm2=z5 - pmulhw mm1,[GOTOFF(ebx,PW_F0541)] ; mm1=MULTIPLY(tmp10,FIX_0_54119610) - pmulhw mm6,[GOTOFF(ebx,PW_F1306)] ; mm6=MULTIPLY(tmp12,FIX_1_30656296) - paddw mm1,mm2 ; mm1=z2 - paddw mm6,mm2 ; mm6=z4 - - movq mm5,mm0 - psubw mm0,mm3 ; mm0=z13 - paddw mm5,mm3 ; mm5=z11 - - movq mm7,mm0 - movq mm4,mm5 - psubw mm0,mm1 ; mm0=data3 - psubw mm5,mm6 ; mm5=data7 - paddw mm7,mm1 ; mm7=data5 - paddw mm4,mm6 ; mm4=data1 - - movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0 - movq MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm5 - movq MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm7 - movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4 - - add edx, byte 4*DCTSIZE*SIZEOF_DCTELEM - dec ecx - jnz near .rowloop - - ; ---- Pass 2: process columns. - - mov edx, POINTER [data(eax)] ; (DCTELEM *) - mov ecx, DCTSIZE/4 - alignx 16,7 -.columnloop: - - movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)] - movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)] - movq mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)] - movq mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)] - - ; mm0=(02 12 22 32), mm2=(42 52 62 72) - ; mm1=(03 13 23 33), mm3=(43 53 63 73) - - movq mm4,mm0 ; transpose coefficients(phase 1) - punpcklwd mm0,mm1 ; mm0=(02 03 12 13) - punpckhwd mm4,mm1 ; mm4=(22 23 32 33) - movq mm5,mm2 ; transpose coefficients(phase 1) - punpcklwd mm2,mm3 ; mm2=(42 43 52 53) - punpckhwd mm5,mm3 ; mm5=(62 63 72 73) - - movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)] - movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)] - movq mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)] - movq mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)] - - ; mm6=(00 10 20 30), mm1=(40 50 60 70) - ; mm7=(01 11 21 31), mm3=(41 51 61 71) - - movq MMWORD [wk(0)], mm4 ; wk(0)=(22 23 32 33) - movq MMWORD [wk(1)], mm2 ; wk(1)=(42 43 52 53) - - movq mm4,mm6 ; transpose coefficients(phase 1) - punpcklwd mm6,mm7 ; mm6=(00 01 10 11) - punpckhwd mm4,mm7 ; mm4=(20 21 30 31) - movq mm2,mm1 ; transpose coefficients(phase 1) - punpcklwd mm1,mm3 ; mm1=(40 41 50 51) - punpckhwd mm2,mm3 ; mm2=(60 61 70 71) - - movq mm7,mm6 ; transpose coefficients(phase 2) - punpckldq mm6,mm0 ; mm6=(00 01 02 03)=data0 - punpckhdq mm7,mm0 ; mm7=(10 11 12 13)=data1 - movq mm3,mm2 ; transpose coefficients(phase 2) - punpckldq mm2,mm5 ; mm2=(60 61 62 63)=data6 - punpckhdq mm3,mm5 ; mm3=(70 71 72 73)=data7 - - movq mm0,mm7 - movq mm5,mm6 - psubw mm7,mm2 ; mm7=data1-data6=tmp6 - psubw mm6,mm3 ; mm6=data0-data7=tmp7 - paddw mm0,mm2 ; mm0=data1+data6=tmp1 - paddw mm5,mm3 ; mm5=data0+data7=tmp0 - - movq mm2, MMWORD [wk(0)] ; mm2=(22 23 32 33) - movq mm3, MMWORD [wk(1)] ; mm3=(42 43 52 53) - movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6 - movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7 - - movq mm7,mm4 ; transpose coefficients(phase 2) - punpckldq mm4,mm2 ; mm4=(20 21 22 23)=data2 - punpckhdq mm7,mm2 ; mm7=(30 31 32 33)=data3 - movq mm6,mm1 ; transpose coefficients(phase 2) - punpckldq mm1,mm3 ; mm1=(40 41 42 43)=data4 - punpckhdq mm6,mm3 ; mm6=(50 51 52 53)=data5 - - movq mm2,mm7 - movq mm3,mm4 - paddw mm7,mm1 ; mm7=data3+data4=tmp3 - paddw mm4,mm6 ; mm4=data2+data5=tmp2 - psubw mm2,mm1 ; mm2=data3-data4=tmp4 - psubw mm3,mm6 ; mm3=data2-data5=tmp5 - - ; -- Even part - - movq mm1,mm5 - movq mm6,mm0 - psubw mm5,mm7 ; mm5=tmp13 - psubw mm0,mm4 ; mm0=tmp12 - paddw mm1,mm7 ; mm1=tmp10 - paddw mm6,mm4 ; mm6=tmp11 - - paddw mm0,mm5 - psllw mm0,PRE_MULTIPLY_SCALE_BITS - pmulhw mm0,[GOTOFF(ebx,PW_F0707)] ; mm0=z1 - - movq mm7,mm1 - movq mm4,mm5 - psubw mm1,mm6 ; mm1=data4 - psubw mm5,mm0 ; mm5=data6 - paddw mm7,mm6 ; mm7=data0 - paddw mm4,mm0 ; mm4=data2 - - movq MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm1 - movq MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm5 - movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7 - movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4 - - ; -- Odd part - - movq mm6, MMWORD [wk(0)] ; mm6=tmp6 - movq mm0, MMWORD [wk(1)] ; mm0=tmp7 - - paddw mm2,mm3 ; mm2=tmp10 - paddw mm3,mm6 ; mm3=tmp11 - paddw mm6,mm0 ; mm6=tmp12, mm0=tmp7 - - psllw mm2,PRE_MULTIPLY_SCALE_BITS - psllw mm6,PRE_MULTIPLY_SCALE_BITS - - psllw mm3,PRE_MULTIPLY_SCALE_BITS - pmulhw mm3,[GOTOFF(ebx,PW_F0707)] ; mm3=z3 - - movq mm1,mm2 ; mm1=tmp10 - psubw mm2,mm6 - pmulhw mm2,[GOTOFF(ebx,PW_F0382)] ; mm2=z5 - pmulhw mm1,[GOTOFF(ebx,PW_F0541)] ; mm1=MULTIPLY(tmp10,FIX_0_54119610) - pmulhw mm6,[GOTOFF(ebx,PW_F1306)] ; mm6=MULTIPLY(tmp12,FIX_1_30656296) - paddw mm1,mm2 ; mm1=z2 - paddw mm6,mm2 ; mm6=z4 - - movq mm5,mm0 - psubw mm0,mm3 ; mm0=z13 - paddw mm5,mm3 ; mm5=z11 - - movq mm7,mm0 - movq mm4,mm5 - psubw mm0,mm1 ; mm0=data3 - psubw mm5,mm6 ; mm5=data7 - paddw mm7,mm1 ; mm7=data5 - paddw mm4,mm6 ; mm4=data1 - - movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0 - movq MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm5 - movq MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm7 - movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4 - - add edx, byte 4*SIZEOF_DCTELEM - dec ecx - jnz near .columnloop - - emms ; empty MMX state - -; pop edi ; unused -; pop esi ; unused -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - poppic ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret - -; For some reason, the OS X linker does not honor the request to align the -; segment unless we do this. - align 16 diff --git a/Builder/jni-1.11/simd/i386/src/jfmmxint.asm b/Builder/jni-1.11/simd/i386/src/jfmmxint.asm deleted file mode 100644 index a7e73f73a..000000000 --- a/Builder/jni-1.11/simd/i386/src/jfmmxint.asm +++ /dev/null @@ -1,622 +0,0 @@ -; -; jfmmxint.asm - accurate integer FDCT (MMX) -; -; Copyright 2009 Pierre Ossman for Cendio AB -; -; Based on -; x86 SIMD extension for IJG JPEG library -; Copyright (C) 1999-2006, MIYASAKA Masaru. -; For conditions of distribution and use, see copyright notice in jsimdext.inc -; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; This file contains a slow-but-accurate integer implementation of the -; forward DCT (Discrete Cosine Transform). The following code is based -; directly on the IJG's original jfdctint.c; see the jfdctint.c for -; more details. -; -; [TAB8] - -%include "jsimdext.inc" -%include "jdct.inc" - -; -------------------------------------------------------------------------- - -%define CONST_BITS 13 -%define PASS1_BITS 2 - -%define DESCALE_P1 (CONST_BITS-PASS1_BITS) -%define DESCALE_P2 (CONST_BITS+PASS1_BITS) - -%if CONST_BITS == 13 -F_0_298 equ 2446 ; FIX(0.298631336) -F_0_390 equ 3196 ; FIX(0.390180644) -F_0_541 equ 4433 ; FIX(0.541196100) -F_0_765 equ 6270 ; FIX(0.765366865) -F_0_899 equ 7373 ; FIX(0.899976223) -F_1_175 equ 9633 ; FIX(1.175875602) -F_1_501 equ 12299 ; FIX(1.501321110) -F_1_847 equ 15137 ; FIX(1.847759065) -F_1_961 equ 16069 ; FIX(1.961570560) -F_2_053 equ 16819 ; FIX(2.053119869) -F_2_562 equ 20995 ; FIX(2.562915447) -F_3_072 equ 25172 ; FIX(3.072711026) -%else -; NASM cannot do compile-time arithmetic on floating-point constants. -%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) -F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336) -F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644) -F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) -F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) -F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) -F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602) -F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110) -F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) -F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560) -F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869) -F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) -F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026) -%endif - -; -------------------------------------------------------------------------- - SECTION SEG_CONST - - alignz 16 - global EXTN(jconst_fdct_islow_mmx) - -EXTN(jconst_fdct_islow_mmx): - -PW_F130_F054 times 2 dw (F_0_541+F_0_765), F_0_541 -PW_F054_MF130 times 2 dw F_0_541, (F_0_541-F_1_847) -PW_MF078_F117 times 2 dw (F_1_175-F_1_961), F_1_175 -PW_F117_F078 times 2 dw F_1_175, (F_1_175-F_0_390) -PW_MF060_MF089 times 2 dw (F_0_298-F_0_899),-F_0_899 -PW_MF089_F060 times 2 dw -F_0_899, (F_1_501-F_0_899) -PW_MF050_MF256 times 2 dw (F_2_053-F_2_562),-F_2_562 -PW_MF256_F050 times 2 dw -F_2_562, (F_3_072-F_2_562) -PD_DESCALE_P1 times 2 dd 1 << (DESCALE_P1-1) -PD_DESCALE_P2 times 2 dd 1 << (DESCALE_P2-1) -PW_DESCALE_P2X times 4 dw 1 << (PASS1_BITS-1) - - alignz 16 - -; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 -; -; Perform the forward DCT on one block of samples. -; -; GLOBAL(void) -; jsimd_fdct_islow_mmx (DCTELEM * data) -; - -%define data(b) (b)+8 ; DCTELEM * data - -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] -%define WK_NUM 2 - - align 16 - global EXTN(jsimd_fdct_islow_mmx) - -EXTN(jsimd_fdct_islow_mmx): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [wk(0)] - pushpic ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved -; push esi ; unused -; push edi ; unused - - get_GOT ebx ; get GOT address - - ; ---- Pass 1: process rows. - - mov edx, POINTER [data(eax)] ; (DCTELEM *) - mov ecx, DCTSIZE/4 - alignx 16,7 -.rowloop: - - movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)] - movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)] - movq mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)] - movq mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)] - - ; mm0=(20 21 22 23), mm2=(24 25 26 27) - ; mm1=(30 31 32 33), mm3=(34 35 36 37) - - movq mm4,mm0 ; transpose coefficients(phase 1) - punpcklwd mm0,mm1 ; mm0=(20 30 21 31) - punpckhwd mm4,mm1 ; mm4=(22 32 23 33) - movq mm5,mm2 ; transpose coefficients(phase 1) - punpcklwd mm2,mm3 ; mm2=(24 34 25 35) - punpckhwd mm5,mm3 ; mm5=(26 36 27 37) - - movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)] - movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)] - movq mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)] - movq mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)] - - ; mm6=(00 01 02 03), mm1=(04 05 06 07) - ; mm7=(10 11 12 13), mm3=(14 15 16 17) - - movq MMWORD [wk(0)], mm4 ; wk(0)=(22 32 23 33) - movq MMWORD [wk(1)], mm2 ; wk(1)=(24 34 25 35) - - movq mm4,mm6 ; transpose coefficients(phase 1) - punpcklwd mm6,mm7 ; mm6=(00 10 01 11) - punpckhwd mm4,mm7 ; mm4=(02 12 03 13) - movq mm2,mm1 ; transpose coefficients(phase 1) - punpcklwd mm1,mm3 ; mm1=(04 14 05 15) - punpckhwd mm2,mm3 ; mm2=(06 16 07 17) - - movq mm7,mm6 ; transpose coefficients(phase 2) - punpckldq mm6,mm0 ; mm6=(00 10 20 30)=data0 - punpckhdq mm7,mm0 ; mm7=(01 11 21 31)=data1 - movq mm3,mm2 ; transpose coefficients(phase 2) - punpckldq mm2,mm5 ; mm2=(06 16 26 36)=data6 - punpckhdq mm3,mm5 ; mm3=(07 17 27 37)=data7 - - movq mm0,mm7 - movq mm5,mm6 - psubw mm7,mm2 ; mm7=data1-data6=tmp6 - psubw mm6,mm3 ; mm6=data0-data7=tmp7 - paddw mm0,mm2 ; mm0=data1+data6=tmp1 - paddw mm5,mm3 ; mm5=data0+data7=tmp0 - - movq mm2, MMWORD [wk(0)] ; mm2=(22 32 23 33) - movq mm3, MMWORD [wk(1)] ; mm3=(24 34 25 35) - movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6 - movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7 - - movq mm7,mm4 ; transpose coefficients(phase 2) - punpckldq mm4,mm2 ; mm4=(02 12 22 32)=data2 - punpckhdq mm7,mm2 ; mm7=(03 13 23 33)=data3 - movq mm6,mm1 ; transpose coefficients(phase 2) - punpckldq mm1,mm3 ; mm1=(04 14 24 34)=data4 - punpckhdq mm6,mm3 ; mm6=(05 15 25 35)=data5 - - movq mm2,mm7 - movq mm3,mm4 - paddw mm7,mm1 ; mm7=data3+data4=tmp3 - paddw mm4,mm6 ; mm4=data2+data5=tmp2 - psubw mm2,mm1 ; mm2=data3-data4=tmp4 - psubw mm3,mm6 ; mm3=data2-data5=tmp5 - - ; -- Even part - - movq mm1,mm5 - movq mm6,mm0 - paddw mm5,mm7 ; mm5=tmp10 - paddw mm0,mm4 ; mm0=tmp11 - psubw mm1,mm7 ; mm1=tmp13 - psubw mm6,mm4 ; mm6=tmp12 - - movq mm7,mm5 - paddw mm5,mm0 ; mm5=tmp10+tmp11 - psubw mm7,mm0 ; mm7=tmp10-tmp11 - - psllw mm5,PASS1_BITS ; mm5=data0 - psllw mm7,PASS1_BITS ; mm7=data4 - - movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5 - movq MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm7 - - ; (Original) - ; z1 = (tmp12 + tmp13) * 0.541196100; - ; data2 = z1 + tmp13 * 0.765366865; - ; data6 = z1 + tmp12 * -1.847759065; - ; - ; (This implementation) - ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; - ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); - - movq mm4,mm1 ; mm1=tmp13 - movq mm0,mm1 - punpcklwd mm4,mm6 ; mm6=tmp12 - punpckhwd mm0,mm6 - movq mm1,mm4 - movq mm6,mm0 - pmaddwd mm4,[GOTOFF(ebx,PW_F130_F054)] ; mm4=data2L - pmaddwd mm0,[GOTOFF(ebx,PW_F130_F054)] ; mm0=data2H - pmaddwd mm1,[GOTOFF(ebx,PW_F054_MF130)] ; mm1=data6L - pmaddwd mm6,[GOTOFF(ebx,PW_F054_MF130)] ; mm6=data6H - - paddd mm4,[GOTOFF(ebx,PD_DESCALE_P1)] - paddd mm0,[GOTOFF(ebx,PD_DESCALE_P1)] - psrad mm4,DESCALE_P1 - psrad mm0,DESCALE_P1 - paddd mm1,[GOTOFF(ebx,PD_DESCALE_P1)] - paddd mm6,[GOTOFF(ebx,PD_DESCALE_P1)] - psrad mm1,DESCALE_P1 - psrad mm6,DESCALE_P1 - - packssdw mm4,mm0 ; mm4=data2 - packssdw mm1,mm6 ; mm1=data6 - - movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4 - movq MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm1 - - ; -- Odd part - - movq mm5, MMWORD [wk(0)] ; mm5=tmp6 - movq mm7, MMWORD [wk(1)] ; mm7=tmp7 - - movq mm0,mm2 ; mm2=tmp4 - movq mm6,mm3 ; mm3=tmp5 - paddw mm0,mm5 ; mm0=z3 - paddw mm6,mm7 ; mm6=z4 - - ; (Original) - ; z5 = (z3 + z4) * 1.175875602; - ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; - ; z3 += z5; z4 += z5; - ; - ; (This implementation) - ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; - ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); - - movq mm4,mm0 - movq mm1,mm0 - punpcklwd mm4,mm6 - punpckhwd mm1,mm6 - movq mm0,mm4 - movq mm6,mm1 - pmaddwd mm4,[GOTOFF(ebx,PW_MF078_F117)] ; mm4=z3L - pmaddwd mm1,[GOTOFF(ebx,PW_MF078_F117)] ; mm1=z3H - pmaddwd mm0,[GOTOFF(ebx,PW_F117_F078)] ; mm0=z4L - pmaddwd mm6,[GOTOFF(ebx,PW_F117_F078)] ; mm6=z4H - - movq MMWORD [wk(0)], mm4 ; wk(0)=z3L - movq MMWORD [wk(1)], mm1 ; wk(1)=z3H - - ; (Original) - ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; - ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; - ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; - ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; - ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; - ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; - ; - ; (This implementation) - ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; - ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; - ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); - ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); - ; data7 = tmp4 + z3; data5 = tmp5 + z4; - ; data3 = tmp6 + z3; data1 = tmp7 + z4; - - movq mm4,mm2 - movq mm1,mm2 - punpcklwd mm4,mm7 - punpckhwd mm1,mm7 - movq mm2,mm4 - movq mm7,mm1 - pmaddwd mm4,[GOTOFF(ebx,PW_MF060_MF089)] ; mm4=tmp4L - pmaddwd mm1,[GOTOFF(ebx,PW_MF060_MF089)] ; mm1=tmp4H - pmaddwd mm2,[GOTOFF(ebx,PW_MF089_F060)] ; mm2=tmp7L - pmaddwd mm7,[GOTOFF(ebx,PW_MF089_F060)] ; mm7=tmp7H - - paddd mm4, MMWORD [wk(0)] ; mm4=data7L - paddd mm1, MMWORD [wk(1)] ; mm1=data7H - paddd mm2,mm0 ; mm2=data1L - paddd mm7,mm6 ; mm7=data1H - - paddd mm4,[GOTOFF(ebx,PD_DESCALE_P1)] - paddd mm1,[GOTOFF(ebx,PD_DESCALE_P1)] - psrad mm4,DESCALE_P1 - psrad mm1,DESCALE_P1 - paddd mm2,[GOTOFF(ebx,PD_DESCALE_P1)] - paddd mm7,[GOTOFF(ebx,PD_DESCALE_P1)] - psrad mm2,DESCALE_P1 - psrad mm7,DESCALE_P1 - - packssdw mm4,mm1 ; mm4=data7 - packssdw mm2,mm7 ; mm2=data1 - - movq MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm4 - movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2 - - movq mm1,mm3 - movq mm7,mm3 - punpcklwd mm1,mm5 - punpckhwd mm7,mm5 - movq mm3,mm1 - movq mm5,mm7 - pmaddwd mm1,[GOTOFF(ebx,PW_MF050_MF256)] ; mm1=tmp5L - pmaddwd mm7,[GOTOFF(ebx,PW_MF050_MF256)] ; mm7=tmp5H - pmaddwd mm3,[GOTOFF(ebx,PW_MF256_F050)] ; mm3=tmp6L - pmaddwd mm5,[GOTOFF(ebx,PW_MF256_F050)] ; mm5=tmp6H - - paddd mm1,mm0 ; mm1=data5L - paddd mm7,mm6 ; mm7=data5H - paddd mm3, MMWORD [wk(0)] ; mm3=data3L - paddd mm5, MMWORD [wk(1)] ; mm5=data3H - - paddd mm1,[GOTOFF(ebx,PD_DESCALE_P1)] - paddd mm7,[GOTOFF(ebx,PD_DESCALE_P1)] - psrad mm1,DESCALE_P1 - psrad mm7,DESCALE_P1 - paddd mm3,[GOTOFF(ebx,PD_DESCALE_P1)] - paddd mm5,[GOTOFF(ebx,PD_DESCALE_P1)] - psrad mm3,DESCALE_P1 - psrad mm5,DESCALE_P1 - - packssdw mm1,mm7 ; mm1=data5 - packssdw mm3,mm5 ; mm3=data3 - - movq MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm1 - movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3 - - add edx, byte 4*DCTSIZE*SIZEOF_DCTELEM - dec ecx - jnz near .rowloop - - ; ---- Pass 2: process columns. - - mov edx, POINTER [data(eax)] ; (DCTELEM *) - mov ecx, DCTSIZE/4 - alignx 16,7 -.columnloop: - - movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)] - movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)] - movq mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)] - movq mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)] - - ; mm0=(02 12 22 32), mm2=(42 52 62 72) - ; mm1=(03 13 23 33), mm3=(43 53 63 73) - - movq mm4,mm0 ; transpose coefficients(phase 1) - punpcklwd mm0,mm1 ; mm0=(02 03 12 13) - punpckhwd mm4,mm1 ; mm4=(22 23 32 33) - movq mm5,mm2 ; transpose coefficients(phase 1) - punpcklwd mm2,mm3 ; mm2=(42 43 52 53) - punpckhwd mm5,mm3 ; mm5=(62 63 72 73) - - movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)] - movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)] - movq mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)] - movq mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)] - - ; mm6=(00 10 20 30), mm1=(40 50 60 70) - ; mm7=(01 11 21 31), mm3=(41 51 61 71) - - movq MMWORD [wk(0)], mm4 ; wk(0)=(22 23 32 33) - movq MMWORD [wk(1)], mm2 ; wk(1)=(42 43 52 53) - - movq mm4,mm6 ; transpose coefficients(phase 1) - punpcklwd mm6,mm7 ; mm6=(00 01 10 11) - punpckhwd mm4,mm7 ; mm4=(20 21 30 31) - movq mm2,mm1 ; transpose coefficients(phase 1) - punpcklwd mm1,mm3 ; mm1=(40 41 50 51) - punpckhwd mm2,mm3 ; mm2=(60 61 70 71) - - movq mm7,mm6 ; transpose coefficients(phase 2) - punpckldq mm6,mm0 ; mm6=(00 01 02 03)=data0 - punpckhdq mm7,mm0 ; mm7=(10 11 12 13)=data1 - movq mm3,mm2 ; transpose coefficients(phase 2) - punpckldq mm2,mm5 ; mm2=(60 61 62 63)=data6 - punpckhdq mm3,mm5 ; mm3=(70 71 72 73)=data7 - - movq mm0,mm7 - movq mm5,mm6 - psubw mm7,mm2 ; mm7=data1-data6=tmp6 - psubw mm6,mm3 ; mm6=data0-data7=tmp7 - paddw mm0,mm2 ; mm0=data1+data6=tmp1 - paddw mm5,mm3 ; mm5=data0+data7=tmp0 - - movq mm2, MMWORD [wk(0)] ; mm2=(22 23 32 33) - movq mm3, MMWORD [wk(1)] ; mm3=(42 43 52 53) - movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6 - movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7 - - movq mm7,mm4 ; transpose coefficients(phase 2) - punpckldq mm4,mm2 ; mm4=(20 21 22 23)=data2 - punpckhdq mm7,mm2 ; mm7=(30 31 32 33)=data3 - movq mm6,mm1 ; transpose coefficients(phase 2) - punpckldq mm1,mm3 ; mm1=(40 41 42 43)=data4 - punpckhdq mm6,mm3 ; mm6=(50 51 52 53)=data5 - - movq mm2,mm7 - movq mm3,mm4 - paddw mm7,mm1 ; mm7=data3+data4=tmp3 - paddw mm4,mm6 ; mm4=data2+data5=tmp2 - psubw mm2,mm1 ; mm2=data3-data4=tmp4 - psubw mm3,mm6 ; mm3=data2-data5=tmp5 - - ; -- Even part - - movq mm1,mm5 - movq mm6,mm0 - paddw mm5,mm7 ; mm5=tmp10 - paddw mm0,mm4 ; mm0=tmp11 - psubw mm1,mm7 ; mm1=tmp13 - psubw mm6,mm4 ; mm6=tmp12 - - movq mm7,mm5 - paddw mm5,mm0 ; mm5=tmp10+tmp11 - psubw mm7,mm0 ; mm7=tmp10-tmp11 - - paddw mm5,[GOTOFF(ebx,PW_DESCALE_P2X)] - paddw mm7,[GOTOFF(ebx,PW_DESCALE_P2X)] - psraw mm5,PASS1_BITS ; mm5=data0 - psraw mm7,PASS1_BITS ; mm7=data4 - - movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5 - movq MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm7 - - ; (Original) - ; z1 = (tmp12 + tmp13) * 0.541196100; - ; data2 = z1 + tmp13 * 0.765366865; - ; data6 = z1 + tmp12 * -1.847759065; - ; - ; (This implementation) - ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; - ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); - - movq mm4,mm1 ; mm1=tmp13 - movq mm0,mm1 - punpcklwd mm4,mm6 ; mm6=tmp12 - punpckhwd mm0,mm6 - movq mm1,mm4 - movq mm6,mm0 - pmaddwd mm4,[GOTOFF(ebx,PW_F130_F054)] ; mm4=data2L - pmaddwd mm0,[GOTOFF(ebx,PW_F130_F054)] ; mm0=data2H - pmaddwd mm1,[GOTOFF(ebx,PW_F054_MF130)] ; mm1=data6L - pmaddwd mm6,[GOTOFF(ebx,PW_F054_MF130)] ; mm6=data6H - - paddd mm4,[GOTOFF(ebx,PD_DESCALE_P2)] - paddd mm0,[GOTOFF(ebx,PD_DESCALE_P2)] - psrad mm4,DESCALE_P2 - psrad mm0,DESCALE_P2 - paddd mm1,[GOTOFF(ebx,PD_DESCALE_P2)] - paddd mm6,[GOTOFF(ebx,PD_DESCALE_P2)] - psrad mm1,DESCALE_P2 - psrad mm6,DESCALE_P2 - - packssdw mm4,mm0 ; mm4=data2 - packssdw mm1,mm6 ; mm1=data6 - - movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4 - movq MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm1 - - ; -- Odd part - - movq mm5, MMWORD [wk(0)] ; mm5=tmp6 - movq mm7, MMWORD [wk(1)] ; mm7=tmp7 - - movq mm0,mm2 ; mm2=tmp4 - movq mm6,mm3 ; mm3=tmp5 - paddw mm0,mm5 ; mm0=z3 - paddw mm6,mm7 ; mm6=z4 - - ; (Original) - ; z5 = (z3 + z4) * 1.175875602; - ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; - ; z3 += z5; z4 += z5; - ; - ; (This implementation) - ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; - ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); - - movq mm4,mm0 - movq mm1,mm0 - punpcklwd mm4,mm6 - punpckhwd mm1,mm6 - movq mm0,mm4 - movq mm6,mm1 - pmaddwd mm4,[GOTOFF(ebx,PW_MF078_F117)] ; mm4=z3L - pmaddwd mm1,[GOTOFF(ebx,PW_MF078_F117)] ; mm1=z3H - pmaddwd mm0,[GOTOFF(ebx,PW_F117_F078)] ; mm0=z4L - pmaddwd mm6,[GOTOFF(ebx,PW_F117_F078)] ; mm6=z4H - - movq MMWORD [wk(0)], mm4 ; wk(0)=z3L - movq MMWORD [wk(1)], mm1 ; wk(1)=z3H - - ; (Original) - ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; - ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; - ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; - ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; - ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; - ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; - ; - ; (This implementation) - ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; - ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; - ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); - ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); - ; data7 = tmp4 + z3; data5 = tmp5 + z4; - ; data3 = tmp6 + z3; data1 = tmp7 + z4; - - movq mm4,mm2 - movq mm1,mm2 - punpcklwd mm4,mm7 - punpckhwd mm1,mm7 - movq mm2,mm4 - movq mm7,mm1 - pmaddwd mm4,[GOTOFF(ebx,PW_MF060_MF089)] ; mm4=tmp4L - pmaddwd mm1,[GOTOFF(ebx,PW_MF060_MF089)] ; mm1=tmp4H - pmaddwd mm2,[GOTOFF(ebx,PW_MF089_F060)] ; mm2=tmp7L - pmaddwd mm7,[GOTOFF(ebx,PW_MF089_F060)] ; mm7=tmp7H - - paddd mm4, MMWORD [wk(0)] ; mm4=data7L - paddd mm1, MMWORD [wk(1)] ; mm1=data7H - paddd mm2,mm0 ; mm2=data1L - paddd mm7,mm6 ; mm7=data1H - - paddd mm4,[GOTOFF(ebx,PD_DESCALE_P2)] - paddd mm1,[GOTOFF(ebx,PD_DESCALE_P2)] - psrad mm4,DESCALE_P2 - psrad mm1,DESCALE_P2 - paddd mm2,[GOTOFF(ebx,PD_DESCALE_P2)] - paddd mm7,[GOTOFF(ebx,PD_DESCALE_P2)] - psrad mm2,DESCALE_P2 - psrad mm7,DESCALE_P2 - - packssdw mm4,mm1 ; mm4=data7 - packssdw mm2,mm7 ; mm2=data1 - - movq MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm4 - movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2 - - movq mm1,mm3 - movq mm7,mm3 - punpcklwd mm1,mm5 - punpckhwd mm7,mm5 - movq mm3,mm1 - movq mm5,mm7 - pmaddwd mm1,[GOTOFF(ebx,PW_MF050_MF256)] ; mm1=tmp5L - pmaddwd mm7,[GOTOFF(ebx,PW_MF050_MF256)] ; mm7=tmp5H - pmaddwd mm3,[GOTOFF(ebx,PW_MF256_F050)] ; mm3=tmp6L - pmaddwd mm5,[GOTOFF(ebx,PW_MF256_F050)] ; mm5=tmp6H - - paddd mm1,mm0 ; mm1=data5L - paddd mm7,mm6 ; mm7=data5H - paddd mm3, MMWORD [wk(0)] ; mm3=data3L - paddd mm5, MMWORD [wk(1)] ; mm5=data3H - - paddd mm1,[GOTOFF(ebx,PD_DESCALE_P2)] - paddd mm7,[GOTOFF(ebx,PD_DESCALE_P2)] - psrad mm1,DESCALE_P2 - psrad mm7,DESCALE_P2 - paddd mm3,[GOTOFF(ebx,PD_DESCALE_P2)] - paddd mm5,[GOTOFF(ebx,PD_DESCALE_P2)] - psrad mm3,DESCALE_P2 - psrad mm5,DESCALE_P2 - - packssdw mm1,mm7 ; mm1=data5 - packssdw mm3,mm5 ; mm3=data3 - - movq MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm1 - movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3 - - add edx, byte 4*SIZEOF_DCTELEM - dec ecx - jnz near .columnloop - - emms ; empty MMX state - -; pop edi ; unused -; pop esi ; unused -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - poppic ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret - -; For some reason, the OS X linker does not honor the request to align the -; segment unless we do this. - align 16 diff --git a/Builder/jni-1.11/simd/i386/src/jfss2fst.asm b/Builder/jni-1.11/simd/i386/src/jfss2fst.asm deleted file mode 100644 index 73fc9e51a..000000000 --- a/Builder/jni-1.11/simd/i386/src/jfss2fst.asm +++ /dev/null @@ -1,404 +0,0 @@ -; -; jfss2fst.asm - fast integer FDCT (SSE2) -; -; Copyright 2009 Pierre Ossman for Cendio AB -; -; Based on -; x86 SIMD extension for IJG JPEG library -; Copyright (C) 1999-2006, MIYASAKA Masaru. -; For conditions of distribution and use, see copyright notice in jsimdext.inc -; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; This file contains a fast, not so accurate integer implementation of -; the forward DCT (Discrete Cosine Transform). The following code is -; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c -; for more details. -; -; [TAB8] - -%include "jsimdext.inc" -%include "jdct.inc" - -; -------------------------------------------------------------------------- - -%define CONST_BITS 8 ; 14 is also OK. - -%if CONST_BITS == 8 -F_0_382 equ 98 ; FIX(0.382683433) -F_0_541 equ 139 ; FIX(0.541196100) -F_0_707 equ 181 ; FIX(0.707106781) -F_1_306 equ 334 ; FIX(1.306562965) -%else -; NASM cannot do compile-time arithmetic on floating-point constants. -%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) -F_0_382 equ DESCALE( 410903207,30-CONST_BITS) ; FIX(0.382683433) -F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) -F_0_707 equ DESCALE( 759250124,30-CONST_BITS) ; FIX(0.707106781) -F_1_306 equ DESCALE(1402911301,30-CONST_BITS) ; FIX(1.306562965) -%endif - -; -------------------------------------------------------------------------- - SECTION SEG_CONST - -; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) -; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) - -%define PRE_MULTIPLY_SCALE_BITS 2 -%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) - - alignz 16 - global EXTN(jconst_fdct_ifast_sse2) - -EXTN(jconst_fdct_ifast_sse2): - -PW_F0707 times 8 dw F_0_707 << CONST_SHIFT -PW_F0382 times 8 dw F_0_382 << CONST_SHIFT -PW_F0541 times 8 dw F_0_541 << CONST_SHIFT -PW_F1306 times 8 dw F_1_306 << CONST_SHIFT - - alignz 16 - -; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 -; -; Perform the forward DCT on one block of samples. -; -; GLOBAL(void) -; jsimd_fdct_ifast_sse2 (DCTELEM * data) -; - -%define data(b) (b)+8 ; DCTELEM * data - -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 2 - - align 16 - global EXTN(jsimd_fdct_ifast_sse2) - -EXTN(jsimd_fdct_ifast_sse2): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [wk(0)] - pushpic ebx -; push ecx ; unused -; push edx ; need not be preserved -; push esi ; unused -; push edi ; unused - - get_GOT ebx ; get GOT address - - ; ---- Pass 1: process rows. - - mov edx, POINTER [data(eax)] ; (DCTELEM *) - - movdqa xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)] - movdqa xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)] - movdqa xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)] - movdqa xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)] - - ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27) - ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37) - - movdqa xmm4,xmm0 ; transpose coefficients(phase 1) - punpcklwd xmm0,xmm1 ; xmm0=(00 10 01 11 02 12 03 13) - punpckhwd xmm4,xmm1 ; xmm4=(04 14 05 15 06 16 07 17) - movdqa xmm5,xmm2 ; transpose coefficients(phase 1) - punpcklwd xmm2,xmm3 ; xmm2=(20 30 21 31 22 32 23 33) - punpckhwd xmm5,xmm3 ; xmm5=(24 34 25 35 26 36 27 37) - - movdqa xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)] - movdqa xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)] - movdqa xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)] - movdqa xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)] - - ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62) - ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63) - - movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33) - movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37) - - movdqa xmm2,xmm6 ; transpose coefficients(phase 1) - punpcklwd xmm6,xmm7 ; xmm6=(40 50 41 51 42 52 43 53) - punpckhwd xmm2,xmm7 ; xmm2=(44 54 45 55 46 56 47 57) - movdqa xmm5,xmm1 ; transpose coefficients(phase 1) - punpcklwd xmm1,xmm3 ; xmm1=(60 70 61 71 62 72 63 73) - punpckhwd xmm5,xmm3 ; xmm5=(64 74 65 75 66 76 67 77) - - movdqa xmm7,xmm6 ; transpose coefficients(phase 2) - punpckldq xmm6,xmm1 ; xmm6=(40 50 60 70 41 51 61 71) - punpckhdq xmm7,xmm1 ; xmm7=(42 52 62 72 43 53 63 73) - movdqa xmm3,xmm2 ; transpose coefficients(phase 2) - punpckldq xmm2,xmm5 ; xmm2=(44 54 64 74 45 55 65 75) - punpckhdq xmm3,xmm5 ; xmm3=(46 56 66 76 47 57 67 77) - - movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33) - movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37) - movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(42 52 62 72 43 53 63 73) - movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=(44 54 64 74 45 55 65 75) - - movdqa xmm7,xmm0 ; transpose coefficients(phase 2) - punpckldq xmm0,xmm1 ; xmm0=(00 10 20 30 01 11 21 31) - punpckhdq xmm7,xmm1 ; xmm7=(02 12 22 32 03 13 23 33) - movdqa xmm2,xmm4 ; transpose coefficients(phase 2) - punpckldq xmm4,xmm5 ; xmm4=(04 14 24 34 05 15 25 35) - punpckhdq xmm2,xmm5 ; xmm2=(06 16 26 36 07 17 27 37) - - movdqa xmm1,xmm0 ; transpose coefficients(phase 3) - punpcklqdq xmm0,xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0 - punpckhqdq xmm1,xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1 - movdqa xmm5,xmm2 ; transpose coefficients(phase 3) - punpcklqdq xmm2,xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6 - punpckhqdq xmm5,xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7 - - movdqa xmm6,xmm1 - movdqa xmm3,xmm0 - psubw xmm1,xmm2 ; xmm1=data1-data6=tmp6 - psubw xmm0,xmm5 ; xmm0=data0-data7=tmp7 - paddw xmm6,xmm2 ; xmm6=data1+data6=tmp1 - paddw xmm3,xmm5 ; xmm3=data0+data7=tmp0 - - movdqa xmm2, XMMWORD [wk(0)] ; xmm2=(42 52 62 72 43 53 63 73) - movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(44 54 64 74 45 55 65 75) - movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6 - movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7 - - movdqa xmm1,xmm7 ; transpose coefficients(phase 3) - punpcklqdq xmm7,xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2 - punpckhqdq xmm1,xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3 - movdqa xmm0,xmm4 ; transpose coefficients(phase 3) - punpcklqdq xmm4,xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4 - punpckhqdq xmm0,xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5 - - movdqa xmm2,xmm1 - movdqa xmm5,xmm7 - paddw xmm1,xmm4 ; xmm1=data3+data4=tmp3 - paddw xmm7,xmm0 ; xmm7=data2+data5=tmp2 - psubw xmm2,xmm4 ; xmm2=data3-data4=tmp4 - psubw xmm5,xmm0 ; xmm5=data2-data5=tmp5 - - ; -- Even part - - movdqa xmm4,xmm3 - movdqa xmm0,xmm6 - psubw xmm3,xmm1 ; xmm3=tmp13 - psubw xmm6,xmm7 ; xmm6=tmp12 - paddw xmm4,xmm1 ; xmm4=tmp10 - paddw xmm0,xmm7 ; xmm0=tmp11 - - paddw xmm6,xmm3 - psllw xmm6,PRE_MULTIPLY_SCALE_BITS - pmulhw xmm6,[GOTOFF(ebx,PW_F0707)] ; xmm6=z1 - - movdqa xmm1,xmm4 - movdqa xmm7,xmm3 - psubw xmm4,xmm0 ; xmm4=data4 - psubw xmm3,xmm6 ; xmm3=data6 - paddw xmm1,xmm0 ; xmm1=data0 - paddw xmm7,xmm6 ; xmm7=data2 - - movdqa xmm0, XMMWORD [wk(0)] ; xmm0=tmp6 - movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp7 - movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=data4 - movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=data6 - - ; -- Odd part - - paddw xmm2,xmm5 ; xmm2=tmp10 - paddw xmm5,xmm0 ; xmm5=tmp11 - paddw xmm0,xmm6 ; xmm0=tmp12, xmm6=tmp7 - - psllw xmm2,PRE_MULTIPLY_SCALE_BITS - psllw xmm0,PRE_MULTIPLY_SCALE_BITS - - psllw xmm5,PRE_MULTIPLY_SCALE_BITS - pmulhw xmm5,[GOTOFF(ebx,PW_F0707)] ; xmm5=z3 - - movdqa xmm4,xmm2 ; xmm4=tmp10 - psubw xmm2,xmm0 - pmulhw xmm2,[GOTOFF(ebx,PW_F0382)] ; xmm2=z5 - pmulhw xmm4,[GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196) - pmulhw xmm0,[GOTOFF(ebx,PW_F1306)] ; xmm0=MULTIPLY(tmp12,FIX_1_306562) - paddw xmm4,xmm2 ; xmm4=z2 - paddw xmm0,xmm2 ; xmm0=z4 - - movdqa xmm3,xmm6 - psubw xmm6,xmm5 ; xmm6=z13 - paddw xmm3,xmm5 ; xmm3=z11 - - movdqa xmm2,xmm6 - movdqa xmm5,xmm3 - psubw xmm6,xmm4 ; xmm6=data3 - psubw xmm3,xmm0 ; xmm3=data7 - paddw xmm2,xmm4 ; xmm2=data5 - paddw xmm5,xmm0 ; xmm5=data1 - - ; ---- Pass 2: process columns. - -; mov edx, POINTER [data(eax)] ; (DCTELEM *) - - ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72) - ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73) - - movdqa xmm4,xmm1 ; transpose coefficients(phase 1) - punpcklwd xmm1,xmm5 ; xmm1=(00 01 10 11 20 21 30 31) - punpckhwd xmm4,xmm5 ; xmm4=(40 41 50 51 60 61 70 71) - movdqa xmm0,xmm7 ; transpose coefficients(phase 1) - punpcklwd xmm7,xmm6 ; xmm7=(02 03 12 13 22 23 32 33) - punpckhwd xmm0,xmm6 ; xmm0=(42 43 52 53 62 63 72 73) - - movdqa xmm5, XMMWORD [wk(0)] ; xmm5=col4 - movdqa xmm6, XMMWORD [wk(1)] ; xmm6=col6 - - ; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76) - ; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77) - - movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(02 03 12 13 22 23 32 33) - movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(42 43 52 53 62 63 72 73) - - movdqa xmm7,xmm5 ; transpose coefficients(phase 1) - punpcklwd xmm5,xmm2 ; xmm5=(04 05 14 15 24 25 34 35) - punpckhwd xmm7,xmm2 ; xmm7=(44 45 54 55 64 65 74 75) - movdqa xmm0,xmm6 ; transpose coefficients(phase 1) - punpcklwd xmm6,xmm3 ; xmm6=(06 07 16 17 26 27 36 37) - punpckhwd xmm0,xmm3 ; xmm0=(46 47 56 57 66 67 76 77) - - movdqa xmm2,xmm5 ; transpose coefficients(phase 2) - punpckldq xmm5,xmm6 ; xmm5=(04 05 06 07 14 15 16 17) - punpckhdq xmm2,xmm6 ; xmm2=(24 25 26 27 34 35 36 37) - movdqa xmm3,xmm7 ; transpose coefficients(phase 2) - punpckldq xmm7,xmm0 ; xmm7=(44 45 46 47 54 55 56 57) - punpckhdq xmm3,xmm0 ; xmm3=(64 65 66 67 74 75 76 77) - - movdqa xmm6, XMMWORD [wk(0)] ; xmm6=(02 03 12 13 22 23 32 33) - movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(42 43 52 53 62 63 72 73) - movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(24 25 26 27 34 35 36 37) - movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(44 45 46 47 54 55 56 57) - - movdqa xmm2,xmm1 ; transpose coefficients(phase 2) - punpckldq xmm1,xmm6 ; xmm1=(00 01 02 03 10 11 12 13) - punpckhdq xmm2,xmm6 ; xmm2=(20 21 22 23 30 31 32 33) - movdqa xmm7,xmm4 ; transpose coefficients(phase 2) - punpckldq xmm4,xmm0 ; xmm4=(40 41 42 43 50 51 52 53) - punpckhdq xmm7,xmm0 ; xmm7=(60 61 62 63 70 71 72 73) - - movdqa xmm6,xmm1 ; transpose coefficients(phase 3) - punpcklqdq xmm1,xmm5 ; xmm1=(00 01 02 03 04 05 06 07)=data0 - punpckhqdq xmm6,xmm5 ; xmm6=(10 11 12 13 14 15 16 17)=data1 - movdqa xmm0,xmm7 ; transpose coefficients(phase 3) - punpcklqdq xmm7,xmm3 ; xmm7=(60 61 62 63 64 65 66 67)=data6 - punpckhqdq xmm0,xmm3 ; xmm0=(70 71 72 73 74 75 76 77)=data7 - - movdqa xmm5,xmm6 - movdqa xmm3,xmm1 - psubw xmm6,xmm7 ; xmm6=data1-data6=tmp6 - psubw xmm1,xmm0 ; xmm1=data0-data7=tmp7 - paddw xmm5,xmm7 ; xmm5=data1+data6=tmp1 - paddw xmm3,xmm0 ; xmm3=data0+data7=tmp0 - - movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(24 25 26 27 34 35 36 37) - movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(44 45 46 47 54 55 56 57) - movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=tmp6 - movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=tmp7 - - movdqa xmm6,xmm2 ; transpose coefficients(phase 3) - punpcklqdq xmm2,xmm7 ; xmm2=(20 21 22 23 24 25 26 27)=data2 - punpckhqdq xmm6,xmm7 ; xmm6=(30 31 32 33 34 35 36 37)=data3 - movdqa xmm1,xmm4 ; transpose coefficients(phase 3) - punpcklqdq xmm4,xmm0 ; xmm4=(40 41 42 43 44 45 46 47)=data4 - punpckhqdq xmm1,xmm0 ; xmm1=(50 51 52 53 54 55 56 57)=data5 - - movdqa xmm7,xmm6 - movdqa xmm0,xmm2 - paddw xmm6,xmm4 ; xmm6=data3+data4=tmp3 - paddw xmm2,xmm1 ; xmm2=data2+data5=tmp2 - psubw xmm7,xmm4 ; xmm7=data3-data4=tmp4 - psubw xmm0,xmm1 ; xmm0=data2-data5=tmp5 - - ; -- Even part - - movdqa xmm4,xmm3 - movdqa xmm1,xmm5 - psubw xmm3,xmm6 ; xmm3=tmp13 - psubw xmm5,xmm2 ; xmm5=tmp12 - paddw xmm4,xmm6 ; xmm4=tmp10 - paddw xmm1,xmm2 ; xmm1=tmp11 - - paddw xmm5,xmm3 - psllw xmm5,PRE_MULTIPLY_SCALE_BITS - pmulhw xmm5,[GOTOFF(ebx,PW_F0707)] ; xmm5=z1 - - movdqa xmm6,xmm4 - movdqa xmm2,xmm3 - psubw xmm4,xmm1 ; xmm4=data4 - psubw xmm3,xmm5 ; xmm3=data6 - paddw xmm6,xmm1 ; xmm6=data0 - paddw xmm2,xmm5 ; xmm2=data2 - - movdqa XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm4 - movdqa XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm3 - movdqa XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm6 - movdqa XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm2 - - ; -- Odd part - - movdqa xmm1, XMMWORD [wk(0)] ; xmm1=tmp6 - movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7 - - paddw xmm7,xmm0 ; xmm7=tmp10 - paddw xmm0,xmm1 ; xmm0=tmp11 - paddw xmm1,xmm5 ; xmm1=tmp12, xmm5=tmp7 - - psllw xmm7,PRE_MULTIPLY_SCALE_BITS - psllw xmm1,PRE_MULTIPLY_SCALE_BITS - - psllw xmm0,PRE_MULTIPLY_SCALE_BITS - pmulhw xmm0,[GOTOFF(ebx,PW_F0707)] ; xmm0=z3 - - movdqa xmm4,xmm7 ; xmm4=tmp10 - psubw xmm7,xmm1 - pmulhw xmm7,[GOTOFF(ebx,PW_F0382)] ; xmm7=z5 - pmulhw xmm4,[GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196) - pmulhw xmm1,[GOTOFF(ebx,PW_F1306)] ; xmm1=MULTIPLY(tmp12,FIX_1_306562) - paddw xmm4,xmm7 ; xmm4=z2 - paddw xmm1,xmm7 ; xmm1=z4 - - movdqa xmm3,xmm5 - psubw xmm5,xmm0 ; xmm5=z13 - paddw xmm3,xmm0 ; xmm3=z11 - - movdqa xmm6,xmm5 - movdqa xmm2,xmm3 - psubw xmm5,xmm4 ; xmm5=data3 - psubw xmm3,xmm1 ; xmm3=data7 - paddw xmm6,xmm4 ; xmm6=data5 - paddw xmm2,xmm1 ; xmm2=data1 - - movdqa XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm5 - movdqa XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm3 - movdqa XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm6 - movdqa XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm2 - -; pop edi ; unused -; pop esi ; unused -; pop edx ; need not be preserved -; pop ecx ; unused - poppic ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret - -; For some reason, the OS X linker does not honor the request to align the -; segment unless we do this. - align 16 diff --git a/Builder/jni-1.11/simd/i386/src/jfss2int.asm b/Builder/jni-1.11/simd/i386/src/jfss2int.asm deleted file mode 100644 index 5e3f2aaa9..000000000 --- a/Builder/jni-1.11/simd/i386/src/jfss2int.asm +++ /dev/null @@ -1,634 +0,0 @@ -; -; jfss2int.asm - accurate integer FDCT (SSE2) -; -; Copyright 2009 Pierre Ossman for Cendio AB -; -; Based on -; x86 SIMD extension for IJG JPEG library -; Copyright (C) 1999-2006, MIYASAKA Masaru. -; For conditions of distribution and use, see copyright notice in jsimdext.inc -; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; This file contains a slow-but-accurate integer implementation of the -; forward DCT (Discrete Cosine Transform). The following code is based -; directly on the IJG's original jfdctint.c; see the jfdctint.c for -; more details. -; -; [TAB8] - -%include "jsimdext.inc" -%include "jdct.inc" - -; -------------------------------------------------------------------------- - -%define CONST_BITS 13 -%define PASS1_BITS 2 - -%define DESCALE_P1 (CONST_BITS-PASS1_BITS) -%define DESCALE_P2 (CONST_BITS+PASS1_BITS) - -%if CONST_BITS == 13 -F_0_298 equ 2446 ; FIX(0.298631336) -F_0_390 equ 3196 ; FIX(0.390180644) -F_0_541 equ 4433 ; FIX(0.541196100) -F_0_765 equ 6270 ; FIX(0.765366865) -F_0_899 equ 7373 ; FIX(0.899976223) -F_1_175 equ 9633 ; FIX(1.175875602) -F_1_501 equ 12299 ; FIX(1.501321110) -F_1_847 equ 15137 ; FIX(1.847759065) -F_1_961 equ 16069 ; FIX(1.961570560) -F_2_053 equ 16819 ; FIX(2.053119869) -F_2_562 equ 20995 ; FIX(2.562915447) -F_3_072 equ 25172 ; FIX(3.072711026) -%else -; NASM cannot do compile-time arithmetic on floating-point constants. -%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) -F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336) -F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644) -F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) -F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) -F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) -F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602) -F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110) -F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) -F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560) -F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869) -F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) -F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026) -%endif - -; -------------------------------------------------------------------------- - SECTION SEG_CONST - - alignz 16 - global EXTN(jconst_fdct_islow_sse2) - -EXTN(jconst_fdct_islow_sse2): - -PW_F130_F054 times 4 dw (F_0_541+F_0_765), F_0_541 -PW_F054_MF130 times 4 dw F_0_541, (F_0_541-F_1_847) -PW_MF078_F117 times 4 dw (F_1_175-F_1_961), F_1_175 -PW_F117_F078 times 4 dw F_1_175, (F_1_175-F_0_390) -PW_MF060_MF089 times 4 dw (F_0_298-F_0_899),-F_0_899 -PW_MF089_F060 times 4 dw -F_0_899, (F_1_501-F_0_899) -PW_MF050_MF256 times 4 dw (F_2_053-F_2_562),-F_2_562 -PW_MF256_F050 times 4 dw -F_2_562, (F_3_072-F_2_562) -PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1-1) -PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2-1) -PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS-1) - - alignz 16 - -; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 -; -; Perform the forward DCT on one block of samples. -; -; GLOBAL(void) -; jsimd_fdct_islow_sse2 (DCTELEM * data) -; - -%define data(b) (b)+8 ; DCTELEM * data - -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 6 - - align 16 - global EXTN(jsimd_fdct_islow_sse2) - -EXTN(jsimd_fdct_islow_sse2): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [wk(0)] - pushpic ebx -; push ecx ; unused -; push edx ; need not be preserved -; push esi ; unused -; push edi ; unused - - get_GOT ebx ; get GOT address - - ; ---- Pass 1: process rows. - - mov edx, POINTER [data(eax)] ; (DCTELEM *) - - movdqa xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)] - movdqa xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)] - movdqa xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)] - movdqa xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)] - - ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27) - ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37) - - movdqa xmm4,xmm0 ; transpose coefficients(phase 1) - punpcklwd xmm0,xmm1 ; xmm0=(00 10 01 11 02 12 03 13) - punpckhwd xmm4,xmm1 ; xmm4=(04 14 05 15 06 16 07 17) - movdqa xmm5,xmm2 ; transpose coefficients(phase 1) - punpcklwd xmm2,xmm3 ; xmm2=(20 30 21 31 22 32 23 33) - punpckhwd xmm5,xmm3 ; xmm5=(24 34 25 35 26 36 27 37) - - movdqa xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)] - movdqa xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)] - movdqa xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)] - movdqa xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)] - - ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62) - ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63) - - movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33) - movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37) - - movdqa xmm2,xmm6 ; transpose coefficients(phase 1) - punpcklwd xmm6,xmm7 ; xmm6=(40 50 41 51 42 52 43 53) - punpckhwd xmm2,xmm7 ; xmm2=(44 54 45 55 46 56 47 57) - movdqa xmm5,xmm1 ; transpose coefficients(phase 1) - punpcklwd xmm1,xmm3 ; xmm1=(60 70 61 71 62 72 63 73) - punpckhwd xmm5,xmm3 ; xmm5=(64 74 65 75 66 76 67 77) - - movdqa xmm7,xmm6 ; transpose coefficients(phase 2) - punpckldq xmm6,xmm1 ; xmm6=(40 50 60 70 41 51 61 71) - punpckhdq xmm7,xmm1 ; xmm7=(42 52 62 72 43 53 63 73) - movdqa xmm3,xmm2 ; transpose coefficients(phase 2) - punpckldq xmm2,xmm5 ; xmm2=(44 54 64 74 45 55 65 75) - punpckhdq xmm3,xmm5 ; xmm3=(46 56 66 76 47 57 67 77) - - movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33) - movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37) - movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=(42 52 62 72 43 53 63 73) - movdqa XMMWORD [wk(3)], xmm2 ; wk(3)=(44 54 64 74 45 55 65 75) - - movdqa xmm7,xmm0 ; transpose coefficients(phase 2) - punpckldq xmm0,xmm1 ; xmm0=(00 10 20 30 01 11 21 31) - punpckhdq xmm7,xmm1 ; xmm7=(02 12 22 32 03 13 23 33) - movdqa xmm2,xmm4 ; transpose coefficients(phase 2) - punpckldq xmm4,xmm5 ; xmm4=(04 14 24 34 05 15 25 35) - punpckhdq xmm2,xmm5 ; xmm2=(06 16 26 36 07 17 27 37) - - movdqa xmm1,xmm0 ; transpose coefficients(phase 3) - punpcklqdq xmm0,xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0 - punpckhqdq xmm1,xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1 - movdqa xmm5,xmm2 ; transpose coefficients(phase 3) - punpcklqdq xmm2,xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6 - punpckhqdq xmm5,xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7 - - movdqa xmm6,xmm1 - movdqa xmm3,xmm0 - psubw xmm1,xmm2 ; xmm1=data1-data6=tmp6 - psubw xmm0,xmm5 ; xmm0=data0-data7=tmp7 - paddw xmm6,xmm2 ; xmm6=data1+data6=tmp1 - paddw xmm3,xmm5 ; xmm3=data0+data7=tmp0 - - movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(42 52 62 72 43 53 63 73) - movdqa xmm5, XMMWORD [wk(3)] ; xmm5=(44 54 64 74 45 55 65 75) - movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6 - movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7 - - movdqa xmm1,xmm7 ; transpose coefficients(phase 3) - punpcklqdq xmm7,xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2 - punpckhqdq xmm1,xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3 - movdqa xmm0,xmm4 ; transpose coefficients(phase 3) - punpcklqdq xmm4,xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4 - punpckhqdq xmm0,xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5 - - movdqa xmm2,xmm1 - movdqa xmm5,xmm7 - paddw xmm1,xmm4 ; xmm1=data3+data4=tmp3 - paddw xmm7,xmm0 ; xmm7=data2+data5=tmp2 - psubw xmm2,xmm4 ; xmm2=data3-data4=tmp4 - psubw xmm5,xmm0 ; xmm5=data2-data5=tmp5 - - ; -- Even part - - movdqa xmm4,xmm3 - movdqa xmm0,xmm6 - paddw xmm3,xmm1 ; xmm3=tmp10 - paddw xmm6,xmm7 ; xmm6=tmp11 - psubw xmm4,xmm1 ; xmm4=tmp13 - psubw xmm0,xmm7 ; xmm0=tmp12 - - movdqa xmm1,xmm3 - paddw xmm3,xmm6 ; xmm3=tmp10+tmp11 - psubw xmm1,xmm6 ; xmm1=tmp10-tmp11 - - psllw xmm3,PASS1_BITS ; xmm3=data0 - psllw xmm1,PASS1_BITS ; xmm1=data4 - - movdqa XMMWORD [wk(2)], xmm3 ; wk(2)=data0 - movdqa XMMWORD [wk(3)], xmm1 ; wk(3)=data4 - - ; (Original) - ; z1 = (tmp12 + tmp13) * 0.541196100; - ; data2 = z1 + tmp13 * 0.765366865; - ; data6 = z1 + tmp12 * -1.847759065; - ; - ; (This implementation) - ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; - ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); - - movdqa xmm7,xmm4 ; xmm4=tmp13 - movdqa xmm6,xmm4 - punpcklwd xmm7,xmm0 ; xmm0=tmp12 - punpckhwd xmm6,xmm0 - movdqa xmm4,xmm7 - movdqa xmm0,xmm6 - pmaddwd xmm7,[GOTOFF(ebx,PW_F130_F054)] ; xmm7=data2L - pmaddwd xmm6,[GOTOFF(ebx,PW_F130_F054)] ; xmm6=data2H - pmaddwd xmm4,[GOTOFF(ebx,PW_F054_MF130)] ; xmm4=data6L - pmaddwd xmm0,[GOTOFF(ebx,PW_F054_MF130)] ; xmm0=data6H - - paddd xmm7,[GOTOFF(ebx,PD_DESCALE_P1)] - paddd xmm6,[GOTOFF(ebx,PD_DESCALE_P1)] - psrad xmm7,DESCALE_P1 - psrad xmm6,DESCALE_P1 - paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P1)] - paddd xmm0,[GOTOFF(ebx,PD_DESCALE_P1)] - psrad xmm4,DESCALE_P1 - psrad xmm0,DESCALE_P1 - - packssdw xmm7,xmm6 ; xmm7=data2 - packssdw xmm4,xmm0 ; xmm4=data6 - - movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=data2 - movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=data6 - - ; -- Odd part - - movdqa xmm3, XMMWORD [wk(0)] ; xmm3=tmp6 - movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp7 - - movdqa xmm6,xmm2 ; xmm2=tmp4 - movdqa xmm0,xmm5 ; xmm5=tmp5 - paddw xmm6,xmm3 ; xmm6=z3 - paddw xmm0,xmm1 ; xmm0=z4 - - ; (Original) - ; z5 = (z3 + z4) * 1.175875602; - ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; - ; z3 += z5; z4 += z5; - ; - ; (This implementation) - ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; - ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); - - movdqa xmm7,xmm6 - movdqa xmm4,xmm6 - punpcklwd xmm7,xmm0 - punpckhwd xmm4,xmm0 - movdqa xmm6,xmm7 - movdqa xmm0,xmm4 - pmaddwd xmm7,[GOTOFF(ebx,PW_MF078_F117)] ; xmm7=z3L - pmaddwd xmm4,[GOTOFF(ebx,PW_MF078_F117)] ; xmm4=z3H - pmaddwd xmm6,[GOTOFF(ebx,PW_F117_F078)] ; xmm6=z4L - pmaddwd xmm0,[GOTOFF(ebx,PW_F117_F078)] ; xmm0=z4H - - movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=z3L - movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=z3H - - ; (Original) - ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; - ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; - ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; - ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; - ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; - ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; - ; - ; (This implementation) - ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; - ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; - ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); - ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); - ; data7 = tmp4 + z3; data5 = tmp5 + z4; - ; data3 = tmp6 + z3; data1 = tmp7 + z4; - - movdqa xmm7,xmm2 - movdqa xmm4,xmm2 - punpcklwd xmm7,xmm1 - punpckhwd xmm4,xmm1 - movdqa xmm2,xmm7 - movdqa xmm1,xmm4 - pmaddwd xmm7,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm7=tmp4L - pmaddwd xmm4,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm4=tmp4H - pmaddwd xmm2,[GOTOFF(ebx,PW_MF089_F060)] ; xmm2=tmp7L - pmaddwd xmm1,[GOTOFF(ebx,PW_MF089_F060)] ; xmm1=tmp7H - - paddd xmm7, XMMWORD [wk(0)] ; xmm7=data7L - paddd xmm4, XMMWORD [wk(1)] ; xmm4=data7H - paddd xmm2,xmm6 ; xmm2=data1L - paddd xmm1,xmm0 ; xmm1=data1H - - paddd xmm7,[GOTOFF(ebx,PD_DESCALE_P1)] - paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P1)] - psrad xmm7,DESCALE_P1 - psrad xmm4,DESCALE_P1 - paddd xmm2,[GOTOFF(ebx,PD_DESCALE_P1)] - paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P1)] - psrad xmm2,DESCALE_P1 - psrad xmm1,DESCALE_P1 - - packssdw xmm7,xmm4 ; xmm7=data7 - packssdw xmm2,xmm1 ; xmm2=data1 - - movdqa xmm4,xmm5 - movdqa xmm1,xmm5 - punpcklwd xmm4,xmm3 - punpckhwd xmm1,xmm3 - movdqa xmm5,xmm4 - movdqa xmm3,xmm1 - pmaddwd xmm4,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm4=tmp5L - pmaddwd xmm1,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm1=tmp5H - pmaddwd xmm5,[GOTOFF(ebx,PW_MF256_F050)] ; xmm5=tmp6L - pmaddwd xmm3,[GOTOFF(ebx,PW_MF256_F050)] ; xmm3=tmp6H - - paddd xmm4,xmm6 ; xmm4=data5L - paddd xmm1,xmm0 ; xmm1=data5H - paddd xmm5, XMMWORD [wk(0)] ; xmm5=data3L - paddd xmm3, XMMWORD [wk(1)] ; xmm3=data3H - - paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P1)] - paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P1)] - psrad xmm4,DESCALE_P1 - psrad xmm1,DESCALE_P1 - paddd xmm5,[GOTOFF(ebx,PD_DESCALE_P1)] - paddd xmm3,[GOTOFF(ebx,PD_DESCALE_P1)] - psrad xmm5,DESCALE_P1 - psrad xmm3,DESCALE_P1 - - packssdw xmm4,xmm1 ; xmm4=data5 - packssdw xmm5,xmm3 ; xmm5=data3 - - ; ---- Pass 2: process columns. - -; mov edx, POINTER [data(eax)] ; (DCTELEM *) - - movdqa xmm6, XMMWORD [wk(2)] ; xmm6=col0 - movdqa xmm0, XMMWORD [wk(4)] ; xmm0=col2 - - ; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72) - ; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73) - - movdqa xmm1,xmm6 ; transpose coefficients(phase 1) - punpcklwd xmm6,xmm2 ; xmm6=(00 01 10 11 20 21 30 31) - punpckhwd xmm1,xmm2 ; xmm1=(40 41 50 51 60 61 70 71) - movdqa xmm3,xmm0 ; transpose coefficients(phase 1) - punpcklwd xmm0,xmm5 ; xmm0=(02 03 12 13 22 23 32 33) - punpckhwd xmm3,xmm5 ; xmm3=(42 43 52 53 62 63 72 73) - - movdqa xmm2, XMMWORD [wk(3)] ; xmm2=col4 - movdqa xmm5, XMMWORD [wk(5)] ; xmm5=col6 - - ; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76) - ; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77) - - movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=(02 03 12 13 22 23 32 33) - movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(42 43 52 53 62 63 72 73) - - movdqa xmm0,xmm2 ; transpose coefficients(phase 1) - punpcklwd xmm2,xmm4 ; xmm2=(04 05 14 15 24 25 34 35) - punpckhwd xmm0,xmm4 ; xmm0=(44 45 54 55 64 65 74 75) - movdqa xmm3,xmm5 ; transpose coefficients(phase 1) - punpcklwd xmm5,xmm7 ; xmm5=(06 07 16 17 26 27 36 37) - punpckhwd xmm3,xmm7 ; xmm3=(46 47 56 57 66 67 76 77) - - movdqa xmm4,xmm2 ; transpose coefficients(phase 2) - punpckldq xmm2,xmm5 ; xmm2=(04 05 06 07 14 15 16 17) - punpckhdq xmm4,xmm5 ; xmm4=(24 25 26 27 34 35 36 37) - movdqa xmm7,xmm0 ; transpose coefficients(phase 2) - punpckldq xmm0,xmm3 ; xmm0=(44 45 46 47 54 55 56 57) - punpckhdq xmm7,xmm3 ; xmm7=(64 65 66 67 74 75 76 77) - - movdqa xmm5, XMMWORD [wk(0)] ; xmm5=(02 03 12 13 22 23 32 33) - movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53 62 63 72 73) - movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=(24 25 26 27 34 35 36 37) - movdqa XMMWORD [wk(3)], xmm0 ; wk(3)=(44 45 46 47 54 55 56 57) - - movdqa xmm4,xmm6 ; transpose coefficients(phase 2) - punpckldq xmm6,xmm5 ; xmm6=(00 01 02 03 10 11 12 13) - punpckhdq xmm4,xmm5 ; xmm4=(20 21 22 23 30 31 32 33) - movdqa xmm0,xmm1 ; transpose coefficients(phase 2) - punpckldq xmm1,xmm3 ; xmm1=(40 41 42 43 50 51 52 53) - punpckhdq xmm0,xmm3 ; xmm0=(60 61 62 63 70 71 72 73) - - movdqa xmm5,xmm6 ; transpose coefficients(phase 3) - punpcklqdq xmm6,xmm2 ; xmm6=(00 01 02 03 04 05 06 07)=data0 - punpckhqdq xmm5,xmm2 ; xmm5=(10 11 12 13 14 15 16 17)=data1 - movdqa xmm3,xmm0 ; transpose coefficients(phase 3) - punpcklqdq xmm0,xmm7 ; xmm0=(60 61 62 63 64 65 66 67)=data6 - punpckhqdq xmm3,xmm7 ; xmm3=(70 71 72 73 74 75 76 77)=data7 - - movdqa xmm2,xmm5 - movdqa xmm7,xmm6 - psubw xmm5,xmm0 ; xmm5=data1-data6=tmp6 - psubw xmm6,xmm3 ; xmm6=data0-data7=tmp7 - paddw xmm2,xmm0 ; xmm2=data1+data6=tmp1 - paddw xmm7,xmm3 ; xmm7=data0+data7=tmp0 - - movdqa xmm0, XMMWORD [wk(2)] ; xmm0=(24 25 26 27 34 35 36 37) - movdqa xmm3, XMMWORD [wk(3)] ; xmm3=(44 45 46 47 54 55 56 57) - movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=tmp6 - movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 - - movdqa xmm5,xmm4 ; transpose coefficients(phase 3) - punpcklqdq xmm4,xmm0 ; xmm4=(20 21 22 23 24 25 26 27)=data2 - punpckhqdq xmm5,xmm0 ; xmm5=(30 31 32 33 34 35 36 37)=data3 - movdqa xmm6,xmm1 ; transpose coefficients(phase 3) - punpcklqdq xmm1,xmm3 ; xmm1=(40 41 42 43 44 45 46 47)=data4 - punpckhqdq xmm6,xmm3 ; xmm6=(50 51 52 53 54 55 56 57)=data5 - - movdqa xmm0,xmm5 - movdqa xmm3,xmm4 - paddw xmm5,xmm1 ; xmm5=data3+data4=tmp3 - paddw xmm4,xmm6 ; xmm4=data2+data5=tmp2 - psubw xmm0,xmm1 ; xmm0=data3-data4=tmp4 - psubw xmm3,xmm6 ; xmm3=data2-data5=tmp5 - - ; -- Even part - - movdqa xmm1,xmm7 - movdqa xmm6,xmm2 - paddw xmm7,xmm5 ; xmm7=tmp10 - paddw xmm2,xmm4 ; xmm2=tmp11 - psubw xmm1,xmm5 ; xmm1=tmp13 - psubw xmm6,xmm4 ; xmm6=tmp12 - - movdqa xmm5,xmm7 - paddw xmm7,xmm2 ; xmm7=tmp10+tmp11 - psubw xmm5,xmm2 ; xmm5=tmp10-tmp11 - - paddw xmm7,[GOTOFF(ebx,PW_DESCALE_P2X)] - paddw xmm5,[GOTOFF(ebx,PW_DESCALE_P2X)] - psraw xmm7,PASS1_BITS ; xmm7=data0 - psraw xmm5,PASS1_BITS ; xmm5=data4 - - movdqa XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm7 - movdqa XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm5 - - ; (Original) - ; z1 = (tmp12 + tmp13) * 0.541196100; - ; data2 = z1 + tmp13 * 0.765366865; - ; data6 = z1 + tmp12 * -1.847759065; - ; - ; (This implementation) - ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; - ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); - - movdqa xmm4,xmm1 ; xmm1=tmp13 - movdqa xmm2,xmm1 - punpcklwd xmm4,xmm6 ; xmm6=tmp12 - punpckhwd xmm2,xmm6 - movdqa xmm1,xmm4 - movdqa xmm6,xmm2 - pmaddwd xmm4,[GOTOFF(ebx,PW_F130_F054)] ; xmm4=data2L - pmaddwd xmm2,[GOTOFF(ebx,PW_F130_F054)] ; xmm2=data2H - pmaddwd xmm1,[GOTOFF(ebx,PW_F054_MF130)] ; xmm1=data6L - pmaddwd xmm6,[GOTOFF(ebx,PW_F054_MF130)] ; xmm6=data6H - - paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P2)] - paddd xmm2,[GOTOFF(ebx,PD_DESCALE_P2)] - psrad xmm4,DESCALE_P2 - psrad xmm2,DESCALE_P2 - paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P2)] - paddd xmm6,[GOTOFF(ebx,PD_DESCALE_P2)] - psrad xmm1,DESCALE_P2 - psrad xmm6,DESCALE_P2 - - packssdw xmm4,xmm2 ; xmm4=data2 - packssdw xmm1,xmm6 ; xmm1=data6 - - movdqa XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm4 - movdqa XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm1 - - ; -- Odd part - - movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp6 - movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7 - - movdqa xmm2,xmm0 ; xmm0=tmp4 - movdqa xmm6,xmm3 ; xmm3=tmp5 - paddw xmm2,xmm7 ; xmm2=z3 - paddw xmm6,xmm5 ; xmm6=z4 - - ; (Original) - ; z5 = (z3 + z4) * 1.175875602; - ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; - ; z3 += z5; z4 += z5; - ; - ; (This implementation) - ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; - ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); - - movdqa xmm4,xmm2 - movdqa xmm1,xmm2 - punpcklwd xmm4,xmm6 - punpckhwd xmm1,xmm6 - movdqa xmm2,xmm4 - movdqa xmm6,xmm1 - pmaddwd xmm4,[GOTOFF(ebx,PW_MF078_F117)] ; xmm4=z3L - pmaddwd xmm1,[GOTOFF(ebx,PW_MF078_F117)] ; xmm1=z3H - pmaddwd xmm2,[GOTOFF(ebx,PW_F117_F078)] ; xmm2=z4L - pmaddwd xmm6,[GOTOFF(ebx,PW_F117_F078)] ; xmm6=z4H - - movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=z3L - movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=z3H - - ; (Original) - ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; - ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; - ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; - ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; - ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; - ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; - ; - ; (This implementation) - ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; - ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; - ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); - ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); - ; data7 = tmp4 + z3; data5 = tmp5 + z4; - ; data3 = tmp6 + z3; data1 = tmp7 + z4; - - movdqa xmm4,xmm0 - movdqa xmm1,xmm0 - punpcklwd xmm4,xmm5 - punpckhwd xmm1,xmm5 - movdqa xmm0,xmm4 - movdqa xmm5,xmm1 - pmaddwd xmm4,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm4=tmp4L - pmaddwd xmm1,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm1=tmp4H - pmaddwd xmm0,[GOTOFF(ebx,PW_MF089_F060)] ; xmm0=tmp7L - pmaddwd xmm5,[GOTOFF(ebx,PW_MF089_F060)] ; xmm5=tmp7H - - paddd xmm4, XMMWORD [wk(0)] ; xmm4=data7L - paddd xmm1, XMMWORD [wk(1)] ; xmm1=data7H - paddd xmm0,xmm2 ; xmm0=data1L - paddd xmm5,xmm6 ; xmm5=data1H - - paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P2)] - paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P2)] - psrad xmm4,DESCALE_P2 - psrad xmm1,DESCALE_P2 - paddd xmm0,[GOTOFF(ebx,PD_DESCALE_P2)] - paddd xmm5,[GOTOFF(ebx,PD_DESCALE_P2)] - psrad xmm0,DESCALE_P2 - psrad xmm5,DESCALE_P2 - - packssdw xmm4,xmm1 ; xmm4=data7 - packssdw xmm0,xmm5 ; xmm0=data1 - - movdqa XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm4 - movdqa XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm0 - - movdqa xmm1,xmm3 - movdqa xmm5,xmm3 - punpcklwd xmm1,xmm7 - punpckhwd xmm5,xmm7 - movdqa xmm3,xmm1 - movdqa xmm7,xmm5 - pmaddwd xmm1,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm1=tmp5L - pmaddwd xmm5,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm5=tmp5H - pmaddwd xmm3,[GOTOFF(ebx,PW_MF256_F050)] ; xmm3=tmp6L - pmaddwd xmm7,[GOTOFF(ebx,PW_MF256_F050)] ; xmm7=tmp6H - - paddd xmm1,xmm2 ; xmm1=data5L - paddd xmm5,xmm6 ; xmm5=data5H - paddd xmm3, XMMWORD [wk(0)] ; xmm3=data3L - paddd xmm7, XMMWORD [wk(1)] ; xmm7=data3H - - paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P2)] - paddd xmm5,[GOTOFF(ebx,PD_DESCALE_P2)] - psrad xmm1,DESCALE_P2 - psrad xmm5,DESCALE_P2 - paddd xmm3,[GOTOFF(ebx,PD_DESCALE_P2)] - paddd xmm7,[GOTOFF(ebx,PD_DESCALE_P2)] - psrad xmm3,DESCALE_P2 - psrad xmm7,DESCALE_P2 - - packssdw xmm1,xmm5 ; xmm1=data5 - packssdw xmm3,xmm7 ; xmm3=data3 - - movdqa XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm1 - movdqa XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm3 - -; pop edi ; unused -; pop esi ; unused -; pop edx ; need not be preserved -; pop ecx ; unused - poppic ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret - -; For some reason, the OS X linker does not honor the request to align the -; segment unless we do this. - align 16 diff --git a/Builder/jni-1.11/simd/i386/src/jfsseflt.asm b/Builder/jni-1.11/simd/i386/src/jfsseflt.asm deleted file mode 100644 index bc54cccde..000000000 --- a/Builder/jni-1.11/simd/i386/src/jfsseflt.asm +++ /dev/null @@ -1,370 +0,0 @@ -; -; jfsseflt.asm - floating-point FDCT (SSE) -; -; Copyright 2009 Pierre Ossman for Cendio AB -; -; Based on -; x86 SIMD extension for IJG JPEG library -; Copyright (C) 1999-2006, MIYASAKA Masaru. -; For conditions of distribution and use, see copyright notice in jsimdext.inc -; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; This file contains a floating-point implementation of the forward DCT -; (Discrete Cosine Transform). The following code is based directly on -; the IJG's original jfdctflt.c; see the jfdctflt.c for more details. -; -; [TAB8] - -%include "jsimdext.inc" -%include "jdct.inc" - -; -------------------------------------------------------------------------- - -%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) - shufps %1,%2,0x44 -%endmacro - -%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) - shufps %1,%2,0xEE -%endmacro - -; -------------------------------------------------------------------------- - SECTION SEG_CONST - - alignz 16 - global EXTN(jconst_fdct_float_sse) - -EXTN(jconst_fdct_float_sse): - -PD_0_382 times 4 dd 0.382683432365089771728460 -PD_0_707 times 4 dd 0.707106781186547524400844 -PD_0_541 times 4 dd 0.541196100146196984399723 -PD_1_306 times 4 dd 1.306562964876376527856643 - - alignz 16 - -; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 -; -; Perform the forward DCT on one block of samples. -; -; GLOBAL(void) -; jsimd_fdct_float_sse (FAST_FLOAT * data) -; - -%define data(b) (b)+8 ; FAST_FLOAT * data - -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 2 - - align 16 - global EXTN(jsimd_fdct_float_sse) - -EXTN(jsimd_fdct_float_sse): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [wk(0)] - pushpic ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved -; push esi ; unused -; push edi ; unused - - get_GOT ebx ; get GOT address - - ; ---- Pass 1: process rows. - - mov edx, POINTER [data(eax)] ; (FAST_FLOAT *) - mov ecx, DCTSIZE/4 - alignx 16,7 -.rowloop: - - movaps xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)] - movaps xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)] - movaps xmm2, XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)] - movaps xmm3, XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)] - - ; xmm0=(20 21 22 23), xmm2=(24 25 26 27) - ; xmm1=(30 31 32 33), xmm3=(34 35 36 37) - - movaps xmm4,xmm0 ; transpose coefficients(phase 1) - unpcklps xmm0,xmm1 ; xmm0=(20 30 21 31) - unpckhps xmm4,xmm1 ; xmm4=(22 32 23 33) - movaps xmm5,xmm2 ; transpose coefficients(phase 1) - unpcklps xmm2,xmm3 ; xmm2=(24 34 25 35) - unpckhps xmm5,xmm3 ; xmm5=(26 36 27 37) - - movaps xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] - movaps xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] - movaps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] - movaps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] - - ; xmm6=(00 01 02 03), xmm1=(04 05 06 07) - ; xmm7=(10 11 12 13), xmm3=(14 15 16 17) - - movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 32 23 33) - movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(24 34 25 35) - - movaps xmm4,xmm6 ; transpose coefficients(phase 1) - unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11) - unpckhps xmm4,xmm7 ; xmm4=(02 12 03 13) - movaps xmm2,xmm1 ; transpose coefficients(phase 1) - unpcklps xmm1,xmm3 ; xmm1=(04 14 05 15) - unpckhps xmm2,xmm3 ; xmm2=(06 16 07 17) - - movaps xmm7,xmm6 ; transpose coefficients(phase 2) - unpcklps2 xmm6,xmm0 ; xmm6=(00 10 20 30)=data0 - unpckhps2 xmm7,xmm0 ; xmm7=(01 11 21 31)=data1 - movaps xmm3,xmm2 ; transpose coefficients(phase 2) - unpcklps2 xmm2,xmm5 ; xmm2=(06 16 26 36)=data6 - unpckhps2 xmm3,xmm5 ; xmm3=(07 17 27 37)=data7 - - movaps xmm0,xmm7 - movaps xmm5,xmm6 - subps xmm7,xmm2 ; xmm7=data1-data6=tmp6 - subps xmm6,xmm3 ; xmm6=data0-data7=tmp7 - addps xmm0,xmm2 ; xmm0=data1+data6=tmp1 - addps xmm5,xmm3 ; xmm5=data0+data7=tmp0 - - movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 32 23 33) - movaps xmm3, XMMWORD [wk(1)] ; xmm3=(24 34 25 35) - movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6 - movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 - - movaps xmm7,xmm4 ; transpose coefficients(phase 2) - unpcklps2 xmm4,xmm2 ; xmm4=(02 12 22 32)=data2 - unpckhps2 xmm7,xmm2 ; xmm7=(03 13 23 33)=data3 - movaps xmm6,xmm1 ; transpose coefficients(phase 2) - unpcklps2 xmm1,xmm3 ; xmm1=(04 14 24 34)=data4 - unpckhps2 xmm6,xmm3 ; xmm6=(05 15 25 35)=data5 - - movaps xmm2,xmm7 - movaps xmm3,xmm4 - addps xmm7,xmm1 ; xmm7=data3+data4=tmp3 - addps xmm4,xmm6 ; xmm4=data2+data5=tmp2 - subps xmm2,xmm1 ; xmm2=data3-data4=tmp4 - subps xmm3,xmm6 ; xmm3=data2-data5=tmp5 - - ; -- Even part - - movaps xmm1,xmm5 - movaps xmm6,xmm0 - subps xmm5,xmm7 ; xmm5=tmp13 - subps xmm0,xmm4 ; xmm0=tmp12 - addps xmm1,xmm7 ; xmm1=tmp10 - addps xmm6,xmm4 ; xmm6=tmp11 - - addps xmm0,xmm5 - mulps xmm0,[GOTOFF(ebx,PD_0_707)] ; xmm0=z1 - - movaps xmm7,xmm1 - movaps xmm4,xmm5 - subps xmm1,xmm6 ; xmm1=data4 - subps xmm5,xmm0 ; xmm5=data6 - addps xmm7,xmm6 ; xmm7=data0 - addps xmm4,xmm0 ; xmm4=data2 - - movaps XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], xmm1 - movaps XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)], xmm5 - movaps XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7 - movaps XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4 - - ; -- Odd part - - movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6 - movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7 - - addps xmm2,xmm3 ; xmm2=tmp10 - addps xmm3,xmm6 ; xmm3=tmp11 - addps xmm6,xmm0 ; xmm6=tmp12, xmm0=tmp7 - - mulps xmm3,[GOTOFF(ebx,PD_0_707)] ; xmm3=z3 - - movaps xmm1,xmm2 ; xmm1=tmp10 - subps xmm2,xmm6 - mulps xmm2,[GOTOFF(ebx,PD_0_382)] ; xmm2=z5 - mulps xmm1,[GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196) - mulps xmm6,[GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562) - addps xmm1,xmm2 ; xmm1=z2 - addps xmm6,xmm2 ; xmm6=z4 - - movaps xmm5,xmm0 - subps xmm0,xmm3 ; xmm0=z13 - addps xmm5,xmm3 ; xmm5=z11 - - movaps xmm7,xmm0 - movaps xmm4,xmm5 - subps xmm0,xmm1 ; xmm0=data3 - subps xmm5,xmm6 ; xmm5=data7 - addps xmm7,xmm1 ; xmm7=data5 - addps xmm4,xmm6 ; xmm4=data1 - - movaps XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0 - movaps XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)], xmm5 - movaps XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], xmm7 - movaps XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4 - - add edx, 4*DCTSIZE*SIZEOF_FAST_FLOAT - dec ecx - jnz near .rowloop - - ; ---- Pass 2: process columns. - - mov edx, POINTER [data(eax)] ; (FAST_FLOAT *) - mov ecx, DCTSIZE/4 - alignx 16,7 -.columnloop: - - movaps xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)] - movaps xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)] - movaps xmm2, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)] - movaps xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)] - - ; xmm0=(02 12 22 32), xmm2=(42 52 62 72) - ; xmm1=(03 13 23 33), xmm3=(43 53 63 73) - - movaps xmm4,xmm0 ; transpose coefficients(phase 1) - unpcklps xmm0,xmm1 ; xmm0=(02 03 12 13) - unpckhps xmm4,xmm1 ; xmm4=(22 23 32 33) - movaps xmm5,xmm2 ; transpose coefficients(phase 1) - unpcklps xmm2,xmm3 ; xmm2=(42 43 52 53) - unpckhps xmm5,xmm3 ; xmm5=(62 63 72 73) - - movaps xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] - movaps xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] - movaps xmm1, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)] - movaps xmm3, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)] - - ; xmm6=(00 10 20 30), xmm1=(40 50 60 70) - ; xmm7=(01 11 21 31), xmm3=(41 51 61 71) - - movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 23 32 33) - movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(42 43 52 53) - - movaps xmm4,xmm6 ; transpose coefficients(phase 1) - unpcklps xmm6,xmm7 ; xmm6=(00 01 10 11) - unpckhps xmm4,xmm7 ; xmm4=(20 21 30 31) - movaps xmm2,xmm1 ; transpose coefficients(phase 1) - unpcklps xmm1,xmm3 ; xmm1=(40 41 50 51) - unpckhps xmm2,xmm3 ; xmm2=(60 61 70 71) - - movaps xmm7,xmm6 ; transpose coefficients(phase 2) - unpcklps2 xmm6,xmm0 ; xmm6=(00 01 02 03)=data0 - unpckhps2 xmm7,xmm0 ; xmm7=(10 11 12 13)=data1 - movaps xmm3,xmm2 ; transpose coefficients(phase 2) - unpcklps2 xmm2,xmm5 ; xmm2=(60 61 62 63)=data6 - unpckhps2 xmm3,xmm5 ; xmm3=(70 71 72 73)=data7 - - movaps xmm0,xmm7 - movaps xmm5,xmm6 - subps xmm7,xmm2 ; xmm7=data1-data6=tmp6 - subps xmm6,xmm3 ; xmm6=data0-data7=tmp7 - addps xmm0,xmm2 ; xmm0=data1+data6=tmp1 - addps xmm5,xmm3 ; xmm5=data0+data7=tmp0 - - movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 23 32 33) - movaps xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53) - movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6 - movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 - - movaps xmm7,xmm4 ; transpose coefficients(phase 2) - unpcklps2 xmm4,xmm2 ; xmm4=(20 21 22 23)=data2 - unpckhps2 xmm7,xmm2 ; xmm7=(30 31 32 33)=data3 - movaps xmm6,xmm1 ; transpose coefficients(phase 2) - unpcklps2 xmm1,xmm3 ; xmm1=(40 41 42 43)=data4 - unpckhps2 xmm6,xmm3 ; xmm6=(50 51 52 53)=data5 - - movaps xmm2,xmm7 - movaps xmm3,xmm4 - addps xmm7,xmm1 ; xmm7=data3+data4=tmp3 - addps xmm4,xmm6 ; xmm4=data2+data5=tmp2 - subps xmm2,xmm1 ; xmm2=data3-data4=tmp4 - subps xmm3,xmm6 ; xmm3=data2-data5=tmp5 - - ; -- Even part - - movaps xmm1,xmm5 - movaps xmm6,xmm0 - subps xmm5,xmm7 ; xmm5=tmp13 - subps xmm0,xmm4 ; xmm0=tmp12 - addps xmm1,xmm7 ; xmm1=tmp10 - addps xmm6,xmm4 ; xmm6=tmp11 - - addps xmm0,xmm5 - mulps xmm0,[GOTOFF(ebx,PD_0_707)] ; xmm0=z1 - - movaps xmm7,xmm1 - movaps xmm4,xmm5 - subps xmm1,xmm6 ; xmm1=data4 - subps xmm5,xmm0 ; xmm5=data6 - addps xmm7,xmm6 ; xmm7=data0 - addps xmm4,xmm0 ; xmm4=data2 - - movaps XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], xmm1 - movaps XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], xmm5 - movaps XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7 - movaps XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4 - - ; -- Odd part - - movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6 - movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7 - - addps xmm2,xmm3 ; xmm2=tmp10 - addps xmm3,xmm6 ; xmm3=tmp11 - addps xmm6,xmm0 ; xmm6=tmp12, xmm0=tmp7 - - mulps xmm3,[GOTOFF(ebx,PD_0_707)] ; xmm3=z3 - - movaps xmm1,xmm2 ; xmm1=tmp10 - subps xmm2,xmm6 - mulps xmm2,[GOTOFF(ebx,PD_0_382)] ; xmm2=z5 - mulps xmm1,[GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196) - mulps xmm6,[GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562) - addps xmm1,xmm2 ; xmm1=z2 - addps xmm6,xmm2 ; xmm6=z4 - - movaps xmm5,xmm0 - subps xmm0,xmm3 ; xmm0=z13 - addps xmm5,xmm3 ; xmm5=z11 - - movaps xmm7,xmm0 - movaps xmm4,xmm5 - subps xmm0,xmm1 ; xmm0=data3 - subps xmm5,xmm6 ; xmm5=data7 - addps xmm7,xmm1 ; xmm7=data5 - addps xmm4,xmm6 ; xmm4=data1 - - movaps XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0 - movaps XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], xmm5 - movaps XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], xmm7 - movaps XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4 - - add edx, byte 4*SIZEOF_FAST_FLOAT - dec ecx - jnz near .columnloop - -; pop edi ; unused -; pop esi ; unused -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - poppic ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret - -; For some reason, the OS X linker does not honor the request to align the -; segment unless we do this. - align 16 diff --git a/Builder/jni-1.11/simd/i386/src/ji3dnflt.asm b/Builder/jni-1.11/simd/i386/src/ji3dnflt.asm deleted file mode 100644 index dc2076f41..000000000 --- a/Builder/jni-1.11/simd/i386/src/ji3dnflt.asm +++ /dev/null @@ -1,452 +0,0 @@ -; -; ji3dnflt.asm - floating-point IDCT (3DNow! & MMX) -; -; Copyright 2009 Pierre Ossman for Cendio AB -; -; Based on -; x86 SIMD extension for IJG JPEG library -; Copyright (C) 1999-2006, MIYASAKA Masaru. -; For conditions of distribution and use, see copyright notice in jsimdext.inc -; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; This file contains a floating-point implementation of the inverse DCT -; (Discrete Cosine Transform). The following code is based directly on -; the IJG's original jidctflt.c; see the jidctflt.c for more details. -; -; [TAB8] - -%include "jsimdext.inc" -%include "jdct.inc" - -; -------------------------------------------------------------------------- - SECTION SEG_CONST - - alignz 16 - global EXTN(jconst_idct_float_3dnow) - -EXTN(jconst_idct_float_3dnow): - -PD_1_414 times 2 dd 1.414213562373095048801689 -PD_1_847 times 2 dd 1.847759065022573512256366 -PD_1_082 times 2 dd 1.082392200292393968799446 -PD_2_613 times 2 dd 2.613125929752753055713286 -PD_RNDINT_MAGIC times 2 dd 100663296.0 ; (float)(0x00C00000 << 3) -PB_CENTERJSAMP times 8 db CENTERJSAMPLE - - alignz 16 - -; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 -; -; Perform dequantization and inverse DCT on one block of coefficients. -; -; GLOBAL(void) -; jsimd_idct_float_3dnow (void * dct_table, JCOEFPTR coef_block, -; JSAMPARRAY output_buf, JDIMENSION output_col) -; - -%define dct_table(b) (b)+8 ; void * dct_table -%define coef_block(b) (b)+12 ; JCOEFPTR coef_block -%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf -%define output_col(b) (b)+20 ; JDIMENSION output_col - -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] -%define WK_NUM 2 -%define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT - ; FAST_FLOAT workspace[DCTSIZE2] - - align 16 - global EXTN(jsimd_idct_float_3dnow) - -EXTN(jsimd_idct_float_3dnow): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [workspace] - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - get_GOT ebx ; get GOT address - - ; ---- Pass 1: process columns from input, store into work array. - -; mov eax, [original_ebp] - mov edx, POINTER [dct_table(eax)] ; quantptr - mov esi, JCOEFPTR [coef_block(eax)] ; inptr - lea edi, [workspace] ; FAST_FLOAT * wsptr - mov ecx, DCTSIZE/2 ; ctr - alignx 16,7 -.columnloop: -%ifndef NO_ZERO_COLUMN_TEST_FLOAT_3DNOW - mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] - or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] - jnz short .columnDCT - - pushpic ebx ; save GOT address - mov ebx, DWORD [DWBLOCK(3,0,esi,SIZEOF_JCOEF)] - mov eax, DWORD [DWBLOCK(4,0,esi,SIZEOF_JCOEF)] - or ebx, DWORD [DWBLOCK(5,0,esi,SIZEOF_JCOEF)] - or eax, DWORD [DWBLOCK(6,0,esi,SIZEOF_JCOEF)] - or ebx, DWORD [DWBLOCK(7,0,esi,SIZEOF_JCOEF)] - or eax,ebx - poppic ebx ; restore GOT address - jnz short .columnDCT - - ; -- AC terms all zero - - movd mm0, DWORD [DWBLOCK(0,0,esi,SIZEOF_JCOEF)] - - punpcklwd mm0,mm0 - psrad mm0,(DWORD_BIT-WORD_BIT) - pi2fd mm0,mm0 - - pfmul mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - - movq mm1,mm0 - punpckldq mm0,mm0 - punpckhdq mm1,mm1 - - movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm0 - movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm0 - movq MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm0 - movq MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0 - movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1 - movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm1 - movq MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm1 - movq MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1 - jmp near .nextcolumn - alignx 16,7 -%endif -.columnDCT: - - ; -- Even part - - movd mm0, DWORD [DWBLOCK(0,0,esi,SIZEOF_JCOEF)] - movd mm1, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] - movd mm2, DWORD [DWBLOCK(4,0,esi,SIZEOF_JCOEF)] - movd mm3, DWORD [DWBLOCK(6,0,esi,SIZEOF_JCOEF)] - - punpcklwd mm0,mm0 - punpcklwd mm1,mm1 - psrad mm0,(DWORD_BIT-WORD_BIT) - psrad mm1,(DWORD_BIT-WORD_BIT) - pi2fd mm0,mm0 - pi2fd mm1,mm1 - - pfmul mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - pfmul mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - - punpcklwd mm2,mm2 - punpcklwd mm3,mm3 - psrad mm2,(DWORD_BIT-WORD_BIT) - psrad mm3,(DWORD_BIT-WORD_BIT) - pi2fd mm2,mm2 - pi2fd mm3,mm3 - - pfmul mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - pfmul mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - - movq mm4,mm0 - movq mm5,mm1 - pfsub mm0,mm2 ; mm0=tmp11 - pfsub mm1,mm3 - pfadd mm4,mm2 ; mm4=tmp10 - pfadd mm5,mm3 ; mm5=tmp13 - - pfmul mm1,[GOTOFF(ebx,PD_1_414)] - pfsub mm1,mm5 ; mm1=tmp12 - - movq mm6,mm4 - movq mm7,mm0 - pfsub mm4,mm5 ; mm4=tmp3 - pfsub mm0,mm1 ; mm0=tmp2 - pfadd mm6,mm5 ; mm6=tmp0 - pfadd mm7,mm1 ; mm7=tmp1 - - movq MMWORD [wk(1)], mm4 ; tmp3 - movq MMWORD [wk(0)], mm0 ; tmp2 - - ; -- Odd part - - movd mm2, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] - movd mm3, DWORD [DWBLOCK(3,0,esi,SIZEOF_JCOEF)] - movd mm5, DWORD [DWBLOCK(5,0,esi,SIZEOF_JCOEF)] - movd mm1, DWORD [DWBLOCK(7,0,esi,SIZEOF_JCOEF)] - - punpcklwd mm2,mm2 - punpcklwd mm3,mm3 - psrad mm2,(DWORD_BIT-WORD_BIT) - psrad mm3,(DWORD_BIT-WORD_BIT) - pi2fd mm2,mm2 - pi2fd mm3,mm3 - - pfmul mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - pfmul mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - - punpcklwd mm5,mm5 - punpcklwd mm1,mm1 - psrad mm5,(DWORD_BIT-WORD_BIT) - psrad mm1,(DWORD_BIT-WORD_BIT) - pi2fd mm5,mm5 - pi2fd mm1,mm1 - - pfmul mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - pfmul mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - - movq mm4,mm2 - movq mm0,mm5 - pfadd mm2,mm1 ; mm2=z11 - pfadd mm5,mm3 ; mm5=z13 - pfsub mm4,mm1 ; mm4=z12 - pfsub mm0,mm3 ; mm0=z10 - - movq mm1,mm2 - pfsub mm2,mm5 - pfadd mm1,mm5 ; mm1=tmp7 - - pfmul mm2,[GOTOFF(ebx,PD_1_414)] ; mm2=tmp11 - - movq mm3,mm0 - pfadd mm0,mm4 - pfmul mm0,[GOTOFF(ebx,PD_1_847)] ; mm0=z5 - pfmul mm3,[GOTOFF(ebx,PD_2_613)] ; mm3=(z10 * 2.613125930) - pfmul mm4,[GOTOFF(ebx,PD_1_082)] ; mm4=(z12 * 1.082392200) - pfsubr mm3,mm0 ; mm3=tmp12 - pfsub mm4,mm0 ; mm4=tmp10 - - ; -- Final output stage - - pfsub mm3,mm1 ; mm3=tmp6 - movq mm5,mm6 - movq mm0,mm7 - pfadd mm6,mm1 ; mm6=data0=(00 01) - pfadd mm7,mm3 ; mm7=data1=(10 11) - pfsub mm5,mm1 ; mm5=data7=(70 71) - pfsub mm0,mm3 ; mm0=data6=(60 61) - pfsub mm2,mm3 ; mm2=tmp5 - - movq mm1,mm6 ; transpose coefficients - punpckldq mm6,mm7 ; mm6=(00 10) - punpckhdq mm1,mm7 ; mm1=(01 11) - movq mm3,mm0 ; transpose coefficients - punpckldq mm0,mm5 ; mm0=(60 70) - punpckhdq mm3,mm5 ; mm3=(61 71) - - movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm6 - movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1 - movq MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0 - movq MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm3 - - movq mm7, MMWORD [wk(0)] ; mm7=tmp2 - movq mm5, MMWORD [wk(1)] ; mm5=tmp3 - - pfadd mm4,mm2 ; mm4=tmp4 - movq mm6,mm7 - movq mm1,mm5 - pfadd mm7,mm2 ; mm7=data2=(20 21) - pfadd mm5,mm4 ; mm5=data4=(40 41) - pfsub mm6,mm2 ; mm6=data5=(50 51) - pfsub mm1,mm4 ; mm1=data3=(30 31) - - movq mm0,mm7 ; transpose coefficients - punpckldq mm7,mm1 ; mm7=(20 30) - punpckhdq mm0,mm1 ; mm0=(21 31) - movq mm3,mm5 ; transpose coefficients - punpckldq mm5,mm6 ; mm5=(40 50) - punpckhdq mm3,mm6 ; mm3=(41 51) - - movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm7 - movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm0 - movq MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5 - movq MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm3 - -.nextcolumn: - add esi, byte 2*SIZEOF_JCOEF ; coef_block - add edx, byte 2*SIZEOF_FLOAT_MULT_TYPE ; quantptr - add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr - dec ecx ; ctr - jnz near .columnloop - - ; -- Prefetch the next coefficient block - - prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32] - prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32] - prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32] - prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32] - - ; ---- Pass 2: process rows from work array, store into output array. - - mov eax, [original_ebp] - lea esi, [workspace] ; FAST_FLOAT * wsptr - mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) - mov eax, JDIMENSION [output_col(eax)] - mov ecx, DCTSIZE/2 ; ctr - alignx 16,7 -.rowloop: - - ; -- Even part - - movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] - movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)] - movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)] - movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)] - - movq mm4,mm0 - movq mm5,mm1 - pfsub mm0,mm2 ; mm0=tmp11 - pfsub mm1,mm3 - pfadd mm4,mm2 ; mm4=tmp10 - pfadd mm5,mm3 ; mm5=tmp13 - - pfmul mm1,[GOTOFF(ebx,PD_1_414)] - pfsub mm1,mm5 ; mm1=tmp12 - - movq mm6,mm4 - movq mm7,mm0 - pfsub mm4,mm5 ; mm4=tmp3 - pfsub mm0,mm1 ; mm0=tmp2 - pfadd mm6,mm5 ; mm6=tmp0 - pfadd mm7,mm1 ; mm7=tmp1 - - movq MMWORD [wk(1)], mm4 ; tmp3 - movq MMWORD [wk(0)], mm0 ; tmp2 - - ; -- Odd part - - movq mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] - movq mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)] - movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)] - movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)] - - movq mm4,mm2 - movq mm0,mm5 - pfadd mm2,mm1 ; mm2=z11 - pfadd mm5,mm3 ; mm5=z13 - pfsub mm4,mm1 ; mm4=z12 - pfsub mm0,mm3 ; mm0=z10 - - movq mm1,mm2 - pfsub mm2,mm5 - pfadd mm1,mm5 ; mm1=tmp7 - - pfmul mm2,[GOTOFF(ebx,PD_1_414)] ; mm2=tmp11 - - movq mm3,mm0 - pfadd mm0,mm4 - pfmul mm0,[GOTOFF(ebx,PD_1_847)] ; mm0=z5 - pfmul mm3,[GOTOFF(ebx,PD_2_613)] ; mm3=(z10 * 2.613125930) - pfmul mm4,[GOTOFF(ebx,PD_1_082)] ; mm4=(z12 * 1.082392200) - pfsubr mm3,mm0 ; mm3=tmp12 - pfsub mm4,mm0 ; mm4=tmp10 - - ; -- Final output stage - - pfsub mm3,mm1 ; mm3=tmp6 - movq mm5,mm6 - movq mm0,mm7 - pfadd mm6,mm1 ; mm6=data0=(00 10) - pfadd mm7,mm3 ; mm7=data1=(01 11) - pfsub mm5,mm1 ; mm5=data7=(07 17) - pfsub mm0,mm3 ; mm0=data6=(06 16) - pfsub mm2,mm3 ; mm2=tmp5 - - movq mm1,[GOTOFF(ebx,PD_RNDINT_MAGIC)] ; mm1=[PD_RNDINT_MAGIC] - pcmpeqd mm3,mm3 - psrld mm3,WORD_BIT ; mm3={0xFFFF 0x0000 0xFFFF 0x0000} - - pfadd mm6,mm1 ; mm6=roundint(data0/8)=(00 ** 10 **) - pfadd mm7,mm1 ; mm7=roundint(data1/8)=(01 ** 11 **) - pfadd mm0,mm1 ; mm0=roundint(data6/8)=(06 ** 16 **) - pfadd mm5,mm1 ; mm5=roundint(data7/8)=(07 ** 17 **) - - pand mm6,mm3 ; mm6=(00 -- 10 --) - pslld mm7,WORD_BIT ; mm7=(-- 01 -- 11) - pand mm0,mm3 ; mm0=(06 -- 16 --) - pslld mm5,WORD_BIT ; mm5=(-- 07 -- 17) - por mm6,mm7 ; mm6=(00 01 10 11) - por mm0,mm5 ; mm0=(06 07 16 17) - - movq mm1, MMWORD [wk(0)] ; mm1=tmp2 - movq mm3, MMWORD [wk(1)] ; mm3=tmp3 - - pfadd mm4,mm2 ; mm4=tmp4 - movq mm7,mm1 - movq mm5,mm3 - pfadd mm1,mm2 ; mm1=data2=(02 12) - pfadd mm3,mm4 ; mm3=data4=(04 14) - pfsub mm7,mm2 ; mm7=data5=(05 15) - pfsub mm5,mm4 ; mm5=data3=(03 13) - - movq mm2,[GOTOFF(ebx,PD_RNDINT_MAGIC)] ; mm2=[PD_RNDINT_MAGIC] - pcmpeqd mm4,mm4 - psrld mm4,WORD_BIT ; mm4={0xFFFF 0x0000 0xFFFF 0x0000} - - pfadd mm3,mm2 ; mm3=roundint(data4/8)=(04 ** 14 **) - pfadd mm7,mm2 ; mm7=roundint(data5/8)=(05 ** 15 **) - pfadd mm1,mm2 ; mm1=roundint(data2/8)=(02 ** 12 **) - pfadd mm5,mm2 ; mm5=roundint(data3/8)=(03 ** 13 **) - - pand mm3,mm4 ; mm3=(04 -- 14 --) - pslld mm7,WORD_BIT ; mm7=(-- 05 -- 15) - pand mm1,mm4 ; mm1=(02 -- 12 --) - pslld mm5,WORD_BIT ; mm5=(-- 03 -- 13) - por mm3,mm7 ; mm3=(04 05 14 15) - por mm1,mm5 ; mm1=(02 03 12 13) - - movq mm2,[GOTOFF(ebx,PB_CENTERJSAMP)] ; mm2=[PB_CENTERJSAMP] - - packsswb mm6,mm3 ; mm6=(00 01 10 11 04 05 14 15) - packsswb mm1,mm0 ; mm1=(02 03 12 13 06 07 16 17) - paddb mm6,mm2 - paddb mm1,mm2 - - movq mm4,mm6 ; transpose coefficients(phase 2) - punpcklwd mm6,mm1 ; mm6=(00 01 02 03 10 11 12 13) - punpckhwd mm4,mm1 ; mm4=(04 05 06 07 14 15 16 17) - - movq mm7,mm6 ; transpose coefficients(phase 3) - punpckldq mm6,mm4 ; mm6=(00 01 02 03 04 05 06 07) - punpckhdq mm7,mm4 ; mm7=(10 11 12 13 14 15 16 17) - - pushpic ebx ; save GOT address - - mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] - mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] - movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6 - movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7 - - poppic ebx ; restore GOT address - - add esi, byte 2*SIZEOF_FAST_FLOAT ; wsptr - add edi, byte 2*SIZEOF_JSAMPROW - dec ecx ; ctr - jnz near .rowloop - - femms ; empty MMX/3DNow! state - - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret - -; For some reason, the OS X linker does not honor the request to align the -; segment unless we do this. - align 16 diff --git a/Builder/jni-1.11/simd/i386/src/jimmxfst.asm b/Builder/jni-1.11/simd/i386/src/jimmxfst.asm deleted file mode 100644 index 3b055727d..000000000 --- a/Builder/jni-1.11/simd/i386/src/jimmxfst.asm +++ /dev/null @@ -1,500 +0,0 @@ -; -; jimmxfst.asm - fast integer IDCT (MMX) -; -; Copyright 2009 Pierre Ossman for Cendio AB -; -; Based on -; x86 SIMD extension for IJG JPEG library -; Copyright (C) 1999-2006, MIYASAKA Masaru. -; For conditions of distribution and use, see copyright notice in jsimdext.inc -; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; This file contains a fast, not so accurate integer implementation of -; the inverse DCT (Discrete Cosine Transform). The following code is -; based directly on the IJG's original jidctfst.c; see the jidctfst.c -; for more details. -; -; [TAB8] - -%include "jsimdext.inc" -%include "jdct.inc" - -; -------------------------------------------------------------------------- - -%define CONST_BITS 8 ; 14 is also OK. -%define PASS1_BITS 2 - -%if IFAST_SCALE_BITS != PASS1_BITS -%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'." -%endif - -%if CONST_BITS == 8 -F_1_082 equ 277 ; FIX(1.082392200) -F_1_414 equ 362 ; FIX(1.414213562) -F_1_847 equ 473 ; FIX(1.847759065) -F_2_613 equ 669 ; FIX(2.613125930) -F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1) -%else -; NASM cannot do compile-time arithmetic on floating-point constants. -%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) -F_1_082 equ DESCALE(1162209775,30-CONST_BITS) ; FIX(1.082392200) -F_1_414 equ DESCALE(1518500249,30-CONST_BITS) ; FIX(1.414213562) -F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) -F_2_613 equ DESCALE(2805822602,30-CONST_BITS) ; FIX(2.613125930) -F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1) -%endif - -; -------------------------------------------------------------------------- - SECTION SEG_CONST - -; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) -; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) - -%define PRE_MULTIPLY_SCALE_BITS 2 -%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) - - alignz 16 - global EXTN(jconst_idct_ifast_mmx) - -EXTN(jconst_idct_ifast_mmx): - -PW_F1414 times 4 dw F_1_414 << CONST_SHIFT -PW_F1847 times 4 dw F_1_847 << CONST_SHIFT -PW_MF1613 times 4 dw -F_1_613 << CONST_SHIFT -PW_F1082 times 4 dw F_1_082 << CONST_SHIFT -PB_CENTERJSAMP times 8 db CENTERJSAMPLE - - alignz 16 - -; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 -; -; Perform dequantization and inverse DCT on one block of coefficients. -; -; GLOBAL(void) -; jsimd_idct_ifast_mmx (void * dct_table, JCOEFPTR coef_block, -; JSAMPARRAY output_buf, JDIMENSION output_col) -; - -%define dct_table(b) (b)+8 ; jpeg_component_info * compptr -%define coef_block(b) (b)+12 ; JCOEFPTR coef_block -%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf -%define output_col(b) (b)+20 ; JDIMENSION output_col - -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] -%define WK_NUM 2 -%define workspace wk(0)-DCTSIZE2*SIZEOF_JCOEF - ; JCOEF workspace[DCTSIZE2] - - align 16 - global EXTN(jsimd_idct_ifast_mmx) - -EXTN(jsimd_idct_ifast_mmx): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [workspace] - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - get_GOT ebx ; get GOT address - - ; ---- Pass 1: process columns from input, store into work array. - -; mov eax, [original_ebp] - mov edx, POINTER [dct_table(eax)] ; quantptr - mov esi, JCOEFPTR [coef_block(eax)] ; inptr - lea edi, [workspace] ; JCOEF * wsptr - mov ecx, DCTSIZE/4 ; ctr - alignx 16,7 -.columnloop: -%ifndef NO_ZERO_COLUMN_TEST_IFAST_MMX - mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] - or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] - jnz short .columnDCT - - movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] - movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] - por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] - por mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] - por mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] - por mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] - por mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] - por mm1,mm0 - packsswb mm1,mm1 - movd eax,mm1 - test eax,eax - jnz short .columnDCT - - ; -- AC terms all zero - - movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] - pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)] - - movq mm2,mm0 ; mm0=in0=(00 01 02 03) - punpcklwd mm0,mm0 ; mm0=(00 00 01 01) - punpckhwd mm2,mm2 ; mm2=(02 02 03 03) - - movq mm1,mm0 - punpckldq mm0,mm0 ; mm0=(00 00 00 00) - punpckhdq mm1,mm1 ; mm1=(01 01 01 01) - movq mm3,mm2 - punpckldq mm2,mm2 ; mm2=(02 02 02 02) - punpckhdq mm3,mm3 ; mm3=(03 03 03 03) - - movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0 - movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0 - movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1 - movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1 - movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2 - movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2 - movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3 - movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3 - jmp near .nextcolumn - alignx 16,7 -%endif -.columnDCT: - - ; -- Even part - - movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] - movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] - pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)] - pmullw mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)] - movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] - movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] - pmullw mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)] - pmullw mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)] - - movq mm4,mm0 - movq mm5,mm1 - psubw mm0,mm2 ; mm0=tmp11 - psubw mm1,mm3 - paddw mm4,mm2 ; mm4=tmp10 - paddw mm5,mm3 ; mm5=tmp13 - - psllw mm1,PRE_MULTIPLY_SCALE_BITS - pmulhw mm1,[GOTOFF(ebx,PW_F1414)] - psubw mm1,mm5 ; mm1=tmp12 - - movq mm6,mm4 - movq mm7,mm0 - psubw mm4,mm5 ; mm4=tmp3 - psubw mm0,mm1 ; mm0=tmp2 - paddw mm6,mm5 ; mm6=tmp0 - paddw mm7,mm1 ; mm7=tmp1 - - movq MMWORD [wk(1)], mm4 ; wk(1)=tmp3 - movq MMWORD [wk(0)], mm0 ; wk(0)=tmp2 - - ; -- Odd part - - movq mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] - movq mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] - pmullw mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)] - pmullw mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)] - movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] - movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] - pmullw mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)] - pmullw mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)] - - movq mm4,mm2 - movq mm0,mm5 - psubw mm2,mm1 ; mm2=z12 - psubw mm5,mm3 ; mm5=z10 - paddw mm4,mm1 ; mm4=z11 - paddw mm0,mm3 ; mm0=z13 - - movq mm1,mm5 ; mm1=z10(unscaled) - psllw mm2,PRE_MULTIPLY_SCALE_BITS - psllw mm5,PRE_MULTIPLY_SCALE_BITS - - movq mm3,mm4 - psubw mm4,mm0 - paddw mm3,mm0 ; mm3=tmp7 - - psllw mm4,PRE_MULTIPLY_SCALE_BITS - pmulhw mm4,[GOTOFF(ebx,PW_F1414)] ; mm4=tmp11 - - ; To avoid overflow... - ; - ; (Original) - ; tmp12 = -2.613125930 * z10 + z5; - ; - ; (This implementation) - ; tmp12 = (-1.613125930 - 1) * z10 + z5; - ; = -1.613125930 * z10 - z10 + z5; - - movq mm0,mm5 - paddw mm5,mm2 - pmulhw mm5,[GOTOFF(ebx,PW_F1847)] ; mm5=z5 - pmulhw mm0,[GOTOFF(ebx,PW_MF1613)] - pmulhw mm2,[GOTOFF(ebx,PW_F1082)] - psubw mm0,mm1 - psubw mm2,mm5 ; mm2=tmp10 - paddw mm0,mm5 ; mm0=tmp12 - - ; -- Final output stage - - psubw mm0,mm3 ; mm0=tmp6 - movq mm1,mm6 - movq mm5,mm7 - paddw mm6,mm3 ; mm6=data0=(00 01 02 03) - paddw mm7,mm0 ; mm7=data1=(10 11 12 13) - psubw mm1,mm3 ; mm1=data7=(70 71 72 73) - psubw mm5,mm0 ; mm5=data6=(60 61 62 63) - psubw mm4,mm0 ; mm4=tmp5 - - movq mm3,mm6 ; transpose coefficients(phase 1) - punpcklwd mm6,mm7 ; mm6=(00 10 01 11) - punpckhwd mm3,mm7 ; mm3=(02 12 03 13) - movq mm0,mm5 ; transpose coefficients(phase 1) - punpcklwd mm5,mm1 ; mm5=(60 70 61 71) - punpckhwd mm0,mm1 ; mm0=(62 72 63 73) - - movq mm7, MMWORD [wk(0)] ; mm7=tmp2 - movq mm1, MMWORD [wk(1)] ; mm1=tmp3 - - movq MMWORD [wk(0)], mm5 ; wk(0)=(60 70 61 71) - movq MMWORD [wk(1)], mm0 ; wk(1)=(62 72 63 73) - - paddw mm2,mm4 ; mm2=tmp4 - movq mm5,mm7 - movq mm0,mm1 - paddw mm7,mm4 ; mm7=data2=(20 21 22 23) - paddw mm1,mm2 ; mm1=data4=(40 41 42 43) - psubw mm5,mm4 ; mm5=data5=(50 51 52 53) - psubw mm0,mm2 ; mm0=data3=(30 31 32 33) - - movq mm4,mm7 ; transpose coefficients(phase 1) - punpcklwd mm7,mm0 ; mm7=(20 30 21 31) - punpckhwd mm4,mm0 ; mm4=(22 32 23 33) - movq mm2,mm1 ; transpose coefficients(phase 1) - punpcklwd mm1,mm5 ; mm1=(40 50 41 51) - punpckhwd mm2,mm5 ; mm2=(42 52 43 53) - - movq mm0,mm6 ; transpose coefficients(phase 2) - punpckldq mm6,mm7 ; mm6=(00 10 20 30) - punpckhdq mm0,mm7 ; mm0=(01 11 21 31) - movq mm5,mm3 ; transpose coefficients(phase 2) - punpckldq mm3,mm4 ; mm3=(02 12 22 32) - punpckhdq mm5,mm4 ; mm5=(03 13 23 33) - - movq mm7, MMWORD [wk(0)] ; mm7=(60 70 61 71) - movq mm4, MMWORD [wk(1)] ; mm4=(62 72 63 73) - - movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm6 - movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0 - movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm3 - movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5 - - movq mm6,mm1 ; transpose coefficients(phase 2) - punpckldq mm1,mm7 ; mm1=(40 50 60 70) - punpckhdq mm6,mm7 ; mm6=(41 51 61 71) - movq mm0,mm2 ; transpose coefficients(phase 2) - punpckldq mm2,mm4 ; mm2=(42 52 62 72) - punpckhdq mm0,mm4 ; mm0=(43 53 63 73) - - movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1 - movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm6 - movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2 - movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm0 - -.nextcolumn: - add esi, byte 4*SIZEOF_JCOEF ; coef_block - add edx, byte 4*SIZEOF_IFAST_MULT_TYPE ; quantptr - add edi, byte 4*DCTSIZE*SIZEOF_JCOEF ; wsptr - dec ecx ; ctr - jnz near .columnloop - - ; ---- Pass 2: process rows from work array, store into output array. - - mov eax, [original_ebp] - lea esi, [workspace] ; JCOEF * wsptr - mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) - mov eax, JDIMENSION [output_col(eax)] - mov ecx, DCTSIZE/4 ; ctr - alignx 16,7 -.rowloop: - - ; -- Even part - - movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] - movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] - movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] - movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] - - movq mm4,mm0 - movq mm5,mm1 - psubw mm0,mm2 ; mm0=tmp11 - psubw mm1,mm3 - paddw mm4,mm2 ; mm4=tmp10 - paddw mm5,mm3 ; mm5=tmp13 - - psllw mm1,PRE_MULTIPLY_SCALE_BITS - pmulhw mm1,[GOTOFF(ebx,PW_F1414)] - psubw mm1,mm5 ; mm1=tmp12 - - movq mm6,mm4 - movq mm7,mm0 - psubw mm4,mm5 ; mm4=tmp3 - psubw mm0,mm1 ; mm0=tmp2 - paddw mm6,mm5 ; mm6=tmp0 - paddw mm7,mm1 ; mm7=tmp1 - - movq MMWORD [wk(1)], mm4 ; wk(1)=tmp3 - movq MMWORD [wk(0)], mm0 ; wk(0)=tmp2 - - ; -- Odd part - - movq mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] - movq mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] - movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] - movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] - - movq mm4,mm2 - movq mm0,mm5 - psubw mm2,mm1 ; mm2=z12 - psubw mm5,mm3 ; mm5=z10 - paddw mm4,mm1 ; mm4=z11 - paddw mm0,mm3 ; mm0=z13 - - movq mm1,mm5 ; mm1=z10(unscaled) - psllw mm2,PRE_MULTIPLY_SCALE_BITS - psllw mm5,PRE_MULTIPLY_SCALE_BITS - - movq mm3,mm4 - psubw mm4,mm0 - paddw mm3,mm0 ; mm3=tmp7 - - psllw mm4,PRE_MULTIPLY_SCALE_BITS - pmulhw mm4,[GOTOFF(ebx,PW_F1414)] ; mm4=tmp11 - - ; To avoid overflow... - ; - ; (Original) - ; tmp12 = -2.613125930 * z10 + z5; - ; - ; (This implementation) - ; tmp12 = (-1.613125930 - 1) * z10 + z5; - ; = -1.613125930 * z10 - z10 + z5; - - movq mm0,mm5 - paddw mm5,mm2 - pmulhw mm5,[GOTOFF(ebx,PW_F1847)] ; mm5=z5 - pmulhw mm0,[GOTOFF(ebx,PW_MF1613)] - pmulhw mm2,[GOTOFF(ebx,PW_F1082)] - psubw mm0,mm1 - psubw mm2,mm5 ; mm2=tmp10 - paddw mm0,mm5 ; mm0=tmp12 - - ; -- Final output stage - - psubw mm0,mm3 ; mm0=tmp6 - movq mm1,mm6 - movq mm5,mm7 - paddw mm6,mm3 ; mm6=data0=(00 10 20 30) - paddw mm7,mm0 ; mm7=data1=(01 11 21 31) - psraw mm6,(PASS1_BITS+3) ; descale - psraw mm7,(PASS1_BITS+3) ; descale - psubw mm1,mm3 ; mm1=data7=(07 17 27 37) - psubw mm5,mm0 ; mm5=data6=(06 16 26 36) - psraw mm1,(PASS1_BITS+3) ; descale - psraw mm5,(PASS1_BITS+3) ; descale - psubw mm4,mm0 ; mm4=tmp5 - - packsswb mm6,mm5 ; mm6=(00 10 20 30 06 16 26 36) - packsswb mm7,mm1 ; mm7=(01 11 21 31 07 17 27 37) - - movq mm3, MMWORD [wk(0)] ; mm3=tmp2 - movq mm0, MMWORD [wk(1)] ; mm0=tmp3 - - paddw mm2,mm4 ; mm2=tmp4 - movq mm5,mm3 - movq mm1,mm0 - paddw mm3,mm4 ; mm3=data2=(02 12 22 32) - paddw mm0,mm2 ; mm0=data4=(04 14 24 34) - psraw mm3,(PASS1_BITS+3) ; descale - psraw mm0,(PASS1_BITS+3) ; descale - psubw mm5,mm4 ; mm5=data5=(05 15 25 35) - psubw mm1,mm2 ; mm1=data3=(03 13 23 33) - psraw mm5,(PASS1_BITS+3) ; descale - psraw mm1,(PASS1_BITS+3) ; descale - - movq mm4,[GOTOFF(ebx,PB_CENTERJSAMP)] ; mm4=[PB_CENTERJSAMP] - - packsswb mm3,mm0 ; mm3=(02 12 22 32 04 14 24 34) - packsswb mm1,mm5 ; mm1=(03 13 23 33 05 15 25 35) - - paddb mm6,mm4 - paddb mm7,mm4 - paddb mm3,mm4 - paddb mm1,mm4 - - movq mm2,mm6 ; transpose coefficients(phase 1) - punpcklbw mm6,mm7 ; mm6=(00 01 10 11 20 21 30 31) - punpckhbw mm2,mm7 ; mm2=(06 07 16 17 26 27 36 37) - movq mm0,mm3 ; transpose coefficients(phase 1) - punpcklbw mm3,mm1 ; mm3=(02 03 12 13 22 23 32 33) - punpckhbw mm0,mm1 ; mm0=(04 05 14 15 24 25 34 35) - - movq mm5,mm6 ; transpose coefficients(phase 2) - punpcklwd mm6,mm3 ; mm6=(00 01 02 03 10 11 12 13) - punpckhwd mm5,mm3 ; mm5=(20 21 22 23 30 31 32 33) - movq mm4,mm0 ; transpose coefficients(phase 2) - punpcklwd mm0,mm2 ; mm0=(04 05 06 07 14 15 16 17) - punpckhwd mm4,mm2 ; mm4=(24 25 26 27 34 35 36 37) - - movq mm7,mm6 ; transpose coefficients(phase 3) - punpckldq mm6,mm0 ; mm6=(00 01 02 03 04 05 06 07) - punpckhdq mm7,mm0 ; mm7=(10 11 12 13 14 15 16 17) - movq mm1,mm5 ; transpose coefficients(phase 3) - punpckldq mm5,mm4 ; mm5=(20 21 22 23 24 25 26 27) - punpckhdq mm1,mm4 ; mm1=(30 31 32 33 34 35 36 37) - - pushpic ebx ; save GOT address - - mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] - mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] - movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6 - movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7 - mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] - mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW] - movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5 - movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1 - - poppic ebx ; restore GOT address - - add esi, byte 4*SIZEOF_JCOEF ; wsptr - add edi, byte 4*SIZEOF_JSAMPROW - dec ecx ; ctr - jnz near .rowloop - - emms ; empty MMX state - - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret - -; For some reason, the OS X linker does not honor the request to align the -; segment unless we do this. - align 16 diff --git a/Builder/jni-1.11/simd/i386/src/jimmxint.asm b/Builder/jni-1.11/simd/i386/src/jimmxint.asm deleted file mode 100644 index 7b52fae34..000000000 --- a/Builder/jni-1.11/simd/i386/src/jimmxint.asm +++ /dev/null @@ -1,852 +0,0 @@ -; -; jimmxint.asm - accurate integer IDCT (MMX) -; -; Copyright 2009 Pierre Ossman for Cendio AB -; -; Based on -; x86 SIMD extension for IJG JPEG library -; Copyright (C) 1999-2006, MIYASAKA Masaru. -; For conditions of distribution and use, see copyright notice in jsimdext.inc -; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; This file contains a slow-but-accurate integer implementation of the -; inverse DCT (Discrete Cosine Transform). The following code is based -; directly on the IJG's original jidctint.c; see the jidctint.c for -; more details. -; -; [TAB8] - -%include "jsimdext.inc" -%include "jdct.inc" - -; -------------------------------------------------------------------------- - -%define CONST_BITS 13 -%define PASS1_BITS 2 - -%define DESCALE_P1 (CONST_BITS-PASS1_BITS) -%define DESCALE_P2 (CONST_BITS+PASS1_BITS+3) - -%if CONST_BITS == 13 -F_0_298 equ 2446 ; FIX(0.298631336) -F_0_390 equ 3196 ; FIX(0.390180644) -F_0_541 equ 4433 ; FIX(0.541196100) -F_0_765 equ 6270 ; FIX(0.765366865) -F_0_899 equ 7373 ; FIX(0.899976223) -F_1_175 equ 9633 ; FIX(1.175875602) -F_1_501 equ 12299 ; FIX(1.501321110) -F_1_847 equ 15137 ; FIX(1.847759065) -F_1_961 equ 16069 ; FIX(1.961570560) -F_2_053 equ 16819 ; FIX(2.053119869) -F_2_562 equ 20995 ; FIX(2.562915447) -F_3_072 equ 25172 ; FIX(3.072711026) -%else -; NASM cannot do compile-time arithmetic on floating-point constants. -%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) -F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336) -F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644) -F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) -F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) -F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) -F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602) -F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110) -F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) -F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560) -F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869) -F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) -F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026) -%endif - -; -------------------------------------------------------------------------- - SECTION SEG_CONST - - alignz 16 - global EXTN(jconst_idct_islow_mmx) - -EXTN(jconst_idct_islow_mmx): - -PW_F130_F054 times 2 dw (F_0_541+F_0_765), F_0_541 -PW_F054_MF130 times 2 dw F_0_541, (F_0_541-F_1_847) -PW_MF078_F117 times 2 dw (F_1_175-F_1_961), F_1_175 -PW_F117_F078 times 2 dw F_1_175, (F_1_175-F_0_390) -PW_MF060_MF089 times 2 dw (F_0_298-F_0_899),-F_0_899 -PW_MF089_F060 times 2 dw -F_0_899, (F_1_501-F_0_899) -PW_MF050_MF256 times 2 dw (F_2_053-F_2_562),-F_2_562 -PW_MF256_F050 times 2 dw -F_2_562, (F_3_072-F_2_562) -PD_DESCALE_P1 times 2 dd 1 << (DESCALE_P1-1) -PD_DESCALE_P2 times 2 dd 1 << (DESCALE_P2-1) -PB_CENTERJSAMP times 8 db CENTERJSAMPLE - - alignz 16 - -; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 -; -; Perform dequantization and inverse DCT on one block of coefficients. -; -; GLOBAL(void) -; jsimd_idct_islow_mmx (void * dct_table, JCOEFPTR coef_block, -; JSAMPARRAY output_buf, JDIMENSION output_col) -; - -%define dct_table(b) (b)+8 ; jpeg_component_info * compptr -%define coef_block(b) (b)+12 ; JCOEFPTR coef_block -%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf -%define output_col(b) (b)+20 ; JDIMENSION output_col - -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] -%define WK_NUM 12 -%define workspace wk(0)-DCTSIZE2*SIZEOF_JCOEF - ; JCOEF workspace[DCTSIZE2] - - align 16 - global EXTN(jsimd_idct_islow_mmx) - -EXTN(jsimd_idct_islow_mmx): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [workspace] - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - get_GOT ebx ; get GOT address - - ; ---- Pass 1: process columns from input, store into work array. - -; mov eax, [original_ebp] - mov edx, POINTER [dct_table(eax)] ; quantptr - mov esi, JCOEFPTR [coef_block(eax)] ; inptr - lea edi, [workspace] ; JCOEF * wsptr - mov ecx, DCTSIZE/4 ; ctr - alignx 16,7 -.columnloop: -%ifndef NO_ZERO_COLUMN_TEST_ISLOW_MMX - mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] - or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] - jnz short .columnDCT - - movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] - movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] - por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] - por mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] - por mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] - por mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] - por mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] - por mm1,mm0 - packsswb mm1,mm1 - movd eax,mm1 - test eax,eax - jnz short .columnDCT - - ; -- AC terms all zero - - movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] - pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - - psllw mm0,PASS1_BITS - - movq mm2,mm0 ; mm0=in0=(00 01 02 03) - punpcklwd mm0,mm0 ; mm0=(00 00 01 01) - punpckhwd mm2,mm2 ; mm2=(02 02 03 03) - - movq mm1,mm0 - punpckldq mm0,mm0 ; mm0=(00 00 00 00) - punpckhdq mm1,mm1 ; mm1=(01 01 01 01) - movq mm3,mm2 - punpckldq mm2,mm2 ; mm2=(02 02 02 02) - punpckhdq mm3,mm3 ; mm3=(03 03 03 03) - - movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0 - movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0 - movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1 - movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1 - movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2 - movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2 - movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3 - movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3 - jmp near .nextcolumn - alignx 16,7 -%endif -.columnDCT: - - ; -- Even part - - movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] - movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] - pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] - movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] - pmullw mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - - ; (Original) - ; z1 = (z2 + z3) * 0.541196100; - ; tmp2 = z1 + z3 * -1.847759065; - ; tmp3 = z1 + z2 * 0.765366865; - ; - ; (This implementation) - ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); - ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; - - movq mm4,mm1 ; mm1=in2=z2 - movq mm5,mm1 - punpcklwd mm4,mm3 ; mm3=in6=z3 - punpckhwd mm5,mm3 - movq mm1,mm4 - movq mm3,mm5 - pmaddwd mm4,[GOTOFF(ebx,PW_F130_F054)] ; mm4=tmp3L - pmaddwd mm5,[GOTOFF(ebx,PW_F130_F054)] ; mm5=tmp3H - pmaddwd mm1,[GOTOFF(ebx,PW_F054_MF130)] ; mm1=tmp2L - pmaddwd mm3,[GOTOFF(ebx,PW_F054_MF130)] ; mm3=tmp2H - - movq mm6,mm0 - paddw mm0,mm2 ; mm0=in0+in4 - psubw mm6,mm2 ; mm6=in0-in4 - - pxor mm7,mm7 - pxor mm2,mm2 - punpcklwd mm7,mm0 ; mm7=tmp0L - punpckhwd mm2,mm0 ; mm2=tmp0H - psrad mm7,(16-CONST_BITS) ; psrad mm7,16 & pslld mm7,CONST_BITS - psrad mm2,(16-CONST_BITS) ; psrad mm2,16 & pslld mm2,CONST_BITS - - movq mm0,mm7 - paddd mm7,mm4 ; mm7=tmp10L - psubd mm0,mm4 ; mm0=tmp13L - movq mm4,mm2 - paddd mm2,mm5 ; mm2=tmp10H - psubd mm4,mm5 ; mm4=tmp13H - - movq MMWORD [wk(0)], mm7 ; wk(0)=tmp10L - movq MMWORD [wk(1)], mm2 ; wk(1)=tmp10H - movq MMWORD [wk(2)], mm0 ; wk(2)=tmp13L - movq MMWORD [wk(3)], mm4 ; wk(3)=tmp13H - - pxor mm5,mm5 - pxor mm7,mm7 - punpcklwd mm5,mm6 ; mm5=tmp1L - punpckhwd mm7,mm6 ; mm7=tmp1H - psrad mm5,(16-CONST_BITS) ; psrad mm5,16 & pslld mm5,CONST_BITS - psrad mm7,(16-CONST_BITS) ; psrad mm7,16 & pslld mm7,CONST_BITS - - movq mm2,mm5 - paddd mm5,mm1 ; mm5=tmp11L - psubd mm2,mm1 ; mm2=tmp12L - movq mm0,mm7 - paddd mm7,mm3 ; mm7=tmp11H - psubd mm0,mm3 ; mm0=tmp12H - - movq MMWORD [wk(4)], mm5 ; wk(4)=tmp11L - movq MMWORD [wk(5)], mm7 ; wk(5)=tmp11H - movq MMWORD [wk(6)], mm2 ; wk(6)=tmp12L - movq MMWORD [wk(7)], mm0 ; wk(7)=tmp12H - - ; -- Odd part - - movq mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] - movq mm6, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] - pmullw mm4, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw mm6, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - movq mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] - movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] - pmullw mm1, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - - movq mm5,mm6 - movq mm7,mm4 - paddw mm5,mm3 ; mm5=z3 - paddw mm7,mm1 ; mm7=z4 - - ; (Original) - ; z5 = (z3 + z4) * 1.175875602; - ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; - ; z3 += z5; z4 += z5; - ; - ; (This implementation) - ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; - ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); - - movq mm2,mm5 - movq mm0,mm5 - punpcklwd mm2,mm7 - punpckhwd mm0,mm7 - movq mm5,mm2 - movq mm7,mm0 - pmaddwd mm2,[GOTOFF(ebx,PW_MF078_F117)] ; mm2=z3L - pmaddwd mm0,[GOTOFF(ebx,PW_MF078_F117)] ; mm0=z3H - pmaddwd mm5,[GOTOFF(ebx,PW_F117_F078)] ; mm5=z4L - pmaddwd mm7,[GOTOFF(ebx,PW_F117_F078)] ; mm7=z4H - - movq MMWORD [wk(10)], mm2 ; wk(10)=z3L - movq MMWORD [wk(11)], mm0 ; wk(11)=z3H - - ; (Original) - ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; - ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; - ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; - ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; - ; tmp0 += z1 + z3; tmp1 += z2 + z4; - ; tmp2 += z2 + z3; tmp3 += z1 + z4; - ; - ; (This implementation) - ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; - ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; - ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); - ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); - ; tmp0 += z3; tmp1 += z4; - ; tmp2 += z3; tmp3 += z4; - - movq mm2,mm3 - movq mm0,mm3 - punpcklwd mm2,mm4 - punpckhwd mm0,mm4 - movq mm3,mm2 - movq mm4,mm0 - pmaddwd mm2,[GOTOFF(ebx,PW_MF060_MF089)] ; mm2=tmp0L - pmaddwd mm0,[GOTOFF(ebx,PW_MF060_MF089)] ; mm0=tmp0H - pmaddwd mm3,[GOTOFF(ebx,PW_MF089_F060)] ; mm3=tmp3L - pmaddwd mm4,[GOTOFF(ebx,PW_MF089_F060)] ; mm4=tmp3H - - paddd mm2, MMWORD [wk(10)] ; mm2=tmp0L - paddd mm0, MMWORD [wk(11)] ; mm0=tmp0H - paddd mm3,mm5 ; mm3=tmp3L - paddd mm4,mm7 ; mm4=tmp3H - - movq MMWORD [wk(8)], mm2 ; wk(8)=tmp0L - movq MMWORD [wk(9)], mm0 ; wk(9)=tmp0H - - movq mm2,mm1 - movq mm0,mm1 - punpcklwd mm2,mm6 - punpckhwd mm0,mm6 - movq mm1,mm2 - movq mm6,mm0 - pmaddwd mm2,[GOTOFF(ebx,PW_MF050_MF256)] ; mm2=tmp1L - pmaddwd mm0,[GOTOFF(ebx,PW_MF050_MF256)] ; mm0=tmp1H - pmaddwd mm1,[GOTOFF(ebx,PW_MF256_F050)] ; mm1=tmp2L - pmaddwd mm6,[GOTOFF(ebx,PW_MF256_F050)] ; mm6=tmp2H - - paddd mm2,mm5 ; mm2=tmp1L - paddd mm0,mm7 ; mm0=tmp1H - paddd mm1, MMWORD [wk(10)] ; mm1=tmp2L - paddd mm6, MMWORD [wk(11)] ; mm6=tmp2H - - movq MMWORD [wk(10)], mm2 ; wk(10)=tmp1L - movq MMWORD [wk(11)], mm0 ; wk(11)=tmp1H - - ; -- Final output stage - - movq mm5, MMWORD [wk(0)] ; mm5=tmp10L - movq mm7, MMWORD [wk(1)] ; mm7=tmp10H - - movq mm2,mm5 - movq mm0,mm7 - paddd mm5,mm3 ; mm5=data0L - paddd mm7,mm4 ; mm7=data0H - psubd mm2,mm3 ; mm2=data7L - psubd mm0,mm4 ; mm0=data7H - - movq mm3,[GOTOFF(ebx,PD_DESCALE_P1)] ; mm3=[PD_DESCALE_P1] - - paddd mm5,mm3 - paddd mm7,mm3 - psrad mm5,DESCALE_P1 - psrad mm7,DESCALE_P1 - paddd mm2,mm3 - paddd mm0,mm3 - psrad mm2,DESCALE_P1 - psrad mm0,DESCALE_P1 - - packssdw mm5,mm7 ; mm5=data0=(00 01 02 03) - packssdw mm2,mm0 ; mm2=data7=(70 71 72 73) - - movq mm4, MMWORD [wk(4)] ; mm4=tmp11L - movq mm3, MMWORD [wk(5)] ; mm3=tmp11H - - movq mm7,mm4 - movq mm0,mm3 - paddd mm4,mm1 ; mm4=data1L - paddd mm3,mm6 ; mm3=data1H - psubd mm7,mm1 ; mm7=data6L - psubd mm0,mm6 ; mm0=data6H - - movq mm1,[GOTOFF(ebx,PD_DESCALE_P1)] ; mm1=[PD_DESCALE_P1] - - paddd mm4,mm1 - paddd mm3,mm1 - psrad mm4,DESCALE_P1 - psrad mm3,DESCALE_P1 - paddd mm7,mm1 - paddd mm0,mm1 - psrad mm7,DESCALE_P1 - psrad mm0,DESCALE_P1 - - packssdw mm4,mm3 ; mm4=data1=(10 11 12 13) - packssdw mm7,mm0 ; mm7=data6=(60 61 62 63) - - movq mm6,mm5 ; transpose coefficients(phase 1) - punpcklwd mm5,mm4 ; mm5=(00 10 01 11) - punpckhwd mm6,mm4 ; mm6=(02 12 03 13) - movq mm1,mm7 ; transpose coefficients(phase 1) - punpcklwd mm7,mm2 ; mm7=(60 70 61 71) - punpckhwd mm1,mm2 ; mm1=(62 72 63 73) - - movq mm3, MMWORD [wk(6)] ; mm3=tmp12L - movq mm0, MMWORD [wk(7)] ; mm0=tmp12H - movq mm4, MMWORD [wk(10)] ; mm4=tmp1L - movq mm2, MMWORD [wk(11)] ; mm2=tmp1H - - movq MMWORD [wk(0)], mm5 ; wk(0)=(00 10 01 11) - movq MMWORD [wk(1)], mm6 ; wk(1)=(02 12 03 13) - movq MMWORD [wk(4)], mm7 ; wk(4)=(60 70 61 71) - movq MMWORD [wk(5)], mm1 ; wk(5)=(62 72 63 73) - - movq mm5,mm3 - movq mm6,mm0 - paddd mm3,mm4 ; mm3=data2L - paddd mm0,mm2 ; mm0=data2H - psubd mm5,mm4 ; mm5=data5L - psubd mm6,mm2 ; mm6=data5H - - movq mm7,[GOTOFF(ebx,PD_DESCALE_P1)] ; mm7=[PD_DESCALE_P1] - - paddd mm3,mm7 - paddd mm0,mm7 - psrad mm3,DESCALE_P1 - psrad mm0,DESCALE_P1 - paddd mm5,mm7 - paddd mm6,mm7 - psrad mm5,DESCALE_P1 - psrad mm6,DESCALE_P1 - - packssdw mm3,mm0 ; mm3=data2=(20 21 22 23) - packssdw mm5,mm6 ; mm5=data5=(50 51 52 53) - - movq mm1, MMWORD [wk(2)] ; mm1=tmp13L - movq mm4, MMWORD [wk(3)] ; mm4=tmp13H - movq mm2, MMWORD [wk(8)] ; mm2=tmp0L - movq mm7, MMWORD [wk(9)] ; mm7=tmp0H - - movq mm0,mm1 - movq mm6,mm4 - paddd mm1,mm2 ; mm1=data3L - paddd mm4,mm7 ; mm4=data3H - psubd mm0,mm2 ; mm0=data4L - psubd mm6,mm7 ; mm6=data4H - - movq mm2,[GOTOFF(ebx,PD_DESCALE_P1)] ; mm2=[PD_DESCALE_P1] - - paddd mm1,mm2 - paddd mm4,mm2 - psrad mm1,DESCALE_P1 - psrad mm4,DESCALE_P1 - paddd mm0,mm2 - paddd mm6,mm2 - psrad mm0,DESCALE_P1 - psrad mm6,DESCALE_P1 - - packssdw mm1,mm4 ; mm1=data3=(30 31 32 33) - packssdw mm0,mm6 ; mm0=data4=(40 41 42 43) - - movq mm7, MMWORD [wk(0)] ; mm7=(00 10 01 11) - movq mm2, MMWORD [wk(1)] ; mm2=(02 12 03 13) - - movq mm4,mm3 ; transpose coefficients(phase 1) - punpcklwd mm3,mm1 ; mm3=(20 30 21 31) - punpckhwd mm4,mm1 ; mm4=(22 32 23 33) - movq mm6,mm0 ; transpose coefficients(phase 1) - punpcklwd mm0,mm5 ; mm0=(40 50 41 51) - punpckhwd mm6,mm5 ; mm6=(42 52 43 53) - - movq mm1,mm7 ; transpose coefficients(phase 2) - punpckldq mm7,mm3 ; mm7=(00 10 20 30) - punpckhdq mm1,mm3 ; mm1=(01 11 21 31) - movq mm5,mm2 ; transpose coefficients(phase 2) - punpckldq mm2,mm4 ; mm2=(02 12 22 32) - punpckhdq mm5,mm4 ; mm5=(03 13 23 33) - - movq mm3, MMWORD [wk(4)] ; mm3=(60 70 61 71) - movq mm4, MMWORD [wk(5)] ; mm4=(62 72 63 73) - - movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm7 - movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1 - movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2 - movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5 - - movq mm7,mm0 ; transpose coefficients(phase 2) - punpckldq mm0,mm3 ; mm0=(40 50 60 70) - punpckhdq mm7,mm3 ; mm7=(41 51 61 71) - movq mm1,mm6 ; transpose coefficients(phase 2) - punpckldq mm6,mm4 ; mm6=(42 52 62 72) - punpckhdq mm1,mm4 ; mm1=(43 53 63 73) - - movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0 - movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm7 - movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm6 - movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm1 - -.nextcolumn: - add esi, byte 4*SIZEOF_JCOEF ; coef_block - add edx, byte 4*SIZEOF_ISLOW_MULT_TYPE ; quantptr - add edi, byte 4*DCTSIZE*SIZEOF_JCOEF ; wsptr - dec ecx ; ctr - jnz near .columnloop - - ; ---- Pass 2: process rows from work array, store into output array. - - mov eax, [original_ebp] - lea esi, [workspace] ; JCOEF * wsptr - mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) - mov eax, JDIMENSION [output_col(eax)] - mov ecx, DCTSIZE/4 ; ctr - alignx 16,7 -.rowloop: - - ; -- Even part - - movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] - movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] - movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] - movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] - - ; (Original) - ; z1 = (z2 + z3) * 0.541196100; - ; tmp2 = z1 + z3 * -1.847759065; - ; tmp3 = z1 + z2 * 0.765366865; - ; - ; (This implementation) - ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); - ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; - - movq mm4,mm1 ; mm1=in2=z2 - movq mm5,mm1 - punpcklwd mm4,mm3 ; mm3=in6=z3 - punpckhwd mm5,mm3 - movq mm1,mm4 - movq mm3,mm5 - pmaddwd mm4,[GOTOFF(ebx,PW_F130_F054)] ; mm4=tmp3L - pmaddwd mm5,[GOTOFF(ebx,PW_F130_F054)] ; mm5=tmp3H - pmaddwd mm1,[GOTOFF(ebx,PW_F054_MF130)] ; mm1=tmp2L - pmaddwd mm3,[GOTOFF(ebx,PW_F054_MF130)] ; mm3=tmp2H - - movq mm6,mm0 - paddw mm0,mm2 ; mm0=in0+in4 - psubw mm6,mm2 ; mm6=in0-in4 - - pxor mm7,mm7 - pxor mm2,mm2 - punpcklwd mm7,mm0 ; mm7=tmp0L - punpckhwd mm2,mm0 ; mm2=tmp0H - psrad mm7,(16-CONST_BITS) ; psrad mm7,16 & pslld mm7,CONST_BITS - psrad mm2,(16-CONST_BITS) ; psrad mm2,16 & pslld mm2,CONST_BITS - - movq mm0,mm7 - paddd mm7,mm4 ; mm7=tmp10L - psubd mm0,mm4 ; mm0=tmp13L - movq mm4,mm2 - paddd mm2,mm5 ; mm2=tmp10H - psubd mm4,mm5 ; mm4=tmp13H - - movq MMWORD [wk(0)], mm7 ; wk(0)=tmp10L - movq MMWORD [wk(1)], mm2 ; wk(1)=tmp10H - movq MMWORD [wk(2)], mm0 ; wk(2)=tmp13L - movq MMWORD [wk(3)], mm4 ; wk(3)=tmp13H - - pxor mm5,mm5 - pxor mm7,mm7 - punpcklwd mm5,mm6 ; mm5=tmp1L - punpckhwd mm7,mm6 ; mm7=tmp1H - psrad mm5,(16-CONST_BITS) ; psrad mm5,16 & pslld mm5,CONST_BITS - psrad mm7,(16-CONST_BITS) ; psrad mm7,16 & pslld mm7,CONST_BITS - - movq mm2,mm5 - paddd mm5,mm1 ; mm5=tmp11L - psubd mm2,mm1 ; mm2=tmp12L - movq mm0,mm7 - paddd mm7,mm3 ; mm7=tmp11H - psubd mm0,mm3 ; mm0=tmp12H - - movq MMWORD [wk(4)], mm5 ; wk(4)=tmp11L - movq MMWORD [wk(5)], mm7 ; wk(5)=tmp11H - movq MMWORD [wk(6)], mm2 ; wk(6)=tmp12L - movq MMWORD [wk(7)], mm0 ; wk(7)=tmp12H - - ; -- Odd part - - movq mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] - movq mm6, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] - movq mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] - movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] - - movq mm5,mm6 - movq mm7,mm4 - paddw mm5,mm3 ; mm5=z3 - paddw mm7,mm1 ; mm7=z4 - - ; (Original) - ; z5 = (z3 + z4) * 1.175875602; - ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; - ; z3 += z5; z4 += z5; - ; - ; (This implementation) - ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; - ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); - - movq mm2,mm5 - movq mm0,mm5 - punpcklwd mm2,mm7 - punpckhwd mm0,mm7 - movq mm5,mm2 - movq mm7,mm0 - pmaddwd mm2,[GOTOFF(ebx,PW_MF078_F117)] ; mm2=z3L - pmaddwd mm0,[GOTOFF(ebx,PW_MF078_F117)] ; mm0=z3H - pmaddwd mm5,[GOTOFF(ebx,PW_F117_F078)] ; mm5=z4L - pmaddwd mm7,[GOTOFF(ebx,PW_F117_F078)] ; mm7=z4H - - movq MMWORD [wk(10)], mm2 ; wk(10)=z3L - movq MMWORD [wk(11)], mm0 ; wk(11)=z3H - - ; (Original) - ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; - ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; - ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; - ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; - ; tmp0 += z1 + z3; tmp1 += z2 + z4; - ; tmp2 += z2 + z3; tmp3 += z1 + z4; - ; - ; (This implementation) - ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; - ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; - ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); - ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); - ; tmp0 += z3; tmp1 += z4; - ; tmp2 += z3; tmp3 += z4; - - movq mm2,mm3 - movq mm0,mm3 - punpcklwd mm2,mm4 - punpckhwd mm0,mm4 - movq mm3,mm2 - movq mm4,mm0 - pmaddwd mm2,[GOTOFF(ebx,PW_MF060_MF089)] ; mm2=tmp0L - pmaddwd mm0,[GOTOFF(ebx,PW_MF060_MF089)] ; mm0=tmp0H - pmaddwd mm3,[GOTOFF(ebx,PW_MF089_F060)] ; mm3=tmp3L - pmaddwd mm4,[GOTOFF(ebx,PW_MF089_F060)] ; mm4=tmp3H - - paddd mm2, MMWORD [wk(10)] ; mm2=tmp0L - paddd mm0, MMWORD [wk(11)] ; mm0=tmp0H - paddd mm3,mm5 ; mm3=tmp3L - paddd mm4,mm7 ; mm4=tmp3H - - movq MMWORD [wk(8)], mm2 ; wk(8)=tmp0L - movq MMWORD [wk(9)], mm0 ; wk(9)=tmp0H - - movq mm2,mm1 - movq mm0,mm1 - punpcklwd mm2,mm6 - punpckhwd mm0,mm6 - movq mm1,mm2 - movq mm6,mm0 - pmaddwd mm2,[GOTOFF(ebx,PW_MF050_MF256)] ; mm2=tmp1L - pmaddwd mm0,[GOTOFF(ebx,PW_MF050_MF256)] ; mm0=tmp1H - pmaddwd mm1,[GOTOFF(ebx,PW_MF256_F050)] ; mm1=tmp2L - pmaddwd mm6,[GOTOFF(ebx,PW_MF256_F050)] ; mm6=tmp2H - - paddd mm2,mm5 ; mm2=tmp1L - paddd mm0,mm7 ; mm0=tmp1H - paddd mm1, MMWORD [wk(10)] ; mm1=tmp2L - paddd mm6, MMWORD [wk(11)] ; mm6=tmp2H - - movq MMWORD [wk(10)], mm2 ; wk(10)=tmp1L - movq MMWORD [wk(11)], mm0 ; wk(11)=tmp1H - - ; -- Final output stage - - movq mm5, MMWORD [wk(0)] ; mm5=tmp10L - movq mm7, MMWORD [wk(1)] ; mm7=tmp10H - - movq mm2,mm5 - movq mm0,mm7 - paddd mm5,mm3 ; mm5=data0L - paddd mm7,mm4 ; mm7=data0H - psubd mm2,mm3 ; mm2=data7L - psubd mm0,mm4 ; mm0=data7H - - movq mm3,[GOTOFF(ebx,PD_DESCALE_P2)] ; mm3=[PD_DESCALE_P2] - - paddd mm5,mm3 - paddd mm7,mm3 - psrad mm5,DESCALE_P2 - psrad mm7,DESCALE_P2 - paddd mm2,mm3 - paddd mm0,mm3 - psrad mm2,DESCALE_P2 - psrad mm0,DESCALE_P2 - - packssdw mm5,mm7 ; mm5=data0=(00 10 20 30) - packssdw mm2,mm0 ; mm2=data7=(07 17 27 37) - - movq mm4, MMWORD [wk(4)] ; mm4=tmp11L - movq mm3, MMWORD [wk(5)] ; mm3=tmp11H - - movq mm7,mm4 - movq mm0,mm3 - paddd mm4,mm1 ; mm4=data1L - paddd mm3,mm6 ; mm3=data1H - psubd mm7,mm1 ; mm7=data6L - psubd mm0,mm6 ; mm0=data6H - - movq mm1,[GOTOFF(ebx,PD_DESCALE_P2)] ; mm1=[PD_DESCALE_P2] - - paddd mm4,mm1 - paddd mm3,mm1 - psrad mm4,DESCALE_P2 - psrad mm3,DESCALE_P2 - paddd mm7,mm1 - paddd mm0,mm1 - psrad mm7,DESCALE_P2 - psrad mm0,DESCALE_P2 - - packssdw mm4,mm3 ; mm4=data1=(01 11 21 31) - packssdw mm7,mm0 ; mm7=data6=(06 16 26 36) - - packsswb mm5,mm7 ; mm5=(00 10 20 30 06 16 26 36) - packsswb mm4,mm2 ; mm4=(01 11 21 31 07 17 27 37) - - movq mm6, MMWORD [wk(6)] ; mm6=tmp12L - movq mm1, MMWORD [wk(7)] ; mm1=tmp12H - movq mm3, MMWORD [wk(10)] ; mm3=tmp1L - movq mm0, MMWORD [wk(11)] ; mm0=tmp1H - - movq MMWORD [wk(0)], mm5 ; wk(0)=(00 10 20 30 06 16 26 36) - movq MMWORD [wk(1)], mm4 ; wk(1)=(01 11 21 31 07 17 27 37) - - movq mm7,mm6 - movq mm2,mm1 - paddd mm6,mm3 ; mm6=data2L - paddd mm1,mm0 ; mm1=data2H - psubd mm7,mm3 ; mm7=data5L - psubd mm2,mm0 ; mm2=data5H - - movq mm5,[GOTOFF(ebx,PD_DESCALE_P2)] ; mm5=[PD_DESCALE_P2] - - paddd mm6,mm5 - paddd mm1,mm5 - psrad mm6,DESCALE_P2 - psrad mm1,DESCALE_P2 - paddd mm7,mm5 - paddd mm2,mm5 - psrad mm7,DESCALE_P2 - psrad mm2,DESCALE_P2 - - packssdw mm6,mm1 ; mm6=data2=(02 12 22 32) - packssdw mm7,mm2 ; mm7=data5=(05 15 25 35) - - movq mm4, MMWORD [wk(2)] ; mm4=tmp13L - movq mm3, MMWORD [wk(3)] ; mm3=tmp13H - movq mm0, MMWORD [wk(8)] ; mm0=tmp0L - movq mm5, MMWORD [wk(9)] ; mm5=tmp0H - - movq mm1,mm4 - movq mm2,mm3 - paddd mm4,mm0 ; mm4=data3L - paddd mm3,mm5 ; mm3=data3H - psubd mm1,mm0 ; mm1=data4L - psubd mm2,mm5 ; mm2=data4H - - movq mm0,[GOTOFF(ebx,PD_DESCALE_P2)] ; mm0=[PD_DESCALE_P2] - - paddd mm4,mm0 - paddd mm3,mm0 - psrad mm4,DESCALE_P2 - psrad mm3,DESCALE_P2 - paddd mm1,mm0 - paddd mm2,mm0 - psrad mm1,DESCALE_P2 - psrad mm2,DESCALE_P2 - - movq mm5,[GOTOFF(ebx,PB_CENTERJSAMP)] ; mm5=[PB_CENTERJSAMP] - - packssdw mm4,mm3 ; mm4=data3=(03 13 23 33) - packssdw mm1,mm2 ; mm1=data4=(04 14 24 34) - - movq mm0, MMWORD [wk(0)] ; mm0=(00 10 20 30 06 16 26 36) - movq mm3, MMWORD [wk(1)] ; mm3=(01 11 21 31 07 17 27 37) - - packsswb mm6,mm1 ; mm6=(02 12 22 32 04 14 24 34) - packsswb mm4,mm7 ; mm4=(03 13 23 33 05 15 25 35) - - paddb mm0,mm5 - paddb mm3,mm5 - paddb mm6,mm5 - paddb mm4,mm5 - - movq mm2,mm0 ; transpose coefficients(phase 1) - punpcklbw mm0,mm3 ; mm0=(00 01 10 11 20 21 30 31) - punpckhbw mm2,mm3 ; mm2=(06 07 16 17 26 27 36 37) - movq mm1,mm6 ; transpose coefficients(phase 1) - punpcklbw mm6,mm4 ; mm6=(02 03 12 13 22 23 32 33) - punpckhbw mm1,mm4 ; mm1=(04 05 14 15 24 25 34 35) - - movq mm7,mm0 ; transpose coefficients(phase 2) - punpcklwd mm0,mm6 ; mm0=(00 01 02 03 10 11 12 13) - punpckhwd mm7,mm6 ; mm7=(20 21 22 23 30 31 32 33) - movq mm5,mm1 ; transpose coefficients(phase 2) - punpcklwd mm1,mm2 ; mm1=(04 05 06 07 14 15 16 17) - punpckhwd mm5,mm2 ; mm5=(24 25 26 27 34 35 36 37) - - movq mm3,mm0 ; transpose coefficients(phase 3) - punpckldq mm0,mm1 ; mm0=(00 01 02 03 04 05 06 07) - punpckhdq mm3,mm1 ; mm3=(10 11 12 13 14 15 16 17) - movq mm4,mm7 ; transpose coefficients(phase 3) - punpckldq mm7,mm5 ; mm7=(20 21 22 23 24 25 26 27) - punpckhdq mm4,mm5 ; mm4=(30 31 32 33 34 35 36 37) - - pushpic ebx ; save GOT address - - mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] - mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] - movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0 - movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm3 - mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] - mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW] - movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm7 - movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4 - - poppic ebx ; restore GOT address - - add esi, byte 4*SIZEOF_JCOEF ; wsptr - add edi, byte 4*SIZEOF_JSAMPROW - dec ecx ; ctr - jnz near .rowloop - - emms ; empty MMX state - - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret - -; For some reason, the OS X linker does not honor the request to align the -; segment unless we do this. - align 16 diff --git a/Builder/jni-1.11/simd/i386/src/jimmxred.asm b/Builder/jni-1.11/simd/i386/src/jimmxred.asm deleted file mode 100644 index a2b7103df..000000000 --- a/Builder/jni-1.11/simd/i386/src/jimmxred.asm +++ /dev/null @@ -1,706 +0,0 @@ -; -; jimmxred.asm - reduced-size IDCT (MMX) -; -; Copyright 2009 Pierre Ossman for Cendio AB -; -; Based on -; x86 SIMD extension for IJG JPEG library -; Copyright (C) 1999-2006, MIYASAKA Masaru. -; For conditions of distribution and use, see copyright notice in jsimdext.inc -; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; This file contains inverse-DCT routines that produce reduced-size -; output: either 4x4 or 2x2 pixels from an 8x8 DCT block. -; The following code is based directly on the IJG's original jidctred.c; -; see the jidctred.c for more details. -; -; [TAB8] - -%include "jsimdext.inc" -%include "jdct.inc" - -; -------------------------------------------------------------------------- - -%define CONST_BITS 13 -%define PASS1_BITS 2 - -%define DESCALE_P1_4 (CONST_BITS-PASS1_BITS+1) -%define DESCALE_P2_4 (CONST_BITS+PASS1_BITS+3+1) -%define DESCALE_P1_2 (CONST_BITS-PASS1_BITS+2) -%define DESCALE_P2_2 (CONST_BITS+PASS1_BITS+3+2) - -%if CONST_BITS == 13 -F_0_211 equ 1730 ; FIX(0.211164243) -F_0_509 equ 4176 ; FIX(0.509795579) -F_0_601 equ 4926 ; FIX(0.601344887) -F_0_720 equ 5906 ; FIX(0.720959822) -F_0_765 equ 6270 ; FIX(0.765366865) -F_0_850 equ 6967 ; FIX(0.850430095) -F_0_899 equ 7373 ; FIX(0.899976223) -F_1_061 equ 8697 ; FIX(1.061594337) -F_1_272 equ 10426 ; FIX(1.272758580) -F_1_451 equ 11893 ; FIX(1.451774981) -F_1_847 equ 15137 ; FIX(1.847759065) -F_2_172 equ 17799 ; FIX(2.172734803) -F_2_562 equ 20995 ; FIX(2.562915447) -F_3_624 equ 29692 ; FIX(3.624509785) -%else -; NASM cannot do compile-time arithmetic on floating-point constants. -%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) -F_0_211 equ DESCALE( 226735879,30-CONST_BITS) ; FIX(0.211164243) -F_0_509 equ DESCALE( 547388834,30-CONST_BITS) ; FIX(0.509795579) -F_0_601 equ DESCALE( 645689155,30-CONST_BITS) ; FIX(0.601344887) -F_0_720 equ DESCALE( 774124714,30-CONST_BITS) ; FIX(0.720959822) -F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) -F_0_850 equ DESCALE( 913142361,30-CONST_BITS) ; FIX(0.850430095) -F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) -F_1_061 equ DESCALE(1139878239,30-CONST_BITS) ; FIX(1.061594337) -F_1_272 equ DESCALE(1366614119,30-CONST_BITS) ; FIX(1.272758580) -F_1_451 equ DESCALE(1558831516,30-CONST_BITS) ; FIX(1.451774981) -F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) -F_2_172 equ DESCALE(2332956230,30-CONST_BITS) ; FIX(2.172734803) -F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) -F_3_624 equ DESCALE(3891787747,30-CONST_BITS) ; FIX(3.624509785) -%endif - -; -------------------------------------------------------------------------- - SECTION SEG_CONST - - alignz 16 - global EXTN(jconst_idct_red_mmx) - -EXTN(jconst_idct_red_mmx): - -PW_F184_MF076 times 2 dw F_1_847,-F_0_765 -PW_F256_F089 times 2 dw F_2_562, F_0_899 -PW_F106_MF217 times 2 dw F_1_061,-F_2_172 -PW_MF060_MF050 times 2 dw -F_0_601,-F_0_509 -PW_F145_MF021 times 2 dw F_1_451,-F_0_211 -PW_F362_MF127 times 2 dw F_3_624,-F_1_272 -PW_F085_MF072 times 2 dw F_0_850,-F_0_720 -PD_DESCALE_P1_4 times 2 dd 1 << (DESCALE_P1_4-1) -PD_DESCALE_P2_4 times 2 dd 1 << (DESCALE_P2_4-1) -PD_DESCALE_P1_2 times 2 dd 1 << (DESCALE_P1_2-1) -PD_DESCALE_P2_2 times 2 dd 1 << (DESCALE_P2_2-1) -PB_CENTERJSAMP times 8 db CENTERJSAMPLE - - alignz 16 - -; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 -; -; Perform dequantization and inverse DCT on one block of coefficients, -; producing a reduced-size 4x4 output block. -; -; GLOBAL(void) -; jsimd_idct_4x4_mmx (void * dct_table, JCOEFPTR coef_block, -; JSAMPARRAY output_buf, JDIMENSION output_col) -; - -%define dct_table(b) (b)+8 ; void * dct_table -%define coef_block(b) (b)+12 ; JCOEFPTR coef_block -%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf -%define output_col(b) (b)+20 ; JDIMENSION output_col - -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] -%define WK_NUM 2 -%define workspace wk(0)-DCTSIZE2*SIZEOF_JCOEF - ; JCOEF workspace[DCTSIZE2] - - align 16 - global EXTN(jsimd_idct_4x4_mmx) - -EXTN(jsimd_idct_4x4_mmx): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [workspace] - pushpic ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - get_GOT ebx ; get GOT address - - ; ---- Pass 1: process columns from input, store into work array. - -; mov eax, [original_ebp] - mov edx, POINTER [dct_table(eax)] ; quantptr - mov esi, JCOEFPTR [coef_block(eax)] ; inptr - lea edi, [workspace] ; JCOEF * wsptr - mov ecx, DCTSIZE/4 ; ctr - alignx 16,7 -.columnloop: -%ifndef NO_ZERO_COLUMN_TEST_4X4_MMX - mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] - or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] - jnz short .columnDCT - - movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] - movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] - por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] - por mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] - por mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] - por mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] - por mm0,mm1 - packsswb mm0,mm0 - movd eax,mm0 - test eax,eax - jnz short .columnDCT - - ; -- AC terms all zero - - movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] - pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - - psllw mm0,PASS1_BITS - - movq mm2,mm0 ; mm0=in0=(00 01 02 03) - punpcklwd mm0,mm0 ; mm0=(00 00 01 01) - punpckhwd mm2,mm2 ; mm2=(02 02 03 03) - - movq mm1,mm0 - punpckldq mm0,mm0 ; mm0=(00 00 00 00) - punpckhdq mm1,mm1 ; mm1=(01 01 01 01) - movq mm3,mm2 - punpckldq mm2,mm2 ; mm2=(02 02 02 02) - punpckhdq mm3,mm3 ; mm3=(03 03 03 03) - - movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0 - movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1 - movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2 - movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3 - jmp near .nextcolumn - alignx 16,7 -%endif -.columnDCT: - - ; -- Odd part - - movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] - movq mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] - pmullw mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - movq mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] - movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] - pmullw mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - - movq mm4,mm0 - movq mm5,mm0 - punpcklwd mm4,mm1 - punpckhwd mm5,mm1 - movq mm0,mm4 - movq mm1,mm5 - pmaddwd mm4,[GOTOFF(ebx,PW_F256_F089)] ; mm4=(tmp2L) - pmaddwd mm5,[GOTOFF(ebx,PW_F256_F089)] ; mm5=(tmp2H) - pmaddwd mm0,[GOTOFF(ebx,PW_F106_MF217)] ; mm0=(tmp0L) - pmaddwd mm1,[GOTOFF(ebx,PW_F106_MF217)] ; mm1=(tmp0H) - - movq mm6,mm2 - movq mm7,mm2 - punpcklwd mm6,mm3 - punpckhwd mm7,mm3 - movq mm2,mm6 - movq mm3,mm7 - pmaddwd mm6,[GOTOFF(ebx,PW_MF060_MF050)] ; mm6=(tmp2L) - pmaddwd mm7,[GOTOFF(ebx,PW_MF060_MF050)] ; mm7=(tmp2H) - pmaddwd mm2,[GOTOFF(ebx,PW_F145_MF021)] ; mm2=(tmp0L) - pmaddwd mm3,[GOTOFF(ebx,PW_F145_MF021)] ; mm3=(tmp0H) - - paddd mm6,mm4 ; mm6=tmp2L - paddd mm7,mm5 ; mm7=tmp2H - paddd mm2,mm0 ; mm2=tmp0L - paddd mm3,mm1 ; mm3=tmp0H - - movq MMWORD [wk(0)], mm2 ; wk(0)=tmp0L - movq MMWORD [wk(1)], mm3 ; wk(1)=tmp0H - - ; -- Even part - - movq mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] - movq mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] - movq mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] - pmullw mm4, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw mm5, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw mm0, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - - pxor mm1,mm1 - pxor mm2,mm2 - punpcklwd mm1,mm4 ; mm1=tmp0L - punpckhwd mm2,mm4 ; mm2=tmp0H - psrad mm1,(16-CONST_BITS-1) ; psrad mm1,16 & pslld mm1,CONST_BITS+1 - psrad mm2,(16-CONST_BITS-1) ; psrad mm2,16 & pslld mm2,CONST_BITS+1 - - movq mm3,mm5 ; mm5=in2=z2 - punpcklwd mm5,mm0 ; mm0=in6=z3 - punpckhwd mm3,mm0 - pmaddwd mm5,[GOTOFF(ebx,PW_F184_MF076)] ; mm5=tmp2L - pmaddwd mm3,[GOTOFF(ebx,PW_F184_MF076)] ; mm3=tmp2H - - movq mm4,mm1 - movq mm0,mm2 - paddd mm1,mm5 ; mm1=tmp10L - paddd mm2,mm3 ; mm2=tmp10H - psubd mm4,mm5 ; mm4=tmp12L - psubd mm0,mm3 ; mm0=tmp12H - - ; -- Final output stage - - movq mm5,mm1 - movq mm3,mm2 - paddd mm1,mm6 ; mm1=data0L - paddd mm2,mm7 ; mm2=data0H - psubd mm5,mm6 ; mm5=data3L - psubd mm3,mm7 ; mm3=data3H - - movq mm6,[GOTOFF(ebx,PD_DESCALE_P1_4)] ; mm6=[PD_DESCALE_P1_4] - - paddd mm1,mm6 - paddd mm2,mm6 - psrad mm1,DESCALE_P1_4 - psrad mm2,DESCALE_P1_4 - paddd mm5,mm6 - paddd mm3,mm6 - psrad mm5,DESCALE_P1_4 - psrad mm3,DESCALE_P1_4 - - packssdw mm1,mm2 ; mm1=data0=(00 01 02 03) - packssdw mm5,mm3 ; mm5=data3=(30 31 32 33) - - movq mm7, MMWORD [wk(0)] ; mm7=tmp0L - movq mm6, MMWORD [wk(1)] ; mm6=tmp0H - - movq mm2,mm4 - movq mm3,mm0 - paddd mm4,mm7 ; mm4=data1L - paddd mm0,mm6 ; mm0=data1H - psubd mm2,mm7 ; mm2=data2L - psubd mm3,mm6 ; mm3=data2H - - movq mm7,[GOTOFF(ebx,PD_DESCALE_P1_4)] ; mm7=[PD_DESCALE_P1_4] - - paddd mm4,mm7 - paddd mm0,mm7 - psrad mm4,DESCALE_P1_4 - psrad mm0,DESCALE_P1_4 - paddd mm2,mm7 - paddd mm3,mm7 - psrad mm2,DESCALE_P1_4 - psrad mm3,DESCALE_P1_4 - - packssdw mm4,mm0 ; mm4=data1=(10 11 12 13) - packssdw mm2,mm3 ; mm2=data2=(20 21 22 23) - - movq mm6,mm1 ; transpose coefficients(phase 1) - punpcklwd mm1,mm4 ; mm1=(00 10 01 11) - punpckhwd mm6,mm4 ; mm6=(02 12 03 13) - movq mm7,mm2 ; transpose coefficients(phase 1) - punpcklwd mm2,mm5 ; mm2=(20 30 21 31) - punpckhwd mm7,mm5 ; mm7=(22 32 23 33) - - movq mm0,mm1 ; transpose coefficients(phase 2) - punpckldq mm1,mm2 ; mm1=(00 10 20 30) - punpckhdq mm0,mm2 ; mm0=(01 11 21 31) - movq mm3,mm6 ; transpose coefficients(phase 2) - punpckldq mm6,mm7 ; mm6=(02 12 22 32) - punpckhdq mm3,mm7 ; mm3=(03 13 23 33) - - movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm1 - movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0 - movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm6 - movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3 - -.nextcolumn: - add esi, byte 4*SIZEOF_JCOEF ; coef_block - add edx, byte 4*SIZEOF_ISLOW_MULT_TYPE ; quantptr - add edi, byte 4*DCTSIZE*SIZEOF_JCOEF ; wsptr - dec ecx ; ctr - jnz near .columnloop - - ; ---- Pass 2: process rows from work array, store into output array. - - mov eax, [original_ebp] - lea esi, [workspace] ; JCOEF * wsptr - mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) - mov eax, JDIMENSION [output_col(eax)] - - ; -- Odd part - - movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] - movq mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] - movq mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] - movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] - - movq mm4,mm0 - movq mm5,mm0 - punpcklwd mm4,mm1 - punpckhwd mm5,mm1 - movq mm0,mm4 - movq mm1,mm5 - pmaddwd mm4,[GOTOFF(ebx,PW_F256_F089)] ; mm4=(tmp2L) - pmaddwd mm5,[GOTOFF(ebx,PW_F256_F089)] ; mm5=(tmp2H) - pmaddwd mm0,[GOTOFF(ebx,PW_F106_MF217)] ; mm0=(tmp0L) - pmaddwd mm1,[GOTOFF(ebx,PW_F106_MF217)] ; mm1=(tmp0H) - - movq mm6,mm2 - movq mm7,mm2 - punpcklwd mm6,mm3 - punpckhwd mm7,mm3 - movq mm2,mm6 - movq mm3,mm7 - pmaddwd mm6,[GOTOFF(ebx,PW_MF060_MF050)] ; mm6=(tmp2L) - pmaddwd mm7,[GOTOFF(ebx,PW_MF060_MF050)] ; mm7=(tmp2H) - pmaddwd mm2,[GOTOFF(ebx,PW_F145_MF021)] ; mm2=(tmp0L) - pmaddwd mm3,[GOTOFF(ebx,PW_F145_MF021)] ; mm3=(tmp0H) - - paddd mm6,mm4 ; mm6=tmp2L - paddd mm7,mm5 ; mm7=tmp2H - paddd mm2,mm0 ; mm2=tmp0L - paddd mm3,mm1 ; mm3=tmp0H - - movq MMWORD [wk(0)], mm2 ; wk(0)=tmp0L - movq MMWORD [wk(1)], mm3 ; wk(1)=tmp0H - - ; -- Even part - - movq mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] - movq mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] - movq mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] - - pxor mm1,mm1 - pxor mm2,mm2 - punpcklwd mm1,mm4 ; mm1=tmp0L - punpckhwd mm2,mm4 ; mm2=tmp0H - psrad mm1,(16-CONST_BITS-1) ; psrad mm1,16 & pslld mm1,CONST_BITS+1 - psrad mm2,(16-CONST_BITS-1) ; psrad mm2,16 & pslld mm2,CONST_BITS+1 - - movq mm3,mm5 ; mm5=in2=z2 - punpcklwd mm5,mm0 ; mm0=in6=z3 - punpckhwd mm3,mm0 - pmaddwd mm5,[GOTOFF(ebx,PW_F184_MF076)] ; mm5=tmp2L - pmaddwd mm3,[GOTOFF(ebx,PW_F184_MF076)] ; mm3=tmp2H - - movq mm4,mm1 - movq mm0,mm2 - paddd mm1,mm5 ; mm1=tmp10L - paddd mm2,mm3 ; mm2=tmp10H - psubd mm4,mm5 ; mm4=tmp12L - psubd mm0,mm3 ; mm0=tmp12H - - ; -- Final output stage - - movq mm5,mm1 - movq mm3,mm2 - paddd mm1,mm6 ; mm1=data0L - paddd mm2,mm7 ; mm2=data0H - psubd mm5,mm6 ; mm5=data3L - psubd mm3,mm7 ; mm3=data3H - - movq mm6,[GOTOFF(ebx,PD_DESCALE_P2_4)] ; mm6=[PD_DESCALE_P2_4] - - paddd mm1,mm6 - paddd mm2,mm6 - psrad mm1,DESCALE_P2_4 - psrad mm2,DESCALE_P2_4 - paddd mm5,mm6 - paddd mm3,mm6 - psrad mm5,DESCALE_P2_4 - psrad mm3,DESCALE_P2_4 - - packssdw mm1,mm2 ; mm1=data0=(00 10 20 30) - packssdw mm5,mm3 ; mm5=data3=(03 13 23 33) - - movq mm7, MMWORD [wk(0)] ; mm7=tmp0L - movq mm6, MMWORD [wk(1)] ; mm6=tmp0H - - movq mm2,mm4 - movq mm3,mm0 - paddd mm4,mm7 ; mm4=data1L - paddd mm0,mm6 ; mm0=data1H - psubd mm2,mm7 ; mm2=data2L - psubd mm3,mm6 ; mm3=data2H - - movq mm7,[GOTOFF(ebx,PD_DESCALE_P2_4)] ; mm7=[PD_DESCALE_P2_4] - - paddd mm4,mm7 - paddd mm0,mm7 - psrad mm4,DESCALE_P2_4 - psrad mm0,DESCALE_P2_4 - paddd mm2,mm7 - paddd mm3,mm7 - psrad mm2,DESCALE_P2_4 - psrad mm3,DESCALE_P2_4 - - packssdw mm4,mm0 ; mm4=data1=(01 11 21 31) - packssdw mm2,mm3 ; mm2=data2=(02 12 22 32) - - movq mm6,[GOTOFF(ebx,PB_CENTERJSAMP)] ; mm6=[PB_CENTERJSAMP] - - packsswb mm1,mm2 ; mm1=(00 10 20 30 02 12 22 32) - packsswb mm4,mm5 ; mm4=(01 11 21 31 03 13 23 33) - paddb mm1,mm6 - paddb mm4,mm6 - - movq mm7,mm1 ; transpose coefficients(phase 1) - punpcklbw mm1,mm4 ; mm1=(00 01 10 11 20 21 30 31) - punpckhbw mm7,mm4 ; mm7=(02 03 12 13 22 23 32 33) - - movq mm0,mm1 ; transpose coefficients(phase 2) - punpcklwd mm1,mm7 ; mm1=(00 01 02 03 10 11 12 13) - punpckhwd mm0,mm7 ; mm0=(20 21 22 23 30 31 32 33) - - mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] - mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW] - movd DWORD [edx+eax*SIZEOF_JSAMPLE], mm1 - movd DWORD [esi+eax*SIZEOF_JSAMPLE], mm0 - - psrlq mm1,4*BYTE_BIT - psrlq mm0,4*BYTE_BIT - - mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] - mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW] - movd DWORD [edx+eax*SIZEOF_JSAMPLE], mm1 - movd DWORD [esi+eax*SIZEOF_JSAMPLE], mm0 - - emms ; empty MMX state - - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - poppic ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret - - -; -------------------------------------------------------------------------- -; -; Perform dequantization and inverse DCT on one block of coefficients, -; producing a reduced-size 2x2 output block. -; -; GLOBAL(void) -; jsimd_idct_2x2_mmx (void * dct_table, JCOEFPTR coef_block, -; JSAMPARRAY output_buf, JDIMENSION output_col) -; - -%define dct_table(b) (b)+8 ; void * dct_table -%define coef_block(b) (b)+12 ; JCOEFPTR coef_block -%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf -%define output_col(b) (b)+20 ; JDIMENSION output_col - - align 16 - global EXTN(jsimd_idct_2x2_mmx) - -EXTN(jsimd_idct_2x2_mmx): - push ebp - mov ebp,esp - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - get_GOT ebx ; get GOT address - - ; ---- Pass 1: process columns from input. - - mov edx, POINTER [dct_table(ebp)] ; quantptr - mov esi, JCOEFPTR [coef_block(ebp)] ; inptr - - ; | input: | result: | - ; | 00 01 ** 03 ** 05 ** 07 | | - ; | 10 11 ** 13 ** 15 ** 17 | | - ; | ** ** ** ** ** ** ** ** | | - ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 | - ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 | - ; | 50 51 ** 53 ** 55 ** 57 | | - ; | ** ** ** ** ** ** ** ** | | - ; | 70 71 ** 73 ** 75 ** 77 | | - - ; -- Odd part - - movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] - movq mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] - pmullw mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - movq mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] - movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] - pmullw mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - - ; mm0=(10 11 ** 13), mm1=(30 31 ** 33) - ; mm2=(50 51 ** 53), mm3=(70 71 ** 73) - - pcmpeqd mm7,mm7 - pslld mm7,WORD_BIT ; mm7={0x0000 0xFFFF 0x0000 0xFFFF} - - movq mm4,mm0 ; mm4=(10 11 ** 13) - movq mm5,mm2 ; mm5=(50 51 ** 53) - punpcklwd mm4,mm1 ; mm4=(10 30 11 31) - punpcklwd mm5,mm3 ; mm5=(50 70 51 71) - pmaddwd mm4,[GOTOFF(ebx,PW_F362_MF127)] - pmaddwd mm5,[GOTOFF(ebx,PW_F085_MF072)] - - psrld mm0,WORD_BIT ; mm0=(11 -- 13 --) - pand mm1,mm7 ; mm1=(-- 31 -- 33) - psrld mm2,WORD_BIT ; mm2=(51 -- 53 --) - pand mm3,mm7 ; mm3=(-- 71 -- 73) - por mm0,mm1 ; mm0=(11 31 13 33) - por mm2,mm3 ; mm2=(51 71 53 73) - pmaddwd mm0,[GOTOFF(ebx,PW_F362_MF127)] - pmaddwd mm2,[GOTOFF(ebx,PW_F085_MF072)] - - paddd mm4,mm5 ; mm4=tmp0[col0 col1] - - movq mm6, MMWORD [MMBLOCK(1,1,esi,SIZEOF_JCOEF)] - movq mm1, MMWORD [MMBLOCK(3,1,esi,SIZEOF_JCOEF)] - pmullw mm6, MMWORD [MMBLOCK(1,1,edx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw mm1, MMWORD [MMBLOCK(3,1,edx,SIZEOF_ISLOW_MULT_TYPE)] - movq mm3, MMWORD [MMBLOCK(5,1,esi,SIZEOF_JCOEF)] - movq mm5, MMWORD [MMBLOCK(7,1,esi,SIZEOF_JCOEF)] - pmullw mm3, MMWORD [MMBLOCK(5,1,edx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw mm5, MMWORD [MMBLOCK(7,1,edx,SIZEOF_ISLOW_MULT_TYPE)] - - ; mm6=(** 15 ** 17), mm1=(** 35 ** 37) - ; mm3=(** 55 ** 57), mm5=(** 75 ** 77) - - psrld mm6,WORD_BIT ; mm6=(15 -- 17 --) - pand mm1,mm7 ; mm1=(-- 35 -- 37) - psrld mm3,WORD_BIT ; mm3=(55 -- 57 --) - pand mm5,mm7 ; mm5=(-- 75 -- 77) - por mm6,mm1 ; mm6=(15 35 17 37) - por mm3,mm5 ; mm3=(55 75 57 77) - pmaddwd mm6,[GOTOFF(ebx,PW_F362_MF127)] - pmaddwd mm3,[GOTOFF(ebx,PW_F085_MF072)] - - paddd mm0,mm2 ; mm0=tmp0[col1 col3] - paddd mm6,mm3 ; mm6=tmp0[col5 col7] - - ; -- Even part - - movq mm1, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] - movq mm5, MMWORD [MMBLOCK(0,1,esi,SIZEOF_JCOEF)] - pmullw mm1, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw mm5, MMWORD [MMBLOCK(0,1,edx,SIZEOF_ISLOW_MULT_TYPE)] - - ; mm1=(00 01 ** 03), mm5=(** 05 ** 07) - - movq mm2,mm1 ; mm2=(00 01 ** 03) - pslld mm1,WORD_BIT ; mm1=(-- 00 -- **) - psrad mm1,(WORD_BIT-CONST_BITS-2) ; mm1=tmp10[col0 ****] - - pand mm2,mm7 ; mm2=(-- 01 -- 03) - pand mm5,mm7 ; mm5=(-- 05 -- 07) - psrad mm2,(WORD_BIT-CONST_BITS-2) ; mm2=tmp10[col1 col3] - psrad mm5,(WORD_BIT-CONST_BITS-2) ; mm5=tmp10[col5 col7] - - ; -- Final output stage - - movq mm3,mm1 - paddd mm1,mm4 ; mm1=data0[col0 ****]=(A0 **) - psubd mm3,mm4 ; mm3=data1[col0 ****]=(B0 **) - punpckldq mm1,mm3 ; mm1=(A0 B0) - - movq mm7,[GOTOFF(ebx,PD_DESCALE_P1_2)] ; mm7=[PD_DESCALE_P1_2] - - movq mm4,mm2 - movq mm3,mm5 - paddd mm2,mm0 ; mm2=data0[col1 col3]=(A1 A3) - paddd mm5,mm6 ; mm5=data0[col5 col7]=(A5 A7) - psubd mm4,mm0 ; mm4=data1[col1 col3]=(B1 B3) - psubd mm3,mm6 ; mm3=data1[col5 col7]=(B5 B7) - - paddd mm1,mm7 - psrad mm1,DESCALE_P1_2 - - paddd mm2,mm7 - paddd mm5,mm7 - psrad mm2,DESCALE_P1_2 - psrad mm5,DESCALE_P1_2 - paddd mm4,mm7 - paddd mm3,mm7 - psrad mm4,DESCALE_P1_2 - psrad mm3,DESCALE_P1_2 - - ; ---- Pass 2: process rows, store into output array. - - mov edi, JSAMPARRAY [output_buf(ebp)] ; (JSAMPROW *) - mov eax, JDIMENSION [output_col(ebp)] - - ; | input:| result:| - ; | A0 B0 | | - ; | A1 B1 | C0 C1 | - ; | A3 B3 | D0 D1 | - ; | A5 B5 | | - ; | A7 B7 | | - - ; -- Odd part - - packssdw mm2,mm4 ; mm2=(A1 A3 B1 B3) - packssdw mm5,mm3 ; mm5=(A5 A7 B5 B7) - pmaddwd mm2,[GOTOFF(ebx,PW_F362_MF127)] - pmaddwd mm5,[GOTOFF(ebx,PW_F085_MF072)] - - paddd mm2,mm5 ; mm2=tmp0[row0 row1] - - ; -- Even part - - pslld mm1,(CONST_BITS+2) ; mm1=tmp10[row0 row1] - - ; -- Final output stage - - movq mm0,[GOTOFF(ebx,PD_DESCALE_P2_2)] ; mm0=[PD_DESCALE_P2_2] - - movq mm6,mm1 - paddd mm1,mm2 ; mm1=data0[row0 row1]=(C0 C1) - psubd mm6,mm2 ; mm6=data1[row0 row1]=(D0 D1) - - paddd mm1,mm0 - paddd mm6,mm0 - psrad mm1,DESCALE_P2_2 - psrad mm6,DESCALE_P2_2 - - movq mm7,mm1 ; transpose coefficients - punpckldq mm1,mm6 ; mm1=(C0 D0) - punpckhdq mm7,mm6 ; mm7=(C1 D1) - - packssdw mm1,mm7 ; mm1=(C0 D0 C1 D1) - packsswb mm1,mm1 ; mm1=(C0 D0 C1 D1 C0 D0 C1 D1) - paddb mm1,[GOTOFF(ebx,PB_CENTERJSAMP)] - - movd ecx,mm1 - movd ebx,mm1 ; ebx=(C0 D0 C1 D1) - shr ecx,2*BYTE_BIT ; ecx=(C1 D1 -- --) - - mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] - mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] - mov WORD [edx+eax*SIZEOF_JSAMPLE], bx - mov WORD [esi+eax*SIZEOF_JSAMPLE], cx - - emms ; empty MMX state - - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - pop ebp - ret - -; For some reason, the OS X linker does not honor the request to align the -; segment unless we do this. - align 16 diff --git a/Builder/jni-1.11/simd/i386/src/jiss2flt.asm b/Builder/jni-1.11/simd/i386/src/jiss2flt.asm deleted file mode 100644 index 17bc3633e..000000000 --- a/Builder/jni-1.11/simd/i386/src/jiss2flt.asm +++ /dev/null @@ -1,498 +0,0 @@ -; -; jiss2flt.asm - floating-point IDCT (SSE & SSE2) -; -; Copyright 2009 Pierre Ossman for Cendio AB -; -; Based on -; x86 SIMD extension for IJG JPEG library -; Copyright (C) 1999-2006, MIYASAKA Masaru. -; For conditions of distribution and use, see copyright notice in jsimdext.inc -; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; This file contains a floating-point implementation of the inverse DCT -; (Discrete Cosine Transform). The following code is based directly on -; the IJG's original jidctflt.c; see the jidctflt.c for more details. -; -; [TAB8] - -%include "jsimdext.inc" -%include "jdct.inc" - -; -------------------------------------------------------------------------- - -%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) - shufps %1,%2,0x44 -%endmacro - -%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) - shufps %1,%2,0xEE -%endmacro - -; -------------------------------------------------------------------------- - SECTION SEG_CONST - - alignz 16 - global EXTN(jconst_idct_float_sse2) - -EXTN(jconst_idct_float_sse2): - -PD_1_414 times 4 dd 1.414213562373095048801689 -PD_1_847 times 4 dd 1.847759065022573512256366 -PD_1_082 times 4 dd 1.082392200292393968799446 -PD_M2_613 times 4 dd -2.613125929752753055713286 -PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3) -PB_CENTERJSAMP times 16 db CENTERJSAMPLE - - alignz 16 - -; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 -; -; Perform dequantization and inverse DCT on one block of coefficients. -; -; GLOBAL(void) -; jsimd_idct_float_sse2 (void * dct_table, JCOEFPTR coef_block, -; JSAMPARRAY output_buf, JDIMENSION output_col) -; - -%define dct_table(b) (b)+8 ; void * dct_table -%define coef_block(b) (b)+12 ; JCOEFPTR coef_block -%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf -%define output_col(b) (b)+20 ; JDIMENSION output_col - -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 2 -%define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT - ; FAST_FLOAT workspace[DCTSIZE2] - - align 16 - global EXTN(jsimd_idct_float_sse2) - -EXTN(jsimd_idct_float_sse2): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [workspace] - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - get_GOT ebx ; get GOT address - - ; ---- Pass 1: process columns from input, store into work array. - -; mov eax, [original_ebp] - mov edx, POINTER [dct_table(eax)] ; quantptr - mov esi, JCOEFPTR [coef_block(eax)] ; inptr - lea edi, [workspace] ; FAST_FLOAT * wsptr - mov ecx, DCTSIZE/4 ; ctr - alignx 16,7 -.columnloop: -%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE - mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] - or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] - jnz near .columnDCT - - movq xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] - movq xmm2, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] - movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] - movq xmm4, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] - movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] - movq xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] - movq xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] - por xmm1,xmm2 - por xmm3,xmm4 - por xmm5,xmm6 - por xmm1,xmm3 - por xmm5,xmm7 - por xmm1,xmm5 - packsswb xmm1,xmm1 - movd eax,xmm1 - test eax,eax - jnz short .columnDCT - - ; -- AC terms all zero - - movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] - - punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) - psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) - cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03) - - mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - - movaps xmm1,xmm0 - movaps xmm2,xmm0 - movaps xmm3,xmm0 - - shufps xmm0,xmm0,0x00 ; xmm0=(00 00 00 00) - shufps xmm1,xmm1,0x55 ; xmm1=(01 01 01 01) - shufps xmm2,xmm2,0xAA ; xmm2=(02 02 02 02) - shufps xmm3,xmm3,0xFF ; xmm3=(03 03 03 03) - - movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0 - movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0 - movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1 - movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1 - movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2 - movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2 - movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3 - movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 - jmp near .nextcolumn - alignx 16,7 -%endif -.columnDCT: - - ; -- Even part - - movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] - movq xmm1, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] - movq xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] - movq xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] - - punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) - punpcklwd xmm1,xmm1 ; xmm1=(20 20 21 21 22 22 23 23) - psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) - psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23) - cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03) - cvtdq2ps xmm1,xmm1 ; xmm1=in2=(20 21 22 23) - - punpcklwd xmm2,xmm2 ; xmm2=(40 40 41 41 42 42 43 43) - punpcklwd xmm3,xmm3 ; xmm3=(60 60 61 61 62 62 63 63) - psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43) - psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63) - cvtdq2ps xmm2,xmm2 ; xmm2=in4=(40 41 42 43) - cvtdq2ps xmm3,xmm3 ; xmm3=in6=(60 61 62 63) - - mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - mulps xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - mulps xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - mulps xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - - movaps xmm4,xmm0 - movaps xmm5,xmm1 - subps xmm0,xmm2 ; xmm0=tmp11 - subps xmm1,xmm3 - addps xmm4,xmm2 ; xmm4=tmp10 - addps xmm5,xmm3 ; xmm5=tmp13 - - mulps xmm1,[GOTOFF(ebx,PD_1_414)] - subps xmm1,xmm5 ; xmm1=tmp12 - - movaps xmm6,xmm4 - movaps xmm7,xmm0 - subps xmm4,xmm5 ; xmm4=tmp3 - subps xmm0,xmm1 ; xmm0=tmp2 - addps xmm6,xmm5 ; xmm6=tmp0 - addps xmm7,xmm1 ; xmm7=tmp1 - - movaps XMMWORD [wk(1)], xmm4 ; tmp3 - movaps XMMWORD [wk(0)], xmm0 ; tmp2 - - ; -- Odd part - - movq xmm2, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] - movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] - movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] - movq xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] - - punpcklwd xmm2,xmm2 ; xmm2=(10 10 11 11 12 12 13 13) - punpcklwd xmm3,xmm3 ; xmm3=(30 30 31 31 32 32 33 33) - psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13) - psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33) - cvtdq2ps xmm2,xmm2 ; xmm2=in1=(10 11 12 13) - cvtdq2ps xmm3,xmm3 ; xmm3=in3=(30 31 32 33) - - punpcklwd xmm5,xmm5 ; xmm5=(50 50 51 51 52 52 53 53) - punpcklwd xmm1,xmm1 ; xmm1=(70 70 71 71 72 72 73 73) - psrad xmm5,(DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53) - psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73) - cvtdq2ps xmm5,xmm5 ; xmm5=in5=(50 51 52 53) - cvtdq2ps xmm1,xmm1 ; xmm1=in7=(70 71 72 73) - - mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - mulps xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - mulps xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - mulps xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - - movaps xmm4,xmm2 - movaps xmm0,xmm5 - addps xmm2,xmm1 ; xmm2=z11 - addps xmm5,xmm3 ; xmm5=z13 - subps xmm4,xmm1 ; xmm4=z12 - subps xmm0,xmm3 ; xmm0=z10 - - movaps xmm1,xmm2 - subps xmm2,xmm5 - addps xmm1,xmm5 ; xmm1=tmp7 - - mulps xmm2,[GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11 - - movaps xmm3,xmm0 - addps xmm0,xmm4 - mulps xmm0,[GOTOFF(ebx,PD_1_847)] ; xmm0=z5 - mulps xmm3,[GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930) - mulps xmm4,[GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200) - addps xmm3,xmm0 ; xmm3=tmp12 - subps xmm4,xmm0 ; xmm4=tmp10 - - ; -- Final output stage - - subps xmm3,xmm1 ; xmm3=tmp6 - movaps xmm5,xmm6 - movaps xmm0,xmm7 - addps xmm6,xmm1 ; xmm6=data0=(00 01 02 03) - addps xmm7,xmm3 ; xmm7=data1=(10 11 12 13) - subps xmm5,xmm1 ; xmm5=data7=(70 71 72 73) - subps xmm0,xmm3 ; xmm0=data6=(60 61 62 63) - subps xmm2,xmm3 ; xmm2=tmp5 - - movaps xmm1,xmm6 ; transpose coefficients(phase 1) - unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11) - unpckhps xmm1,xmm7 ; xmm1=(02 12 03 13) - movaps xmm3,xmm0 ; transpose coefficients(phase 1) - unpcklps xmm0,xmm5 ; xmm0=(60 70 61 71) - unpckhps xmm3,xmm5 ; xmm3=(62 72 63 73) - - movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 - movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3 - - movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71) - movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73) - - addps xmm4,xmm2 ; xmm4=tmp4 - movaps xmm0,xmm7 - movaps xmm3,xmm5 - addps xmm7,xmm2 ; xmm7=data2=(20 21 22 23) - addps xmm5,xmm4 ; xmm5=data4=(40 41 42 43) - subps xmm0,xmm2 ; xmm0=data5=(50 51 52 53) - subps xmm3,xmm4 ; xmm3=data3=(30 31 32 33) - - movaps xmm2,xmm7 ; transpose coefficients(phase 1) - unpcklps xmm7,xmm3 ; xmm7=(20 30 21 31) - unpckhps xmm2,xmm3 ; xmm2=(22 32 23 33) - movaps xmm4,xmm5 ; transpose coefficients(phase 1) - unpcklps xmm5,xmm0 ; xmm5=(40 50 41 51) - unpckhps xmm4,xmm0 ; xmm4=(42 52 43 53) - - movaps xmm3,xmm6 ; transpose coefficients(phase 2) - unpcklps2 xmm6,xmm7 ; xmm6=(00 10 20 30) - unpckhps2 xmm3,xmm7 ; xmm3=(01 11 21 31) - movaps xmm0,xmm1 ; transpose coefficients(phase 2) - unpcklps2 xmm1,xmm2 ; xmm1=(02 12 22 32) - unpckhps2 xmm0,xmm2 ; xmm0=(03 13 23 33) - - movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71) - movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73) - - movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6 - movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3 - movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1 - movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0 - - movaps xmm6,xmm5 ; transpose coefficients(phase 2) - unpcklps2 xmm5,xmm7 ; xmm5=(40 50 60 70) - unpckhps2 xmm6,xmm7 ; xmm6=(41 51 61 71) - movaps xmm3,xmm4 ; transpose coefficients(phase 2) - unpcklps2 xmm4,xmm2 ; xmm4=(42 52 62 72) - unpckhps2 xmm3,xmm2 ; xmm3=(43 53 63 73) - - movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5 - movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6 - movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4 - movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 - -.nextcolumn: - add esi, byte 4*SIZEOF_JCOEF ; coef_block - add edx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr - add edi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr - dec ecx ; ctr - jnz near .columnloop - - ; -- Prefetch the next coefficient block - - prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32] - prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32] - prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32] - prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32] - - ; ---- Pass 2: process rows from work array, store into output array. - - mov eax, [original_ebp] - lea esi, [workspace] ; FAST_FLOAT * wsptr - mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) - mov eax, JDIMENSION [output_col(eax)] - mov ecx, DCTSIZE/4 ; ctr - alignx 16,7 -.rowloop: - - ; -- Even part - - movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] - movaps xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)] - movaps xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)] - movaps xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)] - - movaps xmm4,xmm0 - movaps xmm5,xmm1 - subps xmm0,xmm2 ; xmm0=tmp11 - subps xmm1,xmm3 - addps xmm4,xmm2 ; xmm4=tmp10 - addps xmm5,xmm3 ; xmm5=tmp13 - - mulps xmm1,[GOTOFF(ebx,PD_1_414)] - subps xmm1,xmm5 ; xmm1=tmp12 - - movaps xmm6,xmm4 - movaps xmm7,xmm0 - subps xmm4,xmm5 ; xmm4=tmp3 - subps xmm0,xmm1 ; xmm0=tmp2 - addps xmm6,xmm5 ; xmm6=tmp0 - addps xmm7,xmm1 ; xmm7=tmp1 - - movaps XMMWORD [wk(1)], xmm4 ; tmp3 - movaps XMMWORD [wk(0)], xmm0 ; tmp2 - - ; -- Odd part - - movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] - movaps xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)] - movaps xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)] - movaps xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)] - - movaps xmm4,xmm2 - movaps xmm0,xmm5 - addps xmm2,xmm1 ; xmm2=z11 - addps xmm5,xmm3 ; xmm5=z13 - subps xmm4,xmm1 ; xmm4=z12 - subps xmm0,xmm3 ; xmm0=z10 - - movaps xmm1,xmm2 - subps xmm2,xmm5 - addps xmm1,xmm5 ; xmm1=tmp7 - - mulps xmm2,[GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11 - - movaps xmm3,xmm0 - addps xmm0,xmm4 - mulps xmm0,[GOTOFF(ebx,PD_1_847)] ; xmm0=z5 - mulps xmm3,[GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930) - mulps xmm4,[GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200) - addps xmm3,xmm0 ; xmm3=tmp12 - subps xmm4,xmm0 ; xmm4=tmp10 - - ; -- Final output stage - - subps xmm3,xmm1 ; xmm3=tmp6 - movaps xmm5,xmm6 - movaps xmm0,xmm7 - addps xmm6,xmm1 ; xmm6=data0=(00 10 20 30) - addps xmm7,xmm3 ; xmm7=data1=(01 11 21 31) - subps xmm5,xmm1 ; xmm5=data7=(07 17 27 37) - subps xmm0,xmm3 ; xmm0=data6=(06 16 26 36) - subps xmm2,xmm3 ; xmm2=tmp5 - - movaps xmm1,[GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm1=[PD_RNDINT_MAGIC] - pcmpeqd xmm3,xmm3 - psrld xmm3,WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..} - - addps xmm6,xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **) - addps xmm7,xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **) - addps xmm0,xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **) - addps xmm5,xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **) - - pand xmm6,xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --) - pslld xmm7,WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31) - pand xmm0,xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --) - pslld xmm5,WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37) - por xmm6,xmm7 ; xmm6=(00 01 10 11 20 21 30 31) - por xmm0,xmm5 ; xmm0=(06 07 16 17 26 27 36 37) - - movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2 - movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3 - - addps xmm4,xmm2 ; xmm4=tmp4 - movaps xmm7,xmm1 - movaps xmm5,xmm3 - addps xmm1,xmm2 ; xmm1=data2=(02 12 22 32) - addps xmm3,xmm4 ; xmm3=data4=(04 14 24 34) - subps xmm7,xmm2 ; xmm7=data5=(05 15 25 35) - subps xmm5,xmm4 ; xmm5=data3=(03 13 23 33) - - movaps xmm2,[GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm2=[PD_RNDINT_MAGIC] - pcmpeqd xmm4,xmm4 - psrld xmm4,WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..} - - addps xmm3,xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **) - addps xmm7,xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **) - addps xmm1,xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **) - addps xmm5,xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **) - - pand xmm3,xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --) - pslld xmm7,WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35) - pand xmm1,xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --) - pslld xmm5,WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33) - por xmm3,xmm7 ; xmm3=(04 05 14 15 24 25 34 35) - por xmm1,xmm5 ; xmm1=(02 03 12 13 22 23 32 33) - - movdqa xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP] - - packsswb xmm6,xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35) - packsswb xmm1,xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37) - paddb xmm6,xmm2 - paddb xmm1,xmm2 - - movdqa xmm4,xmm6 ; transpose coefficients(phase 2) - punpcklwd xmm6,xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) - punpckhwd xmm4,xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) - - movdqa xmm7,xmm6 ; transpose coefficients(phase 3) - punpckldq xmm6,xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) - punpckhdq xmm7,xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) - - pshufd xmm5,xmm6,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) - pshufd xmm3,xmm7,0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) - - pushpic ebx ; save GOT address - - mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] - mov ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] - movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6 - movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7 - mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] - mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW] - movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5 - movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3 - - poppic ebx ; restore GOT address - - add esi, byte 4*SIZEOF_FAST_FLOAT ; wsptr - add edi, byte 4*SIZEOF_JSAMPROW - dec ecx ; ctr - jnz near .rowloop - - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret - -; For some reason, the OS X linker does not honor the request to align the -; segment unless we do this. - align 16 diff --git a/Builder/jni-1.11/simd/i386/src/jiss2fst.asm b/Builder/jni-1.11/simd/i386/src/jiss2fst.asm deleted file mode 100644 index b53664d7c..000000000 --- a/Builder/jni-1.11/simd/i386/src/jiss2fst.asm +++ /dev/null @@ -1,502 +0,0 @@ -; -; jiss2fst.asm - fast integer IDCT (SSE2) -; -; Copyright 2009 Pierre Ossman for Cendio AB -; -; Based on -; x86 SIMD extension for IJG JPEG library -; Copyright (C) 1999-2006, MIYASAKA Masaru. -; For conditions of distribution and use, see copyright notice in jsimdext.inc -; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; This file contains a fast, not so accurate integer implementation of -; the inverse DCT (Discrete Cosine Transform). The following code is -; based directly on the IJG's original jidctfst.c; see the jidctfst.c -; for more details. -; -; [TAB8] - -%include "jsimdext.inc" -%include "jdct.inc" - -; -------------------------------------------------------------------------- - -%define CONST_BITS 8 ; 14 is also OK. -%define PASS1_BITS 2 - -%if IFAST_SCALE_BITS != PASS1_BITS -%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'." -%endif - -%if CONST_BITS == 8 -F_1_082 equ 277 ; FIX(1.082392200) -F_1_414 equ 362 ; FIX(1.414213562) -F_1_847 equ 473 ; FIX(1.847759065) -F_2_613 equ 669 ; FIX(2.613125930) -F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1) -%else -; NASM cannot do compile-time arithmetic on floating-point constants. -%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) -F_1_082 equ DESCALE(1162209775,30-CONST_BITS) ; FIX(1.082392200) -F_1_414 equ DESCALE(1518500249,30-CONST_BITS) ; FIX(1.414213562) -F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) -F_2_613 equ DESCALE(2805822602,30-CONST_BITS) ; FIX(2.613125930) -F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1) -%endif - -; -------------------------------------------------------------------------- - SECTION SEG_CONST - -; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) -; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) - -%define PRE_MULTIPLY_SCALE_BITS 2 -%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) - - alignz 16 - global EXTN(jconst_idct_ifast_sse2) - -EXTN(jconst_idct_ifast_sse2): - -PW_F1414 times 8 dw F_1_414 << CONST_SHIFT -PW_F1847 times 8 dw F_1_847 << CONST_SHIFT -PW_MF1613 times 8 dw -F_1_613 << CONST_SHIFT -PW_F1082 times 8 dw F_1_082 << CONST_SHIFT -PB_CENTERJSAMP times 16 db CENTERJSAMPLE - - alignz 16 - -; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 -; -; Perform dequantization and inverse DCT on one block of coefficients. -; -; GLOBAL(void) -; jsimd_idct_ifast_sse2 (void * dct_table, JCOEFPTR coef_block, -; JSAMPARRAY output_buf, JDIMENSION output_col) -; - -%define dct_table(b) (b)+8 ; jpeg_component_info * compptr -%define coef_block(b) (b)+12 ; JCOEFPTR coef_block -%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf -%define output_col(b) (b)+20 ; JDIMENSION output_col - -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 2 - - align 16 - global EXTN(jsimd_idct_ifast_sse2) - -EXTN(jsimd_idct_ifast_sse2): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [wk(0)] - pushpic ebx -; push ecx ; unused -; push edx ; need not be preserved - push esi - push edi - - get_GOT ebx ; get GOT address - - ; ---- Pass 1: process columns from input. - -; mov eax, [original_ebp] - mov edx, POINTER [dct_table(eax)] ; quantptr - mov esi, JCOEFPTR [coef_block(eax)] ; inptr - -%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2 - mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] - or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] - jnz near .columnDCT - - movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] - movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] - por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] - por xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)] - por xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] - por xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] - por xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] - por xmm1,xmm0 - packsswb xmm1,xmm1 - packsswb xmm1,xmm1 - movd eax,xmm1 - test eax,eax - jnz short .columnDCT - - ; -- AC terms all zero - - movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] - pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - - movdqa xmm7,xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07) - punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) - punpckhwd xmm7,xmm7 ; xmm7=(04 04 05 05 06 06 07 07) - - pshufd xmm6,xmm0,0x00 ; xmm6=col0=(00 00 00 00 00 00 00 00) - pshufd xmm2,xmm0,0x55 ; xmm2=col1=(01 01 01 01 01 01 01 01) - pshufd xmm5,xmm0,0xAA ; xmm5=col2=(02 02 02 02 02 02 02 02) - pshufd xmm0,xmm0,0xFF ; xmm0=col3=(03 03 03 03 03 03 03 03) - pshufd xmm1,xmm7,0x00 ; xmm1=col4=(04 04 04 04 04 04 04 04) - pshufd xmm4,xmm7,0x55 ; xmm4=col5=(05 05 05 05 05 05 05 05) - pshufd xmm3,xmm7,0xAA ; xmm3=col6=(06 06 06 06 06 06 06 06) - pshufd xmm7,xmm7,0xFF ; xmm7=col7=(07 07 07 07 07 07 07 07) - - movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=col1 - movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=col3 - jmp near .column_end - alignx 16,7 -%endif -.columnDCT: - - ; -- Even part - - movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] - movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] - pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)] - pmullw xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)] - movdqa xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)] - movdqa xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] - pmullw xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)] - pmullw xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)] - - movdqa xmm4,xmm0 - movdqa xmm5,xmm1 - psubw xmm0,xmm2 ; xmm0=tmp11 - psubw xmm1,xmm3 - paddw xmm4,xmm2 ; xmm4=tmp10 - paddw xmm5,xmm3 ; xmm5=tmp13 - - psllw xmm1,PRE_MULTIPLY_SCALE_BITS - pmulhw xmm1,[GOTOFF(ebx,PW_F1414)] - psubw xmm1,xmm5 ; xmm1=tmp12 - - movdqa xmm6,xmm4 - movdqa xmm7,xmm0 - psubw xmm4,xmm5 ; xmm4=tmp3 - psubw xmm0,xmm1 ; xmm0=tmp2 - paddw xmm6,xmm5 ; xmm6=tmp0 - paddw xmm7,xmm1 ; xmm7=tmp1 - - movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=tmp3 - movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=tmp2 - - ; -- Odd part - - movdqa xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] - movdqa xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] - pmullw xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)] - pmullw xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)] - movdqa xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] - movdqa xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] - pmullw xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)] - pmullw xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)] - - movdqa xmm4,xmm2 - movdqa xmm0,xmm5 - psubw xmm2,xmm1 ; xmm2=z12 - psubw xmm5,xmm3 ; xmm5=z10 - paddw xmm4,xmm1 ; xmm4=z11 - paddw xmm0,xmm3 ; xmm0=z13 - - movdqa xmm1,xmm5 ; xmm1=z10(unscaled) - psllw xmm2,PRE_MULTIPLY_SCALE_BITS - psllw xmm5,PRE_MULTIPLY_SCALE_BITS - - movdqa xmm3,xmm4 - psubw xmm4,xmm0 - paddw xmm3,xmm0 ; xmm3=tmp7 - - psllw xmm4,PRE_MULTIPLY_SCALE_BITS - pmulhw xmm4,[GOTOFF(ebx,PW_F1414)] ; xmm4=tmp11 - - ; To avoid overflow... - ; - ; (Original) - ; tmp12 = -2.613125930 * z10 + z5; - ; - ; (This implementation) - ; tmp12 = (-1.613125930 - 1) * z10 + z5; - ; = -1.613125930 * z10 - z10 + z5; - - movdqa xmm0,xmm5 - paddw xmm5,xmm2 - pmulhw xmm5,[GOTOFF(ebx,PW_F1847)] ; xmm5=z5 - pmulhw xmm0,[GOTOFF(ebx,PW_MF1613)] - pmulhw xmm2,[GOTOFF(ebx,PW_F1082)] - psubw xmm0,xmm1 - psubw xmm2,xmm5 ; xmm2=tmp10 - paddw xmm0,xmm5 ; xmm0=tmp12 - - ; -- Final output stage - - psubw xmm0,xmm3 ; xmm0=tmp6 - movdqa xmm1,xmm6 - movdqa xmm5,xmm7 - paddw xmm6,xmm3 ; xmm6=data0=(00 01 02 03 04 05 06 07) - paddw xmm7,xmm0 ; xmm7=data1=(10 11 12 13 14 15 16 17) - psubw xmm1,xmm3 ; xmm1=data7=(70 71 72 73 74 75 76 77) - psubw xmm5,xmm0 ; xmm5=data6=(60 61 62 63 64 65 66 67) - psubw xmm4,xmm0 ; xmm4=tmp5 - - movdqa xmm3,xmm6 ; transpose coefficients(phase 1) - punpcklwd xmm6,xmm7 ; xmm6=(00 10 01 11 02 12 03 13) - punpckhwd xmm3,xmm7 ; xmm3=(04 14 05 15 06 16 07 17) - movdqa xmm0,xmm5 ; transpose coefficients(phase 1) - punpcklwd xmm5,xmm1 ; xmm5=(60 70 61 71 62 72 63 73) - punpckhwd xmm0,xmm1 ; xmm0=(64 74 65 75 66 76 67 77) - - movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 - movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp3 - - movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(60 70 61 71 62 72 63 73) - movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(64 74 65 75 66 76 67 77) - - paddw xmm2,xmm4 ; xmm2=tmp4 - movdqa xmm5,xmm7 - movdqa xmm0,xmm1 - paddw xmm7,xmm4 ; xmm7=data2=(20 21 22 23 24 25 26 27) - paddw xmm1,xmm2 ; xmm1=data4=(40 41 42 43 44 45 46 47) - psubw xmm5,xmm4 ; xmm5=data5=(50 51 52 53 54 55 56 57) - psubw xmm0,xmm2 ; xmm0=data3=(30 31 32 33 34 35 36 37) - - movdqa xmm4,xmm7 ; transpose coefficients(phase 1) - punpcklwd xmm7,xmm0 ; xmm7=(20 30 21 31 22 32 23 33) - punpckhwd xmm4,xmm0 ; xmm4=(24 34 25 35 26 36 27 37) - movdqa xmm2,xmm1 ; transpose coefficients(phase 1) - punpcklwd xmm1,xmm5 ; xmm1=(40 50 41 51 42 52 43 53) - punpckhwd xmm2,xmm5 ; xmm2=(44 54 45 55 46 56 47 57) - - movdqa xmm0,xmm3 ; transpose coefficients(phase 2) - punpckldq xmm3,xmm4 ; xmm3=(04 14 24 34 05 15 25 35) - punpckhdq xmm0,xmm4 ; xmm0=(06 16 26 36 07 17 27 37) - movdqa xmm5,xmm6 ; transpose coefficients(phase 2) - punpckldq xmm6,xmm7 ; xmm6=(00 10 20 30 01 11 21 31) - punpckhdq xmm5,xmm7 ; xmm5=(02 12 22 32 03 13 23 33) - - movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(60 70 61 71 62 72 63 73) - movdqa xmm7, XMMWORD [wk(1)] ; xmm7=(64 74 65 75 66 76 67 77) - - movdqa XMMWORD [wk(0)], xmm3 ; wk(0)=(04 14 24 34 05 15 25 35) - movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(06 16 26 36 07 17 27 37) - - movdqa xmm3,xmm1 ; transpose coefficients(phase 2) - punpckldq xmm1,xmm4 ; xmm1=(40 50 60 70 41 51 61 71) - punpckhdq xmm3,xmm4 ; xmm3=(42 52 62 72 43 53 63 73) - movdqa xmm0,xmm2 ; transpose coefficients(phase 2) - punpckldq xmm2,xmm7 ; xmm2=(44 54 64 74 45 55 65 75) - punpckhdq xmm0,xmm7 ; xmm0=(46 56 66 76 47 57 67 77) - - movdqa xmm4,xmm6 ; transpose coefficients(phase 3) - punpcklqdq xmm6,xmm1 ; xmm6=col0=(00 10 20 30 40 50 60 70) - punpckhqdq xmm4,xmm1 ; xmm4=col1=(01 11 21 31 41 51 61 71) - movdqa xmm7,xmm5 ; transpose coefficients(phase 3) - punpcklqdq xmm5,xmm3 ; xmm5=col2=(02 12 22 32 42 52 62 72) - punpckhqdq xmm7,xmm3 ; xmm7=col3=(03 13 23 33 43 53 63 73) - - movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(04 14 24 34 05 15 25 35) - movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(06 16 26 36 07 17 27 37) - - movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=col1 - movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=col3 - - movdqa xmm4,xmm1 ; transpose coefficients(phase 3) - punpcklqdq xmm1,xmm2 ; xmm1=col4=(04 14 24 34 44 54 64 74) - punpckhqdq xmm4,xmm2 ; xmm4=col5=(05 15 25 35 45 55 65 75) - movdqa xmm7,xmm3 ; transpose coefficients(phase 3) - punpcklqdq xmm3,xmm0 ; xmm3=col6=(06 16 26 36 46 56 66 76) - punpckhqdq xmm7,xmm0 ; xmm7=col7=(07 17 27 37 47 57 67 77) -.column_end: - - ; -- Prefetch the next coefficient block - - prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32] - prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32] - prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32] - prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32] - - ; ---- Pass 2: process rows from work array, store into output array. - - mov eax, [original_ebp] - mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) - mov eax, JDIMENSION [output_col(eax)] - - ; -- Even part - - ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6 - - movdqa xmm2,xmm6 - movdqa xmm0,xmm5 - psubw xmm6,xmm1 ; xmm6=tmp11 - psubw xmm5,xmm3 - paddw xmm2,xmm1 ; xmm2=tmp10 - paddw xmm0,xmm3 ; xmm0=tmp13 - - psllw xmm5,PRE_MULTIPLY_SCALE_BITS - pmulhw xmm5,[GOTOFF(ebx,PW_F1414)] - psubw xmm5,xmm0 ; xmm5=tmp12 - - movdqa xmm1,xmm2 - movdqa xmm3,xmm6 - psubw xmm2,xmm0 ; xmm2=tmp3 - psubw xmm6,xmm5 ; xmm6=tmp2 - paddw xmm1,xmm0 ; xmm1=tmp0 - paddw xmm3,xmm5 ; xmm3=tmp1 - - movdqa xmm0, XMMWORD [wk(0)] ; xmm0=col1 - movdqa xmm5, XMMWORD [wk(1)] ; xmm5=col3 - - movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp3 - movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp2 - - ; -- Odd part - - ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7 - - movdqa xmm2,xmm0 - movdqa xmm6,xmm4 - psubw xmm0,xmm7 ; xmm0=z12 - psubw xmm4,xmm5 ; xmm4=z10 - paddw xmm2,xmm7 ; xmm2=z11 - paddw xmm6,xmm5 ; xmm6=z13 - - movdqa xmm7,xmm4 ; xmm7=z10(unscaled) - psllw xmm0,PRE_MULTIPLY_SCALE_BITS - psllw xmm4,PRE_MULTIPLY_SCALE_BITS - - movdqa xmm5,xmm2 - psubw xmm2,xmm6 - paddw xmm5,xmm6 ; xmm5=tmp7 - - psllw xmm2,PRE_MULTIPLY_SCALE_BITS - pmulhw xmm2,[GOTOFF(ebx,PW_F1414)] ; xmm2=tmp11 - - ; To avoid overflow... - ; - ; (Original) - ; tmp12 = -2.613125930 * z10 + z5; - ; - ; (This implementation) - ; tmp12 = (-1.613125930 - 1) * z10 + z5; - ; = -1.613125930 * z10 - z10 + z5; - - movdqa xmm6,xmm4 - paddw xmm4,xmm0 - pmulhw xmm4,[GOTOFF(ebx,PW_F1847)] ; xmm4=z5 - pmulhw xmm6,[GOTOFF(ebx,PW_MF1613)] - pmulhw xmm0,[GOTOFF(ebx,PW_F1082)] - psubw xmm6,xmm7 - psubw xmm0,xmm4 ; xmm0=tmp10 - paddw xmm6,xmm4 ; xmm6=tmp12 - - ; -- Final output stage - - psubw xmm6,xmm5 ; xmm6=tmp6 - movdqa xmm7,xmm1 - movdqa xmm4,xmm3 - paddw xmm1,xmm5 ; xmm1=data0=(00 10 20 30 40 50 60 70) - paddw xmm3,xmm6 ; xmm3=data1=(01 11 21 31 41 51 61 71) - psraw xmm1,(PASS1_BITS+3) ; descale - psraw xmm3,(PASS1_BITS+3) ; descale - psubw xmm7,xmm5 ; xmm7=data7=(07 17 27 37 47 57 67 77) - psubw xmm4,xmm6 ; xmm4=data6=(06 16 26 36 46 56 66 76) - psraw xmm7,(PASS1_BITS+3) ; descale - psraw xmm4,(PASS1_BITS+3) ; descale - psubw xmm2,xmm6 ; xmm2=tmp5 - - packsswb xmm1,xmm4 ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) - packsswb xmm3,xmm7 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) - - movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp2 - movdqa xmm6, XMMWORD [wk(0)] ; xmm6=tmp3 - - paddw xmm0,xmm2 ; xmm0=tmp4 - movdqa xmm4,xmm5 - movdqa xmm7,xmm6 - paddw xmm5,xmm2 ; xmm5=data2=(02 12 22 32 42 52 62 72) - paddw xmm6,xmm0 ; xmm6=data4=(04 14 24 34 44 54 64 74) - psraw xmm5,(PASS1_BITS+3) ; descale - psraw xmm6,(PASS1_BITS+3) ; descale - psubw xmm4,xmm2 ; xmm4=data5=(05 15 25 35 45 55 65 75) - psubw xmm7,xmm0 ; xmm7=data3=(03 13 23 33 43 53 63 73) - psraw xmm4,(PASS1_BITS+3) ; descale - psraw xmm7,(PASS1_BITS+3) ; descale - - movdqa xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP] - - packsswb xmm5,xmm6 ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74) - packsswb xmm7,xmm4 ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75) - - paddb xmm1,xmm2 - paddb xmm3,xmm2 - paddb xmm5,xmm2 - paddb xmm7,xmm2 - - movdqa xmm0,xmm1 ; transpose coefficients(phase 1) - punpcklbw xmm1,xmm3 ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71) - punpckhbw xmm0,xmm3 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77) - movdqa xmm6,xmm5 ; transpose coefficients(phase 1) - punpcklbw xmm5,xmm7 ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73) - punpckhbw xmm6,xmm7 ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75) - - movdqa xmm4,xmm1 ; transpose coefficients(phase 2) - punpcklwd xmm1,xmm5 ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) - punpckhwd xmm4,xmm5 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73) - movdqa xmm2,xmm6 ; transpose coefficients(phase 2) - punpcklwd xmm6,xmm0 ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) - punpckhwd xmm2,xmm0 ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77) - - movdqa xmm3,xmm1 ; transpose coefficients(phase 3) - punpckldq xmm1,xmm6 ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) - punpckhdq xmm3,xmm6 ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) - movdqa xmm7,xmm4 ; transpose coefficients(phase 3) - punpckldq xmm4,xmm2 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) - punpckhdq xmm7,xmm2 ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) - - pshufd xmm5,xmm1,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) - pshufd xmm0,xmm3,0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) - pshufd xmm6,xmm4,0x4E ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47) - pshufd xmm2,xmm7,0x4E ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67) - - mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] - mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW] - movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm1 - movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3 - mov edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW] - mov esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW] - movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4 - movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm7 - - mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] - mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW] - movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5 - movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0 - mov edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW] - mov esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW] - movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6 - movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm2 - - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; unused - poppic ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret - -; For some reason, the OS X linker does not honor the request to align the -; segment unless we do this. - align 16 diff --git a/Builder/jni-1.11/simd/i386/src/jiss2int.asm b/Builder/jni-1.11/simd/i386/src/jiss2int.asm deleted file mode 100644 index adf39fb3a..000000000 --- a/Builder/jni-1.11/simd/i386/src/jiss2int.asm +++ /dev/null @@ -1,859 +0,0 @@ -; -; jiss2int.asm - accurate integer IDCT (SSE2) -; -; Copyright 2009 Pierre Ossman for Cendio AB -; -; Based on -; x86 SIMD extension for IJG JPEG library -; Copyright (C) 1999-2006, MIYASAKA Masaru. -; For conditions of distribution and use, see copyright notice in jsimdext.inc -; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; This file contains a slow-but-accurate integer implementation of the -; inverse DCT (Discrete Cosine Transform). The following code is based -; directly on the IJG's original jidctint.c; see the jidctint.c for -; more details. -; -; [TAB8] - -%include "jsimdext.inc" -%include "jdct.inc" - -; -------------------------------------------------------------------------- - -%define CONST_BITS 13 -%define PASS1_BITS 2 - -%define DESCALE_P1 (CONST_BITS-PASS1_BITS) -%define DESCALE_P2 (CONST_BITS+PASS1_BITS+3) - -%if CONST_BITS == 13 -F_0_298 equ 2446 ; FIX(0.298631336) -F_0_390 equ 3196 ; FIX(0.390180644) -F_0_541 equ 4433 ; FIX(0.541196100) -F_0_765 equ 6270 ; FIX(0.765366865) -F_0_899 equ 7373 ; FIX(0.899976223) -F_1_175 equ 9633 ; FIX(1.175875602) -F_1_501 equ 12299 ; FIX(1.501321110) -F_1_847 equ 15137 ; FIX(1.847759065) -F_1_961 equ 16069 ; FIX(1.961570560) -F_2_053 equ 16819 ; FIX(2.053119869) -F_2_562 equ 20995 ; FIX(2.562915447) -F_3_072 equ 25172 ; FIX(3.072711026) -%else -; NASM cannot do compile-time arithmetic on floating-point constants. -%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) -F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336) -F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644) -F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) -F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) -F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) -F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602) -F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110) -F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) -F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560) -F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869) -F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) -F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026) -%endif - -; -------------------------------------------------------------------------- - SECTION SEG_CONST - - alignz 16 - global EXTN(jconst_idct_islow_sse2) - -EXTN(jconst_idct_islow_sse2): - -PW_F130_F054 times 4 dw (F_0_541+F_0_765), F_0_541 -PW_F054_MF130 times 4 dw F_0_541, (F_0_541-F_1_847) -PW_MF078_F117 times 4 dw (F_1_175-F_1_961), F_1_175 -PW_F117_F078 times 4 dw F_1_175, (F_1_175-F_0_390) -PW_MF060_MF089 times 4 dw (F_0_298-F_0_899),-F_0_899 -PW_MF089_F060 times 4 dw -F_0_899, (F_1_501-F_0_899) -PW_MF050_MF256 times 4 dw (F_2_053-F_2_562),-F_2_562 -PW_MF256_F050 times 4 dw -F_2_562, (F_3_072-F_2_562) -PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1-1) -PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2-1) -PB_CENTERJSAMP times 16 db CENTERJSAMPLE - - alignz 16 - -; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 -; -; Perform dequantization and inverse DCT on one block of coefficients. -; -; GLOBAL(void) -; jsimd_idct_islow_sse2 (void * dct_table, JCOEFPTR coef_block, -; JSAMPARRAY output_buf, JDIMENSION output_col) -; - -%define dct_table(b) (b)+8 ; jpeg_component_info * compptr -%define coef_block(b) (b)+12 ; JCOEFPTR coef_block -%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf -%define output_col(b) (b)+20 ; JDIMENSION output_col - -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 12 - - align 16 - global EXTN(jsimd_idct_islow_sse2) - -EXTN(jsimd_idct_islow_sse2): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [wk(0)] - pushpic ebx -; push ecx ; unused -; push edx ; need not be preserved - push esi - push edi - - get_GOT ebx ; get GOT address - - ; ---- Pass 1: process columns from input. - -; mov eax, [original_ebp] - mov edx, POINTER [dct_table(eax)] ; quantptr - mov esi, JCOEFPTR [coef_block(eax)] ; inptr - -%ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2 - mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] - or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] - jnz near .columnDCT - - movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] - movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] - por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] - por xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)] - por xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] - por xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] - por xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] - por xmm1,xmm0 - packsswb xmm1,xmm1 - packsswb xmm1,xmm1 - movd eax,xmm1 - test eax,eax - jnz short .columnDCT - - ; -- AC terms all zero - - movdqa xmm5, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] - pmullw xmm5, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - - psllw xmm5,PASS1_BITS - - movdqa xmm4,xmm5 ; xmm5=in0=(00 01 02 03 04 05 06 07) - punpcklwd xmm5,xmm5 ; xmm5=(00 00 01 01 02 02 03 03) - punpckhwd xmm4,xmm4 ; xmm4=(04 04 05 05 06 06 07 07) - - pshufd xmm7,xmm5,0x00 ; xmm7=col0=(00 00 00 00 00 00 00 00) - pshufd xmm6,xmm5,0x55 ; xmm6=col1=(01 01 01 01 01 01 01 01) - pshufd xmm1,xmm5,0xAA ; xmm1=col2=(02 02 02 02 02 02 02 02) - pshufd xmm5,xmm5,0xFF ; xmm5=col3=(03 03 03 03 03 03 03 03) - pshufd xmm0,xmm4,0x00 ; xmm0=col4=(04 04 04 04 04 04 04 04) - pshufd xmm3,xmm4,0x55 ; xmm3=col5=(05 05 05 05 05 05 05 05) - pshufd xmm2,xmm4,0xAA ; xmm2=col6=(06 06 06 06 06 06 06 06) - pshufd xmm4,xmm4,0xFF ; xmm4=col7=(07 07 07 07 07 07 07 07) - - movdqa XMMWORD [wk(8)], xmm6 ; wk(8)=col1 - movdqa XMMWORD [wk(9)], xmm5 ; wk(9)=col3 - movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5 - movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7 - jmp near .column_end - alignx 16,7 -%endif -.columnDCT: - - ; -- Even part - - movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] - movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] - pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - movdqa xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)] - movdqa xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] - pmullw xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - - ; (Original) - ; z1 = (z2 + z3) * 0.541196100; - ; tmp2 = z1 + z3 * -1.847759065; - ; tmp3 = z1 + z2 * 0.765366865; - ; - ; (This implementation) - ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); - ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; - - movdqa xmm4,xmm1 ; xmm1=in2=z2 - movdqa xmm5,xmm1 - punpcklwd xmm4,xmm3 ; xmm3=in6=z3 - punpckhwd xmm5,xmm3 - movdqa xmm1,xmm4 - movdqa xmm3,xmm5 - pmaddwd xmm4,[GOTOFF(ebx,PW_F130_F054)] ; xmm4=tmp3L - pmaddwd xmm5,[GOTOFF(ebx,PW_F130_F054)] ; xmm5=tmp3H - pmaddwd xmm1,[GOTOFF(ebx,PW_F054_MF130)] ; xmm1=tmp2L - pmaddwd xmm3,[GOTOFF(ebx,PW_F054_MF130)] ; xmm3=tmp2H - - movdqa xmm6,xmm0 - paddw xmm0,xmm2 ; xmm0=in0+in4 - psubw xmm6,xmm2 ; xmm6=in0-in4 - - pxor xmm7,xmm7 - pxor xmm2,xmm2 - punpcklwd xmm7,xmm0 ; xmm7=tmp0L - punpckhwd xmm2,xmm0 ; xmm2=tmp0H - psrad xmm7,(16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS - psrad xmm2,(16-CONST_BITS) ; psrad xmm2,16 & pslld xmm2,CONST_BITS - - movdqa xmm0,xmm7 - paddd xmm7,xmm4 ; xmm7=tmp10L - psubd xmm0,xmm4 ; xmm0=tmp13L - movdqa xmm4,xmm2 - paddd xmm2,xmm5 ; xmm2=tmp10H - psubd xmm4,xmm5 ; xmm4=tmp13H - - movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=tmp10L - movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=tmp10H - movdqa XMMWORD [wk(2)], xmm0 ; wk(2)=tmp13L - movdqa XMMWORD [wk(3)], xmm4 ; wk(3)=tmp13H - - pxor xmm5,xmm5 - pxor xmm7,xmm7 - punpcklwd xmm5,xmm6 ; xmm5=tmp1L - punpckhwd xmm7,xmm6 ; xmm7=tmp1H - psrad xmm5,(16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS - psrad xmm7,(16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS - - movdqa xmm2,xmm5 - paddd xmm5,xmm1 ; xmm5=tmp11L - psubd xmm2,xmm1 ; xmm2=tmp12L - movdqa xmm0,xmm7 - paddd xmm7,xmm3 ; xmm7=tmp11H - psubd xmm0,xmm3 ; xmm0=tmp12H - - movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L - movdqa XMMWORD [wk(5)], xmm7 ; wk(5)=tmp11H - movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=tmp12L - movdqa XMMWORD [wk(7)], xmm0 ; wk(7)=tmp12H - - ; -- Odd part - - movdqa xmm4, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] - movdqa xmm6, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] - pmullw xmm4, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw xmm6, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - movdqa xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] - movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] - pmullw xmm1, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - - movdqa xmm5,xmm6 - movdqa xmm7,xmm4 - paddw xmm5,xmm3 ; xmm5=z3 - paddw xmm7,xmm1 ; xmm7=z4 - - ; (Original) - ; z5 = (z3 + z4) * 1.175875602; - ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; - ; z3 += z5; z4 += z5; - ; - ; (This implementation) - ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; - ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); - - movdqa xmm2,xmm5 - movdqa xmm0,xmm5 - punpcklwd xmm2,xmm7 - punpckhwd xmm0,xmm7 - movdqa xmm5,xmm2 - movdqa xmm7,xmm0 - pmaddwd xmm2,[GOTOFF(ebx,PW_MF078_F117)] ; xmm2=z3L - pmaddwd xmm0,[GOTOFF(ebx,PW_MF078_F117)] ; xmm0=z3H - pmaddwd xmm5,[GOTOFF(ebx,PW_F117_F078)] ; xmm5=z4L - pmaddwd xmm7,[GOTOFF(ebx,PW_F117_F078)] ; xmm7=z4H - - movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=z3L - movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=z3H - - ; (Original) - ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; - ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; - ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; - ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; - ; tmp0 += z1 + z3; tmp1 += z2 + z4; - ; tmp2 += z2 + z3; tmp3 += z1 + z4; - ; - ; (This implementation) - ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; - ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; - ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); - ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); - ; tmp0 += z3; tmp1 += z4; - ; tmp2 += z3; tmp3 += z4; - - movdqa xmm2,xmm3 - movdqa xmm0,xmm3 - punpcklwd xmm2,xmm4 - punpckhwd xmm0,xmm4 - movdqa xmm3,xmm2 - movdqa xmm4,xmm0 - pmaddwd xmm2,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm2=tmp0L - pmaddwd xmm0,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm0=tmp0H - pmaddwd xmm3,[GOTOFF(ebx,PW_MF089_F060)] ; xmm3=tmp3L - pmaddwd xmm4,[GOTOFF(ebx,PW_MF089_F060)] ; xmm4=tmp3H - - paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp0L - paddd xmm0, XMMWORD [wk(11)] ; xmm0=tmp0H - paddd xmm3,xmm5 ; xmm3=tmp3L - paddd xmm4,xmm7 ; xmm4=tmp3H - - movdqa XMMWORD [wk(8)], xmm2 ; wk(8)=tmp0L - movdqa XMMWORD [wk(9)], xmm0 ; wk(9)=tmp0H - - movdqa xmm2,xmm1 - movdqa xmm0,xmm1 - punpcklwd xmm2,xmm6 - punpckhwd xmm0,xmm6 - movdqa xmm1,xmm2 - movdqa xmm6,xmm0 - pmaddwd xmm2,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm2=tmp1L - pmaddwd xmm0,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm0=tmp1H - pmaddwd xmm1,[GOTOFF(ebx,PW_MF256_F050)] ; xmm1=tmp2L - pmaddwd xmm6,[GOTOFF(ebx,PW_MF256_F050)] ; xmm6=tmp2H - - paddd xmm2,xmm5 ; xmm2=tmp1L - paddd xmm0,xmm7 ; xmm0=tmp1H - paddd xmm1, XMMWORD [wk(10)] ; xmm1=tmp2L - paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H - - movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=tmp1L - movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=tmp1H - - ; -- Final output stage - - movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L - movdqa xmm7, XMMWORD [wk(1)] ; xmm7=tmp10H - - movdqa xmm2,xmm5 - movdqa xmm0,xmm7 - paddd xmm5,xmm3 ; xmm5=data0L - paddd xmm7,xmm4 ; xmm7=data0H - psubd xmm2,xmm3 ; xmm2=data7L - psubd xmm0,xmm4 ; xmm0=data7H - - movdqa xmm3,[GOTOFF(ebx,PD_DESCALE_P1)] ; xmm3=[PD_DESCALE_P1] - - paddd xmm5,xmm3 - paddd xmm7,xmm3 - psrad xmm5,DESCALE_P1 - psrad xmm7,DESCALE_P1 - paddd xmm2,xmm3 - paddd xmm0,xmm3 - psrad xmm2,DESCALE_P1 - psrad xmm0,DESCALE_P1 - - packssdw xmm5,xmm7 ; xmm5=data0=(00 01 02 03 04 05 06 07) - packssdw xmm2,xmm0 ; xmm2=data7=(70 71 72 73 74 75 76 77) - - movdqa xmm4, XMMWORD [wk(4)] ; xmm4=tmp11L - movdqa xmm3, XMMWORD [wk(5)] ; xmm3=tmp11H - - movdqa xmm7,xmm4 - movdqa xmm0,xmm3 - paddd xmm4,xmm1 ; xmm4=data1L - paddd xmm3,xmm6 ; xmm3=data1H - psubd xmm7,xmm1 ; xmm7=data6L - psubd xmm0,xmm6 ; xmm0=data6H - - movdqa xmm1,[GOTOFF(ebx,PD_DESCALE_P1)] ; xmm1=[PD_DESCALE_P1] - - paddd xmm4,xmm1 - paddd xmm3,xmm1 - psrad xmm4,DESCALE_P1 - psrad xmm3,DESCALE_P1 - paddd xmm7,xmm1 - paddd xmm0,xmm1 - psrad xmm7,DESCALE_P1 - psrad xmm0,DESCALE_P1 - - packssdw xmm4,xmm3 ; xmm4=data1=(10 11 12 13 14 15 16 17) - packssdw xmm7,xmm0 ; xmm7=data6=(60 61 62 63 64 65 66 67) - - movdqa xmm6,xmm5 ; transpose coefficients(phase 1) - punpcklwd xmm5,xmm4 ; xmm5=(00 10 01 11 02 12 03 13) - punpckhwd xmm6,xmm4 ; xmm6=(04 14 05 15 06 16 07 17) - movdqa xmm1,xmm7 ; transpose coefficients(phase 1) - punpcklwd xmm7,xmm2 ; xmm7=(60 70 61 71 62 72 63 73) - punpckhwd xmm1,xmm2 ; xmm1=(64 74 65 75 66 76 67 77) - - movdqa xmm3, XMMWORD [wk(6)] ; xmm3=tmp12L - movdqa xmm0, XMMWORD [wk(7)] ; xmm0=tmp12H - movdqa xmm4, XMMWORD [wk(10)] ; xmm4=tmp1L - movdqa xmm2, XMMWORD [wk(11)] ; xmm2=tmp1H - - movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 01 11 02 12 03 13) - movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=(04 14 05 15 06 16 07 17) - movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=(60 70 61 71 62 72 63 73) - movdqa XMMWORD [wk(5)], xmm1 ; wk(5)=(64 74 65 75 66 76 67 77) - - movdqa xmm5,xmm3 - movdqa xmm6,xmm0 - paddd xmm3,xmm4 ; xmm3=data2L - paddd xmm0,xmm2 ; xmm0=data2H - psubd xmm5,xmm4 ; xmm5=data5L - psubd xmm6,xmm2 ; xmm6=data5H - - movdqa xmm7,[GOTOFF(ebx,PD_DESCALE_P1)] ; xmm7=[PD_DESCALE_P1] - - paddd xmm3,xmm7 - paddd xmm0,xmm7 - psrad xmm3,DESCALE_P1 - psrad xmm0,DESCALE_P1 - paddd xmm5,xmm7 - paddd xmm6,xmm7 - psrad xmm5,DESCALE_P1 - psrad xmm6,DESCALE_P1 - - packssdw xmm3,xmm0 ; xmm3=data2=(20 21 22 23 24 25 26 27) - packssdw xmm5,xmm6 ; xmm5=data5=(50 51 52 53 54 55 56 57) - - movdqa xmm1, XMMWORD [wk(2)] ; xmm1=tmp13L - movdqa xmm4, XMMWORD [wk(3)] ; xmm4=tmp13H - movdqa xmm2, XMMWORD [wk(8)] ; xmm2=tmp0L - movdqa xmm7, XMMWORD [wk(9)] ; xmm7=tmp0H - - movdqa xmm0,xmm1 - movdqa xmm6,xmm4 - paddd xmm1,xmm2 ; xmm1=data3L - paddd xmm4,xmm7 ; xmm4=data3H - psubd xmm0,xmm2 ; xmm0=data4L - psubd xmm6,xmm7 ; xmm6=data4H - - movdqa xmm2,[GOTOFF(ebx,PD_DESCALE_P1)] ; xmm2=[PD_DESCALE_P1] - - paddd xmm1,xmm2 - paddd xmm4,xmm2 - psrad xmm1,DESCALE_P1 - psrad xmm4,DESCALE_P1 - paddd xmm0,xmm2 - paddd xmm6,xmm2 - psrad xmm0,DESCALE_P1 - psrad xmm6,DESCALE_P1 - - packssdw xmm1,xmm4 ; xmm1=data3=(30 31 32 33 34 35 36 37) - packssdw xmm0,xmm6 ; xmm0=data4=(40 41 42 43 44 45 46 47) - - movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 01 11 02 12 03 13) - movdqa xmm2, XMMWORD [wk(1)] ; xmm2=(04 14 05 15 06 16 07 17) - - movdqa xmm4,xmm3 ; transpose coefficients(phase 1) - punpcklwd xmm3,xmm1 ; xmm3=(20 30 21 31 22 32 23 33) - punpckhwd xmm4,xmm1 ; xmm4=(24 34 25 35 26 36 27 37) - movdqa xmm6,xmm0 ; transpose coefficients(phase 1) - punpcklwd xmm0,xmm5 ; xmm0=(40 50 41 51 42 52 43 53) - punpckhwd xmm6,xmm5 ; xmm6=(44 54 45 55 46 56 47 57) - - movdqa xmm1,xmm7 ; transpose coefficients(phase 2) - punpckldq xmm7,xmm3 ; xmm7=(00 10 20 30 01 11 21 31) - punpckhdq xmm1,xmm3 ; xmm1=(02 12 22 32 03 13 23 33) - movdqa xmm5,xmm2 ; transpose coefficients(phase 2) - punpckldq xmm2,xmm4 ; xmm2=(04 14 24 34 05 15 25 35) - punpckhdq xmm5,xmm4 ; xmm5=(06 16 26 36 07 17 27 37) - - movdqa xmm3, XMMWORD [wk(4)] ; xmm3=(60 70 61 71 62 72 63 73) - movdqa xmm4, XMMWORD [wk(5)] ; xmm4=(64 74 65 75 66 76 67 77) - - movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=(04 14 24 34 05 15 25 35) - movdqa XMMWORD [wk(7)], xmm5 ; wk(7)=(06 16 26 36 07 17 27 37) - - movdqa xmm2,xmm0 ; transpose coefficients(phase 2) - punpckldq xmm0,xmm3 ; xmm0=(40 50 60 70 41 51 61 71) - punpckhdq xmm2,xmm3 ; xmm2=(42 52 62 72 43 53 63 73) - movdqa xmm5,xmm6 ; transpose coefficients(phase 2) - punpckldq xmm6,xmm4 ; xmm6=(44 54 64 74 45 55 65 75) - punpckhdq xmm5,xmm4 ; xmm5=(46 56 66 76 47 57 67 77) - - movdqa xmm3,xmm7 ; transpose coefficients(phase 3) - punpcklqdq xmm7,xmm0 ; xmm7=col0=(00 10 20 30 40 50 60 70) - punpckhqdq xmm3,xmm0 ; xmm3=col1=(01 11 21 31 41 51 61 71) - movdqa xmm4,xmm1 ; transpose coefficients(phase 3) - punpcklqdq xmm1,xmm2 ; xmm1=col2=(02 12 22 32 42 52 62 72) - punpckhqdq xmm4,xmm2 ; xmm4=col3=(03 13 23 33 43 53 63 73) - - movdqa xmm0, XMMWORD [wk(6)] ; xmm0=(04 14 24 34 05 15 25 35) - movdqa xmm2, XMMWORD [wk(7)] ; xmm2=(06 16 26 36 07 17 27 37) - - movdqa XMMWORD [wk(8)], xmm3 ; wk(8)=col1 - movdqa XMMWORD [wk(9)], xmm4 ; wk(9)=col3 - - movdqa xmm3,xmm0 ; transpose coefficients(phase 3) - punpcklqdq xmm0,xmm6 ; xmm0=col4=(04 14 24 34 44 54 64 74) - punpckhqdq xmm3,xmm6 ; xmm3=col5=(05 15 25 35 45 55 65 75) - movdqa xmm4,xmm2 ; transpose coefficients(phase 3) - punpcklqdq xmm2,xmm5 ; xmm2=col6=(06 16 26 36 46 56 66 76) - punpckhqdq xmm4,xmm5 ; xmm4=col7=(07 17 27 37 47 57 67 77) - - movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5 - movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7 -.column_end: - - ; -- Prefetch the next coefficient block - - prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32] - prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32] - prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32] - prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32] - - ; ---- Pass 2: process rows from work array, store into output array. - - mov eax, [original_ebp] - mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) - mov eax, JDIMENSION [output_col(eax)] - - ; -- Even part - - ; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6 - - ; (Original) - ; z1 = (z2 + z3) * 0.541196100; - ; tmp2 = z1 + z3 * -1.847759065; - ; tmp3 = z1 + z2 * 0.765366865; - ; - ; (This implementation) - ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); - ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; - - movdqa xmm6,xmm1 ; xmm1=in2=z2 - movdqa xmm5,xmm1 - punpcklwd xmm6,xmm2 ; xmm2=in6=z3 - punpckhwd xmm5,xmm2 - movdqa xmm1,xmm6 - movdqa xmm2,xmm5 - pmaddwd xmm6,[GOTOFF(ebx,PW_F130_F054)] ; xmm6=tmp3L - pmaddwd xmm5,[GOTOFF(ebx,PW_F130_F054)] ; xmm5=tmp3H - pmaddwd xmm1,[GOTOFF(ebx,PW_F054_MF130)] ; xmm1=tmp2L - pmaddwd xmm2,[GOTOFF(ebx,PW_F054_MF130)] ; xmm2=tmp2H - - movdqa xmm3,xmm7 - paddw xmm7,xmm0 ; xmm7=in0+in4 - psubw xmm3,xmm0 ; xmm3=in0-in4 - - pxor xmm4,xmm4 - pxor xmm0,xmm0 - punpcklwd xmm4,xmm7 ; xmm4=tmp0L - punpckhwd xmm0,xmm7 ; xmm0=tmp0H - psrad xmm4,(16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS - psrad xmm0,(16-CONST_BITS) ; psrad xmm0,16 & pslld xmm0,CONST_BITS - - movdqa xmm7,xmm4 - paddd xmm4,xmm6 ; xmm4=tmp10L - psubd xmm7,xmm6 ; xmm7=tmp13L - movdqa xmm6,xmm0 - paddd xmm0,xmm5 ; xmm0=tmp10H - psubd xmm6,xmm5 ; xmm6=tmp13H - - movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=tmp10L - movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp10H - movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=tmp13L - movdqa XMMWORD [wk(3)], xmm6 ; wk(3)=tmp13H - - pxor xmm5,xmm5 - pxor xmm4,xmm4 - punpcklwd xmm5,xmm3 ; xmm5=tmp1L - punpckhwd xmm4,xmm3 ; xmm4=tmp1H - psrad xmm5,(16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS - psrad xmm4,(16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS - - movdqa xmm0,xmm5 - paddd xmm5,xmm1 ; xmm5=tmp11L - psubd xmm0,xmm1 ; xmm0=tmp12L - movdqa xmm7,xmm4 - paddd xmm4,xmm2 ; xmm4=tmp11H - psubd xmm7,xmm2 ; xmm7=tmp12H - - movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L - movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=tmp11H - movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=tmp12L - movdqa XMMWORD [wk(7)], xmm7 ; wk(7)=tmp12H - - ; -- Odd part - - movdqa xmm6, XMMWORD [wk(9)] ; xmm6=col3 - movdqa xmm3, XMMWORD [wk(8)] ; xmm3=col1 - movdqa xmm1, XMMWORD [wk(11)] ; xmm1=col7 - movdqa xmm2, XMMWORD [wk(10)] ; xmm2=col5 - - movdqa xmm5,xmm6 - movdqa xmm4,xmm3 - paddw xmm5,xmm1 ; xmm5=z3 - paddw xmm4,xmm2 ; xmm4=z4 - - ; (Original) - ; z5 = (z3 + z4) * 1.175875602; - ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; - ; z3 += z5; z4 += z5; - ; - ; (This implementation) - ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; - ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); - - movdqa xmm0,xmm5 - movdqa xmm7,xmm5 - punpcklwd xmm0,xmm4 - punpckhwd xmm7,xmm4 - movdqa xmm5,xmm0 - movdqa xmm4,xmm7 - pmaddwd xmm0,[GOTOFF(ebx,PW_MF078_F117)] ; xmm0=z3L - pmaddwd xmm7,[GOTOFF(ebx,PW_MF078_F117)] ; xmm7=z3H - pmaddwd xmm5,[GOTOFF(ebx,PW_F117_F078)] ; xmm5=z4L - pmaddwd xmm4,[GOTOFF(ebx,PW_F117_F078)] ; xmm4=z4H - - movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=z3L - movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=z3H - - ; (Original) - ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; - ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; - ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; - ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; - ; tmp0 += z1 + z3; tmp1 += z2 + z4; - ; tmp2 += z2 + z3; tmp3 += z1 + z4; - ; - ; (This implementation) - ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; - ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; - ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); - ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); - ; tmp0 += z3; tmp1 += z4; - ; tmp2 += z3; tmp3 += z4; - - movdqa xmm0,xmm1 - movdqa xmm7,xmm1 - punpcklwd xmm0,xmm3 - punpckhwd xmm7,xmm3 - movdqa xmm1,xmm0 - movdqa xmm3,xmm7 - pmaddwd xmm0,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm0=tmp0L - pmaddwd xmm7,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm7=tmp0H - pmaddwd xmm1,[GOTOFF(ebx,PW_MF089_F060)] ; xmm1=tmp3L - pmaddwd xmm3,[GOTOFF(ebx,PW_MF089_F060)] ; xmm3=tmp3H - - paddd xmm0, XMMWORD [wk(10)] ; xmm0=tmp0L - paddd xmm7, XMMWORD [wk(11)] ; xmm7=tmp0H - paddd xmm1,xmm5 ; xmm1=tmp3L - paddd xmm3,xmm4 ; xmm3=tmp3H - - movdqa XMMWORD [wk(8)], xmm0 ; wk(8)=tmp0L - movdqa XMMWORD [wk(9)], xmm7 ; wk(9)=tmp0H - - movdqa xmm0,xmm2 - movdqa xmm7,xmm2 - punpcklwd xmm0,xmm6 - punpckhwd xmm7,xmm6 - movdqa xmm2,xmm0 - movdqa xmm6,xmm7 - pmaddwd xmm0,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm0=tmp1L - pmaddwd xmm7,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm7=tmp1H - pmaddwd xmm2,[GOTOFF(ebx,PW_MF256_F050)] ; xmm2=tmp2L - pmaddwd xmm6,[GOTOFF(ebx,PW_MF256_F050)] ; xmm6=tmp2H - - paddd xmm0,xmm5 ; xmm0=tmp1L - paddd xmm7,xmm4 ; xmm7=tmp1H - paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp2L - paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H - - movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=tmp1L - movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=tmp1H - - ; -- Final output stage - - movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L - movdqa xmm4, XMMWORD [wk(1)] ; xmm4=tmp10H - - movdqa xmm0,xmm5 - movdqa xmm7,xmm4 - paddd xmm5,xmm1 ; xmm5=data0L - paddd xmm4,xmm3 ; xmm4=data0H - psubd xmm0,xmm1 ; xmm0=data7L - psubd xmm7,xmm3 ; xmm7=data7H - - movdqa xmm1,[GOTOFF(ebx,PD_DESCALE_P2)] ; xmm1=[PD_DESCALE_P2] - - paddd xmm5,xmm1 - paddd xmm4,xmm1 - psrad xmm5,DESCALE_P2 - psrad xmm4,DESCALE_P2 - paddd xmm0,xmm1 - paddd xmm7,xmm1 - psrad xmm0,DESCALE_P2 - psrad xmm7,DESCALE_P2 - - packssdw xmm5,xmm4 ; xmm5=data0=(00 10 20 30 40 50 60 70) - packssdw xmm0,xmm7 ; xmm0=data7=(07 17 27 37 47 57 67 77) - - movdqa xmm3, XMMWORD [wk(4)] ; xmm3=tmp11L - movdqa xmm1, XMMWORD [wk(5)] ; xmm1=tmp11H - - movdqa xmm4,xmm3 - movdqa xmm7,xmm1 - paddd xmm3,xmm2 ; xmm3=data1L - paddd xmm1,xmm6 ; xmm1=data1H - psubd xmm4,xmm2 ; xmm4=data6L - psubd xmm7,xmm6 ; xmm7=data6H - - movdqa xmm2,[GOTOFF(ebx,PD_DESCALE_P2)] ; xmm2=[PD_DESCALE_P2] - - paddd xmm3,xmm2 - paddd xmm1,xmm2 - psrad xmm3,DESCALE_P2 - psrad xmm1,DESCALE_P2 - paddd xmm4,xmm2 - paddd xmm7,xmm2 - psrad xmm4,DESCALE_P2 - psrad xmm7,DESCALE_P2 - - packssdw xmm3,xmm1 ; xmm3=data1=(01 11 21 31 41 51 61 71) - packssdw xmm4,xmm7 ; xmm4=data6=(06 16 26 36 46 56 66 76) - - packsswb xmm5,xmm4 ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) - packsswb xmm3,xmm0 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) - - movdqa xmm6, XMMWORD [wk(6)] ; xmm6=tmp12L - movdqa xmm2, XMMWORD [wk(7)] ; xmm2=tmp12H - movdqa xmm1, XMMWORD [wk(10)] ; xmm1=tmp1L - movdqa xmm7, XMMWORD [wk(11)] ; xmm7=tmp1H - - movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) - movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) - - movdqa xmm4,xmm6 - movdqa xmm0,xmm2 - paddd xmm6,xmm1 ; xmm6=data2L - paddd xmm2,xmm7 ; xmm2=data2H - psubd xmm4,xmm1 ; xmm4=data5L - psubd xmm0,xmm7 ; xmm0=data5H - - movdqa xmm5,[GOTOFF(ebx,PD_DESCALE_P2)] ; xmm5=[PD_DESCALE_P2] - - paddd xmm6,xmm5 - paddd xmm2,xmm5 - psrad xmm6,DESCALE_P2 - psrad xmm2,DESCALE_P2 - paddd xmm4,xmm5 - paddd xmm0,xmm5 - psrad xmm4,DESCALE_P2 - psrad xmm0,DESCALE_P2 - - packssdw xmm6,xmm2 ; xmm6=data2=(02 12 22 32 42 52 62 72) - packssdw xmm4,xmm0 ; xmm4=data5=(05 15 25 35 45 55 65 75) - - movdqa xmm3, XMMWORD [wk(2)] ; xmm3=tmp13L - movdqa xmm1, XMMWORD [wk(3)] ; xmm1=tmp13H - movdqa xmm7, XMMWORD [wk(8)] ; xmm7=tmp0L - movdqa xmm5, XMMWORD [wk(9)] ; xmm5=tmp0H - - movdqa xmm2,xmm3 - movdqa xmm0,xmm1 - paddd xmm3,xmm7 ; xmm3=data3L - paddd xmm1,xmm5 ; xmm1=data3H - psubd xmm2,xmm7 ; xmm2=data4L - psubd xmm0,xmm5 ; xmm0=data4H - - movdqa xmm7,[GOTOFF(ebx,PD_DESCALE_P2)] ; xmm7=[PD_DESCALE_P2] - - paddd xmm3,xmm7 - paddd xmm1,xmm7 - psrad xmm3,DESCALE_P2 - psrad xmm1,DESCALE_P2 - paddd xmm2,xmm7 - paddd xmm0,xmm7 - psrad xmm2,DESCALE_P2 - psrad xmm0,DESCALE_P2 - - movdqa xmm5,[GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm5=[PB_CENTERJSAMP] - - packssdw xmm3,xmm1 ; xmm3=data3=(03 13 23 33 43 53 63 73) - packssdw xmm2,xmm0 ; xmm2=data4=(04 14 24 34 44 54 64 74) - - movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) - movdqa xmm1, XMMWORD [wk(1)] ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) - - packsswb xmm6,xmm2 ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74) - packsswb xmm3,xmm4 ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75) - - paddb xmm7,xmm5 - paddb xmm1,xmm5 - paddb xmm6,xmm5 - paddb xmm3,xmm5 - - movdqa xmm0,xmm7 ; transpose coefficients(phase 1) - punpcklbw xmm7,xmm1 ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71) - punpckhbw xmm0,xmm1 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77) - movdqa xmm2,xmm6 ; transpose coefficients(phase 1) - punpcklbw xmm6,xmm3 ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73) - punpckhbw xmm2,xmm3 ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75) - - movdqa xmm4,xmm7 ; transpose coefficients(phase 2) - punpcklwd xmm7,xmm6 ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) - punpckhwd xmm4,xmm6 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73) - movdqa xmm5,xmm2 ; transpose coefficients(phase 2) - punpcklwd xmm2,xmm0 ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) - punpckhwd xmm5,xmm0 ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77) - - movdqa xmm1,xmm7 ; transpose coefficients(phase 3) - punpckldq xmm7,xmm2 ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) - punpckhdq xmm1,xmm2 ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) - movdqa xmm3,xmm4 ; transpose coefficients(phase 3) - punpckldq xmm4,xmm5 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) - punpckhdq xmm3,xmm5 ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) - - pshufd xmm6,xmm7,0x4E ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) - pshufd xmm0,xmm1,0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) - pshufd xmm2,xmm4,0x4E ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47) - pshufd xmm5,xmm3,0x4E ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67) - - mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] - mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW] - movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm7 - movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm1 - mov edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW] - mov esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW] - movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4 - movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3 - - mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] - mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW] - movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6 - movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0 - mov edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW] - mov esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW] - movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm2 - movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm5 - - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; unused - poppic ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret - -; For some reason, the OS X linker does not honor the request to align the -; segment unless we do this. - align 16 diff --git a/Builder/jni-1.11/simd/i386/src/jiss2red.asm b/Builder/jni-1.11/simd/i386/src/jiss2red.asm deleted file mode 100644 index 238c61d07..000000000 --- a/Builder/jni-1.11/simd/i386/src/jiss2red.asm +++ /dev/null @@ -1,594 +0,0 @@ -; -; jiss2red.asm - reduced-size IDCT (SSE2) -; -; Copyright 2009 Pierre Ossman for Cendio AB -; -; Based on -; x86 SIMD extension for IJG JPEG library -; Copyright (C) 1999-2006, MIYASAKA Masaru. -; For conditions of distribution and use, see copyright notice in jsimdext.inc -; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; This file contains inverse-DCT routines that produce reduced-size -; output: either 4x4 or 2x2 pixels from an 8x8 DCT block. -; The following code is based directly on the IJG's original jidctred.c; -; see the jidctred.c for more details. -; -; [TAB8] - -%include "jsimdext.inc" -%include "jdct.inc" - -; -------------------------------------------------------------------------- - -%define CONST_BITS 13 -%define PASS1_BITS 2 - -%define DESCALE_P1_4 (CONST_BITS-PASS1_BITS+1) -%define DESCALE_P2_4 (CONST_BITS+PASS1_BITS+3+1) -%define DESCALE_P1_2 (CONST_BITS-PASS1_BITS+2) -%define DESCALE_P2_2 (CONST_BITS+PASS1_BITS+3+2) - -%if CONST_BITS == 13 -F_0_211 equ 1730 ; FIX(0.211164243) -F_0_509 equ 4176 ; FIX(0.509795579) -F_0_601 equ 4926 ; FIX(0.601344887) -F_0_720 equ 5906 ; FIX(0.720959822) -F_0_765 equ 6270 ; FIX(0.765366865) -F_0_850 equ 6967 ; FIX(0.850430095) -F_0_899 equ 7373 ; FIX(0.899976223) -F_1_061 equ 8697 ; FIX(1.061594337) -F_1_272 equ 10426 ; FIX(1.272758580) -F_1_451 equ 11893 ; FIX(1.451774981) -F_1_847 equ 15137 ; FIX(1.847759065) -F_2_172 equ 17799 ; FIX(2.172734803) -F_2_562 equ 20995 ; FIX(2.562915447) -F_3_624 equ 29692 ; FIX(3.624509785) -%else -; NASM cannot do compile-time arithmetic on floating-point constants. -%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) -F_0_211 equ DESCALE( 226735879,30-CONST_BITS) ; FIX(0.211164243) -F_0_509 equ DESCALE( 547388834,30-CONST_BITS) ; FIX(0.509795579) -F_0_601 equ DESCALE( 645689155,30-CONST_BITS) ; FIX(0.601344887) -F_0_720 equ DESCALE( 774124714,30-CONST_BITS) ; FIX(0.720959822) -F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) -F_0_850 equ DESCALE( 913142361,30-CONST_BITS) ; FIX(0.850430095) -F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) -F_1_061 equ DESCALE(1139878239,30-CONST_BITS) ; FIX(1.061594337) -F_1_272 equ DESCALE(1366614119,30-CONST_BITS) ; FIX(1.272758580) -F_1_451 equ DESCALE(1558831516,30-CONST_BITS) ; FIX(1.451774981) -F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) -F_2_172 equ DESCALE(2332956230,30-CONST_BITS) ; FIX(2.172734803) -F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) -F_3_624 equ DESCALE(3891787747,30-CONST_BITS) ; FIX(3.624509785) -%endif - -; -------------------------------------------------------------------------- - SECTION SEG_CONST - - alignz 16 - global EXTN(jconst_idct_red_sse2) - -EXTN(jconst_idct_red_sse2): - -PW_F184_MF076 times 4 dw F_1_847,-F_0_765 -PW_F256_F089 times 4 dw F_2_562, F_0_899 -PW_F106_MF217 times 4 dw F_1_061,-F_2_172 -PW_MF060_MF050 times 4 dw -F_0_601,-F_0_509 -PW_F145_MF021 times 4 dw F_1_451,-F_0_211 -PW_F362_MF127 times 4 dw F_3_624,-F_1_272 -PW_F085_MF072 times 4 dw F_0_850,-F_0_720 -PD_DESCALE_P1_4 times 4 dd 1 << (DESCALE_P1_4-1) -PD_DESCALE_P2_4 times 4 dd 1 << (DESCALE_P2_4-1) -PD_DESCALE_P1_2 times 4 dd 1 << (DESCALE_P1_2-1) -PD_DESCALE_P2_2 times 4 dd 1 << (DESCALE_P2_2-1) -PB_CENTERJSAMP times 16 db CENTERJSAMPLE - - alignz 16 - -; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 -; -; Perform dequantization and inverse DCT on one block of coefficients, -; producing a reduced-size 4x4 output block. -; -; GLOBAL(void) -; jsimd_idct_4x4_sse2 (void * dct_table, JCOEFPTR coef_block, -; JSAMPARRAY output_buf, JDIMENSION output_col) -; - -%define dct_table(b) (b)+8 ; void * dct_table -%define coef_block(b) (b)+12 ; JCOEFPTR coef_block -%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf -%define output_col(b) (b)+20 ; JDIMENSION output_col - -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 2 - - align 16 - global EXTN(jsimd_idct_4x4_sse2) - -EXTN(jsimd_idct_4x4_sse2): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [wk(0)] - pushpic ebx -; push ecx ; unused -; push edx ; need not be preserved - push esi - push edi - - get_GOT ebx ; get GOT address - - ; ---- Pass 1: process columns from input. - -; mov eax, [original_ebp] - mov edx, POINTER [dct_table(eax)] ; quantptr - mov esi, JCOEFPTR [coef_block(eax)] ; inptr - -%ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2 - mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] - or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] - jnz short .columnDCT - - movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] - movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] - por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] - por xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] - por xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] - por xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] - por xmm0,xmm1 - packsswb xmm0,xmm0 - packsswb xmm0,xmm0 - movd eax,xmm0 - test eax,eax - jnz short .columnDCT - - ; -- AC terms all zero - - movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] - pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - - psllw xmm0,PASS1_BITS - - movdqa xmm3,xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07) - punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) - punpckhwd xmm3,xmm3 ; xmm3=(04 04 05 05 06 06 07 07) - - pshufd xmm1,xmm0,0x50 ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01) - pshufd xmm0,xmm0,0xFA ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03) - pshufd xmm6,xmm3,0x50 ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05) - pshufd xmm3,xmm3,0xFA ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07) - - jmp near .column_end - alignx 16,7 -%endif -.columnDCT: - - ; -- Odd part - - movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] - movdqa xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] - pmullw xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - movdqa xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] - movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] - pmullw xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - - movdqa xmm4,xmm0 - movdqa xmm5,xmm0 - punpcklwd xmm4,xmm1 - punpckhwd xmm5,xmm1 - movdqa xmm0,xmm4 - movdqa xmm1,xmm5 - pmaddwd xmm4,[GOTOFF(ebx,PW_F256_F089)] ; xmm4=(tmp2L) - pmaddwd xmm5,[GOTOFF(ebx,PW_F256_F089)] ; xmm5=(tmp2H) - pmaddwd xmm0,[GOTOFF(ebx,PW_F106_MF217)] ; xmm0=(tmp0L) - pmaddwd xmm1,[GOTOFF(ebx,PW_F106_MF217)] ; xmm1=(tmp0H) - - movdqa xmm6,xmm2 - movdqa xmm7,xmm2 - punpcklwd xmm6,xmm3 - punpckhwd xmm7,xmm3 - movdqa xmm2,xmm6 - movdqa xmm3,xmm7 - pmaddwd xmm6,[GOTOFF(ebx,PW_MF060_MF050)] ; xmm6=(tmp2L) - pmaddwd xmm7,[GOTOFF(ebx,PW_MF060_MF050)] ; xmm7=(tmp2H) - pmaddwd xmm2,[GOTOFF(ebx,PW_F145_MF021)] ; xmm2=(tmp0L) - pmaddwd xmm3,[GOTOFF(ebx,PW_F145_MF021)] ; xmm3=(tmp0H) - - paddd xmm6,xmm4 ; xmm6=tmp2L - paddd xmm7,xmm5 ; xmm7=tmp2H - paddd xmm2,xmm0 ; xmm2=tmp0L - paddd xmm3,xmm1 ; xmm3=tmp0H - - movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp0L - movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=tmp0H - - ; -- Even part - - movdqa xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] - movdqa xmm5, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] - movdqa xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] - pmullw xmm4, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw xmm5, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw xmm0, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - - pxor xmm1,xmm1 - pxor xmm2,xmm2 - punpcklwd xmm1,xmm4 ; xmm1=tmp0L - punpckhwd xmm2,xmm4 ; xmm2=tmp0H - psrad xmm1,(16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1 - psrad xmm2,(16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1 - - movdqa xmm3,xmm5 ; xmm5=in2=z2 - punpcklwd xmm5,xmm0 ; xmm0=in6=z3 - punpckhwd xmm3,xmm0 - pmaddwd xmm5,[GOTOFF(ebx,PW_F184_MF076)] ; xmm5=tmp2L - pmaddwd xmm3,[GOTOFF(ebx,PW_F184_MF076)] ; xmm3=tmp2H - - movdqa xmm4,xmm1 - movdqa xmm0,xmm2 - paddd xmm1,xmm5 ; xmm1=tmp10L - paddd xmm2,xmm3 ; xmm2=tmp10H - psubd xmm4,xmm5 ; xmm4=tmp12L - psubd xmm0,xmm3 ; xmm0=tmp12H - - ; -- Final output stage - - movdqa xmm5,xmm1 - movdqa xmm3,xmm2 - paddd xmm1,xmm6 ; xmm1=data0L - paddd xmm2,xmm7 ; xmm2=data0H - psubd xmm5,xmm6 ; xmm5=data3L - psubd xmm3,xmm7 ; xmm3=data3H - - movdqa xmm6,[GOTOFF(ebx,PD_DESCALE_P1_4)] ; xmm6=[PD_DESCALE_P1_4] - - paddd xmm1,xmm6 - paddd xmm2,xmm6 - psrad xmm1,DESCALE_P1_4 - psrad xmm2,DESCALE_P1_4 - paddd xmm5,xmm6 - paddd xmm3,xmm6 - psrad xmm5,DESCALE_P1_4 - psrad xmm3,DESCALE_P1_4 - - packssdw xmm1,xmm2 ; xmm1=data0=(00 01 02 03 04 05 06 07) - packssdw xmm5,xmm3 ; xmm5=data3=(30 31 32 33 34 35 36 37) - - movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp0L - movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp0H - - movdqa xmm2,xmm4 - movdqa xmm3,xmm0 - paddd xmm4,xmm7 ; xmm4=data1L - paddd xmm0,xmm6 ; xmm0=data1H - psubd xmm2,xmm7 ; xmm2=data2L - psubd xmm3,xmm6 ; xmm3=data2H - - movdqa xmm7,[GOTOFF(ebx,PD_DESCALE_P1_4)] ; xmm7=[PD_DESCALE_P1_4] - - paddd xmm4,xmm7 - paddd xmm0,xmm7 - psrad xmm4,DESCALE_P1_4 - psrad xmm0,DESCALE_P1_4 - paddd xmm2,xmm7 - paddd xmm3,xmm7 - psrad xmm2,DESCALE_P1_4 - psrad xmm3,DESCALE_P1_4 - - packssdw xmm4,xmm0 ; xmm4=data1=(10 11 12 13 14 15 16 17) - packssdw xmm2,xmm3 ; xmm2=data2=(20 21 22 23 24 25 26 27) - - movdqa xmm6,xmm1 ; transpose coefficients(phase 1) - punpcklwd xmm1,xmm4 ; xmm1=(00 10 01 11 02 12 03 13) - punpckhwd xmm6,xmm4 ; xmm6=(04 14 05 15 06 16 07 17) - movdqa xmm7,xmm2 ; transpose coefficients(phase 1) - punpcklwd xmm2,xmm5 ; xmm2=(20 30 21 31 22 32 23 33) - punpckhwd xmm7,xmm5 ; xmm7=(24 34 25 35 26 36 27 37) - - movdqa xmm0,xmm1 ; transpose coefficients(phase 2) - punpckldq xmm1,xmm2 ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31) - punpckhdq xmm0,xmm2 ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33) - movdqa xmm3,xmm6 ; transpose coefficients(phase 2) - punpckldq xmm6,xmm7 ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35) - punpckhdq xmm3,xmm7 ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37) -.column_end: - - ; -- Prefetch the next coefficient block - - prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32] - prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32] - prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32] - prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32] - - ; ---- Pass 2: process rows, store into output array. - - mov eax, [original_ebp] - mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) - mov eax, JDIMENSION [output_col(eax)] - - ; -- Even part - - pxor xmm4,xmm4 - punpcklwd xmm4,xmm1 ; xmm4=tmp0 - psrad xmm4,(16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1 - - ; -- Odd part - - punpckhwd xmm1,xmm0 - punpckhwd xmm6,xmm3 - movdqa xmm5,xmm1 - movdqa xmm2,xmm6 - pmaddwd xmm1,[GOTOFF(ebx,PW_F256_F089)] ; xmm1=(tmp2) - pmaddwd xmm6,[GOTOFF(ebx,PW_MF060_MF050)] ; xmm6=(tmp2) - pmaddwd xmm5,[GOTOFF(ebx,PW_F106_MF217)] ; xmm5=(tmp0) - pmaddwd xmm2,[GOTOFF(ebx,PW_F145_MF021)] ; xmm2=(tmp0) - - paddd xmm6,xmm1 ; xmm6=tmp2 - paddd xmm2,xmm5 ; xmm2=tmp0 - - ; -- Even part - - punpcklwd xmm0,xmm3 - pmaddwd xmm0,[GOTOFF(ebx,PW_F184_MF076)] ; xmm0=tmp2 - - movdqa xmm7,xmm4 - paddd xmm4,xmm0 ; xmm4=tmp10 - psubd xmm7,xmm0 ; xmm7=tmp12 - - ; -- Final output stage - - movdqa xmm1,[GOTOFF(ebx,PD_DESCALE_P2_4)] ; xmm1=[PD_DESCALE_P2_4] - - movdqa xmm5,xmm4 - movdqa xmm3,xmm7 - paddd xmm4,xmm6 ; xmm4=data0=(00 10 20 30) - paddd xmm7,xmm2 ; xmm7=data1=(01 11 21 31) - psubd xmm5,xmm6 ; xmm5=data3=(03 13 23 33) - psubd xmm3,xmm2 ; xmm3=data2=(02 12 22 32) - - paddd xmm4,xmm1 - paddd xmm7,xmm1 - psrad xmm4,DESCALE_P2_4 - psrad xmm7,DESCALE_P2_4 - paddd xmm5,xmm1 - paddd xmm3,xmm1 - psrad xmm5,DESCALE_P2_4 - psrad xmm3,DESCALE_P2_4 - - packssdw xmm4,xmm3 ; xmm4=(00 10 20 30 02 12 22 32) - packssdw xmm7,xmm5 ; xmm7=(01 11 21 31 03 13 23 33) - - movdqa xmm0,xmm4 ; transpose coefficients(phase 1) - punpcklwd xmm4,xmm7 ; xmm4=(00 01 10 11 20 21 30 31) - punpckhwd xmm0,xmm7 ; xmm0=(02 03 12 13 22 23 32 33) - - movdqa xmm6,xmm4 ; transpose coefficients(phase 2) - punpckldq xmm4,xmm0 ; xmm4=(00 01 02 03 10 11 12 13) - punpckhdq xmm6,xmm0 ; xmm6=(20 21 22 23 30 31 32 33) - - packsswb xmm4,xmm6 ; xmm4=(00 01 02 03 10 11 12 13 20 ..) - paddb xmm4,[GOTOFF(ebx,PB_CENTERJSAMP)] - - pshufd xmm2,xmm4,0x39 ; xmm2=(10 11 12 13 20 21 22 23 30 ..) - pshufd xmm1,xmm4,0x4E ; xmm1=(20 21 22 23 30 31 32 33 00 ..) - pshufd xmm3,xmm4,0x93 ; xmm3=(30 31 32 33 00 01 02 03 10 ..) - - mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] - mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] - movd XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm4 - movd XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm2 - mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] - mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW] - movd XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm1 - movd XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm3 - - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; unused - poppic ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret - - -; -------------------------------------------------------------------------- -; -; Perform dequantization and inverse DCT on one block of coefficients, -; producing a reduced-size 2x2 output block. -; -; GLOBAL(void) -; jsimd_idct_2x2_sse2 (void * dct_table, JCOEFPTR coef_block, -; JSAMPARRAY output_buf, JDIMENSION output_col) -; - -%define dct_table(b) (b)+8 ; void * dct_table -%define coef_block(b) (b)+12 ; JCOEFPTR coef_block -%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf -%define output_col(b) (b)+20 ; JDIMENSION output_col - - align 16 - global EXTN(jsimd_idct_2x2_sse2) - -EXTN(jsimd_idct_2x2_sse2): - push ebp - mov ebp,esp - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - get_GOT ebx ; get GOT address - - ; ---- Pass 1: process columns from input. - - mov edx, POINTER [dct_table(ebp)] ; quantptr - mov esi, JCOEFPTR [coef_block(ebp)] ; inptr - - ; | input: | result: | - ; | 00 01 ** 03 ** 05 ** 07 | | - ; | 10 11 ** 13 ** 15 ** 17 | | - ; | ** ** ** ** ** ** ** ** | | - ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 | - ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 | - ; | 50 51 ** 53 ** 55 ** 57 | | - ; | ** ** ** ** ** ** ** ** | | - ; | 70 71 ** 73 ** 75 ** 77 | | - - ; -- Odd part - - movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] - movdqa xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] - pmullw xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - movdqa xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] - movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] - pmullw xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - - ; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37) - ; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77) - - pcmpeqd xmm7,xmm7 - pslld xmm7,WORD_BIT ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..} - - movdqa xmm4,xmm0 ; xmm4=(10 11 ** 13 ** 15 ** 17) - movdqa xmm5,xmm2 ; xmm5=(50 51 ** 53 ** 55 ** 57) - punpcklwd xmm4,xmm1 ; xmm4=(10 30 11 31 ** ** 13 33) - punpcklwd xmm5,xmm3 ; xmm5=(50 70 51 71 ** ** 53 73) - pmaddwd xmm4,[GOTOFF(ebx,PW_F362_MF127)] - pmaddwd xmm5,[GOTOFF(ebx,PW_F085_MF072)] - - psrld xmm0,WORD_BIT ; xmm0=(11 -- 13 -- 15 -- 17 --) - pand xmm1,xmm7 ; xmm1=(-- 31 -- 33 -- 35 -- 37) - psrld xmm2,WORD_BIT ; xmm2=(51 -- 53 -- 55 -- 57 --) - pand xmm3,xmm7 ; xmm3=(-- 71 -- 73 -- 75 -- 77) - por xmm0,xmm1 ; xmm0=(11 31 13 33 15 35 17 37) - por xmm2,xmm3 ; xmm2=(51 71 53 73 55 75 57 77) - pmaddwd xmm0,[GOTOFF(ebx,PW_F362_MF127)] - pmaddwd xmm2,[GOTOFF(ebx,PW_F085_MF072)] - - paddd xmm4,xmm5 ; xmm4=tmp0[col0 col1 **** col3] - paddd xmm0,xmm2 ; xmm0=tmp0[col1 col3 col5 col7] - - ; -- Even part - - movdqa xmm6, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] - pmullw xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - - ; xmm6=(00 01 ** 03 ** 05 ** 07) - - movdqa xmm1,xmm6 ; xmm1=(00 01 ** 03 ** 05 ** 07) - pslld xmm6,WORD_BIT ; xmm6=(-- 00 -- ** -- ** -- **) - pand xmm1,xmm7 ; xmm1=(-- 01 -- 03 -- 05 -- 07) - psrad xmm6,(WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****] - psrad xmm1,(WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7] - - ; -- Final output stage - - movdqa xmm3,xmm6 - movdqa xmm5,xmm1 - paddd xmm6,xmm4 ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **) - paddd xmm1,xmm0 ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7) - psubd xmm3,xmm4 ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **) - psubd xmm5,xmm0 ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7) - - movdqa xmm2,[GOTOFF(ebx,PD_DESCALE_P1_2)] ; xmm2=[PD_DESCALE_P1_2] - - punpckldq xmm6,xmm3 ; xmm6=(A0 B0 ** **) - - movdqa xmm7,xmm1 - punpcklqdq xmm1,xmm5 ; xmm1=(A1 A3 B1 B3) - punpckhqdq xmm7,xmm5 ; xmm7=(A5 A7 B5 B7) - - paddd xmm6,xmm2 - psrad xmm6,DESCALE_P1_2 - - paddd xmm1,xmm2 - paddd xmm7,xmm2 - psrad xmm1,DESCALE_P1_2 - psrad xmm7,DESCALE_P1_2 - - ; -- Prefetch the next coefficient block - - prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32] - prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32] - prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32] - prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32] - - ; ---- Pass 2: process rows, store into output array. - - mov edi, JSAMPARRAY [output_buf(ebp)] ; (JSAMPROW *) - mov eax, JDIMENSION [output_col(ebp)] - - ; | input:| result:| - ; | A0 B0 | | - ; | A1 B1 | C0 C1 | - ; | A3 B3 | D0 D1 | - ; | A5 B5 | | - ; | A7 B7 | | - - ; -- Odd part - - packssdw xmm1,xmm1 ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3) - packssdw xmm7,xmm7 ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7) - pmaddwd xmm1,[GOTOFF(ebx,PW_F362_MF127)] - pmaddwd xmm7,[GOTOFF(ebx,PW_F085_MF072)] - - paddd xmm1,xmm7 ; xmm1=tmp0[row0 row1 row0 row1] - - ; -- Even part - - pslld xmm6,(CONST_BITS+2) ; xmm6=tmp10[row0 row1 **** ****] - - ; -- Final output stage - - movdqa xmm4,xmm6 - paddd xmm6,xmm1 ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **) - psubd xmm4,xmm1 ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **) - - punpckldq xmm6,xmm4 ; xmm6=(C0 D0 C1 D1) - - paddd xmm6,[GOTOFF(ebx,PD_DESCALE_P2_2)] - psrad xmm6,DESCALE_P2_2 - - packssdw xmm6,xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1) - packsswb xmm6,xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..) - paddb xmm6,[GOTOFF(ebx,PB_CENTERJSAMP)] - - pextrw ebx,xmm6,0x00 ; ebx=(C0 D0 -- --) - pextrw ecx,xmm6,0x01 ; ecx=(C1 D1 -- --) - - mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] - mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] - mov WORD [edx+eax*SIZEOF_JSAMPLE], bx - mov WORD [esi+eax*SIZEOF_JSAMPLE], cx - - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - pop ebp - ret - -; For some reason, the OS X linker does not honor the request to align the -; segment unless we do this. - align 16 diff --git a/Builder/jni-1.11/simd/i386/src/jisseflt.asm b/Builder/jni-1.11/simd/i386/src/jisseflt.asm deleted file mode 100644 index d6147c12d..000000000 --- a/Builder/jni-1.11/simd/i386/src/jisseflt.asm +++ /dev/null @@ -1,572 +0,0 @@ -; -; jisseflt.asm - floating-point IDCT (SSE & MMX) -; -; Copyright 2009 Pierre Ossman for Cendio AB -; -; Based on -; x86 SIMD extension for IJG JPEG library -; Copyright (C) 1999-2006, MIYASAKA Masaru. -; For conditions of distribution and use, see copyright notice in jsimdext.inc -; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; This file contains a floating-point implementation of the inverse DCT -; (Discrete Cosine Transform). The following code is based directly on -; the IJG's original jidctflt.c; see the jidctflt.c for more details. -; -; [TAB8] - -%include "jsimdext.inc" -%include "jdct.inc" - -; -------------------------------------------------------------------------- - -%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) - shufps %1,%2,0x44 -%endmacro - -%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) - shufps %1,%2,0xEE -%endmacro - -; -------------------------------------------------------------------------- - SECTION SEG_CONST - - alignz 16 - global EXTN(jconst_idct_float_sse) - -EXTN(jconst_idct_float_sse): - -PD_1_414 times 4 dd 1.414213562373095048801689 -PD_1_847 times 4 dd 1.847759065022573512256366 -PD_1_082 times 4 dd 1.082392200292393968799446 -PD_M2_613 times 4 dd -2.613125929752753055713286 -PD_0_125 times 4 dd 0.125 ; 1/8 -PB_CENTERJSAMP times 8 db CENTERJSAMPLE - - alignz 16 - -; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 -; -; Perform dequantization and inverse DCT on one block of coefficients. -; -; GLOBAL(void) -; jsimd_idct_float_sse (void * dct_table, JCOEFPTR coef_block, -; JSAMPARRAY output_buf, JDIMENSION output_col) -; - -%define dct_table(b) (b)+8 ; void * dct_table -%define coef_block(b) (b)+12 ; JCOEFPTR coef_block -%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf -%define output_col(b) (b)+20 ; JDIMENSION output_col - -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 2 -%define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT - ; FAST_FLOAT workspace[DCTSIZE2] - - align 16 - global EXTN(jsimd_idct_float_sse) - -EXTN(jsimd_idct_float_sse): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [workspace] - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - get_GOT ebx ; get GOT address - - ; ---- Pass 1: process columns from input, store into work array. - -; mov eax, [original_ebp] - mov edx, POINTER [dct_table(eax)] ; quantptr - mov esi, JCOEFPTR [coef_block(eax)] ; inptr - lea edi, [workspace] ; FAST_FLOAT * wsptr - mov ecx, DCTSIZE/4 ; ctr - alignx 16,7 -.columnloop: -%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE - mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] - or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] - jnz near .columnDCT - - movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] - movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] - por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] - por mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] - por mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] - por mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] - por mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] - por mm1,mm0 - packsswb mm1,mm1 - movd eax,mm1 - test eax,eax - jnz short .columnDCT - - ; -- AC terms all zero - - movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] - - punpckhwd mm1,mm0 ; mm1=(** 02 ** 03) - punpcklwd mm0,mm0 ; mm0=(00 00 01 01) - psrad mm1,(DWORD_BIT-WORD_BIT) ; mm1=in0H=(02 03) - psrad mm0,(DWORD_BIT-WORD_BIT) ; mm0=in0L=(00 01) - cvtpi2ps xmm3,mm1 ; xmm3=(02 03 ** **) - cvtpi2ps xmm0,mm0 ; xmm0=(00 01 ** **) - movlhps xmm0,xmm3 ; xmm0=in0=(00 01 02 03) - - mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - - movaps xmm1,xmm0 - movaps xmm2,xmm0 - movaps xmm3,xmm0 - - shufps xmm0,xmm0,0x00 ; xmm0=(00 00 00 00) - shufps xmm1,xmm1,0x55 ; xmm1=(01 01 01 01) - shufps xmm2,xmm2,0xAA ; xmm2=(02 02 02 02) - shufps xmm3,xmm3,0xFF ; xmm3=(03 03 03 03) - - movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0 - movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0 - movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1 - movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1 - movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2 - movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2 - movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3 - movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 - jmp near .nextcolumn - alignx 16,7 -%endif -.columnDCT: - - ; -- Even part - - movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] - movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] - movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] - movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] - - punpckhwd mm4,mm0 ; mm4=(** 02 ** 03) - punpcklwd mm0,mm0 ; mm0=(00 00 01 01) - punpckhwd mm5,mm1 ; mm5=(** 22 ** 23) - punpcklwd mm1,mm1 ; mm1=(20 20 21 21) - - psrad mm4,(DWORD_BIT-WORD_BIT) ; mm4=in0H=(02 03) - psrad mm0,(DWORD_BIT-WORD_BIT) ; mm0=in0L=(00 01) - cvtpi2ps xmm4,mm4 ; xmm4=(02 03 ** **) - cvtpi2ps xmm0,mm0 ; xmm0=(00 01 ** **) - psrad mm5,(DWORD_BIT-WORD_BIT) ; mm5=in2H=(22 23) - psrad mm1,(DWORD_BIT-WORD_BIT) ; mm1=in2L=(20 21) - cvtpi2ps xmm5,mm5 ; xmm5=(22 23 ** **) - cvtpi2ps xmm1,mm1 ; xmm1=(20 21 ** **) - - punpckhwd mm6,mm2 ; mm6=(** 42 ** 43) - punpcklwd mm2,mm2 ; mm2=(40 40 41 41) - punpckhwd mm7,mm3 ; mm7=(** 62 ** 63) - punpcklwd mm3,mm3 ; mm3=(60 60 61 61) - - psrad mm6,(DWORD_BIT-WORD_BIT) ; mm6=in4H=(42 43) - psrad mm2,(DWORD_BIT-WORD_BIT) ; mm2=in4L=(40 41) - cvtpi2ps xmm6,mm6 ; xmm6=(42 43 ** **) - cvtpi2ps xmm2,mm2 ; xmm2=(40 41 ** **) - psrad mm7,(DWORD_BIT-WORD_BIT) ; mm7=in6H=(62 63) - psrad mm3,(DWORD_BIT-WORD_BIT) ; mm3=in6L=(60 61) - cvtpi2ps xmm7,mm7 ; xmm7=(62 63 ** **) - cvtpi2ps xmm3,mm3 ; xmm3=(60 61 ** **) - - movlhps xmm0,xmm4 ; xmm0=in0=(00 01 02 03) - movlhps xmm1,xmm5 ; xmm1=in2=(20 21 22 23) - mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - mulps xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - - movlhps xmm2,xmm6 ; xmm2=in4=(40 41 42 43) - movlhps xmm3,xmm7 ; xmm3=in6=(60 61 62 63) - mulps xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - mulps xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - - movaps xmm4,xmm0 - movaps xmm5,xmm1 - subps xmm0,xmm2 ; xmm0=tmp11 - subps xmm1,xmm3 - addps xmm4,xmm2 ; xmm4=tmp10 - addps xmm5,xmm3 ; xmm5=tmp13 - - mulps xmm1,[GOTOFF(ebx,PD_1_414)] - subps xmm1,xmm5 ; xmm1=tmp12 - - movaps xmm6,xmm4 - movaps xmm7,xmm0 - subps xmm4,xmm5 ; xmm4=tmp3 - subps xmm0,xmm1 ; xmm0=tmp2 - addps xmm6,xmm5 ; xmm6=tmp0 - addps xmm7,xmm1 ; xmm7=tmp1 - - movaps XMMWORD [wk(1)], xmm4 ; tmp3 - movaps XMMWORD [wk(0)], xmm0 ; tmp2 - - ; -- Odd part - - movq mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] - movq mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] - movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] - movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] - - punpckhwd mm6,mm4 ; mm6=(** 12 ** 13) - punpcklwd mm4,mm4 ; mm4=(10 10 11 11) - punpckhwd mm2,mm0 ; mm2=(** 32 ** 33) - punpcklwd mm0,mm0 ; mm0=(30 30 31 31) - - psrad mm6,(DWORD_BIT-WORD_BIT) ; mm6=in1H=(12 13) - psrad mm4,(DWORD_BIT-WORD_BIT) ; mm4=in1L=(10 11) - cvtpi2ps xmm4,mm6 ; xmm4=(12 13 ** **) - cvtpi2ps xmm2,mm4 ; xmm2=(10 11 ** **) - psrad mm2,(DWORD_BIT-WORD_BIT) ; mm2=in3H=(32 33) - psrad mm0,(DWORD_BIT-WORD_BIT) ; mm0=in3L=(30 31) - cvtpi2ps xmm0,mm2 ; xmm0=(32 33 ** **) - cvtpi2ps xmm3,mm0 ; xmm3=(30 31 ** **) - - punpckhwd mm7,mm5 ; mm7=(** 52 ** 53) - punpcklwd mm5,mm5 ; mm5=(50 50 51 51) - punpckhwd mm3,mm1 ; mm3=(** 72 ** 73) - punpcklwd mm1,mm1 ; mm1=(70 70 71 71) - - movlhps xmm2,xmm4 ; xmm2=in1=(10 11 12 13) - movlhps xmm3,xmm0 ; xmm3=in3=(30 31 32 33) - - psrad mm7,(DWORD_BIT-WORD_BIT) ; mm7=in5H=(52 53) - psrad mm5,(DWORD_BIT-WORD_BIT) ; mm5=in5L=(50 51) - cvtpi2ps xmm4,mm7 ; xmm4=(52 53 ** **) - cvtpi2ps xmm5,mm5 ; xmm5=(50 51 ** **) - psrad mm3,(DWORD_BIT-WORD_BIT) ; mm3=in7H=(72 73) - psrad mm1,(DWORD_BIT-WORD_BIT) ; mm1=in7L=(70 71) - cvtpi2ps xmm0,mm3 ; xmm0=(72 73 ** **) - cvtpi2ps xmm1,mm1 ; xmm1=(70 71 ** **) - - mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - mulps xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - - movlhps xmm5,xmm4 ; xmm5=in5=(50 51 52 53) - movlhps xmm1,xmm0 ; xmm1=in7=(70 71 72 73) - mulps xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - mulps xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - - movaps xmm4,xmm2 - movaps xmm0,xmm5 - addps xmm2,xmm1 ; xmm2=z11 - addps xmm5,xmm3 ; xmm5=z13 - subps xmm4,xmm1 ; xmm4=z12 - subps xmm0,xmm3 ; xmm0=z10 - - movaps xmm1,xmm2 - subps xmm2,xmm5 - addps xmm1,xmm5 ; xmm1=tmp7 - - mulps xmm2,[GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11 - - movaps xmm3,xmm0 - addps xmm0,xmm4 - mulps xmm0,[GOTOFF(ebx,PD_1_847)] ; xmm0=z5 - mulps xmm3,[GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930) - mulps xmm4,[GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200) - addps xmm3,xmm0 ; xmm3=tmp12 - subps xmm4,xmm0 ; xmm4=tmp10 - - ; -- Final output stage - - subps xmm3,xmm1 ; xmm3=tmp6 - movaps xmm5,xmm6 - movaps xmm0,xmm7 - addps xmm6,xmm1 ; xmm6=data0=(00 01 02 03) - addps xmm7,xmm3 ; xmm7=data1=(10 11 12 13) - subps xmm5,xmm1 ; xmm5=data7=(70 71 72 73) - subps xmm0,xmm3 ; xmm0=data6=(60 61 62 63) - subps xmm2,xmm3 ; xmm2=tmp5 - - movaps xmm1,xmm6 ; transpose coefficients(phase 1) - unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11) - unpckhps xmm1,xmm7 ; xmm1=(02 12 03 13) - movaps xmm3,xmm0 ; transpose coefficients(phase 1) - unpcklps xmm0,xmm5 ; xmm0=(60 70 61 71) - unpckhps xmm3,xmm5 ; xmm3=(62 72 63 73) - - movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 - movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3 - - movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71) - movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73) - - addps xmm4,xmm2 ; xmm4=tmp4 - movaps xmm0,xmm7 - movaps xmm3,xmm5 - addps xmm7,xmm2 ; xmm7=data2=(20 21 22 23) - addps xmm5,xmm4 ; xmm5=data4=(40 41 42 43) - subps xmm0,xmm2 ; xmm0=data5=(50 51 52 53) - subps xmm3,xmm4 ; xmm3=data3=(30 31 32 33) - - movaps xmm2,xmm7 ; transpose coefficients(phase 1) - unpcklps xmm7,xmm3 ; xmm7=(20 30 21 31) - unpckhps xmm2,xmm3 ; xmm2=(22 32 23 33) - movaps xmm4,xmm5 ; transpose coefficients(phase 1) - unpcklps xmm5,xmm0 ; xmm5=(40 50 41 51) - unpckhps xmm4,xmm0 ; xmm4=(42 52 43 53) - - movaps xmm3,xmm6 ; transpose coefficients(phase 2) - unpcklps2 xmm6,xmm7 ; xmm6=(00 10 20 30) - unpckhps2 xmm3,xmm7 ; xmm3=(01 11 21 31) - movaps xmm0,xmm1 ; transpose coefficients(phase 2) - unpcklps2 xmm1,xmm2 ; xmm1=(02 12 22 32) - unpckhps2 xmm0,xmm2 ; xmm0=(03 13 23 33) - - movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71) - movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73) - - movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6 - movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3 - movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1 - movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0 - - movaps xmm6,xmm5 ; transpose coefficients(phase 2) - unpcklps2 xmm5,xmm7 ; xmm5=(40 50 60 70) - unpckhps2 xmm6,xmm7 ; xmm6=(41 51 61 71) - movaps xmm3,xmm4 ; transpose coefficients(phase 2) - unpcklps2 xmm4,xmm2 ; xmm4=(42 52 62 72) - unpckhps2 xmm3,xmm2 ; xmm3=(43 53 63 73) - - movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5 - movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6 - movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4 - movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 - -.nextcolumn: - add esi, byte 4*SIZEOF_JCOEF ; coef_block - add edx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr - add edi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr - dec ecx ; ctr - jnz near .columnloop - - ; -- Prefetch the next coefficient block - - prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32] - prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32] - prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32] - prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32] - - ; ---- Pass 2: process rows from work array, store into output array. - - mov eax, [original_ebp] - lea esi, [workspace] ; FAST_FLOAT * wsptr - mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) - mov eax, JDIMENSION [output_col(eax)] - mov ecx, DCTSIZE/4 ; ctr - alignx 16,7 -.rowloop: - - ; -- Even part - - movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] - movaps xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)] - movaps xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)] - movaps xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)] - - movaps xmm4,xmm0 - movaps xmm5,xmm1 - subps xmm0,xmm2 ; xmm0=tmp11 - subps xmm1,xmm3 - addps xmm4,xmm2 ; xmm4=tmp10 - addps xmm5,xmm3 ; xmm5=tmp13 - - mulps xmm1,[GOTOFF(ebx,PD_1_414)] - subps xmm1,xmm5 ; xmm1=tmp12 - - movaps xmm6,xmm4 - movaps xmm7,xmm0 - subps xmm4,xmm5 ; xmm4=tmp3 - subps xmm0,xmm1 ; xmm0=tmp2 - addps xmm6,xmm5 ; xmm6=tmp0 - addps xmm7,xmm1 ; xmm7=tmp1 - - movaps XMMWORD [wk(1)], xmm4 ; tmp3 - movaps XMMWORD [wk(0)], xmm0 ; tmp2 - - ; -- Odd part - - movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] - movaps xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)] - movaps xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)] - movaps xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)] - - movaps xmm4,xmm2 - movaps xmm0,xmm5 - addps xmm2,xmm1 ; xmm2=z11 - addps xmm5,xmm3 ; xmm5=z13 - subps xmm4,xmm1 ; xmm4=z12 - subps xmm0,xmm3 ; xmm0=z10 - - movaps xmm1,xmm2 - subps xmm2,xmm5 - addps xmm1,xmm5 ; xmm1=tmp7 - - mulps xmm2,[GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11 - - movaps xmm3,xmm0 - addps xmm0,xmm4 - mulps xmm0,[GOTOFF(ebx,PD_1_847)] ; xmm0=z5 - mulps xmm3,[GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930) - mulps xmm4,[GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200) - addps xmm3,xmm0 ; xmm3=tmp12 - subps xmm4,xmm0 ; xmm4=tmp10 - - ; -- Final output stage - - subps xmm3,xmm1 ; xmm3=tmp6 - movaps xmm5,xmm6 - movaps xmm0,xmm7 - addps xmm6,xmm1 ; xmm6=data0=(00 10 20 30) - addps xmm7,xmm3 ; xmm7=data1=(01 11 21 31) - subps xmm5,xmm1 ; xmm5=data7=(07 17 27 37) - subps xmm0,xmm3 ; xmm0=data6=(06 16 26 36) - subps xmm2,xmm3 ; xmm2=tmp5 - - movaps xmm1,[GOTOFF(ebx,PD_0_125)] ; xmm1=[PD_0_125] - - mulps xmm6,xmm1 ; descale(1/8) - mulps xmm7,xmm1 ; descale(1/8) - mulps xmm5,xmm1 ; descale(1/8) - mulps xmm0,xmm1 ; descale(1/8) - - movhlps xmm3,xmm6 - movhlps xmm1,xmm7 - cvtps2pi mm0,xmm6 ; round to int32, mm0=data0L=(00 10) - cvtps2pi mm1,xmm7 ; round to int32, mm1=data1L=(01 11) - cvtps2pi mm2,xmm3 ; round to int32, mm2=data0H=(20 30) - cvtps2pi mm3,xmm1 ; round to int32, mm3=data1H=(21 31) - packssdw mm0,mm2 ; mm0=data0=(00 10 20 30) - packssdw mm1,mm3 ; mm1=data1=(01 11 21 31) - - movhlps xmm6,xmm5 - movhlps xmm7,xmm0 - cvtps2pi mm4,xmm5 ; round to int32, mm4=data7L=(07 17) - cvtps2pi mm5,xmm0 ; round to int32, mm5=data6L=(06 16) - cvtps2pi mm6,xmm6 ; round to int32, mm6=data7H=(27 37) - cvtps2pi mm7,xmm7 ; round to int32, mm7=data6H=(26 36) - packssdw mm4,mm6 ; mm4=data7=(07 17 27 37) - packssdw mm5,mm7 ; mm5=data6=(06 16 26 36) - - packsswb mm0,mm5 ; mm0=(00 10 20 30 06 16 26 36) - packsswb mm1,mm4 ; mm1=(01 11 21 31 07 17 27 37) - - movaps xmm3, XMMWORD [wk(0)] ; xmm3=tmp2 - movaps xmm1, XMMWORD [wk(1)] ; xmm1=tmp3 - - movaps xmm6,[GOTOFF(ebx,PD_0_125)] ; xmm6=[PD_0_125] - - addps xmm4,xmm2 ; xmm4=tmp4 - movaps xmm5,xmm3 - movaps xmm0,xmm1 - addps xmm3,xmm2 ; xmm3=data2=(02 12 22 32) - addps xmm1,xmm4 ; xmm1=data4=(04 14 24 34) - subps xmm5,xmm2 ; xmm5=data5=(05 15 25 35) - subps xmm0,xmm4 ; xmm0=data3=(03 13 23 33) - - mulps xmm3,xmm6 ; descale(1/8) - mulps xmm1,xmm6 ; descale(1/8) - mulps xmm5,xmm6 ; descale(1/8) - mulps xmm0,xmm6 ; descale(1/8) - - movhlps xmm7,xmm3 - movhlps xmm2,xmm1 - cvtps2pi mm2,xmm3 ; round to int32, mm2=data2L=(02 12) - cvtps2pi mm3,xmm1 ; round to int32, mm3=data4L=(04 14) - cvtps2pi mm6,xmm7 ; round to int32, mm6=data2H=(22 32) - cvtps2pi mm7,xmm2 ; round to int32, mm7=data4H=(24 34) - packssdw mm2,mm6 ; mm2=data2=(02 12 22 32) - packssdw mm3,mm7 ; mm3=data4=(04 14 24 34) - - movhlps xmm4,xmm5 - movhlps xmm6,xmm0 - cvtps2pi mm5,xmm5 ; round to int32, mm5=data5L=(05 15) - cvtps2pi mm4,xmm0 ; round to int32, mm4=data3L=(03 13) - cvtps2pi mm6,xmm4 ; round to int32, mm6=data5H=(25 35) - cvtps2pi mm7,xmm6 ; round to int32, mm7=data3H=(23 33) - packssdw mm5,mm6 ; mm5=data5=(05 15 25 35) - packssdw mm4,mm7 ; mm4=data3=(03 13 23 33) - - movq mm6,[GOTOFF(ebx,PB_CENTERJSAMP)] ; mm6=[PB_CENTERJSAMP] - - packsswb mm2,mm3 ; mm2=(02 12 22 32 04 14 24 34) - packsswb mm4,mm5 ; mm4=(03 13 23 33 05 15 25 35) - - paddb mm0,mm6 - paddb mm1,mm6 - paddb mm2,mm6 - paddb mm4,mm6 - - movq mm7,mm0 ; transpose coefficients(phase 1) - punpcklbw mm0,mm1 ; mm0=(00 01 10 11 20 21 30 31) - punpckhbw mm7,mm1 ; mm7=(06 07 16 17 26 27 36 37) - movq mm3,mm2 ; transpose coefficients(phase 1) - punpcklbw mm2,mm4 ; mm2=(02 03 12 13 22 23 32 33) - punpckhbw mm3,mm4 ; mm3=(04 05 14 15 24 25 34 35) - - movq mm5,mm0 ; transpose coefficients(phase 2) - punpcklwd mm0,mm2 ; mm0=(00 01 02 03 10 11 12 13) - punpckhwd mm5,mm2 ; mm5=(20 21 22 23 30 31 32 33) - movq mm6,mm3 ; transpose coefficients(phase 2) - punpcklwd mm3,mm7 ; mm3=(04 05 06 07 14 15 16 17) - punpckhwd mm6,mm7 ; mm6=(24 25 26 27 34 35 36 37) - - movq mm1,mm0 ; transpose coefficients(phase 3) - punpckldq mm0,mm3 ; mm0=(00 01 02 03 04 05 06 07) - punpckhdq mm1,mm3 ; mm1=(10 11 12 13 14 15 16 17) - movq mm4,mm5 ; transpose coefficients(phase 3) - punpckldq mm5,mm6 ; mm5=(20 21 22 23 24 25 26 27) - punpckhdq mm4,mm6 ; mm4=(30 31 32 33 34 35 36 37) - - pushpic ebx ; save GOT address - - mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] - mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] - movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0 - movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1 - mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] - mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW] - movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5 - movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4 - - poppic ebx ; restore GOT address - - add esi, byte 4*SIZEOF_FAST_FLOAT ; wsptr - add edi, byte 4*SIZEOF_JSAMPROW - dec ecx ; ctr - jnz near .rowloop - - emms ; empty MMX state - - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret - -; For some reason, the OS X linker does not honor the request to align the -; segment unless we do this. - align 16 diff --git a/Builder/jni-1.11/simd/i386/src/jsimdcfg.inc b/Builder/jni-1.11/simd/i386/src/jsimdcfg.inc deleted file mode 100644 index 3ce75d334..000000000 --- a/Builder/jni-1.11/simd/i386/src/jsimdcfg.inc +++ /dev/null @@ -1,181 +0,0 @@ -; -; Automatically generated include file from jsimdcfg.inc.h -; - -; -; -- jpeglib.h -; - -%define DCTSIZE 8 -%define DCTSIZE2 64 - -; -; -- jmorecfg.h -; - -%define RGB_RED 0 -%define RGB_GREEN 1 -%define RGB_BLUE 2 -%define RGB_PIXELSIZE 3 - -%define EXT_RGB_RED 0 -%define EXT_RGB_GREEN 1 -%define EXT_RGB_BLUE 2 -%define EXT_RGB_PIXELSIZE 3 - -%define EXT_RGBX_RED 0 -%define EXT_RGBX_GREEN 1 -%define EXT_RGBX_BLUE 2 -%define EXT_RGBX_PIXELSIZE 4 - -%define EXT_BGR_RED 2 -%define EXT_BGR_GREEN 1 -%define EXT_BGR_BLUE 0 -%define EXT_BGR_PIXELSIZE 3 - -%define EXT_BGRX_RED 2 -%define EXT_BGRX_GREEN 1 -%define EXT_BGRX_BLUE 0 -%define EXT_BGRX_PIXELSIZE 4 - -%define EXT_XBGR_RED 3 -%define EXT_XBGR_GREEN 2 -%define EXT_XBGR_BLUE 1 -%define EXT_XBGR_PIXELSIZE 4 - -%define EXT_XRGB_RED 1 -%define EXT_XRGB_GREEN 2 -%define EXT_XRGB_BLUE 3 -%define EXT_XRGB_PIXELSIZE 4 - -%define RGBX_FILLER_0XFF 1 - -; Representation of a single sample (pixel element value). -; On this SIMD implementation, this must be 'unsigned char'. -; - -%define JSAMPLE byte ; unsigned char -%define SIZEOF_JSAMPLE SIZEOF_BYTE ; sizeof(JSAMPLE) - -%define CENTERJSAMPLE 128 - -; Representation of a DCT frequency coefficient. -; On this SIMD implementation, this must be 'short'. -; -%define JCOEF word ; short -%define SIZEOF_JCOEF SIZEOF_WORD ; sizeof(JCOEF) - -; Datatype used for image dimensions. -; On this SIMD implementation, this must be 'unsigned int'. -; -%define JDIMENSION dword ; unsigned int -%define SIZEOF_JDIMENSION SIZEOF_DWORD ; sizeof(JDIMENSION) - -%define JSAMPROW POINTER ; JSAMPLE FAR * (jpeglib.h) -%define JSAMPARRAY POINTER ; JSAMPROW * (jpeglib.h) -%define JSAMPIMAGE POINTER ; JSAMPARRAY * (jpeglib.h) -%define JCOEFPTR POINTER ; JCOEF FAR * (jpeglib.h) -%define SIZEOF_JSAMPROW SIZEOF_POINTER ; sizeof(JSAMPROW) -%define SIZEOF_JSAMPARRAY SIZEOF_POINTER ; sizeof(JSAMPARRAY) -%define SIZEOF_JSAMPIMAGE SIZEOF_POINTER ; sizeof(JSAMPIMAGE) -%define SIZEOF_JCOEFPTR SIZEOF_POINTER ; sizeof(JCOEFPTR) - -; -; -- jdct.h -; - -; A forward DCT routine is given a pointer to a work area of type DCTELEM[]; -; the DCT is to be performed in-place in that buffer. -; To maximize parallelism, Type DCTELEM is changed to short (originally, int). -; -%define DCTELEM word ; short -%define SIZEOF_DCTELEM SIZEOF_WORD ; sizeof(DCTELEM) - -%define FAST_FLOAT FP32 ; float -%define SIZEOF_FAST_FLOAT SIZEOF_FP32 ; sizeof(FAST_FLOAT) - -; To maximize parallelism, Type MULTIPLIER is changed to short. -; -%define ISLOW_MULT_TYPE word ; must be short -%define SIZEOF_ISLOW_MULT_TYPE SIZEOF_WORD ; sizeof(ISLOW_MULT_TYPE) - -%define IFAST_MULT_TYPE word ; must be short -%define SIZEOF_IFAST_MULT_TYPE SIZEOF_WORD ; sizeof(IFAST_MULT_TYPE) -%define IFAST_SCALE_BITS 2 ; fractional bits in scale factors - -%define FLOAT_MULT_TYPE FP32 ; must be float -%define SIZEOF_FLOAT_MULT_TYPE SIZEOF_FP32 ; sizeof(FLOAT_MULT_TYPE) - -; -; -- jsimd.h -; - -%define JSIMD_NONE 0 -%define JSIMD_MMX 1 -%define JSIMD_3DNOW 2 -%define JSIMD_SSE 4 -%define JSIMD_SSE2 8 - -; Short forms of external names for systems with brain-damaged linkers. -; -%define _cpp_protection_jpeg_simd_cpu_support jpeg_simd_cpu_support -%define _cpp_protection_jsimd_rgb_ycc_convert_mmx jsimd_rgb_ycc_convert_mmx -%define _cpp_protection_jsimd_ycc_rgb_convert_mmx jsimd_ycc_rgb_convert_mmx -%define _cpp_protection_jconst_rgb_ycc_convert_sse2 jconst_rgb_ycc_convert_sse2 -%define _cpp_protection_jsimd_rgb_ycc_convert_sse2 jsimd_rgb_ycc_convert_sse2 -%define _cpp_protection_jconst_ycc_rgb_convert_sse2 jconst_ycc_rgb_convert_sse2 -%define _cpp_protection_jsimd_ycc_rgb_convert_sse2 jsimd_ycc_rgb_convert_sse2 -%define _cpp_protection_jsimd_h2v2_downsample_mmx jsimd_h2v2_downsample_mmx -%define _cpp_protection_jsimd_h2v1_downsample_mmx jsimd_h2v1_downsample_mmx -%define _cpp_protection_jsimd_h2v2_downsample_sse2 jsimd_h2v2_downsample_sse2 -%define _cpp_protection_jsimd_h2v1_downsample_sse2 jsimd_h2v1_downsample_sse2 -%define _cpp_protection_jsimd_h2v2_upsample_mmx jsimd_h2v2_upsample_mmx -%define _cpp_protection_jsimd_h2v1_upsample_mmx jsimd_h2v1_upsample_mmx -%define _cpp_protection_jsimd_h2v1_fancy_upsample_mmx jsimd_h2v1_fancy_upsample_mmx -%define _cpp_protection_jsimd_h2v2_fancy_upsample_mmx jsimd_h2v2_fancy_upsample_mmx -%define _cpp_protection_jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_merged_upsample_mmx -%define _cpp_protection_jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_merged_upsample_mmx -%define _cpp_protection_jsimd_h2v2_upsample_sse2 jsimd_h2v2_upsample_sse2 -%define _cpp_protection_jsimd_h2v1_upsample_sse2 jsimd_h2v1_upsample_sse2 -%define _cpp_protection_jconst_fancy_upsample_sse2 jconst_fancy_upsample_sse2 -%define _cpp_protection_jsimd_h2v1_fancy_upsample_sse2 jsimd_h2v1_fancy_upsample_sse2 -%define _cpp_protection_jsimd_h2v2_fancy_upsample_sse2 jsimd_h2v2_fancy_upsample_sse2 -%define _cpp_protection_jconst_merged_upsample_sse2 jconst_merged_upsample_sse2 -%define _cpp_protection_jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_merged_upsample_sse2 -%define _cpp_protection_jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_merged_upsample_sse2 -%define _cpp_protection_jsimd_convsamp_mmx jsimd_convsamp_mmx -%define _cpp_protection_jsimd_convsamp_sse2 jsimd_convsamp_sse2 -%define _cpp_protection_jsimd_convsamp_float_3dnow jsimd_convsamp_float_3dnow -%define _cpp_protection_jsimd_convsamp_float_sse jsimd_convsamp_float_sse -%define _cpp_protection_jsimd_convsamp_float_sse2 jsimd_convsamp_float_sse2 -%define _cpp_protection_jsimd_fdct_islow_mmx jsimd_fdct_islow_mmx -%define _cpp_protection_jsimd_fdct_ifast_mmx jsimd_fdct_ifast_mmx -%define _cpp_protection_jconst_fdct_islow_sse2 jconst_fdct_islow_sse2 -%define _cpp_protection_jsimd_fdct_islow_sse2 jsimd_fdct_islow_sse2 -%define _cpp_protection_jconst_fdct_ifast_sse2 jconst_fdct_ifast_sse2 -%define _cpp_protection_jsimd_fdct_ifast_sse2 jsimd_fdct_ifast_sse2 -%define _cpp_protection_jsimd_fdct_float_3dnow jsimd_fdct_float_3dnow -%define _cpp_protection_jconst_fdct_float_sse jconst_fdct_float_sse -%define _cpp_protection_jsimd_fdct_float_sse jsimd_fdct_float_sse -%define _cpp_protection_jsimd_quantize_mmx jsimd_quantize_mmx -%define _cpp_protection_jsimd_quantize_sse2 jsimd_quantize_sse2 -%define _cpp_protection_jsimd_quantize_float_3dnow jsimd_quantize_float_3dnow -%define _cpp_protection_jsimd_quantize_float_sse jsimd_quantize_float_sse -%define _cpp_protection_jsimd_quantize_float_sse2 jsimd_quantize_float_sse2 -%define _cpp_protection_jsimd_idct_2x2_mmx jsimd_idct_2x2_mmx -%define _cpp_protection_jsimd_idct_4x4_mmx jsimd_idct_4x4_mmx -%define _cpp_protection_jconst_idct_red_sse2 jconst_idct_red_sse2 -%define _cpp_protection_jsimd_idct_2x2_sse2 jsimd_idct_2x2_sse2 -%define _cpp_protection_jsimd_idct_4x4_sse2 jsimd_idct_4x4_sse2 -%define _cpp_protection_jsimd_idct_islow_mmx jsimd_idct_islow_mmx -%define _cpp_protection_jsimd_idct_ifast_mmx jsimd_idct_ifast_mmx -%define _cpp_protection_jconst_idct_islow_sse2 jconst_idct_islow_sse2 -%define _cpp_protection_jsimd_idct_islow_sse2 jsimd_idct_islow_sse2 -%define _cpp_protection_jconst_idct_ifast_sse2 jconst_idct_ifast_sse2 -%define _cpp_protection_jsimd_idct_ifast_sse2 jsimd_idct_ifast_sse2 -%define _cpp_protection_jsimd_idct_float_3dnow jsimd_idct_float_3dnow -%define _cpp_protection_jconst_idct_float_sse jconst_idct_float_sse -%define _cpp_protection_jsimd_idct_float_sse jsimd_idct_float_sse -%define _cpp_protection_jconst_idct_float_sse2 jconst_idct_float_sse2 -%define _cpp_protection_jsimd_idct_float_sse2 jsimd_idct_float_sse2 - diff --git a/Builder/jni-1.11/simd/i386/src/jsimdcfg.inc.h b/Builder/jni-1.11/simd/i386/src/jsimdcfg.inc.h deleted file mode 100644 index 583b7e325..000000000 --- a/Builder/jni-1.11/simd/i386/src/jsimdcfg.inc.h +++ /dev/null @@ -1,196 +0,0 @@ -// This file generates the include file for the assembly -// implementations by abusing the C preprocessor. -// -// Note: Some things are manually defined as they need to -// be mapped to NASM types. - -; -; Automatically generated include file from jsimdcfg.inc.h -; - -#define JPEG_INTERNALS - -#include "../jpeglib.h" -#include "../jconfig.h" -#include "../jmorecfg.h" -#include "jsimd.h" - -; -; -- jpeglib.h -; - -%define _cpp_protection_DCTSIZE DCTSIZE -%define _cpp_protection_DCTSIZE2 DCTSIZE2 - -; -; -- jmorecfg.h -; - -%define _cpp_protection_RGB_RED RGB_RED -%define _cpp_protection_RGB_GREEN RGB_GREEN -%define _cpp_protection_RGB_BLUE RGB_BLUE -%define _cpp_protection_RGB_PIXELSIZE RGB_PIXELSIZE - -%define _cpp_protection_EXT_RGB_RED EXT_RGB_RED -%define _cpp_protection_EXT_RGB_GREEN EXT_RGB_GREEN -%define _cpp_protection_EXT_RGB_BLUE EXT_RGB_BLUE -%define _cpp_protection_EXT_RGB_PIXELSIZE EXT_RGB_PIXELSIZE - -%define _cpp_protection_EXT_RGBX_RED EXT_RGBX_RED -%define _cpp_protection_EXT_RGBX_GREEN EXT_RGBX_GREEN -%define _cpp_protection_EXT_RGBX_BLUE EXT_RGBX_BLUE -%define _cpp_protection_EXT_RGBX_PIXELSIZE EXT_RGBX_PIXELSIZE - -%define _cpp_protection_EXT_BGR_RED EXT_BGR_RED -%define _cpp_protection_EXT_BGR_GREEN EXT_BGR_GREEN -%define _cpp_protection_EXT_BGR_BLUE EXT_BGR_BLUE -%define _cpp_protection_EXT_BGR_PIXELSIZE EXT_BGR_PIXELSIZE - -%define _cpp_protection_EXT_BGRX_RED EXT_BGRX_RED -%define _cpp_protection_EXT_BGRX_GREEN EXT_BGRX_GREEN -%define _cpp_protection_EXT_BGRX_BLUE EXT_BGRX_BLUE -%define _cpp_protection_EXT_BGRX_PIXELSIZE EXT_BGRX_PIXELSIZE - -%define _cpp_protection_EXT_XBGR_RED EXT_XBGR_RED -%define _cpp_protection_EXT_XBGR_GREEN EXT_XBGR_GREEN -%define _cpp_protection_EXT_XBGR_BLUE EXT_XBGR_BLUE -%define _cpp_protection_EXT_XBGR_PIXELSIZE EXT_XBGR_PIXELSIZE - -%define _cpp_protection_EXT_XRGB_RED EXT_XRGB_RED -%define _cpp_protection_EXT_XRGB_GREEN EXT_XRGB_GREEN -%define _cpp_protection_EXT_XRGB_BLUE EXT_XRGB_BLUE -%define _cpp_protection_EXT_XRGB_PIXELSIZE EXT_XRGB_PIXELSIZE - -%define RGBX_FILLER_0XFF 1 - -; Representation of a single sample (pixel element value). -; On this SIMD implementation, this must be 'unsigned char'. -; - -%define JSAMPLE byte ; unsigned char -%define SIZEOF_JSAMPLE SIZEOF_BYTE ; sizeof(JSAMPLE) - -%define _cpp_protection_CENTERJSAMPLE CENTERJSAMPLE - -; Representation of a DCT frequency coefficient. -; On this SIMD implementation, this must be 'short'. -; -%define JCOEF word ; short -%define SIZEOF_JCOEF SIZEOF_WORD ; sizeof(JCOEF) - -; Datatype used for image dimensions. -; On this SIMD implementation, this must be 'unsigned int'. -; -%define JDIMENSION dword ; unsigned int -%define SIZEOF_JDIMENSION SIZEOF_DWORD ; sizeof(JDIMENSION) - -%define JSAMPROW POINTER ; JSAMPLE FAR * (jpeglib.h) -%define JSAMPARRAY POINTER ; JSAMPROW * (jpeglib.h) -%define JSAMPIMAGE POINTER ; JSAMPARRAY * (jpeglib.h) -%define JCOEFPTR POINTER ; JCOEF FAR * (jpeglib.h) -%define SIZEOF_JSAMPROW SIZEOF_POINTER ; sizeof(JSAMPROW) -%define SIZEOF_JSAMPARRAY SIZEOF_POINTER ; sizeof(JSAMPARRAY) -%define SIZEOF_JSAMPIMAGE SIZEOF_POINTER ; sizeof(JSAMPIMAGE) -%define SIZEOF_JCOEFPTR SIZEOF_POINTER ; sizeof(JCOEFPTR) - -; -; -- jdct.h -; - -; A forward DCT routine is given a pointer to a work area of type DCTELEM[]; -; the DCT is to be performed in-place in that buffer. -; To maximize parallelism, Type DCTELEM is changed to short (originally, int). -; -%define DCTELEM word ; short -%define SIZEOF_DCTELEM SIZEOF_WORD ; sizeof(DCTELEM) - -%define FAST_FLOAT FP32 ; float -%define SIZEOF_FAST_FLOAT SIZEOF_FP32 ; sizeof(FAST_FLOAT) - -; To maximize parallelism, Type MULTIPLIER is changed to short. -; -%define ISLOW_MULT_TYPE word ; must be short -%define SIZEOF_ISLOW_MULT_TYPE SIZEOF_WORD ; sizeof(ISLOW_MULT_TYPE) - -%define IFAST_MULT_TYPE word ; must be short -%define SIZEOF_IFAST_MULT_TYPE SIZEOF_WORD ; sizeof(IFAST_MULT_TYPE) -%define IFAST_SCALE_BITS 2 ; fractional bits in scale factors - -%define FLOAT_MULT_TYPE FP32 ; must be float -%define SIZEOF_FLOAT_MULT_TYPE SIZEOF_FP32 ; sizeof(FLOAT_MULT_TYPE) - -; -; -- jsimd.h -; - -%define _cpp_protection_JSIMD_NONE JSIMD_NONE -%define _cpp_protection_JSIMD_MMX JSIMD_MMX -%define _cpp_protection_JSIMD_3DNOW JSIMD_3DNOW -%define _cpp_protection_JSIMD_SSE JSIMD_SSE -%define _cpp_protection_JSIMD_SSE2 JSIMD_SSE2 - -; Short forms of external names for systems with brain-damaged linkers. -; -#ifdef NEED_SHORT_EXTERNAL_NAMES -%define _cpp_protection_jpeg_simd_cpu_support jpeg_simd_cpu_support -%define _cpp_protection_jsimd_rgb_ycc_convert_mmx jsimd_rgb_ycc_convert_mmx -%define _cpp_protection_jsimd_ycc_rgb_convert_mmx jsimd_ycc_rgb_convert_mmx -%define _cpp_protection_jconst_rgb_ycc_convert_sse2 jconst_rgb_ycc_convert_sse2 -%define _cpp_protection_jsimd_rgb_ycc_convert_sse2 jsimd_rgb_ycc_convert_sse2 -%define _cpp_protection_jconst_ycc_rgb_convert_sse2 jconst_ycc_rgb_convert_sse2 -%define _cpp_protection_jsimd_ycc_rgb_convert_sse2 jsimd_ycc_rgb_convert_sse2 -%define _cpp_protection_jsimd_h2v2_downsample_mmx jsimd_h2v2_downsample_mmx -%define _cpp_protection_jsimd_h2v1_downsample_mmx jsimd_h2v1_downsample_mmx -%define _cpp_protection_jsimd_h2v2_downsample_sse2 jsimd_h2v2_downsample_sse2 -%define _cpp_protection_jsimd_h2v1_downsample_sse2 jsimd_h2v1_downsample_sse2 -%define _cpp_protection_jsimd_h2v2_upsample_mmx jsimd_h2v2_upsample_mmx -%define _cpp_protection_jsimd_h2v1_upsample_mmx jsimd_h2v1_upsample_mmx -%define _cpp_protection_jsimd_h2v1_fancy_upsample_mmx jsimd_h2v1_fancy_upsample_mmx -%define _cpp_protection_jsimd_h2v2_fancy_upsample_mmx jsimd_h2v2_fancy_upsample_mmx -%define _cpp_protection_jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_merged_upsample_mmx -%define _cpp_protection_jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_merged_upsample_mmx -%define _cpp_protection_jsimd_h2v2_upsample_sse2 jsimd_h2v2_upsample_sse2 -%define _cpp_protection_jsimd_h2v1_upsample_sse2 jsimd_h2v1_upsample_sse2 -%define _cpp_protection_jconst_fancy_upsample_sse2 jconst_fancy_upsample_sse2 -%define _cpp_protection_jsimd_h2v1_fancy_upsample_sse2 jsimd_h2v1_fancy_upsample_sse2 -%define _cpp_protection_jsimd_h2v2_fancy_upsample_sse2 jsimd_h2v2_fancy_upsample_sse2 -%define _cpp_protection_jconst_merged_upsample_sse2 jconst_merged_upsample_sse2 -%define _cpp_protection_jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_merged_upsample_sse2 -%define _cpp_protection_jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_merged_upsample_sse2 -%define _cpp_protection_jsimd_convsamp_mmx jsimd_convsamp_mmx -%define _cpp_protection_jsimd_convsamp_sse2 jsimd_convsamp_sse2 -%define _cpp_protection_jsimd_convsamp_float_3dnow jsimd_convsamp_float_3dnow -%define _cpp_protection_jsimd_convsamp_float_sse jsimd_convsamp_float_sse -%define _cpp_protection_jsimd_convsamp_float_sse2 jsimd_convsamp_float_sse2 -%define _cpp_protection_jsimd_fdct_islow_mmx jsimd_fdct_islow_mmx -%define _cpp_protection_jsimd_fdct_ifast_mmx jsimd_fdct_ifast_mmx -%define _cpp_protection_jconst_fdct_islow_sse2 jconst_fdct_islow_sse2 -%define _cpp_protection_jsimd_fdct_islow_sse2 jsimd_fdct_islow_sse2 -%define _cpp_protection_jconst_fdct_ifast_sse2 jconst_fdct_ifast_sse2 -%define _cpp_protection_jsimd_fdct_ifast_sse2 jsimd_fdct_ifast_sse2 -%define _cpp_protection_jsimd_fdct_float_3dnow jsimd_fdct_float_3dnow -%define _cpp_protection_jconst_fdct_float_sse jconst_fdct_float_sse -%define _cpp_protection_jsimd_fdct_float_sse jsimd_fdct_float_sse -%define _cpp_protection_jsimd_quantize_mmx jsimd_quantize_mmx -%define _cpp_protection_jsimd_quantize_sse2 jsimd_quantize_sse2 -%define _cpp_protection_jsimd_quantize_float_3dnow jsimd_quantize_float_3dnow -%define _cpp_protection_jsimd_quantize_float_sse jsimd_quantize_float_sse -%define _cpp_protection_jsimd_quantize_float_sse2 jsimd_quantize_float_sse2 -%define _cpp_protection_jsimd_idct_2x2_mmx jsimd_idct_2x2_mmx -%define _cpp_protection_jsimd_idct_4x4_mmx jsimd_idct_4x4_mmx -%define _cpp_protection_jconst_idct_red_sse2 jconst_idct_red_sse2 -%define _cpp_protection_jsimd_idct_2x2_sse2 jsimd_idct_2x2_sse2 -%define _cpp_protection_jsimd_idct_4x4_sse2 jsimd_idct_4x4_sse2 -%define _cpp_protection_jsimd_idct_islow_mmx jsimd_idct_islow_mmx -%define _cpp_protection_jsimd_idct_ifast_mmx jsimd_idct_ifast_mmx -%define _cpp_protection_jconst_idct_islow_sse2 jconst_idct_islow_sse2 -%define _cpp_protection_jsimd_idct_islow_sse2 jsimd_idct_islow_sse2 -%define _cpp_protection_jconst_idct_ifast_sse2 jconst_idct_ifast_sse2 -%define _cpp_protection_jsimd_idct_ifast_sse2 jsimd_idct_ifast_sse2 -%define _cpp_protection_jsimd_idct_float_3dnow jsimd_idct_float_3dnow -%define _cpp_protection_jconst_idct_float_sse jconst_idct_float_sse -%define _cpp_protection_jsimd_idct_float_sse jsimd_idct_float_sse -%define _cpp_protection_jconst_idct_float_sse2 jconst_idct_float_sse2 -%define _cpp_protection_jsimd_idct_float_sse2 jsimd_idct_float_sse2 -#endif /* NEED_SHORT_EXTERNAL_NAMES */ - diff --git a/Builder/jni-1.11/simd/i386/src/jsimdcpu.asm b/Builder/jni-1.11/simd/i386/src/jsimdcpu.asm deleted file mode 100644 index bdbcc2317..000000000 --- a/Builder/jni-1.11/simd/i386/src/jsimdcpu.asm +++ /dev/null @@ -1,105 +0,0 @@ -; -; jsimdcpu.asm - SIMD instruction support check -; -; Copyright 2009 Pierre Ossman for Cendio AB -; -; Based on -; x86 SIMD extension for IJG JPEG library -; Copyright (C) 1999-2006, MIYASAKA Masaru. -; For conditions of distribution and use, see copyright notice in jsimdext.inc -; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] - -%include "jsimdext.inc" - -; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 -; -; Check if the CPU supports SIMD instructions -; -; GLOBAL(unsigned int) -; jpeg_simd_cpu_support (void) -; - - align 16 - global EXTN(jpeg_simd_cpu_support) - -EXTN(jpeg_simd_cpu_support): - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved -; push esi ; unused - push edi - - xor edi,edi ; simd support flag - - pushfd - pop eax - mov edx,eax - xor eax, 1<<21 ; flip ID bit in EFLAGS - push eax - popfd - pushfd - pop eax - xor eax,edx - jz short .return ; CPUID is not supported - - ; Check for MMX instruction support - xor eax,eax - cpuid - test eax,eax - jz short .return - - xor eax,eax - inc eax - cpuid - mov eax,edx ; eax = Standard feature flags - - test eax, 1<<23 ; bit23:MMX - jz short .no_mmx - or edi, byte JSIMD_MMX -.no_mmx: - test eax, 1<<25 ; bit25:SSE - jz short .no_sse - or edi, byte JSIMD_SSE -.no_sse: - test eax, 1<<26 ; bit26:SSE2 - jz short .no_sse2 - or edi, byte JSIMD_SSE2 -.no_sse2: - - ; Check for 3DNow! instruction support - mov eax, 0x80000000 - cpuid - cmp eax, 0x80000000 - jbe short .return - - mov eax, 0x80000001 - cpuid - mov eax,edx ; eax = Extended feature flags - - test eax, 1<<31 ; bit31:3DNow!(vendor independent) - jz short .no_3dnow - or edi, byte JSIMD_3DNOW -.no_3dnow: - -.return: - mov eax,edi - - pop edi -; pop esi ; unused -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - ret - -; For some reason, the OS X linker does not honor the request to align the -; segment unless we do this. - align 16 diff --git a/Builder/jni-1.11/simd/jccolext-altivec.c b/Builder/jni-1.11/simd/jccolext-altivec.c new file mode 100644 index 000000000..849825eb0 --- /dev/null +++ b/Builder/jni-1.11/simd/jccolext-altivec.c @@ -0,0 +1,267 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved. + * Copyright (C) 2014, Jay Foad. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* This file is included by jccolor-altivec.c */ + + +void jsimd_rgb_ycc_convert_altivec (JDIMENSION img_width, JSAMPARRAY input_buf, + JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) +{ + JSAMPROW inptr, outptr0, outptr1, outptr2; + int pitch = img_width * RGB_PIXELSIZE, num_cols; +#if __BIG_ENDIAN__ + int offset; +#endif + unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16]; + + __vector unsigned char rgb0, rgb1 = {0}, rgb2 = {0}, + rgbg0, rgbg1, rgbg2, rgbg3, y, cb, cr; +#if __BIG_ENDIAN__ || RGB_PIXELSIZE == 4 + __vector unsigned char rgb3 = {0}; +#endif +#if __BIG_ENDIAN__ && RGB_PIXELSIZE == 4 + __vector unsigned char rgb4 = {0}; +#endif + __vector short rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3; + __vector unsigned short yl, yh, crl, crh, cbl, cbh; + __vector int y0, y1, y2, y3, cr0, cr1, cr2, cr3, cb0, cb1, cb2, cb3; + + /* Constants */ + __vector short pw_f0299_f0337 = { __4X2(F_0_299, F_0_337) }, + pw_f0114_f0250 = { __4X2(F_0_114, F_0_250) }, + pw_mf016_mf033 = { __4X2(-F_0_168, -F_0_331) }, + pw_mf008_mf041 = { __4X2(-F_0_081, -F_0_418) }; + __vector unsigned short pw_f050_f000 = { __4X2(F_0_500, 0) }; + __vector int pd_onehalf = { __4X(ONE_HALF) }, + pd_onehalfm1_cj = { __4X(ONE_HALF - 1 + (CENTERJSAMPLE << SCALEBITS)) }; + __vector unsigned char pb_zero = { __16X(0) }, +#if __BIG_ENDIAN__ + shift_pack_index = {0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29}; +#else + shift_pack_index = {2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31}; +#endif + + while (--num_rows >= 0) { + inptr = *input_buf++; + outptr0 = output_buf[0][output_row]; + outptr1 = output_buf[1][output_row]; + outptr2 = output_buf[2][output_row]; + output_row++; + + for (num_cols = pitch; num_cols > 0; + num_cols -= RGB_PIXELSIZE * 16, inptr += RGB_PIXELSIZE * 16, + outptr0 += 16, outptr1 += 16, outptr2 += 16) { + +#if __BIG_ENDIAN__ + /* Load 16 pixels == 48 or 64 bytes */ + offset = (size_t)inptr & 15; + if (offset) { + __vector unsigned char unaligned_shift_index; + int bytes = num_cols + offset; + + if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) { + /* Slow path to prevent buffer overread. Since there is no way to + * read a partial AltiVec register, overread would occur on the last + * chunk of the last image row if the right edge is not on a 16-byte + * boundary. It could also occur on other rows if the bytes per row + * is low enough. Since we can't determine whether we're on the last + * image row, we have to assume every row is the last. + */ + memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16)); + rgb0 = vec_ld(0, tmpbuf); + rgb1 = vec_ld(16, tmpbuf); + rgb2 = vec_ld(32, tmpbuf); +#if RGB_PIXELSIZE == 4 + rgb3 = vec_ld(48, tmpbuf); +#endif + } else { + /* Fast path */ + rgb0 = vec_ld(0, inptr); + if (bytes > 16) + rgb1 = vec_ld(16, inptr); + if (bytes > 32) + rgb2 = vec_ld(32, inptr); + if (bytes > 48) + rgb3 = vec_ld(48, inptr); +#if RGB_PIXELSIZE == 4 + if (bytes > 64) + rgb4 = vec_ld(64, inptr); +#endif + unaligned_shift_index = vec_lvsl(0, inptr); + rgb0 = vec_perm(rgb0, rgb1, unaligned_shift_index); + rgb1 = vec_perm(rgb1, rgb2, unaligned_shift_index); + rgb2 = vec_perm(rgb2, rgb3, unaligned_shift_index); +#if RGB_PIXELSIZE == 4 + rgb3 = vec_perm(rgb3, rgb4, unaligned_shift_index); +#endif + } + } else { +#endif /* __BIG_ENDIAN__ */ + if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) { + /* Slow path */ + memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16)); + rgb0 = VEC_LD(0, tmpbuf); + rgb1 = VEC_LD(16, tmpbuf); + rgb2 = VEC_LD(32, tmpbuf); +#if RGB_PIXELSIZE == 4 + rgb3 = VEC_LD(48, tmpbuf); +#endif + } else { + /* Fast path */ + rgb0 = VEC_LD(0, inptr); + if (num_cols > 16) + rgb1 = VEC_LD(16, inptr); + if (num_cols > 32) + rgb2 = VEC_LD(32, inptr); +#if RGB_PIXELSIZE == 4 + if (num_cols > 48) + rgb3 = VEC_LD(48, inptr); +#endif + } +#if __BIG_ENDIAN__ + } +#endif + +#if RGB_PIXELSIZE == 3 + /* rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5 + * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga + * rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf + * + * rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3 + * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7 + * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb + * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf + */ + rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX0); + rgbg1 = vec_perm(rgb0, rgb1, (__vector unsigned char)RGBG_INDEX1); + rgbg2 = vec_perm(rgb1, rgb2, (__vector unsigned char)RGBG_INDEX2); + rgbg3 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX3); +#else + /* rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3 + * rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7 + * rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb + * rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf + * + * rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3 + * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7 + * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb + * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf + */ + rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX); + rgbg1 = vec_perm(rgb1, rgb1, (__vector unsigned char)RGBG_INDEX); + rgbg2 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX); + rgbg3 = vec_perm(rgb3, rgb3, (__vector unsigned char)RGBG_INDEX); +#endif + + /* rg0 = R0 G0 R1 G1 R2 G2 R3 G3 + * bg0 = B0 G0 B1 G1 B2 G2 B3 G3 + * ... + * + * NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't + * support unsigned vectors. + */ + rg0 = (__vector signed short)VEC_UNPACKHU(rgbg0); + bg0 = (__vector signed short)VEC_UNPACKLU(rgbg0); + rg1 = (__vector signed short)VEC_UNPACKHU(rgbg1); + bg1 = (__vector signed short)VEC_UNPACKLU(rgbg1); + rg2 = (__vector signed short)VEC_UNPACKHU(rgbg2); + bg2 = (__vector signed short)VEC_UNPACKLU(rgbg2); + rg3 = (__vector signed short)VEC_UNPACKHU(rgbg3); + bg3 = (__vector signed short)VEC_UNPACKLU(rgbg3); + + /* (Original) + * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE + * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE + * + * (This implementation) + * Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE + * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE + */ + + /* Calculate Y values */ + + y0 = vec_msums(rg0, pw_f0299_f0337, pd_onehalf); + y1 = vec_msums(rg1, pw_f0299_f0337, pd_onehalf); + y2 = vec_msums(rg2, pw_f0299_f0337, pd_onehalf); + y3 = vec_msums(rg3, pw_f0299_f0337, pd_onehalf); + y0 = vec_msums(bg0, pw_f0114_f0250, y0); + y1 = vec_msums(bg1, pw_f0114_f0250, y1); + y2 = vec_msums(bg2, pw_f0114_f0250, y2); + y3 = vec_msums(bg3, pw_f0114_f0250, y3); + /* Clever way to avoid 4 shifts + 2 packs. This packs the high word from + * each dword into a new 16-bit vector, which is the equivalent of + * descaling the 32-bit results (right-shifting by 16 bits) and then + * packing them. + */ + yl = vec_perm((__vector unsigned short)y0, (__vector unsigned short)y1, + shift_pack_index); + yh = vec_perm((__vector unsigned short)y2, (__vector unsigned short)y3, + shift_pack_index); + y = vec_pack(yl, yh); + vec_st(y, 0, outptr0); + + /* Calculate Cb values */ + cb0 = vec_msums(rg0, pw_mf016_mf033, pd_onehalfm1_cj); + cb1 = vec_msums(rg1, pw_mf016_mf033, pd_onehalfm1_cj); + cb2 = vec_msums(rg2, pw_mf016_mf033, pd_onehalfm1_cj); + cb3 = vec_msums(rg3, pw_mf016_mf033, pd_onehalfm1_cj); + cb0 = (__vector int)vec_msum((__vector unsigned short)bg0, pw_f050_f000, + (__vector unsigned int)cb0); + cb1 = (__vector int)vec_msum((__vector unsigned short)bg1, pw_f050_f000, + (__vector unsigned int)cb1); + cb2 = (__vector int)vec_msum((__vector unsigned short)bg2, pw_f050_f000, + (__vector unsigned int)cb2); + cb3 = (__vector int)vec_msum((__vector unsigned short)bg3, pw_f050_f000, + (__vector unsigned int)cb3); + cbl = vec_perm((__vector unsigned short)cb0, + (__vector unsigned short)cb1, shift_pack_index); + cbh = vec_perm((__vector unsigned short)cb2, + (__vector unsigned short)cb3, shift_pack_index); + cb = vec_pack(cbl, cbh); + vec_st(cb, 0, outptr1); + + /* Calculate Cr values */ + cr0 = vec_msums(bg0, pw_mf008_mf041, pd_onehalfm1_cj); + cr1 = vec_msums(bg1, pw_mf008_mf041, pd_onehalfm1_cj); + cr2 = vec_msums(bg2, pw_mf008_mf041, pd_onehalfm1_cj); + cr3 = vec_msums(bg3, pw_mf008_mf041, pd_onehalfm1_cj); + cr0 = (__vector int)vec_msum((__vector unsigned short)rg0, pw_f050_f000, + (__vector unsigned int)cr0); + cr1 = (__vector int)vec_msum((__vector unsigned short)rg1, pw_f050_f000, + (__vector unsigned int)cr1); + cr2 = (__vector int)vec_msum((__vector unsigned short)rg2, pw_f050_f000, + (__vector unsigned int)cr2); + cr3 = (__vector int)vec_msum((__vector unsigned short)rg3, pw_f050_f000, + (__vector unsigned int)cr3); + crl = vec_perm((__vector unsigned short)cr0, + (__vector unsigned short)cr1, shift_pack_index); + crh = vec_perm((__vector unsigned short)cr2, + (__vector unsigned short)cr3, shift_pack_index); + cr = vec_pack(crl, crh); + vec_st(cr, 0, outptr2); + } + } +} diff --git a/Builder/jni-1.11/simd/jccolext-mmx.asm b/Builder/jni-1.11/simd/jccolext-mmx.asm new file mode 100644 index 000000000..96a0372b1 --- /dev/null +++ b/Builder/jni-1.11/simd/jccolext-mmx.asm @@ -0,0 +1,476 @@ +; +; jccolext.asm - colorspace conversion (MMX) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Convert some rows of samples to the output colorspace. +; +; GLOBAL(void) +; jsimd_rgb_ycc_convert_mmx (JDIMENSION img_width, +; JSAMPARRAY input_buf, JSAMPIMAGE output_buf, +; JDIMENSION output_row, int num_rows); +; + +%define img_width(b) (b)+8 ; JDIMENSION img_width +%define input_buf(b) (b)+12 ; JSAMPARRAY input_buf +%define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf +%define output_row(b) (b)+20 ; JDIMENSION output_row +%define num_rows(b) (b)+24 ; int num_rows + +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] +%define WK_NUM 8 +%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr + + align 16 + global EXTN(jsimd_rgb_ycc_convert_mmx) + +EXTN(jsimd_rgb_ycc_convert_mmx): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov ecx, JDIMENSION [img_width(eax)] ; num_cols + test ecx,ecx + jz near .return + + push ecx + + mov esi, JSAMPIMAGE [output_buf(eax)] + mov ecx, JDIMENSION [output_row(eax)] + mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY] + mov ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY] + mov edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY] + lea edi, [edi+ecx*SIZEOF_JSAMPROW] + lea ebx, [ebx+ecx*SIZEOF_JSAMPROW] + lea edx, [edx+ecx*SIZEOF_JSAMPROW] + + pop ecx + + mov esi, JSAMPARRAY [input_buf(eax)] + mov eax, INT [num_rows(eax)] + test eax,eax + jle near .return + alignx 16,7 +.rowloop: + pushpic eax + push edx + push ebx + push edi + push esi + push ecx ; col + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr0 + mov ebx, JSAMPROW [ebx] ; outptr1 + mov edx, JSAMPROW [edx] ; outptr2 + movpic eax, POINTER [gotptr] ; load GOT address (eax) + + cmp ecx, byte SIZEOF_MMWORD + jae short .columnloop + alignx 16,7 + +%if RGB_PIXELSIZE == 3 ; --------------- + +.column_ld1: + push eax + push edx + lea ecx,[ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE + test cl, SIZEOF_BYTE + jz short .column_ld2 + sub ecx, byte SIZEOF_BYTE + xor eax,eax + mov al, BYTE [esi+ecx] +.column_ld2: + test cl, SIZEOF_WORD + jz short .column_ld4 + sub ecx, byte SIZEOF_WORD + xor edx,edx + mov dx, WORD [esi+ecx] + shl eax, WORD_BIT + or eax,edx +.column_ld4: + movd mmA,eax + pop edx + pop eax + test cl, SIZEOF_DWORD + jz short .column_ld8 + sub ecx, byte SIZEOF_DWORD + movd mmG, DWORD [esi+ecx] + psllq mmA, DWORD_BIT + por mmA,mmG +.column_ld8: + test cl, SIZEOF_MMWORD + jz short .column_ld16 + movq mmG,mmA + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + mov ecx, SIZEOF_MMWORD + jmp short .rgb_ycc_cnv +.column_ld16: + test cl, 2*SIZEOF_MMWORD + mov ecx, SIZEOF_MMWORD + jz short .rgb_ycc_cnv + movq mmF,mmA + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + movq mmG, MMWORD [esi+1*SIZEOF_MMWORD] + jmp short .rgb_ycc_cnv + alignx 16,7 + +.columnloop: + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + movq mmG, MMWORD [esi+1*SIZEOF_MMWORD] + movq mmF, MMWORD [esi+2*SIZEOF_MMWORD] + +.rgb_ycc_cnv: + ; mmA=(00 10 20 01 11 21 02 12) + ; mmG=(22 03 13 23 04 14 24 05) + ; mmF=(15 25 06 16 26 07 17 27) + + movq mmD,mmA + psllq mmA,4*BYTE_BIT ; mmA=(-- -- -- -- 00 10 20 01) + psrlq mmD,4*BYTE_BIT ; mmD=(11 21 02 12 -- -- -- --) + + punpckhbw mmA,mmG ; mmA=(00 04 10 14 20 24 01 05) + psllq mmG,4*BYTE_BIT ; mmG=(-- -- -- -- 22 03 13 23) + + punpcklbw mmD,mmF ; mmD=(11 15 21 25 02 06 12 16) + punpckhbw mmG,mmF ; mmG=(22 26 03 07 13 17 23 27) + + movq mmE,mmA + psllq mmA,4*BYTE_BIT ; mmA=(-- -- -- -- 00 04 10 14) + psrlq mmE,4*BYTE_BIT ; mmE=(20 24 01 05 -- -- -- --) + + punpckhbw mmA,mmD ; mmA=(00 02 04 06 10 12 14 16) + psllq mmD,4*BYTE_BIT ; mmD=(-- -- -- -- 11 15 21 25) + + punpcklbw mmE,mmG ; mmE=(20 22 24 26 01 03 05 07) + punpckhbw mmD,mmG ; mmD=(11 13 15 17 21 23 25 27) + + pxor mmH,mmH + + movq mmC,mmA + punpcklbw mmA,mmH ; mmA=(00 02 04 06) + punpckhbw mmC,mmH ; mmC=(10 12 14 16) + + movq mmB,mmE + punpcklbw mmE,mmH ; mmE=(20 22 24 26) + punpckhbw mmB,mmH ; mmB=(01 03 05 07) + + movq mmF,mmD + punpcklbw mmD,mmH ; mmD=(11 13 15 17) + punpckhbw mmF,mmH ; mmF=(21 23 25 27) + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +.column_ld1: + test cl, SIZEOF_MMWORD/8 + jz short .column_ld2 + sub ecx, byte SIZEOF_MMWORD/8 + movd mmA, DWORD [esi+ecx*RGB_PIXELSIZE] +.column_ld2: + test cl, SIZEOF_MMWORD/4 + jz short .column_ld4 + sub ecx, byte SIZEOF_MMWORD/4 + movq mmF,mmA + movq mmA, MMWORD [esi+ecx*RGB_PIXELSIZE] +.column_ld4: + test cl, SIZEOF_MMWORD/2 + mov ecx, SIZEOF_MMWORD + jz short .rgb_ycc_cnv + movq mmD,mmA + movq mmC,mmF + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + movq mmF, MMWORD [esi+1*SIZEOF_MMWORD] + jmp short .rgb_ycc_cnv + alignx 16,7 + +.columnloop: + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + movq mmF, MMWORD [esi+1*SIZEOF_MMWORD] + movq mmD, MMWORD [esi+2*SIZEOF_MMWORD] + movq mmC, MMWORD [esi+3*SIZEOF_MMWORD] + +.rgb_ycc_cnv: + ; mmA=(00 10 20 30 01 11 21 31) + ; mmF=(02 12 22 32 03 13 23 33) + ; mmD=(04 14 24 34 05 15 25 35) + ; mmC=(06 16 26 36 07 17 27 37) + + movq mmB,mmA + punpcklbw mmA,mmF ; mmA=(00 02 10 12 20 22 30 32) + punpckhbw mmB,mmF ; mmB=(01 03 11 13 21 23 31 33) + + movq mmG,mmD + punpcklbw mmD,mmC ; mmD=(04 06 14 16 24 26 34 36) + punpckhbw mmG,mmC ; mmG=(05 07 15 17 25 27 35 37) + + movq mmE,mmA + punpcklwd mmA,mmD ; mmA=(00 02 04 06 10 12 14 16) + punpckhwd mmE,mmD ; mmE=(20 22 24 26 30 32 34 36) + + movq mmH,mmB + punpcklwd mmB,mmG ; mmB=(01 03 05 07 11 13 15 17) + punpckhwd mmH,mmG ; mmH=(21 23 25 27 31 33 35 37) + + pxor mmF,mmF + + movq mmC,mmA + punpcklbw mmA,mmF ; mmA=(00 02 04 06) + punpckhbw mmC,mmF ; mmC=(10 12 14 16) + + movq mmD,mmB + punpcklbw mmB,mmF ; mmB=(01 03 05 07) + punpckhbw mmD,mmF ; mmD=(11 13 15 17) + + movq mmG,mmE + punpcklbw mmE,mmF ; mmE=(20 22 24 26) + punpckhbw mmG,mmF ; mmG=(30 32 34 36) + + punpcklbw mmF,mmH + punpckhbw mmH,mmH + psrlw mmF,BYTE_BIT ; mmF=(21 23 25 27) + psrlw mmH,BYTE_BIT ; mmH=(31 33 35 37) + +%endif ; RGB_PIXELSIZE ; --------------- + + ; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE + ; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO + + ; (Original) + ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE + ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE + ; + ; (This implementation) + ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE + ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE + + movq MMWORD [wk(0)], mm0 ; wk(0)=RE + movq MMWORD [wk(1)], mm1 ; wk(1)=RO + movq MMWORD [wk(2)], mm4 ; wk(2)=BE + movq MMWORD [wk(3)], mm5 ; wk(3)=BO + + movq mm6,mm1 + punpcklwd mm1,mm3 + punpckhwd mm6,mm3 + movq mm7,mm1 + movq mm4,mm6 + pmaddwd mm1,[GOTOFF(eax,PW_F0299_F0337)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337) + pmaddwd mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337) + pmaddwd mm7,[GOTOFF(eax,PW_MF016_MF033)] ; mm7=ROL*-FIX(0.168)+GOL*-FIX(0.331) + pmaddwd mm4,[GOTOFF(eax,PW_MF016_MF033)] ; mm4=ROH*-FIX(0.168)+GOH*-FIX(0.331) + + movq MMWORD [wk(4)], mm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337) + movq MMWORD [wk(5)], mm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337) + + pxor mm1,mm1 + pxor mm6,mm6 + punpcklwd mm1,mm5 ; mm1=BOL + punpckhwd mm6,mm5 ; mm6=BOH + psrld mm1,1 ; mm1=BOL*FIX(0.500) + psrld mm6,1 ; mm6=BOH*FIX(0.500) + + movq mm5,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm5=[PD_ONEHALFM1_CJ] + + paddd mm7,mm1 + paddd mm4,mm6 + paddd mm7,mm5 + paddd mm4,mm5 + psrld mm7,SCALEBITS ; mm7=CbOL + psrld mm4,SCALEBITS ; mm4=CbOH + packssdw mm7,mm4 ; mm7=CbO + + movq mm1, MMWORD [wk(2)] ; mm1=BE + + movq mm6,mm0 + punpcklwd mm0,mm2 + punpckhwd mm6,mm2 + movq mm5,mm0 + movq mm4,mm6 + pmaddwd mm0,[GOTOFF(eax,PW_F0299_F0337)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337) + pmaddwd mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337) + pmaddwd mm5,[GOTOFF(eax,PW_MF016_MF033)] ; mm5=REL*-FIX(0.168)+GEL*-FIX(0.331) + pmaddwd mm4,[GOTOFF(eax,PW_MF016_MF033)] ; mm4=REH*-FIX(0.168)+GEH*-FIX(0.331) + + movq MMWORD [wk(6)], mm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337) + movq MMWORD [wk(7)], mm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337) + + pxor mm0,mm0 + pxor mm6,mm6 + punpcklwd mm0,mm1 ; mm0=BEL + punpckhwd mm6,mm1 ; mm6=BEH + psrld mm0,1 ; mm0=BEL*FIX(0.500) + psrld mm6,1 ; mm6=BEH*FIX(0.500) + + movq mm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ] + + paddd mm5,mm0 + paddd mm4,mm6 + paddd mm5,mm1 + paddd mm4,mm1 + psrld mm5,SCALEBITS ; mm5=CbEL + psrld mm4,SCALEBITS ; mm4=CbEH + packssdw mm5,mm4 ; mm5=CbE + + psllw mm7,BYTE_BIT + por mm5,mm7 ; mm5=Cb + movq MMWORD [ebx], mm5 ; Save Cb + + movq mm0, MMWORD [wk(3)] ; mm0=BO + movq mm6, MMWORD [wk(2)] ; mm6=BE + movq mm1, MMWORD [wk(1)] ; mm1=RO + + movq mm4,mm0 + punpcklwd mm0,mm3 + punpckhwd mm4,mm3 + movq mm7,mm0 + movq mm5,mm4 + pmaddwd mm0,[GOTOFF(eax,PW_F0114_F0250)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250) + pmaddwd mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250) + pmaddwd mm7,[GOTOFF(eax,PW_MF008_MF041)] ; mm7=BOL*-FIX(0.081)+GOL*-FIX(0.418) + pmaddwd mm5,[GOTOFF(eax,PW_MF008_MF041)] ; mm5=BOH*-FIX(0.081)+GOH*-FIX(0.418) + + movq mm3,[GOTOFF(eax,PD_ONEHALF)] ; mm3=[PD_ONEHALF] + + paddd mm0, MMWORD [wk(4)] + paddd mm4, MMWORD [wk(5)] + paddd mm0,mm3 + paddd mm4,mm3 + psrld mm0,SCALEBITS ; mm0=YOL + psrld mm4,SCALEBITS ; mm4=YOH + packssdw mm0,mm4 ; mm0=YO + + pxor mm3,mm3 + pxor mm4,mm4 + punpcklwd mm3,mm1 ; mm3=ROL + punpckhwd mm4,mm1 ; mm4=ROH + psrld mm3,1 ; mm3=ROL*FIX(0.500) + psrld mm4,1 ; mm4=ROH*FIX(0.500) + + movq mm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ] + + paddd mm7,mm3 + paddd mm5,mm4 + paddd mm7,mm1 + paddd mm5,mm1 + psrld mm7,SCALEBITS ; mm7=CrOL + psrld mm5,SCALEBITS ; mm5=CrOH + packssdw mm7,mm5 ; mm7=CrO + + movq mm3, MMWORD [wk(0)] ; mm3=RE + + movq mm4,mm6 + punpcklwd mm6,mm2 + punpckhwd mm4,mm2 + movq mm1,mm6 + movq mm5,mm4 + pmaddwd mm6,[GOTOFF(eax,PW_F0114_F0250)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250) + pmaddwd mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250) + pmaddwd mm1,[GOTOFF(eax,PW_MF008_MF041)] ; mm1=BEL*-FIX(0.081)+GEL*-FIX(0.418) + pmaddwd mm5,[GOTOFF(eax,PW_MF008_MF041)] ; mm5=BEH*-FIX(0.081)+GEH*-FIX(0.418) + + movq mm2,[GOTOFF(eax,PD_ONEHALF)] ; mm2=[PD_ONEHALF] + + paddd mm6, MMWORD [wk(6)] + paddd mm4, MMWORD [wk(7)] + paddd mm6,mm2 + paddd mm4,mm2 + psrld mm6,SCALEBITS ; mm6=YEL + psrld mm4,SCALEBITS ; mm4=YEH + packssdw mm6,mm4 ; mm6=YE + + psllw mm0,BYTE_BIT + por mm6,mm0 ; mm6=Y + movq MMWORD [edi], mm6 ; Save Y + + pxor mm2,mm2 + pxor mm4,mm4 + punpcklwd mm2,mm3 ; mm2=REL + punpckhwd mm4,mm3 ; mm4=REH + psrld mm2,1 ; mm2=REL*FIX(0.500) + psrld mm4,1 ; mm4=REH*FIX(0.500) + + movq mm0,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm0=[PD_ONEHALFM1_CJ] + + paddd mm1,mm2 + paddd mm5,mm4 + paddd mm1,mm0 + paddd mm5,mm0 + psrld mm1,SCALEBITS ; mm1=CrEL + psrld mm5,SCALEBITS ; mm5=CrEH + packssdw mm1,mm5 ; mm1=CrE + + psllw mm7,BYTE_BIT + por mm1,mm7 ; mm1=Cr + movq MMWORD [edx], mm1 ; Save Cr + + sub ecx, byte SIZEOF_MMWORD + add esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; inptr + add edi, byte SIZEOF_MMWORD ; outptr0 + add ebx, byte SIZEOF_MMWORD ; outptr1 + add edx, byte SIZEOF_MMWORD ; outptr2 + cmp ecx, byte SIZEOF_MMWORD + jae near .columnloop + test ecx,ecx + jnz near .column_ld1 + + pop ecx ; col + pop esi + pop edi + pop ebx + pop edx + poppic eax + + add esi, byte SIZEOF_JSAMPROW ; input_buf + add edi, byte SIZEOF_JSAMPROW + add ebx, byte SIZEOF_JSAMPROW + add edx, byte SIZEOF_JSAMPROW + dec eax ; num_rows + jg near .rowloop + + emms ; empty MMX state + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/Builder/jni-1.11/simd/jccolext-sse2-64.asm b/Builder/jni-1.11/simd/jccolext-sse2-64.asm new file mode 100644 index 000000000..8e4642d3b --- /dev/null +++ b/Builder/jni-1.11/simd/jccolext-sse2-64.asm @@ -0,0 +1,486 @@ +; +; jccolext.asm - colorspace conversion (64-bit SSE2) +; +; Copyright (C) 2009, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Convert some rows of samples to the output colorspace. +; +; GLOBAL(void) +; jsimd_rgb_ycc_convert_sse2 (JDIMENSION img_width, +; JSAMPARRAY input_buf, JSAMPIMAGE output_buf, +; JDIMENSION output_row, int num_rows); +; + +; r10 = JDIMENSION img_width +; r11 = JSAMPARRAY input_buf +; r12 = JSAMPIMAGE output_buf +; r13 = JDIMENSION output_row +; r14 = int num_rows + +%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 8 + + align 16 + + global EXTN(jsimd_rgb_ycc_convert_sse2) + +EXTN(jsimd_rgb_ycc_convert_sse2): + push rbp + mov rax,rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp],rax + mov rbp,rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args + push rbx + + mov ecx, r10d + test rcx,rcx + jz near .return + + push rcx + + mov rsi, r12 + mov ecx, r13d + mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY] + mov rbx, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY] + mov rdx, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY] + lea rdi, [rdi+rcx*SIZEOF_JSAMPROW] + lea rbx, [rbx+rcx*SIZEOF_JSAMPROW] + lea rdx, [rdx+rcx*SIZEOF_JSAMPROW] + + pop rcx + + mov rsi, r11 + mov eax, r14d + test rax,rax + jle near .return +.rowloop: + push rdx + push rbx + push rdi + push rsi + push rcx ; col + + mov rsi, JSAMPROW [rsi] ; inptr + mov rdi, JSAMPROW [rdi] ; outptr0 + mov rbx, JSAMPROW [rbx] ; outptr1 + mov rdx, JSAMPROW [rdx] ; outptr2 + + cmp rcx, byte SIZEOF_XMMWORD + jae near .columnloop + +%if RGB_PIXELSIZE == 3 ; --------------- + +.column_ld1: + push rax + push rdx + lea rcx,[rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE + test cl, SIZEOF_BYTE + jz short .column_ld2 + sub rcx, byte SIZEOF_BYTE + movzx rax, BYTE [rsi+rcx] +.column_ld2: + test cl, SIZEOF_WORD + jz short .column_ld4 + sub rcx, byte SIZEOF_WORD + movzx rdx, WORD [rsi+rcx] + shl rax, WORD_BIT + or rax,rdx +.column_ld4: + movd xmmA,eax + pop rdx + pop rax + test cl, SIZEOF_DWORD + jz short .column_ld8 + sub rcx, byte SIZEOF_DWORD + movd xmmF, XMM_DWORD [rsi+rcx] + pslldq xmmA, SIZEOF_DWORD + por xmmA,xmmF +.column_ld8: + test cl, SIZEOF_MMWORD + jz short .column_ld16 + sub rcx, byte SIZEOF_MMWORD + movq xmmB, XMM_MMWORD [rsi+rcx] + pslldq xmmA, SIZEOF_MMWORD + por xmmA,xmmB +.column_ld16: + test cl, SIZEOF_XMMWORD + jz short .column_ld32 + movdqa xmmF,xmmA + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + mov rcx, SIZEOF_XMMWORD + jmp short .rgb_ycc_cnv +.column_ld32: + test cl, 2*SIZEOF_XMMWORD + mov rcx, SIZEOF_XMMWORD + jz short .rgb_ycc_cnv + movdqa xmmB,xmmA + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] + jmp short .rgb_ycc_cnv + +.columnloop: + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] + movdqu xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD] + +.rgb_ycc_cnv: + ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) + ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) + + movdqa xmmG,xmmA + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) + psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) + + punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) + pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) + + punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) + punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) + + movdqa xmmD,xmmA + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) + psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) + + punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) + pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) + + punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) + punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) + + movdqa xmmE,xmmA + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) + psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) + + punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) + pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) + + punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) + punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) + + pxor xmmH,xmmH + + movdqa xmmC,xmmA + punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) + punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) + + movdqa xmmB,xmmE + punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) + punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) + + movdqa xmmF,xmmD + punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) + punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +.column_ld1: + test cl, SIZEOF_XMMWORD/16 + jz short .column_ld2 + sub rcx, byte SIZEOF_XMMWORD/16 + movd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE] +.column_ld2: + test cl, SIZEOF_XMMWORD/8 + jz short .column_ld4 + sub rcx, byte SIZEOF_XMMWORD/8 + movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE] + pslldq xmmA, SIZEOF_MMWORD + por xmmA,xmmE +.column_ld4: + test cl, SIZEOF_XMMWORD/4 + jz short .column_ld8 + sub rcx, byte SIZEOF_XMMWORD/4 + movdqa xmmE,xmmA + movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE] +.column_ld8: + test cl, SIZEOF_XMMWORD/2 + mov rcx, SIZEOF_XMMWORD + jz short .rgb_ycc_cnv + movdqa xmmF,xmmA + movdqa xmmH,xmmE + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] + jmp short .rgb_ycc_cnv + +.columnloop: + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD] + movdqu xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD] + +.rgb_ycc_cnv: + ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) + ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) + ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + + movdqa xmmD,xmmA + punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) + punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) + + movdqa xmmC,xmmF + punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) + punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) + + movdqa xmmB,xmmA + punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) + punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) + + movdqa xmmG,xmmD + punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) + punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) + + movdqa xmmE,xmmA + punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) + punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) + + movdqa xmmH,xmmB + punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) + punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) + + pxor xmmF,xmmF + + movdqa xmmC,xmmA + punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) + punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) + + movdqa xmmD,xmmB + punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) + punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) + + movdqa xmmG,xmmE + punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) + punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) + + punpcklbw xmmF,xmmH + punpckhbw xmmH,xmmH + psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) + psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) + +%endif ; RGB_PIXELSIZE ; --------------- + + ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE + ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO + + ; (Original) + ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE + ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE + ; + ; (This implementation) + ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE + ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE + + movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=RE + movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=RO + movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE + movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO + + movdqa xmm6,xmm1 + punpcklwd xmm1,xmm3 + punpckhwd xmm6,xmm3 + movdqa xmm7,xmm1 + movdqa xmm4,xmm6 + pmaddwd xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) + pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) + pmaddwd xmm7,[rel PW_MF016_MF033] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331) + pmaddwd xmm4,[rel PW_MF016_MF033] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331) + + movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337) + movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337) + + pxor xmm1,xmm1 + pxor xmm6,xmm6 + punpcklwd xmm1,xmm5 ; xmm1=BOL + punpckhwd xmm6,xmm5 ; xmm6=BOH + psrld xmm1,1 ; xmm1=BOL*FIX(0.500) + psrld xmm6,1 ; xmm6=BOH*FIX(0.500) + + movdqa xmm5,[rel PD_ONEHALFM1_CJ] ; xmm5=[PD_ONEHALFM1_CJ] + + paddd xmm7,xmm1 + paddd xmm4,xmm6 + paddd xmm7,xmm5 + paddd xmm4,xmm5 + psrld xmm7,SCALEBITS ; xmm7=CbOL + psrld xmm4,SCALEBITS ; xmm4=CbOH + packssdw xmm7,xmm4 ; xmm7=CbO + + movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE + + movdqa xmm6,xmm0 + punpcklwd xmm0,xmm2 + punpckhwd xmm6,xmm2 + movdqa xmm5,xmm0 + movdqa xmm4,xmm6 + pmaddwd xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) + pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) + pmaddwd xmm5,[rel PW_MF016_MF033] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331) + pmaddwd xmm4,[rel PW_MF016_MF033] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331) + + movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337) + movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337) + + pxor xmm0,xmm0 + pxor xmm6,xmm6 + punpcklwd xmm0,xmm1 ; xmm0=BEL + punpckhwd xmm6,xmm1 ; xmm6=BEH + psrld xmm0,1 ; xmm0=BEL*FIX(0.500) + psrld xmm6,1 ; xmm6=BEH*FIX(0.500) + + movdqa xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ] + + paddd xmm5,xmm0 + paddd xmm4,xmm6 + paddd xmm5,xmm1 + paddd xmm4,xmm1 + psrld xmm5,SCALEBITS ; xmm5=CbEL + psrld xmm4,SCALEBITS ; xmm4=CbEH + packssdw xmm5,xmm4 ; xmm5=CbE + + psllw xmm7,BYTE_BIT + por xmm5,xmm7 ; xmm5=Cb + movdqa XMMWORD [rbx], xmm5 ; Save Cb + + movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO + movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE + movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO + + movdqa xmm4,xmm0 + punpcklwd xmm0,xmm3 + punpckhwd xmm4,xmm3 + movdqa xmm7,xmm0 + movdqa xmm5,xmm4 + pmaddwd xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) + pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) + pmaddwd xmm7,[rel PW_MF008_MF041] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418) + pmaddwd xmm5,[rel PW_MF008_MF041] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418) + + movdqa xmm3,[rel PD_ONEHALF] ; xmm3=[PD_ONEHALF] + + paddd xmm0, XMMWORD [wk(4)] + paddd xmm4, XMMWORD [wk(5)] + paddd xmm0,xmm3 + paddd xmm4,xmm3 + psrld xmm0,SCALEBITS ; xmm0=YOL + psrld xmm4,SCALEBITS ; xmm4=YOH + packssdw xmm0,xmm4 ; xmm0=YO + + pxor xmm3,xmm3 + pxor xmm4,xmm4 + punpcklwd xmm3,xmm1 ; xmm3=ROL + punpckhwd xmm4,xmm1 ; xmm4=ROH + psrld xmm3,1 ; xmm3=ROL*FIX(0.500) + psrld xmm4,1 ; xmm4=ROH*FIX(0.500) + + movdqa xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ] + + paddd xmm7,xmm3 + paddd xmm5,xmm4 + paddd xmm7,xmm1 + paddd xmm5,xmm1 + psrld xmm7,SCALEBITS ; xmm7=CrOL + psrld xmm5,SCALEBITS ; xmm5=CrOH + packssdw xmm7,xmm5 ; xmm7=CrO + + movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE + + movdqa xmm4,xmm6 + punpcklwd xmm6,xmm2 + punpckhwd xmm4,xmm2 + movdqa xmm1,xmm6 + movdqa xmm5,xmm4 + pmaddwd xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) + pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) + pmaddwd xmm1,[rel PW_MF008_MF041] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418) + pmaddwd xmm5,[rel PW_MF008_MF041] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418) + + movdqa xmm2,[rel PD_ONEHALF] ; xmm2=[PD_ONEHALF] + + paddd xmm6, XMMWORD [wk(6)] + paddd xmm4, XMMWORD [wk(7)] + paddd xmm6,xmm2 + paddd xmm4,xmm2 + psrld xmm6,SCALEBITS ; xmm6=YEL + psrld xmm4,SCALEBITS ; xmm4=YEH + packssdw xmm6,xmm4 ; xmm6=YE + + psllw xmm0,BYTE_BIT + por xmm6,xmm0 ; xmm6=Y + movdqa XMMWORD [rdi], xmm6 ; Save Y + + pxor xmm2,xmm2 + pxor xmm4,xmm4 + punpcklwd xmm2,xmm3 ; xmm2=REL + punpckhwd xmm4,xmm3 ; xmm4=REH + psrld xmm2,1 ; xmm2=REL*FIX(0.500) + psrld xmm4,1 ; xmm4=REH*FIX(0.500) + + movdqa xmm0,[rel PD_ONEHALFM1_CJ] ; xmm0=[PD_ONEHALFM1_CJ] + + paddd xmm1,xmm2 + paddd xmm5,xmm4 + paddd xmm1,xmm0 + paddd xmm5,xmm0 + psrld xmm1,SCALEBITS ; xmm1=CrEL + psrld xmm5,SCALEBITS ; xmm5=CrEH + packssdw xmm1,xmm5 ; xmm1=CrE + + psllw xmm7,BYTE_BIT + por xmm1,xmm7 ; xmm1=Cr + movdqa XMMWORD [rdx], xmm1 ; Save Cr + + sub rcx, byte SIZEOF_XMMWORD + add rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr + add rdi, byte SIZEOF_XMMWORD ; outptr0 + add rbx, byte SIZEOF_XMMWORD ; outptr1 + add rdx, byte SIZEOF_XMMWORD ; outptr2 + cmp rcx, byte SIZEOF_XMMWORD + jae near .columnloop + test rcx,rcx + jnz near .column_ld1 + + pop rcx ; col + pop rsi + pop rdi + pop rbx + pop rdx + + add rsi, byte SIZEOF_JSAMPROW ; input_buf + add rdi, byte SIZEOF_JSAMPROW + add rbx, byte SIZEOF_JSAMPROW + add rdx, byte SIZEOF_JSAMPROW + dec rax ; num_rows + jg near .rowloop + +.return: + pop rbx + uncollect_args + mov rsp,rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/Builder/jni-1.11/simd/jccolext-sse2.asm b/Builder/jni-1.11/simd/jccolext-sse2.asm new file mode 100644 index 000000000..cc38e98a1 --- /dev/null +++ b/Builder/jni-1.11/simd/jccolext-sse2.asm @@ -0,0 +1,503 @@ +; +; jccolext.asm - colorspace conversion (SSE2) +; +; x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Convert some rows of samples to the output colorspace. +; +; GLOBAL(void) +; jsimd_rgb_ycc_convert_sse2 (JDIMENSION img_width, +; JSAMPARRAY input_buf, JSAMPIMAGE output_buf, +; JDIMENSION output_row, int num_rows); +; + +%define img_width(b) (b)+8 ; JDIMENSION img_width +%define input_buf(b) (b)+12 ; JSAMPARRAY input_buf +%define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf +%define output_row(b) (b)+20 ; JDIMENSION output_row +%define num_rows(b) (b)+24 ; int num_rows + +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 8 +%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr + + align 16 + + global EXTN(jsimd_rgb_ycc_convert_sse2) + +EXTN(jsimd_rgb_ycc_convert_sse2): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov ecx, JDIMENSION [img_width(eax)] + test ecx,ecx + jz near .return + + push ecx + + mov esi, JSAMPIMAGE [output_buf(eax)] + mov ecx, JDIMENSION [output_row(eax)] + mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY] + mov ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY] + mov edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY] + lea edi, [edi+ecx*SIZEOF_JSAMPROW] + lea ebx, [ebx+ecx*SIZEOF_JSAMPROW] + lea edx, [edx+ecx*SIZEOF_JSAMPROW] + + pop ecx + + mov esi, JSAMPARRAY [input_buf(eax)] + mov eax, INT [num_rows(eax)] + test eax,eax + jle near .return + alignx 16,7 +.rowloop: + pushpic eax + push edx + push ebx + push edi + push esi + push ecx ; col + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr0 + mov ebx, JSAMPROW [ebx] ; outptr1 + mov edx, JSAMPROW [edx] ; outptr2 + movpic eax, POINTER [gotptr] ; load GOT address (eax) + + cmp ecx, byte SIZEOF_XMMWORD + jae near .columnloop + alignx 16,7 + +%if RGB_PIXELSIZE == 3 ; --------------- + +.column_ld1: + push eax + push edx + lea ecx,[ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE + test cl, SIZEOF_BYTE + jz short .column_ld2 + sub ecx, byte SIZEOF_BYTE + movzx eax, BYTE [esi+ecx] +.column_ld2: + test cl, SIZEOF_WORD + jz short .column_ld4 + sub ecx, byte SIZEOF_WORD + movzx edx, WORD [esi+ecx] + shl eax, WORD_BIT + or eax,edx +.column_ld4: + movd xmmA,eax + pop edx + pop eax + test cl, SIZEOF_DWORD + jz short .column_ld8 + sub ecx, byte SIZEOF_DWORD + movd xmmF, XMM_DWORD [esi+ecx] + pslldq xmmA, SIZEOF_DWORD + por xmmA,xmmF +.column_ld8: + test cl, SIZEOF_MMWORD + jz short .column_ld16 + sub ecx, byte SIZEOF_MMWORD + movq xmmB, XMM_MMWORD [esi+ecx] + pslldq xmmA, SIZEOF_MMWORD + por xmmA,xmmB +.column_ld16: + test cl, SIZEOF_XMMWORD + jz short .column_ld32 + movdqa xmmF,xmmA + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + mov ecx, SIZEOF_XMMWORD + jmp short .rgb_ycc_cnv +.column_ld32: + test cl, 2*SIZEOF_XMMWORD + mov ecx, SIZEOF_XMMWORD + jz short .rgb_ycc_cnv + movdqa xmmB,xmmA + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] + jmp short .rgb_ycc_cnv + alignx 16,7 + +.columnloop: + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] + movdqu xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD] + +.rgb_ycc_cnv: + ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) + ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) + + movdqa xmmG,xmmA + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) + psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) + + punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) + pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) + + punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) + punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) + + movdqa xmmD,xmmA + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) + psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) + + punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) + pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) + + punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) + punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) + + movdqa xmmE,xmmA + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) + psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) + + punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) + pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) + + punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) + punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) + + pxor xmmH,xmmH + + movdqa xmmC,xmmA + punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) + punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) + + movdqa xmmB,xmmE + punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) + punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) + + movdqa xmmF,xmmD + punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) + punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +.column_ld1: + test cl, SIZEOF_XMMWORD/16 + jz short .column_ld2 + sub ecx, byte SIZEOF_XMMWORD/16 + movd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE] +.column_ld2: + test cl, SIZEOF_XMMWORD/8 + jz short .column_ld4 + sub ecx, byte SIZEOF_XMMWORD/8 + movq xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE] + pslldq xmmA, SIZEOF_MMWORD + por xmmA,xmmE +.column_ld4: + test cl, SIZEOF_XMMWORD/4 + jz short .column_ld8 + sub ecx, byte SIZEOF_XMMWORD/4 + movdqa xmmE,xmmA + movdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE] +.column_ld8: + test cl, SIZEOF_XMMWORD/2 + mov ecx, SIZEOF_XMMWORD + jz short .rgb_ycc_cnv + movdqa xmmF,xmmA + movdqa xmmH,xmmE + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] + jmp short .rgb_ycc_cnv + alignx 16,7 + +.columnloop: + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD] + movdqu xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD] + +.rgb_ycc_cnv: + ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) + ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) + ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + + movdqa xmmD,xmmA + punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) + punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) + + movdqa xmmC,xmmF + punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) + punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) + + movdqa xmmB,xmmA + punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) + punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) + + movdqa xmmG,xmmD + punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) + punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) + + movdqa xmmE,xmmA + punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) + punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) + + movdqa xmmH,xmmB + punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) + punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) + + pxor xmmF,xmmF + + movdqa xmmC,xmmA + punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) + punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) + + movdqa xmmD,xmmB + punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) + punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) + + movdqa xmmG,xmmE + punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) + punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) + + punpcklbw xmmF,xmmH + punpckhbw xmmH,xmmH + psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) + psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) + +%endif ; RGB_PIXELSIZE ; --------------- + + ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE + ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO + + ; (Original) + ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE + ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE + ; + ; (This implementation) + ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE + ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE + + movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=RE + movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=RO + movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE + movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO + + movdqa xmm6,xmm1 + punpcklwd xmm1,xmm3 + punpckhwd xmm6,xmm3 + movdqa xmm7,xmm1 + movdqa xmm4,xmm6 + pmaddwd xmm1,[GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) + pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) + pmaddwd xmm7,[GOTOFF(eax,PW_MF016_MF033)] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331) + pmaddwd xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331) + + movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337) + movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337) + + pxor xmm1,xmm1 + pxor xmm6,xmm6 + punpcklwd xmm1,xmm5 ; xmm1=BOL + punpckhwd xmm6,xmm5 ; xmm6=BOH + psrld xmm1,1 ; xmm1=BOL*FIX(0.500) + psrld xmm6,1 ; xmm6=BOH*FIX(0.500) + + movdqa xmm5,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm5=[PD_ONEHALFM1_CJ] + + paddd xmm7,xmm1 + paddd xmm4,xmm6 + paddd xmm7,xmm5 + paddd xmm4,xmm5 + psrld xmm7,SCALEBITS ; xmm7=CbOL + psrld xmm4,SCALEBITS ; xmm4=CbOH + packssdw xmm7,xmm4 ; xmm7=CbO + + movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE + + movdqa xmm6,xmm0 + punpcklwd xmm0,xmm2 + punpckhwd xmm6,xmm2 + movdqa xmm5,xmm0 + movdqa xmm4,xmm6 + pmaddwd xmm0,[GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) + pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) + pmaddwd xmm5,[GOTOFF(eax,PW_MF016_MF033)] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331) + pmaddwd xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331) + + movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337) + movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337) + + pxor xmm0,xmm0 + pxor xmm6,xmm6 + punpcklwd xmm0,xmm1 ; xmm0=BEL + punpckhwd xmm6,xmm1 ; xmm6=BEH + psrld xmm0,1 ; xmm0=BEL*FIX(0.500) + psrld xmm6,1 ; xmm6=BEH*FIX(0.500) + + movdqa xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ] + + paddd xmm5,xmm0 + paddd xmm4,xmm6 + paddd xmm5,xmm1 + paddd xmm4,xmm1 + psrld xmm5,SCALEBITS ; xmm5=CbEL + psrld xmm4,SCALEBITS ; xmm4=CbEH + packssdw xmm5,xmm4 ; xmm5=CbE + + psllw xmm7,BYTE_BIT + por xmm5,xmm7 ; xmm5=Cb + movdqa XMMWORD [ebx], xmm5 ; Save Cb + + movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO + movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE + movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO + + movdqa xmm4,xmm0 + punpcklwd xmm0,xmm3 + punpckhwd xmm4,xmm3 + movdqa xmm7,xmm0 + movdqa xmm5,xmm4 + pmaddwd xmm0,[GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) + pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) + pmaddwd xmm7,[GOTOFF(eax,PW_MF008_MF041)] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418) + pmaddwd xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418) + + movdqa xmm3,[GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF] + + paddd xmm0, XMMWORD [wk(4)] + paddd xmm4, XMMWORD [wk(5)] + paddd xmm0,xmm3 + paddd xmm4,xmm3 + psrld xmm0,SCALEBITS ; xmm0=YOL + psrld xmm4,SCALEBITS ; xmm4=YOH + packssdw xmm0,xmm4 ; xmm0=YO + + pxor xmm3,xmm3 + pxor xmm4,xmm4 + punpcklwd xmm3,xmm1 ; xmm3=ROL + punpckhwd xmm4,xmm1 ; xmm4=ROH + psrld xmm3,1 ; xmm3=ROL*FIX(0.500) + psrld xmm4,1 ; xmm4=ROH*FIX(0.500) + + movdqa xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ] + + paddd xmm7,xmm3 + paddd xmm5,xmm4 + paddd xmm7,xmm1 + paddd xmm5,xmm1 + psrld xmm7,SCALEBITS ; xmm7=CrOL + psrld xmm5,SCALEBITS ; xmm5=CrOH + packssdw xmm7,xmm5 ; xmm7=CrO + + movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE + + movdqa xmm4,xmm6 + punpcklwd xmm6,xmm2 + punpckhwd xmm4,xmm2 + movdqa xmm1,xmm6 + movdqa xmm5,xmm4 + pmaddwd xmm6,[GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) + pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) + pmaddwd xmm1,[GOTOFF(eax,PW_MF008_MF041)] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418) + pmaddwd xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418) + + movdqa xmm2,[GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF] + + paddd xmm6, XMMWORD [wk(6)] + paddd xmm4, XMMWORD [wk(7)] + paddd xmm6,xmm2 + paddd xmm4,xmm2 + psrld xmm6,SCALEBITS ; xmm6=YEL + psrld xmm4,SCALEBITS ; xmm4=YEH + packssdw xmm6,xmm4 ; xmm6=YE + + psllw xmm0,BYTE_BIT + por xmm6,xmm0 ; xmm6=Y + movdqa XMMWORD [edi], xmm6 ; Save Y + + pxor xmm2,xmm2 + pxor xmm4,xmm4 + punpcklwd xmm2,xmm3 ; xmm2=REL + punpckhwd xmm4,xmm3 ; xmm4=REH + psrld xmm2,1 ; xmm2=REL*FIX(0.500) + psrld xmm4,1 ; xmm4=REH*FIX(0.500) + + movdqa xmm0,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm0=[PD_ONEHALFM1_CJ] + + paddd xmm1,xmm2 + paddd xmm5,xmm4 + paddd xmm1,xmm0 + paddd xmm5,xmm0 + psrld xmm1,SCALEBITS ; xmm1=CrEL + psrld xmm5,SCALEBITS ; xmm5=CrEH + packssdw xmm1,xmm5 ; xmm1=CrE + + psllw xmm7,BYTE_BIT + por xmm1,xmm7 ; xmm1=Cr + movdqa XMMWORD [edx], xmm1 ; Save Cr + + sub ecx, byte SIZEOF_XMMWORD + add esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr + add edi, byte SIZEOF_XMMWORD ; outptr0 + add ebx, byte SIZEOF_XMMWORD ; outptr1 + add edx, byte SIZEOF_XMMWORD ; outptr2 + cmp ecx, byte SIZEOF_XMMWORD + jae near .columnloop + test ecx,ecx + jnz near .column_ld1 + + pop ecx ; col + pop esi + pop edi + pop ebx + pop edx + poppic eax + + add esi, byte SIZEOF_JSAMPROW ; input_buf + add edi, byte SIZEOF_JSAMPROW + add ebx, byte SIZEOF_JSAMPROW + add edx, byte SIZEOF_JSAMPROW + dec eax ; num_rows + jg near .rowloop + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/Builder/jni-1.11/simd/jccolor-altivec.c b/Builder/jni-1.11/simd/jccolor-altivec.c new file mode 100644 index 000000000..ec473320e --- /dev/null +++ b/Builder/jni-1.11/simd/jccolor-altivec.c @@ -0,0 +1,104 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2014, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* RGB --> YCC CONVERSION */ + +#include "jsimd_altivec.h" + + +#define F_0_081 5329 /* FIX(0.08131) */ +#define F_0_114 7471 /* FIX(0.11400) */ +#define F_0_168 11059 /* FIX(0.16874) */ +#define F_0_250 16384 /* FIX(0.25000) */ +#define F_0_299 19595 /* FIX(0.29900) */ +#define F_0_331 21709 /* FIX(0.33126) */ +#define F_0_418 27439 /* FIX(0.41869) */ +#define F_0_500 32768 /* FIX(0.50000) */ +#define F_0_587 38470 /* FIX(0.58700) */ +#define F_0_337 (F_0_587 - F_0_250) /* FIX(0.58700) - FIX(0.25000) */ + +#define SCALEBITS 16 +#define ONE_HALF (1 << (SCALEBITS - 1)) + + +#define RGBG_INDEX0 {0,1,3,4,6,7,9,10,2,1,5,4,8,7,11,10} +#define RGBG_INDEX1 {12,13,15,16,18,19,21,22,14,13,17,16,20,19,23,22} +#define RGBG_INDEX2 {8,9,11,12,14,15,17,18,10,9,13,12,16,15,19,18} +#define RGBG_INDEX3 {4,5,7,8,10,11,13,14,6,5,9,8,12,11,15,14} +#include "jccolext-altivec.c" +#undef RGB_PIXELSIZE + +#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +#define jsimd_rgb_ycc_convert_altivec jsimd_extrgb_ycc_convert_altivec +#include "jccolext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGBG_INDEX0 +#undef RGBG_INDEX1 +#undef RGBG_INDEX2 +#undef RGBG_INDEX3 +#undef jsimd_rgb_ycc_convert_altivec + +#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +#define RGBG_INDEX {0,1,4,5,8,9,12,13,2,1,6,5,10,9,14,13} +#define jsimd_rgb_ycc_convert_altivec jsimd_extrgbx_ycc_convert_altivec +#include "jccolext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGBG_INDEX +#undef jsimd_rgb_ycc_convert_altivec + +#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +#define RGBG_INDEX0 {2,1,5,4,8,7,11,10,0,1,3,4,6,7,9,10} +#define RGBG_INDEX1 {14,13,17,16,20,19,23,22,12,13,15,16,18,19,21,22} +#define RGBG_INDEX2 {10,9,13,12,16,15,19,18,8,9,11,12,14,15,17,18} +#define RGBG_INDEX3 {6,5,9,8,12,11,15,14,4,5,7,8,10,11,13,14} +#define jsimd_rgb_ycc_convert_altivec jsimd_extbgr_ycc_convert_altivec +#include "jccolext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGBG_INDEX0 +#undef RGBG_INDEX1 +#undef RGBG_INDEX2 +#undef RGBG_INDEX3 +#undef jsimd_rgb_ycc_convert_altivec + +#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +#define RGBG_INDEX {2,1,6,5,10,9,14,13,0,1,4,5,8,9,12,13} +#define jsimd_rgb_ycc_convert_altivec jsimd_extbgrx_ycc_convert_altivec +#include "jccolext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGBG_INDEX +#undef jsimd_rgb_ycc_convert_altivec + +#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +#define RGBG_INDEX {3,2,7,6,11,10,15,14,1,2,5,6,9,10,13,14} +#define jsimd_rgb_ycc_convert_altivec jsimd_extxbgr_ycc_convert_altivec +#include "jccolext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGBG_INDEX +#undef jsimd_rgb_ycc_convert_altivec + +#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +#define RGBG_INDEX {1,2,5,6,9,10,13,14,3,2,7,6,11,10,15,14} +#define jsimd_rgb_ycc_convert_altivec jsimd_extxrgb_ycc_convert_altivec +#include "jccolext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGBG_INDEX +#undef jsimd_rgb_ycc_convert_altivec diff --git a/Builder/jni-1.11/simd/i386/src/jccolmmx.asm b/Builder/jni-1.11/simd/jccolor-mmx.asm similarity index 64% rename from Builder/jni-1.11/simd/i386/src/jccolmmx.asm rename to Builder/jni-1.11/simd/jccolor-mmx.asm index 9650e47d4..c4e6d88be 100644 --- a/Builder/jni-1.11/simd/i386/src/jccolmmx.asm +++ b/Builder/jni-1.11/simd/jccolor-mmx.asm @@ -1,11 +1,10 @@ ; -; jccolmmx.asm - colorspace conversion (MMX) +; jccolor.asm - colorspace conversion (MMX) ; ; Copyright 2009 Pierre Ossman for Cendio AB -; Copyright 2009 D. R. Commander +; Copyright (C) 2009, D. R. Commander. ; -; Based on -; x86 SIMD extension for IJG JPEG library +; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; @@ -21,40 +20,40 @@ ; -------------------------------------------------------------------------- -%define SCALEBITS 16 +%define SCALEBITS 16 -F_0_081 equ 5329 ; FIX(0.08131) -F_0_114 equ 7471 ; FIX(0.11400) -F_0_168 equ 11059 ; FIX(0.16874) -F_0_250 equ 16384 ; FIX(0.25000) -F_0_299 equ 19595 ; FIX(0.29900) -F_0_331 equ 21709 ; FIX(0.33126) -F_0_418 equ 27439 ; FIX(0.41869) -F_0_587 equ 38470 ; FIX(0.58700) -F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) +F_0_081 equ 5329 ; FIX(0.08131) +F_0_114 equ 7471 ; FIX(0.11400) +F_0_168 equ 11059 ; FIX(0.16874) +F_0_250 equ 16384 ; FIX(0.25000) +F_0_299 equ 19595 ; FIX(0.29900) +F_0_331 equ 21709 ; FIX(0.33126) +F_0_418 equ 27439 ; FIX(0.41869) +F_0_587 equ 38470 ; FIX(0.58700) +F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_rgb_ycc_convert_mmx) + alignz 16 + global EXTN(jconst_rgb_ycc_convert_mmx) EXTN(jconst_rgb_ycc_convert_mmx): -PW_F0299_F0337 times 2 dw F_0_299, F_0_337 -PW_F0114_F0250 times 2 dw F_0_114, F_0_250 -PW_MF016_MF033 times 2 dw -F_0_168,-F_0_331 -PW_MF008_MF041 times 2 dw -F_0_081,-F_0_418 -PD_ONEHALFM1_CJ times 2 dd (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS) -PD_ONEHALF times 2 dd (1 << (SCALEBITS-1)) +PW_F0299_F0337 times 2 dw F_0_299, F_0_337 +PW_F0114_F0250 times 2 dw F_0_114, F_0_250 +PW_MF016_MF033 times 2 dw -F_0_168,-F_0_331 +PW_MF008_MF041 times 2 dw -F_0_081,-F_0_418 +PD_ONEHALFM1_CJ times 2 dd (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS) +PD_ONEHALF times 2 dd (1 << (SCALEBITS-1)) - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 + SECTION SEG_TEXT + BITS 32 -%include "jcclrmmx.asm" +%include "jccolext-mmx.asm" %undef RGB_RED %undef RGB_GREEN @@ -65,7 +64,7 @@ PD_ONEHALF times 2 dd (1 << (SCALEBITS-1)) %define RGB_BLUE EXT_RGB_BLUE %define RGB_PIXELSIZE EXT_RGB_PIXELSIZE %define jsimd_rgb_ycc_convert_mmx jsimd_extrgb_ycc_convert_mmx -%include "jcclrmmx.asm" +%include "jccolext-mmx.asm" %undef RGB_RED %undef RGB_GREEN @@ -76,7 +75,7 @@ PD_ONEHALF times 2 dd (1 << (SCALEBITS-1)) %define RGB_BLUE EXT_RGBX_BLUE %define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE %define jsimd_rgb_ycc_convert_mmx jsimd_extrgbx_ycc_convert_mmx -%include "jcclrmmx.asm" +%include "jccolext-mmx.asm" %undef RGB_RED %undef RGB_GREEN @@ -87,7 +86,7 @@ PD_ONEHALF times 2 dd (1 << (SCALEBITS-1)) %define RGB_BLUE EXT_BGR_BLUE %define RGB_PIXELSIZE EXT_BGR_PIXELSIZE %define jsimd_rgb_ycc_convert_mmx jsimd_extbgr_ycc_convert_mmx -%include "jcclrmmx.asm" +%include "jccolext-mmx.asm" %undef RGB_RED %undef RGB_GREEN @@ -98,7 +97,7 @@ PD_ONEHALF times 2 dd (1 << (SCALEBITS-1)) %define RGB_BLUE EXT_BGRX_BLUE %define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE %define jsimd_rgb_ycc_convert_mmx jsimd_extbgrx_ycc_convert_mmx -%include "jcclrmmx.asm" +%include "jccolext-mmx.asm" %undef RGB_RED %undef RGB_GREEN @@ -109,7 +108,7 @@ PD_ONEHALF times 2 dd (1 << (SCALEBITS-1)) %define RGB_BLUE EXT_XBGR_BLUE %define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE %define jsimd_rgb_ycc_convert_mmx jsimd_extxbgr_ycc_convert_mmx -%include "jcclrmmx.asm" +%include "jccolext-mmx.asm" %undef RGB_RED %undef RGB_GREEN @@ -120,4 +119,4 @@ PD_ONEHALF times 2 dd (1 << (SCALEBITS-1)) %define RGB_BLUE EXT_XRGB_BLUE %define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE %define jsimd_rgb_ycc_convert_mmx jsimd_extxrgb_ycc_convert_mmx -%include "jcclrmmx.asm" +%include "jccolext-mmx.asm" diff --git a/Builder/jni-1.11/simd/jccolor-sse2-64.asm b/Builder/jni-1.11/simd/jccolor-sse2-64.asm new file mode 100644 index 000000000..bd2188b4c --- /dev/null +++ b/Builder/jni-1.11/simd/jccolor-sse2-64.asm @@ -0,0 +1,121 @@ +; +; jccolor.asm - colorspace conversion (64-bit SSE2) +; +; Copyright (C) 2009, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_081 equ 5329 ; FIX(0.08131) +F_0_114 equ 7471 ; FIX(0.11400) +F_0_168 equ 11059 ; FIX(0.16874) +F_0_250 equ 16384 ; FIX(0.25000) +F_0_299 equ 19595 ; FIX(0.29900) +F_0_331 equ 21709 ; FIX(0.33126) +F_0_418 equ 27439 ; FIX(0.41869) +F_0_587 equ 38470 ; FIX(0.58700) +F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_rgb_ycc_convert_sse2) + +EXTN(jconst_rgb_ycc_convert_sse2): + +PW_F0299_F0337 times 4 dw F_0_299, F_0_337 +PW_F0114_F0250 times 4 dw F_0_114, F_0_250 +PW_MF016_MF033 times 4 dw -F_0_168,-F_0_331 +PW_MF008_MF041 times 4 dw -F_0_081,-F_0_418 +PD_ONEHALFM1_CJ times 4 dd (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS) +PD_ONEHALF times 4 dd (1 << (SCALEBITS-1)) + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 + +%include "jccolext-sse2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGB_RED +%define RGB_GREEN EXT_RGB_GREEN +%define RGB_BLUE EXT_RGB_BLUE +%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgb_ycc_convert_sse2 +%include "jccolext-sse2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGBX_RED +%define RGB_GREEN EXT_RGBX_GREEN +%define RGB_BLUE EXT_RGBX_BLUE +%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgbx_ycc_convert_sse2 +%include "jccolext-sse2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGR_RED +%define RGB_GREEN EXT_BGR_GREEN +%define RGB_BLUE EXT_BGR_BLUE +%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgr_ycc_convert_sse2 +%include "jccolext-sse2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGRX_RED +%define RGB_GREEN EXT_BGRX_GREEN +%define RGB_BLUE EXT_BGRX_BLUE +%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgrx_ycc_convert_sse2 +%include "jccolext-sse2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XBGR_RED +%define RGB_GREEN EXT_XBGR_GREEN +%define RGB_BLUE EXT_XBGR_BLUE +%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +%define jsimd_rgb_ycc_convert_sse2 jsimd_extxbgr_ycc_convert_sse2 +%include "jccolext-sse2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XRGB_RED +%define RGB_GREEN EXT_XRGB_GREEN +%define RGB_BLUE EXT_XRGB_BLUE +%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +%define jsimd_rgb_ycc_convert_sse2 jsimd_extxrgb_ycc_convert_sse2 +%include "jccolext-sse2-64.asm" diff --git a/Builder/jni-1.11/simd/i386/src/jccolss2.asm b/Builder/jni-1.11/simd/jccolor-sse2.asm similarity index 65% rename from Builder/jni-1.11/simd/i386/src/jccolss2.asm rename to Builder/jni-1.11/simd/jccolor-sse2.asm index ac001d186..13124d13d 100644 --- a/Builder/jni-1.11/simd/i386/src/jccolss2.asm +++ b/Builder/jni-1.11/simd/jccolor-sse2.asm @@ -1,9 +1,10 @@ ; -; jccolss2.asm - colorspace conversion (SSE2) +; jccolor.asm - colorspace conversion (SSE2) ; -; x86 SIMD extension for IJG JPEG library -; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 2009, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; ; This file should be assembled with NASM (Netwide Assembler), @@ -18,40 +19,40 @@ ; -------------------------------------------------------------------------- -%define SCALEBITS 16 +%define SCALEBITS 16 -F_0_081 equ 5329 ; FIX(0.08131) -F_0_114 equ 7471 ; FIX(0.11400) -F_0_168 equ 11059 ; FIX(0.16874) -F_0_250 equ 16384 ; FIX(0.25000) -F_0_299 equ 19595 ; FIX(0.29900) -F_0_331 equ 21709 ; FIX(0.33126) -F_0_418 equ 27439 ; FIX(0.41869) -F_0_587 equ 38470 ; FIX(0.58700) -F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) +F_0_081 equ 5329 ; FIX(0.08131) +F_0_114 equ 7471 ; FIX(0.11400) +F_0_168 equ 11059 ; FIX(0.16874) +F_0_250 equ 16384 ; FIX(0.25000) +F_0_299 equ 19595 ; FIX(0.29900) +F_0_331 equ 21709 ; FIX(0.33126) +F_0_418 equ 27439 ; FIX(0.41869) +F_0_587 equ 38470 ; FIX(0.58700) +F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_rgb_ycc_convert_sse2) + alignz 16 + global EXTN(jconst_rgb_ycc_convert_sse2) EXTN(jconst_rgb_ycc_convert_sse2): -PW_F0299_F0337 times 4 dw F_0_299, F_0_337 -PW_F0114_F0250 times 4 dw F_0_114, F_0_250 -PW_MF016_MF033 times 4 dw -F_0_168,-F_0_331 -PW_MF008_MF041 times 4 dw -F_0_081,-F_0_418 -PD_ONEHALFM1_CJ times 4 dd (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS) -PD_ONEHALF times 4 dd (1 << (SCALEBITS-1)) +PW_F0299_F0337 times 4 dw F_0_299, F_0_337 +PW_F0114_F0250 times 4 dw F_0_114, F_0_250 +PW_MF016_MF033 times 4 dw -F_0_168,-F_0_331 +PW_MF008_MF041 times 4 dw -F_0_081,-F_0_418 +PD_ONEHALFM1_CJ times 4 dd (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS) +PD_ONEHALF times 4 dd (1 << (SCALEBITS-1)) - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 + SECTION SEG_TEXT + BITS 32 -%include "jcclrss2.asm" +%include "jccolext-sse2.asm" %undef RGB_RED %undef RGB_GREEN @@ -62,7 +63,7 @@ PD_ONEHALF times 4 dd (1 << (SCALEBITS-1)) %define RGB_BLUE EXT_RGB_BLUE %define RGB_PIXELSIZE EXT_RGB_PIXELSIZE %define jsimd_rgb_ycc_convert_sse2 jsimd_extrgb_ycc_convert_sse2 -%include "jcclrss2.asm" +%include "jccolext-sse2.asm" %undef RGB_RED %undef RGB_GREEN @@ -73,7 +74,7 @@ PD_ONEHALF times 4 dd (1 << (SCALEBITS-1)) %define RGB_BLUE EXT_RGBX_BLUE %define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE %define jsimd_rgb_ycc_convert_sse2 jsimd_extrgbx_ycc_convert_sse2 -%include "jcclrss2.asm" +%include "jccolext-sse2.asm" %undef RGB_RED %undef RGB_GREEN @@ -84,7 +85,7 @@ PD_ONEHALF times 4 dd (1 << (SCALEBITS-1)) %define RGB_BLUE EXT_BGR_BLUE %define RGB_PIXELSIZE EXT_BGR_PIXELSIZE %define jsimd_rgb_ycc_convert_sse2 jsimd_extbgr_ycc_convert_sse2 -%include "jcclrss2.asm" +%include "jccolext-sse2.asm" %undef RGB_RED %undef RGB_GREEN @@ -95,7 +96,7 @@ PD_ONEHALF times 4 dd (1 << (SCALEBITS-1)) %define RGB_BLUE EXT_BGRX_BLUE %define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE %define jsimd_rgb_ycc_convert_sse2 jsimd_extbgrx_ycc_convert_sse2 -%include "jcclrss2.asm" +%include "jccolext-sse2.asm" %undef RGB_RED %undef RGB_GREEN @@ -106,7 +107,7 @@ PD_ONEHALF times 4 dd (1 << (SCALEBITS-1)) %define RGB_BLUE EXT_XBGR_BLUE %define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE %define jsimd_rgb_ycc_convert_sse2 jsimd_extxbgr_ycc_convert_sse2 -%include "jcclrss2.asm" +%include "jccolext-sse2.asm" %undef RGB_RED %undef RGB_GREEN @@ -117,4 +118,4 @@ PD_ONEHALF times 4 dd (1 << (SCALEBITS-1)) %define RGB_BLUE EXT_XRGB_BLUE %define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE %define jsimd_rgb_ycc_convert_sse2 jsimd_extxrgb_ycc_convert_sse2 -%include "jcclrss2.asm" +%include "jccolext-sse2.asm" diff --git a/Builder/jni-1.11/simd/jcgray-altivec.c b/Builder/jni-1.11/simd/jcgray-altivec.c new file mode 100644 index 000000000..684df5ef1 --- /dev/null +++ b/Builder/jni-1.11/simd/jcgray-altivec.c @@ -0,0 +1,99 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2014, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* RGB --> GRAYSCALE CONVERSION */ + +#include "jsimd_altivec.h" + + +#define F_0_114 7471 /* FIX(0.11400) */ +#define F_0_250 16384 /* FIX(0.25000) */ +#define F_0_299 19595 /* FIX(0.29900) */ +#define F_0_587 38470 /* FIX(0.58700) */ +#define F_0_337 (F_0_587 - F_0_250) /* FIX(0.58700) - FIX(0.25000) */ + +#define SCALEBITS 16 +#define ONE_HALF (1 << (SCALEBITS - 1)) + + +#define RGBG_INDEX0 {0,1,3,4,6,7,9,10,2,1,5,4,8,7,11,10} +#define RGBG_INDEX1 {12,13,15,16,18,19,21,22,14,13,17,16,20,19,23,22} +#define RGBG_INDEX2 {8,9,11,12,14,15,17,18,10,9,13,12,16,15,19,18} +#define RGBG_INDEX3 {4,5,7,8,10,11,13,14,6,5,9,8,12,11,15,14} +#include "jcgryext-altivec.c" +#undef RGB_PIXELSIZE + +#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +#define jsimd_rgb_gray_convert_altivec jsimd_extrgb_gray_convert_altivec +#include "jcgryext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGBG_INDEX0 +#undef RGBG_INDEX1 +#undef RGBG_INDEX2 +#undef RGBG_INDEX3 +#undef jsimd_rgb_gray_convert_altivec + +#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +#define RGBG_INDEX {0,1,4,5,8,9,12,13,2,1,6,5,10,9,14,13} +#define jsimd_rgb_gray_convert_altivec jsimd_extrgbx_gray_convert_altivec +#include "jcgryext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGBG_INDEX +#undef jsimd_rgb_gray_convert_altivec + +#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +#define RGBG_INDEX0 {2,1,5,4,8,7,11,10,0,1,3,4,6,7,9,10} +#define RGBG_INDEX1 {14,13,17,16,20,19,23,22,12,13,15,16,18,19,21,22} +#define RGBG_INDEX2 {10,9,13,12,16,15,19,18,8,9,11,12,14,15,17,18} +#define RGBG_INDEX3 {6,5,9,8,12,11,15,14,4,5,7,8,10,11,13,14} +#define jsimd_rgb_gray_convert_altivec jsimd_extbgr_gray_convert_altivec +#include "jcgryext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGBG_INDEX0 +#undef RGBG_INDEX1 +#undef RGBG_INDEX2 +#undef RGBG_INDEX3 +#undef jsimd_rgb_gray_convert_altivec + +#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +#define RGBG_INDEX {2,1,6,5,10,9,14,13,0,1,4,5,8,9,12,13} +#define jsimd_rgb_gray_convert_altivec jsimd_extbgrx_gray_convert_altivec +#include "jcgryext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGBG_INDEX +#undef jsimd_rgb_gray_convert_altivec + +#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +#define RGBG_INDEX {3,2,7,6,11,10,15,14,1,2,5,6,9,10,13,14} +#define jsimd_rgb_gray_convert_altivec jsimd_extxbgr_gray_convert_altivec +#include "jcgryext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGBG_INDEX +#undef jsimd_rgb_gray_convert_altivec + +#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +#define RGBG_INDEX {1,2,5,6,9,10,13,14,3,2,7,6,11,10,15,14} +#define jsimd_rgb_gray_convert_altivec jsimd_extxrgb_gray_convert_altivec +#include "jcgryext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGBG_INDEX +#undef jsimd_rgb_gray_convert_altivec diff --git a/Builder/jni-1.11/simd/i386/src/jcgrammx.asm b/Builder/jni-1.11/simd/jcgray-mmx.asm similarity index 71% rename from Builder/jni-1.11/simd/i386/src/jcgrammx.asm rename to Builder/jni-1.11/simd/jcgray-mmx.asm index b8b8dd3ad..0819b6ca0 100644 --- a/Builder/jni-1.11/simd/i386/src/jcgrammx.asm +++ b/Builder/jni-1.11/simd/jcgray-mmx.asm @@ -1,11 +1,10 @@ ; -; jcgrammx.asm - grayscale colorspace conversion (MMX) +; jcgray.asm - grayscale colorspace conversion (MMX) ; ; Copyright 2009 Pierre Ossman for Cendio AB -; Copyright 2011 D. R. Commander +; Copyright (C) 2011, D. R. Commander. ; -; Based on -; x86 SIMD extension for IJG JPEG library +; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; @@ -21,33 +20,33 @@ ; -------------------------------------------------------------------------- -%define SCALEBITS 16 +%define SCALEBITS 16 -F_0_114 equ 7471 ; FIX(0.11400) -F_0_250 equ 16384 ; FIX(0.25000) -F_0_299 equ 19595 ; FIX(0.29900) -F_0_587 equ 38470 ; FIX(0.58700) -F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) +F_0_114 equ 7471 ; FIX(0.11400) +F_0_250 equ 16384 ; FIX(0.25000) +F_0_299 equ 19595 ; FIX(0.29900) +F_0_587 equ 38470 ; FIX(0.58700) +F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_rgb_gray_convert_mmx) + alignz 16 + global EXTN(jconst_rgb_gray_convert_mmx) EXTN(jconst_rgb_gray_convert_mmx): -PW_F0299_F0337 times 2 dw F_0_299, F_0_337 -PW_F0114_F0250 times 2 dw F_0_114, F_0_250 -PD_ONEHALF times 2 dd (1 << (SCALEBITS-1)) +PW_F0299_F0337 times 2 dw F_0_299, F_0_337 +PW_F0114_F0250 times 2 dw F_0_114, F_0_250 +PD_ONEHALF times 2 dd (1 << (SCALEBITS-1)) - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 + SECTION SEG_TEXT + BITS 32 -%include "jcgrymmx.asm" +%include "jcgryext-mmx.asm" %undef RGB_RED %undef RGB_GREEN @@ -58,7 +57,7 @@ PD_ONEHALF times 2 dd (1 << (SCALEBITS-1)) %define RGB_BLUE EXT_RGB_BLUE %define RGB_PIXELSIZE EXT_RGB_PIXELSIZE %define jsimd_rgb_gray_convert_mmx jsimd_extrgb_gray_convert_mmx -%include "jcgrymmx.asm" +%include "jcgryext-mmx.asm" %undef RGB_RED %undef RGB_GREEN @@ -69,7 +68,7 @@ PD_ONEHALF times 2 dd (1 << (SCALEBITS-1)) %define RGB_BLUE EXT_RGBX_BLUE %define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE %define jsimd_rgb_gray_convert_mmx jsimd_extrgbx_gray_convert_mmx -%include "jcgrymmx.asm" +%include "jcgryext-mmx.asm" %undef RGB_RED %undef RGB_GREEN @@ -80,7 +79,7 @@ PD_ONEHALF times 2 dd (1 << (SCALEBITS-1)) %define RGB_BLUE EXT_BGR_BLUE %define RGB_PIXELSIZE EXT_BGR_PIXELSIZE %define jsimd_rgb_gray_convert_mmx jsimd_extbgr_gray_convert_mmx -%include "jcgrymmx.asm" +%include "jcgryext-mmx.asm" %undef RGB_RED %undef RGB_GREEN @@ -91,7 +90,7 @@ PD_ONEHALF times 2 dd (1 << (SCALEBITS-1)) %define RGB_BLUE EXT_BGRX_BLUE %define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE %define jsimd_rgb_gray_convert_mmx jsimd_extbgrx_gray_convert_mmx -%include "jcgrymmx.asm" +%include "jcgryext-mmx.asm" %undef RGB_RED %undef RGB_GREEN @@ -102,7 +101,7 @@ PD_ONEHALF times 2 dd (1 << (SCALEBITS-1)) %define RGB_BLUE EXT_XBGR_BLUE %define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE %define jsimd_rgb_gray_convert_mmx jsimd_extxbgr_gray_convert_mmx -%include "jcgrymmx.asm" +%include "jcgryext-mmx.asm" %undef RGB_RED %undef RGB_GREEN @@ -113,4 +112,4 @@ PD_ONEHALF times 2 dd (1 << (SCALEBITS-1)) %define RGB_BLUE EXT_XRGB_BLUE %define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE %define jsimd_rgb_gray_convert_mmx jsimd_extxrgb_gray_convert_mmx -%include "jcgrymmx.asm" +%include "jcgryext-mmx.asm" diff --git a/Builder/jni-1.11/simd/jcgray-sse2-64.asm b/Builder/jni-1.11/simd/jcgray-sse2-64.asm new file mode 100644 index 000000000..bafd302aa --- /dev/null +++ b/Builder/jni-1.11/simd/jcgray-sse2-64.asm @@ -0,0 +1,114 @@ +; +; jcgray.asm - grayscale colorspace conversion (64-bit SSE2) +; +; Copyright (C) 2011, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_114 equ 7471 ; FIX(0.11400) +F_0_250 equ 16384 ; FIX(0.25000) +F_0_299 equ 19595 ; FIX(0.29900) +F_0_587 equ 38470 ; FIX(0.58700) +F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_rgb_gray_convert_sse2) + +EXTN(jconst_rgb_gray_convert_sse2): + +PW_F0299_F0337 times 4 dw F_0_299, F_0_337 +PW_F0114_F0250 times 4 dw F_0_114, F_0_250 +PD_ONEHALF times 4 dd (1 << (SCALEBITS-1)) + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 + +%include "jcgryext-sse2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGB_RED +%define RGB_GREEN EXT_RGB_GREEN +%define RGB_BLUE EXT_RGB_BLUE +%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +%define jsimd_rgb_gray_convert_sse2 jsimd_extrgb_gray_convert_sse2 +%include "jcgryext-sse2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGBX_RED +%define RGB_GREEN EXT_RGBX_GREEN +%define RGB_BLUE EXT_RGBX_BLUE +%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +%define jsimd_rgb_gray_convert_sse2 jsimd_extrgbx_gray_convert_sse2 +%include "jcgryext-sse2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGR_RED +%define RGB_GREEN EXT_BGR_GREEN +%define RGB_BLUE EXT_BGR_BLUE +%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +%define jsimd_rgb_gray_convert_sse2 jsimd_extbgr_gray_convert_sse2 +%include "jcgryext-sse2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGRX_RED +%define RGB_GREEN EXT_BGRX_GREEN +%define RGB_BLUE EXT_BGRX_BLUE +%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +%define jsimd_rgb_gray_convert_sse2 jsimd_extbgrx_gray_convert_sse2 +%include "jcgryext-sse2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XBGR_RED +%define RGB_GREEN EXT_XBGR_GREEN +%define RGB_BLUE EXT_XBGR_BLUE +%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +%define jsimd_rgb_gray_convert_sse2 jsimd_extxbgr_gray_convert_sse2 +%include "jcgryext-sse2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XRGB_RED +%define RGB_GREEN EXT_XRGB_GREEN +%define RGB_BLUE EXT_XRGB_BLUE +%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +%define jsimd_rgb_gray_convert_sse2 jsimd_extxrgb_gray_convert_sse2 +%include "jcgryext-sse2-64.asm" diff --git a/Builder/jni-1.11/simd/i386/src/jcgrass2.asm b/Builder/jni-1.11/simd/jcgray-sse2.asm similarity index 72% rename from Builder/jni-1.11/simd/i386/src/jcgrass2.asm rename to Builder/jni-1.11/simd/jcgray-sse2.asm index 998968e76..5b0b46695 100644 --- a/Builder/jni-1.11/simd/i386/src/jcgrass2.asm +++ b/Builder/jni-1.11/simd/jcgray-sse2.asm @@ -1,9 +1,10 @@ ; -; jcgrass2.asm - grayscale colorspace conversion (SSE2) +; jcgray.asm - grayscale colorspace conversion (SSE2) ; -; x86 SIMD extension for IJG JPEG library -; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 2011, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; ; This file should be assembled with NASM (Netwide Assembler), @@ -18,33 +19,33 @@ ; -------------------------------------------------------------------------- -%define SCALEBITS 16 +%define SCALEBITS 16 -F_0_114 equ 7471 ; FIX(0.11400) -F_0_250 equ 16384 ; FIX(0.25000) -F_0_299 equ 19595 ; FIX(0.29900) -F_0_587 equ 38470 ; FIX(0.58700) -F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) +F_0_114 equ 7471 ; FIX(0.11400) +F_0_250 equ 16384 ; FIX(0.25000) +F_0_299 equ 19595 ; FIX(0.29900) +F_0_587 equ 38470 ; FIX(0.58700) +F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_rgb_gray_convert_sse2) + alignz 16 + global EXTN(jconst_rgb_gray_convert_sse2) EXTN(jconst_rgb_gray_convert_sse2): -PW_F0299_F0337 times 4 dw F_0_299, F_0_337 -PW_F0114_F0250 times 4 dw F_0_114, F_0_250 -PD_ONEHALF times 4 dd (1 << (SCALEBITS-1)) +PW_F0299_F0337 times 4 dw F_0_299, F_0_337 +PW_F0114_F0250 times 4 dw F_0_114, F_0_250 +PD_ONEHALF times 4 dd (1 << (SCALEBITS-1)) - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 + SECTION SEG_TEXT + BITS 32 -%include "jcgryss2.asm" +%include "jcgryext-sse2.asm" %undef RGB_RED %undef RGB_GREEN @@ -55,7 +56,7 @@ PD_ONEHALF times 4 dd (1 << (SCALEBITS-1)) %define RGB_BLUE EXT_RGB_BLUE %define RGB_PIXELSIZE EXT_RGB_PIXELSIZE %define jsimd_rgb_gray_convert_sse2 jsimd_extrgb_gray_convert_sse2 -%include "jcgryss2.asm" +%include "jcgryext-sse2.asm" %undef RGB_RED %undef RGB_GREEN @@ -66,7 +67,7 @@ PD_ONEHALF times 4 dd (1 << (SCALEBITS-1)) %define RGB_BLUE EXT_RGBX_BLUE %define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE %define jsimd_rgb_gray_convert_sse2 jsimd_extrgbx_gray_convert_sse2 -%include "jcgryss2.asm" +%include "jcgryext-sse2.asm" %undef RGB_RED %undef RGB_GREEN @@ -77,7 +78,7 @@ PD_ONEHALF times 4 dd (1 << (SCALEBITS-1)) %define RGB_BLUE EXT_BGR_BLUE %define RGB_PIXELSIZE EXT_BGR_PIXELSIZE %define jsimd_rgb_gray_convert_sse2 jsimd_extbgr_gray_convert_sse2 -%include "jcgryss2.asm" +%include "jcgryext-sse2.asm" %undef RGB_RED %undef RGB_GREEN @@ -88,7 +89,7 @@ PD_ONEHALF times 4 dd (1 << (SCALEBITS-1)) %define RGB_BLUE EXT_BGRX_BLUE %define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE %define jsimd_rgb_gray_convert_sse2 jsimd_extbgrx_gray_convert_sse2 -%include "jcgryss2.asm" +%include "jcgryext-sse2.asm" %undef RGB_RED %undef RGB_GREEN @@ -99,7 +100,7 @@ PD_ONEHALF times 4 dd (1 << (SCALEBITS-1)) %define RGB_BLUE EXT_XBGR_BLUE %define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE %define jsimd_rgb_gray_convert_sse2 jsimd_extxbgr_gray_convert_sse2 -%include "jcgryss2.asm" +%include "jcgryext-sse2.asm" %undef RGB_RED %undef RGB_GREEN @@ -110,4 +111,4 @@ PD_ONEHALF times 4 dd (1 << (SCALEBITS-1)) %define RGB_BLUE EXT_XRGB_BLUE %define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE %define jsimd_rgb_gray_convert_sse2 jsimd_extxrgb_gray_convert_sse2 -%include "jcgryss2.asm" +%include "jcgryext-sse2.asm" diff --git a/Builder/jni-1.11/simd/jcgryext-altivec.c b/Builder/jni-1.11/simd/jcgryext-altivec.c new file mode 100644 index 000000000..7f8232bb2 --- /dev/null +++ b/Builder/jni-1.11/simd/jcgryext-altivec.c @@ -0,0 +1,227 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved. + * Copyright (C) 2014, Jay Foad. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* This file is included by jcgray-altivec.c */ + + +void jsimd_rgb_gray_convert_altivec (JDIMENSION img_width, + JSAMPARRAY input_buf, + JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) +{ + JSAMPROW inptr, outptr; + int pitch = img_width * RGB_PIXELSIZE, num_cols; +#if __BIG_ENDIAN__ + int offset; + unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16]; +#endif + + __vector unsigned char rgb0, rgb1 = {0}, rgb2 = {0}, + rgbg0, rgbg1, rgbg2, rgbg3, y; +#if __BIG_ENDIAN__ || RGB_PIXELSIZE == 4 + __vector unsigned char rgb3 = {0}; +#endif +#if __BIG_ENDIAN__ && RGB_PIXELSIZE == 4 + __vector unsigned char rgb4 = {0}; +#endif + __vector short rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3; + __vector unsigned short yl, yh; + __vector int y0, y1, y2, y3; + + /* Constants */ + __vector short pw_f0299_f0337 = { __4X2(F_0_299, F_0_337) }, + pw_f0114_f0250 = { __4X2(F_0_114, F_0_250) }; + __vector int pd_onehalf = { __4X(ONE_HALF) }; + __vector unsigned char pb_zero = { __16X(0) }, +#if __BIG_ENDIAN__ + shift_pack_index = {0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29}; +#else + shift_pack_index = {2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31}; +#endif + + while (--num_rows >= 0) { + inptr = *input_buf++; + outptr = output_buf[0][output_row]; + output_row++; + + for (num_cols = pitch; num_cols > 0; + num_cols -= RGB_PIXELSIZE * 16, inptr += RGB_PIXELSIZE * 16, + outptr += 16) { + +#if __BIG_ENDIAN__ + /* Load 16 pixels == 48 or 64 bytes */ + offset = (size_t)inptr & 15; + if (offset) { + __vector unsigned char unaligned_shift_index; + int bytes = num_cols + offset; + + if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) { + /* Slow path to prevent buffer overread. Since there is no way to + * read a partial AltiVec register, overread would occur on the last + * chunk of the last image row if the right edge is not on a 16-byte + * boundary. It could also occur on other rows if the bytes per row + * is low enough. Since we can't determine whether we're on the last + * image row, we have to assume every row is the last. + */ + memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16)); + rgb0 = vec_ld(0, tmpbuf); + rgb1 = vec_ld(16, tmpbuf); + rgb2 = vec_ld(32, tmpbuf); +#if RGB_PIXELSIZE == 4 + rgb3 = vec_ld(48, tmpbuf); +#endif + } else { + /* Fast path */ + rgb0 = vec_ld(0, inptr); + if (bytes > 16) + rgb1 = vec_ld(16, inptr); + if (bytes > 32) + rgb2 = vec_ld(32, inptr); + if (bytes > 48) + rgb3 = vec_ld(48, inptr); +#if RGB_PIXELSIZE == 4 + if (bytes > 64) + rgb4 = vec_ld(64, inptr); +#endif + unaligned_shift_index = vec_lvsl(0, inptr); + rgb0 = vec_perm(rgb0, rgb1, unaligned_shift_index); + rgb1 = vec_perm(rgb1, rgb2, unaligned_shift_index); + rgb2 = vec_perm(rgb2, rgb3, unaligned_shift_index); +#if RGB_PIXELSIZE == 4 + rgb3 = vec_perm(rgb3, rgb4, unaligned_shift_index); +#endif + } + } else { + if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) { + /* Slow path */ + memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16)); + rgb0 = vec_ld(0, tmpbuf); + rgb1 = vec_ld(16, tmpbuf); + rgb2 = vec_ld(32, tmpbuf); +#if RGB_PIXELSIZE == 4 + rgb3 = vec_ld(48, tmpbuf); +#endif + } else { + /* Fast path */ + rgb0 = vec_ld(0, inptr); + if (num_cols > 16) + rgb1 = vec_ld(16, inptr); + if (num_cols > 32) + rgb2 = vec_ld(32, inptr); +#if RGB_PIXELSIZE == 4 + if (num_cols > 48) + rgb3 = vec_ld(48, inptr); +#endif + } + } +#else + /* Little endian */ + rgb0 = vec_vsx_ld(0, inptr); + if (num_cols > 16) + rgb1 = vec_vsx_ld(16, inptr); + if (num_cols > 32) + rgb2 = vec_vsx_ld(32, inptr); +#if RGB_PIXELSIZE == 4 + if (num_cols > 48) + rgb3 = vec_vsx_ld(48, inptr); +#endif +#endif + +#if RGB_PIXELSIZE == 3 + /* rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5 + * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga + * rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf + * + * rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3 + * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7 + * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb + * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf + */ + rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX0); + rgbg1 = vec_perm(rgb0, rgb1, (__vector unsigned char)RGBG_INDEX1); + rgbg2 = vec_perm(rgb1, rgb2, (__vector unsigned char)RGBG_INDEX2); + rgbg3 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX3); +#else + /* rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3 + * rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7 + * rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb + * rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf + * + * rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3 + * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7 + * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb + * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf + */ + rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX); + rgbg1 = vec_perm(rgb1, rgb1, (__vector unsigned char)RGBG_INDEX); + rgbg2 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX); + rgbg3 = vec_perm(rgb3, rgb3, (__vector unsigned char)RGBG_INDEX); +#endif + + /* rg0 = R0 G0 R1 G1 R2 G2 R3 G3 + * bg0 = B0 G0 B1 G1 B2 G2 B3 G3 + * ... + * + * NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't + * support unsigned vectors. + */ + rg0 = (__vector signed short)VEC_UNPACKHU(rgbg0); + bg0 = (__vector signed short)VEC_UNPACKLU(rgbg0); + rg1 = (__vector signed short)VEC_UNPACKHU(rgbg1); + bg1 = (__vector signed short)VEC_UNPACKLU(rgbg1); + rg2 = (__vector signed short)VEC_UNPACKHU(rgbg2); + bg2 = (__vector signed short)VEC_UNPACKLU(rgbg2); + rg3 = (__vector signed short)VEC_UNPACKHU(rgbg3); + bg3 = (__vector signed short)VEC_UNPACKLU(rgbg3); + + /* (Original) + * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + * + * (This implementation) + * Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + */ + + /* Calculate Y values */ + + y0 = vec_msums(rg0, pw_f0299_f0337, pd_onehalf); + y1 = vec_msums(rg1, pw_f0299_f0337, pd_onehalf); + y2 = vec_msums(rg2, pw_f0299_f0337, pd_onehalf); + y3 = vec_msums(rg3, pw_f0299_f0337, pd_onehalf); + y0 = vec_msums(bg0, pw_f0114_f0250, y0); + y1 = vec_msums(bg1, pw_f0114_f0250, y1); + y2 = vec_msums(bg2, pw_f0114_f0250, y2); + y3 = vec_msums(bg3, pw_f0114_f0250, y3); + /* Clever way to avoid 4 shifts + 2 packs. This packs the high word from + * each dword into a new 16-bit vector, which is the equivalent of + * descaling the 32-bit results (right-shifting by 16 bits) and then + * packing them. + */ + yl = vec_perm((__vector unsigned short)y0, (__vector unsigned short)y1, + shift_pack_index); + yh = vec_perm((__vector unsigned short)y2, (__vector unsigned short)y3, + shift_pack_index); + y = vec_pack(yl, yh); + vec_st(y, 0, outptr); + } + } +} diff --git a/Builder/jni-1.11/simd/jcgryext-mmx.asm b/Builder/jni-1.11/simd/jcgryext-mmx.asm new file mode 100644 index 000000000..1c1b8d8bc --- /dev/null +++ b/Builder/jni-1.11/simd/jcgryext-mmx.asm @@ -0,0 +1,356 @@ +; +; jcgryext.asm - grayscale colorspace conversion (MMX) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2011, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Convert some rows of samples to the output colorspace. +; +; GLOBAL(void) +; jsimd_rgb_gray_convert_mmx (JDIMENSION img_width, +; JSAMPARRAY input_buf, JSAMPIMAGE output_buf, +; JDIMENSION output_row, int num_rows); +; + +%define img_width(b) (b)+8 ; JDIMENSION img_width +%define input_buf(b) (b)+12 ; JSAMPARRAY input_buf +%define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf +%define output_row(b) (b)+20 ; JDIMENSION output_row +%define num_rows(b) (b)+24 ; int num_rows + +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] +%define WK_NUM 2 +%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr + + align 16 + global EXTN(jsimd_rgb_gray_convert_mmx) + +EXTN(jsimd_rgb_gray_convert_mmx): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov ecx, JDIMENSION [img_width(eax)] ; num_cols + test ecx,ecx + jz near .return + + push ecx + + mov esi, JSAMPIMAGE [output_buf(eax)] + mov ecx, JDIMENSION [output_row(eax)] + mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY] + lea edi, [edi+ecx*SIZEOF_JSAMPROW] + + pop ecx + + mov esi, JSAMPARRAY [input_buf(eax)] + mov eax, INT [num_rows(eax)] + test eax,eax + jle near .return + alignx 16,7 +.rowloop: + pushpic eax + push edi + push esi + push ecx ; col + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr0 + movpic eax, POINTER [gotptr] ; load GOT address (eax) + + cmp ecx, byte SIZEOF_MMWORD + jae short .columnloop + alignx 16,7 + +%if RGB_PIXELSIZE == 3 ; --------------- + +.column_ld1: + push eax + push edx + lea ecx,[ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE + test cl, SIZEOF_BYTE + jz short .column_ld2 + sub ecx, byte SIZEOF_BYTE + xor eax,eax + mov al, BYTE [esi+ecx] +.column_ld2: + test cl, SIZEOF_WORD + jz short .column_ld4 + sub ecx, byte SIZEOF_WORD + xor edx,edx + mov dx, WORD [esi+ecx] + shl eax, WORD_BIT + or eax,edx +.column_ld4: + movd mmA,eax + pop edx + pop eax + test cl, SIZEOF_DWORD + jz short .column_ld8 + sub ecx, byte SIZEOF_DWORD + movd mmG, DWORD [esi+ecx] + psllq mmA, DWORD_BIT + por mmA,mmG +.column_ld8: + test cl, SIZEOF_MMWORD + jz short .column_ld16 + movq mmG,mmA + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + mov ecx, SIZEOF_MMWORD + jmp short .rgb_gray_cnv +.column_ld16: + test cl, 2*SIZEOF_MMWORD + mov ecx, SIZEOF_MMWORD + jz short .rgb_gray_cnv + movq mmF,mmA + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + movq mmG, MMWORD [esi+1*SIZEOF_MMWORD] + jmp short .rgb_gray_cnv + alignx 16,7 + +.columnloop: + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + movq mmG, MMWORD [esi+1*SIZEOF_MMWORD] + movq mmF, MMWORD [esi+2*SIZEOF_MMWORD] + +.rgb_gray_cnv: + ; mmA=(00 10 20 01 11 21 02 12) + ; mmG=(22 03 13 23 04 14 24 05) + ; mmF=(15 25 06 16 26 07 17 27) + + movq mmD,mmA + psllq mmA,4*BYTE_BIT ; mmA=(-- -- -- -- 00 10 20 01) + psrlq mmD,4*BYTE_BIT ; mmD=(11 21 02 12 -- -- -- --) + + punpckhbw mmA,mmG ; mmA=(00 04 10 14 20 24 01 05) + psllq mmG,4*BYTE_BIT ; mmG=(-- -- -- -- 22 03 13 23) + + punpcklbw mmD,mmF ; mmD=(11 15 21 25 02 06 12 16) + punpckhbw mmG,mmF ; mmG=(22 26 03 07 13 17 23 27) + + movq mmE,mmA + psllq mmA,4*BYTE_BIT ; mmA=(-- -- -- -- 00 04 10 14) + psrlq mmE,4*BYTE_BIT ; mmE=(20 24 01 05 -- -- -- --) + + punpckhbw mmA,mmD ; mmA=(00 02 04 06 10 12 14 16) + psllq mmD,4*BYTE_BIT ; mmD=(-- -- -- -- 11 15 21 25) + + punpcklbw mmE,mmG ; mmE=(20 22 24 26 01 03 05 07) + punpckhbw mmD,mmG ; mmD=(11 13 15 17 21 23 25 27) + + pxor mmH,mmH + + movq mmC,mmA + punpcklbw mmA,mmH ; mmA=(00 02 04 06) + punpckhbw mmC,mmH ; mmC=(10 12 14 16) + + movq mmB,mmE + punpcklbw mmE,mmH ; mmE=(20 22 24 26) + punpckhbw mmB,mmH ; mmB=(01 03 05 07) + + movq mmF,mmD + punpcklbw mmD,mmH ; mmD=(11 13 15 17) + punpckhbw mmF,mmH ; mmF=(21 23 25 27) + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +.column_ld1: + test cl, SIZEOF_MMWORD/8 + jz short .column_ld2 + sub ecx, byte SIZEOF_MMWORD/8 + movd mmA, DWORD [esi+ecx*RGB_PIXELSIZE] +.column_ld2: + test cl, SIZEOF_MMWORD/4 + jz short .column_ld4 + sub ecx, byte SIZEOF_MMWORD/4 + movq mmF,mmA + movq mmA, MMWORD [esi+ecx*RGB_PIXELSIZE] +.column_ld4: + test cl, SIZEOF_MMWORD/2 + mov ecx, SIZEOF_MMWORD + jz short .rgb_gray_cnv + movq mmD,mmA + movq mmC,mmF + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + movq mmF, MMWORD [esi+1*SIZEOF_MMWORD] + jmp short .rgb_gray_cnv + alignx 16,7 + +.columnloop: + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + movq mmF, MMWORD [esi+1*SIZEOF_MMWORD] + movq mmD, MMWORD [esi+2*SIZEOF_MMWORD] + movq mmC, MMWORD [esi+3*SIZEOF_MMWORD] + +.rgb_gray_cnv: + ; mmA=(00 10 20 30 01 11 21 31) + ; mmF=(02 12 22 32 03 13 23 33) + ; mmD=(04 14 24 34 05 15 25 35) + ; mmC=(06 16 26 36 07 17 27 37) + + movq mmB,mmA + punpcklbw mmA,mmF ; mmA=(00 02 10 12 20 22 30 32) + punpckhbw mmB,mmF ; mmB=(01 03 11 13 21 23 31 33) + + movq mmG,mmD + punpcklbw mmD,mmC ; mmD=(04 06 14 16 24 26 34 36) + punpckhbw mmG,mmC ; mmG=(05 07 15 17 25 27 35 37) + + movq mmE,mmA + punpcklwd mmA,mmD ; mmA=(00 02 04 06 10 12 14 16) + punpckhwd mmE,mmD ; mmE=(20 22 24 26 30 32 34 36) + + movq mmH,mmB + punpcklwd mmB,mmG ; mmB=(01 03 05 07 11 13 15 17) + punpckhwd mmH,mmG ; mmH=(21 23 25 27 31 33 35 37) + + pxor mmF,mmF + + movq mmC,mmA + punpcklbw mmA,mmF ; mmA=(00 02 04 06) + punpckhbw mmC,mmF ; mmC=(10 12 14 16) + + movq mmD,mmB + punpcklbw mmB,mmF ; mmB=(01 03 05 07) + punpckhbw mmD,mmF ; mmD=(11 13 15 17) + + movq mmG,mmE + punpcklbw mmE,mmF ; mmE=(20 22 24 26) + punpckhbw mmG,mmF ; mmG=(30 32 34 36) + + punpcklbw mmF,mmH + punpckhbw mmH,mmH + psrlw mmF,BYTE_BIT ; mmF=(21 23 25 27) + psrlw mmH,BYTE_BIT ; mmH=(31 33 35 37) + +%endif ; RGB_PIXELSIZE ; --------------- + + ; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE + ; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO + + ; (Original) + ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + ; + ; (This implementation) + ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + + movq mm6,mm1 + punpcklwd mm1,mm3 + punpckhwd mm6,mm3 + pmaddwd mm1,[GOTOFF(eax,PW_F0299_F0337)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337) + pmaddwd mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337) + + movq mm7, mm6 ; mm7=ROH*FIX(0.299)+GOH*FIX(0.337) + + movq mm6,mm0 + punpcklwd mm0,mm2 + punpckhwd mm6,mm2 + pmaddwd mm0,[GOTOFF(eax,PW_F0299_F0337)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337) + pmaddwd mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337) + + movq MMWORD [wk(0)], mm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337) + movq MMWORD [wk(1)], mm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337) + + movq mm0, mm5 ; mm0=BO + movq mm6, mm4 ; mm6=BE + + movq mm4,mm0 + punpcklwd mm0,mm3 + punpckhwd mm4,mm3 + pmaddwd mm0,[GOTOFF(eax,PW_F0114_F0250)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250) + pmaddwd mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250) + + movq mm3,[GOTOFF(eax,PD_ONEHALF)] ; mm3=[PD_ONEHALF] + + paddd mm0, mm1 + paddd mm4, mm7 + paddd mm0,mm3 + paddd mm4,mm3 + psrld mm0,SCALEBITS ; mm0=YOL + psrld mm4,SCALEBITS ; mm4=YOH + packssdw mm0,mm4 ; mm0=YO + + movq mm4,mm6 + punpcklwd mm6,mm2 + punpckhwd mm4,mm2 + pmaddwd mm6,[GOTOFF(eax,PW_F0114_F0250)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250) + pmaddwd mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250) + + movq mm2,[GOTOFF(eax,PD_ONEHALF)] ; mm2=[PD_ONEHALF] + + paddd mm6, MMWORD [wk(0)] + paddd mm4, MMWORD [wk(1)] + paddd mm6,mm2 + paddd mm4,mm2 + psrld mm6,SCALEBITS ; mm6=YEL + psrld mm4,SCALEBITS ; mm4=YEH + packssdw mm6,mm4 ; mm6=YE + + psllw mm0,BYTE_BIT + por mm6,mm0 ; mm6=Y + movq MMWORD [edi], mm6 ; Save Y + + sub ecx, byte SIZEOF_MMWORD + add esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; inptr + add edi, byte SIZEOF_MMWORD ; outptr0 + cmp ecx, byte SIZEOF_MMWORD + jae near .columnloop + test ecx,ecx + jnz near .column_ld1 + + pop ecx ; col + pop esi + pop edi + poppic eax + + add esi, byte SIZEOF_JSAMPROW ; input_buf + add edi, byte SIZEOF_JSAMPROW + dec eax ; num_rows + jg near .rowloop + + emms ; empty MMX state + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/Builder/jni-1.11/simd/jcgryext-sse2-64.asm b/Builder/jni-1.11/simd/jcgryext-sse2-64.asm new file mode 100644 index 000000000..541355af8 --- /dev/null +++ b/Builder/jni-1.11/simd/jcgryext-sse2-64.asm @@ -0,0 +1,365 @@ +; +; jcgryext.asm - grayscale colorspace conversion (64-bit SSE2) +; +; Copyright (C) 2011, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Convert some rows of samples to the output colorspace. +; +; GLOBAL(void) +; jsimd_rgb_gray_convert_sse2 (JDIMENSION img_width, +; JSAMPARRAY input_buf, JSAMPIMAGE output_buf, +; JDIMENSION output_row, int num_rows); +; + +; r10 = JDIMENSION img_width +; r11 = JSAMPARRAY input_buf +; r12 = JSAMPIMAGE output_buf +; r13 = JDIMENSION output_row +; r14 = int num_rows + +%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 + + align 16 + + global EXTN(jsimd_rgb_gray_convert_sse2) + +EXTN(jsimd_rgb_gray_convert_sse2): + push rbp + mov rax,rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp],rax + mov rbp,rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args + push rbx + + mov ecx, r10d + test rcx,rcx + jz near .return + + push rcx + + mov rsi, r12 + mov ecx, r13d + mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY] + lea rdi, [rdi+rcx*SIZEOF_JSAMPROW] + + pop rcx + + mov rsi, r11 + mov eax, r14d + test rax,rax + jle near .return +.rowloop: + push rdi + push rsi + push rcx ; col + + mov rsi, JSAMPROW [rsi] ; inptr + mov rdi, JSAMPROW [rdi] ; outptr0 + + cmp rcx, byte SIZEOF_XMMWORD + jae near .columnloop + +%if RGB_PIXELSIZE == 3 ; --------------- + +.column_ld1: + push rax + push rdx + lea rcx,[rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE + test cl, SIZEOF_BYTE + jz short .column_ld2 + sub rcx, byte SIZEOF_BYTE + movzx rax, BYTE [rsi+rcx] +.column_ld2: + test cl, SIZEOF_WORD + jz short .column_ld4 + sub rcx, byte SIZEOF_WORD + movzx rdx, WORD [rsi+rcx] + shl rax, WORD_BIT + or rax,rdx +.column_ld4: + movd xmmA,eax + pop rdx + pop rax + test cl, SIZEOF_DWORD + jz short .column_ld8 + sub rcx, byte SIZEOF_DWORD + movd xmmF, XMM_DWORD [rsi+rcx] + pslldq xmmA, SIZEOF_DWORD + por xmmA,xmmF +.column_ld8: + test cl, SIZEOF_MMWORD + jz short .column_ld16 + sub rcx, byte SIZEOF_MMWORD + movq xmmB, XMM_MMWORD [rsi+rcx] + pslldq xmmA, SIZEOF_MMWORD + por xmmA,xmmB +.column_ld16: + test cl, SIZEOF_XMMWORD + jz short .column_ld32 + movdqa xmmF,xmmA + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + mov rcx, SIZEOF_XMMWORD + jmp short .rgb_gray_cnv +.column_ld32: + test cl, 2*SIZEOF_XMMWORD + mov rcx, SIZEOF_XMMWORD + jz short .rgb_gray_cnv + movdqa xmmB,xmmA + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] + jmp short .rgb_gray_cnv + +.columnloop: + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] + movdqu xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD] + +.rgb_gray_cnv: + ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) + ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) + + movdqa xmmG,xmmA + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) + psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) + + punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) + pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) + + punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) + punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) + + movdqa xmmD,xmmA + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) + psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) + + punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) + pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) + + punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) + punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) + + movdqa xmmE,xmmA + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) + psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) + + punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) + pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) + + punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) + punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) + + pxor xmmH,xmmH + + movdqa xmmC,xmmA + punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) + punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) + + movdqa xmmB,xmmE + punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) + punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) + + movdqa xmmF,xmmD + punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) + punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +.column_ld1: + test cl, SIZEOF_XMMWORD/16 + jz short .column_ld2 + sub rcx, byte SIZEOF_XMMWORD/16 + movd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE] +.column_ld2: + test cl, SIZEOF_XMMWORD/8 + jz short .column_ld4 + sub rcx, byte SIZEOF_XMMWORD/8 + movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE] + pslldq xmmA, SIZEOF_MMWORD + por xmmA,xmmE +.column_ld4: + test cl, SIZEOF_XMMWORD/4 + jz short .column_ld8 + sub rcx, byte SIZEOF_XMMWORD/4 + movdqa xmmE,xmmA + movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE] +.column_ld8: + test cl, SIZEOF_XMMWORD/2 + mov rcx, SIZEOF_XMMWORD + jz short .rgb_gray_cnv + movdqa xmmF,xmmA + movdqa xmmH,xmmE + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] + jmp short .rgb_gray_cnv + +.columnloop: + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD] + movdqu xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD] + +.rgb_gray_cnv: + ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) + ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) + ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + + movdqa xmmD,xmmA + punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) + punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) + + movdqa xmmC,xmmF + punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) + punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) + + movdqa xmmB,xmmA + punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) + punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) + + movdqa xmmG,xmmD + punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) + punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) + + movdqa xmmE,xmmA + punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) + punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) + + movdqa xmmH,xmmB + punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) + punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) + + pxor xmmF,xmmF + + movdqa xmmC,xmmA + punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) + punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) + + movdqa xmmD,xmmB + punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) + punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) + + movdqa xmmG,xmmE + punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) + punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) + + punpcklbw xmmF,xmmH + punpckhbw xmmH,xmmH + psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) + psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) + +%endif ; RGB_PIXELSIZE ; --------------- + + ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE + ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO + + ; (Original) + ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + ; + ; (This implementation) + ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + + movdqa xmm6,xmm1 + punpcklwd xmm1,xmm3 + punpckhwd xmm6,xmm3 + pmaddwd xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) + pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) + + movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337) + + movdqa xmm6,xmm0 + punpcklwd xmm0,xmm2 + punpckhwd xmm6,xmm2 + pmaddwd xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) + pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) + + movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337) + movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337) + + movdqa xmm0, xmm5 ; xmm0=BO + movdqa xmm6, xmm4 ; xmm6=BE + + movdqa xmm4,xmm0 + punpcklwd xmm0,xmm3 + punpckhwd xmm4,xmm3 + pmaddwd xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) + pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) + + movdqa xmm3,[rel PD_ONEHALF] ; xmm3=[PD_ONEHALF] + + paddd xmm0, xmm1 + paddd xmm4, xmm7 + paddd xmm0,xmm3 + paddd xmm4,xmm3 + psrld xmm0,SCALEBITS ; xmm0=YOL + psrld xmm4,SCALEBITS ; xmm4=YOH + packssdw xmm0,xmm4 ; xmm0=YO + + movdqa xmm4,xmm6 + punpcklwd xmm6,xmm2 + punpckhwd xmm4,xmm2 + pmaddwd xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) + pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) + + movdqa xmm2,[rel PD_ONEHALF] ; xmm2=[PD_ONEHALF] + + paddd xmm6, XMMWORD [wk(0)] + paddd xmm4, XMMWORD [wk(1)] + paddd xmm6,xmm2 + paddd xmm4,xmm2 + psrld xmm6,SCALEBITS ; xmm6=YEL + psrld xmm4,SCALEBITS ; xmm4=YEH + packssdw xmm6,xmm4 ; xmm6=YE + + psllw xmm0,BYTE_BIT + por xmm6,xmm0 ; xmm6=Y + movdqa XMMWORD [rdi], xmm6 ; Save Y + + sub rcx, byte SIZEOF_XMMWORD + add rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr + add rdi, byte SIZEOF_XMMWORD ; outptr0 + cmp rcx, byte SIZEOF_XMMWORD + jae near .columnloop + test rcx,rcx + jnz near .column_ld1 + + pop rcx ; col + pop rsi + pop rdi + + add rsi, byte SIZEOF_JSAMPROW ; input_buf + add rdi, byte SIZEOF_JSAMPROW + dec rax ; num_rows + jg near .rowloop + +.return: + pop rbx + uncollect_args + mov rsp,rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/Builder/jni-1.11/simd/jcgryext-sse2.asm b/Builder/jni-1.11/simd/jcgryext-sse2.asm new file mode 100644 index 000000000..cd16dd192 --- /dev/null +++ b/Builder/jni-1.11/simd/jcgryext-sse2.asm @@ -0,0 +1,384 @@ +; +; jcgryext.asm - grayscale colorspace conversion (SSE2) +; +; Copyright (C) 2011, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Convert some rows of samples to the output colorspace. +; +; GLOBAL(void) +; jsimd_rgb_gray_convert_sse2 (JDIMENSION img_width, +; JSAMPARRAY input_buf, JSAMPIMAGE output_buf, +; JDIMENSION output_row, int num_rows); +; + +%define img_width(b) (b)+8 ; JDIMENSION img_width +%define input_buf(b) (b)+12 ; JSAMPARRAY input_buf +%define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf +%define output_row(b) (b)+20 ; JDIMENSION output_row +%define num_rows(b) (b)+24 ; int num_rows + +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 +%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr + + align 16 + + global EXTN(jsimd_rgb_gray_convert_sse2) + +EXTN(jsimd_rgb_gray_convert_sse2): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov ecx, JDIMENSION [img_width(eax)] + test ecx,ecx + jz near .return + + push ecx + + mov esi, JSAMPIMAGE [output_buf(eax)] + mov ecx, JDIMENSION [output_row(eax)] + mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY] + lea edi, [edi+ecx*SIZEOF_JSAMPROW] + + pop ecx + + mov esi, JSAMPARRAY [input_buf(eax)] + mov eax, INT [num_rows(eax)] + test eax,eax + jle near .return + alignx 16,7 +.rowloop: + pushpic eax + push edi + push esi + push ecx ; col + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr0 + movpic eax, POINTER [gotptr] ; load GOT address (eax) + + cmp ecx, byte SIZEOF_XMMWORD + jae near .columnloop + alignx 16,7 + +%if RGB_PIXELSIZE == 3 ; --------------- + +.column_ld1: + push eax + push edx + lea ecx,[ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE + test cl, SIZEOF_BYTE + jz short .column_ld2 + sub ecx, byte SIZEOF_BYTE + movzx eax, BYTE [esi+ecx] +.column_ld2: + test cl, SIZEOF_WORD + jz short .column_ld4 + sub ecx, byte SIZEOF_WORD + movzx edx, WORD [esi+ecx] + shl eax, WORD_BIT + or eax,edx +.column_ld4: + movd xmmA,eax + pop edx + pop eax + test cl, SIZEOF_DWORD + jz short .column_ld8 + sub ecx, byte SIZEOF_DWORD + movd xmmF, XMM_DWORD [esi+ecx] + pslldq xmmA, SIZEOF_DWORD + por xmmA,xmmF +.column_ld8: + test cl, SIZEOF_MMWORD + jz short .column_ld16 + sub ecx, byte SIZEOF_MMWORD + movq xmmB, XMM_MMWORD [esi+ecx] + pslldq xmmA, SIZEOF_MMWORD + por xmmA,xmmB +.column_ld16: + test cl, SIZEOF_XMMWORD + jz short .column_ld32 + movdqa xmmF,xmmA + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + mov ecx, SIZEOF_XMMWORD + jmp short .rgb_gray_cnv +.column_ld32: + test cl, 2*SIZEOF_XMMWORD + mov ecx, SIZEOF_XMMWORD + jz short .rgb_gray_cnv + movdqa xmmB,xmmA + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] + jmp short .rgb_gray_cnv + alignx 16,7 + +.columnloop: + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] + movdqu xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD] + +.rgb_gray_cnv: + ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) + ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) + + movdqa xmmG,xmmA + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) + psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) + + punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) + pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) + + punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) + punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) + + movdqa xmmD,xmmA + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) + psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) + + punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) + pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) + + punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) + punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) + + movdqa xmmE,xmmA + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) + psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) + + punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) + pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) + + punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) + punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) + + pxor xmmH,xmmH + + movdqa xmmC,xmmA + punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) + punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) + + movdqa xmmB,xmmE + punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) + punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) + + movdqa xmmF,xmmD + punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) + punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +.column_ld1: + test cl, SIZEOF_XMMWORD/16 + jz short .column_ld2 + sub ecx, byte SIZEOF_XMMWORD/16 + movd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE] +.column_ld2: + test cl, SIZEOF_XMMWORD/8 + jz short .column_ld4 + sub ecx, byte SIZEOF_XMMWORD/8 + movq xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE] + pslldq xmmA, SIZEOF_MMWORD + por xmmA,xmmE +.column_ld4: + test cl, SIZEOF_XMMWORD/4 + jz short .column_ld8 + sub ecx, byte SIZEOF_XMMWORD/4 + movdqa xmmE,xmmA + movdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE] +.column_ld8: + test cl, SIZEOF_XMMWORD/2 + mov ecx, SIZEOF_XMMWORD + jz short .rgb_gray_cnv + movdqa xmmF,xmmA + movdqa xmmH,xmmE + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] + jmp short .rgb_gray_cnv + alignx 16,7 + +.columnloop: + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD] + movdqu xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD] + +.rgb_gray_cnv: + ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) + ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) + ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + + movdqa xmmD,xmmA + punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) + punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) + + movdqa xmmC,xmmF + punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) + punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) + + movdqa xmmB,xmmA + punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) + punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) + + movdqa xmmG,xmmD + punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) + punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) + + movdqa xmmE,xmmA + punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) + punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) + + movdqa xmmH,xmmB + punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) + punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) + + pxor xmmF,xmmF + + movdqa xmmC,xmmA + punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) + punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) + + movdqa xmmD,xmmB + punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) + punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) + + movdqa xmmG,xmmE + punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) + punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) + + punpcklbw xmmF,xmmH + punpckhbw xmmH,xmmH + psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) + psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) + +%endif ; RGB_PIXELSIZE ; --------------- + + ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE + ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO + + ; (Original) + ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + ; + ; (This implementation) + ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + + movdqa xmm6,xmm1 + punpcklwd xmm1,xmm3 + punpckhwd xmm6,xmm3 + pmaddwd xmm1,[GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) + pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) + + movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337) + + movdqa xmm6,xmm0 + punpcklwd xmm0,xmm2 + punpckhwd xmm6,xmm2 + pmaddwd xmm0,[GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) + pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) + + movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337) + movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337) + + movdqa xmm0, xmm5 ; xmm0=BO + movdqa xmm6, xmm4 ; xmm6=BE + + movdqa xmm4,xmm0 + punpcklwd xmm0,xmm3 + punpckhwd xmm4,xmm3 + pmaddwd xmm0,[GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) + pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) + + movdqa xmm3,[GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF] + + paddd xmm0, xmm1 + paddd xmm4, xmm7 + paddd xmm0,xmm3 + paddd xmm4,xmm3 + psrld xmm0,SCALEBITS ; xmm0=YOL + psrld xmm4,SCALEBITS ; xmm4=YOH + packssdw xmm0,xmm4 ; xmm0=YO + + movdqa xmm4,xmm6 + punpcklwd xmm6,xmm2 + punpckhwd xmm4,xmm2 + pmaddwd xmm6,[GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) + pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) + + movdqa xmm2,[GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF] + + paddd xmm6, XMMWORD [wk(0)] + paddd xmm4, XMMWORD [wk(1)] + paddd xmm6,xmm2 + paddd xmm4,xmm2 + psrld xmm6,SCALEBITS ; xmm6=YEL + psrld xmm4,SCALEBITS ; xmm4=YEH + packssdw xmm6,xmm4 ; xmm6=YE + + psllw xmm0,BYTE_BIT + por xmm6,xmm0 ; xmm6=Y + movdqa XMMWORD [edi], xmm6 ; Save Y + + sub ecx, byte SIZEOF_XMMWORD + add esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr + add edi, byte SIZEOF_XMMWORD ; outptr0 + cmp ecx, byte SIZEOF_XMMWORD + jae near .columnloop + test ecx,ecx + jnz near .column_ld1 + + pop ecx ; col + pop esi + pop edi + poppic eax + + add esi, byte SIZEOF_JSAMPROW ; input_buf + add edi, byte SIZEOF_JSAMPROW + dec eax ; num_rows + jg near .rowloop + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/Builder/jni-1.11/simd/jchuff-sse2-64.asm b/Builder/jni-1.11/simd/jchuff-sse2-64.asm new file mode 100644 index 000000000..b1144d1cd --- /dev/null +++ b/Builder/jni-1.11/simd/jchuff-sse2-64.asm @@ -0,0 +1,360 @@ +; +; jchuff-sse2-64.asm - Huffman entropy encoding (64-bit SSE2) +; +; Copyright (C) 2009-2011, 2014-2016, D. R. Commander. +; Copyright (C) 2015, Matthieu Darbois. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains an SSE2 implementation for Huffman coding of one block. +; The following code is based directly on jchuff.c; see jchuff.c for more +; details. +; +; [TAB8] + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_huff_encode_one_block) + +EXTN(jconst_huff_encode_one_block): + +%include "jpeg_nbits_table.inc" + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 + +; These macros perform the same task as the emit_bits() function in the +; original libjpeg code. In addition to reducing overhead by explicitly +; inlining the code, additional performance is achieved by taking into +; account the size of the bit buffer and waiting until it is almost full +; before emptying it. This mostly benefits 64-bit platforms, since 6 +; bytes can be stored in a 64-bit bit buffer before it has to be emptied. + +%macro EMIT_BYTE 0 + sub put_bits, 8 ; put_bits -= 8; + mov rdx, put_buffer + mov ecx, put_bits + shr rdx, cl ; c = (JOCTET)GETJOCTET(put_buffer >> put_bits); + mov byte [buffer], dl ; *buffer++ = c; + add buffer, 1 + cmp dl, 0xFF ; need to stuff a zero byte? + jne %%.EMIT_BYTE_END + mov byte [buffer], 0 ; *buffer++ = 0; + add buffer, 1 +%%.EMIT_BYTE_END: +%endmacro + +%macro PUT_BITS 1 + add put_bits, ecx ; put_bits += size; + shl put_buffer, cl ; put_buffer = (put_buffer << size); + or put_buffer, %1 +%endmacro + +%macro CHECKBUF31 0 + cmp put_bits, 32 ; if (put_bits > 31) { + jl %%.CHECKBUF31_END + EMIT_BYTE + EMIT_BYTE + EMIT_BYTE + EMIT_BYTE +%%.CHECKBUF31_END: +%endmacro + +%macro CHECKBUF47 0 + cmp put_bits, 48 ; if (put_bits > 47) { + jl %%.CHECKBUF47_END + EMIT_BYTE + EMIT_BYTE + EMIT_BYTE + EMIT_BYTE + EMIT_BYTE + EMIT_BYTE +%%.CHECKBUF47_END: +%endmacro + +%macro EMIT_BITS 2 + CHECKBUF47 + mov ecx, %2 + PUT_BITS %1 +%endmacro + +%macro kloop_prepare 37 ;(ko, jno0, ..., jno31, xmm0, xmm1, xmm2, xmm3) + pxor xmm8, xmm8 ; __m128i neg = _mm_setzero_si128(); + pxor xmm9, xmm9 ; __m128i neg = _mm_setzero_si128(); + pxor xmm10, xmm10 ; __m128i neg = _mm_setzero_si128(); + pxor xmm11, xmm11 ; __m128i neg = _mm_setzero_si128(); + pinsrw %34, word [r12 + %2 * SIZEOF_WORD], 0 ; xmm_shadow[0] = block[jno0]; + pinsrw %35, word [r12 + %10 * SIZEOF_WORD], 0 ; xmm_shadow[8] = block[jno8]; + pinsrw %36, word [r12 + %18 * SIZEOF_WORD], 0 ; xmm_shadow[16] = block[jno16]; + pinsrw %37, word [r12 + %26 * SIZEOF_WORD], 0 ; xmm_shadow[24] = block[jno24]; + pinsrw %34, word [r12 + %3 * SIZEOF_WORD], 1 ; xmm_shadow[1] = block[jno1]; + pinsrw %35, word [r12 + %11 * SIZEOF_WORD], 1 ; xmm_shadow[9] = block[jno9]; + pinsrw %36, word [r12 + %19 * SIZEOF_WORD], 1 ; xmm_shadow[17] = block[jno17]; + pinsrw %37, word [r12 + %27 * SIZEOF_WORD], 1 ; xmm_shadow[25] = block[jno25]; + pinsrw %34, word [r12 + %4 * SIZEOF_WORD], 2 ; xmm_shadow[2] = block[jno2]; + pinsrw %35, word [r12 + %12 * SIZEOF_WORD], 2 ; xmm_shadow[10] = block[jno10]; + pinsrw %36, word [r12 + %20 * SIZEOF_WORD], 2 ; xmm_shadow[18] = block[jno18]; + pinsrw %37, word [r12 + %28 * SIZEOF_WORD], 2 ; xmm_shadow[26] = block[jno26]; + pinsrw %34, word [r12 + %5 * SIZEOF_WORD], 3 ; xmm_shadow[3] = block[jno3]; + pinsrw %35, word [r12 + %13 * SIZEOF_WORD], 3 ; xmm_shadow[11] = block[jno11]; + pinsrw %36, word [r12 + %21 * SIZEOF_WORD], 3 ; xmm_shadow[19] = block[jno19]; + pinsrw %37, word [r12 + %29 * SIZEOF_WORD], 3 ; xmm_shadow[27] = block[jno27]; + pinsrw %34, word [r12 + %6 * SIZEOF_WORD], 4 ; xmm_shadow[4] = block[jno4]; + pinsrw %35, word [r12 + %14 * SIZEOF_WORD], 4 ; xmm_shadow[12] = block[jno12]; + pinsrw %36, word [r12 + %22 * SIZEOF_WORD], 4 ; xmm_shadow[20] = block[jno20]; + pinsrw %37, word [r12 + %30 * SIZEOF_WORD], 4 ; xmm_shadow[28] = block[jno28]; + pinsrw %34, word [r12 + %7 * SIZEOF_WORD], 5 ; xmm_shadow[5] = block[jno5]; + pinsrw %35, word [r12 + %15 * SIZEOF_WORD], 5 ; xmm_shadow[13] = block[jno13]; + pinsrw %36, word [r12 + %23 * SIZEOF_WORD], 5 ; xmm_shadow[21] = block[jno21]; + pinsrw %37, word [r12 + %31 * SIZEOF_WORD], 5 ; xmm_shadow[29] = block[jno29]; + pinsrw %34, word [r12 + %8 * SIZEOF_WORD], 6 ; xmm_shadow[6] = block[jno6]; + pinsrw %35, word [r12 + %16 * SIZEOF_WORD], 6 ; xmm_shadow[14] = block[jno14]; + pinsrw %36, word [r12 + %24 * SIZEOF_WORD], 6 ; xmm_shadow[22] = block[jno22]; + pinsrw %37, word [r12 + %32 * SIZEOF_WORD], 6 ; xmm_shadow[30] = block[jno30]; + pinsrw %34, word [r12 + %9 * SIZEOF_WORD], 7 ; xmm_shadow[7] = block[jno7]; + pinsrw %35, word [r12 + %17 * SIZEOF_WORD], 7 ; xmm_shadow[15] = block[jno15]; + pinsrw %36, word [r12 + %25 * SIZEOF_WORD], 7 ; xmm_shadow[23] = block[jno23]; +%if %1 != 32 + pinsrw %37, word [r12 + %33 * SIZEOF_WORD], 7 ; xmm_shadow[31] = block[jno31]; +%else + pinsrw %37, ebx, 7 ; xmm_shadow[31] = block[jno31]; +%endif + pcmpgtw xmm8, %34 ; neg = _mm_cmpgt_epi16(neg, x1); + pcmpgtw xmm9, %35 ; neg = _mm_cmpgt_epi16(neg, x1); + pcmpgtw xmm10, %36 ; neg = _mm_cmpgt_epi16(neg, x1); + pcmpgtw xmm11, %37 ; neg = _mm_cmpgt_epi16(neg, x1); + paddw %34, xmm8 ; x1 = _mm_add_epi16(x1, neg); + paddw %35, xmm9 ; x1 = _mm_add_epi16(x1, neg); + paddw %36, xmm10 ; x1 = _mm_add_epi16(x1, neg); + paddw %37, xmm11 ; x1 = _mm_add_epi16(x1, neg); + pxor %34, xmm8 ; x1 = _mm_xor_si128(x1, neg); + pxor %35, xmm9 ; x1 = _mm_xor_si128(x1, neg); + pxor %36, xmm10 ; x1 = _mm_xor_si128(x1, neg); + pxor %37, xmm11 ; x1 = _mm_xor_si128(x1, neg); + pxor xmm8, %34 ; neg = _mm_xor_si128(neg, x1); + pxor xmm9, %35 ; neg = _mm_xor_si128(neg, x1); + pxor xmm10, %36 ; neg = _mm_xor_si128(neg, x1); + pxor xmm11, %37 ; neg = _mm_xor_si128(neg, x1); + movdqa XMMWORD [t1 + %1 * SIZEOF_WORD], %34 ; _mm_storeu_si128((__m128i *)(t1 + ko), x1); + movdqa XMMWORD [t1 + (%1 + 8) * SIZEOF_WORD], %35 ; _mm_storeu_si128((__m128i *)(t1 + ko + 8), x1); + movdqa XMMWORD [t1 + (%1 + 16) * SIZEOF_WORD], %36 ; _mm_storeu_si128((__m128i *)(t1 + ko + 16), x1); + movdqa XMMWORD [t1 + (%1 + 24) * SIZEOF_WORD], %37 ; _mm_storeu_si128((__m128i *)(t1 + ko + 24), x1); + movdqa XMMWORD [t2 + %1 * SIZEOF_WORD], xmm8 ; _mm_storeu_si128((__m128i *)(t2 + ko), neg); + movdqa XMMWORD [t2 + (%1 + 8) * SIZEOF_WORD], xmm9 ; _mm_storeu_si128((__m128i *)(t2 + ko + 8), neg); + movdqa XMMWORD [t2 + (%1 + 16) * SIZEOF_WORD], xmm10 ; _mm_storeu_si128((__m128i *)(t2 + ko + 16), neg); + movdqa XMMWORD [t2 + (%1 + 24) * SIZEOF_WORD], xmm11 ; _mm_storeu_si128((__m128i *)(t2 + ko + 24), neg); +%endmacro + +; +; Encode a single block's worth of coefficients. +; +; GLOBAL(JOCTET*) +; jsimd_huff_encode_one_block_sse2 (working_state *state, JOCTET *buffer, +; JCOEFPTR block, int last_dc_val, +; c_derived_tbl *dctbl, c_derived_tbl *actbl) +; + +; r10 = working_state *state +; r11 = JOCTET *buffer +; r12 = JCOEFPTR block +; r13 = int last_dc_val +; r14 = c_derived_tbl *dctbl +; r15 = c_derived_tbl *actbl + +%define t1 rbp-(DCTSIZE2*SIZEOF_WORD) +%define t2 t1-(DCTSIZE2*SIZEOF_WORD) +%define put_buffer r8 +%define put_bits r9d +%define buffer rax + + align 16 + global EXTN(jsimd_huff_encode_one_block_sse2) + +EXTN(jsimd_huff_encode_one_block_sse2): + push rbp + mov rax,rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp],rax + mov rbp,rsp ; rbp = aligned rbp + lea rsp, [t2] + collect_args +%ifdef WIN64 + movaps XMMWORD [rsp-1*SIZEOF_XMMWORD], xmm8 + movaps XMMWORD [rsp-2*SIZEOF_XMMWORD], xmm9 + movaps XMMWORD [rsp-3*SIZEOF_XMMWORD], xmm10 + movaps XMMWORD [rsp-4*SIZEOF_XMMWORD], xmm11 + sub rsp, 4*SIZEOF_XMMWORD +%endif + push rbx + + mov buffer, r11 ; r11 is now sratch + + mov put_buffer, MMWORD [r10+16] ; put_buffer = state->cur.put_buffer; + mov put_bits, DWORD [r10+24] ; put_bits = state->cur.put_bits; + push r10 ; r10 is now scratch + + ; Encode the DC coefficient difference per section F.1.2.1 + movsx edi, word [r12] ; temp = temp2 = block[0] - last_dc_val; + sub edi, r13d ; r13 is not used anymore + mov ebx, edi + + ; This is a well-known technique for obtaining the absolute value + ; without a branch. It is derived from an assembly language technique + ; presented in "How to Optimize for the Pentium Processors", + ; Copyright (c) 1996, 1997 by Agner Fog. + mov esi, edi + sar esi, 31 ; temp3 = temp >> (CHAR_BIT * sizeof(int) - 1); + xor edi, esi ; temp ^= temp3; + sub edi, esi ; temp -= temp3; + + ; For a negative input, want temp2 = bitwise complement of abs(input) + ; This code assumes we are on a two's complement machine + add ebx, esi ; temp2 += temp3; + + ; Find the number of bits needed for the magnitude of the coefficient + lea r11, [rel jpeg_nbits_table] + movzx rdi, byte [r11 + rdi] ; nbits = JPEG_NBITS(temp); + ; Emit the Huffman-coded symbol for the number of bits + mov r11d, INT [r14 + rdi * 4] ; code = dctbl->ehufco[nbits]; + movzx esi, byte [r14 + rdi + 1024] ; size = dctbl->ehufsi[nbits]; + EMIT_BITS r11, esi ; EMIT_BITS(code, size) + + ; Mask off any extra bits in code + mov esi, 1 + mov ecx, edi + shl esi, cl + dec esi + and ebx, esi ; temp2 &= (((JLONG) 1)<ehufco[0xf0]; + movzx r14d, byte [r15 + 1024 + 240] ; size_0xf0 = actbl->ehufsi[0xf0]; + lea rsi, [t1] +.BLOOP: + bsf r12, r11 ; r = __builtin_ctzl(index); + jz .ELOOP + mov rcx, r12 + lea rsi, [rsi+r12*2] ; k += r; + shr r11, cl ; index >>= r; + movzx rdi, word [rsi] ; temp = t1[k]; + lea rbx, [rel jpeg_nbits_table] + movzx rdi, byte [rbx + rdi] ; nbits = JPEG_NBITS(temp); +.BRLOOP: + cmp r12, 16 ; while (r > 15) { + jl .ERLOOP + EMIT_BITS r13, r14d ; EMIT_BITS(code_0xf0, size_0xf0) + sub r12, 16 ; r -= 16; + jmp .BRLOOP +.ERLOOP: + ; Emit Huffman symbol for run length / number of bits + CHECKBUF31 ; uses rcx, rdx + + shl r12, 4 ; temp3 = (r << 4) + nbits; + add r12, rdi + mov ebx, INT [r15 + r12 * 4] ; code = actbl->ehufco[temp3]; + movzx ecx, byte [r15 + r12 + 1024] ; size = actbl->ehufsi[temp3]; + PUT_BITS rbx + + ;EMIT_CODE(code, size) + + movsx ebx, word [rsi-DCTSIZE2*2] ; temp2 = t2[k]; + ; Mask off any extra bits in code + mov rcx, rdi + mov rdx, 1 + shl rdx, cl + dec rdx + and rbx, rdx ; temp2 &= (((JLONG) 1)<>= 1; + add rsi, 2 ; ++k; + jmp .BLOOP +.ELOOP: + ; If the last coef(s) were zero, emit an end-of-block code + lea rdi, [t1 + (DCTSIZE2-1) * 2] ; r = DCTSIZE2-1-k; + cmp rdi, rsi ; if (r > 0) { + je .EFN + mov ebx, INT [r15] ; code = actbl->ehufco[0]; + movzx r12d, byte [r15 + 1024] ; size = actbl->ehufsi[0]; + EMIT_BITS rbx, r12d +.EFN: + pop r10 + ; Save put_buffer & put_bits + mov MMWORD [r10+16], put_buffer ; state->cur.put_buffer = put_buffer; + mov DWORD [r10+24], put_bits ; state->cur.put_bits = put_bits; + + pop rbx +%ifdef WIN64 + movaps xmm11, XMMWORD [rsp+0*SIZEOF_XMMWORD] + movaps xmm10, XMMWORD [rsp+1*SIZEOF_XMMWORD] + movaps xmm9, XMMWORD [rsp+2*SIZEOF_XMMWORD] + movaps xmm8, XMMWORD [rsp+3*SIZEOF_XMMWORD] + add rsp, 4*SIZEOF_XMMWORD +%endif + uncollect_args + mov rsp,rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/Builder/jni-1.11/simd/jchuff-sse2.asm b/Builder/jni-1.11/simd/jchuff-sse2.asm new file mode 100644 index 000000000..b81db75b4 --- /dev/null +++ b/Builder/jni-1.11/simd/jchuff-sse2.asm @@ -0,0 +1,426 @@ +; +; jchuff-sse2.asm - Huffman entropy encoding (SSE2) +; +; Copyright (C) 2009-2011, 2014-2017, D. R. Commander. +; Copyright (C) 2015, Matthieu Darbois. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains an SSE2 implementation for Huffman coding of one block. +; The following code is based directly on jchuff.c; see jchuff.c for more +; details. +; +; [TAB8] + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_huff_encode_one_block) + +EXTN(jconst_huff_encode_one_block): + +%include "jpeg_nbits_table.inc" + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + +; These macros perform the same task as the emit_bits() function in the +; original libjpeg code. In addition to reducing overhead by explicitly +; inlining the code, additional performance is achieved by taking into +; account the size of the bit buffer and waiting until it is almost full +; before emptying it. This mostly benefits 64-bit platforms, since 6 +; bytes can be stored in a 64-bit bit buffer before it has to be emptied. + +%macro EMIT_BYTE 0 + sub put_bits, 8 ; put_bits -= 8; + mov edx, put_buffer + mov ecx, put_bits + shr edx, cl ; c = (JOCTET)GETJOCTET(put_buffer >> put_bits); + mov byte [eax], dl ; *buffer++ = c; + add eax, 1 + cmp dl, 0xFF ; need to stuff a zero byte? + jne %%.EMIT_BYTE_END + mov byte [eax], 0 ; *buffer++ = 0; + add eax, 1 +%%.EMIT_BYTE_END: +%endmacro + +%macro PUT_BITS 1 + add put_bits, ecx ; put_bits += size; + shl put_buffer, cl ; put_buffer = (put_buffer << size); + or put_buffer, %1 +%endmacro + +%macro CHECKBUF15 0 + cmp put_bits, 16 ; if (put_bits > 31) { + jl %%.CHECKBUF15_END + mov eax, POINTER [esp+buffer] + EMIT_BYTE + EMIT_BYTE + mov POINTER [esp+buffer], eax +%%.CHECKBUF15_END: +%endmacro + +%macro EMIT_BITS 1 + PUT_BITS %1 + CHECKBUF15 +%endmacro + +%macro kloop_prepare 37 ;(ko, jno0, ..., jno31, xmm0, xmm1, xmm2, xmm3) + pxor xmm4, xmm4 ; __m128i neg = _mm_setzero_si128(); + pxor xmm5, xmm5 ; __m128i neg = _mm_setzero_si128(); + pxor xmm6, xmm6 ; __m128i neg = _mm_setzero_si128(); + pxor xmm7, xmm7 ; __m128i neg = _mm_setzero_si128(); + pinsrw %34, word [esi + %2 * SIZEOF_WORD], 0 ; xmm_shadow[0] = block[jno0]; + pinsrw %35, word [esi + %10 * SIZEOF_WORD], 0 ; xmm_shadow[8] = block[jno8]; + pinsrw %36, word [esi + %18 * SIZEOF_WORD], 0 ; xmm_shadow[16] = block[jno16]; + pinsrw %37, word [esi + %26 * SIZEOF_WORD], 0 ; xmm_shadow[24] = block[jno24]; + pinsrw %34, word [esi + %3 * SIZEOF_WORD], 1 ; xmm_shadow[1] = block[jno1]; + pinsrw %35, word [esi + %11 * SIZEOF_WORD], 1 ; xmm_shadow[9] = block[jno9]; + pinsrw %36, word [esi + %19 * SIZEOF_WORD], 1 ; xmm_shadow[17] = block[jno17]; + pinsrw %37, word [esi + %27 * SIZEOF_WORD], 1 ; xmm_shadow[25] = block[jno25]; + pinsrw %34, word [esi + %4 * SIZEOF_WORD], 2 ; xmm_shadow[2] = block[jno2]; + pinsrw %35, word [esi + %12 * SIZEOF_WORD], 2 ; xmm_shadow[10] = block[jno10]; + pinsrw %36, word [esi + %20 * SIZEOF_WORD], 2 ; xmm_shadow[18] = block[jno18]; + pinsrw %37, word [esi + %28 * SIZEOF_WORD], 2 ; xmm_shadow[26] = block[jno26]; + pinsrw %34, word [esi + %5 * SIZEOF_WORD], 3 ; xmm_shadow[3] = block[jno3]; + pinsrw %35, word [esi + %13 * SIZEOF_WORD], 3 ; xmm_shadow[11] = block[jno11]; + pinsrw %36, word [esi + %21 * SIZEOF_WORD], 3 ; xmm_shadow[19] = block[jno19]; + pinsrw %37, word [esi + %29 * SIZEOF_WORD], 3 ; xmm_shadow[27] = block[jno27]; + pinsrw %34, word [esi + %6 * SIZEOF_WORD], 4 ; xmm_shadow[4] = block[jno4]; + pinsrw %35, word [esi + %14 * SIZEOF_WORD], 4 ; xmm_shadow[12] = block[jno12]; + pinsrw %36, word [esi + %22 * SIZEOF_WORD], 4 ; xmm_shadow[20] = block[jno20]; + pinsrw %37, word [esi + %30 * SIZEOF_WORD], 4 ; xmm_shadow[28] = block[jno28]; + pinsrw %34, word [esi + %7 * SIZEOF_WORD], 5 ; xmm_shadow[5] = block[jno5]; + pinsrw %35, word [esi + %15 * SIZEOF_WORD], 5 ; xmm_shadow[13] = block[jno13]; + pinsrw %36, word [esi + %23 * SIZEOF_WORD], 5 ; xmm_shadow[21] = block[jno21]; + pinsrw %37, word [esi + %31 * SIZEOF_WORD], 5 ; xmm_shadow[29] = block[jno29]; + pinsrw %34, word [esi + %8 * SIZEOF_WORD], 6 ; xmm_shadow[6] = block[jno6]; + pinsrw %35, word [esi + %16 * SIZEOF_WORD], 6 ; xmm_shadow[14] = block[jno14]; + pinsrw %36, word [esi + %24 * SIZEOF_WORD], 6 ; xmm_shadow[22] = block[jno22]; + pinsrw %37, word [esi + %32 * SIZEOF_WORD], 6 ; xmm_shadow[30] = block[jno30]; + pinsrw %34, word [esi + %9 * SIZEOF_WORD], 7 ; xmm_shadow[7] = block[jno7]; + pinsrw %35, word [esi + %17 * SIZEOF_WORD], 7 ; xmm_shadow[15] = block[jno15]; + pinsrw %36, word [esi + %25 * SIZEOF_WORD], 7 ; xmm_shadow[23] = block[jno23]; +%if %1 != 32 + pinsrw %37, word [esi + %33 * SIZEOF_WORD], 7 ; xmm_shadow[31] = block[jno31]; +%else + pinsrw %37, ecx, 7 ; xmm_shadow[31] = block[jno31]; +%endif + pcmpgtw xmm4, %34 ; neg = _mm_cmpgt_epi16(neg, x1); + pcmpgtw xmm5, %35 ; neg = _mm_cmpgt_epi16(neg, x1); + pcmpgtw xmm6, %36 ; neg = _mm_cmpgt_epi16(neg, x1); + pcmpgtw xmm7, %37 ; neg = _mm_cmpgt_epi16(neg, x1); + paddw %34, xmm4 ; x1 = _mm_add_epi16(x1, neg); + paddw %35, xmm5 ; x1 = _mm_add_epi16(x1, neg); + paddw %36, xmm6 ; x1 = _mm_add_epi16(x1, neg); + paddw %37, xmm7 ; x1 = _mm_add_epi16(x1, neg); + pxor %34, xmm4 ; x1 = _mm_xor_si128(x1, neg); + pxor %35, xmm5 ; x1 = _mm_xor_si128(x1, neg); + pxor %36, xmm6 ; x1 = _mm_xor_si128(x1, neg); + pxor %37, xmm7 ; x1 = _mm_xor_si128(x1, neg); + pxor xmm4, %34 ; neg = _mm_xor_si128(neg, x1); + pxor xmm5, %35 ; neg = _mm_xor_si128(neg, x1); + pxor xmm6, %36 ; neg = _mm_xor_si128(neg, x1); + pxor xmm7, %37 ; neg = _mm_xor_si128(neg, x1); + movdqa XMMWORD [esp + t1 + %1 * SIZEOF_WORD], %34 ; _mm_storeu_si128((__m128i *)(t1 + ko), x1); + movdqa XMMWORD [esp + t1 + (%1 + 8) * SIZEOF_WORD], %35 ; _mm_storeu_si128((__m128i *)(t1 + ko + 8), x1); + movdqa XMMWORD [esp + t1 + (%1 + 16) * SIZEOF_WORD], %36 ; _mm_storeu_si128((__m128i *)(t1 + ko + 16), x1); + movdqa XMMWORD [esp + t1 + (%1 + 24) * SIZEOF_WORD], %37 ; _mm_storeu_si128((__m128i *)(t1 + ko + 24), x1); + movdqa XMMWORD [esp + t2 + %1 * SIZEOF_WORD], xmm4 ; _mm_storeu_si128((__m128i *)(t2 + ko), neg); + movdqa XMMWORD [esp + t2 + (%1 + 8) * SIZEOF_WORD], xmm5 ; _mm_storeu_si128((__m128i *)(t2 + ko + 8), neg); + movdqa XMMWORD [esp + t2 + (%1 + 16) * SIZEOF_WORD], xmm6 ; _mm_storeu_si128((__m128i *)(t2 + ko + 16), neg); + movdqa XMMWORD [esp + t2 + (%1 + 24) * SIZEOF_WORD], xmm7 ; _mm_storeu_si128((__m128i *)(t2 + ko + 24), neg); +%endmacro + +; +; Encode a single block's worth of coefficients. +; +; GLOBAL(JOCTET*) +; jsimd_huff_encode_one_block_sse2 (working_state *state, JOCTET *buffer, +; JCOEFPTR block, int last_dc_val, +; c_derived_tbl *dctbl, c_derived_tbl *actbl) +; + +; eax + 8 = working_state *state +; eax + 12 = JOCTET *buffer +; eax + 16 = JCOEFPTR block +; eax + 20 = int last_dc_val +; eax + 24 = c_derived_tbl *dctbl +; eax + 28 = c_derived_tbl *actbl + +%define pad 6*SIZEOF_DWORD ; Align to 16 bytes +%define t1 pad +%define t2 t1+(DCTSIZE2*SIZEOF_WORD) +%define block t2+(DCTSIZE2*SIZEOF_WORD) +%define actbl block+SIZEOF_DWORD +%define buffer actbl+SIZEOF_DWORD +%define temp buffer+SIZEOF_DWORD +%define temp2 temp+SIZEOF_DWORD +%define temp3 temp2+SIZEOF_DWORD +%define temp4 temp3+SIZEOF_DWORD +%define temp5 temp4+SIZEOF_DWORD +%define gotptr temp5+SIZEOF_DWORD ; void *gotptr +%define put_buffer ebx +%define put_bits edi + + align 16 + global EXTN(jsimd_huff_encode_one_block_sse2) + +EXTN(jsimd_huff_encode_one_block_sse2): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + sub esp, temp5+9*SIZEOF_DWORD-pad + push ebx + push ecx +; push edx ; need not be preserved + push esi + push edi + push ebp + + mov esi, POINTER [eax+8] ; (working_state *state) + mov put_buffer, DWORD [esi+8] ; put_buffer = state->cur.put_buffer; + mov put_bits, DWORD [esi+12] ; put_bits = state->cur.put_bits; + push esi ; esi is now scratch + + get_GOT edx ; get GOT address + movpic POINTER [esp+gotptr], edx ; save GOT address + + mov ecx, POINTER [eax+28] + mov edx, POINTER [eax+16] + mov esi, POINTER [eax+12] + mov POINTER [esp+actbl], ecx + mov POINTER [esp+block], edx + mov POINTER [esp+buffer], esi + + ; Encode the DC coefficient difference per section F.1.2.1 + mov esi, POINTER [esp+block] ; block + movsx ecx, word [esi] ; temp = temp2 = block[0] - last_dc_val; + sub ecx, DWORD [eax+20] + mov esi, ecx + + ; This is a well-known technique for obtaining the absolute value + ; without a branch. It is derived from an assembly language technique + ; presented in "How to Optimize for the Pentium Processors", + ; Copyright (c) 1996, 1997 by Agner Fog. + mov edx, ecx + sar edx, 31 ; temp3 = temp >> (CHAR_BIT * sizeof(int) - 1); + xor ecx, edx ; temp ^= temp3; + sub ecx, edx ; temp -= temp3; + + ; For a negative input, want temp2 = bitwise complement of abs(input) + ; This code assumes we are on a two's complement machine + add esi, edx ; temp2 += temp3; + mov DWORD [esp+temp], esi ; backup temp2 in temp + + ; Find the number of bits needed for the magnitude of the coefficient + movpic ebp, POINTER [esp+gotptr] ; load GOT address (ebp) + movzx edx, byte [GOTOFF(ebp, jpeg_nbits_table + ecx)] ; nbits = JPEG_NBITS(temp); + mov DWORD [esp+temp2], edx ; backup nbits in temp2 + + ; Emit the Huffman-coded symbol for the number of bits + mov ebp, POINTER [eax+24] ; After this point, arguments are not accessible anymore + mov eax, INT [ebp + edx * 4] ; code = dctbl->ehufco[nbits]; + movzx ecx, byte [ebp + edx + 1024] ; size = dctbl->ehufsi[nbits]; + EMIT_BITS eax ; EMIT_BITS(code, size) + + mov ecx, DWORD [esp+temp2] ; restore nbits + + ; Mask off any extra bits in code + mov eax, 1 + shl eax, cl + dec eax + and eax, DWORD [esp+temp] ; temp2 &= (((JLONG) 1)<>= r; + mov DWORD [esp+temp3], edx +.BRLOOP: + cmp ecx, 16 ; while (r > 15) { + jl near .ERLOOP + sub ecx, 16 ; r -= 16; + mov DWORD [esp+temp], ecx + mov eax, INT [ebp + 240 * 4] ; code_0xf0 = actbl->ehufco[0xf0]; + movzx ecx, byte [ebp + 1024 + 240] ; size_0xf0 = actbl->ehufsi[0xf0]; + EMIT_BITS eax ; EMIT_BITS(code_0xf0, size_0xf0) + mov ecx, DWORD [esp+temp] + jmp .BRLOOP +.ERLOOP: + movsx eax, word [esi] ; temp = t1[k]; + movpic edx, POINTER [esp+gotptr] ; load GOT address (edx) + movzx eax, byte [GOTOFF(edx, jpeg_nbits_table + eax)] ; nbits = JPEG_NBITS(temp); + mov DWORD [esp+temp2], eax + ; Emit Huffman symbol for run length / number of bits + shl ecx, 4 ; temp3 = (r << 4) + nbits; + add ecx, eax + mov eax, INT [ebp + ecx * 4] ; code = actbl->ehufco[temp3]; + movzx ecx, byte [ebp + ecx + 1024] ; size = actbl->ehufsi[temp3]; + EMIT_BITS eax + + movsx edx, word [esi+DCTSIZE2*2] ; temp2 = t2[k]; + ; Mask off any extra bits in code + mov ecx, DWORD [esp+temp2] + mov eax, 1 + shl eax, cl + dec eax + and eax, edx ; temp2 &= (((JLONG) 1)<>= 1; + + jmp .BLOOP +.ELOOP: + movdqa xmm0, XMMWORD [esp + t1 + 32 * SIZEOF_WORD] ; __m128i tmp0 = _mm_loadu_si128((__m128i *)(t1 + 0)); + movdqa xmm1, XMMWORD [esp + t1 + 40 * SIZEOF_WORD] ; __m128i tmp1 = _mm_loadu_si128((__m128i *)(t1 + 8)); + movdqa xmm2, XMMWORD [esp + t1 + 48 * SIZEOF_WORD] ; __m128i tmp2 = _mm_loadu_si128((__m128i *)(t1 + 16)); + movdqa xmm3, XMMWORD [esp + t1 + 56 * SIZEOF_WORD] ; __m128i tmp3 = _mm_loadu_si128((__m128i *)(t1 + 24)); + pcmpeqw xmm0, xmm7 ; tmp0 = _mm_cmpeq_epi16(tmp0, zero); + pcmpeqw xmm1, xmm7 ; tmp1 = _mm_cmpeq_epi16(tmp1, zero); + pcmpeqw xmm2, xmm7 ; tmp2 = _mm_cmpeq_epi16(tmp2, zero); + pcmpeqw xmm3, xmm7 ; tmp3 = _mm_cmpeq_epi16(tmp3, zero); + packsswb xmm0, xmm1 ; tmp0 = _mm_packs_epi16(tmp0, tmp1); + packsswb xmm2, xmm3 ; tmp2 = _mm_packs_epi16(tmp2, tmp3); + pmovmskb edx, xmm0 ; index = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0; + pmovmskb ecx, xmm2 ; index = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16; + shl ecx, 16 + or edx, ecx + not edx ; index = ~index; + + lea eax, [esp + t1 + (DCTSIZE2/2) * 2] + sub eax, esi + shr eax, 1 + bsf ecx, edx ; r = __builtin_ctzl(index); + jz near .ELOOP2 + shr edx, cl ; index >>= r; + add ecx, eax + lea esi, [esi+ecx*2] ; k += r; + mov DWORD [esp+temp3], edx + jmp .BRLOOP2 +.BLOOP2: + bsf ecx, edx ; r = __builtin_ctzl(index); + jz near .ELOOP2 + lea esi, [esi+ecx*2] ; k += r; + shr edx, cl ; index >>= r; + mov DWORD [esp+temp3], edx +.BRLOOP2: + cmp ecx, 16 ; while (r > 15) { + jl near .ERLOOP2 + sub ecx, 16 ; r -= 16; + mov DWORD [esp+temp], ecx + mov eax, INT [ebp + 240 * 4] ; code_0xf0 = actbl->ehufco[0xf0]; + movzx ecx, byte [ebp + 1024 + 240] ; size_0xf0 = actbl->ehufsi[0xf0]; + EMIT_BITS eax ; EMIT_BITS(code_0xf0, size_0xf0) + mov ecx, DWORD [esp+temp] + jmp .BRLOOP2 +.ERLOOP2: + movsx eax, word [esi] ; temp = t1[k]; + bsr eax, eax ; nbits = 32 - __builtin_clz(temp); + inc eax + mov DWORD [esp+temp2], eax + ; Emit Huffman symbol for run length / number of bits + shl ecx, 4 ; temp3 = (r << 4) + nbits; + add ecx, eax + mov eax, INT [ebp + ecx * 4] ; code = actbl->ehufco[temp3]; + movzx ecx, byte [ebp + ecx + 1024] ; size = actbl->ehufsi[temp3]; + EMIT_BITS eax + + movsx edx, word [esi+DCTSIZE2*2] ; temp2 = t2[k]; + ; Mask off any extra bits in code + mov ecx, DWORD [esp+temp2] + mov eax, 1 + shl eax, cl + dec eax + and eax, edx ; temp2 &= (((JLONG) 1)<>= 1; + + jmp .BLOOP2 +.ELOOP2: + ; If the last coef(s) were zero, emit an end-of-block code + lea edx, [esp + t1 + (DCTSIZE2-1) * 2] ; r = DCTSIZE2-1-k; + cmp edx, esi ; if (r > 0) { + je .EFN + mov eax, INT [ebp] ; code = actbl->ehufco[0]; + movzx ecx, byte [ebp + 1024] ; size = actbl->ehufsi[0]; + EMIT_BITS eax +.EFN: + mov eax, [esp+buffer] + pop esi + ; Save put_buffer & put_bits + mov DWORD [esi+8], put_buffer ; state->cur.put_buffer = put_buffer; + mov DWORD [esi+12], put_bits ; state->cur.put_bits = put_bits; + + pop ebp + pop edi + pop esi +; pop edx ; need not be preserved + pop ecx + pop ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/Builder/jni-1.11/simd/i386/src/jcolsamp.inc b/Builder/jni-1.11/simd/jcolsamp.inc similarity index 97% rename from Builder/jni-1.11/simd/i386/src/jcolsamp.inc rename to Builder/jni-1.11/simd/jcolsamp.inc index 79751b7c7..3be446e84 100644 --- a/Builder/jni-1.11/simd/i386/src/jcolsamp.inc +++ b/Builder/jni-1.11/simd/jcolsamp.inc @@ -3,8 +3,7 @@ ; ; Copyright 2009 Pierre Ossman for Cendio AB ; -; Based on -; x86 SIMD extension for IJG JPEG library +; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; diff --git a/Builder/jni-1.11/simd/jcsample-altivec.c b/Builder/jni-1.11/simd/jcsample-altivec.c new file mode 100644 index 000000000..11609d9da --- /dev/null +++ b/Builder/jni-1.11/simd/jcsample-altivec.c @@ -0,0 +1,158 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2015, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* CHROMA DOWNSAMPLING */ + +#include "jsimd_altivec.h" +#include "jcsample.h" + + +void +jsimd_h2v1_downsample_altivec (JDIMENSION image_width, int max_v_samp_factor, + JDIMENSION v_samp_factor, + JDIMENSION width_blocks, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + int outrow, outcol; + JDIMENSION output_cols = width_blocks * DCTSIZE; + JSAMPROW inptr, outptr; + + __vector unsigned char this0, next0, out; + __vector unsigned short this0e, this0o, next0e, next0o, outl, outh; + + /* Constants */ + __vector unsigned short pw_bias = { __4X2(0, 1) }, + pw_one = { __8X(1) }; + __vector unsigned char even_odd_index = + {0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15}, + pb_zero = { __16X(0) }; + + expand_right_edge(input_data, max_v_samp_factor, image_width, + output_cols * 2); + + for (outrow = 0; outrow < v_samp_factor; outrow++) { + outptr = output_data[outrow]; + inptr = input_data[outrow]; + + for (outcol = output_cols; outcol > 0; + outcol -= 16, inptr += 32, outptr += 16) { + + this0 = vec_ld(0, inptr); + this0 = vec_perm(this0, this0, even_odd_index); + this0e = (__vector unsigned short)VEC_UNPACKHU(this0); + this0o = (__vector unsigned short)VEC_UNPACKLU(this0); + outl = vec_add(this0e, this0o); + outl = vec_add(outl, pw_bias); + outl = vec_sr(outl, pw_one); + + if (outcol > 8) { + next0 = vec_ld(16, inptr); + next0 = vec_perm(next0, next0, even_odd_index); + next0e = (__vector unsigned short)VEC_UNPACKHU(next0); + next0o = (__vector unsigned short)VEC_UNPACKLU(next0); + outh = vec_add(next0e, next0o); + outh = vec_add(outh, pw_bias); + outh = vec_sr(outh, pw_one); + } else + outh = vec_splat_u16(0); + + out = vec_pack(outl, outh); + vec_st(out, 0, outptr); + } + } +} + + +void +jsimd_h2v2_downsample_altivec (JDIMENSION image_width, int max_v_samp_factor, + JDIMENSION v_samp_factor, + JDIMENSION width_blocks, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + int inrow, outrow, outcol; + JDIMENSION output_cols = width_blocks * DCTSIZE; + JSAMPROW inptr0, inptr1, outptr; + + __vector unsigned char this0, next0, this1, next1, out; + __vector unsigned short this0e, this0o, next0e, next0o, this1e, this1o, + next1e, next1o, out0l, out0h, out1l, out1h, outl, outh; + + /* Constants */ + __vector unsigned short pw_bias = { __4X2(1, 2) }, + pw_two = { __8X(2) }; + __vector unsigned char even_odd_index = + { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }, + pb_zero = { __16X(0) }; + + expand_right_edge(input_data, max_v_samp_factor, image_width, + output_cols * 2); + + for (inrow = 0, outrow = 0; outrow < v_samp_factor; + inrow += 2, outrow++) { + + inptr0 = input_data[inrow]; + inptr1 = input_data[inrow + 1]; + outptr = output_data[outrow]; + + for (outcol = output_cols; outcol > 0; + outcol -= 16, inptr0 += 32, inptr1 += 32, outptr += 16) { + + this0 = vec_ld(0, inptr0); + this0 = vec_perm(this0, this0, even_odd_index); + this0e = (__vector unsigned short)VEC_UNPACKHU(this0); + this0o = (__vector unsigned short)VEC_UNPACKLU(this0); + out0l = vec_add(this0e, this0o); + + this1 = vec_ld(0, inptr1); + this1 = vec_perm(this1, this1, even_odd_index); + this1e = (__vector unsigned short)VEC_UNPACKHU(this1); + this1o = (__vector unsigned short)VEC_UNPACKLU(this1); + out1l = vec_add(this1e, this1o); + + outl = vec_add(out0l, out1l); + outl = vec_add(outl, pw_bias); + outl = vec_sr(outl, pw_two); + + if (outcol > 8) { + next0 = vec_ld(16, inptr0); + next0 = vec_perm(next0, next0, even_odd_index); + next0e = (__vector unsigned short)VEC_UNPACKHU(next0); + next0o = (__vector unsigned short)VEC_UNPACKLU(next0); + out0h = vec_add(next0e, next0o); + + next1 = vec_ld(16, inptr1); + next1 = vec_perm(next1, next1, even_odd_index); + next1e = (__vector unsigned short)VEC_UNPACKHU(next1); + next1o = (__vector unsigned short)VEC_UNPACKLU(next1); + out1h = vec_add(next1e, next1o); + + outh = vec_add(out0h, out1h); + outh = vec_add(outh, pw_bias); + outh = vec_sr(outh, pw_two); + } else + outh = vec_splat_u16(0); + + out = vec_pack(outl, outh); + vec_st(out, 0, outptr); + } + } +} diff --git a/Builder/jni-1.11/simd/jcsample-mmx.asm b/Builder/jni-1.11/simd/jcsample-mmx.asm new file mode 100644 index 000000000..6cd544e74 --- /dev/null +++ b/Builder/jni-1.11/simd/jcsample-mmx.asm @@ -0,0 +1,323 @@ +; +; jcsample.asm - downsampling (MMX) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Downsample pixel values of a single component. +; This version handles the common case of 2:1 horizontal and 1:1 vertical, +; without smoothing. +; +; GLOBAL(void) +; jsimd_h2v1_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor, +; JDIMENSION v_samp_factor, JDIMENSION width_blocks, +; JSAMPARRAY input_data, JSAMPARRAY output_data); +; + +%define img_width(b) (b)+8 ; JDIMENSION image_width +%define max_v_samp(b) (b)+12 ; int max_v_samp_factor +%define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor +%define width_blks(b) (b)+20 ; JDIMENSION width_blocks +%define input_data(b) (b)+24 ; JSAMPARRAY input_data +%define output_data(b) (b)+28 ; JSAMPARRAY output_data + + align 16 + global EXTN(jsimd_h2v1_downsample_mmx) + +EXTN(jsimd_h2v1_downsample_mmx): + push ebp + mov ebp,esp +; push ebx ; unused +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov ecx, JDIMENSION [width_blks(ebp)] + shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols) + jz near .return + + mov edx, JDIMENSION [img_width(ebp)] + + ; -- expand_right_edge + + push ecx + shl ecx,1 ; output_cols * 2 + sub ecx,edx + jle short .expand_end + + mov eax, INT [max_v_samp(ebp)] + test eax,eax + jle short .expand_end + + cld + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + alignx 16,7 +.expandloop: + push eax + push ecx + + mov edi, JSAMPROW [esi] + add edi,edx + mov al, JSAMPLE [edi-1] + + rep stosb + + pop ecx + pop eax + + add esi, byte SIZEOF_JSAMPROW + dec eax + jg short .expandloop + +.expand_end: + pop ecx ; output_cols + + ; -- h2v1_downsample + + mov eax, JDIMENSION [v_samp(ebp)] ; rowctr + test eax,eax + jle near .return + + mov edx, 0x00010000 ; bias pattern + movd mm7,edx + pcmpeqw mm6,mm6 + punpckldq mm7,mm7 ; mm7={0, 1, 0, 1} + psrlw mm6,BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..} + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, JSAMPARRAY [output_data(ebp)] ; output_data + alignx 16,7 +.rowloop: + push ecx + push edi + push esi + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr + alignx 16,7 +.columnloop: + + movq mm0, MMWORD [esi+0*SIZEOF_MMWORD] + movq mm1, MMWORD [esi+1*SIZEOF_MMWORD] + movq mm2,mm0 + movq mm3,mm1 + + pand mm0,mm6 + psrlw mm2,BYTE_BIT + pand mm1,mm6 + psrlw mm3,BYTE_BIT + + paddw mm0,mm2 + paddw mm1,mm3 + paddw mm0,mm7 + paddw mm1,mm7 + psrlw mm0,1 + psrlw mm1,1 + + packuswb mm0,mm1 + + movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 + + add esi, byte 2*SIZEOF_MMWORD ; inptr + add edi, byte 1*SIZEOF_MMWORD ; outptr + sub ecx, byte SIZEOF_MMWORD ; outcol + jnz short .columnloop + + pop esi + pop edi + pop ecx + + add esi, byte SIZEOF_JSAMPROW ; input_data + add edi, byte SIZEOF_JSAMPROW ; output_data + dec eax ; rowctr + jg short .rowloop + + emms ; empty MMX state + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved +; pop ebx ; unused + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Downsample pixel values of a single component. +; This version handles the standard case of 2:1 horizontal and 2:1 vertical, +; without smoothing. +; +; GLOBAL(void) +; jsimd_h2v2_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor, +; JDIMENSION v_samp_factor, JDIMENSION width_blocks, +; JSAMPARRAY input_data, JSAMPARRAY output_data); +; + +%define img_width(b) (b)+8 ; JDIMENSION image_width +%define max_v_samp(b) (b)+12 ; int max_v_samp_factor +%define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor +%define width_blks(b) (b)+20 ; JDIMENSION width_blocks +%define input_data(b) (b)+24 ; JSAMPARRAY input_data +%define output_data(b) (b)+28 ; JSAMPARRAY output_data + + align 16 + global EXTN(jsimd_h2v2_downsample_mmx) + +EXTN(jsimd_h2v2_downsample_mmx): + push ebp + mov ebp,esp +; push ebx ; unused +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov ecx, JDIMENSION [width_blks(ebp)] + shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols) + jz near .return + + mov edx, JDIMENSION [img_width(ebp)] + + ; -- expand_right_edge + + push ecx + shl ecx,1 ; output_cols * 2 + sub ecx,edx + jle short .expand_end + + mov eax, INT [max_v_samp(ebp)] + test eax,eax + jle short .expand_end + + cld + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + alignx 16,7 +.expandloop: + push eax + push ecx + + mov edi, JSAMPROW [esi] + add edi,edx + mov al, JSAMPLE [edi-1] + + rep stosb + + pop ecx + pop eax + + add esi, byte SIZEOF_JSAMPROW + dec eax + jg short .expandloop + +.expand_end: + pop ecx ; output_cols + + ; -- h2v2_downsample + + mov eax, JDIMENSION [v_samp(ebp)] ; rowctr + test eax,eax + jle near .return + + mov edx, 0x00020001 ; bias pattern + movd mm7,edx + pcmpeqw mm6,mm6 + punpckldq mm7,mm7 ; mm7={1, 2, 1, 2} + psrlw mm6,BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..} + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, JSAMPARRAY [output_data(ebp)] ; output_data + alignx 16,7 +.rowloop: + push ecx + push edi + push esi + + mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 + mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1 + mov edi, JSAMPROW [edi] ; outptr + alignx 16,7 +.columnloop: + + movq mm0, MMWORD [edx+0*SIZEOF_MMWORD] + movq mm1, MMWORD [esi+0*SIZEOF_MMWORD] + movq mm2, MMWORD [edx+1*SIZEOF_MMWORD] + movq mm3, MMWORD [esi+1*SIZEOF_MMWORD] + + movq mm4,mm0 + movq mm5,mm1 + pand mm0,mm6 + psrlw mm4,BYTE_BIT + pand mm1,mm6 + psrlw mm5,BYTE_BIT + paddw mm0,mm4 + paddw mm1,mm5 + + movq mm4,mm2 + movq mm5,mm3 + pand mm2,mm6 + psrlw mm4,BYTE_BIT + pand mm3,mm6 + psrlw mm5,BYTE_BIT + paddw mm2,mm4 + paddw mm3,mm5 + + paddw mm0,mm1 + paddw mm2,mm3 + paddw mm0,mm7 + paddw mm2,mm7 + psrlw mm0,2 + psrlw mm2,2 + + packuswb mm0,mm2 + + movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 + + add edx, byte 2*SIZEOF_MMWORD ; inptr0 + add esi, byte 2*SIZEOF_MMWORD ; inptr1 + add edi, byte 1*SIZEOF_MMWORD ; outptr + sub ecx, byte SIZEOF_MMWORD ; outcol + jnz near .columnloop + + pop esi + pop edi + pop ecx + + add esi, byte 2*SIZEOF_JSAMPROW ; input_data + add edi, byte 1*SIZEOF_JSAMPROW ; output_data + dec eax ; rowctr + jg near .rowloop + + emms ; empty MMX state + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved +; pop ebx ; unused + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/Builder/jni-1.11/simd/jcsample-sse2-64.asm b/Builder/jni-1.11/simd/jcsample-sse2-64.asm new file mode 100644 index 000000000..40ee15fcb --- /dev/null +++ b/Builder/jni-1.11/simd/jcsample-sse2-64.asm @@ -0,0 +1,329 @@ +; +; jcsample.asm - downsampling (64-bit SSE2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2009, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 +; +; Downsample pixel values of a single component. +; This version handles the common case of 2:1 horizontal and 1:1 vertical, +; without smoothing. +; +; GLOBAL(void) +; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor, +; JDIMENSION v_samp_factor, JDIMENSION width_blocks, +; JSAMPARRAY input_data, JSAMPARRAY output_data); +; + +; r10 = JDIMENSION image_width +; r11 = int max_v_samp_factor +; r12 = JDIMENSION v_samp_factor +; r13 = JDIMENSION width_blocks +; r14 = JSAMPARRAY input_data +; r15 = JSAMPARRAY output_data + + align 16 + global EXTN(jsimd_h2v1_downsample_sse2) + +EXTN(jsimd_h2v1_downsample_sse2): + push rbp + mov rax,rsp + mov rbp,rsp + collect_args + + mov ecx, r13d + shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols) + jz near .return + + mov edx, r10d + + ; -- expand_right_edge + + push rcx + shl rcx,1 ; output_cols * 2 + sub rcx,rdx + jle short .expand_end + + mov rax, r11 + test rax,rax + jle short .expand_end + + cld + mov rsi, r14 ; input_data +.expandloop: + push rax + push rcx + + mov rdi, JSAMPROW [rsi] + add rdi,rdx + mov al, JSAMPLE [rdi-1] + + rep stosb + + pop rcx + pop rax + + add rsi, byte SIZEOF_JSAMPROW + dec rax + jg short .expandloop + +.expand_end: + pop rcx ; output_cols + + ; -- h2v1_downsample + + mov eax, r12d ; rowctr + test eax,eax + jle near .return + + mov rdx, 0x00010000 ; bias pattern + movd xmm7,edx + pcmpeqw xmm6,xmm6 + pshufd xmm7,xmm7,0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1} + psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} + + mov rsi, r14 ; input_data + mov rdi, r15 ; output_data +.rowloop: + push rcx + push rdi + push rsi + + mov rsi, JSAMPROW [rsi] ; inptr + mov rdi, JSAMPROW [rdi] ; outptr + + cmp rcx, byte SIZEOF_XMMWORD + jae short .columnloop + +.columnloop_r8: + movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] + pxor xmm1,xmm1 + mov rcx, SIZEOF_XMMWORD + jmp short .downsample + +.columnloop: + movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqa xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD] + +.downsample: + movdqa xmm2,xmm0 + movdqa xmm3,xmm1 + + pand xmm0,xmm6 + psrlw xmm2,BYTE_BIT + pand xmm1,xmm6 + psrlw xmm3,BYTE_BIT + + paddw xmm0,xmm2 + paddw xmm1,xmm3 + paddw xmm0,xmm7 + paddw xmm1,xmm7 + psrlw xmm0,1 + psrlw xmm1,1 + + packuswb xmm0,xmm1 + + movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 + + sub rcx, byte SIZEOF_XMMWORD ; outcol + add rsi, byte 2*SIZEOF_XMMWORD ; inptr + add rdi, byte 1*SIZEOF_XMMWORD ; outptr + cmp rcx, byte SIZEOF_XMMWORD + jae short .columnloop + test rcx,rcx + jnz short .columnloop_r8 + + pop rsi + pop rdi + pop rcx + + add rsi, byte SIZEOF_JSAMPROW ; input_data + add rdi, byte SIZEOF_JSAMPROW ; output_data + dec rax ; rowctr + jg near .rowloop + +.return: + uncollect_args + pop rbp + ret + +; -------------------------------------------------------------------------- +; +; Downsample pixel values of a single component. +; This version handles the standard case of 2:1 horizontal and 2:1 vertical, +; without smoothing. +; +; GLOBAL(void) +; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor, +; JDIMENSION v_samp_factor, JDIMENSION width_blocks, +; JSAMPARRAY input_data, JSAMPARRAY output_data); +; + +; r10 = JDIMENSION image_width +; r11 = int max_v_samp_factor +; r12 = JDIMENSION v_samp_factor +; r13 = JDIMENSION width_blocks +; r14 = JSAMPARRAY input_data +; r15 = JSAMPARRAY output_data + + align 16 + global EXTN(jsimd_h2v2_downsample_sse2) + +EXTN(jsimd_h2v2_downsample_sse2): + push rbp + mov rax,rsp + mov rbp,rsp + collect_args + + mov ecx, r13d + shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols) + jz near .return + + mov edx, r10d + + ; -- expand_right_edge + + push rcx + shl rcx,1 ; output_cols * 2 + sub rcx,rdx + jle short .expand_end + + mov rax, r11 + test rax,rax + jle short .expand_end + + cld + mov rsi, r14 ; input_data +.expandloop: + push rax + push rcx + + mov rdi, JSAMPROW [rsi] + add rdi,rdx + mov al, JSAMPLE [rdi-1] + + rep stosb + + pop rcx + pop rax + + add rsi, byte SIZEOF_JSAMPROW + dec rax + jg short .expandloop + +.expand_end: + pop rcx ; output_cols + + ; -- h2v2_downsample + + mov eax, r12d ; rowctr + test rax,rax + jle near .return + + mov rdx, 0x00020001 ; bias pattern + movd xmm7,edx + pcmpeqw xmm6,xmm6 + pshufd xmm7,xmm7,0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2} + psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} + + mov rsi, r14 ; input_data + mov rdi, r15 ; output_data +.rowloop: + push rcx + push rdi + push rsi + + mov rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0 + mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1 + mov rdi, JSAMPROW [rdi] ; outptr + + cmp rcx, byte SIZEOF_XMMWORD + jae short .columnloop + +.columnloop_r8: + movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD] + movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] + pxor xmm2,xmm2 + pxor xmm3,xmm3 + mov rcx, SIZEOF_XMMWORD + jmp short .downsample + +.columnloop: + movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD] + movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqa xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD] + movdqa xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD] + +.downsample: + movdqa xmm4,xmm0 + movdqa xmm5,xmm1 + pand xmm0,xmm6 + psrlw xmm4,BYTE_BIT + pand xmm1,xmm6 + psrlw xmm5,BYTE_BIT + paddw xmm0,xmm4 + paddw xmm1,xmm5 + + movdqa xmm4,xmm2 + movdqa xmm5,xmm3 + pand xmm2,xmm6 + psrlw xmm4,BYTE_BIT + pand xmm3,xmm6 + psrlw xmm5,BYTE_BIT + paddw xmm2,xmm4 + paddw xmm3,xmm5 + + paddw xmm0,xmm1 + paddw xmm2,xmm3 + paddw xmm0,xmm7 + paddw xmm2,xmm7 + psrlw xmm0,2 + psrlw xmm2,2 + + packuswb xmm0,xmm2 + + movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 + + sub rcx, byte SIZEOF_XMMWORD ; outcol + add rdx, byte 2*SIZEOF_XMMWORD ; inptr0 + add rsi, byte 2*SIZEOF_XMMWORD ; inptr1 + add rdi, byte 1*SIZEOF_XMMWORD ; outptr + cmp rcx, byte SIZEOF_XMMWORD + jae near .columnloop + test rcx,rcx + jnz near .columnloop_r8 + + pop rsi + pop rdi + pop rcx + + add rsi, byte 2*SIZEOF_JSAMPROW ; input_data + add rdi, byte 1*SIZEOF_JSAMPROW ; output_data + dec rax ; rowctr + jg near .rowloop + +.return: + uncollect_args + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/Builder/jni-1.11/simd/jcsample-sse2.asm b/Builder/jni-1.11/simd/jcsample-sse2.asm new file mode 100644 index 000000000..83c9d152a --- /dev/null +++ b/Builder/jni-1.11/simd/jcsample-sse2.asm @@ -0,0 +1,350 @@ +; +; jcsample.asm - downsampling (SSE2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Downsample pixel values of a single component. +; This version handles the common case of 2:1 horizontal and 1:1 vertical, +; without smoothing. +; +; GLOBAL(void) +; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor, +; JDIMENSION v_samp_factor, JDIMENSION width_blocks, +; JSAMPARRAY input_data, JSAMPARRAY output_data); +; + +%define img_width(b) (b)+8 ; JDIMENSION image_width +%define max_v_samp(b) (b)+12 ; int max_v_samp_factor +%define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor +%define width_blks(b) (b)+20 ; JDIMENSION width_blocks +%define input_data(b) (b)+24 ; JSAMPARRAY input_data +%define output_data(b) (b)+28 ; JSAMPARRAY output_data + + align 16 + global EXTN(jsimd_h2v1_downsample_sse2) + +EXTN(jsimd_h2v1_downsample_sse2): + push ebp + mov ebp,esp +; push ebx ; unused +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov ecx, JDIMENSION [width_blks(ebp)] + shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols) + jz near .return + + mov edx, JDIMENSION [img_width(ebp)] + + ; -- expand_right_edge + + push ecx + shl ecx,1 ; output_cols * 2 + sub ecx,edx + jle short .expand_end + + mov eax, INT [max_v_samp(ebp)] + test eax,eax + jle short .expand_end + + cld + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + alignx 16,7 +.expandloop: + push eax + push ecx + + mov edi, JSAMPROW [esi] + add edi,edx + mov al, JSAMPLE [edi-1] + + rep stosb + + pop ecx + pop eax + + add esi, byte SIZEOF_JSAMPROW + dec eax + jg short .expandloop + +.expand_end: + pop ecx ; output_cols + + ; -- h2v1_downsample + + mov eax, JDIMENSION [v_samp(ebp)] ; rowctr + test eax,eax + jle near .return + + mov edx, 0x00010000 ; bias pattern + movd xmm7,edx + pcmpeqw xmm6,xmm6 + pshufd xmm7,xmm7,0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1} + psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, JSAMPARRAY [output_data(ebp)] ; output_data + alignx 16,7 +.rowloop: + push ecx + push edi + push esi + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr + + cmp ecx, byte SIZEOF_XMMWORD + jae short .columnloop + alignx 16,7 + +.columnloop_r8: + movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] + pxor xmm1,xmm1 + mov ecx, SIZEOF_XMMWORD + jmp short .downsample + alignx 16,7 + +.columnloop: + movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqa xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD] + +.downsample: + movdqa xmm2,xmm0 + movdqa xmm3,xmm1 + + pand xmm0,xmm6 + psrlw xmm2,BYTE_BIT + pand xmm1,xmm6 + psrlw xmm3,BYTE_BIT + + paddw xmm0,xmm2 + paddw xmm1,xmm3 + paddw xmm0,xmm7 + paddw xmm1,xmm7 + psrlw xmm0,1 + psrlw xmm1,1 + + packuswb xmm0,xmm1 + + movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 + + sub ecx, byte SIZEOF_XMMWORD ; outcol + add esi, byte 2*SIZEOF_XMMWORD ; inptr + add edi, byte 1*SIZEOF_XMMWORD ; outptr + cmp ecx, byte SIZEOF_XMMWORD + jae short .columnloop + test ecx,ecx + jnz short .columnloop_r8 + + pop esi + pop edi + pop ecx + + add esi, byte SIZEOF_JSAMPROW ; input_data + add edi, byte SIZEOF_JSAMPROW ; output_data + dec eax ; rowctr + jg near .rowloop + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved +; pop ebx ; unused + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Downsample pixel values of a single component. +; This version handles the standard case of 2:1 horizontal and 2:1 vertical, +; without smoothing. +; +; GLOBAL(void) +; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor, +; JDIMENSION v_samp_factor, JDIMENSION width_blocks, +; JSAMPARRAY input_data, JSAMPARRAY output_data); +; + +%define img_width(b) (b)+8 ; JDIMENSION image_width +%define max_v_samp(b) (b)+12 ; int max_v_samp_factor +%define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor +%define width_blks(b) (b)+20 ; JDIMENSION width_blocks +%define input_data(b) (b)+24 ; JSAMPARRAY input_data +%define output_data(b) (b)+28 ; JSAMPARRAY output_data + + align 16 + global EXTN(jsimd_h2v2_downsample_sse2) + +EXTN(jsimd_h2v2_downsample_sse2): + push ebp + mov ebp,esp +; push ebx ; unused +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov ecx, JDIMENSION [width_blks(ebp)] + shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols) + jz near .return + + mov edx, JDIMENSION [img_width(ebp)] + + ; -- expand_right_edge + + push ecx + shl ecx,1 ; output_cols * 2 + sub ecx,edx + jle short .expand_end + + mov eax, INT [max_v_samp(ebp)] + test eax,eax + jle short .expand_end + + cld + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + alignx 16,7 +.expandloop: + push eax + push ecx + + mov edi, JSAMPROW [esi] + add edi,edx + mov al, JSAMPLE [edi-1] + + rep stosb + + pop ecx + pop eax + + add esi, byte SIZEOF_JSAMPROW + dec eax + jg short .expandloop + +.expand_end: + pop ecx ; output_cols + + ; -- h2v2_downsample + + mov eax, JDIMENSION [v_samp(ebp)] ; rowctr + test eax,eax + jle near .return + + mov edx, 0x00020001 ; bias pattern + movd xmm7,edx + pcmpeqw xmm6,xmm6 + pshufd xmm7,xmm7,0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2} + psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, JSAMPARRAY [output_data(ebp)] ; output_data + alignx 16,7 +.rowloop: + push ecx + push edi + push esi + + mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 + mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1 + mov edi, JSAMPROW [edi] ; outptr + + cmp ecx, byte SIZEOF_XMMWORD + jae short .columnloop + alignx 16,7 + +.columnloop_r8: + movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD] + movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] + pxor xmm2,xmm2 + pxor xmm3,xmm3 + mov ecx, SIZEOF_XMMWORD + jmp short .downsample + alignx 16,7 + +.columnloop: + movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD] + movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqa xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD] + movdqa xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD] + +.downsample: + movdqa xmm4,xmm0 + movdqa xmm5,xmm1 + pand xmm0,xmm6 + psrlw xmm4,BYTE_BIT + pand xmm1,xmm6 + psrlw xmm5,BYTE_BIT + paddw xmm0,xmm4 + paddw xmm1,xmm5 + + movdqa xmm4,xmm2 + movdqa xmm5,xmm3 + pand xmm2,xmm6 + psrlw xmm4,BYTE_BIT + pand xmm3,xmm6 + psrlw xmm5,BYTE_BIT + paddw xmm2,xmm4 + paddw xmm3,xmm5 + + paddw xmm0,xmm1 + paddw xmm2,xmm3 + paddw xmm0,xmm7 + paddw xmm2,xmm7 + psrlw xmm0,2 + psrlw xmm2,2 + + packuswb xmm0,xmm2 + + movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 + + sub ecx, byte SIZEOF_XMMWORD ; outcol + add edx, byte 2*SIZEOF_XMMWORD ; inptr0 + add esi, byte 2*SIZEOF_XMMWORD ; inptr1 + add edi, byte 1*SIZEOF_XMMWORD ; outptr + cmp ecx, byte SIZEOF_XMMWORD + jae near .columnloop + test ecx,ecx + jnz near .columnloop_r8 + + pop esi + pop edi + pop ecx + + add esi, byte 2*SIZEOF_JSAMPROW ; input_data + add edi, byte 1*SIZEOF_JSAMPROW ; output_data + dec eax ; rowctr + jg near .rowloop + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved +; pop ebx ; unused + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/Builder/jni-1.11/simd/jcsample.h b/Builder/jni-1.11/simd/jcsample.h new file mode 100644 index 000000000..2a50544e9 --- /dev/null +++ b/Builder/jni-1.11/simd/jcsample.h @@ -0,0 +1,28 @@ +/* + * jcsample.h + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1991-1996, Thomas G. Lane. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + */ + +LOCAL(void) +expand_right_edge (JSAMPARRAY image_data, int num_rows, + JDIMENSION input_cols, JDIMENSION output_cols) +{ + register JSAMPROW ptr; + register JSAMPLE pixval; + register int count; + int row; + int numcols = (int) (output_cols - input_cols); + + if (numcols > 0) { + for (row = 0; row < num_rows; row++) { + ptr = image_data[row] + input_cols; + pixval = ptr[-1]; /* don't need GETJSAMPLE() here */ + for (count = numcols; count > 0; count--) + *ptr++ = pixval; + } + } +} diff --git a/Builder/jni-1.11/simd/jdcolext-altivec.c b/Builder/jni-1.11/simd/jdcolext-altivec.c new file mode 100644 index 000000000..fb121ce74 --- /dev/null +++ b/Builder/jni-1.11/simd/jdcolext-altivec.c @@ -0,0 +1,274 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2015, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* This file is included by jdcolor-altivec.c */ + + +void jsimd_ycc_rgb_convert_altivec (JDIMENSION out_width, JSAMPIMAGE input_buf, + JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ + JSAMPROW outptr, inptr0, inptr1, inptr2; + int pitch = out_width * RGB_PIXELSIZE, num_cols; +#if __BIG_ENDIAN__ + int offset; +#endif + unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16]; + + __vector unsigned char rgb0, rgb1, rgb2, rgbx0, rgbx1, rgbx2, rgbx3, + y, cb, cr; +#if __BIG_ENDIAN__ + __vector unsigned char edgel, edgeh, edges, out0, out1, out2, out3; +#if RGB_PIXELSIZE == 4 + __vector unsigned char out4; +#endif +#endif +#if RGB_PIXELSIZE == 4 + __vector unsigned char rgb3; +#endif + __vector short rg0, rg1, rg2, rg3, bx0, bx1, bx2, bx3, yl, yh, cbl, cbh, + crl, crh, rl, rh, gl, gh, bl, bh, g0w, g1w, g2w, g3w; + __vector int g0, g1, g2, g3; + + /* Constants + * NOTE: The >> 1 is to compensate for the fact that vec_madds() returns 17 + * high-order bits, not 16. + */ + __vector short pw_f0402 = { __8X(F_0_402 >> 1) }, + pw_mf0228 = { __8X(-F_0_228 >> 1) }, + pw_mf0344_f0285 = { __4X2(-F_0_344, F_0_285) }, + pw_one = { __8X(1) }, pw_255 = { __8X(255) }, + pw_cj = { __8X(CENTERJSAMPLE) }; + __vector int pd_onehalf = { __4X(ONE_HALF) }; + __vector unsigned char pb_zero = { __16X(0) }, +#if __BIG_ENDIAN__ + shift_pack_index = {0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29}; +#else + shift_pack_index = {2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31}; +#endif + + while (--num_rows >= 0) { + inptr0 = input_buf[0][input_row]; + inptr1 = input_buf[1][input_row]; + inptr2 = input_buf[2][input_row]; + input_row++; + outptr = *output_buf++; + + for (num_cols = pitch; num_cols > 0; + num_cols -= RGB_PIXELSIZE * 16, outptr += RGB_PIXELSIZE * 16, + inptr0 += 16, inptr1 += 16, inptr2 += 16) { + + y = vec_ld(0, inptr0); + /* NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't + * support unsigned vectors. + */ + yl = (__vector signed short)VEC_UNPACKHU(y); + yh = (__vector signed short)VEC_UNPACKLU(y); + + cb = vec_ld(0, inptr1); + cbl = (__vector signed short)VEC_UNPACKHU(cb); + cbh = (__vector signed short)VEC_UNPACKLU(cb); + cbl = vec_sub(cbl, pw_cj); + cbh = vec_sub(cbh, pw_cj); + + cr = vec_ld(0, inptr2); + crl = (__vector signed short)VEC_UNPACKHU(cr); + crh = (__vector signed short)VEC_UNPACKLU(cr); + crl = vec_sub(crl, pw_cj); + crh = vec_sub(crh, pw_cj); + + /* (Original) + * R = Y + 1.40200 * Cr + * G = Y - 0.34414 * Cb - 0.71414 * Cr + * B = Y + 1.77200 * Cb + * + * (This implementation) + * R = Y + 0.40200 * Cr + Cr + * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr + * B = Y - 0.22800 * Cb + Cb + Cb + */ + bl = vec_add(cbl, cbl); + bh = vec_add(cbh, cbh); + bl = vec_madds(bl, pw_mf0228, pw_one); + bh = vec_madds(bh, pw_mf0228, pw_one); + bl = vec_sra(bl, (__vector unsigned short)pw_one); + bh = vec_sra(bh, (__vector unsigned short)pw_one); + bl = vec_add(bl, cbl); + bh = vec_add(bh, cbh); + bl = vec_add(bl, cbl); + bh = vec_add(bh, cbh); + bl = vec_add(bl, yl); + bh = vec_add(bh, yh); + + rl = vec_add(crl, crl); + rh = vec_add(crh, crh); + rl = vec_madds(rl, pw_f0402, pw_one); + rh = vec_madds(rh, pw_f0402, pw_one); + rl = vec_sra(rl, (__vector unsigned short)pw_one); + rh = vec_sra(rh, (__vector unsigned short)pw_one); + rl = vec_add(rl, crl); + rh = vec_add(rh, crh); + rl = vec_add(rl, yl); + rh = vec_add(rh, yh); + + g0w = vec_mergeh(cbl, crl); + g1w = vec_mergel(cbl, crl); + g0 = vec_msums(g0w, pw_mf0344_f0285, pd_onehalf); + g1 = vec_msums(g1w, pw_mf0344_f0285, pd_onehalf); + g2w = vec_mergeh(cbh, crh); + g3w = vec_mergel(cbh, crh); + g2 = vec_msums(g2w, pw_mf0344_f0285, pd_onehalf); + g3 = vec_msums(g3w, pw_mf0344_f0285, pd_onehalf); + /* Clever way to avoid 4 shifts + 2 packs. This packs the high word from + * each dword into a new 16-bit vector, which is the equivalent of + * descaling the 32-bit results (right-shifting by 16 bits) and then + * packing them. + */ + gl = vec_perm((__vector short)g0, (__vector short)g1, shift_pack_index); + gh = vec_perm((__vector short)g2, (__vector short)g3, shift_pack_index); + gl = vec_sub(gl, crl); + gh = vec_sub(gh, crh); + gl = vec_add(gl, yl); + gh = vec_add(gh, yh); + + rg0 = vec_mergeh(rl, gl); + bx0 = vec_mergeh(bl, pw_255); + rg1 = vec_mergel(rl, gl); + bx1 = vec_mergel(bl, pw_255); + rg2 = vec_mergeh(rh, gh); + bx2 = vec_mergeh(bh, pw_255); + rg3 = vec_mergel(rh, gh); + bx3 = vec_mergel(bh, pw_255); + + rgbx0 = vec_packsu(rg0, bx0); + rgbx1 = vec_packsu(rg1, bx1); + rgbx2 = vec_packsu(rg2, bx2); + rgbx3 = vec_packsu(rg3, bx3); + +#if RGB_PIXELSIZE == 3 + /* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3 + * rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7 + * rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb + * rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf + * + * rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5 + * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga + * rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf + */ + rgb0 = vec_perm(rgbx0, rgbx1, (__vector unsigned char)RGB_INDEX0); + rgb1 = vec_perm(rgbx1, rgbx2, (__vector unsigned char)RGB_INDEX1); + rgb2 = vec_perm(rgbx2, rgbx3, (__vector unsigned char)RGB_INDEX2); +#else + /* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3 + * rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7 + * rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb + * rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf + * + * rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3 + * rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7 + * rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb + * rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf + */ + rgb0 = vec_perm(rgbx0, rgbx0, (__vector unsigned char)RGB_INDEX); + rgb1 = vec_perm(rgbx1, rgbx1, (__vector unsigned char)RGB_INDEX); + rgb2 = vec_perm(rgbx2, rgbx2, (__vector unsigned char)RGB_INDEX); + rgb3 = vec_perm(rgbx3, rgbx3, (__vector unsigned char)RGB_INDEX); +#endif + +#if __BIG_ENDIAN__ + offset = (size_t)outptr & 15; + if (offset) { + __vector unsigned char unaligned_shift_index; + int bytes = num_cols + offset; + + if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) { + /* Slow path to prevent buffer overwrite. Since there is no way to + * write a partial AltiVec register, overwrite would occur on the + * last chunk of the last image row if the right edge is not on a + * 16-byte boundary. It could also occur on other rows if the bytes + * per row is low enough. Since we can't determine whether we're on + * the last image row, we have to assume every row is the last. + */ + vec_st(rgb0, 0, tmpbuf); + vec_st(rgb1, 16, tmpbuf); + vec_st(rgb2, 32, tmpbuf); +#if RGB_PIXELSIZE == 4 + vec_st(rgb3, 48, tmpbuf); +#endif + memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16)); + } else { + /* Fast path */ + unaligned_shift_index = vec_lvsl(0, outptr); + edgel = vec_ld(0, outptr); + edgeh = vec_ld(min(num_cols - 1, RGB_PIXELSIZE * 16), outptr); + edges = vec_perm(edgeh, edgel, unaligned_shift_index); + unaligned_shift_index = vec_lvsr(0, outptr); + out0 = vec_perm(edges, rgb0, unaligned_shift_index); + out1 = vec_perm(rgb0, rgb1, unaligned_shift_index); + out2 = vec_perm(rgb1, rgb2, unaligned_shift_index); +#if RGB_PIXELSIZE == 4 + out3 = vec_perm(rgb2, rgb3, unaligned_shift_index); + out4 = vec_perm(rgb3, edges, unaligned_shift_index); +#else + out3 = vec_perm(rgb2, edges, unaligned_shift_index); +#endif + vec_st(out0, 0, outptr); + if (bytes > 16) + vec_st(out1, 16, outptr); + if (bytes > 32) + vec_st(out2, 32, outptr); + if (bytes > 48) + vec_st(out3, 48, outptr); +#if RGB_PIXELSIZE == 4 + if (bytes > 64) + vec_st(out4, 64, outptr); +#endif + } + } else { +#endif /* __BIG_ENDIAN__ */ + if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) { + /* Slow path */ + VEC_ST(rgb0, 0, tmpbuf); + VEC_ST(rgb1, 16, tmpbuf); + VEC_ST(rgb2, 32, tmpbuf); +#if RGB_PIXELSIZE == 4 + VEC_ST(rgb3, 48, tmpbuf); +#endif + memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16)); + } else { + /* Fast path */ + VEC_ST(rgb0, 0, outptr); + if (num_cols > 16) + VEC_ST(rgb1, 16, outptr); + if (num_cols > 32) + VEC_ST(rgb2, 32, outptr); +#if RGB_PIXELSIZE == 4 + if (num_cols > 48) + VEC_ST(rgb3, 48, outptr); +#endif + } +#if __BIG_ENDIAN__ + } +#endif + } + } +} diff --git a/Builder/jni-1.11/simd/jdcolext-mmx.asm b/Builder/jni-1.11/simd/jdcolext-mmx.asm new file mode 100644 index 000000000..21e34f678 --- /dev/null +++ b/Builder/jni-1.11/simd/jdcolext-mmx.asm @@ -0,0 +1,404 @@ +; +; jdcolext.asm - colorspace conversion (MMX) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Convert some rows of samples to the output colorspace. +; +; GLOBAL(void) +; jsimd_ycc_rgb_convert_mmx (JDIMENSION out_width, +; JSAMPIMAGE input_buf, JDIMENSION input_row, +; JSAMPARRAY output_buf, int num_rows) +; + +%define out_width(b) (b)+8 ; JDIMENSION out_width +%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf +%define input_row(b) (b)+16 ; JDIMENSION input_row +%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf +%define num_rows(b) (b)+24 ; int num_rows + +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] +%define WK_NUM 2 +%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr + + align 16 + global EXTN(jsimd_ycc_rgb_convert_mmx) + +EXTN(jsimd_ycc_rgb_convert_mmx): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov ecx, JDIMENSION [out_width(eax)] ; num_cols + test ecx,ecx + jz near .return + + push ecx + + mov edi, JSAMPIMAGE [input_buf(eax)] + mov ecx, JDIMENSION [input_row(eax)] + mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] + mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] + mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] + lea esi, [esi+ecx*SIZEOF_JSAMPROW] + lea ebx, [ebx+ecx*SIZEOF_JSAMPROW] + lea edx, [edx+ecx*SIZEOF_JSAMPROW] + + pop ecx + + mov edi, JSAMPARRAY [output_buf(eax)] + mov eax, INT [num_rows(eax)] + test eax,eax + jle near .return + alignx 16,7 +.rowloop: + push eax + push edi + push edx + push ebx + push esi + push ecx ; col + + mov esi, JSAMPROW [esi] ; inptr0 + mov ebx, JSAMPROW [ebx] ; inptr1 + mov edx, JSAMPROW [edx] ; inptr2 + mov edi, JSAMPROW [edi] ; outptr + movpic eax, POINTER [gotptr] ; load GOT address (eax) + alignx 16,7 +.columnloop: + + movq mm5, MMWORD [ebx] ; mm5=Cb(01234567) + movq mm1, MMWORD [edx] ; mm1=Cr(01234567) + + pcmpeqw mm4,mm4 + pcmpeqw mm7,mm7 + psrlw mm4,BYTE_BIT + psllw mm7,7 ; mm7={0xFF80 0xFF80 0xFF80 0xFF80} + movq mm0,mm4 ; mm0=mm4={0xFF 0x00 0xFF 0x00 ..} + + pand mm4,mm5 ; mm4=Cb(0246)=CbE + psrlw mm5,BYTE_BIT ; mm5=Cb(1357)=CbO + pand mm0,mm1 ; mm0=Cr(0246)=CrE + psrlw mm1,BYTE_BIT ; mm1=Cr(1357)=CrO + + paddw mm4,mm7 + paddw mm5,mm7 + paddw mm0,mm7 + paddw mm1,mm7 + + ; (Original) + ; R = Y + 1.40200 * Cr + ; G = Y - 0.34414 * Cb - 0.71414 * Cr + ; B = Y + 1.77200 * Cb + ; + ; (This implementation) + ; R = Y + 0.40200 * Cr + Cr + ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr + ; B = Y - 0.22800 * Cb + Cb + Cb + + movq mm2,mm4 ; mm2=CbE + movq mm3,mm5 ; mm3=CbO + paddw mm4,mm4 ; mm4=2*CbE + paddw mm5,mm5 ; mm5=2*CbO + movq mm6,mm0 ; mm6=CrE + movq mm7,mm1 ; mm7=CrO + paddw mm0,mm0 ; mm0=2*CrE + paddw mm1,mm1 ; mm1=2*CrO + + pmulhw mm4,[GOTOFF(eax,PW_MF0228)] ; mm4=(2*CbE * -FIX(0.22800)) + pmulhw mm5,[GOTOFF(eax,PW_MF0228)] ; mm5=(2*CbO * -FIX(0.22800)) + pmulhw mm0,[GOTOFF(eax,PW_F0402)] ; mm0=(2*CrE * FIX(0.40200)) + pmulhw mm1,[GOTOFF(eax,PW_F0402)] ; mm1=(2*CrO * FIX(0.40200)) + + paddw mm4,[GOTOFF(eax,PW_ONE)] + paddw mm5,[GOTOFF(eax,PW_ONE)] + psraw mm4,1 ; mm4=(CbE * -FIX(0.22800)) + psraw mm5,1 ; mm5=(CbO * -FIX(0.22800)) + paddw mm0,[GOTOFF(eax,PW_ONE)] + paddw mm1,[GOTOFF(eax,PW_ONE)] + psraw mm0,1 ; mm0=(CrE * FIX(0.40200)) + psraw mm1,1 ; mm1=(CrO * FIX(0.40200)) + + paddw mm4,mm2 + paddw mm5,mm3 + paddw mm4,mm2 ; mm4=(CbE * FIX(1.77200))=(B-Y)E + paddw mm5,mm3 ; mm5=(CbO * FIX(1.77200))=(B-Y)O + paddw mm0,mm6 ; mm0=(CrE * FIX(1.40200))=(R-Y)E + paddw mm1,mm7 ; mm1=(CrO * FIX(1.40200))=(R-Y)O + + movq MMWORD [wk(0)], mm4 ; wk(0)=(B-Y)E + movq MMWORD [wk(1)], mm5 ; wk(1)=(B-Y)O + + movq mm4,mm2 + movq mm5,mm3 + punpcklwd mm2,mm6 + punpckhwd mm4,mm6 + pmaddwd mm2,[GOTOFF(eax,PW_MF0344_F0285)] + pmaddwd mm4,[GOTOFF(eax,PW_MF0344_F0285)] + punpcklwd mm3,mm7 + punpckhwd mm5,mm7 + pmaddwd mm3,[GOTOFF(eax,PW_MF0344_F0285)] + pmaddwd mm5,[GOTOFF(eax,PW_MF0344_F0285)] + + paddd mm2,[GOTOFF(eax,PD_ONEHALF)] + paddd mm4,[GOTOFF(eax,PD_ONEHALF)] + psrad mm2,SCALEBITS + psrad mm4,SCALEBITS + paddd mm3,[GOTOFF(eax,PD_ONEHALF)] + paddd mm5,[GOTOFF(eax,PD_ONEHALF)] + psrad mm3,SCALEBITS + psrad mm5,SCALEBITS + + packssdw mm2,mm4 ; mm2=CbE*-FIX(0.344)+CrE*FIX(0.285) + packssdw mm3,mm5 ; mm3=CbO*-FIX(0.344)+CrO*FIX(0.285) + psubw mm2,mm6 ; mm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E + psubw mm3,mm7 ; mm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O + + movq mm5, MMWORD [esi] ; mm5=Y(01234567) + + pcmpeqw mm4,mm4 + psrlw mm4,BYTE_BIT ; mm4={0xFF 0x00 0xFF 0x00 ..} + pand mm4,mm5 ; mm4=Y(0246)=YE + psrlw mm5,BYTE_BIT ; mm5=Y(1357)=YO + + paddw mm0,mm4 ; mm0=((R-Y)E+YE)=RE=(R0 R2 R4 R6) + paddw mm1,mm5 ; mm1=((R-Y)O+YO)=RO=(R1 R3 R5 R7) + packuswb mm0,mm0 ; mm0=(R0 R2 R4 R6 ** ** ** **) + packuswb mm1,mm1 ; mm1=(R1 R3 R5 R7 ** ** ** **) + + paddw mm2,mm4 ; mm2=((G-Y)E+YE)=GE=(G0 G2 G4 G6) + paddw mm3,mm5 ; mm3=((G-Y)O+YO)=GO=(G1 G3 G5 G7) + packuswb mm2,mm2 ; mm2=(G0 G2 G4 G6 ** ** ** **) + packuswb mm3,mm3 ; mm3=(G1 G3 G5 G7 ** ** ** **) + + paddw mm4, MMWORD [wk(0)] ; mm4=(YE+(B-Y)E)=BE=(B0 B2 B4 B6) + paddw mm5, MMWORD [wk(1)] ; mm5=(YO+(B-Y)O)=BO=(B1 B3 B5 B7) + packuswb mm4,mm4 ; mm4=(B0 B2 B4 B6 ** ** ** **) + packuswb mm5,mm5 ; mm5=(B1 B3 B5 B7 ** ** ** **) + +%if RGB_PIXELSIZE == 3 ; --------------- + + ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) + ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) + ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) + ; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **) + + punpcklbw mmA,mmC ; mmA=(00 10 02 12 04 14 06 16) + punpcklbw mmE,mmB ; mmE=(20 01 22 03 24 05 26 07) + punpcklbw mmD,mmF ; mmD=(11 21 13 23 15 25 17 27) + + movq mmG,mmA + movq mmH,mmA + punpcklwd mmA,mmE ; mmA=(00 10 20 01 02 12 22 03) + punpckhwd mmG,mmE ; mmG=(04 14 24 05 06 16 26 07) + + psrlq mmH,2*BYTE_BIT ; mmH=(02 12 04 14 06 16 -- --) + psrlq mmE,2*BYTE_BIT ; mmE=(22 03 24 05 26 07 -- --) + + movq mmC,mmD + movq mmB,mmD + punpcklwd mmD,mmH ; mmD=(11 21 02 12 13 23 04 14) + punpckhwd mmC,mmH ; mmC=(15 25 06 16 17 27 -- --) + + psrlq mmB,2*BYTE_BIT ; mmB=(13 23 15 25 17 27 -- --) + + movq mmF,mmE + punpcklwd mmE,mmB ; mmE=(22 03 13 23 24 05 15 25) + punpckhwd mmF,mmB ; mmF=(26 07 17 27 -- -- -- --) + + punpckldq mmA,mmD ; mmA=(00 10 20 01 11 21 02 12) + punpckldq mmE,mmG ; mmE=(22 03 13 23 04 14 24 05) + punpckldq mmC,mmF ; mmC=(15 25 06 16 26 07 17 27) + + cmp ecx, byte SIZEOF_MMWORD + jb short .column_st16 + + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq MMWORD [edi+1*SIZEOF_MMWORD], mmE + movq MMWORD [edi+2*SIZEOF_MMWORD], mmC + + sub ecx, byte SIZEOF_MMWORD + jz short .nextrow + + add esi, byte SIZEOF_MMWORD ; inptr0 + add ebx, byte SIZEOF_MMWORD ; inptr1 + add edx, byte SIZEOF_MMWORD ; inptr2 + add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr + jmp near .columnloop + alignx 16,7 + +.column_st16: + lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE + cmp ecx, byte 2*SIZEOF_MMWORD + jb short .column_st8 + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq MMWORD [edi+1*SIZEOF_MMWORD], mmE + movq mmA,mmC + sub ecx, byte 2*SIZEOF_MMWORD + add edi, byte 2*SIZEOF_MMWORD + jmp short .column_st4 +.column_st8: + cmp ecx, byte SIZEOF_MMWORD + jb short .column_st4 + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq mmA,mmE + sub ecx, byte SIZEOF_MMWORD + add edi, byte SIZEOF_MMWORD +.column_st4: + movd eax,mmA + cmp ecx, byte SIZEOF_DWORD + jb short .column_st2 + mov DWORD [edi+0*SIZEOF_DWORD], eax + psrlq mmA,DWORD_BIT + movd eax,mmA + sub ecx, byte SIZEOF_DWORD + add edi, byte SIZEOF_DWORD +.column_st2: + cmp ecx, byte SIZEOF_WORD + jb short .column_st1 + mov WORD [edi+0*SIZEOF_WORD], ax + shr eax,WORD_BIT + sub ecx, byte SIZEOF_WORD + add edi, byte SIZEOF_WORD +.column_st1: + cmp ecx, byte SIZEOF_BYTE + jb short .nextrow + mov BYTE [edi+0*SIZEOF_BYTE], al + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +%ifdef RGBX_FILLER_0XFF + pcmpeqb mm6,mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **) + pcmpeqb mm7,mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **) +%else + pxor mm6,mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **) + pxor mm7,mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **) +%endif + ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) + ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) + ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) + ; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **) + + punpcklbw mmA,mmC ; mmA=(00 10 02 12 04 14 06 16) + punpcklbw mmE,mmG ; mmE=(20 30 22 32 24 34 26 36) + punpcklbw mmB,mmD ; mmB=(01 11 03 13 05 15 07 17) + punpcklbw mmF,mmH ; mmF=(21 31 23 33 25 35 27 37) + + movq mmC,mmA + punpcklwd mmA,mmE ; mmA=(00 10 20 30 02 12 22 32) + punpckhwd mmC,mmE ; mmC=(04 14 24 34 06 16 26 36) + movq mmG,mmB + punpcklwd mmB,mmF ; mmB=(01 11 21 31 03 13 23 33) + punpckhwd mmG,mmF ; mmG=(05 15 25 35 07 17 27 37) + + movq mmD,mmA + punpckldq mmA,mmB ; mmA=(00 10 20 30 01 11 21 31) + punpckhdq mmD,mmB ; mmD=(02 12 22 32 03 13 23 33) + movq mmH,mmC + punpckldq mmC,mmG ; mmC=(04 14 24 34 05 15 25 35) + punpckhdq mmH,mmG ; mmH=(06 16 26 36 07 17 27 37) + + cmp ecx, byte SIZEOF_MMWORD + jb short .column_st16 + + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq MMWORD [edi+1*SIZEOF_MMWORD], mmD + movq MMWORD [edi+2*SIZEOF_MMWORD], mmC + movq MMWORD [edi+3*SIZEOF_MMWORD], mmH + + sub ecx, byte SIZEOF_MMWORD + jz short .nextrow + + add esi, byte SIZEOF_MMWORD ; inptr0 + add ebx, byte SIZEOF_MMWORD ; inptr1 + add edx, byte SIZEOF_MMWORD ; inptr2 + add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr + jmp near .columnloop + alignx 16,7 + +.column_st16: + cmp ecx, byte SIZEOF_MMWORD/2 + jb short .column_st8 + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq MMWORD [edi+1*SIZEOF_MMWORD], mmD + movq mmA,mmC + movq mmD,mmH + sub ecx, byte SIZEOF_MMWORD/2 + add edi, byte 2*SIZEOF_MMWORD +.column_st8: + cmp ecx, byte SIZEOF_MMWORD/4 + jb short .column_st4 + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq mmA,mmD + sub ecx, byte SIZEOF_MMWORD/4 + add edi, byte 1*SIZEOF_MMWORD +.column_st4: + cmp ecx, byte SIZEOF_MMWORD/8 + jb short .nextrow + movd DWORD [edi+0*SIZEOF_DWORD], mmA + +%endif ; RGB_PIXELSIZE ; --------------- + + alignx 16,7 + +.nextrow: + pop ecx + pop esi + pop ebx + pop edx + pop edi + pop eax + + add esi, byte SIZEOF_JSAMPROW + add ebx, byte SIZEOF_JSAMPROW + add edx, byte SIZEOF_JSAMPROW + add edi, byte SIZEOF_JSAMPROW ; output_buf + dec eax ; num_rows + jg near .rowloop + + emms ; empty MMX state + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/Builder/jni-1.11/simd/jdcolext-sse2-64.asm b/Builder/jni-1.11/simd/jdcolext-sse2-64.asm new file mode 100644 index 000000000..4634066c4 --- /dev/null +++ b/Builder/jni-1.11/simd/jdcolext-sse2-64.asm @@ -0,0 +1,440 @@ +; +; jdcolext.asm - colorspace conversion (64-bit SSE2) +; +; Copyright 2009, 2012 Pierre Ossman for Cendio AB +; Copyright (C) 2009, 2012, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Convert some rows of samples to the output colorspace. +; +; GLOBAL(void) +; jsimd_ycc_rgb_convert_sse2 (JDIMENSION out_width, +; JSAMPIMAGE input_buf, JDIMENSION input_row, +; JSAMPARRAY output_buf, int num_rows) +; + +; r10 = JDIMENSION out_width +; r11 = JSAMPIMAGE input_buf +; r12 = JDIMENSION input_row +; r13 = JSAMPARRAY output_buf +; r14 = int num_rows + +%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 + + align 16 + global EXTN(jsimd_ycc_rgb_convert_sse2) + +EXTN(jsimd_ycc_rgb_convert_sse2): + push rbp + mov rax,rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp],rax + mov rbp,rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args + push rbx + + mov ecx, r10d ; num_cols + test rcx,rcx + jz near .return + + push rcx + + mov rdi, r11 + mov ecx, r12d + mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY] + mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY] + mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY] + lea rsi, [rsi+rcx*SIZEOF_JSAMPROW] + lea rbx, [rbx+rcx*SIZEOF_JSAMPROW] + lea rdx, [rdx+rcx*SIZEOF_JSAMPROW] + + pop rcx + + mov rdi, r13 + mov eax, r14d + test rax,rax + jle near .return +.rowloop: + push rax + push rdi + push rdx + push rbx + push rsi + push rcx ; col + + mov rsi, JSAMPROW [rsi] ; inptr0 + mov rbx, JSAMPROW [rbx] ; inptr1 + mov rdx, JSAMPROW [rdx] ; inptr2 + mov rdi, JSAMPROW [rdi] ; outptr +.columnloop: + + movdqa xmm5, XMMWORD [rbx] ; xmm5=Cb(0123456789ABCDEF) + movdqa xmm1, XMMWORD [rdx] ; xmm1=Cr(0123456789ABCDEF) + + pcmpeqw xmm4,xmm4 + pcmpeqw xmm7,xmm7 + psrlw xmm4,BYTE_BIT + psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} + movdqa xmm0,xmm4 ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..} + + pand xmm4,xmm5 ; xmm4=Cb(02468ACE)=CbE + psrlw xmm5,BYTE_BIT ; xmm5=Cb(13579BDF)=CbO + pand xmm0,xmm1 ; xmm0=Cr(02468ACE)=CrE + psrlw xmm1,BYTE_BIT ; xmm1=Cr(13579BDF)=CrO + + paddw xmm4,xmm7 + paddw xmm5,xmm7 + paddw xmm0,xmm7 + paddw xmm1,xmm7 + + ; (Original) + ; R = Y + 1.40200 * Cr + ; G = Y - 0.34414 * Cb - 0.71414 * Cr + ; B = Y + 1.77200 * Cb + ; + ; (This implementation) + ; R = Y + 0.40200 * Cr + Cr + ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr + ; B = Y - 0.22800 * Cb + Cb + Cb + + movdqa xmm2,xmm4 ; xmm2=CbE + movdqa xmm3,xmm5 ; xmm3=CbO + paddw xmm4,xmm4 ; xmm4=2*CbE + paddw xmm5,xmm5 ; xmm5=2*CbO + movdqa xmm6,xmm0 ; xmm6=CrE + movdqa xmm7,xmm1 ; xmm7=CrO + paddw xmm0,xmm0 ; xmm0=2*CrE + paddw xmm1,xmm1 ; xmm1=2*CrO + + pmulhw xmm4,[rel PW_MF0228] ; xmm4=(2*CbE * -FIX(0.22800)) + pmulhw xmm5,[rel PW_MF0228] ; xmm5=(2*CbO * -FIX(0.22800)) + pmulhw xmm0,[rel PW_F0402] ; xmm0=(2*CrE * FIX(0.40200)) + pmulhw xmm1,[rel PW_F0402] ; xmm1=(2*CrO * FIX(0.40200)) + + paddw xmm4,[rel PW_ONE] + paddw xmm5,[rel PW_ONE] + psraw xmm4,1 ; xmm4=(CbE * -FIX(0.22800)) + psraw xmm5,1 ; xmm5=(CbO * -FIX(0.22800)) + paddw xmm0,[rel PW_ONE] + paddw xmm1,[rel PW_ONE] + psraw xmm0,1 ; xmm0=(CrE * FIX(0.40200)) + psraw xmm1,1 ; xmm1=(CrO * FIX(0.40200)) + + paddw xmm4,xmm2 + paddw xmm5,xmm3 + paddw xmm4,xmm2 ; xmm4=(CbE * FIX(1.77200))=(B-Y)E + paddw xmm5,xmm3 ; xmm5=(CbO * FIX(1.77200))=(B-Y)O + paddw xmm0,xmm6 ; xmm0=(CrE * FIX(1.40200))=(R-Y)E + paddw xmm1,xmm7 ; xmm1=(CrO * FIX(1.40200))=(R-Y)O + + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=(B-Y)E + movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(B-Y)O + + movdqa xmm4,xmm2 + movdqa xmm5,xmm3 + punpcklwd xmm2,xmm6 + punpckhwd xmm4,xmm6 + pmaddwd xmm2,[rel PW_MF0344_F0285] + pmaddwd xmm4,[rel PW_MF0344_F0285] + punpcklwd xmm3,xmm7 + punpckhwd xmm5,xmm7 + pmaddwd xmm3,[rel PW_MF0344_F0285] + pmaddwd xmm5,[rel PW_MF0344_F0285] + + paddd xmm2,[rel PD_ONEHALF] + paddd xmm4,[rel PD_ONEHALF] + psrad xmm2,SCALEBITS + psrad xmm4,SCALEBITS + paddd xmm3,[rel PD_ONEHALF] + paddd xmm5,[rel PD_ONEHALF] + psrad xmm3,SCALEBITS + psrad xmm5,SCALEBITS + + packssdw xmm2,xmm4 ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285) + packssdw xmm3,xmm5 ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285) + psubw xmm2,xmm6 ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E + psubw xmm3,xmm7 ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O + + movdqa xmm5, XMMWORD [rsi] ; xmm5=Y(0123456789ABCDEF) + + pcmpeqw xmm4,xmm4 + psrlw xmm4,BYTE_BIT ; xmm4={0xFF 0x00 0xFF 0x00 ..} + pand xmm4,xmm5 ; xmm4=Y(02468ACE)=YE + psrlw xmm5,BYTE_BIT ; xmm5=Y(13579BDF)=YO + + paddw xmm0,xmm4 ; xmm0=((R-Y)E+YE)=RE=R(02468ACE) + paddw xmm1,xmm5 ; xmm1=((R-Y)O+YO)=RO=R(13579BDF) + packuswb xmm0,xmm0 ; xmm0=R(02468ACE********) + packuswb xmm1,xmm1 ; xmm1=R(13579BDF********) + + paddw xmm2,xmm4 ; xmm2=((G-Y)E+YE)=GE=G(02468ACE) + paddw xmm3,xmm5 ; xmm3=((G-Y)O+YO)=GO=G(13579BDF) + packuswb xmm2,xmm2 ; xmm2=G(02468ACE********) + packuswb xmm3,xmm3 ; xmm3=G(13579BDF********) + + paddw xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE) + paddw xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF) + packuswb xmm4,xmm4 ; xmm4=B(02468ACE********) + packuswb xmm5,xmm5 ; xmm5=B(13579BDF********) + +%if RGB_PIXELSIZE == 3 ; --------------- + + ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) + ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) + ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) + ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **) + + punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) + punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F) + punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F) + + movdqa xmmG,xmmA + movdqa xmmH,xmmA + punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07) + punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F) + + psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --) + psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --) + + movdqa xmmC,xmmD + movdqa xmmB,xmmD + punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18) + punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --) + + psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --) + + movdqa xmmF,xmmE + punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29) + punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --) + + pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03) + movdqa xmmB,xmmE + punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14) + punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07) + punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29) + + pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B) + movdqa xmmB,xmmF + punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C) + punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F) + punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --) + + punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) + punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) + + cmp rcx, byte SIZEOF_XMMWORD + jb short .column_st32 + + test rdi, SIZEOF_XMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF + jmp short .out0 +.out1: ; --(unaligned)----------------- + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF +.out0: + add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr + sub rcx, byte SIZEOF_XMMWORD + jz near .nextrow + + add rsi, byte SIZEOF_XMMWORD ; inptr0 + add rbx, byte SIZEOF_XMMWORD ; inptr1 + add rdx, byte SIZEOF_XMMWORD ; inptr2 + jmp near .columnloop + +.column_st32: + lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE + cmp rcx, byte 2*SIZEOF_XMMWORD + jb short .column_st16 + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + add rdi, byte 2*SIZEOF_XMMWORD ; outptr + movdqa xmmA,xmmF + sub rcx, byte 2*SIZEOF_XMMWORD + jmp short .column_st15 +.column_st16: + cmp rcx, byte SIZEOF_XMMWORD + jb short .column_st15 + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + add rdi, byte SIZEOF_XMMWORD ; outptr + movdqa xmmA,xmmD + sub rcx, byte SIZEOF_XMMWORD +.column_st15: + ; Store the lower 8 bytes of xmmA to the output when it has enough + ; space. + cmp rcx, byte SIZEOF_MMWORD + jb short .column_st7 + movq XMM_MMWORD [rdi], xmmA + add rdi, byte SIZEOF_MMWORD + sub rcx, byte SIZEOF_MMWORD + psrldq xmmA, SIZEOF_MMWORD +.column_st7: + ; Store the lower 4 bytes of xmmA to the output when it has enough + ; space. + cmp rcx, byte SIZEOF_DWORD + jb short .column_st3 + movd XMM_DWORD [rdi], xmmA + add rdi, byte SIZEOF_DWORD + sub rcx, byte SIZEOF_DWORD + psrldq xmmA, SIZEOF_DWORD +.column_st3: + ; Store the lower 2 bytes of rax to the output when it has enough + ; space. + movd eax, xmmA + cmp rcx, byte SIZEOF_WORD + jb short .column_st1 + mov WORD [rdi], ax + add rdi, byte SIZEOF_WORD + sub rcx, byte SIZEOF_WORD + shr rax, 16 +.column_st1: + ; Store the lower 1 byte of rax to the output when it has enough + ; space. + test rcx, rcx + jz short .nextrow + mov BYTE [rdi], al + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +%ifdef RGBX_FILLER_0XFF + pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********) + pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********) +%else + pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********) + pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********) +%endif + ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) + ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) + ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) + ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **) + + punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) + punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E) + punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F) + punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F) + + movdqa xmmC,xmmA + punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36) + punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E) + movdqa xmmG,xmmB + punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37) + punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F) + + movdqa xmmD,xmmA + punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) + punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + movdqa xmmH,xmmC + punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) + punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + + cmp rcx, byte SIZEOF_XMMWORD + jb short .column_st32 + + test rdi, SIZEOF_XMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC + movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH + jmp short .out0 +.out1: ; --(unaligned)----------------- + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC + movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH +.out0: + add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr + sub rcx, byte SIZEOF_XMMWORD + jz near .nextrow + + add rsi, byte SIZEOF_XMMWORD ; inptr0 + add rbx, byte SIZEOF_XMMWORD ; inptr1 + add rdx, byte SIZEOF_XMMWORD ; inptr2 + jmp near .columnloop + +.column_st32: + cmp rcx, byte SIZEOF_XMMWORD/2 + jb short .column_st16 + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + add rdi, byte 2*SIZEOF_XMMWORD ; outptr + movdqa xmmA,xmmC + movdqa xmmD,xmmH + sub rcx, byte SIZEOF_XMMWORD/2 +.column_st16: + cmp rcx, byte SIZEOF_XMMWORD/4 + jb short .column_st15 + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + add rdi, byte SIZEOF_XMMWORD ; outptr + movdqa xmmA,xmmD + sub rcx, byte SIZEOF_XMMWORD/4 +.column_st15: + ; Store two pixels (8 bytes) of xmmA to the output when it has enough + ; space. + cmp rcx, byte SIZEOF_XMMWORD/8 + jb short .column_st7 + movq MMWORD [rdi], xmmA + add rdi, byte SIZEOF_XMMWORD/8*4 + sub rcx, byte SIZEOF_XMMWORD/8 + psrldq xmmA, SIZEOF_XMMWORD/8*4 +.column_st7: + ; Store one pixel (4 bytes) of xmmA to the output when it has enough + ; space. + test rcx, rcx + jz short .nextrow + movd XMM_DWORD [rdi], xmmA + +%endif ; RGB_PIXELSIZE ; --------------- + +.nextrow: + pop rcx + pop rsi + pop rbx + pop rdx + pop rdi + pop rax + + add rsi, byte SIZEOF_JSAMPROW + add rbx, byte SIZEOF_JSAMPROW + add rdx, byte SIZEOF_JSAMPROW + add rdi, byte SIZEOF_JSAMPROW ; output_buf + dec rax ; num_rows + jg near .rowloop + + sfence ; flush the write buffer + +.return: + pop rbx + uncollect_args + mov rsp,rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/Builder/jni-1.11/simd/jdcolext-sse2.asm b/Builder/jni-1.11/simd/jdcolext-sse2.asm new file mode 100644 index 000000000..682aef35f --- /dev/null +++ b/Builder/jni-1.11/simd/jdcolext-sse2.asm @@ -0,0 +1,459 @@ +; +; jdcolext.asm - colorspace conversion (SSE2) +; +; Copyright 2009, 2012 Pierre Ossman for Cendio AB +; Copyright (C) 2012, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Convert some rows of samples to the output colorspace. +; +; GLOBAL(void) +; jsimd_ycc_rgb_convert_sse2 (JDIMENSION out_width, +; JSAMPIMAGE input_buf, JDIMENSION input_row, +; JSAMPARRAY output_buf, int num_rows) +; + +%define out_width(b) (b)+8 ; JDIMENSION out_width +%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf +%define input_row(b) (b)+16 ; JDIMENSION input_row +%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf +%define num_rows(b) (b)+24 ; int num_rows + +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 +%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr + + align 16 + global EXTN(jsimd_ycc_rgb_convert_sse2) + +EXTN(jsimd_ycc_rgb_convert_sse2): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov ecx, JDIMENSION [out_width(eax)] ; num_cols + test ecx,ecx + jz near .return + + push ecx + + mov edi, JSAMPIMAGE [input_buf(eax)] + mov ecx, JDIMENSION [input_row(eax)] + mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] + mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] + mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] + lea esi, [esi+ecx*SIZEOF_JSAMPROW] + lea ebx, [ebx+ecx*SIZEOF_JSAMPROW] + lea edx, [edx+ecx*SIZEOF_JSAMPROW] + + pop ecx + + mov edi, JSAMPARRAY [output_buf(eax)] + mov eax, INT [num_rows(eax)] + test eax,eax + jle near .return + alignx 16,7 +.rowloop: + push eax + push edi + push edx + push ebx + push esi + push ecx ; col + + mov esi, JSAMPROW [esi] ; inptr0 + mov ebx, JSAMPROW [ebx] ; inptr1 + mov edx, JSAMPROW [edx] ; inptr2 + mov edi, JSAMPROW [edi] ; outptr + movpic eax, POINTER [gotptr] ; load GOT address (eax) + alignx 16,7 +.columnloop: + + movdqa xmm5, XMMWORD [ebx] ; xmm5=Cb(0123456789ABCDEF) + movdqa xmm1, XMMWORD [edx] ; xmm1=Cr(0123456789ABCDEF) + + pcmpeqw xmm4,xmm4 + pcmpeqw xmm7,xmm7 + psrlw xmm4,BYTE_BIT + psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} + movdqa xmm0,xmm4 ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..} + + pand xmm4,xmm5 ; xmm4=Cb(02468ACE)=CbE + psrlw xmm5,BYTE_BIT ; xmm5=Cb(13579BDF)=CbO + pand xmm0,xmm1 ; xmm0=Cr(02468ACE)=CrE + psrlw xmm1,BYTE_BIT ; xmm1=Cr(13579BDF)=CrO + + paddw xmm4,xmm7 + paddw xmm5,xmm7 + paddw xmm0,xmm7 + paddw xmm1,xmm7 + + ; (Original) + ; R = Y + 1.40200 * Cr + ; G = Y - 0.34414 * Cb - 0.71414 * Cr + ; B = Y + 1.77200 * Cb + ; + ; (This implementation) + ; R = Y + 0.40200 * Cr + Cr + ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr + ; B = Y - 0.22800 * Cb + Cb + Cb + + movdqa xmm2,xmm4 ; xmm2=CbE + movdqa xmm3,xmm5 ; xmm3=CbO + paddw xmm4,xmm4 ; xmm4=2*CbE + paddw xmm5,xmm5 ; xmm5=2*CbO + movdqa xmm6,xmm0 ; xmm6=CrE + movdqa xmm7,xmm1 ; xmm7=CrO + paddw xmm0,xmm0 ; xmm0=2*CrE + paddw xmm1,xmm1 ; xmm1=2*CrO + + pmulhw xmm4,[GOTOFF(eax,PW_MF0228)] ; xmm4=(2*CbE * -FIX(0.22800)) + pmulhw xmm5,[GOTOFF(eax,PW_MF0228)] ; xmm5=(2*CbO * -FIX(0.22800)) + pmulhw xmm0,[GOTOFF(eax,PW_F0402)] ; xmm0=(2*CrE * FIX(0.40200)) + pmulhw xmm1,[GOTOFF(eax,PW_F0402)] ; xmm1=(2*CrO * FIX(0.40200)) + + paddw xmm4,[GOTOFF(eax,PW_ONE)] + paddw xmm5,[GOTOFF(eax,PW_ONE)] + psraw xmm4,1 ; xmm4=(CbE * -FIX(0.22800)) + psraw xmm5,1 ; xmm5=(CbO * -FIX(0.22800)) + paddw xmm0,[GOTOFF(eax,PW_ONE)] + paddw xmm1,[GOTOFF(eax,PW_ONE)] + psraw xmm0,1 ; xmm0=(CrE * FIX(0.40200)) + psraw xmm1,1 ; xmm1=(CrO * FIX(0.40200)) + + paddw xmm4,xmm2 + paddw xmm5,xmm3 + paddw xmm4,xmm2 ; xmm4=(CbE * FIX(1.77200))=(B-Y)E + paddw xmm5,xmm3 ; xmm5=(CbO * FIX(1.77200))=(B-Y)O + paddw xmm0,xmm6 ; xmm0=(CrE * FIX(1.40200))=(R-Y)E + paddw xmm1,xmm7 ; xmm1=(CrO * FIX(1.40200))=(R-Y)O + + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=(B-Y)E + movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(B-Y)O + + movdqa xmm4,xmm2 + movdqa xmm5,xmm3 + punpcklwd xmm2,xmm6 + punpckhwd xmm4,xmm6 + pmaddwd xmm2,[GOTOFF(eax,PW_MF0344_F0285)] + pmaddwd xmm4,[GOTOFF(eax,PW_MF0344_F0285)] + punpcklwd xmm3,xmm7 + punpckhwd xmm5,xmm7 + pmaddwd xmm3,[GOTOFF(eax,PW_MF0344_F0285)] + pmaddwd xmm5,[GOTOFF(eax,PW_MF0344_F0285)] + + paddd xmm2,[GOTOFF(eax,PD_ONEHALF)] + paddd xmm4,[GOTOFF(eax,PD_ONEHALF)] + psrad xmm2,SCALEBITS + psrad xmm4,SCALEBITS + paddd xmm3,[GOTOFF(eax,PD_ONEHALF)] + paddd xmm5,[GOTOFF(eax,PD_ONEHALF)] + psrad xmm3,SCALEBITS + psrad xmm5,SCALEBITS + + packssdw xmm2,xmm4 ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285) + packssdw xmm3,xmm5 ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285) + psubw xmm2,xmm6 ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E + psubw xmm3,xmm7 ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O + + movdqa xmm5, XMMWORD [esi] ; xmm5=Y(0123456789ABCDEF) + + pcmpeqw xmm4,xmm4 + psrlw xmm4,BYTE_BIT ; xmm4={0xFF 0x00 0xFF 0x00 ..} + pand xmm4,xmm5 ; xmm4=Y(02468ACE)=YE + psrlw xmm5,BYTE_BIT ; xmm5=Y(13579BDF)=YO + + paddw xmm0,xmm4 ; xmm0=((R-Y)E+YE)=RE=R(02468ACE) + paddw xmm1,xmm5 ; xmm1=((R-Y)O+YO)=RO=R(13579BDF) + packuswb xmm0,xmm0 ; xmm0=R(02468ACE********) + packuswb xmm1,xmm1 ; xmm1=R(13579BDF********) + + paddw xmm2,xmm4 ; xmm2=((G-Y)E+YE)=GE=G(02468ACE) + paddw xmm3,xmm5 ; xmm3=((G-Y)O+YO)=GO=G(13579BDF) + packuswb xmm2,xmm2 ; xmm2=G(02468ACE********) + packuswb xmm3,xmm3 ; xmm3=G(13579BDF********) + + paddw xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE) + paddw xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF) + packuswb xmm4,xmm4 ; xmm4=B(02468ACE********) + packuswb xmm5,xmm5 ; xmm5=B(13579BDF********) + +%if RGB_PIXELSIZE == 3 ; --------------- + + ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) + ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) + ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) + ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **) + + punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) + punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F) + punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F) + + movdqa xmmG,xmmA + movdqa xmmH,xmmA + punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07) + punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F) + + psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --) + psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --) + + movdqa xmmC,xmmD + movdqa xmmB,xmmD + punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18) + punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --) + + psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --) + + movdqa xmmF,xmmE + punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29) + punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --) + + pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03) + movdqa xmmB,xmmE + punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14) + punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07) + punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29) + + pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B) + movdqa xmmB,xmmF + punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C) + punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F) + punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --) + + punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) + punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) + + cmp ecx, byte SIZEOF_XMMWORD + jb short .column_st32 + + test edi, SIZEOF_XMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF + jmp short .out0 +.out1: ; --(unaligned)----------------- + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF +.out0: + add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr + sub ecx, byte SIZEOF_XMMWORD + jz near .nextrow + + add esi, byte SIZEOF_XMMWORD ; inptr0 + add ebx, byte SIZEOF_XMMWORD ; inptr1 + add edx, byte SIZEOF_XMMWORD ; inptr2 + jmp near .columnloop + alignx 16,7 + +.column_st32: + lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE + cmp ecx, byte 2*SIZEOF_XMMWORD + jb short .column_st16 + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + add edi, byte 2*SIZEOF_XMMWORD ; outptr + movdqa xmmA,xmmF + sub ecx, byte 2*SIZEOF_XMMWORD + jmp short .column_st15 +.column_st16: + cmp ecx, byte SIZEOF_XMMWORD + jb short .column_st15 + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + add edi, byte SIZEOF_XMMWORD ; outptr + movdqa xmmA,xmmD + sub ecx, byte SIZEOF_XMMWORD +.column_st15: + ; Store the lower 8 bytes of xmmA to the output when it has enough + ; space. + cmp ecx, byte SIZEOF_MMWORD + jb short .column_st7 + movq XMM_MMWORD [edi], xmmA + add edi, byte SIZEOF_MMWORD + sub ecx, byte SIZEOF_MMWORD + psrldq xmmA, SIZEOF_MMWORD +.column_st7: + ; Store the lower 4 bytes of xmmA to the output when it has enough + ; space. + cmp ecx, byte SIZEOF_DWORD + jb short .column_st3 + movd XMM_DWORD [edi], xmmA + add edi, byte SIZEOF_DWORD + sub ecx, byte SIZEOF_DWORD + psrldq xmmA, SIZEOF_DWORD +.column_st3: + ; Store the lower 2 bytes of eax to the output when it has enough + ; space. + movd eax, xmmA + cmp ecx, byte SIZEOF_WORD + jb short .column_st1 + mov WORD [edi], ax + add edi, byte SIZEOF_WORD + sub ecx, byte SIZEOF_WORD + shr eax, 16 +.column_st1: + ; Store the lower 1 byte of eax to the output when it has enough + ; space. + test ecx, ecx + jz short .nextrow + mov BYTE [edi], al + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +%ifdef RGBX_FILLER_0XFF + pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********) + pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********) +%else + pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********) + pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********) +%endif + ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) + ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) + ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) + ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **) + + punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) + punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E) + punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F) + punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F) + + movdqa xmmC,xmmA + punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36) + punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E) + movdqa xmmG,xmmB + punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37) + punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F) + + movdqa xmmD,xmmA + punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) + punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + movdqa xmmH,xmmC + punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) + punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + + cmp ecx, byte SIZEOF_XMMWORD + jb short .column_st32 + + test edi, SIZEOF_XMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC + movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH + jmp short .out0 +.out1: ; --(unaligned)----------------- + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC + movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH +.out0: + add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr + sub ecx, byte SIZEOF_XMMWORD + jz near .nextrow + + add esi, byte SIZEOF_XMMWORD ; inptr0 + add ebx, byte SIZEOF_XMMWORD ; inptr1 + add edx, byte SIZEOF_XMMWORD ; inptr2 + jmp near .columnloop + alignx 16,7 + +.column_st32: + cmp ecx, byte SIZEOF_XMMWORD/2 + jb short .column_st16 + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + add edi, byte 2*SIZEOF_XMMWORD ; outptr + movdqa xmmA,xmmC + movdqa xmmD,xmmH + sub ecx, byte SIZEOF_XMMWORD/2 +.column_st16: + cmp ecx, byte SIZEOF_XMMWORD/4 + jb short .column_st15 + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + add edi, byte SIZEOF_XMMWORD ; outptr + movdqa xmmA,xmmD + sub ecx, byte SIZEOF_XMMWORD/4 +.column_st15: + ; Store two pixels (8 bytes) of xmmA to the output when it has enough + ; space. + cmp ecx, byte SIZEOF_XMMWORD/8 + jb short .column_st7 + movq XMM_MMWORD [edi], xmmA + add edi, byte SIZEOF_XMMWORD/8*4 + sub ecx, byte SIZEOF_XMMWORD/8 + psrldq xmmA, SIZEOF_XMMWORD/8*4 +.column_st7: + ; Store one pixel (4 bytes) of xmmA to the output when it has enough + ; space. + test ecx, ecx + jz short .nextrow + movd XMM_DWORD [edi], xmmA + +%endif ; RGB_PIXELSIZE ; --------------- + + alignx 16,7 + +.nextrow: + pop ecx + pop esi + pop ebx + pop edx + pop edi + pop eax + + add esi, byte SIZEOF_JSAMPROW + add ebx, byte SIZEOF_JSAMPROW + add edx, byte SIZEOF_JSAMPROW + add edi, byte SIZEOF_JSAMPROW ; output_buf + dec eax ; num_rows + jg near .rowloop + + sfence ; flush the write buffer + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/Builder/jni-1.11/simd/jdcolor-altivec.c b/Builder/jni-1.11/simd/jdcolor-altivec.c new file mode 100644 index 000000000..0dc4c427c --- /dev/null +++ b/Builder/jni-1.11/simd/jdcolor-altivec.c @@ -0,0 +1,96 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2015, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* YCC --> RGB CONVERSION */ + +#include "jsimd_altivec.h" + + +#define F_0_344 22554 /* FIX(0.34414) */ +#define F_0_714 46802 /* FIX(0.71414) */ +#define F_1_402 91881 /* FIX(1.40200) */ +#define F_1_772 116130 /* FIX(1.77200) */ +#define F_0_402 (F_1_402 - 65536) /* FIX(1.40200) - FIX(1) */ +#define F_0_285 (65536 - F_0_714) /* FIX(1) - FIX(0.71414) */ +#define F_0_228 (131072 - F_1_772) /* FIX(2) - FIX(1.77200) */ + +#define SCALEBITS 16 +#define ONE_HALF (1 << (SCALEBITS - 1)) + +#define RGB_INDEX0 {0,1,8,2,3,10,4,5,12,6,7,14,16,17,24,18} +#define RGB_INDEX1 {3,10,4,5,12,6,7,14,16,17,24,18,19,26,20,21} +#define RGB_INDEX2 {12,6,7,14,16,17,24,18,19,26,20,21,28,22,23,30} +#include "jdcolext-altivec.c" +#undef RGB_PIXELSIZE + +#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extrgb_convert_altivec +#include "jdcolext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGB_INDEX0 +#undef RGB_INDEX1 +#undef RGB_INDEX2 +#undef jsimd_ycc_rgb_convert_altivec + +#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +#define RGB_INDEX {0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15} +#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extrgbx_convert_altivec +#include "jdcolext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGB_INDEX +#undef jsimd_ycc_rgb_convert_altivec + +#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +#define RGB_INDEX0 {8,1,0,10,3,2,12,5,4,14,7,6,24,17,16,26} +#define RGB_INDEX1 {3,2,12,5,4,14,7,6,24,17,16,26,19,18,28,21} +#define RGB_INDEX2 {4,14,7,6,24,17,16,26,19,18,28,21,20,30,23,22} +#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extbgr_convert_altivec +#include "jdcolext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGB_INDEX0 +#undef RGB_INDEX1 +#undef RGB_INDEX2 +#undef jsimd_ycc_rgb_convert_altivec + +#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +#define RGB_INDEX {8,1,0,9,10,3,2,11,12,5,4,13,14,7,6,15} +#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extbgrx_convert_altivec +#include "jdcolext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGB_INDEX +#undef jsimd_ycc_rgb_convert_altivec + +#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +#define RGB_INDEX {9,8,1,0,11,10,3,2,13,12,5,4,15,14,7,6} +#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extxbgr_convert_altivec +#include "jdcolext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGB_INDEX +#undef jsimd_ycc_rgb_convert_altivec + +#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +#define RGB_INDEX {9,0,1,8,11,2,3,10,13,4,5,12,15,6,7,14} +#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extxrgb_convert_altivec +#include "jdcolext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGB_INDEX +#undef jsimd_ycc_rgb_convert_altivec diff --git a/Builder/jni-1.11/simd/i386/src/jdcolmmx.asm b/Builder/jni-1.11/simd/jdcolor-mmx.asm similarity index 68% rename from Builder/jni-1.11/simd/i386/src/jdcolmmx.asm rename to Builder/jni-1.11/simd/jdcolor-mmx.asm index 5e4e47d42..4e58031dd 100644 --- a/Builder/jni-1.11/simd/i386/src/jdcolmmx.asm +++ b/Builder/jni-1.11/simd/jdcolor-mmx.asm @@ -1,11 +1,10 @@ ; -; jdcolmmx.asm - colorspace conversion (MMX) +; jdcolor.asm - colorspace conversion (MMX) ; ; Copyright 2009 Pierre Ossman for Cendio AB -; Copyright 2009 D. R. Commander +; Copyright (C) 2009, D. R. Commander. ; -; Based on -; x86 SIMD extension for IJG JPEG library +; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; @@ -21,37 +20,37 @@ ; -------------------------------------------------------------------------- -%define SCALEBITS 16 +%define SCALEBITS 16 -F_0_344 equ 22554 ; FIX(0.34414) -F_0_714 equ 46802 ; FIX(0.71414) -F_1_402 equ 91881 ; FIX(1.40200) -F_1_772 equ 116130 ; FIX(1.77200) -F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) -F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) -F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) +F_0_344 equ 22554 ; FIX(0.34414) +F_0_714 equ 46802 ; FIX(0.71414) +F_1_402 equ 91881 ; FIX(1.40200) +F_1_772 equ 116130 ; FIX(1.77200) +F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) +F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) +F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_ycc_rgb_convert_mmx) + alignz 16 + global EXTN(jconst_ycc_rgb_convert_mmx) EXTN(jconst_ycc_rgb_convert_mmx): -PW_F0402 times 4 dw F_0_402 -PW_MF0228 times 4 dw -F_0_228 -PW_MF0344_F0285 times 2 dw -F_0_344, F_0_285 -PW_ONE times 4 dw 1 -PD_ONEHALF times 2 dd 1 << (SCALEBITS-1) +PW_F0402 times 4 dw F_0_402 +PW_MF0228 times 4 dw -F_0_228 +PW_MF0344_F0285 times 2 dw -F_0_344, F_0_285 +PW_ONE times 4 dw 1 +PD_ONEHALF times 2 dd 1 << (SCALEBITS-1) - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 + SECTION SEG_TEXT + BITS 32 -%include "jdclrmmx.asm" +%include "jdcolext-mmx.asm" %undef RGB_RED %undef RGB_GREEN @@ -62,7 +61,7 @@ PD_ONEHALF times 2 dd 1 << (SCALEBITS-1) %define RGB_BLUE EXT_RGB_BLUE %define RGB_PIXELSIZE EXT_RGB_PIXELSIZE %define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extrgb_convert_mmx -%include "jdclrmmx.asm" +%include "jdcolext-mmx.asm" %undef RGB_RED %undef RGB_GREEN @@ -73,7 +72,7 @@ PD_ONEHALF times 2 dd 1 << (SCALEBITS-1) %define RGB_BLUE EXT_RGBX_BLUE %define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE %define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extrgbx_convert_mmx -%include "jdclrmmx.asm" +%include "jdcolext-mmx.asm" %undef RGB_RED %undef RGB_GREEN @@ -84,7 +83,7 @@ PD_ONEHALF times 2 dd 1 << (SCALEBITS-1) %define RGB_BLUE EXT_BGR_BLUE %define RGB_PIXELSIZE EXT_BGR_PIXELSIZE %define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extbgr_convert_mmx -%include "jdclrmmx.asm" +%include "jdcolext-mmx.asm" %undef RGB_RED %undef RGB_GREEN @@ -95,7 +94,7 @@ PD_ONEHALF times 2 dd 1 << (SCALEBITS-1) %define RGB_BLUE EXT_BGRX_BLUE %define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE %define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extbgrx_convert_mmx -%include "jdclrmmx.asm" +%include "jdcolext-mmx.asm" %undef RGB_RED %undef RGB_GREEN @@ -106,7 +105,7 @@ PD_ONEHALF times 2 dd 1 << (SCALEBITS-1) %define RGB_BLUE EXT_XBGR_BLUE %define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE %define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extxbgr_convert_mmx -%include "jdclrmmx.asm" +%include "jdcolext-mmx.asm" %undef RGB_RED %undef RGB_GREEN @@ -117,4 +116,4 @@ PD_ONEHALF times 2 dd 1 << (SCALEBITS-1) %define RGB_BLUE EXT_XRGB_BLUE %define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE %define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extxrgb_convert_mmx -%include "jdclrmmx.asm" +%include "jdcolext-mmx.asm" diff --git a/Builder/jni-1.11/simd/jdcolor-sse2-64.asm b/Builder/jni-1.11/simd/jdcolor-sse2-64.asm new file mode 100644 index 000000000..d2bf21000 --- /dev/null +++ b/Builder/jni-1.11/simd/jdcolor-sse2-64.asm @@ -0,0 +1,119 @@ +; +; jdcolor.asm - colorspace conversion (64-bit SSE2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2009, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_344 equ 22554 ; FIX(0.34414) +F_0_714 equ 46802 ; FIX(0.71414) +F_1_402 equ 91881 ; FIX(1.40200) +F_1_772 equ 116130 ; FIX(1.77200) +F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) +F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) +F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_ycc_rgb_convert_sse2) + +EXTN(jconst_ycc_rgb_convert_sse2): + +PW_F0402 times 8 dw F_0_402 +PW_MF0228 times 8 dw -F_0_228 +PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285 +PW_ONE times 8 dw 1 +PD_ONEHALF times 4 dd 1 << (SCALEBITS-1) + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 + +%include "jdcolext-sse2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGB_RED +%define RGB_GREEN EXT_RGB_GREEN +%define RGB_BLUE EXT_RGB_BLUE +%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgb_convert_sse2 +%include "jdcolext-sse2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGBX_RED +%define RGB_GREEN EXT_RGBX_GREEN +%define RGB_BLUE EXT_RGBX_BLUE +%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgbx_convert_sse2 +%include "jdcolext-sse2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGR_RED +%define RGB_GREEN EXT_BGR_GREEN +%define RGB_BLUE EXT_BGR_BLUE +%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgr_convert_sse2 +%include "jdcolext-sse2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGRX_RED +%define RGB_GREEN EXT_BGRX_GREEN +%define RGB_BLUE EXT_BGRX_BLUE +%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgrx_convert_sse2 +%include "jdcolext-sse2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XBGR_RED +%define RGB_GREEN EXT_XBGR_GREEN +%define RGB_BLUE EXT_XBGR_BLUE +%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxbgr_convert_sse2 +%include "jdcolext-sse2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XRGB_RED +%define RGB_GREEN EXT_XRGB_GREEN +%define RGB_BLUE EXT_XRGB_BLUE +%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxrgb_convert_sse2 +%include "jdcolext-sse2-64.asm" diff --git a/Builder/jni-1.11/simd/i386/src/jdcolss2.asm b/Builder/jni-1.11/simd/jdcolor-sse2.asm similarity index 68% rename from Builder/jni-1.11/simd/i386/src/jdcolss2.asm rename to Builder/jni-1.11/simd/jdcolor-sse2.asm index 1912d92e4..7ff5d05d0 100644 --- a/Builder/jni-1.11/simd/i386/src/jdcolss2.asm +++ b/Builder/jni-1.11/simd/jdcolor-sse2.asm @@ -1,11 +1,10 @@ ; -; jdcolss2.asm - colorspace conversion (SSE2) +; jdcolor.asm - colorspace conversion (SSE2) ; ; Copyright 2009 Pierre Ossman for Cendio AB -; Copyright 2009 D. R. Commander +; Copyright (C) 2009, D. R. Commander. ; -; Based on -; x86 SIMD extension for IJG JPEG library +; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; @@ -21,37 +20,37 @@ ; -------------------------------------------------------------------------- -%define SCALEBITS 16 +%define SCALEBITS 16 -F_0_344 equ 22554 ; FIX(0.34414) -F_0_714 equ 46802 ; FIX(0.71414) -F_1_402 equ 91881 ; FIX(1.40200) -F_1_772 equ 116130 ; FIX(1.77200) -F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) -F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) -F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) +F_0_344 equ 22554 ; FIX(0.34414) +F_0_714 equ 46802 ; FIX(0.71414) +F_1_402 equ 91881 ; FIX(1.40200) +F_1_772 equ 116130 ; FIX(1.77200) +F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) +F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) +F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_ycc_rgb_convert_sse2) + alignz 16 + global EXTN(jconst_ycc_rgb_convert_sse2) EXTN(jconst_ycc_rgb_convert_sse2): -PW_F0402 times 8 dw F_0_402 -PW_MF0228 times 8 dw -F_0_228 -PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285 -PW_ONE times 8 dw 1 -PD_ONEHALF times 4 dd 1 << (SCALEBITS-1) +PW_F0402 times 8 dw F_0_402 +PW_MF0228 times 8 dw -F_0_228 +PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285 +PW_ONE times 8 dw 1 +PD_ONEHALF times 4 dd 1 << (SCALEBITS-1) - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 + SECTION SEG_TEXT + BITS 32 -%include "jdclrss2.asm" +%include "jdcolext-sse2.asm" %undef RGB_RED %undef RGB_GREEN @@ -62,7 +61,7 @@ PD_ONEHALF times 4 dd 1 << (SCALEBITS-1) %define RGB_BLUE EXT_RGB_BLUE %define RGB_PIXELSIZE EXT_RGB_PIXELSIZE %define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgb_convert_sse2 -%include "jdclrss2.asm" +%include "jdcolext-sse2.asm" %undef RGB_RED %undef RGB_GREEN @@ -73,7 +72,7 @@ PD_ONEHALF times 4 dd 1 << (SCALEBITS-1) %define RGB_BLUE EXT_RGBX_BLUE %define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE %define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgbx_convert_sse2 -%include "jdclrss2.asm" +%include "jdcolext-sse2.asm" %undef RGB_RED %undef RGB_GREEN @@ -84,7 +83,7 @@ PD_ONEHALF times 4 dd 1 << (SCALEBITS-1) %define RGB_BLUE EXT_BGR_BLUE %define RGB_PIXELSIZE EXT_BGR_PIXELSIZE %define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgr_convert_sse2 -%include "jdclrss2.asm" +%include "jdcolext-sse2.asm" %undef RGB_RED %undef RGB_GREEN @@ -95,7 +94,7 @@ PD_ONEHALF times 4 dd 1 << (SCALEBITS-1) %define RGB_BLUE EXT_BGRX_BLUE %define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE %define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgrx_convert_sse2 -%include "jdclrss2.asm" +%include "jdcolext-sse2.asm" %undef RGB_RED %undef RGB_GREEN @@ -106,7 +105,7 @@ PD_ONEHALF times 4 dd 1 << (SCALEBITS-1) %define RGB_BLUE EXT_XBGR_BLUE %define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE %define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxbgr_convert_sse2 -%include "jdclrss2.asm" +%include "jdcolext-sse2.asm" %undef RGB_RED %undef RGB_GREEN @@ -117,4 +116,4 @@ PD_ONEHALF times 4 dd 1 << (SCALEBITS-1) %define RGB_BLUE EXT_XRGB_BLUE %define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE %define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxrgb_convert_sse2 -%include "jdclrss2.asm" +%include "jdcolext-sse2.asm" diff --git a/Builder/jni-1.11/simd/i386/src/jdct.inc b/Builder/jni-1.11/simd/jdct.inc similarity index 67% rename from Builder/jni-1.11/simd/i386/src/jdct.inc rename to Builder/jni-1.11/simd/jdct.inc index cc6270425..b9761071e 100644 --- a/Builder/jni-1.11/simd/i386/src/jdct.inc +++ b/Builder/jni-1.11/simd/jdct.inc @@ -3,8 +3,7 @@ ; ; Copyright 2009 Pierre Ossman for Cendio AB ; -; Based on -; x86 SIMD extension for IJG JPEG library +; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; @@ -18,11 +17,11 @@ ; %define RANGE_MASK (MAXJSAMPLE * 4 + 3) ; 2 bits wider than legal samples -%define ROW(n,b,s) ((b)+(n)*(s)) -%define COL(n,b,s) ((b)+(n)*(s)*DCTSIZE) +%define ROW(n,b,s) ((b)+(n)*(s)) +%define COL(n,b,s) ((b)+(n)*(s)*DCTSIZE) -%define DWBLOCK(m,n,b,s) ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_DWORD) -%define MMBLOCK(m,n,b,s) ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_MMWORD) -%define XMMBLOCK(m,n,b,s) ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_XMMWORD) +%define DWBLOCK(m,n,b,s) ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_DWORD) +%define MMBLOCK(m,n,b,s) ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_MMWORD) +%define XMMBLOCK(m,n,b,s) ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_XMMWORD) ; -------------------------------------------------------------------------- diff --git a/Builder/jni-1.11/simd/jdmerge-altivec.c b/Builder/jni-1.11/simd/jdmerge-altivec.c new file mode 100644 index 000000000..6a35f2019 --- /dev/null +++ b/Builder/jni-1.11/simd/jdmerge-altivec.c @@ -0,0 +1,108 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2015, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* MERGED YCC --> RGB CONVERSION AND UPSAMPLING */ + +#include "jsimd_altivec.h" + + +#define F_0_344 22554 /* FIX(0.34414) */ +#define F_0_714 46802 /* FIX(0.71414) */ +#define F_1_402 91881 /* FIX(1.40200) */ +#define F_1_772 116130 /* FIX(1.77200) */ +#define F_0_402 (F_1_402 - 65536) /* FIX(1.40200) - FIX(1) */ +#define F_0_285 (65536 - F_0_714) /* FIX(1) - FIX(0.71414) */ +#define F_0_228 (131072 - F_1_772) /* FIX(2) - FIX(1.77200) */ + +#define SCALEBITS 16 +#define ONE_HALF (1 << (SCALEBITS - 1)) + +#define RGB_INDEX0 {0,1,8,2,3,10,4,5,12,6,7,14,16,17,24,18} +#define RGB_INDEX1 {3,10,4,5,12,6,7,14,16,17,24,18,19,26,20,21} +#define RGB_INDEX2 {12,6,7,14,16,17,24,18,19,26,20,21,28,22,23,30} +#include "jdmrgext-altivec.c" +#undef RGB_PIXELSIZE + +#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +#define jsimd_h2v1_merged_upsample_altivec jsimd_h2v1_extrgb_merged_upsample_altivec +#define jsimd_h2v2_merged_upsample_altivec jsimd_h2v2_extrgb_merged_upsample_altivec +#include "jdmrgext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGB_INDEX0 +#undef RGB_INDEX1 +#undef RGB_INDEX2 +#undef jsimd_h2v1_merged_upsample_altivec +#undef jsimd_h2v2_merged_upsample_altivec + +#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +#define RGB_INDEX {0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15} +#define jsimd_h2v1_merged_upsample_altivec jsimd_h2v1_extrgbx_merged_upsample_altivec +#define jsimd_h2v2_merged_upsample_altivec jsimd_h2v2_extrgbx_merged_upsample_altivec +#include "jdmrgext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGB_INDEX +#undef jsimd_h2v1_merged_upsample_altivec +#undef jsimd_h2v2_merged_upsample_altivec + +#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +#define RGB_INDEX0 {8,1,0,10,3,2,12,5,4,14,7,6,24,17,16,26} +#define RGB_INDEX1 {3,2,12,5,4,14,7,6,24,17,16,26,19,18,28,21} +#define RGB_INDEX2 {4,14,7,6,24,17,16,26,19,18,28,21,20,30,23,22} +#define jsimd_h2v1_merged_upsample_altivec jsimd_h2v1_extbgr_merged_upsample_altivec +#define jsimd_h2v2_merged_upsample_altivec jsimd_h2v2_extbgr_merged_upsample_altivec +#include "jdmrgext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGB_INDEX0 +#undef RGB_INDEX1 +#undef RGB_INDEX2 +#undef jsimd_h2v1_merged_upsample_altivec +#undef jsimd_h2v2_merged_upsample_altivec + +#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +#define RGB_INDEX {8,1,0,9,10,3,2,11,12,5,4,13,14,7,6,15} +#define jsimd_h2v1_merged_upsample_altivec jsimd_h2v1_extbgrx_merged_upsample_altivec +#define jsimd_h2v2_merged_upsample_altivec jsimd_h2v2_extbgrx_merged_upsample_altivec +#include "jdmrgext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGB_INDEX +#undef jsimd_h2v1_merged_upsample_altivec +#undef jsimd_h2v2_merged_upsample_altivec + +#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +#define RGB_INDEX {9,8,1,0,11,10,3,2,13,12,5,4,15,14,7,6} +#define jsimd_h2v1_merged_upsample_altivec jsimd_h2v1_extxbgr_merged_upsample_altivec +#define jsimd_h2v2_merged_upsample_altivec jsimd_h2v2_extxbgr_merged_upsample_altivec +#include "jdmrgext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGB_INDEX +#undef jsimd_h2v1_merged_upsample_altivec +#undef jsimd_h2v2_merged_upsample_altivec + +#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +#define RGB_INDEX {9,0,1,8,11,2,3,10,13,4,5,12,15,6,7,14} +#define jsimd_h2v1_merged_upsample_altivec jsimd_h2v1_extxrgb_merged_upsample_altivec +#define jsimd_h2v2_merged_upsample_altivec jsimd_h2v2_extxrgb_merged_upsample_altivec +#include "jdmrgext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGB_INDEX +#undef jsimd_h2v1_merged_upsample_altivec +#undef jsimd_h2v2_merged_upsample_altivec diff --git a/Builder/jni-1.11/simd/i386/src/jdmermmx.asm b/Builder/jni-1.11/simd/jdmerge-mmx.asm similarity index 72% rename from Builder/jni-1.11/simd/i386/src/jdmermmx.asm rename to Builder/jni-1.11/simd/jdmerge-mmx.asm index 7b86c7493..ee58bff1c 100644 --- a/Builder/jni-1.11/simd/i386/src/jdmermmx.asm +++ b/Builder/jni-1.11/simd/jdmerge-mmx.asm @@ -1,11 +1,10 @@ ; -; jdmermmx.asm - merged upsampling/color conversion (MMX) +; jdmerge.asm - merged upsampling/color conversion (MMX) ; ; Copyright 2009 Pierre Ossman for Cendio AB -; Copyright 2009 D. R. Commander +; Copyright (C) 2009, D. R. Commander. ; -; Based on -; x86 SIMD extension for IJG JPEG library +; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; @@ -21,37 +20,37 @@ ; -------------------------------------------------------------------------- -%define SCALEBITS 16 +%define SCALEBITS 16 -F_0_344 equ 22554 ; FIX(0.34414) -F_0_714 equ 46802 ; FIX(0.71414) -F_1_402 equ 91881 ; FIX(1.40200) -F_1_772 equ 116130 ; FIX(1.77200) -F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) -F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) -F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) +F_0_344 equ 22554 ; FIX(0.34414) +F_0_714 equ 46802 ; FIX(0.71414) +F_1_402 equ 91881 ; FIX(1.40200) +F_1_772 equ 116130 ; FIX(1.77200) +F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) +F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) +F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_merged_upsample_mmx) + alignz 16 + global EXTN(jconst_merged_upsample_mmx) EXTN(jconst_merged_upsample_mmx): -PW_F0402 times 4 dw F_0_402 -PW_MF0228 times 4 dw -F_0_228 -PW_MF0344_F0285 times 2 dw -F_0_344, F_0_285 -PW_ONE times 4 dw 1 -PD_ONEHALF times 2 dd 1 << (SCALEBITS-1) +PW_F0402 times 4 dw F_0_402 +PW_MF0228 times 4 dw -F_0_228 +PW_MF0344_F0285 times 2 dw -F_0_344, F_0_285 +PW_ONE times 4 dw 1 +PD_ONEHALF times 2 dd 1 << (SCALEBITS-1) - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 + SECTION SEG_TEXT + BITS 32 -%include "jdmrgmmx.asm" +%include "jdmrgext-mmx.asm" %undef RGB_RED %undef RGB_GREEN @@ -63,7 +62,7 @@ PD_ONEHALF times 2 dd 1 << (SCALEBITS-1) %define RGB_PIXELSIZE EXT_RGB_PIXELSIZE %define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extrgb_merged_upsample_mmx %define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extrgb_merged_upsample_mmx -%include "jdmrgmmx.asm" +%include "jdmrgext-mmx.asm" %undef RGB_RED %undef RGB_GREEN @@ -75,7 +74,7 @@ PD_ONEHALF times 2 dd 1 << (SCALEBITS-1) %define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE %define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extrgbx_merged_upsample_mmx %define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extrgbx_merged_upsample_mmx -%include "jdmrgmmx.asm" +%include "jdmrgext-mmx.asm" %undef RGB_RED %undef RGB_GREEN @@ -87,7 +86,7 @@ PD_ONEHALF times 2 dd 1 << (SCALEBITS-1) %define RGB_PIXELSIZE EXT_BGR_PIXELSIZE %define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extbgr_merged_upsample_mmx %define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extbgr_merged_upsample_mmx -%include "jdmrgmmx.asm" +%include "jdmrgext-mmx.asm" %undef RGB_RED %undef RGB_GREEN @@ -99,7 +98,7 @@ PD_ONEHALF times 2 dd 1 << (SCALEBITS-1) %define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE %define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extbgrx_merged_upsample_mmx %define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extbgrx_merged_upsample_mmx -%include "jdmrgmmx.asm" +%include "jdmrgext-mmx.asm" %undef RGB_RED %undef RGB_GREEN @@ -111,7 +110,7 @@ PD_ONEHALF times 2 dd 1 << (SCALEBITS-1) %define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE %define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extxbgr_merged_upsample_mmx %define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extxbgr_merged_upsample_mmx -%include "jdmrgmmx.asm" +%include "jdmrgext-mmx.asm" %undef RGB_RED %undef RGB_GREEN @@ -123,4 +122,4 @@ PD_ONEHALF times 2 dd 1 << (SCALEBITS-1) %define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE %define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extxrgb_merged_upsample_mmx %define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extxrgb_merged_upsample_mmx -%include "jdmrgmmx.asm" +%include "jdmrgext-mmx.asm" diff --git a/Builder/jni-1.11/simd/jdmerge-sse2-64.asm b/Builder/jni-1.11/simd/jdmerge-sse2-64.asm new file mode 100644 index 000000000..244bd4023 --- /dev/null +++ b/Builder/jni-1.11/simd/jdmerge-sse2-64.asm @@ -0,0 +1,125 @@ +; +; jdmerge.asm - merged upsampling/color conversion (64-bit SSE2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2009, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_344 equ 22554 ; FIX(0.34414) +F_0_714 equ 46802 ; FIX(0.71414) +F_1_402 equ 91881 ; FIX(1.40200) +F_1_772 equ 116130 ; FIX(1.77200) +F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) +F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) +F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_merged_upsample_sse2) + +EXTN(jconst_merged_upsample_sse2): + +PW_F0402 times 8 dw F_0_402 +PW_MF0228 times 8 dw -F_0_228 +PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285 +PW_ONE times 8 dw 1 +PD_ONEHALF times 4 dd 1 << (SCALEBITS-1) + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 + +%include "jdmrgext-sse2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGB_RED +%define RGB_GREEN EXT_RGB_GREEN +%define RGB_BLUE EXT_RGB_BLUE +%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extrgb_merged_upsample_sse2 +%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extrgb_merged_upsample_sse2 +%include "jdmrgext-sse2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGBX_RED +%define RGB_GREEN EXT_RGBX_GREEN +%define RGB_BLUE EXT_RGBX_BLUE +%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extrgbx_merged_upsample_sse2 +%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extrgbx_merged_upsample_sse2 +%include "jdmrgext-sse2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGR_RED +%define RGB_GREEN EXT_BGR_GREEN +%define RGB_BLUE EXT_BGR_BLUE +%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extbgr_merged_upsample_sse2 +%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extbgr_merged_upsample_sse2 +%include "jdmrgext-sse2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGRX_RED +%define RGB_GREEN EXT_BGRX_GREEN +%define RGB_BLUE EXT_BGRX_BLUE +%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extbgrx_merged_upsample_sse2 +%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extbgrx_merged_upsample_sse2 +%include "jdmrgext-sse2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XBGR_RED +%define RGB_GREEN EXT_XBGR_GREEN +%define RGB_BLUE EXT_XBGR_BLUE +%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extxbgr_merged_upsample_sse2 +%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extxbgr_merged_upsample_sse2 +%include "jdmrgext-sse2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XRGB_RED +%define RGB_GREEN EXT_XRGB_GREEN +%define RGB_BLUE EXT_XRGB_BLUE +%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extxrgb_merged_upsample_sse2 +%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extxrgb_merged_upsample_sse2 +%include "jdmrgext-sse2-64.asm" diff --git a/Builder/jni-1.11/simd/i386/src/jdmerss2.asm b/Builder/jni-1.11/simd/jdmerge-sse2.asm similarity index 72% rename from Builder/jni-1.11/simd/i386/src/jdmerss2.asm rename to Builder/jni-1.11/simd/jdmerge-sse2.asm index e536c802e..236de5a38 100644 --- a/Builder/jni-1.11/simd/i386/src/jdmerss2.asm +++ b/Builder/jni-1.11/simd/jdmerge-sse2.asm @@ -1,11 +1,10 @@ ; -; jdmerss2.asm - merged upsampling/color conversion (SSE2) +; jdmerge.asm - merged upsampling/color conversion (SSE2) ; ; Copyright 2009 Pierre Ossman for Cendio AB -; Copyright 2009 D. R. Commander +; Copyright (C) 2009, D. R. Commander. ; -; Based on -; x86 SIMD extension for IJG JPEG library +; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; @@ -21,37 +20,37 @@ ; -------------------------------------------------------------------------- -%define SCALEBITS 16 +%define SCALEBITS 16 -F_0_344 equ 22554 ; FIX(0.34414) -F_0_714 equ 46802 ; FIX(0.71414) -F_1_402 equ 91881 ; FIX(1.40200) -F_1_772 equ 116130 ; FIX(1.77200) -F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) -F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) -F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) +F_0_344 equ 22554 ; FIX(0.34414) +F_0_714 equ 46802 ; FIX(0.71414) +F_1_402 equ 91881 ; FIX(1.40200) +F_1_772 equ 116130 ; FIX(1.77200) +F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) +F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) +F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_merged_upsample_sse2) + alignz 16 + global EXTN(jconst_merged_upsample_sse2) EXTN(jconst_merged_upsample_sse2): -PW_F0402 times 8 dw F_0_402 -PW_MF0228 times 8 dw -F_0_228 -PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285 -PW_ONE times 8 dw 1 -PD_ONEHALF times 4 dd 1 << (SCALEBITS-1) +PW_F0402 times 8 dw F_0_402 +PW_MF0228 times 8 dw -F_0_228 +PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285 +PW_ONE times 8 dw 1 +PD_ONEHALF times 4 dd 1 << (SCALEBITS-1) - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 + SECTION SEG_TEXT + BITS 32 -%include "jdmrgss2.asm" +%include "jdmrgext-sse2.asm" %undef RGB_RED %undef RGB_GREEN @@ -63,7 +62,7 @@ PD_ONEHALF times 4 dd 1 << (SCALEBITS-1) %define RGB_PIXELSIZE EXT_RGB_PIXELSIZE %define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extrgb_merged_upsample_sse2 %define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extrgb_merged_upsample_sse2 -%include "jdmrgss2.asm" +%include "jdmrgext-sse2.asm" %undef RGB_RED %undef RGB_GREEN @@ -75,7 +74,7 @@ PD_ONEHALF times 4 dd 1 << (SCALEBITS-1) %define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE %define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extrgbx_merged_upsample_sse2 %define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extrgbx_merged_upsample_sse2 -%include "jdmrgss2.asm" +%include "jdmrgext-sse2.asm" %undef RGB_RED %undef RGB_GREEN @@ -87,7 +86,7 @@ PD_ONEHALF times 4 dd 1 << (SCALEBITS-1) %define RGB_PIXELSIZE EXT_BGR_PIXELSIZE %define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extbgr_merged_upsample_sse2 %define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extbgr_merged_upsample_sse2 -%include "jdmrgss2.asm" +%include "jdmrgext-sse2.asm" %undef RGB_RED %undef RGB_GREEN @@ -99,7 +98,7 @@ PD_ONEHALF times 4 dd 1 << (SCALEBITS-1) %define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE %define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extbgrx_merged_upsample_sse2 %define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extbgrx_merged_upsample_sse2 -%include "jdmrgss2.asm" +%include "jdmrgext-sse2.asm" %undef RGB_RED %undef RGB_GREEN @@ -111,7 +110,7 @@ PD_ONEHALF times 4 dd 1 << (SCALEBITS-1) %define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE %define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extxbgr_merged_upsample_sse2 %define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extxbgr_merged_upsample_sse2 -%include "jdmrgss2.asm" +%include "jdmrgext-sse2.asm" %undef RGB_RED %undef RGB_GREEN @@ -123,4 +122,4 @@ PD_ONEHALF times 4 dd 1 << (SCALEBITS-1) %define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE %define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extxrgb_merged_upsample_sse2 %define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extxrgb_merged_upsample_sse2 -%include "jdmrgss2.asm" +%include "jdmrgext-sse2.asm" diff --git a/Builder/jni-1.11/simd/jdmrgext-altivec.c b/Builder/jni-1.11/simd/jdmrgext-altivec.c new file mode 100644 index 000000000..55205bb1f --- /dev/null +++ b/Builder/jni-1.11/simd/jdmrgext-altivec.c @@ -0,0 +1,323 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2015, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* This file is included by jdmerge-altivec.c */ + + +void jsimd_h2v1_merged_upsample_altivec (JDIMENSION output_width, + JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf) +{ + JSAMPROW outptr, inptr0, inptr1, inptr2; + int pitch = output_width * RGB_PIXELSIZE, num_cols, yloop; +#if __BIG_ENDIAN__ + int offset; +#endif + unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16]; + + __vector unsigned char rgb0, rgb1, rgb2, rgbx0, rgbx1, rgbx2, rgbx3, + y, cb, cr; +#if __BIG_ENDIAN__ + __vector unsigned char edgel, edgeh, edges, out0, out1, out2, out3; +#if RGB_PIXELSIZE == 4 + __vector unsigned char out4; +#endif +#endif +#if RGB_PIXELSIZE == 4 + __vector unsigned char rgb3; +#endif + __vector short rg0, rg1, rg2, rg3, bx0, bx1, bx2, bx3, ye, yo, cbl, cbh, + crl, crh, r_yl, r_yh, g_yl, g_yh, b_yl, b_yh, g_y0w, g_y1w, g_y2w, g_y3w, + rl, rh, gl, gh, bl, bh, re, ro, ge, go, be, bo; + __vector int g_y0, g_y1, g_y2, g_y3; + + /* Constants + * NOTE: The >> 1 is to compensate for the fact that vec_madds() returns 17 + * high-order bits, not 16. + */ + __vector short pw_f0402 = { __8X(F_0_402 >> 1) }, + pw_mf0228 = { __8X(-F_0_228 >> 1) }, + pw_mf0344_f0285 = { __4X2(-F_0_344, F_0_285) }, + pw_one = { __8X(1) }, pw_255 = { __8X(255) }, + pw_cj = { __8X(CENTERJSAMPLE) }; + __vector int pd_onehalf = { __4X(ONE_HALF) }; + __vector unsigned char pb_zero = { __16X(0) }, +#if __BIG_ENDIAN__ + shift_pack_index = {0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29}, + even_index = {0,16,0,18,0,20,0,22,0,24,0,26,0,28,0,30}, + odd_index = {0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31}; +#else + shift_pack_index = {2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31}, + even_index = {16,0,18,0,20,0,22,0,24,0,26,0,28,0,30,0}, + odd_index = {17,0,19,0,21,0,23,0,25,0,27,0,29,0,31,0}; +#endif + + inptr0 = input_buf[0][in_row_group_ctr]; + inptr1 = input_buf[1][in_row_group_ctr]; + inptr2 = input_buf[2][in_row_group_ctr]; + outptr = output_buf[0]; + + for (num_cols = pitch; num_cols > 0; inptr1 += 16, inptr2 += 16) { + + cb = vec_ld(0, inptr1); + /* NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't + * support unsigned vectors. + */ + cbl = (__vector signed short)VEC_UNPACKHU(cb); + cbh = (__vector signed short)VEC_UNPACKLU(cb); + cbl = vec_sub(cbl, pw_cj); + cbh = vec_sub(cbh, pw_cj); + + cr = vec_ld(0, inptr2); + crl = (__vector signed short)VEC_UNPACKHU(cr); + crh = (__vector signed short)VEC_UNPACKLU(cr); + crl = vec_sub(crl, pw_cj); + crh = vec_sub(crh, pw_cj); + + /* (Original) + * R = Y + 1.40200 * Cr + * G = Y - 0.34414 * Cb - 0.71414 * Cr + * B = Y + 1.77200 * Cb + * + * (This implementation) + * R = Y + 0.40200 * Cr + Cr + * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr + * B = Y - 0.22800 * Cb + Cb + Cb + */ + b_yl = vec_add(cbl, cbl); + b_yh = vec_add(cbh, cbh); + b_yl = vec_madds(b_yl, pw_mf0228, pw_one); + b_yh = vec_madds(b_yh, pw_mf0228, pw_one); + b_yl = vec_sra(b_yl, (__vector unsigned short)pw_one); + b_yh = vec_sra(b_yh, (__vector unsigned short)pw_one); + b_yl = vec_add(b_yl, cbl); + b_yh = vec_add(b_yh, cbh); + b_yl = vec_add(b_yl, cbl); + b_yh = vec_add(b_yh, cbh); + + r_yl = vec_add(crl, crl); + r_yh = vec_add(crh, crh); + r_yl = vec_madds(r_yl, pw_f0402, pw_one); + r_yh = vec_madds(r_yh, pw_f0402, pw_one); + r_yl = vec_sra(r_yl, (__vector unsigned short)pw_one); + r_yh = vec_sra(r_yh, (__vector unsigned short)pw_one); + r_yl = vec_add(r_yl, crl); + r_yh = vec_add(r_yh, crh); + + g_y0w = vec_mergeh(cbl, crl); + g_y1w = vec_mergel(cbl, crl); + g_y0 = vec_msums(g_y0w, pw_mf0344_f0285, pd_onehalf); + g_y1 = vec_msums(g_y1w, pw_mf0344_f0285, pd_onehalf); + g_y2w = vec_mergeh(cbh, crh); + g_y3w = vec_mergel(cbh, crh); + g_y2 = vec_msums(g_y2w, pw_mf0344_f0285, pd_onehalf); + g_y3 = vec_msums(g_y3w, pw_mf0344_f0285, pd_onehalf); + /* Clever way to avoid 4 shifts + 2 packs. This packs the high word from + * each dword into a new 16-bit vector, which is the equivalent of + * descaling the 32-bit results (right-shifting by 16 bits) and then + * packing them. + */ + g_yl = vec_perm((__vector short)g_y0, (__vector short)g_y1, + shift_pack_index); + g_yh = vec_perm((__vector short)g_y2, (__vector short)g_y3, + shift_pack_index); + g_yl = vec_sub(g_yl, crl); + g_yh = vec_sub(g_yh, crh); + + for (yloop = 0; yloop < 2 && num_cols > 0; yloop++, + num_cols -= RGB_PIXELSIZE * 16, + outptr += RGB_PIXELSIZE * 16, inptr0 += 16) { + + y = vec_ld(0, inptr0); + ye = (__vector signed short)vec_perm(pb_zero, y, even_index); + yo = (__vector signed short)vec_perm(pb_zero, y, odd_index); + + if (yloop == 0) { + be = vec_add(b_yl, ye); + bo = vec_add(b_yl, yo); + re = vec_add(r_yl, ye); + ro = vec_add(r_yl, yo); + ge = vec_add(g_yl, ye); + go = vec_add(g_yl, yo); + } else { + be = vec_add(b_yh, ye); + bo = vec_add(b_yh, yo); + re = vec_add(r_yh, ye); + ro = vec_add(r_yh, yo); + ge = vec_add(g_yh, ye); + go = vec_add(g_yh, yo); + } + + rl = vec_mergeh(re, ro); + rh = vec_mergel(re, ro); + gl = vec_mergeh(ge, go); + gh = vec_mergel(ge, go); + bl = vec_mergeh(be, bo); + bh = vec_mergel(be, bo); + + rg0 = vec_mergeh(rl, gl); + bx0 = vec_mergeh(bl, pw_255); + rg1 = vec_mergel(rl, gl); + bx1 = vec_mergel(bl, pw_255); + rg2 = vec_mergeh(rh, gh); + bx2 = vec_mergeh(bh, pw_255); + rg3 = vec_mergel(rh, gh); + bx3 = vec_mergel(bh, pw_255); + + rgbx0 = vec_packsu(rg0, bx0); + rgbx1 = vec_packsu(rg1, bx1); + rgbx2 = vec_packsu(rg2, bx2); + rgbx3 = vec_packsu(rg3, bx3); + +#if RGB_PIXELSIZE == 3 + /* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3 + * rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7 + * rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb + * rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf + * + * rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5 + * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga + * rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf + */ + rgb0 = vec_perm(rgbx0, rgbx1, (__vector unsigned char)RGB_INDEX0); + rgb1 = vec_perm(rgbx1, rgbx2, (__vector unsigned char)RGB_INDEX1); + rgb2 = vec_perm(rgbx2, rgbx3, (__vector unsigned char)RGB_INDEX2); +#else + /* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3 + * rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7 + * rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb + * rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf + * + * rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3 + * rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7 + * rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb + * rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf + */ + rgb0 = vec_perm(rgbx0, rgbx0, (__vector unsigned char)RGB_INDEX); + rgb1 = vec_perm(rgbx1, rgbx1, (__vector unsigned char)RGB_INDEX); + rgb2 = vec_perm(rgbx2, rgbx2, (__vector unsigned char)RGB_INDEX); + rgb3 = vec_perm(rgbx3, rgbx3, (__vector unsigned char)RGB_INDEX); +#endif + +#if __BIG_ENDIAN__ + offset = (size_t)outptr & 15; + if (offset) { + __vector unsigned char unaligned_shift_index; + int bytes = num_cols + offset; + + if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) { + /* Slow path to prevent buffer overwrite. Since there is no way to + * write a partial AltiVec register, overwrite would occur on the + * last chunk of the last image row if the right edge is not on a + * 16-byte boundary. It could also occur on other rows if the bytes + * per row is low enough. Since we can't determine whether we're on + * the last image row, we have to assume every row is the last. + */ + vec_st(rgb0, 0, tmpbuf); + vec_st(rgb1, 16, tmpbuf); + vec_st(rgb2, 32, tmpbuf); +#if RGB_PIXELSIZE == 4 + vec_st(rgb3, 48, tmpbuf); +#endif + memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16)); + } else { + /* Fast path */ + unaligned_shift_index = vec_lvsl(0, outptr); + edgel = vec_ld(0, outptr); + edgeh = vec_ld(min(num_cols - 1, RGB_PIXELSIZE * 16), outptr); + edges = vec_perm(edgeh, edgel, unaligned_shift_index); + unaligned_shift_index = vec_lvsr(0, outptr); + out0 = vec_perm(edges, rgb0, unaligned_shift_index); + out1 = vec_perm(rgb0, rgb1, unaligned_shift_index); + out2 = vec_perm(rgb1, rgb2, unaligned_shift_index); +#if RGB_PIXELSIZE == 4 + out3 = vec_perm(rgb2, rgb3, unaligned_shift_index); + out4 = vec_perm(rgb3, edges, unaligned_shift_index); +#else + out3 = vec_perm(rgb2, edges, unaligned_shift_index); +#endif + vec_st(out0, 0, outptr); + if (bytes > 16) + vec_st(out1, 16, outptr); + if (bytes > 32) + vec_st(out2, 32, outptr); + if (bytes > 48) + vec_st(out3, 48, outptr); +#if RGB_PIXELSIZE == 4 + if (bytes > 64) + vec_st(out4, 64, outptr); +#endif + } + } else { +#endif /* __BIG_ENDIAN__ */ + if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) { + /* Slow path */ + VEC_ST(rgb0, 0, tmpbuf); + VEC_ST(rgb1, 16, tmpbuf); + VEC_ST(rgb2, 32, tmpbuf); +#if RGB_PIXELSIZE == 4 + VEC_ST(rgb3, 48, tmpbuf); +#endif + memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16)); + } else { + /* Fast path */ + VEC_ST(rgb0, 0, outptr); + if (num_cols > 16) + VEC_ST(rgb1, 16, outptr); + if (num_cols > 32) + VEC_ST(rgb2, 32, outptr); +#if RGB_PIXELSIZE == 4 + if (num_cols > 48) + VEC_ST(rgb3, 48, outptr); +#endif + } +#if __BIG_ENDIAN__ + } +#endif + } + } +} + + +void jsimd_h2v2_merged_upsample_altivec (JDIMENSION output_width, + JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf) +{ + JSAMPROW inptr, outptr; + + inptr = input_buf[0][in_row_group_ctr]; + outptr = output_buf[0]; + + input_buf[0][in_row_group_ctr] = input_buf[0][in_row_group_ctr * 2]; + jsimd_h2v1_merged_upsample_altivec(output_width, input_buf, in_row_group_ctr, + output_buf); + + input_buf[0][in_row_group_ctr] = input_buf[0][in_row_group_ctr * 2 + 1]; + output_buf[0] = output_buf[1]; + jsimd_h2v1_merged_upsample_altivec(output_width, input_buf, in_row_group_ctr, + output_buf); + + input_buf[0][in_row_group_ctr] = inptr; + output_buf[0] = outptr; +} diff --git a/Builder/jni-1.11/simd/jdmrgext-mmx.asm b/Builder/jni-1.11/simd/jdmrgext-mmx.asm new file mode 100644 index 000000000..63f45cf37 --- /dev/null +++ b/Builder/jni-1.11/simd/jdmrgext-mmx.asm @@ -0,0 +1,463 @@ +; +; jdmrgext.asm - merged upsampling/color conversion (MMX) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical. +; +; GLOBAL(void) +; jsimd_h2v1_merged_upsample_mmx (JDIMENSION output_width, +; JSAMPIMAGE input_buf, +; JDIMENSION in_row_group_ctr, +; JSAMPARRAY output_buf); +; + +%define output_width(b) (b)+8 ; JDIMENSION output_width +%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf +%define in_row_group_ctr(b) (b)+16 ; JDIMENSION in_row_group_ctr +%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf + +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] +%define WK_NUM 3 +%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr + + align 16 + global EXTN(jsimd_h2v1_merged_upsample_mmx) + +EXTN(jsimd_h2v1_merged_upsample_mmx): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov ecx, JDIMENSION [output_width(eax)] ; col + test ecx,ecx + jz near .return + + push ecx + + mov edi, JSAMPIMAGE [input_buf(eax)] + mov ecx, JDIMENSION [in_row_group_ctr(eax)] + mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] + mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] + mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] + mov edi, JSAMPARRAY [output_buf(eax)] + mov esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW] ; inptr0 + mov ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW] ; inptr1 + mov edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW] ; inptr2 + mov edi, JSAMPROW [edi] ; outptr + + pop ecx ; col + + alignx 16,7 +.columnloop: + movpic eax, POINTER [gotptr] ; load GOT address (eax) + + movq mm6, MMWORD [ebx] ; mm6=Cb(01234567) + movq mm7, MMWORD [edx] ; mm7=Cr(01234567) + + pxor mm1,mm1 ; mm1=(all 0's) + pcmpeqw mm3,mm3 + psllw mm3,7 ; mm3={0xFF80 0xFF80 0xFF80 0xFF80} + + movq mm4,mm6 + punpckhbw mm6,mm1 ; mm6=Cb(4567)=CbH + punpcklbw mm4,mm1 ; mm4=Cb(0123)=CbL + movq mm0,mm7 + punpckhbw mm7,mm1 ; mm7=Cr(4567)=CrH + punpcklbw mm0,mm1 ; mm0=Cr(0123)=CrL + + paddw mm6,mm3 + paddw mm4,mm3 + paddw mm7,mm3 + paddw mm0,mm3 + + ; (Original) + ; R = Y + 1.40200 * Cr + ; G = Y - 0.34414 * Cb - 0.71414 * Cr + ; B = Y + 1.77200 * Cb + ; + ; (This implementation) + ; R = Y + 0.40200 * Cr + Cr + ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr + ; B = Y - 0.22800 * Cb + Cb + Cb + + movq mm5,mm6 ; mm5=CbH + movq mm2,mm4 ; mm2=CbL + paddw mm6,mm6 ; mm6=2*CbH + paddw mm4,mm4 ; mm4=2*CbL + movq mm1,mm7 ; mm1=CrH + movq mm3,mm0 ; mm3=CrL + paddw mm7,mm7 ; mm7=2*CrH + paddw mm0,mm0 ; mm0=2*CrL + + pmulhw mm6,[GOTOFF(eax,PW_MF0228)] ; mm6=(2*CbH * -FIX(0.22800)) + pmulhw mm4,[GOTOFF(eax,PW_MF0228)] ; mm4=(2*CbL * -FIX(0.22800)) + pmulhw mm7,[GOTOFF(eax,PW_F0402)] ; mm7=(2*CrH * FIX(0.40200)) + pmulhw mm0,[GOTOFF(eax,PW_F0402)] ; mm0=(2*CrL * FIX(0.40200)) + + paddw mm6,[GOTOFF(eax,PW_ONE)] + paddw mm4,[GOTOFF(eax,PW_ONE)] + psraw mm6,1 ; mm6=(CbH * -FIX(0.22800)) + psraw mm4,1 ; mm4=(CbL * -FIX(0.22800)) + paddw mm7,[GOTOFF(eax,PW_ONE)] + paddw mm0,[GOTOFF(eax,PW_ONE)] + psraw mm7,1 ; mm7=(CrH * FIX(0.40200)) + psraw mm0,1 ; mm0=(CrL * FIX(0.40200)) + + paddw mm6,mm5 + paddw mm4,mm2 + paddw mm6,mm5 ; mm6=(CbH * FIX(1.77200))=(B-Y)H + paddw mm4,mm2 ; mm4=(CbL * FIX(1.77200))=(B-Y)L + paddw mm7,mm1 ; mm7=(CrH * FIX(1.40200))=(R-Y)H + paddw mm0,mm3 ; mm0=(CrL * FIX(1.40200))=(R-Y)L + + movq MMWORD [wk(0)], mm6 ; wk(0)=(B-Y)H + movq MMWORD [wk(1)], mm7 ; wk(1)=(R-Y)H + + movq mm6,mm5 + movq mm7,mm2 + punpcklwd mm5,mm1 + punpckhwd mm6,mm1 + pmaddwd mm5,[GOTOFF(eax,PW_MF0344_F0285)] + pmaddwd mm6,[GOTOFF(eax,PW_MF0344_F0285)] + punpcklwd mm2,mm3 + punpckhwd mm7,mm3 + pmaddwd mm2,[GOTOFF(eax,PW_MF0344_F0285)] + pmaddwd mm7,[GOTOFF(eax,PW_MF0344_F0285)] + + paddd mm5,[GOTOFF(eax,PD_ONEHALF)] + paddd mm6,[GOTOFF(eax,PD_ONEHALF)] + psrad mm5,SCALEBITS + psrad mm6,SCALEBITS + paddd mm2,[GOTOFF(eax,PD_ONEHALF)] + paddd mm7,[GOTOFF(eax,PD_ONEHALF)] + psrad mm2,SCALEBITS + psrad mm7,SCALEBITS + + packssdw mm5,mm6 ; mm5=CbH*-FIX(0.344)+CrH*FIX(0.285) + packssdw mm2,mm7 ; mm2=CbL*-FIX(0.344)+CrL*FIX(0.285) + psubw mm5,mm1 ; mm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H + psubw mm2,mm3 ; mm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L + + movq MMWORD [wk(2)], mm5 ; wk(2)=(G-Y)H + + mov al,2 ; Yctr + jmp short .Yloop_1st + alignx 16,7 + +.Yloop_2nd: + movq mm0, MMWORD [wk(1)] ; mm0=(R-Y)H + movq mm2, MMWORD [wk(2)] ; mm2=(G-Y)H + movq mm4, MMWORD [wk(0)] ; mm4=(B-Y)H + alignx 16,7 + +.Yloop_1st: + movq mm7, MMWORD [esi] ; mm7=Y(01234567) + + pcmpeqw mm6,mm6 + psrlw mm6,BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..} + pand mm6,mm7 ; mm6=Y(0246)=YE + psrlw mm7,BYTE_BIT ; mm7=Y(1357)=YO + + movq mm1,mm0 ; mm1=mm0=(R-Y)(L/H) + movq mm3,mm2 ; mm3=mm2=(G-Y)(L/H) + movq mm5,mm4 ; mm5=mm4=(B-Y)(L/H) + + paddw mm0,mm6 ; mm0=((R-Y)+YE)=RE=(R0 R2 R4 R6) + paddw mm1,mm7 ; mm1=((R-Y)+YO)=RO=(R1 R3 R5 R7) + packuswb mm0,mm0 ; mm0=(R0 R2 R4 R6 ** ** ** **) + packuswb mm1,mm1 ; mm1=(R1 R3 R5 R7 ** ** ** **) + + paddw mm2,mm6 ; mm2=((G-Y)+YE)=GE=(G0 G2 G4 G6) + paddw mm3,mm7 ; mm3=((G-Y)+YO)=GO=(G1 G3 G5 G7) + packuswb mm2,mm2 ; mm2=(G0 G2 G4 G6 ** ** ** **) + packuswb mm3,mm3 ; mm3=(G1 G3 G5 G7 ** ** ** **) + + paddw mm4,mm6 ; mm4=((B-Y)+YE)=BE=(B0 B2 B4 B6) + paddw mm5,mm7 ; mm5=((B-Y)+YO)=BO=(B1 B3 B5 B7) + packuswb mm4,mm4 ; mm4=(B0 B2 B4 B6 ** ** ** **) + packuswb mm5,mm5 ; mm5=(B1 B3 B5 B7 ** ** ** **) + +%if RGB_PIXELSIZE == 3 ; --------------- + + ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) + ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) + ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) + ; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **) + + punpcklbw mmA,mmC ; mmA=(00 10 02 12 04 14 06 16) + punpcklbw mmE,mmB ; mmE=(20 01 22 03 24 05 26 07) + punpcklbw mmD,mmF ; mmD=(11 21 13 23 15 25 17 27) + + movq mmG,mmA + movq mmH,mmA + punpcklwd mmA,mmE ; mmA=(00 10 20 01 02 12 22 03) + punpckhwd mmG,mmE ; mmG=(04 14 24 05 06 16 26 07) + + psrlq mmH,2*BYTE_BIT ; mmH=(02 12 04 14 06 16 -- --) + psrlq mmE,2*BYTE_BIT ; mmE=(22 03 24 05 26 07 -- --) + + movq mmC,mmD + movq mmB,mmD + punpcklwd mmD,mmH ; mmD=(11 21 02 12 13 23 04 14) + punpckhwd mmC,mmH ; mmC=(15 25 06 16 17 27 -- --) + + psrlq mmB,2*BYTE_BIT ; mmB=(13 23 15 25 17 27 -- --) + + movq mmF,mmE + punpcklwd mmE,mmB ; mmE=(22 03 13 23 24 05 15 25) + punpckhwd mmF,mmB ; mmF=(26 07 17 27 -- -- -- --) + + punpckldq mmA,mmD ; mmA=(00 10 20 01 11 21 02 12) + punpckldq mmE,mmG ; mmE=(22 03 13 23 04 14 24 05) + punpckldq mmC,mmF ; mmC=(15 25 06 16 26 07 17 27) + + cmp ecx, byte SIZEOF_MMWORD + jb short .column_st16 + + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq MMWORD [edi+1*SIZEOF_MMWORD], mmE + movq MMWORD [edi+2*SIZEOF_MMWORD], mmC + + sub ecx, byte SIZEOF_MMWORD + jz near .endcolumn + + add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr + add esi, byte SIZEOF_MMWORD ; inptr0 + dec al ; Yctr + jnz near .Yloop_2nd + + add ebx, byte SIZEOF_MMWORD ; inptr1 + add edx, byte SIZEOF_MMWORD ; inptr2 + jmp near .columnloop + alignx 16,7 + +.column_st16: + lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE + cmp ecx, byte 2*SIZEOF_MMWORD + jb short .column_st8 + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq MMWORD [edi+1*SIZEOF_MMWORD], mmE + movq mmA,mmC + sub ecx, byte 2*SIZEOF_MMWORD + add edi, byte 2*SIZEOF_MMWORD + jmp short .column_st4 +.column_st8: + cmp ecx, byte SIZEOF_MMWORD + jb short .column_st4 + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq mmA,mmE + sub ecx, byte SIZEOF_MMWORD + add edi, byte SIZEOF_MMWORD +.column_st4: + movd eax,mmA + cmp ecx, byte SIZEOF_DWORD + jb short .column_st2 + mov DWORD [edi+0*SIZEOF_DWORD], eax + psrlq mmA,DWORD_BIT + movd eax,mmA + sub ecx, byte SIZEOF_DWORD + add edi, byte SIZEOF_DWORD +.column_st2: + cmp ecx, byte SIZEOF_WORD + jb short .column_st1 + mov WORD [edi+0*SIZEOF_WORD], ax + shr eax,WORD_BIT + sub ecx, byte SIZEOF_WORD + add edi, byte SIZEOF_WORD +.column_st1: + cmp ecx, byte SIZEOF_BYTE + jb short .endcolumn + mov BYTE [edi+0*SIZEOF_BYTE], al + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +%ifdef RGBX_FILLER_0XFF + pcmpeqb mm6,mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **) + pcmpeqb mm7,mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **) +%else + pxor mm6,mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **) + pxor mm7,mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **) +%endif + ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) + ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) + ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) + ; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **) + + punpcklbw mmA,mmC ; mmA=(00 10 02 12 04 14 06 16) + punpcklbw mmE,mmG ; mmE=(20 30 22 32 24 34 26 36) + punpcklbw mmB,mmD ; mmB=(01 11 03 13 05 15 07 17) + punpcklbw mmF,mmH ; mmF=(21 31 23 33 25 35 27 37) + + movq mmC,mmA + punpcklwd mmA,mmE ; mmA=(00 10 20 30 02 12 22 32) + punpckhwd mmC,mmE ; mmC=(04 14 24 34 06 16 26 36) + movq mmG,mmB + punpcklwd mmB,mmF ; mmB=(01 11 21 31 03 13 23 33) + punpckhwd mmG,mmF ; mmG=(05 15 25 35 07 17 27 37) + + movq mmD,mmA + punpckldq mmA,mmB ; mmA=(00 10 20 30 01 11 21 31) + punpckhdq mmD,mmB ; mmD=(02 12 22 32 03 13 23 33) + movq mmH,mmC + punpckldq mmC,mmG ; mmC=(04 14 24 34 05 15 25 35) + punpckhdq mmH,mmG ; mmH=(06 16 26 36 07 17 27 37) + + cmp ecx, byte SIZEOF_MMWORD + jb short .column_st16 + + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq MMWORD [edi+1*SIZEOF_MMWORD], mmD + movq MMWORD [edi+2*SIZEOF_MMWORD], mmC + movq MMWORD [edi+3*SIZEOF_MMWORD], mmH + + sub ecx, byte SIZEOF_MMWORD + jz short .endcolumn + + add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr + add esi, byte SIZEOF_MMWORD ; inptr0 + dec al ; Yctr + jnz near .Yloop_2nd + + add ebx, byte SIZEOF_MMWORD ; inptr1 + add edx, byte SIZEOF_MMWORD ; inptr2 + jmp near .columnloop + alignx 16,7 + +.column_st16: + cmp ecx, byte SIZEOF_MMWORD/2 + jb short .column_st8 + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq MMWORD [edi+1*SIZEOF_MMWORD], mmD + movq mmA,mmC + movq mmD,mmH + sub ecx, byte SIZEOF_MMWORD/2 + add edi, byte 2*SIZEOF_MMWORD +.column_st8: + cmp ecx, byte SIZEOF_MMWORD/4 + jb short .column_st4 + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq mmA,mmD + sub ecx, byte SIZEOF_MMWORD/4 + add edi, byte 1*SIZEOF_MMWORD +.column_st4: + cmp ecx, byte SIZEOF_MMWORD/8 + jb short .endcolumn + movd DWORD [edi+0*SIZEOF_DWORD], mmA + +%endif ; RGB_PIXELSIZE ; --------------- + +.endcolumn: + emms ; empty MMX state + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical. +; +; GLOBAL(void) +; jsimd_h2v2_merged_upsample_mmx (JDIMENSION output_width, +; JSAMPIMAGE input_buf, +; JDIMENSION in_row_group_ctr, +; JSAMPARRAY output_buf); +; + +%define output_width(b) (b)+8 ; JDIMENSION output_width +%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf +%define in_row_group_ctr(b) (b)+16 ; JDIMENSION in_row_group_ctr +%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf + + align 16 + global EXTN(jsimd_h2v2_merged_upsample_mmx) + +EXTN(jsimd_h2v2_merged_upsample_mmx): + push ebp + mov ebp,esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov eax, JDIMENSION [output_width(ebp)] + + mov edi, JSAMPIMAGE [input_buf(ebp)] + mov ecx, JDIMENSION [in_row_group_ctr(ebp)] + mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] + mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] + mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] + mov edi, JSAMPARRAY [output_buf(ebp)] + lea esi, [esi+ecx*SIZEOF_JSAMPROW] + + push edx ; inptr2 + push ebx ; inptr1 + push esi ; inptr00 + mov ebx,esp + + push edi ; output_buf (outptr0) + push ecx ; in_row_group_ctr + push ebx ; input_buf + push eax ; output_width + + call near EXTN(jsimd_h2v1_merged_upsample_mmx) + + add esi, byte SIZEOF_JSAMPROW ; inptr01 + add edi, byte SIZEOF_JSAMPROW ; outptr1 + mov POINTER [ebx+0*SIZEOF_POINTER], esi + mov POINTER [ebx-1*SIZEOF_POINTER], edi + + call near EXTN(jsimd_h2v1_merged_upsample_mmx) + + add esp, byte 7*SIZEOF_DWORD + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/Builder/jni-1.11/simd/jdmrgext-sse2-64.asm b/Builder/jni-1.11/simd/jdmrgext-sse2-64.asm new file mode 100644 index 000000000..ad74c5ff4 --- /dev/null +++ b/Builder/jni-1.11/simd/jdmrgext-sse2-64.asm @@ -0,0 +1,537 @@ +; +; jdmrgext.asm - merged upsampling/color conversion (64-bit SSE2) +; +; Copyright 2009, 2012 Pierre Ossman for Cendio AB +; Copyright (C) 2009, 2012, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical. +; +; GLOBAL(void) +; jsimd_h2v1_merged_upsample_sse2 (JDIMENSION output_width, +; JSAMPIMAGE input_buf, +; JDIMENSION in_row_group_ctr, +; JSAMPARRAY output_buf); +; + +; r10 = JDIMENSION output_width +; r11 = JSAMPIMAGE input_buf +; r12 = JDIMENSION in_row_group_ctr +; r13 = JSAMPARRAY output_buf + +%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 3 + + align 16 + global EXTN(jsimd_h2v1_merged_upsample_sse2) + +EXTN(jsimd_h2v1_merged_upsample_sse2): + push rbp + mov rax,rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp],rax + mov rbp,rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args + push rbx + + mov ecx, r10d ; col + test rcx,rcx + jz near .return + + push rcx + + mov rdi, r11 + mov ecx, r12d + mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY] + mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY] + mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY] + mov rdi, r13 + mov rsi, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW] ; inptr0 + mov rbx, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW] ; inptr1 + mov rdx, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW] ; inptr2 + mov rdi, JSAMPROW [rdi] ; outptr + + pop rcx ; col + +.columnloop: + + movdqa xmm6, XMMWORD [rbx] ; xmm6=Cb(0123456789ABCDEF) + movdqa xmm7, XMMWORD [rdx] ; xmm7=Cr(0123456789ABCDEF) + + pxor xmm1,xmm1 ; xmm1=(all 0's) + pcmpeqw xmm3,xmm3 + psllw xmm3,7 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..} + + movdqa xmm4,xmm6 + punpckhbw xmm6,xmm1 ; xmm6=Cb(89ABCDEF)=CbH + punpcklbw xmm4,xmm1 ; xmm4=Cb(01234567)=CbL + movdqa xmm0,xmm7 + punpckhbw xmm7,xmm1 ; xmm7=Cr(89ABCDEF)=CrH + punpcklbw xmm0,xmm1 ; xmm0=Cr(01234567)=CrL + + paddw xmm6,xmm3 + paddw xmm4,xmm3 + paddw xmm7,xmm3 + paddw xmm0,xmm3 + + ; (Original) + ; R = Y + 1.40200 * Cr + ; G = Y - 0.34414 * Cb - 0.71414 * Cr + ; B = Y + 1.77200 * Cb + ; + ; (This implementation) + ; R = Y + 0.40200 * Cr + Cr + ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr + ; B = Y - 0.22800 * Cb + Cb + Cb + + movdqa xmm5,xmm6 ; xmm5=CbH + movdqa xmm2,xmm4 ; xmm2=CbL + paddw xmm6,xmm6 ; xmm6=2*CbH + paddw xmm4,xmm4 ; xmm4=2*CbL + movdqa xmm1,xmm7 ; xmm1=CrH + movdqa xmm3,xmm0 ; xmm3=CrL + paddw xmm7,xmm7 ; xmm7=2*CrH + paddw xmm0,xmm0 ; xmm0=2*CrL + + pmulhw xmm6,[rel PW_MF0228] ; xmm6=(2*CbH * -FIX(0.22800)) + pmulhw xmm4,[rel PW_MF0228] ; xmm4=(2*CbL * -FIX(0.22800)) + pmulhw xmm7,[rel PW_F0402] ; xmm7=(2*CrH * FIX(0.40200)) + pmulhw xmm0,[rel PW_F0402] ; xmm0=(2*CrL * FIX(0.40200)) + + paddw xmm6,[rel PW_ONE] + paddw xmm4,[rel PW_ONE] + psraw xmm6,1 ; xmm6=(CbH * -FIX(0.22800)) + psraw xmm4,1 ; xmm4=(CbL * -FIX(0.22800)) + paddw xmm7,[rel PW_ONE] + paddw xmm0,[rel PW_ONE] + psraw xmm7,1 ; xmm7=(CrH * FIX(0.40200)) + psraw xmm0,1 ; xmm0=(CrL * FIX(0.40200)) + + paddw xmm6,xmm5 + paddw xmm4,xmm2 + paddw xmm6,xmm5 ; xmm6=(CbH * FIX(1.77200))=(B-Y)H + paddw xmm4,xmm2 ; xmm4=(CbL * FIX(1.77200))=(B-Y)L + paddw xmm7,xmm1 ; xmm7=(CrH * FIX(1.40200))=(R-Y)H + paddw xmm0,xmm3 ; xmm0=(CrL * FIX(1.40200))=(R-Y)L + + movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=(B-Y)H + movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(R-Y)H + + movdqa xmm6,xmm5 + movdqa xmm7,xmm2 + punpcklwd xmm5,xmm1 + punpckhwd xmm6,xmm1 + pmaddwd xmm5,[rel PW_MF0344_F0285] + pmaddwd xmm6,[rel PW_MF0344_F0285] + punpcklwd xmm2,xmm3 + punpckhwd xmm7,xmm3 + pmaddwd xmm2,[rel PW_MF0344_F0285] + pmaddwd xmm7,[rel PW_MF0344_F0285] + + paddd xmm5,[rel PD_ONEHALF] + paddd xmm6,[rel PD_ONEHALF] + psrad xmm5,SCALEBITS + psrad xmm6,SCALEBITS + paddd xmm2,[rel PD_ONEHALF] + paddd xmm7,[rel PD_ONEHALF] + psrad xmm2,SCALEBITS + psrad xmm7,SCALEBITS + + packssdw xmm5,xmm6 ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285) + packssdw xmm2,xmm7 ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285) + psubw xmm5,xmm1 ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H + psubw xmm2,xmm3 ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L + + movdqa XMMWORD [wk(2)], xmm5 ; wk(2)=(G-Y)H + + mov al,2 ; Yctr + jmp short .Yloop_1st + +.Yloop_2nd: + movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(R-Y)H + movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(G-Y)H + movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(B-Y)H + +.Yloop_1st: + movdqa xmm7, XMMWORD [rsi] ; xmm7=Y(0123456789ABCDEF) + + pcmpeqw xmm6,xmm6 + psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} + pand xmm6,xmm7 ; xmm6=Y(02468ACE)=YE + psrlw xmm7,BYTE_BIT ; xmm7=Y(13579BDF)=YO + + movdqa xmm1,xmm0 ; xmm1=xmm0=(R-Y)(L/H) + movdqa xmm3,xmm2 ; xmm3=xmm2=(G-Y)(L/H) + movdqa xmm5,xmm4 ; xmm5=xmm4=(B-Y)(L/H) + + paddw xmm0,xmm6 ; xmm0=((R-Y)+YE)=RE=R(02468ACE) + paddw xmm1,xmm7 ; xmm1=((R-Y)+YO)=RO=R(13579BDF) + packuswb xmm0,xmm0 ; xmm0=R(02468ACE********) + packuswb xmm1,xmm1 ; xmm1=R(13579BDF********) + + paddw xmm2,xmm6 ; xmm2=((G-Y)+YE)=GE=G(02468ACE) + paddw xmm3,xmm7 ; xmm3=((G-Y)+YO)=GO=G(13579BDF) + packuswb xmm2,xmm2 ; xmm2=G(02468ACE********) + packuswb xmm3,xmm3 ; xmm3=G(13579BDF********) + + paddw xmm4,xmm6 ; xmm4=((B-Y)+YE)=BE=B(02468ACE) + paddw xmm5,xmm7 ; xmm5=((B-Y)+YO)=BO=B(13579BDF) + packuswb xmm4,xmm4 ; xmm4=B(02468ACE********) + packuswb xmm5,xmm5 ; xmm5=B(13579BDF********) + +%if RGB_PIXELSIZE == 3 ; --------------- + + ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) + ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) + ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) + ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **) + + punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) + punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F) + punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F) + + movdqa xmmG,xmmA + movdqa xmmH,xmmA + punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07) + punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F) + + psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --) + psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --) + + movdqa xmmC,xmmD + movdqa xmmB,xmmD + punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18) + punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --) + + psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --) + + movdqa xmmF,xmmE + punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29) + punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --) + + pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03) + movdqa xmmB,xmmE + punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14) + punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07) + punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29) + + pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B) + movdqa xmmB,xmmF + punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C) + punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F) + punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --) + + punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) + punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) + + cmp rcx, byte SIZEOF_XMMWORD + jb short .column_st32 + + test rdi, SIZEOF_XMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF + jmp short .out0 +.out1: ; --(unaligned)----------------- + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF +.out0: + add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr + sub rcx, byte SIZEOF_XMMWORD + jz near .endcolumn + + add rsi, byte SIZEOF_XMMWORD ; inptr0 + dec al ; Yctr + jnz near .Yloop_2nd + + add rbx, byte SIZEOF_XMMWORD ; inptr1 + add rdx, byte SIZEOF_XMMWORD ; inptr2 + jmp near .columnloop + +.column_st32: + lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE + cmp rcx, byte 2*SIZEOF_XMMWORD + jb short .column_st16 + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + add rdi, byte 2*SIZEOF_XMMWORD ; outptr + movdqa xmmA,xmmF + sub rcx, byte 2*SIZEOF_XMMWORD + jmp short .column_st15 +.column_st16: + cmp rcx, byte SIZEOF_XMMWORD + jb short .column_st15 + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + add rdi, byte SIZEOF_XMMWORD ; outptr + movdqa xmmA,xmmD + sub rcx, byte SIZEOF_XMMWORD +.column_st15: + ; Store the lower 8 bytes of xmmA to the output when it has enough + ; space. + cmp rcx, byte SIZEOF_MMWORD + jb short .column_st7 + movq XMM_MMWORD [rdi], xmmA + add rdi, byte SIZEOF_MMWORD + sub rcx, byte SIZEOF_MMWORD + psrldq xmmA, SIZEOF_MMWORD +.column_st7: + ; Store the lower 4 bytes of xmmA to the output when it has enough + ; space. + cmp rcx, byte SIZEOF_DWORD + jb short .column_st3 + movd XMM_DWORD [rdi], xmmA + add rdi, byte SIZEOF_DWORD + sub rcx, byte SIZEOF_DWORD + psrldq xmmA, SIZEOF_DWORD +.column_st3: + ; Store the lower 2 bytes of rax to the output when it has enough + ; space. + movd eax, xmmA + cmp rcx, byte SIZEOF_WORD + jb short .column_st1 + mov WORD [rdi], ax + add rdi, byte SIZEOF_WORD + sub rcx, byte SIZEOF_WORD + shr rax, 16 +.column_st1: + ; Store the lower 1 byte of rax to the output when it has enough + ; space. + test rcx, rcx + jz short .endcolumn + mov BYTE [rdi], al + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +%ifdef RGBX_FILLER_0XFF + pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********) + pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********) +%else + pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********) + pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********) +%endif + ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) + ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) + ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) + ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **) + + punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) + punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E) + punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F) + punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F) + + movdqa xmmC,xmmA + punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36) + punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E) + movdqa xmmG,xmmB + punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37) + punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F) + + movdqa xmmD,xmmA + punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) + punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + movdqa xmmH,xmmC + punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) + punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + + cmp rcx, byte SIZEOF_XMMWORD + jb short .column_st32 + + test rdi, SIZEOF_XMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC + movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH + jmp short .out0 +.out1: ; --(unaligned)----------------- + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC + movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH +.out0: + add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr + sub rcx, byte SIZEOF_XMMWORD + jz near .endcolumn + + add rsi, byte SIZEOF_XMMWORD ; inptr0 + dec al ; Yctr + jnz near .Yloop_2nd + + add rbx, byte SIZEOF_XMMWORD ; inptr1 + add rdx, byte SIZEOF_XMMWORD ; inptr2 + jmp near .columnloop + +.column_st32: + cmp rcx, byte SIZEOF_XMMWORD/2 + jb short .column_st16 + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + add rdi, byte 2*SIZEOF_XMMWORD ; outptr + movdqa xmmA,xmmC + movdqa xmmD,xmmH + sub rcx, byte SIZEOF_XMMWORD/2 +.column_st16: + cmp rcx, byte SIZEOF_XMMWORD/4 + jb short .column_st15 + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + add rdi, byte SIZEOF_XMMWORD ; outptr + movdqa xmmA,xmmD + sub rcx, byte SIZEOF_XMMWORD/4 +.column_st15: + ; Store two pixels (8 bytes) of xmmA to the output when it has enough + ; space. + cmp rcx, byte SIZEOF_XMMWORD/8 + jb short .column_st7 + movq XMM_MMWORD [rdi], xmmA + add rdi, byte SIZEOF_XMMWORD/8*4 + sub rcx, byte SIZEOF_XMMWORD/8 + psrldq xmmA, SIZEOF_XMMWORD/8*4 +.column_st7: + ; Store one pixel (4 bytes) of xmmA to the output when it has enough + ; space. + test rcx, rcx + jz short .endcolumn + movd XMM_DWORD [rdi], xmmA + +%endif ; RGB_PIXELSIZE ; --------------- + +.endcolumn: + sfence ; flush the write buffer + +.return: + pop rbx + uncollect_args + mov rsp,rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +; -------------------------------------------------------------------------- +; +; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical. +; +; GLOBAL(void) +; jsimd_h2v2_merged_upsample_sse2 (JDIMENSION output_width, +; JSAMPIMAGE input_buf, +; JDIMENSION in_row_group_ctr, +; JSAMPARRAY output_buf); +; + +; r10 = JDIMENSION output_width +; r11 = JSAMPIMAGE input_buf +; r12 = JDIMENSION in_row_group_ctr +; r13 = JSAMPARRAY output_buf + + align 16 + global EXTN(jsimd_h2v2_merged_upsample_sse2) + +EXTN(jsimd_h2v2_merged_upsample_sse2): + push rbp + mov rax,rsp + mov rbp,rsp + collect_args + push rbx + + mov eax, r10d + + mov rdi, r11 + mov ecx, r12d + mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY] + mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY] + mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY] + mov rdi, r13 + lea rsi, [rsi+rcx*SIZEOF_JSAMPROW] + + push rdx ; inptr2 + push rbx ; inptr1 + push rsi ; inptr00 + mov rbx,rsp + + push rdi + push rcx + push rax + + %ifdef WIN64 + mov r8, rcx + mov r9, rdi + mov rcx, rax + mov rdx, rbx + %else + mov rdx, rcx + mov rcx, rdi + mov rdi, rax + mov rsi, rbx + %endif + + call EXTN(jsimd_h2v1_merged_upsample_sse2) + + pop rax + pop rcx + pop rdi + pop rsi + pop rbx + pop rdx + + add rdi, byte SIZEOF_JSAMPROW ; outptr1 + add rsi, byte SIZEOF_JSAMPROW ; inptr01 + + push rdx ; inptr2 + push rbx ; inptr1 + push rsi ; inptr00 + mov rbx,rsp + + push rdi + push rcx + push rax + + %ifdef WIN64 + mov r8, rcx + mov r9, rdi + mov rcx, rax + mov rdx, rbx + %else + mov rdx, rcx + mov rcx, rdi + mov rdi, rax + mov rsi, rbx + %endif + + call EXTN(jsimd_h2v1_merged_upsample_sse2) + + pop rax + pop rcx + pop rdi + pop rsi + pop rbx + pop rdx + + pop rbx + uncollect_args + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/Builder/jni-1.11/simd/jdmrgext-sse2.asm b/Builder/jni-1.11/simd/jdmrgext-sse2.asm new file mode 100644 index 000000000..b50f698b4 --- /dev/null +++ b/Builder/jni-1.11/simd/jdmrgext-sse2.asm @@ -0,0 +1,518 @@ +; +; jdmrgext.asm - merged upsampling/color conversion (SSE2) +; +; Copyright 2009, 2012 Pierre Ossman for Cendio AB +; Copyright (C) 2012, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical. +; +; GLOBAL(void) +; jsimd_h2v1_merged_upsample_sse2 (JDIMENSION output_width, +; JSAMPIMAGE input_buf, +; JDIMENSION in_row_group_ctr, +; JSAMPARRAY output_buf); +; + +%define output_width(b) (b)+8 ; JDIMENSION output_width +%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf +%define in_row_group_ctr(b) (b)+16 ; JDIMENSION in_row_group_ctr +%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf + +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 3 +%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr + + align 16 + global EXTN(jsimd_h2v1_merged_upsample_sse2) + +EXTN(jsimd_h2v1_merged_upsample_sse2): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov ecx, JDIMENSION [output_width(eax)] ; col + test ecx,ecx + jz near .return + + push ecx + + mov edi, JSAMPIMAGE [input_buf(eax)] + mov ecx, JDIMENSION [in_row_group_ctr(eax)] + mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] + mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] + mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] + mov edi, JSAMPARRAY [output_buf(eax)] + mov esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW] ; inptr0 + mov ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW] ; inptr1 + mov edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW] ; inptr2 + mov edi, JSAMPROW [edi] ; outptr + + pop ecx ; col + + alignx 16,7 +.columnloop: + movpic eax, POINTER [gotptr] ; load GOT address (eax) + + movdqa xmm6, XMMWORD [ebx] ; xmm6=Cb(0123456789ABCDEF) + movdqa xmm7, XMMWORD [edx] ; xmm7=Cr(0123456789ABCDEF) + + pxor xmm1,xmm1 ; xmm1=(all 0's) + pcmpeqw xmm3,xmm3 + psllw xmm3,7 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..} + + movdqa xmm4,xmm6 + punpckhbw xmm6,xmm1 ; xmm6=Cb(89ABCDEF)=CbH + punpcklbw xmm4,xmm1 ; xmm4=Cb(01234567)=CbL + movdqa xmm0,xmm7 + punpckhbw xmm7,xmm1 ; xmm7=Cr(89ABCDEF)=CrH + punpcklbw xmm0,xmm1 ; xmm0=Cr(01234567)=CrL + + paddw xmm6,xmm3 + paddw xmm4,xmm3 + paddw xmm7,xmm3 + paddw xmm0,xmm3 + + ; (Original) + ; R = Y + 1.40200 * Cr + ; G = Y - 0.34414 * Cb - 0.71414 * Cr + ; B = Y + 1.77200 * Cb + ; + ; (This implementation) + ; R = Y + 0.40200 * Cr + Cr + ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr + ; B = Y - 0.22800 * Cb + Cb + Cb + + movdqa xmm5,xmm6 ; xmm5=CbH + movdqa xmm2,xmm4 ; xmm2=CbL + paddw xmm6,xmm6 ; xmm6=2*CbH + paddw xmm4,xmm4 ; xmm4=2*CbL + movdqa xmm1,xmm7 ; xmm1=CrH + movdqa xmm3,xmm0 ; xmm3=CrL + paddw xmm7,xmm7 ; xmm7=2*CrH + paddw xmm0,xmm0 ; xmm0=2*CrL + + pmulhw xmm6,[GOTOFF(eax,PW_MF0228)] ; xmm6=(2*CbH * -FIX(0.22800)) + pmulhw xmm4,[GOTOFF(eax,PW_MF0228)] ; xmm4=(2*CbL * -FIX(0.22800)) + pmulhw xmm7,[GOTOFF(eax,PW_F0402)] ; xmm7=(2*CrH * FIX(0.40200)) + pmulhw xmm0,[GOTOFF(eax,PW_F0402)] ; xmm0=(2*CrL * FIX(0.40200)) + + paddw xmm6,[GOTOFF(eax,PW_ONE)] + paddw xmm4,[GOTOFF(eax,PW_ONE)] + psraw xmm6,1 ; xmm6=(CbH * -FIX(0.22800)) + psraw xmm4,1 ; xmm4=(CbL * -FIX(0.22800)) + paddw xmm7,[GOTOFF(eax,PW_ONE)] + paddw xmm0,[GOTOFF(eax,PW_ONE)] + psraw xmm7,1 ; xmm7=(CrH * FIX(0.40200)) + psraw xmm0,1 ; xmm0=(CrL * FIX(0.40200)) + + paddw xmm6,xmm5 + paddw xmm4,xmm2 + paddw xmm6,xmm5 ; xmm6=(CbH * FIX(1.77200))=(B-Y)H + paddw xmm4,xmm2 ; xmm4=(CbL * FIX(1.77200))=(B-Y)L + paddw xmm7,xmm1 ; xmm7=(CrH * FIX(1.40200))=(R-Y)H + paddw xmm0,xmm3 ; xmm0=(CrL * FIX(1.40200))=(R-Y)L + + movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=(B-Y)H + movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(R-Y)H + + movdqa xmm6,xmm5 + movdqa xmm7,xmm2 + punpcklwd xmm5,xmm1 + punpckhwd xmm6,xmm1 + pmaddwd xmm5,[GOTOFF(eax,PW_MF0344_F0285)] + pmaddwd xmm6,[GOTOFF(eax,PW_MF0344_F0285)] + punpcklwd xmm2,xmm3 + punpckhwd xmm7,xmm3 + pmaddwd xmm2,[GOTOFF(eax,PW_MF0344_F0285)] + pmaddwd xmm7,[GOTOFF(eax,PW_MF0344_F0285)] + + paddd xmm5,[GOTOFF(eax,PD_ONEHALF)] + paddd xmm6,[GOTOFF(eax,PD_ONEHALF)] + psrad xmm5,SCALEBITS + psrad xmm6,SCALEBITS + paddd xmm2,[GOTOFF(eax,PD_ONEHALF)] + paddd xmm7,[GOTOFF(eax,PD_ONEHALF)] + psrad xmm2,SCALEBITS + psrad xmm7,SCALEBITS + + packssdw xmm5,xmm6 ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285) + packssdw xmm2,xmm7 ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285) + psubw xmm5,xmm1 ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H + psubw xmm2,xmm3 ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L + + movdqa XMMWORD [wk(2)], xmm5 ; wk(2)=(G-Y)H + + mov al,2 ; Yctr + jmp short .Yloop_1st + alignx 16,7 + +.Yloop_2nd: + movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(R-Y)H + movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(G-Y)H + movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(B-Y)H + alignx 16,7 + +.Yloop_1st: + movdqa xmm7, XMMWORD [esi] ; xmm7=Y(0123456789ABCDEF) + + pcmpeqw xmm6,xmm6 + psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} + pand xmm6,xmm7 ; xmm6=Y(02468ACE)=YE + psrlw xmm7,BYTE_BIT ; xmm7=Y(13579BDF)=YO + + movdqa xmm1,xmm0 ; xmm1=xmm0=(R-Y)(L/H) + movdqa xmm3,xmm2 ; xmm3=xmm2=(G-Y)(L/H) + movdqa xmm5,xmm4 ; xmm5=xmm4=(B-Y)(L/H) + + paddw xmm0,xmm6 ; xmm0=((R-Y)+YE)=RE=R(02468ACE) + paddw xmm1,xmm7 ; xmm1=((R-Y)+YO)=RO=R(13579BDF) + packuswb xmm0,xmm0 ; xmm0=R(02468ACE********) + packuswb xmm1,xmm1 ; xmm1=R(13579BDF********) + + paddw xmm2,xmm6 ; xmm2=((G-Y)+YE)=GE=G(02468ACE) + paddw xmm3,xmm7 ; xmm3=((G-Y)+YO)=GO=G(13579BDF) + packuswb xmm2,xmm2 ; xmm2=G(02468ACE********) + packuswb xmm3,xmm3 ; xmm3=G(13579BDF********) + + paddw xmm4,xmm6 ; xmm4=((B-Y)+YE)=BE=B(02468ACE) + paddw xmm5,xmm7 ; xmm5=((B-Y)+YO)=BO=B(13579BDF) + packuswb xmm4,xmm4 ; xmm4=B(02468ACE********) + packuswb xmm5,xmm5 ; xmm5=B(13579BDF********) + +%if RGB_PIXELSIZE == 3 ; --------------- + + ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) + ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) + ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) + ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **) + + punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) + punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F) + punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F) + + movdqa xmmG,xmmA + movdqa xmmH,xmmA + punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07) + punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F) + + psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --) + psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --) + + movdqa xmmC,xmmD + movdqa xmmB,xmmD + punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18) + punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --) + + psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --) + + movdqa xmmF,xmmE + punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29) + punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --) + + pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03) + movdqa xmmB,xmmE + punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14) + punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07) + punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29) + + pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B) + movdqa xmmB,xmmF + punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C) + punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F) + punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --) + + punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) + punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) + + cmp ecx, byte SIZEOF_XMMWORD + jb short .column_st32 + + test edi, SIZEOF_XMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF + jmp short .out0 +.out1: ; --(unaligned)----------------- + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF +.out0: + add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr + sub ecx, byte SIZEOF_XMMWORD + jz near .endcolumn + + add esi, byte SIZEOF_XMMWORD ; inptr0 + dec al ; Yctr + jnz near .Yloop_2nd + + add ebx, byte SIZEOF_XMMWORD ; inptr1 + add edx, byte SIZEOF_XMMWORD ; inptr2 + jmp near .columnloop + alignx 16,7 + +.column_st32: + lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE + cmp ecx, byte 2*SIZEOF_XMMWORD + jb short .column_st16 + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + add edi, byte 2*SIZEOF_XMMWORD ; outptr + movdqa xmmA,xmmF + sub ecx, byte 2*SIZEOF_XMMWORD + jmp short .column_st15 +.column_st16: + cmp ecx, byte SIZEOF_XMMWORD + jb short .column_st15 + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + add edi, byte SIZEOF_XMMWORD ; outptr + movdqa xmmA,xmmD + sub ecx, byte SIZEOF_XMMWORD +.column_st15: + ; Store the lower 8 bytes of xmmA to the output when it has enough + ; space. + cmp ecx, byte SIZEOF_MMWORD + jb short .column_st7 + movq XMM_MMWORD [edi], xmmA + add edi, byte SIZEOF_MMWORD + sub ecx, byte SIZEOF_MMWORD + psrldq xmmA, SIZEOF_MMWORD +.column_st7: + ; Store the lower 4 bytes of xmmA to the output when it has enough + ; space. + cmp ecx, byte SIZEOF_DWORD + jb short .column_st3 + movd XMM_DWORD [edi], xmmA + add edi, byte SIZEOF_DWORD + sub ecx, byte SIZEOF_DWORD + psrldq xmmA, SIZEOF_DWORD +.column_st3: + ; Store the lower 2 bytes of eax to the output when it has enough + ; space. + movd eax, xmmA + cmp ecx, byte SIZEOF_WORD + jb short .column_st1 + mov WORD [edi], ax + add edi, byte SIZEOF_WORD + sub ecx, byte SIZEOF_WORD + shr eax, 16 +.column_st1: + ; Store the lower 1 byte of eax to the output when it has enough + ; space. + test ecx, ecx + jz short .endcolumn + mov BYTE [edi], al + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +%ifdef RGBX_FILLER_0XFF + pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********) + pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********) +%else + pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********) + pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********) +%endif + ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) + ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) + ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) + ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **) + + punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) + punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E) + punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F) + punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F) + + movdqa xmmC,xmmA + punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36) + punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E) + movdqa xmmG,xmmB + punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37) + punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F) + + movdqa xmmD,xmmA + punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) + punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + movdqa xmmH,xmmC + punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) + punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + + cmp ecx, byte SIZEOF_XMMWORD + jb short .column_st32 + + test edi, SIZEOF_XMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC + movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH + jmp short .out0 +.out1: ; --(unaligned)----------------- + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC + movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH +.out0: + add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr + sub ecx, byte SIZEOF_XMMWORD + jz near .endcolumn + + add esi, byte SIZEOF_XMMWORD ; inptr0 + dec al ; Yctr + jnz near .Yloop_2nd + + add ebx, byte SIZEOF_XMMWORD ; inptr1 + add edx, byte SIZEOF_XMMWORD ; inptr2 + jmp near .columnloop + alignx 16,7 + +.column_st32: + cmp ecx, byte SIZEOF_XMMWORD/2 + jb short .column_st16 + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + add edi, byte 2*SIZEOF_XMMWORD ; outptr + movdqa xmmA,xmmC + movdqa xmmD,xmmH + sub ecx, byte SIZEOF_XMMWORD/2 +.column_st16: + cmp ecx, byte SIZEOF_XMMWORD/4 + jb short .column_st15 + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + add edi, byte SIZEOF_XMMWORD ; outptr + movdqa xmmA,xmmD + sub ecx, byte SIZEOF_XMMWORD/4 +.column_st15: + ; Store two pixels (8 bytes) of xmmA to the output when it has enough + ; space. + cmp ecx, byte SIZEOF_XMMWORD/8 + jb short .column_st7 + movq XMM_MMWORD [edi], xmmA + add edi, byte SIZEOF_XMMWORD/8*4 + sub ecx, byte SIZEOF_XMMWORD/8 + psrldq xmmA, SIZEOF_XMMWORD/8*4 +.column_st7: + ; Store one pixel (4 bytes) of xmmA to the output when it has enough + ; space. + test ecx, ecx + jz short .endcolumn + movd XMM_DWORD [edi], xmmA + +%endif ; RGB_PIXELSIZE ; --------------- + +.endcolumn: + sfence ; flush the write buffer + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical. +; +; GLOBAL(void) +; jsimd_h2v2_merged_upsample_sse2 (JDIMENSION output_width, +; JSAMPIMAGE input_buf, +; JDIMENSION in_row_group_ctr, +; JSAMPARRAY output_buf); +; + +%define output_width(b) (b)+8 ; JDIMENSION output_width +%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf +%define in_row_group_ctr(b) (b)+16 ; JDIMENSION in_row_group_ctr +%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf + + align 16 + global EXTN(jsimd_h2v2_merged_upsample_sse2) + +EXTN(jsimd_h2v2_merged_upsample_sse2): + push ebp + mov ebp,esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov eax, POINTER [output_width(ebp)] + + mov edi, JSAMPIMAGE [input_buf(ebp)] + mov ecx, JDIMENSION [in_row_group_ctr(ebp)] + mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] + mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] + mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] + mov edi, JSAMPARRAY [output_buf(ebp)] + lea esi, [esi+ecx*SIZEOF_JSAMPROW] + + push edx ; inptr2 + push ebx ; inptr1 + push esi ; inptr00 + mov ebx,esp + + push edi ; output_buf (outptr0) + push ecx ; in_row_group_ctr + push ebx ; input_buf + push eax ; output_width + + call near EXTN(jsimd_h2v1_merged_upsample_sse2) + + add esi, byte SIZEOF_JSAMPROW ; inptr01 + add edi, byte SIZEOF_JSAMPROW ; outptr1 + mov POINTER [ebx+0*SIZEOF_POINTER], esi + mov POINTER [ebx-1*SIZEOF_POINTER], edi + + call near EXTN(jsimd_h2v1_merged_upsample_sse2) + + add esp, byte 7*SIZEOF_DWORD + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/Builder/jni-1.11/simd/jdsample-altivec.c b/Builder/jni-1.11/simd/jdsample-altivec.c new file mode 100644 index 000000000..b40ce55c8 --- /dev/null +++ b/Builder/jni-1.11/simd/jdsample-altivec.c @@ -0,0 +1,392 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2015, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* CHROMA UPSAMPLING */ + +#include "jsimd_altivec.h" + + +void +jsimd_h2v1_fancy_upsample_altivec (int max_v_samp_factor, + JDIMENSION downsampled_width, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + JSAMPARRAY output_data = *output_data_ptr; + JSAMPROW inptr, outptr; + int inrow, incol; + + __vector unsigned char this0, last0, p_last0, next0 = {0}, p_next0, + out; + __vector short this0e, this0o, this0l, this0h, last0l, last0h, + next0l, next0h, outle, outhe, outlo, outho; + + /* Constants */ + __vector unsigned char pb_zero = { __16X(0) }, pb_three = { __16X(3) }, + last_index_col0 = {0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14}, + last_index = {15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30}, + next_index = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}, + next_index_lastcol = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,15}, +#if __BIG_ENDIAN__ + merge_pack_index = {1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31}; +#else + merge_pack_index = {0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30}; +#endif + __vector short pw_one = { __8X(1) }, pw_two = { __8X(2) }; + + for (inrow = 0; inrow < max_v_samp_factor; inrow++) { + inptr = input_data[inrow]; + outptr = output_data[inrow]; + + if (downsampled_width & 15) + inptr[downsampled_width] = inptr[downsampled_width - 1]; + + this0 = vec_ld(0, inptr); + p_last0 = vec_perm(this0, this0, last_index_col0); + last0 = this0; + + for (incol = downsampled_width; incol > 0; + incol -= 16, inptr += 16, outptr += 32) { + + if (downsampled_width - incol > 0) { + p_last0 = vec_perm(last0, this0, last_index); + last0 = this0; + } + + if (incol <= 16) + p_next0 = vec_perm(this0, this0, next_index_lastcol); + else { + next0 = vec_ld(16, inptr); + p_next0 = vec_perm(this0, next0, next_index); + } + + this0e = (__vector short)vec_mule(this0, pb_three); + this0o = (__vector short)vec_mulo(this0, pb_three); + this0l = vec_mergeh(this0e, this0o); + this0h = vec_mergel(this0e, this0o); + + last0l = (__vector short)VEC_UNPACKHU(p_last0); + last0h = (__vector short)VEC_UNPACKLU(p_last0); + last0l = vec_add(last0l, pw_one); + + next0l = (__vector short)VEC_UNPACKHU(p_next0); + next0h = (__vector short)VEC_UNPACKLU(p_next0); + next0l = vec_add(next0l, pw_two); + + outle = vec_add(this0l, last0l); + outlo = vec_add(this0l, next0l); + outle = vec_sr(outle, (__vector unsigned short)pw_two); + outlo = vec_sr(outlo, (__vector unsigned short)pw_two); + + out = vec_perm((__vector unsigned char)outle, + (__vector unsigned char)outlo, merge_pack_index); + vec_st(out, 0, outptr); + + if (incol > 8) { + last0h = vec_add(last0h, pw_one); + next0h = vec_add(next0h, pw_two); + + outhe = vec_add(this0h, last0h); + outho = vec_add(this0h, next0h); + outhe = vec_sr(outhe, (__vector unsigned short)pw_two); + outho = vec_sr(outho, (__vector unsigned short)pw_two); + + out = vec_perm((__vector unsigned char)outhe, + (__vector unsigned char)outho, merge_pack_index); + vec_st(out, 16, outptr); + } + + this0 = next0; + } + } +} + + +void +jsimd_h2v2_fancy_upsample_altivec (int max_v_samp_factor, + JDIMENSION downsampled_width, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + JSAMPARRAY output_data = *output_data_ptr; + JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1; + int inrow, outrow, incol; + + __vector unsigned char this_1, this0, this1, out; + __vector short this_1l, this_1h, this0l, this0h, this1l, this1h, + lastcolsum_1h, lastcolsum1h, + p_lastcolsum_1l, p_lastcolsum_1h, p_lastcolsum1l, p_lastcolsum1h, + thiscolsum_1l, thiscolsum_1h, thiscolsum1l, thiscolsum1h, + nextcolsum_1l = {0}, nextcolsum_1h = {0}, + nextcolsum1l = {0}, nextcolsum1h = {0}, + p_nextcolsum_1l, p_nextcolsum_1h, p_nextcolsum1l, p_nextcolsum1h, + tmpl, tmph, outle, outhe, outlo, outho; + + /* Constants */ + __vector unsigned char pb_zero = { __16X(0) }, + last_index_col0 = {0,1,0,1,2,3,4,5,6,7,8,9,10,11,12,13}, + last_index={14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29}, + next_index = {2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17}, + next_index_lastcol = {2,3,4,5,6,7,8,9,10,11,12,13,14,15,14,15}, +#if __BIG_ENDIAN__ + merge_pack_index = {1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31}; +#else + merge_pack_index = {0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30}; +#endif + __vector short pw_zero = { __8X(0) }, pw_three = { __8X(3) }, + pw_seven = { __8X(7) }, pw_eight = { __8X(8) }; + __vector unsigned short pw_four = { __8X(4) }; + + for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) { + + inptr_1 = input_data[inrow - 1]; + inptr0 = input_data[inrow]; + inptr1 = input_data[inrow + 1]; + outptr0 = output_data[outrow++]; + outptr1 = output_data[outrow++]; + + if (downsampled_width & 15) { + inptr_1[downsampled_width] = inptr_1[downsampled_width - 1]; + inptr0[downsampled_width] = inptr0[downsampled_width - 1]; + inptr1[downsampled_width] = inptr1[downsampled_width - 1]; + } + + this0 = vec_ld(0, inptr0); + this0l = (__vector short)VEC_UNPACKHU(this0); + this0h = (__vector short)VEC_UNPACKLU(this0); + this0l = vec_mladd(this0l, pw_three, pw_zero); + this0h = vec_mladd(this0h, pw_three, pw_zero); + + this_1 = vec_ld(0, inptr_1); + this_1l = (__vector short)VEC_UNPACKHU(this_1); + this_1h = (__vector short)VEC_UNPACKLU(this_1); + thiscolsum_1l = vec_add(this0l, this_1l); + thiscolsum_1h = vec_add(this0h, this_1h); + lastcolsum_1h = thiscolsum_1h; + p_lastcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1l, last_index_col0); + p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index); + + this1 = vec_ld(0, inptr1); + this1l = (__vector short)VEC_UNPACKHU(this1); + this1h = (__vector short)VEC_UNPACKLU(this1); + thiscolsum1l = vec_add(this0l, this1l); + thiscolsum1h = vec_add(this0h, this1h); + lastcolsum1h = thiscolsum1h; + p_lastcolsum1l = vec_perm(thiscolsum1l, thiscolsum1l, last_index_col0); + p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index); + + for (incol = downsampled_width; incol > 0; + incol -= 16, inptr_1 += 16, inptr0 += 16, inptr1 += 16, + outptr0 += 32, outptr1 += 32) { + + if (downsampled_width - incol > 0) { + p_lastcolsum_1l = vec_perm(lastcolsum_1h, thiscolsum_1l, last_index); + p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index); + p_lastcolsum1l = vec_perm(lastcolsum1h, thiscolsum1l, last_index); + p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index); + lastcolsum_1h = thiscolsum_1h; lastcolsum1h = thiscolsum1h; + } + + if (incol <= 16) { + p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index); + p_nextcolsum_1h = vec_perm(thiscolsum_1h, thiscolsum_1h, + next_index_lastcol); + p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index); + p_nextcolsum1h = vec_perm(thiscolsum1h, thiscolsum1h, + next_index_lastcol); + } else { + this0 = vec_ld(16, inptr0); + this0l = (__vector short)VEC_UNPACKHU(this0); + this0h = (__vector short)VEC_UNPACKLU(this0); + this0l = vec_mladd(this0l, pw_three, pw_zero); + this0h = vec_mladd(this0h, pw_three, pw_zero); + + this_1 = vec_ld(16, inptr_1); + this_1l = (__vector short)VEC_UNPACKHU(this_1); + this_1h = (__vector short)VEC_UNPACKLU(this_1); + nextcolsum_1l = vec_add(this0l, this_1l); + nextcolsum_1h = vec_add(this0h, this_1h); + p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index); + p_nextcolsum_1h = vec_perm(thiscolsum_1h, nextcolsum_1l, next_index); + + this1 = vec_ld(16, inptr1); + this1l = (__vector short)VEC_UNPACKHU(this1); + this1h = (__vector short)VEC_UNPACKLU(this1); + nextcolsum1l = vec_add(this0l, this1l); + nextcolsum1h = vec_add(this0h, this1h); + p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index); + p_nextcolsum1h = vec_perm(thiscolsum1h, nextcolsum1l, next_index); + } + + /* Process the upper row */ + + tmpl = vec_mladd(thiscolsum_1l, pw_three, pw_zero); + outle = vec_add(tmpl, p_lastcolsum_1l); + outle = vec_add(outle, pw_eight); + outle = vec_sr(outle, pw_four); + + outlo = vec_add(tmpl, p_nextcolsum_1l); + outlo = vec_add(outlo, pw_seven); + outlo = vec_sr(outlo, pw_four); + + out = vec_perm((__vector unsigned char)outle, + (__vector unsigned char)outlo, merge_pack_index); + vec_st(out, 0, outptr0); + + if (incol > 8) { + tmph = vec_mladd(thiscolsum_1h, pw_three, pw_zero); + outhe = vec_add(tmph, p_lastcolsum_1h); + outhe = vec_add(outhe, pw_eight); + outhe = vec_sr(outhe, pw_four); + + outho = vec_add(tmph, p_nextcolsum_1h); + outho = vec_add(outho, pw_seven); + outho = vec_sr(outho, pw_four); + + out = vec_perm((__vector unsigned char)outhe, + (__vector unsigned char)outho, merge_pack_index); + vec_st(out, 16, outptr0); + } + + /* Process the lower row */ + + tmpl = vec_mladd(thiscolsum1l, pw_three, pw_zero); + outle = vec_add(tmpl, p_lastcolsum1l); + outle = vec_add(outle, pw_eight); + outle = vec_sr(outle, pw_four); + + outlo = vec_add(tmpl, p_nextcolsum1l); + outlo = vec_add(outlo, pw_seven); + outlo = vec_sr(outlo, pw_four); + + out = vec_perm((__vector unsigned char)outle, + (__vector unsigned char)outlo, merge_pack_index); + vec_st(out, 0, outptr1); + + if (incol > 8) { + tmph = vec_mladd(thiscolsum1h, pw_three, pw_zero); + outhe = vec_add(tmph, p_lastcolsum1h); + outhe = vec_add(outhe, pw_eight); + outhe = vec_sr(outhe, pw_four); + + outho = vec_add(tmph, p_nextcolsum1h); + outho = vec_add(outho, pw_seven); + outho = vec_sr(outho, pw_four); + + out = vec_perm((__vector unsigned char)outhe, + (__vector unsigned char)outho, merge_pack_index); + vec_st(out, 16, outptr1); + } + + thiscolsum_1l = nextcolsum_1l; thiscolsum_1h = nextcolsum_1h; + thiscolsum1l = nextcolsum1l; thiscolsum1h = nextcolsum1h; + } + } +} + + +/* These are rarely used (mainly just for decompressing YCCK images) */ + +void +jsimd_h2v1_upsample_altivec (int max_v_samp_factor, + JDIMENSION output_width, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + JSAMPARRAY output_data = *output_data_ptr; + JSAMPROW inptr, outptr; + int inrow, incol; + + __vector unsigned char in, inl, inh; + + for (inrow = 0; inrow < max_v_samp_factor; inrow++) { + inptr = input_data[inrow]; + outptr = output_data[inrow]; + + for (incol = (output_width + 31) & (~31); incol > 0; + incol -= 64, inptr += 32, outptr += 64) { + + in = vec_ld(0, inptr); + inl = vec_mergeh(in, in); + inh = vec_mergel(in, in); + + vec_st(inl, 0, outptr); + vec_st(inh, 16, outptr); + + if (incol > 32) { + in = vec_ld(16, inptr); + inl = vec_mergeh(in, in); + inh = vec_mergel(in, in); + + vec_st(inl, 32, outptr); + vec_st(inh, 48, outptr); + } + } + } +} + + +void +jsimd_h2v2_upsample_altivec (int max_v_samp_factor, + JDIMENSION output_width, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + JSAMPARRAY output_data = *output_data_ptr; + JSAMPROW inptr, outptr0, outptr1; + int inrow, outrow, incol; + + __vector unsigned char in, inl, inh; + + for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) { + + inptr = input_data[inrow]; + outptr0 = output_data[outrow++]; + outptr1 = output_data[outrow++]; + + for (incol = (output_width + 31) & (~31); incol > 0; + incol -= 64, inptr += 32, outptr0 += 64, outptr1 += 64) { + + in = vec_ld(0, inptr); + inl = vec_mergeh(in, in); + inh = vec_mergel(in, in); + + vec_st(inl, 0, outptr0); + vec_st(inl, 0, outptr1); + + vec_st(inh, 16, outptr0); + vec_st(inh, 16, outptr1); + + if (incol > 32) { + in = vec_ld(16, inptr); + inl = vec_mergeh(in, in); + inh = vec_mergel(in, in); + + vec_st(inl, 32, outptr0); + vec_st(inl, 32, outptr1); + + vec_st(inh, 48, outptr0); + vec_st(inh, 48, outptr1); + } + } + } +} diff --git a/Builder/jni-1.11/simd/jdsample-mmx.asm b/Builder/jni-1.11/simd/jdsample-mmx.asm new file mode 100644 index 000000000..5e4fa7ae2 --- /dev/null +++ b/Builder/jni-1.11/simd/jdsample-mmx.asm @@ -0,0 +1,736 @@ +; +; jdsample.asm - upsampling (MMX) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_fancy_upsample_mmx) + +EXTN(jconst_fancy_upsample_mmx): + +PW_ONE times 4 dw 1 +PW_TWO times 4 dw 2 +PW_THREE times 4 dw 3 +PW_SEVEN times 4 dw 7 +PW_EIGHT times 4 dw 8 + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical. +; +; The upsampling algorithm is linear interpolation between pixel centers, +; also known as a "triangle filter". This is a good compromise between +; speed and visual quality. The centers of the output pixels are 1/4 and 3/4 +; of the way between input pixel centers. +; +; GLOBAL(void) +; jsimd_h2v1_fancy_upsample_mmx (int max_v_samp_factor, +; JDIMENSION downsampled_width, +; JSAMPARRAY input_data, +; JSAMPARRAY *output_data_ptr); +; + +%define max_v_samp(b) (b)+8 ; int max_v_samp_factor +%define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width +%define input_data(b) (b)+16 ; JSAMPARRAY input_data +%define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr + + align 16 + global EXTN(jsimd_h2v1_fancy_upsample_mmx) + +EXTN(jsimd_h2v1_fancy_upsample_mmx): + push ebp + mov ebp,esp + pushpic ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr + test eax,eax + jz near .return + + mov ecx, INT [max_v_samp(ebp)] ; rowctr + test ecx,ecx + jz near .return + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, POINTER [output_data_ptr(ebp)] + mov edi, JSAMPARRAY [edi] ; output_data + alignx 16,7 +.rowloop: + push eax ; colctr + push edi + push esi + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr + + test eax, SIZEOF_MMWORD-1 + jz short .skip + mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample +.skip: + pxor mm0,mm0 ; mm0=(all 0's) + pcmpeqb mm7,mm7 + psrlq mm7,(SIZEOF_MMWORD-1)*BYTE_BIT + pand mm7, MMWORD [esi+0*SIZEOF_MMWORD] + + add eax, byte SIZEOF_MMWORD-1 + and eax, byte -SIZEOF_MMWORD + cmp eax, byte SIZEOF_MMWORD + ja short .columnloop + alignx 16,7 + +.columnloop_last: + pcmpeqb mm6,mm6 + psllq mm6,(SIZEOF_MMWORD-1)*BYTE_BIT + pand mm6, MMWORD [esi+0*SIZEOF_MMWORD] + jmp short .upsample + alignx 16,7 + +.columnloop: + movq mm6, MMWORD [esi+1*SIZEOF_MMWORD] + psllq mm6,(SIZEOF_MMWORD-1)*BYTE_BIT + +.upsample: + movq mm1, MMWORD [esi+0*SIZEOF_MMWORD] + movq mm2,mm1 + movq mm3,mm1 ; mm1=( 0 1 2 3 4 5 6 7) + psllq mm2,BYTE_BIT ; mm2=( - 0 1 2 3 4 5 6) + psrlq mm3,BYTE_BIT ; mm3=( 1 2 3 4 5 6 7 -) + + por mm2,mm7 ; mm2=(-1 0 1 2 3 4 5 6) + por mm3,mm6 ; mm3=( 1 2 3 4 5 6 7 8) + + movq mm7,mm1 + psrlq mm7,(SIZEOF_MMWORD-1)*BYTE_BIT ; mm7=( 7 - - - - - - -) + + movq mm4,mm1 + punpcklbw mm1,mm0 ; mm1=( 0 1 2 3) + punpckhbw mm4,mm0 ; mm4=( 4 5 6 7) + movq mm5,mm2 + punpcklbw mm2,mm0 ; mm2=(-1 0 1 2) + punpckhbw mm5,mm0 ; mm5=( 3 4 5 6) + movq mm6,mm3 + punpcklbw mm3,mm0 ; mm3=( 1 2 3 4) + punpckhbw mm6,mm0 ; mm6=( 5 6 7 8) + + pmullw mm1,[GOTOFF(ebx,PW_THREE)] + pmullw mm4,[GOTOFF(ebx,PW_THREE)] + paddw mm2,[GOTOFF(ebx,PW_ONE)] + paddw mm5,[GOTOFF(ebx,PW_ONE)] + paddw mm3,[GOTOFF(ebx,PW_TWO)] + paddw mm6,[GOTOFF(ebx,PW_TWO)] + + paddw mm2,mm1 + paddw mm5,mm4 + psrlw mm2,2 ; mm2=OutLE=( 0 2 4 6) + psrlw mm5,2 ; mm5=OutHE=( 8 10 12 14) + paddw mm3,mm1 + paddw mm6,mm4 + psrlw mm3,2 ; mm3=OutLO=( 1 3 5 7) + psrlw mm6,2 ; mm6=OutHO=( 9 11 13 15) + + psllw mm3,BYTE_BIT + psllw mm6,BYTE_BIT + por mm2,mm3 ; mm2=OutL=( 0 1 2 3 4 5 6 7) + por mm5,mm6 ; mm5=OutH=( 8 9 10 11 12 13 14 15) + + movq MMWORD [edi+0*SIZEOF_MMWORD], mm2 + movq MMWORD [edi+1*SIZEOF_MMWORD], mm5 + + sub eax, byte SIZEOF_MMWORD + add esi, byte 1*SIZEOF_MMWORD ; inptr + add edi, byte 2*SIZEOF_MMWORD ; outptr + cmp eax, byte SIZEOF_MMWORD + ja near .columnloop + test eax,eax + jnz near .columnloop_last + + pop esi + pop edi + pop eax + + add esi, byte SIZEOF_JSAMPROW ; input_data + add edi, byte SIZEOF_JSAMPROW ; output_data + dec ecx ; rowctr + jg near .rowloop + + emms ; empty MMX state + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + poppic ebx + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical. +; Again a triangle filter; see comments for h2v1 case, above. +; +; GLOBAL(void) +; jsimd_h2v2_fancy_upsample_mmx (int max_v_samp_factor, +; JDIMENSION downsampled_width, +; JSAMPARRAY input_data, +; JSAMPARRAY *output_data_ptr); +; + +%define max_v_samp(b) (b)+8 ; int max_v_samp_factor +%define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width +%define input_data(b) (b)+16 ; JSAMPARRAY input_data +%define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr + +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] +%define WK_NUM 4 +%define gotptr wk(0)-SIZEOF_POINTER ; void *gotptr + + align 16 + global EXTN(jsimd_h2v2_fancy_upsample_mmx) + +EXTN(jsimd_h2v2_fancy_upsample_mmx): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov edx,eax ; edx = original ebp + mov eax, JDIMENSION [downsamp_width(edx)] ; colctr + test eax,eax + jz near .return + + mov ecx, INT [max_v_samp(edx)] ; rowctr + test ecx,ecx + jz near .return + + mov esi, JSAMPARRAY [input_data(edx)] ; input_data + mov edi, POINTER [output_data_ptr(edx)] + mov edi, JSAMPARRAY [edi] ; output_data + alignx 16,7 +.rowloop: + push eax ; colctr + push ecx + push edi + push esi + + mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above) + mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 + mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below) + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 + mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 + + test eax, SIZEOF_MMWORD-1 + jz short .skip + push edx + mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl + mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl + mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample + pop edx +.skip: + ; -- process the first column block + + movq mm0, MMWORD [ebx+0*SIZEOF_MMWORD] ; mm0=row[ 0][0] + movq mm1, MMWORD [ecx+0*SIZEOF_MMWORD] ; mm1=row[-1][0] + movq mm2, MMWORD [esi+0*SIZEOF_MMWORD] ; mm2=row[+1][0] + + pushpic ebx + movpic ebx, POINTER [gotptr] ; load GOT address + + pxor mm3,mm3 ; mm3=(all 0's) + movq mm4,mm0 + punpcklbw mm0,mm3 ; mm0=row[ 0][0]( 0 1 2 3) + punpckhbw mm4,mm3 ; mm4=row[ 0][0]( 4 5 6 7) + movq mm5,mm1 + punpcklbw mm1,mm3 ; mm1=row[-1][0]( 0 1 2 3) + punpckhbw mm5,mm3 ; mm5=row[-1][0]( 4 5 6 7) + movq mm6,mm2 + punpcklbw mm2,mm3 ; mm2=row[+1][0]( 0 1 2 3) + punpckhbw mm6,mm3 ; mm6=row[+1][0]( 4 5 6 7) + + pmullw mm0,[GOTOFF(ebx,PW_THREE)] + pmullw mm4,[GOTOFF(ebx,PW_THREE)] + + pcmpeqb mm7,mm7 + psrlq mm7,(SIZEOF_MMWORD-2)*BYTE_BIT + + paddw mm1,mm0 ; mm1=Int0L=( 0 1 2 3) + paddw mm5,mm4 ; mm5=Int0H=( 4 5 6 7) + paddw mm2,mm0 ; mm2=Int1L=( 0 1 2 3) + paddw mm6,mm4 ; mm6=Int1H=( 4 5 6 7) + + movq MMWORD [edx+0*SIZEOF_MMWORD], mm1 ; temporarily save + movq MMWORD [edx+1*SIZEOF_MMWORD], mm5 ; the intermediate data + movq MMWORD [edi+0*SIZEOF_MMWORD], mm2 + movq MMWORD [edi+1*SIZEOF_MMWORD], mm6 + + pand mm1,mm7 ; mm1=( 0 - - -) + pand mm2,mm7 ; mm2=( 0 - - -) + + movq MMWORD [wk(0)], mm1 + movq MMWORD [wk(1)], mm2 + + poppic ebx + + add eax, byte SIZEOF_MMWORD-1 + and eax, byte -SIZEOF_MMWORD + cmp eax, byte SIZEOF_MMWORD + ja short .columnloop + alignx 16,7 + +.columnloop_last: + ; -- process the last column block + + pushpic ebx + movpic ebx, POINTER [gotptr] ; load GOT address + + pcmpeqb mm1,mm1 + psllq mm1,(SIZEOF_MMWORD-2)*BYTE_BIT + movq mm2,mm1 + + pand mm1, MMWORD [edx+1*SIZEOF_MMWORD] ; mm1=( - - - 7) + pand mm2, MMWORD [edi+1*SIZEOF_MMWORD] ; mm2=( - - - 7) + + movq MMWORD [wk(2)], mm1 + movq MMWORD [wk(3)], mm2 + + jmp short .upsample + alignx 16,7 + +.columnloop: + ; -- process the next column block + + movq mm0, MMWORD [ebx+1*SIZEOF_MMWORD] ; mm0=row[ 0][1] + movq mm1, MMWORD [ecx+1*SIZEOF_MMWORD] ; mm1=row[-1][1] + movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] ; mm2=row[+1][1] + + pushpic ebx + movpic ebx, POINTER [gotptr] ; load GOT address + + pxor mm3,mm3 ; mm3=(all 0's) + movq mm4,mm0 + punpcklbw mm0,mm3 ; mm0=row[ 0][1]( 0 1 2 3) + punpckhbw mm4,mm3 ; mm4=row[ 0][1]( 4 5 6 7) + movq mm5,mm1 + punpcklbw mm1,mm3 ; mm1=row[-1][1]( 0 1 2 3) + punpckhbw mm5,mm3 ; mm5=row[-1][1]( 4 5 6 7) + movq mm6,mm2 + punpcklbw mm2,mm3 ; mm2=row[+1][1]( 0 1 2 3) + punpckhbw mm6,mm3 ; mm6=row[+1][1]( 4 5 6 7) + + pmullw mm0,[GOTOFF(ebx,PW_THREE)] + pmullw mm4,[GOTOFF(ebx,PW_THREE)] + + paddw mm1,mm0 ; mm1=Int0L=( 0 1 2 3) + paddw mm5,mm4 ; mm5=Int0H=( 4 5 6 7) + paddw mm2,mm0 ; mm2=Int1L=( 0 1 2 3) + paddw mm6,mm4 ; mm6=Int1H=( 4 5 6 7) + + movq MMWORD [edx+2*SIZEOF_MMWORD], mm1 ; temporarily save + movq MMWORD [edx+3*SIZEOF_MMWORD], mm5 ; the intermediate data + movq MMWORD [edi+2*SIZEOF_MMWORD], mm2 + movq MMWORD [edi+3*SIZEOF_MMWORD], mm6 + + psllq mm1,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm1=( - - - 0) + psllq mm2,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm2=( - - - 0) + + movq MMWORD [wk(2)], mm1 + movq MMWORD [wk(3)], mm2 + +.upsample: + ; -- process the upper row + + movq mm7, MMWORD [edx+0*SIZEOF_MMWORD] ; mm7=Int0L=( 0 1 2 3) + movq mm3, MMWORD [edx+1*SIZEOF_MMWORD] ; mm3=Int0H=( 4 5 6 7) + + movq mm0,mm7 + movq mm4,mm3 + psrlq mm0,2*BYTE_BIT ; mm0=( 1 2 3 -) + psllq mm4,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm4=( - - - 4) + movq mm5,mm7 + movq mm6,mm3 + psrlq mm5,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm5=( 3 - - -) + psllq mm6,2*BYTE_BIT ; mm6=( - 4 5 6) + + por mm0,mm4 ; mm0=( 1 2 3 4) + por mm5,mm6 ; mm5=( 3 4 5 6) + + movq mm1,mm7 + movq mm2,mm3 + psllq mm1,2*BYTE_BIT ; mm1=( - 0 1 2) + psrlq mm2,2*BYTE_BIT ; mm2=( 5 6 7 -) + movq mm4,mm3 + psrlq mm4,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm4=( 7 - - -) + + por mm1, MMWORD [wk(0)] ; mm1=(-1 0 1 2) + por mm2, MMWORD [wk(2)] ; mm2=( 5 6 7 8) + + movq MMWORD [wk(0)], mm4 + + pmullw mm7,[GOTOFF(ebx,PW_THREE)] + pmullw mm3,[GOTOFF(ebx,PW_THREE)] + paddw mm1,[GOTOFF(ebx,PW_EIGHT)] + paddw mm5,[GOTOFF(ebx,PW_EIGHT)] + paddw mm0,[GOTOFF(ebx,PW_SEVEN)] + paddw mm2,[GOTOFF(ebx,PW_SEVEN)] + + paddw mm1,mm7 + paddw mm5,mm3 + psrlw mm1,4 ; mm1=Out0LE=( 0 2 4 6) + psrlw mm5,4 ; mm5=Out0HE=( 8 10 12 14) + paddw mm0,mm7 + paddw mm2,mm3 + psrlw mm0,4 ; mm0=Out0LO=( 1 3 5 7) + psrlw mm2,4 ; mm2=Out0HO=( 9 11 13 15) + + psllw mm0,BYTE_BIT + psllw mm2,BYTE_BIT + por mm1,mm0 ; mm1=Out0L=( 0 1 2 3 4 5 6 7) + por mm5,mm2 ; mm5=Out0H=( 8 9 10 11 12 13 14 15) + + movq MMWORD [edx+0*SIZEOF_MMWORD], mm1 + movq MMWORD [edx+1*SIZEOF_MMWORD], mm5 + + ; -- process the lower row + + movq mm6, MMWORD [edi+0*SIZEOF_MMWORD] ; mm6=Int1L=( 0 1 2 3) + movq mm4, MMWORD [edi+1*SIZEOF_MMWORD] ; mm4=Int1H=( 4 5 6 7) + + movq mm7,mm6 + movq mm3,mm4 + psrlq mm7,2*BYTE_BIT ; mm7=( 1 2 3 -) + psllq mm3,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm3=( - - - 4) + movq mm0,mm6 + movq mm2,mm4 + psrlq mm0,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm0=( 3 - - -) + psllq mm2,2*BYTE_BIT ; mm2=( - 4 5 6) + + por mm7,mm3 ; mm7=( 1 2 3 4) + por mm0,mm2 ; mm0=( 3 4 5 6) + + movq mm1,mm6 + movq mm5,mm4 + psllq mm1,2*BYTE_BIT ; mm1=( - 0 1 2) + psrlq mm5,2*BYTE_BIT ; mm5=( 5 6 7 -) + movq mm3,mm4 + psrlq mm3,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm3=( 7 - - -) + + por mm1, MMWORD [wk(1)] ; mm1=(-1 0 1 2) + por mm5, MMWORD [wk(3)] ; mm5=( 5 6 7 8) + + movq MMWORD [wk(1)], mm3 + + pmullw mm6,[GOTOFF(ebx,PW_THREE)] + pmullw mm4,[GOTOFF(ebx,PW_THREE)] + paddw mm1,[GOTOFF(ebx,PW_EIGHT)] + paddw mm0,[GOTOFF(ebx,PW_EIGHT)] + paddw mm7,[GOTOFF(ebx,PW_SEVEN)] + paddw mm5,[GOTOFF(ebx,PW_SEVEN)] + + paddw mm1,mm6 + paddw mm0,mm4 + psrlw mm1,4 ; mm1=Out1LE=( 0 2 4 6) + psrlw mm0,4 ; mm0=Out1HE=( 8 10 12 14) + paddw mm7,mm6 + paddw mm5,mm4 + psrlw mm7,4 ; mm7=Out1LO=( 1 3 5 7) + psrlw mm5,4 ; mm5=Out1HO=( 9 11 13 15) + + psllw mm7,BYTE_BIT + psllw mm5,BYTE_BIT + por mm1,mm7 ; mm1=Out1L=( 0 1 2 3 4 5 6 7) + por mm0,mm5 ; mm0=Out1H=( 8 9 10 11 12 13 14 15) + + movq MMWORD [edi+0*SIZEOF_MMWORD], mm1 + movq MMWORD [edi+1*SIZEOF_MMWORD], mm0 + + poppic ebx + + sub eax, byte SIZEOF_MMWORD + add ecx, byte 1*SIZEOF_MMWORD ; inptr1(above) + add ebx, byte 1*SIZEOF_MMWORD ; inptr0 + add esi, byte 1*SIZEOF_MMWORD ; inptr1(below) + add edx, byte 2*SIZEOF_MMWORD ; outptr0 + add edi, byte 2*SIZEOF_MMWORD ; outptr1 + cmp eax, byte SIZEOF_MMWORD + ja near .columnloop + test eax,eax + jnz near .columnloop_last + + pop esi + pop edi + pop ecx + pop eax + + add esi, byte 1*SIZEOF_JSAMPROW ; input_data + add edi, byte 2*SIZEOF_JSAMPROW ; output_data + sub ecx, byte 2 ; rowctr + jg near .rowloop + + emms ; empty MMX state + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Fast processing for the common case of 2:1 horizontal and 1:1 vertical. +; It's still a box filter. +; +; GLOBAL(void) +; jsimd_h2v1_upsample_mmx (int max_v_samp_factor, +; JDIMENSION output_width, +; JSAMPARRAY input_data, +; JSAMPARRAY *output_data_ptr); +; + +%define max_v_samp(b) (b)+8 ; int max_v_samp_factor +%define output_width(b) (b)+12 ; JDIMENSION output_width +%define input_data(b) (b)+16 ; JSAMPARRAY input_data +%define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr + + align 16 + global EXTN(jsimd_h2v1_upsample_mmx) + +EXTN(jsimd_h2v1_upsample_mmx): + push ebp + mov ebp,esp +; push ebx ; unused +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov edx, JDIMENSION [output_width(ebp)] + add edx, byte (2*SIZEOF_MMWORD)-1 + and edx, byte -(2*SIZEOF_MMWORD) + jz short .return + + mov ecx, INT [max_v_samp(ebp)] ; rowctr + test ecx,ecx + jz short .return + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, POINTER [output_data_ptr(ebp)] + mov edi, JSAMPARRAY [edi] ; output_data + alignx 16,7 +.rowloop: + push edi + push esi + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr + mov eax,edx ; colctr + alignx 16,7 +.columnloop: + + movq mm0, MMWORD [esi+0*SIZEOF_MMWORD] + + movq mm1,mm0 + punpcklbw mm0,mm0 + punpckhbw mm1,mm1 + + movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 + movq MMWORD [edi+1*SIZEOF_MMWORD], mm1 + + sub eax, byte 2*SIZEOF_MMWORD + jz short .nextrow + + movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] + + movq mm3,mm2 + punpcklbw mm2,mm2 + punpckhbw mm3,mm3 + + movq MMWORD [edi+2*SIZEOF_MMWORD], mm2 + movq MMWORD [edi+3*SIZEOF_MMWORD], mm3 + + sub eax, byte 2*SIZEOF_MMWORD + jz short .nextrow + + add esi, byte 2*SIZEOF_MMWORD ; inptr + add edi, byte 4*SIZEOF_MMWORD ; outptr + jmp short .columnloop + alignx 16,7 + +.nextrow: + pop esi + pop edi + + add esi, byte SIZEOF_JSAMPROW ; input_data + add edi, byte SIZEOF_JSAMPROW ; output_data + dec ecx ; rowctr + jg short .rowloop + + emms ; empty MMX state + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved +; pop ebx ; unused + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Fast processing for the common case of 2:1 horizontal and 2:1 vertical. +; It's still a box filter. +; +; GLOBAL(void) +; jsimd_h2v2_upsample_mmx (int max_v_samp_factor, +; JDIMENSION output_width, +; JSAMPARRAY input_data, +; JSAMPARRAY *output_data_ptr); +; + +%define max_v_samp(b) (b)+8 ; int max_v_samp_factor +%define output_width(b) (b)+12 ; JDIMENSION output_width +%define input_data(b) (b)+16 ; JSAMPARRAY input_data +%define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr + + align 16 + global EXTN(jsimd_h2v2_upsample_mmx) + +EXTN(jsimd_h2v2_upsample_mmx): + push ebp + mov ebp,esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov edx, JDIMENSION [output_width(ebp)] + add edx, byte (2*SIZEOF_MMWORD)-1 + and edx, byte -(2*SIZEOF_MMWORD) + jz near .return + + mov ecx, INT [max_v_samp(ebp)] ; rowctr + test ecx,ecx + jz short .return + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, POINTER [output_data_ptr(ebp)] + mov edi, JSAMPARRAY [edi] ; output_data + alignx 16,7 +.rowloop: + push edi + push esi + + mov esi, JSAMPROW [esi] ; inptr + mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 + mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 + mov eax,edx ; colctr + alignx 16,7 +.columnloop: + + movq mm0, MMWORD [esi+0*SIZEOF_MMWORD] + + movq mm1,mm0 + punpcklbw mm0,mm0 + punpckhbw mm1,mm1 + + movq MMWORD [ebx+0*SIZEOF_MMWORD], mm0 + movq MMWORD [ebx+1*SIZEOF_MMWORD], mm1 + movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 + movq MMWORD [edi+1*SIZEOF_MMWORD], mm1 + + sub eax, byte 2*SIZEOF_MMWORD + jz short .nextrow + + movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] + + movq mm3,mm2 + punpcklbw mm2,mm2 + punpckhbw mm3,mm3 + + movq MMWORD [ebx+2*SIZEOF_MMWORD], mm2 + movq MMWORD [ebx+3*SIZEOF_MMWORD], mm3 + movq MMWORD [edi+2*SIZEOF_MMWORD], mm2 + movq MMWORD [edi+3*SIZEOF_MMWORD], mm3 + + sub eax, byte 2*SIZEOF_MMWORD + jz short .nextrow + + add esi, byte 2*SIZEOF_MMWORD ; inptr + add ebx, byte 4*SIZEOF_MMWORD ; outptr0 + add edi, byte 4*SIZEOF_MMWORD ; outptr1 + jmp short .columnloop + alignx 16,7 + +.nextrow: + pop esi + pop edi + + add esi, byte 1*SIZEOF_JSAMPROW ; input_data + add edi, byte 2*SIZEOF_JSAMPROW ; output_data + sub ecx, byte 2 ; rowctr + jg short .rowloop + + emms ; empty MMX state + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/Builder/jni-1.11/simd/jdsample-sse2-64.asm b/Builder/jni-1.11/simd/jdsample-sse2-64.asm new file mode 100644 index 000000000..1faaed648 --- /dev/null +++ b/Builder/jni-1.11/simd/jdsample-sse2-64.asm @@ -0,0 +1,670 @@ +; +; jdsample.asm - upsampling (64-bit SSE2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2009, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_fancy_upsample_sse2) + +EXTN(jconst_fancy_upsample_sse2): + +PW_ONE times 8 dw 1 +PW_TWO times 8 dw 2 +PW_THREE times 8 dw 3 +PW_SEVEN times 8 dw 7 +PW_EIGHT times 8 dw 8 + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 +; +; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical. +; +; The upsampling algorithm is linear interpolation between pixel centers, +; also known as a "triangle filter". This is a good compromise between +; speed and visual quality. The centers of the output pixels are 1/4 and 3/4 +; of the way between input pixel centers. +; +; GLOBAL(void) +; jsimd_h2v1_fancy_upsample_sse2 (int max_v_samp_factor, +; JDIMENSION downsampled_width, +; JSAMPARRAY input_data, +; JSAMPARRAY *output_data_ptr); +; + +; r10 = int max_v_samp_factor +; r11 = JDIMENSION downsampled_width +; r12 = JSAMPARRAY input_data +; r13 = JSAMPARRAY *output_data_ptr + + align 16 + global EXTN(jsimd_h2v1_fancy_upsample_sse2) + +EXTN(jsimd_h2v1_fancy_upsample_sse2): + push rbp + mov rax,rsp + mov rbp,rsp + collect_args + + mov eax, r11d ; colctr + test rax,rax + jz near .return + + mov rcx, r10 ; rowctr + test rcx,rcx + jz near .return + + mov rsi, r12 ; input_data + mov rdi, r13 + mov rdi, JSAMPARRAY [rdi] ; output_data +.rowloop: + push rax ; colctr + push rdi + push rsi + + mov rsi, JSAMPROW [rsi] ; inptr + mov rdi, JSAMPROW [rdi] ; outptr + + test rax, SIZEOF_XMMWORD-1 + jz short .skip + mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample +.skip: + pxor xmm0,xmm0 ; xmm0=(all 0's) + pcmpeqb xmm7,xmm7 + psrldq xmm7,(SIZEOF_XMMWORD-1) + pand xmm7, XMMWORD [rsi+0*SIZEOF_XMMWORD] + + add rax, byte SIZEOF_XMMWORD-1 + and rax, byte -SIZEOF_XMMWORD + cmp rax, byte SIZEOF_XMMWORD + ja short .columnloop + +.columnloop_last: + pcmpeqb xmm6,xmm6 + pslldq xmm6,(SIZEOF_XMMWORD-1) + pand xmm6, XMMWORD [rsi+0*SIZEOF_XMMWORD] + jmp short .upsample + +.columnloop: + movdqa xmm6, XMMWORD [rsi+1*SIZEOF_XMMWORD] + pslldq xmm6,(SIZEOF_XMMWORD-1) + +.upsample: + movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqa xmm2,xmm1 + movdqa xmm3,xmm1 ; xmm1=( 0 1 2 ... 13 14 15) + pslldq xmm2,1 ; xmm2=(-- 0 1 ... 12 13 14) + psrldq xmm3,1 ; xmm3=( 1 2 3 ... 14 15 --) + + por xmm2,xmm7 ; xmm2=(-1 0 1 ... 12 13 14) + por xmm3,xmm6 ; xmm3=( 1 2 3 ... 14 15 16) + + movdqa xmm7,xmm1 + psrldq xmm7,(SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --) + + movdqa xmm4,xmm1 + punpcklbw xmm1,xmm0 ; xmm1=( 0 1 2 3 4 5 6 7) + punpckhbw xmm4,xmm0 ; xmm4=( 8 9 10 11 12 13 14 15) + movdqa xmm5,xmm2 + punpcklbw xmm2,xmm0 ; xmm2=(-1 0 1 2 3 4 5 6) + punpckhbw xmm5,xmm0 ; xmm5=( 7 8 9 10 11 12 13 14) + movdqa xmm6,xmm3 + punpcklbw xmm3,xmm0 ; xmm3=( 1 2 3 4 5 6 7 8) + punpckhbw xmm6,xmm0 ; xmm6=( 9 10 11 12 13 14 15 16) + + pmullw xmm1,[rel PW_THREE] + pmullw xmm4,[rel PW_THREE] + paddw xmm2,[rel PW_ONE] + paddw xmm5,[rel PW_ONE] + paddw xmm3,[rel PW_TWO] + paddw xmm6,[rel PW_TWO] + + paddw xmm2,xmm1 + paddw xmm5,xmm4 + psrlw xmm2,2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14) + psrlw xmm5,2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30) + paddw xmm3,xmm1 + paddw xmm6,xmm4 + psrlw xmm3,2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15) + psrlw xmm6,2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31) + + psllw xmm3,BYTE_BIT + psllw xmm6,BYTE_BIT + por xmm2,xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15) + por xmm5,xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31) + + movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm5 + + sub rax, byte SIZEOF_XMMWORD + add rsi, byte 1*SIZEOF_XMMWORD ; inptr + add rdi, byte 2*SIZEOF_XMMWORD ; outptr + cmp rax, byte SIZEOF_XMMWORD + ja near .columnloop + test eax,eax + jnz near .columnloop_last + + pop rsi + pop rdi + pop rax + + add rsi, byte SIZEOF_JSAMPROW ; input_data + add rdi, byte SIZEOF_JSAMPROW ; output_data + dec rcx ; rowctr + jg near .rowloop + +.return: + uncollect_args + pop rbp + ret + +; -------------------------------------------------------------------------- +; +; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical. +; Again a triangle filter; see comments for h2v1 case, above. +; +; GLOBAL(void) +; jsimd_h2v2_fancy_upsample_sse2 (int max_v_samp_factor, +; JDIMENSION downsampled_width, +; JSAMPARRAY input_data, +; JSAMPARRAY *output_data_ptr); +; + +; r10 = int max_v_samp_factor +; r11 = JDIMENSION downsampled_width +; r12 = JSAMPARRAY input_data +; r13 = JSAMPARRAY *output_data_ptr + +%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 4 + + align 16 + global EXTN(jsimd_h2v2_fancy_upsample_sse2) + +EXTN(jsimd_h2v2_fancy_upsample_sse2): + push rbp + mov rax,rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp],rax + mov rbp,rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args + push rbx + + mov eax, r11d ; colctr + test rax,rax + jz near .return + + mov rcx, r10 ; rowctr + test rcx,rcx + jz near .return + + mov rsi, r12 ; input_data + mov rdi, r13 + mov rdi, JSAMPARRAY [rdi] ; output_data +.rowloop: + push rax ; colctr + push rcx + push rdi + push rsi + + mov rcx, JSAMPROW [rsi-1*SIZEOF_JSAMPROW] ; inptr1(above) + mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0 + mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1(below) + mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0 + mov rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1 + + test rax, SIZEOF_XMMWORD-1 + jz short .skip + push rdx + mov dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl + mov dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl + mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample + pop rdx +.skip: + ; -- process the first column block + + movdqa xmm0, XMMWORD [rbx+0*SIZEOF_XMMWORD] ; xmm0=row[ 0][0] + movdqa xmm1, XMMWORD [rcx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0] + movdqa xmm2, XMMWORD [rsi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0] + + pxor xmm3,xmm3 ; xmm3=(all 0's) + movdqa xmm4,xmm0 + punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) + punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) + movdqa xmm5,xmm1 + punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) + punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) + movdqa xmm6,xmm2 + punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) + punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) + + pmullw xmm0,[rel PW_THREE] + pmullw xmm4,[rel PW_THREE] + + pcmpeqb xmm7,xmm7 + psrldq xmm7,(SIZEOF_XMMWORD-2) + + paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) + paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) + paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) + paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) + + movdqa XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save + movdqa XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data + movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm6 + + pand xmm1,xmm7 ; xmm1=( 0 -- -- -- -- -- -- --) + pand xmm2,xmm7 ; xmm2=( 0 -- -- -- -- -- -- --) + + movdqa XMMWORD [wk(0)], xmm1 + movdqa XMMWORD [wk(1)], xmm2 + + add rax, byte SIZEOF_XMMWORD-1 + and rax, byte -SIZEOF_XMMWORD + cmp rax, byte SIZEOF_XMMWORD + ja short .columnloop + +.columnloop_last: + ; -- process the last column block + + pcmpeqb xmm1,xmm1 + pslldq xmm1,(SIZEOF_XMMWORD-2) + movdqa xmm2,xmm1 + + pand xmm1, XMMWORD [rdx+1*SIZEOF_XMMWORD] + pand xmm2, XMMWORD [rdi+1*SIZEOF_XMMWORD] + + movdqa XMMWORD [wk(2)], xmm1 ; xmm1=(-- -- -- -- -- -- -- 15) + movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15) + + jmp near .upsample + +.columnloop: + ; -- process the next column block + + movdqa xmm0, XMMWORD [rbx+1*SIZEOF_XMMWORD] ; xmm0=row[ 0][1] + movdqa xmm1, XMMWORD [rcx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1] + movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1] + + pxor xmm3,xmm3 ; xmm3=(all 0's) + movdqa xmm4,xmm0 + punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) + punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) + movdqa xmm5,xmm1 + punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) + punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) + movdqa xmm6,xmm2 + punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) + punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) + + pmullw xmm0,[rel PW_THREE] + pmullw xmm4,[rel PW_THREE] + + paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) + paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) + paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) + paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) + + movdqa XMMWORD [rdx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save + movdqa XMMWORD [rdx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data + movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm6 + + pslldq xmm1,(SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0) + pslldq xmm2,(SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0) + + movdqa XMMWORD [wk(2)], xmm1 + movdqa XMMWORD [wk(3)], xmm2 + +.upsample: + ; -- process the upper row + + movdqa xmm7, XMMWORD [rdx+0*SIZEOF_XMMWORD] + movdqa xmm3, XMMWORD [rdx+1*SIZEOF_XMMWORD] + + movdqa xmm0,xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7) + movdqa xmm4,xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15) + psrldq xmm0,2 ; xmm0=( 1 2 3 4 5 6 7 --) + pslldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8) + movdqa xmm5,xmm7 + movdqa xmm6,xmm3 + psrldq xmm5,(SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --) + pslldq xmm6,2 ; xmm6=(-- 8 9 10 11 12 13 14) + + por xmm0,xmm4 ; xmm0=( 1 2 3 4 5 6 7 8) + por xmm5,xmm6 ; xmm5=( 7 8 9 10 11 12 13 14) + + movdqa xmm1,xmm7 + movdqa xmm2,xmm3 + pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6) + psrldq xmm2,2 ; xmm2=( 9 10 11 12 13 14 15 --) + movdqa xmm4,xmm3 + psrldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --) + + por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6) + por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16) + + movdqa XMMWORD [wk(0)], xmm4 + + pmullw xmm7,[rel PW_THREE] + pmullw xmm3,[rel PW_THREE] + paddw xmm1,[rel PW_EIGHT] + paddw xmm5,[rel PW_EIGHT] + paddw xmm0,[rel PW_SEVEN] + paddw xmm2,[rel PW_SEVEN] + + paddw xmm1,xmm7 + paddw xmm5,xmm3 + psrlw xmm1,4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14) + psrlw xmm5,4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30) + paddw xmm0,xmm7 + paddw xmm2,xmm3 + psrlw xmm0,4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15) + psrlw xmm2,4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31) + + psllw xmm0,BYTE_BIT + psllw xmm2,BYTE_BIT + por xmm1,xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15) + por xmm5,xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31) + + movdqa XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1 + movdqa XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5 + + ; -- process the lower row + + movdqa xmm6, XMMWORD [rdi+0*SIZEOF_XMMWORD] + movdqa xmm4, XMMWORD [rdi+1*SIZEOF_XMMWORD] + + movdqa xmm7,xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7) + movdqa xmm3,xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15) + psrldq xmm7,2 ; xmm7=( 1 2 3 4 5 6 7 --) + pslldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8) + movdqa xmm0,xmm6 + movdqa xmm2,xmm4 + psrldq xmm0,(SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --) + pslldq xmm2,2 ; xmm2=(-- 8 9 10 11 12 13 14) + + por xmm7,xmm3 ; xmm7=( 1 2 3 4 5 6 7 8) + por xmm0,xmm2 ; xmm0=( 7 8 9 10 11 12 13 14) + + movdqa xmm1,xmm6 + movdqa xmm5,xmm4 + pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6) + psrldq xmm5,2 ; xmm5=( 9 10 11 12 13 14 15 --) + movdqa xmm3,xmm4 + psrldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --) + + por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6) + por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16) + + movdqa XMMWORD [wk(1)], xmm3 + + pmullw xmm6,[rel PW_THREE] + pmullw xmm4,[rel PW_THREE] + paddw xmm1,[rel PW_EIGHT] + paddw xmm0,[rel PW_EIGHT] + paddw xmm7,[rel PW_SEVEN] + paddw xmm5,[rel PW_SEVEN] + + paddw xmm1,xmm6 + paddw xmm0,xmm4 + psrlw xmm1,4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14) + psrlw xmm0,4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30) + paddw xmm7,xmm6 + paddw xmm5,xmm4 + psrlw xmm7,4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15) + psrlw xmm5,4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31) + + psllw xmm7,BYTE_BIT + psllw xmm5,BYTE_BIT + por xmm1,xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15) + por xmm0,xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31) + + movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm1 + movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm0 + + sub rax, byte SIZEOF_XMMWORD + add rcx, byte 1*SIZEOF_XMMWORD ; inptr1(above) + add rbx, byte 1*SIZEOF_XMMWORD ; inptr0 + add rsi, byte 1*SIZEOF_XMMWORD ; inptr1(below) + add rdx, byte 2*SIZEOF_XMMWORD ; outptr0 + add rdi, byte 2*SIZEOF_XMMWORD ; outptr1 + cmp rax, byte SIZEOF_XMMWORD + ja near .columnloop + test rax,rax + jnz near .columnloop_last + + pop rsi + pop rdi + pop rcx + pop rax + + add rsi, byte 1*SIZEOF_JSAMPROW ; input_data + add rdi, byte 2*SIZEOF_JSAMPROW ; output_data + sub rcx, byte 2 ; rowctr + jg near .rowloop + +.return: + pop rbx + uncollect_args + mov rsp,rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +; -------------------------------------------------------------------------- +; +; Fast processing for the common case of 2:1 horizontal and 1:1 vertical. +; It's still a box filter. +; +; GLOBAL(void) +; jsimd_h2v1_upsample_sse2 (int max_v_samp_factor, +; JDIMENSION output_width, +; JSAMPARRAY input_data, +; JSAMPARRAY *output_data_ptr); +; + +; r10 = int max_v_samp_factor +; r11 = JDIMENSION output_width +; r12 = JSAMPARRAY input_data +; r13 = JSAMPARRAY *output_data_ptr + + align 16 + global EXTN(jsimd_h2v1_upsample_sse2) + +EXTN(jsimd_h2v1_upsample_sse2): + push rbp + mov rax,rsp + mov rbp,rsp + collect_args + + mov edx, r11d + add rdx, byte (2*SIZEOF_XMMWORD)-1 + and rdx, byte -(2*SIZEOF_XMMWORD) + jz near .return + + mov rcx, r10 ; rowctr + test rcx,rcx + jz short .return + + mov rsi, r12 ; input_data + mov rdi, r13 + mov rdi, JSAMPARRAY [rdi] ; output_data +.rowloop: + push rdi + push rsi + + mov rsi, JSAMPROW [rsi] ; inptr + mov rdi, JSAMPROW [rdi] ; outptr + mov rax,rdx ; colctr +.columnloop: + + movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] + + movdqa xmm1,xmm0 + punpcklbw xmm0,xmm0 + punpckhbw xmm1,xmm1 + + movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 + movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1 + + sub rax, byte 2*SIZEOF_XMMWORD + jz short .nextrow + + movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD] + + movdqa xmm3,xmm2 + punpcklbw xmm2,xmm2 + punpckhbw xmm3,xmm3 + + movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3 + + sub rax, byte 2*SIZEOF_XMMWORD + jz short .nextrow + + add rsi, byte 2*SIZEOF_XMMWORD ; inptr + add rdi, byte 4*SIZEOF_XMMWORD ; outptr + jmp short .columnloop + +.nextrow: + pop rsi + pop rdi + + add rsi, byte SIZEOF_JSAMPROW ; input_data + add rdi, byte SIZEOF_JSAMPROW ; output_data + dec rcx ; rowctr + jg short .rowloop + +.return: + uncollect_args + pop rbp + ret + +; -------------------------------------------------------------------------- +; +; Fast processing for the common case of 2:1 horizontal and 2:1 vertical. +; It's still a box filter. +; +; GLOBAL(void) +; jsimd_h2v2_upsample_sse2 (nt max_v_samp_factor, +; JDIMENSION output_width, +; JSAMPARRAY input_data, +; JSAMPARRAY *output_data_ptr); +; + +; r10 = int max_v_samp_factor +; r11 = JDIMENSION output_width +; r12 = JSAMPARRAY input_data +; r13 = JSAMPARRAY *output_data_ptr + + align 16 + global EXTN(jsimd_h2v2_upsample_sse2) + +EXTN(jsimd_h2v2_upsample_sse2): + push rbp + mov rax,rsp + mov rbp,rsp + collect_args + push rbx + + mov edx, r11d + add rdx, byte (2*SIZEOF_XMMWORD)-1 + and rdx, byte -(2*SIZEOF_XMMWORD) + jz near .return + + mov rcx, r10 ; rowctr + test rcx,rcx + jz near .return + + mov rsi, r12 ; input_data + mov rdi, r13 + mov rdi, JSAMPARRAY [rdi] ; output_data +.rowloop: + push rdi + push rsi + + mov rsi, JSAMPROW [rsi] ; inptr + mov rbx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0 + mov rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1 + mov rax,rdx ; colctr +.columnloop: + + movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] + + movdqa xmm1,xmm0 + punpcklbw xmm0,xmm0 + punpckhbw xmm1,xmm1 + + movdqa XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0 + movdqa XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1 + movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 + movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1 + + sub rax, byte 2*SIZEOF_XMMWORD + jz short .nextrow + + movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD] + + movdqa xmm3,xmm2 + punpcklbw xmm2,xmm2 + punpckhbw xmm3,xmm3 + + movdqa XMMWORD [rbx+2*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [rbx+3*SIZEOF_XMMWORD], xmm3 + movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3 + + sub rax, byte 2*SIZEOF_XMMWORD + jz short .nextrow + + add rsi, byte 2*SIZEOF_XMMWORD ; inptr + add rbx, byte 4*SIZEOF_XMMWORD ; outptr0 + add rdi, byte 4*SIZEOF_XMMWORD ; outptr1 + jmp short .columnloop + +.nextrow: + pop rsi + pop rdi + + add rsi, byte 1*SIZEOF_JSAMPROW ; input_data + add rdi, byte 2*SIZEOF_JSAMPROW ; output_data + sub rcx, byte 2 ; rowctr + jg near .rowloop + +.return: + pop rbx + uncollect_args + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/Builder/jni-1.11/simd/jdsample-sse2.asm b/Builder/jni-1.11/simd/jdsample-sse2.asm new file mode 100644 index 000000000..1d0059e80 --- /dev/null +++ b/Builder/jni-1.11/simd/jdsample-sse2.asm @@ -0,0 +1,728 @@ +; +; jdsample.asm - upsampling (SSE2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_fancy_upsample_sse2) + +EXTN(jconst_fancy_upsample_sse2): + +PW_ONE times 8 dw 1 +PW_TWO times 8 dw 2 +PW_THREE times 8 dw 3 +PW_SEVEN times 8 dw 7 +PW_EIGHT times 8 dw 8 + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical. +; +; The upsampling algorithm is linear interpolation between pixel centers, +; also known as a "triangle filter". This is a good compromise between +; speed and visual quality. The centers of the output pixels are 1/4 and 3/4 +; of the way between input pixel centers. +; +; GLOBAL(void) +; jsimd_h2v1_fancy_upsample_sse2 (int max_v_samp_factor, +; JDIMENSION downsampled_width, +; JSAMPARRAY input_data, +; JSAMPARRAY *output_data_ptr); +; + +%define max_v_samp(b) (b)+8 ; int max_v_samp_factor +%define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width +%define input_data(b) (b)+16 ; JSAMPARRAY input_data +%define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr + + align 16 + global EXTN(jsimd_h2v1_fancy_upsample_sse2) + +EXTN(jsimd_h2v1_fancy_upsample_sse2): + push ebp + mov ebp,esp + pushpic ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr + test eax,eax + jz near .return + + mov ecx, INT [max_v_samp(ebp)] ; rowctr + test ecx,ecx + jz near .return + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, POINTER [output_data_ptr(ebp)] + mov edi, JSAMPARRAY [edi] ; output_data + alignx 16,7 +.rowloop: + push eax ; colctr + push edi + push esi + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr + + test eax, SIZEOF_XMMWORD-1 + jz short .skip + mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample +.skip: + pxor xmm0,xmm0 ; xmm0=(all 0's) + pcmpeqb xmm7,xmm7 + psrldq xmm7,(SIZEOF_XMMWORD-1) + pand xmm7, XMMWORD [esi+0*SIZEOF_XMMWORD] + + add eax, byte SIZEOF_XMMWORD-1 + and eax, byte -SIZEOF_XMMWORD + cmp eax, byte SIZEOF_XMMWORD + ja short .columnloop + alignx 16,7 + +.columnloop_last: + pcmpeqb xmm6,xmm6 + pslldq xmm6,(SIZEOF_XMMWORD-1) + pand xmm6, XMMWORD [esi+0*SIZEOF_XMMWORD] + jmp short .upsample + alignx 16,7 + +.columnloop: + movdqa xmm6, XMMWORD [esi+1*SIZEOF_XMMWORD] + pslldq xmm6,(SIZEOF_XMMWORD-1) + +.upsample: + movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqa xmm2,xmm1 + movdqa xmm3,xmm1 ; xmm1=( 0 1 2 ... 13 14 15) + pslldq xmm2,1 ; xmm2=(-- 0 1 ... 12 13 14) + psrldq xmm3,1 ; xmm3=( 1 2 3 ... 14 15 --) + + por xmm2,xmm7 ; xmm2=(-1 0 1 ... 12 13 14) + por xmm3,xmm6 ; xmm3=( 1 2 3 ... 14 15 16) + + movdqa xmm7,xmm1 + psrldq xmm7,(SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --) + + movdqa xmm4,xmm1 + punpcklbw xmm1,xmm0 ; xmm1=( 0 1 2 3 4 5 6 7) + punpckhbw xmm4,xmm0 ; xmm4=( 8 9 10 11 12 13 14 15) + movdqa xmm5,xmm2 + punpcklbw xmm2,xmm0 ; xmm2=(-1 0 1 2 3 4 5 6) + punpckhbw xmm5,xmm0 ; xmm5=( 7 8 9 10 11 12 13 14) + movdqa xmm6,xmm3 + punpcklbw xmm3,xmm0 ; xmm3=( 1 2 3 4 5 6 7 8) + punpckhbw xmm6,xmm0 ; xmm6=( 9 10 11 12 13 14 15 16) + + pmullw xmm1,[GOTOFF(ebx,PW_THREE)] + pmullw xmm4,[GOTOFF(ebx,PW_THREE)] + paddw xmm2,[GOTOFF(ebx,PW_ONE)] + paddw xmm5,[GOTOFF(ebx,PW_ONE)] + paddw xmm3,[GOTOFF(ebx,PW_TWO)] + paddw xmm6,[GOTOFF(ebx,PW_TWO)] + + paddw xmm2,xmm1 + paddw xmm5,xmm4 + psrlw xmm2,2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14) + psrlw xmm5,2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30) + paddw xmm3,xmm1 + paddw xmm6,xmm4 + psrlw xmm3,2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15) + psrlw xmm6,2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31) + + psllw xmm3,BYTE_BIT + psllw xmm6,BYTE_BIT + por xmm2,xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15) + por xmm5,xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31) + + movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm5 + + sub eax, byte SIZEOF_XMMWORD + add esi, byte 1*SIZEOF_XMMWORD ; inptr + add edi, byte 2*SIZEOF_XMMWORD ; outptr + cmp eax, byte SIZEOF_XMMWORD + ja near .columnloop + test eax,eax + jnz near .columnloop_last + + pop esi + pop edi + pop eax + + add esi, byte SIZEOF_JSAMPROW ; input_data + add edi, byte SIZEOF_JSAMPROW ; output_data + dec ecx ; rowctr + jg near .rowloop + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + poppic ebx + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical. +; Again a triangle filter; see comments for h2v1 case, above. +; +; GLOBAL(void) +; jsimd_h2v2_fancy_upsample_sse2 (int max_v_samp_factor, +; JDIMENSION downsampled_width, +; JSAMPARRAY input_data, +; JSAMPARRAY *output_data_ptr); +; + +%define max_v_samp(b) (b)+8 ; int max_v_samp_factor +%define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width +%define input_data(b) (b)+16 ; JSAMPARRAY input_data +%define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr + +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 4 +%define gotptr wk(0)-SIZEOF_POINTER ; void *gotptr + + align 16 + global EXTN(jsimd_h2v2_fancy_upsample_sse2) + +EXTN(jsimd_h2v2_fancy_upsample_sse2): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov edx,eax ; edx = original ebp + mov eax, JDIMENSION [downsamp_width(edx)] ; colctr + test eax,eax + jz near .return + + mov ecx, INT [max_v_samp(edx)] ; rowctr + test ecx,ecx + jz near .return + + mov esi, JSAMPARRAY [input_data(edx)] ; input_data + mov edi, POINTER [output_data_ptr(edx)] + mov edi, JSAMPARRAY [edi] ; output_data + alignx 16,7 +.rowloop: + push eax ; colctr + push ecx + push edi + push esi + + mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above) + mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 + mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below) + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 + mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 + + test eax, SIZEOF_XMMWORD-1 + jz short .skip + push edx + mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl + mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl + mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample + pop edx +.skip: + ; -- process the first column block + + movdqa xmm0, XMMWORD [ebx+0*SIZEOF_XMMWORD] ; xmm0=row[ 0][0] + movdqa xmm1, XMMWORD [ecx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0] + movdqa xmm2, XMMWORD [esi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0] + + pushpic ebx + movpic ebx, POINTER [gotptr] ; load GOT address + + pxor xmm3,xmm3 ; xmm3=(all 0's) + movdqa xmm4,xmm0 + punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) + punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) + movdqa xmm5,xmm1 + punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) + punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) + movdqa xmm6,xmm2 + punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) + punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) + + pmullw xmm0,[GOTOFF(ebx,PW_THREE)] + pmullw xmm4,[GOTOFF(ebx,PW_THREE)] + + pcmpeqb xmm7,xmm7 + psrldq xmm7,(SIZEOF_XMMWORD-2) + + paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) + paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) + paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) + paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) + + movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save + movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data + movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm6 + + pand xmm1,xmm7 ; xmm1=( 0 -- -- -- -- -- -- --) + pand xmm2,xmm7 ; xmm2=( 0 -- -- -- -- -- -- --) + + movdqa XMMWORD [wk(0)], xmm1 + movdqa XMMWORD [wk(1)], xmm2 + + poppic ebx + + add eax, byte SIZEOF_XMMWORD-1 + and eax, byte -SIZEOF_XMMWORD + cmp eax, byte SIZEOF_XMMWORD + ja short .columnloop + alignx 16,7 + +.columnloop_last: + ; -- process the last column block + + pushpic ebx + movpic ebx, POINTER [gotptr] ; load GOT address + + pcmpeqb xmm1,xmm1 + pslldq xmm1,(SIZEOF_XMMWORD-2) + movdqa xmm2,xmm1 + + pand xmm1, XMMWORD [edx+1*SIZEOF_XMMWORD] + pand xmm2, XMMWORD [edi+1*SIZEOF_XMMWORD] + + movdqa XMMWORD [wk(2)], xmm1 ; xmm1=(-- -- -- -- -- -- -- 15) + movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15) + + jmp near .upsample + alignx 16,7 + +.columnloop: + ; -- process the next column block + + movdqa xmm0, XMMWORD [ebx+1*SIZEOF_XMMWORD] ; xmm0=row[ 0][1] + movdqa xmm1, XMMWORD [ecx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1] + movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1] + + pushpic ebx + movpic ebx, POINTER [gotptr] ; load GOT address + + pxor xmm3,xmm3 ; xmm3=(all 0's) + movdqa xmm4,xmm0 + punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) + punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) + movdqa xmm5,xmm1 + punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) + punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) + movdqa xmm6,xmm2 + punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) + punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) + + pmullw xmm0,[GOTOFF(ebx,PW_THREE)] + pmullw xmm4,[GOTOFF(ebx,PW_THREE)] + + paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) + paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) + paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) + paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) + + movdqa XMMWORD [edx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save + movdqa XMMWORD [edx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data + movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm6 + + pslldq xmm1,(SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0) + pslldq xmm2,(SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0) + + movdqa XMMWORD [wk(2)], xmm1 + movdqa XMMWORD [wk(3)], xmm2 + +.upsample: + ; -- process the upper row + + movdqa xmm7, XMMWORD [edx+0*SIZEOF_XMMWORD] + movdqa xmm3, XMMWORD [edx+1*SIZEOF_XMMWORD] + + movdqa xmm0,xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7) + movdqa xmm4,xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15) + psrldq xmm0,2 ; xmm0=( 1 2 3 4 5 6 7 --) + pslldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8) + movdqa xmm5,xmm7 + movdqa xmm6,xmm3 + psrldq xmm5,(SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --) + pslldq xmm6,2 ; xmm6=(-- 8 9 10 11 12 13 14) + + por xmm0,xmm4 ; xmm0=( 1 2 3 4 5 6 7 8) + por xmm5,xmm6 ; xmm5=( 7 8 9 10 11 12 13 14) + + movdqa xmm1,xmm7 + movdqa xmm2,xmm3 + pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6) + psrldq xmm2,2 ; xmm2=( 9 10 11 12 13 14 15 --) + movdqa xmm4,xmm3 + psrldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --) + + por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6) + por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16) + + movdqa XMMWORD [wk(0)], xmm4 + + pmullw xmm7,[GOTOFF(ebx,PW_THREE)] + pmullw xmm3,[GOTOFF(ebx,PW_THREE)] + paddw xmm1,[GOTOFF(ebx,PW_EIGHT)] + paddw xmm5,[GOTOFF(ebx,PW_EIGHT)] + paddw xmm0,[GOTOFF(ebx,PW_SEVEN)] + paddw xmm2,[GOTOFF(ebx,PW_SEVEN)] + + paddw xmm1,xmm7 + paddw xmm5,xmm3 + psrlw xmm1,4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14) + psrlw xmm5,4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30) + paddw xmm0,xmm7 + paddw xmm2,xmm3 + psrlw xmm0,4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15) + psrlw xmm2,4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31) + + psllw xmm0,BYTE_BIT + psllw xmm2,BYTE_BIT + por xmm1,xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15) + por xmm5,xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31) + + movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1 + movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5 + + ; -- process the lower row + + movdqa xmm6, XMMWORD [edi+0*SIZEOF_XMMWORD] + movdqa xmm4, XMMWORD [edi+1*SIZEOF_XMMWORD] + + movdqa xmm7,xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7) + movdqa xmm3,xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15) + psrldq xmm7,2 ; xmm7=( 1 2 3 4 5 6 7 --) + pslldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8) + movdqa xmm0,xmm6 + movdqa xmm2,xmm4 + psrldq xmm0,(SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --) + pslldq xmm2,2 ; xmm2=(-- 8 9 10 11 12 13 14) + + por xmm7,xmm3 ; xmm7=( 1 2 3 4 5 6 7 8) + por xmm0,xmm2 ; xmm0=( 7 8 9 10 11 12 13 14) + + movdqa xmm1,xmm6 + movdqa xmm5,xmm4 + pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6) + psrldq xmm5,2 ; xmm5=( 9 10 11 12 13 14 15 --) + movdqa xmm3,xmm4 + psrldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --) + + por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6) + por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16) + + movdqa XMMWORD [wk(1)], xmm3 + + pmullw xmm6,[GOTOFF(ebx,PW_THREE)] + pmullw xmm4,[GOTOFF(ebx,PW_THREE)] + paddw xmm1,[GOTOFF(ebx,PW_EIGHT)] + paddw xmm0,[GOTOFF(ebx,PW_EIGHT)] + paddw xmm7,[GOTOFF(ebx,PW_SEVEN)] + paddw xmm5,[GOTOFF(ebx,PW_SEVEN)] + + paddw xmm1,xmm6 + paddw xmm0,xmm4 + psrlw xmm1,4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14) + psrlw xmm0,4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30) + paddw xmm7,xmm6 + paddw xmm5,xmm4 + psrlw xmm7,4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15) + psrlw xmm5,4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31) + + psllw xmm7,BYTE_BIT + psllw xmm5,BYTE_BIT + por xmm1,xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15) + por xmm0,xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31) + + movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm1 + movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm0 + + poppic ebx + + sub eax, byte SIZEOF_XMMWORD + add ecx, byte 1*SIZEOF_XMMWORD ; inptr1(above) + add ebx, byte 1*SIZEOF_XMMWORD ; inptr0 + add esi, byte 1*SIZEOF_XMMWORD ; inptr1(below) + add edx, byte 2*SIZEOF_XMMWORD ; outptr0 + add edi, byte 2*SIZEOF_XMMWORD ; outptr1 + cmp eax, byte SIZEOF_XMMWORD + ja near .columnloop + test eax,eax + jnz near .columnloop_last + + pop esi + pop edi + pop ecx + pop eax + + add esi, byte 1*SIZEOF_JSAMPROW ; input_data + add edi, byte 2*SIZEOF_JSAMPROW ; output_data + sub ecx, byte 2 ; rowctr + jg near .rowloop + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Fast processing for the common case of 2:1 horizontal and 1:1 vertical. +; It's still a box filter. +; +; GLOBAL(void) +; jsimd_h2v1_upsample_sse2 (int max_v_samp_factor, +; JDIMENSION output_width, +; JSAMPARRAY input_data, +; JSAMPARRAY *output_data_ptr); +; + +%define max_v_samp(b) (b)+8 ; int max_v_samp_factor +%define output_width(b) (b)+12 ; JDIMENSION output_width +%define input_data(b) (b)+16 ; JSAMPARRAY input_data +%define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr + + align 16 + global EXTN(jsimd_h2v1_upsample_sse2) + +EXTN(jsimd_h2v1_upsample_sse2): + push ebp + mov ebp,esp +; push ebx ; unused +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov edx, JDIMENSION [output_width(ebp)] + add edx, byte (2*SIZEOF_XMMWORD)-1 + and edx, byte -(2*SIZEOF_XMMWORD) + jz short .return + + mov ecx, INT [max_v_samp(ebp)] ; rowctr + test ecx,ecx + jz short .return + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, POINTER [output_data_ptr(ebp)] + mov edi, JSAMPARRAY [edi] ; output_data + alignx 16,7 +.rowloop: + push edi + push esi + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr + mov eax,edx ; colctr + alignx 16,7 +.columnloop: + + movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] + + movdqa xmm1,xmm0 + punpcklbw xmm0,xmm0 + punpckhbw xmm1,xmm1 + + movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 + movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1 + + sub eax, byte 2*SIZEOF_XMMWORD + jz short .nextrow + + movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] + + movdqa xmm3,xmm2 + punpcklbw xmm2,xmm2 + punpckhbw xmm3,xmm3 + + movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3 + + sub eax, byte 2*SIZEOF_XMMWORD + jz short .nextrow + + add esi, byte 2*SIZEOF_XMMWORD ; inptr + add edi, byte 4*SIZEOF_XMMWORD ; outptr + jmp short .columnloop + alignx 16,7 + +.nextrow: + pop esi + pop edi + + add esi, byte SIZEOF_JSAMPROW ; input_data + add edi, byte SIZEOF_JSAMPROW ; output_data + dec ecx ; rowctr + jg short .rowloop + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved +; pop ebx ; unused + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Fast processing for the common case of 2:1 horizontal and 2:1 vertical. +; It's still a box filter. +; +; GLOBAL(void) +; jsimd_h2v2_upsample_sse2 (nt max_v_samp_factor, +; JDIMENSION output_width, +; JSAMPARRAY input_data, +; JSAMPARRAY *output_data_ptr); +; + +%define max_v_samp(b) (b)+8 ; int max_v_samp_factor +%define output_width(b) (b)+12 ; JDIMENSION output_width +%define input_data(b) (b)+16 ; JSAMPARRAY input_data +%define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr + + align 16 + global EXTN(jsimd_h2v2_upsample_sse2) + +EXTN(jsimd_h2v2_upsample_sse2): + push ebp + mov ebp,esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov edx, JDIMENSION [output_width(ebp)] + add edx, byte (2*SIZEOF_XMMWORD)-1 + and edx, byte -(2*SIZEOF_XMMWORD) + jz near .return + + mov ecx, INT [max_v_samp(ebp)] ; rowctr + test ecx,ecx + jz near .return + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, POINTER [output_data_ptr(ebp)] + mov edi, JSAMPARRAY [edi] ; output_data + alignx 16,7 +.rowloop: + push edi + push esi + + mov esi, JSAMPROW [esi] ; inptr + mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 + mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 + mov eax,edx ; colctr + alignx 16,7 +.columnloop: + + movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] + + movdqa xmm1,xmm0 + punpcklbw xmm0,xmm0 + punpckhbw xmm1,xmm1 + + movdqa XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0 + movdqa XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1 + movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 + movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1 + + sub eax, byte 2*SIZEOF_XMMWORD + jz short .nextrow + + movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] + + movdqa xmm3,xmm2 + punpcklbw xmm2,xmm2 + punpckhbw xmm3,xmm3 + + movdqa XMMWORD [ebx+2*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [ebx+3*SIZEOF_XMMWORD], xmm3 + movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3 + + sub eax, byte 2*SIZEOF_XMMWORD + jz short .nextrow + + add esi, byte 2*SIZEOF_XMMWORD ; inptr + add ebx, byte 4*SIZEOF_XMMWORD ; outptr0 + add edi, byte 4*SIZEOF_XMMWORD ; outptr1 + jmp short .columnloop + alignx 16,7 + +.nextrow: + pop esi + pop edi + + add esi, byte 1*SIZEOF_JSAMPROW ; input_data + add edi, byte 2*SIZEOF_JSAMPROW ; output_data + sub ecx, byte 2 ; rowctr + jg short .rowloop + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/Builder/jni-1.11/simd/jfdctflt-3dn.asm b/Builder/jni-1.11/simd/jfdctflt-3dn.asm new file mode 100644 index 000000000..219161819 --- /dev/null +++ b/Builder/jni-1.11/simd/jfdctflt-3dn.asm @@ -0,0 +1,319 @@ +; +; jfdctflt.asm - floating-point FDCT (3DNow!) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a floating-point implementation of the forward DCT +; (Discrete Cosine Transform). The following code is based directly on +; the IJG's original jfdctflt.c; see the jfdctflt.c for more details. +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_fdct_float_3dnow) + +EXTN(jconst_fdct_float_3dnow): + +PD_0_382 times 2 dd 0.382683432365089771728460 +PD_0_707 times 2 dd 0.707106781186547524400844 +PD_0_541 times 2 dd 0.541196100146196984399723 +PD_1_306 times 2 dd 1.306562964876376527856643 + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform the forward DCT on one block of samples. +; +; GLOBAL(void) +; jsimd_fdct_float_3dnow (FAST_FLOAT *data) +; + +%define data(b) (b)+8 ; FAST_FLOAT *data + +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] +%define WK_NUM 2 + + align 16 + global EXTN(jsimd_fdct_float_3dnow) + +EXTN(jsimd_fdct_float_3dnow): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved +; push esi ; unused +; push edi ; unused + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process rows. + + mov edx, POINTER [data(eax)] ; (FAST_FLOAT *) + mov ecx, DCTSIZE/2 + alignx 16,7 +.rowloop: + + movq mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] + movq mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] + movq mm2, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)] + movq mm3, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)] + + ; mm0=(00 01), mm1=(10 11), mm2=(06 07), mm3=(16 17) + + movq mm4,mm0 ; transpose coefficients + punpckldq mm0,mm1 ; mm0=(00 10)=data0 + punpckhdq mm4,mm1 ; mm4=(01 11)=data1 + movq mm5,mm2 ; transpose coefficients + punpckldq mm2,mm3 ; mm2=(06 16)=data6 + punpckhdq mm5,mm3 ; mm5=(07 17)=data7 + + movq mm6,mm4 + movq mm7,mm0 + pfsub mm4,mm2 ; mm4=data1-data6=tmp6 + pfsub mm0,mm5 ; mm0=data0-data7=tmp7 + pfadd mm6,mm2 ; mm6=data1+data6=tmp1 + pfadd mm7,mm5 ; mm7=data0+data7=tmp0 + + movq mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] + movq mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] + movq mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)] + movq mm5, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)] + + ; mm1=(02 03), mm3=(12 13), mm2=(04 05), mm5=(14 15) + + movq MMWORD [wk(0)], mm4 ; wk(0)=tmp6 + movq MMWORD [wk(1)], mm0 ; wk(1)=tmp7 + + movq mm4,mm1 ; transpose coefficients + punpckldq mm1,mm3 ; mm1=(02 12)=data2 + punpckhdq mm4,mm3 ; mm4=(03 13)=data3 + movq mm0,mm2 ; transpose coefficients + punpckldq mm2,mm5 ; mm2=(04 14)=data4 + punpckhdq mm0,mm5 ; mm0=(05 15)=data5 + + movq mm3,mm4 + movq mm5,mm1 + pfadd mm4,mm2 ; mm4=data3+data4=tmp3 + pfadd mm1,mm0 ; mm1=data2+data5=tmp2 + pfsub mm3,mm2 ; mm3=data3-data4=tmp4 + pfsub mm5,mm0 ; mm5=data2-data5=tmp5 + + ; -- Even part + + movq mm2,mm7 + movq mm0,mm6 + pfsub mm7,mm4 ; mm7=tmp13 + pfsub mm6,mm1 ; mm6=tmp12 + pfadd mm2,mm4 ; mm2=tmp10 + pfadd mm0,mm1 ; mm0=tmp11 + + pfadd mm6,mm7 + pfmul mm6,[GOTOFF(ebx,PD_0_707)] ; mm6=z1 + + movq mm4,mm2 + movq mm1,mm7 + pfsub mm2,mm0 ; mm2=data4 + pfsub mm7,mm6 ; mm7=data6 + pfadd mm4,mm0 ; mm4=data0 + pfadd mm1,mm6 ; mm1=data2 + + movq MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)], mm2 + movq MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)], mm7 + movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4 + movq MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], mm1 + + ; -- Odd part + + movq mm0, MMWORD [wk(0)] ; mm0=tmp6 + movq mm6, MMWORD [wk(1)] ; mm6=tmp7 + + pfadd mm3,mm5 ; mm3=tmp10 + pfadd mm5,mm0 ; mm5=tmp11 + pfadd mm0,mm6 ; mm0=tmp12, mm6=tmp7 + + pfmul mm5,[GOTOFF(ebx,PD_0_707)] ; mm5=z3 + + movq mm2,mm3 ; mm2=tmp10 + pfsub mm3,mm0 + pfmul mm3,[GOTOFF(ebx,PD_0_382)] ; mm3=z5 + pfmul mm2,[GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610) + pfmul mm0,[GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296) + pfadd mm2,mm3 ; mm2=z2 + pfadd mm0,mm3 ; mm0=z4 + + movq mm7,mm6 + pfsub mm6,mm5 ; mm6=z13 + pfadd mm7,mm5 ; mm7=z11 + + movq mm4,mm6 + movq mm1,mm7 + pfsub mm6,mm2 ; mm6=data3 + pfsub mm7,mm0 ; mm7=data7 + pfadd mm4,mm2 ; mm4=data5 + pfadd mm1,mm0 ; mm1=data1 + + movq MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], mm6 + movq MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)], mm7 + movq MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)], mm4 + movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1 + + add edx, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT + dec ecx + jnz near .rowloop + + ; ---- Pass 2: process columns. + + mov edx, POINTER [data(eax)] ; (FAST_FLOAT *) + mov ecx, DCTSIZE/2 + alignx 16,7 +.columnloop: + + movq mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] + movq mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] + movq mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)] + movq mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)] + + ; mm0=(00 10), mm1=(01 11), mm2=(60 70), mm3=(61 71) + + movq mm4,mm0 ; transpose coefficients + punpckldq mm0,mm1 ; mm0=(00 01)=data0 + punpckhdq mm4,mm1 ; mm4=(10 11)=data1 + movq mm5,mm2 ; transpose coefficients + punpckldq mm2,mm3 ; mm2=(60 61)=data6 + punpckhdq mm5,mm3 ; mm5=(70 71)=data7 + + movq mm6,mm4 + movq mm7,mm0 + pfsub mm4,mm2 ; mm4=data1-data6=tmp6 + pfsub mm0,mm5 ; mm0=data0-data7=tmp7 + pfadd mm6,mm2 ; mm6=data1+data6=tmp1 + pfadd mm7,mm5 ; mm7=data0+data7=tmp0 + + movq mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)] + movq mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)] + movq mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)] + movq mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)] + + ; mm1=(20 30), mm3=(21 31), mm2=(40 50), mm5=(41 51) + + movq MMWORD [wk(0)], mm4 ; wk(0)=tmp6 + movq MMWORD [wk(1)], mm0 ; wk(1)=tmp7 + + movq mm4,mm1 ; transpose coefficients + punpckldq mm1,mm3 ; mm1=(20 21)=data2 + punpckhdq mm4,mm3 ; mm4=(30 31)=data3 + movq mm0,mm2 ; transpose coefficients + punpckldq mm2,mm5 ; mm2=(40 41)=data4 + punpckhdq mm0,mm5 ; mm0=(50 51)=data5 + + movq mm3,mm4 + movq mm5,mm1 + pfadd mm4,mm2 ; mm4=data3+data4=tmp3 + pfadd mm1,mm0 ; mm1=data2+data5=tmp2 + pfsub mm3,mm2 ; mm3=data3-data4=tmp4 + pfsub mm5,mm0 ; mm5=data2-data5=tmp5 + + ; -- Even part + + movq mm2,mm7 + movq mm0,mm6 + pfsub mm7,mm4 ; mm7=tmp13 + pfsub mm6,mm1 ; mm6=tmp12 + pfadd mm2,mm4 ; mm2=tmp10 + pfadd mm0,mm1 ; mm0=tmp11 + + pfadd mm6,mm7 + pfmul mm6,[GOTOFF(ebx,PD_0_707)] ; mm6=z1 + + movq mm4,mm2 + movq mm1,mm7 + pfsub mm2,mm0 ; mm2=data4 + pfsub mm7,mm6 ; mm7=data6 + pfadd mm4,mm0 ; mm4=data0 + pfadd mm1,mm6 ; mm1=data2 + + movq MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], mm2 + movq MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], mm7 + movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4 + movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], mm1 + + ; -- Odd part + + movq mm0, MMWORD [wk(0)] ; mm0=tmp6 + movq mm6, MMWORD [wk(1)] ; mm6=tmp7 + + pfadd mm3,mm5 ; mm3=tmp10 + pfadd mm5,mm0 ; mm5=tmp11 + pfadd mm0,mm6 ; mm0=tmp12, mm6=tmp7 + + pfmul mm5,[GOTOFF(ebx,PD_0_707)] ; mm5=z3 + + movq mm2,mm3 ; mm2=tmp10 + pfsub mm3,mm0 + pfmul mm3,[GOTOFF(ebx,PD_0_382)] ; mm3=z5 + pfmul mm2,[GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610) + pfmul mm0,[GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296) + pfadd mm2,mm3 ; mm2=z2 + pfadd mm0,mm3 ; mm0=z4 + + movq mm7,mm6 + pfsub mm6,mm5 ; mm6=z13 + pfadd mm7,mm5 ; mm7=z11 + + movq mm4,mm6 + movq mm1,mm7 + pfsub mm6,mm2 ; mm6=data3 + pfsub mm7,mm0 ; mm7=data7 + pfadd mm4,mm2 ; mm4=data5 + pfadd mm1,mm0 ; mm1=data1 + + movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], mm6 + movq MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], mm7 + movq MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], mm4 + movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1 + + add edx, byte 2*SIZEOF_FAST_FLOAT + dec ecx + jnz near .columnloop + + femms ; empty MMX/3DNow! state + +; pop edi ; unused +; pop esi ; unused +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + poppic ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/Builder/jni-1.11/simd/jfdctflt-sse-64.asm b/Builder/jni-1.11/simd/jfdctflt-sse-64.asm new file mode 100644 index 000000000..4b64ea4bb --- /dev/null +++ b/Builder/jni-1.11/simd/jfdctflt-sse-64.asm @@ -0,0 +1,357 @@ +; +; jfdctflt.asm - floating-point FDCT (64-bit SSE) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2009, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a floating-point implementation of the forward DCT +; (Discrete Cosine Transform). The following code is based directly on +; the IJG's original jfdctflt.c; see the jfdctflt.c for more details. +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) + shufps %1,%2,0x44 +%endmacro + +%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) + shufps %1,%2,0xEE +%endmacro + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_fdct_float_sse) + +EXTN(jconst_fdct_float_sse): + +PD_0_382 times 4 dd 0.382683432365089771728460 +PD_0_707 times 4 dd 0.707106781186547524400844 +PD_0_541 times 4 dd 0.541196100146196984399723 +PD_1_306 times 4 dd 1.306562964876376527856643 + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 +; +; Perform the forward DCT on one block of samples. +; +; GLOBAL(void) +; jsimd_fdct_float_sse (FAST_FLOAT *data) +; + +; r10 = FAST_FLOAT *data + +%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 + + align 16 + global EXTN(jsimd_fdct_float_sse) + +EXTN(jsimd_fdct_float_sse): + push rbp + mov rax,rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp],rax + mov rbp,rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args + + ; ---- Pass 1: process rows. + + mov rdx, r10 ; (FAST_FLOAT *) + mov rcx, DCTSIZE/4 +.rowloop: + + movaps xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)] + + ; xmm0=(20 21 22 23), xmm2=(24 25 26 27) + ; xmm1=(30 31 32 33), xmm3=(34 35 36 37) + + movaps xmm4,xmm0 ; transpose coefficients(phase 1) + unpcklps xmm0,xmm1 ; xmm0=(20 30 21 31) + unpckhps xmm4,xmm1 ; xmm4=(22 32 23 33) + movaps xmm5,xmm2 ; transpose coefficients(phase 1) + unpcklps xmm2,xmm3 ; xmm2=(24 34 25 35) + unpckhps xmm5,xmm3 ; xmm5=(26 36 27 37) + + movaps xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)] + + ; xmm6=(00 01 02 03), xmm1=(04 05 06 07) + ; xmm7=(10 11 12 13), xmm3=(14 15 16 17) + + movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 32 23 33) + movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(24 34 25 35) + + movaps xmm4,xmm6 ; transpose coefficients(phase 1) + unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11) + unpckhps xmm4,xmm7 ; xmm4=(02 12 03 13) + movaps xmm2,xmm1 ; transpose coefficients(phase 1) + unpcklps xmm1,xmm3 ; xmm1=(04 14 05 15) + unpckhps xmm2,xmm3 ; xmm2=(06 16 07 17) + + movaps xmm7,xmm6 ; transpose coefficients(phase 2) + unpcklps2 xmm6,xmm0 ; xmm6=(00 10 20 30)=data0 + unpckhps2 xmm7,xmm0 ; xmm7=(01 11 21 31)=data1 + movaps xmm3,xmm2 ; transpose coefficients(phase 2) + unpcklps2 xmm2,xmm5 ; xmm2=(06 16 26 36)=data6 + unpckhps2 xmm3,xmm5 ; xmm3=(07 17 27 37)=data7 + + movaps xmm0,xmm7 + movaps xmm5,xmm6 + subps xmm7,xmm2 ; xmm7=data1-data6=tmp6 + subps xmm6,xmm3 ; xmm6=data0-data7=tmp7 + addps xmm0,xmm2 ; xmm0=data1+data6=tmp1 + addps xmm5,xmm3 ; xmm5=data0+data7=tmp0 + + movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 32 23 33) + movaps xmm3, XMMWORD [wk(1)] ; xmm3=(24 34 25 35) + movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6 + movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 + + movaps xmm7,xmm4 ; transpose coefficients(phase 2) + unpcklps2 xmm4,xmm2 ; xmm4=(02 12 22 32)=data2 + unpckhps2 xmm7,xmm2 ; xmm7=(03 13 23 33)=data3 + movaps xmm6,xmm1 ; transpose coefficients(phase 2) + unpcklps2 xmm1,xmm3 ; xmm1=(04 14 24 34)=data4 + unpckhps2 xmm6,xmm3 ; xmm6=(05 15 25 35)=data5 + + movaps xmm2,xmm7 + movaps xmm3,xmm4 + addps xmm7,xmm1 ; xmm7=data3+data4=tmp3 + addps xmm4,xmm6 ; xmm4=data2+data5=tmp2 + subps xmm2,xmm1 ; xmm2=data3-data4=tmp4 + subps xmm3,xmm6 ; xmm3=data2-data5=tmp5 + + ; -- Even part + + movaps xmm1,xmm5 + movaps xmm6,xmm0 + subps xmm5,xmm7 ; xmm5=tmp13 + subps xmm0,xmm4 ; xmm0=tmp12 + addps xmm1,xmm7 ; xmm1=tmp10 + addps xmm6,xmm4 ; xmm6=tmp11 + + addps xmm0,xmm5 + mulps xmm0,[rel PD_0_707] ; xmm0=z1 + + movaps xmm7,xmm1 + movaps xmm4,xmm5 + subps xmm1,xmm6 ; xmm1=data4 + subps xmm5,xmm0 ; xmm5=data6 + addps xmm7,xmm6 ; xmm7=data0 + addps xmm4,xmm0 ; xmm4=data2 + + movaps XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7 + movaps XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4 + + ; -- Odd part + + movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6 + movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7 + + addps xmm2,xmm3 ; xmm2=tmp10 + addps xmm3,xmm6 ; xmm3=tmp11 + addps xmm6,xmm0 ; xmm6=tmp12, xmm0=tmp7 + + mulps xmm3,[rel PD_0_707] ; xmm3=z3 + + movaps xmm1,xmm2 ; xmm1=tmp10 + subps xmm2,xmm6 + mulps xmm2,[rel PD_0_382] ; xmm2=z5 + mulps xmm1,[rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196) + mulps xmm6,[rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562) + addps xmm1,xmm2 ; xmm1=z2 + addps xmm6,xmm2 ; xmm6=z4 + + movaps xmm5,xmm0 + subps xmm0,xmm3 ; xmm0=z13 + addps xmm5,xmm3 ; xmm5=z11 + + movaps xmm7,xmm0 + movaps xmm4,xmm5 + subps xmm0,xmm1 ; xmm0=data3 + subps xmm5,xmm6 ; xmm5=data7 + addps xmm7,xmm1 ; xmm7=data5 + addps xmm4,xmm6 ; xmm4=data1 + + movaps XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)], xmm7 + movaps XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4 + + add rdx, 4*DCTSIZE*SIZEOF_FAST_FLOAT + dec rcx + jnz near .rowloop + + ; ---- Pass 2: process columns. + + mov rdx, r10 ; (FAST_FLOAT *) + mov rcx, DCTSIZE/4 +.columnloop: + + movaps xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)] + + ; xmm0=(02 12 22 32), xmm2=(42 52 62 72) + ; xmm1=(03 13 23 33), xmm3=(43 53 63 73) + + movaps xmm4,xmm0 ; transpose coefficients(phase 1) + unpcklps xmm0,xmm1 ; xmm0=(02 03 12 13) + unpckhps xmm4,xmm1 ; xmm4=(22 23 32 33) + movaps xmm5,xmm2 ; transpose coefficients(phase 1) + unpcklps xmm2,xmm3 ; xmm2=(42 43 52 53) + unpckhps xmm5,xmm3 ; xmm5=(62 63 72 73) + + movaps xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)] + + ; xmm6=(00 10 20 30), xmm1=(40 50 60 70) + ; xmm7=(01 11 21 31), xmm3=(41 51 61 71) + + movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 23 32 33) + movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(42 43 52 53) + + movaps xmm4,xmm6 ; transpose coefficients(phase 1) + unpcklps xmm6,xmm7 ; xmm6=(00 01 10 11) + unpckhps xmm4,xmm7 ; xmm4=(20 21 30 31) + movaps xmm2,xmm1 ; transpose coefficients(phase 1) + unpcklps xmm1,xmm3 ; xmm1=(40 41 50 51) + unpckhps xmm2,xmm3 ; xmm2=(60 61 70 71) + + movaps xmm7,xmm6 ; transpose coefficients(phase 2) + unpcklps2 xmm6,xmm0 ; xmm6=(00 01 02 03)=data0 + unpckhps2 xmm7,xmm0 ; xmm7=(10 11 12 13)=data1 + movaps xmm3,xmm2 ; transpose coefficients(phase 2) + unpcklps2 xmm2,xmm5 ; xmm2=(60 61 62 63)=data6 + unpckhps2 xmm3,xmm5 ; xmm3=(70 71 72 73)=data7 + + movaps xmm0,xmm7 + movaps xmm5,xmm6 + subps xmm7,xmm2 ; xmm7=data1-data6=tmp6 + subps xmm6,xmm3 ; xmm6=data0-data7=tmp7 + addps xmm0,xmm2 ; xmm0=data1+data6=tmp1 + addps xmm5,xmm3 ; xmm5=data0+data7=tmp0 + + movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 23 32 33) + movaps xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53) + movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6 + movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 + + movaps xmm7,xmm4 ; transpose coefficients(phase 2) + unpcklps2 xmm4,xmm2 ; xmm4=(20 21 22 23)=data2 + unpckhps2 xmm7,xmm2 ; xmm7=(30 31 32 33)=data3 + movaps xmm6,xmm1 ; transpose coefficients(phase 2) + unpcklps2 xmm1,xmm3 ; xmm1=(40 41 42 43)=data4 + unpckhps2 xmm6,xmm3 ; xmm6=(50 51 52 53)=data5 + + movaps xmm2,xmm7 + movaps xmm3,xmm4 + addps xmm7,xmm1 ; xmm7=data3+data4=tmp3 + addps xmm4,xmm6 ; xmm4=data2+data5=tmp2 + subps xmm2,xmm1 ; xmm2=data3-data4=tmp4 + subps xmm3,xmm6 ; xmm3=data2-data5=tmp5 + + ; -- Even part + + movaps xmm1,xmm5 + movaps xmm6,xmm0 + subps xmm5,xmm7 ; xmm5=tmp13 + subps xmm0,xmm4 ; xmm0=tmp12 + addps xmm1,xmm7 ; xmm1=tmp10 + addps xmm6,xmm4 ; xmm6=tmp11 + + addps xmm0,xmm5 + mulps xmm0,[rel PD_0_707] ; xmm0=z1 + + movaps xmm7,xmm1 + movaps xmm4,xmm5 + subps xmm1,xmm6 ; xmm1=data4 + subps xmm5,xmm0 ; xmm5=data6 + addps xmm7,xmm6 ; xmm7=data0 + addps xmm4,xmm0 ; xmm4=data2 + + movaps XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7 + movaps XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4 + + ; -- Odd part + + movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6 + movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7 + + addps xmm2,xmm3 ; xmm2=tmp10 + addps xmm3,xmm6 ; xmm3=tmp11 + addps xmm6,xmm0 ; xmm6=tmp12, xmm0=tmp7 + + mulps xmm3,[rel PD_0_707] ; xmm3=z3 + + movaps xmm1,xmm2 ; xmm1=tmp10 + subps xmm2,xmm6 + mulps xmm2,[rel PD_0_382] ; xmm2=z5 + mulps xmm1,[rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196) + mulps xmm6,[rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562) + addps xmm1,xmm2 ; xmm1=z2 + addps xmm6,xmm2 ; xmm6=z4 + + movaps xmm5,xmm0 + subps xmm0,xmm3 ; xmm0=z13 + addps xmm5,xmm3 ; xmm5=z11 + + movaps xmm7,xmm0 + movaps xmm4,xmm5 + subps xmm0,xmm1 ; xmm0=data3 + subps xmm5,xmm6 ; xmm5=data7 + addps xmm7,xmm1 ; xmm7=data5 + addps xmm4,xmm6 ; xmm4=data1 + + movaps XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)], xmm7 + movaps XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4 + + add rdx, byte 4*SIZEOF_FAST_FLOAT + dec rcx + jnz near .columnloop + + uncollect_args + mov rsp,rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/Builder/jni-1.11/simd/jfdctflt-sse.asm b/Builder/jni-1.11/simd/jfdctflt-sse.asm new file mode 100644 index 000000000..e7ede26c0 --- /dev/null +++ b/Builder/jni-1.11/simd/jfdctflt-sse.asm @@ -0,0 +1,369 @@ +; +; jfdctflt.asm - floating-point FDCT (SSE) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a floating-point implementation of the forward DCT +; (Discrete Cosine Transform). The following code is based directly on +; the IJG's original jfdctflt.c; see the jfdctflt.c for more details. +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) + shufps %1,%2,0x44 +%endmacro + +%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) + shufps %1,%2,0xEE +%endmacro + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_fdct_float_sse) + +EXTN(jconst_fdct_float_sse): + +PD_0_382 times 4 dd 0.382683432365089771728460 +PD_0_707 times 4 dd 0.707106781186547524400844 +PD_0_541 times 4 dd 0.541196100146196984399723 +PD_1_306 times 4 dd 1.306562964876376527856643 + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform the forward DCT on one block of samples. +; +; GLOBAL(void) +; jsimd_fdct_float_sse (FAST_FLOAT *data) +; + +%define data(b) (b)+8 ; FAST_FLOAT *data + +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 + + align 16 + global EXTN(jsimd_fdct_float_sse) + +EXTN(jsimd_fdct_float_sse): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved +; push esi ; unused +; push edi ; unused + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process rows. + + mov edx, POINTER [data(eax)] ; (FAST_FLOAT *) + mov ecx, DCTSIZE/4 + alignx 16,7 +.rowloop: + + movaps xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)] + + ; xmm0=(20 21 22 23), xmm2=(24 25 26 27) + ; xmm1=(30 31 32 33), xmm3=(34 35 36 37) + + movaps xmm4,xmm0 ; transpose coefficients(phase 1) + unpcklps xmm0,xmm1 ; xmm0=(20 30 21 31) + unpckhps xmm4,xmm1 ; xmm4=(22 32 23 33) + movaps xmm5,xmm2 ; transpose coefficients(phase 1) + unpcklps xmm2,xmm3 ; xmm2=(24 34 25 35) + unpckhps xmm5,xmm3 ; xmm5=(26 36 27 37) + + movaps xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] + + ; xmm6=(00 01 02 03), xmm1=(04 05 06 07) + ; xmm7=(10 11 12 13), xmm3=(14 15 16 17) + + movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 32 23 33) + movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(24 34 25 35) + + movaps xmm4,xmm6 ; transpose coefficients(phase 1) + unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11) + unpckhps xmm4,xmm7 ; xmm4=(02 12 03 13) + movaps xmm2,xmm1 ; transpose coefficients(phase 1) + unpcklps xmm1,xmm3 ; xmm1=(04 14 05 15) + unpckhps xmm2,xmm3 ; xmm2=(06 16 07 17) + + movaps xmm7,xmm6 ; transpose coefficients(phase 2) + unpcklps2 xmm6,xmm0 ; xmm6=(00 10 20 30)=data0 + unpckhps2 xmm7,xmm0 ; xmm7=(01 11 21 31)=data1 + movaps xmm3,xmm2 ; transpose coefficients(phase 2) + unpcklps2 xmm2,xmm5 ; xmm2=(06 16 26 36)=data6 + unpckhps2 xmm3,xmm5 ; xmm3=(07 17 27 37)=data7 + + movaps xmm0,xmm7 + movaps xmm5,xmm6 + subps xmm7,xmm2 ; xmm7=data1-data6=tmp6 + subps xmm6,xmm3 ; xmm6=data0-data7=tmp7 + addps xmm0,xmm2 ; xmm0=data1+data6=tmp1 + addps xmm5,xmm3 ; xmm5=data0+data7=tmp0 + + movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 32 23 33) + movaps xmm3, XMMWORD [wk(1)] ; xmm3=(24 34 25 35) + movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6 + movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 + + movaps xmm7,xmm4 ; transpose coefficients(phase 2) + unpcklps2 xmm4,xmm2 ; xmm4=(02 12 22 32)=data2 + unpckhps2 xmm7,xmm2 ; xmm7=(03 13 23 33)=data3 + movaps xmm6,xmm1 ; transpose coefficients(phase 2) + unpcklps2 xmm1,xmm3 ; xmm1=(04 14 24 34)=data4 + unpckhps2 xmm6,xmm3 ; xmm6=(05 15 25 35)=data5 + + movaps xmm2,xmm7 + movaps xmm3,xmm4 + addps xmm7,xmm1 ; xmm7=data3+data4=tmp3 + addps xmm4,xmm6 ; xmm4=data2+data5=tmp2 + subps xmm2,xmm1 ; xmm2=data3-data4=tmp4 + subps xmm3,xmm6 ; xmm3=data2-data5=tmp5 + + ; -- Even part + + movaps xmm1,xmm5 + movaps xmm6,xmm0 + subps xmm5,xmm7 ; xmm5=tmp13 + subps xmm0,xmm4 ; xmm0=tmp12 + addps xmm1,xmm7 ; xmm1=tmp10 + addps xmm6,xmm4 ; xmm6=tmp11 + + addps xmm0,xmm5 + mulps xmm0,[GOTOFF(ebx,PD_0_707)] ; xmm0=z1 + + movaps xmm7,xmm1 + movaps xmm4,xmm5 + subps xmm1,xmm6 ; xmm1=data4 + subps xmm5,xmm0 ; xmm5=data6 + addps xmm7,xmm6 ; xmm7=data0 + addps xmm4,xmm0 ; xmm4=data2 + + movaps XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7 + movaps XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4 + + ; -- Odd part + + movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6 + movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7 + + addps xmm2,xmm3 ; xmm2=tmp10 + addps xmm3,xmm6 ; xmm3=tmp11 + addps xmm6,xmm0 ; xmm6=tmp12, xmm0=tmp7 + + mulps xmm3,[GOTOFF(ebx,PD_0_707)] ; xmm3=z3 + + movaps xmm1,xmm2 ; xmm1=tmp10 + subps xmm2,xmm6 + mulps xmm2,[GOTOFF(ebx,PD_0_382)] ; xmm2=z5 + mulps xmm1,[GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196) + mulps xmm6,[GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562) + addps xmm1,xmm2 ; xmm1=z2 + addps xmm6,xmm2 ; xmm6=z4 + + movaps xmm5,xmm0 + subps xmm0,xmm3 ; xmm0=z13 + addps xmm5,xmm3 ; xmm5=z11 + + movaps xmm7,xmm0 + movaps xmm4,xmm5 + subps xmm0,xmm1 ; xmm0=data3 + subps xmm5,xmm6 ; xmm5=data7 + addps xmm7,xmm1 ; xmm7=data5 + addps xmm4,xmm6 ; xmm4=data1 + + movaps XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], xmm7 + movaps XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4 + + add edx, 4*DCTSIZE*SIZEOF_FAST_FLOAT + dec ecx + jnz near .rowloop + + ; ---- Pass 2: process columns. + + mov edx, POINTER [data(eax)] ; (FAST_FLOAT *) + mov ecx, DCTSIZE/4 + alignx 16,7 +.columnloop: + + movaps xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)] + + ; xmm0=(02 12 22 32), xmm2=(42 52 62 72) + ; xmm1=(03 13 23 33), xmm3=(43 53 63 73) + + movaps xmm4,xmm0 ; transpose coefficients(phase 1) + unpcklps xmm0,xmm1 ; xmm0=(02 03 12 13) + unpckhps xmm4,xmm1 ; xmm4=(22 23 32 33) + movaps xmm5,xmm2 ; transpose coefficients(phase 1) + unpcklps xmm2,xmm3 ; xmm2=(42 43 52 53) + unpckhps xmm5,xmm3 ; xmm5=(62 63 72 73) + + movaps xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)] + + ; xmm6=(00 10 20 30), xmm1=(40 50 60 70) + ; xmm7=(01 11 21 31), xmm3=(41 51 61 71) + + movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 23 32 33) + movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(42 43 52 53) + + movaps xmm4,xmm6 ; transpose coefficients(phase 1) + unpcklps xmm6,xmm7 ; xmm6=(00 01 10 11) + unpckhps xmm4,xmm7 ; xmm4=(20 21 30 31) + movaps xmm2,xmm1 ; transpose coefficients(phase 1) + unpcklps xmm1,xmm3 ; xmm1=(40 41 50 51) + unpckhps xmm2,xmm3 ; xmm2=(60 61 70 71) + + movaps xmm7,xmm6 ; transpose coefficients(phase 2) + unpcklps2 xmm6,xmm0 ; xmm6=(00 01 02 03)=data0 + unpckhps2 xmm7,xmm0 ; xmm7=(10 11 12 13)=data1 + movaps xmm3,xmm2 ; transpose coefficients(phase 2) + unpcklps2 xmm2,xmm5 ; xmm2=(60 61 62 63)=data6 + unpckhps2 xmm3,xmm5 ; xmm3=(70 71 72 73)=data7 + + movaps xmm0,xmm7 + movaps xmm5,xmm6 + subps xmm7,xmm2 ; xmm7=data1-data6=tmp6 + subps xmm6,xmm3 ; xmm6=data0-data7=tmp7 + addps xmm0,xmm2 ; xmm0=data1+data6=tmp1 + addps xmm5,xmm3 ; xmm5=data0+data7=tmp0 + + movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 23 32 33) + movaps xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53) + movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6 + movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 + + movaps xmm7,xmm4 ; transpose coefficients(phase 2) + unpcklps2 xmm4,xmm2 ; xmm4=(20 21 22 23)=data2 + unpckhps2 xmm7,xmm2 ; xmm7=(30 31 32 33)=data3 + movaps xmm6,xmm1 ; transpose coefficients(phase 2) + unpcklps2 xmm1,xmm3 ; xmm1=(40 41 42 43)=data4 + unpckhps2 xmm6,xmm3 ; xmm6=(50 51 52 53)=data5 + + movaps xmm2,xmm7 + movaps xmm3,xmm4 + addps xmm7,xmm1 ; xmm7=data3+data4=tmp3 + addps xmm4,xmm6 ; xmm4=data2+data5=tmp2 + subps xmm2,xmm1 ; xmm2=data3-data4=tmp4 + subps xmm3,xmm6 ; xmm3=data2-data5=tmp5 + + ; -- Even part + + movaps xmm1,xmm5 + movaps xmm6,xmm0 + subps xmm5,xmm7 ; xmm5=tmp13 + subps xmm0,xmm4 ; xmm0=tmp12 + addps xmm1,xmm7 ; xmm1=tmp10 + addps xmm6,xmm4 ; xmm6=tmp11 + + addps xmm0,xmm5 + mulps xmm0,[GOTOFF(ebx,PD_0_707)] ; xmm0=z1 + + movaps xmm7,xmm1 + movaps xmm4,xmm5 + subps xmm1,xmm6 ; xmm1=data4 + subps xmm5,xmm0 ; xmm5=data6 + addps xmm7,xmm6 ; xmm7=data0 + addps xmm4,xmm0 ; xmm4=data2 + + movaps XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7 + movaps XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4 + + ; -- Odd part + + movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6 + movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7 + + addps xmm2,xmm3 ; xmm2=tmp10 + addps xmm3,xmm6 ; xmm3=tmp11 + addps xmm6,xmm0 ; xmm6=tmp12, xmm0=tmp7 + + mulps xmm3,[GOTOFF(ebx,PD_0_707)] ; xmm3=z3 + + movaps xmm1,xmm2 ; xmm1=tmp10 + subps xmm2,xmm6 + mulps xmm2,[GOTOFF(ebx,PD_0_382)] ; xmm2=z5 + mulps xmm1,[GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196) + mulps xmm6,[GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562) + addps xmm1,xmm2 ; xmm1=z2 + addps xmm6,xmm2 ; xmm6=z4 + + movaps xmm5,xmm0 + subps xmm0,xmm3 ; xmm0=z13 + addps xmm5,xmm3 ; xmm5=z11 + + movaps xmm7,xmm0 + movaps xmm4,xmm5 + subps xmm0,xmm1 ; xmm0=data3 + subps xmm5,xmm6 ; xmm5=data7 + addps xmm7,xmm1 ; xmm7=data5 + addps xmm4,xmm6 ; xmm4=data1 + + movaps XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], xmm7 + movaps XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4 + + add edx, byte 4*SIZEOF_FAST_FLOAT + dec ecx + jnz near .columnloop + +; pop edi ; unused +; pop esi ; unused +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + poppic ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/Builder/jni-1.11/simd/jfdctfst-altivec.c b/Builder/jni-1.11/simd/jfdctfst-altivec.c new file mode 100644 index 000000000..04157f77e --- /dev/null +++ b/Builder/jni-1.11/simd/jfdctfst-altivec.c @@ -0,0 +1,156 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2014, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* FAST INTEGER FORWARD DCT + * + * This is similar to the SSE2 implementation, except that we left-shift the + * constants by 1 less bit (the -1 in CONST_SHIFT.) This is because + * vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of: + * the elements in arg3 + the most significant 17 bits of + * (the elements in arg1 * the elements in arg2). + */ + +#include "jsimd_altivec.h" + + +#define F_0_382 98 /* FIX(0.382683433) */ +#define F_0_541 139 /* FIX(0.541196100) */ +#define F_0_707 181 /* FIX(0.707106781) */ +#define F_1_306 334 /* FIX(1.306562965) */ + +#define CONST_BITS 8 +#define PRE_MULTIPLY_SCALE_BITS 2 +#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1) + + +#define DO_FDCT() \ +{ \ + /* Even part */ \ + \ + tmp10 = vec_add(tmp0, tmp3); \ + tmp13 = vec_sub(tmp0, tmp3); \ + tmp11 = vec_add(tmp1, tmp2); \ + tmp12 = vec_sub(tmp1, tmp2); \ + \ + out0 = vec_add(tmp10, tmp11); \ + out4 = vec_sub(tmp10, tmp11); \ + \ + z1 = vec_add(tmp12, tmp13); \ + z1 = vec_sl(z1, pre_multiply_scale_bits); \ + z1 = vec_madds(z1, pw_0707, pw_zero); \ + \ + out2 = vec_add(tmp13, z1); \ + out6 = vec_sub(tmp13, z1); \ + \ + /* Odd part */ \ + \ + tmp10 = vec_add(tmp4, tmp5); \ + tmp11 = vec_add(tmp5, tmp6); \ + tmp12 = vec_add(tmp6, tmp7); \ + \ + tmp10 = vec_sl(tmp10, pre_multiply_scale_bits); \ + tmp12 = vec_sl(tmp12, pre_multiply_scale_bits); \ + z5 = vec_sub(tmp10, tmp12); \ + z5 = vec_madds(z5, pw_0382, pw_zero); \ + \ + z2 = vec_madds(tmp10, pw_0541, z5); \ + z4 = vec_madds(tmp12, pw_1306, z5); \ + \ + tmp11 = vec_sl(tmp11, pre_multiply_scale_bits); \ + z3 = vec_madds(tmp11, pw_0707, pw_zero); \ + \ + z11 = vec_add(tmp7, z3); \ + z13 = vec_sub(tmp7, z3); \ + \ + out5 = vec_add(z13, z2); \ + out3 = vec_sub(z13, z2); \ + out1 = vec_add(z11, z4); \ + out7 = vec_sub(z11, z4); \ +} + + +void +jsimd_fdct_ifast_altivec (DCTELEM *data) +{ + __vector short row0, row1, row2, row3, row4, row5, row6, row7, + col0, col1, col2, col3, col4, col5, col6, col7, + tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13, + z1, z2, z3, z4, z5, z11, z13, + out0, out1, out2, out3, out4, out5, out6, out7; + + /* Constants */ + __vector short pw_zero = { __8X(0) }, + pw_0382 = { __8X(F_0_382 << CONST_SHIFT) }, + pw_0541 = { __8X(F_0_541 << CONST_SHIFT) }, + pw_0707 = { __8X(F_0_707 << CONST_SHIFT) }, + pw_1306 = { __8X(F_1_306 << CONST_SHIFT) }; + __vector unsigned short + pre_multiply_scale_bits = { __8X(PRE_MULTIPLY_SCALE_BITS) }; + + /* Pass 1: process rows */ + + row0 = vec_ld(0, data); + row1 = vec_ld(16, data); + row2 = vec_ld(32, data); + row3 = vec_ld(48, data); + row4 = vec_ld(64, data); + row5 = vec_ld(80, data); + row6 = vec_ld(96, data); + row7 = vec_ld(112, data); + + TRANSPOSE(row, col); + + tmp0 = vec_add(col0, col7); + tmp7 = vec_sub(col0, col7); + tmp1 = vec_add(col1, col6); + tmp6 = vec_sub(col1, col6); + tmp2 = vec_add(col2, col5); + tmp5 = vec_sub(col2, col5); + tmp3 = vec_add(col3, col4); + tmp4 = vec_sub(col3, col4); + + DO_FDCT(); + + /* Pass 2: process columns */ + + TRANSPOSE(out, row); + + tmp0 = vec_add(row0, row7); + tmp7 = vec_sub(row0, row7); + tmp1 = vec_add(row1, row6); + tmp6 = vec_sub(row1, row6); + tmp2 = vec_add(row2, row5); + tmp5 = vec_sub(row2, row5); + tmp3 = vec_add(row3, row4); + tmp4 = vec_sub(row3, row4); + + DO_FDCT(); + + vec_st(out0, 0, data); + vec_st(out1, 16, data); + vec_st(out2, 32, data); + vec_st(out3, 48, data); + vec_st(out4, 64, data); + vec_st(out5, 80, data); + vec_st(out6, 96, data); + vec_st(out7, 112, data); +} diff --git a/Builder/jni-1.11/simd/jfdctfst-mmx.asm b/Builder/jni-1.11/simd/jfdctfst-mmx.asm new file mode 100644 index 000000000..eb2eb9c50 --- /dev/null +++ b/Builder/jni-1.11/simd/jfdctfst-mmx.asm @@ -0,0 +1,396 @@ +; +; jfdctfst.asm - fast integer FDCT (MMX) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a fast, not so accurate integer implementation of +; the forward DCT (Discrete Cosine Transform). The following code is +; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c +; for more details. +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 8 ; 14 is also OK. + +%if CONST_BITS == 8 +F_0_382 equ 98 ; FIX(0.382683433) +F_0_541 equ 139 ; FIX(0.541196100) +F_0_707 equ 181 ; FIX(0.707106781) +F_1_306 equ 334 ; FIX(1.306562965) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) +F_0_382 equ DESCALE( 410903207,30-CONST_BITS) ; FIX(0.382683433) +F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) +F_0_707 equ DESCALE( 759250124,30-CONST_BITS) ; FIX(0.707106781) +F_1_306 equ DESCALE(1402911301,30-CONST_BITS) ; FIX(1.306562965) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + +; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) +; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) + +%define PRE_MULTIPLY_SCALE_BITS 2 +%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) + + alignz 16 + global EXTN(jconst_fdct_ifast_mmx) + +EXTN(jconst_fdct_ifast_mmx): + +PW_F0707 times 4 dw F_0_707 << CONST_SHIFT +PW_F0382 times 4 dw F_0_382 << CONST_SHIFT +PW_F0541 times 4 dw F_0_541 << CONST_SHIFT +PW_F1306 times 4 dw F_1_306 << CONST_SHIFT + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform the forward DCT on one block of samples. +; +; GLOBAL(void) +; jsimd_fdct_ifast_mmx (DCTELEM *data) +; + +%define data(b) (b)+8 ; DCTELEM *data + +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] +%define WK_NUM 2 + + align 16 + global EXTN(jsimd_fdct_ifast_mmx) + +EXTN(jsimd_fdct_ifast_mmx): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved +; push esi ; unused +; push edi ; unused + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process rows. + + mov edx, POINTER [data(eax)] ; (DCTELEM *) + mov ecx, DCTSIZE/4 + alignx 16,7 +.rowloop: + + movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)] + movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)] + movq mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)] + movq mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)] + + ; mm0=(20 21 22 23), mm2=(24 25 26 27) + ; mm1=(30 31 32 33), mm3=(34 35 36 37) + + movq mm4,mm0 ; transpose coefficients(phase 1) + punpcklwd mm0,mm1 ; mm0=(20 30 21 31) + punpckhwd mm4,mm1 ; mm4=(22 32 23 33) + movq mm5,mm2 ; transpose coefficients(phase 1) + punpcklwd mm2,mm3 ; mm2=(24 34 25 35) + punpckhwd mm5,mm3 ; mm5=(26 36 27 37) + + movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)] + movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)] + movq mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)] + movq mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)] + + ; mm6=(00 01 02 03), mm1=(04 05 06 07) + ; mm7=(10 11 12 13), mm3=(14 15 16 17) + + movq MMWORD [wk(0)], mm4 ; wk(0)=(22 32 23 33) + movq MMWORD [wk(1)], mm2 ; wk(1)=(24 34 25 35) + + movq mm4,mm6 ; transpose coefficients(phase 1) + punpcklwd mm6,mm7 ; mm6=(00 10 01 11) + punpckhwd mm4,mm7 ; mm4=(02 12 03 13) + movq mm2,mm1 ; transpose coefficients(phase 1) + punpcklwd mm1,mm3 ; mm1=(04 14 05 15) + punpckhwd mm2,mm3 ; mm2=(06 16 07 17) + + movq mm7,mm6 ; transpose coefficients(phase 2) + punpckldq mm6,mm0 ; mm6=(00 10 20 30)=data0 + punpckhdq mm7,mm0 ; mm7=(01 11 21 31)=data1 + movq mm3,mm2 ; transpose coefficients(phase 2) + punpckldq mm2,mm5 ; mm2=(06 16 26 36)=data6 + punpckhdq mm3,mm5 ; mm3=(07 17 27 37)=data7 + + movq mm0,mm7 + movq mm5,mm6 + psubw mm7,mm2 ; mm7=data1-data6=tmp6 + psubw mm6,mm3 ; mm6=data0-data7=tmp7 + paddw mm0,mm2 ; mm0=data1+data6=tmp1 + paddw mm5,mm3 ; mm5=data0+data7=tmp0 + + movq mm2, MMWORD [wk(0)] ; mm2=(22 32 23 33) + movq mm3, MMWORD [wk(1)] ; mm3=(24 34 25 35) + movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6 + movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7 + + movq mm7,mm4 ; transpose coefficients(phase 2) + punpckldq mm4,mm2 ; mm4=(02 12 22 32)=data2 + punpckhdq mm7,mm2 ; mm7=(03 13 23 33)=data3 + movq mm6,mm1 ; transpose coefficients(phase 2) + punpckldq mm1,mm3 ; mm1=(04 14 24 34)=data4 + punpckhdq mm6,mm3 ; mm6=(05 15 25 35)=data5 + + movq mm2,mm7 + movq mm3,mm4 + paddw mm7,mm1 ; mm7=data3+data4=tmp3 + paddw mm4,mm6 ; mm4=data2+data5=tmp2 + psubw mm2,mm1 ; mm2=data3-data4=tmp4 + psubw mm3,mm6 ; mm3=data2-data5=tmp5 + + ; -- Even part + + movq mm1,mm5 + movq mm6,mm0 + psubw mm5,mm7 ; mm5=tmp13 + psubw mm0,mm4 ; mm0=tmp12 + paddw mm1,mm7 ; mm1=tmp10 + paddw mm6,mm4 ; mm6=tmp11 + + paddw mm0,mm5 + psllw mm0,PRE_MULTIPLY_SCALE_BITS + pmulhw mm0,[GOTOFF(ebx,PW_F0707)] ; mm0=z1 + + movq mm7,mm1 + movq mm4,mm5 + psubw mm1,mm6 ; mm1=data4 + psubw mm5,mm0 ; mm5=data6 + paddw mm7,mm6 ; mm7=data0 + paddw mm4,mm0 ; mm4=data2 + + movq MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm1 + movq MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm5 + movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7 + movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4 + + ; -- Odd part + + movq mm6, MMWORD [wk(0)] ; mm6=tmp6 + movq mm0, MMWORD [wk(1)] ; mm0=tmp7 + + paddw mm2,mm3 ; mm2=tmp10 + paddw mm3,mm6 ; mm3=tmp11 + paddw mm6,mm0 ; mm6=tmp12, mm0=tmp7 + + psllw mm2,PRE_MULTIPLY_SCALE_BITS + psllw mm6,PRE_MULTIPLY_SCALE_BITS + + psllw mm3,PRE_MULTIPLY_SCALE_BITS + pmulhw mm3,[GOTOFF(ebx,PW_F0707)] ; mm3=z3 + + movq mm1,mm2 ; mm1=tmp10 + psubw mm2,mm6 + pmulhw mm2,[GOTOFF(ebx,PW_F0382)] ; mm2=z5 + pmulhw mm1,[GOTOFF(ebx,PW_F0541)] ; mm1=MULTIPLY(tmp10,FIX_0_54119610) + pmulhw mm6,[GOTOFF(ebx,PW_F1306)] ; mm6=MULTIPLY(tmp12,FIX_1_30656296) + paddw mm1,mm2 ; mm1=z2 + paddw mm6,mm2 ; mm6=z4 + + movq mm5,mm0 + psubw mm0,mm3 ; mm0=z13 + paddw mm5,mm3 ; mm5=z11 + + movq mm7,mm0 + movq mm4,mm5 + psubw mm0,mm1 ; mm0=data3 + psubw mm5,mm6 ; mm5=data7 + paddw mm7,mm1 ; mm7=data5 + paddw mm4,mm6 ; mm4=data1 + + movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0 + movq MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm5 + movq MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm7 + movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4 + + add edx, byte 4*DCTSIZE*SIZEOF_DCTELEM + dec ecx + jnz near .rowloop + + ; ---- Pass 2: process columns. + + mov edx, POINTER [data(eax)] ; (DCTELEM *) + mov ecx, DCTSIZE/4 + alignx 16,7 +.columnloop: + + movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)] + movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)] + movq mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)] + movq mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)] + + ; mm0=(02 12 22 32), mm2=(42 52 62 72) + ; mm1=(03 13 23 33), mm3=(43 53 63 73) + + movq mm4,mm0 ; transpose coefficients(phase 1) + punpcklwd mm0,mm1 ; mm0=(02 03 12 13) + punpckhwd mm4,mm1 ; mm4=(22 23 32 33) + movq mm5,mm2 ; transpose coefficients(phase 1) + punpcklwd mm2,mm3 ; mm2=(42 43 52 53) + punpckhwd mm5,mm3 ; mm5=(62 63 72 73) + + movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)] + movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)] + movq mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)] + movq mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)] + + ; mm6=(00 10 20 30), mm1=(40 50 60 70) + ; mm7=(01 11 21 31), mm3=(41 51 61 71) + + movq MMWORD [wk(0)], mm4 ; wk(0)=(22 23 32 33) + movq MMWORD [wk(1)], mm2 ; wk(1)=(42 43 52 53) + + movq mm4,mm6 ; transpose coefficients(phase 1) + punpcklwd mm6,mm7 ; mm6=(00 01 10 11) + punpckhwd mm4,mm7 ; mm4=(20 21 30 31) + movq mm2,mm1 ; transpose coefficients(phase 1) + punpcklwd mm1,mm3 ; mm1=(40 41 50 51) + punpckhwd mm2,mm3 ; mm2=(60 61 70 71) + + movq mm7,mm6 ; transpose coefficients(phase 2) + punpckldq mm6,mm0 ; mm6=(00 01 02 03)=data0 + punpckhdq mm7,mm0 ; mm7=(10 11 12 13)=data1 + movq mm3,mm2 ; transpose coefficients(phase 2) + punpckldq mm2,mm5 ; mm2=(60 61 62 63)=data6 + punpckhdq mm3,mm5 ; mm3=(70 71 72 73)=data7 + + movq mm0,mm7 + movq mm5,mm6 + psubw mm7,mm2 ; mm7=data1-data6=tmp6 + psubw mm6,mm3 ; mm6=data0-data7=tmp7 + paddw mm0,mm2 ; mm0=data1+data6=tmp1 + paddw mm5,mm3 ; mm5=data0+data7=tmp0 + + movq mm2, MMWORD [wk(0)] ; mm2=(22 23 32 33) + movq mm3, MMWORD [wk(1)] ; mm3=(42 43 52 53) + movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6 + movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7 + + movq mm7,mm4 ; transpose coefficients(phase 2) + punpckldq mm4,mm2 ; mm4=(20 21 22 23)=data2 + punpckhdq mm7,mm2 ; mm7=(30 31 32 33)=data3 + movq mm6,mm1 ; transpose coefficients(phase 2) + punpckldq mm1,mm3 ; mm1=(40 41 42 43)=data4 + punpckhdq mm6,mm3 ; mm6=(50 51 52 53)=data5 + + movq mm2,mm7 + movq mm3,mm4 + paddw mm7,mm1 ; mm7=data3+data4=tmp3 + paddw mm4,mm6 ; mm4=data2+data5=tmp2 + psubw mm2,mm1 ; mm2=data3-data4=tmp4 + psubw mm3,mm6 ; mm3=data2-data5=tmp5 + + ; -- Even part + + movq mm1,mm5 + movq mm6,mm0 + psubw mm5,mm7 ; mm5=tmp13 + psubw mm0,mm4 ; mm0=tmp12 + paddw mm1,mm7 ; mm1=tmp10 + paddw mm6,mm4 ; mm6=tmp11 + + paddw mm0,mm5 + psllw mm0,PRE_MULTIPLY_SCALE_BITS + pmulhw mm0,[GOTOFF(ebx,PW_F0707)] ; mm0=z1 + + movq mm7,mm1 + movq mm4,mm5 + psubw mm1,mm6 ; mm1=data4 + psubw mm5,mm0 ; mm5=data6 + paddw mm7,mm6 ; mm7=data0 + paddw mm4,mm0 ; mm4=data2 + + movq MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm1 + movq MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm5 + movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7 + movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4 + + ; -- Odd part + + movq mm6, MMWORD [wk(0)] ; mm6=tmp6 + movq mm0, MMWORD [wk(1)] ; mm0=tmp7 + + paddw mm2,mm3 ; mm2=tmp10 + paddw mm3,mm6 ; mm3=tmp11 + paddw mm6,mm0 ; mm6=tmp12, mm0=tmp7 + + psllw mm2,PRE_MULTIPLY_SCALE_BITS + psllw mm6,PRE_MULTIPLY_SCALE_BITS + + psllw mm3,PRE_MULTIPLY_SCALE_BITS + pmulhw mm3,[GOTOFF(ebx,PW_F0707)] ; mm3=z3 + + movq mm1,mm2 ; mm1=tmp10 + psubw mm2,mm6 + pmulhw mm2,[GOTOFF(ebx,PW_F0382)] ; mm2=z5 + pmulhw mm1,[GOTOFF(ebx,PW_F0541)] ; mm1=MULTIPLY(tmp10,FIX_0_54119610) + pmulhw mm6,[GOTOFF(ebx,PW_F1306)] ; mm6=MULTIPLY(tmp12,FIX_1_30656296) + paddw mm1,mm2 ; mm1=z2 + paddw mm6,mm2 ; mm6=z4 + + movq mm5,mm0 + psubw mm0,mm3 ; mm0=z13 + paddw mm5,mm3 ; mm5=z11 + + movq mm7,mm0 + movq mm4,mm5 + psubw mm0,mm1 ; mm0=data3 + psubw mm5,mm6 ; mm5=data7 + paddw mm7,mm1 ; mm7=data5 + paddw mm4,mm6 ; mm4=data1 + + movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0 + movq MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm5 + movq MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm7 + movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4 + + add edx, byte 4*SIZEOF_DCTELEM + dec ecx + jnz near .columnloop + + emms ; empty MMX state + +; pop edi ; unused +; pop esi ; unused +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + poppic ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/Builder/jni-1.11/simd/jfdctfst-sse2-64.asm b/Builder/jni-1.11/simd/jfdctfst-sse2-64.asm new file mode 100644 index 000000000..4c9668542 --- /dev/null +++ b/Builder/jni-1.11/simd/jfdctfst-sse2-64.asm @@ -0,0 +1,391 @@ +; +; jfdctfst.asm - fast integer FDCT (64-bit SSE2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2009, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a fast, not so accurate integer implementation of +; the forward DCT (Discrete Cosine Transform). The following code is +; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c +; for more details. +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 8 ; 14 is also OK. + +%if CONST_BITS == 8 +F_0_382 equ 98 ; FIX(0.382683433) +F_0_541 equ 139 ; FIX(0.541196100) +F_0_707 equ 181 ; FIX(0.707106781) +F_1_306 equ 334 ; FIX(1.306562965) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) +F_0_382 equ DESCALE( 410903207,30-CONST_BITS) ; FIX(0.382683433) +F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) +F_0_707 equ DESCALE( 759250124,30-CONST_BITS) ; FIX(0.707106781) +F_1_306 equ DESCALE(1402911301,30-CONST_BITS) ; FIX(1.306562965) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + +; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) +; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) + +%define PRE_MULTIPLY_SCALE_BITS 2 +%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) + + alignz 16 + global EXTN(jconst_fdct_ifast_sse2) + +EXTN(jconst_fdct_ifast_sse2): + +PW_F0707 times 8 dw F_0_707 << CONST_SHIFT +PW_F0382 times 8 dw F_0_382 << CONST_SHIFT +PW_F0541 times 8 dw F_0_541 << CONST_SHIFT +PW_F1306 times 8 dw F_1_306 << CONST_SHIFT + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 +; +; Perform the forward DCT on one block of samples. +; +; GLOBAL(void) +; jsimd_fdct_ifast_sse2 (DCTELEM *data) +; + +; r10 = DCTELEM *data + +%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 + + align 16 + global EXTN(jsimd_fdct_ifast_sse2) + +EXTN(jsimd_fdct_ifast_sse2): + push rbp + mov rax,rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp],rax + mov rbp,rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args + + ; ---- Pass 1: process rows. + + mov rdx, r10 ; (DCTELEM *) + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)] + + ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27) + ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37) + + movdqa xmm4,xmm0 ; transpose coefficients(phase 1) + punpcklwd xmm0,xmm1 ; xmm0=(00 10 01 11 02 12 03 13) + punpckhwd xmm4,xmm1 ; xmm4=(04 14 05 15 06 16 07 17) + movdqa xmm5,xmm2 ; transpose coefficients(phase 1) + punpcklwd xmm2,xmm3 ; xmm2=(20 30 21 31 22 32 23 33) + punpckhwd xmm5,xmm3 ; xmm5=(24 34 25 35 26 36 27 37) + + movdqa xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)] + + ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62) + ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63) + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33) + movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37) + + movdqa xmm2,xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6,xmm7 ; xmm6=(40 50 41 51 42 52 43 53) + punpckhwd xmm2,xmm7 ; xmm2=(44 54 45 55 46 56 47 57) + movdqa xmm5,xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1,xmm3 ; xmm1=(60 70 61 71 62 72 63 73) + punpckhwd xmm5,xmm3 ; xmm5=(64 74 65 75 66 76 67 77) + + movdqa xmm7,xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6,xmm1 ; xmm6=(40 50 60 70 41 51 61 71) + punpckhdq xmm7,xmm1 ; xmm7=(42 52 62 72 43 53 63 73) + movdqa xmm3,xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2,xmm5 ; xmm2=(44 54 64 74 45 55 65 75) + punpckhdq xmm3,xmm5 ; xmm3=(46 56 66 76 47 57 67 77) + + movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33) + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37) + movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(42 52 62 72 43 53 63 73) + movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=(44 54 64 74 45 55 65 75) + + movdqa xmm7,xmm0 ; transpose coefficients(phase 2) + punpckldq xmm0,xmm1 ; xmm0=(00 10 20 30 01 11 21 31) + punpckhdq xmm7,xmm1 ; xmm7=(02 12 22 32 03 13 23 33) + movdqa xmm2,xmm4 ; transpose coefficients(phase 2) + punpckldq xmm4,xmm5 ; xmm4=(04 14 24 34 05 15 25 35) + punpckhdq xmm2,xmm5 ; xmm2=(06 16 26 36 07 17 27 37) + + movdqa xmm1,xmm0 ; transpose coefficients(phase 3) + punpcklqdq xmm0,xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0 + punpckhqdq xmm1,xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1 + movdqa xmm5,xmm2 ; transpose coefficients(phase 3) + punpcklqdq xmm2,xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6 + punpckhqdq xmm5,xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7 + + movdqa xmm6,xmm1 + movdqa xmm3,xmm0 + psubw xmm1,xmm2 ; xmm1=data1-data6=tmp6 + psubw xmm0,xmm5 ; xmm0=data0-data7=tmp7 + paddw xmm6,xmm2 ; xmm6=data1+data6=tmp1 + paddw xmm3,xmm5 ; xmm3=data0+data7=tmp0 + + movdqa xmm2, XMMWORD [wk(0)] ; xmm2=(42 52 62 72 43 53 63 73) + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(44 54 64 74 45 55 65 75) + movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6 + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7 + + movdqa xmm1,xmm7 ; transpose coefficients(phase 3) + punpcklqdq xmm7,xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2 + punpckhqdq xmm1,xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3 + movdqa xmm0,xmm4 ; transpose coefficients(phase 3) + punpcklqdq xmm4,xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4 + punpckhqdq xmm0,xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5 + + movdqa xmm2,xmm1 + movdqa xmm5,xmm7 + paddw xmm1,xmm4 ; xmm1=data3+data4=tmp3 + paddw xmm7,xmm0 ; xmm7=data2+data5=tmp2 + psubw xmm2,xmm4 ; xmm2=data3-data4=tmp4 + psubw xmm5,xmm0 ; xmm5=data2-data5=tmp5 + + ; -- Even part + + movdqa xmm4,xmm3 + movdqa xmm0,xmm6 + psubw xmm3,xmm1 ; xmm3=tmp13 + psubw xmm6,xmm7 ; xmm6=tmp12 + paddw xmm4,xmm1 ; xmm4=tmp10 + paddw xmm0,xmm7 ; xmm0=tmp11 + + paddw xmm6,xmm3 + psllw xmm6,PRE_MULTIPLY_SCALE_BITS + pmulhw xmm6,[rel PW_F0707] ; xmm6=z1 + + movdqa xmm1,xmm4 + movdqa xmm7,xmm3 + psubw xmm4,xmm0 ; xmm4=data4 + psubw xmm3,xmm6 ; xmm3=data6 + paddw xmm1,xmm0 ; xmm1=data0 + paddw xmm7,xmm6 ; xmm7=data2 + + movdqa xmm0, XMMWORD [wk(0)] ; xmm0=tmp6 + movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp7 + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=data4 + movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=data6 + + ; -- Odd part + + paddw xmm2,xmm5 ; xmm2=tmp10 + paddw xmm5,xmm0 ; xmm5=tmp11 + paddw xmm0,xmm6 ; xmm0=tmp12, xmm6=tmp7 + + psllw xmm2,PRE_MULTIPLY_SCALE_BITS + psllw xmm0,PRE_MULTIPLY_SCALE_BITS + + psllw xmm5,PRE_MULTIPLY_SCALE_BITS + pmulhw xmm5,[rel PW_F0707] ; xmm5=z3 + + movdqa xmm4,xmm2 ; xmm4=tmp10 + psubw xmm2,xmm0 + pmulhw xmm2,[rel PW_F0382] ; xmm2=z5 + pmulhw xmm4,[rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196) + pmulhw xmm0,[rel PW_F1306] ; xmm0=MULTIPLY(tmp12,FIX_1_306562) + paddw xmm4,xmm2 ; xmm4=z2 + paddw xmm0,xmm2 ; xmm0=z4 + + movdqa xmm3,xmm6 + psubw xmm6,xmm5 ; xmm6=z13 + paddw xmm3,xmm5 ; xmm3=z11 + + movdqa xmm2,xmm6 + movdqa xmm5,xmm3 + psubw xmm6,xmm4 ; xmm6=data3 + psubw xmm3,xmm0 ; xmm3=data7 + paddw xmm2,xmm4 ; xmm2=data5 + paddw xmm5,xmm0 ; xmm5=data1 + + ; ---- Pass 2: process columns. + + ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72) + ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73) + + movdqa xmm4,xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1,xmm5 ; xmm1=(00 01 10 11 20 21 30 31) + punpckhwd xmm4,xmm5 ; xmm4=(40 41 50 51 60 61 70 71) + movdqa xmm0,xmm7 ; transpose coefficients(phase 1) + punpcklwd xmm7,xmm6 ; xmm7=(02 03 12 13 22 23 32 33) + punpckhwd xmm0,xmm6 ; xmm0=(42 43 52 53 62 63 72 73) + + movdqa xmm5, XMMWORD [wk(0)] ; xmm5=col4 + movdqa xmm6, XMMWORD [wk(1)] ; xmm6=col6 + + ; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76) + ; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77) + + movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(02 03 12 13 22 23 32 33) + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(42 43 52 53 62 63 72 73) + + movdqa xmm7,xmm5 ; transpose coefficients(phase 1) + punpcklwd xmm5,xmm2 ; xmm5=(04 05 14 15 24 25 34 35) + punpckhwd xmm7,xmm2 ; xmm7=(44 45 54 55 64 65 74 75) + movdqa xmm0,xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6,xmm3 ; xmm6=(06 07 16 17 26 27 36 37) + punpckhwd xmm0,xmm3 ; xmm0=(46 47 56 57 66 67 76 77) + + movdqa xmm2,xmm5 ; transpose coefficients(phase 2) + punpckldq xmm5,xmm6 ; xmm5=(04 05 06 07 14 15 16 17) + punpckhdq xmm2,xmm6 ; xmm2=(24 25 26 27 34 35 36 37) + movdqa xmm3,xmm7 ; transpose coefficients(phase 2) + punpckldq xmm7,xmm0 ; xmm7=(44 45 46 47 54 55 56 57) + punpckhdq xmm3,xmm0 ; xmm3=(64 65 66 67 74 75 76 77) + + movdqa xmm6, XMMWORD [wk(0)] ; xmm6=(02 03 12 13 22 23 32 33) + movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(42 43 52 53 62 63 72 73) + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(24 25 26 27 34 35 36 37) + movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(44 45 46 47 54 55 56 57) + + movdqa xmm2,xmm1 ; transpose coefficients(phase 2) + punpckldq xmm1,xmm6 ; xmm1=(00 01 02 03 10 11 12 13) + punpckhdq xmm2,xmm6 ; xmm2=(20 21 22 23 30 31 32 33) + movdqa xmm7,xmm4 ; transpose coefficients(phase 2) + punpckldq xmm4,xmm0 ; xmm4=(40 41 42 43 50 51 52 53) + punpckhdq xmm7,xmm0 ; xmm7=(60 61 62 63 70 71 72 73) + + movdqa xmm6,xmm1 ; transpose coefficients(phase 3) + punpcklqdq xmm1,xmm5 ; xmm1=(00 01 02 03 04 05 06 07)=data0 + punpckhqdq xmm6,xmm5 ; xmm6=(10 11 12 13 14 15 16 17)=data1 + movdqa xmm0,xmm7 ; transpose coefficients(phase 3) + punpcklqdq xmm7,xmm3 ; xmm7=(60 61 62 63 64 65 66 67)=data6 + punpckhqdq xmm0,xmm3 ; xmm0=(70 71 72 73 74 75 76 77)=data7 + + movdqa xmm5,xmm6 + movdqa xmm3,xmm1 + psubw xmm6,xmm7 ; xmm6=data1-data6=tmp6 + psubw xmm1,xmm0 ; xmm1=data0-data7=tmp7 + paddw xmm5,xmm7 ; xmm5=data1+data6=tmp1 + paddw xmm3,xmm0 ; xmm3=data0+data7=tmp0 + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(24 25 26 27 34 35 36 37) + movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(44 45 46 47 54 55 56 57) + movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=tmp6 + movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=tmp7 + + movdqa xmm6,xmm2 ; transpose coefficients(phase 3) + punpcklqdq xmm2,xmm7 ; xmm2=(20 21 22 23 24 25 26 27)=data2 + punpckhqdq xmm6,xmm7 ; xmm6=(30 31 32 33 34 35 36 37)=data3 + movdqa xmm1,xmm4 ; transpose coefficients(phase 3) + punpcklqdq xmm4,xmm0 ; xmm4=(40 41 42 43 44 45 46 47)=data4 + punpckhqdq xmm1,xmm0 ; xmm1=(50 51 52 53 54 55 56 57)=data5 + + movdqa xmm7,xmm6 + movdqa xmm0,xmm2 + paddw xmm6,xmm4 ; xmm6=data3+data4=tmp3 + paddw xmm2,xmm1 ; xmm2=data2+data5=tmp2 + psubw xmm7,xmm4 ; xmm7=data3-data4=tmp4 + psubw xmm0,xmm1 ; xmm0=data2-data5=tmp5 + + ; -- Even part + + movdqa xmm4,xmm3 + movdqa xmm1,xmm5 + psubw xmm3,xmm6 ; xmm3=tmp13 + psubw xmm5,xmm2 ; xmm5=tmp12 + paddw xmm4,xmm6 ; xmm4=tmp10 + paddw xmm1,xmm2 ; xmm1=tmp11 + + paddw xmm5,xmm3 + psllw xmm5,PRE_MULTIPLY_SCALE_BITS + pmulhw xmm5,[rel PW_F0707] ; xmm5=z1 + + movdqa xmm6,xmm4 + movdqa xmm2,xmm3 + psubw xmm4,xmm1 ; xmm4=data4 + psubw xmm3,xmm5 ; xmm3=data6 + paddw xmm6,xmm1 ; xmm6=data0 + paddw xmm2,xmm5 ; xmm2=data2 + + movdqa XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm4 + movdqa XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm3 + movdqa XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm6 + movdqa XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm2 + + ; -- Odd part + + movdqa xmm1, XMMWORD [wk(0)] ; xmm1=tmp6 + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7 + + paddw xmm7,xmm0 ; xmm7=tmp10 + paddw xmm0,xmm1 ; xmm0=tmp11 + paddw xmm1,xmm5 ; xmm1=tmp12, xmm5=tmp7 + + psllw xmm7,PRE_MULTIPLY_SCALE_BITS + psllw xmm1,PRE_MULTIPLY_SCALE_BITS + + psllw xmm0,PRE_MULTIPLY_SCALE_BITS + pmulhw xmm0,[rel PW_F0707] ; xmm0=z3 + + movdqa xmm4,xmm7 ; xmm4=tmp10 + psubw xmm7,xmm1 + pmulhw xmm7,[rel PW_F0382] ; xmm7=z5 + pmulhw xmm4,[rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196) + pmulhw xmm1,[rel PW_F1306] ; xmm1=MULTIPLY(tmp12,FIX_1_306562) + paddw xmm4,xmm7 ; xmm4=z2 + paddw xmm1,xmm7 ; xmm1=z4 + + movdqa xmm3,xmm5 + psubw xmm5,xmm0 ; xmm5=z13 + paddw xmm3,xmm0 ; xmm3=z11 + + movdqa xmm6,xmm5 + movdqa xmm2,xmm3 + psubw xmm5,xmm4 ; xmm5=data3 + psubw xmm3,xmm1 ; xmm3=data7 + paddw xmm6,xmm4 ; xmm6=data5 + paddw xmm2,xmm1 ; xmm2=data1 + + movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm5 + movdqa XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm3 + movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm6 + movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm2 + + uncollect_args + mov rsp,rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/Builder/jni-1.11/simd/jfdctfst-sse2.asm b/Builder/jni-1.11/simd/jfdctfst-sse2.asm new file mode 100644 index 000000000..54856a236 --- /dev/null +++ b/Builder/jni-1.11/simd/jfdctfst-sse2.asm @@ -0,0 +1,403 @@ +; +; jfdctfst.asm - fast integer FDCT (SSE2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a fast, not so accurate integer implementation of +; the forward DCT (Discrete Cosine Transform). The following code is +; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c +; for more details. +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 8 ; 14 is also OK. + +%if CONST_BITS == 8 +F_0_382 equ 98 ; FIX(0.382683433) +F_0_541 equ 139 ; FIX(0.541196100) +F_0_707 equ 181 ; FIX(0.707106781) +F_1_306 equ 334 ; FIX(1.306562965) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) +F_0_382 equ DESCALE( 410903207,30-CONST_BITS) ; FIX(0.382683433) +F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) +F_0_707 equ DESCALE( 759250124,30-CONST_BITS) ; FIX(0.707106781) +F_1_306 equ DESCALE(1402911301,30-CONST_BITS) ; FIX(1.306562965) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + +; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) +; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) + +%define PRE_MULTIPLY_SCALE_BITS 2 +%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) + + alignz 16 + global EXTN(jconst_fdct_ifast_sse2) + +EXTN(jconst_fdct_ifast_sse2): + +PW_F0707 times 8 dw F_0_707 << CONST_SHIFT +PW_F0382 times 8 dw F_0_382 << CONST_SHIFT +PW_F0541 times 8 dw F_0_541 << CONST_SHIFT +PW_F1306 times 8 dw F_1_306 << CONST_SHIFT + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform the forward DCT on one block of samples. +; +; GLOBAL(void) +; jsimd_fdct_ifast_sse2 (DCTELEM *data) +; + +%define data(b) (b)+8 ; DCTELEM *data + +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 + + align 16 + global EXTN(jsimd_fdct_ifast_sse2) + +EXTN(jsimd_fdct_ifast_sse2): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic ebx +; push ecx ; unused +; push edx ; need not be preserved +; push esi ; unused +; push edi ; unused + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process rows. + + mov edx, POINTER [data(eax)] ; (DCTELEM *) + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)] + movdqa xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)] + movdqa xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)] + movdqa xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)] + + ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27) + ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37) + + movdqa xmm4,xmm0 ; transpose coefficients(phase 1) + punpcklwd xmm0,xmm1 ; xmm0=(00 10 01 11 02 12 03 13) + punpckhwd xmm4,xmm1 ; xmm4=(04 14 05 15 06 16 07 17) + movdqa xmm5,xmm2 ; transpose coefficients(phase 1) + punpcklwd xmm2,xmm3 ; xmm2=(20 30 21 31 22 32 23 33) + punpckhwd xmm5,xmm3 ; xmm5=(24 34 25 35 26 36 27 37) + + movdqa xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)] + movdqa xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)] + movdqa xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)] + + ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62) + ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63) + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33) + movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37) + + movdqa xmm2,xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6,xmm7 ; xmm6=(40 50 41 51 42 52 43 53) + punpckhwd xmm2,xmm7 ; xmm2=(44 54 45 55 46 56 47 57) + movdqa xmm5,xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1,xmm3 ; xmm1=(60 70 61 71 62 72 63 73) + punpckhwd xmm5,xmm3 ; xmm5=(64 74 65 75 66 76 67 77) + + movdqa xmm7,xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6,xmm1 ; xmm6=(40 50 60 70 41 51 61 71) + punpckhdq xmm7,xmm1 ; xmm7=(42 52 62 72 43 53 63 73) + movdqa xmm3,xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2,xmm5 ; xmm2=(44 54 64 74 45 55 65 75) + punpckhdq xmm3,xmm5 ; xmm3=(46 56 66 76 47 57 67 77) + + movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33) + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37) + movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(42 52 62 72 43 53 63 73) + movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=(44 54 64 74 45 55 65 75) + + movdqa xmm7,xmm0 ; transpose coefficients(phase 2) + punpckldq xmm0,xmm1 ; xmm0=(00 10 20 30 01 11 21 31) + punpckhdq xmm7,xmm1 ; xmm7=(02 12 22 32 03 13 23 33) + movdqa xmm2,xmm4 ; transpose coefficients(phase 2) + punpckldq xmm4,xmm5 ; xmm4=(04 14 24 34 05 15 25 35) + punpckhdq xmm2,xmm5 ; xmm2=(06 16 26 36 07 17 27 37) + + movdqa xmm1,xmm0 ; transpose coefficients(phase 3) + punpcklqdq xmm0,xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0 + punpckhqdq xmm1,xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1 + movdqa xmm5,xmm2 ; transpose coefficients(phase 3) + punpcklqdq xmm2,xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6 + punpckhqdq xmm5,xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7 + + movdqa xmm6,xmm1 + movdqa xmm3,xmm0 + psubw xmm1,xmm2 ; xmm1=data1-data6=tmp6 + psubw xmm0,xmm5 ; xmm0=data0-data7=tmp7 + paddw xmm6,xmm2 ; xmm6=data1+data6=tmp1 + paddw xmm3,xmm5 ; xmm3=data0+data7=tmp0 + + movdqa xmm2, XMMWORD [wk(0)] ; xmm2=(42 52 62 72 43 53 63 73) + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(44 54 64 74 45 55 65 75) + movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6 + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7 + + movdqa xmm1,xmm7 ; transpose coefficients(phase 3) + punpcklqdq xmm7,xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2 + punpckhqdq xmm1,xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3 + movdqa xmm0,xmm4 ; transpose coefficients(phase 3) + punpcklqdq xmm4,xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4 + punpckhqdq xmm0,xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5 + + movdqa xmm2,xmm1 + movdqa xmm5,xmm7 + paddw xmm1,xmm4 ; xmm1=data3+data4=tmp3 + paddw xmm7,xmm0 ; xmm7=data2+data5=tmp2 + psubw xmm2,xmm4 ; xmm2=data3-data4=tmp4 + psubw xmm5,xmm0 ; xmm5=data2-data5=tmp5 + + ; -- Even part + + movdqa xmm4,xmm3 + movdqa xmm0,xmm6 + psubw xmm3,xmm1 ; xmm3=tmp13 + psubw xmm6,xmm7 ; xmm6=tmp12 + paddw xmm4,xmm1 ; xmm4=tmp10 + paddw xmm0,xmm7 ; xmm0=tmp11 + + paddw xmm6,xmm3 + psllw xmm6,PRE_MULTIPLY_SCALE_BITS + pmulhw xmm6,[GOTOFF(ebx,PW_F0707)] ; xmm6=z1 + + movdqa xmm1,xmm4 + movdqa xmm7,xmm3 + psubw xmm4,xmm0 ; xmm4=data4 + psubw xmm3,xmm6 ; xmm3=data6 + paddw xmm1,xmm0 ; xmm1=data0 + paddw xmm7,xmm6 ; xmm7=data2 + + movdqa xmm0, XMMWORD [wk(0)] ; xmm0=tmp6 + movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp7 + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=data4 + movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=data6 + + ; -- Odd part + + paddw xmm2,xmm5 ; xmm2=tmp10 + paddw xmm5,xmm0 ; xmm5=tmp11 + paddw xmm0,xmm6 ; xmm0=tmp12, xmm6=tmp7 + + psllw xmm2,PRE_MULTIPLY_SCALE_BITS + psllw xmm0,PRE_MULTIPLY_SCALE_BITS + + psllw xmm5,PRE_MULTIPLY_SCALE_BITS + pmulhw xmm5,[GOTOFF(ebx,PW_F0707)] ; xmm5=z3 + + movdqa xmm4,xmm2 ; xmm4=tmp10 + psubw xmm2,xmm0 + pmulhw xmm2,[GOTOFF(ebx,PW_F0382)] ; xmm2=z5 + pmulhw xmm4,[GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196) + pmulhw xmm0,[GOTOFF(ebx,PW_F1306)] ; xmm0=MULTIPLY(tmp12,FIX_1_306562) + paddw xmm4,xmm2 ; xmm4=z2 + paddw xmm0,xmm2 ; xmm0=z4 + + movdqa xmm3,xmm6 + psubw xmm6,xmm5 ; xmm6=z13 + paddw xmm3,xmm5 ; xmm3=z11 + + movdqa xmm2,xmm6 + movdqa xmm5,xmm3 + psubw xmm6,xmm4 ; xmm6=data3 + psubw xmm3,xmm0 ; xmm3=data7 + paddw xmm2,xmm4 ; xmm2=data5 + paddw xmm5,xmm0 ; xmm5=data1 + + ; ---- Pass 2: process columns. + +; mov edx, POINTER [data(eax)] ; (DCTELEM *) + + ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72) + ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73) + + movdqa xmm4,xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1,xmm5 ; xmm1=(00 01 10 11 20 21 30 31) + punpckhwd xmm4,xmm5 ; xmm4=(40 41 50 51 60 61 70 71) + movdqa xmm0,xmm7 ; transpose coefficients(phase 1) + punpcklwd xmm7,xmm6 ; xmm7=(02 03 12 13 22 23 32 33) + punpckhwd xmm0,xmm6 ; xmm0=(42 43 52 53 62 63 72 73) + + movdqa xmm5, XMMWORD [wk(0)] ; xmm5=col4 + movdqa xmm6, XMMWORD [wk(1)] ; xmm6=col6 + + ; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76) + ; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77) + + movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(02 03 12 13 22 23 32 33) + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(42 43 52 53 62 63 72 73) + + movdqa xmm7,xmm5 ; transpose coefficients(phase 1) + punpcklwd xmm5,xmm2 ; xmm5=(04 05 14 15 24 25 34 35) + punpckhwd xmm7,xmm2 ; xmm7=(44 45 54 55 64 65 74 75) + movdqa xmm0,xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6,xmm3 ; xmm6=(06 07 16 17 26 27 36 37) + punpckhwd xmm0,xmm3 ; xmm0=(46 47 56 57 66 67 76 77) + + movdqa xmm2,xmm5 ; transpose coefficients(phase 2) + punpckldq xmm5,xmm6 ; xmm5=(04 05 06 07 14 15 16 17) + punpckhdq xmm2,xmm6 ; xmm2=(24 25 26 27 34 35 36 37) + movdqa xmm3,xmm7 ; transpose coefficients(phase 2) + punpckldq xmm7,xmm0 ; xmm7=(44 45 46 47 54 55 56 57) + punpckhdq xmm3,xmm0 ; xmm3=(64 65 66 67 74 75 76 77) + + movdqa xmm6, XMMWORD [wk(0)] ; xmm6=(02 03 12 13 22 23 32 33) + movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(42 43 52 53 62 63 72 73) + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(24 25 26 27 34 35 36 37) + movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(44 45 46 47 54 55 56 57) + + movdqa xmm2,xmm1 ; transpose coefficients(phase 2) + punpckldq xmm1,xmm6 ; xmm1=(00 01 02 03 10 11 12 13) + punpckhdq xmm2,xmm6 ; xmm2=(20 21 22 23 30 31 32 33) + movdqa xmm7,xmm4 ; transpose coefficients(phase 2) + punpckldq xmm4,xmm0 ; xmm4=(40 41 42 43 50 51 52 53) + punpckhdq xmm7,xmm0 ; xmm7=(60 61 62 63 70 71 72 73) + + movdqa xmm6,xmm1 ; transpose coefficients(phase 3) + punpcklqdq xmm1,xmm5 ; xmm1=(00 01 02 03 04 05 06 07)=data0 + punpckhqdq xmm6,xmm5 ; xmm6=(10 11 12 13 14 15 16 17)=data1 + movdqa xmm0,xmm7 ; transpose coefficients(phase 3) + punpcklqdq xmm7,xmm3 ; xmm7=(60 61 62 63 64 65 66 67)=data6 + punpckhqdq xmm0,xmm3 ; xmm0=(70 71 72 73 74 75 76 77)=data7 + + movdqa xmm5,xmm6 + movdqa xmm3,xmm1 + psubw xmm6,xmm7 ; xmm6=data1-data6=tmp6 + psubw xmm1,xmm0 ; xmm1=data0-data7=tmp7 + paddw xmm5,xmm7 ; xmm5=data1+data6=tmp1 + paddw xmm3,xmm0 ; xmm3=data0+data7=tmp0 + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(24 25 26 27 34 35 36 37) + movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(44 45 46 47 54 55 56 57) + movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=tmp6 + movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=tmp7 + + movdqa xmm6,xmm2 ; transpose coefficients(phase 3) + punpcklqdq xmm2,xmm7 ; xmm2=(20 21 22 23 24 25 26 27)=data2 + punpckhqdq xmm6,xmm7 ; xmm6=(30 31 32 33 34 35 36 37)=data3 + movdqa xmm1,xmm4 ; transpose coefficients(phase 3) + punpcklqdq xmm4,xmm0 ; xmm4=(40 41 42 43 44 45 46 47)=data4 + punpckhqdq xmm1,xmm0 ; xmm1=(50 51 52 53 54 55 56 57)=data5 + + movdqa xmm7,xmm6 + movdqa xmm0,xmm2 + paddw xmm6,xmm4 ; xmm6=data3+data4=tmp3 + paddw xmm2,xmm1 ; xmm2=data2+data5=tmp2 + psubw xmm7,xmm4 ; xmm7=data3-data4=tmp4 + psubw xmm0,xmm1 ; xmm0=data2-data5=tmp5 + + ; -- Even part + + movdqa xmm4,xmm3 + movdqa xmm1,xmm5 + psubw xmm3,xmm6 ; xmm3=tmp13 + psubw xmm5,xmm2 ; xmm5=tmp12 + paddw xmm4,xmm6 ; xmm4=tmp10 + paddw xmm1,xmm2 ; xmm1=tmp11 + + paddw xmm5,xmm3 + psllw xmm5,PRE_MULTIPLY_SCALE_BITS + pmulhw xmm5,[GOTOFF(ebx,PW_F0707)] ; xmm5=z1 + + movdqa xmm6,xmm4 + movdqa xmm2,xmm3 + psubw xmm4,xmm1 ; xmm4=data4 + psubw xmm3,xmm5 ; xmm3=data6 + paddw xmm6,xmm1 ; xmm6=data0 + paddw xmm2,xmm5 ; xmm2=data2 + + movdqa XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm4 + movdqa XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm3 + movdqa XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm6 + movdqa XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm2 + + ; -- Odd part + + movdqa xmm1, XMMWORD [wk(0)] ; xmm1=tmp6 + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7 + + paddw xmm7,xmm0 ; xmm7=tmp10 + paddw xmm0,xmm1 ; xmm0=tmp11 + paddw xmm1,xmm5 ; xmm1=tmp12, xmm5=tmp7 + + psllw xmm7,PRE_MULTIPLY_SCALE_BITS + psllw xmm1,PRE_MULTIPLY_SCALE_BITS + + psllw xmm0,PRE_MULTIPLY_SCALE_BITS + pmulhw xmm0,[GOTOFF(ebx,PW_F0707)] ; xmm0=z3 + + movdqa xmm4,xmm7 ; xmm4=tmp10 + psubw xmm7,xmm1 + pmulhw xmm7,[GOTOFF(ebx,PW_F0382)] ; xmm7=z5 + pmulhw xmm4,[GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196) + pmulhw xmm1,[GOTOFF(ebx,PW_F1306)] ; xmm1=MULTIPLY(tmp12,FIX_1_306562) + paddw xmm4,xmm7 ; xmm4=z2 + paddw xmm1,xmm7 ; xmm1=z4 + + movdqa xmm3,xmm5 + psubw xmm5,xmm0 ; xmm5=z13 + paddw xmm3,xmm0 ; xmm3=z11 + + movdqa xmm6,xmm5 + movdqa xmm2,xmm3 + psubw xmm5,xmm4 ; xmm5=data3 + psubw xmm3,xmm1 ; xmm3=data7 + paddw xmm6,xmm4 ; xmm6=data5 + paddw xmm2,xmm1 ; xmm2=data1 + + movdqa XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm5 + movdqa XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm3 + movdqa XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm6 + movdqa XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm2 + +; pop edi ; unused +; pop esi ; unused +; pop edx ; need not be preserved +; pop ecx ; unused + poppic ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/Builder/jni-1.11/simd/jfdctint-altivec.c b/Builder/jni-1.11/simd/jfdctint-altivec.c new file mode 100644 index 000000000..e6e8a5687 --- /dev/null +++ b/Builder/jni-1.11/simd/jfdctint-altivec.c @@ -0,0 +1,262 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2014, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* SLOW INTEGER FORWARD DCT */ + +#include "jsimd_altivec.h" + + +#define F_0_298 2446 /* FIX(0.298631336) */ +#define F_0_390 3196 /* FIX(0.390180644) */ +#define F_0_541 4433 /* FIX(0.541196100) */ +#define F_0_765 6270 /* FIX(0.765366865) */ +#define F_0_899 7373 /* FIX(0.899976223) */ +#define F_1_175 9633 /* FIX(1.175875602) */ +#define F_1_501 12299 /* FIX(1.501321110) */ +#define F_1_847 15137 /* FIX(1.847759065) */ +#define F_1_961 16069 /* FIX(1.961570560) */ +#define F_2_053 16819 /* FIX(2.053119869) */ +#define F_2_562 20995 /* FIX(2.562915447) */ +#define F_3_072 25172 /* FIX(3.072711026) */ + +#define CONST_BITS 13 +#define PASS1_BITS 2 +#define DESCALE_P1 (CONST_BITS - PASS1_BITS) +#define DESCALE_P2 (CONST_BITS + PASS1_BITS) + + +#define DO_FDCT_COMMON(PASS) \ +{ \ + /* (Original) \ + * z1 = (tmp12 + tmp13) * 0.541196100; \ + * data2 = z1 + tmp13 * 0.765366865; \ + * data6 = z1 + tmp12 * -1.847759065; \ + * \ + * (This implementation) \ + * data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; \ + * data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); \ + */ \ + \ + tmp1312l = vec_mergeh(tmp13, tmp12); \ + tmp1312h = vec_mergel(tmp13, tmp12); \ + \ + out2l = vec_msums(tmp1312l, pw_f130_f054, pd_descale_p##PASS); \ + out2h = vec_msums(tmp1312h, pw_f130_f054, pd_descale_p##PASS); \ + out6l = vec_msums(tmp1312l, pw_f054_mf130, pd_descale_p##PASS); \ + out6h = vec_msums(tmp1312h, pw_f054_mf130, pd_descale_p##PASS); \ + \ + out2l = vec_sra(out2l, descale_p##PASS); \ + out2h = vec_sra(out2h, descale_p##PASS); \ + out6l = vec_sra(out6l, descale_p##PASS); \ + out6h = vec_sra(out6h, descale_p##PASS); \ + \ + out2 = vec_pack(out2l, out2h); \ + out6 = vec_pack(out6l, out6h); \ + \ + /* Odd part */ \ + \ + z3 = vec_add(tmp4, tmp6); \ + z4 = vec_add(tmp5, tmp7); \ + \ + /* (Original) \ + * z5 = (z3 + z4) * 1.175875602; \ + * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; \ + * z3 += z5; z4 += z5; \ + * \ + * (This implementation) \ + * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \ + * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \ + */ \ + \ + z34l = vec_mergeh(z3, z4); \ + z34h = vec_mergel(z3, z4); \ + \ + z3l = vec_msums(z34l, pw_mf078_f117, pd_descale_p##PASS); \ + z3h = vec_msums(z34h, pw_mf078_f117, pd_descale_p##PASS); \ + z4l = vec_msums(z34l, pw_f117_f078, pd_descale_p##PASS); \ + z4h = vec_msums(z34h, pw_f117_f078, pd_descale_p##PASS); \ + \ + /* (Original) \ + * z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; \ + * tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; \ + * tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; \ + * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; \ + * data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; \ + * data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; \ + * \ + * (This implementation) \ + * tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; \ + * tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; \ + * tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); \ + * tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); \ + * data7 = tmp4 + z3; data5 = tmp5 + z4; \ + * data3 = tmp6 + z3; data1 = tmp7 + z4; \ + */ \ + \ + tmp47l = vec_mergeh(tmp4, tmp7); \ + tmp47h = vec_mergel(tmp4, tmp7); \ + \ + out7l = vec_msums(tmp47l, pw_mf060_mf089, z3l); \ + out7h = vec_msums(tmp47h, pw_mf060_mf089, z3h); \ + out1l = vec_msums(tmp47l, pw_mf089_f060, z4l); \ + out1h = vec_msums(tmp47h, pw_mf089_f060, z4h); \ + \ + out7l = vec_sra(out7l, descale_p##PASS); \ + out7h = vec_sra(out7h, descale_p##PASS); \ + out1l = vec_sra(out1l, descale_p##PASS); \ + out1h = vec_sra(out1h, descale_p##PASS); \ + \ + out7 = vec_pack(out7l, out7h); \ + out1 = vec_pack(out1l, out1h); \ + \ + tmp56l = vec_mergeh(tmp5, tmp6); \ + tmp56h = vec_mergel(tmp5, tmp6); \ + \ + out5l = vec_msums(tmp56l, pw_mf050_mf256, z4l); \ + out5h = vec_msums(tmp56h, pw_mf050_mf256, z4h); \ + out3l = vec_msums(tmp56l, pw_mf256_f050, z3l); \ + out3h = vec_msums(tmp56h, pw_mf256_f050, z3h); \ + \ + out5l = vec_sra(out5l, descale_p##PASS); \ + out5h = vec_sra(out5h, descale_p##PASS); \ + out3l = vec_sra(out3l, descale_p##PASS); \ + out3h = vec_sra(out3h, descale_p##PASS); \ + \ + out5 = vec_pack(out5l, out5h); \ + out3 = vec_pack(out3l, out3h); \ +} + +#define DO_FDCT_PASS1() \ +{ \ + /* Even part */ \ + \ + tmp10 = vec_add(tmp0, tmp3); \ + tmp13 = vec_sub(tmp0, tmp3); \ + tmp11 = vec_add(tmp1, tmp2); \ + tmp12 = vec_sub(tmp1, tmp2); \ + \ + out0 = vec_add(tmp10, tmp11); \ + out0 = vec_sl(out0, pass1_bits); \ + out4 = vec_sub(tmp10, tmp11); \ + out4 = vec_sl(out4, pass1_bits); \ + \ + DO_FDCT_COMMON(1); \ +} + +#define DO_FDCT_PASS2() \ +{ \ + /* Even part */ \ + \ + tmp10 = vec_add(tmp0, tmp3); \ + tmp13 = vec_sub(tmp0, tmp3); \ + tmp11 = vec_add(tmp1, tmp2); \ + tmp12 = vec_sub(tmp1, tmp2); \ + \ + out0 = vec_add(tmp10, tmp11); \ + out0 = vec_add(out0, pw_descale_p2x); \ + out0 = vec_sra(out0, pass1_bits); \ + out4 = vec_sub(tmp10, tmp11); \ + out4 = vec_add(out4, pw_descale_p2x); \ + out4 = vec_sra(out4, pass1_bits); \ + \ + DO_FDCT_COMMON(2); \ +} + + +void +jsimd_fdct_islow_altivec (DCTELEM *data) +{ + __vector short row0, row1, row2, row3, row4, row5, row6, row7, + col0, col1, col2, col3, col4, col5, col6, col7, + tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13, + tmp47l, tmp47h, tmp56l, tmp56h, tmp1312l, tmp1312h, + z3, z4, z34l, z34h, + out0, out1, out2, out3, out4, out5, out6, out7; + __vector int z3l, z3h, z4l, z4h, + out1l, out1h, out2l, out2h, out3l, out3h, out5l, out5h, out6l, out6h, + out7l, out7h; + + /* Constants */ + __vector short + pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) }, + pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) }, + pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) }, + pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) }, + pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) }, + pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) }, + pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) }, + pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) }, + pw_descale_p2x = { __8X(1 << (PASS1_BITS - 1)) }; + __vector unsigned short pass1_bits = { __8X(PASS1_BITS) }; + __vector int pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) }, + pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) }; + __vector unsigned int descale_p1 = { __4X(DESCALE_P1) }, + descale_p2 = { __4X(DESCALE_P2) }; + + /* Pass 1: process rows */ + + row0 = vec_ld(0, data); + row1 = vec_ld(16, data); + row2 = vec_ld(32, data); + row3 = vec_ld(48, data); + row4 = vec_ld(64, data); + row5 = vec_ld(80, data); + row6 = vec_ld(96, data); + row7 = vec_ld(112, data); + + TRANSPOSE(row, col); + + tmp0 = vec_add(col0, col7); + tmp7 = vec_sub(col0, col7); + tmp1 = vec_add(col1, col6); + tmp6 = vec_sub(col1, col6); + tmp2 = vec_add(col2, col5); + tmp5 = vec_sub(col2, col5); + tmp3 = vec_add(col3, col4); + tmp4 = vec_sub(col3, col4); + + DO_FDCT_PASS1(); + + /* Pass 2: process columns */ + + TRANSPOSE(out, row); + + tmp0 = vec_add(row0, row7); + tmp7 = vec_sub(row0, row7); + tmp1 = vec_add(row1, row6); + tmp6 = vec_sub(row1, row6); + tmp2 = vec_add(row2, row5); + tmp5 = vec_sub(row2, row5); + tmp3 = vec_add(row3, row4); + tmp4 = vec_sub(row3, row4); + + DO_FDCT_PASS2(); + + vec_st(out0, 0, data); + vec_st(out1, 16, data); + vec_st(out2, 32, data); + vec_st(out3, 48, data); + vec_st(out4, 64, data); + vec_st(out5, 80, data); + vec_st(out6, 96, data); + vec_st(out7, 112, data); +} diff --git a/Builder/jni-1.11/simd/jfdctint-mmx.asm b/Builder/jni-1.11/simd/jfdctint-mmx.asm new file mode 100644 index 000000000..9142ad881 --- /dev/null +++ b/Builder/jni-1.11/simd/jfdctint-mmx.asm @@ -0,0 +1,621 @@ +; +; jfdctint.asm - accurate integer FDCT (MMX) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a slow-but-accurate integer implementation of the +; forward DCT (Discrete Cosine Transform). The following code is based +; directly on the IJG's original jfdctint.c; see the jfdctint.c for +; more details. +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 13 +%define PASS1_BITS 2 + +%define DESCALE_P1 (CONST_BITS-PASS1_BITS) +%define DESCALE_P2 (CONST_BITS+PASS1_BITS) + +%if CONST_BITS == 13 +F_0_298 equ 2446 ; FIX(0.298631336) +F_0_390 equ 3196 ; FIX(0.390180644) +F_0_541 equ 4433 ; FIX(0.541196100) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_175 equ 9633 ; FIX(1.175875602) +F_1_501 equ 12299 ; FIX(1.501321110) +F_1_847 equ 15137 ; FIX(1.847759065) +F_1_961 equ 16069 ; FIX(1.961570560) +F_2_053 equ 16819 ; FIX(2.053119869) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_072 equ 25172 ; FIX(3.072711026) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) +F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336) +F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644) +F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) +F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) +F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) +F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602) +F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110) +F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) +F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560) +F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869) +F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) +F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_fdct_islow_mmx) + +EXTN(jconst_fdct_islow_mmx): + +PW_F130_F054 times 2 dw (F_0_541+F_0_765), F_0_541 +PW_F054_MF130 times 2 dw F_0_541, (F_0_541-F_1_847) +PW_MF078_F117 times 2 dw (F_1_175-F_1_961), F_1_175 +PW_F117_F078 times 2 dw F_1_175, (F_1_175-F_0_390) +PW_MF060_MF089 times 2 dw (F_0_298-F_0_899),-F_0_899 +PW_MF089_F060 times 2 dw -F_0_899, (F_1_501-F_0_899) +PW_MF050_MF256 times 2 dw (F_2_053-F_2_562),-F_2_562 +PW_MF256_F050 times 2 dw -F_2_562, (F_3_072-F_2_562) +PD_DESCALE_P1 times 2 dd 1 << (DESCALE_P1-1) +PD_DESCALE_P2 times 2 dd 1 << (DESCALE_P2-1) +PW_DESCALE_P2X times 4 dw 1 << (PASS1_BITS-1) + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform the forward DCT on one block of samples. +; +; GLOBAL(void) +; jsimd_fdct_islow_mmx (DCTELEM *data) +; + +%define data(b) (b)+8 ; DCTELEM *data + +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] +%define WK_NUM 2 + + align 16 + global EXTN(jsimd_fdct_islow_mmx) + +EXTN(jsimd_fdct_islow_mmx): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved +; push esi ; unused +; push edi ; unused + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process rows. + + mov edx, POINTER [data(eax)] ; (DCTELEM *) + mov ecx, DCTSIZE/4 + alignx 16,7 +.rowloop: + + movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)] + movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)] + movq mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)] + movq mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)] + + ; mm0=(20 21 22 23), mm2=(24 25 26 27) + ; mm1=(30 31 32 33), mm3=(34 35 36 37) + + movq mm4,mm0 ; transpose coefficients(phase 1) + punpcklwd mm0,mm1 ; mm0=(20 30 21 31) + punpckhwd mm4,mm1 ; mm4=(22 32 23 33) + movq mm5,mm2 ; transpose coefficients(phase 1) + punpcklwd mm2,mm3 ; mm2=(24 34 25 35) + punpckhwd mm5,mm3 ; mm5=(26 36 27 37) + + movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)] + movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)] + movq mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)] + movq mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)] + + ; mm6=(00 01 02 03), mm1=(04 05 06 07) + ; mm7=(10 11 12 13), mm3=(14 15 16 17) + + movq MMWORD [wk(0)], mm4 ; wk(0)=(22 32 23 33) + movq MMWORD [wk(1)], mm2 ; wk(1)=(24 34 25 35) + + movq mm4,mm6 ; transpose coefficients(phase 1) + punpcklwd mm6,mm7 ; mm6=(00 10 01 11) + punpckhwd mm4,mm7 ; mm4=(02 12 03 13) + movq mm2,mm1 ; transpose coefficients(phase 1) + punpcklwd mm1,mm3 ; mm1=(04 14 05 15) + punpckhwd mm2,mm3 ; mm2=(06 16 07 17) + + movq mm7,mm6 ; transpose coefficients(phase 2) + punpckldq mm6,mm0 ; mm6=(00 10 20 30)=data0 + punpckhdq mm7,mm0 ; mm7=(01 11 21 31)=data1 + movq mm3,mm2 ; transpose coefficients(phase 2) + punpckldq mm2,mm5 ; mm2=(06 16 26 36)=data6 + punpckhdq mm3,mm5 ; mm3=(07 17 27 37)=data7 + + movq mm0,mm7 + movq mm5,mm6 + psubw mm7,mm2 ; mm7=data1-data6=tmp6 + psubw mm6,mm3 ; mm6=data0-data7=tmp7 + paddw mm0,mm2 ; mm0=data1+data6=tmp1 + paddw mm5,mm3 ; mm5=data0+data7=tmp0 + + movq mm2, MMWORD [wk(0)] ; mm2=(22 32 23 33) + movq mm3, MMWORD [wk(1)] ; mm3=(24 34 25 35) + movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6 + movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7 + + movq mm7,mm4 ; transpose coefficients(phase 2) + punpckldq mm4,mm2 ; mm4=(02 12 22 32)=data2 + punpckhdq mm7,mm2 ; mm7=(03 13 23 33)=data3 + movq mm6,mm1 ; transpose coefficients(phase 2) + punpckldq mm1,mm3 ; mm1=(04 14 24 34)=data4 + punpckhdq mm6,mm3 ; mm6=(05 15 25 35)=data5 + + movq mm2,mm7 + movq mm3,mm4 + paddw mm7,mm1 ; mm7=data3+data4=tmp3 + paddw mm4,mm6 ; mm4=data2+data5=tmp2 + psubw mm2,mm1 ; mm2=data3-data4=tmp4 + psubw mm3,mm6 ; mm3=data2-data5=tmp5 + + ; -- Even part + + movq mm1,mm5 + movq mm6,mm0 + paddw mm5,mm7 ; mm5=tmp10 + paddw mm0,mm4 ; mm0=tmp11 + psubw mm1,mm7 ; mm1=tmp13 + psubw mm6,mm4 ; mm6=tmp12 + + movq mm7,mm5 + paddw mm5,mm0 ; mm5=tmp10+tmp11 + psubw mm7,mm0 ; mm7=tmp10-tmp11 + + psllw mm5,PASS1_BITS ; mm5=data0 + psllw mm7,PASS1_BITS ; mm7=data4 + + movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5 + movq MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm7 + + ; (Original) + ; z1 = (tmp12 + tmp13) * 0.541196100; + ; data2 = z1 + tmp13 * 0.765366865; + ; data6 = z1 + tmp12 * -1.847759065; + ; + ; (This implementation) + ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; + ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); + + movq mm4,mm1 ; mm1=tmp13 + movq mm0,mm1 + punpcklwd mm4,mm6 ; mm6=tmp12 + punpckhwd mm0,mm6 + movq mm1,mm4 + movq mm6,mm0 + pmaddwd mm4,[GOTOFF(ebx,PW_F130_F054)] ; mm4=data2L + pmaddwd mm0,[GOTOFF(ebx,PW_F130_F054)] ; mm0=data2H + pmaddwd mm1,[GOTOFF(ebx,PW_F054_MF130)] ; mm1=data6L + pmaddwd mm6,[GOTOFF(ebx,PW_F054_MF130)] ; mm6=data6H + + paddd mm4,[GOTOFF(ebx,PD_DESCALE_P1)] + paddd mm0,[GOTOFF(ebx,PD_DESCALE_P1)] + psrad mm4,DESCALE_P1 + psrad mm0,DESCALE_P1 + paddd mm1,[GOTOFF(ebx,PD_DESCALE_P1)] + paddd mm6,[GOTOFF(ebx,PD_DESCALE_P1)] + psrad mm1,DESCALE_P1 + psrad mm6,DESCALE_P1 + + packssdw mm4,mm0 ; mm4=data2 + packssdw mm1,mm6 ; mm1=data6 + + movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4 + movq MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm1 + + ; -- Odd part + + movq mm5, MMWORD [wk(0)] ; mm5=tmp6 + movq mm7, MMWORD [wk(1)] ; mm7=tmp7 + + movq mm0,mm2 ; mm2=tmp4 + movq mm6,mm3 ; mm3=tmp5 + paddw mm0,mm5 ; mm0=z3 + paddw mm6,mm7 ; mm6=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movq mm4,mm0 + movq mm1,mm0 + punpcklwd mm4,mm6 + punpckhwd mm1,mm6 + movq mm0,mm4 + movq mm6,mm1 + pmaddwd mm4,[GOTOFF(ebx,PW_MF078_F117)] ; mm4=z3L + pmaddwd mm1,[GOTOFF(ebx,PW_MF078_F117)] ; mm1=z3H + pmaddwd mm0,[GOTOFF(ebx,PW_F117_F078)] ; mm0=z4L + pmaddwd mm6,[GOTOFF(ebx,PW_F117_F078)] ; mm6=z4H + + movq MMWORD [wk(0)], mm4 ; wk(0)=z3L + movq MMWORD [wk(1)], mm1 ; wk(1)=z3H + + ; (Original) + ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; + ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; + ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; + ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; + ; + ; (This implementation) + ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; + ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; + ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); + ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); + ; data7 = tmp4 + z3; data5 = tmp5 + z4; + ; data3 = tmp6 + z3; data1 = tmp7 + z4; + + movq mm4,mm2 + movq mm1,mm2 + punpcklwd mm4,mm7 + punpckhwd mm1,mm7 + movq mm2,mm4 + movq mm7,mm1 + pmaddwd mm4,[GOTOFF(ebx,PW_MF060_MF089)] ; mm4=tmp4L + pmaddwd mm1,[GOTOFF(ebx,PW_MF060_MF089)] ; mm1=tmp4H + pmaddwd mm2,[GOTOFF(ebx,PW_MF089_F060)] ; mm2=tmp7L + pmaddwd mm7,[GOTOFF(ebx,PW_MF089_F060)] ; mm7=tmp7H + + paddd mm4, MMWORD [wk(0)] ; mm4=data7L + paddd mm1, MMWORD [wk(1)] ; mm1=data7H + paddd mm2,mm0 ; mm2=data1L + paddd mm7,mm6 ; mm7=data1H + + paddd mm4,[GOTOFF(ebx,PD_DESCALE_P1)] + paddd mm1,[GOTOFF(ebx,PD_DESCALE_P1)] + psrad mm4,DESCALE_P1 + psrad mm1,DESCALE_P1 + paddd mm2,[GOTOFF(ebx,PD_DESCALE_P1)] + paddd mm7,[GOTOFF(ebx,PD_DESCALE_P1)] + psrad mm2,DESCALE_P1 + psrad mm7,DESCALE_P1 + + packssdw mm4,mm1 ; mm4=data7 + packssdw mm2,mm7 ; mm2=data1 + + movq MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm4 + movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2 + + movq mm1,mm3 + movq mm7,mm3 + punpcklwd mm1,mm5 + punpckhwd mm7,mm5 + movq mm3,mm1 + movq mm5,mm7 + pmaddwd mm1,[GOTOFF(ebx,PW_MF050_MF256)] ; mm1=tmp5L + pmaddwd mm7,[GOTOFF(ebx,PW_MF050_MF256)] ; mm7=tmp5H + pmaddwd mm3,[GOTOFF(ebx,PW_MF256_F050)] ; mm3=tmp6L + pmaddwd mm5,[GOTOFF(ebx,PW_MF256_F050)] ; mm5=tmp6H + + paddd mm1,mm0 ; mm1=data5L + paddd mm7,mm6 ; mm7=data5H + paddd mm3, MMWORD [wk(0)] ; mm3=data3L + paddd mm5, MMWORD [wk(1)] ; mm5=data3H + + paddd mm1,[GOTOFF(ebx,PD_DESCALE_P1)] + paddd mm7,[GOTOFF(ebx,PD_DESCALE_P1)] + psrad mm1,DESCALE_P1 + psrad mm7,DESCALE_P1 + paddd mm3,[GOTOFF(ebx,PD_DESCALE_P1)] + paddd mm5,[GOTOFF(ebx,PD_DESCALE_P1)] + psrad mm3,DESCALE_P1 + psrad mm5,DESCALE_P1 + + packssdw mm1,mm7 ; mm1=data5 + packssdw mm3,mm5 ; mm3=data3 + + movq MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm1 + movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3 + + add edx, byte 4*DCTSIZE*SIZEOF_DCTELEM + dec ecx + jnz near .rowloop + + ; ---- Pass 2: process columns. + + mov edx, POINTER [data(eax)] ; (DCTELEM *) + mov ecx, DCTSIZE/4 + alignx 16,7 +.columnloop: + + movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)] + movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)] + movq mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)] + movq mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)] + + ; mm0=(02 12 22 32), mm2=(42 52 62 72) + ; mm1=(03 13 23 33), mm3=(43 53 63 73) + + movq mm4,mm0 ; transpose coefficients(phase 1) + punpcklwd mm0,mm1 ; mm0=(02 03 12 13) + punpckhwd mm4,mm1 ; mm4=(22 23 32 33) + movq mm5,mm2 ; transpose coefficients(phase 1) + punpcklwd mm2,mm3 ; mm2=(42 43 52 53) + punpckhwd mm5,mm3 ; mm5=(62 63 72 73) + + movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)] + movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)] + movq mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)] + movq mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)] + + ; mm6=(00 10 20 30), mm1=(40 50 60 70) + ; mm7=(01 11 21 31), mm3=(41 51 61 71) + + movq MMWORD [wk(0)], mm4 ; wk(0)=(22 23 32 33) + movq MMWORD [wk(1)], mm2 ; wk(1)=(42 43 52 53) + + movq mm4,mm6 ; transpose coefficients(phase 1) + punpcklwd mm6,mm7 ; mm6=(00 01 10 11) + punpckhwd mm4,mm7 ; mm4=(20 21 30 31) + movq mm2,mm1 ; transpose coefficients(phase 1) + punpcklwd mm1,mm3 ; mm1=(40 41 50 51) + punpckhwd mm2,mm3 ; mm2=(60 61 70 71) + + movq mm7,mm6 ; transpose coefficients(phase 2) + punpckldq mm6,mm0 ; mm6=(00 01 02 03)=data0 + punpckhdq mm7,mm0 ; mm7=(10 11 12 13)=data1 + movq mm3,mm2 ; transpose coefficients(phase 2) + punpckldq mm2,mm5 ; mm2=(60 61 62 63)=data6 + punpckhdq mm3,mm5 ; mm3=(70 71 72 73)=data7 + + movq mm0,mm7 + movq mm5,mm6 + psubw mm7,mm2 ; mm7=data1-data6=tmp6 + psubw mm6,mm3 ; mm6=data0-data7=tmp7 + paddw mm0,mm2 ; mm0=data1+data6=tmp1 + paddw mm5,mm3 ; mm5=data0+data7=tmp0 + + movq mm2, MMWORD [wk(0)] ; mm2=(22 23 32 33) + movq mm3, MMWORD [wk(1)] ; mm3=(42 43 52 53) + movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6 + movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7 + + movq mm7,mm4 ; transpose coefficients(phase 2) + punpckldq mm4,mm2 ; mm4=(20 21 22 23)=data2 + punpckhdq mm7,mm2 ; mm7=(30 31 32 33)=data3 + movq mm6,mm1 ; transpose coefficients(phase 2) + punpckldq mm1,mm3 ; mm1=(40 41 42 43)=data4 + punpckhdq mm6,mm3 ; mm6=(50 51 52 53)=data5 + + movq mm2,mm7 + movq mm3,mm4 + paddw mm7,mm1 ; mm7=data3+data4=tmp3 + paddw mm4,mm6 ; mm4=data2+data5=tmp2 + psubw mm2,mm1 ; mm2=data3-data4=tmp4 + psubw mm3,mm6 ; mm3=data2-data5=tmp5 + + ; -- Even part + + movq mm1,mm5 + movq mm6,mm0 + paddw mm5,mm7 ; mm5=tmp10 + paddw mm0,mm4 ; mm0=tmp11 + psubw mm1,mm7 ; mm1=tmp13 + psubw mm6,mm4 ; mm6=tmp12 + + movq mm7,mm5 + paddw mm5,mm0 ; mm5=tmp10+tmp11 + psubw mm7,mm0 ; mm7=tmp10-tmp11 + + paddw mm5,[GOTOFF(ebx,PW_DESCALE_P2X)] + paddw mm7,[GOTOFF(ebx,PW_DESCALE_P2X)] + psraw mm5,PASS1_BITS ; mm5=data0 + psraw mm7,PASS1_BITS ; mm7=data4 + + movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5 + movq MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm7 + + ; (Original) + ; z1 = (tmp12 + tmp13) * 0.541196100; + ; data2 = z1 + tmp13 * 0.765366865; + ; data6 = z1 + tmp12 * -1.847759065; + ; + ; (This implementation) + ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; + ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); + + movq mm4,mm1 ; mm1=tmp13 + movq mm0,mm1 + punpcklwd mm4,mm6 ; mm6=tmp12 + punpckhwd mm0,mm6 + movq mm1,mm4 + movq mm6,mm0 + pmaddwd mm4,[GOTOFF(ebx,PW_F130_F054)] ; mm4=data2L + pmaddwd mm0,[GOTOFF(ebx,PW_F130_F054)] ; mm0=data2H + pmaddwd mm1,[GOTOFF(ebx,PW_F054_MF130)] ; mm1=data6L + pmaddwd mm6,[GOTOFF(ebx,PW_F054_MF130)] ; mm6=data6H + + paddd mm4,[GOTOFF(ebx,PD_DESCALE_P2)] + paddd mm0,[GOTOFF(ebx,PD_DESCALE_P2)] + psrad mm4,DESCALE_P2 + psrad mm0,DESCALE_P2 + paddd mm1,[GOTOFF(ebx,PD_DESCALE_P2)] + paddd mm6,[GOTOFF(ebx,PD_DESCALE_P2)] + psrad mm1,DESCALE_P2 + psrad mm6,DESCALE_P2 + + packssdw mm4,mm0 ; mm4=data2 + packssdw mm1,mm6 ; mm1=data6 + + movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4 + movq MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm1 + + ; -- Odd part + + movq mm5, MMWORD [wk(0)] ; mm5=tmp6 + movq mm7, MMWORD [wk(1)] ; mm7=tmp7 + + movq mm0,mm2 ; mm2=tmp4 + movq mm6,mm3 ; mm3=tmp5 + paddw mm0,mm5 ; mm0=z3 + paddw mm6,mm7 ; mm6=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movq mm4,mm0 + movq mm1,mm0 + punpcklwd mm4,mm6 + punpckhwd mm1,mm6 + movq mm0,mm4 + movq mm6,mm1 + pmaddwd mm4,[GOTOFF(ebx,PW_MF078_F117)] ; mm4=z3L + pmaddwd mm1,[GOTOFF(ebx,PW_MF078_F117)] ; mm1=z3H + pmaddwd mm0,[GOTOFF(ebx,PW_F117_F078)] ; mm0=z4L + pmaddwd mm6,[GOTOFF(ebx,PW_F117_F078)] ; mm6=z4H + + movq MMWORD [wk(0)], mm4 ; wk(0)=z3L + movq MMWORD [wk(1)], mm1 ; wk(1)=z3H + + ; (Original) + ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; + ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; + ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; + ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; + ; + ; (This implementation) + ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; + ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; + ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); + ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); + ; data7 = tmp4 + z3; data5 = tmp5 + z4; + ; data3 = tmp6 + z3; data1 = tmp7 + z4; + + movq mm4,mm2 + movq mm1,mm2 + punpcklwd mm4,mm7 + punpckhwd mm1,mm7 + movq mm2,mm4 + movq mm7,mm1 + pmaddwd mm4,[GOTOFF(ebx,PW_MF060_MF089)] ; mm4=tmp4L + pmaddwd mm1,[GOTOFF(ebx,PW_MF060_MF089)] ; mm1=tmp4H + pmaddwd mm2,[GOTOFF(ebx,PW_MF089_F060)] ; mm2=tmp7L + pmaddwd mm7,[GOTOFF(ebx,PW_MF089_F060)] ; mm7=tmp7H + + paddd mm4, MMWORD [wk(0)] ; mm4=data7L + paddd mm1, MMWORD [wk(1)] ; mm1=data7H + paddd mm2,mm0 ; mm2=data1L + paddd mm7,mm6 ; mm7=data1H + + paddd mm4,[GOTOFF(ebx,PD_DESCALE_P2)] + paddd mm1,[GOTOFF(ebx,PD_DESCALE_P2)] + psrad mm4,DESCALE_P2 + psrad mm1,DESCALE_P2 + paddd mm2,[GOTOFF(ebx,PD_DESCALE_P2)] + paddd mm7,[GOTOFF(ebx,PD_DESCALE_P2)] + psrad mm2,DESCALE_P2 + psrad mm7,DESCALE_P2 + + packssdw mm4,mm1 ; mm4=data7 + packssdw mm2,mm7 ; mm2=data1 + + movq MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm4 + movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2 + + movq mm1,mm3 + movq mm7,mm3 + punpcklwd mm1,mm5 + punpckhwd mm7,mm5 + movq mm3,mm1 + movq mm5,mm7 + pmaddwd mm1,[GOTOFF(ebx,PW_MF050_MF256)] ; mm1=tmp5L + pmaddwd mm7,[GOTOFF(ebx,PW_MF050_MF256)] ; mm7=tmp5H + pmaddwd mm3,[GOTOFF(ebx,PW_MF256_F050)] ; mm3=tmp6L + pmaddwd mm5,[GOTOFF(ebx,PW_MF256_F050)] ; mm5=tmp6H + + paddd mm1,mm0 ; mm1=data5L + paddd mm7,mm6 ; mm7=data5H + paddd mm3, MMWORD [wk(0)] ; mm3=data3L + paddd mm5, MMWORD [wk(1)] ; mm5=data3H + + paddd mm1,[GOTOFF(ebx,PD_DESCALE_P2)] + paddd mm7,[GOTOFF(ebx,PD_DESCALE_P2)] + psrad mm1,DESCALE_P2 + psrad mm7,DESCALE_P2 + paddd mm3,[GOTOFF(ebx,PD_DESCALE_P2)] + paddd mm5,[GOTOFF(ebx,PD_DESCALE_P2)] + psrad mm3,DESCALE_P2 + psrad mm5,DESCALE_P2 + + packssdw mm1,mm7 ; mm1=data5 + packssdw mm3,mm5 ; mm3=data3 + + movq MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm1 + movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3 + + add edx, byte 4*SIZEOF_DCTELEM + dec ecx + jnz near .columnloop + + emms ; empty MMX state + +; pop edi ; unused +; pop esi ; unused +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + poppic ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/Builder/jni-1.11/simd/jfdctint-sse2-64.asm b/Builder/jni-1.11/simd/jfdctint-sse2-64.asm new file mode 100644 index 000000000..9a0ca0fd2 --- /dev/null +++ b/Builder/jni-1.11/simd/jfdctint-sse2-64.asm @@ -0,0 +1,621 @@ +; +; jfdctint.asm - accurate integer FDCT (64-bit SSE2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2009, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a slow-but-accurate integer implementation of the +; forward DCT (Discrete Cosine Transform). The following code is based +; directly on the IJG's original jfdctint.c; see the jfdctint.c for +; more details. +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 13 +%define PASS1_BITS 2 + +%define DESCALE_P1 (CONST_BITS-PASS1_BITS) +%define DESCALE_P2 (CONST_BITS+PASS1_BITS) + +%if CONST_BITS == 13 +F_0_298 equ 2446 ; FIX(0.298631336) +F_0_390 equ 3196 ; FIX(0.390180644) +F_0_541 equ 4433 ; FIX(0.541196100) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_175 equ 9633 ; FIX(1.175875602) +F_1_501 equ 12299 ; FIX(1.501321110) +F_1_847 equ 15137 ; FIX(1.847759065) +F_1_961 equ 16069 ; FIX(1.961570560) +F_2_053 equ 16819 ; FIX(2.053119869) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_072 equ 25172 ; FIX(3.072711026) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) +F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336) +F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644) +F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) +F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) +F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) +F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602) +F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110) +F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) +F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560) +F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869) +F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) +F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_fdct_islow_sse2) + +EXTN(jconst_fdct_islow_sse2): + +PW_F130_F054 times 4 dw (F_0_541+F_0_765), F_0_541 +PW_F054_MF130 times 4 dw F_0_541, (F_0_541-F_1_847) +PW_MF078_F117 times 4 dw (F_1_175-F_1_961), F_1_175 +PW_F117_F078 times 4 dw F_1_175, (F_1_175-F_0_390) +PW_MF060_MF089 times 4 dw (F_0_298-F_0_899),-F_0_899 +PW_MF089_F060 times 4 dw -F_0_899, (F_1_501-F_0_899) +PW_MF050_MF256 times 4 dw (F_2_053-F_2_562),-F_2_562 +PW_MF256_F050 times 4 dw -F_2_562, (F_3_072-F_2_562) +PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1-1) +PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2-1) +PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS-1) + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 +; +; Perform the forward DCT on one block of samples. +; +; GLOBAL(void) +; jsimd_fdct_islow_sse2 (DCTELEM *data) +; + +; r10 = DCTELEM *data + +%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 6 + + align 16 + global EXTN(jsimd_fdct_islow_sse2) + +EXTN(jsimd_fdct_islow_sse2): + push rbp + mov rax,rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp],rax + mov rbp,rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args + + ; ---- Pass 1: process rows. + + mov rdx, r10 ; (DCTELEM *) + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)] + + ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27) + ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37) + + movdqa xmm4,xmm0 ; transpose coefficients(phase 1) + punpcklwd xmm0,xmm1 ; xmm0=(00 10 01 11 02 12 03 13) + punpckhwd xmm4,xmm1 ; xmm4=(04 14 05 15 06 16 07 17) + movdqa xmm5,xmm2 ; transpose coefficients(phase 1) + punpcklwd xmm2,xmm3 ; xmm2=(20 30 21 31 22 32 23 33) + punpckhwd xmm5,xmm3 ; xmm5=(24 34 25 35 26 36 27 37) + + movdqa xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)] + + ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62) + ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63) + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33) + movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37) + + movdqa xmm2,xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6,xmm7 ; xmm6=(40 50 41 51 42 52 43 53) + punpckhwd xmm2,xmm7 ; xmm2=(44 54 45 55 46 56 47 57) + movdqa xmm5,xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1,xmm3 ; xmm1=(60 70 61 71 62 72 63 73) + punpckhwd xmm5,xmm3 ; xmm5=(64 74 65 75 66 76 67 77) + + movdqa xmm7,xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6,xmm1 ; xmm6=(40 50 60 70 41 51 61 71) + punpckhdq xmm7,xmm1 ; xmm7=(42 52 62 72 43 53 63 73) + movdqa xmm3,xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2,xmm5 ; xmm2=(44 54 64 74 45 55 65 75) + punpckhdq xmm3,xmm5 ; xmm3=(46 56 66 76 47 57 67 77) + + movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33) + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37) + movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=(42 52 62 72 43 53 63 73) + movdqa XMMWORD [wk(3)], xmm2 ; wk(3)=(44 54 64 74 45 55 65 75) + + movdqa xmm7,xmm0 ; transpose coefficients(phase 2) + punpckldq xmm0,xmm1 ; xmm0=(00 10 20 30 01 11 21 31) + punpckhdq xmm7,xmm1 ; xmm7=(02 12 22 32 03 13 23 33) + movdqa xmm2,xmm4 ; transpose coefficients(phase 2) + punpckldq xmm4,xmm5 ; xmm4=(04 14 24 34 05 15 25 35) + punpckhdq xmm2,xmm5 ; xmm2=(06 16 26 36 07 17 27 37) + + movdqa xmm1,xmm0 ; transpose coefficients(phase 3) + punpcklqdq xmm0,xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0 + punpckhqdq xmm1,xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1 + movdqa xmm5,xmm2 ; transpose coefficients(phase 3) + punpcklqdq xmm2,xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6 + punpckhqdq xmm5,xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7 + + movdqa xmm6,xmm1 + movdqa xmm3,xmm0 + psubw xmm1,xmm2 ; xmm1=data1-data6=tmp6 + psubw xmm0,xmm5 ; xmm0=data0-data7=tmp7 + paddw xmm6,xmm2 ; xmm6=data1+data6=tmp1 + paddw xmm3,xmm5 ; xmm3=data0+data7=tmp0 + + movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(42 52 62 72 43 53 63 73) + movdqa xmm5, XMMWORD [wk(3)] ; xmm5=(44 54 64 74 45 55 65 75) + movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6 + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7 + + movdqa xmm1,xmm7 ; transpose coefficients(phase 3) + punpcklqdq xmm7,xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2 + punpckhqdq xmm1,xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3 + movdqa xmm0,xmm4 ; transpose coefficients(phase 3) + punpcklqdq xmm4,xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4 + punpckhqdq xmm0,xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5 + + movdqa xmm2,xmm1 + movdqa xmm5,xmm7 + paddw xmm1,xmm4 ; xmm1=data3+data4=tmp3 + paddw xmm7,xmm0 ; xmm7=data2+data5=tmp2 + psubw xmm2,xmm4 ; xmm2=data3-data4=tmp4 + psubw xmm5,xmm0 ; xmm5=data2-data5=tmp5 + + ; -- Even part + + movdqa xmm4,xmm3 + movdqa xmm0,xmm6 + paddw xmm3,xmm1 ; xmm3=tmp10 + paddw xmm6,xmm7 ; xmm6=tmp11 + psubw xmm4,xmm1 ; xmm4=tmp13 + psubw xmm0,xmm7 ; xmm0=tmp12 + + movdqa xmm1,xmm3 + paddw xmm3,xmm6 ; xmm3=tmp10+tmp11 + psubw xmm1,xmm6 ; xmm1=tmp10-tmp11 + + psllw xmm3,PASS1_BITS ; xmm3=data0 + psllw xmm1,PASS1_BITS ; xmm1=data4 + + movdqa XMMWORD [wk(2)], xmm3 ; wk(2)=data0 + movdqa XMMWORD [wk(3)], xmm1 ; wk(3)=data4 + + ; (Original) + ; z1 = (tmp12 + tmp13) * 0.541196100; + ; data2 = z1 + tmp13 * 0.765366865; + ; data6 = z1 + tmp12 * -1.847759065; + ; + ; (This implementation) + ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; + ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); + + movdqa xmm7,xmm4 ; xmm4=tmp13 + movdqa xmm6,xmm4 + punpcklwd xmm7,xmm0 ; xmm0=tmp12 + punpckhwd xmm6,xmm0 + movdqa xmm4,xmm7 + movdqa xmm0,xmm6 + pmaddwd xmm7,[rel PW_F130_F054] ; xmm7=data2L + pmaddwd xmm6,[rel PW_F130_F054] ; xmm6=data2H + pmaddwd xmm4,[rel PW_F054_MF130] ; xmm4=data6L + pmaddwd xmm0,[rel PW_F054_MF130] ; xmm0=data6H + + paddd xmm7,[rel PD_DESCALE_P1] + paddd xmm6,[rel PD_DESCALE_P1] + psrad xmm7,DESCALE_P1 + psrad xmm6,DESCALE_P1 + paddd xmm4,[rel PD_DESCALE_P1] + paddd xmm0,[rel PD_DESCALE_P1] + psrad xmm4,DESCALE_P1 + psrad xmm0,DESCALE_P1 + + packssdw xmm7,xmm6 ; xmm7=data2 + packssdw xmm4,xmm0 ; xmm4=data6 + + movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=data2 + movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=data6 + + ; -- Odd part + + movdqa xmm3, XMMWORD [wk(0)] ; xmm3=tmp6 + movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp7 + + movdqa xmm6,xmm2 ; xmm2=tmp4 + movdqa xmm0,xmm5 ; xmm5=tmp5 + paddw xmm6,xmm3 ; xmm6=z3 + paddw xmm0,xmm1 ; xmm0=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movdqa xmm7,xmm6 + movdqa xmm4,xmm6 + punpcklwd xmm7,xmm0 + punpckhwd xmm4,xmm0 + movdqa xmm6,xmm7 + movdqa xmm0,xmm4 + pmaddwd xmm7,[rel PW_MF078_F117] ; xmm7=z3L + pmaddwd xmm4,[rel PW_MF078_F117] ; xmm4=z3H + pmaddwd xmm6,[rel PW_F117_F078] ; xmm6=z4L + pmaddwd xmm0,[rel PW_F117_F078] ; xmm0=z4H + + movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=z3L + movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=z3H + + ; (Original) + ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; + ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; + ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; + ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; + ; + ; (This implementation) + ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; + ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; + ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); + ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); + ; data7 = tmp4 + z3; data5 = tmp5 + z4; + ; data3 = tmp6 + z3; data1 = tmp7 + z4; + + movdqa xmm7,xmm2 + movdqa xmm4,xmm2 + punpcklwd xmm7,xmm1 + punpckhwd xmm4,xmm1 + movdqa xmm2,xmm7 + movdqa xmm1,xmm4 + pmaddwd xmm7,[rel PW_MF060_MF089] ; xmm7=tmp4L + pmaddwd xmm4,[rel PW_MF060_MF089] ; xmm4=tmp4H + pmaddwd xmm2,[rel PW_MF089_F060] ; xmm2=tmp7L + pmaddwd xmm1,[rel PW_MF089_F060] ; xmm1=tmp7H + + paddd xmm7, XMMWORD [wk(0)] ; xmm7=data7L + paddd xmm4, XMMWORD [wk(1)] ; xmm4=data7H + paddd xmm2,xmm6 ; xmm2=data1L + paddd xmm1,xmm0 ; xmm1=data1H + + paddd xmm7,[rel PD_DESCALE_P1] + paddd xmm4,[rel PD_DESCALE_P1] + psrad xmm7,DESCALE_P1 + psrad xmm4,DESCALE_P1 + paddd xmm2,[rel PD_DESCALE_P1] + paddd xmm1,[rel PD_DESCALE_P1] + psrad xmm2,DESCALE_P1 + psrad xmm1,DESCALE_P1 + + packssdw xmm7,xmm4 ; xmm7=data7 + packssdw xmm2,xmm1 ; xmm2=data1 + + movdqa xmm4,xmm5 + movdqa xmm1,xmm5 + punpcklwd xmm4,xmm3 + punpckhwd xmm1,xmm3 + movdqa xmm5,xmm4 + movdqa xmm3,xmm1 + pmaddwd xmm4,[rel PW_MF050_MF256] ; xmm4=tmp5L + pmaddwd xmm1,[rel PW_MF050_MF256] ; xmm1=tmp5H + pmaddwd xmm5,[rel PW_MF256_F050] ; xmm5=tmp6L + pmaddwd xmm3,[rel PW_MF256_F050] ; xmm3=tmp6H + + paddd xmm4,xmm6 ; xmm4=data5L + paddd xmm1,xmm0 ; xmm1=data5H + paddd xmm5, XMMWORD [wk(0)] ; xmm5=data3L + paddd xmm3, XMMWORD [wk(1)] ; xmm3=data3H + + paddd xmm4,[rel PD_DESCALE_P1] + paddd xmm1,[rel PD_DESCALE_P1] + psrad xmm4,DESCALE_P1 + psrad xmm1,DESCALE_P1 + paddd xmm5,[rel PD_DESCALE_P1] + paddd xmm3,[rel PD_DESCALE_P1] + psrad xmm5,DESCALE_P1 + psrad xmm3,DESCALE_P1 + + packssdw xmm4,xmm1 ; xmm4=data5 + packssdw xmm5,xmm3 ; xmm5=data3 + + ; ---- Pass 2: process columns. + + movdqa xmm6, XMMWORD [wk(2)] ; xmm6=col0 + movdqa xmm0, XMMWORD [wk(4)] ; xmm0=col2 + + ; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72) + ; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73) + + movdqa xmm1,xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6,xmm2 ; xmm6=(00 01 10 11 20 21 30 31) + punpckhwd xmm1,xmm2 ; xmm1=(40 41 50 51 60 61 70 71) + movdqa xmm3,xmm0 ; transpose coefficients(phase 1) + punpcklwd xmm0,xmm5 ; xmm0=(02 03 12 13 22 23 32 33) + punpckhwd xmm3,xmm5 ; xmm3=(42 43 52 53 62 63 72 73) + + movdqa xmm2, XMMWORD [wk(3)] ; xmm2=col4 + movdqa xmm5, XMMWORD [wk(5)] ; xmm5=col6 + + ; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76) + ; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77) + + movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=(02 03 12 13 22 23 32 33) + movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(42 43 52 53 62 63 72 73) + + movdqa xmm0,xmm2 ; transpose coefficients(phase 1) + punpcklwd xmm2,xmm4 ; xmm2=(04 05 14 15 24 25 34 35) + punpckhwd xmm0,xmm4 ; xmm0=(44 45 54 55 64 65 74 75) + movdqa xmm3,xmm5 ; transpose coefficients(phase 1) + punpcklwd xmm5,xmm7 ; xmm5=(06 07 16 17 26 27 36 37) + punpckhwd xmm3,xmm7 ; xmm3=(46 47 56 57 66 67 76 77) + + movdqa xmm4,xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2,xmm5 ; xmm2=(04 05 06 07 14 15 16 17) + punpckhdq xmm4,xmm5 ; xmm4=(24 25 26 27 34 35 36 37) + movdqa xmm7,xmm0 ; transpose coefficients(phase 2) + punpckldq xmm0,xmm3 ; xmm0=(44 45 46 47 54 55 56 57) + punpckhdq xmm7,xmm3 ; xmm7=(64 65 66 67 74 75 76 77) + + movdqa xmm5, XMMWORD [wk(0)] ; xmm5=(02 03 12 13 22 23 32 33) + movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53 62 63 72 73) + movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=(24 25 26 27 34 35 36 37) + movdqa XMMWORD [wk(3)], xmm0 ; wk(3)=(44 45 46 47 54 55 56 57) + + movdqa xmm4,xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6,xmm5 ; xmm6=(00 01 02 03 10 11 12 13) + punpckhdq xmm4,xmm5 ; xmm4=(20 21 22 23 30 31 32 33) + movdqa xmm0,xmm1 ; transpose coefficients(phase 2) + punpckldq xmm1,xmm3 ; xmm1=(40 41 42 43 50 51 52 53) + punpckhdq xmm0,xmm3 ; xmm0=(60 61 62 63 70 71 72 73) + + movdqa xmm5,xmm6 ; transpose coefficients(phase 3) + punpcklqdq xmm6,xmm2 ; xmm6=(00 01 02 03 04 05 06 07)=data0 + punpckhqdq xmm5,xmm2 ; xmm5=(10 11 12 13 14 15 16 17)=data1 + movdqa xmm3,xmm0 ; transpose coefficients(phase 3) + punpcklqdq xmm0,xmm7 ; xmm0=(60 61 62 63 64 65 66 67)=data6 + punpckhqdq xmm3,xmm7 ; xmm3=(70 71 72 73 74 75 76 77)=data7 + + movdqa xmm2,xmm5 + movdqa xmm7,xmm6 + psubw xmm5,xmm0 ; xmm5=data1-data6=tmp6 + psubw xmm6,xmm3 ; xmm6=data0-data7=tmp7 + paddw xmm2,xmm0 ; xmm2=data1+data6=tmp1 + paddw xmm7,xmm3 ; xmm7=data0+data7=tmp0 + + movdqa xmm0, XMMWORD [wk(2)] ; xmm0=(24 25 26 27 34 35 36 37) + movdqa xmm3, XMMWORD [wk(3)] ; xmm3=(44 45 46 47 54 55 56 57) + movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=tmp6 + movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 + + movdqa xmm5,xmm4 ; transpose coefficients(phase 3) + punpcklqdq xmm4,xmm0 ; xmm4=(20 21 22 23 24 25 26 27)=data2 + punpckhqdq xmm5,xmm0 ; xmm5=(30 31 32 33 34 35 36 37)=data3 + movdqa xmm6,xmm1 ; transpose coefficients(phase 3) + punpcklqdq xmm1,xmm3 ; xmm1=(40 41 42 43 44 45 46 47)=data4 + punpckhqdq xmm6,xmm3 ; xmm6=(50 51 52 53 54 55 56 57)=data5 + + movdqa xmm0,xmm5 + movdqa xmm3,xmm4 + paddw xmm5,xmm1 ; xmm5=data3+data4=tmp3 + paddw xmm4,xmm6 ; xmm4=data2+data5=tmp2 + psubw xmm0,xmm1 ; xmm0=data3-data4=tmp4 + psubw xmm3,xmm6 ; xmm3=data2-data5=tmp5 + + ; -- Even part + + movdqa xmm1,xmm7 + movdqa xmm6,xmm2 + paddw xmm7,xmm5 ; xmm7=tmp10 + paddw xmm2,xmm4 ; xmm2=tmp11 + psubw xmm1,xmm5 ; xmm1=tmp13 + psubw xmm6,xmm4 ; xmm6=tmp12 + + movdqa xmm5,xmm7 + paddw xmm7,xmm2 ; xmm7=tmp10+tmp11 + psubw xmm5,xmm2 ; xmm5=tmp10-tmp11 + + paddw xmm7,[rel PW_DESCALE_P2X] + paddw xmm5,[rel PW_DESCALE_P2X] + psraw xmm7,PASS1_BITS ; xmm7=data0 + psraw xmm5,PASS1_BITS ; xmm5=data4 + + movdqa XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm7 + movdqa XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm5 + + ; (Original) + ; z1 = (tmp12 + tmp13) * 0.541196100; + ; data2 = z1 + tmp13 * 0.765366865; + ; data6 = z1 + tmp12 * -1.847759065; + ; + ; (This implementation) + ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; + ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); + + movdqa xmm4,xmm1 ; xmm1=tmp13 + movdqa xmm2,xmm1 + punpcklwd xmm4,xmm6 ; xmm6=tmp12 + punpckhwd xmm2,xmm6 + movdqa xmm1,xmm4 + movdqa xmm6,xmm2 + pmaddwd xmm4,[rel PW_F130_F054] ; xmm4=data2L + pmaddwd xmm2,[rel PW_F130_F054] ; xmm2=data2H + pmaddwd xmm1,[rel PW_F054_MF130] ; xmm1=data6L + pmaddwd xmm6,[rel PW_F054_MF130] ; xmm6=data6H + + paddd xmm4,[rel PD_DESCALE_P2] + paddd xmm2,[rel PD_DESCALE_P2] + psrad xmm4,DESCALE_P2 + psrad xmm2,DESCALE_P2 + paddd xmm1,[rel PD_DESCALE_P2] + paddd xmm6,[rel PD_DESCALE_P2] + psrad xmm1,DESCALE_P2 + psrad xmm6,DESCALE_P2 + + packssdw xmm4,xmm2 ; xmm4=data2 + packssdw xmm1,xmm6 ; xmm1=data6 + + movdqa XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm4 + movdqa XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm1 + + ; -- Odd part + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp6 + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7 + + movdqa xmm2,xmm0 ; xmm0=tmp4 + movdqa xmm6,xmm3 ; xmm3=tmp5 + paddw xmm2,xmm7 ; xmm2=z3 + paddw xmm6,xmm5 ; xmm6=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movdqa xmm4,xmm2 + movdqa xmm1,xmm2 + punpcklwd xmm4,xmm6 + punpckhwd xmm1,xmm6 + movdqa xmm2,xmm4 + movdqa xmm6,xmm1 + pmaddwd xmm4,[rel PW_MF078_F117] ; xmm4=z3L + pmaddwd xmm1,[rel PW_MF078_F117] ; xmm1=z3H + pmaddwd xmm2,[rel PW_F117_F078] ; xmm2=z4L + pmaddwd xmm6,[rel PW_F117_F078] ; xmm6=z4H + + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=z3L + movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=z3H + + ; (Original) + ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; + ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; + ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; + ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; + ; + ; (This implementation) + ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; + ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; + ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); + ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); + ; data7 = tmp4 + z3; data5 = tmp5 + z4; + ; data3 = tmp6 + z3; data1 = tmp7 + z4; + + movdqa xmm4,xmm0 + movdqa xmm1,xmm0 + punpcklwd xmm4,xmm5 + punpckhwd xmm1,xmm5 + movdqa xmm0,xmm4 + movdqa xmm5,xmm1 + pmaddwd xmm4,[rel PW_MF060_MF089] ; xmm4=tmp4L + pmaddwd xmm1,[rel PW_MF060_MF089] ; xmm1=tmp4H + pmaddwd xmm0,[rel PW_MF089_F060] ; xmm0=tmp7L + pmaddwd xmm5,[rel PW_MF089_F060] ; xmm5=tmp7H + + paddd xmm4, XMMWORD [wk(0)] ; xmm4=data7L + paddd xmm1, XMMWORD [wk(1)] ; xmm1=data7H + paddd xmm0,xmm2 ; xmm0=data1L + paddd xmm5,xmm6 ; xmm5=data1H + + paddd xmm4,[rel PD_DESCALE_P2] + paddd xmm1,[rel PD_DESCALE_P2] + psrad xmm4,DESCALE_P2 + psrad xmm1,DESCALE_P2 + paddd xmm0,[rel PD_DESCALE_P2] + paddd xmm5,[rel PD_DESCALE_P2] + psrad xmm0,DESCALE_P2 + psrad xmm5,DESCALE_P2 + + packssdw xmm4,xmm1 ; xmm4=data7 + packssdw xmm0,xmm5 ; xmm0=data1 + + movdqa XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm4 + movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm0 + + movdqa xmm1,xmm3 + movdqa xmm5,xmm3 + punpcklwd xmm1,xmm7 + punpckhwd xmm5,xmm7 + movdqa xmm3,xmm1 + movdqa xmm7,xmm5 + pmaddwd xmm1,[rel PW_MF050_MF256] ; xmm1=tmp5L + pmaddwd xmm5,[rel PW_MF050_MF256] ; xmm5=tmp5H + pmaddwd xmm3,[rel PW_MF256_F050] ; xmm3=tmp6L + pmaddwd xmm7,[rel PW_MF256_F050] ; xmm7=tmp6H + + paddd xmm1,xmm2 ; xmm1=data5L + paddd xmm5,xmm6 ; xmm5=data5H + paddd xmm3, XMMWORD [wk(0)] ; xmm3=data3L + paddd xmm7, XMMWORD [wk(1)] ; xmm7=data3H + + paddd xmm1,[rel PD_DESCALE_P2] + paddd xmm5,[rel PD_DESCALE_P2] + psrad xmm1,DESCALE_P2 + psrad xmm5,DESCALE_P2 + paddd xmm3,[rel PD_DESCALE_P2] + paddd xmm7,[rel PD_DESCALE_P2] + psrad xmm3,DESCALE_P2 + psrad xmm7,DESCALE_P2 + + packssdw xmm1,xmm5 ; xmm1=data5 + packssdw xmm3,xmm7 ; xmm3=data3 + + movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm1 + movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm3 + + uncollect_args + mov rsp,rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/Builder/jni-1.11/simd/jfdctint-sse2.asm b/Builder/jni-1.11/simd/jfdctint-sse2.asm new file mode 100644 index 000000000..db9d0bbe4 --- /dev/null +++ b/Builder/jni-1.11/simd/jfdctint-sse2.asm @@ -0,0 +1,633 @@ +; +; jfdctint.asm - accurate integer FDCT (SSE2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a slow-but-accurate integer implementation of the +; forward DCT (Discrete Cosine Transform). The following code is based +; directly on the IJG's original jfdctint.c; see the jfdctint.c for +; more details. +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 13 +%define PASS1_BITS 2 + +%define DESCALE_P1 (CONST_BITS-PASS1_BITS) +%define DESCALE_P2 (CONST_BITS+PASS1_BITS) + +%if CONST_BITS == 13 +F_0_298 equ 2446 ; FIX(0.298631336) +F_0_390 equ 3196 ; FIX(0.390180644) +F_0_541 equ 4433 ; FIX(0.541196100) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_175 equ 9633 ; FIX(1.175875602) +F_1_501 equ 12299 ; FIX(1.501321110) +F_1_847 equ 15137 ; FIX(1.847759065) +F_1_961 equ 16069 ; FIX(1.961570560) +F_2_053 equ 16819 ; FIX(2.053119869) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_072 equ 25172 ; FIX(3.072711026) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) +F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336) +F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644) +F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) +F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) +F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) +F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602) +F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110) +F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) +F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560) +F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869) +F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) +F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_fdct_islow_sse2) + +EXTN(jconst_fdct_islow_sse2): + +PW_F130_F054 times 4 dw (F_0_541+F_0_765), F_0_541 +PW_F054_MF130 times 4 dw F_0_541, (F_0_541-F_1_847) +PW_MF078_F117 times 4 dw (F_1_175-F_1_961), F_1_175 +PW_F117_F078 times 4 dw F_1_175, (F_1_175-F_0_390) +PW_MF060_MF089 times 4 dw (F_0_298-F_0_899),-F_0_899 +PW_MF089_F060 times 4 dw -F_0_899, (F_1_501-F_0_899) +PW_MF050_MF256 times 4 dw (F_2_053-F_2_562),-F_2_562 +PW_MF256_F050 times 4 dw -F_2_562, (F_3_072-F_2_562) +PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1-1) +PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2-1) +PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS-1) + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform the forward DCT on one block of samples. +; +; GLOBAL(void) +; jsimd_fdct_islow_sse2 (DCTELEM *data) +; + +%define data(b) (b)+8 ; DCTELEM *data + +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 6 + + align 16 + global EXTN(jsimd_fdct_islow_sse2) + +EXTN(jsimd_fdct_islow_sse2): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic ebx +; push ecx ; unused +; push edx ; need not be preserved +; push esi ; unused +; push edi ; unused + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process rows. + + mov edx, POINTER [data(eax)] ; (DCTELEM *) + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)] + movdqa xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)] + movdqa xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)] + movdqa xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)] + + ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27) + ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37) + + movdqa xmm4,xmm0 ; transpose coefficients(phase 1) + punpcklwd xmm0,xmm1 ; xmm0=(00 10 01 11 02 12 03 13) + punpckhwd xmm4,xmm1 ; xmm4=(04 14 05 15 06 16 07 17) + movdqa xmm5,xmm2 ; transpose coefficients(phase 1) + punpcklwd xmm2,xmm3 ; xmm2=(20 30 21 31 22 32 23 33) + punpckhwd xmm5,xmm3 ; xmm5=(24 34 25 35 26 36 27 37) + + movdqa xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)] + movdqa xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)] + movdqa xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)] + + ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62) + ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63) + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33) + movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37) + + movdqa xmm2,xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6,xmm7 ; xmm6=(40 50 41 51 42 52 43 53) + punpckhwd xmm2,xmm7 ; xmm2=(44 54 45 55 46 56 47 57) + movdqa xmm5,xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1,xmm3 ; xmm1=(60 70 61 71 62 72 63 73) + punpckhwd xmm5,xmm3 ; xmm5=(64 74 65 75 66 76 67 77) + + movdqa xmm7,xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6,xmm1 ; xmm6=(40 50 60 70 41 51 61 71) + punpckhdq xmm7,xmm1 ; xmm7=(42 52 62 72 43 53 63 73) + movdqa xmm3,xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2,xmm5 ; xmm2=(44 54 64 74 45 55 65 75) + punpckhdq xmm3,xmm5 ; xmm3=(46 56 66 76 47 57 67 77) + + movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33) + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37) + movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=(42 52 62 72 43 53 63 73) + movdqa XMMWORD [wk(3)], xmm2 ; wk(3)=(44 54 64 74 45 55 65 75) + + movdqa xmm7,xmm0 ; transpose coefficients(phase 2) + punpckldq xmm0,xmm1 ; xmm0=(00 10 20 30 01 11 21 31) + punpckhdq xmm7,xmm1 ; xmm7=(02 12 22 32 03 13 23 33) + movdqa xmm2,xmm4 ; transpose coefficients(phase 2) + punpckldq xmm4,xmm5 ; xmm4=(04 14 24 34 05 15 25 35) + punpckhdq xmm2,xmm5 ; xmm2=(06 16 26 36 07 17 27 37) + + movdqa xmm1,xmm0 ; transpose coefficients(phase 3) + punpcklqdq xmm0,xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0 + punpckhqdq xmm1,xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1 + movdqa xmm5,xmm2 ; transpose coefficients(phase 3) + punpcklqdq xmm2,xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6 + punpckhqdq xmm5,xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7 + + movdqa xmm6,xmm1 + movdqa xmm3,xmm0 + psubw xmm1,xmm2 ; xmm1=data1-data6=tmp6 + psubw xmm0,xmm5 ; xmm0=data0-data7=tmp7 + paddw xmm6,xmm2 ; xmm6=data1+data6=tmp1 + paddw xmm3,xmm5 ; xmm3=data0+data7=tmp0 + + movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(42 52 62 72 43 53 63 73) + movdqa xmm5, XMMWORD [wk(3)] ; xmm5=(44 54 64 74 45 55 65 75) + movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6 + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7 + + movdqa xmm1,xmm7 ; transpose coefficients(phase 3) + punpcklqdq xmm7,xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2 + punpckhqdq xmm1,xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3 + movdqa xmm0,xmm4 ; transpose coefficients(phase 3) + punpcklqdq xmm4,xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4 + punpckhqdq xmm0,xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5 + + movdqa xmm2,xmm1 + movdqa xmm5,xmm7 + paddw xmm1,xmm4 ; xmm1=data3+data4=tmp3 + paddw xmm7,xmm0 ; xmm7=data2+data5=tmp2 + psubw xmm2,xmm4 ; xmm2=data3-data4=tmp4 + psubw xmm5,xmm0 ; xmm5=data2-data5=tmp5 + + ; -- Even part + + movdqa xmm4,xmm3 + movdqa xmm0,xmm6 + paddw xmm3,xmm1 ; xmm3=tmp10 + paddw xmm6,xmm7 ; xmm6=tmp11 + psubw xmm4,xmm1 ; xmm4=tmp13 + psubw xmm0,xmm7 ; xmm0=tmp12 + + movdqa xmm1,xmm3 + paddw xmm3,xmm6 ; xmm3=tmp10+tmp11 + psubw xmm1,xmm6 ; xmm1=tmp10-tmp11 + + psllw xmm3,PASS1_BITS ; xmm3=data0 + psllw xmm1,PASS1_BITS ; xmm1=data4 + + movdqa XMMWORD [wk(2)], xmm3 ; wk(2)=data0 + movdqa XMMWORD [wk(3)], xmm1 ; wk(3)=data4 + + ; (Original) + ; z1 = (tmp12 + tmp13) * 0.541196100; + ; data2 = z1 + tmp13 * 0.765366865; + ; data6 = z1 + tmp12 * -1.847759065; + ; + ; (This implementation) + ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; + ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); + + movdqa xmm7,xmm4 ; xmm4=tmp13 + movdqa xmm6,xmm4 + punpcklwd xmm7,xmm0 ; xmm0=tmp12 + punpckhwd xmm6,xmm0 + movdqa xmm4,xmm7 + movdqa xmm0,xmm6 + pmaddwd xmm7,[GOTOFF(ebx,PW_F130_F054)] ; xmm7=data2L + pmaddwd xmm6,[GOTOFF(ebx,PW_F130_F054)] ; xmm6=data2H + pmaddwd xmm4,[GOTOFF(ebx,PW_F054_MF130)] ; xmm4=data6L + pmaddwd xmm0,[GOTOFF(ebx,PW_F054_MF130)] ; xmm0=data6H + + paddd xmm7,[GOTOFF(ebx,PD_DESCALE_P1)] + paddd xmm6,[GOTOFF(ebx,PD_DESCALE_P1)] + psrad xmm7,DESCALE_P1 + psrad xmm6,DESCALE_P1 + paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P1)] + paddd xmm0,[GOTOFF(ebx,PD_DESCALE_P1)] + psrad xmm4,DESCALE_P1 + psrad xmm0,DESCALE_P1 + + packssdw xmm7,xmm6 ; xmm7=data2 + packssdw xmm4,xmm0 ; xmm4=data6 + + movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=data2 + movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=data6 + + ; -- Odd part + + movdqa xmm3, XMMWORD [wk(0)] ; xmm3=tmp6 + movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp7 + + movdqa xmm6,xmm2 ; xmm2=tmp4 + movdqa xmm0,xmm5 ; xmm5=tmp5 + paddw xmm6,xmm3 ; xmm6=z3 + paddw xmm0,xmm1 ; xmm0=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movdqa xmm7,xmm6 + movdqa xmm4,xmm6 + punpcklwd xmm7,xmm0 + punpckhwd xmm4,xmm0 + movdqa xmm6,xmm7 + movdqa xmm0,xmm4 + pmaddwd xmm7,[GOTOFF(ebx,PW_MF078_F117)] ; xmm7=z3L + pmaddwd xmm4,[GOTOFF(ebx,PW_MF078_F117)] ; xmm4=z3H + pmaddwd xmm6,[GOTOFF(ebx,PW_F117_F078)] ; xmm6=z4L + pmaddwd xmm0,[GOTOFF(ebx,PW_F117_F078)] ; xmm0=z4H + + movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=z3L + movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=z3H + + ; (Original) + ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; + ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; + ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; + ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; + ; + ; (This implementation) + ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; + ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; + ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); + ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); + ; data7 = tmp4 + z3; data5 = tmp5 + z4; + ; data3 = tmp6 + z3; data1 = tmp7 + z4; + + movdqa xmm7,xmm2 + movdqa xmm4,xmm2 + punpcklwd xmm7,xmm1 + punpckhwd xmm4,xmm1 + movdqa xmm2,xmm7 + movdqa xmm1,xmm4 + pmaddwd xmm7,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm7=tmp4L + pmaddwd xmm4,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm4=tmp4H + pmaddwd xmm2,[GOTOFF(ebx,PW_MF089_F060)] ; xmm2=tmp7L + pmaddwd xmm1,[GOTOFF(ebx,PW_MF089_F060)] ; xmm1=tmp7H + + paddd xmm7, XMMWORD [wk(0)] ; xmm7=data7L + paddd xmm4, XMMWORD [wk(1)] ; xmm4=data7H + paddd xmm2,xmm6 ; xmm2=data1L + paddd xmm1,xmm0 ; xmm1=data1H + + paddd xmm7,[GOTOFF(ebx,PD_DESCALE_P1)] + paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P1)] + psrad xmm7,DESCALE_P1 + psrad xmm4,DESCALE_P1 + paddd xmm2,[GOTOFF(ebx,PD_DESCALE_P1)] + paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P1)] + psrad xmm2,DESCALE_P1 + psrad xmm1,DESCALE_P1 + + packssdw xmm7,xmm4 ; xmm7=data7 + packssdw xmm2,xmm1 ; xmm2=data1 + + movdqa xmm4,xmm5 + movdqa xmm1,xmm5 + punpcklwd xmm4,xmm3 + punpckhwd xmm1,xmm3 + movdqa xmm5,xmm4 + movdqa xmm3,xmm1 + pmaddwd xmm4,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm4=tmp5L + pmaddwd xmm1,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm1=tmp5H + pmaddwd xmm5,[GOTOFF(ebx,PW_MF256_F050)] ; xmm5=tmp6L + pmaddwd xmm3,[GOTOFF(ebx,PW_MF256_F050)] ; xmm3=tmp6H + + paddd xmm4,xmm6 ; xmm4=data5L + paddd xmm1,xmm0 ; xmm1=data5H + paddd xmm5, XMMWORD [wk(0)] ; xmm5=data3L + paddd xmm3, XMMWORD [wk(1)] ; xmm3=data3H + + paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P1)] + paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P1)] + psrad xmm4,DESCALE_P1 + psrad xmm1,DESCALE_P1 + paddd xmm5,[GOTOFF(ebx,PD_DESCALE_P1)] + paddd xmm3,[GOTOFF(ebx,PD_DESCALE_P1)] + psrad xmm5,DESCALE_P1 + psrad xmm3,DESCALE_P1 + + packssdw xmm4,xmm1 ; xmm4=data5 + packssdw xmm5,xmm3 ; xmm5=data3 + + ; ---- Pass 2: process columns. + +; mov edx, POINTER [data(eax)] ; (DCTELEM *) + + movdqa xmm6, XMMWORD [wk(2)] ; xmm6=col0 + movdqa xmm0, XMMWORD [wk(4)] ; xmm0=col2 + + ; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72) + ; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73) + + movdqa xmm1,xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6,xmm2 ; xmm6=(00 01 10 11 20 21 30 31) + punpckhwd xmm1,xmm2 ; xmm1=(40 41 50 51 60 61 70 71) + movdqa xmm3,xmm0 ; transpose coefficients(phase 1) + punpcklwd xmm0,xmm5 ; xmm0=(02 03 12 13 22 23 32 33) + punpckhwd xmm3,xmm5 ; xmm3=(42 43 52 53 62 63 72 73) + + movdqa xmm2, XMMWORD [wk(3)] ; xmm2=col4 + movdqa xmm5, XMMWORD [wk(5)] ; xmm5=col6 + + ; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76) + ; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77) + + movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=(02 03 12 13 22 23 32 33) + movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(42 43 52 53 62 63 72 73) + + movdqa xmm0,xmm2 ; transpose coefficients(phase 1) + punpcklwd xmm2,xmm4 ; xmm2=(04 05 14 15 24 25 34 35) + punpckhwd xmm0,xmm4 ; xmm0=(44 45 54 55 64 65 74 75) + movdqa xmm3,xmm5 ; transpose coefficients(phase 1) + punpcklwd xmm5,xmm7 ; xmm5=(06 07 16 17 26 27 36 37) + punpckhwd xmm3,xmm7 ; xmm3=(46 47 56 57 66 67 76 77) + + movdqa xmm4,xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2,xmm5 ; xmm2=(04 05 06 07 14 15 16 17) + punpckhdq xmm4,xmm5 ; xmm4=(24 25 26 27 34 35 36 37) + movdqa xmm7,xmm0 ; transpose coefficients(phase 2) + punpckldq xmm0,xmm3 ; xmm0=(44 45 46 47 54 55 56 57) + punpckhdq xmm7,xmm3 ; xmm7=(64 65 66 67 74 75 76 77) + + movdqa xmm5, XMMWORD [wk(0)] ; xmm5=(02 03 12 13 22 23 32 33) + movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53 62 63 72 73) + movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=(24 25 26 27 34 35 36 37) + movdqa XMMWORD [wk(3)], xmm0 ; wk(3)=(44 45 46 47 54 55 56 57) + + movdqa xmm4,xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6,xmm5 ; xmm6=(00 01 02 03 10 11 12 13) + punpckhdq xmm4,xmm5 ; xmm4=(20 21 22 23 30 31 32 33) + movdqa xmm0,xmm1 ; transpose coefficients(phase 2) + punpckldq xmm1,xmm3 ; xmm1=(40 41 42 43 50 51 52 53) + punpckhdq xmm0,xmm3 ; xmm0=(60 61 62 63 70 71 72 73) + + movdqa xmm5,xmm6 ; transpose coefficients(phase 3) + punpcklqdq xmm6,xmm2 ; xmm6=(00 01 02 03 04 05 06 07)=data0 + punpckhqdq xmm5,xmm2 ; xmm5=(10 11 12 13 14 15 16 17)=data1 + movdqa xmm3,xmm0 ; transpose coefficients(phase 3) + punpcklqdq xmm0,xmm7 ; xmm0=(60 61 62 63 64 65 66 67)=data6 + punpckhqdq xmm3,xmm7 ; xmm3=(70 71 72 73 74 75 76 77)=data7 + + movdqa xmm2,xmm5 + movdqa xmm7,xmm6 + psubw xmm5,xmm0 ; xmm5=data1-data6=tmp6 + psubw xmm6,xmm3 ; xmm6=data0-data7=tmp7 + paddw xmm2,xmm0 ; xmm2=data1+data6=tmp1 + paddw xmm7,xmm3 ; xmm7=data0+data7=tmp0 + + movdqa xmm0, XMMWORD [wk(2)] ; xmm0=(24 25 26 27 34 35 36 37) + movdqa xmm3, XMMWORD [wk(3)] ; xmm3=(44 45 46 47 54 55 56 57) + movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=tmp6 + movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 + + movdqa xmm5,xmm4 ; transpose coefficients(phase 3) + punpcklqdq xmm4,xmm0 ; xmm4=(20 21 22 23 24 25 26 27)=data2 + punpckhqdq xmm5,xmm0 ; xmm5=(30 31 32 33 34 35 36 37)=data3 + movdqa xmm6,xmm1 ; transpose coefficients(phase 3) + punpcklqdq xmm1,xmm3 ; xmm1=(40 41 42 43 44 45 46 47)=data4 + punpckhqdq xmm6,xmm3 ; xmm6=(50 51 52 53 54 55 56 57)=data5 + + movdqa xmm0,xmm5 + movdqa xmm3,xmm4 + paddw xmm5,xmm1 ; xmm5=data3+data4=tmp3 + paddw xmm4,xmm6 ; xmm4=data2+data5=tmp2 + psubw xmm0,xmm1 ; xmm0=data3-data4=tmp4 + psubw xmm3,xmm6 ; xmm3=data2-data5=tmp5 + + ; -- Even part + + movdqa xmm1,xmm7 + movdqa xmm6,xmm2 + paddw xmm7,xmm5 ; xmm7=tmp10 + paddw xmm2,xmm4 ; xmm2=tmp11 + psubw xmm1,xmm5 ; xmm1=tmp13 + psubw xmm6,xmm4 ; xmm6=tmp12 + + movdqa xmm5,xmm7 + paddw xmm7,xmm2 ; xmm7=tmp10+tmp11 + psubw xmm5,xmm2 ; xmm5=tmp10-tmp11 + + paddw xmm7,[GOTOFF(ebx,PW_DESCALE_P2X)] + paddw xmm5,[GOTOFF(ebx,PW_DESCALE_P2X)] + psraw xmm7,PASS1_BITS ; xmm7=data0 + psraw xmm5,PASS1_BITS ; xmm5=data4 + + movdqa XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm7 + movdqa XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm5 + + ; (Original) + ; z1 = (tmp12 + tmp13) * 0.541196100; + ; data2 = z1 + tmp13 * 0.765366865; + ; data6 = z1 + tmp12 * -1.847759065; + ; + ; (This implementation) + ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; + ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); + + movdqa xmm4,xmm1 ; xmm1=tmp13 + movdqa xmm2,xmm1 + punpcklwd xmm4,xmm6 ; xmm6=tmp12 + punpckhwd xmm2,xmm6 + movdqa xmm1,xmm4 + movdqa xmm6,xmm2 + pmaddwd xmm4,[GOTOFF(ebx,PW_F130_F054)] ; xmm4=data2L + pmaddwd xmm2,[GOTOFF(ebx,PW_F130_F054)] ; xmm2=data2H + pmaddwd xmm1,[GOTOFF(ebx,PW_F054_MF130)] ; xmm1=data6L + pmaddwd xmm6,[GOTOFF(ebx,PW_F054_MF130)] ; xmm6=data6H + + paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P2)] + paddd xmm2,[GOTOFF(ebx,PD_DESCALE_P2)] + psrad xmm4,DESCALE_P2 + psrad xmm2,DESCALE_P2 + paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P2)] + paddd xmm6,[GOTOFF(ebx,PD_DESCALE_P2)] + psrad xmm1,DESCALE_P2 + psrad xmm6,DESCALE_P2 + + packssdw xmm4,xmm2 ; xmm4=data2 + packssdw xmm1,xmm6 ; xmm1=data6 + + movdqa XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm4 + movdqa XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm1 + + ; -- Odd part + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp6 + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7 + + movdqa xmm2,xmm0 ; xmm0=tmp4 + movdqa xmm6,xmm3 ; xmm3=tmp5 + paddw xmm2,xmm7 ; xmm2=z3 + paddw xmm6,xmm5 ; xmm6=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movdqa xmm4,xmm2 + movdqa xmm1,xmm2 + punpcklwd xmm4,xmm6 + punpckhwd xmm1,xmm6 + movdqa xmm2,xmm4 + movdqa xmm6,xmm1 + pmaddwd xmm4,[GOTOFF(ebx,PW_MF078_F117)] ; xmm4=z3L + pmaddwd xmm1,[GOTOFF(ebx,PW_MF078_F117)] ; xmm1=z3H + pmaddwd xmm2,[GOTOFF(ebx,PW_F117_F078)] ; xmm2=z4L + pmaddwd xmm6,[GOTOFF(ebx,PW_F117_F078)] ; xmm6=z4H + + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=z3L + movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=z3H + + ; (Original) + ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; + ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; + ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; + ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; + ; + ; (This implementation) + ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; + ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; + ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); + ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); + ; data7 = tmp4 + z3; data5 = tmp5 + z4; + ; data3 = tmp6 + z3; data1 = tmp7 + z4; + + movdqa xmm4,xmm0 + movdqa xmm1,xmm0 + punpcklwd xmm4,xmm5 + punpckhwd xmm1,xmm5 + movdqa xmm0,xmm4 + movdqa xmm5,xmm1 + pmaddwd xmm4,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm4=tmp4L + pmaddwd xmm1,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm1=tmp4H + pmaddwd xmm0,[GOTOFF(ebx,PW_MF089_F060)] ; xmm0=tmp7L + pmaddwd xmm5,[GOTOFF(ebx,PW_MF089_F060)] ; xmm5=tmp7H + + paddd xmm4, XMMWORD [wk(0)] ; xmm4=data7L + paddd xmm1, XMMWORD [wk(1)] ; xmm1=data7H + paddd xmm0,xmm2 ; xmm0=data1L + paddd xmm5,xmm6 ; xmm5=data1H + + paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P2)] + paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P2)] + psrad xmm4,DESCALE_P2 + psrad xmm1,DESCALE_P2 + paddd xmm0,[GOTOFF(ebx,PD_DESCALE_P2)] + paddd xmm5,[GOTOFF(ebx,PD_DESCALE_P2)] + psrad xmm0,DESCALE_P2 + psrad xmm5,DESCALE_P2 + + packssdw xmm4,xmm1 ; xmm4=data7 + packssdw xmm0,xmm5 ; xmm0=data1 + + movdqa XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm4 + movdqa XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm0 + + movdqa xmm1,xmm3 + movdqa xmm5,xmm3 + punpcklwd xmm1,xmm7 + punpckhwd xmm5,xmm7 + movdqa xmm3,xmm1 + movdqa xmm7,xmm5 + pmaddwd xmm1,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm1=tmp5L + pmaddwd xmm5,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm5=tmp5H + pmaddwd xmm3,[GOTOFF(ebx,PW_MF256_F050)] ; xmm3=tmp6L + pmaddwd xmm7,[GOTOFF(ebx,PW_MF256_F050)] ; xmm7=tmp6H + + paddd xmm1,xmm2 ; xmm1=data5L + paddd xmm5,xmm6 ; xmm5=data5H + paddd xmm3, XMMWORD [wk(0)] ; xmm3=data3L + paddd xmm7, XMMWORD [wk(1)] ; xmm7=data3H + + paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P2)] + paddd xmm5,[GOTOFF(ebx,PD_DESCALE_P2)] + psrad xmm1,DESCALE_P2 + psrad xmm5,DESCALE_P2 + paddd xmm3,[GOTOFF(ebx,PD_DESCALE_P2)] + paddd xmm7,[GOTOFF(ebx,PD_DESCALE_P2)] + psrad xmm3,DESCALE_P2 + psrad xmm7,DESCALE_P2 + + packssdw xmm1,xmm5 ; xmm1=data5 + packssdw xmm3,xmm7 ; xmm3=data3 + + movdqa XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm1 + movdqa XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm3 + +; pop edi ; unused +; pop esi ; unused +; pop edx ; need not be preserved +; pop ecx ; unused + poppic ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/Builder/jni-1.11/simd/jidctflt-3dn.asm b/Builder/jni-1.11/simd/jidctflt-3dn.asm new file mode 100644 index 000000000..99356f20a --- /dev/null +++ b/Builder/jni-1.11/simd/jidctflt-3dn.asm @@ -0,0 +1,451 @@ +; +; jidctflt.asm - floating-point IDCT (3DNow! & MMX) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a floating-point implementation of the inverse DCT +; (Discrete Cosine Transform). The following code is based directly on +; the IJG's original jidctflt.c; see the jidctflt.c for more details. +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_idct_float_3dnow) + +EXTN(jconst_idct_float_3dnow): + +PD_1_414 times 2 dd 1.414213562373095048801689 +PD_1_847 times 2 dd 1.847759065022573512256366 +PD_1_082 times 2 dd 1.082392200292393968799446 +PD_2_613 times 2 dd 2.613125929752753055713286 +PD_RNDINT_MAGIC times 2 dd 100663296.0 ; (float)(0x00C00000 << 3) +PB_CENTERJSAMP times 8 db CENTERJSAMPLE + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform dequantization and inverse DCT on one block of coefficients. +; +; GLOBAL(void) +; jsimd_idct_float_3dnow (void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +%define dct_table(b) (b)+8 ; void *dct_table +%define coef_block(b) (b)+12 ; JCOEFPTR coef_block +%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf +%define output_col(b) (b)+20 ; JDIMENSION output_col + +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] +%define WK_NUM 2 +%define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT + ; FAST_FLOAT workspace[DCTSIZE2] + + align 16 + global EXTN(jsimd_idct_float_3dnow) + +EXTN(jsimd_idct_float_3dnow): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [workspace] + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input, store into work array. + +; mov eax, [original_ebp] + mov edx, POINTER [dct_table(eax)] ; quantptr + mov esi, JCOEFPTR [coef_block(eax)] ; inptr + lea edi, [workspace] ; FAST_FLOAT *wsptr + mov ecx, DCTSIZE/2 ; ctr + alignx 16,7 +.columnloop: +%ifndef NO_ZERO_COLUMN_TEST_FLOAT_3DNOW + mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + jnz short .columnDCT + + pushpic ebx ; save GOT address + mov ebx, DWORD [DWBLOCK(3,0,esi,SIZEOF_JCOEF)] + mov eax, DWORD [DWBLOCK(4,0,esi,SIZEOF_JCOEF)] + or ebx, DWORD [DWBLOCK(5,0,esi,SIZEOF_JCOEF)] + or eax, DWORD [DWBLOCK(6,0,esi,SIZEOF_JCOEF)] + or ebx, DWORD [DWBLOCK(7,0,esi,SIZEOF_JCOEF)] + or eax,ebx + poppic ebx ; restore GOT address + jnz short .columnDCT + + ; -- AC terms all zero + + movd mm0, DWORD [DWBLOCK(0,0,esi,SIZEOF_JCOEF)] + + punpcklwd mm0,mm0 + psrad mm0,(DWORD_BIT-WORD_BIT) + pi2fd mm0,mm0 + + pfmul mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movq mm1,mm0 + punpckldq mm0,mm0 + punpckhdq mm1,mm1 + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm0 + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm0 + movq MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm0 + movq MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1 + movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm1 + movq MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm1 + movq MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1 + jmp near .nextcolumn + alignx 16,7 +%endif +.columnDCT: + + ; -- Even part + + movd mm0, DWORD [DWBLOCK(0,0,esi,SIZEOF_JCOEF)] + movd mm1, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + movd mm2, DWORD [DWBLOCK(4,0,esi,SIZEOF_JCOEF)] + movd mm3, DWORD [DWBLOCK(6,0,esi,SIZEOF_JCOEF)] + + punpcklwd mm0,mm0 + punpcklwd mm1,mm1 + psrad mm0,(DWORD_BIT-WORD_BIT) + psrad mm1,(DWORD_BIT-WORD_BIT) + pi2fd mm0,mm0 + pi2fd mm1,mm1 + + pfmul mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + pfmul mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + punpcklwd mm2,mm2 + punpcklwd mm3,mm3 + psrad mm2,(DWORD_BIT-WORD_BIT) + psrad mm3,(DWORD_BIT-WORD_BIT) + pi2fd mm2,mm2 + pi2fd mm3,mm3 + + pfmul mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + pfmul mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movq mm4,mm0 + movq mm5,mm1 + pfsub mm0,mm2 ; mm0=tmp11 + pfsub mm1,mm3 + pfadd mm4,mm2 ; mm4=tmp10 + pfadd mm5,mm3 ; mm5=tmp13 + + pfmul mm1,[GOTOFF(ebx,PD_1_414)] + pfsub mm1,mm5 ; mm1=tmp12 + + movq mm6,mm4 + movq mm7,mm0 + pfsub mm4,mm5 ; mm4=tmp3 + pfsub mm0,mm1 ; mm0=tmp2 + pfadd mm6,mm5 ; mm6=tmp0 + pfadd mm7,mm1 ; mm7=tmp1 + + movq MMWORD [wk(1)], mm4 ; tmp3 + movq MMWORD [wk(0)], mm0 ; tmp2 + + ; -- Odd part + + movd mm2, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + movd mm3, DWORD [DWBLOCK(3,0,esi,SIZEOF_JCOEF)] + movd mm5, DWORD [DWBLOCK(5,0,esi,SIZEOF_JCOEF)] + movd mm1, DWORD [DWBLOCK(7,0,esi,SIZEOF_JCOEF)] + + punpcklwd mm2,mm2 + punpcklwd mm3,mm3 + psrad mm2,(DWORD_BIT-WORD_BIT) + psrad mm3,(DWORD_BIT-WORD_BIT) + pi2fd mm2,mm2 + pi2fd mm3,mm3 + + pfmul mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + pfmul mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + punpcklwd mm5,mm5 + punpcklwd mm1,mm1 + psrad mm5,(DWORD_BIT-WORD_BIT) + psrad mm1,(DWORD_BIT-WORD_BIT) + pi2fd mm5,mm5 + pi2fd mm1,mm1 + + pfmul mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + pfmul mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movq mm4,mm2 + movq mm0,mm5 + pfadd mm2,mm1 ; mm2=z11 + pfadd mm5,mm3 ; mm5=z13 + pfsub mm4,mm1 ; mm4=z12 + pfsub mm0,mm3 ; mm0=z10 + + movq mm1,mm2 + pfsub mm2,mm5 + pfadd mm1,mm5 ; mm1=tmp7 + + pfmul mm2,[GOTOFF(ebx,PD_1_414)] ; mm2=tmp11 + + movq mm3,mm0 + pfadd mm0,mm4 + pfmul mm0,[GOTOFF(ebx,PD_1_847)] ; mm0=z5 + pfmul mm3,[GOTOFF(ebx,PD_2_613)] ; mm3=(z10 * 2.613125930) + pfmul mm4,[GOTOFF(ebx,PD_1_082)] ; mm4=(z12 * 1.082392200) + pfsubr mm3,mm0 ; mm3=tmp12 + pfsub mm4,mm0 ; mm4=tmp10 + + ; -- Final output stage + + pfsub mm3,mm1 ; mm3=tmp6 + movq mm5,mm6 + movq mm0,mm7 + pfadd mm6,mm1 ; mm6=data0=(00 01) + pfadd mm7,mm3 ; mm7=data1=(10 11) + pfsub mm5,mm1 ; mm5=data7=(70 71) + pfsub mm0,mm3 ; mm0=data6=(60 61) + pfsub mm2,mm3 ; mm2=tmp5 + + movq mm1,mm6 ; transpose coefficients + punpckldq mm6,mm7 ; mm6=(00 10) + punpckhdq mm1,mm7 ; mm1=(01 11) + movq mm3,mm0 ; transpose coefficients + punpckldq mm0,mm5 ; mm0=(60 70) + punpckhdq mm3,mm5 ; mm3=(61 71) + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm6 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1 + movq MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0 + movq MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm3 + + movq mm7, MMWORD [wk(0)] ; mm7=tmp2 + movq mm5, MMWORD [wk(1)] ; mm5=tmp3 + + pfadd mm4,mm2 ; mm4=tmp4 + movq mm6,mm7 + movq mm1,mm5 + pfadd mm7,mm2 ; mm7=data2=(20 21) + pfadd mm5,mm4 ; mm5=data4=(40 41) + pfsub mm6,mm2 ; mm6=data5=(50 51) + pfsub mm1,mm4 ; mm1=data3=(30 31) + + movq mm0,mm7 ; transpose coefficients + punpckldq mm7,mm1 ; mm7=(20 30) + punpckhdq mm0,mm1 ; mm0=(21 31) + movq mm3,mm5 ; transpose coefficients + punpckldq mm5,mm6 ; mm5=(40 50) + punpckhdq mm3,mm6 ; mm3=(41 51) + + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm7 + movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm0 + movq MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5 + movq MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm3 + +.nextcolumn: + add esi, byte 2*SIZEOF_JCOEF ; coef_block + add edx, byte 2*SIZEOF_FLOAT_MULT_TYPE ; quantptr + add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr + dec ecx ; ctr + jnz near .columnloop + + ; -- Prefetch the next coefficient block + + prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32] + prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32] + prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32] + prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows from work array, store into output array. + + mov eax, [original_ebp] + lea esi, [workspace] ; FAST_FLOAT *wsptr + mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(eax)] + mov ecx, DCTSIZE/2 ; ctr + alignx 16,7 +.rowloop: + + ; -- Even part + + movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] + movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)] + movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)] + movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)] + + movq mm4,mm0 + movq mm5,mm1 + pfsub mm0,mm2 ; mm0=tmp11 + pfsub mm1,mm3 + pfadd mm4,mm2 ; mm4=tmp10 + pfadd mm5,mm3 ; mm5=tmp13 + + pfmul mm1,[GOTOFF(ebx,PD_1_414)] + pfsub mm1,mm5 ; mm1=tmp12 + + movq mm6,mm4 + movq mm7,mm0 + pfsub mm4,mm5 ; mm4=tmp3 + pfsub mm0,mm1 ; mm0=tmp2 + pfadd mm6,mm5 ; mm6=tmp0 + pfadd mm7,mm1 ; mm7=tmp1 + + movq MMWORD [wk(1)], mm4 ; tmp3 + movq MMWORD [wk(0)], mm0 ; tmp2 + + ; -- Odd part + + movq mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] + movq mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)] + movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)] + movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)] + + movq mm4,mm2 + movq mm0,mm5 + pfadd mm2,mm1 ; mm2=z11 + pfadd mm5,mm3 ; mm5=z13 + pfsub mm4,mm1 ; mm4=z12 + pfsub mm0,mm3 ; mm0=z10 + + movq mm1,mm2 + pfsub mm2,mm5 + pfadd mm1,mm5 ; mm1=tmp7 + + pfmul mm2,[GOTOFF(ebx,PD_1_414)] ; mm2=tmp11 + + movq mm3,mm0 + pfadd mm0,mm4 + pfmul mm0,[GOTOFF(ebx,PD_1_847)] ; mm0=z5 + pfmul mm3,[GOTOFF(ebx,PD_2_613)] ; mm3=(z10 * 2.613125930) + pfmul mm4,[GOTOFF(ebx,PD_1_082)] ; mm4=(z12 * 1.082392200) + pfsubr mm3,mm0 ; mm3=tmp12 + pfsub mm4,mm0 ; mm4=tmp10 + + ; -- Final output stage + + pfsub mm3,mm1 ; mm3=tmp6 + movq mm5,mm6 + movq mm0,mm7 + pfadd mm6,mm1 ; mm6=data0=(00 10) + pfadd mm7,mm3 ; mm7=data1=(01 11) + pfsub mm5,mm1 ; mm5=data7=(07 17) + pfsub mm0,mm3 ; mm0=data6=(06 16) + pfsub mm2,mm3 ; mm2=tmp5 + + movq mm1,[GOTOFF(ebx,PD_RNDINT_MAGIC)] ; mm1=[PD_RNDINT_MAGIC] + pcmpeqd mm3,mm3 + psrld mm3,WORD_BIT ; mm3={0xFFFF 0x0000 0xFFFF 0x0000} + + pfadd mm6,mm1 ; mm6=roundint(data0/8)=(00 ** 10 **) + pfadd mm7,mm1 ; mm7=roundint(data1/8)=(01 ** 11 **) + pfadd mm0,mm1 ; mm0=roundint(data6/8)=(06 ** 16 **) + pfadd mm5,mm1 ; mm5=roundint(data7/8)=(07 ** 17 **) + + pand mm6,mm3 ; mm6=(00 -- 10 --) + pslld mm7,WORD_BIT ; mm7=(-- 01 -- 11) + pand mm0,mm3 ; mm0=(06 -- 16 --) + pslld mm5,WORD_BIT ; mm5=(-- 07 -- 17) + por mm6,mm7 ; mm6=(00 01 10 11) + por mm0,mm5 ; mm0=(06 07 16 17) + + movq mm1, MMWORD [wk(0)] ; mm1=tmp2 + movq mm3, MMWORD [wk(1)] ; mm3=tmp3 + + pfadd mm4,mm2 ; mm4=tmp4 + movq mm7,mm1 + movq mm5,mm3 + pfadd mm1,mm2 ; mm1=data2=(02 12) + pfadd mm3,mm4 ; mm3=data4=(04 14) + pfsub mm7,mm2 ; mm7=data5=(05 15) + pfsub mm5,mm4 ; mm5=data3=(03 13) + + movq mm2,[GOTOFF(ebx,PD_RNDINT_MAGIC)] ; mm2=[PD_RNDINT_MAGIC] + pcmpeqd mm4,mm4 + psrld mm4,WORD_BIT ; mm4={0xFFFF 0x0000 0xFFFF 0x0000} + + pfadd mm3,mm2 ; mm3=roundint(data4/8)=(04 ** 14 **) + pfadd mm7,mm2 ; mm7=roundint(data5/8)=(05 ** 15 **) + pfadd mm1,mm2 ; mm1=roundint(data2/8)=(02 ** 12 **) + pfadd mm5,mm2 ; mm5=roundint(data3/8)=(03 ** 13 **) + + pand mm3,mm4 ; mm3=(04 -- 14 --) + pslld mm7,WORD_BIT ; mm7=(-- 05 -- 15) + pand mm1,mm4 ; mm1=(02 -- 12 --) + pslld mm5,WORD_BIT ; mm5=(-- 03 -- 13) + por mm3,mm7 ; mm3=(04 05 14 15) + por mm1,mm5 ; mm1=(02 03 12 13) + + movq mm2,[GOTOFF(ebx,PB_CENTERJSAMP)] ; mm2=[PB_CENTERJSAMP] + + packsswb mm6,mm3 ; mm6=(00 01 10 11 04 05 14 15) + packsswb mm1,mm0 ; mm1=(02 03 12 13 06 07 16 17) + paddb mm6,mm2 + paddb mm1,mm2 + + movq mm4,mm6 ; transpose coefficients(phase 2) + punpcklwd mm6,mm1 ; mm6=(00 01 02 03 10 11 12 13) + punpckhwd mm4,mm1 ; mm4=(04 05 06 07 14 15 16 17) + + movq mm7,mm6 ; transpose coefficients(phase 3) + punpckldq mm6,mm4 ; mm6=(00 01 02 03 04 05 06 07) + punpckhdq mm7,mm4 ; mm7=(10 11 12 13 14 15 16 17) + + pushpic ebx ; save GOT address + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6 + movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7 + + poppic ebx ; restore GOT address + + add esi, byte 2*SIZEOF_FAST_FLOAT ; wsptr + add edi, byte 2*SIZEOF_JSAMPROW + dec ecx ; ctr + jnz near .rowloop + + femms ; empty MMX/3DNow! state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/Builder/jni-1.11/simd/jidctflt-sse.asm b/Builder/jni-1.11/simd/jidctflt-sse.asm new file mode 100644 index 000000000..4d4af2fff --- /dev/null +++ b/Builder/jni-1.11/simd/jidctflt-sse.asm @@ -0,0 +1,571 @@ +; +; jidctflt.asm - floating-point IDCT (SSE & MMX) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a floating-point implementation of the inverse DCT +; (Discrete Cosine Transform). The following code is based directly on +; the IJG's original jidctflt.c; see the jidctflt.c for more details. +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) + shufps %1,%2,0x44 +%endmacro + +%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) + shufps %1,%2,0xEE +%endmacro + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_idct_float_sse) + +EXTN(jconst_idct_float_sse): + +PD_1_414 times 4 dd 1.414213562373095048801689 +PD_1_847 times 4 dd 1.847759065022573512256366 +PD_1_082 times 4 dd 1.082392200292393968799446 +PD_M2_613 times 4 dd -2.613125929752753055713286 +PD_0_125 times 4 dd 0.125 ; 1/8 +PB_CENTERJSAMP times 8 db CENTERJSAMPLE + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform dequantization and inverse DCT on one block of coefficients. +; +; GLOBAL(void) +; jsimd_idct_float_sse (void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +%define dct_table(b) (b)+8 ; void *dct_table +%define coef_block(b) (b)+12 ; JCOEFPTR coef_block +%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf +%define output_col(b) (b)+20 ; JDIMENSION output_col + +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 +%define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT + ; FAST_FLOAT workspace[DCTSIZE2] + + align 16 + global EXTN(jsimd_idct_float_sse) + +EXTN(jsimd_idct_float_sse): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [workspace] + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input, store into work array. + +; mov eax, [original_ebp] + mov edx, POINTER [dct_table(eax)] ; quantptr + mov esi, JCOEFPTR [coef_block(eax)] ; inptr + lea edi, [workspace] ; FAST_FLOAT *wsptr + mov ecx, DCTSIZE/4 ; ctr + alignx 16,7 +.columnloop: +%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE + mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + jnz near .columnDCT + + movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + por mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] + por mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + por mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + por mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + por mm1,mm0 + packsswb mm1,mm1 + movd eax,mm1 + test eax,eax + jnz short .columnDCT + + ; -- AC terms all zero + + movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + + punpckhwd mm1,mm0 ; mm1=(** 02 ** 03) + punpcklwd mm0,mm0 ; mm0=(00 00 01 01) + psrad mm1,(DWORD_BIT-WORD_BIT) ; mm1=in0H=(02 03) + psrad mm0,(DWORD_BIT-WORD_BIT) ; mm0=in0L=(00 01) + cvtpi2ps xmm3,mm1 ; xmm3=(02 03 ** **) + cvtpi2ps xmm0,mm0 ; xmm0=(00 01 ** **) + movlhps xmm0,xmm3 ; xmm0=in0=(00 01 02 03) + + mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movaps xmm1,xmm0 + movaps xmm2,xmm0 + movaps xmm3,xmm0 + + shufps xmm0,xmm0,0x00 ; xmm0=(00 00 00 00) + shufps xmm1,xmm1,0x55 ; xmm1=(01 01 01 01) + shufps xmm2,xmm2,0xAA ; xmm2=(02 02 02 02) + shufps xmm3,xmm3,0xFF ; xmm3=(03 03 03 03) + + movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2 + movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2 + movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3 + movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 + jmp near .nextcolumn + alignx 16,7 +%endif +.columnDCT: + + ; -- Even part + + movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + + punpckhwd mm4,mm0 ; mm4=(** 02 ** 03) + punpcklwd mm0,mm0 ; mm0=(00 00 01 01) + punpckhwd mm5,mm1 ; mm5=(** 22 ** 23) + punpcklwd mm1,mm1 ; mm1=(20 20 21 21) + + psrad mm4,(DWORD_BIT-WORD_BIT) ; mm4=in0H=(02 03) + psrad mm0,(DWORD_BIT-WORD_BIT) ; mm0=in0L=(00 01) + cvtpi2ps xmm4,mm4 ; xmm4=(02 03 ** **) + cvtpi2ps xmm0,mm0 ; xmm0=(00 01 ** **) + psrad mm5,(DWORD_BIT-WORD_BIT) ; mm5=in2H=(22 23) + psrad mm1,(DWORD_BIT-WORD_BIT) ; mm1=in2L=(20 21) + cvtpi2ps xmm5,mm5 ; xmm5=(22 23 ** **) + cvtpi2ps xmm1,mm1 ; xmm1=(20 21 ** **) + + punpckhwd mm6,mm2 ; mm6=(** 42 ** 43) + punpcklwd mm2,mm2 ; mm2=(40 40 41 41) + punpckhwd mm7,mm3 ; mm7=(** 62 ** 63) + punpcklwd mm3,mm3 ; mm3=(60 60 61 61) + + psrad mm6,(DWORD_BIT-WORD_BIT) ; mm6=in4H=(42 43) + psrad mm2,(DWORD_BIT-WORD_BIT) ; mm2=in4L=(40 41) + cvtpi2ps xmm6,mm6 ; xmm6=(42 43 ** **) + cvtpi2ps xmm2,mm2 ; xmm2=(40 41 ** **) + psrad mm7,(DWORD_BIT-WORD_BIT) ; mm7=in6H=(62 63) + psrad mm3,(DWORD_BIT-WORD_BIT) ; mm3=in6L=(60 61) + cvtpi2ps xmm7,mm7 ; xmm7=(62 63 ** **) + cvtpi2ps xmm3,mm3 ; xmm3=(60 61 ** **) + + movlhps xmm0,xmm4 ; xmm0=in0=(00 01 02 03) + movlhps xmm1,xmm5 ; xmm1=in2=(20 21 22 23) + mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movlhps xmm2,xmm6 ; xmm2=in4=(40 41 42 43) + movlhps xmm3,xmm7 ; xmm3=in6=(60 61 62 63) + mulps xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movaps xmm4,xmm0 + movaps xmm5,xmm1 + subps xmm0,xmm2 ; xmm0=tmp11 + subps xmm1,xmm3 + addps xmm4,xmm2 ; xmm4=tmp10 + addps xmm5,xmm3 ; xmm5=tmp13 + + mulps xmm1,[GOTOFF(ebx,PD_1_414)] + subps xmm1,xmm5 ; xmm1=tmp12 + + movaps xmm6,xmm4 + movaps xmm7,xmm0 + subps xmm4,xmm5 ; xmm4=tmp3 + subps xmm0,xmm1 ; xmm0=tmp2 + addps xmm6,xmm5 ; xmm6=tmp0 + addps xmm7,xmm1 ; xmm7=tmp1 + + movaps XMMWORD [wk(1)], xmm4 ; tmp3 + movaps XMMWORD [wk(0)], xmm0 ; tmp2 + + ; -- Odd part + + movq mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + + punpckhwd mm6,mm4 ; mm6=(** 12 ** 13) + punpcklwd mm4,mm4 ; mm4=(10 10 11 11) + punpckhwd mm2,mm0 ; mm2=(** 32 ** 33) + punpcklwd mm0,mm0 ; mm0=(30 30 31 31) + + psrad mm6,(DWORD_BIT-WORD_BIT) ; mm6=in1H=(12 13) + psrad mm4,(DWORD_BIT-WORD_BIT) ; mm4=in1L=(10 11) + cvtpi2ps xmm4,mm6 ; xmm4=(12 13 ** **) + cvtpi2ps xmm2,mm4 ; xmm2=(10 11 ** **) + psrad mm2,(DWORD_BIT-WORD_BIT) ; mm2=in3H=(32 33) + psrad mm0,(DWORD_BIT-WORD_BIT) ; mm0=in3L=(30 31) + cvtpi2ps xmm0,mm2 ; xmm0=(32 33 ** **) + cvtpi2ps xmm3,mm0 ; xmm3=(30 31 ** **) + + punpckhwd mm7,mm5 ; mm7=(** 52 ** 53) + punpcklwd mm5,mm5 ; mm5=(50 50 51 51) + punpckhwd mm3,mm1 ; mm3=(** 72 ** 73) + punpcklwd mm1,mm1 ; mm1=(70 70 71 71) + + movlhps xmm2,xmm4 ; xmm2=in1=(10 11 12 13) + movlhps xmm3,xmm0 ; xmm3=in3=(30 31 32 33) + + psrad mm7,(DWORD_BIT-WORD_BIT) ; mm7=in5H=(52 53) + psrad mm5,(DWORD_BIT-WORD_BIT) ; mm5=in5L=(50 51) + cvtpi2ps xmm4,mm7 ; xmm4=(52 53 ** **) + cvtpi2ps xmm5,mm5 ; xmm5=(50 51 ** **) + psrad mm3,(DWORD_BIT-WORD_BIT) ; mm3=in7H=(72 73) + psrad mm1,(DWORD_BIT-WORD_BIT) ; mm1=in7L=(70 71) + cvtpi2ps xmm0,mm3 ; xmm0=(72 73 ** **) + cvtpi2ps xmm1,mm1 ; xmm1=(70 71 ** **) + + mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movlhps xmm5,xmm4 ; xmm5=in5=(50 51 52 53) + movlhps xmm1,xmm0 ; xmm1=in7=(70 71 72 73) + mulps xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movaps xmm4,xmm2 + movaps xmm0,xmm5 + addps xmm2,xmm1 ; xmm2=z11 + addps xmm5,xmm3 ; xmm5=z13 + subps xmm4,xmm1 ; xmm4=z12 + subps xmm0,xmm3 ; xmm0=z10 + + movaps xmm1,xmm2 + subps xmm2,xmm5 + addps xmm1,xmm5 ; xmm1=tmp7 + + mulps xmm2,[GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11 + + movaps xmm3,xmm0 + addps xmm0,xmm4 + mulps xmm0,[GOTOFF(ebx,PD_1_847)] ; xmm0=z5 + mulps xmm3,[GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930) + mulps xmm4,[GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200) + addps xmm3,xmm0 ; xmm3=tmp12 + subps xmm4,xmm0 ; xmm4=tmp10 + + ; -- Final output stage + + subps xmm3,xmm1 ; xmm3=tmp6 + movaps xmm5,xmm6 + movaps xmm0,xmm7 + addps xmm6,xmm1 ; xmm6=data0=(00 01 02 03) + addps xmm7,xmm3 ; xmm7=data1=(10 11 12 13) + subps xmm5,xmm1 ; xmm5=data7=(70 71 72 73) + subps xmm0,xmm3 ; xmm0=data6=(60 61 62 63) + subps xmm2,xmm3 ; xmm2=tmp5 + + movaps xmm1,xmm6 ; transpose coefficients(phase 1) + unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11) + unpckhps xmm1,xmm7 ; xmm1=(02 12 03 13) + movaps xmm3,xmm0 ; transpose coefficients(phase 1) + unpcklps xmm0,xmm5 ; xmm0=(60 70 61 71) + unpckhps xmm3,xmm5 ; xmm3=(62 72 63 73) + + movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 + movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3 + + movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71) + movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73) + + addps xmm4,xmm2 ; xmm4=tmp4 + movaps xmm0,xmm7 + movaps xmm3,xmm5 + addps xmm7,xmm2 ; xmm7=data2=(20 21 22 23) + addps xmm5,xmm4 ; xmm5=data4=(40 41 42 43) + subps xmm0,xmm2 ; xmm0=data5=(50 51 52 53) + subps xmm3,xmm4 ; xmm3=data3=(30 31 32 33) + + movaps xmm2,xmm7 ; transpose coefficients(phase 1) + unpcklps xmm7,xmm3 ; xmm7=(20 30 21 31) + unpckhps xmm2,xmm3 ; xmm2=(22 32 23 33) + movaps xmm4,xmm5 ; transpose coefficients(phase 1) + unpcklps xmm5,xmm0 ; xmm5=(40 50 41 51) + unpckhps xmm4,xmm0 ; xmm4=(42 52 43 53) + + movaps xmm3,xmm6 ; transpose coefficients(phase 2) + unpcklps2 xmm6,xmm7 ; xmm6=(00 10 20 30) + unpckhps2 xmm3,xmm7 ; xmm3=(01 11 21 31) + movaps xmm0,xmm1 ; transpose coefficients(phase 2) + unpcklps2 xmm1,xmm2 ; xmm1=(02 12 22 32) + unpckhps2 xmm0,xmm2 ; xmm0=(03 13 23 33) + + movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71) + movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73) + + movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6 + movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3 + movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0 + + movaps xmm6,xmm5 ; transpose coefficients(phase 2) + unpcklps2 xmm5,xmm7 ; xmm5=(40 50 60 70) + unpckhps2 xmm6,xmm7 ; xmm6=(41 51 61 71) + movaps xmm3,xmm4 ; transpose coefficients(phase 2) + unpcklps2 xmm4,xmm2 ; xmm4=(42 52 62 72) + unpckhps2 xmm3,xmm2 ; xmm3=(43 53 63 73) + + movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6 + movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4 + movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 + +.nextcolumn: + add esi, byte 4*SIZEOF_JCOEF ; coef_block + add edx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr + add edi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr + dec ecx ; ctr + jnz near .columnloop + + ; -- Prefetch the next coefficient block + + prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32] + prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32] + prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32] + prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows from work array, store into output array. + + mov eax, [original_ebp] + lea esi, [workspace] ; FAST_FLOAT *wsptr + mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(eax)] + mov ecx, DCTSIZE/4 ; ctr + alignx 16,7 +.rowloop: + + ; -- Even part + + movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)] + + movaps xmm4,xmm0 + movaps xmm5,xmm1 + subps xmm0,xmm2 ; xmm0=tmp11 + subps xmm1,xmm3 + addps xmm4,xmm2 ; xmm4=tmp10 + addps xmm5,xmm3 ; xmm5=tmp13 + + mulps xmm1,[GOTOFF(ebx,PD_1_414)] + subps xmm1,xmm5 ; xmm1=tmp12 + + movaps xmm6,xmm4 + movaps xmm7,xmm0 + subps xmm4,xmm5 ; xmm4=tmp3 + subps xmm0,xmm1 ; xmm0=tmp2 + addps xmm6,xmm5 ; xmm6=tmp0 + addps xmm7,xmm1 ; xmm7=tmp1 + + movaps XMMWORD [wk(1)], xmm4 ; tmp3 + movaps XMMWORD [wk(0)], xmm0 ; tmp2 + + ; -- Odd part + + movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)] + + movaps xmm4,xmm2 + movaps xmm0,xmm5 + addps xmm2,xmm1 ; xmm2=z11 + addps xmm5,xmm3 ; xmm5=z13 + subps xmm4,xmm1 ; xmm4=z12 + subps xmm0,xmm3 ; xmm0=z10 + + movaps xmm1,xmm2 + subps xmm2,xmm5 + addps xmm1,xmm5 ; xmm1=tmp7 + + mulps xmm2,[GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11 + + movaps xmm3,xmm0 + addps xmm0,xmm4 + mulps xmm0,[GOTOFF(ebx,PD_1_847)] ; xmm0=z5 + mulps xmm3,[GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930) + mulps xmm4,[GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200) + addps xmm3,xmm0 ; xmm3=tmp12 + subps xmm4,xmm0 ; xmm4=tmp10 + + ; -- Final output stage + + subps xmm3,xmm1 ; xmm3=tmp6 + movaps xmm5,xmm6 + movaps xmm0,xmm7 + addps xmm6,xmm1 ; xmm6=data0=(00 10 20 30) + addps xmm7,xmm3 ; xmm7=data1=(01 11 21 31) + subps xmm5,xmm1 ; xmm5=data7=(07 17 27 37) + subps xmm0,xmm3 ; xmm0=data6=(06 16 26 36) + subps xmm2,xmm3 ; xmm2=tmp5 + + movaps xmm1,[GOTOFF(ebx,PD_0_125)] ; xmm1=[PD_0_125] + + mulps xmm6,xmm1 ; descale(1/8) + mulps xmm7,xmm1 ; descale(1/8) + mulps xmm5,xmm1 ; descale(1/8) + mulps xmm0,xmm1 ; descale(1/8) + + movhlps xmm3,xmm6 + movhlps xmm1,xmm7 + cvtps2pi mm0,xmm6 ; round to int32, mm0=data0L=(00 10) + cvtps2pi mm1,xmm7 ; round to int32, mm1=data1L=(01 11) + cvtps2pi mm2,xmm3 ; round to int32, mm2=data0H=(20 30) + cvtps2pi mm3,xmm1 ; round to int32, mm3=data1H=(21 31) + packssdw mm0,mm2 ; mm0=data0=(00 10 20 30) + packssdw mm1,mm3 ; mm1=data1=(01 11 21 31) + + movhlps xmm6,xmm5 + movhlps xmm7,xmm0 + cvtps2pi mm4,xmm5 ; round to int32, mm4=data7L=(07 17) + cvtps2pi mm5,xmm0 ; round to int32, mm5=data6L=(06 16) + cvtps2pi mm6,xmm6 ; round to int32, mm6=data7H=(27 37) + cvtps2pi mm7,xmm7 ; round to int32, mm7=data6H=(26 36) + packssdw mm4,mm6 ; mm4=data7=(07 17 27 37) + packssdw mm5,mm7 ; mm5=data6=(06 16 26 36) + + packsswb mm0,mm5 ; mm0=(00 10 20 30 06 16 26 36) + packsswb mm1,mm4 ; mm1=(01 11 21 31 07 17 27 37) + + movaps xmm3, XMMWORD [wk(0)] ; xmm3=tmp2 + movaps xmm1, XMMWORD [wk(1)] ; xmm1=tmp3 + + movaps xmm6,[GOTOFF(ebx,PD_0_125)] ; xmm6=[PD_0_125] + + addps xmm4,xmm2 ; xmm4=tmp4 + movaps xmm5,xmm3 + movaps xmm0,xmm1 + addps xmm3,xmm2 ; xmm3=data2=(02 12 22 32) + addps xmm1,xmm4 ; xmm1=data4=(04 14 24 34) + subps xmm5,xmm2 ; xmm5=data5=(05 15 25 35) + subps xmm0,xmm4 ; xmm0=data3=(03 13 23 33) + + mulps xmm3,xmm6 ; descale(1/8) + mulps xmm1,xmm6 ; descale(1/8) + mulps xmm5,xmm6 ; descale(1/8) + mulps xmm0,xmm6 ; descale(1/8) + + movhlps xmm7,xmm3 + movhlps xmm2,xmm1 + cvtps2pi mm2,xmm3 ; round to int32, mm2=data2L=(02 12) + cvtps2pi mm3,xmm1 ; round to int32, mm3=data4L=(04 14) + cvtps2pi mm6,xmm7 ; round to int32, mm6=data2H=(22 32) + cvtps2pi mm7,xmm2 ; round to int32, mm7=data4H=(24 34) + packssdw mm2,mm6 ; mm2=data2=(02 12 22 32) + packssdw mm3,mm7 ; mm3=data4=(04 14 24 34) + + movhlps xmm4,xmm5 + movhlps xmm6,xmm0 + cvtps2pi mm5,xmm5 ; round to int32, mm5=data5L=(05 15) + cvtps2pi mm4,xmm0 ; round to int32, mm4=data3L=(03 13) + cvtps2pi mm6,xmm4 ; round to int32, mm6=data5H=(25 35) + cvtps2pi mm7,xmm6 ; round to int32, mm7=data3H=(23 33) + packssdw mm5,mm6 ; mm5=data5=(05 15 25 35) + packssdw mm4,mm7 ; mm4=data3=(03 13 23 33) + + movq mm6,[GOTOFF(ebx,PB_CENTERJSAMP)] ; mm6=[PB_CENTERJSAMP] + + packsswb mm2,mm3 ; mm2=(02 12 22 32 04 14 24 34) + packsswb mm4,mm5 ; mm4=(03 13 23 33 05 15 25 35) + + paddb mm0,mm6 + paddb mm1,mm6 + paddb mm2,mm6 + paddb mm4,mm6 + + movq mm7,mm0 ; transpose coefficients(phase 1) + punpcklbw mm0,mm1 ; mm0=(00 01 10 11 20 21 30 31) + punpckhbw mm7,mm1 ; mm7=(06 07 16 17 26 27 36 37) + movq mm3,mm2 ; transpose coefficients(phase 1) + punpcklbw mm2,mm4 ; mm2=(02 03 12 13 22 23 32 33) + punpckhbw mm3,mm4 ; mm3=(04 05 14 15 24 25 34 35) + + movq mm5,mm0 ; transpose coefficients(phase 2) + punpcklwd mm0,mm2 ; mm0=(00 01 02 03 10 11 12 13) + punpckhwd mm5,mm2 ; mm5=(20 21 22 23 30 31 32 33) + movq mm6,mm3 ; transpose coefficients(phase 2) + punpcklwd mm3,mm7 ; mm3=(04 05 06 07 14 15 16 17) + punpckhwd mm6,mm7 ; mm6=(24 25 26 27 34 35 36 37) + + movq mm1,mm0 ; transpose coefficients(phase 3) + punpckldq mm0,mm3 ; mm0=(00 01 02 03 04 05 06 07) + punpckhdq mm1,mm3 ; mm1=(10 11 12 13 14 15 16 17) + movq mm4,mm5 ; transpose coefficients(phase 3) + punpckldq mm5,mm6 ; mm5=(20 21 22 23 24 25 26 27) + punpckhdq mm4,mm6 ; mm4=(30 31 32 33 34 35 36 37) + + pushpic ebx ; save GOT address + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0 + movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1 + mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] + mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW] + movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5 + movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4 + + poppic ebx ; restore GOT address + + add esi, byte 4*SIZEOF_FAST_FLOAT ; wsptr + add edi, byte 4*SIZEOF_JSAMPROW + dec ecx ; ctr + jnz near .rowloop + + emms ; empty MMX state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/Builder/jni-1.11/simd/jidctflt-sse2-64.asm b/Builder/jni-1.11/simd/jidctflt-sse2-64.asm new file mode 100644 index 000000000..bdda05d97 --- /dev/null +++ b/Builder/jni-1.11/simd/jidctflt-sse2-64.asm @@ -0,0 +1,482 @@ +; +; jidctflt.asm - floating-point IDCT (64-bit SSE & SSE2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2009, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a floating-point implementation of the inverse DCT +; (Discrete Cosine Transform). The following code is based directly on +; the IJG's original jidctflt.c; see the jidctflt.c for more details. +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) + shufps %1,%2,0x44 +%endmacro + +%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) + shufps %1,%2,0xEE +%endmacro + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_idct_float_sse2) + +EXTN(jconst_idct_float_sse2): + +PD_1_414 times 4 dd 1.414213562373095048801689 +PD_1_847 times 4 dd 1.847759065022573512256366 +PD_1_082 times 4 dd 1.082392200292393968799446 +PD_M2_613 times 4 dd -2.613125929752753055713286 +PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3) +PB_CENTERJSAMP times 16 db CENTERJSAMPLE + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 +; +; Perform dequantization and inverse DCT on one block of coefficients. +; +; GLOBAL(void) +; jsimd_idct_float_sse2 (void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +; r10 = void *dct_table +; r11 = JCOEFPTR coef_block +; r12 = JSAMPARRAY output_buf +; r13 = JDIMENSION output_col + +%define original_rbp rbp+0 +%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 +%define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT + ; FAST_FLOAT workspace[DCTSIZE2] + + align 16 + global EXTN(jsimd_idct_float_sse2) + +EXTN(jsimd_idct_float_sse2): + push rbp + mov rax,rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp],rax + mov rbp,rsp ; rbp = aligned rbp + lea rsp, [workspace] + collect_args + push rbx + + ; ---- Pass 1: process columns from input, store into work array. + + mov rdx, r10 ; quantptr + mov rsi, r11 ; inptr + lea rdi, [workspace] ; FAST_FLOAT *wsptr + mov rcx, DCTSIZE/4 ; ctr +.columnloop: +%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE + mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)] + or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)] + jnz near .columnDCT + + movq xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)] + movq xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)] + movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)] + movq xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)] + movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)] + movq xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)] + movq xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)] + por xmm1,xmm2 + por xmm3,xmm4 + por xmm5,xmm6 + por xmm1,xmm3 + por xmm5,xmm7 + por xmm1,xmm5 + packsswb xmm1,xmm1 + movd eax,xmm1 + test rax,rax + jnz short .columnDCT + + ; -- AC terms all zero + + movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)] + + punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) + psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) + cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03) + + mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] + + movaps xmm1,xmm0 + movaps xmm2,xmm0 + movaps xmm3,xmm0 + + shufps xmm0,xmm0,0x00 ; xmm0=(00 00 00 00) + shufps xmm1,xmm1,0x55 ; xmm1=(01 01 01 01) + shufps xmm2,xmm2,0xAA ; xmm2=(02 02 02 02) + shufps xmm3,xmm3,0xFF ; xmm3=(03 03 03 03) + + movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2 + movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2 + movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3 + movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3 + jmp near .nextcolumn +%endif +.columnDCT: + + ; -- Even part + + movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)] + movq xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)] + movq xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)] + movq xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)] + + punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) + punpcklwd xmm1,xmm1 ; xmm1=(20 20 21 21 22 22 23 23) + psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) + psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23) + cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03) + cvtdq2ps xmm1,xmm1 ; xmm1=in2=(20 21 22 23) + + punpcklwd xmm2,xmm2 ; xmm2=(40 40 41 41 42 42 43 43) + punpcklwd xmm3,xmm3 ; xmm3=(60 60 61 61 62 62 63 63) + psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43) + psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63) + cvtdq2ps xmm2,xmm2 ; xmm2=in4=(40 41 42 43) + cvtdq2ps xmm3,xmm3 ; xmm3=in6=(60 61 62 63) + + mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] + + movaps xmm4,xmm0 + movaps xmm5,xmm1 + subps xmm0,xmm2 ; xmm0=tmp11 + subps xmm1,xmm3 + addps xmm4,xmm2 ; xmm4=tmp10 + addps xmm5,xmm3 ; xmm5=tmp13 + + mulps xmm1,[rel PD_1_414] + subps xmm1,xmm5 ; xmm1=tmp12 + + movaps xmm6,xmm4 + movaps xmm7,xmm0 + subps xmm4,xmm5 ; xmm4=tmp3 + subps xmm0,xmm1 ; xmm0=tmp2 + addps xmm6,xmm5 ; xmm6=tmp0 + addps xmm7,xmm1 ; xmm7=tmp1 + + movaps XMMWORD [wk(1)], xmm4 ; tmp3 + movaps XMMWORD [wk(0)], xmm0 ; tmp2 + + ; -- Odd part + + movq xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)] + movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)] + movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)] + movq xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)] + + punpcklwd xmm2,xmm2 ; xmm2=(10 10 11 11 12 12 13 13) + punpcklwd xmm3,xmm3 ; xmm3=(30 30 31 31 32 32 33 33) + psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13) + psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33) + cvtdq2ps xmm2,xmm2 ; xmm2=in1=(10 11 12 13) + cvtdq2ps xmm3,xmm3 ; xmm3=in3=(30 31 32 33) + + punpcklwd xmm5,xmm5 ; xmm5=(50 50 51 51 52 52 53 53) + punpcklwd xmm1,xmm1 ; xmm1=(70 70 71 71 72 72 73 73) + psrad xmm5,(DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53) + psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73) + cvtdq2ps xmm5,xmm5 ; xmm5=in5=(50 51 52 53) + cvtdq2ps xmm1,xmm1 ; xmm1=in7=(70 71 72 73) + + mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] + + movaps xmm4,xmm2 + movaps xmm0,xmm5 + addps xmm2,xmm1 ; xmm2=z11 + addps xmm5,xmm3 ; xmm5=z13 + subps xmm4,xmm1 ; xmm4=z12 + subps xmm0,xmm3 ; xmm0=z10 + + movaps xmm1,xmm2 + subps xmm2,xmm5 + addps xmm1,xmm5 ; xmm1=tmp7 + + mulps xmm2,[rel PD_1_414] ; xmm2=tmp11 + + movaps xmm3,xmm0 + addps xmm0,xmm4 + mulps xmm0,[rel PD_1_847] ; xmm0=z5 + mulps xmm3,[rel PD_M2_613] ; xmm3=(z10 * -2.613125930) + mulps xmm4,[rel PD_1_082] ; xmm4=(z12 * 1.082392200) + addps xmm3,xmm0 ; xmm3=tmp12 + subps xmm4,xmm0 ; xmm4=tmp10 + + ; -- Final output stage + + subps xmm3,xmm1 ; xmm3=tmp6 + movaps xmm5,xmm6 + movaps xmm0,xmm7 + addps xmm6,xmm1 ; xmm6=data0=(00 01 02 03) + addps xmm7,xmm3 ; xmm7=data1=(10 11 12 13) + subps xmm5,xmm1 ; xmm5=data7=(70 71 72 73) + subps xmm0,xmm3 ; xmm0=data6=(60 61 62 63) + subps xmm2,xmm3 ; xmm2=tmp5 + + movaps xmm1,xmm6 ; transpose coefficients(phase 1) + unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11) + unpckhps xmm1,xmm7 ; xmm1=(02 12 03 13) + movaps xmm3,xmm0 ; transpose coefficients(phase 1) + unpcklps xmm0,xmm5 ; xmm0=(60 70 61 71) + unpckhps xmm3,xmm5 ; xmm3=(62 72 63 73) + + movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 + movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3 + + movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71) + movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73) + + addps xmm4,xmm2 ; xmm4=tmp4 + movaps xmm0,xmm7 + movaps xmm3,xmm5 + addps xmm7,xmm2 ; xmm7=data2=(20 21 22 23) + addps xmm5,xmm4 ; xmm5=data4=(40 41 42 43) + subps xmm0,xmm2 ; xmm0=data5=(50 51 52 53) + subps xmm3,xmm4 ; xmm3=data3=(30 31 32 33) + + movaps xmm2,xmm7 ; transpose coefficients(phase 1) + unpcklps xmm7,xmm3 ; xmm7=(20 30 21 31) + unpckhps xmm2,xmm3 ; xmm2=(22 32 23 33) + movaps xmm4,xmm5 ; transpose coefficients(phase 1) + unpcklps xmm5,xmm0 ; xmm5=(40 50 41 51) + unpckhps xmm4,xmm0 ; xmm4=(42 52 43 53) + + movaps xmm3,xmm6 ; transpose coefficients(phase 2) + unpcklps2 xmm6,xmm7 ; xmm6=(00 10 20 30) + unpckhps2 xmm3,xmm7 ; xmm3=(01 11 21 31) + movaps xmm0,xmm1 ; transpose coefficients(phase 2) + unpcklps2 xmm1,xmm2 ; xmm1=(02 12 22 32) + unpckhps2 xmm0,xmm2 ; xmm0=(03 13 23 33) + + movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71) + movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73) + + movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6 + movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3 + movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0 + + movaps xmm6,xmm5 ; transpose coefficients(phase 2) + unpcklps2 xmm5,xmm7 ; xmm5=(40 50 60 70) + unpckhps2 xmm6,xmm7 ; xmm6=(41 51 61 71) + movaps xmm3,xmm4 ; transpose coefficients(phase 2) + unpcklps2 xmm4,xmm2 ; xmm4=(42 52 62 72) + unpckhps2 xmm3,xmm2 ; xmm3=(43 53 63 73) + + movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6 + movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4 + movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3 + +.nextcolumn: + add rsi, byte 4*SIZEOF_JCOEF ; coef_block + add rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr + add rdi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr + dec rcx ; ctr + jnz near .columnloop + + ; -- Prefetch the next coefficient block + + prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32] + prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32] + prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32] + prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows from work array, store into output array. + + mov rax, [original_rbp] + lea rsi, [workspace] ; FAST_FLOAT *wsptr + mov rdi, r12 ; (JSAMPROW *) + mov eax, r13d + mov rcx, DCTSIZE/4 ; ctr +.rowloop: + + ; -- Even part + + movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)] + + movaps xmm4,xmm0 + movaps xmm5,xmm1 + subps xmm0,xmm2 ; xmm0=tmp11 + subps xmm1,xmm3 + addps xmm4,xmm2 ; xmm4=tmp10 + addps xmm5,xmm3 ; xmm5=tmp13 + + mulps xmm1,[rel PD_1_414] + subps xmm1,xmm5 ; xmm1=tmp12 + + movaps xmm6,xmm4 + movaps xmm7,xmm0 + subps xmm4,xmm5 ; xmm4=tmp3 + subps xmm0,xmm1 ; xmm0=tmp2 + addps xmm6,xmm5 ; xmm6=tmp0 + addps xmm7,xmm1 ; xmm7=tmp1 + + movaps XMMWORD [wk(1)], xmm4 ; tmp3 + movaps XMMWORD [wk(0)], xmm0 ; tmp2 + + ; -- Odd part + + movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)] + movaps xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)] + + movaps xmm4,xmm2 + movaps xmm0,xmm5 + addps xmm2,xmm1 ; xmm2=z11 + addps xmm5,xmm3 ; xmm5=z13 + subps xmm4,xmm1 ; xmm4=z12 + subps xmm0,xmm3 ; xmm0=z10 + + movaps xmm1,xmm2 + subps xmm2,xmm5 + addps xmm1,xmm5 ; xmm1=tmp7 + + mulps xmm2,[rel PD_1_414] ; xmm2=tmp11 + + movaps xmm3,xmm0 + addps xmm0,xmm4 + mulps xmm0,[rel PD_1_847] ; xmm0=z5 + mulps xmm3,[rel PD_M2_613] ; xmm3=(z10 * -2.613125930) + mulps xmm4,[rel PD_1_082] ; xmm4=(z12 * 1.082392200) + addps xmm3,xmm0 ; xmm3=tmp12 + subps xmm4,xmm0 ; xmm4=tmp10 + + ; -- Final output stage + + subps xmm3,xmm1 ; xmm3=tmp6 + movaps xmm5,xmm6 + movaps xmm0,xmm7 + addps xmm6,xmm1 ; xmm6=data0=(00 10 20 30) + addps xmm7,xmm3 ; xmm7=data1=(01 11 21 31) + subps xmm5,xmm1 ; xmm5=data7=(07 17 27 37) + subps xmm0,xmm3 ; xmm0=data6=(06 16 26 36) + subps xmm2,xmm3 ; xmm2=tmp5 + + movaps xmm1,[rel PD_RNDINT_MAGIC] ; xmm1=[rel PD_RNDINT_MAGIC] + pcmpeqd xmm3,xmm3 + psrld xmm3,WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..} + + addps xmm6,xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **) + addps xmm7,xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **) + addps xmm0,xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **) + addps xmm5,xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **) + + pand xmm6,xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --) + pslld xmm7,WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31) + pand xmm0,xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --) + pslld xmm5,WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37) + por xmm6,xmm7 ; xmm6=(00 01 10 11 20 21 30 31) + por xmm0,xmm5 ; xmm0=(06 07 16 17 26 27 36 37) + + movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2 + movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3 + + addps xmm4,xmm2 ; xmm4=tmp4 + movaps xmm7,xmm1 + movaps xmm5,xmm3 + addps xmm1,xmm2 ; xmm1=data2=(02 12 22 32) + addps xmm3,xmm4 ; xmm3=data4=(04 14 24 34) + subps xmm7,xmm2 ; xmm7=data5=(05 15 25 35) + subps xmm5,xmm4 ; xmm5=data3=(03 13 23 33) + + movaps xmm2,[rel PD_RNDINT_MAGIC] ; xmm2=[rel PD_RNDINT_MAGIC] + pcmpeqd xmm4,xmm4 + psrld xmm4,WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..} + + addps xmm3,xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **) + addps xmm7,xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **) + addps xmm1,xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **) + addps xmm5,xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **) + + pand xmm3,xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --) + pslld xmm7,WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35) + pand xmm1,xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --) + pslld xmm5,WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33) + por xmm3,xmm7 ; xmm3=(04 05 14 15 24 25 34 35) + por xmm1,xmm5 ; xmm1=(02 03 12 13 22 23 32 33) + + movdqa xmm2,[rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP] + + packsswb xmm6,xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35) + packsswb xmm1,xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37) + paddb xmm6,xmm2 + paddb xmm1,xmm2 + + movdqa xmm4,xmm6 ; transpose coefficients(phase 2) + punpcklwd xmm6,xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) + punpckhwd xmm4,xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) + + movdqa xmm7,xmm6 ; transpose coefficients(phase 3) + punpckldq xmm6,xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) + punpckhdq xmm7,xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) + + pshufd xmm5,xmm6,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) + pshufd xmm3,xmm7,0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) + + mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] + mov rbx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6 + movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7 + mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] + mov rbx, JSAMPROW [rdi+3*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5 + movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3 + + add rsi, byte 4*SIZEOF_FAST_FLOAT ; wsptr + add rdi, byte 4*SIZEOF_JSAMPROW + dec rcx ; ctr + jnz near .rowloop + + pop rbx + uncollect_args + mov rsp,rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/Builder/jni-1.11/simd/jidctflt-sse2.asm b/Builder/jni-1.11/simd/jidctflt-sse2.asm new file mode 100644 index 000000000..a15a9c111 --- /dev/null +++ b/Builder/jni-1.11/simd/jidctflt-sse2.asm @@ -0,0 +1,497 @@ +; +; jidctflt.asm - floating-point IDCT (SSE & SSE2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a floating-point implementation of the inverse DCT +; (Discrete Cosine Transform). The following code is based directly on +; the IJG's original jidctflt.c; see the jidctflt.c for more details. +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) + shufps %1,%2,0x44 +%endmacro + +%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) + shufps %1,%2,0xEE +%endmacro + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_idct_float_sse2) + +EXTN(jconst_idct_float_sse2): + +PD_1_414 times 4 dd 1.414213562373095048801689 +PD_1_847 times 4 dd 1.847759065022573512256366 +PD_1_082 times 4 dd 1.082392200292393968799446 +PD_M2_613 times 4 dd -2.613125929752753055713286 +PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3) +PB_CENTERJSAMP times 16 db CENTERJSAMPLE + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform dequantization and inverse DCT on one block of coefficients. +; +; GLOBAL(void) +; jsimd_idct_float_sse2 (void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +%define dct_table(b) (b)+8 ; void *dct_table +%define coef_block(b) (b)+12 ; JCOEFPTR coef_block +%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf +%define output_col(b) (b)+20 ; JDIMENSION output_col + +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 +%define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT + ; FAST_FLOAT workspace[DCTSIZE2] + + align 16 + global EXTN(jsimd_idct_float_sse2) + +EXTN(jsimd_idct_float_sse2): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [workspace] + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input, store into work array. + +; mov eax, [original_ebp] + mov edx, POINTER [dct_table(eax)] ; quantptr + mov esi, JCOEFPTR [coef_block(eax)] ; inptr + lea edi, [workspace] ; FAST_FLOAT *wsptr + mov ecx, DCTSIZE/4 ; ctr + alignx 16,7 +.columnloop: +%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE + mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + jnz near .columnDCT + + movq xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq xmm2, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + movq xmm4, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] + movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movq xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + movq xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + por xmm1,xmm2 + por xmm3,xmm4 + por xmm5,xmm6 + por xmm1,xmm3 + por xmm5,xmm7 + por xmm1,xmm5 + packsswb xmm1,xmm1 + movd eax,xmm1 + test eax,eax + jnz short .columnDCT + + ; -- AC terms all zero + + movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + + punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) + psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) + cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03) + + mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movaps xmm1,xmm0 + movaps xmm2,xmm0 + movaps xmm3,xmm0 + + shufps xmm0,xmm0,0x00 ; xmm0=(00 00 00 00) + shufps xmm1,xmm1,0x55 ; xmm1=(01 01 01 01) + shufps xmm2,xmm2,0xAA ; xmm2=(02 02 02 02) + shufps xmm3,xmm3,0xFF ; xmm3=(03 03 03 03) + + movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2 + movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2 + movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3 + movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 + jmp near .nextcolumn + alignx 16,7 +%endif +.columnDCT: + + ; -- Even part + + movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movq xmm1, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + movq xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] + movq xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + + punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) + punpcklwd xmm1,xmm1 ; xmm1=(20 20 21 21 22 22 23 23) + psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) + psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23) + cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03) + cvtdq2ps xmm1,xmm1 ; xmm1=in2=(20 21 22 23) + + punpcklwd xmm2,xmm2 ; xmm2=(40 40 41 41 42 42 43 43) + punpcklwd xmm3,xmm3 ; xmm3=(60 60 61 61 62 62 63 63) + psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43) + psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63) + cvtdq2ps xmm2,xmm2 ; xmm2=in4=(40 41 42 43) + cvtdq2ps xmm3,xmm3 ; xmm3=in6=(60 61 62 63) + + mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movaps xmm4,xmm0 + movaps xmm5,xmm1 + subps xmm0,xmm2 ; xmm0=tmp11 + subps xmm1,xmm3 + addps xmm4,xmm2 ; xmm4=tmp10 + addps xmm5,xmm3 ; xmm5=tmp13 + + mulps xmm1,[GOTOFF(ebx,PD_1_414)] + subps xmm1,xmm5 ; xmm1=tmp12 + + movaps xmm6,xmm4 + movaps xmm7,xmm0 + subps xmm4,xmm5 ; xmm4=tmp3 + subps xmm0,xmm1 ; xmm0=tmp2 + addps xmm6,xmm5 ; xmm6=tmp0 + addps xmm7,xmm1 ; xmm7=tmp1 + + movaps XMMWORD [wk(1)], xmm4 ; tmp3 + movaps XMMWORD [wk(0)], xmm0 ; tmp2 + + ; -- Odd part + + movq xmm2, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movq xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + + punpcklwd xmm2,xmm2 ; xmm2=(10 10 11 11 12 12 13 13) + punpcklwd xmm3,xmm3 ; xmm3=(30 30 31 31 32 32 33 33) + psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13) + psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33) + cvtdq2ps xmm2,xmm2 ; xmm2=in1=(10 11 12 13) + cvtdq2ps xmm3,xmm3 ; xmm3=in3=(30 31 32 33) + + punpcklwd xmm5,xmm5 ; xmm5=(50 50 51 51 52 52 53 53) + punpcklwd xmm1,xmm1 ; xmm1=(70 70 71 71 72 72 73 73) + psrad xmm5,(DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53) + psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73) + cvtdq2ps xmm5,xmm5 ; xmm5=in5=(50 51 52 53) + cvtdq2ps xmm1,xmm1 ; xmm1=in7=(70 71 72 73) + + mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movaps xmm4,xmm2 + movaps xmm0,xmm5 + addps xmm2,xmm1 ; xmm2=z11 + addps xmm5,xmm3 ; xmm5=z13 + subps xmm4,xmm1 ; xmm4=z12 + subps xmm0,xmm3 ; xmm0=z10 + + movaps xmm1,xmm2 + subps xmm2,xmm5 + addps xmm1,xmm5 ; xmm1=tmp7 + + mulps xmm2,[GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11 + + movaps xmm3,xmm0 + addps xmm0,xmm4 + mulps xmm0,[GOTOFF(ebx,PD_1_847)] ; xmm0=z5 + mulps xmm3,[GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930) + mulps xmm4,[GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200) + addps xmm3,xmm0 ; xmm3=tmp12 + subps xmm4,xmm0 ; xmm4=tmp10 + + ; -- Final output stage + + subps xmm3,xmm1 ; xmm3=tmp6 + movaps xmm5,xmm6 + movaps xmm0,xmm7 + addps xmm6,xmm1 ; xmm6=data0=(00 01 02 03) + addps xmm7,xmm3 ; xmm7=data1=(10 11 12 13) + subps xmm5,xmm1 ; xmm5=data7=(70 71 72 73) + subps xmm0,xmm3 ; xmm0=data6=(60 61 62 63) + subps xmm2,xmm3 ; xmm2=tmp5 + + movaps xmm1,xmm6 ; transpose coefficients(phase 1) + unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11) + unpckhps xmm1,xmm7 ; xmm1=(02 12 03 13) + movaps xmm3,xmm0 ; transpose coefficients(phase 1) + unpcklps xmm0,xmm5 ; xmm0=(60 70 61 71) + unpckhps xmm3,xmm5 ; xmm3=(62 72 63 73) + + movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 + movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3 + + movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71) + movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73) + + addps xmm4,xmm2 ; xmm4=tmp4 + movaps xmm0,xmm7 + movaps xmm3,xmm5 + addps xmm7,xmm2 ; xmm7=data2=(20 21 22 23) + addps xmm5,xmm4 ; xmm5=data4=(40 41 42 43) + subps xmm0,xmm2 ; xmm0=data5=(50 51 52 53) + subps xmm3,xmm4 ; xmm3=data3=(30 31 32 33) + + movaps xmm2,xmm7 ; transpose coefficients(phase 1) + unpcklps xmm7,xmm3 ; xmm7=(20 30 21 31) + unpckhps xmm2,xmm3 ; xmm2=(22 32 23 33) + movaps xmm4,xmm5 ; transpose coefficients(phase 1) + unpcklps xmm5,xmm0 ; xmm5=(40 50 41 51) + unpckhps xmm4,xmm0 ; xmm4=(42 52 43 53) + + movaps xmm3,xmm6 ; transpose coefficients(phase 2) + unpcklps2 xmm6,xmm7 ; xmm6=(00 10 20 30) + unpckhps2 xmm3,xmm7 ; xmm3=(01 11 21 31) + movaps xmm0,xmm1 ; transpose coefficients(phase 2) + unpcklps2 xmm1,xmm2 ; xmm1=(02 12 22 32) + unpckhps2 xmm0,xmm2 ; xmm0=(03 13 23 33) + + movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71) + movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73) + + movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6 + movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3 + movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0 + + movaps xmm6,xmm5 ; transpose coefficients(phase 2) + unpcklps2 xmm5,xmm7 ; xmm5=(40 50 60 70) + unpckhps2 xmm6,xmm7 ; xmm6=(41 51 61 71) + movaps xmm3,xmm4 ; transpose coefficients(phase 2) + unpcklps2 xmm4,xmm2 ; xmm4=(42 52 62 72) + unpckhps2 xmm3,xmm2 ; xmm3=(43 53 63 73) + + movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6 + movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4 + movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 + +.nextcolumn: + add esi, byte 4*SIZEOF_JCOEF ; coef_block + add edx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr + add edi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr + dec ecx ; ctr + jnz near .columnloop + + ; -- Prefetch the next coefficient block + + prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32] + prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32] + prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32] + prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows from work array, store into output array. + + mov eax, [original_ebp] + lea esi, [workspace] ; FAST_FLOAT *wsptr + mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(eax)] + mov ecx, DCTSIZE/4 ; ctr + alignx 16,7 +.rowloop: + + ; -- Even part + + movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)] + + movaps xmm4,xmm0 + movaps xmm5,xmm1 + subps xmm0,xmm2 ; xmm0=tmp11 + subps xmm1,xmm3 + addps xmm4,xmm2 ; xmm4=tmp10 + addps xmm5,xmm3 ; xmm5=tmp13 + + mulps xmm1,[GOTOFF(ebx,PD_1_414)] + subps xmm1,xmm5 ; xmm1=tmp12 + + movaps xmm6,xmm4 + movaps xmm7,xmm0 + subps xmm4,xmm5 ; xmm4=tmp3 + subps xmm0,xmm1 ; xmm0=tmp2 + addps xmm6,xmm5 ; xmm6=tmp0 + addps xmm7,xmm1 ; xmm7=tmp1 + + movaps XMMWORD [wk(1)], xmm4 ; tmp3 + movaps XMMWORD [wk(0)], xmm0 ; tmp2 + + ; -- Odd part + + movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)] + + movaps xmm4,xmm2 + movaps xmm0,xmm5 + addps xmm2,xmm1 ; xmm2=z11 + addps xmm5,xmm3 ; xmm5=z13 + subps xmm4,xmm1 ; xmm4=z12 + subps xmm0,xmm3 ; xmm0=z10 + + movaps xmm1,xmm2 + subps xmm2,xmm5 + addps xmm1,xmm5 ; xmm1=tmp7 + + mulps xmm2,[GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11 + + movaps xmm3,xmm0 + addps xmm0,xmm4 + mulps xmm0,[GOTOFF(ebx,PD_1_847)] ; xmm0=z5 + mulps xmm3,[GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930) + mulps xmm4,[GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200) + addps xmm3,xmm0 ; xmm3=tmp12 + subps xmm4,xmm0 ; xmm4=tmp10 + + ; -- Final output stage + + subps xmm3,xmm1 ; xmm3=tmp6 + movaps xmm5,xmm6 + movaps xmm0,xmm7 + addps xmm6,xmm1 ; xmm6=data0=(00 10 20 30) + addps xmm7,xmm3 ; xmm7=data1=(01 11 21 31) + subps xmm5,xmm1 ; xmm5=data7=(07 17 27 37) + subps xmm0,xmm3 ; xmm0=data6=(06 16 26 36) + subps xmm2,xmm3 ; xmm2=tmp5 + + movaps xmm1,[GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm1=[PD_RNDINT_MAGIC] + pcmpeqd xmm3,xmm3 + psrld xmm3,WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..} + + addps xmm6,xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **) + addps xmm7,xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **) + addps xmm0,xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **) + addps xmm5,xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **) + + pand xmm6,xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --) + pslld xmm7,WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31) + pand xmm0,xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --) + pslld xmm5,WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37) + por xmm6,xmm7 ; xmm6=(00 01 10 11 20 21 30 31) + por xmm0,xmm5 ; xmm0=(06 07 16 17 26 27 36 37) + + movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2 + movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3 + + addps xmm4,xmm2 ; xmm4=tmp4 + movaps xmm7,xmm1 + movaps xmm5,xmm3 + addps xmm1,xmm2 ; xmm1=data2=(02 12 22 32) + addps xmm3,xmm4 ; xmm3=data4=(04 14 24 34) + subps xmm7,xmm2 ; xmm7=data5=(05 15 25 35) + subps xmm5,xmm4 ; xmm5=data3=(03 13 23 33) + + movaps xmm2,[GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm2=[PD_RNDINT_MAGIC] + pcmpeqd xmm4,xmm4 + psrld xmm4,WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..} + + addps xmm3,xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **) + addps xmm7,xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **) + addps xmm1,xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **) + addps xmm5,xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **) + + pand xmm3,xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --) + pslld xmm7,WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35) + pand xmm1,xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --) + pslld xmm5,WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33) + por xmm3,xmm7 ; xmm3=(04 05 14 15 24 25 34 35) + por xmm1,xmm5 ; xmm1=(02 03 12 13 22 23 32 33) + + movdqa xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP] + + packsswb xmm6,xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35) + packsswb xmm1,xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37) + paddb xmm6,xmm2 + paddb xmm1,xmm2 + + movdqa xmm4,xmm6 ; transpose coefficients(phase 2) + punpcklwd xmm6,xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) + punpckhwd xmm4,xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) + + movdqa xmm7,xmm6 ; transpose coefficients(phase 3) + punpckldq xmm6,xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) + punpckhdq xmm7,xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) + + pshufd xmm5,xmm6,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) + pshufd xmm3,xmm7,0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) + + pushpic ebx ; save GOT address + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6 + movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7 + mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5 + movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3 + + poppic ebx ; restore GOT address + + add esi, byte 4*SIZEOF_FAST_FLOAT ; wsptr + add edi, byte 4*SIZEOF_JSAMPROW + dec ecx ; ctr + jnz near .rowloop + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/Builder/jni-1.11/simd/jidctfst-altivec.c b/Builder/jni-1.11/simd/jidctfst-altivec.c new file mode 100644 index 000000000..ec30c3995 --- /dev/null +++ b/Builder/jni-1.11/simd/jidctfst-altivec.c @@ -0,0 +1,257 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* FAST INTEGER INVERSE DCT + * + * This is similar to the SSE2 implementation, except that we left-shift the + * constants by 1 less bit (the -1 in CONST_SHIFT.) This is because + * vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of: + * the elements in arg3 + the most significant 17 bits of + * (the elements in arg1 * the elements in arg2). + */ + +#include "jsimd_altivec.h" + + +#define F_1_082 277 /* FIX(1.082392200) */ +#define F_1_414 362 /* FIX(1.414213562) */ +#define F_1_847 473 /* FIX(1.847759065) */ +#define F_2_613 669 /* FIX(2.613125930) */ +#define F_1_613 (F_2_613 - 256) /* FIX(2.613125930) - FIX(1) */ + +#define CONST_BITS 8 +#define PASS1_BITS 2 +#define PRE_MULTIPLY_SCALE_BITS 2 +#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1) + + +#define DO_IDCT(in) \ +{ \ + /* Even part */ \ + \ + tmp10 = vec_add(in##0, in##4); \ + tmp11 = vec_sub(in##0, in##4); \ + tmp13 = vec_add(in##2, in##6); \ + \ + tmp12 = vec_sub(in##2, in##6); \ + tmp12 = vec_sl(tmp12, pre_multiply_scale_bits); \ + tmp12 = vec_madds(tmp12, pw_F1414, pw_zero); \ + tmp12 = vec_sub(tmp12, tmp13); \ + \ + tmp0 = vec_add(tmp10, tmp13); \ + tmp3 = vec_sub(tmp10, tmp13); \ + tmp1 = vec_add(tmp11, tmp12); \ + tmp2 = vec_sub(tmp11, tmp12); \ + \ + /* Odd part */ \ + \ + z13 = vec_add(in##5, in##3); \ + z10 = vec_sub(in##5, in##3); \ + z10s = vec_sl(z10, pre_multiply_scale_bits); \ + z11 = vec_add(in##1, in##7); \ + z12s = vec_sub(in##1, in##7); \ + z12s = vec_sl(z12s, pre_multiply_scale_bits); \ + \ + tmp11 = vec_sub(z11, z13); \ + tmp11 = vec_sl(tmp11, pre_multiply_scale_bits); \ + tmp11 = vec_madds(tmp11, pw_F1414, pw_zero); \ + \ + tmp7 = vec_add(z11, z13); \ + \ + /* To avoid overflow... \ + * \ + * (Original) \ + * tmp12 = -2.613125930 * z10 + z5; \ + * \ + * (This implementation) \ + * tmp12 = (-1.613125930 - 1) * z10 + z5; \ + * = -1.613125930 * z10 - z10 + z5; \ + */ \ + \ + z5 = vec_add(z10s, z12s); \ + z5 = vec_madds(z5, pw_F1847, pw_zero); \ + \ + tmp10 = vec_madds(z12s, pw_F1082, pw_zero); \ + tmp10 = vec_sub(tmp10, z5); \ + tmp12 = vec_madds(z10s, pw_MF1613, z5); \ + tmp12 = vec_sub(tmp12, z10); \ + \ + tmp6 = vec_sub(tmp12, tmp7); \ + tmp5 = vec_sub(tmp11, tmp6); \ + tmp4 = vec_add(tmp10, tmp5); \ + \ + out0 = vec_add(tmp0, tmp7); \ + out1 = vec_add(tmp1, tmp6); \ + out2 = vec_add(tmp2, tmp5); \ + out3 = vec_sub(tmp3, tmp4); \ + out4 = vec_add(tmp3, tmp4); \ + out5 = vec_sub(tmp2, tmp5); \ + out6 = vec_sub(tmp1, tmp6); \ + out7 = vec_sub(tmp0, tmp7); \ +} + + +void +jsimd_idct_ifast_altivec (void *dct_table_, JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + short *dct_table = (short *)dct_table_; + int *outptr; + + __vector short row0, row1, row2, row3, row4, row5, row6, row7, + col0, col1, col2, col3, col4, col5, col6, col7, + quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7, + tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13, + z5, z10, z10s, z11, z12s, z13, + out0, out1, out2, out3, out4, out5, out6, out7; + __vector signed char outb; + + /* Constants */ + __vector short pw_zero = { __8X(0) }, + pw_F1414 = { __8X(F_1_414 << CONST_SHIFT) }, + pw_F1847 = { __8X(F_1_847 << CONST_SHIFT) }, + pw_MF1613 = { __8X(-F_1_613 << CONST_SHIFT) }, + pw_F1082 = { __8X(F_1_082 << CONST_SHIFT) }; + __vector unsigned short + pre_multiply_scale_bits = { __8X(PRE_MULTIPLY_SCALE_BITS) }, + pass1_bits3 = { __8X(PASS1_BITS + 3) }; + __vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) }; + + /* Pass 1: process columns */ + + col0 = vec_ld(0, coef_block); + col1 = vec_ld(16, coef_block); + col2 = vec_ld(32, coef_block); + col3 = vec_ld(48, coef_block); + col4 = vec_ld(64, coef_block); + col5 = vec_ld(80, coef_block); + col6 = vec_ld(96, coef_block); + col7 = vec_ld(112, coef_block); + + tmp1 = vec_or(col1, col2); + tmp2 = vec_or(col3, col4); + tmp1 = vec_or(tmp1, tmp2); + tmp3 = vec_or(col5, col6); + tmp3 = vec_or(tmp3, col7); + tmp1 = vec_or(tmp1, tmp3); + + quant0 = vec_ld(0, dct_table); + col0 = vec_mladd(col0, quant0, pw_zero); + + if (vec_all_eq(tmp1, pw_zero)) { + /* AC terms all zero */ + + row0 = vec_splat(col0, 0); + row1 = vec_splat(col0, 1); + row2 = vec_splat(col0, 2); + row3 = vec_splat(col0, 3); + row4 = vec_splat(col0, 4); + row5 = vec_splat(col0, 5); + row6 = vec_splat(col0, 6); + row7 = vec_splat(col0, 7); + + } else { + + quant1 = vec_ld(16, dct_table); + quant2 = vec_ld(32, dct_table); + quant3 = vec_ld(48, dct_table); + quant4 = vec_ld(64, dct_table); + quant5 = vec_ld(80, dct_table); + quant6 = vec_ld(96, dct_table); + quant7 = vec_ld(112, dct_table); + + col1 = vec_mladd(col1, quant1, pw_zero); + col2 = vec_mladd(col2, quant2, pw_zero); + col3 = vec_mladd(col3, quant3, pw_zero); + col4 = vec_mladd(col4, quant4, pw_zero); + col5 = vec_mladd(col5, quant5, pw_zero); + col6 = vec_mladd(col6, quant6, pw_zero); + col7 = vec_mladd(col7, quant7, pw_zero); + + DO_IDCT(col); + + TRANSPOSE(out, row); + } + + /* Pass 2: process rows */ + + DO_IDCT(row); + + out0 = vec_sra(out0, pass1_bits3); + out1 = vec_sra(out1, pass1_bits3); + out2 = vec_sra(out2, pass1_bits3); + out3 = vec_sra(out3, pass1_bits3); + out4 = vec_sra(out4, pass1_bits3); + out5 = vec_sra(out5, pass1_bits3); + out6 = vec_sra(out6, pass1_bits3); + out7 = vec_sra(out7, pass1_bits3); + + TRANSPOSE(out, col); + + outb = vec_packs(col0, col0); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[0] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col1, col1); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[1] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col2, col2); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[2] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col3, col3); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[3] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col4, col4); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[4] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col5, col5); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[5] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col6, col6); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[6] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col7, col7); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[7] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); +} diff --git a/Builder/jni-1.11/simd/jidctfst-mmx.asm b/Builder/jni-1.11/simd/jidctfst-mmx.asm new file mode 100644 index 000000000..6e95bfbca --- /dev/null +++ b/Builder/jni-1.11/simd/jidctfst-mmx.asm @@ -0,0 +1,499 @@ +; +; jidctfst.asm - fast integer IDCT (MMX) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a fast, not so accurate integer implementation of +; the inverse DCT (Discrete Cosine Transform). The following code is +; based directly on the IJG's original jidctfst.c; see the jidctfst.c +; for more details. +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 8 ; 14 is also OK. +%define PASS1_BITS 2 + +%if IFAST_SCALE_BITS != PASS1_BITS +%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'." +%endif + +%if CONST_BITS == 8 +F_1_082 equ 277 ; FIX(1.082392200) +F_1_414 equ 362 ; FIX(1.414213562) +F_1_847 equ 473 ; FIX(1.847759065) +F_2_613 equ 669 ; FIX(2.613125930) +F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) +F_1_082 equ DESCALE(1162209775,30-CONST_BITS) ; FIX(1.082392200) +F_1_414 equ DESCALE(1518500249,30-CONST_BITS) ; FIX(1.414213562) +F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) +F_2_613 equ DESCALE(2805822602,30-CONST_BITS) ; FIX(2.613125930) +F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + +; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) +; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) + +%define PRE_MULTIPLY_SCALE_BITS 2 +%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) + + alignz 16 + global EXTN(jconst_idct_ifast_mmx) + +EXTN(jconst_idct_ifast_mmx): + +PW_F1414 times 4 dw F_1_414 << CONST_SHIFT +PW_F1847 times 4 dw F_1_847 << CONST_SHIFT +PW_MF1613 times 4 dw -F_1_613 << CONST_SHIFT +PW_F1082 times 4 dw F_1_082 << CONST_SHIFT +PB_CENTERJSAMP times 8 db CENTERJSAMPLE + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform dequantization and inverse DCT on one block of coefficients. +; +; GLOBAL(void) +; jsimd_idct_ifast_mmx (void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +%define dct_table(b) (b)+8 ; jpeg_component_info *compptr +%define coef_block(b) (b)+12 ; JCOEFPTR coef_block +%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf +%define output_col(b) (b)+20 ; JDIMENSION output_col + +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] +%define WK_NUM 2 +%define workspace wk(0)-DCTSIZE2*SIZEOF_JCOEF + ; JCOEF workspace[DCTSIZE2] + + align 16 + global EXTN(jsimd_idct_ifast_mmx) + +EXTN(jsimd_idct_ifast_mmx): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [workspace] + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input, store into work array. + +; mov eax, [original_ebp] + mov edx, POINTER [dct_table(eax)] ; quantptr + mov esi, JCOEFPTR [coef_block(eax)] ; inptr + lea edi, [workspace] ; JCOEF *wsptr + mov ecx, DCTSIZE/4 ; ctr + alignx 16,7 +.columnloop: +%ifndef NO_ZERO_COLUMN_TEST_IFAST_MMX + mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + jnz short .columnDCT + + movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + por mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] + por mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + por mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + por mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + por mm1,mm0 + packsswb mm1,mm1 + movd eax,mm1 + test eax,eax + jnz short .columnDCT + + ; -- AC terms all zero + + movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)] + + movq mm2,mm0 ; mm0=in0=(00 01 02 03) + punpcklwd mm0,mm0 ; mm0=(00 00 01 01) + punpckhwd mm2,mm2 ; mm2=(02 02 03 03) + + movq mm1,mm0 + punpckldq mm0,mm0 ; mm0=(00 00 00 00) + punpckhdq mm1,mm1 ; mm1=(01 01 01 01) + movq mm3,mm2 + punpckldq mm2,mm2 ; mm2=(02 02 02 02) + punpckhdq mm3,mm3 ; mm3=(03 03 03 03) + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0 + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1 + movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1 + movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2 + movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2 + movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3 + movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3 + jmp near .nextcolumn + alignx 16,7 +%endif +.columnDCT: + + ; -- Even part + + movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)] + pmullw mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)] + movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + pmullw mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)] + pmullw mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)] + + movq mm4,mm0 + movq mm5,mm1 + psubw mm0,mm2 ; mm0=tmp11 + psubw mm1,mm3 + paddw mm4,mm2 ; mm4=tmp10 + paddw mm5,mm3 ; mm5=tmp13 + + psllw mm1,PRE_MULTIPLY_SCALE_BITS + pmulhw mm1,[GOTOFF(ebx,PW_F1414)] + psubw mm1,mm5 ; mm1=tmp12 + + movq mm6,mm4 + movq mm7,mm0 + psubw mm4,mm5 ; mm4=tmp3 + psubw mm0,mm1 ; mm0=tmp2 + paddw mm6,mm5 ; mm6=tmp0 + paddw mm7,mm1 ; mm7=tmp1 + + movq MMWORD [wk(1)], mm4 ; wk(1)=tmp3 + movq MMWORD [wk(0)], mm0 ; wk(0)=tmp2 + + ; -- Odd part + + movq mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + pmullw mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)] + pmullw mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)] + movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + pmullw mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)] + pmullw mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)] + + movq mm4,mm2 + movq mm0,mm5 + psubw mm2,mm1 ; mm2=z12 + psubw mm5,mm3 ; mm5=z10 + paddw mm4,mm1 ; mm4=z11 + paddw mm0,mm3 ; mm0=z13 + + movq mm1,mm5 ; mm1=z10(unscaled) + psllw mm2,PRE_MULTIPLY_SCALE_BITS + psllw mm5,PRE_MULTIPLY_SCALE_BITS + + movq mm3,mm4 + psubw mm4,mm0 + paddw mm3,mm0 ; mm3=tmp7 + + psllw mm4,PRE_MULTIPLY_SCALE_BITS + pmulhw mm4,[GOTOFF(ebx,PW_F1414)] ; mm4=tmp11 + + ; To avoid overflow... + ; + ; (Original) + ; tmp12 = -2.613125930 * z10 + z5; + ; + ; (This implementation) + ; tmp12 = (-1.613125930 - 1) * z10 + z5; + ; = -1.613125930 * z10 - z10 + z5; + + movq mm0,mm5 + paddw mm5,mm2 + pmulhw mm5,[GOTOFF(ebx,PW_F1847)] ; mm5=z5 + pmulhw mm0,[GOTOFF(ebx,PW_MF1613)] + pmulhw mm2,[GOTOFF(ebx,PW_F1082)] + psubw mm0,mm1 + psubw mm2,mm5 ; mm2=tmp10 + paddw mm0,mm5 ; mm0=tmp12 + + ; -- Final output stage + + psubw mm0,mm3 ; mm0=tmp6 + movq mm1,mm6 + movq mm5,mm7 + paddw mm6,mm3 ; mm6=data0=(00 01 02 03) + paddw mm7,mm0 ; mm7=data1=(10 11 12 13) + psubw mm1,mm3 ; mm1=data7=(70 71 72 73) + psubw mm5,mm0 ; mm5=data6=(60 61 62 63) + psubw mm4,mm0 ; mm4=tmp5 + + movq mm3,mm6 ; transpose coefficients(phase 1) + punpcklwd mm6,mm7 ; mm6=(00 10 01 11) + punpckhwd mm3,mm7 ; mm3=(02 12 03 13) + movq mm0,mm5 ; transpose coefficients(phase 1) + punpcklwd mm5,mm1 ; mm5=(60 70 61 71) + punpckhwd mm0,mm1 ; mm0=(62 72 63 73) + + movq mm7, MMWORD [wk(0)] ; mm7=tmp2 + movq mm1, MMWORD [wk(1)] ; mm1=tmp3 + + movq MMWORD [wk(0)], mm5 ; wk(0)=(60 70 61 71) + movq MMWORD [wk(1)], mm0 ; wk(1)=(62 72 63 73) + + paddw mm2,mm4 ; mm2=tmp4 + movq mm5,mm7 + movq mm0,mm1 + paddw mm7,mm4 ; mm7=data2=(20 21 22 23) + paddw mm1,mm2 ; mm1=data4=(40 41 42 43) + psubw mm5,mm4 ; mm5=data5=(50 51 52 53) + psubw mm0,mm2 ; mm0=data3=(30 31 32 33) + + movq mm4,mm7 ; transpose coefficients(phase 1) + punpcklwd mm7,mm0 ; mm7=(20 30 21 31) + punpckhwd mm4,mm0 ; mm4=(22 32 23 33) + movq mm2,mm1 ; transpose coefficients(phase 1) + punpcklwd mm1,mm5 ; mm1=(40 50 41 51) + punpckhwd mm2,mm5 ; mm2=(42 52 43 53) + + movq mm0,mm6 ; transpose coefficients(phase 2) + punpckldq mm6,mm7 ; mm6=(00 10 20 30) + punpckhdq mm0,mm7 ; mm0=(01 11 21 31) + movq mm5,mm3 ; transpose coefficients(phase 2) + punpckldq mm3,mm4 ; mm3=(02 12 22 32) + punpckhdq mm5,mm4 ; mm5=(03 13 23 33) + + movq mm7, MMWORD [wk(0)] ; mm7=(60 70 61 71) + movq mm4, MMWORD [wk(1)] ; mm4=(62 72 63 73) + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm6 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0 + movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm3 + movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5 + + movq mm6,mm1 ; transpose coefficients(phase 2) + punpckldq mm1,mm7 ; mm1=(40 50 60 70) + punpckhdq mm6,mm7 ; mm6=(41 51 61 71) + movq mm0,mm2 ; transpose coefficients(phase 2) + punpckldq mm2,mm4 ; mm2=(42 52 62 72) + punpckhdq mm0,mm4 ; mm0=(43 53 63 73) + + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1 + movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm6 + movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2 + movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm0 + +.nextcolumn: + add esi, byte 4*SIZEOF_JCOEF ; coef_block + add edx, byte 4*SIZEOF_IFAST_MULT_TYPE ; quantptr + add edi, byte 4*DCTSIZE*SIZEOF_JCOEF ; wsptr + dec ecx ; ctr + jnz near .columnloop + + ; ---- Pass 2: process rows from work array, store into output array. + + mov eax, [original_ebp] + lea esi, [workspace] ; JCOEF *wsptr + mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(eax)] + mov ecx, DCTSIZE/4 ; ctr + alignx 16,7 +.rowloop: + + ; -- Even part + + movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + + movq mm4,mm0 + movq mm5,mm1 + psubw mm0,mm2 ; mm0=tmp11 + psubw mm1,mm3 + paddw mm4,mm2 ; mm4=tmp10 + paddw mm5,mm3 ; mm5=tmp13 + + psllw mm1,PRE_MULTIPLY_SCALE_BITS + pmulhw mm1,[GOTOFF(ebx,PW_F1414)] + psubw mm1,mm5 ; mm1=tmp12 + + movq mm6,mm4 + movq mm7,mm0 + psubw mm4,mm5 ; mm4=tmp3 + psubw mm0,mm1 ; mm0=tmp2 + paddw mm6,mm5 ; mm6=tmp0 + paddw mm7,mm1 ; mm7=tmp1 + + movq MMWORD [wk(1)], mm4 ; wk(1)=tmp3 + movq MMWORD [wk(0)], mm0 ; wk(0)=tmp2 + + ; -- Odd part + + movq mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + + movq mm4,mm2 + movq mm0,mm5 + psubw mm2,mm1 ; mm2=z12 + psubw mm5,mm3 ; mm5=z10 + paddw mm4,mm1 ; mm4=z11 + paddw mm0,mm3 ; mm0=z13 + + movq mm1,mm5 ; mm1=z10(unscaled) + psllw mm2,PRE_MULTIPLY_SCALE_BITS + psllw mm5,PRE_MULTIPLY_SCALE_BITS + + movq mm3,mm4 + psubw mm4,mm0 + paddw mm3,mm0 ; mm3=tmp7 + + psllw mm4,PRE_MULTIPLY_SCALE_BITS + pmulhw mm4,[GOTOFF(ebx,PW_F1414)] ; mm4=tmp11 + + ; To avoid overflow... + ; + ; (Original) + ; tmp12 = -2.613125930 * z10 + z5; + ; + ; (This implementation) + ; tmp12 = (-1.613125930 - 1) * z10 + z5; + ; = -1.613125930 * z10 - z10 + z5; + + movq mm0,mm5 + paddw mm5,mm2 + pmulhw mm5,[GOTOFF(ebx,PW_F1847)] ; mm5=z5 + pmulhw mm0,[GOTOFF(ebx,PW_MF1613)] + pmulhw mm2,[GOTOFF(ebx,PW_F1082)] + psubw mm0,mm1 + psubw mm2,mm5 ; mm2=tmp10 + paddw mm0,mm5 ; mm0=tmp12 + + ; -- Final output stage + + psubw mm0,mm3 ; mm0=tmp6 + movq mm1,mm6 + movq mm5,mm7 + paddw mm6,mm3 ; mm6=data0=(00 10 20 30) + paddw mm7,mm0 ; mm7=data1=(01 11 21 31) + psraw mm6,(PASS1_BITS+3) ; descale + psraw mm7,(PASS1_BITS+3) ; descale + psubw mm1,mm3 ; mm1=data7=(07 17 27 37) + psubw mm5,mm0 ; mm5=data6=(06 16 26 36) + psraw mm1,(PASS1_BITS+3) ; descale + psraw mm5,(PASS1_BITS+3) ; descale + psubw mm4,mm0 ; mm4=tmp5 + + packsswb mm6,mm5 ; mm6=(00 10 20 30 06 16 26 36) + packsswb mm7,mm1 ; mm7=(01 11 21 31 07 17 27 37) + + movq mm3, MMWORD [wk(0)] ; mm3=tmp2 + movq mm0, MMWORD [wk(1)] ; mm0=tmp3 + + paddw mm2,mm4 ; mm2=tmp4 + movq mm5,mm3 + movq mm1,mm0 + paddw mm3,mm4 ; mm3=data2=(02 12 22 32) + paddw mm0,mm2 ; mm0=data4=(04 14 24 34) + psraw mm3,(PASS1_BITS+3) ; descale + psraw mm0,(PASS1_BITS+3) ; descale + psubw mm5,mm4 ; mm5=data5=(05 15 25 35) + psubw mm1,mm2 ; mm1=data3=(03 13 23 33) + psraw mm5,(PASS1_BITS+3) ; descale + psraw mm1,(PASS1_BITS+3) ; descale + + movq mm4,[GOTOFF(ebx,PB_CENTERJSAMP)] ; mm4=[PB_CENTERJSAMP] + + packsswb mm3,mm0 ; mm3=(02 12 22 32 04 14 24 34) + packsswb mm1,mm5 ; mm1=(03 13 23 33 05 15 25 35) + + paddb mm6,mm4 + paddb mm7,mm4 + paddb mm3,mm4 + paddb mm1,mm4 + + movq mm2,mm6 ; transpose coefficients(phase 1) + punpcklbw mm6,mm7 ; mm6=(00 01 10 11 20 21 30 31) + punpckhbw mm2,mm7 ; mm2=(06 07 16 17 26 27 36 37) + movq mm0,mm3 ; transpose coefficients(phase 1) + punpcklbw mm3,mm1 ; mm3=(02 03 12 13 22 23 32 33) + punpckhbw mm0,mm1 ; mm0=(04 05 14 15 24 25 34 35) + + movq mm5,mm6 ; transpose coefficients(phase 2) + punpcklwd mm6,mm3 ; mm6=(00 01 02 03 10 11 12 13) + punpckhwd mm5,mm3 ; mm5=(20 21 22 23 30 31 32 33) + movq mm4,mm0 ; transpose coefficients(phase 2) + punpcklwd mm0,mm2 ; mm0=(04 05 06 07 14 15 16 17) + punpckhwd mm4,mm2 ; mm4=(24 25 26 27 34 35 36 37) + + movq mm7,mm6 ; transpose coefficients(phase 3) + punpckldq mm6,mm0 ; mm6=(00 01 02 03 04 05 06 07) + punpckhdq mm7,mm0 ; mm7=(10 11 12 13 14 15 16 17) + movq mm1,mm5 ; transpose coefficients(phase 3) + punpckldq mm5,mm4 ; mm5=(20 21 22 23 24 25 26 27) + punpckhdq mm1,mm4 ; mm1=(30 31 32 33 34 35 36 37) + + pushpic ebx ; save GOT address + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6 + movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7 + mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] + mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW] + movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5 + movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1 + + poppic ebx ; restore GOT address + + add esi, byte 4*SIZEOF_JCOEF ; wsptr + add edi, byte 4*SIZEOF_JSAMPROW + dec ecx ; ctr + jnz near .rowloop + + emms ; empty MMX state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/Builder/jni-1.11/simd/jidctfst-sse2-64.asm b/Builder/jni-1.11/simd/jidctfst-sse2-64.asm new file mode 100644 index 000000000..48846426d --- /dev/null +++ b/Builder/jni-1.11/simd/jidctfst-sse2-64.asm @@ -0,0 +1,491 @@ +; +; jidctfst.asm - fast integer IDCT (64-bit SSE2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2009, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a fast, not so accurate integer implementation of +; the inverse DCT (Discrete Cosine Transform). The following code is +; based directly on the IJG's original jidctfst.c; see the jidctfst.c +; for more details. +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 8 ; 14 is also OK. +%define PASS1_BITS 2 + +%if IFAST_SCALE_BITS != PASS1_BITS +%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'." +%endif + +%if CONST_BITS == 8 +F_1_082 equ 277 ; FIX(1.082392200) +F_1_414 equ 362 ; FIX(1.414213562) +F_1_847 equ 473 ; FIX(1.847759065) +F_2_613 equ 669 ; FIX(2.613125930) +F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) +F_1_082 equ DESCALE(1162209775,30-CONST_BITS) ; FIX(1.082392200) +F_1_414 equ DESCALE(1518500249,30-CONST_BITS) ; FIX(1.414213562) +F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) +F_2_613 equ DESCALE(2805822602,30-CONST_BITS) ; FIX(2.613125930) +F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + +; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) +; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) + +%define PRE_MULTIPLY_SCALE_BITS 2 +%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) + + alignz 16 + global EXTN(jconst_idct_ifast_sse2) + +EXTN(jconst_idct_ifast_sse2): + +PW_F1414 times 8 dw F_1_414 << CONST_SHIFT +PW_F1847 times 8 dw F_1_847 << CONST_SHIFT +PW_MF1613 times 8 dw -F_1_613 << CONST_SHIFT +PW_F1082 times 8 dw F_1_082 << CONST_SHIFT +PB_CENTERJSAMP times 16 db CENTERJSAMPLE + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 +; +; Perform dequantization and inverse DCT on one block of coefficients. +; +; GLOBAL(void) +; jsimd_idct_ifast_sse2 (void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +; r10 = jpeg_component_info *compptr +; r11 = JCOEFPTR coef_block +; r12 = JSAMPARRAY output_buf +; r13 = JDIMENSION output_col + +%define original_rbp rbp+0 +%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 + + align 16 + global EXTN(jsimd_idct_ifast_sse2) + +EXTN(jsimd_idct_ifast_sse2): + push rbp + mov rax,rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp],rax + mov rbp,rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args + + ; ---- Pass 1: process columns from input. + + mov rdx, r10 ; quantptr + mov rsi, r11 ; inptr + +%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2 + mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)] + or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)] + jnz near .columnDCT + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] + por xmm1,xmm0 + packsswb xmm1,xmm1 + packsswb xmm1,xmm1 + movd eax,xmm1 + test rax,rax + jnz short .columnDCT + + ; -- AC terms all zero + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + + movdqa xmm7,xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07) + punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) + punpckhwd xmm7,xmm7 ; xmm7=(04 04 05 05 06 06 07 07) + + pshufd xmm6,xmm0,0x00 ; xmm6=col0=(00 00 00 00 00 00 00 00) + pshufd xmm2,xmm0,0x55 ; xmm2=col1=(01 01 01 01 01 01 01 01) + pshufd xmm5,xmm0,0xAA ; xmm5=col2=(02 02 02 02 02 02 02 02) + pshufd xmm0,xmm0,0xFF ; xmm0=col3=(03 03 03 03 03 03 03 03) + pshufd xmm1,xmm7,0x00 ; xmm1=col4=(04 04 04 04 04 04 04 04) + pshufd xmm4,xmm7,0x55 ; xmm4=col5=(05 05 05 05 05 05 05 05) + pshufd xmm3,xmm7,0xAA ; xmm3=col6=(06 06 06 06 06 06 06 06) + pshufd xmm7,xmm7,0xFF ; xmm7=col7=(07 07 07 07 07 07 07 07) + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=col1 + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=col3 + jmp near .column_end +%endif +.columnDCT: + + ; -- Even part + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_IFAST_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_IFAST_MULT_TYPE)] + movdqa xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_IFAST_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_IFAST_MULT_TYPE)] + + movdqa xmm4,xmm0 + movdqa xmm5,xmm1 + psubw xmm0,xmm2 ; xmm0=tmp11 + psubw xmm1,xmm3 + paddw xmm4,xmm2 ; xmm4=tmp10 + paddw xmm5,xmm3 ; xmm5=tmp13 + + psllw xmm1,PRE_MULTIPLY_SCALE_BITS + pmulhw xmm1,[rel PW_F1414] + psubw xmm1,xmm5 ; xmm1=tmp12 + + movdqa xmm6,xmm4 + movdqa xmm7,xmm0 + psubw xmm4,xmm5 ; xmm4=tmp3 + psubw xmm0,xmm1 ; xmm0=tmp2 + paddw xmm6,xmm5 ; xmm6=tmp0 + paddw xmm7,xmm1 ; xmm7=tmp1 + + movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=tmp3 + movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=tmp2 + + ; -- Odd part + + movdqa xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_IFAST_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_IFAST_MULT_TYPE)] + movdqa xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] + pmullw xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_IFAST_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_IFAST_MULT_TYPE)] + + movdqa xmm4,xmm2 + movdqa xmm0,xmm5 + psubw xmm2,xmm1 ; xmm2=z12 + psubw xmm5,xmm3 ; xmm5=z10 + paddw xmm4,xmm1 ; xmm4=z11 + paddw xmm0,xmm3 ; xmm0=z13 + + movdqa xmm1,xmm5 ; xmm1=z10(unscaled) + psllw xmm2,PRE_MULTIPLY_SCALE_BITS + psllw xmm5,PRE_MULTIPLY_SCALE_BITS + + movdqa xmm3,xmm4 + psubw xmm4,xmm0 + paddw xmm3,xmm0 ; xmm3=tmp7 + + psllw xmm4,PRE_MULTIPLY_SCALE_BITS + pmulhw xmm4,[rel PW_F1414] ; xmm4=tmp11 + + ; To avoid overflow... + ; + ; (Original) + ; tmp12 = -2.613125930 * z10 + z5; + ; + ; (This implementation) + ; tmp12 = (-1.613125930 - 1) * z10 + z5; + ; = -1.613125930 * z10 - z10 + z5; + + movdqa xmm0,xmm5 + paddw xmm5,xmm2 + pmulhw xmm5,[rel PW_F1847] ; xmm5=z5 + pmulhw xmm0,[rel PW_MF1613] + pmulhw xmm2,[rel PW_F1082] + psubw xmm0,xmm1 + psubw xmm2,xmm5 ; xmm2=tmp10 + paddw xmm0,xmm5 ; xmm0=tmp12 + + ; -- Final output stage + + psubw xmm0,xmm3 ; xmm0=tmp6 + movdqa xmm1,xmm6 + movdqa xmm5,xmm7 + paddw xmm6,xmm3 ; xmm6=data0=(00 01 02 03 04 05 06 07) + paddw xmm7,xmm0 ; xmm7=data1=(10 11 12 13 14 15 16 17) + psubw xmm1,xmm3 ; xmm1=data7=(70 71 72 73 74 75 76 77) + psubw xmm5,xmm0 ; xmm5=data6=(60 61 62 63 64 65 66 67) + psubw xmm4,xmm0 ; xmm4=tmp5 + + movdqa xmm3,xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6,xmm7 ; xmm6=(00 10 01 11 02 12 03 13) + punpckhwd xmm3,xmm7 ; xmm3=(04 14 05 15 06 16 07 17) + movdqa xmm0,xmm5 ; transpose coefficients(phase 1) + punpcklwd xmm5,xmm1 ; xmm5=(60 70 61 71 62 72 63 73) + punpckhwd xmm0,xmm1 ; xmm0=(64 74 65 75 66 76 67 77) + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 + movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp3 + + movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(60 70 61 71 62 72 63 73) + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(64 74 65 75 66 76 67 77) + + paddw xmm2,xmm4 ; xmm2=tmp4 + movdqa xmm5,xmm7 + movdqa xmm0,xmm1 + paddw xmm7,xmm4 ; xmm7=data2=(20 21 22 23 24 25 26 27) + paddw xmm1,xmm2 ; xmm1=data4=(40 41 42 43 44 45 46 47) + psubw xmm5,xmm4 ; xmm5=data5=(50 51 52 53 54 55 56 57) + psubw xmm0,xmm2 ; xmm0=data3=(30 31 32 33 34 35 36 37) + + movdqa xmm4,xmm7 ; transpose coefficients(phase 1) + punpcklwd xmm7,xmm0 ; xmm7=(20 30 21 31 22 32 23 33) + punpckhwd xmm4,xmm0 ; xmm4=(24 34 25 35 26 36 27 37) + movdqa xmm2,xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1,xmm5 ; xmm1=(40 50 41 51 42 52 43 53) + punpckhwd xmm2,xmm5 ; xmm2=(44 54 45 55 46 56 47 57) + + movdqa xmm0,xmm3 ; transpose coefficients(phase 2) + punpckldq xmm3,xmm4 ; xmm3=(04 14 24 34 05 15 25 35) + punpckhdq xmm0,xmm4 ; xmm0=(06 16 26 36 07 17 27 37) + movdqa xmm5,xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6,xmm7 ; xmm6=(00 10 20 30 01 11 21 31) + punpckhdq xmm5,xmm7 ; xmm5=(02 12 22 32 03 13 23 33) + + movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(60 70 61 71 62 72 63 73) + movdqa xmm7, XMMWORD [wk(1)] ; xmm7=(64 74 65 75 66 76 67 77) + + movdqa XMMWORD [wk(0)], xmm3 ; wk(0)=(04 14 24 34 05 15 25 35) + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(06 16 26 36 07 17 27 37) + + movdqa xmm3,xmm1 ; transpose coefficients(phase 2) + punpckldq xmm1,xmm4 ; xmm1=(40 50 60 70 41 51 61 71) + punpckhdq xmm3,xmm4 ; xmm3=(42 52 62 72 43 53 63 73) + movdqa xmm0,xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2,xmm7 ; xmm2=(44 54 64 74 45 55 65 75) + punpckhdq xmm0,xmm7 ; xmm0=(46 56 66 76 47 57 67 77) + + movdqa xmm4,xmm6 ; transpose coefficients(phase 3) + punpcklqdq xmm6,xmm1 ; xmm6=col0=(00 10 20 30 40 50 60 70) + punpckhqdq xmm4,xmm1 ; xmm4=col1=(01 11 21 31 41 51 61 71) + movdqa xmm7,xmm5 ; transpose coefficients(phase 3) + punpcklqdq xmm5,xmm3 ; xmm5=col2=(02 12 22 32 42 52 62 72) + punpckhqdq xmm7,xmm3 ; xmm7=col3=(03 13 23 33 43 53 63 73) + + movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(04 14 24 34 05 15 25 35) + movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(06 16 26 36 07 17 27 37) + + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=col1 + movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=col3 + + movdqa xmm4,xmm1 ; transpose coefficients(phase 3) + punpcklqdq xmm1,xmm2 ; xmm1=col4=(04 14 24 34 44 54 64 74) + punpckhqdq xmm4,xmm2 ; xmm4=col5=(05 15 25 35 45 55 65 75) + movdqa xmm7,xmm3 ; transpose coefficients(phase 3) + punpcklqdq xmm3,xmm0 ; xmm3=col6=(06 16 26 36 46 56 66 76) + punpckhqdq xmm7,xmm0 ; xmm7=col7=(07 17 27 37 47 57 67 77) +.column_end: + + ; -- Prefetch the next coefficient block + + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows from work array, store into output array. + + mov rax, [original_rbp] + mov rdi, r12 ; (JSAMPROW *) + mov eax, r13d + + ; -- Even part + + ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6 + + movdqa xmm2,xmm6 + movdqa xmm0,xmm5 + psubw xmm6,xmm1 ; xmm6=tmp11 + psubw xmm5,xmm3 + paddw xmm2,xmm1 ; xmm2=tmp10 + paddw xmm0,xmm3 ; xmm0=tmp13 + + psllw xmm5,PRE_MULTIPLY_SCALE_BITS + pmulhw xmm5,[rel PW_F1414] + psubw xmm5,xmm0 ; xmm5=tmp12 + + movdqa xmm1,xmm2 + movdqa xmm3,xmm6 + psubw xmm2,xmm0 ; xmm2=tmp3 + psubw xmm6,xmm5 ; xmm6=tmp2 + paddw xmm1,xmm0 ; xmm1=tmp0 + paddw xmm3,xmm5 ; xmm3=tmp1 + + movdqa xmm0, XMMWORD [wk(0)] ; xmm0=col1 + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=col3 + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp3 + movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp2 + + ; -- Odd part + + ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7 + + movdqa xmm2,xmm0 + movdqa xmm6,xmm4 + psubw xmm0,xmm7 ; xmm0=z12 + psubw xmm4,xmm5 ; xmm4=z10 + paddw xmm2,xmm7 ; xmm2=z11 + paddw xmm6,xmm5 ; xmm6=z13 + + movdqa xmm7,xmm4 ; xmm7=z10(unscaled) + psllw xmm0,PRE_MULTIPLY_SCALE_BITS + psllw xmm4,PRE_MULTIPLY_SCALE_BITS + + movdqa xmm5,xmm2 + psubw xmm2,xmm6 + paddw xmm5,xmm6 ; xmm5=tmp7 + + psllw xmm2,PRE_MULTIPLY_SCALE_BITS + pmulhw xmm2,[rel PW_F1414] ; xmm2=tmp11 + + ; To avoid overflow... + ; + ; (Original) + ; tmp12 = -2.613125930 * z10 + z5; + ; + ; (This implementation) + ; tmp12 = (-1.613125930 - 1) * z10 + z5; + ; = -1.613125930 * z10 - z10 + z5; + + movdqa xmm6,xmm4 + paddw xmm4,xmm0 + pmulhw xmm4,[rel PW_F1847] ; xmm4=z5 + pmulhw xmm6,[rel PW_MF1613] + pmulhw xmm0,[rel PW_F1082] + psubw xmm6,xmm7 + psubw xmm0,xmm4 ; xmm0=tmp10 + paddw xmm6,xmm4 ; xmm6=tmp12 + + ; -- Final output stage + + psubw xmm6,xmm5 ; xmm6=tmp6 + movdqa xmm7,xmm1 + movdqa xmm4,xmm3 + paddw xmm1,xmm5 ; xmm1=data0=(00 10 20 30 40 50 60 70) + paddw xmm3,xmm6 ; xmm3=data1=(01 11 21 31 41 51 61 71) + psraw xmm1,(PASS1_BITS+3) ; descale + psraw xmm3,(PASS1_BITS+3) ; descale + psubw xmm7,xmm5 ; xmm7=data7=(07 17 27 37 47 57 67 77) + psubw xmm4,xmm6 ; xmm4=data6=(06 16 26 36 46 56 66 76) + psraw xmm7,(PASS1_BITS+3) ; descale + psraw xmm4,(PASS1_BITS+3) ; descale + psubw xmm2,xmm6 ; xmm2=tmp5 + + packsswb xmm1,xmm4 ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) + packsswb xmm3,xmm7 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) + + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp2 + movdqa xmm6, XMMWORD [wk(0)] ; xmm6=tmp3 + + paddw xmm0,xmm2 ; xmm0=tmp4 + movdqa xmm4,xmm5 + movdqa xmm7,xmm6 + paddw xmm5,xmm2 ; xmm5=data2=(02 12 22 32 42 52 62 72) + paddw xmm6,xmm0 ; xmm6=data4=(04 14 24 34 44 54 64 74) + psraw xmm5,(PASS1_BITS+3) ; descale + psraw xmm6,(PASS1_BITS+3) ; descale + psubw xmm4,xmm2 ; xmm4=data5=(05 15 25 35 45 55 65 75) + psubw xmm7,xmm0 ; xmm7=data3=(03 13 23 33 43 53 63 73) + psraw xmm4,(PASS1_BITS+3) ; descale + psraw xmm7,(PASS1_BITS+3) ; descale + + movdqa xmm2,[rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP] + + packsswb xmm5,xmm6 ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74) + packsswb xmm7,xmm4 ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75) + + paddb xmm1,xmm2 + paddb xmm3,xmm2 + paddb xmm5,xmm2 + paddb xmm7,xmm2 + + movdqa xmm0,xmm1 ; transpose coefficients(phase 1) + punpcklbw xmm1,xmm3 ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71) + punpckhbw xmm0,xmm3 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77) + movdqa xmm6,xmm5 ; transpose coefficients(phase 1) + punpcklbw xmm5,xmm7 ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73) + punpckhbw xmm6,xmm7 ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75) + + movdqa xmm4,xmm1 ; transpose coefficients(phase 2) + punpcklwd xmm1,xmm5 ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) + punpckhwd xmm4,xmm5 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73) + movdqa xmm2,xmm6 ; transpose coefficients(phase 2) + punpcklwd xmm6,xmm0 ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) + punpckhwd xmm2,xmm0 ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77) + + movdqa xmm3,xmm1 ; transpose coefficients(phase 3) + punpckldq xmm1,xmm6 ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) + punpckhdq xmm3,xmm6 ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) + movdqa xmm7,xmm4 ; transpose coefficients(phase 3) + punpckldq xmm4,xmm2 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) + punpckhdq xmm7,xmm2 ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) + + pshufd xmm5,xmm1,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) + pshufd xmm0,xmm3,0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) + pshufd xmm6,xmm4,0x4E ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47) + pshufd xmm2,xmm7,0x4E ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67) + + mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] + mov rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1 + movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3 + mov rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW] + mov rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4 + movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm7 + + mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] + mov rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5 + movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0 + mov rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW] + mov rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6 + movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2 + + uncollect_args + mov rsp,rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/Builder/jni-1.11/simd/jidctfst-sse2.asm b/Builder/jni-1.11/simd/jidctfst-sse2.asm new file mode 100644 index 000000000..f591e55f0 --- /dev/null +++ b/Builder/jni-1.11/simd/jidctfst-sse2.asm @@ -0,0 +1,501 @@ +; +; jidctfst.asm - fast integer IDCT (SSE2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a fast, not so accurate integer implementation of +; the inverse DCT (Discrete Cosine Transform). The following code is +; based directly on the IJG's original jidctfst.c; see the jidctfst.c +; for more details. +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 8 ; 14 is also OK. +%define PASS1_BITS 2 + +%if IFAST_SCALE_BITS != PASS1_BITS +%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'." +%endif + +%if CONST_BITS == 8 +F_1_082 equ 277 ; FIX(1.082392200) +F_1_414 equ 362 ; FIX(1.414213562) +F_1_847 equ 473 ; FIX(1.847759065) +F_2_613 equ 669 ; FIX(2.613125930) +F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) +F_1_082 equ DESCALE(1162209775,30-CONST_BITS) ; FIX(1.082392200) +F_1_414 equ DESCALE(1518500249,30-CONST_BITS) ; FIX(1.414213562) +F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) +F_2_613 equ DESCALE(2805822602,30-CONST_BITS) ; FIX(2.613125930) +F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + +; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) +; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) + +%define PRE_MULTIPLY_SCALE_BITS 2 +%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) + + alignz 16 + global EXTN(jconst_idct_ifast_sse2) + +EXTN(jconst_idct_ifast_sse2): + +PW_F1414 times 8 dw F_1_414 << CONST_SHIFT +PW_F1847 times 8 dw F_1_847 << CONST_SHIFT +PW_MF1613 times 8 dw -F_1_613 << CONST_SHIFT +PW_F1082 times 8 dw F_1_082 << CONST_SHIFT +PB_CENTERJSAMP times 16 db CENTERJSAMPLE + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform dequantization and inverse DCT on one block of coefficients. +; +; GLOBAL(void) +; jsimd_idct_ifast_sse2 (void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +%define dct_table(b) (b)+8 ; jpeg_component_info *compptr +%define coef_block(b) (b)+12 ; JCOEFPTR coef_block +%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf +%define output_col(b) (b)+20 ; JDIMENSION output_col + +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 + + align 16 + global EXTN(jsimd_idct_ifast_sse2) + +EXTN(jsimd_idct_ifast_sse2): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic ebx +; push ecx ; unused +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input. + +; mov eax, [original_ebp] + mov edx, POINTER [dct_table(eax)] ; quantptr + mov esi, JCOEFPTR [coef_block(eax)] ; inptr + +%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2 + mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + jnz near .columnDCT + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] + por xmm1,xmm0 + packsswb xmm1,xmm1 + packsswb xmm1,xmm1 + movd eax,xmm1 + test eax,eax + jnz short .columnDCT + + ; -- AC terms all zero + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + movdqa xmm7,xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07) + punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) + punpckhwd xmm7,xmm7 ; xmm7=(04 04 05 05 06 06 07 07) + + pshufd xmm6,xmm0,0x00 ; xmm6=col0=(00 00 00 00 00 00 00 00) + pshufd xmm2,xmm0,0x55 ; xmm2=col1=(01 01 01 01 01 01 01 01) + pshufd xmm5,xmm0,0xAA ; xmm5=col2=(02 02 02 02 02 02 02 02) + pshufd xmm0,xmm0,0xFF ; xmm0=col3=(03 03 03 03 03 03 03 03) + pshufd xmm1,xmm7,0x00 ; xmm1=col4=(04 04 04 04 04 04 04 04) + pshufd xmm4,xmm7,0x55 ; xmm4=col5=(05 05 05 05 05 05 05 05) + pshufd xmm3,xmm7,0xAA ; xmm3=col6=(06 06 06 06 06 06 06 06) + pshufd xmm7,xmm7,0xFF ; xmm7=col7=(07 07 07 07 07 07 07 07) + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=col1 + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=col3 + jmp near .column_end + alignx 16,7 +%endif +.columnDCT: + + ; -- Even part + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)] + movdqa xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)] + + movdqa xmm4,xmm0 + movdqa xmm5,xmm1 + psubw xmm0,xmm2 ; xmm0=tmp11 + psubw xmm1,xmm3 + paddw xmm4,xmm2 ; xmm4=tmp10 + paddw xmm5,xmm3 ; xmm5=tmp13 + + psllw xmm1,PRE_MULTIPLY_SCALE_BITS + pmulhw xmm1,[GOTOFF(ebx,PW_F1414)] + psubw xmm1,xmm5 ; xmm1=tmp12 + + movdqa xmm6,xmm4 + movdqa xmm7,xmm0 + psubw xmm4,xmm5 ; xmm4=tmp3 + psubw xmm0,xmm1 ; xmm0=tmp2 + paddw xmm6,xmm5 ; xmm6=tmp0 + paddw xmm7,xmm1 ; xmm7=tmp1 + + movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=tmp3 + movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=tmp2 + + ; -- Odd part + + movdqa xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)] + movdqa xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] + pmullw xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)] + + movdqa xmm4,xmm2 + movdqa xmm0,xmm5 + psubw xmm2,xmm1 ; xmm2=z12 + psubw xmm5,xmm3 ; xmm5=z10 + paddw xmm4,xmm1 ; xmm4=z11 + paddw xmm0,xmm3 ; xmm0=z13 + + movdqa xmm1,xmm5 ; xmm1=z10(unscaled) + psllw xmm2,PRE_MULTIPLY_SCALE_BITS + psllw xmm5,PRE_MULTIPLY_SCALE_BITS + + movdqa xmm3,xmm4 + psubw xmm4,xmm0 + paddw xmm3,xmm0 ; xmm3=tmp7 + + psllw xmm4,PRE_MULTIPLY_SCALE_BITS + pmulhw xmm4,[GOTOFF(ebx,PW_F1414)] ; xmm4=tmp11 + + ; To avoid overflow... + ; + ; (Original) + ; tmp12 = -2.613125930 * z10 + z5; + ; + ; (This implementation) + ; tmp12 = (-1.613125930 - 1) * z10 + z5; + ; = -1.613125930 * z10 - z10 + z5; + + movdqa xmm0,xmm5 + paddw xmm5,xmm2 + pmulhw xmm5,[GOTOFF(ebx,PW_F1847)] ; xmm5=z5 + pmulhw xmm0,[GOTOFF(ebx,PW_MF1613)] + pmulhw xmm2,[GOTOFF(ebx,PW_F1082)] + psubw xmm0,xmm1 + psubw xmm2,xmm5 ; xmm2=tmp10 + paddw xmm0,xmm5 ; xmm0=tmp12 + + ; -- Final output stage + + psubw xmm0,xmm3 ; xmm0=tmp6 + movdqa xmm1,xmm6 + movdqa xmm5,xmm7 + paddw xmm6,xmm3 ; xmm6=data0=(00 01 02 03 04 05 06 07) + paddw xmm7,xmm0 ; xmm7=data1=(10 11 12 13 14 15 16 17) + psubw xmm1,xmm3 ; xmm1=data7=(70 71 72 73 74 75 76 77) + psubw xmm5,xmm0 ; xmm5=data6=(60 61 62 63 64 65 66 67) + psubw xmm4,xmm0 ; xmm4=tmp5 + + movdqa xmm3,xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6,xmm7 ; xmm6=(00 10 01 11 02 12 03 13) + punpckhwd xmm3,xmm7 ; xmm3=(04 14 05 15 06 16 07 17) + movdqa xmm0,xmm5 ; transpose coefficients(phase 1) + punpcklwd xmm5,xmm1 ; xmm5=(60 70 61 71 62 72 63 73) + punpckhwd xmm0,xmm1 ; xmm0=(64 74 65 75 66 76 67 77) + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 + movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp3 + + movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(60 70 61 71 62 72 63 73) + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(64 74 65 75 66 76 67 77) + + paddw xmm2,xmm4 ; xmm2=tmp4 + movdqa xmm5,xmm7 + movdqa xmm0,xmm1 + paddw xmm7,xmm4 ; xmm7=data2=(20 21 22 23 24 25 26 27) + paddw xmm1,xmm2 ; xmm1=data4=(40 41 42 43 44 45 46 47) + psubw xmm5,xmm4 ; xmm5=data5=(50 51 52 53 54 55 56 57) + psubw xmm0,xmm2 ; xmm0=data3=(30 31 32 33 34 35 36 37) + + movdqa xmm4,xmm7 ; transpose coefficients(phase 1) + punpcklwd xmm7,xmm0 ; xmm7=(20 30 21 31 22 32 23 33) + punpckhwd xmm4,xmm0 ; xmm4=(24 34 25 35 26 36 27 37) + movdqa xmm2,xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1,xmm5 ; xmm1=(40 50 41 51 42 52 43 53) + punpckhwd xmm2,xmm5 ; xmm2=(44 54 45 55 46 56 47 57) + + movdqa xmm0,xmm3 ; transpose coefficients(phase 2) + punpckldq xmm3,xmm4 ; xmm3=(04 14 24 34 05 15 25 35) + punpckhdq xmm0,xmm4 ; xmm0=(06 16 26 36 07 17 27 37) + movdqa xmm5,xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6,xmm7 ; xmm6=(00 10 20 30 01 11 21 31) + punpckhdq xmm5,xmm7 ; xmm5=(02 12 22 32 03 13 23 33) + + movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(60 70 61 71 62 72 63 73) + movdqa xmm7, XMMWORD [wk(1)] ; xmm7=(64 74 65 75 66 76 67 77) + + movdqa XMMWORD [wk(0)], xmm3 ; wk(0)=(04 14 24 34 05 15 25 35) + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(06 16 26 36 07 17 27 37) + + movdqa xmm3,xmm1 ; transpose coefficients(phase 2) + punpckldq xmm1,xmm4 ; xmm1=(40 50 60 70 41 51 61 71) + punpckhdq xmm3,xmm4 ; xmm3=(42 52 62 72 43 53 63 73) + movdqa xmm0,xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2,xmm7 ; xmm2=(44 54 64 74 45 55 65 75) + punpckhdq xmm0,xmm7 ; xmm0=(46 56 66 76 47 57 67 77) + + movdqa xmm4,xmm6 ; transpose coefficients(phase 3) + punpcklqdq xmm6,xmm1 ; xmm6=col0=(00 10 20 30 40 50 60 70) + punpckhqdq xmm4,xmm1 ; xmm4=col1=(01 11 21 31 41 51 61 71) + movdqa xmm7,xmm5 ; transpose coefficients(phase 3) + punpcklqdq xmm5,xmm3 ; xmm5=col2=(02 12 22 32 42 52 62 72) + punpckhqdq xmm7,xmm3 ; xmm7=col3=(03 13 23 33 43 53 63 73) + + movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(04 14 24 34 05 15 25 35) + movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(06 16 26 36 07 17 27 37) + + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=col1 + movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=col3 + + movdqa xmm4,xmm1 ; transpose coefficients(phase 3) + punpcklqdq xmm1,xmm2 ; xmm1=col4=(04 14 24 34 44 54 64 74) + punpckhqdq xmm4,xmm2 ; xmm4=col5=(05 15 25 35 45 55 65 75) + movdqa xmm7,xmm3 ; transpose coefficients(phase 3) + punpcklqdq xmm3,xmm0 ; xmm3=col6=(06 16 26 36 46 56 66 76) + punpckhqdq xmm7,xmm0 ; xmm7=col7=(07 17 27 37 47 57 67 77) +.column_end: + + ; -- Prefetch the next coefficient block + + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows from work array, store into output array. + + mov eax, [original_ebp] + mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(eax)] + + ; -- Even part + + ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6 + + movdqa xmm2,xmm6 + movdqa xmm0,xmm5 + psubw xmm6,xmm1 ; xmm6=tmp11 + psubw xmm5,xmm3 + paddw xmm2,xmm1 ; xmm2=tmp10 + paddw xmm0,xmm3 ; xmm0=tmp13 + + psllw xmm5,PRE_MULTIPLY_SCALE_BITS + pmulhw xmm5,[GOTOFF(ebx,PW_F1414)] + psubw xmm5,xmm0 ; xmm5=tmp12 + + movdqa xmm1,xmm2 + movdqa xmm3,xmm6 + psubw xmm2,xmm0 ; xmm2=tmp3 + psubw xmm6,xmm5 ; xmm6=tmp2 + paddw xmm1,xmm0 ; xmm1=tmp0 + paddw xmm3,xmm5 ; xmm3=tmp1 + + movdqa xmm0, XMMWORD [wk(0)] ; xmm0=col1 + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=col3 + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp3 + movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp2 + + ; -- Odd part + + ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7 + + movdqa xmm2,xmm0 + movdqa xmm6,xmm4 + psubw xmm0,xmm7 ; xmm0=z12 + psubw xmm4,xmm5 ; xmm4=z10 + paddw xmm2,xmm7 ; xmm2=z11 + paddw xmm6,xmm5 ; xmm6=z13 + + movdqa xmm7,xmm4 ; xmm7=z10(unscaled) + psllw xmm0,PRE_MULTIPLY_SCALE_BITS + psllw xmm4,PRE_MULTIPLY_SCALE_BITS + + movdqa xmm5,xmm2 + psubw xmm2,xmm6 + paddw xmm5,xmm6 ; xmm5=tmp7 + + psllw xmm2,PRE_MULTIPLY_SCALE_BITS + pmulhw xmm2,[GOTOFF(ebx,PW_F1414)] ; xmm2=tmp11 + + ; To avoid overflow... + ; + ; (Original) + ; tmp12 = -2.613125930 * z10 + z5; + ; + ; (This implementation) + ; tmp12 = (-1.613125930 - 1) * z10 + z5; + ; = -1.613125930 * z10 - z10 + z5; + + movdqa xmm6,xmm4 + paddw xmm4,xmm0 + pmulhw xmm4,[GOTOFF(ebx,PW_F1847)] ; xmm4=z5 + pmulhw xmm6,[GOTOFF(ebx,PW_MF1613)] + pmulhw xmm0,[GOTOFF(ebx,PW_F1082)] + psubw xmm6,xmm7 + psubw xmm0,xmm4 ; xmm0=tmp10 + paddw xmm6,xmm4 ; xmm6=tmp12 + + ; -- Final output stage + + psubw xmm6,xmm5 ; xmm6=tmp6 + movdqa xmm7,xmm1 + movdqa xmm4,xmm3 + paddw xmm1,xmm5 ; xmm1=data0=(00 10 20 30 40 50 60 70) + paddw xmm3,xmm6 ; xmm3=data1=(01 11 21 31 41 51 61 71) + psraw xmm1,(PASS1_BITS+3) ; descale + psraw xmm3,(PASS1_BITS+3) ; descale + psubw xmm7,xmm5 ; xmm7=data7=(07 17 27 37 47 57 67 77) + psubw xmm4,xmm6 ; xmm4=data6=(06 16 26 36 46 56 66 76) + psraw xmm7,(PASS1_BITS+3) ; descale + psraw xmm4,(PASS1_BITS+3) ; descale + psubw xmm2,xmm6 ; xmm2=tmp5 + + packsswb xmm1,xmm4 ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) + packsswb xmm3,xmm7 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) + + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp2 + movdqa xmm6, XMMWORD [wk(0)] ; xmm6=tmp3 + + paddw xmm0,xmm2 ; xmm0=tmp4 + movdqa xmm4,xmm5 + movdqa xmm7,xmm6 + paddw xmm5,xmm2 ; xmm5=data2=(02 12 22 32 42 52 62 72) + paddw xmm6,xmm0 ; xmm6=data4=(04 14 24 34 44 54 64 74) + psraw xmm5,(PASS1_BITS+3) ; descale + psraw xmm6,(PASS1_BITS+3) ; descale + psubw xmm4,xmm2 ; xmm4=data5=(05 15 25 35 45 55 65 75) + psubw xmm7,xmm0 ; xmm7=data3=(03 13 23 33 43 53 63 73) + psraw xmm4,(PASS1_BITS+3) ; descale + psraw xmm7,(PASS1_BITS+3) ; descale + + movdqa xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP] + + packsswb xmm5,xmm6 ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74) + packsswb xmm7,xmm4 ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75) + + paddb xmm1,xmm2 + paddb xmm3,xmm2 + paddb xmm5,xmm2 + paddb xmm7,xmm2 + + movdqa xmm0,xmm1 ; transpose coefficients(phase 1) + punpcklbw xmm1,xmm3 ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71) + punpckhbw xmm0,xmm3 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77) + movdqa xmm6,xmm5 ; transpose coefficients(phase 1) + punpcklbw xmm5,xmm7 ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73) + punpckhbw xmm6,xmm7 ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75) + + movdqa xmm4,xmm1 ; transpose coefficients(phase 2) + punpcklwd xmm1,xmm5 ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) + punpckhwd xmm4,xmm5 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73) + movdqa xmm2,xmm6 ; transpose coefficients(phase 2) + punpcklwd xmm6,xmm0 ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) + punpckhwd xmm2,xmm0 ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77) + + movdqa xmm3,xmm1 ; transpose coefficients(phase 3) + punpckldq xmm1,xmm6 ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) + punpckhdq xmm3,xmm6 ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) + movdqa xmm7,xmm4 ; transpose coefficients(phase 3) + punpckldq xmm4,xmm2 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) + punpckhdq xmm7,xmm2 ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) + + pshufd xmm5,xmm1,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) + pshufd xmm0,xmm3,0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) + pshufd xmm6,xmm4,0x4E ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47) + pshufd xmm2,xmm7,0x4E ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67) + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm1 + movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3 + mov edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4 + movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm7 + + mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5 + movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0 + mov edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6 + movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm2 + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; unused + poppic ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/Builder/jni-1.11/simd/jidctint-altivec.c b/Builder/jni-1.11/simd/jidctint-altivec.c new file mode 100644 index 000000000..935f35d1e --- /dev/null +++ b/Builder/jni-1.11/simd/jidctint-altivec.c @@ -0,0 +1,359 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* SLOW INTEGER INVERSE DCT */ + +#include "jsimd_altivec.h" + + +#define F_0_298 2446 /* FIX(0.298631336) */ +#define F_0_390 3196 /* FIX(0.390180644) */ +#define F_0_541 4433 /* FIX(0.541196100) */ +#define F_0_765 6270 /* FIX(0.765366865) */ +#define F_0_899 7373 /* FIX(0.899976223) */ +#define F_1_175 9633 /* FIX(1.175875602) */ +#define F_1_501 12299 /* FIX(1.501321110) */ +#define F_1_847 15137 /* FIX(1.847759065) */ +#define F_1_961 16069 /* FIX(1.961570560) */ +#define F_2_053 16819 /* FIX(2.053119869) */ +#define F_2_562 20995 /* FIX(2.562915447) */ +#define F_3_072 25172 /* FIX(3.072711026) */ + +#define CONST_BITS 13 +#define PASS1_BITS 2 +#define DESCALE_P1 (CONST_BITS - PASS1_BITS) +#define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3) + + +#define DO_IDCT(in, PASS) \ +{ \ + /* Even part \ + * \ + * (Original) \ + * z1 = (z2 + z3) * 0.541196100; \ + * tmp2 = z1 + z3 * -1.847759065; \ + * tmp3 = z1 + z2 * 0.765366865; \ + * \ + * (This implementation) \ + * tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); \ + * tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; \ + */ \ + \ + in##26l = vec_mergeh(in##2, in##6); \ + in##26h = vec_mergel(in##2, in##6); \ + \ + tmp3l = vec_msums(in##26l, pw_f130_f054, pd_zero); \ + tmp3h = vec_msums(in##26h, pw_f130_f054, pd_zero); \ + tmp2l = vec_msums(in##26l, pw_f054_mf130, pd_zero); \ + tmp2h = vec_msums(in##26h, pw_f054_mf130, pd_zero); \ + \ + tmp0 = vec_add(in##0, in##4); \ + tmp1 = vec_sub(in##0, in##4); \ + \ + tmp0l = vec_unpackh(tmp0); \ + tmp0h = vec_unpackl(tmp0); \ + tmp0l = vec_sl(tmp0l, const_bits); \ + tmp0h = vec_sl(tmp0h, const_bits); \ + tmp0l = vec_add(tmp0l, pd_descale_p##PASS); \ + tmp0h = vec_add(tmp0h, pd_descale_p##PASS); \ + \ + tmp10l = vec_add(tmp0l, tmp3l); \ + tmp10h = vec_add(tmp0h, tmp3h); \ + tmp13l = vec_sub(tmp0l, tmp3l); \ + tmp13h = vec_sub(tmp0h, tmp3h); \ + \ + tmp1l = vec_unpackh(tmp1); \ + tmp1h = vec_unpackl(tmp1); \ + tmp1l = vec_sl(tmp1l, const_bits); \ + tmp1h = vec_sl(tmp1h, const_bits); \ + tmp1l = vec_add(tmp1l, pd_descale_p##PASS); \ + tmp1h = vec_add(tmp1h, pd_descale_p##PASS); \ + \ + tmp11l = vec_add(tmp1l, tmp2l); \ + tmp11h = vec_add(tmp1h, tmp2h); \ + tmp12l = vec_sub(tmp1l, tmp2l); \ + tmp12h = vec_sub(tmp1h, tmp2h); \ + \ + /* Odd part */ \ + \ + z3 = vec_add(in##3, in##7); \ + z4 = vec_add(in##1, in##5); \ + \ + /* (Original) \ + * z5 = (z3 + z4) * 1.175875602; \ + * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; \ + * z3 += z5; z4 += z5; \ + * \ + * (This implementation) \ + * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \ + * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \ + */ \ + \ + z34l = vec_mergeh(z3, z4); \ + z34h = vec_mergel(z3, z4); \ + \ + z3l = vec_msums(z34l, pw_mf078_f117, pd_zero); \ + z3h = vec_msums(z34h, pw_mf078_f117, pd_zero); \ + z4l = vec_msums(z34l, pw_f117_f078, pd_zero); \ + z4h = vec_msums(z34h, pw_f117_f078, pd_zero); \ + \ + /* (Original) \ + * z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; \ + * tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; \ + * tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; \ + * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; \ + * tmp0 += z1 + z3; tmp1 += z2 + z4; \ + * tmp2 += z2 + z3; tmp3 += z1 + z4; \ + * \ + * (This implementation) \ + * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; \ + * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; \ + * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); \ + * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); \ + * tmp0 += z3; tmp1 += z4; \ + * tmp2 += z3; tmp3 += z4; \ + */ \ + \ + in##71l = vec_mergeh(in##7, in##1); \ + in##71h = vec_mergel(in##7, in##1); \ + \ + tmp0l = vec_msums(in##71l, pw_mf060_mf089, z3l); \ + tmp0h = vec_msums(in##71h, pw_mf060_mf089, z3h); \ + tmp3l = vec_msums(in##71l, pw_mf089_f060, z4l); \ + tmp3h = vec_msums(in##71h, pw_mf089_f060, z4h); \ + \ + in##53l = vec_mergeh(in##5, in##3); \ + in##53h = vec_mergel(in##5, in##3); \ + \ + tmp1l = vec_msums(in##53l, pw_mf050_mf256, z4l); \ + tmp1h = vec_msums(in##53h, pw_mf050_mf256, z4h); \ + tmp2l = vec_msums(in##53l, pw_mf256_f050, z3l); \ + tmp2h = vec_msums(in##53h, pw_mf256_f050, z3h); \ + \ + /* Final output stage */ \ + \ + out0l = vec_add(tmp10l, tmp3l); \ + out0h = vec_add(tmp10h, tmp3h); \ + out7l = vec_sub(tmp10l, tmp3l); \ + out7h = vec_sub(tmp10h, tmp3h); \ + \ + out0l = vec_sra(out0l, descale_p##PASS); \ + out0h = vec_sra(out0h, descale_p##PASS); \ + out7l = vec_sra(out7l, descale_p##PASS); \ + out7h = vec_sra(out7h, descale_p##PASS); \ + \ + out0 = vec_pack(out0l, out0h); \ + out7 = vec_pack(out7l, out7h); \ + \ + out1l = vec_add(tmp11l, tmp2l); \ + out1h = vec_add(tmp11h, tmp2h); \ + out6l = vec_sub(tmp11l, tmp2l); \ + out6h = vec_sub(tmp11h, tmp2h); \ + \ + out1l = vec_sra(out1l, descale_p##PASS); \ + out1h = vec_sra(out1h, descale_p##PASS); \ + out6l = vec_sra(out6l, descale_p##PASS); \ + out6h = vec_sra(out6h, descale_p##PASS); \ + \ + out1 = vec_pack(out1l, out1h); \ + out6 = vec_pack(out6l, out6h); \ + \ + out2l = vec_add(tmp12l, tmp1l); \ + out2h = vec_add(tmp12h, tmp1h); \ + out5l = vec_sub(tmp12l, tmp1l); \ + out5h = vec_sub(tmp12h, tmp1h); \ + \ + out2l = vec_sra(out2l, descale_p##PASS); \ + out2h = vec_sra(out2h, descale_p##PASS); \ + out5l = vec_sra(out5l, descale_p##PASS); \ + out5h = vec_sra(out5h, descale_p##PASS); \ + \ + out2 = vec_pack(out2l, out2h); \ + out5 = vec_pack(out5l, out5h); \ + \ + out3l = vec_add(tmp13l, tmp0l); \ + out3h = vec_add(tmp13h, tmp0h); \ + out4l = vec_sub(tmp13l, tmp0l); \ + out4h = vec_sub(tmp13h, tmp0h); \ + \ + out3l = vec_sra(out3l, descale_p##PASS); \ + out3h = vec_sra(out3h, descale_p##PASS); \ + out4l = vec_sra(out4l, descale_p##PASS); \ + out4h = vec_sra(out4h, descale_p##PASS); \ + \ + out3 = vec_pack(out3l, out3h); \ + out4 = vec_pack(out4l, out4h); \ +} + + +void +jsimd_idct_islow_altivec (void *dct_table_, JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + short *dct_table = (short *)dct_table_; + int *outptr; + + __vector short row0, row1, row2, row3, row4, row5, row6, row7, + col0, col1, col2, col3, col4, col5, col6, col7, + quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7, + tmp0, tmp1, tmp2, tmp3, z3, z4, + z34l, z34h, col71l, col71h, col26l, col26h, col53l, col53h, + row71l, row71h, row26l, row26h, row53l, row53h, + out0, out1, out2, out3, out4, out5, out6, out7; + __vector int tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h, + tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h, + z3l, z3h, z4l, z4h, + out0l, out0h, out1l, out1h, out2l, out2h, out3l, out3h, out4l, out4h, + out5l, out5h, out6l, out6h, out7l, out7h; + __vector signed char outb; + + /* Constants */ + __vector short pw_zero = { __8X(0) }, + pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) }, + pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) }, + pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) }, + pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) }, + pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) }, + pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) }, + pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) }, + pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) }; + __vector unsigned short pass1_bits = { __8X(PASS1_BITS) }; + __vector int pd_zero = { __4X(0) }, + pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) }, + pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) }; + __vector unsigned int descale_p1 = { __4X(DESCALE_P1) }, + descale_p2 = { __4X(DESCALE_P2) }, + const_bits = { __4X(CONST_BITS) }; + __vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) }; + + /* Pass 1: process columns */ + + col0 = vec_ld(0, coef_block); + col1 = vec_ld(16, coef_block); + col2 = vec_ld(32, coef_block); + col3 = vec_ld(48, coef_block); + col4 = vec_ld(64, coef_block); + col5 = vec_ld(80, coef_block); + col6 = vec_ld(96, coef_block); + col7 = vec_ld(112, coef_block); + + tmp1 = vec_or(col1, col2); + tmp2 = vec_or(col3, col4); + tmp1 = vec_or(tmp1, tmp2); + tmp3 = vec_or(col5, col6); + tmp3 = vec_or(tmp3, col7); + tmp1 = vec_or(tmp1, tmp3); + + quant0 = vec_ld(0, dct_table); + col0 = vec_mladd(col0, quant0, pw_zero); + + if (vec_all_eq(tmp1, pw_zero)) { + /* AC terms all zero */ + + col0 = vec_sl(col0, pass1_bits); + + row0 = vec_splat(col0, 0); + row1 = vec_splat(col0, 1); + row2 = vec_splat(col0, 2); + row3 = vec_splat(col0, 3); + row4 = vec_splat(col0, 4); + row5 = vec_splat(col0, 5); + row6 = vec_splat(col0, 6); + row7 = vec_splat(col0, 7); + + } else { + + quant1 = vec_ld(16, dct_table); + quant2 = vec_ld(32, dct_table); + quant3 = vec_ld(48, dct_table); + quant4 = vec_ld(64, dct_table); + quant5 = vec_ld(80, dct_table); + quant6 = vec_ld(96, dct_table); + quant7 = vec_ld(112, dct_table); + + col1 = vec_mladd(col1, quant1, pw_zero); + col2 = vec_mladd(col2, quant2, pw_zero); + col3 = vec_mladd(col3, quant3, pw_zero); + col4 = vec_mladd(col4, quant4, pw_zero); + col5 = vec_mladd(col5, quant5, pw_zero); + col6 = vec_mladd(col6, quant6, pw_zero); + col7 = vec_mladd(col7, quant7, pw_zero); + + DO_IDCT(col, 1); + + TRANSPOSE(out, row); + } + + /* Pass 2: process rows */ + + DO_IDCT(row, 2); + + TRANSPOSE(out, col); + + outb = vec_packs(col0, col0); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[0] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col1, col1); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[1] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col2, col2); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[2] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col3, col3); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[3] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col4, col4); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[4] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col5, col5); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[5] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col6, col6); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[6] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col7, col7); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[7] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); +} diff --git a/Builder/jni-1.11/simd/jidctint-mmx.asm b/Builder/jni-1.11/simd/jidctint-mmx.asm new file mode 100644 index 000000000..5bd198120 --- /dev/null +++ b/Builder/jni-1.11/simd/jidctint-mmx.asm @@ -0,0 +1,851 @@ +; +; jidctint.asm - accurate integer IDCT (MMX) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a slow-but-accurate integer implementation of the +; inverse DCT (Discrete Cosine Transform). The following code is based +; directly on the IJG's original jidctint.c; see the jidctint.c for +; more details. +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 13 +%define PASS1_BITS 2 + +%define DESCALE_P1 (CONST_BITS-PASS1_BITS) +%define DESCALE_P2 (CONST_BITS+PASS1_BITS+3) + +%if CONST_BITS == 13 +F_0_298 equ 2446 ; FIX(0.298631336) +F_0_390 equ 3196 ; FIX(0.390180644) +F_0_541 equ 4433 ; FIX(0.541196100) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_175 equ 9633 ; FIX(1.175875602) +F_1_501 equ 12299 ; FIX(1.501321110) +F_1_847 equ 15137 ; FIX(1.847759065) +F_1_961 equ 16069 ; FIX(1.961570560) +F_2_053 equ 16819 ; FIX(2.053119869) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_072 equ 25172 ; FIX(3.072711026) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) +F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336) +F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644) +F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) +F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) +F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) +F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602) +F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110) +F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) +F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560) +F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869) +F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) +F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_idct_islow_mmx) + +EXTN(jconst_idct_islow_mmx): + +PW_F130_F054 times 2 dw (F_0_541+F_0_765), F_0_541 +PW_F054_MF130 times 2 dw F_0_541, (F_0_541-F_1_847) +PW_MF078_F117 times 2 dw (F_1_175-F_1_961), F_1_175 +PW_F117_F078 times 2 dw F_1_175, (F_1_175-F_0_390) +PW_MF060_MF089 times 2 dw (F_0_298-F_0_899),-F_0_899 +PW_MF089_F060 times 2 dw -F_0_899, (F_1_501-F_0_899) +PW_MF050_MF256 times 2 dw (F_2_053-F_2_562),-F_2_562 +PW_MF256_F050 times 2 dw -F_2_562, (F_3_072-F_2_562) +PD_DESCALE_P1 times 2 dd 1 << (DESCALE_P1-1) +PD_DESCALE_P2 times 2 dd 1 << (DESCALE_P2-1) +PB_CENTERJSAMP times 8 db CENTERJSAMPLE + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform dequantization and inverse DCT on one block of coefficients. +; +; GLOBAL(void) +; jsimd_idct_islow_mmx (void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +%define dct_table(b) (b)+8 ; jpeg_component_info *compptr +%define coef_block(b) (b)+12 ; JCOEFPTR coef_block +%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf +%define output_col(b) (b)+20 ; JDIMENSION output_col + +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] +%define WK_NUM 12 +%define workspace wk(0)-DCTSIZE2*SIZEOF_JCOEF + ; JCOEF workspace[DCTSIZE2] + + align 16 + global EXTN(jsimd_idct_islow_mmx) + +EXTN(jsimd_idct_islow_mmx): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [workspace] + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input, store into work array. + +; mov eax, [original_ebp] + mov edx, POINTER [dct_table(eax)] ; quantptr + mov esi, JCOEFPTR [coef_block(eax)] ; inptr + lea edi, [workspace] ; JCOEF *wsptr + mov ecx, DCTSIZE/4 ; ctr + alignx 16,7 +.columnloop: +%ifndef NO_ZERO_COLUMN_TEST_ISLOW_MMX + mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + jnz short .columnDCT + + movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + por mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] + por mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + por mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + por mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + por mm1,mm0 + packsswb mm1,mm1 + movd eax,mm1 + test eax,eax + jnz short .columnDCT + + ; -- AC terms all zero + + movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + psllw mm0,PASS1_BITS + + movq mm2,mm0 ; mm0=in0=(00 01 02 03) + punpcklwd mm0,mm0 ; mm0=(00 00 01 01) + punpckhwd mm2,mm2 ; mm2=(02 02 03 03) + + movq mm1,mm0 + punpckldq mm0,mm0 ; mm0=(00 00 00 00) + punpckhdq mm1,mm1 ; mm1=(01 01 01 01) + movq mm3,mm2 + punpckldq mm2,mm2 ; mm2=(02 02 02 02) + punpckhdq mm3,mm3 ; mm3=(03 03 03 03) + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0 + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1 + movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1 + movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2 + movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2 + movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3 + movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3 + jmp near .nextcolumn + alignx 16,7 +%endif +.columnDCT: + + ; -- Even part + + movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + pmullw mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + ; (Original) + ; z1 = (z2 + z3) * 0.541196100; + ; tmp2 = z1 + z3 * -1.847759065; + ; tmp3 = z1 + z2 * 0.765366865; + ; + ; (This implementation) + ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); + ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; + + movq mm4,mm1 ; mm1=in2=z2 + movq mm5,mm1 + punpcklwd mm4,mm3 ; mm3=in6=z3 + punpckhwd mm5,mm3 + movq mm1,mm4 + movq mm3,mm5 + pmaddwd mm4,[GOTOFF(ebx,PW_F130_F054)] ; mm4=tmp3L + pmaddwd mm5,[GOTOFF(ebx,PW_F130_F054)] ; mm5=tmp3H + pmaddwd mm1,[GOTOFF(ebx,PW_F054_MF130)] ; mm1=tmp2L + pmaddwd mm3,[GOTOFF(ebx,PW_F054_MF130)] ; mm3=tmp2H + + movq mm6,mm0 + paddw mm0,mm2 ; mm0=in0+in4 + psubw mm6,mm2 ; mm6=in0-in4 + + pxor mm7,mm7 + pxor mm2,mm2 + punpcklwd mm7,mm0 ; mm7=tmp0L + punpckhwd mm2,mm0 ; mm2=tmp0H + psrad mm7,(16-CONST_BITS) ; psrad mm7,16 & pslld mm7,CONST_BITS + psrad mm2,(16-CONST_BITS) ; psrad mm2,16 & pslld mm2,CONST_BITS + + movq mm0,mm7 + paddd mm7,mm4 ; mm7=tmp10L + psubd mm0,mm4 ; mm0=tmp13L + movq mm4,mm2 + paddd mm2,mm5 ; mm2=tmp10H + psubd mm4,mm5 ; mm4=tmp13H + + movq MMWORD [wk(0)], mm7 ; wk(0)=tmp10L + movq MMWORD [wk(1)], mm2 ; wk(1)=tmp10H + movq MMWORD [wk(2)], mm0 ; wk(2)=tmp13L + movq MMWORD [wk(3)], mm4 ; wk(3)=tmp13H + + pxor mm5,mm5 + pxor mm7,mm7 + punpcklwd mm5,mm6 ; mm5=tmp1L + punpckhwd mm7,mm6 ; mm7=tmp1H + psrad mm5,(16-CONST_BITS) ; psrad mm5,16 & pslld mm5,CONST_BITS + psrad mm7,(16-CONST_BITS) ; psrad mm7,16 & pslld mm7,CONST_BITS + + movq mm2,mm5 + paddd mm5,mm1 ; mm5=tmp11L + psubd mm2,mm1 ; mm2=tmp12L + movq mm0,mm7 + paddd mm7,mm3 ; mm7=tmp11H + psubd mm0,mm3 ; mm0=tmp12H + + movq MMWORD [wk(4)], mm5 ; wk(4)=tmp11L + movq MMWORD [wk(5)], mm7 ; wk(5)=tmp11H + movq MMWORD [wk(6)], mm2 ; wk(6)=tmp12L + movq MMWORD [wk(7)], mm0 ; wk(7)=tmp12H + + ; -- Odd part + + movq mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm6, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + pmullw mm4, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm6, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + movq mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + pmullw mm1, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + movq mm5,mm6 + movq mm7,mm4 + paddw mm5,mm3 ; mm5=z3 + paddw mm7,mm1 ; mm7=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movq mm2,mm5 + movq mm0,mm5 + punpcklwd mm2,mm7 + punpckhwd mm0,mm7 + movq mm5,mm2 + movq mm7,mm0 + pmaddwd mm2,[GOTOFF(ebx,PW_MF078_F117)] ; mm2=z3L + pmaddwd mm0,[GOTOFF(ebx,PW_MF078_F117)] ; mm0=z3H + pmaddwd mm5,[GOTOFF(ebx,PW_F117_F078)] ; mm5=z4L + pmaddwd mm7,[GOTOFF(ebx,PW_F117_F078)] ; mm7=z4H + + movq MMWORD [wk(10)], mm2 ; wk(10)=z3L + movq MMWORD [wk(11)], mm0 ; wk(11)=z3H + + ; (Original) + ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; + ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; + ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; tmp0 += z1 + z3; tmp1 += z2 + z4; + ; tmp2 += z2 + z3; tmp3 += z1 + z4; + ; + ; (This implementation) + ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; + ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; + ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); + ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); + ; tmp0 += z3; tmp1 += z4; + ; tmp2 += z3; tmp3 += z4; + + movq mm2,mm3 + movq mm0,mm3 + punpcklwd mm2,mm4 + punpckhwd mm0,mm4 + movq mm3,mm2 + movq mm4,mm0 + pmaddwd mm2,[GOTOFF(ebx,PW_MF060_MF089)] ; mm2=tmp0L + pmaddwd mm0,[GOTOFF(ebx,PW_MF060_MF089)] ; mm0=tmp0H + pmaddwd mm3,[GOTOFF(ebx,PW_MF089_F060)] ; mm3=tmp3L + pmaddwd mm4,[GOTOFF(ebx,PW_MF089_F060)] ; mm4=tmp3H + + paddd mm2, MMWORD [wk(10)] ; mm2=tmp0L + paddd mm0, MMWORD [wk(11)] ; mm0=tmp0H + paddd mm3,mm5 ; mm3=tmp3L + paddd mm4,mm7 ; mm4=tmp3H + + movq MMWORD [wk(8)], mm2 ; wk(8)=tmp0L + movq MMWORD [wk(9)], mm0 ; wk(9)=tmp0H + + movq mm2,mm1 + movq mm0,mm1 + punpcklwd mm2,mm6 + punpckhwd mm0,mm6 + movq mm1,mm2 + movq mm6,mm0 + pmaddwd mm2,[GOTOFF(ebx,PW_MF050_MF256)] ; mm2=tmp1L + pmaddwd mm0,[GOTOFF(ebx,PW_MF050_MF256)] ; mm0=tmp1H + pmaddwd mm1,[GOTOFF(ebx,PW_MF256_F050)] ; mm1=tmp2L + pmaddwd mm6,[GOTOFF(ebx,PW_MF256_F050)] ; mm6=tmp2H + + paddd mm2,mm5 ; mm2=tmp1L + paddd mm0,mm7 ; mm0=tmp1H + paddd mm1, MMWORD [wk(10)] ; mm1=tmp2L + paddd mm6, MMWORD [wk(11)] ; mm6=tmp2H + + movq MMWORD [wk(10)], mm2 ; wk(10)=tmp1L + movq MMWORD [wk(11)], mm0 ; wk(11)=tmp1H + + ; -- Final output stage + + movq mm5, MMWORD [wk(0)] ; mm5=tmp10L + movq mm7, MMWORD [wk(1)] ; mm7=tmp10H + + movq mm2,mm5 + movq mm0,mm7 + paddd mm5,mm3 ; mm5=data0L + paddd mm7,mm4 ; mm7=data0H + psubd mm2,mm3 ; mm2=data7L + psubd mm0,mm4 ; mm0=data7H + + movq mm3,[GOTOFF(ebx,PD_DESCALE_P1)] ; mm3=[PD_DESCALE_P1] + + paddd mm5,mm3 + paddd mm7,mm3 + psrad mm5,DESCALE_P1 + psrad mm7,DESCALE_P1 + paddd mm2,mm3 + paddd mm0,mm3 + psrad mm2,DESCALE_P1 + psrad mm0,DESCALE_P1 + + packssdw mm5,mm7 ; mm5=data0=(00 01 02 03) + packssdw mm2,mm0 ; mm2=data7=(70 71 72 73) + + movq mm4, MMWORD [wk(4)] ; mm4=tmp11L + movq mm3, MMWORD [wk(5)] ; mm3=tmp11H + + movq mm7,mm4 + movq mm0,mm3 + paddd mm4,mm1 ; mm4=data1L + paddd mm3,mm6 ; mm3=data1H + psubd mm7,mm1 ; mm7=data6L + psubd mm0,mm6 ; mm0=data6H + + movq mm1,[GOTOFF(ebx,PD_DESCALE_P1)] ; mm1=[PD_DESCALE_P1] + + paddd mm4,mm1 + paddd mm3,mm1 + psrad mm4,DESCALE_P1 + psrad mm3,DESCALE_P1 + paddd mm7,mm1 + paddd mm0,mm1 + psrad mm7,DESCALE_P1 + psrad mm0,DESCALE_P1 + + packssdw mm4,mm3 ; mm4=data1=(10 11 12 13) + packssdw mm7,mm0 ; mm7=data6=(60 61 62 63) + + movq mm6,mm5 ; transpose coefficients(phase 1) + punpcklwd mm5,mm4 ; mm5=(00 10 01 11) + punpckhwd mm6,mm4 ; mm6=(02 12 03 13) + movq mm1,mm7 ; transpose coefficients(phase 1) + punpcklwd mm7,mm2 ; mm7=(60 70 61 71) + punpckhwd mm1,mm2 ; mm1=(62 72 63 73) + + movq mm3, MMWORD [wk(6)] ; mm3=tmp12L + movq mm0, MMWORD [wk(7)] ; mm0=tmp12H + movq mm4, MMWORD [wk(10)] ; mm4=tmp1L + movq mm2, MMWORD [wk(11)] ; mm2=tmp1H + + movq MMWORD [wk(0)], mm5 ; wk(0)=(00 10 01 11) + movq MMWORD [wk(1)], mm6 ; wk(1)=(02 12 03 13) + movq MMWORD [wk(4)], mm7 ; wk(4)=(60 70 61 71) + movq MMWORD [wk(5)], mm1 ; wk(5)=(62 72 63 73) + + movq mm5,mm3 + movq mm6,mm0 + paddd mm3,mm4 ; mm3=data2L + paddd mm0,mm2 ; mm0=data2H + psubd mm5,mm4 ; mm5=data5L + psubd mm6,mm2 ; mm6=data5H + + movq mm7,[GOTOFF(ebx,PD_DESCALE_P1)] ; mm7=[PD_DESCALE_P1] + + paddd mm3,mm7 + paddd mm0,mm7 + psrad mm3,DESCALE_P1 + psrad mm0,DESCALE_P1 + paddd mm5,mm7 + paddd mm6,mm7 + psrad mm5,DESCALE_P1 + psrad mm6,DESCALE_P1 + + packssdw mm3,mm0 ; mm3=data2=(20 21 22 23) + packssdw mm5,mm6 ; mm5=data5=(50 51 52 53) + + movq mm1, MMWORD [wk(2)] ; mm1=tmp13L + movq mm4, MMWORD [wk(3)] ; mm4=tmp13H + movq mm2, MMWORD [wk(8)] ; mm2=tmp0L + movq mm7, MMWORD [wk(9)] ; mm7=tmp0H + + movq mm0,mm1 + movq mm6,mm4 + paddd mm1,mm2 ; mm1=data3L + paddd mm4,mm7 ; mm4=data3H + psubd mm0,mm2 ; mm0=data4L + psubd mm6,mm7 ; mm6=data4H + + movq mm2,[GOTOFF(ebx,PD_DESCALE_P1)] ; mm2=[PD_DESCALE_P1] + + paddd mm1,mm2 + paddd mm4,mm2 + psrad mm1,DESCALE_P1 + psrad mm4,DESCALE_P1 + paddd mm0,mm2 + paddd mm6,mm2 + psrad mm0,DESCALE_P1 + psrad mm6,DESCALE_P1 + + packssdw mm1,mm4 ; mm1=data3=(30 31 32 33) + packssdw mm0,mm6 ; mm0=data4=(40 41 42 43) + + movq mm7, MMWORD [wk(0)] ; mm7=(00 10 01 11) + movq mm2, MMWORD [wk(1)] ; mm2=(02 12 03 13) + + movq mm4,mm3 ; transpose coefficients(phase 1) + punpcklwd mm3,mm1 ; mm3=(20 30 21 31) + punpckhwd mm4,mm1 ; mm4=(22 32 23 33) + movq mm6,mm0 ; transpose coefficients(phase 1) + punpcklwd mm0,mm5 ; mm0=(40 50 41 51) + punpckhwd mm6,mm5 ; mm6=(42 52 43 53) + + movq mm1,mm7 ; transpose coefficients(phase 2) + punpckldq mm7,mm3 ; mm7=(00 10 20 30) + punpckhdq mm1,mm3 ; mm1=(01 11 21 31) + movq mm5,mm2 ; transpose coefficients(phase 2) + punpckldq mm2,mm4 ; mm2=(02 12 22 32) + punpckhdq mm5,mm4 ; mm5=(03 13 23 33) + + movq mm3, MMWORD [wk(4)] ; mm3=(60 70 61 71) + movq mm4, MMWORD [wk(5)] ; mm4=(62 72 63 73) + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm7 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1 + movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2 + movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5 + + movq mm7,mm0 ; transpose coefficients(phase 2) + punpckldq mm0,mm3 ; mm0=(40 50 60 70) + punpckhdq mm7,mm3 ; mm7=(41 51 61 71) + movq mm1,mm6 ; transpose coefficients(phase 2) + punpckldq mm6,mm4 ; mm6=(42 52 62 72) + punpckhdq mm1,mm4 ; mm1=(43 53 63 73) + + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0 + movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm7 + movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm6 + movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm1 + +.nextcolumn: + add esi, byte 4*SIZEOF_JCOEF ; coef_block + add edx, byte 4*SIZEOF_ISLOW_MULT_TYPE ; quantptr + add edi, byte 4*DCTSIZE*SIZEOF_JCOEF ; wsptr + dec ecx ; ctr + jnz near .columnloop + + ; ---- Pass 2: process rows from work array, store into output array. + + mov eax, [original_ebp] + lea esi, [workspace] ; JCOEF *wsptr + mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(eax)] + mov ecx, DCTSIZE/4 ; ctr + alignx 16,7 +.rowloop: + + ; -- Even part + + movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + + ; (Original) + ; z1 = (z2 + z3) * 0.541196100; + ; tmp2 = z1 + z3 * -1.847759065; + ; tmp3 = z1 + z2 * 0.765366865; + ; + ; (This implementation) + ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); + ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; + + movq mm4,mm1 ; mm1=in2=z2 + movq mm5,mm1 + punpcklwd mm4,mm3 ; mm3=in6=z3 + punpckhwd mm5,mm3 + movq mm1,mm4 + movq mm3,mm5 + pmaddwd mm4,[GOTOFF(ebx,PW_F130_F054)] ; mm4=tmp3L + pmaddwd mm5,[GOTOFF(ebx,PW_F130_F054)] ; mm5=tmp3H + pmaddwd mm1,[GOTOFF(ebx,PW_F054_MF130)] ; mm1=tmp2L + pmaddwd mm3,[GOTOFF(ebx,PW_F054_MF130)] ; mm3=tmp2H + + movq mm6,mm0 + paddw mm0,mm2 ; mm0=in0+in4 + psubw mm6,mm2 ; mm6=in0-in4 + + pxor mm7,mm7 + pxor mm2,mm2 + punpcklwd mm7,mm0 ; mm7=tmp0L + punpckhwd mm2,mm0 ; mm2=tmp0H + psrad mm7,(16-CONST_BITS) ; psrad mm7,16 & pslld mm7,CONST_BITS + psrad mm2,(16-CONST_BITS) ; psrad mm2,16 & pslld mm2,CONST_BITS + + movq mm0,mm7 + paddd mm7,mm4 ; mm7=tmp10L + psubd mm0,mm4 ; mm0=tmp13L + movq mm4,mm2 + paddd mm2,mm5 ; mm2=tmp10H + psubd mm4,mm5 ; mm4=tmp13H + + movq MMWORD [wk(0)], mm7 ; wk(0)=tmp10L + movq MMWORD [wk(1)], mm2 ; wk(1)=tmp10H + movq MMWORD [wk(2)], mm0 ; wk(2)=tmp13L + movq MMWORD [wk(3)], mm4 ; wk(3)=tmp13H + + pxor mm5,mm5 + pxor mm7,mm7 + punpcklwd mm5,mm6 ; mm5=tmp1L + punpckhwd mm7,mm6 ; mm7=tmp1H + psrad mm5,(16-CONST_BITS) ; psrad mm5,16 & pslld mm5,CONST_BITS + psrad mm7,(16-CONST_BITS) ; psrad mm7,16 & pslld mm7,CONST_BITS + + movq mm2,mm5 + paddd mm5,mm1 ; mm5=tmp11L + psubd mm2,mm1 ; mm2=tmp12L + movq mm0,mm7 + paddd mm7,mm3 ; mm7=tmp11H + psubd mm0,mm3 ; mm0=tmp12H + + movq MMWORD [wk(4)], mm5 ; wk(4)=tmp11L + movq MMWORD [wk(5)], mm7 ; wk(5)=tmp11H + movq MMWORD [wk(6)], mm2 ; wk(6)=tmp12L + movq MMWORD [wk(7)], mm0 ; wk(7)=tmp12H + + ; -- Odd part + + movq mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm6, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + + movq mm5,mm6 + movq mm7,mm4 + paddw mm5,mm3 ; mm5=z3 + paddw mm7,mm1 ; mm7=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movq mm2,mm5 + movq mm0,mm5 + punpcklwd mm2,mm7 + punpckhwd mm0,mm7 + movq mm5,mm2 + movq mm7,mm0 + pmaddwd mm2,[GOTOFF(ebx,PW_MF078_F117)] ; mm2=z3L + pmaddwd mm0,[GOTOFF(ebx,PW_MF078_F117)] ; mm0=z3H + pmaddwd mm5,[GOTOFF(ebx,PW_F117_F078)] ; mm5=z4L + pmaddwd mm7,[GOTOFF(ebx,PW_F117_F078)] ; mm7=z4H + + movq MMWORD [wk(10)], mm2 ; wk(10)=z3L + movq MMWORD [wk(11)], mm0 ; wk(11)=z3H + + ; (Original) + ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; + ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; + ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; tmp0 += z1 + z3; tmp1 += z2 + z4; + ; tmp2 += z2 + z3; tmp3 += z1 + z4; + ; + ; (This implementation) + ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; + ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; + ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); + ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); + ; tmp0 += z3; tmp1 += z4; + ; tmp2 += z3; tmp3 += z4; + + movq mm2,mm3 + movq mm0,mm3 + punpcklwd mm2,mm4 + punpckhwd mm0,mm4 + movq mm3,mm2 + movq mm4,mm0 + pmaddwd mm2,[GOTOFF(ebx,PW_MF060_MF089)] ; mm2=tmp0L + pmaddwd mm0,[GOTOFF(ebx,PW_MF060_MF089)] ; mm0=tmp0H + pmaddwd mm3,[GOTOFF(ebx,PW_MF089_F060)] ; mm3=tmp3L + pmaddwd mm4,[GOTOFF(ebx,PW_MF089_F060)] ; mm4=tmp3H + + paddd mm2, MMWORD [wk(10)] ; mm2=tmp0L + paddd mm0, MMWORD [wk(11)] ; mm0=tmp0H + paddd mm3,mm5 ; mm3=tmp3L + paddd mm4,mm7 ; mm4=tmp3H + + movq MMWORD [wk(8)], mm2 ; wk(8)=tmp0L + movq MMWORD [wk(9)], mm0 ; wk(9)=tmp0H + + movq mm2,mm1 + movq mm0,mm1 + punpcklwd mm2,mm6 + punpckhwd mm0,mm6 + movq mm1,mm2 + movq mm6,mm0 + pmaddwd mm2,[GOTOFF(ebx,PW_MF050_MF256)] ; mm2=tmp1L + pmaddwd mm0,[GOTOFF(ebx,PW_MF050_MF256)] ; mm0=tmp1H + pmaddwd mm1,[GOTOFF(ebx,PW_MF256_F050)] ; mm1=tmp2L + pmaddwd mm6,[GOTOFF(ebx,PW_MF256_F050)] ; mm6=tmp2H + + paddd mm2,mm5 ; mm2=tmp1L + paddd mm0,mm7 ; mm0=tmp1H + paddd mm1, MMWORD [wk(10)] ; mm1=tmp2L + paddd mm6, MMWORD [wk(11)] ; mm6=tmp2H + + movq MMWORD [wk(10)], mm2 ; wk(10)=tmp1L + movq MMWORD [wk(11)], mm0 ; wk(11)=tmp1H + + ; -- Final output stage + + movq mm5, MMWORD [wk(0)] ; mm5=tmp10L + movq mm7, MMWORD [wk(1)] ; mm7=tmp10H + + movq mm2,mm5 + movq mm0,mm7 + paddd mm5,mm3 ; mm5=data0L + paddd mm7,mm4 ; mm7=data0H + psubd mm2,mm3 ; mm2=data7L + psubd mm0,mm4 ; mm0=data7H + + movq mm3,[GOTOFF(ebx,PD_DESCALE_P2)] ; mm3=[PD_DESCALE_P2] + + paddd mm5,mm3 + paddd mm7,mm3 + psrad mm5,DESCALE_P2 + psrad mm7,DESCALE_P2 + paddd mm2,mm3 + paddd mm0,mm3 + psrad mm2,DESCALE_P2 + psrad mm0,DESCALE_P2 + + packssdw mm5,mm7 ; mm5=data0=(00 10 20 30) + packssdw mm2,mm0 ; mm2=data7=(07 17 27 37) + + movq mm4, MMWORD [wk(4)] ; mm4=tmp11L + movq mm3, MMWORD [wk(5)] ; mm3=tmp11H + + movq mm7,mm4 + movq mm0,mm3 + paddd mm4,mm1 ; mm4=data1L + paddd mm3,mm6 ; mm3=data1H + psubd mm7,mm1 ; mm7=data6L + psubd mm0,mm6 ; mm0=data6H + + movq mm1,[GOTOFF(ebx,PD_DESCALE_P2)] ; mm1=[PD_DESCALE_P2] + + paddd mm4,mm1 + paddd mm3,mm1 + psrad mm4,DESCALE_P2 + psrad mm3,DESCALE_P2 + paddd mm7,mm1 + paddd mm0,mm1 + psrad mm7,DESCALE_P2 + psrad mm0,DESCALE_P2 + + packssdw mm4,mm3 ; mm4=data1=(01 11 21 31) + packssdw mm7,mm0 ; mm7=data6=(06 16 26 36) + + packsswb mm5,mm7 ; mm5=(00 10 20 30 06 16 26 36) + packsswb mm4,mm2 ; mm4=(01 11 21 31 07 17 27 37) + + movq mm6, MMWORD [wk(6)] ; mm6=tmp12L + movq mm1, MMWORD [wk(7)] ; mm1=tmp12H + movq mm3, MMWORD [wk(10)] ; mm3=tmp1L + movq mm0, MMWORD [wk(11)] ; mm0=tmp1H + + movq MMWORD [wk(0)], mm5 ; wk(0)=(00 10 20 30 06 16 26 36) + movq MMWORD [wk(1)], mm4 ; wk(1)=(01 11 21 31 07 17 27 37) + + movq mm7,mm6 + movq mm2,mm1 + paddd mm6,mm3 ; mm6=data2L + paddd mm1,mm0 ; mm1=data2H + psubd mm7,mm3 ; mm7=data5L + psubd mm2,mm0 ; mm2=data5H + + movq mm5,[GOTOFF(ebx,PD_DESCALE_P2)] ; mm5=[PD_DESCALE_P2] + + paddd mm6,mm5 + paddd mm1,mm5 + psrad mm6,DESCALE_P2 + psrad mm1,DESCALE_P2 + paddd mm7,mm5 + paddd mm2,mm5 + psrad mm7,DESCALE_P2 + psrad mm2,DESCALE_P2 + + packssdw mm6,mm1 ; mm6=data2=(02 12 22 32) + packssdw mm7,mm2 ; mm7=data5=(05 15 25 35) + + movq mm4, MMWORD [wk(2)] ; mm4=tmp13L + movq mm3, MMWORD [wk(3)] ; mm3=tmp13H + movq mm0, MMWORD [wk(8)] ; mm0=tmp0L + movq mm5, MMWORD [wk(9)] ; mm5=tmp0H + + movq mm1,mm4 + movq mm2,mm3 + paddd mm4,mm0 ; mm4=data3L + paddd mm3,mm5 ; mm3=data3H + psubd mm1,mm0 ; mm1=data4L + psubd mm2,mm5 ; mm2=data4H + + movq mm0,[GOTOFF(ebx,PD_DESCALE_P2)] ; mm0=[PD_DESCALE_P2] + + paddd mm4,mm0 + paddd mm3,mm0 + psrad mm4,DESCALE_P2 + psrad mm3,DESCALE_P2 + paddd mm1,mm0 + paddd mm2,mm0 + psrad mm1,DESCALE_P2 + psrad mm2,DESCALE_P2 + + movq mm5,[GOTOFF(ebx,PB_CENTERJSAMP)] ; mm5=[PB_CENTERJSAMP] + + packssdw mm4,mm3 ; mm4=data3=(03 13 23 33) + packssdw mm1,mm2 ; mm1=data4=(04 14 24 34) + + movq mm0, MMWORD [wk(0)] ; mm0=(00 10 20 30 06 16 26 36) + movq mm3, MMWORD [wk(1)] ; mm3=(01 11 21 31 07 17 27 37) + + packsswb mm6,mm1 ; mm6=(02 12 22 32 04 14 24 34) + packsswb mm4,mm7 ; mm4=(03 13 23 33 05 15 25 35) + + paddb mm0,mm5 + paddb mm3,mm5 + paddb mm6,mm5 + paddb mm4,mm5 + + movq mm2,mm0 ; transpose coefficients(phase 1) + punpcklbw mm0,mm3 ; mm0=(00 01 10 11 20 21 30 31) + punpckhbw mm2,mm3 ; mm2=(06 07 16 17 26 27 36 37) + movq mm1,mm6 ; transpose coefficients(phase 1) + punpcklbw mm6,mm4 ; mm6=(02 03 12 13 22 23 32 33) + punpckhbw mm1,mm4 ; mm1=(04 05 14 15 24 25 34 35) + + movq mm7,mm0 ; transpose coefficients(phase 2) + punpcklwd mm0,mm6 ; mm0=(00 01 02 03 10 11 12 13) + punpckhwd mm7,mm6 ; mm7=(20 21 22 23 30 31 32 33) + movq mm5,mm1 ; transpose coefficients(phase 2) + punpcklwd mm1,mm2 ; mm1=(04 05 06 07 14 15 16 17) + punpckhwd mm5,mm2 ; mm5=(24 25 26 27 34 35 36 37) + + movq mm3,mm0 ; transpose coefficients(phase 3) + punpckldq mm0,mm1 ; mm0=(00 01 02 03 04 05 06 07) + punpckhdq mm3,mm1 ; mm3=(10 11 12 13 14 15 16 17) + movq mm4,mm7 ; transpose coefficients(phase 3) + punpckldq mm7,mm5 ; mm7=(20 21 22 23 24 25 26 27) + punpckhdq mm4,mm5 ; mm4=(30 31 32 33 34 35 36 37) + + pushpic ebx ; save GOT address + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0 + movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm3 + mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] + mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW] + movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm7 + movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4 + + poppic ebx ; restore GOT address + + add esi, byte 4*SIZEOF_JCOEF ; wsptr + add edi, byte 4*SIZEOF_JSAMPROW + dec ecx ; ctr + jnz near .rowloop + + emms ; empty MMX state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/Builder/jni-1.11/simd/jidctint-sse2-64.asm b/Builder/jni-1.11/simd/jidctint-sse2-64.asm new file mode 100644 index 000000000..afe1d6a73 --- /dev/null +++ b/Builder/jni-1.11/simd/jidctint-sse2-64.asm @@ -0,0 +1,847 @@ +; +; jidctint.asm - accurate integer IDCT (64-bit SSE2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2009, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a slow-but-accurate integer implementation of the +; inverse DCT (Discrete Cosine Transform). The following code is based +; directly on the IJG's original jidctint.c; see the jidctint.c for +; more details. +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 13 +%define PASS1_BITS 2 + +%define DESCALE_P1 (CONST_BITS-PASS1_BITS) +%define DESCALE_P2 (CONST_BITS+PASS1_BITS+3) + +%if CONST_BITS == 13 +F_0_298 equ 2446 ; FIX(0.298631336) +F_0_390 equ 3196 ; FIX(0.390180644) +F_0_541 equ 4433 ; FIX(0.541196100) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_175 equ 9633 ; FIX(1.175875602) +F_1_501 equ 12299 ; FIX(1.501321110) +F_1_847 equ 15137 ; FIX(1.847759065) +F_1_961 equ 16069 ; FIX(1.961570560) +F_2_053 equ 16819 ; FIX(2.053119869) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_072 equ 25172 ; FIX(3.072711026) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) +F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336) +F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644) +F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) +F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) +F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) +F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602) +F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110) +F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) +F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560) +F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869) +F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) +F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_idct_islow_sse2) + +EXTN(jconst_idct_islow_sse2): + +PW_F130_F054 times 4 dw (F_0_541+F_0_765), F_0_541 +PW_F054_MF130 times 4 dw F_0_541, (F_0_541-F_1_847) +PW_MF078_F117 times 4 dw (F_1_175-F_1_961), F_1_175 +PW_F117_F078 times 4 dw F_1_175, (F_1_175-F_0_390) +PW_MF060_MF089 times 4 dw (F_0_298-F_0_899),-F_0_899 +PW_MF089_F060 times 4 dw -F_0_899, (F_1_501-F_0_899) +PW_MF050_MF256 times 4 dw (F_2_053-F_2_562),-F_2_562 +PW_MF256_F050 times 4 dw -F_2_562, (F_3_072-F_2_562) +PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1-1) +PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2-1) +PB_CENTERJSAMP times 16 db CENTERJSAMPLE + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 +; +; Perform dequantization and inverse DCT on one block of coefficients. +; +; GLOBAL(void) +; jsimd_idct_islow_sse2 (void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +; r10 = jpeg_component_info *compptr +; r11 = JCOEFPTR coef_block +; r12 = JSAMPARRAY output_buf +; r13 = JDIMENSION output_col + +%define original_rbp rbp+0 +%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 12 + + align 16 + global EXTN(jsimd_idct_islow_sse2) + +EXTN(jsimd_idct_islow_sse2): + push rbp + mov rax,rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp],rax + mov rbp,rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args + + ; ---- Pass 1: process columns from input. + + mov rdx, r10 ; quantptr + mov rsi, r11 ; inptr + +%ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2 + mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)] + or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)] + jnz near .columnDCT + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] + por xmm1,xmm0 + packsswb xmm1,xmm1 + packsswb xmm1,xmm1 + movd eax,xmm1 + test rax,rax + jnz short .columnDCT + + ; -- AC terms all zero + + movdqa xmm5, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] + pmullw xmm5, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + + psllw xmm5,PASS1_BITS + + movdqa xmm4,xmm5 ; xmm5=in0=(00 01 02 03 04 05 06 07) + punpcklwd xmm5,xmm5 ; xmm5=(00 00 01 01 02 02 03 03) + punpckhwd xmm4,xmm4 ; xmm4=(04 04 05 05 06 06 07 07) + + pshufd xmm7,xmm5,0x00 ; xmm7=col0=(00 00 00 00 00 00 00 00) + pshufd xmm6,xmm5,0x55 ; xmm6=col1=(01 01 01 01 01 01 01 01) + pshufd xmm1,xmm5,0xAA ; xmm1=col2=(02 02 02 02 02 02 02 02) + pshufd xmm5,xmm5,0xFF ; xmm5=col3=(03 03 03 03 03 03 03 03) + pshufd xmm0,xmm4,0x00 ; xmm0=col4=(04 04 04 04 04 04 04 04) + pshufd xmm3,xmm4,0x55 ; xmm3=col5=(05 05 05 05 05 05 05 05) + pshufd xmm2,xmm4,0xAA ; xmm2=col6=(06 06 06 06 06 06 06 06) + pshufd xmm4,xmm4,0xFF ; xmm4=col7=(07 07 07 07 07 07 07 07) + + movdqa XMMWORD [wk(8)], xmm6 ; wk(8)=col1 + movdqa XMMWORD [wk(9)], xmm5 ; wk(9)=col3 + movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5 + movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7 + jmp near .column_end +%endif +.columnDCT: + + ; -- Even part + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + + ; (Original) + ; z1 = (z2 + z3) * 0.541196100; + ; tmp2 = z1 + z3 * -1.847759065; + ; tmp3 = z1 + z2 * 0.765366865; + ; + ; (This implementation) + ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); + ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; + + movdqa xmm4,xmm1 ; xmm1=in2=z2 + movdqa xmm5,xmm1 + punpcklwd xmm4,xmm3 ; xmm3=in6=z3 + punpckhwd xmm5,xmm3 + movdqa xmm1,xmm4 + movdqa xmm3,xmm5 + pmaddwd xmm4,[rel PW_F130_F054] ; xmm4=tmp3L + pmaddwd xmm5,[rel PW_F130_F054] ; xmm5=tmp3H + pmaddwd xmm1,[rel PW_F054_MF130] ; xmm1=tmp2L + pmaddwd xmm3,[rel PW_F054_MF130] ; xmm3=tmp2H + + movdqa xmm6,xmm0 + paddw xmm0,xmm2 ; xmm0=in0+in4 + psubw xmm6,xmm2 ; xmm6=in0-in4 + + pxor xmm7,xmm7 + pxor xmm2,xmm2 + punpcklwd xmm7,xmm0 ; xmm7=tmp0L + punpckhwd xmm2,xmm0 ; xmm2=tmp0H + psrad xmm7,(16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS + psrad xmm2,(16-CONST_BITS) ; psrad xmm2,16 & pslld xmm2,CONST_BITS + + movdqa xmm0,xmm7 + paddd xmm7,xmm4 ; xmm7=tmp10L + psubd xmm0,xmm4 ; xmm0=tmp13L + movdqa xmm4,xmm2 + paddd xmm2,xmm5 ; xmm2=tmp10H + psubd xmm4,xmm5 ; xmm4=tmp13H + + movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=tmp10L + movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=tmp10H + movdqa XMMWORD [wk(2)], xmm0 ; wk(2)=tmp13L + movdqa XMMWORD [wk(3)], xmm4 ; wk(3)=tmp13H + + pxor xmm5,xmm5 + pxor xmm7,xmm7 + punpcklwd xmm5,xmm6 ; xmm5=tmp1L + punpckhwd xmm7,xmm6 ; xmm7=tmp1H + psrad xmm5,(16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS + psrad xmm7,(16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS + + movdqa xmm2,xmm5 + paddd xmm5,xmm1 ; xmm5=tmp11L + psubd xmm2,xmm1 ; xmm2=tmp12L + movdqa xmm0,xmm7 + paddd xmm7,xmm3 ; xmm7=tmp11H + psubd xmm0,xmm3 ; xmm0=tmp12H + + movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L + movdqa XMMWORD [wk(5)], xmm7 ; wk(5)=tmp11H + movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=tmp12L + movdqa XMMWORD [wk(7)], xmm0 ; wk(7)=tmp12H + + ; -- Odd part + + movdqa xmm4, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] + movdqa xmm6, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] + pmullw xmm4, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm6, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] + pmullw xmm1, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + + movdqa xmm5,xmm6 + movdqa xmm7,xmm4 + paddw xmm5,xmm3 ; xmm5=z3 + paddw xmm7,xmm1 ; xmm7=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movdqa xmm2,xmm5 + movdqa xmm0,xmm5 + punpcklwd xmm2,xmm7 + punpckhwd xmm0,xmm7 + movdqa xmm5,xmm2 + movdqa xmm7,xmm0 + pmaddwd xmm2,[rel PW_MF078_F117] ; xmm2=z3L + pmaddwd xmm0,[rel PW_MF078_F117] ; xmm0=z3H + pmaddwd xmm5,[rel PW_F117_F078] ; xmm5=z4L + pmaddwd xmm7,[rel PW_F117_F078] ; xmm7=z4H + + movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=z3L + movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=z3H + + ; (Original) + ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; + ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; + ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; tmp0 += z1 + z3; tmp1 += z2 + z4; + ; tmp2 += z2 + z3; tmp3 += z1 + z4; + ; + ; (This implementation) + ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; + ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; + ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); + ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); + ; tmp0 += z3; tmp1 += z4; + ; tmp2 += z3; tmp3 += z4; + + movdqa xmm2,xmm3 + movdqa xmm0,xmm3 + punpcklwd xmm2,xmm4 + punpckhwd xmm0,xmm4 + movdqa xmm3,xmm2 + movdqa xmm4,xmm0 + pmaddwd xmm2,[rel PW_MF060_MF089] ; xmm2=tmp0L + pmaddwd xmm0,[rel PW_MF060_MF089] ; xmm0=tmp0H + pmaddwd xmm3,[rel PW_MF089_F060] ; xmm3=tmp3L + pmaddwd xmm4,[rel PW_MF089_F060] ; xmm4=tmp3H + + paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp0L + paddd xmm0, XMMWORD [wk(11)] ; xmm0=tmp0H + paddd xmm3,xmm5 ; xmm3=tmp3L + paddd xmm4,xmm7 ; xmm4=tmp3H + + movdqa XMMWORD [wk(8)], xmm2 ; wk(8)=tmp0L + movdqa XMMWORD [wk(9)], xmm0 ; wk(9)=tmp0H + + movdqa xmm2,xmm1 + movdqa xmm0,xmm1 + punpcklwd xmm2,xmm6 + punpckhwd xmm0,xmm6 + movdqa xmm1,xmm2 + movdqa xmm6,xmm0 + pmaddwd xmm2,[rel PW_MF050_MF256] ; xmm2=tmp1L + pmaddwd xmm0,[rel PW_MF050_MF256] ; xmm0=tmp1H + pmaddwd xmm1,[rel PW_MF256_F050] ; xmm1=tmp2L + pmaddwd xmm6,[rel PW_MF256_F050] ; xmm6=tmp2H + + paddd xmm2,xmm5 ; xmm2=tmp1L + paddd xmm0,xmm7 ; xmm0=tmp1H + paddd xmm1, XMMWORD [wk(10)] ; xmm1=tmp2L + paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H + + movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=tmp1L + movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=tmp1H + + ; -- Final output stage + + movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L + movdqa xmm7, XMMWORD [wk(1)] ; xmm7=tmp10H + + movdqa xmm2,xmm5 + movdqa xmm0,xmm7 + paddd xmm5,xmm3 ; xmm5=data0L + paddd xmm7,xmm4 ; xmm7=data0H + psubd xmm2,xmm3 ; xmm2=data7L + psubd xmm0,xmm4 ; xmm0=data7H + + movdqa xmm3,[rel PD_DESCALE_P1] ; xmm3=[rel PD_DESCALE_P1] + + paddd xmm5,xmm3 + paddd xmm7,xmm3 + psrad xmm5,DESCALE_P1 + psrad xmm7,DESCALE_P1 + paddd xmm2,xmm3 + paddd xmm0,xmm3 + psrad xmm2,DESCALE_P1 + psrad xmm0,DESCALE_P1 + + packssdw xmm5,xmm7 ; xmm5=data0=(00 01 02 03 04 05 06 07) + packssdw xmm2,xmm0 ; xmm2=data7=(70 71 72 73 74 75 76 77) + + movdqa xmm4, XMMWORD [wk(4)] ; xmm4=tmp11L + movdqa xmm3, XMMWORD [wk(5)] ; xmm3=tmp11H + + movdqa xmm7,xmm4 + movdqa xmm0,xmm3 + paddd xmm4,xmm1 ; xmm4=data1L + paddd xmm3,xmm6 ; xmm3=data1H + psubd xmm7,xmm1 ; xmm7=data6L + psubd xmm0,xmm6 ; xmm0=data6H + + movdqa xmm1,[rel PD_DESCALE_P1] ; xmm1=[rel PD_DESCALE_P1] + + paddd xmm4,xmm1 + paddd xmm3,xmm1 + psrad xmm4,DESCALE_P1 + psrad xmm3,DESCALE_P1 + paddd xmm7,xmm1 + paddd xmm0,xmm1 + psrad xmm7,DESCALE_P1 + psrad xmm0,DESCALE_P1 + + packssdw xmm4,xmm3 ; xmm4=data1=(10 11 12 13 14 15 16 17) + packssdw xmm7,xmm0 ; xmm7=data6=(60 61 62 63 64 65 66 67) + + movdqa xmm6,xmm5 ; transpose coefficients(phase 1) + punpcklwd xmm5,xmm4 ; xmm5=(00 10 01 11 02 12 03 13) + punpckhwd xmm6,xmm4 ; xmm6=(04 14 05 15 06 16 07 17) + movdqa xmm1,xmm7 ; transpose coefficients(phase 1) + punpcklwd xmm7,xmm2 ; xmm7=(60 70 61 71 62 72 63 73) + punpckhwd xmm1,xmm2 ; xmm1=(64 74 65 75 66 76 67 77) + + movdqa xmm3, XMMWORD [wk(6)] ; xmm3=tmp12L + movdqa xmm0, XMMWORD [wk(7)] ; xmm0=tmp12H + movdqa xmm4, XMMWORD [wk(10)] ; xmm4=tmp1L + movdqa xmm2, XMMWORD [wk(11)] ; xmm2=tmp1H + + movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 01 11 02 12 03 13) + movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=(04 14 05 15 06 16 07 17) + movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=(60 70 61 71 62 72 63 73) + movdqa XMMWORD [wk(5)], xmm1 ; wk(5)=(64 74 65 75 66 76 67 77) + + movdqa xmm5,xmm3 + movdqa xmm6,xmm0 + paddd xmm3,xmm4 ; xmm3=data2L + paddd xmm0,xmm2 ; xmm0=data2H + psubd xmm5,xmm4 ; xmm5=data5L + psubd xmm6,xmm2 ; xmm6=data5H + + movdqa xmm7,[rel PD_DESCALE_P1] ; xmm7=[rel PD_DESCALE_P1] + + paddd xmm3,xmm7 + paddd xmm0,xmm7 + psrad xmm3,DESCALE_P1 + psrad xmm0,DESCALE_P1 + paddd xmm5,xmm7 + paddd xmm6,xmm7 + psrad xmm5,DESCALE_P1 + psrad xmm6,DESCALE_P1 + + packssdw xmm3,xmm0 ; xmm3=data2=(20 21 22 23 24 25 26 27) + packssdw xmm5,xmm6 ; xmm5=data5=(50 51 52 53 54 55 56 57) + + movdqa xmm1, XMMWORD [wk(2)] ; xmm1=tmp13L + movdqa xmm4, XMMWORD [wk(3)] ; xmm4=tmp13H + movdqa xmm2, XMMWORD [wk(8)] ; xmm2=tmp0L + movdqa xmm7, XMMWORD [wk(9)] ; xmm7=tmp0H + + movdqa xmm0,xmm1 + movdqa xmm6,xmm4 + paddd xmm1,xmm2 ; xmm1=data3L + paddd xmm4,xmm7 ; xmm4=data3H + psubd xmm0,xmm2 ; xmm0=data4L + psubd xmm6,xmm7 ; xmm6=data4H + + movdqa xmm2,[rel PD_DESCALE_P1] ; xmm2=[rel PD_DESCALE_P1] + + paddd xmm1,xmm2 + paddd xmm4,xmm2 + psrad xmm1,DESCALE_P1 + psrad xmm4,DESCALE_P1 + paddd xmm0,xmm2 + paddd xmm6,xmm2 + psrad xmm0,DESCALE_P1 + psrad xmm6,DESCALE_P1 + + packssdw xmm1,xmm4 ; xmm1=data3=(30 31 32 33 34 35 36 37) + packssdw xmm0,xmm6 ; xmm0=data4=(40 41 42 43 44 45 46 47) + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 01 11 02 12 03 13) + movdqa xmm2, XMMWORD [wk(1)] ; xmm2=(04 14 05 15 06 16 07 17) + + movdqa xmm4,xmm3 ; transpose coefficients(phase 1) + punpcklwd xmm3,xmm1 ; xmm3=(20 30 21 31 22 32 23 33) + punpckhwd xmm4,xmm1 ; xmm4=(24 34 25 35 26 36 27 37) + movdqa xmm6,xmm0 ; transpose coefficients(phase 1) + punpcklwd xmm0,xmm5 ; xmm0=(40 50 41 51 42 52 43 53) + punpckhwd xmm6,xmm5 ; xmm6=(44 54 45 55 46 56 47 57) + + movdqa xmm1,xmm7 ; transpose coefficients(phase 2) + punpckldq xmm7,xmm3 ; xmm7=(00 10 20 30 01 11 21 31) + punpckhdq xmm1,xmm3 ; xmm1=(02 12 22 32 03 13 23 33) + movdqa xmm5,xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2,xmm4 ; xmm2=(04 14 24 34 05 15 25 35) + punpckhdq xmm5,xmm4 ; xmm5=(06 16 26 36 07 17 27 37) + + movdqa xmm3, XMMWORD [wk(4)] ; xmm3=(60 70 61 71 62 72 63 73) + movdqa xmm4, XMMWORD [wk(5)] ; xmm4=(64 74 65 75 66 76 67 77) + + movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=(04 14 24 34 05 15 25 35) + movdqa XMMWORD [wk(7)], xmm5 ; wk(7)=(06 16 26 36 07 17 27 37) + + movdqa xmm2,xmm0 ; transpose coefficients(phase 2) + punpckldq xmm0,xmm3 ; xmm0=(40 50 60 70 41 51 61 71) + punpckhdq xmm2,xmm3 ; xmm2=(42 52 62 72 43 53 63 73) + movdqa xmm5,xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6,xmm4 ; xmm6=(44 54 64 74 45 55 65 75) + punpckhdq xmm5,xmm4 ; xmm5=(46 56 66 76 47 57 67 77) + + movdqa xmm3,xmm7 ; transpose coefficients(phase 3) + punpcklqdq xmm7,xmm0 ; xmm7=col0=(00 10 20 30 40 50 60 70) + punpckhqdq xmm3,xmm0 ; xmm3=col1=(01 11 21 31 41 51 61 71) + movdqa xmm4,xmm1 ; transpose coefficients(phase 3) + punpcklqdq xmm1,xmm2 ; xmm1=col2=(02 12 22 32 42 52 62 72) + punpckhqdq xmm4,xmm2 ; xmm4=col3=(03 13 23 33 43 53 63 73) + + movdqa xmm0, XMMWORD [wk(6)] ; xmm0=(04 14 24 34 05 15 25 35) + movdqa xmm2, XMMWORD [wk(7)] ; xmm2=(06 16 26 36 07 17 27 37) + + movdqa XMMWORD [wk(8)], xmm3 ; wk(8)=col1 + movdqa XMMWORD [wk(9)], xmm4 ; wk(9)=col3 + + movdqa xmm3,xmm0 ; transpose coefficients(phase 3) + punpcklqdq xmm0,xmm6 ; xmm0=col4=(04 14 24 34 44 54 64 74) + punpckhqdq xmm3,xmm6 ; xmm3=col5=(05 15 25 35 45 55 65 75) + movdqa xmm4,xmm2 ; transpose coefficients(phase 3) + punpcklqdq xmm2,xmm5 ; xmm2=col6=(06 16 26 36 46 56 66 76) + punpckhqdq xmm4,xmm5 ; xmm4=col7=(07 17 27 37 47 57 67 77) + + movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5 + movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7 +.column_end: + + ; -- Prefetch the next coefficient block + + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows from work array, store into output array. + + mov rax, [original_rbp] + mov rdi, r12 ; (JSAMPROW *) + mov eax, r13d + + ; -- Even part + + ; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6 + + ; (Original) + ; z1 = (z2 + z3) * 0.541196100; + ; tmp2 = z1 + z3 * -1.847759065; + ; tmp3 = z1 + z2 * 0.765366865; + ; + ; (This implementation) + ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); + ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; + + movdqa xmm6,xmm1 ; xmm1=in2=z2 + movdqa xmm5,xmm1 + punpcklwd xmm6,xmm2 ; xmm2=in6=z3 + punpckhwd xmm5,xmm2 + movdqa xmm1,xmm6 + movdqa xmm2,xmm5 + pmaddwd xmm6,[rel PW_F130_F054] ; xmm6=tmp3L + pmaddwd xmm5,[rel PW_F130_F054] ; xmm5=tmp3H + pmaddwd xmm1,[rel PW_F054_MF130] ; xmm1=tmp2L + pmaddwd xmm2,[rel PW_F054_MF130] ; xmm2=tmp2H + + movdqa xmm3,xmm7 + paddw xmm7,xmm0 ; xmm7=in0+in4 + psubw xmm3,xmm0 ; xmm3=in0-in4 + + pxor xmm4,xmm4 + pxor xmm0,xmm0 + punpcklwd xmm4,xmm7 ; xmm4=tmp0L + punpckhwd xmm0,xmm7 ; xmm0=tmp0H + psrad xmm4,(16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS + psrad xmm0,(16-CONST_BITS) ; psrad xmm0,16 & pslld xmm0,CONST_BITS + + movdqa xmm7,xmm4 + paddd xmm4,xmm6 ; xmm4=tmp10L + psubd xmm7,xmm6 ; xmm7=tmp13L + movdqa xmm6,xmm0 + paddd xmm0,xmm5 ; xmm0=tmp10H + psubd xmm6,xmm5 ; xmm6=tmp13H + + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=tmp10L + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp10H + movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=tmp13L + movdqa XMMWORD [wk(3)], xmm6 ; wk(3)=tmp13H + + pxor xmm5,xmm5 + pxor xmm4,xmm4 + punpcklwd xmm5,xmm3 ; xmm5=tmp1L + punpckhwd xmm4,xmm3 ; xmm4=tmp1H + psrad xmm5,(16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS + psrad xmm4,(16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS + + movdqa xmm0,xmm5 + paddd xmm5,xmm1 ; xmm5=tmp11L + psubd xmm0,xmm1 ; xmm0=tmp12L + movdqa xmm7,xmm4 + paddd xmm4,xmm2 ; xmm4=tmp11H + psubd xmm7,xmm2 ; xmm7=tmp12H + + movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L + movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=tmp11H + movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=tmp12L + movdqa XMMWORD [wk(7)], xmm7 ; wk(7)=tmp12H + + ; -- Odd part + + movdqa xmm6, XMMWORD [wk(9)] ; xmm6=col3 + movdqa xmm3, XMMWORD [wk(8)] ; xmm3=col1 + movdqa xmm1, XMMWORD [wk(11)] ; xmm1=col7 + movdqa xmm2, XMMWORD [wk(10)] ; xmm2=col5 + + movdqa xmm5,xmm6 + movdqa xmm4,xmm3 + paddw xmm5,xmm1 ; xmm5=z3 + paddw xmm4,xmm2 ; xmm4=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movdqa xmm0,xmm5 + movdqa xmm7,xmm5 + punpcklwd xmm0,xmm4 + punpckhwd xmm7,xmm4 + movdqa xmm5,xmm0 + movdqa xmm4,xmm7 + pmaddwd xmm0,[rel PW_MF078_F117] ; xmm0=z3L + pmaddwd xmm7,[rel PW_MF078_F117] ; xmm7=z3H + pmaddwd xmm5,[rel PW_F117_F078] ; xmm5=z4L + pmaddwd xmm4,[rel PW_F117_F078] ; xmm4=z4H + + movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=z3L + movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=z3H + + ; (Original) + ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; + ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; + ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; tmp0 += z1 + z3; tmp1 += z2 + z4; + ; tmp2 += z2 + z3; tmp3 += z1 + z4; + ; + ; (This implementation) + ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; + ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; + ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); + ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); + ; tmp0 += z3; tmp1 += z4; + ; tmp2 += z3; tmp3 += z4; + + movdqa xmm0,xmm1 + movdqa xmm7,xmm1 + punpcklwd xmm0,xmm3 + punpckhwd xmm7,xmm3 + movdqa xmm1,xmm0 + movdqa xmm3,xmm7 + pmaddwd xmm0,[rel PW_MF060_MF089] ; xmm0=tmp0L + pmaddwd xmm7,[rel PW_MF060_MF089] ; xmm7=tmp0H + pmaddwd xmm1,[rel PW_MF089_F060] ; xmm1=tmp3L + pmaddwd xmm3,[rel PW_MF089_F060] ; xmm3=tmp3H + + paddd xmm0, XMMWORD [wk(10)] ; xmm0=tmp0L + paddd xmm7, XMMWORD [wk(11)] ; xmm7=tmp0H + paddd xmm1,xmm5 ; xmm1=tmp3L + paddd xmm3,xmm4 ; xmm3=tmp3H + + movdqa XMMWORD [wk(8)], xmm0 ; wk(8)=tmp0L + movdqa XMMWORD [wk(9)], xmm7 ; wk(9)=tmp0H + + movdqa xmm0,xmm2 + movdqa xmm7,xmm2 + punpcklwd xmm0,xmm6 + punpckhwd xmm7,xmm6 + movdqa xmm2,xmm0 + movdqa xmm6,xmm7 + pmaddwd xmm0,[rel PW_MF050_MF256] ; xmm0=tmp1L + pmaddwd xmm7,[rel PW_MF050_MF256] ; xmm7=tmp1H + pmaddwd xmm2,[rel PW_MF256_F050] ; xmm2=tmp2L + pmaddwd xmm6,[rel PW_MF256_F050] ; xmm6=tmp2H + + paddd xmm0,xmm5 ; xmm0=tmp1L + paddd xmm7,xmm4 ; xmm7=tmp1H + paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp2L + paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H + + movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=tmp1L + movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=tmp1H + + ; -- Final output stage + + movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L + movdqa xmm4, XMMWORD [wk(1)] ; xmm4=tmp10H + + movdqa xmm0,xmm5 + movdqa xmm7,xmm4 + paddd xmm5,xmm1 ; xmm5=data0L + paddd xmm4,xmm3 ; xmm4=data0H + psubd xmm0,xmm1 ; xmm0=data7L + psubd xmm7,xmm3 ; xmm7=data7H + + movdqa xmm1,[rel PD_DESCALE_P2] ; xmm1=[rel PD_DESCALE_P2] + + paddd xmm5,xmm1 + paddd xmm4,xmm1 + psrad xmm5,DESCALE_P2 + psrad xmm4,DESCALE_P2 + paddd xmm0,xmm1 + paddd xmm7,xmm1 + psrad xmm0,DESCALE_P2 + psrad xmm7,DESCALE_P2 + + packssdw xmm5,xmm4 ; xmm5=data0=(00 10 20 30 40 50 60 70) + packssdw xmm0,xmm7 ; xmm0=data7=(07 17 27 37 47 57 67 77) + + movdqa xmm3, XMMWORD [wk(4)] ; xmm3=tmp11L + movdqa xmm1, XMMWORD [wk(5)] ; xmm1=tmp11H + + movdqa xmm4,xmm3 + movdqa xmm7,xmm1 + paddd xmm3,xmm2 ; xmm3=data1L + paddd xmm1,xmm6 ; xmm1=data1H + psubd xmm4,xmm2 ; xmm4=data6L + psubd xmm7,xmm6 ; xmm7=data6H + + movdqa xmm2,[rel PD_DESCALE_P2] ; xmm2=[rel PD_DESCALE_P2] + + paddd xmm3,xmm2 + paddd xmm1,xmm2 + psrad xmm3,DESCALE_P2 + psrad xmm1,DESCALE_P2 + paddd xmm4,xmm2 + paddd xmm7,xmm2 + psrad xmm4,DESCALE_P2 + psrad xmm7,DESCALE_P2 + + packssdw xmm3,xmm1 ; xmm3=data1=(01 11 21 31 41 51 61 71) + packssdw xmm4,xmm7 ; xmm4=data6=(06 16 26 36 46 56 66 76) + + packsswb xmm5,xmm4 ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) + packsswb xmm3,xmm0 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) + + movdqa xmm6, XMMWORD [wk(6)] ; xmm6=tmp12L + movdqa xmm2, XMMWORD [wk(7)] ; xmm2=tmp12H + movdqa xmm1, XMMWORD [wk(10)] ; xmm1=tmp1L + movdqa xmm7, XMMWORD [wk(11)] ; xmm7=tmp1H + + movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) + movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) + + movdqa xmm4,xmm6 + movdqa xmm0,xmm2 + paddd xmm6,xmm1 ; xmm6=data2L + paddd xmm2,xmm7 ; xmm2=data2H + psubd xmm4,xmm1 ; xmm4=data5L + psubd xmm0,xmm7 ; xmm0=data5H + + movdqa xmm5,[rel PD_DESCALE_P2] ; xmm5=[rel PD_DESCALE_P2] + + paddd xmm6,xmm5 + paddd xmm2,xmm5 + psrad xmm6,DESCALE_P2 + psrad xmm2,DESCALE_P2 + paddd xmm4,xmm5 + paddd xmm0,xmm5 + psrad xmm4,DESCALE_P2 + psrad xmm0,DESCALE_P2 + + packssdw xmm6,xmm2 ; xmm6=data2=(02 12 22 32 42 52 62 72) + packssdw xmm4,xmm0 ; xmm4=data5=(05 15 25 35 45 55 65 75) + + movdqa xmm3, XMMWORD [wk(2)] ; xmm3=tmp13L + movdqa xmm1, XMMWORD [wk(3)] ; xmm1=tmp13H + movdqa xmm7, XMMWORD [wk(8)] ; xmm7=tmp0L + movdqa xmm5, XMMWORD [wk(9)] ; xmm5=tmp0H + + movdqa xmm2,xmm3 + movdqa xmm0,xmm1 + paddd xmm3,xmm7 ; xmm3=data3L + paddd xmm1,xmm5 ; xmm1=data3H + psubd xmm2,xmm7 ; xmm2=data4L + psubd xmm0,xmm5 ; xmm0=data4H + + movdqa xmm7,[rel PD_DESCALE_P2] ; xmm7=[rel PD_DESCALE_P2] + + paddd xmm3,xmm7 + paddd xmm1,xmm7 + psrad xmm3,DESCALE_P2 + psrad xmm1,DESCALE_P2 + paddd xmm2,xmm7 + paddd xmm0,xmm7 + psrad xmm2,DESCALE_P2 + psrad xmm0,DESCALE_P2 + + movdqa xmm5,[rel PB_CENTERJSAMP] ; xmm5=[rel PB_CENTERJSAMP] + + packssdw xmm3,xmm1 ; xmm3=data3=(03 13 23 33 43 53 63 73) + packssdw xmm2,xmm0 ; xmm2=data4=(04 14 24 34 44 54 64 74) + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) + movdqa xmm1, XMMWORD [wk(1)] ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) + + packsswb xmm6,xmm2 ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74) + packsswb xmm3,xmm4 ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75) + + paddb xmm7,xmm5 + paddb xmm1,xmm5 + paddb xmm6,xmm5 + paddb xmm3,xmm5 + + movdqa xmm0,xmm7 ; transpose coefficients(phase 1) + punpcklbw xmm7,xmm1 ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71) + punpckhbw xmm0,xmm1 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77) + movdqa xmm2,xmm6 ; transpose coefficients(phase 1) + punpcklbw xmm6,xmm3 ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73) + punpckhbw xmm2,xmm3 ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75) + + movdqa xmm4,xmm7 ; transpose coefficients(phase 2) + punpcklwd xmm7,xmm6 ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) + punpckhwd xmm4,xmm6 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73) + movdqa xmm5,xmm2 ; transpose coefficients(phase 2) + punpcklwd xmm2,xmm0 ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) + punpckhwd xmm5,xmm0 ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77) + + movdqa xmm1,xmm7 ; transpose coefficients(phase 3) + punpckldq xmm7,xmm2 ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) + punpckhdq xmm1,xmm2 ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) + movdqa xmm3,xmm4 ; transpose coefficients(phase 3) + punpckldq xmm4,xmm5 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) + punpckhdq xmm3,xmm5 ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) + + pshufd xmm6,xmm7,0x4E ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) + pshufd xmm0,xmm1,0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) + pshufd xmm2,xmm4,0x4E ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47) + pshufd xmm5,xmm3,0x4E ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67) + + mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] + mov rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm7 + movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm1 + mov rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW] + mov rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4 + movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3 + + mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] + mov rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6 + movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0 + mov rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW] + mov rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2 + movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5 + + uncollect_args + mov rsp,rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/Builder/jni-1.11/simd/jidctint-sse2.asm b/Builder/jni-1.11/simd/jidctint-sse2.asm new file mode 100644 index 000000000..6c7e7d9b4 --- /dev/null +++ b/Builder/jni-1.11/simd/jidctint-sse2.asm @@ -0,0 +1,858 @@ +; +; jidctint.asm - accurate integer IDCT (SSE2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a slow-but-accurate integer implementation of the +; inverse DCT (Discrete Cosine Transform). The following code is based +; directly on the IJG's original jidctint.c; see the jidctint.c for +; more details. +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 13 +%define PASS1_BITS 2 + +%define DESCALE_P1 (CONST_BITS-PASS1_BITS) +%define DESCALE_P2 (CONST_BITS+PASS1_BITS+3) + +%if CONST_BITS == 13 +F_0_298 equ 2446 ; FIX(0.298631336) +F_0_390 equ 3196 ; FIX(0.390180644) +F_0_541 equ 4433 ; FIX(0.541196100) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_175 equ 9633 ; FIX(1.175875602) +F_1_501 equ 12299 ; FIX(1.501321110) +F_1_847 equ 15137 ; FIX(1.847759065) +F_1_961 equ 16069 ; FIX(1.961570560) +F_2_053 equ 16819 ; FIX(2.053119869) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_072 equ 25172 ; FIX(3.072711026) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) +F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336) +F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644) +F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) +F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) +F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) +F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602) +F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110) +F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) +F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560) +F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869) +F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) +F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_idct_islow_sse2) + +EXTN(jconst_idct_islow_sse2): + +PW_F130_F054 times 4 dw (F_0_541+F_0_765), F_0_541 +PW_F054_MF130 times 4 dw F_0_541, (F_0_541-F_1_847) +PW_MF078_F117 times 4 dw (F_1_175-F_1_961), F_1_175 +PW_F117_F078 times 4 dw F_1_175, (F_1_175-F_0_390) +PW_MF060_MF089 times 4 dw (F_0_298-F_0_899),-F_0_899 +PW_MF089_F060 times 4 dw -F_0_899, (F_1_501-F_0_899) +PW_MF050_MF256 times 4 dw (F_2_053-F_2_562),-F_2_562 +PW_MF256_F050 times 4 dw -F_2_562, (F_3_072-F_2_562) +PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1-1) +PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2-1) +PB_CENTERJSAMP times 16 db CENTERJSAMPLE + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform dequantization and inverse DCT on one block of coefficients. +; +; GLOBAL(void) +; jsimd_idct_islow_sse2 (void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +%define dct_table(b) (b)+8 ; jpeg_component_info *compptr +%define coef_block(b) (b)+12 ; JCOEFPTR coef_block +%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf +%define output_col(b) (b)+20 ; JDIMENSION output_col + +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 12 + + align 16 + global EXTN(jsimd_idct_islow_sse2) + +EXTN(jsimd_idct_islow_sse2): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic ebx +; push ecx ; unused +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input. + +; mov eax, [original_ebp] + mov edx, POINTER [dct_table(eax)] ; quantptr + mov esi, JCOEFPTR [coef_block(eax)] ; inptr + +%ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2 + mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + jnz near .columnDCT + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] + por xmm1,xmm0 + packsswb xmm1,xmm1 + packsswb xmm1,xmm1 + movd eax,xmm1 + test eax,eax + jnz short .columnDCT + + ; -- AC terms all zero + + movdqa xmm5, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] + pmullw xmm5, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + psllw xmm5,PASS1_BITS + + movdqa xmm4,xmm5 ; xmm5=in0=(00 01 02 03 04 05 06 07) + punpcklwd xmm5,xmm5 ; xmm5=(00 00 01 01 02 02 03 03) + punpckhwd xmm4,xmm4 ; xmm4=(04 04 05 05 06 06 07 07) + + pshufd xmm7,xmm5,0x00 ; xmm7=col0=(00 00 00 00 00 00 00 00) + pshufd xmm6,xmm5,0x55 ; xmm6=col1=(01 01 01 01 01 01 01 01) + pshufd xmm1,xmm5,0xAA ; xmm1=col2=(02 02 02 02 02 02 02 02) + pshufd xmm5,xmm5,0xFF ; xmm5=col3=(03 03 03 03 03 03 03 03) + pshufd xmm0,xmm4,0x00 ; xmm0=col4=(04 04 04 04 04 04 04 04) + pshufd xmm3,xmm4,0x55 ; xmm3=col5=(05 05 05 05 05 05 05 05) + pshufd xmm2,xmm4,0xAA ; xmm2=col6=(06 06 06 06 06 06 06 06) + pshufd xmm4,xmm4,0xFF ; xmm4=col7=(07 07 07 07 07 07 07 07) + + movdqa XMMWORD [wk(8)], xmm6 ; wk(8)=col1 + movdqa XMMWORD [wk(9)], xmm5 ; wk(9)=col3 + movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5 + movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7 + jmp near .column_end + alignx 16,7 +%endif +.columnDCT: + + ; -- Even part + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + ; (Original) + ; z1 = (z2 + z3) * 0.541196100; + ; tmp2 = z1 + z3 * -1.847759065; + ; tmp3 = z1 + z2 * 0.765366865; + ; + ; (This implementation) + ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); + ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; + + movdqa xmm4,xmm1 ; xmm1=in2=z2 + movdqa xmm5,xmm1 + punpcklwd xmm4,xmm3 ; xmm3=in6=z3 + punpckhwd xmm5,xmm3 + movdqa xmm1,xmm4 + movdqa xmm3,xmm5 + pmaddwd xmm4,[GOTOFF(ebx,PW_F130_F054)] ; xmm4=tmp3L + pmaddwd xmm5,[GOTOFF(ebx,PW_F130_F054)] ; xmm5=tmp3H + pmaddwd xmm1,[GOTOFF(ebx,PW_F054_MF130)] ; xmm1=tmp2L + pmaddwd xmm3,[GOTOFF(ebx,PW_F054_MF130)] ; xmm3=tmp2H + + movdqa xmm6,xmm0 + paddw xmm0,xmm2 ; xmm0=in0+in4 + psubw xmm6,xmm2 ; xmm6=in0-in4 + + pxor xmm7,xmm7 + pxor xmm2,xmm2 + punpcklwd xmm7,xmm0 ; xmm7=tmp0L + punpckhwd xmm2,xmm0 ; xmm2=tmp0H + psrad xmm7,(16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS + psrad xmm2,(16-CONST_BITS) ; psrad xmm2,16 & pslld xmm2,CONST_BITS + + movdqa xmm0,xmm7 + paddd xmm7,xmm4 ; xmm7=tmp10L + psubd xmm0,xmm4 ; xmm0=tmp13L + movdqa xmm4,xmm2 + paddd xmm2,xmm5 ; xmm2=tmp10H + psubd xmm4,xmm5 ; xmm4=tmp13H + + movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=tmp10L + movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=tmp10H + movdqa XMMWORD [wk(2)], xmm0 ; wk(2)=tmp13L + movdqa XMMWORD [wk(3)], xmm4 ; wk(3)=tmp13H + + pxor xmm5,xmm5 + pxor xmm7,xmm7 + punpcklwd xmm5,xmm6 ; xmm5=tmp1L + punpckhwd xmm7,xmm6 ; xmm7=tmp1H + psrad xmm5,(16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS + psrad xmm7,(16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS + + movdqa xmm2,xmm5 + paddd xmm5,xmm1 ; xmm5=tmp11L + psubd xmm2,xmm1 ; xmm2=tmp12L + movdqa xmm0,xmm7 + paddd xmm7,xmm3 ; xmm7=tmp11H + psubd xmm0,xmm3 ; xmm0=tmp12H + + movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L + movdqa XMMWORD [wk(5)], xmm7 ; wk(5)=tmp11H + movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=tmp12L + movdqa XMMWORD [wk(7)], xmm0 ; wk(7)=tmp12H + + ; -- Odd part + + movdqa xmm4, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movdqa xmm6, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] + pmullw xmm4, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm6, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] + pmullw xmm1, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + movdqa xmm5,xmm6 + movdqa xmm7,xmm4 + paddw xmm5,xmm3 ; xmm5=z3 + paddw xmm7,xmm1 ; xmm7=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movdqa xmm2,xmm5 + movdqa xmm0,xmm5 + punpcklwd xmm2,xmm7 + punpckhwd xmm0,xmm7 + movdqa xmm5,xmm2 + movdqa xmm7,xmm0 + pmaddwd xmm2,[GOTOFF(ebx,PW_MF078_F117)] ; xmm2=z3L + pmaddwd xmm0,[GOTOFF(ebx,PW_MF078_F117)] ; xmm0=z3H + pmaddwd xmm5,[GOTOFF(ebx,PW_F117_F078)] ; xmm5=z4L + pmaddwd xmm7,[GOTOFF(ebx,PW_F117_F078)] ; xmm7=z4H + + movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=z3L + movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=z3H + + ; (Original) + ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; + ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; + ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; tmp0 += z1 + z3; tmp1 += z2 + z4; + ; tmp2 += z2 + z3; tmp3 += z1 + z4; + ; + ; (This implementation) + ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; + ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; + ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); + ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); + ; tmp0 += z3; tmp1 += z4; + ; tmp2 += z3; tmp3 += z4; + + movdqa xmm2,xmm3 + movdqa xmm0,xmm3 + punpcklwd xmm2,xmm4 + punpckhwd xmm0,xmm4 + movdqa xmm3,xmm2 + movdqa xmm4,xmm0 + pmaddwd xmm2,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm2=tmp0L + pmaddwd xmm0,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm0=tmp0H + pmaddwd xmm3,[GOTOFF(ebx,PW_MF089_F060)] ; xmm3=tmp3L + pmaddwd xmm4,[GOTOFF(ebx,PW_MF089_F060)] ; xmm4=tmp3H + + paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp0L + paddd xmm0, XMMWORD [wk(11)] ; xmm0=tmp0H + paddd xmm3,xmm5 ; xmm3=tmp3L + paddd xmm4,xmm7 ; xmm4=tmp3H + + movdqa XMMWORD [wk(8)], xmm2 ; wk(8)=tmp0L + movdqa XMMWORD [wk(9)], xmm0 ; wk(9)=tmp0H + + movdqa xmm2,xmm1 + movdqa xmm0,xmm1 + punpcklwd xmm2,xmm6 + punpckhwd xmm0,xmm6 + movdqa xmm1,xmm2 + movdqa xmm6,xmm0 + pmaddwd xmm2,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm2=tmp1L + pmaddwd xmm0,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm0=tmp1H + pmaddwd xmm1,[GOTOFF(ebx,PW_MF256_F050)] ; xmm1=tmp2L + pmaddwd xmm6,[GOTOFF(ebx,PW_MF256_F050)] ; xmm6=tmp2H + + paddd xmm2,xmm5 ; xmm2=tmp1L + paddd xmm0,xmm7 ; xmm0=tmp1H + paddd xmm1, XMMWORD [wk(10)] ; xmm1=tmp2L + paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H + + movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=tmp1L + movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=tmp1H + + ; -- Final output stage + + movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L + movdqa xmm7, XMMWORD [wk(1)] ; xmm7=tmp10H + + movdqa xmm2,xmm5 + movdqa xmm0,xmm7 + paddd xmm5,xmm3 ; xmm5=data0L + paddd xmm7,xmm4 ; xmm7=data0H + psubd xmm2,xmm3 ; xmm2=data7L + psubd xmm0,xmm4 ; xmm0=data7H + + movdqa xmm3,[GOTOFF(ebx,PD_DESCALE_P1)] ; xmm3=[PD_DESCALE_P1] + + paddd xmm5,xmm3 + paddd xmm7,xmm3 + psrad xmm5,DESCALE_P1 + psrad xmm7,DESCALE_P1 + paddd xmm2,xmm3 + paddd xmm0,xmm3 + psrad xmm2,DESCALE_P1 + psrad xmm0,DESCALE_P1 + + packssdw xmm5,xmm7 ; xmm5=data0=(00 01 02 03 04 05 06 07) + packssdw xmm2,xmm0 ; xmm2=data7=(70 71 72 73 74 75 76 77) + + movdqa xmm4, XMMWORD [wk(4)] ; xmm4=tmp11L + movdqa xmm3, XMMWORD [wk(5)] ; xmm3=tmp11H + + movdqa xmm7,xmm4 + movdqa xmm0,xmm3 + paddd xmm4,xmm1 ; xmm4=data1L + paddd xmm3,xmm6 ; xmm3=data1H + psubd xmm7,xmm1 ; xmm7=data6L + psubd xmm0,xmm6 ; xmm0=data6H + + movdqa xmm1,[GOTOFF(ebx,PD_DESCALE_P1)] ; xmm1=[PD_DESCALE_P1] + + paddd xmm4,xmm1 + paddd xmm3,xmm1 + psrad xmm4,DESCALE_P1 + psrad xmm3,DESCALE_P1 + paddd xmm7,xmm1 + paddd xmm0,xmm1 + psrad xmm7,DESCALE_P1 + psrad xmm0,DESCALE_P1 + + packssdw xmm4,xmm3 ; xmm4=data1=(10 11 12 13 14 15 16 17) + packssdw xmm7,xmm0 ; xmm7=data6=(60 61 62 63 64 65 66 67) + + movdqa xmm6,xmm5 ; transpose coefficients(phase 1) + punpcklwd xmm5,xmm4 ; xmm5=(00 10 01 11 02 12 03 13) + punpckhwd xmm6,xmm4 ; xmm6=(04 14 05 15 06 16 07 17) + movdqa xmm1,xmm7 ; transpose coefficients(phase 1) + punpcklwd xmm7,xmm2 ; xmm7=(60 70 61 71 62 72 63 73) + punpckhwd xmm1,xmm2 ; xmm1=(64 74 65 75 66 76 67 77) + + movdqa xmm3, XMMWORD [wk(6)] ; xmm3=tmp12L + movdqa xmm0, XMMWORD [wk(7)] ; xmm0=tmp12H + movdqa xmm4, XMMWORD [wk(10)] ; xmm4=tmp1L + movdqa xmm2, XMMWORD [wk(11)] ; xmm2=tmp1H + + movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 01 11 02 12 03 13) + movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=(04 14 05 15 06 16 07 17) + movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=(60 70 61 71 62 72 63 73) + movdqa XMMWORD [wk(5)], xmm1 ; wk(5)=(64 74 65 75 66 76 67 77) + + movdqa xmm5,xmm3 + movdqa xmm6,xmm0 + paddd xmm3,xmm4 ; xmm3=data2L + paddd xmm0,xmm2 ; xmm0=data2H + psubd xmm5,xmm4 ; xmm5=data5L + psubd xmm6,xmm2 ; xmm6=data5H + + movdqa xmm7,[GOTOFF(ebx,PD_DESCALE_P1)] ; xmm7=[PD_DESCALE_P1] + + paddd xmm3,xmm7 + paddd xmm0,xmm7 + psrad xmm3,DESCALE_P1 + psrad xmm0,DESCALE_P1 + paddd xmm5,xmm7 + paddd xmm6,xmm7 + psrad xmm5,DESCALE_P1 + psrad xmm6,DESCALE_P1 + + packssdw xmm3,xmm0 ; xmm3=data2=(20 21 22 23 24 25 26 27) + packssdw xmm5,xmm6 ; xmm5=data5=(50 51 52 53 54 55 56 57) + + movdqa xmm1, XMMWORD [wk(2)] ; xmm1=tmp13L + movdqa xmm4, XMMWORD [wk(3)] ; xmm4=tmp13H + movdqa xmm2, XMMWORD [wk(8)] ; xmm2=tmp0L + movdqa xmm7, XMMWORD [wk(9)] ; xmm7=tmp0H + + movdqa xmm0,xmm1 + movdqa xmm6,xmm4 + paddd xmm1,xmm2 ; xmm1=data3L + paddd xmm4,xmm7 ; xmm4=data3H + psubd xmm0,xmm2 ; xmm0=data4L + psubd xmm6,xmm7 ; xmm6=data4H + + movdqa xmm2,[GOTOFF(ebx,PD_DESCALE_P1)] ; xmm2=[PD_DESCALE_P1] + + paddd xmm1,xmm2 + paddd xmm4,xmm2 + psrad xmm1,DESCALE_P1 + psrad xmm4,DESCALE_P1 + paddd xmm0,xmm2 + paddd xmm6,xmm2 + psrad xmm0,DESCALE_P1 + psrad xmm6,DESCALE_P1 + + packssdw xmm1,xmm4 ; xmm1=data3=(30 31 32 33 34 35 36 37) + packssdw xmm0,xmm6 ; xmm0=data4=(40 41 42 43 44 45 46 47) + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 01 11 02 12 03 13) + movdqa xmm2, XMMWORD [wk(1)] ; xmm2=(04 14 05 15 06 16 07 17) + + movdqa xmm4,xmm3 ; transpose coefficients(phase 1) + punpcklwd xmm3,xmm1 ; xmm3=(20 30 21 31 22 32 23 33) + punpckhwd xmm4,xmm1 ; xmm4=(24 34 25 35 26 36 27 37) + movdqa xmm6,xmm0 ; transpose coefficients(phase 1) + punpcklwd xmm0,xmm5 ; xmm0=(40 50 41 51 42 52 43 53) + punpckhwd xmm6,xmm5 ; xmm6=(44 54 45 55 46 56 47 57) + + movdqa xmm1,xmm7 ; transpose coefficients(phase 2) + punpckldq xmm7,xmm3 ; xmm7=(00 10 20 30 01 11 21 31) + punpckhdq xmm1,xmm3 ; xmm1=(02 12 22 32 03 13 23 33) + movdqa xmm5,xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2,xmm4 ; xmm2=(04 14 24 34 05 15 25 35) + punpckhdq xmm5,xmm4 ; xmm5=(06 16 26 36 07 17 27 37) + + movdqa xmm3, XMMWORD [wk(4)] ; xmm3=(60 70 61 71 62 72 63 73) + movdqa xmm4, XMMWORD [wk(5)] ; xmm4=(64 74 65 75 66 76 67 77) + + movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=(04 14 24 34 05 15 25 35) + movdqa XMMWORD [wk(7)], xmm5 ; wk(7)=(06 16 26 36 07 17 27 37) + + movdqa xmm2,xmm0 ; transpose coefficients(phase 2) + punpckldq xmm0,xmm3 ; xmm0=(40 50 60 70 41 51 61 71) + punpckhdq xmm2,xmm3 ; xmm2=(42 52 62 72 43 53 63 73) + movdqa xmm5,xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6,xmm4 ; xmm6=(44 54 64 74 45 55 65 75) + punpckhdq xmm5,xmm4 ; xmm5=(46 56 66 76 47 57 67 77) + + movdqa xmm3,xmm7 ; transpose coefficients(phase 3) + punpcklqdq xmm7,xmm0 ; xmm7=col0=(00 10 20 30 40 50 60 70) + punpckhqdq xmm3,xmm0 ; xmm3=col1=(01 11 21 31 41 51 61 71) + movdqa xmm4,xmm1 ; transpose coefficients(phase 3) + punpcklqdq xmm1,xmm2 ; xmm1=col2=(02 12 22 32 42 52 62 72) + punpckhqdq xmm4,xmm2 ; xmm4=col3=(03 13 23 33 43 53 63 73) + + movdqa xmm0, XMMWORD [wk(6)] ; xmm0=(04 14 24 34 05 15 25 35) + movdqa xmm2, XMMWORD [wk(7)] ; xmm2=(06 16 26 36 07 17 27 37) + + movdqa XMMWORD [wk(8)], xmm3 ; wk(8)=col1 + movdqa XMMWORD [wk(9)], xmm4 ; wk(9)=col3 + + movdqa xmm3,xmm0 ; transpose coefficients(phase 3) + punpcklqdq xmm0,xmm6 ; xmm0=col4=(04 14 24 34 44 54 64 74) + punpckhqdq xmm3,xmm6 ; xmm3=col5=(05 15 25 35 45 55 65 75) + movdqa xmm4,xmm2 ; transpose coefficients(phase 3) + punpcklqdq xmm2,xmm5 ; xmm2=col6=(06 16 26 36 46 56 66 76) + punpckhqdq xmm4,xmm5 ; xmm4=col7=(07 17 27 37 47 57 67 77) + + movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5 + movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7 +.column_end: + + ; -- Prefetch the next coefficient block + + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows from work array, store into output array. + + mov eax, [original_ebp] + mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(eax)] + + ; -- Even part + + ; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6 + + ; (Original) + ; z1 = (z2 + z3) * 0.541196100; + ; tmp2 = z1 + z3 * -1.847759065; + ; tmp3 = z1 + z2 * 0.765366865; + ; + ; (This implementation) + ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); + ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; + + movdqa xmm6,xmm1 ; xmm1=in2=z2 + movdqa xmm5,xmm1 + punpcklwd xmm6,xmm2 ; xmm2=in6=z3 + punpckhwd xmm5,xmm2 + movdqa xmm1,xmm6 + movdqa xmm2,xmm5 + pmaddwd xmm6,[GOTOFF(ebx,PW_F130_F054)] ; xmm6=tmp3L + pmaddwd xmm5,[GOTOFF(ebx,PW_F130_F054)] ; xmm5=tmp3H + pmaddwd xmm1,[GOTOFF(ebx,PW_F054_MF130)] ; xmm1=tmp2L + pmaddwd xmm2,[GOTOFF(ebx,PW_F054_MF130)] ; xmm2=tmp2H + + movdqa xmm3,xmm7 + paddw xmm7,xmm0 ; xmm7=in0+in4 + psubw xmm3,xmm0 ; xmm3=in0-in4 + + pxor xmm4,xmm4 + pxor xmm0,xmm0 + punpcklwd xmm4,xmm7 ; xmm4=tmp0L + punpckhwd xmm0,xmm7 ; xmm0=tmp0H + psrad xmm4,(16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS + psrad xmm0,(16-CONST_BITS) ; psrad xmm0,16 & pslld xmm0,CONST_BITS + + movdqa xmm7,xmm4 + paddd xmm4,xmm6 ; xmm4=tmp10L + psubd xmm7,xmm6 ; xmm7=tmp13L + movdqa xmm6,xmm0 + paddd xmm0,xmm5 ; xmm0=tmp10H + psubd xmm6,xmm5 ; xmm6=tmp13H + + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=tmp10L + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp10H + movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=tmp13L + movdqa XMMWORD [wk(3)], xmm6 ; wk(3)=tmp13H + + pxor xmm5,xmm5 + pxor xmm4,xmm4 + punpcklwd xmm5,xmm3 ; xmm5=tmp1L + punpckhwd xmm4,xmm3 ; xmm4=tmp1H + psrad xmm5,(16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS + psrad xmm4,(16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS + + movdqa xmm0,xmm5 + paddd xmm5,xmm1 ; xmm5=tmp11L + psubd xmm0,xmm1 ; xmm0=tmp12L + movdqa xmm7,xmm4 + paddd xmm4,xmm2 ; xmm4=tmp11H + psubd xmm7,xmm2 ; xmm7=tmp12H + + movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L + movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=tmp11H + movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=tmp12L + movdqa XMMWORD [wk(7)], xmm7 ; wk(7)=tmp12H + + ; -- Odd part + + movdqa xmm6, XMMWORD [wk(9)] ; xmm6=col3 + movdqa xmm3, XMMWORD [wk(8)] ; xmm3=col1 + movdqa xmm1, XMMWORD [wk(11)] ; xmm1=col7 + movdqa xmm2, XMMWORD [wk(10)] ; xmm2=col5 + + movdqa xmm5,xmm6 + movdqa xmm4,xmm3 + paddw xmm5,xmm1 ; xmm5=z3 + paddw xmm4,xmm2 ; xmm4=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movdqa xmm0,xmm5 + movdqa xmm7,xmm5 + punpcklwd xmm0,xmm4 + punpckhwd xmm7,xmm4 + movdqa xmm5,xmm0 + movdqa xmm4,xmm7 + pmaddwd xmm0,[GOTOFF(ebx,PW_MF078_F117)] ; xmm0=z3L + pmaddwd xmm7,[GOTOFF(ebx,PW_MF078_F117)] ; xmm7=z3H + pmaddwd xmm5,[GOTOFF(ebx,PW_F117_F078)] ; xmm5=z4L + pmaddwd xmm4,[GOTOFF(ebx,PW_F117_F078)] ; xmm4=z4H + + movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=z3L + movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=z3H + + ; (Original) + ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; + ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; + ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; tmp0 += z1 + z3; tmp1 += z2 + z4; + ; tmp2 += z2 + z3; tmp3 += z1 + z4; + ; + ; (This implementation) + ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; + ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; + ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); + ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); + ; tmp0 += z3; tmp1 += z4; + ; tmp2 += z3; tmp3 += z4; + + movdqa xmm0,xmm1 + movdqa xmm7,xmm1 + punpcklwd xmm0,xmm3 + punpckhwd xmm7,xmm3 + movdqa xmm1,xmm0 + movdqa xmm3,xmm7 + pmaddwd xmm0,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm0=tmp0L + pmaddwd xmm7,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm7=tmp0H + pmaddwd xmm1,[GOTOFF(ebx,PW_MF089_F060)] ; xmm1=tmp3L + pmaddwd xmm3,[GOTOFF(ebx,PW_MF089_F060)] ; xmm3=tmp3H + + paddd xmm0, XMMWORD [wk(10)] ; xmm0=tmp0L + paddd xmm7, XMMWORD [wk(11)] ; xmm7=tmp0H + paddd xmm1,xmm5 ; xmm1=tmp3L + paddd xmm3,xmm4 ; xmm3=tmp3H + + movdqa XMMWORD [wk(8)], xmm0 ; wk(8)=tmp0L + movdqa XMMWORD [wk(9)], xmm7 ; wk(9)=tmp0H + + movdqa xmm0,xmm2 + movdqa xmm7,xmm2 + punpcklwd xmm0,xmm6 + punpckhwd xmm7,xmm6 + movdqa xmm2,xmm0 + movdqa xmm6,xmm7 + pmaddwd xmm0,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm0=tmp1L + pmaddwd xmm7,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm7=tmp1H + pmaddwd xmm2,[GOTOFF(ebx,PW_MF256_F050)] ; xmm2=tmp2L + pmaddwd xmm6,[GOTOFF(ebx,PW_MF256_F050)] ; xmm6=tmp2H + + paddd xmm0,xmm5 ; xmm0=tmp1L + paddd xmm7,xmm4 ; xmm7=tmp1H + paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp2L + paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H + + movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=tmp1L + movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=tmp1H + + ; -- Final output stage + + movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L + movdqa xmm4, XMMWORD [wk(1)] ; xmm4=tmp10H + + movdqa xmm0,xmm5 + movdqa xmm7,xmm4 + paddd xmm5,xmm1 ; xmm5=data0L + paddd xmm4,xmm3 ; xmm4=data0H + psubd xmm0,xmm1 ; xmm0=data7L + psubd xmm7,xmm3 ; xmm7=data7H + + movdqa xmm1,[GOTOFF(ebx,PD_DESCALE_P2)] ; xmm1=[PD_DESCALE_P2] + + paddd xmm5,xmm1 + paddd xmm4,xmm1 + psrad xmm5,DESCALE_P2 + psrad xmm4,DESCALE_P2 + paddd xmm0,xmm1 + paddd xmm7,xmm1 + psrad xmm0,DESCALE_P2 + psrad xmm7,DESCALE_P2 + + packssdw xmm5,xmm4 ; xmm5=data0=(00 10 20 30 40 50 60 70) + packssdw xmm0,xmm7 ; xmm0=data7=(07 17 27 37 47 57 67 77) + + movdqa xmm3, XMMWORD [wk(4)] ; xmm3=tmp11L + movdqa xmm1, XMMWORD [wk(5)] ; xmm1=tmp11H + + movdqa xmm4,xmm3 + movdqa xmm7,xmm1 + paddd xmm3,xmm2 ; xmm3=data1L + paddd xmm1,xmm6 ; xmm1=data1H + psubd xmm4,xmm2 ; xmm4=data6L + psubd xmm7,xmm6 ; xmm7=data6H + + movdqa xmm2,[GOTOFF(ebx,PD_DESCALE_P2)] ; xmm2=[PD_DESCALE_P2] + + paddd xmm3,xmm2 + paddd xmm1,xmm2 + psrad xmm3,DESCALE_P2 + psrad xmm1,DESCALE_P2 + paddd xmm4,xmm2 + paddd xmm7,xmm2 + psrad xmm4,DESCALE_P2 + psrad xmm7,DESCALE_P2 + + packssdw xmm3,xmm1 ; xmm3=data1=(01 11 21 31 41 51 61 71) + packssdw xmm4,xmm7 ; xmm4=data6=(06 16 26 36 46 56 66 76) + + packsswb xmm5,xmm4 ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) + packsswb xmm3,xmm0 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) + + movdqa xmm6, XMMWORD [wk(6)] ; xmm6=tmp12L + movdqa xmm2, XMMWORD [wk(7)] ; xmm2=tmp12H + movdqa xmm1, XMMWORD [wk(10)] ; xmm1=tmp1L + movdqa xmm7, XMMWORD [wk(11)] ; xmm7=tmp1H + + movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) + movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) + + movdqa xmm4,xmm6 + movdqa xmm0,xmm2 + paddd xmm6,xmm1 ; xmm6=data2L + paddd xmm2,xmm7 ; xmm2=data2H + psubd xmm4,xmm1 ; xmm4=data5L + psubd xmm0,xmm7 ; xmm0=data5H + + movdqa xmm5,[GOTOFF(ebx,PD_DESCALE_P2)] ; xmm5=[PD_DESCALE_P2] + + paddd xmm6,xmm5 + paddd xmm2,xmm5 + psrad xmm6,DESCALE_P2 + psrad xmm2,DESCALE_P2 + paddd xmm4,xmm5 + paddd xmm0,xmm5 + psrad xmm4,DESCALE_P2 + psrad xmm0,DESCALE_P2 + + packssdw xmm6,xmm2 ; xmm6=data2=(02 12 22 32 42 52 62 72) + packssdw xmm4,xmm0 ; xmm4=data5=(05 15 25 35 45 55 65 75) + + movdqa xmm3, XMMWORD [wk(2)] ; xmm3=tmp13L + movdqa xmm1, XMMWORD [wk(3)] ; xmm1=tmp13H + movdqa xmm7, XMMWORD [wk(8)] ; xmm7=tmp0L + movdqa xmm5, XMMWORD [wk(9)] ; xmm5=tmp0H + + movdqa xmm2,xmm3 + movdqa xmm0,xmm1 + paddd xmm3,xmm7 ; xmm3=data3L + paddd xmm1,xmm5 ; xmm1=data3H + psubd xmm2,xmm7 ; xmm2=data4L + psubd xmm0,xmm5 ; xmm0=data4H + + movdqa xmm7,[GOTOFF(ebx,PD_DESCALE_P2)] ; xmm7=[PD_DESCALE_P2] + + paddd xmm3,xmm7 + paddd xmm1,xmm7 + psrad xmm3,DESCALE_P2 + psrad xmm1,DESCALE_P2 + paddd xmm2,xmm7 + paddd xmm0,xmm7 + psrad xmm2,DESCALE_P2 + psrad xmm0,DESCALE_P2 + + movdqa xmm5,[GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm5=[PB_CENTERJSAMP] + + packssdw xmm3,xmm1 ; xmm3=data3=(03 13 23 33 43 53 63 73) + packssdw xmm2,xmm0 ; xmm2=data4=(04 14 24 34 44 54 64 74) + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) + movdqa xmm1, XMMWORD [wk(1)] ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) + + packsswb xmm6,xmm2 ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74) + packsswb xmm3,xmm4 ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75) + + paddb xmm7,xmm5 + paddb xmm1,xmm5 + paddb xmm6,xmm5 + paddb xmm3,xmm5 + + movdqa xmm0,xmm7 ; transpose coefficients(phase 1) + punpcklbw xmm7,xmm1 ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71) + punpckhbw xmm0,xmm1 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77) + movdqa xmm2,xmm6 ; transpose coefficients(phase 1) + punpcklbw xmm6,xmm3 ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73) + punpckhbw xmm2,xmm3 ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75) + + movdqa xmm4,xmm7 ; transpose coefficients(phase 2) + punpcklwd xmm7,xmm6 ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) + punpckhwd xmm4,xmm6 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73) + movdqa xmm5,xmm2 ; transpose coefficients(phase 2) + punpcklwd xmm2,xmm0 ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) + punpckhwd xmm5,xmm0 ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77) + + movdqa xmm1,xmm7 ; transpose coefficients(phase 3) + punpckldq xmm7,xmm2 ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) + punpckhdq xmm1,xmm2 ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) + movdqa xmm3,xmm4 ; transpose coefficients(phase 3) + punpckldq xmm4,xmm5 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) + punpckhdq xmm3,xmm5 ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) + + pshufd xmm6,xmm7,0x4E ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) + pshufd xmm0,xmm1,0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) + pshufd xmm2,xmm4,0x4E ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47) + pshufd xmm5,xmm3,0x4E ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67) + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm7 + movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm1 + mov edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4 + movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3 + + mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6 + movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0 + mov edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm2 + movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm5 + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; unused + poppic ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/Builder/jni-1.11/simd/jidctred-mmx.asm b/Builder/jni-1.11/simd/jidctred-mmx.asm new file mode 100644 index 000000000..ba054e31a --- /dev/null +++ b/Builder/jni-1.11/simd/jidctred-mmx.asm @@ -0,0 +1,705 @@ +; +; jidctred.asm - reduced-size IDCT (MMX) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains inverse-DCT routines that produce reduced-size +; output: either 4x4 or 2x2 pixels from an 8x8 DCT block. +; The following code is based directly on the IJG's original jidctred.c; +; see the jidctred.c for more details. +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 13 +%define PASS1_BITS 2 + +%define DESCALE_P1_4 (CONST_BITS-PASS1_BITS+1) +%define DESCALE_P2_4 (CONST_BITS+PASS1_BITS+3+1) +%define DESCALE_P1_2 (CONST_BITS-PASS1_BITS+2) +%define DESCALE_P2_2 (CONST_BITS+PASS1_BITS+3+2) + +%if CONST_BITS == 13 +F_0_211 equ 1730 ; FIX(0.211164243) +F_0_509 equ 4176 ; FIX(0.509795579) +F_0_601 equ 4926 ; FIX(0.601344887) +F_0_720 equ 5906 ; FIX(0.720959822) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_850 equ 6967 ; FIX(0.850430095) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_061 equ 8697 ; FIX(1.061594337) +F_1_272 equ 10426 ; FIX(1.272758580) +F_1_451 equ 11893 ; FIX(1.451774981) +F_1_847 equ 15137 ; FIX(1.847759065) +F_2_172 equ 17799 ; FIX(2.172734803) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_624 equ 29692 ; FIX(3.624509785) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) +F_0_211 equ DESCALE( 226735879,30-CONST_BITS) ; FIX(0.211164243) +F_0_509 equ DESCALE( 547388834,30-CONST_BITS) ; FIX(0.509795579) +F_0_601 equ DESCALE( 645689155,30-CONST_BITS) ; FIX(0.601344887) +F_0_720 equ DESCALE( 774124714,30-CONST_BITS) ; FIX(0.720959822) +F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) +F_0_850 equ DESCALE( 913142361,30-CONST_BITS) ; FIX(0.850430095) +F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) +F_1_061 equ DESCALE(1139878239,30-CONST_BITS) ; FIX(1.061594337) +F_1_272 equ DESCALE(1366614119,30-CONST_BITS) ; FIX(1.272758580) +F_1_451 equ DESCALE(1558831516,30-CONST_BITS) ; FIX(1.451774981) +F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) +F_2_172 equ DESCALE(2332956230,30-CONST_BITS) ; FIX(2.172734803) +F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) +F_3_624 equ DESCALE(3891787747,30-CONST_BITS) ; FIX(3.624509785) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_idct_red_mmx) + +EXTN(jconst_idct_red_mmx): + +PW_F184_MF076 times 2 dw F_1_847,-F_0_765 +PW_F256_F089 times 2 dw F_2_562, F_0_899 +PW_F106_MF217 times 2 dw F_1_061,-F_2_172 +PW_MF060_MF050 times 2 dw -F_0_601,-F_0_509 +PW_F145_MF021 times 2 dw F_1_451,-F_0_211 +PW_F362_MF127 times 2 dw F_3_624,-F_1_272 +PW_F085_MF072 times 2 dw F_0_850,-F_0_720 +PD_DESCALE_P1_4 times 2 dd 1 << (DESCALE_P1_4-1) +PD_DESCALE_P2_4 times 2 dd 1 << (DESCALE_P2_4-1) +PD_DESCALE_P1_2 times 2 dd 1 << (DESCALE_P1_2-1) +PD_DESCALE_P2_2 times 2 dd 1 << (DESCALE_P2_2-1) +PB_CENTERJSAMP times 8 db CENTERJSAMPLE + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform dequantization and inverse DCT on one block of coefficients, +; producing a reduced-size 4x4 output block. +; +; GLOBAL(void) +; jsimd_idct_4x4_mmx (void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +%define dct_table(b) (b)+8 ; void *dct_table +%define coef_block(b) (b)+12 ; JCOEFPTR coef_block +%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf +%define output_col(b) (b)+20 ; JDIMENSION output_col + +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] +%define WK_NUM 2 +%define workspace wk(0)-DCTSIZE2*SIZEOF_JCOEF + ; JCOEF workspace[DCTSIZE2] + + align 16 + global EXTN(jsimd_idct_4x4_mmx) + +EXTN(jsimd_idct_4x4_mmx): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [workspace] + pushpic ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input, store into work array. + +; mov eax, [original_ebp] + mov edx, POINTER [dct_table(eax)] ; quantptr + mov esi, JCOEFPTR [coef_block(eax)] ; inptr + lea edi, [workspace] ; JCOEF *wsptr + mov ecx, DCTSIZE/4 ; ctr + alignx 16,7 +.columnloop: +%ifndef NO_ZERO_COLUMN_TEST_4X4_MMX + mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + jnz short .columnDCT + + movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + por mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + por mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + por mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + por mm0,mm1 + packsswb mm0,mm0 + movd eax,mm0 + test eax,eax + jnz short .columnDCT + + ; -- AC terms all zero + + movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + psllw mm0,PASS1_BITS + + movq mm2,mm0 ; mm0=in0=(00 01 02 03) + punpcklwd mm0,mm0 ; mm0=(00 00 01 01) + punpckhwd mm2,mm2 ; mm2=(02 02 03 03) + + movq mm1,mm0 + punpckldq mm0,mm0 ; mm0=(00 00 00 00) + punpckhdq mm1,mm1 ; mm1=(01 01 01 01) + movq mm3,mm2 + punpckldq mm2,mm2 ; mm2=(02 02 02 02) + punpckhdq mm3,mm3 ; mm3=(03 03 03 03) + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1 + movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2 + movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3 + jmp near .nextcolumn + alignx 16,7 +%endif +.columnDCT: + + ; -- Odd part + + movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + pmullw mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + movq mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + pmullw mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + movq mm4,mm0 + movq mm5,mm0 + punpcklwd mm4,mm1 + punpckhwd mm5,mm1 + movq mm0,mm4 + movq mm1,mm5 + pmaddwd mm4,[GOTOFF(ebx,PW_F256_F089)] ; mm4=(tmp2L) + pmaddwd mm5,[GOTOFF(ebx,PW_F256_F089)] ; mm5=(tmp2H) + pmaddwd mm0,[GOTOFF(ebx,PW_F106_MF217)] ; mm0=(tmp0L) + pmaddwd mm1,[GOTOFF(ebx,PW_F106_MF217)] ; mm1=(tmp0H) + + movq mm6,mm2 + movq mm7,mm2 + punpcklwd mm6,mm3 + punpckhwd mm7,mm3 + movq mm2,mm6 + movq mm3,mm7 + pmaddwd mm6,[GOTOFF(ebx,PW_MF060_MF050)] ; mm6=(tmp2L) + pmaddwd mm7,[GOTOFF(ebx,PW_MF060_MF050)] ; mm7=(tmp2H) + pmaddwd mm2,[GOTOFF(ebx,PW_F145_MF021)] ; mm2=(tmp0L) + pmaddwd mm3,[GOTOFF(ebx,PW_F145_MF021)] ; mm3=(tmp0H) + + paddd mm6,mm4 ; mm6=tmp2L + paddd mm7,mm5 ; mm7=tmp2H + paddd mm2,mm0 ; mm2=tmp0L + paddd mm3,mm1 ; mm3=tmp0H + + movq MMWORD [wk(0)], mm2 ; wk(0)=tmp0L + movq MMWORD [wk(1)], mm3 ; wk(1)=tmp0H + + ; -- Even part + + movq mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movq mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + movq mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + pmullw mm4, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm5, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm0, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + pxor mm1,mm1 + pxor mm2,mm2 + punpcklwd mm1,mm4 ; mm1=tmp0L + punpckhwd mm2,mm4 ; mm2=tmp0H + psrad mm1,(16-CONST_BITS-1) ; psrad mm1,16 & pslld mm1,CONST_BITS+1 + psrad mm2,(16-CONST_BITS-1) ; psrad mm2,16 & pslld mm2,CONST_BITS+1 + + movq mm3,mm5 ; mm5=in2=z2 + punpcklwd mm5,mm0 ; mm0=in6=z3 + punpckhwd mm3,mm0 + pmaddwd mm5,[GOTOFF(ebx,PW_F184_MF076)] ; mm5=tmp2L + pmaddwd mm3,[GOTOFF(ebx,PW_F184_MF076)] ; mm3=tmp2H + + movq mm4,mm1 + movq mm0,mm2 + paddd mm1,mm5 ; mm1=tmp10L + paddd mm2,mm3 ; mm2=tmp10H + psubd mm4,mm5 ; mm4=tmp12L + psubd mm0,mm3 ; mm0=tmp12H + + ; -- Final output stage + + movq mm5,mm1 + movq mm3,mm2 + paddd mm1,mm6 ; mm1=data0L + paddd mm2,mm7 ; mm2=data0H + psubd mm5,mm6 ; mm5=data3L + psubd mm3,mm7 ; mm3=data3H + + movq mm6,[GOTOFF(ebx,PD_DESCALE_P1_4)] ; mm6=[PD_DESCALE_P1_4] + + paddd mm1,mm6 + paddd mm2,mm6 + psrad mm1,DESCALE_P1_4 + psrad mm2,DESCALE_P1_4 + paddd mm5,mm6 + paddd mm3,mm6 + psrad mm5,DESCALE_P1_4 + psrad mm3,DESCALE_P1_4 + + packssdw mm1,mm2 ; mm1=data0=(00 01 02 03) + packssdw mm5,mm3 ; mm5=data3=(30 31 32 33) + + movq mm7, MMWORD [wk(0)] ; mm7=tmp0L + movq mm6, MMWORD [wk(1)] ; mm6=tmp0H + + movq mm2,mm4 + movq mm3,mm0 + paddd mm4,mm7 ; mm4=data1L + paddd mm0,mm6 ; mm0=data1H + psubd mm2,mm7 ; mm2=data2L + psubd mm3,mm6 ; mm3=data2H + + movq mm7,[GOTOFF(ebx,PD_DESCALE_P1_4)] ; mm7=[PD_DESCALE_P1_4] + + paddd mm4,mm7 + paddd mm0,mm7 + psrad mm4,DESCALE_P1_4 + psrad mm0,DESCALE_P1_4 + paddd mm2,mm7 + paddd mm3,mm7 + psrad mm2,DESCALE_P1_4 + psrad mm3,DESCALE_P1_4 + + packssdw mm4,mm0 ; mm4=data1=(10 11 12 13) + packssdw mm2,mm3 ; mm2=data2=(20 21 22 23) + + movq mm6,mm1 ; transpose coefficients(phase 1) + punpcklwd mm1,mm4 ; mm1=(00 10 01 11) + punpckhwd mm6,mm4 ; mm6=(02 12 03 13) + movq mm7,mm2 ; transpose coefficients(phase 1) + punpcklwd mm2,mm5 ; mm2=(20 30 21 31) + punpckhwd mm7,mm5 ; mm7=(22 32 23 33) + + movq mm0,mm1 ; transpose coefficients(phase 2) + punpckldq mm1,mm2 ; mm1=(00 10 20 30) + punpckhdq mm0,mm2 ; mm0=(01 11 21 31) + movq mm3,mm6 ; transpose coefficients(phase 2) + punpckldq mm6,mm7 ; mm6=(02 12 22 32) + punpckhdq mm3,mm7 ; mm3=(03 13 23 33) + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm1 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0 + movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm6 + movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3 + +.nextcolumn: + add esi, byte 4*SIZEOF_JCOEF ; coef_block + add edx, byte 4*SIZEOF_ISLOW_MULT_TYPE ; quantptr + add edi, byte 4*DCTSIZE*SIZEOF_JCOEF ; wsptr + dec ecx ; ctr + jnz near .columnloop + + ; ---- Pass 2: process rows from work array, store into output array. + + mov eax, [original_ebp] + lea esi, [workspace] ; JCOEF *wsptr + mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(eax)] + + ; -- Odd part + + movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + movq mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + + movq mm4,mm0 + movq mm5,mm0 + punpcklwd mm4,mm1 + punpckhwd mm5,mm1 + movq mm0,mm4 + movq mm1,mm5 + pmaddwd mm4,[GOTOFF(ebx,PW_F256_F089)] ; mm4=(tmp2L) + pmaddwd mm5,[GOTOFF(ebx,PW_F256_F089)] ; mm5=(tmp2H) + pmaddwd mm0,[GOTOFF(ebx,PW_F106_MF217)] ; mm0=(tmp0L) + pmaddwd mm1,[GOTOFF(ebx,PW_F106_MF217)] ; mm1=(tmp0H) + + movq mm6,mm2 + movq mm7,mm2 + punpcklwd mm6,mm3 + punpckhwd mm7,mm3 + movq mm2,mm6 + movq mm3,mm7 + pmaddwd mm6,[GOTOFF(ebx,PW_MF060_MF050)] ; mm6=(tmp2L) + pmaddwd mm7,[GOTOFF(ebx,PW_MF060_MF050)] ; mm7=(tmp2H) + pmaddwd mm2,[GOTOFF(ebx,PW_F145_MF021)] ; mm2=(tmp0L) + pmaddwd mm3,[GOTOFF(ebx,PW_F145_MF021)] ; mm3=(tmp0H) + + paddd mm6,mm4 ; mm6=tmp2L + paddd mm7,mm5 ; mm7=tmp2H + paddd mm2,mm0 ; mm2=tmp0L + paddd mm3,mm1 ; mm3=tmp0H + + movq MMWORD [wk(0)], mm2 ; wk(0)=tmp0L + movq MMWORD [wk(1)], mm3 ; wk(1)=tmp0H + + ; -- Even part + + movq mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movq mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + movq mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + + pxor mm1,mm1 + pxor mm2,mm2 + punpcklwd mm1,mm4 ; mm1=tmp0L + punpckhwd mm2,mm4 ; mm2=tmp0H + psrad mm1,(16-CONST_BITS-1) ; psrad mm1,16 & pslld mm1,CONST_BITS+1 + psrad mm2,(16-CONST_BITS-1) ; psrad mm2,16 & pslld mm2,CONST_BITS+1 + + movq mm3,mm5 ; mm5=in2=z2 + punpcklwd mm5,mm0 ; mm0=in6=z3 + punpckhwd mm3,mm0 + pmaddwd mm5,[GOTOFF(ebx,PW_F184_MF076)] ; mm5=tmp2L + pmaddwd mm3,[GOTOFF(ebx,PW_F184_MF076)] ; mm3=tmp2H + + movq mm4,mm1 + movq mm0,mm2 + paddd mm1,mm5 ; mm1=tmp10L + paddd mm2,mm3 ; mm2=tmp10H + psubd mm4,mm5 ; mm4=tmp12L + psubd mm0,mm3 ; mm0=tmp12H + + ; -- Final output stage + + movq mm5,mm1 + movq mm3,mm2 + paddd mm1,mm6 ; mm1=data0L + paddd mm2,mm7 ; mm2=data0H + psubd mm5,mm6 ; mm5=data3L + psubd mm3,mm7 ; mm3=data3H + + movq mm6,[GOTOFF(ebx,PD_DESCALE_P2_4)] ; mm6=[PD_DESCALE_P2_4] + + paddd mm1,mm6 + paddd mm2,mm6 + psrad mm1,DESCALE_P2_4 + psrad mm2,DESCALE_P2_4 + paddd mm5,mm6 + paddd mm3,mm6 + psrad mm5,DESCALE_P2_4 + psrad mm3,DESCALE_P2_4 + + packssdw mm1,mm2 ; mm1=data0=(00 10 20 30) + packssdw mm5,mm3 ; mm5=data3=(03 13 23 33) + + movq mm7, MMWORD [wk(0)] ; mm7=tmp0L + movq mm6, MMWORD [wk(1)] ; mm6=tmp0H + + movq mm2,mm4 + movq mm3,mm0 + paddd mm4,mm7 ; mm4=data1L + paddd mm0,mm6 ; mm0=data1H + psubd mm2,mm7 ; mm2=data2L + psubd mm3,mm6 ; mm3=data2H + + movq mm7,[GOTOFF(ebx,PD_DESCALE_P2_4)] ; mm7=[PD_DESCALE_P2_4] + + paddd mm4,mm7 + paddd mm0,mm7 + psrad mm4,DESCALE_P2_4 + psrad mm0,DESCALE_P2_4 + paddd mm2,mm7 + paddd mm3,mm7 + psrad mm2,DESCALE_P2_4 + psrad mm3,DESCALE_P2_4 + + packssdw mm4,mm0 ; mm4=data1=(01 11 21 31) + packssdw mm2,mm3 ; mm2=data2=(02 12 22 32) + + movq mm6,[GOTOFF(ebx,PB_CENTERJSAMP)] ; mm6=[PB_CENTERJSAMP] + + packsswb mm1,mm2 ; mm1=(00 10 20 30 02 12 22 32) + packsswb mm4,mm5 ; mm4=(01 11 21 31 03 13 23 33) + paddb mm1,mm6 + paddb mm4,mm6 + + movq mm7,mm1 ; transpose coefficients(phase 1) + punpcklbw mm1,mm4 ; mm1=(00 01 10 11 20 21 30 31) + punpckhbw mm7,mm4 ; mm7=(02 03 12 13 22 23 32 33) + + movq mm0,mm1 ; transpose coefficients(phase 2) + punpcklwd mm1,mm7 ; mm1=(00 01 02 03 10 11 12 13) + punpckhwd mm0,mm7 ; mm0=(20 21 22 23 30 31 32 33) + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW] + movd DWORD [edx+eax*SIZEOF_JSAMPLE], mm1 + movd DWORD [esi+eax*SIZEOF_JSAMPLE], mm0 + + psrlq mm1,4*BYTE_BIT + psrlq mm0,4*BYTE_BIT + + mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW] + movd DWORD [edx+eax*SIZEOF_JSAMPLE], mm1 + movd DWORD [esi+eax*SIZEOF_JSAMPLE], mm0 + + emms ; empty MMX state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + poppic ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + + +; -------------------------------------------------------------------------- +; +; Perform dequantization and inverse DCT on one block of coefficients, +; producing a reduced-size 2x2 output block. +; +; GLOBAL(void) +; jsimd_idct_2x2_mmx (void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +%define dct_table(b) (b)+8 ; void *dct_table +%define coef_block(b) (b)+12 ; JCOEFPTR coef_block +%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf +%define output_col(b) (b)+20 ; JDIMENSION output_col + + align 16 + global EXTN(jsimd_idct_2x2_mmx) + +EXTN(jsimd_idct_2x2_mmx): + push ebp + mov ebp,esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input. + + mov edx, POINTER [dct_table(ebp)] ; quantptr + mov esi, JCOEFPTR [coef_block(ebp)] ; inptr + + ; | input: | result: | + ; | 00 01 ** 03 ** 05 ** 07 | | + ; | 10 11 ** 13 ** 15 ** 17 | | + ; | ** ** ** ** ** ** ** ** | | + ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 | + ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 | + ; | 50 51 ** 53 ** 55 ** 57 | | + ; | ** ** ** ** ** ** ** ** | | + ; | 70 71 ** 73 ** 75 ** 77 | | + + ; -- Odd part + + movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + pmullw mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + movq mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + pmullw mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + ; mm0=(10 11 ** 13), mm1=(30 31 ** 33) + ; mm2=(50 51 ** 53), mm3=(70 71 ** 73) + + pcmpeqd mm7,mm7 + pslld mm7,WORD_BIT ; mm7={0x0000 0xFFFF 0x0000 0xFFFF} + + movq mm4,mm0 ; mm4=(10 11 ** 13) + movq mm5,mm2 ; mm5=(50 51 ** 53) + punpcklwd mm4,mm1 ; mm4=(10 30 11 31) + punpcklwd mm5,mm3 ; mm5=(50 70 51 71) + pmaddwd mm4,[GOTOFF(ebx,PW_F362_MF127)] + pmaddwd mm5,[GOTOFF(ebx,PW_F085_MF072)] + + psrld mm0,WORD_BIT ; mm0=(11 -- 13 --) + pand mm1,mm7 ; mm1=(-- 31 -- 33) + psrld mm2,WORD_BIT ; mm2=(51 -- 53 --) + pand mm3,mm7 ; mm3=(-- 71 -- 73) + por mm0,mm1 ; mm0=(11 31 13 33) + por mm2,mm3 ; mm2=(51 71 53 73) + pmaddwd mm0,[GOTOFF(ebx,PW_F362_MF127)] + pmaddwd mm2,[GOTOFF(ebx,PW_F085_MF072)] + + paddd mm4,mm5 ; mm4=tmp0[col0 col1] + + movq mm6, MMWORD [MMBLOCK(1,1,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(3,1,esi,SIZEOF_JCOEF)] + pmullw mm6, MMWORD [MMBLOCK(1,1,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm1, MMWORD [MMBLOCK(3,1,edx,SIZEOF_ISLOW_MULT_TYPE)] + movq mm3, MMWORD [MMBLOCK(5,1,esi,SIZEOF_JCOEF)] + movq mm5, MMWORD [MMBLOCK(7,1,esi,SIZEOF_JCOEF)] + pmullw mm3, MMWORD [MMBLOCK(5,1,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm5, MMWORD [MMBLOCK(7,1,edx,SIZEOF_ISLOW_MULT_TYPE)] + + ; mm6=(** 15 ** 17), mm1=(** 35 ** 37) + ; mm3=(** 55 ** 57), mm5=(** 75 ** 77) + + psrld mm6,WORD_BIT ; mm6=(15 -- 17 --) + pand mm1,mm7 ; mm1=(-- 35 -- 37) + psrld mm3,WORD_BIT ; mm3=(55 -- 57 --) + pand mm5,mm7 ; mm5=(-- 75 -- 77) + por mm6,mm1 ; mm6=(15 35 17 37) + por mm3,mm5 ; mm3=(55 75 57 77) + pmaddwd mm6,[GOTOFF(ebx,PW_F362_MF127)] + pmaddwd mm3,[GOTOFF(ebx,PW_F085_MF072)] + + paddd mm0,mm2 ; mm0=tmp0[col1 col3] + paddd mm6,mm3 ; mm6=tmp0[col5 col7] + + ; -- Even part + + movq mm1, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movq mm5, MMWORD [MMBLOCK(0,1,esi,SIZEOF_JCOEF)] + pmullw mm1, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm5, MMWORD [MMBLOCK(0,1,edx,SIZEOF_ISLOW_MULT_TYPE)] + + ; mm1=(00 01 ** 03), mm5=(** 05 ** 07) + + movq mm2,mm1 ; mm2=(00 01 ** 03) + pslld mm1,WORD_BIT ; mm1=(-- 00 -- **) + psrad mm1,(WORD_BIT-CONST_BITS-2) ; mm1=tmp10[col0 ****] + + pand mm2,mm7 ; mm2=(-- 01 -- 03) + pand mm5,mm7 ; mm5=(-- 05 -- 07) + psrad mm2,(WORD_BIT-CONST_BITS-2) ; mm2=tmp10[col1 col3] + psrad mm5,(WORD_BIT-CONST_BITS-2) ; mm5=tmp10[col5 col7] + + ; -- Final output stage + + movq mm3,mm1 + paddd mm1,mm4 ; mm1=data0[col0 ****]=(A0 **) + psubd mm3,mm4 ; mm3=data1[col0 ****]=(B0 **) + punpckldq mm1,mm3 ; mm1=(A0 B0) + + movq mm7,[GOTOFF(ebx,PD_DESCALE_P1_2)] ; mm7=[PD_DESCALE_P1_2] + + movq mm4,mm2 + movq mm3,mm5 + paddd mm2,mm0 ; mm2=data0[col1 col3]=(A1 A3) + paddd mm5,mm6 ; mm5=data0[col5 col7]=(A5 A7) + psubd mm4,mm0 ; mm4=data1[col1 col3]=(B1 B3) + psubd mm3,mm6 ; mm3=data1[col5 col7]=(B5 B7) + + paddd mm1,mm7 + psrad mm1,DESCALE_P1_2 + + paddd mm2,mm7 + paddd mm5,mm7 + psrad mm2,DESCALE_P1_2 + psrad mm5,DESCALE_P1_2 + paddd mm4,mm7 + paddd mm3,mm7 + psrad mm4,DESCALE_P1_2 + psrad mm3,DESCALE_P1_2 + + ; ---- Pass 2: process rows, store into output array. + + mov edi, JSAMPARRAY [output_buf(ebp)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(ebp)] + + ; | input:| result:| + ; | A0 B0 | | + ; | A1 B1 | C0 C1 | + ; | A3 B3 | D0 D1 | + ; | A5 B5 | | + ; | A7 B7 | | + + ; -- Odd part + + packssdw mm2,mm4 ; mm2=(A1 A3 B1 B3) + packssdw mm5,mm3 ; mm5=(A5 A7 B5 B7) + pmaddwd mm2,[GOTOFF(ebx,PW_F362_MF127)] + pmaddwd mm5,[GOTOFF(ebx,PW_F085_MF072)] + + paddd mm2,mm5 ; mm2=tmp0[row0 row1] + + ; -- Even part + + pslld mm1,(CONST_BITS+2) ; mm1=tmp10[row0 row1] + + ; -- Final output stage + + movq mm0,[GOTOFF(ebx,PD_DESCALE_P2_2)] ; mm0=[PD_DESCALE_P2_2] + + movq mm6,mm1 + paddd mm1,mm2 ; mm1=data0[row0 row1]=(C0 C1) + psubd mm6,mm2 ; mm6=data1[row0 row1]=(D0 D1) + + paddd mm1,mm0 + paddd mm6,mm0 + psrad mm1,DESCALE_P2_2 + psrad mm6,DESCALE_P2_2 + + movq mm7,mm1 ; transpose coefficients + punpckldq mm1,mm6 ; mm1=(C0 D0) + punpckhdq mm7,mm6 ; mm7=(C1 D1) + + packssdw mm1,mm7 ; mm1=(C0 D0 C1 D1) + packsswb mm1,mm1 ; mm1=(C0 D0 C1 D1 C0 D0 C1 D1) + paddb mm1,[GOTOFF(ebx,PB_CENTERJSAMP)] + + movd ecx,mm1 + movd ebx,mm1 ; ebx=(C0 D0 C1 D1) + shr ecx,2*BYTE_BIT ; ecx=(C1 D1 -- --) + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + mov WORD [edx+eax*SIZEOF_JSAMPLE], bx + mov WORD [esi+eax*SIZEOF_JSAMPLE], cx + + emms ; empty MMX state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/Builder/jni-1.11/simd/jidctred-sse2-64.asm b/Builder/jni-1.11/simd/jidctred-sse2-64.asm new file mode 100644 index 000000000..a54bbe24e --- /dev/null +++ b/Builder/jni-1.11/simd/jidctred-sse2-64.asm @@ -0,0 +1,575 @@ +; +; jidctred.asm - reduced-size IDCT (64-bit SSE2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2009, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains inverse-DCT routines that produce reduced-size +; output: either 4x4 or 2x2 pixels from an 8x8 DCT block. +; The following code is based directly on the IJG's original jidctred.c; +; see the jidctred.c for more details. +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 13 +%define PASS1_BITS 2 + +%define DESCALE_P1_4 (CONST_BITS-PASS1_BITS+1) +%define DESCALE_P2_4 (CONST_BITS+PASS1_BITS+3+1) +%define DESCALE_P1_2 (CONST_BITS-PASS1_BITS+2) +%define DESCALE_P2_2 (CONST_BITS+PASS1_BITS+3+2) + +%if CONST_BITS == 13 +F_0_211 equ 1730 ; FIX(0.211164243) +F_0_509 equ 4176 ; FIX(0.509795579) +F_0_601 equ 4926 ; FIX(0.601344887) +F_0_720 equ 5906 ; FIX(0.720959822) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_850 equ 6967 ; FIX(0.850430095) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_061 equ 8697 ; FIX(1.061594337) +F_1_272 equ 10426 ; FIX(1.272758580) +F_1_451 equ 11893 ; FIX(1.451774981) +F_1_847 equ 15137 ; FIX(1.847759065) +F_2_172 equ 17799 ; FIX(2.172734803) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_624 equ 29692 ; FIX(3.624509785) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) +F_0_211 equ DESCALE( 226735879,30-CONST_BITS) ; FIX(0.211164243) +F_0_509 equ DESCALE( 547388834,30-CONST_BITS) ; FIX(0.509795579) +F_0_601 equ DESCALE( 645689155,30-CONST_BITS) ; FIX(0.601344887) +F_0_720 equ DESCALE( 774124714,30-CONST_BITS) ; FIX(0.720959822) +F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) +F_0_850 equ DESCALE( 913142361,30-CONST_BITS) ; FIX(0.850430095) +F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) +F_1_061 equ DESCALE(1139878239,30-CONST_BITS) ; FIX(1.061594337) +F_1_272 equ DESCALE(1366614119,30-CONST_BITS) ; FIX(1.272758580) +F_1_451 equ DESCALE(1558831516,30-CONST_BITS) ; FIX(1.451774981) +F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) +F_2_172 equ DESCALE(2332956230,30-CONST_BITS) ; FIX(2.172734803) +F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) +F_3_624 equ DESCALE(3891787747,30-CONST_BITS) ; FIX(3.624509785) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_idct_red_sse2) + +EXTN(jconst_idct_red_sse2): + +PW_F184_MF076 times 4 dw F_1_847,-F_0_765 +PW_F256_F089 times 4 dw F_2_562, F_0_899 +PW_F106_MF217 times 4 dw F_1_061,-F_2_172 +PW_MF060_MF050 times 4 dw -F_0_601,-F_0_509 +PW_F145_MF021 times 4 dw F_1_451,-F_0_211 +PW_F362_MF127 times 4 dw F_3_624,-F_1_272 +PW_F085_MF072 times 4 dw F_0_850,-F_0_720 +PD_DESCALE_P1_4 times 4 dd 1 << (DESCALE_P1_4-1) +PD_DESCALE_P2_4 times 4 dd 1 << (DESCALE_P2_4-1) +PD_DESCALE_P1_2 times 4 dd 1 << (DESCALE_P1_2-1) +PD_DESCALE_P2_2 times 4 dd 1 << (DESCALE_P2_2-1) +PB_CENTERJSAMP times 16 db CENTERJSAMPLE + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 +; +; Perform dequantization and inverse DCT on one block of coefficients, +; producing a reduced-size 4x4 output block. +; +; GLOBAL(void) +; jsimd_idct_4x4_sse2 (void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +; r10 = void *dct_table +; r11 = JCOEFPTR coef_block +; r12 = JSAMPARRAY output_buf +; r13 = JDIMENSION output_col + +%define original_rbp rbp+0 +%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 + + align 16 + global EXTN(jsimd_idct_4x4_sse2) + +EXTN(jsimd_idct_4x4_sse2): + push rbp + mov rax,rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp],rax + mov rbp,rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args + + ; ---- Pass 1: process columns from input. + + mov rdx, r10 ; quantptr + mov rsi, r11 ; inptr + +%ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2 + mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)] + or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)] + jnz short .columnDCT + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] + por xmm0,xmm1 + packsswb xmm0,xmm0 + packsswb xmm0,xmm0 + movd eax,xmm0 + test rax,rax + jnz short .columnDCT + + ; -- AC terms all zero + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + + psllw xmm0,PASS1_BITS + + movdqa xmm3,xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07) + punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) + punpckhwd xmm3,xmm3 ; xmm3=(04 04 05 05 06 06 07 07) + + pshufd xmm1,xmm0,0x50 ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01) + pshufd xmm0,xmm0,0xFA ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03) + pshufd xmm6,xmm3,0x50 ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05) + pshufd xmm3,xmm3,0xFA ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07) + + jmp near .column_end +%endif +.columnDCT: + + ; -- Odd part + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + + movdqa xmm4,xmm0 + movdqa xmm5,xmm0 + punpcklwd xmm4,xmm1 + punpckhwd xmm5,xmm1 + movdqa xmm0,xmm4 + movdqa xmm1,xmm5 + pmaddwd xmm4,[rel PW_F256_F089] ; xmm4=(tmp2L) + pmaddwd xmm5,[rel PW_F256_F089] ; xmm5=(tmp2H) + pmaddwd xmm0,[rel PW_F106_MF217] ; xmm0=(tmp0L) + pmaddwd xmm1,[rel PW_F106_MF217] ; xmm1=(tmp0H) + + movdqa xmm6,xmm2 + movdqa xmm7,xmm2 + punpcklwd xmm6,xmm3 + punpckhwd xmm7,xmm3 + movdqa xmm2,xmm6 + movdqa xmm3,xmm7 + pmaddwd xmm6,[rel PW_MF060_MF050] ; xmm6=(tmp2L) + pmaddwd xmm7,[rel PW_MF060_MF050] ; xmm7=(tmp2H) + pmaddwd xmm2,[rel PW_F145_MF021] ; xmm2=(tmp0L) + pmaddwd xmm3,[rel PW_F145_MF021] ; xmm3=(tmp0H) + + paddd xmm6,xmm4 ; xmm6=tmp2L + paddd xmm7,xmm5 ; xmm7=tmp2H + paddd xmm2,xmm0 ; xmm2=tmp0L + paddd xmm3,xmm1 ; xmm3=tmp0H + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp0L + movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=tmp0H + + ; -- Even part + + movdqa xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] + movdqa xmm5, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] + movdqa xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] + pmullw xmm4, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm5, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm0, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + + pxor xmm1,xmm1 + pxor xmm2,xmm2 + punpcklwd xmm1,xmm4 ; xmm1=tmp0L + punpckhwd xmm2,xmm4 ; xmm2=tmp0H + psrad xmm1,(16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1 + psrad xmm2,(16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1 + + movdqa xmm3,xmm5 ; xmm5=in2=z2 + punpcklwd xmm5,xmm0 ; xmm0=in6=z3 + punpckhwd xmm3,xmm0 + pmaddwd xmm5,[rel PW_F184_MF076] ; xmm5=tmp2L + pmaddwd xmm3,[rel PW_F184_MF076] ; xmm3=tmp2H + + movdqa xmm4,xmm1 + movdqa xmm0,xmm2 + paddd xmm1,xmm5 ; xmm1=tmp10L + paddd xmm2,xmm3 ; xmm2=tmp10H + psubd xmm4,xmm5 ; xmm4=tmp12L + psubd xmm0,xmm3 ; xmm0=tmp12H + + ; -- Final output stage + + movdqa xmm5,xmm1 + movdqa xmm3,xmm2 + paddd xmm1,xmm6 ; xmm1=data0L + paddd xmm2,xmm7 ; xmm2=data0H + psubd xmm5,xmm6 ; xmm5=data3L + psubd xmm3,xmm7 ; xmm3=data3H + + movdqa xmm6,[rel PD_DESCALE_P1_4] ; xmm6=[rel PD_DESCALE_P1_4] + + paddd xmm1,xmm6 + paddd xmm2,xmm6 + psrad xmm1,DESCALE_P1_4 + psrad xmm2,DESCALE_P1_4 + paddd xmm5,xmm6 + paddd xmm3,xmm6 + psrad xmm5,DESCALE_P1_4 + psrad xmm3,DESCALE_P1_4 + + packssdw xmm1,xmm2 ; xmm1=data0=(00 01 02 03 04 05 06 07) + packssdw xmm5,xmm3 ; xmm5=data3=(30 31 32 33 34 35 36 37) + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp0L + movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp0H + + movdqa xmm2,xmm4 + movdqa xmm3,xmm0 + paddd xmm4,xmm7 ; xmm4=data1L + paddd xmm0,xmm6 ; xmm0=data1H + psubd xmm2,xmm7 ; xmm2=data2L + psubd xmm3,xmm6 ; xmm3=data2H + + movdqa xmm7,[rel PD_DESCALE_P1_4] ; xmm7=[rel PD_DESCALE_P1_4] + + paddd xmm4,xmm7 + paddd xmm0,xmm7 + psrad xmm4,DESCALE_P1_4 + psrad xmm0,DESCALE_P1_4 + paddd xmm2,xmm7 + paddd xmm3,xmm7 + psrad xmm2,DESCALE_P1_4 + psrad xmm3,DESCALE_P1_4 + + packssdw xmm4,xmm0 ; xmm4=data1=(10 11 12 13 14 15 16 17) + packssdw xmm2,xmm3 ; xmm2=data2=(20 21 22 23 24 25 26 27) + + movdqa xmm6,xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1,xmm4 ; xmm1=(00 10 01 11 02 12 03 13) + punpckhwd xmm6,xmm4 ; xmm6=(04 14 05 15 06 16 07 17) + movdqa xmm7,xmm2 ; transpose coefficients(phase 1) + punpcklwd xmm2,xmm5 ; xmm2=(20 30 21 31 22 32 23 33) + punpckhwd xmm7,xmm5 ; xmm7=(24 34 25 35 26 36 27 37) + + movdqa xmm0,xmm1 ; transpose coefficients(phase 2) + punpckldq xmm1,xmm2 ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31) + punpckhdq xmm0,xmm2 ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33) + movdqa xmm3,xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6,xmm7 ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35) + punpckhdq xmm3,xmm7 ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37) +.column_end: + + ; -- Prefetch the next coefficient block + + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows, store into output array. + + mov rax, [original_rbp] + mov rdi, r12 ; (JSAMPROW *) + mov eax, r13d + + ; -- Even part + + pxor xmm4,xmm4 + punpcklwd xmm4,xmm1 ; xmm4=tmp0 + psrad xmm4,(16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1 + + ; -- Odd part + + punpckhwd xmm1,xmm0 + punpckhwd xmm6,xmm3 + movdqa xmm5,xmm1 + movdqa xmm2,xmm6 + pmaddwd xmm1,[rel PW_F256_F089] ; xmm1=(tmp2) + pmaddwd xmm6,[rel PW_MF060_MF050] ; xmm6=(tmp2) + pmaddwd xmm5,[rel PW_F106_MF217] ; xmm5=(tmp0) + pmaddwd xmm2,[rel PW_F145_MF021] ; xmm2=(tmp0) + + paddd xmm6,xmm1 ; xmm6=tmp2 + paddd xmm2,xmm5 ; xmm2=tmp0 + + ; -- Even part + + punpcklwd xmm0,xmm3 + pmaddwd xmm0,[rel PW_F184_MF076] ; xmm0=tmp2 + + movdqa xmm7,xmm4 + paddd xmm4,xmm0 ; xmm4=tmp10 + psubd xmm7,xmm0 ; xmm7=tmp12 + + ; -- Final output stage + + movdqa xmm1,[rel PD_DESCALE_P2_4] ; xmm1=[rel PD_DESCALE_P2_4] + + movdqa xmm5,xmm4 + movdqa xmm3,xmm7 + paddd xmm4,xmm6 ; xmm4=data0=(00 10 20 30) + paddd xmm7,xmm2 ; xmm7=data1=(01 11 21 31) + psubd xmm5,xmm6 ; xmm5=data3=(03 13 23 33) + psubd xmm3,xmm2 ; xmm3=data2=(02 12 22 32) + + paddd xmm4,xmm1 + paddd xmm7,xmm1 + psrad xmm4,DESCALE_P2_4 + psrad xmm7,DESCALE_P2_4 + paddd xmm5,xmm1 + paddd xmm3,xmm1 + psrad xmm5,DESCALE_P2_4 + psrad xmm3,DESCALE_P2_4 + + packssdw xmm4,xmm3 ; xmm4=(00 10 20 30 02 12 22 32) + packssdw xmm7,xmm5 ; xmm7=(01 11 21 31 03 13 23 33) + + movdqa xmm0,xmm4 ; transpose coefficients(phase 1) + punpcklwd xmm4,xmm7 ; xmm4=(00 01 10 11 20 21 30 31) + punpckhwd xmm0,xmm7 ; xmm0=(02 03 12 13 22 23 32 33) + + movdqa xmm6,xmm4 ; transpose coefficients(phase 2) + punpckldq xmm4,xmm0 ; xmm4=(00 01 02 03 10 11 12 13) + punpckhdq xmm6,xmm0 ; xmm6=(20 21 22 23 30 31 32 33) + + packsswb xmm4,xmm6 ; xmm4=(00 01 02 03 10 11 12 13 20 ..) + paddb xmm4,[rel PB_CENTERJSAMP] + + pshufd xmm2,xmm4,0x39 ; xmm2=(10 11 12 13 20 21 22 23 30 ..) + pshufd xmm1,xmm4,0x4E ; xmm1=(20 21 22 23 30 31 32 33 00 ..) + pshufd xmm3,xmm4,0x93 ; xmm3=(30 31 32 33 00 01 02 03 10 ..) + + mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] + mov rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] + movd XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4 + movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2 + mov rdx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW] + mov rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW] + movd XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1 + movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3 + + uncollect_args + mov rsp,rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + + +; -------------------------------------------------------------------------- +; +; Perform dequantization and inverse DCT on one block of coefficients, +; producing a reduced-size 2x2 output block. +; +; GLOBAL(void) +; jsimd_idct_2x2_sse2 (void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +; r10 = void *dct_table +; r11 = JCOEFPTR coef_block +; r12 = JSAMPARRAY output_buf +; r13 = JDIMENSION output_col + + align 16 + global EXTN(jsimd_idct_2x2_sse2) + +EXTN(jsimd_idct_2x2_sse2): + push rbp + mov rax,rsp + mov rbp,rsp + collect_args + push rbx + + ; ---- Pass 1: process columns from input. + + mov rdx, r10 ; quantptr + mov rsi, r11 ; inptr + + ; | input: | result: | + ; | 00 01 ** 03 ** 05 ** 07 | | + ; | 10 11 ** 13 ** 15 ** 17 | | + ; | ** ** ** ** ** ** ** ** | | + ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 | + ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 | + ; | 50 51 ** 53 ** 55 ** 57 | | + ; | ** ** ** ** ** ** ** ** | | + ; | 70 71 ** 73 ** 75 ** 77 | | + + ; -- Odd part + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + + ; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37) + ; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77) + + pcmpeqd xmm7,xmm7 + pslld xmm7,WORD_BIT ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..} + + movdqa xmm4,xmm0 ; xmm4=(10 11 ** 13 ** 15 ** 17) + movdqa xmm5,xmm2 ; xmm5=(50 51 ** 53 ** 55 ** 57) + punpcklwd xmm4,xmm1 ; xmm4=(10 30 11 31 ** ** 13 33) + punpcklwd xmm5,xmm3 ; xmm5=(50 70 51 71 ** ** 53 73) + pmaddwd xmm4,[rel PW_F362_MF127] + pmaddwd xmm5,[rel PW_F085_MF072] + + psrld xmm0,WORD_BIT ; xmm0=(11 -- 13 -- 15 -- 17 --) + pand xmm1,xmm7 ; xmm1=(-- 31 -- 33 -- 35 -- 37) + psrld xmm2,WORD_BIT ; xmm2=(51 -- 53 -- 55 -- 57 --) + pand xmm3,xmm7 ; xmm3=(-- 71 -- 73 -- 75 -- 77) + por xmm0,xmm1 ; xmm0=(11 31 13 33 15 35 17 37) + por xmm2,xmm3 ; xmm2=(51 71 53 73 55 75 57 77) + pmaddwd xmm0,[rel PW_F362_MF127] + pmaddwd xmm2,[rel PW_F085_MF072] + + paddd xmm4,xmm5 ; xmm4=tmp0[col0 col1 **** col3] + paddd xmm0,xmm2 ; xmm0=tmp0[col1 col3 col5 col7] + + ; -- Even part + + movdqa xmm6, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] + pmullw xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + + ; xmm6=(00 01 ** 03 ** 05 ** 07) + + movdqa xmm1,xmm6 ; xmm1=(00 01 ** 03 ** 05 ** 07) + pslld xmm6,WORD_BIT ; xmm6=(-- 00 -- ** -- ** -- **) + pand xmm1,xmm7 ; xmm1=(-- 01 -- 03 -- 05 -- 07) + psrad xmm6,(WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****] + psrad xmm1,(WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7] + + ; -- Final output stage + + movdqa xmm3,xmm6 + movdqa xmm5,xmm1 + paddd xmm6,xmm4 ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **) + paddd xmm1,xmm0 ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7) + psubd xmm3,xmm4 ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **) + psubd xmm5,xmm0 ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7) + + movdqa xmm2,[rel PD_DESCALE_P1_2] ; xmm2=[rel PD_DESCALE_P1_2] + + punpckldq xmm6,xmm3 ; xmm6=(A0 B0 ** **) + + movdqa xmm7,xmm1 + punpcklqdq xmm1,xmm5 ; xmm1=(A1 A3 B1 B3) + punpckhqdq xmm7,xmm5 ; xmm7=(A5 A7 B5 B7) + + paddd xmm6,xmm2 + psrad xmm6,DESCALE_P1_2 + + paddd xmm1,xmm2 + paddd xmm7,xmm2 + psrad xmm1,DESCALE_P1_2 + psrad xmm7,DESCALE_P1_2 + + ; -- Prefetch the next coefficient block + + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows, store into output array. + + mov rdi, r12 ; (JSAMPROW *) + mov eax, r13d + + ; | input:| result:| + ; | A0 B0 | | + ; | A1 B1 | C0 C1 | + ; | A3 B3 | D0 D1 | + ; | A5 B5 | | + ; | A7 B7 | | + + ; -- Odd part + + packssdw xmm1,xmm1 ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3) + packssdw xmm7,xmm7 ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7) + pmaddwd xmm1,[rel PW_F362_MF127] + pmaddwd xmm7,[rel PW_F085_MF072] + + paddd xmm1,xmm7 ; xmm1=tmp0[row0 row1 row0 row1] + + ; -- Even part + + pslld xmm6,(CONST_BITS+2) ; xmm6=tmp10[row0 row1 **** ****] + + ; -- Final output stage + + movdqa xmm4,xmm6 + paddd xmm6,xmm1 ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **) + psubd xmm4,xmm1 ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **) + + punpckldq xmm6,xmm4 ; xmm6=(C0 D0 C1 D1) + + paddd xmm6,[rel PD_DESCALE_P2_2] + psrad xmm6,DESCALE_P2_2 + + packssdw xmm6,xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1) + packsswb xmm6,xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..) + paddb xmm6,[rel PB_CENTERJSAMP] + + pextrw ebx,xmm6,0x00 ; ebx=(C0 D0 -- --) + pextrw ecx,xmm6,0x01 ; ecx=(C1 D1 -- --) + + mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] + mov rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] + mov WORD [rdx+rax*SIZEOF_JSAMPLE], bx + mov WORD [rsi+rax*SIZEOF_JSAMPLE], cx + + pop rbx + uncollect_args + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/Builder/jni-1.11/simd/jidctred-sse2.asm b/Builder/jni-1.11/simd/jidctred-sse2.asm new file mode 100644 index 000000000..232d9838d --- /dev/null +++ b/Builder/jni-1.11/simd/jidctred-sse2.asm @@ -0,0 +1,593 @@ +; +; jidctred.asm - reduced-size IDCT (SSE2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains inverse-DCT routines that produce reduced-size +; output: either 4x4 or 2x2 pixels from an 8x8 DCT block. +; The following code is based directly on the IJG's original jidctred.c; +; see the jidctred.c for more details. +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 13 +%define PASS1_BITS 2 + +%define DESCALE_P1_4 (CONST_BITS-PASS1_BITS+1) +%define DESCALE_P2_4 (CONST_BITS+PASS1_BITS+3+1) +%define DESCALE_P1_2 (CONST_BITS-PASS1_BITS+2) +%define DESCALE_P2_2 (CONST_BITS+PASS1_BITS+3+2) + +%if CONST_BITS == 13 +F_0_211 equ 1730 ; FIX(0.211164243) +F_0_509 equ 4176 ; FIX(0.509795579) +F_0_601 equ 4926 ; FIX(0.601344887) +F_0_720 equ 5906 ; FIX(0.720959822) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_850 equ 6967 ; FIX(0.850430095) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_061 equ 8697 ; FIX(1.061594337) +F_1_272 equ 10426 ; FIX(1.272758580) +F_1_451 equ 11893 ; FIX(1.451774981) +F_1_847 equ 15137 ; FIX(1.847759065) +F_2_172 equ 17799 ; FIX(2.172734803) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_624 equ 29692 ; FIX(3.624509785) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) +F_0_211 equ DESCALE( 226735879,30-CONST_BITS) ; FIX(0.211164243) +F_0_509 equ DESCALE( 547388834,30-CONST_BITS) ; FIX(0.509795579) +F_0_601 equ DESCALE( 645689155,30-CONST_BITS) ; FIX(0.601344887) +F_0_720 equ DESCALE( 774124714,30-CONST_BITS) ; FIX(0.720959822) +F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) +F_0_850 equ DESCALE( 913142361,30-CONST_BITS) ; FIX(0.850430095) +F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) +F_1_061 equ DESCALE(1139878239,30-CONST_BITS) ; FIX(1.061594337) +F_1_272 equ DESCALE(1366614119,30-CONST_BITS) ; FIX(1.272758580) +F_1_451 equ DESCALE(1558831516,30-CONST_BITS) ; FIX(1.451774981) +F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) +F_2_172 equ DESCALE(2332956230,30-CONST_BITS) ; FIX(2.172734803) +F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) +F_3_624 equ DESCALE(3891787747,30-CONST_BITS) ; FIX(3.624509785) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_idct_red_sse2) + +EXTN(jconst_idct_red_sse2): + +PW_F184_MF076 times 4 dw F_1_847,-F_0_765 +PW_F256_F089 times 4 dw F_2_562, F_0_899 +PW_F106_MF217 times 4 dw F_1_061,-F_2_172 +PW_MF060_MF050 times 4 dw -F_0_601,-F_0_509 +PW_F145_MF021 times 4 dw F_1_451,-F_0_211 +PW_F362_MF127 times 4 dw F_3_624,-F_1_272 +PW_F085_MF072 times 4 dw F_0_850,-F_0_720 +PD_DESCALE_P1_4 times 4 dd 1 << (DESCALE_P1_4-1) +PD_DESCALE_P2_4 times 4 dd 1 << (DESCALE_P2_4-1) +PD_DESCALE_P1_2 times 4 dd 1 << (DESCALE_P1_2-1) +PD_DESCALE_P2_2 times 4 dd 1 << (DESCALE_P2_2-1) +PB_CENTERJSAMP times 16 db CENTERJSAMPLE + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform dequantization and inverse DCT on one block of coefficients, +; producing a reduced-size 4x4 output block. +; +; GLOBAL(void) +; jsimd_idct_4x4_sse2 (void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +%define dct_table(b) (b)+8 ; void *dct_table +%define coef_block(b) (b)+12 ; JCOEFPTR coef_block +%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf +%define output_col(b) (b)+20 ; JDIMENSION output_col + +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 + + align 16 + global EXTN(jsimd_idct_4x4_sse2) + +EXTN(jsimd_idct_4x4_sse2): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic ebx +; push ecx ; unused +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input. + +; mov eax, [original_ebp] + mov edx, POINTER [dct_table(eax)] ; quantptr + mov esi, JCOEFPTR [coef_block(eax)] ; inptr + +%ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2 + mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + jnz short .columnDCT + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] + por xmm0,xmm1 + packsswb xmm0,xmm0 + packsswb xmm0,xmm0 + movd eax,xmm0 + test eax,eax + jnz short .columnDCT + + ; -- AC terms all zero + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + psllw xmm0,PASS1_BITS + + movdqa xmm3,xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07) + punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) + punpckhwd xmm3,xmm3 ; xmm3=(04 04 05 05 06 06 07 07) + + pshufd xmm1,xmm0,0x50 ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01) + pshufd xmm0,xmm0,0xFA ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03) + pshufd xmm6,xmm3,0x50 ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05) + pshufd xmm3,xmm3,0xFA ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07) + + jmp near .column_end + alignx 16,7 +%endif +.columnDCT: + + ; -- Odd part + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + movdqa xmm4,xmm0 + movdqa xmm5,xmm0 + punpcklwd xmm4,xmm1 + punpckhwd xmm5,xmm1 + movdqa xmm0,xmm4 + movdqa xmm1,xmm5 + pmaddwd xmm4,[GOTOFF(ebx,PW_F256_F089)] ; xmm4=(tmp2L) + pmaddwd xmm5,[GOTOFF(ebx,PW_F256_F089)] ; xmm5=(tmp2H) + pmaddwd xmm0,[GOTOFF(ebx,PW_F106_MF217)] ; xmm0=(tmp0L) + pmaddwd xmm1,[GOTOFF(ebx,PW_F106_MF217)] ; xmm1=(tmp0H) + + movdqa xmm6,xmm2 + movdqa xmm7,xmm2 + punpcklwd xmm6,xmm3 + punpckhwd xmm7,xmm3 + movdqa xmm2,xmm6 + movdqa xmm3,xmm7 + pmaddwd xmm6,[GOTOFF(ebx,PW_MF060_MF050)] ; xmm6=(tmp2L) + pmaddwd xmm7,[GOTOFF(ebx,PW_MF060_MF050)] ; xmm7=(tmp2H) + pmaddwd xmm2,[GOTOFF(ebx,PW_F145_MF021)] ; xmm2=(tmp0L) + pmaddwd xmm3,[GOTOFF(ebx,PW_F145_MF021)] ; xmm3=(tmp0H) + + paddd xmm6,xmm4 ; xmm6=tmp2L + paddd xmm7,xmm5 ; xmm7=tmp2H + paddd xmm2,xmm0 ; xmm2=tmp0L + paddd xmm3,xmm1 ; xmm3=tmp0H + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp0L + movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=tmp0H + + ; -- Even part + + movdqa xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movdqa xmm5, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] + movdqa xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] + pmullw xmm4, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm5, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm0, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + pxor xmm1,xmm1 + pxor xmm2,xmm2 + punpcklwd xmm1,xmm4 ; xmm1=tmp0L + punpckhwd xmm2,xmm4 ; xmm2=tmp0H + psrad xmm1,(16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1 + psrad xmm2,(16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1 + + movdqa xmm3,xmm5 ; xmm5=in2=z2 + punpcklwd xmm5,xmm0 ; xmm0=in6=z3 + punpckhwd xmm3,xmm0 + pmaddwd xmm5,[GOTOFF(ebx,PW_F184_MF076)] ; xmm5=tmp2L + pmaddwd xmm3,[GOTOFF(ebx,PW_F184_MF076)] ; xmm3=tmp2H + + movdqa xmm4,xmm1 + movdqa xmm0,xmm2 + paddd xmm1,xmm5 ; xmm1=tmp10L + paddd xmm2,xmm3 ; xmm2=tmp10H + psubd xmm4,xmm5 ; xmm4=tmp12L + psubd xmm0,xmm3 ; xmm0=tmp12H + + ; -- Final output stage + + movdqa xmm5,xmm1 + movdqa xmm3,xmm2 + paddd xmm1,xmm6 ; xmm1=data0L + paddd xmm2,xmm7 ; xmm2=data0H + psubd xmm5,xmm6 ; xmm5=data3L + psubd xmm3,xmm7 ; xmm3=data3H + + movdqa xmm6,[GOTOFF(ebx,PD_DESCALE_P1_4)] ; xmm6=[PD_DESCALE_P1_4] + + paddd xmm1,xmm6 + paddd xmm2,xmm6 + psrad xmm1,DESCALE_P1_4 + psrad xmm2,DESCALE_P1_4 + paddd xmm5,xmm6 + paddd xmm3,xmm6 + psrad xmm5,DESCALE_P1_4 + psrad xmm3,DESCALE_P1_4 + + packssdw xmm1,xmm2 ; xmm1=data0=(00 01 02 03 04 05 06 07) + packssdw xmm5,xmm3 ; xmm5=data3=(30 31 32 33 34 35 36 37) + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp0L + movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp0H + + movdqa xmm2,xmm4 + movdqa xmm3,xmm0 + paddd xmm4,xmm7 ; xmm4=data1L + paddd xmm0,xmm6 ; xmm0=data1H + psubd xmm2,xmm7 ; xmm2=data2L + psubd xmm3,xmm6 ; xmm3=data2H + + movdqa xmm7,[GOTOFF(ebx,PD_DESCALE_P1_4)] ; xmm7=[PD_DESCALE_P1_4] + + paddd xmm4,xmm7 + paddd xmm0,xmm7 + psrad xmm4,DESCALE_P1_4 + psrad xmm0,DESCALE_P1_4 + paddd xmm2,xmm7 + paddd xmm3,xmm7 + psrad xmm2,DESCALE_P1_4 + psrad xmm3,DESCALE_P1_4 + + packssdw xmm4,xmm0 ; xmm4=data1=(10 11 12 13 14 15 16 17) + packssdw xmm2,xmm3 ; xmm2=data2=(20 21 22 23 24 25 26 27) + + movdqa xmm6,xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1,xmm4 ; xmm1=(00 10 01 11 02 12 03 13) + punpckhwd xmm6,xmm4 ; xmm6=(04 14 05 15 06 16 07 17) + movdqa xmm7,xmm2 ; transpose coefficients(phase 1) + punpcklwd xmm2,xmm5 ; xmm2=(20 30 21 31 22 32 23 33) + punpckhwd xmm7,xmm5 ; xmm7=(24 34 25 35 26 36 27 37) + + movdqa xmm0,xmm1 ; transpose coefficients(phase 2) + punpckldq xmm1,xmm2 ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31) + punpckhdq xmm0,xmm2 ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33) + movdqa xmm3,xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6,xmm7 ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35) + punpckhdq xmm3,xmm7 ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37) +.column_end: + + ; -- Prefetch the next coefficient block + + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows, store into output array. + + mov eax, [original_ebp] + mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(eax)] + + ; -- Even part + + pxor xmm4,xmm4 + punpcklwd xmm4,xmm1 ; xmm4=tmp0 + psrad xmm4,(16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1 + + ; -- Odd part + + punpckhwd xmm1,xmm0 + punpckhwd xmm6,xmm3 + movdqa xmm5,xmm1 + movdqa xmm2,xmm6 + pmaddwd xmm1,[GOTOFF(ebx,PW_F256_F089)] ; xmm1=(tmp2) + pmaddwd xmm6,[GOTOFF(ebx,PW_MF060_MF050)] ; xmm6=(tmp2) + pmaddwd xmm5,[GOTOFF(ebx,PW_F106_MF217)] ; xmm5=(tmp0) + pmaddwd xmm2,[GOTOFF(ebx,PW_F145_MF021)] ; xmm2=(tmp0) + + paddd xmm6,xmm1 ; xmm6=tmp2 + paddd xmm2,xmm5 ; xmm2=tmp0 + + ; -- Even part + + punpcklwd xmm0,xmm3 + pmaddwd xmm0,[GOTOFF(ebx,PW_F184_MF076)] ; xmm0=tmp2 + + movdqa xmm7,xmm4 + paddd xmm4,xmm0 ; xmm4=tmp10 + psubd xmm7,xmm0 ; xmm7=tmp12 + + ; -- Final output stage + + movdqa xmm1,[GOTOFF(ebx,PD_DESCALE_P2_4)] ; xmm1=[PD_DESCALE_P2_4] + + movdqa xmm5,xmm4 + movdqa xmm3,xmm7 + paddd xmm4,xmm6 ; xmm4=data0=(00 10 20 30) + paddd xmm7,xmm2 ; xmm7=data1=(01 11 21 31) + psubd xmm5,xmm6 ; xmm5=data3=(03 13 23 33) + psubd xmm3,xmm2 ; xmm3=data2=(02 12 22 32) + + paddd xmm4,xmm1 + paddd xmm7,xmm1 + psrad xmm4,DESCALE_P2_4 + psrad xmm7,DESCALE_P2_4 + paddd xmm5,xmm1 + paddd xmm3,xmm1 + psrad xmm5,DESCALE_P2_4 + psrad xmm3,DESCALE_P2_4 + + packssdw xmm4,xmm3 ; xmm4=(00 10 20 30 02 12 22 32) + packssdw xmm7,xmm5 ; xmm7=(01 11 21 31 03 13 23 33) + + movdqa xmm0,xmm4 ; transpose coefficients(phase 1) + punpcklwd xmm4,xmm7 ; xmm4=(00 01 10 11 20 21 30 31) + punpckhwd xmm0,xmm7 ; xmm0=(02 03 12 13 22 23 32 33) + + movdqa xmm6,xmm4 ; transpose coefficients(phase 2) + punpckldq xmm4,xmm0 ; xmm4=(00 01 02 03 10 11 12 13) + punpckhdq xmm6,xmm0 ; xmm6=(20 21 22 23 30 31 32 33) + + packsswb xmm4,xmm6 ; xmm4=(00 01 02 03 10 11 12 13 20 ..) + paddb xmm4,[GOTOFF(ebx,PB_CENTERJSAMP)] + + pshufd xmm2,xmm4,0x39 ; xmm2=(10 11 12 13 20 21 22 23 30 ..) + pshufd xmm1,xmm4,0x4E ; xmm1=(20 21 22 23 30 31 32 33 00 ..) + pshufd xmm3,xmm4,0x93 ; xmm3=(30 31 32 33 00 01 02 03 10 ..) + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + movd XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm4 + movd XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm2 + mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW] + movd XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm1 + movd XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm3 + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; unused + poppic ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + + +; -------------------------------------------------------------------------- +; +; Perform dequantization and inverse DCT on one block of coefficients, +; producing a reduced-size 2x2 output block. +; +; GLOBAL(void) +; jsimd_idct_2x2_sse2 (void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +%define dct_table(b) (b)+8 ; void *dct_table +%define coef_block(b) (b)+12 ; JCOEFPTR coef_block +%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf +%define output_col(b) (b)+20 ; JDIMENSION output_col + + align 16 + global EXTN(jsimd_idct_2x2_sse2) + +EXTN(jsimd_idct_2x2_sse2): + push ebp + mov ebp,esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input. + + mov edx, POINTER [dct_table(ebp)] ; quantptr + mov esi, JCOEFPTR [coef_block(ebp)] ; inptr + + ; | input: | result: | + ; | 00 01 ** 03 ** 05 ** 07 | | + ; | 10 11 ** 13 ** 15 ** 17 | | + ; | ** ** ** ** ** ** ** ** | | + ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 | + ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 | + ; | 50 51 ** 53 ** 55 ** 57 | | + ; | ** ** ** ** ** ** ** ** | | + ; | 70 71 ** 73 ** 75 ** 77 | | + + ; -- Odd part + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + ; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37) + ; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77) + + pcmpeqd xmm7,xmm7 + pslld xmm7,WORD_BIT ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..} + + movdqa xmm4,xmm0 ; xmm4=(10 11 ** 13 ** 15 ** 17) + movdqa xmm5,xmm2 ; xmm5=(50 51 ** 53 ** 55 ** 57) + punpcklwd xmm4,xmm1 ; xmm4=(10 30 11 31 ** ** 13 33) + punpcklwd xmm5,xmm3 ; xmm5=(50 70 51 71 ** ** 53 73) + pmaddwd xmm4,[GOTOFF(ebx,PW_F362_MF127)] + pmaddwd xmm5,[GOTOFF(ebx,PW_F085_MF072)] + + psrld xmm0,WORD_BIT ; xmm0=(11 -- 13 -- 15 -- 17 --) + pand xmm1,xmm7 ; xmm1=(-- 31 -- 33 -- 35 -- 37) + psrld xmm2,WORD_BIT ; xmm2=(51 -- 53 -- 55 -- 57 --) + pand xmm3,xmm7 ; xmm3=(-- 71 -- 73 -- 75 -- 77) + por xmm0,xmm1 ; xmm0=(11 31 13 33 15 35 17 37) + por xmm2,xmm3 ; xmm2=(51 71 53 73 55 75 57 77) + pmaddwd xmm0,[GOTOFF(ebx,PW_F362_MF127)] + pmaddwd xmm2,[GOTOFF(ebx,PW_F085_MF072)] + + paddd xmm4,xmm5 ; xmm4=tmp0[col0 col1 **** col3] + paddd xmm0,xmm2 ; xmm0=tmp0[col1 col3 col5 col7] + + ; -- Even part + + movdqa xmm6, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] + pmullw xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + ; xmm6=(00 01 ** 03 ** 05 ** 07) + + movdqa xmm1,xmm6 ; xmm1=(00 01 ** 03 ** 05 ** 07) + pslld xmm6,WORD_BIT ; xmm6=(-- 00 -- ** -- ** -- **) + pand xmm1,xmm7 ; xmm1=(-- 01 -- 03 -- 05 -- 07) + psrad xmm6,(WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****] + psrad xmm1,(WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7] + + ; -- Final output stage + + movdqa xmm3,xmm6 + movdqa xmm5,xmm1 + paddd xmm6,xmm4 ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **) + paddd xmm1,xmm0 ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7) + psubd xmm3,xmm4 ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **) + psubd xmm5,xmm0 ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7) + + movdqa xmm2,[GOTOFF(ebx,PD_DESCALE_P1_2)] ; xmm2=[PD_DESCALE_P1_2] + + punpckldq xmm6,xmm3 ; xmm6=(A0 B0 ** **) + + movdqa xmm7,xmm1 + punpcklqdq xmm1,xmm5 ; xmm1=(A1 A3 B1 B3) + punpckhqdq xmm7,xmm5 ; xmm7=(A5 A7 B5 B7) + + paddd xmm6,xmm2 + psrad xmm6,DESCALE_P1_2 + + paddd xmm1,xmm2 + paddd xmm7,xmm2 + psrad xmm1,DESCALE_P1_2 + psrad xmm7,DESCALE_P1_2 + + ; -- Prefetch the next coefficient block + + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows, store into output array. + + mov edi, JSAMPARRAY [output_buf(ebp)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(ebp)] + + ; | input:| result:| + ; | A0 B0 | | + ; | A1 B1 | C0 C1 | + ; | A3 B3 | D0 D1 | + ; | A5 B5 | | + ; | A7 B7 | | + + ; -- Odd part + + packssdw xmm1,xmm1 ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3) + packssdw xmm7,xmm7 ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7) + pmaddwd xmm1,[GOTOFF(ebx,PW_F362_MF127)] + pmaddwd xmm7,[GOTOFF(ebx,PW_F085_MF072)] + + paddd xmm1,xmm7 ; xmm1=tmp0[row0 row1 row0 row1] + + ; -- Even part + + pslld xmm6,(CONST_BITS+2) ; xmm6=tmp10[row0 row1 **** ****] + + ; -- Final output stage + + movdqa xmm4,xmm6 + paddd xmm6,xmm1 ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **) + psubd xmm4,xmm1 ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **) + + punpckldq xmm6,xmm4 ; xmm6=(C0 D0 C1 D1) + + paddd xmm6,[GOTOFF(ebx,PD_DESCALE_P2_2)] + psrad xmm6,DESCALE_P2_2 + + packssdw xmm6,xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1) + packsswb xmm6,xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..) + paddb xmm6,[GOTOFF(ebx,PB_CENTERJSAMP)] + + pextrw ebx,xmm6,0x00 ; ebx=(C0 D0 -- --) + pextrw ecx,xmm6,0x01 ; ecx=(C1 D1 -- --) + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + mov WORD [edx+eax*SIZEOF_JSAMPLE], bx + mov WORD [esi+eax*SIZEOF_JSAMPLE], cx + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/Builder/jni-1.11/simd/jpeg_nbits_table.inc b/Builder/jni-1.11/simd/jpeg_nbits_table.inc new file mode 100644 index 000000000..cbc69904e --- /dev/null +++ b/Builder/jni-1.11/simd/jpeg_nbits_table.inc @@ -0,0 +1,4097 @@ +jpeg_nbits_table db \ + 0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, \ + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, \ + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, \ + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, \ + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, \ + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, \ + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, \ + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, \ + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, \ + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, \ + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, \ + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, \ + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, \ + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, \ + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, \ + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, \ + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 diff --git a/Builder/jni-1.11/simd/jquant-3dn.asm b/Builder/jni-1.11/simd/jquant-3dn.asm new file mode 100644 index 000000000..0b4164b26 --- /dev/null +++ b/Builder/jni-1.11/simd/jquant-3dn.asm @@ -0,0 +1,232 @@ +; +; jquant.asm - sample data conversion and quantization (3DNow! & MMX) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Load data into workspace, applying unsigned->signed conversion +; +; GLOBAL(void) +; jsimd_convsamp_float_3dnow (JSAMPARRAY sample_data, JDIMENSION start_col, +; FAST_FLOAT *workspace); +; + +%define sample_data ebp+8 ; JSAMPARRAY sample_data +%define start_col ebp+12 ; JDIMENSION start_col +%define workspace ebp+16 ; FAST_FLOAT *workspace + + align 16 + global EXTN(jsimd_convsamp_float_3dnow) + +EXTN(jsimd_convsamp_float_3dnow): + push ebp + mov ebp,esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + pcmpeqw mm7,mm7 + psllw mm7,7 + packsswb mm7,mm7 ; mm7 = PB_CENTERJSAMPLE (0x808080..) + + mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) + mov eax, JDIMENSION [start_col] + mov edi, POINTER [workspace] ; (DCTELEM *) + mov ecx, DCTSIZE/2 + alignx 16,7 +.convloop: + mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) + + movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE] + movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE] + + psubb mm0,mm7 ; mm0=(01234567) + psubb mm1,mm7 ; mm1=(89ABCDEF) + + punpcklbw mm2,mm0 ; mm2=(*0*1*2*3) + punpckhbw mm0,mm0 ; mm0=(*4*5*6*7) + punpcklbw mm3,mm1 ; mm3=(*8*9*A*B) + punpckhbw mm1,mm1 ; mm1=(*C*D*E*F) + + punpcklwd mm4,mm2 ; mm4=(***0***1) + punpckhwd mm2,mm2 ; mm2=(***2***3) + punpcklwd mm5,mm0 ; mm5=(***4***5) + punpckhwd mm0,mm0 ; mm0=(***6***7) + + psrad mm4,(DWORD_BIT-BYTE_BIT) ; mm4=(01) + psrad mm2,(DWORD_BIT-BYTE_BIT) ; mm2=(23) + pi2fd mm4,mm4 + pi2fd mm2,mm2 + psrad mm5,(DWORD_BIT-BYTE_BIT) ; mm5=(45) + psrad mm0,(DWORD_BIT-BYTE_BIT) ; mm0=(67) + pi2fd mm5,mm5 + pi2fd mm0,mm0 + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm4 + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm2 + movq MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5 + movq MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0 + + punpcklwd mm6,mm3 ; mm6=(***8***9) + punpckhwd mm3,mm3 ; mm3=(***A***B) + punpcklwd mm4,mm1 ; mm4=(***C***D) + punpckhwd mm1,mm1 ; mm1=(***E***F) + + psrad mm6,(DWORD_BIT-BYTE_BIT) ; mm6=(89) + psrad mm3,(DWORD_BIT-BYTE_BIT) ; mm3=(AB) + pi2fd mm6,mm6 + pi2fd mm3,mm3 + psrad mm4,(DWORD_BIT-BYTE_BIT) ; mm4=(CD) + psrad mm1,(DWORD_BIT-BYTE_BIT) ; mm1=(EF) + pi2fd mm4,mm4 + pi2fd mm1,mm1 + + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm6 + movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm3 + movq MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm4 + movq MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1 + + add esi, byte 2*SIZEOF_JSAMPROW + add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT + dec ecx + jnz near .convloop + + femms ; empty MMX/3DNow! state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret + + +; -------------------------------------------------------------------------- +; +; Quantize/descale the coefficients, and store into coef_block +; +; GLOBAL(void) +; jsimd_quantize_float_3dnow (JCOEFPTR coef_block, FAST_FLOAT *divisors, +; FAST_FLOAT *workspace); +; + +%define coef_block ebp+8 ; JCOEFPTR coef_block +%define divisors ebp+12 ; FAST_FLOAT *divisors +%define workspace ebp+16 ; FAST_FLOAT *workspace + + align 16 + global EXTN(jsimd_quantize_float_3dnow) + +EXTN(jsimd_quantize_float_3dnow): + push ebp + mov ebp,esp +; push ebx ; unused +; push ecx ; unused +; push edx ; need not be preserved + push esi + push edi + + mov eax, 0x4B400000 ; (float)0x00C00000 (rndint_magic) + movd mm7,eax + punpckldq mm7,mm7 ; mm7={12582912.0F 12582912.0F} + + mov esi, POINTER [workspace] + mov edx, POINTER [divisors] + mov edi, JCOEFPTR [coef_block] + mov eax, DCTSIZE2/16 + alignx 16,7 +.quantloop: + movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] + movq mm1, MMWORD [MMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)] + pfmul mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] + pfmul mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] + movq mm2, MMWORD [MMBLOCK(0,2,esi,SIZEOF_FAST_FLOAT)] + movq mm3, MMWORD [MMBLOCK(0,3,esi,SIZEOF_FAST_FLOAT)] + pfmul mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)] + pfmul mm3, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)] + + pfadd mm0,mm7 ; mm0=(00 ** 01 **) + pfadd mm1,mm7 ; mm1=(02 ** 03 **) + pfadd mm2,mm7 ; mm0=(04 ** 05 **) + pfadd mm3,mm7 ; mm1=(06 ** 07 **) + + movq mm4,mm0 + punpcklwd mm0,mm1 ; mm0=(00 02 ** **) + punpckhwd mm4,mm1 ; mm4=(01 03 ** **) + movq mm5,mm2 + punpcklwd mm2,mm3 ; mm2=(04 06 ** **) + punpckhwd mm5,mm3 ; mm5=(05 07 ** **) + + punpcklwd mm0,mm4 ; mm0=(00 01 02 03) + punpcklwd mm2,mm5 ; mm2=(04 05 06 07) + + movq mm6, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] + movq mm1, MMWORD [MMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)] + pfmul mm6, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] + pfmul mm1, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] + movq mm3, MMWORD [MMBLOCK(1,2,esi,SIZEOF_FAST_FLOAT)] + movq mm4, MMWORD [MMBLOCK(1,3,esi,SIZEOF_FAST_FLOAT)] + pfmul mm3, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)] + pfmul mm4, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)] + + pfadd mm6,mm7 ; mm0=(10 ** 11 **) + pfadd mm1,mm7 ; mm4=(12 ** 13 **) + pfadd mm3,mm7 ; mm0=(14 ** 15 **) + pfadd mm4,mm7 ; mm4=(16 ** 17 **) + + movq mm5,mm6 + punpcklwd mm6,mm1 ; mm6=(10 12 ** **) + punpckhwd mm5,mm1 ; mm5=(11 13 ** **) + movq mm1,mm3 + punpcklwd mm3,mm4 ; mm3=(14 16 ** **) + punpckhwd mm1,mm4 ; mm1=(15 17 ** **) + + punpcklwd mm6,mm5 ; mm6=(10 11 12 13) + punpcklwd mm3,mm1 ; mm3=(14 15 16 17) + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0 + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm2 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm6 + movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3 + + add esi, byte 16*SIZEOF_FAST_FLOAT + add edx, byte 16*SIZEOF_FAST_FLOAT + add edi, byte 16*SIZEOF_JCOEF + dec eax + jnz near .quantloop + + femms ; empty MMX/3DNow! state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; unused +; pop ebx ; unused + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/Builder/jni-1.11/simd/jquant-mmx.asm b/Builder/jni-1.11/simd/jquant-mmx.asm new file mode 100644 index 000000000..aed6071bf --- /dev/null +++ b/Builder/jni-1.11/simd/jquant-mmx.asm @@ -0,0 +1,273 @@ +; +; jquant.asm - sample data conversion and quantization (MMX) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Load data into workspace, applying unsigned->signed conversion +; +; GLOBAL(void) +; jsimd_convsamp_mmx (JSAMPARRAY sample_data, JDIMENSION start_col, +; DCTELEM *workspace); +; + +%define sample_data ebp+8 ; JSAMPARRAY sample_data +%define start_col ebp+12 ; JDIMENSION start_col +%define workspace ebp+16 ; DCTELEM *workspace + + align 16 + global EXTN(jsimd_convsamp_mmx) + +EXTN(jsimd_convsamp_mmx): + push ebp + mov ebp,esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + pxor mm6,mm6 ; mm6=(all 0's) + pcmpeqw mm7,mm7 + psllw mm7,7 ; mm7={0xFF80 0xFF80 0xFF80 0xFF80} + + mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) + mov eax, JDIMENSION [start_col] + mov edi, POINTER [workspace] ; (DCTELEM *) + mov ecx, DCTSIZE/4 + alignx 16,7 +.convloop: + mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) + + movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; mm0=(01234567) + movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE] ; mm1=(89ABCDEF) + + mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *) + + movq mm2, MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; mm2=(GHIJKLMN) + movq mm3, MMWORD [edx+eax*SIZEOF_JSAMPLE] ; mm3=(OPQRSTUV) + + movq mm4,mm0 + punpcklbw mm0,mm6 ; mm0=(0123) + punpckhbw mm4,mm6 ; mm4=(4567) + movq mm5,mm1 + punpcklbw mm1,mm6 ; mm1=(89AB) + punpckhbw mm5,mm6 ; mm5=(CDEF) + + paddw mm0,mm7 + paddw mm4,mm7 + paddw mm1,mm7 + paddw mm5,mm7 + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0 + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm4 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_DCTELEM)], mm1 + movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_DCTELEM)], mm5 + + movq mm0,mm2 + punpcklbw mm2,mm6 ; mm2=(GHIJ) + punpckhbw mm0,mm6 ; mm0=(KLMN) + movq mm4,mm3 + punpcklbw mm3,mm6 ; mm3=(OPQR) + punpckhbw mm4,mm6 ; mm4=(STUV) + + paddw mm2,mm7 + paddw mm0,mm7 + paddw mm3,mm7 + paddw mm4,mm7 + + movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_DCTELEM)], mm2 + movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_DCTELEM)], mm0 + movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_DCTELEM)], mm3 + movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_DCTELEM)], mm4 + + add esi, byte 4*SIZEOF_JSAMPROW + add edi, byte 4*DCTSIZE*SIZEOF_DCTELEM + dec ecx + jnz short .convloop + + emms ; empty MMX state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Quantize/descale the coefficients, and store into coef_block +; +; This implementation is based on an algorithm described in +; "How to optimize for the Pentium family of microprocessors" +; (http://www.agner.org/assem/). +; +; GLOBAL(void) +; jsimd_quantize_mmx (JCOEFPTR coef_block, DCTELEM *divisors, +; DCTELEM *workspace); +; + +%define RECIPROCAL(m,n,b) MMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM) +%define CORRECTION(m,n,b) MMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM) +%define SCALE(m,n,b) MMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM) +%define SHIFT(m,n,b) MMBLOCK(DCTSIZE*3+(m),(n),(b),SIZEOF_DCTELEM) + +%define coef_block ebp+8 ; JCOEFPTR coef_block +%define divisors ebp+12 ; DCTELEM *divisors +%define workspace ebp+16 ; DCTELEM *workspace + + align 16 + global EXTN(jsimd_quantize_mmx) + +EXTN(jsimd_quantize_mmx): + push ebp + mov ebp,esp +; push ebx ; unused +; push ecx ; unused +; push edx ; need not be preserved + push esi + push edi + + mov esi, POINTER [workspace] + mov edx, POINTER [divisors] + mov edi, JCOEFPTR [coef_block] + mov ah, 2 + alignx 16,7 +.quantloop1: + mov al, DCTSIZE2/8/2 + alignx 16,7 +.quantloop2: + movq mm2, MMWORD [MMBLOCK(0,0,esi,SIZEOF_DCTELEM)] + movq mm3, MMWORD [MMBLOCK(0,1,esi,SIZEOF_DCTELEM)] + + movq mm0,mm2 + movq mm1,mm3 + + psraw mm2,(WORD_BIT-1) ; -1 if value < 0, 0 otherwise + psraw mm3,(WORD_BIT-1) + + pxor mm0,mm2 ; val = -val + pxor mm1,mm3 + psubw mm0,mm2 + psubw mm1,mm3 + + ; + ; MMX is an annoyingly crappy instruction set. It has two + ; misfeatures that are causing problems here: + ; + ; - All multiplications are signed. + ; + ; - The second operand for the shifts is not treated as packed. + ; + ; + ; We work around the first problem by implementing this algorithm: + ; + ; unsigned long unsigned_multiply(unsigned short x, unsigned short y) + ; { + ; enum { SHORT_BIT = 16 }; + ; signed short sx = (signed short) x; + ; signed short sy = (signed short) y; + ; signed long sz; + ; + ; sz = (long) sx * (long) sy; /* signed multiply */ + ; + ; if (sx < 0) sz += (long) sy << SHORT_BIT; + ; if (sy < 0) sz += (long) sx << SHORT_BIT; + ; + ; return (unsigned long) sz; + ; } + ; + ; (note that a negative sx adds _sy_ and vice versa) + ; + ; For the second problem, we replace the shift by a multiplication. + ; Unfortunately that means we have to deal with the signed issue again. + ; + + paddw mm0, MMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor + paddw mm1, MMWORD [CORRECTION(0,1,edx)] + + movq mm4,mm0 ; store current value for later + movq mm5,mm1 + pmulhw mm0, MMWORD [RECIPROCAL(0,0,edx)] ; reciprocal + pmulhw mm1, MMWORD [RECIPROCAL(0,1,edx)] + paddw mm0,mm4 ; reciprocal is always negative (MSB=1), + paddw mm1,mm5 ; so we always need to add the initial value + ; (input value is never negative as we + ; inverted it at the start of this routine) + + ; here it gets a bit tricky as both scale + ; and mm0/mm1 can be negative + movq mm6, MMWORD [SCALE(0,0,edx)] ; scale + movq mm7, MMWORD [SCALE(0,1,edx)] + movq mm4,mm0 + movq mm5,mm1 + pmulhw mm0,mm6 + pmulhw mm1,mm7 + + psraw mm6,(WORD_BIT-1) ; determine if scale is negative + psraw mm7,(WORD_BIT-1) + + pand mm6,mm4 ; and add input if it is + pand mm7,mm5 + paddw mm0,mm6 + paddw mm1,mm7 + + psraw mm4,(WORD_BIT-1) ; then check if negative input + psraw mm5,(WORD_BIT-1) + + pand mm4, MMWORD [SCALE(0,0,edx)] ; and add scale if it is + pand mm5, MMWORD [SCALE(0,1,edx)] + paddw mm0,mm4 + paddw mm1,mm5 + + pxor mm0,mm2 ; val = -val + pxor mm1,mm3 + psubw mm0,mm2 + psubw mm1,mm3 + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0 + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm1 + + add esi, byte 8*SIZEOF_DCTELEM + add edx, byte 8*SIZEOF_DCTELEM + add edi, byte 8*SIZEOF_JCOEF + dec al + jnz near .quantloop2 + dec ah + jnz near .quantloop1 ; to avoid branch misprediction + + emms ; empty MMX state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; unused +; pop ebx ; unused + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/Builder/jni-1.11/simd/jquant-sse.asm b/Builder/jni-1.11/simd/jquant-sse.asm new file mode 100644 index 000000000..1baf88f25 --- /dev/null +++ b/Builder/jni-1.11/simd/jquant-sse.asm @@ -0,0 +1,210 @@ +; +; jquant.asm - sample data conversion and quantization (SSE & MMX) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Load data into workspace, applying unsigned->signed conversion +; +; GLOBAL(void) +; jsimd_convsamp_float_sse (JSAMPARRAY sample_data, JDIMENSION start_col, +; FAST_FLOAT *workspace); +; + +%define sample_data ebp+8 ; JSAMPARRAY sample_data +%define start_col ebp+12 ; JDIMENSION start_col +%define workspace ebp+16 ; FAST_FLOAT *workspace + + align 16 + global EXTN(jsimd_convsamp_float_sse) + +EXTN(jsimd_convsamp_float_sse): + push ebp + mov ebp,esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + pcmpeqw mm7,mm7 + psllw mm7,7 + packsswb mm7,mm7 ; mm7 = PB_CENTERJSAMPLE (0x808080..) + + mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) + mov eax, JDIMENSION [start_col] + mov edi, POINTER [workspace] ; (DCTELEM *) + mov ecx, DCTSIZE/2 + alignx 16,7 +.convloop: + mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) + + movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE] + movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE] + + psubb mm0,mm7 ; mm0=(01234567) + psubb mm1,mm7 ; mm1=(89ABCDEF) + + punpcklbw mm2,mm0 ; mm2=(*0*1*2*3) + punpckhbw mm0,mm0 ; mm0=(*4*5*6*7) + punpcklbw mm3,mm1 ; mm3=(*8*9*A*B) + punpckhbw mm1,mm1 ; mm1=(*C*D*E*F) + + punpcklwd mm4,mm2 ; mm4=(***0***1) + punpckhwd mm2,mm2 ; mm2=(***2***3) + punpcklwd mm5,mm0 ; mm5=(***4***5) + punpckhwd mm0,mm0 ; mm0=(***6***7) + + psrad mm4,(DWORD_BIT-BYTE_BIT) ; mm4=(01) + psrad mm2,(DWORD_BIT-BYTE_BIT) ; mm2=(23) + cvtpi2ps xmm0,mm4 ; xmm0=(01**) + cvtpi2ps xmm1,mm2 ; xmm1=(23**) + psrad mm5,(DWORD_BIT-BYTE_BIT) ; mm5=(45) + psrad mm0,(DWORD_BIT-BYTE_BIT) ; mm0=(67) + cvtpi2ps xmm2,mm5 ; xmm2=(45**) + cvtpi2ps xmm3,mm0 ; xmm3=(67**) + + punpcklwd mm6,mm3 ; mm6=(***8***9) + punpckhwd mm3,mm3 ; mm3=(***A***B) + punpcklwd mm4,mm1 ; mm4=(***C***D) + punpckhwd mm1,mm1 ; mm1=(***E***F) + + psrad mm6,(DWORD_BIT-BYTE_BIT) ; mm6=(89) + psrad mm3,(DWORD_BIT-BYTE_BIT) ; mm3=(AB) + cvtpi2ps xmm4,mm6 ; xmm4=(89**) + cvtpi2ps xmm5,mm3 ; xmm5=(AB**) + psrad mm4,(DWORD_BIT-BYTE_BIT) ; mm4=(CD) + psrad mm1,(DWORD_BIT-BYTE_BIT) ; mm1=(EF) + cvtpi2ps xmm6,mm4 ; xmm6=(CD**) + cvtpi2ps xmm7,mm1 ; xmm7=(EF**) + + movlhps xmm0,xmm1 ; xmm0=(0123) + movlhps xmm2,xmm3 ; xmm2=(4567) + movlhps xmm4,xmm5 ; xmm4=(89AB) + movlhps xmm6,xmm7 ; xmm6=(CDEF) + + movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm2 + movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm4 + movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6 + + add esi, byte 2*SIZEOF_JSAMPROW + add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT + dec ecx + jnz near .convloop + + emms ; empty MMX state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret + + +; -------------------------------------------------------------------------- +; +; Quantize/descale the coefficients, and store into coef_block +; +; GLOBAL(void) +; jsimd_quantize_float_sse (JCOEFPTR coef_block, FAST_FLOAT *divisors, +; FAST_FLOAT *workspace); +; + +%define coef_block ebp+8 ; JCOEFPTR coef_block +%define divisors ebp+12 ; FAST_FLOAT *divisors +%define workspace ebp+16 ; FAST_FLOAT *workspace + + align 16 + global EXTN(jsimd_quantize_float_sse) + +EXTN(jsimd_quantize_float_sse): + push ebp + mov ebp,esp +; push ebx ; unused +; push ecx ; unused +; push edx ; need not be preserved + push esi + push edi + + mov esi, POINTER [workspace] + mov edx, POINTER [divisors] + mov edi, JCOEFPTR [coef_block] + mov eax, DCTSIZE2/16 + alignx 16,7 +.quantloop: + movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)] + mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] + mulps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)] + mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] + mulps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] + + movhlps xmm4,xmm0 + movhlps xmm5,xmm1 + + cvtps2pi mm0,xmm0 + cvtps2pi mm1,xmm1 + cvtps2pi mm4,xmm4 + cvtps2pi mm5,xmm5 + + movhlps xmm6,xmm2 + movhlps xmm7,xmm3 + + cvtps2pi mm2,xmm2 + cvtps2pi mm3,xmm3 + cvtps2pi mm6,xmm6 + cvtps2pi mm7,xmm7 + + packssdw mm0,mm4 + packssdw mm1,mm5 + packssdw mm2,mm6 + packssdw mm3,mm7 + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0 + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm2 + movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3 + + add esi, byte 16*SIZEOF_FAST_FLOAT + add edx, byte 16*SIZEOF_FAST_FLOAT + add edi, byte 16*SIZEOF_JCOEF + dec eax + jnz short .quantloop + + emms ; empty MMX state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; unused +; pop ebx ; unused + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/Builder/jni-1.11/simd/jquantf-sse2-64.asm b/Builder/jni-1.11/simd/jquantf-sse2-64.asm new file mode 100644 index 000000000..ef5c1f959 --- /dev/null +++ b/Builder/jni-1.11/simd/jquantf-sse2-64.asm @@ -0,0 +1,157 @@ +; +; jquantf.asm - sample data conversion and quantization (64-bit SSE & SSE2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2009, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 +; +; Load data into workspace, applying unsigned->signed conversion +; +; GLOBAL(void) +; jsimd_convsamp_float_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col, +; FAST_FLOAT *workspace); +; + +; r10 = JSAMPARRAY sample_data +; r11 = JDIMENSION start_col +; r12 = FAST_FLOAT *workspace + + align 16 + global EXTN(jsimd_convsamp_float_sse2) + +EXTN(jsimd_convsamp_float_sse2): + push rbp + mov rax,rsp + mov rbp,rsp + collect_args + push rbx + + pcmpeqw xmm7,xmm7 + psllw xmm7,7 + packsswb xmm7,xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..) + + mov rsi, r10 + mov eax, r11d + mov rdi, r12 + mov rcx, DCTSIZE/2 +.convloop: + mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) + + movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] + movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] + + psubb xmm0,xmm7 ; xmm0=(01234567) + psubb xmm1,xmm7 ; xmm1=(89ABCDEF) + + punpcklbw xmm0,xmm0 ; xmm0=(*0*1*2*3*4*5*6*7) + punpcklbw xmm1,xmm1 ; xmm1=(*8*9*A*B*C*D*E*F) + + punpcklwd xmm2,xmm0 ; xmm2=(***0***1***2***3) + punpckhwd xmm0,xmm0 ; xmm0=(***4***5***6***7) + punpcklwd xmm3,xmm1 ; xmm3=(***8***9***A***B) + punpckhwd xmm1,xmm1 ; xmm1=(***C***D***E***F) + + psrad xmm2,(DWORD_BIT-BYTE_BIT) ; xmm2=(0123) + psrad xmm0,(DWORD_BIT-BYTE_BIT) ; xmm0=(4567) + cvtdq2ps xmm2,xmm2 ; xmm2=(0123) + cvtdq2ps xmm0,xmm0 ; xmm0=(4567) + psrad xmm3,(DWORD_BIT-BYTE_BIT) ; xmm3=(89AB) + psrad xmm1,(DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF) + cvtdq2ps xmm3,xmm3 ; xmm3=(89AB) + cvtdq2ps xmm1,xmm1 ; xmm1=(CDEF) + + movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm2 + movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3 + movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1 + + add rsi, byte 2*SIZEOF_JSAMPROW + add rdi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT + dec rcx + jnz short .convloop + + pop rbx + uncollect_args + pop rbp + ret + + +; -------------------------------------------------------------------------- +; +; Quantize/descale the coefficients, and store into coef_block +; +; GLOBAL(void) +; jsimd_quantize_float_sse2 (JCOEFPTR coef_block, FAST_FLOAT *divisors, +; FAST_FLOAT *workspace); +; + +; r10 = JCOEFPTR coef_block +; r11 = FAST_FLOAT *divisors +; r12 = FAST_FLOAT *workspace + + align 16 + global EXTN(jsimd_quantize_float_sse2) + +EXTN(jsimd_quantize_float_sse2): + push rbp + mov rax,rsp + mov rbp,rsp + collect_args + + mov rsi, r12 + mov rdx, r11 + mov rdi, r10 + mov rax, DCTSIZE2/16 +.quantloop: + movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(0,1,rsi,SIZEOF_FAST_FLOAT)] + mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)] + mulps xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(1,1,rsi,SIZEOF_FAST_FLOAT)] + mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)] + mulps xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)] + + cvtps2dq xmm0,xmm0 + cvtps2dq xmm1,xmm1 + cvtps2dq xmm2,xmm2 + cvtps2dq xmm3,xmm3 + + packssdw xmm0,xmm1 + packssdw xmm2,xmm3 + + movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_JCOEF)], xmm0 + movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_JCOEF)], xmm2 + + add rsi, byte 16*SIZEOF_FAST_FLOAT + add rdx, byte 16*SIZEOF_FAST_FLOAT + add rdi, byte 16*SIZEOF_JCOEF + dec rax + jnz short .quantloop + + uncollect_args + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/Builder/jni-1.11/simd/jquantf-sse2.asm b/Builder/jni-1.11/simd/jquantf-sse2.asm new file mode 100644 index 000000000..1cbc26740 --- /dev/null +++ b/Builder/jni-1.11/simd/jquantf-sse2.asm @@ -0,0 +1,170 @@ +; +; jquantf.asm - sample data conversion and quantization (SSE & SSE2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Load data into workspace, applying unsigned->signed conversion +; +; GLOBAL(void) +; jsimd_convsamp_float_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col, +; FAST_FLOAT *workspace); +; + +%define sample_data ebp+8 ; JSAMPARRAY sample_data +%define start_col ebp+12 ; JDIMENSION start_col +%define workspace ebp+16 ; FAST_FLOAT *workspace + + align 16 + global EXTN(jsimd_convsamp_float_sse2) + +EXTN(jsimd_convsamp_float_sse2): + push ebp + mov ebp,esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + pcmpeqw xmm7,xmm7 + psllw xmm7,7 + packsswb xmm7,xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..) + + mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) + mov eax, JDIMENSION [start_col] + mov edi, POINTER [workspace] ; (DCTELEM *) + mov ecx, DCTSIZE/2 + alignx 16,7 +.convloop: + mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) + + movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] + movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] + + psubb xmm0,xmm7 ; xmm0=(01234567) + psubb xmm1,xmm7 ; xmm1=(89ABCDEF) + + punpcklbw xmm0,xmm0 ; xmm0=(*0*1*2*3*4*5*6*7) + punpcklbw xmm1,xmm1 ; xmm1=(*8*9*A*B*C*D*E*F) + + punpcklwd xmm2,xmm0 ; xmm2=(***0***1***2***3) + punpckhwd xmm0,xmm0 ; xmm0=(***4***5***6***7) + punpcklwd xmm3,xmm1 ; xmm3=(***8***9***A***B) + punpckhwd xmm1,xmm1 ; xmm1=(***C***D***E***F) + + psrad xmm2,(DWORD_BIT-BYTE_BIT) ; xmm2=(0123) + psrad xmm0,(DWORD_BIT-BYTE_BIT) ; xmm0=(4567) + cvtdq2ps xmm2,xmm2 ; xmm2=(0123) + cvtdq2ps xmm0,xmm0 ; xmm0=(4567) + psrad xmm3,(DWORD_BIT-BYTE_BIT) ; xmm3=(89AB) + psrad xmm1,(DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF) + cvtdq2ps xmm3,xmm3 ; xmm3=(89AB) + cvtdq2ps xmm1,xmm1 ; xmm1=(CDEF) + + movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm2 + movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3 + movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1 + + add esi, byte 2*SIZEOF_JSAMPROW + add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT + dec ecx + jnz short .convloop + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret + + +; -------------------------------------------------------------------------- +; +; Quantize/descale the coefficients, and store into coef_block +; +; GLOBAL(void) +; jsimd_quantize_float_sse2 (JCOEFPTR coef_block, FAST_FLOAT *divisors, +; FAST_FLOAT *workspace); +; + +%define coef_block ebp+8 ; JCOEFPTR coef_block +%define divisors ebp+12 ; FAST_FLOAT *divisors +%define workspace ebp+16 ; FAST_FLOAT *workspace + + align 16 + global EXTN(jsimd_quantize_float_sse2) + +EXTN(jsimd_quantize_float_sse2): + push ebp + mov ebp,esp +; push ebx ; unused +; push ecx ; unused +; push edx ; need not be preserved + push esi + push edi + + mov esi, POINTER [workspace] + mov edx, POINTER [divisors] + mov edi, JCOEFPTR [coef_block] + mov eax, DCTSIZE2/16 + alignx 16,7 +.quantloop: + movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)] + mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] + mulps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)] + mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] + mulps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] + + cvtps2dq xmm0,xmm0 + cvtps2dq xmm1,xmm1 + cvtps2dq xmm2,xmm2 + cvtps2dq xmm3,xmm3 + + packssdw xmm0,xmm1 + packssdw xmm2,xmm3 + + movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_JCOEF)], xmm0 + movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_JCOEF)], xmm2 + + add esi, byte 16*SIZEOF_FAST_FLOAT + add edx, byte 16*SIZEOF_FAST_FLOAT + add edi, byte 16*SIZEOF_JCOEF + dec eax + jnz short .quantloop + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; unused +; pop ebx ; unused + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/Builder/jni-1.11/simd/jquanti-altivec.c b/Builder/jni-1.11/simd/jquanti-altivec.c new file mode 100644 index 000000000..25cc296f7 --- /dev/null +++ b/Builder/jni-1.11/simd/jquanti-altivec.c @@ -0,0 +1,252 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* INTEGER QUANTIZATION AND SAMPLE CONVERSION */ + +#include "jsimd_altivec.h" + + +/* NOTE: The address will either be aligned or offset by 8 bytes, so we can + * always get the data we want by using a single vector load (although we may + * have to permute the result.) + */ +#if __BIG_ENDIAN__ + +#define LOAD_ROW(row) { \ + elemptr = sample_data[row] + start_col; \ + in##row = vec_ld(0, elemptr); \ + if ((size_t)elemptr & 15) \ + in##row = vec_perm(in##row, in##row, vec_lvsl(0, elemptr)); \ +} + +#else + +#define LOAD_ROW(row) { \ + elemptr = sample_data[row] + start_col; \ + in##row = vec_vsx_ld(0, elemptr); \ +} + +#endif + + +void +jsimd_convsamp_altivec (JSAMPARRAY sample_data, JDIMENSION start_col, + DCTELEM *workspace) +{ + JSAMPROW elemptr; + + __vector unsigned char in0, in1, in2, in3, in4, in5, in6, in7; + __vector short out0, out1, out2, out3, out4, out5, out6, out7; + + /* Constants */ + __vector short pw_centerjsamp = { __8X(CENTERJSAMPLE) }; + __vector unsigned char pb_zero = { __16X(0) }; + + LOAD_ROW(0); + LOAD_ROW(1); + LOAD_ROW(2); + LOAD_ROW(3); + LOAD_ROW(4); + LOAD_ROW(5); + LOAD_ROW(6); + LOAD_ROW(7); + + out0 = (__vector short)VEC_UNPACKHU(in0); + out1 = (__vector short)VEC_UNPACKHU(in1); + out2 = (__vector short)VEC_UNPACKHU(in2); + out3 = (__vector short)VEC_UNPACKHU(in3); + out4 = (__vector short)VEC_UNPACKHU(in4); + out5 = (__vector short)VEC_UNPACKHU(in5); + out6 = (__vector short)VEC_UNPACKHU(in6); + out7 = (__vector short)VEC_UNPACKHU(in7); + + out0 = vec_sub(out0, pw_centerjsamp); + out1 = vec_sub(out1, pw_centerjsamp); + out2 = vec_sub(out2, pw_centerjsamp); + out3 = vec_sub(out3, pw_centerjsamp); + out4 = vec_sub(out4, pw_centerjsamp); + out5 = vec_sub(out5, pw_centerjsamp); + out6 = vec_sub(out6, pw_centerjsamp); + out7 = vec_sub(out7, pw_centerjsamp); + + vec_st(out0, 0, workspace); + vec_st(out1, 16, workspace); + vec_st(out2, 32, workspace); + vec_st(out3, 48, workspace); + vec_st(out4, 64, workspace); + vec_st(out5, 80, workspace); + vec_st(out6, 96, workspace); + vec_st(out7, 112, workspace); +} + + +#define WORD_BIT 16 + +/* There is no AltiVec 16-bit unsigned multiply instruction, hence this. + We basically need an unsigned equivalent of vec_madds(). */ + +#define MULTIPLY(vs0, vs1, out) { \ + tmpe = vec_mule((__vector unsigned short)vs0, \ + (__vector unsigned short)vs1); \ + tmpo = vec_mulo((__vector unsigned short)vs0, \ + (__vector unsigned short)vs1); \ + out = (__vector short)vec_perm((__vector unsigned short)tmpe, \ + (__vector unsigned short)tmpo, \ + shift_pack_index); \ +} + +void +jsimd_quantize_altivec (JCOEFPTR coef_block, DCTELEM *divisors, + DCTELEM *workspace) +{ + __vector short row0, row1, row2, row3, row4, row5, row6, row7, + row0s, row1s, row2s, row3s, row4s, row5s, row6s, row7s, + corr0, corr1, corr2, corr3, corr4, corr5, corr6, corr7, + recip0, recip1, recip2, recip3, recip4, recip5, recip6, recip7, + scale0, scale1, scale2, scale3, scale4, scale5, scale6, scale7; + __vector unsigned int tmpe, tmpo; + + /* Constants */ + __vector unsigned short pw_word_bit_m1 = { __8X(WORD_BIT - 1) }; +#if __BIG_ENDIAN__ + __vector unsigned char shift_pack_index = + {0,1,16,17,4,5,20,21,8,9,24,25,12,13,28,29}; +#else + __vector unsigned char shift_pack_index = + {2,3,18,19,6,7,22,23,10,11,26,27,14,15,30,31}; +#endif + + row0 = vec_ld(0, workspace); + row1 = vec_ld(16, workspace); + row2 = vec_ld(32, workspace); + row3 = vec_ld(48, workspace); + row4 = vec_ld(64, workspace); + row5 = vec_ld(80, workspace); + row6 = vec_ld(96, workspace); + row7 = vec_ld(112, workspace); + + /* Branch-less absolute value */ + row0s = vec_sra(row0, pw_word_bit_m1); + row1s = vec_sra(row1, pw_word_bit_m1); + row2s = vec_sra(row2, pw_word_bit_m1); + row3s = vec_sra(row3, pw_word_bit_m1); + row4s = vec_sra(row4, pw_word_bit_m1); + row5s = vec_sra(row5, pw_word_bit_m1); + row6s = vec_sra(row6, pw_word_bit_m1); + row7s = vec_sra(row7, pw_word_bit_m1); + row0 = vec_xor(row0, row0s); + row1 = vec_xor(row1, row1s); + row2 = vec_xor(row2, row2s); + row3 = vec_xor(row3, row3s); + row4 = vec_xor(row4, row4s); + row5 = vec_xor(row5, row5s); + row6 = vec_xor(row6, row6s); + row7 = vec_xor(row7, row7s); + row0 = vec_sub(row0, row0s); + row1 = vec_sub(row1, row1s); + row2 = vec_sub(row2, row2s); + row3 = vec_sub(row3, row3s); + row4 = vec_sub(row4, row4s); + row5 = vec_sub(row5, row5s); + row6 = vec_sub(row6, row6s); + row7 = vec_sub(row7, row7s); + + corr0 = vec_ld(DCTSIZE2 * 2, divisors); + corr1 = vec_ld(DCTSIZE2 * 2 + 16, divisors); + corr2 = vec_ld(DCTSIZE2 * 2 + 32, divisors); + corr3 = vec_ld(DCTSIZE2 * 2 + 48, divisors); + corr4 = vec_ld(DCTSIZE2 * 2 + 64, divisors); + corr5 = vec_ld(DCTSIZE2 * 2 + 80, divisors); + corr6 = vec_ld(DCTSIZE2 * 2 + 96, divisors); + corr7 = vec_ld(DCTSIZE2 * 2 + 112, divisors); + + row0 = vec_add(row0, corr0); + row1 = vec_add(row1, corr1); + row2 = vec_add(row2, corr2); + row3 = vec_add(row3, corr3); + row4 = vec_add(row4, corr4); + row5 = vec_add(row5, corr5); + row6 = vec_add(row6, corr6); + row7 = vec_add(row7, corr7); + + recip0 = vec_ld(0, divisors); + recip1 = vec_ld(16, divisors); + recip2 = vec_ld(32, divisors); + recip3 = vec_ld(48, divisors); + recip4 = vec_ld(64, divisors); + recip5 = vec_ld(80, divisors); + recip6 = vec_ld(96, divisors); + recip7 = vec_ld(112, divisors); + + MULTIPLY(row0, recip0, row0); + MULTIPLY(row1, recip1, row1); + MULTIPLY(row2, recip2, row2); + MULTIPLY(row3, recip3, row3); + MULTIPLY(row4, recip4, row4); + MULTIPLY(row5, recip5, row5); + MULTIPLY(row6, recip6, row6); + MULTIPLY(row7, recip7, row7); + + scale0 = vec_ld(DCTSIZE2 * 4, divisors); + scale1 = vec_ld(DCTSIZE2 * 4 + 16, divisors); + scale2 = vec_ld(DCTSIZE2 * 4 + 32, divisors); + scale3 = vec_ld(DCTSIZE2 * 4 + 48, divisors); + scale4 = vec_ld(DCTSIZE2 * 4 + 64, divisors); + scale5 = vec_ld(DCTSIZE2 * 4 + 80, divisors); + scale6 = vec_ld(DCTSIZE2 * 4 + 96, divisors); + scale7 = vec_ld(DCTSIZE2 * 4 + 112, divisors); + + MULTIPLY(row0, scale0, row0); + MULTIPLY(row1, scale1, row1); + MULTIPLY(row2, scale2, row2); + MULTIPLY(row3, scale3, row3); + MULTIPLY(row4, scale4, row4); + MULTIPLY(row5, scale5, row5); + MULTIPLY(row6, scale6, row6); + MULTIPLY(row7, scale7, row7); + + row0 = vec_xor(row0, row0s); + row1 = vec_xor(row1, row1s); + row2 = vec_xor(row2, row2s); + row3 = vec_xor(row3, row3s); + row4 = vec_xor(row4, row4s); + row5 = vec_xor(row5, row5s); + row6 = vec_xor(row6, row6s); + row7 = vec_xor(row7, row7s); + row0 = vec_sub(row0, row0s); + row1 = vec_sub(row1, row1s); + row2 = vec_sub(row2, row2s); + row3 = vec_sub(row3, row3s); + row4 = vec_sub(row4, row4s); + row5 = vec_sub(row5, row5s); + row6 = vec_sub(row6, row6s); + row7 = vec_sub(row7, row7s); + + vec_st(row0, 0, coef_block); + vec_st(row1, 16, coef_block); + vec_st(row2, 32, coef_block); + vec_st(row3, 48, coef_block); + vec_st(row4, 64, coef_block); + vec_st(row5, 80, coef_block); + vec_st(row6, 96, coef_block); + vec_st(row7, 112, coef_block); +} diff --git a/Builder/jni-1.11/simd/jquanti-sse2-64.asm b/Builder/jni-1.11/simd/jquanti-sse2-64.asm new file mode 100644 index 000000000..66c4e5190 --- /dev/null +++ b/Builder/jni-1.11/simd/jquanti-sse2-64.asm @@ -0,0 +1,186 @@ +; +; jquanti.asm - sample data conversion and quantization (64-bit SSE2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2009, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 +; +; Load data into workspace, applying unsigned->signed conversion +; +; GLOBAL(void) +; jsimd_convsamp_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col, +; DCTELEM *workspace); +; + +; r10 = JSAMPARRAY sample_data +; r11 = JDIMENSION start_col +; r12 = DCTELEM *workspace + + align 16 + global EXTN(jsimd_convsamp_sse2) + +EXTN(jsimd_convsamp_sse2): + push rbp + mov rax,rsp + mov rbp,rsp + collect_args + push rbx + + pxor xmm6,xmm6 ; xmm6=(all 0's) + pcmpeqw xmm7,xmm7 + psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} + + mov rsi, r10 + mov eax, r11d + mov rdi, r12 + mov rcx, DCTSIZE/4 +.convloop: + mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) + + movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm0=(01234567) + movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF) + + mov rbx, JSAMPROW [rsi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov rdx, JSAMPROW [rsi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *) + + movq xmm2, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN) + movq xmm3, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV) + + punpcklbw xmm0,xmm6 ; xmm0=(01234567) + punpcklbw xmm1,xmm6 ; xmm1=(89ABCDEF) + paddw xmm0,xmm7 + paddw xmm1,xmm7 + punpcklbw xmm2,xmm6 ; xmm2=(GHIJKLMN) + punpcklbw xmm3,xmm6 ; xmm3=(OPQRSTUV) + paddw xmm2,xmm7 + paddw xmm3,xmm7 + + movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0 + movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1 + movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2 + movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3 + + add rsi, byte 4*SIZEOF_JSAMPROW + add rdi, byte 4*DCTSIZE*SIZEOF_DCTELEM + dec rcx + jnz short .convloop + + pop rbx + uncollect_args + pop rbp + ret + +; -------------------------------------------------------------------------- +; +; Quantize/descale the coefficients, and store into coef_block +; +; This implementation is based on an algorithm described in +; "How to optimize for the Pentium family of microprocessors" +; (http://www.agner.org/assem/). +; +; GLOBAL(void) +; jsimd_quantize_sse2 (JCOEFPTR coef_block, DCTELEM *divisors, +; DCTELEM *workspace); +; + +%define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM) +%define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM) +%define SCALE(m,n,b) XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM) + +; r10 = JCOEFPTR coef_block +; r11 = DCTELEM *divisors +; r12 = DCTELEM *workspace + + align 16 + global EXTN(jsimd_quantize_sse2) + +EXTN(jsimd_quantize_sse2): + push rbp + mov rax,rsp + mov rbp,rsp + collect_args + + mov rsi, r12 + mov rdx, r11 + mov rdi, r10 + mov rax, DCTSIZE2/32 +.quantloop: + movdqa xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_DCTELEM)] + movdqa xmm5, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_DCTELEM)] + movdqa xmm6, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_DCTELEM)] + movdqa xmm7, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_DCTELEM)] + movdqa xmm0,xmm4 + movdqa xmm1,xmm5 + movdqa xmm2,xmm6 + movdqa xmm3,xmm7 + psraw xmm4,(WORD_BIT-1) + psraw xmm5,(WORD_BIT-1) + psraw xmm6,(WORD_BIT-1) + psraw xmm7,(WORD_BIT-1) + pxor xmm0,xmm4 + pxor xmm1,xmm5 + pxor xmm2,xmm6 + pxor xmm3,xmm7 + psubw xmm0,xmm4 ; if (xmm0 < 0) xmm0 = -xmm0; + psubw xmm1,xmm5 ; if (xmm1 < 0) xmm1 = -xmm1; + psubw xmm2,xmm6 ; if (xmm2 < 0) xmm2 = -xmm2; + psubw xmm3,xmm7 ; if (xmm3 < 0) xmm3 = -xmm3; + + paddw xmm0, XMMWORD [CORRECTION(0,0,rdx)] ; correction + roundfactor + paddw xmm1, XMMWORD [CORRECTION(1,0,rdx)] + paddw xmm2, XMMWORD [CORRECTION(2,0,rdx)] + paddw xmm3, XMMWORD [CORRECTION(3,0,rdx)] + pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,rdx)] ; reciprocal + pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,rdx)] + pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,rdx)] + pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,rdx)] + pmulhuw xmm0, XMMWORD [SCALE(0,0,rdx)] ; scale + pmulhuw xmm1, XMMWORD [SCALE(1,0,rdx)] + pmulhuw xmm2, XMMWORD [SCALE(2,0,rdx)] + pmulhuw xmm3, XMMWORD [SCALE(3,0,rdx)] + + pxor xmm0,xmm4 + pxor xmm1,xmm5 + pxor xmm2,xmm6 + pxor xmm3,xmm7 + psubw xmm0,xmm4 + psubw xmm1,xmm5 + psubw xmm2,xmm6 + psubw xmm3,xmm7 + movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0 + movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1 + movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2 + movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3 + + add rsi, byte 32*SIZEOF_DCTELEM + add rdx, byte 32*SIZEOF_DCTELEM + add rdi, byte 32*SIZEOF_JCOEF + dec rax + jnz near .quantloop + + uncollect_args + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/Builder/jni-1.11/simd/jquanti-sse2.asm b/Builder/jni-1.11/simd/jquanti-sse2.asm new file mode 100644 index 000000000..aea8604e2 --- /dev/null +++ b/Builder/jni-1.11/simd/jquanti-sse2.asm @@ -0,0 +1,199 @@ +; +; jquanti.asm - sample data conversion and quantization (SSE2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Load data into workspace, applying unsigned->signed conversion +; +; GLOBAL(void) +; jsimd_convsamp_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col, +; DCTELEM *workspace); +; + +%define sample_data ebp+8 ; JSAMPARRAY sample_data +%define start_col ebp+12 ; JDIMENSION start_col +%define workspace ebp+16 ; DCTELEM *workspace + + align 16 + global EXTN(jsimd_convsamp_sse2) + +EXTN(jsimd_convsamp_sse2): + push ebp + mov ebp,esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + pxor xmm6,xmm6 ; xmm6=(all 0's) + pcmpeqw xmm7,xmm7 + psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} + + mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) + mov eax, JDIMENSION [start_col] + mov edi, POINTER [workspace] ; (DCTELEM *) + mov ecx, DCTSIZE/4 + alignx 16,7 +.convloop: + mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) + + movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm0=(01234567) + movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF) + + mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *) + + movq xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN) + movq xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV) + + punpcklbw xmm0,xmm6 ; xmm0=(01234567) + punpcklbw xmm1,xmm6 ; xmm1=(89ABCDEF) + paddw xmm0,xmm7 + paddw xmm1,xmm7 + punpcklbw xmm2,xmm6 ; xmm2=(GHIJKLMN) + punpcklbw xmm3,xmm6 ; xmm3=(OPQRSTUV) + paddw xmm2,xmm7 + paddw xmm3,xmm7 + + movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0 + movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1 + movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2 + movdqa XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3 + + add esi, byte 4*SIZEOF_JSAMPROW + add edi, byte 4*DCTSIZE*SIZEOF_DCTELEM + dec ecx + jnz short .convloop + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Quantize/descale the coefficients, and store into coef_block +; +; This implementation is based on an algorithm described in +; "How to optimize for the Pentium family of microprocessors" +; (http://www.agner.org/assem/). +; +; GLOBAL(void) +; jsimd_quantize_sse2 (JCOEFPTR coef_block, DCTELEM *divisors, +; DCTELEM *workspace); +; + +%define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM) +%define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM) +%define SCALE(m,n,b) XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM) + +%define coef_block ebp+8 ; JCOEFPTR coef_block +%define divisors ebp+12 ; DCTELEM *divisors +%define workspace ebp+16 ; DCTELEM *workspace + + align 16 + global EXTN(jsimd_quantize_sse2) + +EXTN(jsimd_quantize_sse2): + push ebp + mov ebp,esp +; push ebx ; unused +; push ecx ; unused +; push edx ; need not be preserved + push esi + push edi + + mov esi, POINTER [workspace] + mov edx, POINTER [divisors] + mov edi, JCOEFPTR [coef_block] + mov eax, DCTSIZE2/32 + alignx 16,7 +.quantloop: + movdqa xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)] + movdqa xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)] + movdqa xmm6, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)] + movdqa xmm7, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_DCTELEM)] + movdqa xmm0,xmm4 + movdqa xmm1,xmm5 + movdqa xmm2,xmm6 + movdqa xmm3,xmm7 + psraw xmm4,(WORD_BIT-1) + psraw xmm5,(WORD_BIT-1) + psraw xmm6,(WORD_BIT-1) + psraw xmm7,(WORD_BIT-1) + pxor xmm0,xmm4 + pxor xmm1,xmm5 + pxor xmm2,xmm6 + pxor xmm3,xmm7 + psubw xmm0,xmm4 ; if (xmm0 < 0) xmm0 = -xmm0; + psubw xmm1,xmm5 ; if (xmm1 < 0) xmm1 = -xmm1; + psubw xmm2,xmm6 ; if (xmm2 < 0) xmm2 = -xmm2; + psubw xmm3,xmm7 ; if (xmm3 < 0) xmm3 = -xmm3; + + paddw xmm0, XMMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor + paddw xmm1, XMMWORD [CORRECTION(1,0,edx)] + paddw xmm2, XMMWORD [CORRECTION(2,0,edx)] + paddw xmm3, XMMWORD [CORRECTION(3,0,edx)] + pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,edx)] ; reciprocal + pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,edx)] + pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,edx)] + pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,edx)] + pmulhuw xmm0, XMMWORD [SCALE(0,0,edx)] ; scale + pmulhuw xmm1, XMMWORD [SCALE(1,0,edx)] + pmulhuw xmm2, XMMWORD [SCALE(2,0,edx)] + pmulhuw xmm3, XMMWORD [SCALE(3,0,edx)] + + pxor xmm0,xmm4 + pxor xmm1,xmm5 + pxor xmm2,xmm6 + pxor xmm3,xmm7 + psubw xmm0,xmm4 + psubw xmm1,xmm5 + psubw xmm2,xmm6 + psubw xmm3,xmm7 + movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0 + movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1 + movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2 + movdqa XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3 + + add esi, byte 32*SIZEOF_DCTELEM + add edx, byte 32*SIZEOF_DCTELEM + add edi, byte 32*SIZEOF_JCOEF + dec eax + jnz near .quantloop + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; unused +; pop ebx ; unused + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/Builder/jni-1.11/simd/jsimd.h b/Builder/jni-1.11/simd/jsimd.h new file mode 100644 index 000000000..dc6ec430d --- /dev/null +++ b/Builder/jni-1.11/simd/jsimd.h @@ -0,0 +1,871 @@ +/* + * simd/jsimd.h + * + * Copyright 2009 Pierre Ossman for Cendio AB + * Copyright (C) 2011, 2014-2016, D. R. Commander. + * Copyright (C) 2013-2014, MIPS Technologies, Inc., California. + * Copyright (C) 2014, Linaro Limited. + * Copyright (C) 2015-2016, Matthieu Darbois. + * + * Based on the x86 SIMD extension for IJG JPEG library, + * Copyright (C) 1999-2006, MIYASAKA Masaru. + * For conditions of distribution and use, see copyright notice in jsimdext.inc + * + */ + +/* Bitmask for supported acceleration methods */ + +#define JSIMD_NONE 0x00 +#define JSIMD_MMX 0x01 +#define JSIMD_3DNOW 0x02 +#define JSIMD_SSE 0x04 +#define JSIMD_SSE2 0x08 +#define JSIMD_ARM_NEON 0x10 +#define JSIMD_MIPS_DSPR2 0x20 +#define JSIMD_ALTIVEC 0x40 + +/* SIMD Ext: retrieve SIMD/CPU information */ +EXTERN(unsigned int) jpeg_simd_cpu_support (void); + +/* RGB & extended RGB --> YCC Colorspace Conversion */ +EXTERN(void) jsimd_rgb_ycc_convert_mmx + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgb_ycc_convert_mmx + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgbx_ycc_convert_mmx + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgr_ycc_convert_mmx + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgrx_ycc_convert_mmx + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxbgr_ycc_convert_mmx + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxrgb_ycc_convert_mmx + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); + +extern const int jconst_rgb_ycc_convert_sse2[]; +EXTERN(void) jsimd_rgb_ycc_convert_sse2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgb_ycc_convert_sse2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgbx_ycc_convert_sse2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgr_ycc_convert_sse2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgrx_ycc_convert_sse2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxbgr_ycc_convert_sse2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxrgb_ycc_convert_sse2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); + +EXTERN(void) jsimd_rgb_ycc_convert_neon + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgb_ycc_convert_neon + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgbx_ycc_convert_neon + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgr_ycc_convert_neon + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgrx_ycc_convert_neon + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxbgr_ycc_convert_neon + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxrgb_ycc_convert_neon + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); + +EXTERN(void) jsimd_extrgb_ycc_convert_neon_slowld3 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgr_ycc_convert_neon_slowld3 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); + +EXTERN(void) jsimd_rgb_ycc_convert_mips_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgb_ycc_convert_mips_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgbx_ycc_convert_mips_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgr_ycc_convert_mips_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgrx_ycc_convert_mips_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxbgr_ycc_convert_mips_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxrgb_ycc_convert_mips_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); + +EXTERN(void) jsimd_rgb_ycc_convert_altivec + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgb_ycc_convert_altivec + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgbx_ycc_convert_altivec + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgr_ycc_convert_altivec + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgrx_ycc_convert_altivec + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxbgr_ycc_convert_altivec + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxrgb_ycc_convert_altivec + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); + +/* RGB & extended RGB --> Grayscale Colorspace Conversion */ +EXTERN(void) jsimd_rgb_gray_convert_mmx + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgb_gray_convert_mmx + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgbx_gray_convert_mmx + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgr_gray_convert_mmx + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgrx_gray_convert_mmx + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxbgr_gray_convert_mmx + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxrgb_gray_convert_mmx + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); + +extern const int jconst_rgb_gray_convert_sse2[]; +EXTERN(void) jsimd_rgb_gray_convert_sse2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgb_gray_convert_sse2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgbx_gray_convert_sse2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgr_gray_convert_sse2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgrx_gray_convert_sse2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxbgr_gray_convert_sse2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxrgb_gray_convert_sse2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); + +EXTERN(void) jsimd_rgb_gray_convert_mips_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgb_gray_convert_mips_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgbx_gray_convert_mips_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgr_gray_convert_mips_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgrx_gray_convert_mips_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxbgr_gray_convert_mips_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxrgb_gray_convert_mips_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); + +EXTERN(void) jsimd_rgb_gray_convert_altivec + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgb_gray_convert_altivec + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgbx_gray_convert_altivec + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgr_gray_convert_altivec + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgrx_gray_convert_altivec + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxbgr_gray_convert_altivec + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxrgb_gray_convert_altivec + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); + +/* YCC --> RGB & extended RGB Colorspace Conversion */ +EXTERN(void) jsimd_ycc_rgb_convert_mmx + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extrgb_convert_mmx + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extrgbx_convert_mmx + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extbgr_convert_mmx + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extbgrx_convert_mmx + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extxbgr_convert_mmx + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extxrgb_convert_mmx + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); + +extern const int jconst_ycc_rgb_convert_sse2[]; +EXTERN(void) jsimd_ycc_rgb_convert_sse2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extrgb_convert_sse2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extrgbx_convert_sse2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extbgr_convert_sse2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extbgrx_convert_sse2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extxbgr_convert_sse2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extxrgb_convert_sse2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); + +EXTERN(void) jsimd_ycc_rgb_convert_neon + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extrgb_convert_neon + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extrgbx_convert_neon + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extbgr_convert_neon + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extbgrx_convert_neon + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extxbgr_convert_neon + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extxrgb_convert_neon + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_rgb565_convert_neon + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); + +EXTERN(void) jsimd_ycc_extrgb_convert_neon_slowst3 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extbgr_convert_neon_slowst3 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); + +EXTERN(void) jsimd_ycc_rgb_convert_mips_dspr2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extrgb_convert_mips_dspr2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extrgbx_convert_mips_dspr2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extbgr_convert_mips_dspr2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extbgrx_convert_mips_dspr2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extxbgr_convert_mips_dspr2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extxrgb_convert_mips_dspr2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); + +EXTERN(void) jsimd_ycc_rgb_convert_altivec + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extrgb_convert_altivec + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extrgbx_convert_altivec + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extbgr_convert_altivec + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extbgrx_convert_altivec + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extxbgr_convert_altivec + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extxrgb_convert_altivec + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); + +/* NULL Colorspace Conversion */ +EXTERN(void) jsimd_c_null_convert_mips_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows, int num_components); + +/* h2v1 Downsampling */ +EXTERN(void) jsimd_h2v1_downsample_mmx + (JDIMENSION image_width, int max_v_samp_factor, + JDIMENSION v_samp_factor, JDIMENSION width_blocks, + JSAMPARRAY input_data, JSAMPARRAY output_data); + +EXTERN(void) jsimd_h2v1_downsample_sse2 + (JDIMENSION image_width, int max_v_samp_factor, + JDIMENSION v_samp_factor, JDIMENSION width_blocks, + JSAMPARRAY input_data, JSAMPARRAY output_data); + +EXTERN(void) jsimd_h2v1_downsample_neon + (JDIMENSION image_width, int max_v_samp_factor, + JDIMENSION v_samp_factor, JDIMENSION width_blocks, + JSAMPARRAY input_data, JSAMPARRAY output_data); + +EXTERN(void) jsimd_h2v1_downsample_mips_dspr2 + (JDIMENSION image_width, int max_v_samp_factor, + JDIMENSION v_samp_factor, JDIMENSION width_blocks, + JSAMPARRAY input_data, JSAMPARRAY output_data); + +EXTERN(void) jsimd_h2v1_downsample_altivec + (JDIMENSION image_width, int max_v_samp_factor, + JDIMENSION v_samp_factor, JDIMENSION width_blocks, + JSAMPARRAY input_data, JSAMPARRAY output_data); + +/* h2v2 Downsampling */ +EXTERN(void) jsimd_h2v2_downsample_mmx + (JDIMENSION image_width, int max_v_samp_factor, + JDIMENSION v_samp_factor, JDIMENSION width_blocks, + JSAMPARRAY input_data, JSAMPARRAY output_data); + +EXTERN(void) jsimd_h2v2_downsample_sse2 + (JDIMENSION image_width, int max_v_samp_factor, + JDIMENSION v_samp_factor, JDIMENSION width_blocks, + JSAMPARRAY input_data, JSAMPARRAY output_data); + +EXTERN(void) jsimd_h2v2_downsample_neon + (JDIMENSION image_width, int max_v_samp_factor, + JDIMENSION v_samp_factor, JDIMENSION width_blocks, + JSAMPARRAY input_data, JSAMPARRAY output_data); + +EXTERN(void) jsimd_h2v2_downsample_mips_dspr2 + (JDIMENSION image_width, int max_v_samp_factor, + JDIMENSION v_samp_factor, JDIMENSION width_blocks, + JSAMPARRAY input_data, JSAMPARRAY output_data); + +EXTERN(void) jsimd_h2v2_downsample_altivec + (JDIMENSION image_width, int max_v_samp_factor, + JDIMENSION v_samp_factor, JDIMENSION width_blocks, + JSAMPARRAY input_data, JSAMPARRAY output_data); + +/* h2v2 Smooth Downsampling */ +EXTERN(void) jsimd_h2v2_smooth_downsample_mips_dspr2 + (JSAMPARRAY input_data, JSAMPARRAY output_data, + JDIMENSION v_samp_factor, int max_v_samp_factor, + int smoothing_factor, JDIMENSION width_blocks, + JDIMENSION image_width); + + +/* Upsampling */ +EXTERN(void) jsimd_h2v1_upsample_mmx + (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); +EXTERN(void) jsimd_h2v2_upsample_mmx + (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); + +EXTERN(void) jsimd_h2v1_upsample_sse2 + (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); +EXTERN(void) jsimd_h2v2_upsample_sse2 + (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); + +EXTERN(void) jsimd_h2v1_upsample_mips_dspr2 + (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); +EXTERN(void) jsimd_h2v2_upsample_mips_dspr2 + (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); + +EXTERN(void) jsimd_int_upsample_mips_dspr2 + (UINT8 h_expand, UINT8 v_expand, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr, JDIMENSION output_width, + int max_v_samp_factor); + +EXTERN(void) jsimd_h2v1_upsample_altivec + (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); +EXTERN(void) jsimd_h2v2_upsample_altivec + (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); + +/* Fancy Upsampling */ +EXTERN(void) jsimd_h2v1_fancy_upsample_mmx + (int max_v_samp_factor, JDIMENSION downsampled_width, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); +EXTERN(void) jsimd_h2v2_fancy_upsample_mmx + (int max_v_samp_factor, JDIMENSION downsampled_width, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); + +extern const int jconst_fancy_upsample_sse2[]; +EXTERN(void) jsimd_h2v1_fancy_upsample_sse2 + (int max_v_samp_factor, JDIMENSION downsampled_width, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); +EXTERN(void) jsimd_h2v2_fancy_upsample_sse2 + (int max_v_samp_factor, JDIMENSION downsampled_width, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); + +EXTERN(void) jsimd_h2v1_fancy_upsample_neon + (int max_v_samp_factor, JDIMENSION downsampled_width, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); + +EXTERN(void) jsimd_h2v1_fancy_upsample_mips_dspr2 + (int max_v_samp_factor, JDIMENSION downsampled_width, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); +EXTERN(void) jsimd_h2v2_fancy_upsample_mips_dspr2 + (int max_v_samp_factor, JDIMENSION downsampled_width, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); + +EXTERN(void) jsimd_h2v1_fancy_upsample_altivec + (int max_v_samp_factor, JDIMENSION downsampled_width, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); +EXTERN(void) jsimd_h2v2_fancy_upsample_altivec + (int max_v_samp_factor, JDIMENSION downsampled_width, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); + +/* Merged Upsampling */ +EXTERN(void) jsimd_h2v1_merged_upsample_mmx + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_mmx + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_mmx + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_mmx + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_mmx + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_mmx + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_mmx + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); + +EXTERN(void) jsimd_h2v2_merged_upsample_mmx + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_mmx + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_mmx + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_mmx + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_mmx + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_mmx + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_mmx + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); + +extern const int jconst_merged_upsample_sse2[]; +EXTERN(void) jsimd_h2v1_merged_upsample_sse2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_sse2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_sse2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_sse2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_sse2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_sse2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_sse2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); + +EXTERN(void) jsimd_h2v2_merged_upsample_sse2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_sse2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_sse2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_sse2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_sse2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_sse2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_sse2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); + +EXTERN(void) jsimd_h2v1_merged_upsample_mips_dspr2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range); +EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_mips_dspr2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range); +EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_mips_dspr2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range); +EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_mips_dspr2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range); +EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_mips_dspr2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range); +EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_mips_dspr2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range); +EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_mips_dspr2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range); + +EXTERN(void) jsimd_h2v2_merged_upsample_mips_dspr2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range); +EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_mips_dspr2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range); +EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_mips_dspr2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range); +EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_mips_dspr2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range); +EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_mips_dspr2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range); +EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_mips_dspr2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range); +EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_mips_dspr2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range); + +EXTERN(void) jsimd_h2v1_merged_upsample_altivec + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_altivec + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_altivec + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_altivec + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_altivec + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_altivec + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_altivec + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); + +EXTERN(void) jsimd_h2v2_merged_upsample_altivec + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_altivec + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_altivec + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_altivec + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_altivec + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_altivec + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_altivec + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); + +/* Sample Conversion */ +EXTERN(void) jsimd_convsamp_mmx + (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace); + +EXTERN(void) jsimd_convsamp_sse2 + (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace); + +EXTERN(void) jsimd_convsamp_neon + (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace); + +EXTERN(void) jsimd_convsamp_mips_dspr2 + (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace); + +EXTERN(void) jsimd_convsamp_altivec + (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace); + +/* Floating Point Sample Conversion */ +EXTERN(void) jsimd_convsamp_float_3dnow + (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace); + +EXTERN(void) jsimd_convsamp_float_sse + (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace); + +EXTERN(void) jsimd_convsamp_float_sse2 + (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace); + +EXTERN(void) jsimd_convsamp_float_mips_dspr2 + (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace); + +/* Slow Integer Forward DCT */ +EXTERN(void) jsimd_fdct_islow_mmx (DCTELEM *data); + +extern const int jconst_fdct_islow_sse2[]; +EXTERN(void) jsimd_fdct_islow_sse2 (DCTELEM *data); + +EXTERN(void) jsimd_fdct_islow_neon (DCTELEM *data); + +EXTERN(void) jsimd_fdct_islow_mips_dspr2 (DCTELEM *data); + +EXTERN(void) jsimd_fdct_islow_altivec (DCTELEM *data); + +/* Fast Integer Forward DCT */ +EXTERN(void) jsimd_fdct_ifast_mmx (DCTELEM *data); + +extern const int jconst_fdct_ifast_sse2[]; +EXTERN(void) jsimd_fdct_ifast_sse2 (DCTELEM *data); + +EXTERN(void) jsimd_fdct_ifast_neon (DCTELEM *data); + +EXTERN(void) jsimd_fdct_ifast_mips_dspr2 (DCTELEM *data); + +EXTERN(void) jsimd_fdct_ifast_altivec (DCTELEM *data); + +/* Floating Point Forward DCT */ +EXTERN(void) jsimd_fdct_float_3dnow (FAST_FLOAT *data); + +extern const int jconst_fdct_float_sse[]; +EXTERN(void) jsimd_fdct_float_sse (FAST_FLOAT *data); + +/* Quantization */ +EXTERN(void) jsimd_quantize_mmx + (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace); + +EXTERN(void) jsimd_quantize_sse2 + (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace); + +EXTERN(void) jsimd_quantize_neon + (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace); + +EXTERN(void) jsimd_quantize_mips_dspr2 + (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace); + +EXTERN(void) jsimd_quantize_altivec + (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace); + +/* Floating Point Quantization */ +EXTERN(void) jsimd_quantize_float_3dnow + (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace); + +EXTERN(void) jsimd_quantize_float_sse + (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace); + +EXTERN(void) jsimd_quantize_float_sse2 + (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace); + +EXTERN(void) jsimd_quantize_float_mips_dspr2 + (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace); + +/* Scaled Inverse DCT */ +EXTERN(void) jsimd_idct_2x2_mmx + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); +EXTERN(void) jsimd_idct_4x4_mmx + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +extern const int jconst_idct_red_sse2[]; +EXTERN(void) jsimd_idct_2x2_sse2 + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); +EXTERN(void) jsimd_idct_4x4_sse2 + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +EXTERN(void) jsimd_idct_2x2_neon + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); +EXTERN(void) jsimd_idct_4x4_neon + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +EXTERN(void) jsimd_idct_2x2_mips_dspr2 + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); +EXTERN(void) jsimd_idct_4x4_mips_dspr2 + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col, int *workspace); +EXTERN(void) jsimd_idct_6x6_mips_dspr2 + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); +EXTERN(void) jsimd_idct_12x12_pass1_mips_dspr2 + (JCOEFPTR coef_block, void *dct_table, int *workspace); +EXTERN(void) jsimd_idct_12x12_pass2_mips_dspr2 + (int *workspace, int *output); + +/* Slow Integer Inverse DCT */ +EXTERN(void) jsimd_idct_islow_mmx + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +extern const int jconst_idct_islow_sse2[]; +EXTERN(void) jsimd_idct_islow_sse2 + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +EXTERN(void) jsimd_idct_islow_neon + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +EXTERN(void) jsimd_idct_islow_mips_dspr2 + (void *dct_table, JCOEFPTR coef_block, int *output_buf, + JSAMPLE *output_col); + +EXTERN(void) jsimd_idct_islow_altivec + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +/* Fast Integer Inverse DCT */ +EXTERN(void) jsimd_idct_ifast_mmx + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +extern const int jconst_idct_ifast_sse2[]; +EXTERN(void) jsimd_idct_ifast_sse2 + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +EXTERN(void) jsimd_idct_ifast_neon + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +EXTERN(void) jsimd_idct_ifast_cols_mips_dspr2 + (JCOEF *inptr, IFAST_MULT_TYPE *quantptr, DCTELEM *wsptr, + const int *idct_coefs); +EXTERN(void) jsimd_idct_ifast_rows_mips_dspr2 + (DCTELEM *wsptr, JSAMPARRAY output_buf, JDIMENSION output_col, + const int *idct_coefs); + +EXTERN(void) jsimd_idct_ifast_altivec + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +/* Floating Point Inverse DCT */ +EXTERN(void) jsimd_idct_float_3dnow + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +extern const int jconst_idct_float_sse[]; +EXTERN(void) jsimd_idct_float_sse + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +extern const int jconst_idct_float_sse2[]; +EXTERN(void) jsimd_idct_float_sse2 + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +/* Huffman coding */ +extern const int jconst_huff_encode_one_block[]; +EXTERN(JOCTET*) jsimd_huff_encode_one_block_sse2 + (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val, + c_derived_tbl *dctbl, c_derived_tbl *actbl); + +EXTERN(JOCTET*) jsimd_huff_encode_one_block_neon + (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val, + c_derived_tbl *dctbl, c_derived_tbl *actbl); + +EXTERN(JOCTET*) jsimd_huff_encode_one_block_neon_slowtbl + (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val, + c_derived_tbl *dctbl, c_derived_tbl *actbl); diff --git a/Builder/jni-1.11/simd/jsimd_altivec.h b/Builder/jni-1.11/simd/jsimd_altivec.h new file mode 100644 index 000000000..62dbc5cdf --- /dev/null +++ b/Builder/jni-1.11/simd/jsimd_altivec.h @@ -0,0 +1,99 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#define JPEG_INTERNALS +#include "../jinclude.h" +#include "../jpeglib.h" +#include "../jsimd.h" +#include "../jdct.h" +#include "../jsimddct.h" +#include "jsimd.h" +#include + + +/* Common code */ + +#define __4X(a) a, a, a, a +#define __4X2(a, b) a, b, a, b, a, b, a, b +#define __8X(a) __4X(a), __4X(a) +#define __16X(a) __8X(a), __8X(a) + +#define TRANSPOSE(row, col) \ +{ \ + __vector short row04l, row04h, row15l, row15h, \ + row26l, row26h, row37l, row37h; \ + __vector short col01e, col01o, col23e, col23o, \ + col45e, col45o, col67e, col67o; \ + \ + /* transpose coefficients (phase 1) */ \ + row04l = vec_mergeh(row##0, row##4); /* row04l=(00 40 01 41 02 42 03 43) */ \ + row04h = vec_mergel(row##0, row##4); /* row04h=(04 44 05 45 06 46 07 47) */ \ + row15l = vec_mergeh(row##1, row##5); /* row15l=(10 50 11 51 12 52 13 53) */ \ + row15h = vec_mergel(row##1, row##5); /* row15h=(14 54 15 55 16 56 17 57) */ \ + row26l = vec_mergeh(row##2, row##6); /* row26l=(20 60 21 61 22 62 23 63) */ \ + row26h = vec_mergel(row##2, row##6); /* row26h=(24 64 25 65 26 66 27 67) */ \ + row37l = vec_mergeh(row##3, row##7); /* row37l=(30 70 31 71 32 72 33 73) */ \ + row37h = vec_mergel(row##3, row##7); /* row37h=(34 74 35 75 36 76 37 77) */ \ + \ + /* transpose coefficients (phase 2) */ \ + col01e = vec_mergeh(row04l, row26l); /* col01e=(00 20 40 60 01 21 41 61) */ \ + col23e = vec_mergel(row04l, row26l); /* col23e=(02 22 42 62 03 23 43 63) */ \ + col45e = vec_mergeh(row04h, row26h); /* col45e=(04 24 44 64 05 25 45 65) */ \ + col67e = vec_mergel(row04h, row26h); /* col67e=(06 26 46 66 07 27 47 67) */ \ + col01o = vec_mergeh(row15l, row37l); /* col01o=(10 30 50 70 11 31 51 71) */ \ + col23o = vec_mergel(row15l, row37l); /* col23o=(12 32 52 72 13 33 53 73) */ \ + col45o = vec_mergeh(row15h, row37h); /* col45o=(14 34 54 74 15 35 55 75) */ \ + col67o = vec_mergel(row15h, row37h); /* col67o=(16 36 56 76 17 37 57 77) */ \ + \ + /* transpose coefficients (phase 3) */ \ + col##0 = vec_mergeh(col01e, col01o); /* col0=(00 10 20 30 40 50 60 70) */ \ + col##1 = vec_mergel(col01e, col01o); /* col1=(01 11 21 31 41 51 61 71) */ \ + col##2 = vec_mergeh(col23e, col23o); /* col2=(02 12 22 32 42 52 62 72) */ \ + col##3 = vec_mergel(col23e, col23o); /* col3=(03 13 23 33 43 53 63 73) */ \ + col##4 = vec_mergeh(col45e, col45o); /* col4=(04 14 24 34 44 54 64 74) */ \ + col##5 = vec_mergel(col45e, col45o); /* col5=(05 15 25 35 45 55 65 75) */ \ + col##6 = vec_mergeh(col67e, col67o); /* col6=(06 16 26 36 46 56 66 76) */ \ + col##7 = vec_mergel(col67e, col67o); /* col7=(07 17 27 37 47 57 67 77) */ \ +} + +#ifndef min +#define min(a,b) ((a) < (b) ? (a) : (b)) +#endif + + +/* Macros to abstract big/little endian bit twiddling */ + +#if __BIG_ENDIAN__ + +#define VEC_LD(a, b) vec_ld(a, b) +#define VEC_ST(a, b, c) vec_st(a, b, c) +#define VEC_UNPACKHU(a) vec_mergeh(pb_zero, a) +#define VEC_UNPACKLU(a) vec_mergel(pb_zero, a) + +#else + +#define VEC_LD(a, b) vec_vsx_ld(a, b) +#define VEC_ST(a, b, c) vec_vsx_st(a, b, c) +#define VEC_UNPACKHU(a) vec_mergeh(a, pb_zero) +#define VEC_UNPACKLU(a) vec_mergel(a, pb_zero) + +#endif diff --git a/Builder/jni-1.11/simd/src/jsimd_arm.c b/Builder/jni-1.11/simd/jsimd_arm.c similarity index 59% rename from Builder/jni-1.11/simd/src/jsimd_arm.c rename to Builder/jni-1.11/simd/jsimd_arm.c index 03b4b07fe..4b20c77ba 100644 --- a/Builder/jni-1.11/simd/src/jsimd_arm.c +++ b/Builder/jni-1.11/simd/jsimd_arm.c @@ -2,34 +2,95 @@ * jsimd_arm.c * * Copyright 2009 Pierre Ossman for Cendio AB - * Copyright 2009-2011 D. R. Commander + * Copyright (C) 2011, Nokia Corporation and/or its subsidiary(-ies). + * Copyright (C) 2009-2011, 2013-2014, 2016, D. R. Commander. + * Copyright (C) 2015-2016, Matthieu Darbois. * * Based on the x86 SIMD extension for IJG JPEG library, * Copyright (C) 1999-2006, MIYASAKA Masaru. * For conditions of distribution and use, see copyright notice in jsimdext.inc * * This file contains the interface between the "normal" portions - * of the library and the SIMD implementations when running on - * ARM architecture. - * - * Based on the stubs from 'jsimd_none.c' + * of the library and the SIMD implementations when running on a + * 32-bit ARM architecture. */ #define JPEG_INTERNALS -#include "jinclude.h" -#include "jpeglib.h" -#include "jdct.h" -#include "jsimddct.h" +#include "h/jinclude.h" +#include "h/jpeglib.h" +#include "h/jsimd.h" +#include "h/jdct.h" +#include "h/jsimddct.h" #include "jsimd.h" #include #include #include -#include "StLog.h" -#define LCTX "TurboJPEG.SIMD" - static unsigned int simd_support = ~0; +static unsigned int simd_huffman = 1; + +#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__) + +#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024) + +LOCAL(int) +check_feature (char *buffer, char *feature) +{ + char *p; + if (*feature == 0) + return 0; + if (strncmp(buffer, "Features", 8) != 0) + return 0; + buffer += 8; + while (isspace(*buffer)) + buffer++; + + /* Check if 'feature' is present in the buffer as a separate word */ + while ((p = strstr(buffer, feature))) { + if (p > buffer && !isspace(*(p - 1))) { + buffer++; + continue; + } + p += strlen(feature); + if (*p != 0 && !isspace(*p)) { + buffer++; + continue; + } + return 1; + } + return 0; +} + +LOCAL(int) +parse_proc_cpuinfo (int bufsize) +{ + char *buffer = (char *)malloc(bufsize); + FILE *fd; + simd_support = 0; + + if (!buffer) + return 0; + + fd = fopen("/proc/cpuinfo", "r"); + if (fd) { + while (fgets(buffer, bufsize, fd)) { + if (!strchr(buffer, '\n') && !feof(fd)) { + /* "impossible" happened - insufficient size of the buffer! */ + fclose(fd); + free(buffer); + return 0; + } + if (check_feature(buffer, "neon")) + simd_support |= JSIMD_ARM_NEON; + } + fclose(fd); + } + free(buffer); + return 1; +} + +#endif /* * Check what SIMD accelerations are supported. @@ -39,28 +100,39 @@ static unsigned int simd_support = ~0; LOCAL(void) init_simd (void) { + char *env = NULL; +#if !defined(__ARM_NEON__) && defined(__linux__) || defined(ANDROID) || defined(__ANDROID__) + int bufsize = 1024; /* an initial guess for the line buffer size limit */ +#endif + if (simd_support != ~0U) - { return; - } simd_support = 0; - char *env = NULL; - /* Force different settings through environment variables */ - env = getenv("JSIMD_FORCE_ARM_NEON"); - if ((env != NULL) && (strcmp(env, "1") == 0)) - { - simd_support |= JSIMD_ARM_NEON; +#if defined(__ARM_NEON__) + simd_support |= JSIMD_ARM_NEON; +#elif defined(__linux__) || defined(ANDROID) || defined(__ANDROID__) + /* We still have a chance to use NEON regardless of globally used + * -mcpu/-mfpu options passed to gcc by performing runtime detection via + * /proc/cpuinfo parsing on linux/android */ + while (!parse_proc_cpuinfo(bufsize)) { + bufsize *= 2; + if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT) + break; } +#endif - env = getenv("JSIMD_FORCE_NO_SIMD"); + /* Force different settings through environment variables */ + env = getenv("JSIMD_FORCENEON"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_support = JSIMD_ARM_NEON; + env = getenv("JSIMD_FORCENONE"); if ((env != NULL) && (strcmp(env, "1") == 0)) - { simd_support = 0; - } - - INFO_L(LCTX, "SIMD support: %d", simd_support); + env = getenv("JSIMD_NOHUFFENC"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_huffman = 0; } GLOBAL(int) @@ -102,6 +174,24 @@ jsimd_can_ycc_rgb (void) return 0; if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) return 0; + + if (simd_support & JSIMD_ARM_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_ycc_rgb565 (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (simd_support & JSIMD_ARM_NEON) return 1; @@ -115,8 +205,7 @@ jsimd_rgb_ycc_convert (j_compress_ptr cinfo, { void (*neonfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); - switch(cinfo->in_color_space) - { + switch(cinfo->in_color_space) { case JCS_EXT_RGB: neonfct=jsimd_extrgb_ycc_convert_neon; break; @@ -144,9 +233,7 @@ jsimd_rgb_ycc_convert (j_compress_ptr cinfo, break; } - if (simd_support & JSIMD_ARM_NEON) - neonfct(cinfo->image_width, input_buf, - output_buf, output_row, num_rows); + neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); } GLOBAL(void) @@ -163,8 +250,7 @@ jsimd_ycc_rgb_convert (j_decompress_ptr cinfo, { void (*neonfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int); - switch(cinfo->out_color_space) - { + switch(cinfo->out_color_space) { case JCS_EXT_RGB: neonfct=jsimd_ycc_extrgb_convert_neon; break; @@ -187,14 +273,21 @@ jsimd_ycc_rgb_convert (j_decompress_ptr cinfo, case JCS_EXT_ARGB: neonfct=jsimd_ycc_extxrgb_convert_neon; break; - default: + default: neonfct=jsimd_ycc_extrgb_convert_neon; break; } - if (simd_support & JSIMD_ARM_NEON) - neonfct(cinfo->output_width, input_buf, - input_row, output_buf, num_rows); + neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows); +} + +GLOBAL(void) +jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ + jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row, + output_buf, num_rows); } GLOBAL(int) @@ -214,13 +307,13 @@ jsimd_can_h2v1_downsample (void) } GLOBAL(void) -jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, +jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr, JSAMPARRAY input_data, JSAMPARRAY output_data) { } GLOBAL(void) -jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, +jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr, JSAMPARRAY input_data, JSAMPARRAY output_data) { } @@ -243,17 +336,17 @@ jsimd_can_h2v1_upsample (void) GLOBAL(void) jsimd_h2v2_upsample (j_decompress_ptr cinfo, - jpeg_component_info * compptr, + jpeg_component_info *compptr, JSAMPARRAY input_data, - JSAMPARRAY * output_data_ptr) + JSAMPARRAY *output_data_ptr) { } GLOBAL(void) jsimd_h2v1_upsample (j_decompress_ptr cinfo, - jpeg_component_info * compptr, + jpeg_component_info *compptr, JSAMPARRAY input_data, - JSAMPARRAY * output_data_ptr) + JSAMPARRAY *output_data_ptr) { } @@ -284,21 +377,21 @@ jsimd_can_h2v1_fancy_upsample (void) GLOBAL(void) jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo, - jpeg_component_info * compptr, + jpeg_component_info *compptr, JSAMPARRAY input_data, - JSAMPARRAY * output_data_ptr) + JSAMPARRAY *output_data_ptr) { } GLOBAL(void) jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo, - jpeg_component_info * compptr, + jpeg_component_info *compptr, JSAMPARRAY input_data, - JSAMPARRAY * output_data_ptr) + JSAMPARRAY *output_data_ptr) { - if (simd_support & JSIMD_ARM_NEON) - jsimd_h2v1_fancy_upsample_neon(cinfo->max_v_samp_factor, - compptr->downsampled_width, input_data, output_data_ptr); + jsimd_h2v1_fancy_upsample_neon(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); } GLOBAL(int) @@ -364,15 +457,14 @@ jsimd_can_convsamp_float (void) GLOBAL(void) jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, - DCTELEM * workspace) + DCTELEM *workspace) { - if (simd_support & JSIMD_ARM_NEON) - jsimd_convsamp_neon(sample_data, start_col, workspace); + jsimd_convsamp_neon(sample_data, start_col, workspace); } GLOBAL(void) jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col, - FAST_FLOAT * workspace) + FAST_FLOAT *workspace) { } @@ -410,19 +502,18 @@ jsimd_can_fdct_float (void) } GLOBAL(void) -jsimd_fdct_islow (DCTELEM * data) +jsimd_fdct_islow (DCTELEM *data) { } GLOBAL(void) -jsimd_fdct_ifast (DCTELEM * data) +jsimd_fdct_ifast (DCTELEM *data) { - if (simd_support & JSIMD_ARM_NEON) - jsimd_fdct_ifast_neon(data); + jsimd_fdct_ifast_neon(data); } GLOBAL(void) -jsimd_fdct_float (FAST_FLOAT * data) +jsimd_fdct_float (FAST_FLOAT *data) { } @@ -454,16 +545,15 @@ jsimd_can_quantize_float (void) } GLOBAL(void) -jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors, - DCTELEM * workspace) +jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors, + DCTELEM *workspace) { - if (simd_support & JSIMD_ARM_NEON) - jsimd_quantize_neon(coef_block, divisors, workspace); + jsimd_quantize_neon(coef_block, divisors, workspace); } GLOBAL(void) -jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors, - FAST_FLOAT * workspace) +jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors, + FAST_FLOAT *workspace) { } @@ -484,7 +574,7 @@ jsimd_can_idct_2x2 (void) if (sizeof(ISLOW_MULT_TYPE) != 2) return 0; - if ((simd_support & JSIMD_ARM_NEON)) + if (simd_support & JSIMD_ARM_NEON) return 1; return 0; @@ -507,28 +597,28 @@ jsimd_can_idct_4x4 (void) if (sizeof(ISLOW_MULT_TYPE) != 2) return 0; - if ((simd_support & JSIMD_ARM_NEON)) + if (simd_support & JSIMD_ARM_NEON) return 1; return 0; } GLOBAL(void) -jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr, +jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr, JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col) { - if ((simd_support & JSIMD_ARM_NEON)) - jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf, output_col); + jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf, + output_col); } GLOBAL(void) -jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr, +jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr, JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col) { - if ((simd_support & JSIMD_ARM_NEON)) - jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf, output_col); + jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf, + output_col); } GLOBAL(int) @@ -573,7 +663,7 @@ jsimd_can_idct_ifast (void) if (IFAST_SCALE_BITS != 2) return 0; - if ((simd_support & JSIMD_ARM_NEON)) + if (simd_support & JSIMD_ARM_NEON) return 1; return 0; @@ -588,27 +678,51 @@ jsimd_can_idct_float (void) } GLOBAL(void) -jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr, - JCOEFPTR coef_block, JSAMPARRAY output_buf, - JDIMENSION output_col) +jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) { - if ((simd_support & JSIMD_ARM_NEON)) - jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf, output_col); + jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf, + output_col); } GLOBAL(void) -jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr, - JCOEFPTR coef_block, JSAMPARRAY output_buf, - JDIMENSION output_col) +jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) { - if ((simd_support & JSIMD_ARM_NEON)) - jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf, output_col); + jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf, + output_col); } GLOBAL(void) -jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr, - JCOEFPTR coef_block, JSAMPARRAY output_buf, - JDIMENSION output_col) +jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) { } +GLOBAL(int) +jsimd_can_huff_encode_one_block (void) +{ + init_simd(); + + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + + if (simd_support & JSIMD_ARM_NEON && simd_huffman) + return 1; + + return 0; +} + +GLOBAL(JOCTET*) +jsimd_huff_encode_one_block (void *state, JOCTET *buffer, JCOEFPTR block, + int last_dc_val, c_derived_tbl *dctbl, + c_derived_tbl *actbl) +{ + return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val, + dctbl, actbl); +} diff --git a/Builder/jni-1.11/simd/jsimd_arm64.c b/Builder/jni-1.11/simd/jsimd_arm64.c new file mode 100644 index 000000000..d1139ef96 --- /dev/null +++ b/Builder/jni-1.11/simd/jsimd_arm64.c @@ -0,0 +1,803 @@ +/* + * jsimd_arm64.c + * + * Copyright 2009 Pierre Ossman for Cendio AB + * Copyright (C) 2011, Nokia Corporation and/or its subsidiary(-ies). + * Copyright (C) 2009-2011, 2013-2014, 2016, D. R. Commander. + * Copyright (C) 2015-2016, Matthieu Darbois. + * + * Based on the x86 SIMD extension for IJG JPEG library, + * Copyright (C) 1999-2006, MIYASAKA Masaru. + * For conditions of distribution and use, see copyright notice in jsimdext.inc + * + * This file contains the interface between the "normal" portions + * of the library and the SIMD implementations when running on a + * 64-bit ARM architecture. + */ + +#define JPEG_INTERNALS +#include "h/jinclude.h" +#include "h/jpeglib.h" +#include "h/jsimd.h" +#include "h/jdct.h" +#include "h/jsimddct.h" +#include "jsimd.h" + +#include +#include +#include + +#define JSIMD_FASTLD3 1 +#define JSIMD_FASTST3 2 +#define JSIMD_FASTTBL 4 + +static unsigned int simd_support = ~0; +static unsigned int simd_huffman = 1; +static unsigned int simd_features = JSIMD_FASTLD3 | JSIMD_FASTST3 | + JSIMD_FASTTBL; + +#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__) + +#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024) + +LOCAL(int) +check_cpuinfo (char *buffer, const char *field, char *value) +{ + char *p; + if (*value == 0) + return 0; + if (strncmp(buffer, field, strlen(field)) != 0) + return 0; + buffer += strlen(field); + while (isspace(*buffer)) + buffer++; + + /* Check if 'value' is present in the buffer as a separate word */ + while ((p = strstr(buffer, value))) { + if (p > buffer && !isspace(*(p - 1))) { + buffer++; + continue; + } + p += strlen(value); + if (*p != 0 && !isspace(*p)) { + buffer++; + continue; + } + return 1; + } + return 0; +} + +LOCAL(int) +parse_proc_cpuinfo (int bufsize) +{ + char *buffer = (char *)malloc(bufsize); + FILE *fd; + + if (!buffer) + return 0; + + fd = fopen("/proc/cpuinfo", "r"); + if (fd) { + while (fgets(buffer, bufsize, fd)) { + if (!strchr(buffer, '\n') && !feof(fd)) { + /* "impossible" happened - insufficient size of the buffer! */ + fclose(fd); + free(buffer); + return 0; + } + if (check_cpuinfo(buffer, "CPU part", "0xd03") || + check_cpuinfo(buffer, "CPU part", "0xd07")) + /* The Cortex-A53 has a slow tbl implementation. We can gain a few + percent speedup by disabling the use of that instruction. The + speedup on Cortex-A57 is more subtle but still measurable. */ + simd_features &= ~JSIMD_FASTTBL; + else if (check_cpuinfo(buffer, "CPU part", "0x0a1")) + /* The SIMD version of Huffman encoding is slower than the C version on + Cavium ThunderX. Also, ld3 and st3 are abyssmally slow on that + CPU. */ + simd_huffman = simd_features = 0; + } + fclose(fd); + } + free(buffer); + return 1; +} + +#endif + +/* + * Check what SIMD accelerations are supported. + * + * FIXME: This code is racy under a multi-threaded environment. + */ + +/* + * ARMv8 architectures support NEON extensions by default. + * It is no longer optional as it was with ARMv7. + */ + + +LOCAL(void) +init_simd (void) +{ + char *env = NULL; +#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__) + int bufsize = 1024; /* an initial guess for the line buffer size limit */ +#endif + + if (simd_support != ~0U) + return; + + simd_support = 0; + + simd_support |= JSIMD_ARM_NEON; +#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__) + while (!parse_proc_cpuinfo(bufsize)) { + bufsize *= 2; + if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT) + break; + } +#endif + + /* Force different settings through environment variables */ + env = getenv("JSIMD_FORCENEON"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_support = JSIMD_ARM_NEON; + env = getenv("JSIMD_FORCENONE"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_support = 0; + env = getenv("JSIMD_NOHUFFENC"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_huffman = 0; + env = getenv("JSIMD_FASTLD3"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_features |= JSIMD_FASTLD3; + if ((env != NULL) && (strcmp(env, "0") == 0)) + simd_features &= ~JSIMD_FASTLD3; + env = getenv("JSIMD_FASTST3"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_features |= JSIMD_FASTST3; + if ((env != NULL) && (strcmp(env, "0") == 0)) + simd_features &= ~JSIMD_FASTST3; +} + +GLOBAL(int) +jsimd_can_rgb_ycc (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if (simd_support & JSIMD_ARM_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_rgb_gray (void) +{ + init_simd(); + + return 0; +} + +GLOBAL(int) +jsimd_can_ycc_rgb (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if (simd_support & JSIMD_ARM_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_ycc_rgb565 (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_ARM_NEON) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_rgb_ycc_convert (j_compress_ptr cinfo, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) +{ + void (*neonfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + + switch(cinfo->in_color_space) { + case JCS_EXT_RGB: + if (simd_features & JSIMD_FASTLD3) + neonfct=jsimd_extrgb_ycc_convert_neon; + else + neonfct=jsimd_extrgb_ycc_convert_neon_slowld3; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + neonfct=jsimd_extrgbx_ycc_convert_neon; + break; + case JCS_EXT_BGR: + if (simd_features & JSIMD_FASTLD3) + neonfct=jsimd_extbgr_ycc_convert_neon; + else + neonfct=jsimd_extbgr_ycc_convert_neon_slowld3; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + neonfct=jsimd_extbgrx_ycc_convert_neon; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + neonfct=jsimd_extxbgr_ycc_convert_neon; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + neonfct=jsimd_extxrgb_ycc_convert_neon; + break; + default: + if (simd_features & JSIMD_FASTLD3) + neonfct=jsimd_extrgb_ycc_convert_neon; + else + neonfct=jsimd_extrgb_ycc_convert_neon_slowld3; + break; + } + + neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); +} + +GLOBAL(void) +jsimd_rgb_gray_convert (j_compress_ptr cinfo, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) +{ +} + +GLOBAL(void) +jsimd_ycc_rgb_convert (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ + void (*neonfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int); + + switch(cinfo->out_color_space) { + case JCS_EXT_RGB: + if (simd_features & JSIMD_FASTST3) + neonfct=jsimd_ycc_extrgb_convert_neon; + else + neonfct=jsimd_ycc_extrgb_convert_neon_slowst3; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + neonfct=jsimd_ycc_extrgbx_convert_neon; + break; + case JCS_EXT_BGR: + if (simd_features & JSIMD_FASTST3) + neonfct=jsimd_ycc_extbgr_convert_neon; + else + neonfct=jsimd_ycc_extbgr_convert_neon_slowst3; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + neonfct=jsimd_ycc_extbgrx_convert_neon; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + neonfct=jsimd_ycc_extxbgr_convert_neon; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + neonfct=jsimd_ycc_extxrgb_convert_neon; + break; + default: + if (simd_features & JSIMD_FASTST3) + neonfct=jsimd_ycc_extrgb_convert_neon; + else + neonfct=jsimd_ycc_extrgb_convert_neon_slowst3; + break; + } + + neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows); +} + +GLOBAL(void) +jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ + jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row, + output_buf, num_rows); +} + +GLOBAL(int) +jsimd_can_h2v2_downsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (DCTSIZE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_ARM_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_downsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (DCTSIZE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_ARM_NEON) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + jsimd_h2v2_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, compptr->width_in_blocks, + input_data, output_data); +} + +GLOBAL(void) +jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + jsimd_h2v1_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, compptr->width_in_blocks, + input_data, output_data); +} + +GLOBAL(int) +jsimd_can_h2v2_upsample (void) +{ + init_simd(); + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_upsample (void) +{ + init_simd(); + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_upsample (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ +} + +GLOBAL(void) +jsimd_h2v1_upsample (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ +} + +GLOBAL(int) +jsimd_can_h2v2_fancy_upsample (void) +{ + init_simd(); + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_fancy_upsample (void) +{ + init_simd(); + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ +} + +GLOBAL(void) +jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ +} + +GLOBAL(int) +jsimd_can_h2v2_merged_upsample (void) +{ + init_simd(); + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_merged_upsample (void) +{ + init_simd(); + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf) +{ +} + +GLOBAL(void) +jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf) +{ +} + +GLOBAL(int) +jsimd_can_convsamp (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_ARM_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_convsamp_float (void) +{ + init_simd(); + + return 0; +} + +GLOBAL(void) +jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, + DCTELEM *workspace) +{ + jsimd_convsamp_neon(sample_data, start_col, workspace); +} + +GLOBAL(void) +jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col, + FAST_FLOAT *workspace) +{ +} + +GLOBAL(int) +jsimd_can_fdct_islow (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_ARM_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_fdct_ifast (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_ARM_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_fdct_float (void) +{ + init_simd(); + + return 0; +} + +GLOBAL(void) +jsimd_fdct_islow (DCTELEM *data) +{ + jsimd_fdct_islow_neon(data); +} + +GLOBAL(void) +jsimd_fdct_ifast (DCTELEM *data) +{ + jsimd_fdct_ifast_neon(data); +} + +GLOBAL(void) +jsimd_fdct_float (FAST_FLOAT *data) +{ +} + +GLOBAL(int) +jsimd_can_quantize (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_ARM_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_quantize_float (void) +{ + init_simd(); + + return 0; +} + +GLOBAL(void) +jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors, + DCTELEM *workspace) +{ + jsimd_quantize_neon(coef_block, divisors, workspace); +} + +GLOBAL(void) +jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors, + FAST_FLOAT *workspace) +{ +} + +GLOBAL(int) +jsimd_can_idct_2x2 (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if (simd_support & JSIMD_ARM_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_4x4 (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if (simd_support & JSIMD_ARM_NEON) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf, + output_col); +} + +GLOBAL(void) +jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf, + output_col); +} + +GLOBAL(int) +jsimd_can_idct_islow (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if (simd_support & JSIMD_ARM_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_ifast (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(IFAST_MULT_TYPE) != 2) + return 0; + if (IFAST_SCALE_BITS != 2) + return 0; + + if (simd_support & JSIMD_ARM_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_float (void) +{ + init_simd(); + + return 0; +} + +GLOBAL(void) +jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf, + output_col); +} + +GLOBAL(void) +jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf, + output_col); +} + +GLOBAL(void) +jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +} + +GLOBAL(int) +jsimd_can_huff_encode_one_block (void) +{ + init_simd(); + + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + + if (simd_support & JSIMD_ARM_NEON && simd_huffman) + return 1; + + return 0; +} + +GLOBAL(JOCTET*) +jsimd_huff_encode_one_block (void *state, JOCTET *buffer, JCOEFPTR block, + int last_dc_val, c_derived_tbl *dctbl, + c_derived_tbl *actbl) +{ + if (simd_features & JSIMD_FASTTBL) + return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val, + dctbl, actbl); + else + return jsimd_huff_encode_one_block_neon_slowtbl(state, buffer, block, + last_dc_val, dctbl, actbl); +} diff --git a/Builder/jni-1.11/simd/jsimd_arm64_neon.S b/Builder/jni-1.11/simd/jsimd_arm64_neon.S new file mode 100644 index 000000000..330985824 --- /dev/null +++ b/Builder/jni-1.11/simd/jsimd_arm64_neon.S @@ -0,0 +1,3425 @@ +/* + * ARMv8 NEON optimizations for libjpeg-turbo + * + * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies). + * All Rights Reserved. + * Author: Siarhei Siamashka + * Copyright (C) 2013-2014, Linaro Limited. All Rights Reserved. + * Author: Ragesh Radhakrishnan + * Copyright (C) 2014-2016, D. R. Commander. All Rights Reserved. + * Copyright (C) 2015-2016, Matthieu Darbois. All Rights Reserved. + * Copyright (C) 2016, Siarhei Siamashka. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits /* mark stack as non-executable */ +#endif + +.text + + +#define RESPECT_STRICT_ALIGNMENT 1 + + +/*****************************************************************************/ + +/* Supplementary macro for setting function attributes */ +.macro asm_function fname +#ifdef __APPLE__ + .globl _\fname +_\fname: +#else + .global \fname +#ifdef __ELF__ + .hidden \fname + .type \fname, %function +#endif +\fname: +#endif +.endm + +/* Transpose elements of single 128 bit registers */ +.macro transpose_single x0, x1, xi, xilen, literal + ins \xi\xilen[0], \x0\xilen[0] + ins \x1\xilen[0], \x0\xilen[1] + trn1 \x0\literal, \x0\literal, \x1\literal + trn2 \x1\literal, \xi\literal, \x1\literal +.endm + +/* Transpose elements of 2 differnet registers */ +.macro transpose x0, x1, xi, xilen, literal + mov \xi\xilen, \x0\xilen + trn1 \x0\literal, \x0\literal, \x1\literal + trn2 \x1\literal, \xi\literal, \x1\literal +.endm + +/* Transpose a block of 4x4 coefficients in four 64-bit registers */ +.macro transpose_4x4_32 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen + mov \xi\xilen, \x0\xilen + trn1 \x0\x0len, \x0\x0len, \x2\x2len + trn2 \x2\x2len, \xi\x0len, \x2\x2len + mov \xi\xilen, \x1\xilen + trn1 \x1\x1len, \x1\x1len, \x3\x3len + trn2 \x3\x3len, \xi\x1len, \x3\x3len +.endm + +.macro transpose_4x4_16 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen + mov \xi\xilen, \x0\xilen + trn1 \x0\x0len, \x0\x0len, \x1\x1len + trn2 \x1\x2len, \xi\x0len, \x1\x2len + mov \xi\xilen, \x2\xilen + trn1 \x2\x2len, \x2\x2len, \x3\x3len + trn2 \x3\x2len, \xi\x1len, \x3\x3len +.endm + +.macro transpose_4x4 x0, x1, x2, x3, x5 + transpose_4x4_16 \x0, .4h, \x1, .4h, \x2, .4h, \x3, .4h, \x5, .16b + transpose_4x4_32 \x0, .2s, \x1, .2s, \x2, .2s, \x3, .2s, \x5, .16b +.endm + +.macro transpose_8x8 l0, l1, l2, l3, l4, l5, l6, l7, t0, t1, t2, t3 + trn1 \t0\().8h, \l0\().8h, \l1\().8h + trn1 \t1\().8h, \l2\().8h, \l3\().8h + trn1 \t2\().8h, \l4\().8h, \l5\().8h + trn1 \t3\().8h, \l6\().8h, \l7\().8h + trn2 \l1\().8h, \l0\().8h, \l1\().8h + trn2 \l3\().8h, \l2\().8h, \l3\().8h + trn2 \l5\().8h, \l4\().8h, \l5\().8h + trn2 \l7\().8h, \l6\().8h, \l7\().8h + + trn1 \l4\().4s, \t2\().4s, \t3\().4s + trn2 \t3\().4s, \t2\().4s, \t3\().4s + trn1 \t2\().4s, \t0\().4s, \t1\().4s + trn2 \l2\().4s, \t0\().4s, \t1\().4s + trn1 \t0\().4s, \l1\().4s, \l3\().4s + trn2 \l3\().4s, \l1\().4s, \l3\().4s + trn2 \t1\().4s, \l5\().4s, \l7\().4s + trn1 \l5\().4s, \l5\().4s, \l7\().4s + + trn2 \l6\().2d, \l2\().2d, \t3\().2d + trn1 \l0\().2d, \t2\().2d, \l4\().2d + trn1 \l1\().2d, \t0\().2d, \l5\().2d + trn2 \l7\().2d, \l3\().2d, \t1\().2d + trn1 \l2\().2d, \l2\().2d, \t3\().2d + trn2 \l4\().2d, \t2\().2d, \l4\().2d + trn1 \l3\().2d, \l3\().2d, \t1\().2d + trn2 \l5\().2d, \t0\().2d, \l5\().2d +.endm + + +#define CENTERJSAMPLE 128 + +/*****************************************************************************/ + +/* + * Perform dequantization and inverse DCT on one block of coefficients. + * + * GLOBAL(void) + * jsimd_idct_islow_neon (void *dct_table, JCOEFPTR coef_block, + * JSAMPARRAY output_buf, JDIMENSION output_col) + */ + +#define CONST_BITS 13 +#define PASS1_BITS 2 + +#define F_0_298 2446 /* FIX(0.298631336) */ +#define F_0_390 3196 /* FIX(0.390180644) */ +#define F_0_541 4433 /* FIX(0.541196100) */ +#define F_0_765 6270 /* FIX(0.765366865) */ +#define F_0_899 7373 /* FIX(0.899976223) */ +#define F_1_175 9633 /* FIX(1.175875602) */ +#define F_1_501 12299 /* FIX(1.501321110) */ +#define F_1_847 15137 /* FIX(1.847759065) */ +#define F_1_961 16069 /* FIX(1.961570560) */ +#define F_2_053 16819 /* FIX(2.053119869) */ +#define F_2_562 20995 /* FIX(2.562915447) */ +#define F_3_072 25172 /* FIX(3.072711026) */ + +.balign 16 +Ljsimd_idct_islow_neon_consts: + .short F_0_298 + .short -F_0_390 + .short F_0_541 + .short F_0_765 + .short - F_0_899 + .short F_1_175 + .short F_1_501 + .short - F_1_847 + .short - F_1_961 + .short F_2_053 + .short - F_2_562 + .short F_3_072 + .short 0 /* padding */ + .short 0 + .short 0 + .short 0 + +#undef F_0_298 +#undef F_0_390 +#undef F_0_541 +#undef F_0_765 +#undef F_0_899 +#undef F_1_175 +#undef F_1_501 +#undef F_1_847 +#undef F_1_961 +#undef F_2_053 +#undef F_2_562 +#undef F_3_072 + +#define XFIX_P_0_298 v0.h[0] +#define XFIX_N_0_390 v0.h[1] +#define XFIX_P_0_541 v0.h[2] +#define XFIX_P_0_765 v0.h[3] +#define XFIX_N_0_899 v0.h[4] +#define XFIX_P_1_175 v0.h[5] +#define XFIX_P_1_501 v0.h[6] +#define XFIX_N_1_847 v0.h[7] +#define XFIX_N_1_961 v1.h[0] +#define XFIX_P_2_053 v1.h[1] +#define XFIX_N_2_562 v1.h[2] +#define XFIX_P_3_072 v1.h[3] + +asm_function jsimd_idct_islow_neon + DCT_TABLE .req x0 + COEF_BLOCK .req x1 + OUTPUT_BUF .req x2 + OUTPUT_COL .req x3 + TMP1 .req x0 + TMP2 .req x1 + TMP3 .req x9 + TMP4 .req x10 + TMP5 .req x11 + TMP6 .req x12 + TMP7 .req x13 + TMP8 .req x14 + + /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't + guarantee that the upper (unused) 32 bits of x3 are valid. This + instruction ensures that those bits are set to zero. */ + uxtw x3, w3 + + sub sp, sp, #64 + adr x15, Ljsimd_idct_islow_neon_consts + mov x10, sp + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], #32 + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], #32 + ld1 {v0.8h, v1.8h}, [x15] + ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [COEF_BLOCK], #64 + ld1 {v18.8h, v19.8h, v20.8h, v21.8h}, [DCT_TABLE], #64 + ld1 {v6.8h, v7.8h, v8.8h, v9.8h}, [COEF_BLOCK], #64 + ld1 {v22.8h, v23.8h, v24.8h, v25.8h}, [DCT_TABLE], #64 + + cmeq v16.8h, v3.8h, #0 + cmeq v26.8h, v4.8h, #0 + cmeq v27.8h, v5.8h, #0 + cmeq v28.8h, v6.8h, #0 + cmeq v29.8h, v7.8h, #0 + cmeq v30.8h, v8.8h, #0 + cmeq v31.8h, v9.8h, #0 + + and v10.16b, v16.16b, v26.16b + and v11.16b, v27.16b, v28.16b + and v12.16b, v29.16b, v30.16b + and v13.16b, v31.16b, v10.16b + and v14.16b, v11.16b, v12.16b + mul v2.8h, v2.8h, v18.8h + and v15.16b, v13.16b, v14.16b + shl v10.8h, v2.8h, #(PASS1_BITS) + sqxtn v16.8b, v15.8h + mov TMP1, v16.d[0] + mvn TMP2, TMP1 + + cbnz TMP2, 2f + /* case all AC coeffs are zeros */ + dup v2.2d, v10.d[0] + dup v6.2d, v10.d[1] + mov v3.16b, v2.16b + mov v7.16b, v6.16b + mov v4.16b, v2.16b + mov v8.16b, v6.16b + mov v5.16b, v2.16b + mov v9.16b, v6.16b +1: + /* for this transpose, we should organise data like this: + * 00, 01, 02, 03, 40, 41, 42, 43 + * 10, 11, 12, 13, 50, 51, 52, 53 + * 20, 21, 22, 23, 60, 61, 62, 63 + * 30, 31, 32, 33, 70, 71, 72, 73 + * 04, 05, 06, 07, 44, 45, 46, 47 + * 14, 15, 16, 17, 54, 55, 56, 57 + * 24, 25, 26, 27, 64, 65, 66, 67 + * 34, 35, 36, 37, 74, 75, 76, 77 + */ + trn1 v28.8h, v2.8h, v3.8h + trn1 v29.8h, v4.8h, v5.8h + trn1 v30.8h, v6.8h, v7.8h + trn1 v31.8h, v8.8h, v9.8h + trn2 v16.8h, v2.8h, v3.8h + trn2 v17.8h, v4.8h, v5.8h + trn2 v18.8h, v6.8h, v7.8h + trn2 v19.8h, v8.8h, v9.8h + trn1 v2.4s, v28.4s, v29.4s + trn1 v6.4s, v30.4s, v31.4s + trn1 v3.4s, v16.4s, v17.4s + trn1 v7.4s, v18.4s, v19.4s + trn2 v4.4s, v28.4s, v29.4s + trn2 v8.4s, v30.4s, v31.4s + trn2 v5.4s, v16.4s, v17.4s + trn2 v9.4s, v18.4s, v19.4s + /* Even part: reverse the even part of the forward DCT. */ + add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */ + add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ + smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ + sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ + smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ + sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ + mov v21.16b, v19.16b /* tmp3 = z1 */ + mov v20.16b, v18.16b /* tmp3 = z1 */ + smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */ + smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */ + sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ + smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ + smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ + sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ + sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ + add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */ + sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */ + add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */ + sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */ + add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */ + sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */ + add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */ + sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */ + + /* Odd part per figure 8; the matrix is unitary and hence its + * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. + */ + + add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ + add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ + add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ + add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ + add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */ + + smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ + smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ + smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ + smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ + smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ + smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560) */ + smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644) */ + smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223) */ + smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447) */ + + smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ + smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ + smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ + smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ + smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ + smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560) */ + smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644) */ + smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223) */ + smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447) */ + + add v23.4s, v23.4s, v27.4s /* z3 += z5 */ + add v22.4s, v22.4s, v26.4s /* z3 += z5 */ + add v25.4s, v25.4s, v27.4s /* z4 += z5 */ + add v24.4s, v24.4s, v26.4s /* z4 += z5 */ + + add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */ + add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */ + add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */ + add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */ + add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */ + add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */ + add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */ + add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */ + + add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */ + add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */ + add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */ + add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */ + add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */ + add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */ + add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */ + add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */ + + /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ + + add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */ + add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */ + sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */ + sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */ + add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */ + add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */ + sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */ + sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */ + add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */ + add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */ + sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */ + sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */ + add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */ + add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */ + sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */ + sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */ + + shrn v2.4h, v18.4s, #16 /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */ + shrn v9.4h, v20.4s, #16 /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */ + shrn v3.4h, v22.4s, #16 /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */ + shrn v8.4h, v24.4s, #16 /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */ + shrn v4.4h, v26.4s, #16 /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */ + shrn v7.4h, v28.4s, #16 /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */ + shrn v5.4h, v14.4s, #16 /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */ + shrn v6.4h, v16.4s, #16 /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */ + shrn2 v2.8h, v19.4s, #16 /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */ + shrn2 v9.8h, v21.4s, #16 /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */ + shrn2 v3.8h, v23.4s, #16 /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */ + shrn2 v8.8h, v25.4s, #16 /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */ + shrn2 v4.8h, v27.4s, #16 /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */ + shrn2 v7.8h, v29.4s, #16 /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */ + shrn2 v5.8h, v15.4s, #16 /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */ + shrn2 v6.8h, v17.4s, #16 /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */ + movi v0.16b, #(CENTERJSAMPLE) + /* Prepare pointers (dual-issue with NEON instructions) */ + ldp TMP1, TMP2, [OUTPUT_BUF], 16 + sqrshrn v28.8b, v2.8h, #(CONST_BITS+PASS1_BITS+3-16) + ldp TMP3, TMP4, [OUTPUT_BUF], 16 + sqrshrn v29.8b, v3.8h, #(CONST_BITS+PASS1_BITS+3-16) + add TMP1, TMP1, OUTPUT_COL + sqrshrn v30.8b, v4.8h, #(CONST_BITS+PASS1_BITS+3-16) + add TMP2, TMP2, OUTPUT_COL + sqrshrn v31.8b, v5.8h, #(CONST_BITS+PASS1_BITS+3-16) + add TMP3, TMP3, OUTPUT_COL + sqrshrn2 v28.16b, v6.8h, #(CONST_BITS+PASS1_BITS+3-16) + add TMP4, TMP4, OUTPUT_COL + sqrshrn2 v29.16b, v7.8h, #(CONST_BITS+PASS1_BITS+3-16) + ldp TMP5, TMP6, [OUTPUT_BUF], 16 + sqrshrn2 v30.16b, v8.8h, #(CONST_BITS+PASS1_BITS+3-16) + ldp TMP7, TMP8, [OUTPUT_BUF], 16 + sqrshrn2 v31.16b, v9.8h, #(CONST_BITS+PASS1_BITS+3-16) + add TMP5, TMP5, OUTPUT_COL + add v16.16b, v28.16b, v0.16b + add TMP6, TMP6, OUTPUT_COL + add v18.16b, v29.16b, v0.16b + add TMP7, TMP7, OUTPUT_COL + add v20.16b, v30.16b, v0.16b + add TMP8, TMP8, OUTPUT_COL + add v22.16b, v31.16b, v0.16b + + /* Transpose the final 8-bit samples */ + trn1 v28.16b, v16.16b, v18.16b + trn1 v30.16b, v20.16b, v22.16b + trn2 v29.16b, v16.16b, v18.16b + trn2 v31.16b, v20.16b, v22.16b + + trn1 v16.8h, v28.8h, v30.8h + trn2 v18.8h, v28.8h, v30.8h + trn1 v20.8h, v29.8h, v31.8h + trn2 v22.8h, v29.8h, v31.8h + + uzp1 v28.4s, v16.4s, v18.4s + uzp2 v30.4s, v16.4s, v18.4s + uzp1 v29.4s, v20.4s, v22.4s + uzp2 v31.4s, v20.4s, v22.4s + + /* Store results to the output buffer */ + st1 {v28.d}[0], [TMP1] + st1 {v29.d}[0], [TMP2] + st1 {v28.d}[1], [TMP3] + st1 {v29.d}[1], [TMP4] + st1 {v30.d}[0], [TMP5] + st1 {v31.d}[0], [TMP6] + st1 {v30.d}[1], [TMP7] + st1 {v31.d}[1], [TMP8] + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32 + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], #32 + blr x30 + +.balign 16 +2: + mul v3.8h, v3.8h, v19.8h + mul v4.8h, v4.8h, v20.8h + mul v5.8h, v5.8h, v21.8h + add TMP4, xzr, TMP2, LSL #32 + mul v6.8h, v6.8h, v22.8h + mul v7.8h, v7.8h, v23.8h + adds TMP3, xzr, TMP2, LSR #32 + mul v8.8h, v8.8h, v24.8h + mul v9.8h, v9.8h, v25.8h + b.ne 3f + /* Right AC coef is zero */ + dup v15.2d, v10.d[1] + /* Even part: reverse the even part of the forward DCT. */ + add v18.4h, v4.4h, v8.4h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */ + add v22.4h, v2.4h, v6.4h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ + sub v26.4h, v2.4h, v6.4h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ + smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ + sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ + mov v20.16b, v18.16b /* tmp3 = z1 */ + sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ + smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */ + smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ + add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */ + sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */ + add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */ + sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */ + + /* Odd part per figure 8; the matrix is unitary and hence its + * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. + */ + + add v22.4h, v9.4h, v5.4h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ + add v24.4h, v7.4h, v3.4h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ + add v18.4h, v9.4h, v3.4h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ + add v20.4h, v7.4h, v5.4h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ + add v26.4h, v22.4h, v24.4h /* z5 = z3 + z4 */ + + smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ + smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ + smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ + smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ + smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ + smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560) */ + smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644) */ + smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223) */ + smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447) */ + + add v22.4s, v22.4s, v26.4s /* z3 += z5 */ + add v24.4s, v24.4s, v26.4s /* z4 += z5 */ + + add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */ + add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */ + add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */ + add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */ + + add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */ + add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */ + add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */ + add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */ + + /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ + + add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */ + sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */ + add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */ + sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */ + add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */ + sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */ + add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */ + sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */ + + rshrn v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */ + rshrn v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */ + rshrn v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */ + rshrn v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */ + rshrn2 v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */ + rshrn2 v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */ + rshrn2 v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */ + rshrn2 v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */ + mov v6.16b, v15.16b + mov v7.16b, v15.16b + mov v8.16b, v15.16b + mov v9.16b, v15.16b + b 1b + +.balign 16 +3: + cbnz TMP4, 4f + /* Left AC coef is zero */ + dup v14.2d, v10.d[0] + /* Even part: reverse the even part of the forward DCT. */ + add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */ + add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ + smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ + sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ + sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ + mov v21.16b, v19.16b /* tmp3 = z1 */ + smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */ + sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ + smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ + add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */ + sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */ + add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */ + sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */ + + /* Odd part per figure 8; the matrix is unitary and hence its + * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. + */ + + add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ + add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ + add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ + add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ + add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */ + + smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ + smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ + smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ + smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ + smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ + smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560) */ + smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644) */ + smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223) */ + smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447) */ + + add v23.4s, v23.4s, v27.4s /* z3 += z5 */ + add v22.4s, v22.4s, v26.4s /* z3 += z5 */ + add v25.4s, v25.4s, v27.4s /* z4 += z5 */ + add v24.4s, v24.4s, v26.4s /* z4 += z5 */ + + add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */ + add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */ + add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */ + add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */ + + add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */ + add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */ + add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */ + add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */ + + /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ + + add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */ + sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */ + add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */ + sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */ + add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */ + sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */ + add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */ + sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */ + + mov v2.16b, v14.16b + mov v3.16b, v14.16b + mov v4.16b, v14.16b + mov v5.16b, v14.16b + rshrn v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */ + rshrn v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */ + rshrn v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */ + rshrn v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */ + rshrn2 v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */ + rshrn2 v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */ + rshrn2 v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */ + rshrn2 v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */ + b 1b + +.balign 16 +4: + /* "No" AC coef is zero */ + /* Even part: reverse the even part of the forward DCT. */ + add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */ + add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ + smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ + sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ + smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ + sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ + mov v21.16b, v19.16b /* tmp3 = z1 */ + mov v20.16b, v18.16b /* tmp3 = z1 */ + smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */ + smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */ + sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ + smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ + smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ + sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ + sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ + add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */ + sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */ + add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */ + sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */ + add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */ + sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */ + add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */ + sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */ + + /* Odd part per figure 8; the matrix is unitary and hence its + * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. + */ + + add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ + add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ + add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ + add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ + add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */ + + smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ + smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ + smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ + smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ + smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ + smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560) */ + smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644) */ + smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223) */ + smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447) */ + + smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ + smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ + smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ + smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ + smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ + smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560) */ + smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644) */ + smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223) */ + smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447) */ + + add v23.4s, v23.4s, v27.4s /* z3 += z5 */ + add v22.4s, v22.4s, v26.4s /* z3 += z5 */ + add v25.4s, v25.4s, v27.4s /* z4 += z5 */ + add v24.4s, v24.4s, v26.4s /* z4 += z5 */ + + add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */ + add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */ + add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */ + add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */ + add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */ + add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */ + add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */ + add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */ + + add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */ + add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */ + add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */ + add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */ + add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */ + add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */ + add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */ + add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */ + + /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ + + add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */ + add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */ + sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */ + sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */ + add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */ + add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */ + sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */ + sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */ + add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */ + add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */ + sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */ + sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */ + add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */ + add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */ + sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */ + sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */ + + rshrn v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */ + rshrn v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */ + rshrn v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */ + rshrn v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */ + rshrn v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */ + rshrn v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */ + rshrn v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */ + rshrn v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */ + rshrn2 v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */ + rshrn2 v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */ + rshrn2 v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */ + rshrn2 v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */ + rshrn2 v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */ + rshrn2 v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */ + rshrn2 v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */ + rshrn2 v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */ + b 1b + + .unreq DCT_TABLE + .unreq COEF_BLOCK + .unreq OUTPUT_BUF + .unreq OUTPUT_COL + .unreq TMP1 + .unreq TMP2 + .unreq TMP3 + .unreq TMP4 + .unreq TMP5 + .unreq TMP6 + .unreq TMP7 + .unreq TMP8 + +#undef CENTERJSAMPLE +#undef CONST_BITS +#undef PASS1_BITS +#undef XFIX_P_0_298 +#undef XFIX_N_0_390 +#undef XFIX_P_0_541 +#undef XFIX_P_0_765 +#undef XFIX_N_0_899 +#undef XFIX_P_1_175 +#undef XFIX_P_1_501 +#undef XFIX_N_1_847 +#undef XFIX_N_1_961 +#undef XFIX_P_2_053 +#undef XFIX_N_2_562 +#undef XFIX_P_3_072 + + +/*****************************************************************************/ + +/* + * jsimd_idct_ifast_neon + * + * This function contains a fast, not so accurate integer implementation of + * the inverse DCT (Discrete Cosine Transform). It uses the same calculations + * and produces exactly the same output as IJG's original 'jpeg_idct_ifast' + * function from jidctfst.c + * + * Normally 1-D AAN DCT needs 5 multiplications and 29 additions. + * But in ARM NEON case some extra additions are required because VQDMULH + * instruction can't handle the constants larger than 1. So the expressions + * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x", + * which introduces an extra addition. Overall, there are 6 extra additions + * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions. + */ + +#define XFIX_1_082392200 v0.h[0] +#define XFIX_1_414213562 v0.h[1] +#define XFIX_1_847759065 v0.h[2] +#define XFIX_2_613125930 v0.h[3] + +.balign 16 +Ljsimd_idct_ifast_neon_consts: + .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */ + .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */ + .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */ + .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */ + +asm_function jsimd_idct_ifast_neon + + DCT_TABLE .req x0 + COEF_BLOCK .req x1 + OUTPUT_BUF .req x2 + OUTPUT_COL .req x3 + TMP1 .req x0 + TMP2 .req x1 + TMP3 .req x9 + TMP4 .req x10 + TMP5 .req x11 + TMP6 .req x12 + TMP7 .req x13 + TMP8 .req x14 + + /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't + guarantee that the upper (unused) 32 bits of x3 are valid. This + instruction ensures that those bits are set to zero. */ + uxtw x3, w3 + + /* Load and dequantize coefficients into NEON registers + * with the following allocation: + * 0 1 2 3 | 4 5 6 7 + * ---------+-------- + * 0 | d16 | d17 ( v16.8h ) + * 1 | d18 | d19 ( v17.8h ) + * 2 | d20 | d21 ( v18.8h ) + * 3 | d22 | d23 ( v19.8h ) + * 4 | d24 | d25 ( v20.8h ) + * 5 | d26 | d27 ( v21.8h ) + * 6 | d28 | d29 ( v22.8h ) + * 7 | d30 | d31 ( v23.8h ) + */ + /* Save NEON registers used in fast IDCT */ + adr TMP5, Ljsimd_idct_ifast_neon_consts + ld1 {v16.8h, v17.8h}, [COEF_BLOCK], 32 + ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32 + ld1 {v18.8h, v19.8h}, [COEF_BLOCK], 32 + mul v16.8h, v16.8h, v0.8h + ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32 + mul v17.8h, v17.8h, v1.8h + ld1 {v20.8h, v21.8h}, [COEF_BLOCK], 32 + mul v18.8h, v18.8h, v2.8h + ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32 + mul v19.8h, v19.8h, v3.8h + ld1 {v22.8h, v23.8h}, [COEF_BLOCK], 32 + mul v20.8h, v20.8h, v0.8h + ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32 + mul v22.8h, v22.8h, v2.8h + mul v21.8h, v21.8h, v1.8h + ld1 {v0.4h}, [TMP5] /* load constants */ + mul v23.8h, v23.8h, v3.8h + + /* 1-D IDCT, pass 1 */ + sub v2.8h, v18.8h, v22.8h + add v22.8h, v18.8h, v22.8h + sub v1.8h, v19.8h, v21.8h + add v21.8h, v19.8h, v21.8h + sub v5.8h, v17.8h, v23.8h + add v23.8h, v17.8h, v23.8h + sqdmulh v4.8h, v2.8h, XFIX_1_414213562 + sqdmulh v6.8h, v1.8h, XFIX_2_613125930 + add v3.8h, v1.8h, v1.8h + sub v1.8h, v5.8h, v1.8h + add v18.8h, v2.8h, v4.8h + sqdmulh v4.8h, v1.8h, XFIX_1_847759065 + sub v2.8h, v23.8h, v21.8h + add v3.8h, v3.8h, v6.8h + sqdmulh v6.8h, v2.8h, XFIX_1_414213562 + add v1.8h, v1.8h, v4.8h + sqdmulh v4.8h, v5.8h, XFIX_1_082392200 + sub v18.8h, v18.8h, v22.8h + add v2.8h, v2.8h, v6.8h + sub v6.8h, v16.8h, v20.8h + add v20.8h, v16.8h, v20.8h + add v17.8h, v5.8h, v4.8h + add v5.8h, v6.8h, v18.8h + sub v18.8h, v6.8h, v18.8h + add v6.8h, v23.8h, v21.8h + add v16.8h, v20.8h, v22.8h + sub v3.8h, v6.8h, v3.8h + sub v20.8h, v20.8h, v22.8h + sub v3.8h, v3.8h, v1.8h + sub v1.8h, v17.8h, v1.8h + add v2.8h, v3.8h, v2.8h + sub v23.8h, v16.8h, v6.8h + add v1.8h, v1.8h, v2.8h + add v16.8h, v16.8h, v6.8h + add v22.8h, v5.8h, v3.8h + sub v17.8h, v5.8h, v3.8h + sub v21.8h, v18.8h, v2.8h + add v18.8h, v18.8h, v2.8h + sub v19.8h, v20.8h, v1.8h + add v20.8h, v20.8h, v1.8h + transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v28, v29, v30, v31 + /* 1-D IDCT, pass 2 */ + sub v2.8h, v18.8h, v22.8h + add v22.8h, v18.8h, v22.8h + sub v1.8h, v19.8h, v21.8h + add v21.8h, v19.8h, v21.8h + sub v5.8h, v17.8h, v23.8h + add v23.8h, v17.8h, v23.8h + sqdmulh v4.8h, v2.8h, XFIX_1_414213562 + sqdmulh v6.8h, v1.8h, XFIX_2_613125930 + add v3.8h, v1.8h, v1.8h + sub v1.8h, v5.8h, v1.8h + add v18.8h, v2.8h, v4.8h + sqdmulh v4.8h, v1.8h, XFIX_1_847759065 + sub v2.8h, v23.8h, v21.8h + add v3.8h, v3.8h, v6.8h + sqdmulh v6.8h, v2.8h, XFIX_1_414213562 + add v1.8h, v1.8h, v4.8h + sqdmulh v4.8h, v5.8h, XFIX_1_082392200 + sub v18.8h, v18.8h, v22.8h + add v2.8h, v2.8h, v6.8h + sub v6.8h, v16.8h, v20.8h + add v20.8h, v16.8h, v20.8h + add v17.8h, v5.8h, v4.8h + add v5.8h, v6.8h, v18.8h + sub v18.8h, v6.8h, v18.8h + add v6.8h, v23.8h, v21.8h + add v16.8h, v20.8h, v22.8h + sub v3.8h, v6.8h, v3.8h + sub v20.8h, v20.8h, v22.8h + sub v3.8h, v3.8h, v1.8h + sub v1.8h, v17.8h, v1.8h + add v2.8h, v3.8h, v2.8h + sub v23.8h, v16.8h, v6.8h + add v1.8h, v1.8h, v2.8h + add v16.8h, v16.8h, v6.8h + add v22.8h, v5.8h, v3.8h + sub v17.8h, v5.8h, v3.8h + sub v21.8h, v18.8h, v2.8h + add v18.8h, v18.8h, v2.8h + sub v19.8h, v20.8h, v1.8h + add v20.8h, v20.8h, v1.8h + /* Descale to 8-bit and range limit */ + movi v0.16b, #0x80 + /* Prepare pointers (dual-issue with NEON instructions) */ + ldp TMP1, TMP2, [OUTPUT_BUF], 16 + sqshrn v28.8b, v16.8h, #5 + ldp TMP3, TMP4, [OUTPUT_BUF], 16 + sqshrn v29.8b, v17.8h, #5 + add TMP1, TMP1, OUTPUT_COL + sqshrn v30.8b, v18.8h, #5 + add TMP2, TMP2, OUTPUT_COL + sqshrn v31.8b, v19.8h, #5 + add TMP3, TMP3, OUTPUT_COL + sqshrn2 v28.16b, v20.8h, #5 + add TMP4, TMP4, OUTPUT_COL + sqshrn2 v29.16b, v21.8h, #5 + ldp TMP5, TMP6, [OUTPUT_BUF], 16 + sqshrn2 v30.16b, v22.8h, #5 + ldp TMP7, TMP8, [OUTPUT_BUF], 16 + sqshrn2 v31.16b, v23.8h, #5 + add TMP5, TMP5, OUTPUT_COL + add v16.16b, v28.16b, v0.16b + add TMP6, TMP6, OUTPUT_COL + add v18.16b, v29.16b, v0.16b + add TMP7, TMP7, OUTPUT_COL + add v20.16b, v30.16b, v0.16b + add TMP8, TMP8, OUTPUT_COL + add v22.16b, v31.16b, v0.16b + + /* Transpose the final 8-bit samples */ + trn1 v28.16b, v16.16b, v18.16b + trn1 v30.16b, v20.16b, v22.16b + trn2 v29.16b, v16.16b, v18.16b + trn2 v31.16b, v20.16b, v22.16b + + trn1 v16.8h, v28.8h, v30.8h + trn2 v18.8h, v28.8h, v30.8h + trn1 v20.8h, v29.8h, v31.8h + trn2 v22.8h, v29.8h, v31.8h + + uzp1 v28.4s, v16.4s, v18.4s + uzp2 v30.4s, v16.4s, v18.4s + uzp1 v29.4s, v20.4s, v22.4s + uzp2 v31.4s, v20.4s, v22.4s + + /* Store results to the output buffer */ + st1 {v28.d}[0], [TMP1] + st1 {v29.d}[0], [TMP2] + st1 {v28.d}[1], [TMP3] + st1 {v29.d}[1], [TMP4] + st1 {v30.d}[0], [TMP5] + st1 {v31.d}[0], [TMP6] + st1 {v30.d}[1], [TMP7] + st1 {v31.d}[1], [TMP8] + blr x30 + + .unreq DCT_TABLE + .unreq COEF_BLOCK + .unreq OUTPUT_BUF + .unreq OUTPUT_COL + .unreq TMP1 + .unreq TMP2 + .unreq TMP3 + .unreq TMP4 + .unreq TMP5 + .unreq TMP6 + .unreq TMP7 + .unreq TMP8 + + +/*****************************************************************************/ + +/* + * jsimd_idct_4x4_neon + * + * This function contains inverse-DCT code for getting reduced-size + * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations + * and produces exactly the same output as IJG's original 'jpeg_idct_4x4' + * function from jpeg-6b (jidctred.c). + * + * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which + * requires much less arithmetic operations and hence should be faster. + * The primary purpose of this particular NEON optimized function is + * bit exact compatibility with jpeg-6b. + * + * TODO: a bit better instructions scheduling can be achieved by expanding + * idct_helper/transpose_4x4 macros and reordering instructions, + * but readability will suffer somewhat. + */ + +#define CONST_BITS 13 + +#define FIX_0_211164243 (1730) /* FIX(0.211164243) */ +#define FIX_0_509795579 (4176) /* FIX(0.509795579) */ +#define FIX_0_601344887 (4926) /* FIX(0.601344887) */ +#define FIX_0_720959822 (5906) /* FIX(0.720959822) */ +#define FIX_0_765366865 (6270) /* FIX(0.765366865) */ +#define FIX_0_850430095 (6967) /* FIX(0.850430095) */ +#define FIX_0_899976223 (7373) /* FIX(0.899976223) */ +#define FIX_1_061594337 (8697) /* FIX(1.061594337) */ +#define FIX_1_272758580 (10426) /* FIX(1.272758580) */ +#define FIX_1_451774981 (11893) /* FIX(1.451774981) */ +#define FIX_1_847759065 (15137) /* FIX(1.847759065) */ +#define FIX_2_172734803 (17799) /* FIX(2.172734803) */ +#define FIX_2_562915447 (20995) /* FIX(2.562915447) */ +#define FIX_3_624509785 (29692) /* FIX(3.624509785) */ + +.balign 16 +Ljsimd_idct_4x4_neon_consts: + .short FIX_1_847759065 /* v0.h[0] */ + .short -FIX_0_765366865 /* v0.h[1] */ + .short -FIX_0_211164243 /* v0.h[2] */ + .short FIX_1_451774981 /* v0.h[3] */ + .short -FIX_2_172734803 /* d1[0] */ + .short FIX_1_061594337 /* d1[1] */ + .short -FIX_0_509795579 /* d1[2] */ + .short -FIX_0_601344887 /* d1[3] */ + .short FIX_0_899976223 /* v2.h[0] */ + .short FIX_2_562915447 /* v2.h[1] */ + .short 1 << (CONST_BITS+1) /* v2.h[2] */ + .short 0 /* v2.h[3] */ + +.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29 + smull v28.4s, \x4, v2.h[2] + smlal v28.4s, \x8, v0.h[0] + smlal v28.4s, \x14, v0.h[1] + + smull v26.4s, \x16, v1.h[2] + smlal v26.4s, \x12, v1.h[3] + smlal v26.4s, \x10, v2.h[0] + smlal v26.4s, \x6, v2.h[1] + + smull v30.4s, \x4, v2.h[2] + smlsl v30.4s, \x8, v0.h[0] + smlsl v30.4s, \x14, v0.h[1] + + smull v24.4s, \x16, v0.h[2] + smlal v24.4s, \x12, v0.h[3] + smlal v24.4s, \x10, v1.h[0] + smlal v24.4s, \x6, v1.h[1] + + add v20.4s, v28.4s, v26.4s + sub v28.4s, v28.4s, v26.4s + + .if \shift > 16 + srshr v20.4s, v20.4s, #\shift + srshr v28.4s, v28.4s, #\shift + xtn \y26, v20.4s + xtn \y29, v28.4s + .else + rshrn \y26, v20.4s, #\shift + rshrn \y29, v28.4s, #\shift + .endif + + add v20.4s, v30.4s, v24.4s + sub v30.4s, v30.4s, v24.4s + + .if \shift > 16 + srshr v20.4s, v20.4s, #\shift + srshr v30.4s, v30.4s, #\shift + xtn \y27, v20.4s + xtn \y28, v30.4s + .else + rshrn \y27, v20.4s, #\shift + rshrn \y28, v30.4s, #\shift + .endif +.endm + +asm_function jsimd_idct_4x4_neon + + DCT_TABLE .req x0 + COEF_BLOCK .req x1 + OUTPUT_BUF .req x2 + OUTPUT_COL .req x3 + TMP1 .req x0 + TMP2 .req x1 + TMP3 .req x2 + TMP4 .req x15 + + /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't + guarantee that the upper (unused) 32 bits of x3 are valid. This + instruction ensures that those bits are set to zero. */ + uxtw x3, w3 + + /* Save all used NEON registers */ + sub sp, sp, 64 + mov x9, sp + /* Load constants (v3.4h is just used for padding) */ + adr TMP4, Ljsimd_idct_4x4_neon_consts + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32 + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32 + ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4] + + /* Load all COEF_BLOCK into NEON registers with the following allocation: + * 0 1 2 3 | 4 5 6 7 + * ---------+-------- + * 0 | v4.4h | v5.4h + * 1 | v6.4h | v7.4h + * 2 | v8.4h | v9.4h + * 3 | v10.4h | v11.4h + * 4 | - | - + * 5 | v12.4h | v13.4h + * 6 | v14.4h | v15.4h + * 7 | v16.4h | v17.4h + */ + ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32 + ld1 {v8.4h, v9.4h, v10.4h, v11.4h}, [COEF_BLOCK], 32 + add COEF_BLOCK, COEF_BLOCK, #16 + ld1 {v12.4h, v13.4h, v14.4h, v15.4h}, [COEF_BLOCK], 32 + ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16 + /* dequantize */ + ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32 + mul v4.4h, v4.4h, v18.4h + mul v5.4h, v5.4h, v19.4h + ins v4.d[1], v5.d[0] /* 128 bit q4 */ + ld1 {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32 + mul v6.4h, v6.4h, v20.4h + mul v7.4h, v7.4h, v21.4h + ins v6.d[1], v7.d[0] /* 128 bit q6 */ + mul v8.4h, v8.4h, v22.4h + mul v9.4h, v9.4h, v23.4h + ins v8.d[1], v9.d[0] /* 128 bit q8 */ + add DCT_TABLE, DCT_TABLE, #16 + ld1 {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32 + mul v10.4h, v10.4h, v24.4h + mul v11.4h, v11.4h, v25.4h + ins v10.d[1], v11.d[0] /* 128 bit q10 */ + mul v12.4h, v12.4h, v26.4h + mul v13.4h, v13.4h, v27.4h + ins v12.d[1], v13.d[0] /* 128 bit q12 */ + ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16 + mul v14.4h, v14.4h, v28.4h + mul v15.4h, v15.4h, v29.4h + ins v14.d[1], v15.d[0] /* 128 bit q14 */ + mul v16.4h, v16.4h, v30.4h + mul v17.4h, v17.4h, v31.4h + ins v16.d[1], v17.d[0] /* 128 bit q16 */ + + /* Pass 1 */ + idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, \ + v4.4h, v6.4h, v8.4h, v10.4h + transpose_4x4 v4, v6, v8, v10, v3 + ins v10.d[1], v11.d[0] + idct_helper v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, \ + v5.4h, v7.4h, v9.4h, v11.4h + transpose_4x4 v5, v7, v9, v11, v3 + ins v10.d[1], v11.d[0] + + /* Pass 2 */ + idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, \ + v26.4h, v27.4h, v28.4h, v29.4h + transpose_4x4 v26, v27, v28, v29, v3 + + /* Range limit */ + movi v30.8h, #0x80 + ins v26.d[1], v27.d[0] + ins v28.d[1], v29.d[0] + add v26.8h, v26.8h, v30.8h + add v28.8h, v28.8h, v30.8h + sqxtun v26.8b, v26.8h + sqxtun v27.8b, v28.8h + + /* Store results to the output buffer */ + ldp TMP1, TMP2, [OUTPUT_BUF], 16 + ldp TMP3, TMP4, [OUTPUT_BUF] + add TMP1, TMP1, OUTPUT_COL + add TMP2, TMP2, OUTPUT_COL + add TMP3, TMP3, OUTPUT_COL + add TMP4, TMP4, OUTPUT_COL + +#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT + /* We can use much less instructions on little endian systems if the + * OS kernel is not configured to trap unaligned memory accesses + */ + st1 {v26.s}[0], [TMP1], 4 + st1 {v27.s}[0], [TMP3], 4 + st1 {v26.s}[1], [TMP2], 4 + st1 {v27.s}[1], [TMP4], 4 +#else + st1 {v26.b}[0], [TMP1], 1 + st1 {v27.b}[0], [TMP3], 1 + st1 {v26.b}[1], [TMP1], 1 + st1 {v27.b}[1], [TMP3], 1 + st1 {v26.b}[2], [TMP1], 1 + st1 {v27.b}[2], [TMP3], 1 + st1 {v26.b}[3], [TMP1], 1 + st1 {v27.b}[3], [TMP3], 1 + + st1 {v26.b}[4], [TMP2], 1 + st1 {v27.b}[4], [TMP4], 1 + st1 {v26.b}[5], [TMP2], 1 + st1 {v27.b}[5], [TMP4], 1 + st1 {v26.b}[6], [TMP2], 1 + st1 {v27.b}[6], [TMP4], 1 + st1 {v26.b}[7], [TMP2], 1 + st1 {v27.b}[7], [TMP4], 1 +#endif + + /* vpop {v8.4h - v15.4h} ;not available */ + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 + blr x30 + + .unreq DCT_TABLE + .unreq COEF_BLOCK + .unreq OUTPUT_BUF + .unreq OUTPUT_COL + .unreq TMP1 + .unreq TMP2 + .unreq TMP3 + .unreq TMP4 + +.purgem idct_helper + + +/*****************************************************************************/ + +/* + * jsimd_idct_2x2_neon + * + * This function contains inverse-DCT code for getting reduced-size + * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations + * and produces exactly the same output as IJG's original 'jpeg_idct_2x2' + * function from jpeg-6b (jidctred.c). + * + * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which + * requires much less arithmetic operations and hence should be faster. + * The primary purpose of this particular NEON optimized function is + * bit exact compatibility with jpeg-6b. + */ + +.balign 8 +Ljsimd_idct_2x2_neon_consts: + .short -FIX_0_720959822 /* v14[0] */ + .short FIX_0_850430095 /* v14[1] */ + .short -FIX_1_272758580 /* v14[2] */ + .short FIX_3_624509785 /* v14[3] */ + +.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27 + sshll v15.4s, \x4, #15 + smull v26.4s, \x6, v14.h[3] + smlal v26.4s, \x10, v14.h[2] + smlal v26.4s, \x12, v14.h[1] + smlal v26.4s, \x16, v14.h[0] + + add v20.4s, v15.4s, v26.4s + sub v15.4s, v15.4s, v26.4s + + .if \shift > 16 + srshr v20.4s, v20.4s, #\shift + srshr v15.4s, v15.4s, #\shift + xtn \y26, v20.4s + xtn \y27, v15.4s + .else + rshrn \y26, v20.4s, #\shift + rshrn \y27, v15.4s, #\shift + .endif +.endm + +asm_function jsimd_idct_2x2_neon + + DCT_TABLE .req x0 + COEF_BLOCK .req x1 + OUTPUT_BUF .req x2 + OUTPUT_COL .req x3 + TMP1 .req x0 + TMP2 .req x15 + + /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't + guarantee that the upper (unused) 32 bits of x3 are valid. This + instruction ensures that those bits are set to zero. */ + uxtw x3, w3 + + /* vpush {v8.4h - v15.4h} ; not available */ + sub sp, sp, 64 + mov x9, sp + + /* Load constants */ + adr TMP2, Ljsimd_idct_2x2_neon_consts + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32 + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32 + ld1 {v14.4h}, [TMP2] + + /* Load all COEF_BLOCK into NEON registers with the following allocation: + * 0 1 2 3 | 4 5 6 7 + * ---------+-------- + * 0 | v4.4h | v5.4h + * 1 | v6.4h | v7.4h + * 2 | - | - + * 3 | v10.4h | v11.4h + * 4 | - | - + * 5 | v12.4h | v13.4h + * 6 | - | - + * 7 | v16.4h | v17.4h + */ + ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32 + add COEF_BLOCK, COEF_BLOCK, #16 + ld1 {v10.4h, v11.4h}, [COEF_BLOCK], 16 + add COEF_BLOCK, COEF_BLOCK, #16 + ld1 {v12.4h, v13.4h}, [COEF_BLOCK], 16 + add COEF_BLOCK, COEF_BLOCK, #16 + ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16 + /* Dequantize */ + ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32 + mul v4.4h, v4.4h, v18.4h + mul v5.4h, v5.4h, v19.4h + ins v4.d[1], v5.d[0] + mul v6.4h, v6.4h, v20.4h + mul v7.4h, v7.4h, v21.4h + ins v6.d[1], v7.d[0] + add DCT_TABLE, DCT_TABLE, #16 + ld1 {v24.4h, v25.4h}, [DCT_TABLE], 16 + mul v10.4h, v10.4h, v24.4h + mul v11.4h, v11.4h, v25.4h + ins v10.d[1], v11.d[0] + add DCT_TABLE, DCT_TABLE, #16 + ld1 {v26.4h, v27.4h}, [DCT_TABLE], 16 + mul v12.4h, v12.4h, v26.4h + mul v13.4h, v13.4h, v27.4h + ins v12.d[1], v13.d[0] + add DCT_TABLE, DCT_TABLE, #16 + ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16 + mul v16.4h, v16.4h, v30.4h + mul v17.4h, v17.4h, v31.4h + ins v16.d[1], v17.d[0] + + /* Pass 1 */ +#if 0 + idct_helper v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h + transpose_4x4 v4.4h, v6.4h, v8.4h, v10.4h + idct_helper v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h + transpose_4x4 v5.4h, v7.4h, v9.4h, v11.4h +#else + smull v26.4s, v6.4h, v14.h[3] + smlal v26.4s, v10.4h, v14.h[2] + smlal v26.4s, v12.4h, v14.h[1] + smlal v26.4s, v16.4h, v14.h[0] + smull v24.4s, v7.4h, v14.h[3] + smlal v24.4s, v11.4h, v14.h[2] + smlal v24.4s, v13.4h, v14.h[1] + smlal v24.4s, v17.4h, v14.h[0] + sshll v15.4s, v4.4h, #15 + sshll v30.4s, v5.4h, #15 + add v20.4s, v15.4s, v26.4s + sub v15.4s, v15.4s, v26.4s + rshrn v4.4h, v20.4s, #13 + rshrn v6.4h, v15.4s, #13 + add v20.4s, v30.4s, v24.4s + sub v15.4s, v30.4s, v24.4s + rshrn v5.4h, v20.4s, #13 + rshrn v7.4h, v15.4s, #13 + ins v4.d[1], v5.d[0] + ins v6.d[1], v7.d[0] + transpose v4, v6, v3, .16b, .8h + transpose v6, v10, v3, .16b, .4s + ins v11.d[0], v10.d[1] + ins v7.d[0], v6.d[1] +#endif + + /* Pass 2 */ + idct_helper v4.4h, v6.4h, v10.4h, v7.4h, v11.4h, 20, v26.4h, v27.4h + + /* Range limit */ + movi v30.8h, #0x80 + ins v26.d[1], v27.d[0] + add v26.8h, v26.8h, v30.8h + sqxtun v30.8b, v26.8h + ins v26.d[0], v30.d[0] + sqxtun v27.8b, v26.8h + + /* Store results to the output buffer */ + ldp TMP1, TMP2, [OUTPUT_BUF] + add TMP1, TMP1, OUTPUT_COL + add TMP2, TMP2, OUTPUT_COL + + st1 {v26.b}[0], [TMP1], 1 + st1 {v27.b}[4], [TMP1], 1 + st1 {v26.b}[1], [TMP2], 1 + st1 {v27.b}[5], [TMP2], 1 + + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 + blr x30 + + .unreq DCT_TABLE + .unreq COEF_BLOCK + .unreq OUTPUT_BUF + .unreq OUTPUT_COL + .unreq TMP1 + .unreq TMP2 + +.purgem idct_helper + + +/*****************************************************************************/ + +/* + * jsimd_ycc_extrgb_convert_neon + * jsimd_ycc_extbgr_convert_neon + * jsimd_ycc_extrgbx_convert_neon + * jsimd_ycc_extbgrx_convert_neon + * jsimd_ycc_extxbgr_convert_neon + * jsimd_ycc_extxrgb_convert_neon + * + * Colorspace conversion YCbCr -> RGB + */ + +.macro do_load size + .if \size == 8 + ld1 {v4.8b}, [U], 8 + ld1 {v5.8b}, [V], 8 + ld1 {v0.8b}, [Y], 8 + prfm pldl1keep, [U, #64] + prfm pldl1keep, [V, #64] + prfm pldl1keep, [Y, #64] + .elseif \size == 4 + ld1 {v4.b}[0], [U], 1 + ld1 {v4.b}[1], [U], 1 + ld1 {v4.b}[2], [U], 1 + ld1 {v4.b}[3], [U], 1 + ld1 {v5.b}[0], [V], 1 + ld1 {v5.b}[1], [V], 1 + ld1 {v5.b}[2], [V], 1 + ld1 {v5.b}[3], [V], 1 + ld1 {v0.b}[0], [Y], 1 + ld1 {v0.b}[1], [Y], 1 + ld1 {v0.b}[2], [Y], 1 + ld1 {v0.b}[3], [Y], 1 + .elseif \size == 2 + ld1 {v4.b}[4], [U], 1 + ld1 {v4.b}[5], [U], 1 + ld1 {v5.b}[4], [V], 1 + ld1 {v5.b}[5], [V], 1 + ld1 {v0.b}[4], [Y], 1 + ld1 {v0.b}[5], [Y], 1 + .elseif \size == 1 + ld1 {v4.b}[6], [U], 1 + ld1 {v5.b}[6], [V], 1 + ld1 {v0.b}[6], [Y], 1 + .else + .error unsupported macroblock size + .endif +.endm + +.macro do_store bpp, size, fast_st3 + .if \bpp == 24 + .if \size == 8 + .if \fast_st3 == 1 + st3 {v10.8b, v11.8b, v12.8b}, [RGB], 24 + .else + st1 {v10.b}[0], [RGB], #1 + st1 {v11.b}[0], [RGB], #1 + st1 {v12.b}[0], [RGB], #1 + + st1 {v10.b}[1], [RGB], #1 + st1 {v11.b}[1], [RGB], #1 + st1 {v12.b}[1], [RGB], #1 + + st1 {v10.b}[2], [RGB], #1 + st1 {v11.b}[2], [RGB], #1 + st1 {v12.b}[2], [RGB], #1 + + st1 {v10.b}[3], [RGB], #1 + st1 {v11.b}[3], [RGB], #1 + st1 {v12.b}[3], [RGB], #1 + + st1 {v10.b}[4], [RGB], #1 + st1 {v11.b}[4], [RGB], #1 + st1 {v12.b}[4], [RGB], #1 + + st1 {v10.b}[5], [RGB], #1 + st1 {v11.b}[5], [RGB], #1 + st1 {v12.b}[5], [RGB], #1 + + st1 {v10.b}[6], [RGB], #1 + st1 {v11.b}[6], [RGB], #1 + st1 {v12.b}[6], [RGB], #1 + + st1 {v10.b}[7], [RGB], #1 + st1 {v11.b}[7], [RGB], #1 + st1 {v12.b}[7], [RGB], #1 + .endif + .elseif \size == 4 + st3 {v10.b, v11.b, v12.b}[0], [RGB], 3 + st3 {v10.b, v11.b, v12.b}[1], [RGB], 3 + st3 {v10.b, v11.b, v12.b}[2], [RGB], 3 + st3 {v10.b, v11.b, v12.b}[3], [RGB], 3 + .elseif \size == 2 + st3 {v10.b, v11.b, v12.b}[4], [RGB], 3 + st3 {v10.b, v11.b, v12.b}[5], [RGB], 3 + .elseif \size == 1 + st3 {v10.b, v11.b, v12.b}[6], [RGB], 3 + .else + .error unsupported macroblock size + .endif + .elseif \bpp == 32 + .if \size == 8 + st4 {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], 32 + .elseif \size == 4 + st4 {v10.b, v11.b, v12.b, v13.b}[0], [RGB], 4 + st4 {v10.b, v11.b, v12.b, v13.b}[1], [RGB], 4 + st4 {v10.b, v11.b, v12.b, v13.b}[2], [RGB], 4 + st4 {v10.b, v11.b, v12.b, v13.b}[3], [RGB], 4 + .elseif \size == 2 + st4 {v10.b, v11.b, v12.b, v13.b}[4], [RGB], 4 + st4 {v10.b, v11.b, v12.b, v13.b}[5], [RGB], 4 + .elseif \size == 1 + st4 {v10.b, v11.b, v12.b, v13.b}[6], [RGB], 4 + .else + .error unsupported macroblock size + .endif + .elseif \bpp==16 + .if \size == 8 + st1 {v25.8h}, [RGB], 16 + .elseif \size == 4 + st1 {v25.4h}, [RGB], 8 + .elseif \size == 2 + st1 {v25.h}[4], [RGB], 2 + st1 {v25.h}[5], [RGB], 2 + .elseif \size == 1 + st1 {v25.h}[6], [RGB], 2 + .else + .error unsupported macroblock size + .endif + .else + .error unsupported bpp + .endif +.endm + +.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, \ + g_offs, gsize, b_offs, bsize, \ + defsize, fast_st3 + +/* + * 2-stage pipelined YCbCr->RGB conversion + */ + +.macro do_yuv_to_rgb_stage1 + uaddw v6.8h, v2.8h, v4.8b /* q3 = u - 128 */ + uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ + smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */ + smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */ + smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */ + smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */ + smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */ + smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */ + smull v28.4s, v6.4h, v1.h[3] /* multiply by 29033 */ + smull2 v30.4s, v6.8h, v1.h[3] /* multiply by 29033 */ +.endm + +.macro do_yuv_to_rgb_stage2 + rshrn v20.4h, v20.4s, #15 + rshrn2 v20.8h, v22.4s, #15 + rshrn v24.4h, v24.4s, #14 + rshrn2 v24.8h, v26.4s, #14 + rshrn v28.4h, v28.4s, #14 + rshrn2 v28.8h, v30.4s, #14 + uaddw v20.8h, v20.8h, v0.8b + uaddw v24.8h, v24.8h, v0.8b + uaddw v28.8h, v28.8h, v0.8b + .if \bpp != 16 + sqxtun v1\g_offs\defsize, v20.8h + sqxtun v1\r_offs\defsize, v24.8h + sqxtun v1\b_offs\defsize, v28.8h + .else + sqshlu v21.8h, v20.8h, #8 + sqshlu v25.8h, v24.8h, #8 + sqshlu v29.8h, v28.8h, #8 + sri v25.8h, v21.8h, #5 + sri v25.8h, v29.8h, #11 + .endif +.endm + +.macro do_yuv_to_rgb_stage2_store_load_stage1 fast_st3 + rshrn v20.4h, v20.4s, #15 + rshrn v24.4h, v24.4s, #14 + rshrn v28.4h, v28.4s, #14 + ld1 {v4.8b}, [U], 8 + rshrn2 v20.8h, v22.4s, #15 + rshrn2 v24.8h, v26.4s, #14 + rshrn2 v28.8h, v30.4s, #14 + ld1 {v5.8b}, [V], 8 + uaddw v20.8h, v20.8h, v0.8b + uaddw v24.8h, v24.8h, v0.8b + uaddw v28.8h, v28.8h, v0.8b + .if \bpp != 16 /**************** rgb24/rgb32 ******************************/ + sqxtun v1\g_offs\defsize, v20.8h + ld1 {v0.8b}, [Y], 8 + sqxtun v1\r_offs\defsize, v24.8h + prfm pldl1keep, [U, #64] + prfm pldl1keep, [V, #64] + prfm pldl1keep, [Y, #64] + sqxtun v1\b_offs\defsize, v28.8h + uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */ + uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ + smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */ + smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */ + smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */ + smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */ + smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */ + smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */ + .else /**************************** rgb565 ********************************/ + sqshlu v21.8h, v20.8h, #8 + sqshlu v25.8h, v24.8h, #8 + sqshlu v29.8h, v28.8h, #8 + uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */ + uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ + ld1 {v0.8b}, [Y], 8 + smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */ + smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */ + smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */ + smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */ + sri v25.8h, v21.8h, #5 + smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */ + smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */ + prfm pldl1keep, [U, #64] + prfm pldl1keep, [V, #64] + prfm pldl1keep, [Y, #64] + sri v25.8h, v29.8h, #11 + .endif + do_store \bpp, 8, \fast_st3 + smull v28.4s, v6.4h, v1.h[3] /* multiply by 29033 */ + smull2 v30.4s, v6.8h, v1.h[3] /* multiply by 29033 */ +.endm + +.macro do_yuv_to_rgb + do_yuv_to_rgb_stage1 + do_yuv_to_rgb_stage2 +.endm + +/* Apple gas crashes on adrl, work around that by using adr. + * But this requires a copy of these constants for each function. + */ + +.balign 16 +.if \fast_st3 == 1 +Ljsimd_ycc_\colorid\()_neon_consts: +.else +Ljsimd_ycc_\colorid\()_neon_slowst3_consts: +.endif + .short 0, 0, 0, 0 + .short 22971, -11277, -23401, 29033 + .short -128, -128, -128, -128 + .short -128, -128, -128, -128 + +.if \fast_st3 == 1 +asm_function jsimd_ycc_\colorid\()_convert_neon +.else +asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3 +.endif + OUTPUT_WIDTH .req w0 + INPUT_BUF .req x1 + INPUT_ROW .req w2 + OUTPUT_BUF .req x3 + NUM_ROWS .req w4 + + INPUT_BUF0 .req x5 + INPUT_BUF1 .req x6 + INPUT_BUF2 .req x1 + + RGB .req x7 + Y .req x9 + U .req x10 + V .req x11 + N .req w15 + + sub sp, sp, 64 + mov x9, sp + + /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */ + .if \fast_st3 == 1 + adr x15, Ljsimd_ycc_\colorid\()_neon_consts + .else + adr x15, Ljsimd_ycc_\colorid\()_neon_slowst3_consts + .endif + + /* Save NEON registers */ + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32 + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32 + ld1 {v0.4h, v1.4h}, [x15], 16 + ld1 {v2.8h}, [x15] + + ldr INPUT_BUF0, [INPUT_BUF] + ldr INPUT_BUF1, [INPUT_BUF, #8] + ldr INPUT_BUF2, [INPUT_BUF, #16] + .unreq INPUT_BUF + + /* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */ + movi v10.16b, #255 + movi v13.16b, #255 + + /* Outer loop over scanlines */ + cmp NUM_ROWS, #1 + b.lt 9f +0: + ldr Y, [INPUT_BUF0, INPUT_ROW, uxtw #3] + ldr U, [INPUT_BUF1, INPUT_ROW, uxtw #3] + mov N, OUTPUT_WIDTH + ldr V, [INPUT_BUF2, INPUT_ROW, uxtw #3] + add INPUT_ROW, INPUT_ROW, #1 + ldr RGB, [OUTPUT_BUF], #8 + + /* Inner loop over pixels */ + subs N, N, #8 + b.lt 3f + do_load 8 + do_yuv_to_rgb_stage1 + subs N, N, #8 + b.lt 2f +1: + do_yuv_to_rgb_stage2_store_load_stage1 \fast_st3 + subs N, N, #8 + b.ge 1b +2: + do_yuv_to_rgb_stage2 + do_store \bpp, 8, \fast_st3 + tst N, #7 + b.eq 8f +3: + tst N, #4 + b.eq 3f + do_load 4 +3: + tst N, #2 + b.eq 4f + do_load 2 +4: + tst N, #1 + b.eq 5f + do_load 1 +5: + do_yuv_to_rgb + tst N, #4 + b.eq 6f + do_store \bpp, 4, \fast_st3 +6: + tst N, #2 + b.eq 7f + do_store \bpp, 2, \fast_st3 +7: + tst N, #1 + b.eq 8f + do_store \bpp, 1, \fast_st3 +8: + subs NUM_ROWS, NUM_ROWS, #1 + b.gt 0b +9: + /* Restore all registers and return */ + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 + br x30 + .unreq OUTPUT_WIDTH + .unreq INPUT_ROW + .unreq OUTPUT_BUF + .unreq NUM_ROWS + .unreq INPUT_BUF0 + .unreq INPUT_BUF1 + .unreq INPUT_BUF2 + .unreq RGB + .unreq Y + .unreq U + .unreq V + .unreq N + +.purgem do_yuv_to_rgb +.purgem do_yuv_to_rgb_stage1 +.purgem do_yuv_to_rgb_stage2 +.purgem do_yuv_to_rgb_stage2_store_load_stage1 + +.endm + +/*--------------------------------- id ----- bpp R rsize G gsize B bsize defsize fast_st3*/ +generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b, 1 +generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b, 1 +generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h, 1, .4h, 2, .4h, .8b, 1 +generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h, 1, .4h, 0, .4h, .8b, 1 +generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h, 2, .4h, 1, .4h, .8b, 1 +generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h, 2, .4h, 3, .4h, .8b, 1 +generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, .4h, 0, .4h, 0, .4h, .8b, 1 + +generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b, 0 +generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b, 0 + +.purgem do_load +.purgem do_store + + +/*****************************************************************************/ + +/* + * jsimd_extrgb_ycc_convert_neon + * jsimd_extbgr_ycc_convert_neon + * jsimd_extrgbx_ycc_convert_neon + * jsimd_extbgrx_ycc_convert_neon + * jsimd_extxbgr_ycc_convert_neon + * jsimd_extxrgb_ycc_convert_neon + * + * Colorspace conversion RGB -> YCbCr + */ + +.macro do_store size + .if \size == 8 + st1 {v20.8b}, [Y], #8 + st1 {v21.8b}, [U], #8 + st1 {v22.8b}, [V], #8 + .elseif \size == 4 + st1 {v20.b}[0], [Y], #1 + st1 {v20.b}[1], [Y], #1 + st1 {v20.b}[2], [Y], #1 + st1 {v20.b}[3], [Y], #1 + st1 {v21.b}[0], [U], #1 + st1 {v21.b}[1], [U], #1 + st1 {v21.b}[2], [U], #1 + st1 {v21.b}[3], [U], #1 + st1 {v22.b}[0], [V], #1 + st1 {v22.b}[1], [V], #1 + st1 {v22.b}[2], [V], #1 + st1 {v22.b}[3], [V], #1 + .elseif \size == 2 + st1 {v20.b}[4], [Y], #1 + st1 {v20.b}[5], [Y], #1 + st1 {v21.b}[4], [U], #1 + st1 {v21.b}[5], [U], #1 + st1 {v22.b}[4], [V], #1 + st1 {v22.b}[5], [V], #1 + .elseif \size == 1 + st1 {v20.b}[6], [Y], #1 + st1 {v21.b}[6], [U], #1 + st1 {v22.b}[6], [V], #1 + .else + .error unsupported macroblock size + .endif +.endm + +.macro do_load bpp, size, fast_ld3 + .if \bpp == 24 + .if \size == 8 + .if \fast_ld3 == 1 + ld3 {v10.8b, v11.8b, v12.8b}, [RGB], #24 + .else + ld1 {v10.b}[0], [RGB], #1 + ld1 {v11.b}[0], [RGB], #1 + ld1 {v12.b}[0], [RGB], #1 + + ld1 {v10.b}[1], [RGB], #1 + ld1 {v11.b}[1], [RGB], #1 + ld1 {v12.b}[1], [RGB], #1 + + ld1 {v10.b}[2], [RGB], #1 + ld1 {v11.b}[2], [RGB], #1 + ld1 {v12.b}[2], [RGB], #1 + + ld1 {v10.b}[3], [RGB], #1 + ld1 {v11.b}[3], [RGB], #1 + ld1 {v12.b}[3], [RGB], #1 + + ld1 {v10.b}[4], [RGB], #1 + ld1 {v11.b}[4], [RGB], #1 + ld1 {v12.b}[4], [RGB], #1 + + ld1 {v10.b}[5], [RGB], #1 + ld1 {v11.b}[5], [RGB], #1 + ld1 {v12.b}[5], [RGB], #1 + + ld1 {v10.b}[6], [RGB], #1 + ld1 {v11.b}[6], [RGB], #1 + ld1 {v12.b}[6], [RGB], #1 + + ld1 {v10.b}[7], [RGB], #1 + ld1 {v11.b}[7], [RGB], #1 + ld1 {v12.b}[7], [RGB], #1 + .endif + prfm pldl1keep, [RGB, #128] + .elseif \size == 4 + ld3 {v10.b, v11.b, v12.b}[0], [RGB], #3 + ld3 {v10.b, v11.b, v12.b}[1], [RGB], #3 + ld3 {v10.b, v11.b, v12.b}[2], [RGB], #3 + ld3 {v10.b, v11.b, v12.b}[3], [RGB], #3 + .elseif \size == 2 + ld3 {v10.b, v11.b, v12.b}[4], [RGB], #3 + ld3 {v10.b, v11.b, v12.b}[5], [RGB], #3 + .elseif \size == 1 + ld3 {v10.b, v11.b, v12.b}[6], [RGB], #3 + .else + .error unsupported macroblock size + .endif + .elseif \bpp == 32 + .if \size == 8 + ld4 {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], #32 + prfm pldl1keep, [RGB, #128] + .elseif \size == 4 + ld4 {v10.b, v11.b, v12.b, v13.b}[0], [RGB], #4 + ld4 {v10.b, v11.b, v12.b, v13.b}[1], [RGB], #4 + ld4 {v10.b, v11.b, v12.b, v13.b}[2], [RGB], #4 + ld4 {v10.b, v11.b, v12.b, v13.b}[3], [RGB], #4 + .elseif \size == 2 + ld4 {v10.b, v11.b, v12.b, v13.b}[4], [RGB], #4 + ld4 {v10.b, v11.b, v12.b, v13.b}[5], [RGB], #4 + .elseif \size == 1 + ld4 {v10.b, v11.b, v12.b, v13.b}[6], [RGB], #4 + .else + .error unsupported macroblock size + .endif + .else + .error unsupported bpp + .endif +.endm + +.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, \ + b_offs, fast_ld3 + +/* + * 2-stage pipelined RGB->YCbCr conversion + */ + +.macro do_rgb_to_yuv_stage1 + ushll v4.8h, v1\r_offs\().8b, #0 /* r = v4 */ + ushll v6.8h, v1\g_offs\().8b, #0 /* g = v6 */ + ushll v8.8h, v1\b_offs\().8b, #0 /* b = v8 */ + rev64 v18.4s, v1.4s + rev64 v26.4s, v1.4s + rev64 v28.4s, v1.4s + rev64 v30.4s, v1.4s + umull v14.4s, v4.4h, v0.h[0] + umull2 v16.4s, v4.8h, v0.h[0] + umlsl v18.4s, v4.4h, v0.h[3] + umlsl2 v26.4s, v4.8h, v0.h[3] + umlal v28.4s, v4.4h, v0.h[5] + umlal2 v30.4s, v4.8h, v0.h[5] + umlal v14.4s, v6.4h, v0.h[1] + umlal2 v16.4s, v6.8h, v0.h[1] + umlsl v18.4s, v6.4h, v0.h[4] + umlsl2 v26.4s, v6.8h, v0.h[4] + umlsl v28.4s, v6.4h, v0.h[6] + umlsl2 v30.4s, v6.8h, v0.h[6] + umlal v14.4s, v8.4h, v0.h[2] + umlal2 v16.4s, v8.8h, v0.h[2] + umlal v18.4s, v8.4h, v0.h[5] + umlal2 v26.4s, v8.8h, v0.h[5] + umlsl v28.4s, v8.4h, v0.h[7] + umlsl2 v30.4s, v8.8h, v0.h[7] +.endm + +.macro do_rgb_to_yuv_stage2 + rshrn v20.4h, v14.4s, #16 + shrn v22.4h, v18.4s, #16 + shrn v24.4h, v28.4s, #16 + rshrn2 v20.8h, v16.4s, #16 + shrn2 v22.8h, v26.4s, #16 + shrn2 v24.8h, v30.4s, #16 + xtn v20.8b, v20.8h /* v20 = y */ + xtn v21.8b, v22.8h /* v21 = u */ + xtn v22.8b, v24.8h /* v22 = v */ +.endm + +.macro do_rgb_to_yuv + do_rgb_to_yuv_stage1 + do_rgb_to_yuv_stage2 +.endm + +/* TODO: expand macros and interleave instructions if some in-order + * ARM64 processor actually can dual-issue LOAD/STORE with ALU */ +.macro do_rgb_to_yuv_stage2_store_load_stage1 fast_ld3 + do_rgb_to_yuv_stage2 + do_load \bpp, 8, \fast_ld3 + st1 {v20.8b}, [Y], #8 + st1 {v21.8b}, [U], #8 + st1 {v22.8b}, [V], #8 + do_rgb_to_yuv_stage1 +.endm + +.balign 16 +.if \fast_ld3 == 1 +Ljsimd_\colorid\()_ycc_neon_consts: +.else +Ljsimd_\colorid\()_ycc_neon_slowld3_consts: +.endif + .short 19595, 38470, 7471, 11059 + .short 21709, 32768, 27439, 5329 + .short 32767, 128, 32767, 128 + .short 32767, 128, 32767, 128 + +.if \fast_ld3 == 1 +asm_function jsimd_\colorid\()_ycc_convert_neon +.else +asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3 +.endif + OUTPUT_WIDTH .req w0 + INPUT_BUF .req x1 + OUTPUT_BUF .req x2 + OUTPUT_ROW .req w3 + NUM_ROWS .req w4 + + OUTPUT_BUF0 .req x5 + OUTPUT_BUF1 .req x6 + OUTPUT_BUF2 .req x2 /* OUTPUT_BUF */ + + RGB .req x7 + Y .req x9 + U .req x10 + V .req x11 + N .req w12 + + /* Load constants to d0, d1, d2, d3 */ + .if \fast_ld3 == 1 + adr x13, Ljsimd_\colorid\()_ycc_neon_consts + .else + adr x13, Ljsimd_\colorid\()_ycc_neon_slowld3_consts + .endif + ld1 {v0.8h, v1.8h}, [x13] + + ldr OUTPUT_BUF0, [OUTPUT_BUF] + ldr OUTPUT_BUF1, [OUTPUT_BUF, #8] + ldr OUTPUT_BUF2, [OUTPUT_BUF, #16] + .unreq OUTPUT_BUF + + /* Save NEON registers */ + sub sp, sp, #64 + mov x9, sp + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32 + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32 + + /* Outer loop over scanlines */ + cmp NUM_ROWS, #1 + b.lt 9f +0: + ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, uxtw #3] + ldr U, [OUTPUT_BUF1, OUTPUT_ROW, uxtw #3] + mov N, OUTPUT_WIDTH + ldr V, [OUTPUT_BUF2, OUTPUT_ROW, uxtw #3] + add OUTPUT_ROW, OUTPUT_ROW, #1 + ldr RGB, [INPUT_BUF], #8 + + /* Inner loop over pixels */ + subs N, N, #8 + b.lt 3f + do_load \bpp, 8, \fast_ld3 + do_rgb_to_yuv_stage1 + subs N, N, #8 + b.lt 2f +1: + do_rgb_to_yuv_stage2_store_load_stage1 \fast_ld3 + subs N, N, #8 + b.ge 1b +2: + do_rgb_to_yuv_stage2 + do_store 8 + tst N, #7 + b.eq 8f +3: + tbz N, #2, 3f + do_load \bpp, 4, \fast_ld3 +3: + tbz N, #1, 4f + do_load \bpp, 2, \fast_ld3 +4: + tbz N, #0, 5f + do_load \bpp, 1, \fast_ld3 +5: + do_rgb_to_yuv + tbz N, #2, 6f + do_store 4 +6: + tbz N, #1, 7f + do_store 2 +7: + tbz N, #0, 8f + do_store 1 +8: + subs NUM_ROWS, NUM_ROWS, #1 + b.gt 0b +9: + /* Restore all registers and return */ + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 + br x30 + + .unreq OUTPUT_WIDTH + .unreq OUTPUT_ROW + .unreq INPUT_BUF + .unreq NUM_ROWS + .unreq OUTPUT_BUF0 + .unreq OUTPUT_BUF1 + .unreq OUTPUT_BUF2 + .unreq RGB + .unreq Y + .unreq U + .unreq V + .unreq N + +.purgem do_rgb_to_yuv +.purgem do_rgb_to_yuv_stage1 +.purgem do_rgb_to_yuv_stage2 +.purgem do_rgb_to_yuv_stage2_store_load_stage1 + +.endm + +/*--------------------------------- id ----- bpp R G B Fast LD3 */ +generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2, 1 +generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0, 1 +generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2, 1 +generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0, 1 +generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1, 1 +generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3, 1 + +generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2, 0 +generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0, 0 + +.purgem do_load +.purgem do_store + + +/*****************************************************************************/ + +/* + * Load data into workspace, applying unsigned->signed conversion + * + * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get + * rid of VST1.16 instructions + */ + +asm_function jsimd_convsamp_neon + SAMPLE_DATA .req x0 + START_COL .req x1 + WORKSPACE .req x2 + TMP1 .req x9 + TMP2 .req x10 + TMP3 .req x11 + TMP4 .req x12 + TMP5 .req x13 + TMP6 .req x14 + TMP7 .req x15 + TMP8 .req x4 + TMPDUP .req w3 + + /* START_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't + guarantee that the upper (unused) 32 bits of x1 are valid. This + instruction ensures that those bits are set to zero. */ + uxtw x1, w1 + + mov TMPDUP, #128 + ldp TMP1, TMP2, [SAMPLE_DATA], 16 + ldp TMP3, TMP4, [SAMPLE_DATA], 16 + dup v0.8b, TMPDUP + add TMP1, TMP1, START_COL + add TMP2, TMP2, START_COL + ldp TMP5, TMP6, [SAMPLE_DATA], 16 + add TMP3, TMP3, START_COL + add TMP4, TMP4, START_COL + ldp TMP7, TMP8, [SAMPLE_DATA], 16 + add TMP5, TMP5, START_COL + add TMP6, TMP6, START_COL + ld1 {v16.8b}, [TMP1] + add TMP7, TMP7, START_COL + add TMP8, TMP8, START_COL + ld1 {v17.8b}, [TMP2] + usubl v16.8h, v16.8b, v0.8b + ld1 {v18.8b}, [TMP3] + usubl v17.8h, v17.8b, v0.8b + ld1 {v19.8b}, [TMP4] + usubl v18.8h, v18.8b, v0.8b + ld1 {v20.8b}, [TMP5] + usubl v19.8h, v19.8b, v0.8b + ld1 {v21.8b}, [TMP6] + st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [WORKSPACE], 64 + usubl v20.8h, v20.8b, v0.8b + ld1 {v22.8b}, [TMP7] + usubl v21.8h, v21.8b, v0.8b + ld1 {v23.8b}, [TMP8] + usubl v22.8h, v22.8b, v0.8b + usubl v23.8h, v23.8b, v0.8b + st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [WORKSPACE], 64 + + br x30 + + .unreq SAMPLE_DATA + .unreq START_COL + .unreq WORKSPACE + .unreq TMP1 + .unreq TMP2 + .unreq TMP3 + .unreq TMP4 + .unreq TMP5 + .unreq TMP6 + .unreq TMP7 + .unreq TMP8 + .unreq TMPDUP + +/*****************************************************************************/ + +/* + * jsimd_fdct_islow_neon + * + * This file contains a slow-but-accurate integer implementation of the + * forward DCT (Discrete Cosine Transform). The following code is based + * directly on the IJG''s original jfdctint.c; see the jfdctint.c for + * more details. + * + * TODO: can be combined with 'jsimd_convsamp_neon' to get + * rid of a bunch of VLD1.16 instructions + */ + +#define CONST_BITS 13 +#define PASS1_BITS 2 + +#define DESCALE_P1 (CONST_BITS-PASS1_BITS) +#define DESCALE_P2 (CONST_BITS+PASS1_BITS) + +#define F_0_298 2446 /* FIX(0.298631336) */ +#define F_0_390 3196 /* FIX(0.390180644) */ +#define F_0_541 4433 /* FIX(0.541196100) */ +#define F_0_765 6270 /* FIX(0.765366865) */ +#define F_0_899 7373 /* FIX(0.899976223) */ +#define F_1_175 9633 /* FIX(1.175875602) */ +#define F_1_501 12299 /* FIX(1.501321110) */ +#define F_1_847 15137 /* FIX(1.847759065) */ +#define F_1_961 16069 /* FIX(1.961570560) */ +#define F_2_053 16819 /* FIX(2.053119869) */ +#define F_2_562 20995 /* FIX(2.562915447) */ +#define F_3_072 25172 /* FIX(3.072711026) */ + +.balign 16 +Ljsimd_fdct_islow_neon_consts: + .short F_0_298 + .short -F_0_390 + .short F_0_541 + .short F_0_765 + .short - F_0_899 + .short F_1_175 + .short F_1_501 + .short - F_1_847 + .short - F_1_961 + .short F_2_053 + .short - F_2_562 + .short F_3_072 + .short 0 /* padding */ + .short 0 + .short 0 + .short 0 + +#undef F_0_298 +#undef F_0_390 +#undef F_0_541 +#undef F_0_765 +#undef F_0_899 +#undef F_1_175 +#undef F_1_501 +#undef F_1_847 +#undef F_1_961 +#undef F_2_053 +#undef F_2_562 +#undef F_3_072 +#define XFIX_P_0_298 v0.h[0] +#define XFIX_N_0_390 v0.h[1] +#define XFIX_P_0_541 v0.h[2] +#define XFIX_P_0_765 v0.h[3] +#define XFIX_N_0_899 v0.h[4] +#define XFIX_P_1_175 v0.h[5] +#define XFIX_P_1_501 v0.h[6] +#define XFIX_N_1_847 v0.h[7] +#define XFIX_N_1_961 v1.h[0] +#define XFIX_P_2_053 v1.h[1] +#define XFIX_N_2_562 v1.h[2] +#define XFIX_P_3_072 v1.h[3] + +asm_function jsimd_fdct_islow_neon + + DATA .req x0 + TMP .req x9 + + /* Load constants */ + adr TMP, Ljsimd_fdct_islow_neon_consts + ld1 {v0.8h, v1.8h}, [TMP] + + /* Save NEON registers */ + sub sp, sp, #64 + mov x10, sp + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], 32 + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], 32 + + /* Load all DATA into NEON registers with the following allocation: + * 0 1 2 3 | 4 5 6 7 + * ---------+-------- + * 0 | d16 | d17 | v16.8h + * 1 | d18 | d19 | v17.8h + * 2 | d20 | d21 | v18.8h + * 3 | d22 | d23 | v19.8h + * 4 | d24 | d25 | v20.8h + * 5 | d26 | d27 | v21.8h + * 6 | d28 | d29 | v22.8h + * 7 | d30 | d31 | v23.8h + */ + + ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64 + ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA] + sub DATA, DATA, #64 + + /* Transpose */ + transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4 + /* 1-D FDCT */ + add v24.8h, v16.8h, v23.8h /* tmp0 = dataptr[0] + dataptr[7]; */ + sub v31.8h, v16.8h, v23.8h /* tmp7 = dataptr[0] - dataptr[7]; */ + add v25.8h, v17.8h, v22.8h /* tmp1 = dataptr[1] + dataptr[6]; */ + sub v30.8h, v17.8h, v22.8h /* tmp6 = dataptr[1] - dataptr[6]; */ + add v26.8h, v18.8h, v21.8h /* tmp2 = dataptr[2] + dataptr[5]; */ + sub v29.8h, v18.8h, v21.8h /* tmp5 = dataptr[2] - dataptr[5]; */ + add v27.8h, v19.8h, v20.8h /* tmp3 = dataptr[3] + dataptr[4]; */ + sub v28.8h, v19.8h, v20.8h /* tmp4 = dataptr[3] - dataptr[4]; */ + + /* even part */ + + add v8.8h, v24.8h, v27.8h /* tmp10 = tmp0 + tmp3; */ + sub v9.8h, v24.8h, v27.8h /* tmp13 = tmp0 - tmp3; */ + add v10.8h, v25.8h, v26.8h /* tmp11 = tmp1 + tmp2; */ + sub v11.8h, v25.8h, v26.8h /* tmp12 = tmp1 - tmp2; */ + + add v16.8h, v8.8h, v10.8h /* tmp10 + tmp11 */ + sub v20.8h, v8.8h, v10.8h /* tmp10 - tmp11 */ + + add v18.8h, v11.8h, v9.8h /* tmp12 + tmp13 */ + + shl v16.8h, v16.8h, #PASS1_BITS /* dataptr[0] = (DCTELEM) LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS); */ + shl v20.8h, v20.8h, #PASS1_BITS /* dataptr[4] = (DCTELEM) LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS); */ + + smull2 v24.4s, v18.8h, XFIX_P_0_541 /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */ + smull v18.4s, v18.4h, XFIX_P_0_541 /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */ + mov v22.16b, v18.16b + mov v25.16b, v24.16b + + smlal v18.4s, v9.4h, XFIX_P_0_765 /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */ + smlal2 v24.4s, v9.8h, XFIX_P_0_765 /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */ + smlal v22.4s, v11.4h, XFIX_N_1_847 /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */ + smlal2 v25.4s, v11.8h, XFIX_N_1_847 /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */ + + rshrn v18.4h, v18.4s, #DESCALE_P1 + rshrn v22.4h, v22.4s, #DESCALE_P1 + rshrn2 v18.8h, v24.4s, #DESCALE_P1 /* dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */ + rshrn2 v22.8h, v25.4s, #DESCALE_P1 /* dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */ + + /* Odd part */ + + add v8.8h, v28.8h, v31.8h /* z1 = tmp4 + tmp7; */ + add v9.8h, v29.8h, v30.8h /* z2 = tmp5 + tmp6; */ + add v10.8h, v28.8h, v30.8h /* z3 = tmp4 + tmp6; */ + add v11.8h, v29.8h, v31.8h /* z4 = tmp5 + tmp7; */ + smull v4.4s, v10.4h, XFIX_P_1_175 /* z5 lo = z3 lo * XFIX_P_1_175 */ + smull2 v5.4s, v10.8h, XFIX_P_1_175 + smlal v4.4s, v11.4h, XFIX_P_1_175 /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */ + smlal2 v5.4s, v11.8h, XFIX_P_1_175 + + smull2 v24.4s, v28.8h, XFIX_P_0_298 + smull2 v25.4s, v29.8h, XFIX_P_2_053 + smull2 v26.4s, v30.8h, XFIX_P_3_072 + smull2 v27.4s, v31.8h, XFIX_P_1_501 + smull v28.4s, v28.4h, XFIX_P_0_298 /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */ + smull v29.4s, v29.4h, XFIX_P_2_053 /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */ + smull v30.4s, v30.4h, XFIX_P_3_072 /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */ + smull v31.4s, v31.4h, XFIX_P_1_501 /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */ + + smull2 v12.4s, v8.8h, XFIX_N_0_899 + smull2 v13.4s, v9.8h, XFIX_N_2_562 + smull2 v14.4s, v10.8h, XFIX_N_1_961 + smull2 v15.4s, v11.8h, XFIX_N_0_390 + smull v8.4s, v8.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223); */ + smull v9.4s, v9.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447); */ + smull v10.4s, v10.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560); */ + smull v11.4s, v11.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644); */ + + add v10.4s, v10.4s, v4.4s /* z3 += z5 */ + add v14.4s, v14.4s, v5.4s + add v11.4s, v11.4s, v4.4s /* z4 += z5 */ + add v15.4s, v15.4s, v5.4s + + add v28.4s, v28.4s, v8.4s /* tmp4 += z1 */ + add v24.4s, v24.4s, v12.4s + add v29.4s, v29.4s, v9.4s /* tmp5 += z2 */ + add v25.4s, v25.4s, v13.4s + add v30.4s, v30.4s, v10.4s /* tmp6 += z3 */ + add v26.4s, v26.4s, v14.4s + add v31.4s, v31.4s, v11.4s /* tmp7 += z4 */ + add v27.4s, v27.4s, v15.4s + + add v28.4s, v28.4s, v10.4s /* tmp4 += z3 */ + add v24.4s, v24.4s, v14.4s + add v29.4s, v29.4s, v11.4s /* tmp5 += z4 */ + add v25.4s, v25.4s, v15.4s + add v30.4s, v30.4s, v9.4s /* tmp6 += z2 */ + add v26.4s, v26.4s, v13.4s + add v31.4s, v31.4s, v8.4s /* tmp7 += z1 */ + add v27.4s, v27.4s, v12.4s + + rshrn v23.4h, v28.4s, #DESCALE_P1 + rshrn v21.4h, v29.4s, #DESCALE_P1 + rshrn v19.4h, v30.4s, #DESCALE_P1 + rshrn v17.4h, v31.4s, #DESCALE_P1 + rshrn2 v23.8h, v24.4s, #DESCALE_P1 /* dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */ + rshrn2 v21.8h, v25.4s, #DESCALE_P1 /* dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */ + rshrn2 v19.8h, v26.4s, #DESCALE_P1 /* dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */ + rshrn2 v17.8h, v27.4s, #DESCALE_P1 /* dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */ + + /* Transpose */ + transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4 + + /* 1-D FDCT */ + add v24.8h, v16.8h, v23.8h /* tmp0 = dataptr[0] + dataptr[7]; */ + sub v31.8h, v16.8h, v23.8h /* tmp7 = dataptr[0] - dataptr[7]; */ + add v25.8h, v17.8h, v22.8h /* tmp1 = dataptr[1] + dataptr[6]; */ + sub v30.8h, v17.8h, v22.8h /* tmp6 = dataptr[1] - dataptr[6]; */ + add v26.8h, v18.8h, v21.8h /* tmp2 = dataptr[2] + dataptr[5]; */ + sub v29.8h, v18.8h, v21.8h /* tmp5 = dataptr[2] - dataptr[5]; */ + add v27.8h, v19.8h, v20.8h /* tmp3 = dataptr[3] + dataptr[4]; */ + sub v28.8h, v19.8h, v20.8h /* tmp4 = dataptr[3] - dataptr[4]; */ + + /* even part */ + add v8.8h, v24.8h, v27.8h /* tmp10 = tmp0 + tmp3; */ + sub v9.8h, v24.8h, v27.8h /* tmp13 = tmp0 - tmp3; */ + add v10.8h, v25.8h, v26.8h /* tmp11 = tmp1 + tmp2; */ + sub v11.8h, v25.8h, v26.8h /* tmp12 = tmp1 - tmp2; */ + + add v16.8h, v8.8h, v10.8h /* tmp10 + tmp11 */ + sub v20.8h, v8.8h, v10.8h /* tmp10 - tmp11 */ + + add v18.8h, v11.8h, v9.8h /* tmp12 + tmp13 */ + + srshr v16.8h, v16.8h, #PASS1_BITS /* dataptr[0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS); */ + srshr v20.8h, v20.8h, #PASS1_BITS /* dataptr[4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS); */ + + smull2 v24.4s, v18.8h, XFIX_P_0_541 /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */ + smull v18.4s, v18.4h, XFIX_P_0_541 /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */ + mov v22.16b, v18.16b + mov v25.16b, v24.16b + + smlal v18.4s, v9.4h, XFIX_P_0_765 /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */ + smlal2 v24.4s, v9.8h, XFIX_P_0_765 /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */ + smlal v22.4s, v11.4h, XFIX_N_1_847 /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */ + smlal2 v25.4s, v11.8h, XFIX_N_1_847 /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */ + + rshrn v18.4h, v18.4s, #DESCALE_P2 + rshrn v22.4h, v22.4s, #DESCALE_P2 + rshrn2 v18.8h, v24.4s, #DESCALE_P2 /* dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */ + rshrn2 v22.8h, v25.4s, #DESCALE_P2 /* dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */ + + /* Odd part */ + add v8.8h, v28.8h, v31.8h /* z1 = tmp4 + tmp7; */ + add v9.8h, v29.8h, v30.8h /* z2 = tmp5 + tmp6; */ + add v10.8h, v28.8h, v30.8h /* z3 = tmp4 + tmp6; */ + add v11.8h, v29.8h, v31.8h /* z4 = tmp5 + tmp7; */ + + smull v4.4s, v10.4h, XFIX_P_1_175 /* z5 lo = z3 lo * XFIX_P_1_175 */ + smull2 v5.4s, v10.8h, XFIX_P_1_175 + smlal v4.4s, v11.4h, XFIX_P_1_175 /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */ + smlal2 v5.4s, v11.8h, XFIX_P_1_175 + + smull2 v24.4s, v28.8h, XFIX_P_0_298 + smull2 v25.4s, v29.8h, XFIX_P_2_053 + smull2 v26.4s, v30.8h, XFIX_P_3_072 + smull2 v27.4s, v31.8h, XFIX_P_1_501 + smull v28.4s, v28.4h, XFIX_P_0_298 /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */ + smull v29.4s, v29.4h, XFIX_P_2_053 /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */ + smull v30.4s, v30.4h, XFIX_P_3_072 /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */ + smull v31.4s, v31.4h, XFIX_P_1_501 /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */ + + smull2 v12.4s, v8.8h, XFIX_N_0_899 + smull2 v13.4s, v9.8h, XFIX_N_2_562 + smull2 v14.4s, v10.8h, XFIX_N_1_961 + smull2 v15.4s, v11.8h, XFIX_N_0_390 + smull v8.4s, v8.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223); */ + smull v9.4s, v9.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447); */ + smull v10.4s, v10.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560); */ + smull v11.4s, v11.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644); */ + + add v10.4s, v10.4s, v4.4s + add v14.4s, v14.4s, v5.4s + add v11.4s, v11.4s, v4.4s + add v15.4s, v15.4s, v5.4s + + add v28.4s, v28.4s, v8.4s /* tmp4 += z1 */ + add v24.4s, v24.4s, v12.4s + add v29.4s, v29.4s, v9.4s /* tmp5 += z2 */ + add v25.4s, v25.4s, v13.4s + add v30.4s, v30.4s, v10.4s /* tmp6 += z3 */ + add v26.4s, v26.4s, v14.4s + add v31.4s, v31.4s, v11.4s /* tmp7 += z4 */ + add v27.4s, v27.4s, v15.4s + + add v28.4s, v28.4s, v10.4s /* tmp4 += z3 */ + add v24.4s, v24.4s, v14.4s + add v29.4s, v29.4s, v11.4s /* tmp5 += z4 */ + add v25.4s, v25.4s, v15.4s + add v30.4s, v30.4s, v9.4s /* tmp6 += z2 */ + add v26.4s, v26.4s, v13.4s + add v31.4s, v31.4s, v8.4s /* tmp7 += z1 */ + add v27.4s, v27.4s, v12.4s + + rshrn v23.4h, v28.4s, #DESCALE_P2 + rshrn v21.4h, v29.4s, #DESCALE_P2 + rshrn v19.4h, v30.4s, #DESCALE_P2 + rshrn v17.4h, v31.4s, #DESCALE_P2 + rshrn2 v23.8h, v24.4s, #DESCALE_P2 /* dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */ + rshrn2 v21.8h, v25.4s, #DESCALE_P2 /* dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */ + rshrn2 v19.8h, v26.4s, #DESCALE_P2 /* dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */ + rshrn2 v17.8h, v27.4s, #DESCALE_P2 /* dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */ + + /* store results */ + st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64 + st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA] + + /* Restore NEON registers */ + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 + + br x30 + + .unreq DATA + .unreq TMP + +#undef XFIX_P_0_298 +#undef XFIX_N_0_390 +#undef XFIX_P_0_541 +#undef XFIX_P_0_765 +#undef XFIX_N_0_899 +#undef XFIX_P_1_175 +#undef XFIX_P_1_501 +#undef XFIX_N_1_847 +#undef XFIX_N_1_961 +#undef XFIX_P_2_053 +#undef XFIX_N_2_562 +#undef XFIX_P_3_072 + + +/*****************************************************************************/ + +/* + * jsimd_fdct_ifast_neon + * + * This function contains a fast, not so accurate integer implementation of + * the forward DCT (Discrete Cosine Transform). It uses the same calculations + * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast' + * function from jfdctfst.c + * + * TODO: can be combined with 'jsimd_convsamp_neon' to get + * rid of a bunch of VLD1.16 instructions + */ + +#undef XFIX_0_541196100 +#define XFIX_0_382683433 v0.h[0] +#define XFIX_0_541196100 v0.h[1] +#define XFIX_0_707106781 v0.h[2] +#define XFIX_1_306562965 v0.h[3] + +.balign 16 +Ljsimd_fdct_ifast_neon_consts: + .short (98 * 128) /* XFIX_0_382683433 */ + .short (139 * 128) /* XFIX_0_541196100 */ + .short (181 * 128) /* XFIX_0_707106781 */ + .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */ + +asm_function jsimd_fdct_ifast_neon + + DATA .req x0 + TMP .req x9 + + /* Load constants */ + adr TMP, Ljsimd_fdct_ifast_neon_consts + ld1 {v0.4h}, [TMP] + + /* Load all DATA into NEON registers with the following allocation: + * 0 1 2 3 | 4 5 6 7 + * ---------+-------- + * 0 | d16 | d17 | v0.8h + * 1 | d18 | d19 | q9 + * 2 | d20 | d21 | q10 + * 3 | d22 | d23 | q11 + * 4 | d24 | d25 | q12 + * 5 | d26 | d27 | q13 + * 6 | d28 | d29 | q14 + * 7 | d30 | d31 | q15 + */ + + ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64 + ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA] + mov TMP, #2 + sub DATA, DATA, #64 +1: + /* Transpose */ + transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v1, v2, v3, v4 + subs TMP, TMP, #1 + /* 1-D FDCT */ + add v4.8h, v19.8h, v20.8h + sub v20.8h, v19.8h, v20.8h + sub v28.8h, v18.8h, v21.8h + add v18.8h, v18.8h, v21.8h + sub v29.8h, v17.8h, v22.8h + add v17.8h, v17.8h, v22.8h + sub v21.8h, v16.8h, v23.8h + add v16.8h, v16.8h, v23.8h + sub v6.8h, v17.8h, v18.8h + sub v7.8h, v16.8h, v4.8h + add v5.8h, v17.8h, v18.8h + add v6.8h, v6.8h, v7.8h + add v4.8h, v16.8h, v4.8h + sqdmulh v6.8h, v6.8h, XFIX_0_707106781 + add v19.8h, v20.8h, v28.8h + add v16.8h, v4.8h, v5.8h + sub v20.8h, v4.8h, v5.8h + add v5.8h, v28.8h, v29.8h + add v29.8h, v29.8h, v21.8h + sqdmulh v5.8h, v5.8h, XFIX_0_707106781 + sub v28.8h, v19.8h, v29.8h + add v18.8h, v7.8h, v6.8h + sqdmulh v28.8h, v28.8h, XFIX_0_382683433 + sub v22.8h, v7.8h, v6.8h + sqdmulh v19.8h, v19.8h, XFIX_0_541196100 + sqdmulh v7.8h, v29.8h, XFIX_1_306562965 + add v6.8h, v21.8h, v5.8h + sub v5.8h, v21.8h, v5.8h + add v29.8h, v29.8h, v28.8h + add v19.8h, v19.8h, v28.8h + add v29.8h, v29.8h, v7.8h + add v21.8h, v5.8h, v19.8h + sub v19.8h, v5.8h, v19.8h + add v17.8h, v6.8h, v29.8h + sub v23.8h, v6.8h, v29.8h + + b.ne 1b + + /* store results */ + st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64 + st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA] + + br x30 + + .unreq DATA + .unreq TMP +#undef XFIX_0_382683433 +#undef XFIX_0_541196100 +#undef XFIX_0_707106781 +#undef XFIX_1_306562965 + + +/*****************************************************************************/ + +/* + * GLOBAL(void) + * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM *divisors, + * DCTELEM *workspace); + * + */ +asm_function jsimd_quantize_neon + + COEF_BLOCK .req x0 + DIVISORS .req x1 + WORKSPACE .req x2 + + RECIPROCAL .req DIVISORS + CORRECTION .req x9 + SHIFT .req x10 + LOOP_COUNT .req x11 + + mov LOOP_COUNT, #2 + add CORRECTION, DIVISORS, #(64 * 2) + add SHIFT, DIVISORS, #(64 * 6) +1: + subs LOOP_COUNT, LOOP_COUNT, #1 + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [WORKSPACE], 64 + ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [CORRECTION], 64 + abs v20.8h, v0.8h + abs v21.8h, v1.8h + abs v22.8h, v2.8h + abs v23.8h, v3.8h + ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [RECIPROCAL], 64 + add v20.8h, v20.8h, v4.8h /* add correction */ + add v21.8h, v21.8h, v5.8h + add v22.8h, v22.8h, v6.8h + add v23.8h, v23.8h, v7.8h + umull v4.4s, v20.4h, v28.4h /* multiply by reciprocal */ + umull2 v16.4s, v20.8h, v28.8h + umull v5.4s, v21.4h, v29.4h + umull2 v17.4s, v21.8h, v29.8h + umull v6.4s, v22.4h, v30.4h /* multiply by reciprocal */ + umull2 v18.4s, v22.8h, v30.8h + umull v7.4s, v23.4h, v31.4h + umull2 v19.4s, v23.8h, v31.8h + ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [SHIFT], 64 + shrn v4.4h, v4.4s, #16 + shrn v5.4h, v5.4s, #16 + shrn v6.4h, v6.4s, #16 + shrn v7.4h, v7.4s, #16 + shrn2 v4.8h, v16.4s, #16 + shrn2 v5.8h, v17.4s, #16 + shrn2 v6.8h, v18.4s, #16 + shrn2 v7.8h, v19.4s, #16 + neg v24.8h, v24.8h + neg v25.8h, v25.8h + neg v26.8h, v26.8h + neg v27.8h, v27.8h + sshr v0.8h, v0.8h, #15 /* extract sign */ + sshr v1.8h, v1.8h, #15 + sshr v2.8h, v2.8h, #15 + sshr v3.8h, v3.8h, #15 + ushl v4.8h, v4.8h, v24.8h /* shift */ + ushl v5.8h, v5.8h, v25.8h + ushl v6.8h, v6.8h, v26.8h + ushl v7.8h, v7.8h, v27.8h + + eor v4.16b, v4.16b, v0.16b /* restore sign */ + eor v5.16b, v5.16b, v1.16b + eor v6.16b, v6.16b, v2.16b + eor v7.16b, v7.16b, v3.16b + sub v4.8h, v4.8h, v0.8h + sub v5.8h, v5.8h, v1.8h + sub v6.8h, v6.8h, v2.8h + sub v7.8h, v7.8h, v3.8h + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [COEF_BLOCK], 64 + + b.ne 1b + + br x30 /* return */ + + .unreq COEF_BLOCK + .unreq DIVISORS + .unreq WORKSPACE + .unreq RECIPROCAL + .unreq CORRECTION + .unreq SHIFT + .unreq LOOP_COUNT + + +/*****************************************************************************/ + +/* + * Downsample pixel values of a single component. + * This version handles the common case of 2:1 horizontal and 1:1 vertical, + * without smoothing. + * + * GLOBAL(void) + * jsimd_h2v1_downsample_neon (JDIMENSION image_width, int max_v_samp_factor, + * JDIMENSION v_samp_factor, + * JDIMENSION width_blocks, JSAMPARRAY input_data, + * JSAMPARRAY output_data); + */ + +.balign 16 +Ljsimd_h2_downsample_neon_consts: + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F /* diff 0 */ + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E /* diff 1 */ + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D /* diff 2 */ + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C /* diff 3 */ + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ + 0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B /* diff 4 */ + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ + 0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A /* diff 5 */ + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ + 0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09 /* diff 6 */ + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08 /* diff 7 */ + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ + 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07 /* diff 8 */ + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, \ + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06 /* diff 9 */ + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, \ + 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05 /* diff 10 */ + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, \ + 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04 /* diff 11 */ + .byte 0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, \ + 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 /* diff 12 */ + .byte 0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, \ + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02 /* diff 13 */ + .byte 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, \ + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 /* diff 14 */ + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 /* diff 15 */ + +asm_function jsimd_h2v1_downsample_neon + IMAGE_WIDTH .req x0 + MAX_V_SAMP .req x1 + V_SAMP .req x2 + BLOCK_WIDTH .req x3 + INPUT_DATA .req x4 + OUTPUT_DATA .req x5 + OUTPTR .req x9 + INPTR .req x10 + TMP1 .req x11 + TMP2 .req x12 + TMP3 .req x13 + TMPDUP .req w15 + + mov TMPDUP, #0x10000 + lsl TMP2, BLOCK_WIDTH, #4 + sub TMP2, TMP2, IMAGE_WIDTH + adr TMP3, Ljsimd_h2_downsample_neon_consts + add TMP3, TMP3, TMP2, lsl #4 + dup v16.4s, TMPDUP + ld1 {v18.16b}, [TMP3] + +1: /* row loop */ + ldr INPTR, [INPUT_DATA], #8 + ldr OUTPTR, [OUTPUT_DATA], #8 + subs TMP1, BLOCK_WIDTH, #1 + b.eq 3f +2: /* columns */ + ld1 {v0.16b}, [INPTR], #16 + mov v4.16b, v16.16b + subs TMP1, TMP1, #1 + uadalp v4.8h, v0.16b + shrn v6.8b, v4.8h, #1 + st1 {v6.8b}, [OUTPTR], #8 + b.ne 2b +3: /* last columns */ + ld1 {v0.16b}, [INPTR] + mov v4.16b, v16.16b + subs V_SAMP, V_SAMP, #1 + /* expand right */ + tbl v2.16b, {v0.16b}, v18.16b + uadalp v4.8h, v2.16b + shrn v6.8b, v4.8h, #1 + st1 {v6.8b}, [OUTPTR], #8 + b.ne 1b + + br x30 + + .unreq IMAGE_WIDTH + .unreq MAX_V_SAMP + .unreq V_SAMP + .unreq BLOCK_WIDTH + .unreq INPUT_DATA + .unreq OUTPUT_DATA + .unreq OUTPTR + .unreq INPTR + .unreq TMP1 + .unreq TMP2 + .unreq TMP3 + .unreq TMPDUP + + +/*****************************************************************************/ + +/* + * Downsample pixel values of a single component. + * This version handles the common case of 2:1 horizontal and 2:1 vertical, + * without smoothing. + * + * GLOBAL(void) + * jsimd_h2v2_downsample_neon (JDIMENSION image_width, int max_v_samp_factor, + * JDIMENSION v_samp_factor, JDIMENSION width_blocks, + * JSAMPARRAY input_data, JSAMPARRAY output_data); + */ + +.balign 16 +asm_function jsimd_h2v2_downsample_neon + IMAGE_WIDTH .req x0 + MAX_V_SAMP .req x1 + V_SAMP .req x2 + BLOCK_WIDTH .req x3 + INPUT_DATA .req x4 + OUTPUT_DATA .req x5 + OUTPTR .req x9 + INPTR0 .req x10 + INPTR1 .req x14 + TMP1 .req x11 + TMP2 .req x12 + TMP3 .req x13 + TMPDUP .req w15 + + mov TMPDUP, #1 + lsl TMP2, BLOCK_WIDTH, #4 + lsl TMPDUP, TMPDUP, #17 + sub TMP2, TMP2, IMAGE_WIDTH + adr TMP3, Ljsimd_h2_downsample_neon_consts + orr TMPDUP, TMPDUP, #1 + add TMP3, TMP3, TMP2, lsl #4 + dup v16.4s, TMPDUP + ld1 {v18.16b}, [TMP3] + +1: /* row loop */ + ldr INPTR0, [INPUT_DATA], #8 + ldr OUTPTR, [OUTPUT_DATA], #8 + ldr INPTR1, [INPUT_DATA], #8 + subs TMP1, BLOCK_WIDTH, #1 + b.eq 3f +2: /* columns */ + ld1 {v0.16b}, [INPTR0], #16 + ld1 {v1.16b}, [INPTR1], #16 + mov v4.16b, v16.16b + subs TMP1, TMP1, #1 + uadalp v4.8h, v0.16b + uadalp v4.8h, v1.16b + shrn v6.8b, v4.8h, #2 + st1 {v6.8b}, [OUTPTR], #8 + b.ne 2b +3: /* last columns */ + ld1 {v0.16b}, [INPTR0], #16 + ld1 {v1.16b}, [INPTR1], #16 + mov v4.16b, v16.16b + subs V_SAMP, V_SAMP, #1 + /* expand right */ + tbl v2.16b, {v0.16b}, v18.16b + tbl v3.16b, {v1.16b}, v18.16b + uadalp v4.8h, v2.16b + uadalp v4.8h, v3.16b + shrn v6.8b, v4.8h, #2 + st1 {v6.8b}, [OUTPTR], #8 + b.ne 1b + + br x30 + + .unreq IMAGE_WIDTH + .unreq MAX_V_SAMP + .unreq V_SAMP + .unreq BLOCK_WIDTH + .unreq INPUT_DATA + .unreq OUTPUT_DATA + .unreq OUTPTR + .unreq INPTR0 + .unreq INPTR1 + .unreq TMP1 + .unreq TMP2 + .unreq TMP3 + .unreq TMPDUP + + +/*****************************************************************************/ + +/* + * GLOBAL(JOCTET*) + * jsimd_huff_encode_one_block (working_state *state, JOCTET *buffer, + * JCOEFPTR block, int last_dc_val, + * c_derived_tbl *dctbl, c_derived_tbl *actbl) + * + */ + + BUFFER .req x1 + PUT_BUFFER .req x6 + PUT_BITS .req x7 + PUT_BITSw .req w7 + +.macro emit_byte + sub PUT_BITS, PUT_BITS, #0x8 + lsr x19, PUT_BUFFER, PUT_BITS + uxtb w19, w19 + strb w19, [BUFFER, #1]! + cmp w19, #0xff + b.ne 14f + strb wzr, [BUFFER, #1]! +14: +.endm +.macro put_bits CODE, SIZE + lsl PUT_BUFFER, PUT_BUFFER, \SIZE + add PUT_BITS, PUT_BITS, \SIZE + orr PUT_BUFFER, PUT_BUFFER, \CODE +.endm +.macro checkbuf31 + cmp PUT_BITS, #0x20 + b.lt 31f + emit_byte + emit_byte + emit_byte + emit_byte +31: +.endm +.macro checkbuf47 + cmp PUT_BITS, #0x30 + b.lt 47f + emit_byte + emit_byte + emit_byte + emit_byte + emit_byte + emit_byte +47: +.endm + +.macro generate_jsimd_huff_encode_one_block fast_tbl + +.balign 16 +.if \fast_tbl == 1 +Ljsimd_huff_encode_one_block_neon_consts: +.else +Ljsimd_huff_encode_one_block_neon_slowtbl_consts: +.endif + .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \ + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 +.if \fast_tbl == 1 + .byte 0, 1, 2, 3, 16, 17, 32, 33, \ + 18, 19, 4, 5, 6, 7, 20, 21 /* L0 => L3 : 4 lines OK */ + .byte 34, 35, 48, 49, 255, 255, 50, 51, \ + 36, 37, 22, 23, 8, 9, 10, 11 /* L0 => L3 : 4 lines OK */ + .byte 8, 9, 22, 23, 36, 37, 50, 51, \ + 255, 255, 255, 255, 255, 255, 52, 53 /* L1 => L4 : 4 lines OK */ + .byte 54, 55, 40, 41, 26, 27, 12, 13, \ + 14, 15, 28, 29, 42, 43, 56, 57 /* L0 => L3 : 4 lines OK */ + .byte 6, 7, 20, 21, 34, 35, 48, 49, \ + 50, 51, 36, 37, 22, 23, 8, 9 /* L4 => L7 : 4 lines OK */ + .byte 42, 43, 28, 29, 14, 15, 30, 31, \ + 44, 45, 58, 59, 255, 255, 255, 255 /* L1 => L4 : 4 lines OK */ + .byte 255, 255, 255, 255, 56, 57, 42, 43, \ + 28, 29, 14, 15, 30, 31, 44, 45 /* L3 => L6 : 4 lines OK */ + .byte 26, 27, 40, 41, 42, 43, 28, 29, \ + 14, 15, 30, 31, 44, 45, 46, 47 /* L5 => L7 : 3 lines OK */ + .byte 255, 255, 255, 255, 0, 1, 255, 255, \ + 255, 255, 255, 255, 255, 255, 255, 255 /* L4 : 1 lines OK */ + .byte 255, 255, 255, 255, 255, 255, 255, 255, \ + 0, 1, 16, 17, 2, 3, 255, 255 /* L5 => L6 : 2 lines OK */ + .byte 255, 255, 255, 255, 255, 255, 255, 255, \ + 255, 255, 255, 255, 8, 9, 22, 23 /* L5 => L6 : 2 lines OK */ + .byte 4, 5, 6, 7, 255, 255, 255, 255, \ + 255, 255, 255, 255, 255, 255, 255, 255 /* L7 : 1 line OK */ +.endif + +.if \fast_tbl == 1 +asm_function jsimd_huff_encode_one_block_neon +.else +asm_function jsimd_huff_encode_one_block_neon_slowtbl +.endif + sub sp, sp, 272 + sub BUFFER, BUFFER, #0x1 /* BUFFER=buffer-- */ + /* Save ARM registers */ + stp x19, x20, [sp] +.if \fast_tbl == 1 + adr x15, Ljsimd_huff_encode_one_block_neon_consts +.else + adr x15, Ljsimd_huff_encode_one_block_neon_slowtbl_consts +.endif + ldr PUT_BUFFER, [x0, #0x10] + ldr PUT_BITSw, [x0, #0x18] + ldrsh w12, [x2] /* load DC coeff in w12 */ + /* prepare data */ +.if \fast_tbl == 1 + ld1 {v23.16b}, [x15], #16 + ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x15], #64 + ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x15], #64 + ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x15], #64 + ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #64 + ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [x2], #64 + sub w12, w12, w3 /* last_dc_val, not used afterwards */ + /* ZigZag 8x8 */ + tbl v0.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v0.16b + tbl v1.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v1.16b + tbl v2.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v2.16b + tbl v3.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v3.16b + tbl v4.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v4.16b + tbl v5.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v5.16b + tbl v6.16b, {v27.16b, v28.16b, v29.16b, v30.16b}, v6.16b + tbl v7.16b, {v29.16b, v30.16b, v31.16b}, v7.16b + ins v0.h[0], w12 + tbx v1.16b, {v28.16b}, v16.16b + tbx v2.16b, {v29.16b, v30.16b}, v17.16b + tbx v5.16b, {v29.16b, v30.16b}, v18.16b + tbx v6.16b, {v31.16b}, v19.16b +.else + add x13, x2, #0x22 + sub w12, w12, w3 /* last_dc_val, not used afterwards */ + ld1 {v23.16b}, [x15] + add x14, x2, #0x18 + add x3, x2, #0x36 + ins v0.h[0], w12 + add x9, x2, #0x2 + ld1 {v1.h}[0], [x13] + add x15, x2, #0x30 + ld1 {v2.h}[0], [x14] + add x19, x2, #0x26 + ld1 {v3.h}[0], [x3] + add x20, x2, #0x28 + ld1 {v0.h}[1], [x9] + add x12, x2, #0x10 + ld1 {v1.h}[1], [x15] + add x13, x2, #0x40 + ld1 {v2.h}[1], [x19] + add x14, x2, #0x34 + ld1 {v3.h}[1], [x20] + add x3, x2, #0x1a + ld1 {v0.h}[2], [x12] + add x9, x2, #0x20 + ld1 {v1.h}[2], [x13] + add x15, x2, #0x32 + ld1 {v2.h}[2], [x14] + add x19, x2, #0x42 + ld1 {v3.h}[2], [x3] + add x20, x2, #0xc + ld1 {v0.h}[3], [x9] + add x12, x2, #0x12 + ld1 {v1.h}[3], [x15] + add x13, x2, #0x24 + ld1 {v2.h}[3], [x19] + add x14, x2, #0x50 + ld1 {v3.h}[3], [x20] + add x3, x2, #0xe + ld1 {v0.h}[4], [x12] + add x9, x2, #0x4 + ld1 {v1.h}[4], [x13] + add x15, x2, #0x16 + ld1 {v2.h}[4], [x14] + add x19, x2, #0x60 + ld1 {v3.h}[4], [x3] + add x20, x2, #0x1c + ld1 {v0.h}[5], [x9] + add x12, x2, #0x6 + ld1 {v1.h}[5], [x15] + add x13, x2, #0x8 + ld1 {v2.h}[5], [x19] + add x14, x2, #0x52 + ld1 {v3.h}[5], [x20] + add x3, x2, #0x2a + ld1 {v0.h}[6], [x12] + add x9, x2, #0x14 + ld1 {v1.h}[6], [x13] + add x15, x2, #0xa + ld1 {v2.h}[6], [x14] + add x19, x2, #0x44 + ld1 {v3.h}[6], [x3] + add x20, x2, #0x38 + ld1 {v0.h}[7], [x9] + add x12, x2, #0x46 + ld1 {v1.h}[7], [x15] + add x13, x2, #0x3a + ld1 {v2.h}[7], [x19] + add x14, x2, #0x74 + ld1 {v3.h}[7], [x20] + add x3, x2, #0x6a + ld1 {v4.h}[0], [x12] + add x9, x2, #0x54 + ld1 {v5.h}[0], [x13] + add x15, x2, #0x2c + ld1 {v6.h}[0], [x14] + add x19, x2, #0x76 + ld1 {v7.h}[0], [x3] + add x20, x2, #0x78 + ld1 {v4.h}[1], [x9] + add x12, x2, #0x62 + ld1 {v5.h}[1], [x15] + add x13, x2, #0x1e + ld1 {v6.h}[1], [x19] + add x14, x2, #0x68 + ld1 {v7.h}[1], [x20] + add x3, x2, #0x7a + ld1 {v4.h}[2], [x12] + add x9, x2, #0x70 + ld1 {v5.h}[2], [x13] + add x15, x2, #0x2e + ld1 {v6.h}[2], [x14] + add x19, x2, #0x5a + ld1 {v7.h}[2], [x3] + add x20, x2, #0x6c + ld1 {v4.h}[3], [x9] + add x12, x2, #0x72 + ld1 {v5.h}[3], [x15] + add x13, x2, #0x3c + ld1 {v6.h}[3], [x19] + add x14, x2, #0x4c + ld1 {v7.h}[3], [x20] + add x3, x2, #0x5e + ld1 {v4.h}[4], [x12] + add x9, x2, #0x64 + ld1 {v5.h}[4], [x13] + add x15, x2, #0x4a + ld1 {v6.h}[4], [x14] + add x19, x2, #0x3e + ld1 {v7.h}[4], [x3] + add x20, x2, #0x6e + ld1 {v4.h}[5], [x9] + add x12, x2, #0x56 + ld1 {v5.h}[5], [x15] + add x13, x2, #0x58 + ld1 {v6.h}[5], [x19] + add x14, x2, #0x4e + ld1 {v7.h}[5], [x20] + add x3, x2, #0x7c + ld1 {v4.h}[6], [x12] + add x9, x2, #0x48 + ld1 {v5.h}[6], [x13] + add x15, x2, #0x66 + ld1 {v6.h}[6], [x14] + add x19, x2, #0x5c + ld1 {v7.h}[6], [x3] + add x20, x2, #0x7e + ld1 {v4.h}[7], [x9] + ld1 {v5.h}[7], [x15] + ld1 {v6.h}[7], [x19] + ld1 {v7.h}[7], [x20] +.endif + cmlt v24.8h, v0.8h, #0 + cmlt v25.8h, v1.8h, #0 + cmlt v26.8h, v2.8h, #0 + cmlt v27.8h, v3.8h, #0 + cmlt v28.8h, v4.8h, #0 + cmlt v29.8h, v5.8h, #0 + cmlt v30.8h, v6.8h, #0 + cmlt v31.8h, v7.8h, #0 + abs v0.8h, v0.8h + abs v1.8h, v1.8h + abs v2.8h, v2.8h + abs v3.8h, v3.8h + abs v4.8h, v4.8h + abs v5.8h, v5.8h + abs v6.8h, v6.8h + abs v7.8h, v7.8h + eor v24.16b, v24.16b, v0.16b + eor v25.16b, v25.16b, v1.16b + eor v26.16b, v26.16b, v2.16b + eor v27.16b, v27.16b, v3.16b + eor v28.16b, v28.16b, v4.16b + eor v29.16b, v29.16b, v5.16b + eor v30.16b, v30.16b, v6.16b + eor v31.16b, v31.16b, v7.16b + cmeq v16.8h, v0.8h, #0 + cmeq v17.8h, v1.8h, #0 + cmeq v18.8h, v2.8h, #0 + cmeq v19.8h, v3.8h, #0 + cmeq v20.8h, v4.8h, #0 + cmeq v21.8h, v5.8h, #0 + cmeq v22.8h, v6.8h, #0 + xtn v16.8b, v16.8h + xtn v18.8b, v18.8h + xtn v20.8b, v20.8h + xtn v22.8b, v22.8h + umov w14, v0.h[0] + xtn2 v16.16b, v17.8h + umov w13, v24.h[0] + xtn2 v18.16b, v19.8h + clz w14, w14 + xtn2 v20.16b, v21.8h + lsl w13, w13, w14 + cmeq v17.8h, v7.8h, #0 + sub w12, w14, #32 + xtn2 v22.16b, v17.8h + lsr w13, w13, w14 + and v16.16b, v16.16b, v23.16b + neg w12, w12 + and v18.16b, v18.16b, v23.16b + add x3, x4, #0x400 /* r1 = dctbl->ehufsi */ + and v20.16b, v20.16b, v23.16b + add x15, sp, #0x90 /* x15 = t2 */ + and v22.16b, v22.16b, v23.16b + ldr w10, [x4, x12, lsl #2] + addp v16.16b, v16.16b, v18.16b + ldrb w11, [x3, x12] + addp v20.16b, v20.16b, v22.16b + checkbuf47 + addp v16.16b, v16.16b, v20.16b + put_bits x10, x11 + addp v16.16b, v16.16b, v18.16b + checkbuf47 + umov x9,v16.D[0] + put_bits x13, x12 + cnt v17.8b, v16.8b + mvn x9, x9 + addv B18, v17.8b + add x4, x5, #0x400 /* x4 = actbl->ehufsi */ + umov w12, v18.b[0] + lsr x9, x9, #0x1 /* clear AC coeff */ + ldr w13, [x5, #0x3c0] /* x13 = actbl->ehufco[0xf0] */ + rbit x9, x9 /* x9 = index0 */ + ldrb w14, [x4, #0xf0] /* x14 = actbl->ehufsi[0xf0] */ + cmp w12, #(64-8) + add x11, sp, #16 + b.lt 4f + cbz x9, 6f + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64 + st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64 + st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64 +1: + clz x2, x9 + add x15, x15, x2, lsl #1 + lsl x9, x9, x2 + ldrh w20, [x15, #-126] +2: + cmp x2, #0x10 + b.lt 3f + sub x2, x2, #0x10 + checkbuf47 + put_bits x13, x14 + b 2b +3: + clz w20, w20 + ldrh w3, [x15, #2]! + sub w11, w20, #32 + lsl w3, w3, w20 + neg w11, w11 + lsr w3, w3, w20 + add x2, x11, x2, lsl #4 + lsl x9, x9, #0x1 + ldr w12, [x5, x2, lsl #2] + ldrb w10, [x4, x2] + checkbuf31 + put_bits x12, x10 + put_bits x3, x11 + cbnz x9, 1b + b 6f +4: + movi v21.8h, #0x0010 + clz v0.8h, v0.8h + clz v1.8h, v1.8h + clz v2.8h, v2.8h + clz v3.8h, v3.8h + clz v4.8h, v4.8h + clz v5.8h, v5.8h + clz v6.8h, v6.8h + clz v7.8h, v7.8h + ushl v24.8h, v24.8h, v0.8h + ushl v25.8h, v25.8h, v1.8h + ushl v26.8h, v26.8h, v2.8h + ushl v27.8h, v27.8h, v3.8h + ushl v28.8h, v28.8h, v4.8h + ushl v29.8h, v29.8h, v5.8h + ushl v30.8h, v30.8h, v6.8h + ushl v31.8h, v31.8h, v7.8h + neg v0.8h, v0.8h + neg v1.8h, v1.8h + neg v2.8h, v2.8h + neg v3.8h, v3.8h + neg v4.8h, v4.8h + neg v5.8h, v5.8h + neg v6.8h, v6.8h + neg v7.8h, v7.8h + ushl v24.8h, v24.8h, v0.8h + ushl v25.8h, v25.8h, v1.8h + ushl v26.8h, v26.8h, v2.8h + ushl v27.8h, v27.8h, v3.8h + ushl v28.8h, v28.8h, v4.8h + ushl v29.8h, v29.8h, v5.8h + ushl v30.8h, v30.8h, v6.8h + ushl v31.8h, v31.8h, v7.8h + add v0.8h, v21.8h, v0.8h + add v1.8h, v21.8h, v1.8h + add v2.8h, v21.8h, v2.8h + add v3.8h, v21.8h, v3.8h + add v4.8h, v21.8h, v4.8h + add v5.8h, v21.8h, v5.8h + add v6.8h, v21.8h, v6.8h + add v7.8h, v21.8h, v7.8h + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64 + st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64 + st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64 +1: + clz x2, x9 + add x15, x15, x2, lsl #1 + lsl x9, x9, x2 + ldrh w11, [x15, #-126] +2: + cmp x2, #0x10 + b.lt 3f + sub x2, x2, #0x10 + checkbuf47 + put_bits x13, x14 + b 2b +3: + ldrh w3, [x15, #2]! + add x2, x11, x2, lsl #4 + lsl x9, x9, #0x1 + ldr w12, [x5, x2, lsl #2] + ldrb w10, [x4, x2] + checkbuf31 + put_bits x12, x10 + put_bits x3, x11 + cbnz x9, 1b +6: + add x13, sp, #0x10e + cmp x15, x13 + b.hs 1f + ldr w12, [x5] + ldrb w14, [x4] + checkbuf47 + put_bits x12, x14 +1: + str PUT_BUFFER, [x0, #0x10] + str PUT_BITSw, [x0, #0x18] + ldp x19, x20, [sp], 16 + add x0, BUFFER, #0x1 + add sp, sp, 256 + br x30 + +.endm + +generate_jsimd_huff_encode_one_block 1 +generate_jsimd_huff_encode_one_block 0 + + .unreq BUFFER + .unreq PUT_BUFFER + .unreq PUT_BITS + .unreq PUT_BITSw + +.purgem emit_byte +.purgem put_bits +.purgem checkbuf31 +.purgem checkbuf47 diff --git a/Builder/jni-1.11/simd/jsimd_arm_neon.S b/Builder/jni-1.11/simd/jsimd_arm_neon.S new file mode 100644 index 000000000..cd2612724 --- /dev/null +++ b/Builder/jni-1.11/simd/jsimd_arm_neon.S @@ -0,0 +1,2878 @@ +/* + * ARMv7 NEON optimizations for libjpeg-turbo + * + * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies). + * All Rights Reserved. + * Author: Siarhei Siamashka + * Copyright (C) 2014, Siarhei Siamashka. All Rights Reserved. + * Copyright (C) 2014, Linaro Limited. All Rights Reserved. + * Copyright (C) 2015, D. R. Commander. All Rights Reserved. + * Copyright (C) 2015-2016, Matthieu Darbois. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits /* mark stack as non-executable */ +#endif + +.text +.fpu neon +.arch armv7a +.object_arch armv4 +.arm +.syntax unified + + +#define RESPECT_STRICT_ALIGNMENT 1 + + +/*****************************************************************************/ + +/* Supplementary macro for setting function attributes */ +.macro asm_function fname +#ifdef __APPLE__ + .globl _\fname +_\fname: +#else + .global \fname +#ifdef __ELF__ + .hidden \fname + .type \fname, %function +#endif +\fname: +#endif +.endm + +/* Transpose a block of 4x4 coefficients in four 64-bit registers */ +.macro transpose_4x4 x0, x1, x2, x3 + vtrn.16 \x0, \x1 + vtrn.16 \x2, \x3 + vtrn.32 \x0, \x2 + vtrn.32 \x1, \x3 +.endm + + +#define CENTERJSAMPLE 128 + +/*****************************************************************************/ + +/* + * Perform dequantization and inverse DCT on one block of coefficients. + * + * GLOBAL(void) + * jsimd_idct_islow_neon (void *dct_table, JCOEFPTR coef_block, + * JSAMPARRAY output_buf, JDIMENSION output_col) + */ + +#define FIX_0_298631336 (2446) +#define FIX_0_390180644 (3196) +#define FIX_0_541196100 (4433) +#define FIX_0_765366865 (6270) +#define FIX_0_899976223 (7373) +#define FIX_1_175875602 (9633) +#define FIX_1_501321110 (12299) +#define FIX_1_847759065 (15137) +#define FIX_1_961570560 (16069) +#define FIX_2_053119869 (16819) +#define FIX_2_562915447 (20995) +#define FIX_3_072711026 (25172) + +#define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560) +#define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644) +#define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065) +#define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447) +#define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223) +#define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223) +#define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447) +#define FIX_0_541196100_PLUS_0_765366865 (FIX_0_541196100 + FIX_0_765366865) + +/* + * Reference SIMD-friendly 1-D ISLOW iDCT C implementation. + * Uses some ideas from the comments in 'simd/jiss2int-64.asm' + */ +#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) \ +{ \ + DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \ + JLONG q1, q2, q3, q4, q5, q6, q7; \ + JLONG tmp11_plus_tmp2, tmp11_minus_tmp2; \ + \ + /* 1-D iDCT input data */ \ + row0 = xrow0; \ + row1 = xrow1; \ + row2 = xrow2; \ + row3 = xrow3; \ + row4 = xrow4; \ + row5 = xrow5; \ + row6 = xrow6; \ + row7 = xrow7; \ + \ + q5 = row7 + row3; \ + q4 = row5 + row1; \ + q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \ + MULTIPLY(q4, FIX_1_175875602); \ + q7 = MULTIPLY(q5, FIX_1_175875602) + \ + MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \ + q2 = MULTIPLY(row2, FIX_0_541196100) + \ + MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \ + q4 = q6; \ + q3 = ((JLONG) row0 - (JLONG) row4) << 13; \ + q6 += MULTIPLY(row5, -FIX_2_562915447) + \ + MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \ + /* now we can use q1 (reloadable constants have been used up) */ \ + q1 = q3 + q2; \ + q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \ + MULTIPLY(row1, -FIX_0_899976223); \ + q5 = q7; \ + q1 = q1 + q6; \ + q7 += MULTIPLY(row7, -FIX_0_899976223) + \ + MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \ + \ + /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \ + tmp11_plus_tmp2 = q1; \ + row1 = 0; \ + \ + q1 = q1 - q6; \ + q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \ + MULTIPLY(row3, -FIX_2_562915447); \ + q1 = q1 - q6; \ + q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \ + MULTIPLY(row6, FIX_0_541196100); \ + q3 = q3 - q2; \ + \ + /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \ + tmp11_minus_tmp2 = q1; \ + \ + q1 = ((JLONG) row0 + (JLONG) row4) << 13; \ + q2 = q1 + q6; \ + q1 = q1 - q6; \ + \ + /* pick up the results */ \ + tmp0 = q4; \ + tmp1 = q5; \ + tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \ + tmp3 = q7; \ + tmp10 = q2; \ + tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \ + tmp12 = q3; \ + tmp13 = q1; \ +} + +#define XFIX_0_899976223 d0[0] +#define XFIX_0_541196100 d0[1] +#define XFIX_2_562915447 d0[2] +#define XFIX_0_298631336_MINUS_0_899976223 d0[3] +#define XFIX_1_501321110_MINUS_0_899976223 d1[0] +#define XFIX_2_053119869_MINUS_2_562915447 d1[1] +#define XFIX_0_541196100_PLUS_0_765366865 d1[2] +#define XFIX_1_175875602 d1[3] +#define XFIX_1_175875602_MINUS_0_390180644 d2[0] +#define XFIX_0_541196100_MINUS_1_847759065 d2[1] +#define XFIX_3_072711026_MINUS_2_562915447 d2[2] +#define XFIX_1_175875602_MINUS_1_961570560 d2[3] + +.balign 16 +jsimd_idct_islow_neon_consts: + .short FIX_0_899976223 /* d0[0] */ + .short FIX_0_541196100 /* d0[1] */ + .short FIX_2_562915447 /* d0[2] */ + .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */ + .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */ + .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */ + .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */ + .short FIX_1_175875602 /* d1[3] */ + /* reloadable constants */ + .short FIX_1_175875602_MINUS_0_390180644 /* d2[0] */ + .short FIX_0_541196100_MINUS_1_847759065 /* d2[1] */ + .short FIX_3_072711026_MINUS_2_562915447 /* d2[2] */ + .short FIX_1_175875602_MINUS_1_961570560 /* d2[3] */ + +asm_function jsimd_idct_islow_neon + + DCT_TABLE .req r0 + COEF_BLOCK .req r1 + OUTPUT_BUF .req r2 + OUTPUT_COL .req r3 + TMP1 .req r0 + TMP2 .req r1 + TMP3 .req r2 + TMP4 .req ip + + ROW0L .req d16 + ROW0R .req d17 + ROW1L .req d18 + ROW1R .req d19 + ROW2L .req d20 + ROW2R .req d21 + ROW3L .req d22 + ROW3R .req d23 + ROW4L .req d24 + ROW4R .req d25 + ROW5L .req d26 + ROW5R .req d27 + ROW6L .req d28 + ROW6R .req d29 + ROW7L .req d30 + ROW7R .req d31 + + /* Load and dequantize coefficients into NEON registers + * with the following allocation: + * 0 1 2 3 | 4 5 6 7 + * ---------+-------- + * 0 | d16 | d17 ( q8 ) + * 1 | d18 | d19 ( q9 ) + * 2 | d20 | d21 ( q10 ) + * 3 | d22 | d23 ( q11 ) + * 4 | d24 | d25 ( q12 ) + * 5 | d26 | d27 ( q13 ) + * 6 | d28 | d29 ( q14 ) + * 7 | d30 | d31 ( q15 ) + */ + adr ip, jsimd_idct_islow_neon_consts + vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]! + vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! + vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]! + vmul.s16 q8, q8, q0 + vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! + vmul.s16 q9, q9, q1 + vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]! + vmul.s16 q10, q10, q2 + vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! + vmul.s16 q11, q11, q3 + vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128] + vmul.s16 q12, q12, q0 + vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! + vmul.s16 q14, q14, q2 + vmul.s16 q13, q13, q1 + vld1.16 {d0, d1, d2, d3}, [ip, :128] /* load constants */ + add ip, ip, #16 + vmul.s16 q15, q15, q3 + vpush {d8-d15} /* save NEON registers */ + /* 1-D IDCT, pass 1, left 4x8 half */ + vadd.s16 d4, ROW7L, ROW3L + vadd.s16 d5, ROW5L, ROW1L + vmull.s16 q6, d4, XFIX_1_175875602_MINUS_1_961570560 + vmlal.s16 q6, d5, XFIX_1_175875602 + vmull.s16 q7, d4, XFIX_1_175875602 + /* Check for the zero coefficients in the right 4x8 half */ + push {r4, r5} + vmlal.s16 q7, d5, XFIX_1_175875602_MINUS_0_390180644 + vsubl.s16 q3, ROW0L, ROW4L + ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))] + vmull.s16 q2, ROW2L, XFIX_0_541196100 + vmlal.s16 q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065 + orr r0, r4, r5 + vmov q4, q6 + vmlsl.s16 q6, ROW5L, XFIX_2_562915447 + ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))] + vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 + vshl.s32 q3, q3, #13 + orr r0, r0, r4 + vmlsl.s16 q4, ROW1L, XFIX_0_899976223 + orr r0, r0, r5 + vadd.s32 q1, q3, q2 + ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))] + vmov q5, q7 + vadd.s32 q1, q1, q6 + orr r0, r0, r4 + vmlsl.s16 q7, ROW7L, XFIX_0_899976223 + orr r0, r0, r5 + vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 + vrshrn.s32 ROW1L, q1, #11 + ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))] + vsub.s32 q1, q1, q6 + vmlal.s16 q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447 + orr r0, r0, r4 + vmlsl.s16 q5, ROW3L, XFIX_2_562915447 + orr r0, r0, r5 + vsub.s32 q1, q1, q6 + vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 + ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))] + vmlal.s16 q6, ROW6L, XFIX_0_541196100 + vsub.s32 q3, q3, q2 + orr r0, r0, r4 + vrshrn.s32 ROW6L, q1, #11 + orr r0, r0, r5 + vadd.s32 q1, q3, q5 + ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))] + vsub.s32 q3, q3, q5 + vaddl.s16 q5, ROW0L, ROW4L + orr r0, r0, r4 + vrshrn.s32 ROW2L, q1, #11 + orr r0, r0, r5 + vrshrn.s32 ROW5L, q3, #11 + ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))] + vshl.s32 q5, q5, #13 + vmlal.s16 q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223 + orr r0, r0, r4 + vadd.s32 q2, q5, q6 + orrs r0, r0, r5 + vsub.s32 q1, q5, q6 + vadd.s32 q6, q2, q7 + ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))] + vsub.s32 q2, q2, q7 + vadd.s32 q5, q1, q4 + orr r0, r4, r5 + vsub.s32 q3, q1, q4 + pop {r4, r5} + vrshrn.s32 ROW7L, q2, #11 + vrshrn.s32 ROW3L, q5, #11 + vrshrn.s32 ROW0L, q6, #11 + vrshrn.s32 ROW4L, q3, #11 + + beq 3f /* Go to do some special handling for the sparse + right 4x8 half */ + + /* 1-D IDCT, pass 1, right 4x8 half */ + vld1.s16 {d2}, [ip, :64] /* reload constants */ + vadd.s16 d10, ROW7R, ROW3R + vadd.s16 d8, ROW5R, ROW1R + /* Transpose left 4x8 half */ + vtrn.16 ROW6L, ROW7L + vmull.s16 q6, d10, XFIX_1_175875602_MINUS_1_961570560 + vmlal.s16 q6, d8, XFIX_1_175875602 + vtrn.16 ROW2L, ROW3L + vmull.s16 q7, d10, XFIX_1_175875602 + vmlal.s16 q7, d8, XFIX_1_175875602_MINUS_0_390180644 + vtrn.16 ROW0L, ROW1L + vsubl.s16 q3, ROW0R, ROW4R + vmull.s16 q2, ROW2R, XFIX_0_541196100 + vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065 + vtrn.16 ROW4L, ROW5L + vmov q4, q6 + vmlsl.s16 q6, ROW5R, XFIX_2_562915447 + vmlal.s16 q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447 + vtrn.32 ROW1L, ROW3L + vshl.s32 q3, q3, #13 + vmlsl.s16 q4, ROW1R, XFIX_0_899976223 + vtrn.32 ROW4L, ROW6L + vadd.s32 q1, q3, q2 + vmov q5, q7 + vadd.s32 q1, q1, q6 + vtrn.32 ROW0L, ROW2L + vmlsl.s16 q7, ROW7R, XFIX_0_899976223 + vmlal.s16 q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223 + vrshrn.s32 ROW1R, q1, #11 + vtrn.32 ROW5L, ROW7L + vsub.s32 q1, q1, q6 + vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447 + vmlsl.s16 q5, ROW3R, XFIX_2_562915447 + vsub.s32 q1, q1, q6 + vmull.s16 q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865 + vmlal.s16 q6, ROW6R, XFIX_0_541196100 + vsub.s32 q3, q3, q2 + vrshrn.s32 ROW6R, q1, #11 + vadd.s32 q1, q3, q5 + vsub.s32 q3, q3, q5 + vaddl.s16 q5, ROW0R, ROW4R + vrshrn.s32 ROW2R, q1, #11 + vrshrn.s32 ROW5R, q3, #11 + vshl.s32 q5, q5, #13 + vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223 + vadd.s32 q2, q5, q6 + vsub.s32 q1, q5, q6 + vadd.s32 q6, q2, q7 + vsub.s32 q2, q2, q7 + vadd.s32 q5, q1, q4 + vsub.s32 q3, q1, q4 + vrshrn.s32 ROW7R, q2, #11 + vrshrn.s32 ROW3R, q5, #11 + vrshrn.s32 ROW0R, q6, #11 + vrshrn.s32 ROW4R, q3, #11 + /* Transpose right 4x8 half */ + vtrn.16 ROW6R, ROW7R + vtrn.16 ROW2R, ROW3R + vtrn.16 ROW0R, ROW1R + vtrn.16 ROW4R, ROW5R + vtrn.32 ROW1R, ROW3R + vtrn.32 ROW4R, ROW6R + vtrn.32 ROW0R, ROW2R + vtrn.32 ROW5R, ROW7R + +1: /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */ + vld1.s16 {d2}, [ip, :64] /* reload constants */ + vmull.s16 q6, ROW1R, XFIX_1_175875602 /* ROW5L <-> ROW1R */ + vmlal.s16 q6, ROW1L, XFIX_1_175875602 + vmlal.s16 q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */ + vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560 + vmull.s16 q7, ROW3R, XFIX_1_175875602 /* ROW7L <-> ROW3R */ + vmlal.s16 q7, ROW3L, XFIX_1_175875602 + vmlal.s16 q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */ + vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644 + vsubl.s16 q3, ROW0L, ROW0R /* ROW4L <-> ROW0R */ + vmull.s16 q2, ROW2L, XFIX_0_541196100 + vmlal.s16 q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L <-> ROW2R */ + vmov q4, q6 + vmlsl.s16 q6, ROW1R, XFIX_2_562915447 /* ROW5L <-> ROW1R */ + vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 + vshl.s32 q3, q3, #13 + vmlsl.s16 q4, ROW1L, XFIX_0_899976223 + vadd.s32 q1, q3, q2 + vmov q5, q7 + vadd.s32 q1, q1, q6 + vmlsl.s16 q7, ROW3R, XFIX_0_899976223 /* ROW7L <-> ROW3R */ + vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 + vshrn.s32 ROW1L, q1, #16 + vsub.s32 q1, q1, q6 + vmlal.s16 q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L <-> ROW1R */ + vmlsl.s16 q5, ROW3L, XFIX_2_562915447 + vsub.s32 q1, q1, q6 + vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 + vmlal.s16 q6, ROW2R, XFIX_0_541196100 /* ROW6L <-> ROW2R */ + vsub.s32 q3, q3, q2 + vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */ + vadd.s32 q1, q3, q5 + vsub.s32 q3, q3, q5 + vaddl.s16 q5, ROW0L, ROW0R /* ROW4L <-> ROW0R */ + vshrn.s32 ROW2L, q1, #16 + vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */ + vshl.s32 q5, q5, #13 + vmlal.s16 q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L <-> ROW3R */ + vadd.s32 q2, q5, q6 + vsub.s32 q1, q5, q6 + vadd.s32 q6, q2, q7 + vsub.s32 q2, q2, q7 + vadd.s32 q5, q1, q4 + vsub.s32 q3, q1, q4 + vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */ + vshrn.s32 ROW3L, q5, #16 + vshrn.s32 ROW0L, q6, #16 + vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */ + /* 1-D IDCT, pass 2, right 4x8 half */ + vld1.s16 {d2}, [ip, :64] /* reload constants */ + vmull.s16 q6, ROW5R, XFIX_1_175875602 + vmlal.s16 q6, ROW5L, XFIX_1_175875602 /* ROW5L <-> ROW1R */ + vmlal.s16 q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560 + vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */ + vmull.s16 q7, ROW7R, XFIX_1_175875602 + vmlal.s16 q7, ROW7L, XFIX_1_175875602 /* ROW7L <-> ROW3R */ + vmlal.s16 q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644 + vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */ + vsubl.s16 q3, ROW4L, ROW4R /* ROW4L <-> ROW0R */ + vmull.s16 q2, ROW6L, XFIX_0_541196100 /* ROW6L <-> ROW2R */ + vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065 + vmov q4, q6 + vmlsl.s16 q6, ROW5R, XFIX_2_562915447 + vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L <-> ROW3R */ + vshl.s32 q3, q3, #13 + vmlsl.s16 q4, ROW5L, XFIX_0_899976223 /* ROW5L <-> ROW1R */ + vadd.s32 q1, q3, q2 + vmov q5, q7 + vadd.s32 q1, q1, q6 + vmlsl.s16 q7, ROW7R, XFIX_0_899976223 + vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L <-> ROW1R */ + vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */ + vsub.s32 q1, q1, q6 + vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447 + vmlsl.s16 q5, ROW7L, XFIX_2_562915447 /* ROW7L <-> ROW3R */ + vsub.s32 q1, q1, q6 + vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L <-> ROW2R */ + vmlal.s16 q6, ROW6R, XFIX_0_541196100 + vsub.s32 q3, q3, q2 + vshrn.s32 ROW6R, q1, #16 + vadd.s32 q1, q3, q5 + vsub.s32 q3, q3, q5 + vaddl.s16 q5, ROW4L, ROW4R /* ROW4L <-> ROW0R */ + vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */ + vshrn.s32 ROW5R, q3, #16 + vshl.s32 q5, q5, #13 + vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223 + vadd.s32 q2, q5, q6 + vsub.s32 q1, q5, q6 + vadd.s32 q6, q2, q7 + vsub.s32 q2, q2, q7 + vadd.s32 q5, q1, q4 + vsub.s32 q3, q1, q4 + vshrn.s32 ROW7R, q2, #16 + vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */ + vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */ + vshrn.s32 ROW4R, q3, #16 + +2: /* Descale to 8-bit and range limit */ + vqrshrn.s16 d16, q8, #2 + vqrshrn.s16 d17, q9, #2 + vqrshrn.s16 d18, q10, #2 + vqrshrn.s16 d19, q11, #2 + vpop {d8-d15} /* restore NEON registers */ + vqrshrn.s16 d20, q12, #2 + /* Transpose the final 8-bit samples and do signed->unsigned conversion */ + vtrn.16 q8, q9 + vqrshrn.s16 d21, q13, #2 + vqrshrn.s16 d22, q14, #2 + vmov.u8 q0, #(CENTERJSAMPLE) + vqrshrn.s16 d23, q15, #2 + vtrn.8 d16, d17 + vtrn.8 d18, d19 + vadd.u8 q8, q8, q0 + vadd.u8 q9, q9, q0 + vtrn.16 q10, q11 + /* Store results to the output buffer */ + ldmia OUTPUT_BUF!, {TMP1, TMP2} + add TMP1, TMP1, OUTPUT_COL + add TMP2, TMP2, OUTPUT_COL + vst1.8 {d16}, [TMP1] + vtrn.8 d20, d21 + vst1.8 {d17}, [TMP2] + ldmia OUTPUT_BUF!, {TMP1, TMP2} + add TMP1, TMP1, OUTPUT_COL + add TMP2, TMP2, OUTPUT_COL + vst1.8 {d18}, [TMP1] + vadd.u8 q10, q10, q0 + vst1.8 {d19}, [TMP2] + ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} + add TMP1, TMP1, OUTPUT_COL + add TMP2, TMP2, OUTPUT_COL + add TMP3, TMP3, OUTPUT_COL + add TMP4, TMP4, OUTPUT_COL + vtrn.8 d22, d23 + vst1.8 {d20}, [TMP1] + vadd.u8 q11, q11, q0 + vst1.8 {d21}, [TMP2] + vst1.8 {d22}, [TMP3] + vst1.8 {d23}, [TMP4] + bx lr + +3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */ + + /* Transpose left 4x8 half */ + vtrn.16 ROW6L, ROW7L + vtrn.16 ROW2L, ROW3L + vtrn.16 ROW0L, ROW1L + vtrn.16 ROW4L, ROW5L + vshl.s16 ROW0R, ROW0R, #2 /* PASS1_BITS */ + vtrn.32 ROW1L, ROW3L + vtrn.32 ROW4L, ROW6L + vtrn.32 ROW0L, ROW2L + vtrn.32 ROW5L, ROW7L + + cmp r0, #0 + beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second + pass */ + + /* Only row 0 is non-zero for the right 4x8 half */ + vdup.s16 ROW1R, ROW0R[1] + vdup.s16 ROW2R, ROW0R[2] + vdup.s16 ROW3R, ROW0R[3] + vdup.s16 ROW4R, ROW0R[0] + vdup.s16 ROW5R, ROW0R[1] + vdup.s16 ROW6R, ROW0R[2] + vdup.s16 ROW7R, ROW0R[3] + vdup.s16 ROW0R, ROW0R[0] + b 1b /* Go to 'normal' second pass */ + +4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */ + vld1.s16 {d2}, [ip, :64] /* reload constants */ + vmull.s16 q6, ROW1L, XFIX_1_175875602 + vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560 + vmull.s16 q7, ROW3L, XFIX_1_175875602 + vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644 + vmull.s16 q2, ROW2L, XFIX_0_541196100 + vshll.s16 q3, ROW0L, #13 + vmov q4, q6 + vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 + vmlsl.s16 q4, ROW1L, XFIX_0_899976223 + vadd.s32 q1, q3, q2 + vmov q5, q7 + vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 + vadd.s32 q1, q1, q6 + vadd.s32 q6, q6, q6 + vmlsl.s16 q5, ROW3L, XFIX_2_562915447 + vshrn.s32 ROW1L, q1, #16 + vsub.s32 q1, q1, q6 + vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 + vsub.s32 q3, q3, q2 + vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */ + vadd.s32 q1, q3, q5 + vsub.s32 q3, q3, q5 + vshll.s16 q5, ROW0L, #13 + vshrn.s32 ROW2L, q1, #16 + vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */ + vadd.s32 q2, q5, q6 + vsub.s32 q1, q5, q6 + vadd.s32 q6, q2, q7 + vsub.s32 q2, q2, q7 + vadd.s32 q5, q1, q4 + vsub.s32 q3, q1, q4 + vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */ + vshrn.s32 ROW3L, q5, #16 + vshrn.s32 ROW0L, q6, #16 + vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */ + /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */ + vld1.s16 {d2}, [ip, :64] /* reload constants */ + vmull.s16 q6, ROW5L, XFIX_1_175875602 + vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 + vmull.s16 q7, ROW7L, XFIX_1_175875602 + vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 + vmull.s16 q2, ROW6L, XFIX_0_541196100 + vshll.s16 q3, ROW4L, #13 + vmov q4, q6 + vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 + vmlsl.s16 q4, ROW5L, XFIX_0_899976223 + vadd.s32 q1, q3, q2 + vmov q5, q7 + vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 + vadd.s32 q1, q1, q6 + vadd.s32 q6, q6, q6 + vmlsl.s16 q5, ROW7L, XFIX_2_562915447 + vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */ + vsub.s32 q1, q1, q6 + vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 + vsub.s32 q3, q3, q2 + vshrn.s32 ROW6R, q1, #16 + vadd.s32 q1, q3, q5 + vsub.s32 q3, q3, q5 + vshll.s16 q5, ROW4L, #13 + vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */ + vshrn.s32 ROW5R, q3, #16 + vadd.s32 q2, q5, q6 + vsub.s32 q1, q5, q6 + vadd.s32 q6, q2, q7 + vsub.s32 q2, q2, q7 + vadd.s32 q5, q1, q4 + vsub.s32 q3, q1, q4 + vshrn.s32 ROW7R, q2, #16 + vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */ + vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */ + vshrn.s32 ROW4R, q3, #16 + b 2b /* Go to epilogue */ + + .unreq DCT_TABLE + .unreq COEF_BLOCK + .unreq OUTPUT_BUF + .unreq OUTPUT_COL + .unreq TMP1 + .unreq TMP2 + .unreq TMP3 + .unreq TMP4 + + .unreq ROW0L + .unreq ROW0R + .unreq ROW1L + .unreq ROW1R + .unreq ROW2L + .unreq ROW2R + .unreq ROW3L + .unreq ROW3R + .unreq ROW4L + .unreq ROW4R + .unreq ROW5L + .unreq ROW5R + .unreq ROW6L + .unreq ROW6R + .unreq ROW7L + .unreq ROW7R + + +/*****************************************************************************/ + +/* + * jsimd_idct_ifast_neon + * + * This function contains a fast, not so accurate integer implementation of + * the inverse DCT (Discrete Cosine Transform). It uses the same calculations + * and produces exactly the same output as IJG's original 'jpeg_idct_ifast' + * function from jidctfst.c + * + * Normally 1-D AAN DCT needs 5 multiplications and 29 additions. + * But in ARM NEON case some extra additions are required because VQDMULH + * instruction can't handle the constants larger than 1. So the expressions + * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x", + * which introduces an extra addition. Overall, there are 6 extra additions + * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions. + */ + +#define XFIX_1_082392200 d0[0] +#define XFIX_1_414213562 d0[1] +#define XFIX_1_847759065 d0[2] +#define XFIX_2_613125930 d0[3] + +.balign 16 +jsimd_idct_ifast_neon_consts: + .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */ + .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */ + .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */ + .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */ + +asm_function jsimd_idct_ifast_neon + + DCT_TABLE .req r0 + COEF_BLOCK .req r1 + OUTPUT_BUF .req r2 + OUTPUT_COL .req r3 + TMP1 .req r0 + TMP2 .req r1 + TMP3 .req r2 + TMP4 .req ip + + /* Load and dequantize coefficients into NEON registers + * with the following allocation: + * 0 1 2 3 | 4 5 6 7 + * ---------+-------- + * 0 | d16 | d17 ( q8 ) + * 1 | d18 | d19 ( q9 ) + * 2 | d20 | d21 ( q10 ) + * 3 | d22 | d23 ( q11 ) + * 4 | d24 | d25 ( q12 ) + * 5 | d26 | d27 ( q13 ) + * 6 | d28 | d29 ( q14 ) + * 7 | d30 | d31 ( q15 ) + */ + adr ip, jsimd_idct_ifast_neon_consts + vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]! + vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! + vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]! + vmul.s16 q8, q8, q0 + vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! + vmul.s16 q9, q9, q1 + vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]! + vmul.s16 q10, q10, q2 + vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! + vmul.s16 q11, q11, q3 + vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128] + vmul.s16 q12, q12, q0 + vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! + vmul.s16 q14, q14, q2 + vmul.s16 q13, q13, q1 + vld1.16 {d0}, [ip, :64] /* load constants */ + vmul.s16 q15, q15, q3 + vpush {d8-d13} /* save NEON registers */ + /* 1-D IDCT, pass 1 */ + vsub.s16 q2, q10, q14 + vadd.s16 q14, q10, q14 + vsub.s16 q1, q11, q13 + vadd.s16 q13, q11, q13 + vsub.s16 q5, q9, q15 + vadd.s16 q15, q9, q15 + vqdmulh.s16 q4, q2, XFIX_1_414213562 + vqdmulh.s16 q6, q1, XFIX_2_613125930 + vadd.s16 q3, q1, q1 + vsub.s16 q1, q5, q1 + vadd.s16 q10, q2, q4 + vqdmulh.s16 q4, q1, XFIX_1_847759065 + vsub.s16 q2, q15, q13 + vadd.s16 q3, q3, q6 + vqdmulh.s16 q6, q2, XFIX_1_414213562 + vadd.s16 q1, q1, q4 + vqdmulh.s16 q4, q5, XFIX_1_082392200 + vsub.s16 q10, q10, q14 + vadd.s16 q2, q2, q6 + vsub.s16 q6, q8, q12 + vadd.s16 q12, q8, q12 + vadd.s16 q9, q5, q4 + vadd.s16 q5, q6, q10 + vsub.s16 q10, q6, q10 + vadd.s16 q6, q15, q13 + vadd.s16 q8, q12, q14 + vsub.s16 q3, q6, q3 + vsub.s16 q12, q12, q14 + vsub.s16 q3, q3, q1 + vsub.s16 q1, q9, q1 + vadd.s16 q2, q3, q2 + vsub.s16 q15, q8, q6 + vadd.s16 q1, q1, q2 + vadd.s16 q8, q8, q6 + vadd.s16 q14, q5, q3 + vsub.s16 q9, q5, q3 + vsub.s16 q13, q10, q2 + vadd.s16 q10, q10, q2 + /* Transpose */ + vtrn.16 q8, q9 + vsub.s16 q11, q12, q1 + vtrn.16 q14, q15 + vadd.s16 q12, q12, q1 + vtrn.16 q10, q11 + vtrn.16 q12, q13 + vtrn.32 q9, q11 + vtrn.32 q12, q14 + vtrn.32 q8, q10 + vtrn.32 q13, q15 + vswp d28, d21 + vswp d26, d19 + /* 1-D IDCT, pass 2 */ + vsub.s16 q2, q10, q14 + vswp d30, d23 + vadd.s16 q14, q10, q14 + vswp d24, d17 + vsub.s16 q1, q11, q13 + vadd.s16 q13, q11, q13 + vsub.s16 q5, q9, q15 + vadd.s16 q15, q9, q15 + vqdmulh.s16 q4, q2, XFIX_1_414213562 + vqdmulh.s16 q6, q1, XFIX_2_613125930 + vadd.s16 q3, q1, q1 + vsub.s16 q1, q5, q1 + vadd.s16 q10, q2, q4 + vqdmulh.s16 q4, q1, XFIX_1_847759065 + vsub.s16 q2, q15, q13 + vadd.s16 q3, q3, q6 + vqdmulh.s16 q6, q2, XFIX_1_414213562 + vadd.s16 q1, q1, q4 + vqdmulh.s16 q4, q5, XFIX_1_082392200 + vsub.s16 q10, q10, q14 + vadd.s16 q2, q2, q6 + vsub.s16 q6, q8, q12 + vadd.s16 q12, q8, q12 + vadd.s16 q9, q5, q4 + vadd.s16 q5, q6, q10 + vsub.s16 q10, q6, q10 + vadd.s16 q6, q15, q13 + vadd.s16 q8, q12, q14 + vsub.s16 q3, q6, q3 + vsub.s16 q12, q12, q14 + vsub.s16 q3, q3, q1 + vsub.s16 q1, q9, q1 + vadd.s16 q2, q3, q2 + vsub.s16 q15, q8, q6 + vadd.s16 q1, q1, q2 + vadd.s16 q8, q8, q6 + vadd.s16 q14, q5, q3 + vsub.s16 q9, q5, q3 + vsub.s16 q13, q10, q2 + vpop {d8-d13} /* restore NEON registers */ + vadd.s16 q10, q10, q2 + vsub.s16 q11, q12, q1 + vadd.s16 q12, q12, q1 + /* Descale to 8-bit and range limit */ + vmov.u8 q0, #0x80 + vqshrn.s16 d16, q8, #5 + vqshrn.s16 d17, q9, #5 + vqshrn.s16 d18, q10, #5 + vqshrn.s16 d19, q11, #5 + vqshrn.s16 d20, q12, #5 + vqshrn.s16 d21, q13, #5 + vqshrn.s16 d22, q14, #5 + vqshrn.s16 d23, q15, #5 + vadd.u8 q8, q8, q0 + vadd.u8 q9, q9, q0 + vadd.u8 q10, q10, q0 + vadd.u8 q11, q11, q0 + /* Transpose the final 8-bit samples */ + vtrn.16 q8, q9 + vtrn.16 q10, q11 + vtrn.32 q8, q10 + vtrn.32 q9, q11 + vtrn.8 d16, d17 + vtrn.8 d18, d19 + /* Store results to the output buffer */ + ldmia OUTPUT_BUF!, {TMP1, TMP2} + add TMP1, TMP1, OUTPUT_COL + add TMP2, TMP2, OUTPUT_COL + vst1.8 {d16}, [TMP1] + vst1.8 {d17}, [TMP2] + ldmia OUTPUT_BUF!, {TMP1, TMP2} + add TMP1, TMP1, OUTPUT_COL + add TMP2, TMP2, OUTPUT_COL + vst1.8 {d18}, [TMP1] + vtrn.8 d20, d21 + vst1.8 {d19}, [TMP2] + ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} + add TMP1, TMP1, OUTPUT_COL + add TMP2, TMP2, OUTPUT_COL + add TMP3, TMP3, OUTPUT_COL + add TMP4, TMP4, OUTPUT_COL + vst1.8 {d20}, [TMP1] + vtrn.8 d22, d23 + vst1.8 {d21}, [TMP2] + vst1.8 {d22}, [TMP3] + vst1.8 {d23}, [TMP4] + bx lr + + .unreq DCT_TABLE + .unreq COEF_BLOCK + .unreq OUTPUT_BUF + .unreq OUTPUT_COL + .unreq TMP1 + .unreq TMP2 + .unreq TMP3 + .unreq TMP4 + + +/*****************************************************************************/ + +/* + * jsimd_idct_4x4_neon + * + * This function contains inverse-DCT code for getting reduced-size + * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations + * and produces exactly the same output as IJG's original 'jpeg_idct_4x4' + * function from jpeg-6b (jidctred.c). + * + * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which + * requires much less arithmetic operations and hence should be faster. + * The primary purpose of this particular NEON optimized function is + * bit exact compatibility with jpeg-6b. + * + * TODO: a bit better instructions scheduling can be achieved by expanding + * idct_helper/transpose_4x4 macros and reordering instructions, + * but readability will suffer somewhat. + */ + +#define CONST_BITS 13 + +#define FIX_0_211164243 (1730) /* FIX(0.211164243) */ +#define FIX_0_509795579 (4176) /* FIX(0.509795579) */ +#define FIX_0_601344887 (4926) /* FIX(0.601344887) */ +#define FIX_0_720959822 (5906) /* FIX(0.720959822) */ +#define FIX_0_765366865 (6270) /* FIX(0.765366865) */ +#define FIX_0_850430095 (6967) /* FIX(0.850430095) */ +#define FIX_0_899976223 (7373) /* FIX(0.899976223) */ +#define FIX_1_061594337 (8697) /* FIX(1.061594337) */ +#define FIX_1_272758580 (10426) /* FIX(1.272758580) */ +#define FIX_1_451774981 (11893) /* FIX(1.451774981) */ +#define FIX_1_847759065 (15137) /* FIX(1.847759065) */ +#define FIX_2_172734803 (17799) /* FIX(2.172734803) */ +#define FIX_2_562915447 (20995) /* FIX(2.562915447) */ +#define FIX_3_624509785 (29692) /* FIX(3.624509785) */ + +.balign 16 +jsimd_idct_4x4_neon_consts: + .short FIX_1_847759065 /* d0[0] */ + .short -FIX_0_765366865 /* d0[1] */ + .short -FIX_0_211164243 /* d0[2] */ + .short FIX_1_451774981 /* d0[3] */ + .short -FIX_2_172734803 /* d1[0] */ + .short FIX_1_061594337 /* d1[1] */ + .short -FIX_0_509795579 /* d1[2] */ + .short -FIX_0_601344887 /* d1[3] */ + .short FIX_0_899976223 /* d2[0] */ + .short FIX_2_562915447 /* d2[1] */ + .short 1 << (CONST_BITS+1) /* d2[2] */ + .short 0 /* d2[3] */ + +.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29 + vmull.s16 q14, \x4, d2[2] + vmlal.s16 q14, \x8, d0[0] + vmlal.s16 q14, \x14, d0[1] + + vmull.s16 q13, \x16, d1[2] + vmlal.s16 q13, \x12, d1[3] + vmlal.s16 q13, \x10, d2[0] + vmlal.s16 q13, \x6, d2[1] + + vmull.s16 q15, \x4, d2[2] + vmlsl.s16 q15, \x8, d0[0] + vmlsl.s16 q15, \x14, d0[1] + + vmull.s16 q12, \x16, d0[2] + vmlal.s16 q12, \x12, d0[3] + vmlal.s16 q12, \x10, d1[0] + vmlal.s16 q12, \x6, d1[1] + + vadd.s32 q10, q14, q13 + vsub.s32 q14, q14, q13 + + .if \shift > 16 + vrshr.s32 q10, q10, #\shift + vrshr.s32 q14, q14, #\shift + vmovn.s32 \y26, q10 + vmovn.s32 \y29, q14 + .else + vrshrn.s32 \y26, q10, #\shift + vrshrn.s32 \y29, q14, #\shift + .endif + + vadd.s32 q10, q15, q12 + vsub.s32 q15, q15, q12 + + .if \shift > 16 + vrshr.s32 q10, q10, #\shift + vrshr.s32 q15, q15, #\shift + vmovn.s32 \y27, q10 + vmovn.s32 \y28, q15 + .else + vrshrn.s32 \y27, q10, #\shift + vrshrn.s32 \y28, q15, #\shift + .endif +.endm + +asm_function jsimd_idct_4x4_neon + + DCT_TABLE .req r0 + COEF_BLOCK .req r1 + OUTPUT_BUF .req r2 + OUTPUT_COL .req r3 + TMP1 .req r0 + TMP2 .req r1 + TMP3 .req r2 + TMP4 .req ip + + vpush {d8-d15} + + /* Load constants (d3 is just used for padding) */ + adr TMP4, jsimd_idct_4x4_neon_consts + vld1.16 {d0, d1, d2, d3}, [TMP4, :128] + + /* Load all COEF_BLOCK into NEON registers with the following allocation: + * 0 1 2 3 | 4 5 6 7 + * ---------+-------- + * 0 | d4 | d5 + * 1 | d6 | d7 + * 2 | d8 | d9 + * 3 | d10 | d11 + * 4 | - | - + * 5 | d12 | d13 + * 6 | d14 | d15 + * 7 | d16 | d17 + */ + vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]! + vld1.16 {d8, d9, d10, d11}, [COEF_BLOCK, :128]! + add COEF_BLOCK, COEF_BLOCK, #16 + vld1.16 {d12, d13, d14, d15}, [COEF_BLOCK, :128]! + vld1.16 {d16, d17}, [COEF_BLOCK, :128]! + /* dequantize */ + vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]! + vmul.s16 q2, q2, q9 + vld1.16 {d22, d23, d24, d25}, [DCT_TABLE, :128]! + vmul.s16 q3, q3, q10 + vmul.s16 q4, q4, q11 + add DCT_TABLE, DCT_TABLE, #16 + vld1.16 {d26, d27, d28, d29}, [DCT_TABLE, :128]! + vmul.s16 q5, q5, q12 + vmul.s16 q6, q6, q13 + vld1.16 {d30, d31}, [DCT_TABLE, :128]! + vmul.s16 q7, q7, q14 + vmul.s16 q8, q8, q15 + + /* Pass 1 */ + idct_helper d4, d6, d8, d10, d12, d14, d16, 12, d4, d6, d8, d10 + transpose_4x4 d4, d6, d8, d10 + idct_helper d5, d7, d9, d11, d13, d15, d17, 12, d5, d7, d9, d11 + transpose_4x4 d5, d7, d9, d11 + + /* Pass 2 */ + idct_helper d4, d6, d8, d10, d7, d9, d11, 19, d26, d27, d28, d29 + transpose_4x4 d26, d27, d28, d29 + + /* Range limit */ + vmov.u16 q15, #0x80 + vadd.s16 q13, q13, q15 + vadd.s16 q14, q14, q15 + vqmovun.s16 d26, q13 + vqmovun.s16 d27, q14 + + /* Store results to the output buffer */ + ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} + add TMP1, TMP1, OUTPUT_COL + add TMP2, TMP2, OUTPUT_COL + add TMP3, TMP3, OUTPUT_COL + add TMP4, TMP4, OUTPUT_COL + +#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT + /* We can use much less instructions on little endian systems if the + * OS kernel is not configured to trap unaligned memory accesses + */ + vst1.32 {d26[0]}, [TMP1]! + vst1.32 {d27[0]}, [TMP3]! + vst1.32 {d26[1]}, [TMP2]! + vst1.32 {d27[1]}, [TMP4]! +#else + vst1.8 {d26[0]}, [TMP1]! + vst1.8 {d27[0]}, [TMP3]! + vst1.8 {d26[1]}, [TMP1]! + vst1.8 {d27[1]}, [TMP3]! + vst1.8 {d26[2]}, [TMP1]! + vst1.8 {d27[2]}, [TMP3]! + vst1.8 {d26[3]}, [TMP1]! + vst1.8 {d27[3]}, [TMP3]! + + vst1.8 {d26[4]}, [TMP2]! + vst1.8 {d27[4]}, [TMP4]! + vst1.8 {d26[5]}, [TMP2]! + vst1.8 {d27[5]}, [TMP4]! + vst1.8 {d26[6]}, [TMP2]! + vst1.8 {d27[6]}, [TMP4]! + vst1.8 {d26[7]}, [TMP2]! + vst1.8 {d27[7]}, [TMP4]! +#endif + + vpop {d8-d15} + bx lr + + .unreq DCT_TABLE + .unreq COEF_BLOCK + .unreq OUTPUT_BUF + .unreq OUTPUT_COL + .unreq TMP1 + .unreq TMP2 + .unreq TMP3 + .unreq TMP4 + +.purgem idct_helper + + +/*****************************************************************************/ + +/* + * jsimd_idct_2x2_neon + * + * This function contains inverse-DCT code for getting reduced-size + * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations + * and produces exactly the same output as IJG's original 'jpeg_idct_2x2' + * function from jpeg-6b (jidctred.c). + * + * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which + * requires much less arithmetic operations and hence should be faster. + * The primary purpose of this particular NEON optimized function is + * bit exact compatibility with jpeg-6b. + */ + +.balign 8 +jsimd_idct_2x2_neon_consts: + .short -FIX_0_720959822 /* d0[0] */ + .short FIX_0_850430095 /* d0[1] */ + .short -FIX_1_272758580 /* d0[2] */ + .short FIX_3_624509785 /* d0[3] */ + +.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27 + vshll.s16 q14, \x4, #15 + vmull.s16 q13, \x6, d0[3] + vmlal.s16 q13, \x10, d0[2] + vmlal.s16 q13, \x12, d0[1] + vmlal.s16 q13, \x16, d0[0] + + vadd.s32 q10, q14, q13 + vsub.s32 q14, q14, q13 + + .if \shift > 16 + vrshr.s32 q10, q10, #\shift + vrshr.s32 q14, q14, #\shift + vmovn.s32 \y26, q10 + vmovn.s32 \y27, q14 + .else + vrshrn.s32 \y26, q10, #\shift + vrshrn.s32 \y27, q14, #\shift + .endif +.endm + +asm_function jsimd_idct_2x2_neon + + DCT_TABLE .req r0 + COEF_BLOCK .req r1 + OUTPUT_BUF .req r2 + OUTPUT_COL .req r3 + TMP1 .req r0 + TMP2 .req ip + + vpush {d8-d15} + + /* Load constants */ + adr TMP2, jsimd_idct_2x2_neon_consts + vld1.16 {d0}, [TMP2, :64] + + /* Load all COEF_BLOCK into NEON registers with the following allocation: + * 0 1 2 3 | 4 5 6 7 + * ---------+-------- + * 0 | d4 | d5 + * 1 | d6 | d7 + * 2 | - | - + * 3 | d10 | d11 + * 4 | - | - + * 5 | d12 | d13 + * 6 | - | - + * 7 | d16 | d17 + */ + vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]! + add COEF_BLOCK, COEF_BLOCK, #16 + vld1.16 {d10, d11}, [COEF_BLOCK, :128]! + add COEF_BLOCK, COEF_BLOCK, #16 + vld1.16 {d12, d13}, [COEF_BLOCK, :128]! + add COEF_BLOCK, COEF_BLOCK, #16 + vld1.16 {d16, d17}, [COEF_BLOCK, :128]! + /* Dequantize */ + vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]! + vmul.s16 q2, q2, q9 + vmul.s16 q3, q3, q10 + add DCT_TABLE, DCT_TABLE, #16 + vld1.16 {d24, d25}, [DCT_TABLE, :128]! + vmul.s16 q5, q5, q12 + add DCT_TABLE, DCT_TABLE, #16 + vld1.16 {d26, d27}, [DCT_TABLE, :128]! + vmul.s16 q6, q6, q13 + add DCT_TABLE, DCT_TABLE, #16 + vld1.16 {d30, d31}, [DCT_TABLE, :128]! + vmul.s16 q8, q8, q15 + + /* Pass 1 */ +#if 0 + idct_helper d4, d6, d10, d12, d16, 13, d4, d6 + transpose_4x4 d4, d6, d8, d10 + idct_helper d5, d7, d11, d13, d17, 13, d5, d7 + transpose_4x4 d5, d7, d9, d11 +#else + vmull.s16 q13, d6, d0[3] + vmlal.s16 q13, d10, d0[2] + vmlal.s16 q13, d12, d0[1] + vmlal.s16 q13, d16, d0[0] + vmull.s16 q12, d7, d0[3] + vmlal.s16 q12, d11, d0[2] + vmlal.s16 q12, d13, d0[1] + vmlal.s16 q12, d17, d0[0] + vshll.s16 q14, d4, #15 + vshll.s16 q15, d5, #15 + vadd.s32 q10, q14, q13 + vsub.s32 q14, q14, q13 + vrshrn.s32 d4, q10, #13 + vrshrn.s32 d6, q14, #13 + vadd.s32 q10, q15, q12 + vsub.s32 q14, q15, q12 + vrshrn.s32 d5, q10, #13 + vrshrn.s32 d7, q14, #13 + vtrn.16 q2, q3 + vtrn.32 q3, q5 +#endif + + /* Pass 2 */ + idct_helper d4, d6, d10, d7, d11, 20, d26, d27 + + /* Range limit */ + vmov.u16 q15, #0x80 + vadd.s16 q13, q13, q15 + vqmovun.s16 d26, q13 + vqmovun.s16 d27, q13 + + /* Store results to the output buffer */ + ldmia OUTPUT_BUF, {TMP1, TMP2} + add TMP1, TMP1, OUTPUT_COL + add TMP2, TMP2, OUTPUT_COL + + vst1.8 {d26[0]}, [TMP1]! + vst1.8 {d27[4]}, [TMP1]! + vst1.8 {d26[1]}, [TMP2]! + vst1.8 {d27[5]}, [TMP2]! + + vpop {d8-d15} + bx lr + + .unreq DCT_TABLE + .unreq COEF_BLOCK + .unreq OUTPUT_BUF + .unreq OUTPUT_COL + .unreq TMP1 + .unreq TMP2 + +.purgem idct_helper + + +/*****************************************************************************/ + +/* + * jsimd_ycc_extrgb_convert_neon + * jsimd_ycc_extbgr_convert_neon + * jsimd_ycc_extrgbx_convert_neon + * jsimd_ycc_extbgrx_convert_neon + * jsimd_ycc_extxbgr_convert_neon + * jsimd_ycc_extxrgb_convert_neon + * + * Colorspace conversion YCbCr -> RGB + */ + + +.macro do_load size + .if \size == 8 + vld1.8 {d4}, [U, :64]! + vld1.8 {d5}, [V, :64]! + vld1.8 {d0}, [Y, :64]! + pld [U, #64] + pld [V, #64] + pld [Y, #64] + .elseif \size == 4 + vld1.8 {d4[0]}, [U]! + vld1.8 {d4[1]}, [U]! + vld1.8 {d4[2]}, [U]! + vld1.8 {d4[3]}, [U]! + vld1.8 {d5[0]}, [V]! + vld1.8 {d5[1]}, [V]! + vld1.8 {d5[2]}, [V]! + vld1.8 {d5[3]}, [V]! + vld1.8 {d0[0]}, [Y]! + vld1.8 {d0[1]}, [Y]! + vld1.8 {d0[2]}, [Y]! + vld1.8 {d0[3]}, [Y]! + .elseif \size == 2 + vld1.8 {d4[4]}, [U]! + vld1.8 {d4[5]}, [U]! + vld1.8 {d5[4]}, [V]! + vld1.8 {d5[5]}, [V]! + vld1.8 {d0[4]}, [Y]! + vld1.8 {d0[5]}, [Y]! + .elseif \size == 1 + vld1.8 {d4[6]}, [U]! + vld1.8 {d5[6]}, [V]! + vld1.8 {d0[6]}, [Y]! + .else + .error unsupported macroblock size + .endif +.endm + +.macro do_store bpp, size + .if \bpp == 24 + .if \size == 8 + vst3.8 {d10, d11, d12}, [RGB]! + .elseif \size == 4 + vst3.8 {d10[0], d11[0], d12[0]}, [RGB]! + vst3.8 {d10[1], d11[1], d12[1]}, [RGB]! + vst3.8 {d10[2], d11[2], d12[2]}, [RGB]! + vst3.8 {d10[3], d11[3], d12[3]}, [RGB]! + .elseif \size == 2 + vst3.8 {d10[4], d11[4], d12[4]}, [RGB]! + vst3.8 {d10[5], d11[5], d12[5]}, [RGB]! + .elseif \size == 1 + vst3.8 {d10[6], d11[6], d12[6]}, [RGB]! + .else + .error unsupported macroblock size + .endif + .elseif \bpp == 32 + .if \size == 8 + vst4.8 {d10, d11, d12, d13}, [RGB]! + .elseif \size == 4 + vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]! + vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]! + vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]! + vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]! + .elseif \size == 2 + vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]! + vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]! + .elseif \size == 1 + vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]! + .else + .error unsupported macroblock size + .endif + .elseif \bpp == 16 + .if \size == 8 + vst1.16 {q15}, [RGB]! + .elseif \size == 4 + vst1.16 {d30}, [RGB]! + .elseif \size == 2 + vst1.16 {d31[0]}, [RGB]! + vst1.16 {d31[1]}, [RGB]! + .elseif \size == 1 + vst1.16 {d31[2]}, [RGB]! + .else + .error unsupported macroblock size + .endif + .else + .error unsupported bpp + .endif +.endm + +.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs + +/* + * 2-stage pipelined YCbCr->RGB conversion + */ + +.macro do_yuv_to_rgb_stage1 + vaddw.u8 q3, q1, d4 /* q3 = u - 128 */ + vaddw.u8 q4, q1, d5 /* q2 = v - 128 */ + vmull.s16 q10, d6, d1[1] /* multiply by -11277 */ + vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */ + vmull.s16 q11, d7, d1[1] /* multiply by -11277 */ + vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */ + vmull.s16 q12, d8, d1[0] /* multiply by 22971 */ + vmull.s16 q13, d9, d1[0] /* multiply by 22971 */ + vmull.s16 q14, d6, d1[3] /* multiply by 29033 */ + vmull.s16 q15, d7, d1[3] /* multiply by 29033 */ +.endm + +.macro do_yuv_to_rgb_stage2 + vrshrn.s32 d20, q10, #15 + vrshrn.s32 d21, q11, #15 + vrshrn.s32 d24, q12, #14 + vrshrn.s32 d25, q13, #14 + vrshrn.s32 d28, q14, #14 + vrshrn.s32 d29, q15, #14 + vaddw.u8 q11, q10, d0 + vaddw.u8 q12, q12, d0 + vaddw.u8 q14, q14, d0 + .if \bpp != 16 + vqmovun.s16 d1\g_offs, q11 + vqmovun.s16 d1\r_offs, q12 + vqmovun.s16 d1\b_offs, q14 + .else /* rgb565 */ + vqshlu.s16 q13, q11, #8 + vqshlu.s16 q15, q12, #8 + vqshlu.s16 q14, q14, #8 + vsri.u16 q15, q13, #5 + vsri.u16 q15, q14, #11 + .endif +.endm + +.macro do_yuv_to_rgb_stage2_store_load_stage1 + /* "do_yuv_to_rgb_stage2" and "store" */ + vrshrn.s32 d20, q10, #15 + /* "load" and "do_yuv_to_rgb_stage1" */ + pld [U, #64] + vrshrn.s32 d21, q11, #15 + pld [V, #64] + vrshrn.s32 d24, q12, #14 + vrshrn.s32 d25, q13, #14 + vld1.8 {d4}, [U, :64]! + vrshrn.s32 d28, q14, #14 + vld1.8 {d5}, [V, :64]! + vrshrn.s32 d29, q15, #14 + vaddw.u8 q3, q1, d4 /* q3 = u - 128 */ + vaddw.u8 q4, q1, d5 /* q2 = v - 128 */ + vaddw.u8 q11, q10, d0 + vmull.s16 q10, d6, d1[1] /* multiply by -11277 */ + vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */ + vaddw.u8 q12, q12, d0 + vaddw.u8 q14, q14, d0 + .if \bpp != 16 /**************** rgb24/rgb32 ******************************/ + vqmovun.s16 d1\g_offs, q11 + pld [Y, #64] + vqmovun.s16 d1\r_offs, q12 + vld1.8 {d0}, [Y, :64]! + vqmovun.s16 d1\b_offs, q14 + vmull.s16 q11, d7, d1[1] /* multiply by -11277 */ + vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */ + do_store \bpp, 8 + vmull.s16 q12, d8, d1[0] /* multiply by 22971 */ + vmull.s16 q13, d9, d1[0] /* multiply by 22971 */ + vmull.s16 q14, d6, d1[3] /* multiply by 29033 */ + vmull.s16 q15, d7, d1[3] /* multiply by 29033 */ + .else /**************************** rgb565 ********************************/ + vqshlu.s16 q13, q11, #8 + pld [Y, #64] + vqshlu.s16 q15, q12, #8 + vqshlu.s16 q14, q14, #8 + vld1.8 {d0}, [Y, :64]! + vmull.s16 q11, d7, d1[1] + vmlal.s16 q11, d9, d1[2] + vsri.u16 q15, q13, #5 + vmull.s16 q12, d8, d1[0] + vsri.u16 q15, q14, #11 + vmull.s16 q13, d9, d1[0] + vmull.s16 q14, d6, d1[3] + do_store \bpp, 8 + vmull.s16 q15, d7, d1[3] + .endif +.endm + +.macro do_yuv_to_rgb + do_yuv_to_rgb_stage1 + do_yuv_to_rgb_stage2 +.endm + +/* Apple gas crashes on adrl, work around that by using adr. + * But this requires a copy of these constants for each function. + */ + +.balign 16 +jsimd_ycc_\colorid\()_neon_consts: + .short 0, 0, 0, 0 + .short 22971, -11277, -23401, 29033 + .short -128, -128, -128, -128 + .short -128, -128, -128, -128 + +asm_function jsimd_ycc_\colorid\()_convert_neon + OUTPUT_WIDTH .req r0 + INPUT_BUF .req r1 + INPUT_ROW .req r2 + OUTPUT_BUF .req r3 + NUM_ROWS .req r4 + + INPUT_BUF0 .req r5 + INPUT_BUF1 .req r6 + INPUT_BUF2 .req INPUT_BUF + + RGB .req r7 + Y .req r8 + U .req r9 + V .req r10 + N .req ip + + /* Load constants to d1, d2, d3 (d0 is just used for padding) */ + adr ip, jsimd_ycc_\colorid\()_neon_consts + vld1.16 {d0, d1, d2, d3}, [ip, :128] + + /* Save ARM registers and handle input arguments */ + push {r4, r5, r6, r7, r8, r9, r10, lr} + ldr NUM_ROWS, [sp, #(4 * 8)] + ldr INPUT_BUF0, [INPUT_BUF] + ldr INPUT_BUF1, [INPUT_BUF, #4] + ldr INPUT_BUF2, [INPUT_BUF, #8] + .unreq INPUT_BUF + + /* Save NEON registers */ + vpush {d8-d15} + + /* Initially set d10, d11, d12, d13 to 0xFF */ + vmov.u8 q5, #255 + vmov.u8 q6, #255 + + /* Outer loop over scanlines */ + cmp NUM_ROWS, #1 + blt 9f +0: + ldr Y, [INPUT_BUF0, INPUT_ROW, lsl #2] + ldr U, [INPUT_BUF1, INPUT_ROW, lsl #2] + mov N, OUTPUT_WIDTH + ldr V, [INPUT_BUF2, INPUT_ROW, lsl #2] + add INPUT_ROW, INPUT_ROW, #1 + ldr RGB, [OUTPUT_BUF], #4 + + /* Inner loop over pixels */ + subs N, N, #8 + blt 3f + do_load 8 + do_yuv_to_rgb_stage1 + subs N, N, #8 + blt 2f +1: + do_yuv_to_rgb_stage2_store_load_stage1 + subs N, N, #8 + bge 1b +2: + do_yuv_to_rgb_stage2 + do_store \bpp, 8 + tst N, #7 + beq 8f +3: + tst N, #4 + beq 3f + do_load 4 +3: + tst N, #2 + beq 4f + do_load 2 +4: + tst N, #1 + beq 5f + do_load 1 +5: + do_yuv_to_rgb + tst N, #4 + beq 6f + do_store \bpp, 4 +6: + tst N, #2 + beq 7f + do_store \bpp, 2 +7: + tst N, #1 + beq 8f + do_store \bpp, 1 +8: + subs NUM_ROWS, NUM_ROWS, #1 + bgt 0b +9: + /* Restore all registers and return */ + vpop {d8-d15} + pop {r4, r5, r6, r7, r8, r9, r10, pc} + + .unreq OUTPUT_WIDTH + .unreq INPUT_ROW + .unreq OUTPUT_BUF + .unreq NUM_ROWS + .unreq INPUT_BUF0 + .unreq INPUT_BUF1 + .unreq INPUT_BUF2 + .unreq RGB + .unreq Y + .unreq U + .unreq V + .unreq N + +.purgem do_yuv_to_rgb +.purgem do_yuv_to_rgb_stage1 +.purgem do_yuv_to_rgb_stage2 +.purgem do_yuv_to_rgb_stage2_store_load_stage1 + +.endm + +/*--------------------------------- id ----- bpp R G B */ +generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, 1, 2 +generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, 1, 0 +generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2 +generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0 +generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1 +generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3 +generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, 0, 0 + +.purgem do_load +.purgem do_store + + +/*****************************************************************************/ + +/* + * jsimd_extrgb_ycc_convert_neon + * jsimd_extbgr_ycc_convert_neon + * jsimd_extrgbx_ycc_convert_neon + * jsimd_extbgrx_ycc_convert_neon + * jsimd_extxbgr_ycc_convert_neon + * jsimd_extxrgb_ycc_convert_neon + * + * Colorspace conversion RGB -> YCbCr + */ + +.macro do_store size + .if \size == 8 + vst1.8 {d20}, [Y]! + vst1.8 {d21}, [U]! + vst1.8 {d22}, [V]! + .elseif \size == 4 + vst1.8 {d20[0]}, [Y]! + vst1.8 {d20[1]}, [Y]! + vst1.8 {d20[2]}, [Y]! + vst1.8 {d20[3]}, [Y]! + vst1.8 {d21[0]}, [U]! + vst1.8 {d21[1]}, [U]! + vst1.8 {d21[2]}, [U]! + vst1.8 {d21[3]}, [U]! + vst1.8 {d22[0]}, [V]! + vst1.8 {d22[1]}, [V]! + vst1.8 {d22[2]}, [V]! + vst1.8 {d22[3]}, [V]! + .elseif \size == 2 + vst1.8 {d20[4]}, [Y]! + vst1.8 {d20[5]}, [Y]! + vst1.8 {d21[4]}, [U]! + vst1.8 {d21[5]}, [U]! + vst1.8 {d22[4]}, [V]! + vst1.8 {d22[5]}, [V]! + .elseif \size == 1 + vst1.8 {d20[6]}, [Y]! + vst1.8 {d21[6]}, [U]! + vst1.8 {d22[6]}, [V]! + .else + .error unsupported macroblock size + .endif +.endm + +.macro do_load bpp, size + .if \bpp == 24 + .if \size == 8 + vld3.8 {d10, d11, d12}, [RGB]! + pld [RGB, #128] + .elseif \size == 4 + vld3.8 {d10[0], d11[0], d12[0]}, [RGB]! + vld3.8 {d10[1], d11[1], d12[1]}, [RGB]! + vld3.8 {d10[2], d11[2], d12[2]}, [RGB]! + vld3.8 {d10[3], d11[3], d12[3]}, [RGB]! + .elseif \size == 2 + vld3.8 {d10[4], d11[4], d12[4]}, [RGB]! + vld3.8 {d10[5], d11[5], d12[5]}, [RGB]! + .elseif \size == 1 + vld3.8 {d10[6], d11[6], d12[6]}, [RGB]! + .else + .error unsupported macroblock size + .endif + .elseif \bpp == 32 + .if \size == 8 + vld4.8 {d10, d11, d12, d13}, [RGB]! + pld [RGB, #128] + .elseif \size == 4 + vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]! + vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]! + vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]! + vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]! + .elseif \size == 2 + vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]! + vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]! + .elseif \size == 1 + vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]! + .else + .error unsupported macroblock size + .endif + .else + .error unsupported bpp + .endif +.endm + +.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs + +/* + * 2-stage pipelined RGB->YCbCr conversion + */ + +.macro do_rgb_to_yuv_stage1 + vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */ + vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */ + vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */ + vmull.u16 q7, d4, d0[0] + vmlal.u16 q7, d6, d0[1] + vmlal.u16 q7, d8, d0[2] + vmull.u16 q8, d5, d0[0] + vmlal.u16 q8, d7, d0[1] + vmlal.u16 q8, d9, d0[2] + vrev64.32 q9, q1 + vrev64.32 q13, q1 + vmlsl.u16 q9, d4, d0[3] + vmlsl.u16 q9, d6, d1[0] + vmlal.u16 q9, d8, d1[1] + vmlsl.u16 q13, d5, d0[3] + vmlsl.u16 q13, d7, d1[0] + vmlal.u16 q13, d9, d1[1] + vrev64.32 q14, q1 + vrev64.32 q15, q1 + vmlal.u16 q14, d4, d1[1] + vmlsl.u16 q14, d6, d1[2] + vmlsl.u16 q14, d8, d1[3] + vmlal.u16 q15, d5, d1[1] + vmlsl.u16 q15, d7, d1[2] + vmlsl.u16 q15, d9, d1[3] +.endm + +.macro do_rgb_to_yuv_stage2 + vrshrn.u32 d20, q7, #16 + vrshrn.u32 d21, q8, #16 + vshrn.u32 d22, q9, #16 + vshrn.u32 d23, q13, #16 + vshrn.u32 d24, q14, #16 + vshrn.u32 d25, q15, #16 + vmovn.u16 d20, q10 /* d20 = y */ + vmovn.u16 d21, q11 /* d21 = u */ + vmovn.u16 d22, q12 /* d22 = v */ +.endm + +.macro do_rgb_to_yuv + do_rgb_to_yuv_stage1 + do_rgb_to_yuv_stage2 +.endm + +.macro do_rgb_to_yuv_stage2_store_load_stage1 + vrshrn.u32 d20, q7, #16 + vrshrn.u32 d21, q8, #16 + vshrn.u32 d22, q9, #16 + vrev64.32 q9, q1 + vshrn.u32 d23, q13, #16 + vrev64.32 q13, q1 + vshrn.u32 d24, q14, #16 + vshrn.u32 d25, q15, #16 + do_load \bpp, 8 + vmovn.u16 d20, q10 /* d20 = y */ + vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */ + vmovn.u16 d21, q11 /* d21 = u */ + vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */ + vmovn.u16 d22, q12 /* d22 = v */ + vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */ + vmull.u16 q7, d4, d0[0] + vmlal.u16 q7, d6, d0[1] + vmlal.u16 q7, d8, d0[2] + vst1.8 {d20}, [Y]! + vmull.u16 q8, d5, d0[0] + vmlal.u16 q8, d7, d0[1] + vmlal.u16 q8, d9, d0[2] + vmlsl.u16 q9, d4, d0[3] + vmlsl.u16 q9, d6, d1[0] + vmlal.u16 q9, d8, d1[1] + vst1.8 {d21}, [U]! + vmlsl.u16 q13, d5, d0[3] + vmlsl.u16 q13, d7, d1[0] + vmlal.u16 q13, d9, d1[1] + vrev64.32 q14, q1 + vrev64.32 q15, q1 + vmlal.u16 q14, d4, d1[1] + vmlsl.u16 q14, d6, d1[2] + vmlsl.u16 q14, d8, d1[3] + vst1.8 {d22}, [V]! + vmlal.u16 q15, d5, d1[1] + vmlsl.u16 q15, d7, d1[2] + vmlsl.u16 q15, d9, d1[3] +.endm + +.balign 16 +jsimd_\colorid\()_ycc_neon_consts: + .short 19595, 38470, 7471, 11059 + .short 21709, 32768, 27439, 5329 + .short 32767, 128, 32767, 128 + .short 32767, 128, 32767, 128 + +asm_function jsimd_\colorid\()_ycc_convert_neon + OUTPUT_WIDTH .req r0 + INPUT_BUF .req r1 + OUTPUT_BUF .req r2 + OUTPUT_ROW .req r3 + NUM_ROWS .req r4 + + OUTPUT_BUF0 .req r5 + OUTPUT_BUF1 .req r6 + OUTPUT_BUF2 .req OUTPUT_BUF + + RGB .req r7 + Y .req r8 + U .req r9 + V .req r10 + N .req ip + + /* Load constants to d0, d1, d2, d3 */ + adr ip, jsimd_\colorid\()_ycc_neon_consts + vld1.16 {d0, d1, d2, d3}, [ip, :128] + + /* Save ARM registers and handle input arguments */ + push {r4, r5, r6, r7, r8, r9, r10, lr} + ldr NUM_ROWS, [sp, #(4 * 8)] + ldr OUTPUT_BUF0, [OUTPUT_BUF] + ldr OUTPUT_BUF1, [OUTPUT_BUF, #4] + ldr OUTPUT_BUF2, [OUTPUT_BUF, #8] + .unreq OUTPUT_BUF + + /* Save NEON registers */ + vpush {d8-d15} + + /* Outer loop over scanlines */ + cmp NUM_ROWS, #1 + blt 9f +0: + ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2] + ldr U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2] + mov N, OUTPUT_WIDTH + ldr V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2] + add OUTPUT_ROW, OUTPUT_ROW, #1 + ldr RGB, [INPUT_BUF], #4 + + /* Inner loop over pixels */ + subs N, N, #8 + blt 3f + do_load \bpp, 8 + do_rgb_to_yuv_stage1 + subs N, N, #8 + blt 2f +1: + do_rgb_to_yuv_stage2_store_load_stage1 + subs N, N, #8 + bge 1b +2: + do_rgb_to_yuv_stage2 + do_store 8 + tst N, #7 + beq 8f +3: + tst N, #4 + beq 3f + do_load \bpp, 4 +3: + tst N, #2 + beq 4f + do_load \bpp, 2 +4: + tst N, #1 + beq 5f + do_load \bpp, 1 +5: + do_rgb_to_yuv + tst N, #4 + beq 6f + do_store 4 +6: + tst N, #2 + beq 7f + do_store 2 +7: + tst N, #1 + beq 8f + do_store 1 +8: + subs NUM_ROWS, NUM_ROWS, #1 + bgt 0b +9: + /* Restore all registers and return */ + vpop {d8-d15} + pop {r4, r5, r6, r7, r8, r9, r10, pc} + + .unreq OUTPUT_WIDTH + .unreq OUTPUT_ROW + .unreq INPUT_BUF + .unreq NUM_ROWS + .unreq OUTPUT_BUF0 + .unreq OUTPUT_BUF1 + .unreq OUTPUT_BUF2 + .unreq RGB + .unreq Y + .unreq U + .unreq V + .unreq N + +.purgem do_rgb_to_yuv +.purgem do_rgb_to_yuv_stage1 +.purgem do_rgb_to_yuv_stage2 +.purgem do_rgb_to_yuv_stage2_store_load_stage1 + +.endm + +/*--------------------------------- id ----- bpp R G B */ +generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2 +generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0 +generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2 +generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0 +generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1 +generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3 + +.purgem do_load +.purgem do_store + + +/*****************************************************************************/ + +/* + * Load data into workspace, applying unsigned->signed conversion + * + * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get + * rid of VST1.16 instructions + */ + +asm_function jsimd_convsamp_neon + SAMPLE_DATA .req r0 + START_COL .req r1 + WORKSPACE .req r2 + TMP1 .req r3 + TMP2 .req r4 + TMP3 .req r5 + TMP4 .req ip + + push {r4, r5} + vmov.u8 d0, #128 + + ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4} + add TMP1, TMP1, START_COL + add TMP2, TMP2, START_COL + add TMP3, TMP3, START_COL + add TMP4, TMP4, START_COL + vld1.8 {d16}, [TMP1] + vsubl.u8 q8, d16, d0 + vld1.8 {d18}, [TMP2] + vsubl.u8 q9, d18, d0 + vld1.8 {d20}, [TMP3] + vsubl.u8 q10, d20, d0 + vld1.8 {d22}, [TMP4] + ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4} + vsubl.u8 q11, d22, d0 + vst1.16 {d16, d17, d18, d19}, [WORKSPACE, :128]! + add TMP1, TMP1, START_COL + add TMP2, TMP2, START_COL + vst1.16 {d20, d21, d22, d23}, [WORKSPACE, :128]! + add TMP3, TMP3, START_COL + add TMP4, TMP4, START_COL + vld1.8 {d24}, [TMP1] + vsubl.u8 q12, d24, d0 + vld1.8 {d26}, [TMP2] + vsubl.u8 q13, d26, d0 + vld1.8 {d28}, [TMP3] + vsubl.u8 q14, d28, d0 + vld1.8 {d30}, [TMP4] + vsubl.u8 q15, d30, d0 + vst1.16 {d24, d25, d26, d27}, [WORKSPACE, :128]! + vst1.16 {d28, d29, d30, d31}, [WORKSPACE, :128]! + pop {r4, r5} + bx lr + + .unreq SAMPLE_DATA + .unreq START_COL + .unreq WORKSPACE + .unreq TMP1 + .unreq TMP2 + .unreq TMP3 + .unreq TMP4 + + +/*****************************************************************************/ + +/* + * jsimd_fdct_ifast_neon + * + * This function contains a fast, not so accurate integer implementation of + * the forward DCT (Discrete Cosine Transform). It uses the same calculations + * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast' + * function from jfdctfst.c + * + * TODO: can be combined with 'jsimd_convsamp_neon' to get + * rid of a bunch of VLD1.16 instructions + */ + +#define XFIX_0_382683433 d0[0] +#define XFIX_0_541196100 d0[1] +#define XFIX_0_707106781 d0[2] +#define XFIX_1_306562965 d0[3] + +.balign 16 +jsimd_fdct_ifast_neon_consts: + .short (98 * 128) /* XFIX_0_382683433 */ + .short (139 * 128) /* XFIX_0_541196100 */ + .short (181 * 128) /* XFIX_0_707106781 */ + .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */ + +asm_function jsimd_fdct_ifast_neon + + DATA .req r0 + TMP .req ip + + vpush {d8-d15} + + /* Load constants */ + adr TMP, jsimd_fdct_ifast_neon_consts + vld1.16 {d0}, [TMP, :64] + + /* Load all DATA into NEON registers with the following allocation: + * 0 1 2 3 | 4 5 6 7 + * ---------+-------- + * 0 | d16 | d17 | q8 + * 1 | d18 | d19 | q9 + * 2 | d20 | d21 | q10 + * 3 | d22 | d23 | q11 + * 4 | d24 | d25 | q12 + * 5 | d26 | d27 | q13 + * 6 | d28 | d29 | q14 + * 7 | d30 | d31 | q15 + */ + + vld1.16 {d16, d17, d18, d19}, [DATA, :128]! + vld1.16 {d20, d21, d22, d23}, [DATA, :128]! + vld1.16 {d24, d25, d26, d27}, [DATA, :128]! + vld1.16 {d28, d29, d30, d31}, [DATA, :128] + sub DATA, DATA, #(128 - 32) + + mov TMP, #2 +1: + /* Transpose */ + vtrn.16 q12, q13 + vtrn.16 q10, q11 + vtrn.16 q8, q9 + vtrn.16 q14, q15 + vtrn.32 q9, q11 + vtrn.32 q13, q15 + vtrn.32 q8, q10 + vtrn.32 q12, q14 + vswp d30, d23 + vswp d24, d17 + vswp d26, d19 + /* 1-D FDCT */ + vadd.s16 q2, q11, q12 + vswp d28, d21 + vsub.s16 q12, q11, q12 + vsub.s16 q6, q10, q13 + vadd.s16 q10, q10, q13 + vsub.s16 q7, q9, q14 + vadd.s16 q9, q9, q14 + vsub.s16 q1, q8, q15 + vadd.s16 q8, q8, q15 + vsub.s16 q4, q9, q10 + vsub.s16 q5, q8, q2 + vadd.s16 q3, q9, q10 + vadd.s16 q4, q4, q5 + vadd.s16 q2, q8, q2 + vqdmulh.s16 q4, q4, XFIX_0_707106781 + vadd.s16 q11, q12, q6 + vadd.s16 q8, q2, q3 + vsub.s16 q12, q2, q3 + vadd.s16 q3, q6, q7 + vadd.s16 q7, q7, q1 + vqdmulh.s16 q3, q3, XFIX_0_707106781 + vsub.s16 q6, q11, q7 + vadd.s16 q10, q5, q4 + vqdmulh.s16 q6, q6, XFIX_0_382683433 + vsub.s16 q14, q5, q4 + vqdmulh.s16 q11, q11, XFIX_0_541196100 + vqdmulh.s16 q5, q7, XFIX_1_306562965 + vadd.s16 q4, q1, q3 + vsub.s16 q3, q1, q3 + vadd.s16 q7, q7, q6 + vadd.s16 q11, q11, q6 + vadd.s16 q7, q7, q5 + vadd.s16 q13, q3, q11 + vsub.s16 q11, q3, q11 + vadd.s16 q9, q4, q7 + vsub.s16 q15, q4, q7 + subs TMP, TMP, #1 + bne 1b + + /* store results */ + vst1.16 {d16, d17, d18, d19}, [DATA, :128]! + vst1.16 {d20, d21, d22, d23}, [DATA, :128]! + vst1.16 {d24, d25, d26, d27}, [DATA, :128]! + vst1.16 {d28, d29, d30, d31}, [DATA, :128] + + vpop {d8-d15} + bx lr + + .unreq DATA + .unreq TMP + + +/*****************************************************************************/ + +/* + * GLOBAL(void) + * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM *divisors, + * DCTELEM *workspace); + * + * Note: the code uses 2 stage pipelining in order to improve instructions + * scheduling and eliminate stalls (this provides ~15% better + * performance for this function on both ARM Cortex-A8 and + * ARM Cortex-A9 when compared to the non-pipelined variant). + * The instructions which belong to the second stage use different + * indentation for better readiability. + */ +asm_function jsimd_quantize_neon + + COEF_BLOCK .req r0 + DIVISORS .req r1 + WORKSPACE .req r2 + + RECIPROCAL .req DIVISORS + CORRECTION .req r3 + SHIFT .req ip + LOOP_COUNT .req r4 + + vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]! + vabs.s16 q12, q0 + add CORRECTION, DIVISORS, #(64 * 2) + add SHIFT, DIVISORS, #(64 * 6) + vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]! + vabs.s16 q13, q1 + vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]! + vadd.u16 q12, q12, q10 /* add correction */ + vadd.u16 q13, q13, q11 + vmull.u16 q10, d24, d16 /* multiply by reciprocal */ + vmull.u16 q11, d25, d17 + vmull.u16 q8, d26, d18 + vmull.u16 q9, d27, d19 + vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]! + vshrn.u32 d20, q10, #16 + vshrn.u32 d21, q11, #16 + vshrn.u32 d22, q8, #16 + vshrn.u32 d23, q9, #16 + vneg.s16 q12, q12 + vneg.s16 q13, q13 + vshr.s16 q2, q0, #15 /* extract sign */ + vshr.s16 q3, q1, #15 + vshl.u16 q14, q10, q12 /* shift */ + vshl.u16 q15, q11, q13 + + push {r4, r5} + mov LOOP_COUNT, #3 +1: + vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]! + veor.u16 q14, q14, q2 /* restore sign */ + vabs.s16 q12, q0 + vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]! + vabs.s16 q13, q1 + veor.u16 q15, q15, q3 + vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]! + vadd.u16 q12, q12, q10 /* add correction */ + vadd.u16 q13, q13, q11 + vmull.u16 q10, d24, d16 /* multiply by reciprocal */ + vmull.u16 q11, d25, d17 + vmull.u16 q8, d26, d18 + vmull.u16 q9, d27, d19 + vsub.u16 q14, q14, q2 + vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]! + vsub.u16 q15, q15, q3 + vshrn.u32 d20, q10, #16 + vshrn.u32 d21, q11, #16 + vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]! + vshrn.u32 d22, q8, #16 + vshrn.u32 d23, q9, #16 + vneg.s16 q12, q12 + vneg.s16 q13, q13 + vshr.s16 q2, q0, #15 /* extract sign */ + vshr.s16 q3, q1, #15 + vshl.u16 q14, q10, q12 /* shift */ + vshl.u16 q15, q11, q13 + subs LOOP_COUNT, LOOP_COUNT, #1 + bne 1b + pop {r4, r5} + + veor.u16 q14, q14, q2 /* restore sign */ + veor.u16 q15, q15, q3 + vsub.u16 q14, q14, q2 + vsub.u16 q15, q15, q3 + vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]! + + bx lr /* return */ + + .unreq COEF_BLOCK + .unreq DIVISORS + .unreq WORKSPACE + .unreq RECIPROCAL + .unreq CORRECTION + .unreq SHIFT + .unreq LOOP_COUNT + + +/*****************************************************************************/ + +/* + * GLOBAL(void) + * jsimd_h2v1_fancy_upsample_neon (int max_v_samp_factor, + * JDIMENSION downsampled_width, + * JSAMPARRAY input_data, + * JSAMPARRAY *output_data_ptr); + * + * Note: the use of unaligned writes is the main remaining bottleneck in + * this code, which can be potentially solved to get up to tens + * of percents performance improvement on Cortex-A8/Cortex-A9. + */ + +/* + * Upsample 16 source pixels to 32 destination pixels. The new 16 source + * pixels are loaded to q0. The previous 16 source pixels are in q1. The + * shifted-by-one source pixels are constructed in q2 by using q0 and q1. + * Register d28 is used for multiplication by 3. Register q15 is used + * for adding +1 bias. + */ +.macro upsample16 OUTPTR, INPTR + vld1.8 {q0}, [\INPTR]! + vmovl.u8 q8, d0 + vext.8 q2, q1, q0, #15 + vmovl.u8 q9, d1 + vaddw.u8 q10, q15, d4 + vaddw.u8 q11, q15, d5 + vmlal.u8 q8, d4, d28 + vmlal.u8 q9, d5, d28 + vmlal.u8 q10, d0, d28 + vmlal.u8 q11, d1, d28 + vmov q1, q0 /* backup source pixels to q1 */ + vrshrn.u16 d6, q8, #2 + vrshrn.u16 d7, q9, #2 + vshrn.u16 d8, q10, #2 + vshrn.u16 d9, q11, #2 + vst2.8 {d6, d7, d8, d9}, [\OUTPTR]! +.endm + +/* + * Upsample 32 source pixels to 64 destination pixels. Compared to 'usample16' + * macro, the roles of q0 and q1 registers are reversed for even and odd + * groups of 16 pixels, that's why "vmov q1, q0" instructions are not needed. + * Also this unrolling allows to reorder loads and stores to compensate + * multiplication latency and reduce stalls. + */ +.macro upsample32 OUTPTR, INPTR + /* even 16 pixels group */ + vld1.8 {q0}, [\INPTR]! + vmovl.u8 q8, d0 + vext.8 q2, q1, q0, #15 + vmovl.u8 q9, d1 + vaddw.u8 q10, q15, d4 + vaddw.u8 q11, q15, d5 + vmlal.u8 q8, d4, d28 + vmlal.u8 q9, d5, d28 + vmlal.u8 q10, d0, d28 + vmlal.u8 q11, d1, d28 + /* odd 16 pixels group */ + vld1.8 {q1}, [\INPTR]! + vrshrn.u16 d6, q8, #2 + vrshrn.u16 d7, q9, #2 + vshrn.u16 d8, q10, #2 + vshrn.u16 d9, q11, #2 + vmovl.u8 q8, d2 + vext.8 q2, q0, q1, #15 + vmovl.u8 q9, d3 + vaddw.u8 q10, q15, d4 + vaddw.u8 q11, q15, d5 + vmlal.u8 q8, d4, d28 + vmlal.u8 q9, d5, d28 + vmlal.u8 q10, d2, d28 + vmlal.u8 q11, d3, d28 + vst2.8 {d6, d7, d8, d9}, [\OUTPTR]! + vrshrn.u16 d6, q8, #2 + vrshrn.u16 d7, q9, #2 + vshrn.u16 d8, q10, #2 + vshrn.u16 d9, q11, #2 + vst2.8 {d6, d7, d8, d9}, [\OUTPTR]! +.endm + +/* + * Upsample a row of WIDTH pixels from INPTR to OUTPTR. + */ +.macro upsample_row OUTPTR, INPTR, WIDTH, TMP1 + /* special case for the first and last pixels */ + sub \WIDTH, \WIDTH, #1 + add \OUTPTR, \OUTPTR, #1 + ldrb \TMP1, [\INPTR, \WIDTH] + strb \TMP1, [\OUTPTR, \WIDTH, asl #1] + ldrb \TMP1, [\INPTR], #1 + strb \TMP1, [\OUTPTR, #-1] + vmov.8 d3[7], \TMP1 + + subs \WIDTH, \WIDTH, #32 + blt 5f +0: /* process 32 pixels per iteration */ + upsample32 \OUTPTR, \INPTR + subs \WIDTH, \WIDTH, #32 + bge 0b +5: + adds \WIDTH, \WIDTH, #16 + blt 1f +0: /* process 16 pixels if needed */ + upsample16 \OUTPTR, \INPTR + subs \WIDTH, \WIDTH, #16 +1: + adds \WIDTH, \WIDTH, #16 + beq 9f + + /* load the remaining 1-15 pixels */ + add \INPTR, \INPTR, \WIDTH + tst \WIDTH, #1 + beq 2f + sub \INPTR, \INPTR, #1 + vld1.8 {d0[0]}, [\INPTR] +2: + tst \WIDTH, #2 + beq 2f + vext.8 d0, d0, d0, #6 + sub \INPTR, \INPTR, #1 + vld1.8 {d0[1]}, [\INPTR] + sub \INPTR, \INPTR, #1 + vld1.8 {d0[0]}, [\INPTR] +2: + tst \WIDTH, #4 + beq 2f + vrev64.32 d0, d0 + sub \INPTR, \INPTR, #1 + vld1.8 {d0[3]}, [\INPTR] + sub \INPTR, \INPTR, #1 + vld1.8 {d0[2]}, [\INPTR] + sub \INPTR, \INPTR, #1 + vld1.8 {d0[1]}, [\INPTR] + sub \INPTR, \INPTR, #1 + vld1.8 {d0[0]}, [\INPTR] +2: + tst \WIDTH, #8 + beq 2f + vmov d1, d0 + sub \INPTR, \INPTR, #8 + vld1.8 {d0}, [\INPTR] +2: /* upsample the remaining pixels */ + vmovl.u8 q8, d0 + vext.8 q2, q1, q0, #15 + vmovl.u8 q9, d1 + vaddw.u8 q10, q15, d4 + vaddw.u8 q11, q15, d5 + vmlal.u8 q8, d4, d28 + vmlal.u8 q9, d5, d28 + vmlal.u8 q10, d0, d28 + vmlal.u8 q11, d1, d28 + vrshrn.u16 d10, q8, #2 + vrshrn.u16 d12, q9, #2 + vshrn.u16 d11, q10, #2 + vshrn.u16 d13, q11, #2 + vzip.8 d10, d11 + vzip.8 d12, d13 + /* store the remaining pixels */ + tst \WIDTH, #8 + beq 2f + vst1.8 {d10, d11}, [\OUTPTR]! + vmov q5, q6 +2: + tst \WIDTH, #4 + beq 2f + vst1.8 {d10}, [\OUTPTR]! + vmov d10, d11 +2: + tst \WIDTH, #2 + beq 2f + vst1.8 {d10[0]}, [\OUTPTR]! + vst1.8 {d10[1]}, [\OUTPTR]! + vst1.8 {d10[2]}, [\OUTPTR]! + vst1.8 {d10[3]}, [\OUTPTR]! + vext.8 d10, d10, d10, #4 +2: + tst \WIDTH, #1 + beq 2f + vst1.8 {d10[0]}, [\OUTPTR]! + vst1.8 {d10[1]}, [\OUTPTR]! +2: +9: +.endm + +asm_function jsimd_h2v1_fancy_upsample_neon + + MAX_V_SAMP_FACTOR .req r0 + DOWNSAMPLED_WIDTH .req r1 + INPUT_DATA .req r2 + OUTPUT_DATA_PTR .req r3 + OUTPUT_DATA .req OUTPUT_DATA_PTR + + OUTPTR .req r4 + INPTR .req r5 + WIDTH .req ip + TMP .req lr + + push {r4, r5, r6, lr} + vpush {d8-d15} + + ldr OUTPUT_DATA, [OUTPUT_DATA_PTR] + cmp MAX_V_SAMP_FACTOR, #0 + ble 99f + + /* initialize constants */ + vmov.u8 d28, #3 + vmov.u16 q15, #1 +11: + ldr INPTR, [INPUT_DATA], #4 + ldr OUTPTR, [OUTPUT_DATA], #4 + mov WIDTH, DOWNSAMPLED_WIDTH + upsample_row OUTPTR, INPTR, WIDTH, TMP + subs MAX_V_SAMP_FACTOR, MAX_V_SAMP_FACTOR, #1 + bgt 11b + +99: + vpop {d8-d15} + pop {r4, r5, r6, pc} + + .unreq MAX_V_SAMP_FACTOR + .unreq DOWNSAMPLED_WIDTH + .unreq INPUT_DATA + .unreq OUTPUT_DATA_PTR + .unreq OUTPUT_DATA + + .unreq OUTPTR + .unreq INPTR + .unreq WIDTH + .unreq TMP + +.purgem upsample16 +.purgem upsample32 +.purgem upsample_row + + +/*****************************************************************************/ + +/* + * GLOBAL(JOCTET*) + * jsimd_huff_encode_one_block (working_state *state, JOCTET *buffer, + * JCOEFPTR block, int last_dc_val, + * c_derived_tbl *dctbl, c_derived_tbl *actbl) + * + */ + +.macro emit_byte BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP + sub \PUT_BITS, \PUT_BITS, #0x8 + lsr \TMP, \PUT_BUFFER, \PUT_BITS + uxtb \TMP, \TMP + strb \TMP, [\BUFFER, #1]! + cmp \TMP, #0xff + /*it eq*/ + strbeq \ZERO, [\BUFFER, #1]! +.endm + +.macro put_bits PUT_BUFFER, PUT_BITS, CODE, SIZE + /*lsl \PUT_BUFFER, \PUT_BUFFER, \SIZE*/ + add \PUT_BITS, \SIZE + /*orr \PUT_BUFFER, \PUT_BUFFER, \CODE*/ + orr \PUT_BUFFER, \CODE, \PUT_BUFFER, lsl \SIZE +.endm + +.macro checkbuf15 BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP + cmp \PUT_BITS, #0x10 + blt 15f + eor \ZERO, \ZERO, \ZERO + emit_byte \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP + emit_byte \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP +15: +.endm + +.balign 16 +jsimd_huff_encode_one_block_neon_consts: + .byte 0x01 + .byte 0x02 + .byte 0x04 + .byte 0x08 + .byte 0x10 + .byte 0x20 + .byte 0x40 + .byte 0x80 + +asm_function jsimd_huff_encode_one_block_neon + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + add r7, sp, #0x1c + sub r4, sp, #0x40 + bfc r4, #0, #5 + mov sp, r4 /* align sp on 32 bytes */ + vst1.64 {d8, d9, d10, d11}, [r4, :128]! + vst1.64 {d12, d13, d14, d15}, [r4, :128] + sub sp, #0x140 /* reserve 320 bytes */ + str r0, [sp, #0x18] /* working state > sp + Ox18 */ + add r4, sp, #0x20 /* r4 = t1 */ + ldr lr, [r7, #0x8] /* lr = dctbl */ + sub r10, r1, #0x1 /* r10=buffer-- */ + ldrsh r1, [r2] + mov r9, #0x10 + mov r8, #0x1 + adr r5, jsimd_huff_encode_one_block_neon_consts + /* prepare data */ + vld1.8 {d26}, [r5, :64] + veor q8, q8, q8 + veor q9, q9, q9 + vdup.16 q14, r9 + vdup.16 q15, r8 + veor q10, q10, q10 + veor q11, q11, q11 + sub r1, r1, r3 + add r9, r2, #0x22 + add r8, r2, #0x18 + add r3, r2, #0x36 + vmov.16 d0[0], r1 + vld1.16 {d2[0]}, [r9, :16] + vld1.16 {d4[0]}, [r8, :16] + vld1.16 {d6[0]}, [r3, :16] + add r1, r2, #0x2 + add r9, r2, #0x30 + add r8, r2, #0x26 + add r3, r2, #0x28 + vld1.16 {d0[1]}, [r1, :16] + vld1.16 {d2[1]}, [r9, :16] + vld1.16 {d4[1]}, [r8, :16] + vld1.16 {d6[1]}, [r3, :16] + add r1, r2, #0x10 + add r9, r2, #0x40 + add r8, r2, #0x34 + add r3, r2, #0x1a + vld1.16 {d0[2]}, [r1, :16] + vld1.16 {d2[2]}, [r9, :16] + vld1.16 {d4[2]}, [r8, :16] + vld1.16 {d6[2]}, [r3, :16] + add r1, r2, #0x20 + add r9, r2, #0x32 + add r8, r2, #0x42 + add r3, r2, #0xc + vld1.16 {d0[3]}, [r1, :16] + vld1.16 {d2[3]}, [r9, :16] + vld1.16 {d4[3]}, [r8, :16] + vld1.16 {d6[3]}, [r3, :16] + add r1, r2, #0x12 + add r9, r2, #0x24 + add r8, r2, #0x50 + add r3, r2, #0xe + vld1.16 {d1[0]}, [r1, :16] + vld1.16 {d3[0]}, [r9, :16] + vld1.16 {d5[0]}, [r8, :16] + vld1.16 {d7[0]}, [r3, :16] + add r1, r2, #0x4 + add r9, r2, #0x16 + add r8, r2, #0x60 + add r3, r2, #0x1c + vld1.16 {d1[1]}, [r1, :16] + vld1.16 {d3[1]}, [r9, :16] + vld1.16 {d5[1]}, [r8, :16] + vld1.16 {d7[1]}, [r3, :16] + add r1, r2, #0x6 + add r9, r2, #0x8 + add r8, r2, #0x52 + add r3, r2, #0x2a + vld1.16 {d1[2]}, [r1, :16] + vld1.16 {d3[2]}, [r9, :16] + vld1.16 {d5[2]}, [r8, :16] + vld1.16 {d7[2]}, [r3, :16] + add r1, r2, #0x14 + add r9, r2, #0xa + add r8, r2, #0x44 + add r3, r2, #0x38 + vld1.16 {d1[3]}, [r1, :16] + vld1.16 {d3[3]}, [r9, :16] + vld1.16 {d5[3]}, [r8, :16] + vld1.16 {d7[3]}, [r3, :16] + vcgt.s16 q8, q8, q0 + vcgt.s16 q9, q9, q1 + vcgt.s16 q10, q10, q2 + vcgt.s16 q11, q11, q3 + vabs.s16 q0, q0 + vabs.s16 q1, q1 + vabs.s16 q2, q2 + vabs.s16 q3, q3 + veor q8, q8, q0 + veor q9, q9, q1 + veor q10, q10, q2 + veor q11, q11, q3 + add r9, r4, #0x20 + add r8, r4, #0x80 + add r3, r4, #0xa0 + vclz.i16 q0, q0 + vclz.i16 q1, q1 + vclz.i16 q2, q2 + vclz.i16 q3, q3 + vsub.i16 q0, q14, q0 + vsub.i16 q1, q14, q1 + vsub.i16 q2, q14, q2 + vsub.i16 q3, q14, q3 + vst1.16 {d0, d1, d2, d3}, [r4, :256] + vst1.16 {d4, d5, d6, d7}, [r9, :256] + vshl.s16 q0, q15, q0 + vshl.s16 q1, q15, q1 + vshl.s16 q2, q15, q2 + vshl.s16 q3, q15, q3 + vsub.i16 q0, q0, q15 + vsub.i16 q1, q1, q15 + vsub.i16 q2, q2, q15 + vsub.i16 q3, q3, q15 + vand q8, q8, q0 + vand q9, q9, q1 + vand q10, q10, q2 + vand q11, q11, q3 + vst1.16 {d16, d17, d18, d19}, [r8, :256] + vst1.16 {d20, d21, d22, d23}, [r3, :256] + add r1, r2, #0x46 + add r9, r2, #0x3a + add r8, r2, #0x74 + add r3, r2, #0x6a + vld1.16 {d8[0]}, [r1, :16] + vld1.16 {d10[0]}, [r9, :16] + vld1.16 {d12[0]}, [r8, :16] + vld1.16 {d14[0]}, [r3, :16] + veor q8, q8, q8 + veor q9, q9, q9 + veor q10, q10, q10 + veor q11, q11, q11 + add r1, r2, #0x54 + add r9, r2, #0x2c + add r8, r2, #0x76 + add r3, r2, #0x78 + vld1.16 {d8[1]}, [r1, :16] + vld1.16 {d10[1]}, [r9, :16] + vld1.16 {d12[1]}, [r8, :16] + vld1.16 {d14[1]}, [r3, :16] + add r1, r2, #0x62 + add r9, r2, #0x1e + add r8, r2, #0x68 + add r3, r2, #0x7a + vld1.16 {d8[2]}, [r1, :16] + vld1.16 {d10[2]}, [r9, :16] + vld1.16 {d12[2]}, [r8, :16] + vld1.16 {d14[2]}, [r3, :16] + add r1, r2, #0x70 + add r9, r2, #0x2e + add r8, r2, #0x5a + add r3, r2, #0x6c + vld1.16 {d8[3]}, [r1, :16] + vld1.16 {d10[3]}, [r9, :16] + vld1.16 {d12[3]}, [r8, :16] + vld1.16 {d14[3]}, [r3, :16] + add r1, r2, #0x72 + add r9, r2, #0x3c + add r8, r2, #0x4c + add r3, r2, #0x5e + vld1.16 {d9[0]}, [r1, :16] + vld1.16 {d11[0]}, [r9, :16] + vld1.16 {d13[0]}, [r8, :16] + vld1.16 {d15[0]}, [r3, :16] + add r1, r2, #0x64 + add r9, r2, #0x4a + add r8, r2, #0x3e + add r3, r2, #0x6e + vld1.16 {d9[1]}, [r1, :16] + vld1.16 {d11[1]}, [r9, :16] + vld1.16 {d13[1]}, [r8, :16] + vld1.16 {d15[1]}, [r3, :16] + add r1, r2, #0x56 + add r9, r2, #0x58 + add r8, r2, #0x4e + add r3, r2, #0x7c + vld1.16 {d9[2]}, [r1, :16] + vld1.16 {d11[2]}, [r9, :16] + vld1.16 {d13[2]}, [r8, :16] + vld1.16 {d15[2]}, [r3, :16] + add r1, r2, #0x48 + add r9, r2, #0x66 + add r8, r2, #0x5c + add r3, r2, #0x7e + vld1.16 {d9[3]}, [r1, :16] + vld1.16 {d11[3]}, [r9, :16] + vld1.16 {d13[3]}, [r8, :16] + vld1.16 {d15[3]}, [r3, :16] + vcgt.s16 q8, q8, q4 + vcgt.s16 q9, q9, q5 + vcgt.s16 q10, q10, q6 + vcgt.s16 q11, q11, q7 + vabs.s16 q4, q4 + vabs.s16 q5, q5 + vabs.s16 q6, q6 + vabs.s16 q7, q7 + veor q8, q8, q4 + veor q9, q9, q5 + veor q10, q10, q6 + veor q11, q11, q7 + add r1, r4, #0x40 + add r9, r4, #0x60 + add r8, r4, #0xc0 + add r3, r4, #0xe0 + vclz.i16 q4, q4 + vclz.i16 q5, q5 + vclz.i16 q6, q6 + vclz.i16 q7, q7 + vsub.i16 q4, q14, q4 + vsub.i16 q5, q14, q5 + vsub.i16 q6, q14, q6 + vsub.i16 q7, q14, q7 + vst1.16 {d8, d9, d10, d11}, [r1, :256] + vst1.16 {d12, d13, d14, d15}, [r9, :256] + vshl.s16 q4, q15, q4 + vshl.s16 q5, q15, q5 + vshl.s16 q6, q15, q6 + vshl.s16 q7, q15, q7 + vsub.i16 q4, q4, q15 + vsub.i16 q5, q5, q15 + vsub.i16 q6, q6, q15 + vsub.i16 q7, q7, q15 + vand q8, q8, q4 + vand q9, q9, q5 + vand q10, q10, q6 + vand q11, q11, q7 + vst1.16 {d16, d17, d18, d19}, [r8, :256] + vst1.16 {d20, d21, d22, d23}, [r3, :256] + ldr r12, [r7, #0xc] /* r12 = actbl */ + add r1, lr, #0x400 /* r1 = dctbl->ehufsi */ + mov r9, r12 /* r9 = actbl */ + add r6, r4, #0x80 /* r6 = t2 */ + ldr r11, [r0, #0x8] /* r11 = put_buffer */ + ldr r4, [r0, #0xc] /* r4 = put_bits */ + ldrh r2, [r6, #-128] /* r2 = nbits */ + ldrh r3, [r6] /* r3 = temp2 & (((JLONG) 1)<ehufsi */ + ldrsb r6, [r5, #0xf0] /* r6 = actbl->ehufsi[0xf0] */ + veor q8, q8, q8 + vceq.i16 q0, q0, q8 + vceq.i16 q1, q1, q8 + vceq.i16 q2, q2, q8 + vceq.i16 q3, q3, q8 + vceq.i16 q4, q4, q8 + vceq.i16 q5, q5, q8 + vceq.i16 q6, q6, q8 + vceq.i16 q7, q7, q8 + vmovn.i16 d0, q0 + vmovn.i16 d2, q1 + vmovn.i16 d4, q2 + vmovn.i16 d6, q3 + vmovn.i16 d8, q4 + vmovn.i16 d10, q5 + vmovn.i16 d12, q6 + vmovn.i16 d14, q7 + vand d0, d0, d26 + vand d2, d2, d26 + vand d4, d4, d26 + vand d6, d6, d26 + vand d8, d8, d26 + vand d10, d10, d26 + vand d12, d12, d26 + vand d14, d14, d26 + vpadd.i8 d0, d0, d2 + vpadd.i8 d4, d4, d6 + vpadd.i8 d8, d8, d10 + vpadd.i8 d12, d12, d14 + vpadd.i8 d0, d0, d4 + vpadd.i8 d8, d8, d12 + vpadd.i8 d0, d0, d8 + vmov.32 r1, d0[1] + vmov.32 r8, d0[0] + mvn r1, r1 + mvn r8, r8 + lsrs r1, r1, #0x1 + rrx r8, r8 /* shift in last r1 bit while shifting out DC bit */ + rbit r1, r1 /* r1 = index1 */ + rbit r8, r8 /* r8 = index0 */ + ldr r0, [r9, #0x3c0] /* r0 = actbl->ehufco[0xf0] */ + str r1, [sp, #0x14] /* index1 > sp + 0x14 */ + cmp r8, #0x0 + beq 6f +1: + clz r2, r8 + add lr, lr, r2, lsl #1 + lsl r8, r8, r2 + ldrh r1, [lr, #-126] +2: + cmp r2, #0x10 + blt 3f + sub r2, r2, #0x10 + put_bits r11, r4, r0, r6 + cmp r4, #0x10 + blt 2b + eor r3, r3, r3 + emit_byte r10, r11, r4, r3, r12 + emit_byte r10, r11, r4, r3, r12 + b 2b +3: + add r2, r1, r2, lsl #4 + ldrh r3, [lr, #2]! + ldr r12, [r9, r2, lsl #2] + ldrb r2, [r5, r2] + put_bits r11, r4, r12, r2 + checkbuf15 r10, r11, r4, r2, r12 + put_bits r11, r4, r3, r1 + checkbuf15 r10, r11, r4, r2, r12 + lsls r8, r8, #0x1 + bne 1b +6: + add r12, sp, #0x20 /* r12 = t1 */ + ldr r8, [sp, #0x14] /* r8 = index1 */ + adds r12, #0xc0 /* r12 = t2 + (DCTSIZE2/2) */ + cmp r8, #0x0 + beq 6f + clz r2, r8 + sub r12, r12, lr + lsl r8, r8, r2 + add r2, r2, r12, lsr #1 + add lr, lr, r2, lsl #1 + b 7f +1: + clz r2, r8 + add lr, lr, r2, lsl #1 + lsl r8, r8, r2 +7: + ldrh r1, [lr, #-126] +2: + cmp r2, #0x10 + blt 3f + sub r2, r2, #0x10 + put_bits r11, r4, r0, r6 + cmp r4, #0x10 + blt 2b + eor r3, r3, r3 + emit_byte r10, r11, r4, r3, r12 + emit_byte r10, r11, r4, r3, r12 + b 2b +3: + add r2, r1, r2, lsl #4 + ldrh r3, [lr, #2]! + ldr r12, [r9, r2, lsl #2] + ldrb r2, [r5, r2] + put_bits r11, r4, r12, r2 + checkbuf15 r10, r11, r4, r2, r12 + put_bits r11, r4, r3, r1 + checkbuf15 r10, r11, r4, r2, r12 + lsls r8, r8, #0x1 + bne 1b +6: + add r0, sp, #0x20 + add r0, #0xfe + cmp lr, r0 + bhs 1f + ldr r1, [r9] + ldrb r0, [r5] + put_bits r11, r4, r1, r0 + checkbuf15 r10, r11, r4, r0, r1 +1: + ldr r12, [sp, #0x18] + str r11, [r12, #0x8] + str r4, [r12, #0xc] + add r0, r10, #0x1 + add r4, sp, #0x140 + vld1.64 {d8, d9, d10, d11}, [r4, :128]! + vld1.64 {d12, d13, d14, d15}, [r4, :128] + sub r4, r7, #0x1c + mov sp, r4 + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} + +.purgem emit_byte +.purgem put_bits +.purgem checkbuf15 diff --git a/Builder/jni-1.11/simd/src/jsimd_i386.c b/Builder/jni-1.11/simd/jsimd_i386.c similarity index 77% rename from Builder/jni-1.11/simd/src/jsimd_i386.c rename to Builder/jni-1.11/simd/jsimd_i386.c index 87ba2a69c..5ab71f181 100644 --- a/Builder/jni-1.11/simd/src/jsimd_i386.c +++ b/Builder/jni-1.11/simd/jsimd_i386.c @@ -2,8 +2,9 @@ * jsimd_i386.c * * Copyright 2009 Pierre Ossman for Cendio AB - * Copyright 2009-2011 D. R. Commander - * + * Copyright (C) 2009-2011, 2013-2014, 2016, D. R. Commander. + * Copyright (C) 2015, Matthieu Darbois. + * * Based on the x86 SIMD extension for IJG JPEG library, * Copyright (C) 1999-2006, MIYASAKA Masaru. * For conditions of distribution and use, see copyright notice in jsimdext.inc @@ -14,15 +15,13 @@ */ #define JPEG_INTERNALS -#include "jinclude.h" -#include "jpeglib.h" -#include "jdct.h" -#include "jsimddct.h" +#include "h/jinclude.h" +#include "h/jpeglib.h" +#include "h/jsimd.h" +#include "h/jdct.h" +#include "h/jsimddct.h" #include "jsimd.h" -#include "StLog.h" -#define LCTX "TurboJPEG.SIMD" - /* * In the PIC cases, we have no guarantee that constants will keep * their alignment. This macro allows us to verify it at runtime. @@ -32,6 +31,7 @@ #define IS_ALIGNED_SSE(ptr) (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */ static unsigned int simd_support = ~0; +static unsigned int simd_huffman = 1; /* * Check what SIMD accelerations are supported. @@ -41,28 +41,32 @@ static unsigned int simd_support = ~0; LOCAL(void) init_simd (void) { + char *env = NULL; + if (simd_support != ~0U) - { return; - } - simd_support = 0; + simd_support = jpeg_simd_cpu_support(); /* Force different settings through environment variables */ - char *env = getenv("JSIMD_FORCESSE2"); - + env = getenv("JSIMD_FORCEMMX"); if ((env != NULL) && (strcmp(env, "1") == 0)) - { - simd_support |= JSIMD_SSE2; - } - - env = getenv("JSIMD_FORCE_NO_SIMD"); + simd_support &= JSIMD_MMX; + env = getenv("JSIMD_FORCE3DNOW"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_support &= JSIMD_3DNOW|JSIMD_MMX; + env = getenv("JSIMD_FORCESSE"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_support &= JSIMD_SSE|JSIMD_MMX; + env = getenv("JSIMD_FORCESSE2"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_support &= JSIMD_SSE2; + env = getenv("JSIMD_FORCENONE"); if ((env != NULL) && (strcmp(env, "1") == 0)) - { simd_support = 0; - } - - INFO_L(LCTX, "SIMD support: %d", simd_support); + env = getenv("JSIMD_NOHUFFENC"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_huffman = 0; } GLOBAL(int) @@ -131,6 +135,12 @@ jsimd_can_ycc_rgb (void) return 0; } +GLOBAL(int) +jsimd_can_ycc_rgb565 (void) +{ + return 0; +} + GLOBAL(void) jsimd_rgb_ycc_convert (j_compress_ptr cinfo, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, @@ -139,8 +149,7 @@ jsimd_rgb_ycc_convert (j_compress_ptr cinfo, void (*sse2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); void (*mmxfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); - switch(cinfo->in_color_space) - { + switch(cinfo->in_color_space) { case JCS_EXT_RGB: sse2fct=jsimd_extrgb_ycc_convert_sse2; mmxfct=jsimd_extrgb_ycc_convert_mmx; @@ -177,11 +186,9 @@ jsimd_rgb_ycc_convert (j_compress_ptr cinfo, if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_rgb_ycc_convert_sse2)) - sse2fct(cinfo->image_width, input_buf, - output_buf, output_row, num_rows); + sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); else if (simd_support & JSIMD_MMX) - mmxfct(cinfo->image_width, input_buf, - output_buf, output_row, num_rows); + mmxfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); } GLOBAL(void) @@ -192,8 +199,7 @@ jsimd_rgb_gray_convert (j_compress_ptr cinfo, void (*sse2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); void (*mmxfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); - switch(cinfo->in_color_space) - { + switch(cinfo->in_color_space) { case JCS_EXT_RGB: sse2fct=jsimd_extrgb_gray_convert_sse2; mmxfct=jsimd_extrgb_gray_convert_mmx; @@ -230,11 +236,9 @@ jsimd_rgb_gray_convert (j_compress_ptr cinfo, if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2)) - sse2fct(cinfo->image_width, input_buf, - output_buf, output_row, num_rows); + sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); else if (simd_support & JSIMD_MMX) - mmxfct(cinfo->image_width, input_buf, - output_buf, output_row, num_rows); + mmxfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); } GLOBAL(void) @@ -245,8 +249,7 @@ jsimd_ycc_rgb_convert (j_decompress_ptr cinfo, void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int); void (*mmxfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int); - switch(cinfo->out_color_space) - { + switch(cinfo->out_color_space) { case JCS_EXT_RGB: sse2fct=jsimd_ycc_extrgb_convert_sse2; mmxfct=jsimd_ycc_extrgb_convert_mmx; @@ -283,11 +286,16 @@ jsimd_ycc_rgb_convert (j_decompress_ptr cinfo, if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2)) - sse2fct(cinfo->output_width, input_buf, - input_row, output_buf, num_rows); + sse2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows); else if (simd_support & JSIMD_MMX) - mmxfct(cinfo->output_width, input_buf, - input_row, output_buf, num_rows); + mmxfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows); +} + +GLOBAL(void) +jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ } GLOBAL(int) @@ -329,31 +337,33 @@ jsimd_can_h2v1_downsample (void) } GLOBAL(void) -jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, +jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr, JSAMPARRAY input_data, JSAMPARRAY output_data) { if (simd_support & JSIMD_SSE2) jsimd_h2v2_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor, - compptr->v_samp_factor, compptr->width_in_blocks, - input_data, output_data); + compptr->v_samp_factor, + compptr->width_in_blocks, input_data, + output_data); else if (simd_support & JSIMD_MMX) jsimd_h2v2_downsample_mmx(cinfo->image_width, cinfo->max_v_samp_factor, - compptr->v_samp_factor, compptr->width_in_blocks, - input_data, output_data); + compptr->v_samp_factor, compptr->width_in_blocks, + input_data, output_data); } GLOBAL(void) -jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, +jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr, JSAMPARRAY input_data, JSAMPARRAY output_data) { if (simd_support & JSIMD_SSE2) jsimd_h2v1_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor, - compptr->v_samp_factor, compptr->width_in_blocks, - input_data, output_data); + compptr->v_samp_factor, + compptr->width_in_blocks, input_data, + output_data); else if (simd_support & JSIMD_MMX) jsimd_h2v1_downsample_mmx(cinfo->image_width, cinfo->max_v_samp_factor, - compptr->v_samp_factor, compptr->width_in_blocks, - input_data, output_data); + compptr->v_samp_factor, compptr->width_in_blocks, + input_data, output_data); } GLOBAL(int) @@ -396,30 +406,30 @@ jsimd_can_h2v1_upsample (void) GLOBAL(void) jsimd_h2v2_upsample (j_decompress_ptr cinfo, - jpeg_component_info * compptr, + jpeg_component_info *compptr, JSAMPARRAY input_data, - JSAMPARRAY * output_data_ptr) + JSAMPARRAY *output_data_ptr) { if (simd_support & JSIMD_SSE2) - jsimd_h2v2_upsample_sse2(cinfo->max_v_samp_factor, - cinfo->output_width, input_data, output_data_ptr); + jsimd_h2v2_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width, + input_data, output_data_ptr); else if (simd_support & JSIMD_MMX) - jsimd_h2v2_upsample_mmx(cinfo->max_v_samp_factor, - cinfo->output_width, input_data, output_data_ptr); + jsimd_h2v2_upsample_mmx(cinfo->max_v_samp_factor, cinfo->output_width, + input_data, output_data_ptr); } GLOBAL(void) jsimd_h2v1_upsample (j_decompress_ptr cinfo, - jpeg_component_info * compptr, + jpeg_component_info *compptr, JSAMPARRAY input_data, - JSAMPARRAY * output_data_ptr) + JSAMPARRAY *output_data_ptr) { if (simd_support & JSIMD_SSE2) - jsimd_h2v1_upsample_sse2(cinfo->max_v_samp_factor, - cinfo->output_width, input_data, output_data_ptr); + jsimd_h2v1_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width, + input_data, output_data_ptr); else if (simd_support & JSIMD_MMX) - jsimd_h2v1_upsample_mmx(cinfo->max_v_samp_factor, - cinfo->output_width, input_data, output_data_ptr); + jsimd_h2v1_upsample_mmx(cinfo->max_v_samp_factor, cinfo->output_width, + input_data, output_data_ptr); } GLOBAL(int) @@ -464,32 +474,36 @@ jsimd_can_h2v1_fancy_upsample (void) GLOBAL(void) jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo, - jpeg_component_info * compptr, + jpeg_component_info *compptr, JSAMPARRAY input_data, - JSAMPARRAY * output_data_ptr) + JSAMPARRAY *output_data_ptr) { if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fancy_upsample_sse2)) jsimd_h2v2_fancy_upsample_sse2(cinfo->max_v_samp_factor, - compptr->downsampled_width, input_data, output_data_ptr); + compptr->downsampled_width, input_data, + output_data_ptr); else if (simd_support & JSIMD_MMX) jsimd_h2v2_fancy_upsample_mmx(cinfo->max_v_samp_factor, - compptr->downsampled_width, input_data, output_data_ptr); + compptr->downsampled_width, input_data, + output_data_ptr); } GLOBAL(void) jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo, - jpeg_component_info * compptr, + jpeg_component_info *compptr, JSAMPARRAY input_data, - JSAMPARRAY * output_data_ptr) + JSAMPARRAY *output_data_ptr) { if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fancy_upsample_sse2)) jsimd_h2v1_fancy_upsample_sse2(cinfo->max_v_samp_factor, - compptr->downsampled_width, input_data, output_data_ptr); + compptr->downsampled_width, input_data, + output_data_ptr); else if (simd_support & JSIMD_MMX) jsimd_h2v1_fancy_upsample_mmx(cinfo->max_v_samp_factor, - compptr->downsampled_width, input_data, output_data_ptr); + compptr->downsampled_width, input_data, + output_data_ptr); } GLOBAL(int) @@ -541,8 +555,7 @@ jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo, void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); void (*mmxfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); - switch(cinfo->out_color_space) - { + switch(cinfo->out_color_space) { case JCS_EXT_RGB: sse2fct=jsimd_h2v2_extrgb_merged_upsample_sse2; mmxfct=jsimd_h2v2_extrgb_merged_upsample_mmx; @@ -579,11 +592,9 @@ jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo, if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_merged_upsample_sse2)) - sse2fct(cinfo->output_width, input_buf, - in_row_group_ctr, output_buf); + sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf); else if (simd_support & JSIMD_MMX) - mmxfct(cinfo->output_width, input_buf, - in_row_group_ctr, output_buf); + mmxfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf); } GLOBAL(void) @@ -595,8 +606,7 @@ jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo, void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); void (*mmxfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); - switch(cinfo->out_color_space) - { + switch(cinfo->out_color_space) { case JCS_EXT_RGB: sse2fct=jsimd_h2v1_extrgb_merged_upsample_sse2; mmxfct=jsimd_h2v1_extrgb_merged_upsample_mmx; @@ -633,11 +643,9 @@ jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo, if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_merged_upsample_sse2)) - sse2fct(cinfo->output_width, input_buf, - in_row_group_ctr, output_buf); + sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf); else if (simd_support & JSIMD_MMX) - mmxfct(cinfo->output_width, input_buf, - in_row_group_ctr, output_buf); + mmxfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf); } GLOBAL(int) @@ -690,7 +698,7 @@ jsimd_can_convsamp_float (void) GLOBAL(void) jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, - DCTELEM * workspace) + DCTELEM *workspace) { if (simd_support & JSIMD_SSE2) jsimd_convsamp_sse2(sample_data, start_col, workspace); @@ -700,7 +708,7 @@ jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, GLOBAL(void) jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col, - FAST_FLOAT * workspace) + FAST_FLOAT *workspace) { if (simd_support & JSIMD_SSE2) jsimd_convsamp_float_sse2(sample_data, start_col, workspace); @@ -768,7 +776,7 @@ jsimd_can_fdct_float (void) } GLOBAL(void) -jsimd_fdct_islow (DCTELEM * data) +jsimd_fdct_islow (DCTELEM *data) { if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2)) jsimd_fdct_islow_sse2(data); @@ -777,7 +785,7 @@ jsimd_fdct_islow (DCTELEM * data) } GLOBAL(void) -jsimd_fdct_ifast (DCTELEM * data) +jsimd_fdct_ifast (DCTELEM *data) { if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2)) jsimd_fdct_ifast_sse2(data); @@ -786,7 +794,7 @@ jsimd_fdct_ifast (DCTELEM * data) } GLOBAL(void) -jsimd_fdct_float (FAST_FLOAT * data) +jsimd_fdct_float (FAST_FLOAT *data) { if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse)) jsimd_fdct_float_sse(data); @@ -839,8 +847,8 @@ jsimd_can_quantize_float (void) } GLOBAL(void) -jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors, - DCTELEM * workspace) +jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors, + DCTELEM *workspace) { if (simd_support & JSIMD_SSE2) jsimd_quantize_sse2(coef_block, divisors, workspace); @@ -849,8 +857,8 @@ jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors, } GLOBAL(void) -jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors, - FAST_FLOAT * workspace) +jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors, + FAST_FLOAT *workspace) { if (simd_support & JSIMD_SSE2) jsimd_quantize_float_sse2(coef_block, divisors, workspace); @@ -911,23 +919,25 @@ jsimd_can_idct_4x4 (void) } GLOBAL(void) -jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr, +jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr, JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col) { if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2)) - jsimd_idct_2x2_sse2(compptr->dct_table, coef_block, output_buf, output_col); + jsimd_idct_2x2_sse2(compptr->dct_table, coef_block, output_buf, + output_col); else if (simd_support & JSIMD_MMX) jsimd_idct_2x2_mmx(compptr->dct_table, coef_block, output_buf, output_col); } GLOBAL(void) -jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr, +jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr, JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col) { if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2)) - jsimd_idct_4x4_sse2(compptr->dct_table, coef_block, output_buf, output_col); + jsimd_idct_4x4_sse2(compptr->dct_table, coef_block, output_buf, + output_col); else if (simd_support & JSIMD_MMX) jsimd_idct_4x4_mmx(compptr->dct_table, coef_block, output_buf, output_col); } @@ -1013,40 +1023,69 @@ jsimd_can_idct_float (void) } GLOBAL(void) -jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr, - JCOEFPTR coef_block, JSAMPARRAY output_buf, - JDIMENSION output_col) +jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) { if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_islow_sse2)) - jsimd_idct_islow_sse2(compptr->dct_table, coef_block, output_buf, output_col); + jsimd_idct_islow_sse2(compptr->dct_table, coef_block, output_buf, + output_col); else if (simd_support & JSIMD_MMX) - jsimd_idct_islow_mmx(compptr->dct_table, coef_block, output_buf, output_col); + jsimd_idct_islow_mmx(compptr->dct_table, coef_block, output_buf, + output_col); } GLOBAL(void) -jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr, - JCOEFPTR coef_block, JSAMPARRAY output_buf, - JDIMENSION output_col) +jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) { if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2)) - jsimd_idct_ifast_sse2(compptr->dct_table, coef_block, output_buf, output_col); + jsimd_idct_ifast_sse2(compptr->dct_table, coef_block, output_buf, + output_col); else if (simd_support & JSIMD_MMX) - jsimd_idct_ifast_mmx(compptr->dct_table, coef_block, output_buf, output_col); + jsimd_idct_ifast_mmx(compptr->dct_table, coef_block, output_buf, + output_col); } GLOBAL(void) -jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr, - JCOEFPTR coef_block, JSAMPARRAY output_buf, - JDIMENSION output_col) +jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) { if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2)) - jsimd_idct_float_sse2(compptr->dct_table, coef_block, - output_buf, output_col); + jsimd_idct_float_sse2(compptr->dct_table, coef_block, output_buf, + output_col); else if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_idct_float_sse)) - jsimd_idct_float_sse(compptr->dct_table, coef_block, - output_buf, output_col); + jsimd_idct_float_sse(compptr->dct_table, coef_block, output_buf, + output_col); else if (simd_support & JSIMD_3DNOW) - jsimd_idct_float_3dnow(compptr->dct_table, coef_block, - output_buf, output_col); + jsimd_idct_float_3dnow(compptr->dct_table, coef_block, output_buf, + output_col); +} + +GLOBAL(int) +jsimd_can_huff_encode_one_block (void) +{ + init_simd(); + + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + + if ((simd_support & JSIMD_SSE2) && simd_huffman && + IS_ALIGNED_SSE(jconst_huff_encode_one_block)) + return 1; + + return 0; } +GLOBAL(JOCTET*) +jsimd_huff_encode_one_block (void *state, JOCTET *buffer, JCOEFPTR block, + int last_dc_val, c_derived_tbl *dctbl, + c_derived_tbl *actbl) +{ + return jsimd_huff_encode_one_block_sse2(state, buffer, block, last_dc_val, + dctbl, actbl); +} diff --git a/Builder/jni-1.11/simd/jsimd_mips.c b/Builder/jni-1.11/simd/jsimd_mips.c new file mode 100644 index 000000000..02e90cd9f --- /dev/null +++ b/Builder/jni-1.11/simd/jsimd_mips.c @@ -0,0 +1,1140 @@ +/* + * jsimd_mips.c + * + * Copyright 2009 Pierre Ossman for Cendio AB + * Copyright (C) 2009-2011, 2014, 2016, D. R. Commander. + * Copyright (C) 2013-2014, MIPS Technologies, Inc., California. + * Copyright (C) 2015, Matthieu Darbois. + * + * Based on the x86 SIMD extension for IJG JPEG library, + * Copyright (C) 1999-2006, MIYASAKA Masaru. + * For conditions of distribution and use, see copyright notice in jsimdext.inc + * + * This file contains the interface between the "normal" portions + * of the library and the SIMD implementations when running on a + * MIPS architecture. + */ + +#define JPEG_INTERNALS +#include "../jinclude.h" +#include "../jpeglib.h" +#include "../jsimd.h" +#include "../jdct.h" +#include "../jsimddct.h" +#include "jsimd.h" + +#include +#include +#include + +static unsigned int simd_support = ~0; + +#if defined(__linux__) + +LOCAL(int) +parse_proc_cpuinfo(const char* search_string) +{ + const char* file_name = "/proc/cpuinfo"; + char cpuinfo_line[256]; + FILE* f = NULL; + simd_support = 0; + + if ((f = fopen(file_name, "r")) != NULL) { + while (fgets(cpuinfo_line, sizeof(cpuinfo_line), f) != NULL) { + if (strstr(cpuinfo_line, search_string) != NULL) { + fclose(f); + simd_support |= JSIMD_MIPS_DSPR2; + return 1; + } + } + fclose(f); + } + /* Did not find string in the proc file, or not Linux ELF. */ + return 0; +} + +#endif + +/* + * Check what SIMD accelerations are supported. + * + * FIXME: This code is racy under a multi-threaded environment. + */ +LOCAL(void) +init_simd (void) +{ + char *env = NULL; + + if (simd_support != ~0U) + return; + + simd_support = 0; + +#if defined(__MIPSEL__) && defined(__mips_dsp) && (__mips_dsp_rev >= 2) + simd_support |= JSIMD_MIPS_DSPR2; +#elif defined(__linux__) + /* We still have a chance to use MIPS DSPR2 regardless of globally used + * -mdspr2 options passed to gcc by performing runtime detection via + * /proc/cpuinfo parsing on linux */ + if (!parse_proc_cpuinfo("MIPS 74K")) + return; +#endif + + /* Force different settings through environment variables */ + env = getenv("JSIMD_FORCEDSPR2"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_support = JSIMD_MIPS_DSPR2; + env = getenv("JSIMD_FORCENONE"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_support = 0; +} + +static const int mips_idct_ifast_coefs[4] = { + 0x45404540, // FIX( 1.082392200 / 2) = 17734 = 0x4546 + 0x5A805A80, // FIX( 1.414213562 / 2) = 23170 = 0x5A82 + 0x76407640, // FIX( 1.847759065 / 2) = 30274 = 0x7642 + 0xAC60AC60 // FIX(-2.613125930 / 4) = -21407 = 0xAC61 +}; + +/* The following struct is borrowed from jdsample.c */ +typedef void (*upsample1_ptr) (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); + +typedef struct { + struct jpeg_upsampler pub; + JSAMPARRAY color_buf[MAX_COMPONENTS]; + upsample1_ptr methods[MAX_COMPONENTS]; + int next_row_out; + JDIMENSION rows_to_go; + int rowgroup_height[MAX_COMPONENTS]; + UINT8 h_expand[MAX_COMPONENTS]; + UINT8 v_expand[MAX_COMPONENTS]; +} my_upsampler; + +typedef my_upsampler *my_upsample_ptr; + +GLOBAL(int) +jsimd_can_rgb_ycc (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if (simd_support & JSIMD_MIPS_DSPR2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_rgb_gray (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if (simd_support & JSIMD_MIPS_DSPR2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_ycc_rgb (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if (simd_support & JSIMD_MIPS_DSPR2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_ycc_rgb565 (void) +{ + return 0; +} + +GLOBAL(int) +jsimd_c_can_null_convert (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_MIPS_DSPR2) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_rgb_ycc_convert (j_compress_ptr cinfo, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) +{ + void (*mipsdspr2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + + switch(cinfo->in_color_space) { + case JCS_EXT_RGB: + mipsdspr2fct=jsimd_extrgb_ycc_convert_mips_dspr2; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + mipsdspr2fct=jsimd_extrgbx_ycc_convert_mips_dspr2; + break; + case JCS_EXT_BGR: + mipsdspr2fct=jsimd_extbgr_ycc_convert_mips_dspr2; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + mipsdspr2fct=jsimd_extbgrx_ycc_convert_mips_dspr2; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + mipsdspr2fct=jsimd_extxbgr_ycc_convert_mips_dspr2; + + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + mipsdspr2fct=jsimd_extxrgb_ycc_convert_mips_dspr2; + break; + default: + mipsdspr2fct=jsimd_extrgb_ycc_convert_mips_dspr2; + break; + } + + if (simd_support & JSIMD_MIPS_DSPR2) + mipsdspr2fct(cinfo->image_width, input_buf, output_buf, output_row, + num_rows); +} + +GLOBAL(void) +jsimd_rgb_gray_convert (j_compress_ptr cinfo, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) +{ + void (*mipsdspr2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + + switch(cinfo->in_color_space) { + case JCS_EXT_RGB: + mipsdspr2fct=jsimd_extrgb_gray_convert_mips_dspr2; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + mipsdspr2fct=jsimd_extrgbx_gray_convert_mips_dspr2; + break; + case JCS_EXT_BGR: + mipsdspr2fct=jsimd_extbgr_gray_convert_mips_dspr2; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + mipsdspr2fct=jsimd_extbgrx_gray_convert_mips_dspr2; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + mipsdspr2fct=jsimd_extxbgr_gray_convert_mips_dspr2; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + mipsdspr2fct=jsimd_extxrgb_gray_convert_mips_dspr2; + break; + default: + mipsdspr2fct=jsimd_extrgb_gray_convert_mips_dspr2; + break; + } + + if (simd_support & JSIMD_MIPS_DSPR2) + mipsdspr2fct(cinfo->image_width, input_buf, output_buf, output_row, + num_rows); +} + +GLOBAL(void) +jsimd_ycc_rgb_convert (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ + void (*mipsdspr2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int); + + switch(cinfo->out_color_space) { + case JCS_EXT_RGB: + mipsdspr2fct=jsimd_ycc_extrgb_convert_mips_dspr2; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + mipsdspr2fct=jsimd_ycc_extrgbx_convert_mips_dspr2; + break; + case JCS_EXT_BGR: + mipsdspr2fct=jsimd_ycc_extbgr_convert_mips_dspr2; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + mipsdspr2fct=jsimd_ycc_extbgrx_convert_mips_dspr2; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + mipsdspr2fct=jsimd_ycc_extxbgr_convert_mips_dspr2; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + mipsdspr2fct=jsimd_ycc_extxrgb_convert_mips_dspr2; + break; + default: + mipsdspr2fct=jsimd_ycc_extrgb_convert_mips_dspr2; + break; + } + + if (simd_support & JSIMD_MIPS_DSPR2) + mipsdspr2fct(cinfo->output_width, input_buf, input_row, output_buf, + num_rows); +} + +GLOBAL(void) +jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ +} + +GLOBAL(void) +jsimd_c_null_convert (j_compress_ptr cinfo, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) +{ + if (simd_support & JSIMD_MIPS_DSPR2) + jsimd_c_null_convert_mips_dspr2(cinfo->image_width, input_buf, + output_buf, output_row, num_rows, + cinfo->num_components); +} + +GLOBAL(int) +jsimd_can_h2v2_downsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_MIPS_DSPR2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v2_smooth_downsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if(DCTSIZE != 8) + return 0; + + if (simd_support & JSIMD_MIPS_DSPR2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_downsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_MIPS_DSPR2) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + if (simd_support & JSIMD_MIPS_DSPR2) + jsimd_h2v2_downsample_mips_dspr2(cinfo->image_width, + cinfo->max_v_samp_factor, + compptr->v_samp_factor, + compptr->width_in_blocks, input_data, + output_data); +} + +GLOBAL(void) +jsimd_h2v2_smooth_downsample (j_compress_ptr cinfo, + jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + jsimd_h2v2_smooth_downsample_mips_dspr2(input_data, output_data, + compptr->v_samp_factor, + cinfo->max_v_samp_factor, + cinfo->smoothing_factor, + compptr->width_in_blocks, + cinfo->image_width); +} + +GLOBAL(void) +jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + if (simd_support & JSIMD_MIPS_DSPR2) + jsimd_h2v1_downsample_mips_dspr2(cinfo->image_width, + cinfo->max_v_samp_factor, + compptr->v_samp_factor, + compptr->width_in_blocks, + input_data, output_data); +} + +GLOBAL(int) +jsimd_can_h2v2_upsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_MIPS_DSPR2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_upsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_MIPS_DSPR2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_int_upsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_MIPS_DSPR2) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_upsample (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + if (simd_support & JSIMD_MIPS_DSPR2) + jsimd_h2v2_upsample_mips_dspr2(cinfo->max_v_samp_factor, + cinfo->output_width, input_data, + output_data_ptr); +} + +GLOBAL(void) +jsimd_h2v1_upsample (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + if (simd_support & JSIMD_MIPS_DSPR2) + jsimd_h2v1_upsample_mips_dspr2(cinfo->max_v_samp_factor, + cinfo->output_width, input_data, + output_data_ptr); +} + +GLOBAL(void) +jsimd_int_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample; + + jsimd_int_upsample_mips_dspr2(upsample->h_expand[compptr->component_index], + upsample->v_expand[compptr->component_index], + input_data, output_data_ptr, + cinfo->output_width, + cinfo->max_v_samp_factor); +} + +GLOBAL(int) +jsimd_can_h2v2_fancy_upsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_MIPS_DSPR2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_fancy_upsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_MIPS_DSPR2) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + if (simd_support & JSIMD_MIPS_DSPR2) + jsimd_h2v2_fancy_upsample_mips_dspr2(cinfo->max_v_samp_factor, + compptr->downsampled_width, + input_data, output_data_ptr); +} + +GLOBAL(void) +jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + if (simd_support & JSIMD_MIPS_DSPR2) + jsimd_h2v1_fancy_upsample_mips_dspr2(cinfo->max_v_samp_factor, + compptr->downsampled_width, + input_data, output_data_ptr); +} + +GLOBAL(int) +jsimd_can_h2v2_merged_upsample (void) +{ + init_simd(); + + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_MIPS_DSPR2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_merged_upsample (void) +{ + init_simd(); + + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_MIPS_DSPR2) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf) +{ + void (*mipsdspr2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, + JSAMPLE *); + + switch(cinfo->out_color_space) { + case JCS_EXT_RGB: + mipsdspr2fct=jsimd_h2v2_extrgb_merged_upsample_mips_dspr2; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + mipsdspr2fct=jsimd_h2v2_extrgbx_merged_upsample_mips_dspr2; + break; + case JCS_EXT_BGR: + mipsdspr2fct=jsimd_h2v2_extbgr_merged_upsample_mips_dspr2; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + mipsdspr2fct=jsimd_h2v2_extbgrx_merged_upsample_mips_dspr2; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + mipsdspr2fct=jsimd_h2v2_extxbgr_merged_upsample_mips_dspr2; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + mipsdspr2fct=jsimd_h2v2_extxrgb_merged_upsample_mips_dspr2; + break; + default: + mipsdspr2fct=jsimd_h2v2_extrgb_merged_upsample_mips_dspr2; + break; + } + + mipsdspr2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf, + cinfo->sample_range_limit); +} + +GLOBAL(void) +jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf) +{ + void (*mipsdspr2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, + JSAMPLE *); + + switch(cinfo->out_color_space) { + case JCS_EXT_RGB: + mipsdspr2fct=jsimd_h2v1_extrgb_merged_upsample_mips_dspr2; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + mipsdspr2fct=jsimd_h2v1_extrgbx_merged_upsample_mips_dspr2; + break; + case JCS_EXT_BGR: + mipsdspr2fct=jsimd_h2v1_extbgr_merged_upsample_mips_dspr2; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + mipsdspr2fct=jsimd_h2v1_extbgrx_merged_upsample_mips_dspr2; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + mipsdspr2fct=jsimd_h2v1_extxbgr_merged_upsample_mips_dspr2; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + mipsdspr2fct=jsimd_h2v1_extxrgb_merged_upsample_mips_dspr2; + break; + default: + mipsdspr2fct=jsimd_h2v1_extrgb_merged_upsample_mips_dspr2; + break; + } + + mipsdspr2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf, + cinfo->sample_range_limit); +} + +GLOBAL(int) +jsimd_can_convsamp (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_MIPS_DSPR2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_convsamp_float (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if (simd_support & JSIMD_MIPS_DSPR2) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, + DCTELEM *workspace) +{ + if (simd_support & JSIMD_MIPS_DSPR2) + jsimd_convsamp_mips_dspr2(sample_data, start_col, workspace); +} + +GLOBAL(void) +jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col, + FAST_FLOAT *workspace) +{ + if ((simd_support & JSIMD_MIPS_DSPR2)) + jsimd_convsamp_float_mips_dspr2(sample_data, start_col, workspace); +} + +GLOBAL(int) +jsimd_can_fdct_islow (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_MIPS_DSPR2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_fdct_ifast (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_MIPS_DSPR2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_fdct_float (void) +{ + init_simd(); + + return 0; +} + +GLOBAL(void) +jsimd_fdct_islow (DCTELEM *data) +{ + if (simd_support & JSIMD_MIPS_DSPR2) + jsimd_fdct_islow_mips_dspr2(data); +} + +GLOBAL(void) +jsimd_fdct_ifast (DCTELEM *data) +{ + if (simd_support & JSIMD_MIPS_DSPR2) + jsimd_fdct_ifast_mips_dspr2(data); +} + +GLOBAL(void) +jsimd_fdct_float (FAST_FLOAT *data) +{ +} + +GLOBAL(int) +jsimd_can_quantize (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_MIPS_DSPR2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_quantize_float (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if (simd_support & JSIMD_MIPS_DSPR2) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors, + DCTELEM *workspace) +{ + if (simd_support & JSIMD_MIPS_DSPR2) + jsimd_quantize_mips_dspr2(coef_block, divisors, workspace); +} + +GLOBAL(void) +jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors, + FAST_FLOAT *workspace) +{ + if (simd_support & JSIMD_MIPS_DSPR2) + jsimd_quantize_float_mips_dspr2(coef_block, divisors, workspace); +} + +GLOBAL(int) +jsimd_can_idct_2x2 (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if (simd_support & JSIMD_MIPS_DSPR2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_4x4 (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if (simd_support & JSIMD_MIPS_DSPR2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_6x6 (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if (simd_support & JSIMD_MIPS_DSPR2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_12x12 (void) +{ + init_simd(); + + if (BITS_IN_JSAMPLE != 8) + return 0; + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if (simd_support & JSIMD_MIPS_DSPR2) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + if (simd_support & JSIMD_MIPS_DSPR2) + jsimd_idct_2x2_mips_dspr2(compptr->dct_table, coef_block, output_buf, + output_col); +} + +GLOBAL(void) +jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + if (simd_support & JSIMD_MIPS_DSPR2) { + int workspace[DCTSIZE*4]; /* buffers data between passes */ + jsimd_idct_4x4_mips_dspr2(compptr->dct_table, coef_block, output_buf, + output_col, workspace); + } +} + +GLOBAL(void) +jsimd_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + if (simd_support & JSIMD_MIPS_DSPR2) + jsimd_idct_6x6_mips_dspr2(compptr->dct_table, coef_block, output_buf, + output_col); +} + +GLOBAL(void) +jsimd_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + if (simd_support & JSIMD_MIPS_DSPR2) { + int workspace[96]; + int output[12] = { + (int)(output_buf[0] + output_col), + (int)(output_buf[1] + output_col), + (int)(output_buf[2] + output_col), + (int)(output_buf[3] + output_col), + (int)(output_buf[4] + output_col), + (int)(output_buf[5] + output_col), + (int)(output_buf[6] + output_col), + (int)(output_buf[7] + output_col), + (int)(output_buf[8] + output_col), + (int)(output_buf[9] + output_col), + (int)(output_buf[10] + output_col), + (int)(output_buf[11] + output_col), + }; + jsimd_idct_12x12_pass1_mips_dspr2(coef_block, compptr->dct_table, + workspace); + jsimd_idct_12x12_pass2_mips_dspr2(workspace, output); + } +} + +GLOBAL(int) +jsimd_can_idct_islow (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if (simd_support & JSIMD_MIPS_DSPR2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_ifast (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(IFAST_MULT_TYPE) != 2) + return 0; + if (IFAST_SCALE_BITS != 2) + return 0; + + if (simd_support & JSIMD_MIPS_DSPR2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_float (void) +{ + init_simd(); + + return 0; +} + +GLOBAL(void) +jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + if (simd_support & JSIMD_MIPS_DSPR2) { + int output[8] = { + (int)(output_buf[0] + output_col), + (int)(output_buf[1] + output_col), + (int)(output_buf[2] + output_col), + (int)(output_buf[3] + output_col), + (int)(output_buf[4] + output_col), + (int)(output_buf[5] + output_col), + (int)(output_buf[6] + output_col), + (int)(output_buf[7] + output_col), + }; + + jsimd_idct_islow_mips_dspr2(coef_block, compptr->dct_table, + output, IDCT_range_limit(cinfo)); + } +} + +GLOBAL(void) +jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + if (simd_support & JSIMD_MIPS_DSPR2) { + JCOEFPTR inptr; + IFAST_MULT_TYPE *quantptr; + DCTELEM workspace[DCTSIZE2]; /* buffers data between passes */ + + /* Pass 1: process columns from input, store into work array. */ + + inptr = coef_block; + quantptr = (IFAST_MULT_TYPE *) compptr->dct_table; + + jsimd_idct_ifast_cols_mips_dspr2(inptr, quantptr, + workspace, mips_idct_ifast_coefs); + + /* Pass 2: process rows from work array, store into output array. */ + /* Note that we must descale the results by a factor of 8 == 2**3, */ + /* and also undo the PASS1_BITS scaling. */ + + jsimd_idct_ifast_rows_mips_dspr2(workspace, output_buf, + output_col, mips_idct_ifast_coefs); + } +} + +GLOBAL(void) +jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +} + +GLOBAL(int) +jsimd_can_huff_encode_one_block (void) +{ + return 0; +} + +GLOBAL(JOCTET*) +jsimd_huff_encode_one_block (void *state, JOCTET *buffer, JCOEFPTR block, + int last_dc_val, c_derived_tbl *dctbl, + c_derived_tbl *actbl) +{ + return NULL; +} diff --git a/Builder/jni-1.11/simd/jsimd_mips_dspr2.S b/Builder/jni-1.11/simd/jsimd_mips_dspr2.S new file mode 100644 index 000000000..c26dd5c53 --- /dev/null +++ b/Builder/jni-1.11/simd/jsimd_mips_dspr2.S @@ -0,0 +1,4486 @@ +/* + * MIPS DSPr2 optimizations for libjpeg-turbo + * + * Copyright (C) 2013-2014, MIPS Technologies, Inc., California. + * All Rights Reserved. + * Authors: Teodora Novkovic (teodora.novkovic@imgtec.com) + * Darko Laus (darko.laus@imgtec.com) + * Copyright (C) 2015, D. R. Commander. All Rights Reserved. + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#include "jsimd_mips_dspr2_asm.h" + +/*****************************************************************************/ +LEAF_MIPS_DSPR2(jsimd_c_null_convert_mips_dspr2) +/* + * a0 - cinfo->image_width + * a1 - input_buf + * a2 - output_buf + * a3 - output_row + * 16(sp) - num_rows + * 20(sp) - cinfo->num_components + * + * Null conversion for compression + */ + + SAVE_REGS_ON_STACK 8, s0, s1 + + lw t9, 24(sp) // t9 = num_rows + lw s0, 28(sp) // s0 = cinfo->num_components + andi t0, a0, 3 // t0 = cinfo->image_width & 3 + beqz t0, 4f // no residual + nop +0: + addiu t9, t9, -1 + bltz t9, 7f + li t1, 0 +1: + sll t3, t1, 2 + lwx t5, t3(a2) // t5 = outptr = output_buf[ci] + lw t2, 0(a1) // t2 = inptr = *input_buf + sll t4, a3, 2 + lwx t5, t4(t5) // t5 = outptr = output_buf[ci][output_row] + addu t2, t2, t1 + addu s1, t5, a0 + addu t6, t5, t0 +2: + lbu t3, 0(t2) + addiu t5, t5, 1 + sb t3, -1(t5) + bne t6, t5, 2b + addu t2, t2, s0 +3: + lbu t3, 0(t2) + addu t4, t2, s0 + addu t7, t4, s0 + addu t8, t7, s0 + addu t2, t8, s0 + lbu t4, 0(t4) + lbu t7, 0(t7) + lbu t8, 0(t8) + addiu t5, t5, 4 + sb t3, -4(t5) + sb t4, -3(t5) + sb t7, -2(t5) + bne s1, t5, 3b + sb t8, -1(t5) + addiu t1, t1, 1 + bne t1, s0, 1b + nop + addiu a1, a1, 4 + bgez t9, 0b + addiu a3, a3, 1 + b 7f + nop +4: + addiu t9, t9, -1 + bltz t9, 7f + li t1, 0 +5: + sll t3, t1, 2 + lwx t5, t3(a2) // t5 = outptr = output_buf[ci] + lw t2, 0(a1) // t2 = inptr = *input_buf + sll t4, a3, 2 + lwx t5, t4(t5) // t5 = outptr = output_buf[ci][output_row] + addu t2, t2, t1 + addu s1, t5, a0 + addu t6, t5, t0 +6: + lbu t3, 0(t2) + addu t4, t2, s0 + addu t7, t4, s0 + addu t8, t7, s0 + addu t2, t8, s0 + lbu t4, 0(t4) + lbu t7, 0(t7) + lbu t8, 0(t8) + addiu t5, t5, 4 + sb t3, -4(t5) + sb t4, -3(t5) + sb t7, -2(t5) + bne s1, t5, 6b + sb t8, -1(t5) + addiu t1, t1, 1 + bne t1, s0, 5b + nop + addiu a1, a1, 4 + bgez t9, 4b + addiu a3, a3, 1 +7: + RESTORE_REGS_FROM_STACK 8, s0, s1 + + j ra + nop + +END(jsimd_c_null_convert_mips_dspr2) + +/*****************************************************************************/ +/* + * jsimd_extrgb_ycc_convert_mips_dspr2 + * jsimd_extbgr_ycc_convert_mips_dspr2 + * jsimd_extrgbx_ycc_convert_mips_dspr2 + * jsimd_extbgrx_ycc_convert_mips_dspr2 + * jsimd_extxbgr_ycc_convert_mips_dspr2 + * jsimd_extxrgb_ycc_convert_mips_dspr2 + * + * Colorspace conversion RGB -> YCbCr + */ + +.macro GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs + +.macro DO_RGB_TO_YCC r, \ + g, \ + b, \ + inptr + lbu \r, \r_offs(\inptr) + lbu \g, \g_offs(\inptr) + lbu \b, \b_offs(\inptr) + addiu \inptr, \pixel_size +.endm + +LEAF_MIPS_DSPR2(jsimd_\colorid\()_ycc_convert_mips_dspr2) +/* + * a0 - cinfo->image_width + * a1 - input_buf + * a2 - output_buf + * a3 - output_row + * 16(sp) - num_rows + */ + + SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + lw t7, 48(sp) // t7 = num_rows + li s0, 0x4c8b // FIX(0.29900) + li s1, 0x9646 // FIX(0.58700) + li s2, 0x1d2f // FIX(0.11400) + li s3, 0xffffd4cd // -FIX(0.16874) + li s4, 0xffffab33 // -FIX(0.33126) + li s5, 0x8000 // FIX(0.50000) + li s6, 0xffff94d1 // -FIX(0.41869) + li s7, 0xffffeb2f // -FIX(0.08131) + li t8, 0x807fff // CBCR_OFFSET + ONE_HALF-1 + +0: + addiu t7, -1 // --num_rows + lw t6, 0(a1) // t6 = input_buf[0] + lw t0, 0(a2) + lw t1, 4(a2) + lw t2, 8(a2) + sll t3, a3, 2 + lwx t0, t3(t0) // t0 = output_buf[0][output_row] + lwx t1, t3(t1) // t1 = output_buf[1][output_row] + lwx t2, t3(t2) // t2 = output_buf[2][output_row] + + addu t9, t2, a0 // t9 = end address + addiu a3, 1 + +1: + DO_RGB_TO_YCC t3, t4, t5, t6 + + mtlo s5, $ac0 + mtlo t8, $ac1 + mtlo t8, $ac2 + maddu $ac0, s2, t5 + maddu $ac1, s5, t5 + maddu $ac2, s5, t3 + maddu $ac0, s0, t3 + maddu $ac1, s3, t3 + maddu $ac2, s6, t4 + maddu $ac0, s1, t4 + maddu $ac1, s4, t4 + maddu $ac2, s7, t5 + extr.w t3, $ac0, 16 + extr.w t4, $ac1, 16 + extr.w t5, $ac2, 16 + sb t3, 0(t0) + sb t4, 0(t1) + sb t5, 0(t2) + addiu t0, 1 + addiu t2, 1 + bne t2, t9, 1b + addiu t1, 1 + bgtz t7, 0b + addiu a1, 4 + + RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + j ra + nop +END(jsimd_\colorid\()_ycc_convert_mips_dspr2) + +.purgem DO_RGB_TO_YCC + +.endm + +/*------------------------------------------id -- pix R G B */ +GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extrgb, 3, 0, 1, 2 +GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extbgr, 3, 2, 1, 0 +GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2 +GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0 +GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1 +GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3 + +/*****************************************************************************/ +/* + * jsimd_ycc_extrgb_convert_mips_dspr2 + * jsimd_ycc_extbgr_convert_mips_dspr2 + * jsimd_ycc_extrgbx_convert_mips_dspr2 + * jsimd_ycc_extbgrx_convert_mips_dspr2 + * jsimd_ycc_extxbgr_convert_mips_dspr2 + * jsimd_ycc_extxrgb_convert_mips_dspr2 + * + * Colorspace conversion YCbCr -> RGB + */ + +.macro GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs, a_offs + +.macro STORE_YCC_TO_RGB scratch0 \ + scratch1 \ + scratch2 \ + outptr + sb \scratch0, \r_offs(\outptr) + sb \scratch1, \g_offs(\outptr) + sb \scratch2, \b_offs(\outptr) +.if (\pixel_size == 4) + li t0, 0xFF + sb t0, \a_offs(\outptr) +.endif + addiu \outptr, \pixel_size +.endm + +LEAF_MIPS_DSPR2(jsimd_ycc_\colorid\()_convert_mips_dspr2) +/* + * a0 - cinfo->image_width + * a1 - input_buf + * a2 - input_row + * a3 - output_buf + * 16(sp) - num_rows + */ + + SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + lw s1, 48(sp) + li t3, 0x8000 + li t4, 0x166e9 // FIX(1.40200) + li t5, 0x1c5a2 // FIX(1.77200) + li t6, 0xffff492e // -FIX(0.71414) + li t7, 0xffffa7e6 // -FIX(0.34414) + repl.ph t8, 128 + +0: + lw s0, 0(a3) + lw t0, 0(a1) + lw t1, 4(a1) + lw t2, 8(a1) + sll s5, a2, 2 + addiu s1, -1 + lwx s2, s5(t0) + lwx s3, s5(t1) + lwx s4, s5(t2) + addu t9, s2, a0 + addiu a2, 1 + +1: + lbu s7, 0(s4) // cr + lbu s6, 0(s3) // cb + lbu s5, 0(s2) // y + addiu s2, 1 + addiu s4, 1 + addiu s7, -128 + addiu s6, -128 + mul t2, t7, s6 + mul t0, t6, s7 // Crgtab[cr] + sll s7, 15 + mulq_rs.w t1, t4, s7 // Crrtab[cr] + sll s6, 15 + addu t2, t3 // Cbgtab[cb] + addu t2, t0 + + mulq_rs.w t0, t5, s6 // Cbbtab[cb] + sra t2, 16 + addu t1, s5 + addu t2, s5 // add y + ins t2, t1, 16, 16 + subu.ph t2, t2, t8 + addu t0, s5 + shll_s.ph t2, t2, 8 + subu t0, 128 + shra.ph t2, t2, 8 + shll_s.w t0, t0, 24 + addu.ph t2, t2, t8 // clip & store + sra t0, t0, 24 + sra t1, t2, 16 + addiu t0, 128 + + STORE_YCC_TO_RGB t1, t2, t0, s0 + + bne s2, t9, 1b + addiu s3, 1 + bgtz s1, 0b + addiu a3, 4 + + RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + j ra + nop +END(jsimd_ycc_\colorid\()_convert_mips_dspr2) + +.purgem STORE_YCC_TO_RGB + +.endm + +/*------------------------------------------id -- pix R G B A */ +GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extrgb, 3, 0, 1, 2, 3 +GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extbgr, 3, 2, 1, 0, 3 +GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2, 3 +GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0, 3 +GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1, 0 +GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3, 0 + +/*****************************************************************************/ +/* + * jsimd_extrgb_gray_convert_mips_dspr2 + * jsimd_extbgr_gray_convert_mips_dspr2 + * jsimd_extrgbx_gray_convert_mips_dspr2 + * jsimd_extbgrx_gray_convert_mips_dspr2 + * jsimd_extxbgr_gray_convert_mips_dspr2 + * jsimd_extxrgb_gray_convert_mips_dspr2 + * + * Colorspace conversion RGB -> GRAY + */ + +.macro GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs + +.macro DO_RGB_TO_GRAY r, \ + g, \ + b, \ + inptr + lbu \r, \r_offs(\inptr) + lbu \g, \g_offs(\inptr) + lbu \b, \b_offs(\inptr) + addiu \inptr, \pixel_size +.endm + +LEAF_MIPS_DSPR2(jsimd_\colorid\()_gray_convert_mips_dspr2) +/* + * a0 - cinfo->image_width + * a1 - input_buf + * a2 - output_buf + * a3 - output_row + * 16(sp) - num_rows + */ + + SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + li s0, 0x4c8b // s0 = FIX(0.29900) + li s1, 0x9646 // s1 = FIX(0.58700) + li s2, 0x1d2f // s2 = FIX(0.11400) + li s7, 0x8000 // s7 = FIX(0.50000) + lw s6, 48(sp) + andi t7, a0, 3 + +0: + addiu s6, -1 // s6 = num_rows + lw t0, 0(a1) + lw t1, 0(a2) + sll t3, a3, 2 + lwx t1, t3(t1) + addiu a3, 1 + addu t9, t1, a0 + subu t8, t9, t7 + beq t1, t8, 2f + nop + +1: + DO_RGB_TO_GRAY t3, t4, t5, t0 + DO_RGB_TO_GRAY s3, s4, s5, t0 + + mtlo s7, $ac0 + maddu $ac0, s2, t5 + maddu $ac0, s1, t4 + maddu $ac0, s0, t3 + mtlo s7, $ac1 + maddu $ac1, s2, s5 + maddu $ac1, s1, s4 + maddu $ac1, s0, s3 + extr.w t6, $ac0, 16 + + DO_RGB_TO_GRAY t3, t4, t5, t0 + DO_RGB_TO_GRAY s3, s4, s5, t0 + + mtlo s7, $ac0 + maddu $ac0, s2, t5 + maddu $ac0, s1, t4 + extr.w t2, $ac1, 16 + maddu $ac0, s0, t3 + mtlo s7, $ac1 + maddu $ac1, s2, s5 + maddu $ac1, s1, s4 + maddu $ac1, s0, s3 + extr.w t5, $ac0, 16 + sb t6, 0(t1) + sb t2, 1(t1) + extr.w t3, $ac1, 16 + addiu t1, 4 + sb t5, -2(t1) + sb t3, -1(t1) + bne t1, t8, 1b + nop + +2: + beqz t7, 4f + nop + +3: + DO_RGB_TO_GRAY t3, t4, t5, t0 + + mtlo s7, $ac0 + maddu $ac0, s2, t5 + maddu $ac0, s1, t4 + maddu $ac0, s0, t3 + extr.w t6, $ac0, 16 + sb t6, 0(t1) + addiu t1, 1 + bne t1, t9, 3b + nop + +4: + bgtz s6, 0b + addiu a1, 4 + + RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + j ra + nop +END(jsimd_\colorid\()_gray_convert_mips_dspr2) + +.purgem DO_RGB_TO_GRAY + +.endm + +/*------------------------------------------id -- pix R G B */ +GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extrgb, 3, 0, 1, 2 +GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extbgr, 3, 2, 1, 0 +GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2 +GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0 +GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1 +GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3 +/*****************************************************************************/ +/* + * jsimd_h2v2_merged_upsample_mips_dspr2 + * jsimd_h2v2_extrgb_merged_upsample_mips_dspr2 + * jsimd_h2v2_extrgbx_merged_upsample_mips_dspr2 + * jsimd_h2v2_extbgr_merged_upsample_mips_dspr2 + * jsimd_h2v2_extbgrx_merged_upsample_mips_dspr2 + * jsimd_h2v2_extxbgr_merged_upsample_mips_dspr2 + * jsimd_h2v2_extxrgb_merged_upsample_mips_dspr2 + * + * Merged h2v2 upsample routines + */ +.macro GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 colorid, \ + pixel_size, \ + r1_offs, \ + g1_offs, \ + b1_offs, \ + a1_offs, \ + r2_offs, \ + g2_offs, \ + b2_offs, \ + a2_offs + +.macro STORE_H2V2_2_PIXELS scratch0 \ + scratch1 \ + scratch2 \ + scratch3 \ + scratch4 \ + scratch5 \ + outptr + sb \scratch0, \r1_offs(\outptr) + sb \scratch1, \g1_offs(\outptr) + sb \scratch2, \b1_offs(\outptr) + sb \scratch3, \r2_offs(\outptr) + sb \scratch4, \g2_offs(\outptr) + sb \scratch5, \b2_offs(\outptr) +.if (\pixel_size == 8) + li \scratch0, 0xFF + sb \scratch0, \a1_offs(\outptr) + sb \scratch0, \a2_offs(\outptr) +.endif + addiu \outptr, \pixel_size +.endm + +.macro STORE_H2V2_1_PIXEL scratch0 \ + scratch1 \ + scratch2 \ + outptr + sb \scratch0, \r1_offs(\outptr) + sb \scratch1, \g1_offs(\outptr) + sb \scratch2, \b1_offs(\outptr) + +.if (\pixel_size == 8) + li t0, 0xFF + sb t0, \a1_offs(\outptr) +.endif +.endm + +LEAF_MIPS_DSPR2(jsimd_h2v2_\colorid\()_merged_upsample_mips_dspr2) +/* + * a0 - cinfo->output_width + * a1 - input_buf + * a2 - in_row_group_ctr + * a3 - output_buf + * 16(sp) - cinfo->sample_range_limit + */ + + SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra + + lw t9, 56(sp) // cinfo->sample_range_limit + lw v0, 0(a1) + lw v1, 4(a1) + lw t0, 8(a1) + sll t1, a2, 3 + addiu t2, t1, 4 + sll t3, a2, 2 + lw t4, 0(a3) // t4 = output_buf[0] + lwx t1, t1(v0) // t1 = input_buf[0][in_row_group_ctr*2] + lwx t2, t2(v0) // t2 = input_buf[0][in_row_group_ctr*2 + 1] + lwx t5, t3(v1) // t5 = input_buf[1][in_row_group_ctr] + lwx t6, t3(t0) // t6 = input_buf[2][in_row_group_ctr] + lw t7, 4(a3) // t7 = output_buf[1] + li s1, 0xe6ea + addiu t8, s1, 0x7fff // t8 = 0x166e9 [FIX(1.40200)] + addiu s0, t8, 0x5eb9 // s0 = 0x1c5a2 [FIX(1.77200)] + addiu s1, zero, 0xa7e6 // s4 = 0xffffa7e6 [-FIX(0.34414)] + xori s2, s1, 0xeec8 // s3 = 0xffff492e [-FIX(0.71414)] + srl t3, a0, 1 + blez t3, 2f + addu t0, t5, t3 // t0 = end address + 1: + lbu t3, 0(t5) + lbu s3, 0(t6) + addiu t5, t5, 1 + addiu t3, t3, -128 // (cb - 128) + addiu s3, s3, -128 // (cr - 128) + mult $ac1, s1, t3 + madd $ac1, s2, s3 + sll s3, s3, 15 + sll t3, t3, 15 + mulq_rs.w s4, t8, s3 // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS + extr_r.w s5, $ac1, 16 + mulq_rs.w s6, s0, t3 // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS + lbu v0, 0(t1) + addiu t6, t6, 1 + addiu t1, t1, 2 + addu t3, v0, s4 // y+cred + addu s3, v0, s5 // y+cgreen + addu v1, v0, s6 // y+cblue + addu t3, t9, t3 // y+cred + addu s3, t9, s3 // y+cgreen + addu v1, t9, v1 // y+cblue + lbu AT, 0(t3) + lbu s7, 0(s3) + lbu ra, 0(v1) + lbu v0, -1(t1) + addu t3, v0, s4 // y+cred + addu s3, v0, s5 // y+cgreen + addu v1, v0, s6 // y+cblue + addu t3, t9, t3 // y+cred + addu s3, t9, s3 // y+cgreen + addu v1, t9, v1 // y+cblue + lbu t3, 0(t3) + lbu s3, 0(s3) + lbu v1, 0(v1) + lbu v0, 0(t2) + + STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t4 + + addu t3, v0, s4 // y+cred + addu s3, v0, s5 // y+cgreen + addu v1, v0, s6 // y+cblue + addu t3, t9, t3 // y+cred + addu s3, t9, s3 // y+cgreen + addu v1, t9, v1 // y+cblue + lbu AT, 0(t3) + lbu s7, 0(s3) + lbu ra, 0(v1) + lbu v0, 1(t2) + addiu t2, t2, 2 + addu t3, v0, s4 // y+cred + addu s3, v0, s5 // y+cgreen + addu v1, v0, s6 // y+cblue + addu t3, t9, t3 // y+cred + addu s3, t9, s3 // y+cgreen + addu v1, t9, v1 // y+cblue + lbu t3, 0(t3) + lbu s3, 0(s3) + lbu v1, 0(v1) + + STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t7 + + bne t0, t5, 1b + nop +2: + andi t0, a0, 1 + beqz t0, 4f + lbu t3, 0(t5) + lbu s3, 0(t6) + addiu t3, t3, -128 // (cb - 128) + addiu s3, s3, -128 // (cr - 128) + mult $ac1, s1, t3 + madd $ac1, s2, s3 + sll s3, s3, 15 + sll t3, t3, 15 + lbu v0, 0(t1) + extr_r.w s5, $ac1, 16 + mulq_rs.w s4, t8, s3 // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS + mulq_rs.w s6, s0, t3 // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS + addu t3, v0, s4 // y+cred + addu s3, v0, s5 // y+cgreen + addu v1, v0, s6 // y+cblue + addu t3, t9, t3 // y+cred + addu s3, t9, s3 // y+cgreen + addu v1, t9, v1 // y+cblue + lbu t3, 0(t3) + lbu s3, 0(s3) + lbu v1, 0(v1) + lbu v0, 0(t2) + + STORE_H2V2_1_PIXEL t3, s3, v1, t4 + + addu t3, v0, s4 // y+cred + addu s3, v0, s5 // y+cgreen + addu v1, v0, s6 // y+cblue + addu t3, t9, t3 // y+cred + addu s3, t9, s3 // y+cgreen + addu v1, t9, v1 // y+cblue + lbu t3, 0(t3) + lbu s3, 0(s3) + lbu v1, 0(v1) + + STORE_H2V2_1_PIXEL t3, s3, v1, t7 +4: + RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra + + j ra + nop + +END(jsimd_h2v2_\colorid\()_merged_upsample_mips_dspr2) + +.purgem STORE_H2V2_1_PIXEL +.purgem STORE_H2V2_2_PIXELS +.endm + +/*-----------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */ +GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extrgb, 6, 0, 1, 2, 6, 3, 4, 5, 6 +GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extbgr, 6, 2, 1, 0, 3, 5, 4, 3, 6 +GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7 +GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7 +GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4 +GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4 +/*****************************************************************************/ +/* + * jsimd_h2v1_merged_upsample_mips_dspr2 + * jsimd_h2v1_extrgb_merged_upsample_mips_dspr2 + * jsimd_h2v1_extrgbx_merged_upsample_mips_dspr2 + * jsimd_h2v1_extbgr_merged_upsample_mips_dspr2 + * jsimd_h2v1_extbgrx_merged_upsample_mips_dspr2 + * jsimd_h2v1_extxbgr_merged_upsample_mips_dspr2 + * jsimd_h2v1_extxrgb_merged_upsample_mips_dspr2 + * + * Merged h2v1 upsample routines + */ + +.macro GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 colorid, \ + pixel_size, \ + r1_offs, \ + g1_offs, \ + b1_offs, \ + a1_offs, \ + r2_offs, \ + g2_offs, \ + b2_offs, \ + a2_offs + +.macro STORE_H2V1_2_PIXELS scratch0 \ + scratch1 \ + scratch2 \ + scratch3 \ + scratch4 \ + scratch5 \ + outptr + sb \scratch0, \r1_offs(\outptr) + sb \scratch1, \g1_offs(\outptr) + sb \scratch2, \b1_offs(\outptr) + sb \scratch3, \r2_offs(\outptr) + sb \scratch4, \g2_offs(\outptr) + sb \scratch5, \b2_offs(\outptr) +.if (\pixel_size == 8) + li t0, 0xFF + sb t0, \a1_offs(\outptr) + sb t0, \a2_offs(\outptr) +.endif + addiu \outptr, \pixel_size +.endm + +.macro STORE_H2V1_1_PIXEL scratch0 \ + scratch1 \ + scratch2 \ + outptr + sb \scratch0, \r1_offs(\outptr) + sb \scratch1, \g1_offs(\outptr) + sb \scratch2, \b1_offs(\outptr) +.if (\pixel_size == 8) + li t0, 0xFF + sb t0, \a1_offs(\outptr) +.endif +.endm + +LEAF_MIPS_DSPR2(jsimd_h2v1_\colorid\()_merged_upsample_mips_dspr2) +/* + * a0 - cinfo->output_width + * a1 - input_buf + * a2 - in_row_group_ctr + * a3 - output_buf + * 16(sp) - range_limit + */ + + SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra + + li t0, 0xe6ea + lw t1, 0(a1) // t1 = input_buf[0] + lw t2, 4(a1) // t2 = input_buf[1] + lw t3, 8(a1) // t3 = input_buf[2] + lw t8, 56(sp) // t8 = range_limit + addiu s1, t0, 0x7fff // s1 = 0x166e9 [FIX(1.40200)] + addiu s2, s1, 0x5eb9 // s2 = 0x1c5a2 [FIX(1.77200)] + addiu s0, t0, 0x9916 // s0 = 0x8000 + addiu s4, zero, 0xa7e6 // s4 = 0xffffa7e6 [-FIX(0.34414)] + xori s3, s4, 0xeec8 // s3 = 0xffff492e [-FIX(0.71414)] + srl t0, a0, 1 + sll t4, a2, 2 + lwx s5, t4(t1) // s5 = inptr0 + lwx s6, t4(t2) // s6 = inptr1 + lwx s7, t4(t3) // s7 = inptr2 + lw t7, 0(a3) // t7 = outptr + blez t0, 2f + addu t9, s6, t0 // t9 = end address +1: + lbu t2, 0(s6) // t2 = cb + lbu t0, 0(s7) // t0 = cr + lbu t1, 0(s5) // t1 = y + addiu t2, t2, -128 // t2 = cb - 128 + addiu t0, t0, -128 // t0 = cr - 128 + mult $ac1, s4, t2 + madd $ac1, s3, t0 + sll t0, t0, 15 + sll t2, t2, 15 + mulq_rs.w t0, s1, t0 // t0 = (C1*cr + ONE_HALF)>> SCALEBITS + extr_r.w t5, $ac1, 16 + mulq_rs.w t6, s2, t2 // t6 = (C2*cb + ONE_HALF)>> SCALEBITS + addiu s7, s7, 1 + addiu s6, s6, 1 + addu t2, t1, t0 // t2 = y + cred + addu t3, t1, t5 // t3 = y + cgreen + addu t4, t1, t6 // t4 = y + cblue + addu t2, t8, t2 + addu t3, t8, t3 + addu t4, t8, t4 + lbu t1, 1(s5) + lbu v0, 0(t2) + lbu v1, 0(t3) + lbu ra, 0(t4) + addu t2, t1, t0 + addu t3, t1, t5 + addu t4, t1, t6 + addu t2, t8, t2 + addu t3, t8, t3 + addu t4, t8, t4 + lbu t2, 0(t2) + lbu t3, 0(t3) + lbu t4, 0(t4) + + STORE_H2V1_2_PIXELS v0, v1, ra, t2, t3, t4, t7 + + bne t9, s6, 1b + addiu s5, s5, 2 +2: + andi t0, a0, 1 + beqz t0, 4f + nop +3: + lbu t2, 0(s6) + lbu t0, 0(s7) + lbu t1, 0(s5) + addiu t2, t2, -128 //(cb - 128) + addiu t0, t0, -128 //(cr - 128) + mul t3, s4, t2 + mul t4, s3, t0 + sll t0, t0, 15 + sll t2, t2, 15 + mulq_rs.w t0, s1, t0 // (C1*cr + ONE_HALF)>> SCALEBITS + mulq_rs.w t6, s2, t2 // (C2*cb + ONE_HALF)>> SCALEBITS + addu t3, t3, s0 + addu t3, t4, t3 + sra t5, t3, 16 // (C4*cb + ONE_HALF + C3*cr)>> SCALEBITS + addu t2, t1, t0 // y + cred + addu t3, t1, t5 // y + cgreen + addu t4, t1, t6 // y + cblue + addu t2, t8, t2 + addu t3, t8, t3 + addu t4, t8, t4 + lbu t2, 0(t2) + lbu t3, 0(t3) + lbu t4, 0(t4) + + STORE_H2V1_1_PIXEL t2, t3, t4, t7 +4: + RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra + + j ra + nop + +END(jsimd_h2v1_\colorid\()_merged_upsample_mips_dspr2) + +.purgem STORE_H2V1_1_PIXEL +.purgem STORE_H2V1_2_PIXELS +.endm + +/*-----------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */ +GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extrgb, 6, 0, 1, 2, 6, 3, 4, 5, 6 +GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extbgr, 6, 2, 1, 0, 3, 5, 4, 3, 6 +GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7 +GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7 +GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4 +GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4 +/*****************************************************************************/ +/* + * jsimd_h2v2_fancy_upsample_mips_dspr2 + * + * Fancy processing for the common case of 2:1 horizontal and 2:1 vertical. + */ +LEAF_MIPS_DSPR2(jsimd_h2v2_fancy_upsample_mips_dspr2) +/* + * a0 - cinfo->max_v_samp_factor + * a1 - downsampled_width + * a2 - input_data + * a3 - output_data_ptr + */ + + SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5 + + li s4, 0 + lw s2, 0(a3) // s2 = *output_data_ptr +0: + li t9, 2 + lw s1, -4(a2) // s1 = inptr1 + +1: + lw s0, 0(a2) // s0 = inptr0 + lwx s3, s4(s2) + addiu s5, a1, -2 // s5 = downsampled_width - 2 + srl t4, s5, 1 + sll t4, t4, 1 + lbu t0, 0(s0) + lbu t1, 1(s0) + lbu t2, 0(s1) + lbu t3, 1(s1) + addiu s0, 2 + addiu s1, 2 + addu t8, s0, t4 // t8 = end address + andi s5, s5, 1 // s5 = residual + sll t4, t0, 1 + sll t6, t1, 1 + addu t0, t0, t4 // t0 = (*inptr0++) * 3 + addu t1, t1, t6 // t1 = (*inptr0++) * 3 + addu t7, t0, t2 // t7 = thiscolsum + addu t6, t1, t3 // t5 = nextcolsum + sll t0, t7, 2 // t0 = thiscolsum * 4 + subu t1, t0, t7 // t1 = thiscolsum * 3 + shra_r.w t0, t0, 4 + addiu t1, 7 + addu t1, t1, t6 + srl t1, t1, 4 + sb t0, 0(s3) + sb t1, 1(s3) + beq t8, s0, 22f // skip to final iteration if width == 3 + addiu s3, 2 +2: + lh t0, 0(s0) // t0 = A3|A2 + lh t2, 0(s1) // t2 = B3|B2 + addiu s0, 2 + addiu s1, 2 + preceu.ph.qbr t0, t0 // t0 = 0|A3|0|A2 + preceu.ph.qbr t2, t2 // t2 = 0|B3|0|B2 + shll.ph t1, t0, 1 + sll t3, t6, 1 + addu.ph t0, t1, t0 // t0 = A3*3|A2*3 + addu t3, t3, t6 // t3 = this * 3 + addu.ph t0, t0, t2 // t0 = next2|next1 + addu t1, t3, t7 + andi t7, t0, 0xFFFF // t7 = next1 + sll t2, t7, 1 + addu t2, t7, t2 // t2 = next1*3 + addu t4, t2, t6 + srl t6, t0, 16 // t6 = next2 + shra_r.w t1, t1, 4 // t1 = (this*3 + last + 8) >> 4 + addu t0, t3, t7 + addiu t0, 7 + srl t0, t0, 4 // t0 = (this*3 + next1 + 7) >> 4 + shra_r.w t4, t4, 4 // t3 = (next1*3 + this + 8) >> 4 + addu t2, t2, t6 + addiu t2, 7 + srl t2, t2, 4 // t2 = (next1*3 + next2 + 7) >> 4 + sb t1, 0(s3) + sb t0, 1(s3) + sb t4, 2(s3) + sb t2, 3(s3) + bne t8, s0, 2b + addiu s3, 4 +22: + beqz s5, 4f + addu t8, s0, s5 +3: + lbu t0, 0(s0) + lbu t2, 0(s1) + addiu s0, 1 + addiu s1, 1 + sll t3, t6, 1 + sll t1, t0, 1 + addu t1, t0, t1 // t1 = inptr0 * 3 + addu t3, t3, t6 // t3 = thiscolsum * 3 + addu t5, t1, t2 + addu t1, t3, t7 + shra_r.w t1, t1, 4 + addu t0, t3, t5 + addiu t0, 7 + srl t0, t0, 4 + sb t1, 0(s3) + sb t0, 1(s3) + addiu s3, 2 + move t7, t6 + bne t8, s0, 3b + move t6, t5 +4: + sll t0, t6, 2 // t0 = thiscolsum * 4 + subu t1, t0, t6 // t1 = thiscolsum * 3 + addu t1, t1, t7 + addiu s4, 4 + shra_r.w t1, t1, 4 + addiu t0, 7 + srl t0, t0, 4 + sb t1, 0(s3) + sb t0, 1(s3) + addiu t9, -1 + addiu s3, 2 + bnez t9, 1b + lw s1, 4(a2) + srl t0, s4, 2 + subu t0, a0, t0 + bgtz t0, 0b + addiu a2, 4 + + RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5 + + j ra + nop +END(jsimd_h2v2_fancy_upsample_mips_dspr2) + +/*****************************************************************************/ +LEAF_MIPS_DSPR2(jsimd_h2v1_fancy_upsample_mips_dspr2) +/* + * a0 - cinfo->max_v_samp_factor + * a1 - downsampled_width + * a2 - input_data + * a3 - output_data_ptr + */ + + SAVE_REGS_ON_STACK 16, s0, s1, s2, s3 + + .set at + + beqz a0, 3f + sll t0, a0, 2 + lw s1, 0(a3) + li s3, 0x10001 + addu s0, s1, t0 +0: + addiu t8, a1, -2 + srl t9, t8, 2 + lw t7, 0(a2) + lw s2, 0(s1) + lbu t0, 0(t7) + lbu t1, 1(t7) // t1 = inptr[1] + sll t2, t0, 1 + addu t2, t2, t0 // t2 = invalue*3 + addu t2, t2, t1 + shra_r.w t2, t2, 2 + sb t0, 0(s2) + sb t2, 1(s2) + beqz t9, 11f + addiu s2, 2 +1: + ulw t0, 0(t7) // t0 = |P3|P2|P1|P0| + ulw t1, 1(t7) + ulh t2, 4(t7) // t2 = |0|0|P5|P4| + preceu.ph.qbl t3, t0 // t3 = |0|P3|0|P2| + preceu.ph.qbr t0, t0 // t0 = |0|P1|0|P0| + preceu.ph.qbr t2, t2 // t2 = |0|P5|0|P4| + preceu.ph.qbl t4, t1 // t4 = |0|P4|0|P3| + preceu.ph.qbr t1, t1 // t1 = |0|P2|0|P1| + shll.ph t5, t4, 1 + shll.ph t6, t1, 1 + addu.ph t5, t5, t4 // t5 = |P4*3|P3*3| + addu.ph t6, t6, t1 // t6 = |P2*3|P1*3| + addu.ph t4, t3, s3 + addu.ph t0, t0, s3 + addu.ph t4, t4, t5 + addu.ph t0, t0, t6 + shrl.ph t4, t4, 2 // t4 = |0|P3|0|P2| + shrl.ph t0, t0, 2 // t0 = |0|P1|0|P0| + addu.ph t2, t2, t5 + addu.ph t3, t3, t6 + shra_r.ph t2, t2, 2 // t2 = |0|P5|0|P4| + shra_r.ph t3, t3, 2 // t3 = |0|P3|0|P2| + shll.ph t2, t2, 8 + shll.ph t3, t3, 8 + or t2, t4, t2 + or t3, t3, t0 + addiu t9, -1 + usw t3, 0(s2) + usw t2, 4(s2) + addiu s2, 8 + bgtz t9, 1b + addiu t7, 4 +11: + andi t8, 3 + beqz t8, 22f + addiu t7, 1 + +2: + lbu t0, 0(t7) + addiu t7, 1 + sll t1, t0, 1 + addu t2, t0, t1 // t2 = invalue + lbu t3, -2(t7) + lbu t4, 0(t7) + addiu t3, 1 + addiu t4, 2 + addu t3, t3, t2 + addu t4, t4, t2 + srl t3, 2 + srl t4, 2 + sb t3, 0(s2) + sb t4, 1(s2) + addiu t8, -1 + bgtz t8, 2b + addiu s2, 2 + +22: + lbu t0, 0(t7) + lbu t2, -1(t7) + sll t1, t0, 1 + addu t1, t1, t0 // t1 = invalue * 3 + addu t1, t1, t2 + addiu t1, 1 + srl t1, t1, 2 + sb t1, 0(s2) + sb t0, 1(s2) + addiu s1, 4 + bne s1, s0, 0b + addiu a2, 4 +3: + RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3 + + j ra + nop +END(jsimd_h2v1_fancy_upsample_mips_dspr2) + +/*****************************************************************************/ +LEAF_MIPS_DSPR2(jsimd_h2v1_downsample_mips_dspr2) +/* + * a0 - cinfo->image_width + * a1 - cinfo->max_v_samp_factor + * a2 - compptr->v_samp_factor + * a3 - compptr->width_in_blocks + * 16(sp) - input_data + * 20(sp) - output_data + */ + .set at + + SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4 + + beqz a2, 7f + lw s1, 44(sp) // s1 = output_data + lw s0, 40(sp) // s0 = input_data + srl s2, a0, 2 + andi t9, a0, 2 + srl t7, t9, 1 + addu s2, t7, s2 + sll t0, a3, 3 // t0 = width_in_blocks*DCT + srl t7, t0, 1 + subu s2, t7, s2 +0: + andi t6, a0, 1 // t6 = temp_index + addiu t6, -1 + lw t4, 0(s1) // t4 = outptr + lw t5, 0(s0) // t5 = inptr0 + li s3, 0 // s3 = bias + srl t7, a0, 1 // t7 = image_width1 + srl s4, t7, 2 + andi t8, t7, 3 +1: + ulhu t0, 0(t5) + ulhu t1, 2(t5) + ulhu t2, 4(t5) + ulhu t3, 6(t5) + raddu.w.qb t0, t0 + raddu.w.qb t1, t1 + raddu.w.qb t2, t2 + raddu.w.qb t3, t3 + shra.ph t0, t0, 1 + shra_r.ph t1, t1, 1 + shra.ph t2, t2, 1 + shra_r.ph t3, t3, 1 + sb t0, 0(t4) + sb t1, 1(t4) + sb t2, 2(t4) + sb t3, 3(t4) + addiu s4, -1 + addiu t4, 4 + bgtz s4, 1b + addiu t5, 8 + beqz t8, 3f + addu s4, t4, t8 +2: + ulhu t0, 0(t5) + raddu.w.qb t0, t0 + addqh.w t0, t0, s3 + xori s3, s3, 1 + sb t0, 0(t4) + addiu t4, 1 + bne t4, s4, 2b + addiu t5, 2 +3: + lbux t1, t6(t5) + sll t1, 1 + addqh.w t2, t1, s3 // t2 = pixval1 + xori s3, s3, 1 + addqh.w t3, t1, s3 // t3 = pixval2 + blez s2, 5f + append t3, t2, 8 + addu t5, t4, s2 // t5 = loop_end2 +4: + ush t3, 0(t4) + addiu s2, -1 + bgtz s2, 4b + addiu t4, 2 +5: + beqz t9, 6f + nop + sb t2, 0(t4) +6: + addiu s1, 4 + addiu a2, -1 + bnez a2, 0b + addiu s0, 4 +7: + RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4 + + j ra + nop +END(jsimd_h2v1_downsample_mips_dspr2) + +/*****************************************************************************/ +LEAF_MIPS_DSPR2(jsimd_h2v2_downsample_mips_dspr2) + +/* + * a0 - cinfo->image_width + * a1 - cinfo->max_v_samp_factor + * a2 - compptr->v_samp_factor + * a3 - compptr->width_in_blocks + * 16(sp) - input_data + * 20(sp) - output_data + */ + .set at + SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + beqz a2, 8f + lw s1, 52(sp) // s1 = output_data + lw s0, 48(sp) // s0 = input_data + + andi t6, a0, 1 // t6 = temp_index + addiu t6, -1 + srl t7, a0, 1 // t7 = image_width1 + srl s4, t7, 2 + andi t8, t7, 3 + andi t9, a0, 2 + srl s2, a0, 2 + srl t7, t9, 1 + addu s2, t7, s2 + sll t0, a3, 3 // s2 = width_in_blocks*DCT + srl t7, t0, 1 + subu s2, t7, s2 +0: + lw t4, 0(s1) // t4 = outptr + lw t5, 0(s0) // t5 = inptr0 + lw s7, 4(s0) // s7 = inptr1 + li s6, 1 // s6 = bias +2: + ulw t0, 0(t5) // t0 = |P3|P2|P1|P0| + ulw t1, 0(s7) // t1 = |Q3|Q2|Q1|Q0| + ulw t2, 4(t5) + ulw t3, 4(s7) + precrq.ph.w t7, t0, t1 // t2 = |P3|P2|Q3|Q2| + ins t0, t1, 16, 16 // t0 = |Q1|Q0|P1|P0| + raddu.w.qb t1, t7 + raddu.w.qb t0, t0 + shra_r.w t1, t1, 2 + addiu t0, 1 + srl t0, 2 + precrq.ph.w t7, t2, t3 + ins t2, t3, 16, 16 + raddu.w.qb t7, t7 + raddu.w.qb t2, t2 + shra_r.w t7, t7, 2 + addiu t2, 1 + srl t2, 2 + sb t0, 0(t4) + sb t1, 1(t4) + sb t2, 2(t4) + sb t7, 3(t4) + addiu t4, 4 + addiu t5, 8 + addiu s4, s4, -1 + bgtz s4, 2b + addiu s7, 8 + beqz t8, 4f + addu t8, t4, t8 +3: + ulhu t0, 0(t5) + ulhu t1, 0(s7) + ins t0, t1, 16, 16 + raddu.w.qb t0, t0 + addu t0, t0, s6 + srl t0, 2 + xori s6, s6, 3 + sb t0, 0(t4) + addiu t5, 2 + addiu t4, 1 + bne t8, t4, 3b + addiu s7, 2 +4: + lbux t1, t6(t5) + sll t1, 1 + lbux t0, t6(s7) + sll t0, 1 + addu t1, t1, t0 + addu t3, t1, s6 + srl t0, t3, 2 // t2 = pixval1 + xori s6, s6, 3 + addu t2, t1, s6 + srl t1, t2, 2 // t3 = pixval2 + blez s2, 6f + append t1, t0, 8 +5: + ush t1, 0(t4) + addiu s2, -1 + bgtz s2, 5b + addiu t4, 2 +6: + beqz t9, 7f + nop + sb t0, 0(t4) +7: + addiu s1, 4 + addiu a2, -1 + bnez a2, 0b + addiu s0, 8 +8: + RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + j ra + nop +END(jsimd_h2v2_downsample_mips_dspr2) +/*****************************************************************************/ +LEAF_MIPS_DSPR2(jsimd_h2v2_smooth_downsample_mips_dspr2) +/* + * a0 - input_data + * a1 - output_data + * a2 - compptr->v_samp_factor + * a3 - cinfo->max_v_samp_factor + * 16(sp) - cinfo->smoothing_factor + * 20(sp) - compptr->width_in_blocks + * 24(sp) - cinfo->image_width + */ + + .set at + + SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + lw s7, 52(sp) // compptr->width_in_blocks + lw s0, 56(sp) // cinfo->image_width + lw s6, 48(sp) // cinfo->smoothing_factor + sll s7, 3 // output_cols = width_in_blocks * DCTSIZE + sll v0, s7, 1 + subu v0, v0, s0 + blez v0, 2f + move v1, zero + addiu t0, a3, 2 // t0 = cinfo->max_v_samp_factor + 2 +0: + addiu t1, a0, -4 + sll t2, v1, 2 + lwx t1, t2(t1) + move t3, v0 + addu t1, t1, s0 + lbu t2, -1(t1) +1: + addiu t3, t3, -1 + sb t2, 0(t1) + bgtz t3, 1b + addiu t1, t1, 1 + addiu v1, v1, 1 + bne v1, t0, 0b + nop +2: + li v0, 80 + mul v0, s6, v0 + li v1, 16384 + move t4, zero + move t5, zero + subu t6, v1, v0 // t6 = 16384 - tmp_smoot_f * 80 + sll t7, s6, 4 // t7 = tmp_smoot_f * 16 +3: +/* Special case for first column: pretend column -1 is same as column 0 */ + sll v0, t4, 2 + lwx t8, v0(a1) // outptr = output_data[outrow] + sll v1, t5, 2 + addiu t9, v1, 4 + addiu s0, v1, -4 + addiu s1, v1, 8 + lwx s2, v1(a0) // inptr0 = input_data[inrow] + lwx t9, t9(a0) // inptr1 = input_data[inrow+1] + lwx s0, s0(a0) // above_ptr = input_data[inrow-1] + lwx s1, s1(a0) // below_ptr = input_data[inrow+2] + lh v0, 0(s2) + lh v1, 0(t9) + lh t0, 0(s0) + lh t1, 0(s1) + ins v0, v1, 16, 16 + ins t0, t1, 16, 16 + raddu.w.qb t2, v0 + raddu.w.qb s3, t0 + lbu v0, 0(s2) + lbu v1, 2(s2) + lbu t0, 0(t9) + lbu t1, 2(t9) + addu v0, v0, v1 + mult $ac1,t2, t6 + addu t0, t0, t1 + lbu t2, 2(s0) + addu t0, t0, v0 + lbu t3, 2(s1) + addu s3, t0, s3 + lbu v0, 0(s0) + lbu t0, 0(s1) + sll s3, s3, 1 + addu v0, v0, t2 + addu t0, t0, t3 + addu t0, t0, v0 + addu s3, t0, s3 + madd $ac1,s3, t7 + extr_r.w v0, $ac1, 16 + addiu t8, t8, 1 + addiu s2, s2, 2 + addiu t9, t9, 2 + addiu s0, s0, 2 + addiu s1, s1, 2 + sb v0, -1(t8) + addiu s4, s7, -2 + and s4, s4, 3 + addu s5, s4, t8 //end adress +4: + lh v0, 0(s2) + lh v1, 0(t9) + lh t0, 0(s0) + lh t1, 0(s1) + ins v0, v1, 16, 16 + ins t0, t1, 16, 16 + raddu.w.qb t2, v0 + raddu.w.qb s3, t0 + lbu v0, -1(s2) + lbu v1, 2(s2) + lbu t0, -1(t9) + lbu t1, 2(t9) + addu v0, v0, v1 + mult $ac1, t2, t6 + addu t0, t0, t1 + lbu t2, 2(s0) + addu t0, t0, v0 + lbu t3, 2(s1) + addu s3, t0, s3 + lbu v0, -1(s0) + lbu t0, -1(s1) + sll s3, s3, 1 + addu v0, v0, t2 + addu t0, t0, t3 + addu t0, t0, v0 + addu s3, t0, s3 + madd $ac1, s3, t7 + extr_r.w t2, $ac1, 16 + addiu t8, t8, 1 + addiu s2, s2, 2 + addiu t9, t9, 2 + addiu s0, s0, 2 + sb t2, -1(t8) + bne s5, t8, 4b + addiu s1, s1, 2 + addiu s5, s7, -2 + subu s5, s5, s4 + addu s5, s5, t8 //end adress +5: + lh v0, 0(s2) + lh v1, 0(t9) + lh t0, 0(s0) + lh t1, 0(s1) + ins v0, v1, 16, 16 + ins t0, t1, 16, 16 + raddu.w.qb t2, v0 + raddu.w.qb s3, t0 + lbu v0, -1(s2) + lbu v1, 2(s2) + lbu t0, -1(t9) + lbu t1, 2(t9) + addu v0, v0, v1 + mult $ac1, t2, t6 + addu t0, t0, t1 + lbu t2, 2(s0) + addu t0, t0, v0 + lbu t3, 2(s1) + addu s3, t0, s3 + lbu v0, -1(s0) + lbu t0, -1(s1) + sll s3, s3, 1 + addu v0, v0, t2 + addu t0, t0, t3 + lh v1, 2(t9) + addu t0, t0, v0 + lh v0, 2(s2) + addu s3, t0, s3 + lh t0, 2(s0) + lh t1, 2(s1) + madd $ac1, s3, t7 + extr_r.w t2, $ac1, 16 + ins t0, t1, 16, 16 + ins v0, v1, 16, 16 + raddu.w.qb s3, t0 + lbu v1, 4(s2) + lbu t0, 1(t9) + lbu t1, 4(t9) + sb t2, 0(t8) + raddu.w.qb t3, v0 + lbu v0, 1(s2) + addu t0, t0, t1 + mult $ac1, t3, t6 + addu v0, v0, v1 + lbu t2, 4(s0) + addu t0, t0, v0 + lbu v0, 1(s0) + addu s3, t0, s3 + lbu t0, 1(s1) + lbu t3, 4(s1) + addu v0, v0, t2 + sll s3, s3, 1 + addu t0, t0, t3 + lh v1, 4(t9) + addu t0, t0, v0 + lh v0, 4(s2) + addu s3, t0, s3 + lh t0, 4(s0) + lh t1, 4(s1) + madd $ac1, s3, t7 + extr_r.w t2, $ac1, 16 + ins t0, t1, 16, 16 + ins v0, v1, 16, 16 + raddu.w.qb s3, t0 + lbu v1, 6(s2) + lbu t0, 3(t9) + lbu t1, 6(t9) + sb t2, 1(t8) + raddu.w.qb t3, v0 + lbu v0, 3(s2) + addu t0, t0,t1 + mult $ac1, t3, t6 + addu v0, v0, v1 + lbu t2, 6(s0) + addu t0, t0, v0 + lbu v0, 3(s0) + addu s3, t0, s3 + lbu t0, 3(s1) + lbu t3, 6(s1) + addu v0, v0, t2 + sll s3, s3, 1 + addu t0, t0, t3 + lh v1, 6(t9) + addu t0, t0, v0 + lh v0, 6(s2) + addu s3, t0, s3 + lh t0, 6(s0) + lh t1, 6(s1) + madd $ac1, s3, t7 + extr_r.w t3, $ac1, 16 + ins t0, t1, 16, 16 + ins v0, v1, 16, 16 + raddu.w.qb s3, t0 + lbu v1, 8(s2) + lbu t0, 5(t9) + lbu t1, 8(t9) + sb t3, 2(t8) + raddu.w.qb t2, v0 + lbu v0, 5(s2) + addu t0, t0, t1 + mult $ac1, t2, t6 + addu v0, v0, v1 + lbu t2, 8(s0) + addu t0, t0, v0 + lbu v0, 5(s0) + addu s3, t0, s3 + lbu t0, 5(s1) + lbu t3, 8(s1) + addu v0, v0, t2 + sll s3, s3, 1 + addu t0, t0, t3 + addiu t8, t8, 4 + addu t0, t0, v0 + addiu s2, s2, 8 + addu s3, t0, s3 + addiu t9, t9, 8 + madd $ac1, s3, t7 + extr_r.w t1, $ac1, 16 + addiu s0, s0, 8 + addiu s1, s1, 8 + bne s5, t8, 5b + sb t1, -1(t8) +/* Special case for last column */ + lh v0, 0(s2) + lh v1, 0(t9) + lh t0, 0(s0) + lh t1, 0(s1) + ins v0, v1, 16, 16 + ins t0, t1, 16, 16 + raddu.w.qb t2, v0 + raddu.w.qb s3, t0 + lbu v0, -1(s2) + lbu v1, 1(s2) + lbu t0, -1(t9) + lbu t1, 1(t9) + addu v0, v0, v1 + mult $ac1, t2, t6 + addu t0, t0, t1 + lbu t2, 1(s0) + addu t0, t0, v0 + lbu t3, 1(s1) + addu s3, t0, s3 + lbu v0, -1(s0) + lbu t0, -1(s1) + sll s3, s3, 1 + addu v0, v0, t2 + addu t0, t0, t3 + addu t0, t0, v0 + addu s3, t0, s3 + madd $ac1, s3, t7 + extr_r.w t0, $ac1, 16 + addiu t5, t5, 2 + sb t0, 0(t8) + addiu t4, t4, 1 + bne t4, a2, 3b + addiu t5, t5, 2 + + RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + j ra + nop + +END(jsimd_h2v2_smooth_downsample_mips_dspr2) + +/*****************************************************************************/ +LEAF_MIPS_DSPR2(jsimd_int_upsample_mips_dspr2) +/* + * a0 - upsample->h_expand[compptr->component_index] + * a1 - upsample->v_expand[compptr->component_index] + * a2 - input_data + * a3 - output_data_ptr + * 16(sp) - cinfo->output_width + * 20(sp) - cinfo->max_v_samp_factor + */ + .set at + + SAVE_REGS_ON_STACK 16, s0, s1, s2, s3 + + lw s0, 0(a3) // s0 = output_data + lw s1, 32(sp) // s1 = cinfo->output_width + lw s2, 36(sp) // s2 = cinfo->max_v_samp_factor + li t6, 0 // t6 = inrow + beqz s2, 10f + li s3, 0 // s3 = outrow +0: + addu t0, a2, t6 + addu t7, s0, s3 + lw t3, 0(t0) // t3 = inptr + lw t8, 0(t7) // t8 = outptr + beqz s1, 4f + addu t5, t8, s1 // t5 = outend +1: + lb t2, 0(t3) // t2 = invalue = *inptr++ + addiu t3, 1 + beqz a0, 3f + move t0, a0 // t0 = h_expand +2: + sb t2, 0(t8) + addiu t0, -1 + bgtz t0, 2b + addiu t8, 1 +3: + bgt t5, t8, 1b + nop +4: + addiu t9, a1, -1 // t9 = v_expand - 1 + blez t9, 9f + nop +5: + lw t3, 0(s0) + lw t4, 4(s0) + subu t0, s1, 0xF + blez t0, 7f + addu t5, t3, s1 // t5 = end address + andi t7, s1, 0xF // t7 = residual + subu t8, t5, t7 +6: + ulw t0, 0(t3) + ulw t1, 4(t3) + ulw t2, 8(t3) + usw t0, 0(t4) + ulw t0, 12(t3) + usw t1, 4(t4) + usw t2, 8(t4) + usw t0, 12(t4) + addiu t3, 16 + bne t3, t8, 6b + addiu t4, 16 + beqz t7, 8f + nop +7: + lbu t0, 0(t3) + sb t0, 0(t4) + addiu t3, 1 + bne t3, t5, 7b + addiu t4, 1 +8: + addiu t9, -1 + bgtz t9, 5b + addiu s0, 8 +9: + addu s3, s3, a1 + bne s3, s2, 0b + addiu t6, 1 +10: + RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3 + + j ra + nop +END(jsimd_int_upsample_mips_dspr2) + +/*****************************************************************************/ +LEAF_MIPS_DSPR2(jsimd_h2v1_upsample_mips_dspr2) +/* + * a0 - cinfo->max_v_samp_factor + * a1 - cinfo->output_width + * a2 - input_data + * a3 - output_data_ptr + */ + lw t7, 0(a3) // t7 = output_data + andi t8, a1, 0xf // t8 = residual + sll t0, a0, 2 + blez a0, 4f + addu t9, t7, t0 // t9 = output_data end address +0: + lw t5, 0(t7) // t5 = outptr + lw t6, 0(a2) // t6 = inptr + addu t3, t5, a1 // t3 = outptr + output_width (end address) + subu t3, t8 // t3 = end address - residual + beq t5, t3, 2f + move t4, t8 +1: + ulw t0, 0(t6) // t0 = |P3|P2|P1|P0| + ulw t2, 4(t6) // t2 = |P7|P6|P5|P4| + srl t1, t0, 16 // t1 = |X|X|P3|P2| + ins t0, t0, 16, 16 // t0 = |P1|P0|P1|P0| + ins t1, t1, 16, 16 // t1 = |P3|P2|P3|P2| + ins t0, t0, 8, 16 // t0 = |P1|P1|P0|P0| + ins t1, t1, 8, 16 // t1 = |P3|P3|P2|P2| + usw t0, 0(t5) + usw t1, 4(t5) + srl t0, t2, 16 // t0 = |X|X|P7|P6| + ins t2, t2, 16, 16 // t2 = |P5|P4|P5|P4| + ins t0, t0, 16, 16 // t0 = |P7|P6|P7|P6| + ins t2, t2, 8, 16 // t2 = |P5|P5|P4|P4| + ins t0, t0, 8, 16 // t0 = |P7|P7|P6|P6| + usw t2, 8(t5) + usw t0, 12(t5) + addiu t5, 16 + bne t5, t3, 1b + addiu t6, 8 + beqz t8, 3f + move t4, t8 +2: + lbu t1, 0(t6) + sb t1, 0(t5) + sb t1, 1(t5) + addiu t4, -2 + addiu t6, 1 + bgtz t4, 2b + addiu t5, 2 +3: + addiu t7, 4 + bne t9, t7, 0b + addiu a2, 4 +4: + j ra + nop +END(jsimd_h2v1_upsample_mips_dspr2) + +/*****************************************************************************/ +LEAF_MIPS_DSPR2(jsimd_h2v2_upsample_mips_dspr2) +/* + * a0 - cinfo->max_v_samp_factor + * a1 - cinfo->output_width + * a2 - input_data + * a3 - output_data_ptr + */ + lw t7, 0(a3) + blez a0, 7f + andi t9, a1, 0xf // t9 = residual +0: + lw t6, 0(a2) // t6 = inptr + lw t5, 0(t7) // t5 = outptr + addu t8, t5, a1 // t8 = outptr end address + subu t8, t9 // t8 = end address - residual + beq t5, t8, 2f + move t4, t9 +1: + ulw t0, 0(t6) + srl t1, t0, 16 + ins t0, t0, 16, 16 + ins t0, t0, 8, 16 + ins t1, t1, 16, 16 + ins t1, t1, 8, 16 + ulw t2, 4(t6) + usw t0, 0(t5) + usw t1, 4(t5) + srl t3, t2, 16 + ins t2, t2, 16, 16 + ins t2, t2, 8, 16 + ins t3, t3, 16, 16 + ins t3, t3, 8, 16 + usw t2, 8(t5) + usw t3, 12(t5) + addiu t5, 16 + bne t5, t8, 1b + addiu t6, 8 + beqz t9, 3f + move t4, t9 +2: + lbu t0, 0(t6) + sb t0, 0(t5) + sb t0, 1(t5) + addiu t4, -2 + addiu t6, 1 + bgtz t4, 2b + addiu t5, 2 +3: + lw t6, 0(t7) // t6 = outptr[0] + lw t5, 4(t7) // t5 = outptr[1] + addu t4, t6, a1 // t4 = new end address + beq a1, t9, 5f + subu t8, t4, t9 +4: + ulw t0, 0(t6) + ulw t1, 4(t6) + ulw t2, 8(t6) + usw t0, 0(t5) + ulw t0, 12(t6) + usw t1, 4(t5) + usw t2, 8(t5) + usw t0, 12(t5) + addiu t6, 16 + bne t6, t8, 4b + addiu t5, 16 + beqz t9, 6f + nop +5: + lbu t0, 0(t6) + sb t0, 0(t5) + addiu t6, 1 + bne t6, t4, 5b + addiu t5, 1 +6: + addiu t7, 8 + addiu a0, -2 + bgtz a0, 0b + addiu a2, 4 +7: + j ra + nop +END(jsimd_h2v2_upsample_mips_dspr2) + +/*****************************************************************************/ +LEAF_MIPS_DSPR2(jsimd_idct_islow_mips_dspr2) +/* + * a0 - coef_block + * a1 - compptr->dcttable + * a2 - output + * a3 - range_limit + */ + + SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + addiu sp, sp, -256 + move v0, sp + addiu v1, zero, 8 // v1 = DCTSIZE = 8 +1: + lh s4, 32(a0) // s4 = inptr[16] + lh s5, 64(a0) // s5 = inptr[32] + lh s6, 96(a0) // s6 = inptr[48] + lh t1, 112(a0) // t1 = inptr[56] + lh t7, 16(a0) // t7 = inptr[8] + lh t5, 80(a0) // t5 = inptr[40] + lh t3, 48(a0) // t3 = inptr[24] + or s4, s4, t1 + or s4, s4, t3 + or s4, s4, t5 + or s4, s4, t7 + or s4, s4, s5 + or s4, s4, s6 + bnez s4, 2f + addiu v1, v1, -1 + lh s5, 0(a1) // quantptr[DCTSIZE*0] + lh s6, 0(a0) // inptr[DCTSIZE*0] + mul s5, s5, s6 // DEQUANTIZE(inptr[0], quantptr[0]) + sll s5, s5, 2 + sw s5, 0(v0) + sw s5, 32(v0) + sw s5, 64(v0) + sw s5, 96(v0) + sw s5, 128(v0) + sw s5, 160(v0) + sw s5, 192(v0) + b 3f + sw s5, 224(v0) +2: + lh t0, 112(a1) + lh t2, 48(a1) + lh t4, 80(a1) + lh t6, 16(a1) + mul t0, t0, t1 // DEQUANTIZE(inptr[DCTSIZE*7],quant[DCTSIZE*7]) + mul t1, t2, t3 // DEQUANTIZE(inptr[DCTSIZE*3],quant[DCTSIZE*3]) + mul t2, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*5],quant[DCTSIZE*5]) + mul t3, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*1],quant[DCTSIZE*1]) + lh t4, 32(a1) + lh t5, 32(a0) + lh t6, 96(a1) + lh t7, 96(a0) + addu s0, t0, t1 // z3 = tmp0 + tmp2 + addu s1, t1, t2 // z2 = tmp1 + tmp2 + addu s2, t2, t3 // z4 = tmp1 + tmp3 + addu s3, s0, s2 // z3 + z4 + addiu t9, zero, 9633 // FIX_1_175875602 + mul s3, s3, t9 // z5 = MULTIPLY(z3 + z4, FIX_1_175875602) + addu t8, t0, t3 // z1 = tmp0 + tmp3 + addiu t9, zero, 2446 // FIX_0_298631336 + mul t0, t0, t9 // tmp0 = MULTIPLY(tmp0, FIX_0_298631336) + addiu t9, zero, 16819 // FIX_2_053119869 + mul t2, t2, t9 // tmp1 = MULTIPLY(tmp1, FIX_2_053119869) + addiu t9, zero, 25172 // FIX_3_072711026 + mul t1, t1, t9 // tmp2 = MULTIPLY(tmp2, FIX_3_072711026) + addiu t9, zero, 12299 // FIX_1_501321110 + mul t3, t3, t9 // tmp3 = MULTIPLY(tmp3, FIX_1_501321110) + addiu t9, zero, 16069 // FIX_1_961570560 + mul s0, s0, t9 // -z3 = MULTIPLY(z3, FIX_1_961570560) + addiu t9, zero, 3196 // FIX_0_390180644 + mul s2, s2, t9 // -z4 = MULTIPLY(z4, FIX_0_390180644) + addiu t9, zero, 7373 // FIX_0_899976223 + mul t8, t8, t9 // -z1 = MULTIPLY(z1, FIX_0_899976223) + addiu t9, zero, 20995 // FIX_2_562915447 + mul s1, s1, t9 // -z2 = MULTIPLY(z2, FIX_2_562915447) + subu s0, s3, s0 // z3 += z5 + addu t0, t0, s0 // tmp0 += z3 + addu t1, t1, s0 // tmp2 += z3 + subu s2, s3, s2 // z4 += z5 + addu t2, t2, s2 // tmp1 += z4 + addu t3, t3, s2 // tmp3 += z4 + subu t0, t0, t8 // tmp0 += z1 + subu t1, t1, s1 // tmp2 += z2 + subu t2, t2, s1 // tmp1 += z2 + subu t3, t3, t8 // tmp3 += z1 + mul s0, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*2],quant[DCTSIZE*2]) + addiu t9, zero, 6270 // FIX_0_765366865 + mul s1, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*6],quant[DCTSIZE*6]) + lh t4, 0(a1) + lh t5, 0(a0) + lh t6, 64(a1) + lh t7, 64(a0) + mul s2, t9, s0 // MULTIPLY(z2, FIX_0_765366865) + mul t5, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*0],quant[DCTSIZE*0]) + mul t6, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*4],quant[DCTSIZE*4]) + addiu t9, zero, 4433 // FIX_0_541196100 + addu s3, s0, s1 // z2 + z3 + mul s3, s3, t9 // z1 = MULTIPLY(z2 + z3, FIX_0_541196100) + addiu t9, zero, 15137 // FIX_1_847759065 + mul t8, s1, t9 // MULTIPLY(z3, FIX_1_847759065) + addu t4, t5, t6 + subu t5, t5, t6 + sll t4, t4, 13 // tmp0 = (z2 + z3) << CONST_BITS + sll t5, t5, 13 // tmp1 = (z2 - z3) << CONST_BITS + addu t7, s3, s2 // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865) + subu t6, s3, t8 // tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065) + addu s0, t4, t7 + subu s1, t4, t7 + addu s2, t5, t6 + subu s3, t5, t6 + addu t4, s0, t3 + subu s0, s0, t3 + addu t3, s2, t1 + subu s2, s2, t1 + addu t1, s3, t2 + subu s3, s3, t2 + addu t2, s1, t0 + subu s1, s1, t0 + shra_r.w t4, t4, 11 + shra_r.w t3, t3, 11 + shra_r.w t1, t1, 11 + shra_r.w t2, t2, 11 + shra_r.w s1, s1, 11 + shra_r.w s3, s3, 11 + shra_r.w s2, s2, 11 + shra_r.w s0, s0, 11 + sw t4, 0(v0) + sw t3, 32(v0) + sw t1, 64(v0) + sw t2, 96(v0) + sw s1, 128(v0) + sw s3, 160(v0) + sw s2, 192(v0) + sw s0, 224(v0) +3: + addiu a1, a1, 2 + addiu a0, a0, 2 + bgtz v1, 1b + addiu v0, v0, 4 + move v0, sp + addiu v1, zero, 8 +4: + lw t0, 8(v0) // z2 = (JLONG) wsptr[2] + lw t1, 24(v0) // z3 = (JLONG) wsptr[6] + lw t2, 0(v0) // (JLONG) wsptr[0] + lw t3, 16(v0) // (JLONG) wsptr[4] + lw s4, 4(v0) // (JLONG) wsptr[1] + lw s5, 12(v0) // (JLONG) wsptr[3] + lw s6, 20(v0) // (JLONG) wsptr[5] + lw s7, 28(v0) // (JLONG) wsptr[7] + or s4, s4, t0 + or s4, s4, t1 + or s4, s4, t3 + or s4, s4, s7 + or s4, s4, s5 + or s4, s4, s6 + bnez s4, 5f + addiu v1, v1, -1 + shra_r.w s5, t2, 5 + andi s5, s5, 0x3ff + lbux s5, s5(a3) + lw s1, 0(a2) + replv.qb s5, s5 + usw s5, 0(s1) + usw s5, 4(s1) + b 6f + nop +5: + addu t4, t0, t1 // z2 + z3 + addiu t8, zero, 4433 // FIX_0_541196100 + mul t5, t4, t8 // z1 = MULTIPLY(z2 + z3, FIX_0_541196100) + addiu t8, zero, 15137 // FIX_1_847759065 + mul t1, t1, t8 // MULTIPLY(z3, FIX_1_847759065) + addiu t8, zero, 6270 // FIX_0_765366865 + mul t0, t0, t8 // MULTIPLY(z2, FIX_0_765366865) + addu t4, t2, t3 // (JLONG) wsptr[0] + (JLONG) wsptr[4] + subu t2, t2, t3 // (JLONG) wsptr[0] - (JLONG) wsptr[4] + sll t4, t4, 13 // tmp0 = ((wsptr[0] + wsptr[4]) << CONST_BITS + sll t2, t2, 13 // tmp1 = ((wsptr[0] - wsptr[4]) << CONST_BITS + subu t1, t5, t1 // tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065) + subu t3, t2, t1 // tmp12 = tmp1 - tmp2 + addu t2, t2, t1 // tmp11 = tmp1 + tmp2 + addu t5, t5, t0 // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865) + subu t1, t4, t5 // tmp13 = tmp0 - tmp3 + addu t0, t4, t5 // tmp10 = tmp0 + tmp3 + lw t4, 28(v0) // tmp0 = (JLONG) wsptr[7] + lw t6, 12(v0) // tmp2 = (JLONG) wsptr[3] + lw t5, 20(v0) // tmp1 = (JLONG) wsptr[5] + lw t7, 4(v0) // tmp3 = (JLONG) wsptr[1] + addu s0, t4, t6 // z3 = tmp0 + tmp2 + addiu t8, zero, 9633 // FIX_1_175875602 + addu s1, t5, t7 // z4 = tmp1 + tmp3 + addu s2, s0, s1 // z3 + z4 + mul s2, s2, t8 // z5 = MULTIPLY(z3 + z4, FIX_1_175875602) + addu s3, t4, t7 // z1 = tmp0 + tmp3 + addu t9, t5, t6 // z2 = tmp1 + tmp2 + addiu t8, zero, 16069 // FIX_1_961570560 + mul s0, s0, t8 // -z3 = MULTIPLY(z3, FIX_1_961570560) + addiu t8, zero, 3196 // FIX_0_390180644 + mul s1, s1, t8 // -z4 = MULTIPLY(z4, FIX_0_390180644) + addiu t8, zero, 2446 // FIX_0_298631336 + mul t4, t4, t8 // tmp0 = MULTIPLY(tmp0, FIX_0_298631336) + addiu t8, zero, 7373 // FIX_0_899976223 + mul s3, s3, t8 // -z1 = MULTIPLY(z1, FIX_0_899976223) + addiu t8, zero, 16819 // FIX_2_053119869 + mul t5, t5, t8 // tmp1 = MULTIPLY(tmp1, FIX_2_053119869) + addiu t8, zero, 20995 // FIX_2_562915447 + mul t9, t9, t8 // -z2 = MULTIPLY(z2, FIX_2_562915447) + addiu t8, zero, 25172 // FIX_3_072711026 + mul t6, t6, t8 // tmp2 = MULTIPLY(tmp2, FIX_3_072711026) + addiu t8, zero, 12299 // FIX_1_501321110 + mul t7, t7, t8 // tmp3 = MULTIPLY(tmp3, FIX_1_501321110) + subu s0, s2, s0 // z3 += z5 + subu s1, s2, s1 // z4 += z5 + addu t4, t4, s0 + subu t4, t4, s3 // tmp0 + addu t5, t5, s1 + subu t5, t5, t9 // tmp1 + addu t6, t6, s0 + subu t6, t6, t9 // tmp2 + addu t7, t7, s1 + subu t7, t7, s3 // tmp3 + addu s0, t0, t7 + subu t0, t0, t7 + addu t7, t2, t6 + subu t2, t2, t6 + addu t6, t3, t5 + subu t3, t3, t5 + addu t5, t1, t4 + subu t1, t1, t4 + shra_r.w s0, s0, 18 + shra_r.w t7, t7, 18 + shra_r.w t6, t6, 18 + shra_r.w t5, t5, 18 + shra_r.w t1, t1, 18 + shra_r.w t3, t3, 18 + shra_r.w t2, t2, 18 + shra_r.w t0, t0, 18 + andi s0, s0, 0x3ff + andi t7, t7, 0x3ff + andi t6, t6, 0x3ff + andi t5, t5, 0x3ff + andi t1, t1, 0x3ff + andi t3, t3, 0x3ff + andi t2, t2, 0x3ff + andi t0, t0, 0x3ff + lw s1, 0(a2) + lbux s0, s0(a3) + lbux t7, t7(a3) + lbux t6, t6(a3) + lbux t5, t5(a3) + lbux t1, t1(a3) + lbux t3, t3(a3) + lbux t2, t2(a3) + lbux t0, t0(a3) + sb s0, 0(s1) + sb t7, 1(s1) + sb t6, 2(s1) + sb t5, 3(s1) + sb t1, 4(s1) + sb t3, 5(s1) + sb t2, 6(s1) + sb t0, 7(s1) +6: + addiu v0, v0, 32 + bgtz v1, 4b + addiu a2, a2, 4 + addiu sp, sp, 256 + + RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + j ra + nop + +END(jsimd_idct_islow_mips_dspr2) + +/*****************************************************************************/ +LEAF_MIPS_DSPR2(jsimd_idct_ifast_cols_mips_dspr2) +/* + * a0 - inptr + * a1 - quantptr + * a2 - wsptr + * a3 - mips_idct_ifast_coefs + */ + + SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + addiu t9, a0, 16 // end address + or AT, a3, zero + +0: + lw s0, 0(a1) // quantptr[DCTSIZE*0] + lw t0, 0(a0) // inptr[DCTSIZE*0] + lw t1, 16(a0) // inptr[DCTSIZE*1] + muleq_s.w.phl v0, t0, s0 // tmp0 ... + lw t2, 32(a0) // inptr[DCTSIZE*2] + lw t3, 48(a0) // inptr[DCTSIZE*3] + lw t4, 64(a0) // inptr[DCTSIZE*4] + lw t5, 80(a0) // inptr[DCTSIZE*5] + muleq_s.w.phr t0, t0, s0 // ... tmp0 ... + lw t6, 96(a0) // inptr[DCTSIZE*6] + lw t7, 112(a0) // inptr[DCTSIZE*7] + or s4, t1, t2 + or s5, t3, t4 + bnez s4, 1f + ins t0, v0, 16, 16 // ... tmp0 + bnez s5, 1f + or s6, t5, t6 + or s6, s6, t7 + bnez s6, 1f + sw t0, 0(a2) // wsptr[DCTSIZE*0] + sw t0, 16(a2) // wsptr[DCTSIZE*1] + sw t0, 32(a2) // wsptr[DCTSIZE*2] + sw t0, 48(a2) // wsptr[DCTSIZE*3] + sw t0, 64(a2) // wsptr[DCTSIZE*4] + sw t0, 80(a2) // wsptr[DCTSIZE*5] + sw t0, 96(a2) // wsptr[DCTSIZE*6] + sw t0, 112(a2) // wsptr[DCTSIZE*7] + addiu a0, a0, 4 + b 2f + addiu a1, a1, 4 + +1: + lw s1, 32(a1) // quantptr[DCTSIZE*2] + lw s2, 64(a1) // quantptr[DCTSIZE*4] + muleq_s.w.phl v0, t2, s1 // tmp1 ... + muleq_s.w.phr t2, t2, s1 // ... tmp1 ... + lw s0, 16(a1) // quantptr[DCTSIZE*1] + lw s1, 48(a1) // quantptr[DCTSIZE*3] + lw s3, 96(a1) // quantptr[DCTSIZE*6] + muleq_s.w.phl v1, t4, s2 // tmp2 ... + muleq_s.w.phr t4, t4, s2 // ... tmp2 ... + lw s2, 80(a1) // quantptr[DCTSIZE*5] + lw t8, 4(AT) // FIX(1.414213562) + ins t2, v0, 16, 16 // ... tmp1 + muleq_s.w.phl v0, t6, s3 // tmp3 ... + muleq_s.w.phr t6, t6, s3 // ... tmp3 ... + ins t4, v1, 16, 16 // ... tmp2 + addq.ph s4, t0, t4 // tmp10 + subq.ph s5, t0, t4 // tmp11 + ins t6, v0, 16, 16 // ... tmp3 + subq.ph s6, t2, t6 // tmp12 ... + addq.ph s7, t2, t6 // tmp13 + mulq_s.ph s6, s6, t8 // ... tmp12 ... + addq.ph t0, s4, s7 // tmp0 + subq.ph t6, s4, s7 // tmp3 + muleq_s.w.phl v0, t1, s0 // tmp4 ... + muleq_s.w.phr t1, t1, s0 // ... tmp4 ... + shll_s.ph s6, s6, 1 // x2 + lw s3, 112(a1) // quantptr[DCTSIZE*7] + subq.ph s6, s6, s7 // ... tmp12 + muleq_s.w.phl v1, t7, s3 // tmp7 ... + muleq_s.w.phr t7, t7, s3 // ... tmp7 ... + ins t1, v0, 16, 16 // ... tmp4 + addq.ph t2, s5, s6 // tmp1 + subq.ph t4, s5, s6 // tmp2 + muleq_s.w.phl v0, t5, s2 // tmp6 ... + muleq_s.w.phr t5, t5, s2 // ... tmp6 ... + ins t7, v1, 16, 16 // ... tmp7 + addq.ph s5, t1, t7 // z11 + subq.ph s6, t1, t7 // z12 + muleq_s.w.phl v1, t3, s1 // tmp5 ... + muleq_s.w.phr t3, t3, s1 // ... tmp5 ... + ins t5, v0, 16, 16 // ... tmp6 + ins t3, v1, 16, 16 // ... tmp5 + addq.ph s7, t5, t3 // z13 + subq.ph v0, t5, t3 // z10 + addq.ph t7, s5, s7 // tmp7 + subq.ph s5, s5, s7 // tmp11 ... + addq.ph v1, v0, s6 // z5 ... + mulq_s.ph s5, s5, t8 // ... tmp11 + lw t8, 8(AT) // FIX(1.847759065) + lw s4, 0(AT) // FIX(1.082392200) + addq.ph s0, t0, t7 + subq.ph s1, t0, t7 + mulq_s.ph v1, v1, t8 // ... z5 + shll_s.ph s5, s5, 1 // x2 + lw t8, 12(AT) // FIX(-2.613125930) + sw s0, 0(a2) // wsptr[DCTSIZE*0] + shll_s.ph v0, v0, 1 // x4 + mulq_s.ph v0, v0, t8 // tmp12 ... + mulq_s.ph s4, s6, s4 // tmp10 ... + shll_s.ph v1, v1, 1 // x2 + addiu a0, a0, 4 + addiu a1, a1, 4 + sw s1, 112(a2) // wsptr[DCTSIZE*7] + shll_s.ph s6, v0, 1 // x4 + shll_s.ph s4, s4, 1 // x2 + addq.ph s6, s6, v1 // ... tmp12 + subq.ph t5, s6, t7 // tmp6 + subq.ph s4, s4, v1 // ... tmp10 + subq.ph t3, s5, t5 // tmp5 + addq.ph s2, t2, t5 + addq.ph t1, s4, t3 // tmp4 + subq.ph s3, t2, t5 + sw s2, 16(a2) // wsptr[DCTSIZE*1] + sw s3, 96(a2) // wsptr[DCTSIZE*6] + addq.ph v0, t4, t3 + subq.ph v1, t4, t3 + sw v0, 32(a2) // wsptr[DCTSIZE*2] + sw v1, 80(a2) // wsptr[DCTSIZE*5] + addq.ph v0, t6, t1 + subq.ph v1, t6, t1 + sw v0, 64(a2) // wsptr[DCTSIZE*4] + sw v1, 48(a2) // wsptr[DCTSIZE*3] + +2: + bne a0, t9, 0b + addiu a2, a2, 4 + + RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + j ra + nop + +END(jsimd_idct_ifast_cols_mips_dspr2) + +/*****************************************************************************/ +LEAF_MIPS_DSPR2(jsimd_idct_ifast_rows_mips_dspr2) +/* + * a0 - wsptr + * a1 - output_buf + * a2 - output_col + * a3 - mips_idct_ifast_coefs + */ + + SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3 + + addiu t9, a0, 128 // end address + lui s8, 0x8080 + ori s8, s8, 0x8080 + +0: + lw AT, 36(sp) // restore $a3 (mips_idct_ifast_coefs) + lw t0, 0(a0) // wsptr[DCTSIZE*0+0/1] b a + lw s0, 16(a0) // wsptr[DCTSIZE*1+0/1] B A + lw t2, 4(a0) // wsptr[DCTSIZE*0+2/3] d c + lw s2, 20(a0) // wsptr[DCTSIZE*1+2/3] D C + lw t4, 8(a0) // wsptr[DCTSIZE*0+4/5] f e + lw s4, 24(a0) // wsptr[DCTSIZE*1+4/5] F E + lw t6, 12(a0) // wsptr[DCTSIZE*0+6/7] h g + lw s6, 28(a0) // wsptr[DCTSIZE*1+6/7] H G + precrq.ph.w t1, s0, t0 // B b + ins t0, s0, 16, 16 // A a + bnez t1, 1f + or s0, t2, s2 + bnez s0, 1f + or s0, t4, s4 + bnez s0, 1f + or s0, t6, s6 + bnez s0, 1f + shll_s.ph s0, t0, 2 // A a + lw a3, 0(a1) + lw AT, 4(a1) + precrq.ph.w t0, s0, s0 // A A + ins s0, s0, 16, 16 // a a + addu a3, a3, a2 + addu AT, AT, a2 + precrq.qb.ph t0, t0, t0 // A A A A + precrq.qb.ph s0, s0, s0 // a a a a + addu.qb s0, s0, s8 + addu.qb t0, t0, s8 + sw s0, 0(a3) + sw s0, 4(a3) + sw t0, 0(AT) + sw t0, 4(AT) + addiu a0, a0, 32 + bne a0, t9, 0b + addiu a1, a1, 8 + b 2f + nop + +1: + precrq.ph.w t3, s2, t2 + ins t2, s2, 16, 16 + precrq.ph.w t5, s4, t4 + ins t4, s4, 16, 16 + precrq.ph.w t7, s6, t6 + ins t6, s6, 16, 16 + lw t8, 4(AT) // FIX(1.414213562) + addq.ph s4, t0, t4 // tmp10 + subq.ph s5, t0, t4 // tmp11 + subq.ph s6, t2, t6 // tmp12 ... + addq.ph s7, t2, t6 // tmp13 + mulq_s.ph s6, s6, t8 // ... tmp12 ... + addq.ph t0, s4, s7 // tmp0 + subq.ph t6, s4, s7 // tmp3 + shll_s.ph s6, s6, 1 // x2 + subq.ph s6, s6, s7 // ... tmp12 + addq.ph t2, s5, s6 // tmp1 + subq.ph t4, s5, s6 // tmp2 + addq.ph s5, t1, t7 // z11 + subq.ph s6, t1, t7 // z12 + addq.ph s7, t5, t3 // z13 + subq.ph v0, t5, t3 // z10 + addq.ph t7, s5, s7 // tmp7 + subq.ph s5, s5, s7 // tmp11 ... + addq.ph v1, v0, s6 // z5 ... + mulq_s.ph s5, s5, t8 // ... tmp11 + lw t8, 8(AT) // FIX(1.847759065) + lw s4, 0(AT) // FIX(1.082392200) + addq.ph s0, t0, t7 // tmp0 + tmp7 + subq.ph s7, t0, t7 // tmp0 - tmp7 + mulq_s.ph v1, v1, t8 // ... z5 + lw a3, 0(a1) + lw t8, 12(AT) // FIX(-2.613125930) + shll_s.ph s5, s5, 1 // x2 + addu a3, a3, a2 + shll_s.ph v0, v0, 1 // x4 + mulq_s.ph v0, v0, t8 // tmp12 ... + mulq_s.ph s4, s6, s4 // tmp10 ... + shll_s.ph v1, v1, 1 // x2 + addiu a0, a0, 32 + addiu a1, a1, 8 + shll_s.ph s6, v0, 1 // x4 + shll_s.ph s4, s4, 1 // x2 + addq.ph s6, s6, v1 // ... tmp12 + shll_s.ph s0, s0, 2 + subq.ph t5, s6, t7 // tmp6 + subq.ph s4, s4, v1 // ... tmp10 + subq.ph t3, s5, t5 // tmp5 + shll_s.ph s7, s7, 2 + addq.ph t1, s4, t3 // tmp4 + addq.ph s1, t2, t5 // tmp1 + tmp6 + subq.ph s6, t2, t5 // tmp1 - tmp6 + addq.ph s2, t4, t3 // tmp2 + tmp5 + subq.ph s5, t4, t3 // tmp2 - tmp5 + addq.ph s4, t6, t1 // tmp3 + tmp4 + subq.ph s3, t6, t1 // tmp3 - tmp4 + shll_s.ph s1, s1, 2 + shll_s.ph s2, s2, 2 + shll_s.ph s3, s3, 2 + shll_s.ph s4, s4, 2 + shll_s.ph s5, s5, 2 + shll_s.ph s6, s6, 2 + precrq.ph.w t0, s1, s0 // B A + ins s0, s1, 16, 16 // b a + precrq.ph.w t2, s3, s2 // D C + ins s2, s3, 16, 16 // d c + precrq.ph.w t4, s5, s4 // F E + ins s4, s5, 16, 16 // f e + precrq.ph.w t6, s7, s6 // H G + ins s6, s7, 16, 16 // h g + precrq.qb.ph t0, t2, t0 // D C B A + precrq.qb.ph s0, s2, s0 // d c b a + precrq.qb.ph t4, t6, t4 // H G F E + precrq.qb.ph s4, s6, s4 // h g f e + addu.qb s0, s0, s8 + addu.qb s4, s4, s8 + sw s0, 0(a3) // outptr[0/1/2/3] d c b a + sw s4, 4(a3) // outptr[4/5/6/7] h g f e + lw a3, -4(a1) + addu.qb t0, t0, s8 + addu a3, a3, a2 + addu.qb t4, t4, s8 + sw t0, 0(a3) // outptr[0/1/2/3] D C B A + bne a0, t9, 0b + sw t4, 4(a3) // outptr[4/5/6/7] H G F E + +2: + + RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3 + + j ra + nop + +END(jsimd_idct_ifast_rows_mips_dspr2) + +/*****************************************************************************/ +LEAF_MIPS_DSPR2(jsimd_fdct_islow_mips_dspr2) +/* + * a0 - data + */ + + SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8 + + lui t0, 6437 + ori t0, 2260 + lui t1, 9633 + ori t1, 11363 + lui t2, 0xd39e + ori t2, 0xe6dc + lui t3, 0xf72d + ori t3, 9633 + lui t4, 2261 + ori t4, 9633 + lui t5, 0xd39e + ori t5, 6437 + lui t6, 9633 + ori t6, 0xd39d + lui t7, 0xe6dc + ori t7, 2260 + lui t8, 4433 + ori t8, 10703 + lui t9, 0xd630 + ori t9, 4433 + li s8, 8 + move a1, a0 +1: + lw s0, 0(a1) // tmp0 = 1|0 + lw s1, 4(a1) // tmp1 = 3|2 + lw s2, 8(a1) // tmp2 = 5|4 + lw s3, 12(a1) // tmp3 = 7|6 + packrl.ph s1, s1, s1 // tmp1 = 2|3 + packrl.ph s3, s3, s3 // tmp3 = 6|7 + subq.ph s7, s1, s2 // tmp7 = 2-5|3-4 = t5|t4 + subq.ph s5, s0, s3 // tmp5 = 1-6|0-7 = t6|t7 + mult $0, $0 // ac0 = 0 + dpa.w.ph $ac0, s7, t0 // ac0 += t5* 6437 + t4* 2260 + dpa.w.ph $ac0, s5, t1 // ac0 += t6* 9633 + t7* 11363 + mult $ac1, $0, $0 // ac1 = 0 + dpa.w.ph $ac1, s7, t2 // ac1 += t5*-11362 + t4* -6436 + dpa.w.ph $ac1, s5, t3 // ac1 += t6* -2259 + t7* 9633 + mult $ac2, $0, $0 // ac2 = 0 + dpa.w.ph $ac2, s7, t4 // ac2 += t5* 2261 + t4* 9633 + dpa.w.ph $ac2, s5, t5 // ac2 += t6*-11362 + t7* 6437 + mult $ac3, $0, $0 // ac3 = 0 + dpa.w.ph $ac3, s7, t6 // ac3 += t5* 9633 + t4*-11363 + dpa.w.ph $ac3, s5, t7 // ac3 += t6* -6436 + t7* 2260 + addq.ph s6, s1, s2 // tmp6 = 2+5|3+4 = t2|t3 + addq.ph s4, s0, s3 // tmp4 = 1+6|0+7 = t1|t0 + extr_r.w s0, $ac0, 11 // tmp0 = (ac0 + 1024) >> 11 + extr_r.w s1, $ac1, 11 // tmp1 = (ac1 + 1024) >> 11 + extr_r.w s2, $ac2, 11 // tmp2 = (ac2 + 1024) >> 11 + extr_r.w s3, $ac3, 11 // tmp3 = (ac3 + 1024) >> 11 + addq.ph s5, s4, s6 // tmp5 = t1+t2|t0+t3 = t11|t10 + subq.ph s7, s4, s6 // tmp7 = t1-t2|t0-t3 = t12|t13 + sh s0, 2(a1) + sh s1, 6(a1) + sh s2, 10(a1) + sh s3, 14(a1) + mult $0, $0 // ac0 = 0 + dpa.w.ph $ac0, s7, t8 // ac0 += t12* 4433 + t13* 10703 + mult $ac1, $0, $0 // ac1 = 0 + dpa.w.ph $ac1, s7, t9 // ac1 += t12*-10704 + t13* 4433 + sra s4, s5, 16 // tmp4 = t11 + addiu a1, a1, 16 + addiu s8, s8, -1 + extr_r.w s0, $ac0, 11 // tmp0 = (ac0 + 1024) >> 11 + extr_r.w s1, $ac1, 11 // tmp1 = (ac1 + 1024) >> 11 + addu s2, s5, s4 // tmp2 = t10 + t11 + subu s3, s5, s4 // tmp3 = t10 - t11 + sll s2, s2, 2 // tmp2 = (t10 + t11) << 2 + sll s3, s3, 2 // tmp3 = (t10 - t11) << 2 + sh s2, -16(a1) + sh s3, -8(a1) + sh s0, -12(a1) + bgtz s8, 1b + sh s1, -4(a1) + li t0, 2260 + li t1, 11363 + li t2, 9633 + li t3, 6436 + li t4, 6437 + li t5, 2261 + li t6, 11362 + li t7, 2259 + li t8, 4433 + li t9, 10703 + li a1, 10704 + li s8, 8 + +2: + lh a2, 0(a0) // 0 + lh a3, 16(a0) // 8 + lh v0, 32(a0) // 16 + lh v1, 48(a0) // 24 + lh s4, 64(a0) // 32 + lh s5, 80(a0) // 40 + lh s6, 96(a0) // 48 + lh s7, 112(a0) // 56 + addu s2, v0, s5 // tmp2 = 16 + 40 + subu s5, v0, s5 // tmp5 = 16 - 40 + addu s3, v1, s4 // tmp3 = 24 + 32 + subu s4, v1, s4 // tmp4 = 24 - 32 + addu s0, a2, s7 // tmp0 = 0 + 56 + subu s7, a2, s7 // tmp7 = 0 - 56 + addu s1, a3, s6 // tmp1 = 8 + 48 + subu s6, a3, s6 // tmp6 = 8 - 48 + addu a2, s0, s3 // tmp10 = tmp0 + tmp3 + subu v1, s0, s3 // tmp13 = tmp0 - tmp3 + addu a3, s1, s2 // tmp11 = tmp1 + tmp2 + subu v0, s1, s2 // tmp12 = tmp1 - tmp2 + mult s7, t1 // ac0 = tmp7 * c1 + madd s4, t0 // ac0 += tmp4 * c0 + madd s5, t4 // ac0 += tmp5 * c4 + madd s6, t2 // ac0 += tmp6 * c2 + mult $ac1, s7, t2 // ac1 = tmp7 * c2 + msub $ac1, s4, t3 // ac1 -= tmp4 * c3 + msub $ac1, s5, t6 // ac1 -= tmp5 * c6 + msub $ac1, s6, t7 // ac1 -= tmp6 * c7 + mult $ac2, s7, t4 // ac2 = tmp7 * c4 + madd $ac2, s4, t2 // ac2 += tmp4 * c2 + madd $ac2, s5, t5 // ac2 += tmp5 * c5 + msub $ac2, s6, t6 // ac2 -= tmp6 * c6 + mult $ac3, s7, t0 // ac3 = tmp7 * c0 + msub $ac3, s4, t1 // ac3 -= tmp4 * c1 + madd $ac3, s5, t2 // ac3 += tmp5 * c2 + msub $ac3, s6, t3 // ac3 -= tmp6 * c3 + extr_r.w s0, $ac0, 15 // tmp0 = (ac0 + 16384) >> 15 + extr_r.w s1, $ac1, 15 // tmp1 = (ac1 + 16384) >> 15 + extr_r.w s2, $ac2, 15 // tmp2 = (ac2 + 16384) >> 15 + extr_r.w s3, $ac3, 15 // tmp3 = (ac3 + 16384) >> 15 + addiu s8, s8, -1 + addu s4, a2, a3 // tmp4 = tmp10 + tmp11 + subu s5, a2, a3 // tmp5 = tmp10 - tmp11 + sh s0, 16(a0) + sh s1, 48(a0) + sh s2, 80(a0) + sh s3, 112(a0) + mult v0, t8 // ac0 = tmp12 * c8 + madd v1, t9 // ac0 += tmp13 * c9 + mult $ac1, v1, t8 // ac1 = tmp13 * c8 + msub $ac1, v0, a1 // ac1 -= tmp12 * c10 + addiu a0, a0, 2 + extr_r.w s6, $ac0, 15 // tmp6 = (ac0 + 16384) >> 15 + extr_r.w s7, $ac1, 15 // tmp7 = (ac1 + 16384) >> 15 + shra_r.w s4, s4, 2 // tmp4 = (tmp4 + 2) >> 2 + shra_r.w s5, s5, 2 // tmp5 = (tmp5 + 2) >> 2 + sh s4, -2(a0) + sh s5, 62(a0) + sh s6, 30(a0) + bgtz s8, 2b + sh s7, 94(a0) + + RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8 + + jr ra + nop + +END(jsimd_fdct_islow_mips_dspr2) + +/*****************************************************************************/ +LEAF_MIPS_DSPR2(jsimd_fdct_ifast_mips_dspr2) +/* + * a0 - data + */ + .set at + SAVE_REGS_ON_STACK 8, s0, s1 + li a1, 0x014e014e // FIX_1_306562965 (334 << 16)|(334 & 0xffff) + li a2, 0x008b008b // FIX_0_541196100 (139 << 16)|(139 & 0xffff) + li a3, 0x00620062 // FIX_0_382683433 (98 << 16) |(98 & 0xffff) + li s1, 0x00b500b5 // FIX_0_707106781 (181 << 16)|(181 & 0xffff) + + move v0, a0 + addiu v1, v0, 128 // end address + +0: + lw t0, 0(v0) // tmp0 = 1|0 + lw t1, 4(v0) // tmp1 = 3|2 + lw t2, 8(v0) // tmp2 = 5|4 + lw t3, 12(v0) // tmp3 = 7|6 + packrl.ph t1, t1, t1 // tmp1 = 2|3 + packrl.ph t3, t3, t3 // tmp3 = 6|7 + subq.ph t7, t1, t2 // tmp7 = 2-5|3-4 = t5|t4 + subq.ph t5, t0, t3 // tmp5 = 1-6|0-7 = t6|t7 + addq.ph t6, t1, t2 // tmp6 = 2+5|3+4 = t2|t3 + addq.ph t4, t0, t3 // tmp4 = 1+6|0+7 = t1|t0 + addq.ph t8, t4, t6 // tmp5 = t1+t2|t0+t3 = t11|t10 + subq.ph t9, t4, t6 // tmp7 = t1-t2|t0-t3 = t12|t13 + sra t4, t8, 16 // tmp4 = t11 + mult $0, $0 // ac0 = 0 + dpa.w.ph $ac0, t9, s1 + mult $ac1, $0, $0 // ac1 = 0 + dpa.w.ph $ac1, t7, a3 // ac1 += t4*98 + t5*98 + dpsx.w.ph $ac1, t5, a3 // ac1 += t6*98 + t7*98 + mult $ac2, $0, $0 // ac2 = 0 + dpa.w.ph $ac2, t7, a2 // ac2 += t4*139 + t5*139 + mult $ac3, $0, $0 // ac3 = 0 + dpa.w.ph $ac3, t5, a1 // ac3 += t6*334 + t7*334 + precrq.ph.w t0, t5, t7 // t0 = t5|t6 + addq.ph t2, t8, t4 // tmp2 = t10 + t11 + subq.ph t3, t8, t4 // tmp3 = t10 - t11 + extr.w t4, $ac0, 8 + mult $0, $0 // ac0 = 0 + dpa.w.ph $ac0, t0, s1 // ac0 += t5*181 + t6*181 + extr.w t0, $ac1, 8 // t0 = z5 + extr.w t1, $ac2, 8 // t1 = MULTIPLY(tmp10, 139) + extr.w t7, $ac3, 8 // t2 = MULTIPLY(tmp12, 334) + extr.w t8, $ac0, 8 // t8 = z3 = MULTIPLY(tmp11, 181) + add t6, t1, t0 // t6 = z2 + add t7, t7, t0 // t7 = z4 + subq.ph t0, t5, t8 // t0 = z13 = tmp7 - z3 + addq.ph t8, t5, t8 // t9 = z11 = tmp7 + z3 + addq.ph t1, t0, t6 // t1 = z13 + z2 + subq.ph t6, t0, t6 // t6 = z13 - z2 + addq.ph t0, t8, t7 // t0 = z11 + z4 + subq.ph t7, t8, t7 // t7 = z11 - z4 + addq.ph t5, t4, t9 + subq.ph t4, t9, t4 + sh t2, 0(v0) + sh t5, 4(v0) + sh t3, 8(v0) + sh t4, 12(v0) + sh t1, 10(v0) + sh t6, 6(v0) + sh t0, 2(v0) + sh t7, 14(v0) + addiu v0, 16 + bne v1, v0, 0b + nop + move v0, a0 + addiu v1, v0, 16 + +1: + lh t0, 0(v0) // 0 + lh t1, 16(v0) // 8 + lh t2, 32(v0) // 16 + lh t3, 48(v0) // 24 + lh t4, 64(v0) // 32 + lh t5, 80(v0) // 40 + lh t6, 96(v0) // 48 + lh t7, 112(v0) // 56 + add t8, t0, t7 // t8 = tmp0 + sub t7, t0, t7 // t7 = tmp7 + add t0, t1, t6 // t0 = tmp1 + sub t1, t1, t6 // t1 = tmp6 + add t6, t2, t5 // t6 = tmp2 + sub t5, t2, t5 // t5 = tmp5 + add t2, t3, t4 // t2 = tmp3 + sub t3, t3, t4 // t3 = tmp4 + add t4, t8, t2 // t4 = tmp10 = tmp0 + tmp3 + sub t8, t8, t2 // t8 = tmp13 = tmp0 - tmp3 + sub s0, t0, t6 // s0 = tmp12 = tmp1 - tmp2 + ins t8, s0, 16, 16 // t8 = tmp12|tmp13 + add t2, t0, t6 // t2 = tmp11 = tmp1 + tmp2 + mult $0, $0 // ac0 = 0 + dpa.w.ph $ac0, t8, s1 // ac0 += t12*181 + t13*181 + add s0, t4, t2 // t8 = tmp10+tmp11 + sub t4, t4, t2 // t4 = tmp10-tmp11 + sh s0, 0(v0) + sh t4, 64(v0) + extr.w t2, $ac0, 8 // z1 = MULTIPLY(tmp12+tmp13,FIX_0_707106781) + addq.ph t4, t8, t2 // t9 = tmp13 + z1 + subq.ph t8, t8, t2 // t2 = tmp13 - z1 + sh t4, 32(v0) + sh t8, 96(v0) + add t3, t3, t5 // t3 = tmp10 = tmp4 + tmp5 + add t0, t5, t1 // t0 = tmp11 = tmp5 + tmp6 + add t1, t1, t7 // t1 = tmp12 = tmp6 + tmp7 + andi t4, a1, 0xffff + mul s0, t1, t4 + sra s0, s0, 8 // s0 = z4 = MULTIPLY(tmp12, FIX_1_306562965) + ins t1, t3, 16, 16 // t1 = tmp10|tmp12 + mult $0, $0 // ac0 = 0 + mulsa.w.ph $ac0, t1, a3 // ac0 += t10*98 - t12*98 + extr.w t8, $ac0, 8 // z5 = MULTIPLY(tmp10-tmp12,FIX_0_382683433) + add t2, t7, t8 // t2 = tmp7 + z5 + sub t7, t7, t8 // t7 = tmp7 - z5 + andi t4, a2, 0xffff + mul t8, t3, t4 + sra t8, t8, 8 // t8 = z2 = MULTIPLY(tmp10, FIX_0_541196100) + andi t4, s1, 0xffff + mul t6, t0, t4 + sra t6, t6, 8 // t6 = z3 = MULTIPLY(tmp11, FIX_0_707106781) + add t0, t6, t8 // t0 = z3 + z2 + sub t1, t6, t8 // t1 = z3 - z2 + add t3, t6, s0 // t3 = z3 + z4 + sub t4, t6, s0 // t4 = z3 - z4 + sub t5, t2, t1 // t5 = dataptr[5] + sub t6, t7, t0 // t6 = dataptr[3] + add t3, t2, t3 // t3 = dataptr[1] + add t4, t7, t4 // t4 = dataptr[7] + sh t5, 80(v0) + sh t6, 48(v0) + sh t3, 16(v0) + sh t4, 112(v0) + addiu v0, 2 + bne v0, v1, 1b + nop + + RESTORE_REGS_FROM_STACK 8, s0, s1 + + j ra + nop +END(jsimd_fdct_ifast_mips_dspr2) + +/*****************************************************************************/ +LEAF_MIPS_DSPR2(jsimd_quantize_mips_dspr2) +/* + * a0 - coef_block + * a1 - divisors + * a2 - workspace + */ + + .set at + + SAVE_REGS_ON_STACK 16, s0, s1, s2 + + addiu v0, a2, 124 // v0 = workspace_end + lh t0, 0(a2) + lh t1, 0(a1) + lh t2, 128(a1) + sra t3, t0, 15 + sll t3, t3, 1 + addiu t3, t3, 1 + mul t0, t0, t3 + lh t4, 384(a1) + lh t5, 130(a1) + lh t6, 2(a2) + lh t7, 2(a1) + lh t8, 386(a1) + +1: + andi t1, 0xffff + add t9, t0, t2 + andi t9, 0xffff + mul v1, t9, t1 + sra s0, t6, 15 + sll s0, s0, 1 + addiu s0, s0, 1 + addiu t9, t4, 16 + srav v1, v1, t9 + mul v1, v1, t3 + mul t6, t6, s0 + andi t7, 0xffff + addiu a2, a2, 4 + addiu a1, a1, 4 + add s1, t6, t5 + andi s1, 0xffff + sh v1, 0(a0) + + mul s2, s1, t7 + addiu s1, t8, 16 + srav s2, s2, s1 + mul s2,s2, s0 + lh t0, 0(a2) + lh t1, 0(a1) + sra t3, t0, 15 + sll t3, t3, 1 + addiu t3, t3, 1 + mul t0, t0, t3 + lh t2, 128(a1) + lh t4, 384(a1) + lh t5, 130(a1) + lh t8, 386(a1) + lh t6, 2(a2) + lh t7, 2(a1) + sh s2, 2(a0) + lh t0, 0(a2) + sra t3, t0, 15 + sll t3, t3, 1 + addiu t3, t3, 1 + mul t0, t0,t3 + bne a2, v0, 1b + addiu a0, a0, 4 + + andi t1, 0xffff + add t9, t0, t2 + andi t9, 0xffff + mul v1, t9, t1 + sra s0, t6, 15 + sll s0, s0, 1 + addiu s0, s0, 1 + addiu t9, t4, 16 + srav v1, v1, t9 + mul v1, v1, t3 + mul t6, t6, s0 + andi t7, 0xffff + sh v1, 0(a0) + add s1, t6, t5 + andi s1, 0xffff + mul s2, s1, t7 + addiu s1, t8, 16 + addiu a2, a2, 4 + addiu a1, a1, 4 + srav s2, s2, s1 + mul s2, s2, s0 + sh s2, 2(a0) + + RESTORE_REGS_FROM_STACK 16, s0, s1, s2 + + j ra + nop + +END(jsimd_quantize_mips_dspr2) + +/*****************************************************************************/ +LEAF_MIPS_DSPR2(jsimd_quantize_float_mips_dspr2) +/* + * a0 - coef_block + * a1 - divisors + * a2 - workspace + */ + + .set at + + li t1, 0x46800100 //integer representation 16384.5 + mtc1 t1, f0 + li t0, 63 +0: + lwc1 f2, 0(a2) + lwc1 f10, 0(a1) + lwc1 f4, 4(a2) + lwc1 f12, 4(a1) + lwc1 f6, 8(a2) + lwc1 f14, 8(a1) + lwc1 f8, 12(a2) + lwc1 f16, 12(a1) + madd.s f2, f0, f2, f10 + madd.s f4, f0, f4, f12 + madd.s f6, f0, f6, f14 + madd.s f8, f0, f8, f16 + lwc1 f10, 16(a1) + lwc1 f12, 20(a1) + trunc.w.s f2, f2 + trunc.w.s f4, f4 + trunc.w.s f6, f6 + trunc.w.s f8, f8 + lwc1 f14, 24(a1) + lwc1 f16, 28(a1) + mfc1 t1, f2 + mfc1 t2, f4 + mfc1 t3, f6 + mfc1 t4, f8 + lwc1 f2, 16(a2) + lwc1 f4, 20(a2) + lwc1 f6, 24(a2) + lwc1 f8, 28(a2) + madd.s f2, f0, f2, f10 + madd.s f4, f0, f4, f12 + madd.s f6, f0, f6, f14 + madd.s f8, f0, f8, f16 + addiu t1, t1, -16384 + addiu t2, t2, -16384 + addiu t3, t3, -16384 + addiu t4, t4, -16384 + trunc.w.s f2, f2 + trunc.w.s f4, f4 + trunc.w.s f6, f6 + trunc.w.s f8, f8 + sh t1, 0(a0) + sh t2, 2(a0) + sh t3, 4(a0) + sh t4, 6(a0) + mfc1 t1, f2 + mfc1 t2, f4 + mfc1 t3, f6 + mfc1 t4, f8 + addiu t0, t0, -8 + addiu a2, a2, 32 + addiu a1, a1, 32 + addiu t1, t1, -16384 + addiu t2, t2, -16384 + addiu t3, t3, -16384 + addiu t4, t4, -16384 + sh t1, 8(a0) + sh t2, 10(a0) + sh t3, 12(a0) + sh t4, 14(a0) + bgez t0, 0b + addiu a0, a0, 16 + + j ra + nop + +END(jsimd_quantize_float_mips_dspr2) +/*****************************************************************************/ +LEAF_MIPS_DSPR2(jsimd_idct_2x2_mips_dspr2) +/* + * a0 - compptr->dct_table + * a1 - coef_block + * a2 - output_buf + * a3 - output_col + */ + .set at + + SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5 + + addiu sp, sp, -40 + move v0, sp + addiu s2, zero, 29692 + addiu s3, zero, -10426 + addiu s4, zero, 6967 + addiu s5, zero, -5906 + lh t0, 0(a1) // t0 = inptr[DCTSIZE*0] + lh t5, 0(a0) // t5 = quantptr[DCTSIZE*0] + lh t1, 48(a1) // t1 = inptr[DCTSIZE*3] + lh t6, 48(a0) // t6 = quantptr[DCTSIZE*3] + mul t4, t5, t0 + lh t0, 16(a1) // t0 = inptr[DCTSIZE*1] + lh t5, 16(a0) // t5 = quantptr[DCTSIZE*1] + mul t6, t6, t1 + mul t5, t5, t0 + lh t2, 80(a1) // t2 = inptr[DCTSIZE*5] + lh t7, 80(a0) // t7 = quantptr[DCTSIZE*5] + lh t3, 112(a1) // t3 = inptr[DCTSIZE*7] + lh t8, 112(a0) // t8 = quantptr[DCTSIZE*7] + mul t7, t7, t2 + mult zero, zero + mul t8, t8, t3 + li s0, 0x73FCD746 // s0 = (29692 << 16) | (-10426 & 0xffff) + li s1, 0x1B37E8EE // s1 = (6967 << 16) | (-5906 & 0xffff) + ins t6, t5, 16, 16 // t6 = t5|t6 + sll t4, t4, 15 + dpa.w.ph $ac0, t6, s0 + lh t1, 2(a1) + lh t6, 2(a0) + ins t8, t7, 16, 16 // t8 = t7|t8 + dpa.w.ph $ac0, t8, s1 + mflo t0, $ac0 + mul t5, t6, t1 + lh t1, 18(a1) + lh t6, 18(a0) + lh t2, 50(a1) + lh t7, 50(a0) + mul t6, t6, t1 + subu t8, t4, t0 + mul t7, t7, t2 + addu t0, t4, t0 + shra_r.w t0, t0, 13 + lh t1, 82(a1) + lh t2, 82(a0) + lh t3, 114(a1) + lh t4, 114(a0) + shra_r.w t8, t8, 13 + mul t1, t1, t2 + mul t3, t3, t4 + sw t0, 0(v0) + sw t8, 20(v0) + sll t4, t5, 15 + ins t7, t6, 16, 16 + mult zero, zero + dpa.w.ph $ac0, t7, s0 + ins t3, t1, 16, 16 + lh t1, 6(a1) + lh t6, 6(a0) + dpa.w.ph $ac0, t3, s1 + mflo t0, $ac0 + mul t5, t6, t1 + lh t1, 22(a1) + lh t6, 22(a0) + lh t2, 54(a1) + lh t7, 54(a0) + mul t6, t6, t1 + subu t8, t4, t0 + mul t7, t7, t2 + addu t0, t4, t0 + shra_r.w t0, t0, 13 + lh t1, 86(a1) + lh t2, 86(a0) + lh t3, 118(a1) + lh t4, 118(a0) + shra_r.w t8, t8, 13 + mul t1, t1, t2 + mul t3, t3, t4 + sw t0, 4(v0) + sw t8, 24(v0) + sll t4, t5, 15 + ins t7, t6, 16, 16 + mult zero, zero + dpa.w.ph $ac0, t7, s0 + ins t3, t1, 16, 16 + lh t1, 10(a1) + lh t6, 10(a0) + dpa.w.ph $ac0, t3, s1 + mflo t0, $ac0 + mul t5, t6, t1 + lh t1, 26(a1) + lh t6, 26(a0) + lh t2, 58(a1) + lh t7, 58(a0) + mul t6, t6, t1 + subu t8, t4, t0 + mul t7, t7, t2 + addu t0, t4, t0 + shra_r.w t0, t0, 13 + lh t1, 90(a1) + lh t2, 90(a0) + lh t3, 122(a1) + lh t4, 122(a0) + shra_r.w t8, t8, 13 + mul t1, t1, t2 + mul t3, t3, t4 + sw t0, 8(v0) + sw t8, 28(v0) + sll t4, t5, 15 + ins t7, t6, 16, 16 + mult zero, zero + dpa.w.ph $ac0, t7, s0 + ins t3, t1, 16, 16 + lh t1, 14(a1) + lh t6, 14(a0) + dpa.w.ph $ac0, t3, s1 + mflo t0, $ac0 + mul t5, t6, t1 + lh t1, 30(a1) + lh t6, 30(a0) + lh t2, 62(a1) + lh t7, 62(a0) + mul t6, t6, t1 + subu t8, t4, t0 + mul t7, t7, t2 + addu t0, t4, t0 + shra_r.w t0, t0, 13 + lh t1, 94(a1) + lh t2, 94(a0) + lh t3, 126(a1) + lh t4, 126(a0) + shra_r.w t8, t8, 13 + mul t1, t1, t2 + mul t3, t3, t4 + sw t0, 12(v0) + sw t8, 32(v0) + sll t4, t5, 15 + ins t7, t6, 16, 16 + mult zero, zero + dpa.w.ph $ac0, t7, s0 + ins t3, t1, 16, 16 + dpa.w.ph $ac0, t3, s1 + mflo t0, $ac0 + lw t9, 0(a2) + lw t3, 0(v0) + lw t7, 4(v0) + lw t1, 8(v0) + addu t9, t9, a3 + sll t3, t3, 15 + subu t8, t4, t0 + addu t0, t4, t0 + shra_r.w t0, t0, 13 + shra_r.w t8, t8, 13 + sw t0, 16(v0) + sw t8, 36(v0) + lw t5, 12(v0) + lw t6, 16(v0) + mult t7, s2 + madd t1, s3 + madd t5, s4 + madd t6, s5 + lw t5, 24(v0) + lw t7, 28(v0) + mflo t0, $ac0 + lw t8, 32(v0) + lw t2, 36(v0) + mult $ac1, t5, s2 + madd $ac1, t7, s3 + madd $ac1, t8, s4 + madd $ac1, t2, s5 + addu t1, t3, t0 + subu t6, t3, t0 + shra_r.w t1, t1, 20 + shra_r.w t6, t6, 20 + mflo t4, $ac1 + shll_s.w t1, t1, 24 + shll_s.w t6, t6, 24 + sra t1, t1, 24 + sra t6, t6, 24 + addiu t1, t1, 128 + addiu t6, t6, 128 + lw t0, 20(v0) + sb t1, 0(t9) + sb t6, 1(t9) + sll t0, t0, 15 + lw t9, 4(a2) + addu t1, t0, t4 + subu t6, t0, t4 + addu t9, t9, a3 + shra_r.w t1, t1, 20 + shra_r.w t6, t6, 20 + shll_s.w t1, t1, 24 + shll_s.w t6, t6, 24 + sra t1, t1, 24 + sra t6, t6, 24 + addiu t1, t1, 128 + addiu t6, t6, 128 + sb t1, 0(t9) + sb t6, 1(t9) + addiu sp, sp, 40 + + RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5 + + j ra + nop + +END(jsimd_idct_2x2_mips_dspr2) + +/*****************************************************************************/ +LEAF_MIPS_DSPR2(jsimd_idct_4x4_mips_dspr2) +/* + * a0 - compptr->dct_table + * a1 - coef_block + * a2 - output_buf + * a3 - output_col + * 16(sp) - workspace[DCTSIZE*4]; // buffers data between passes + */ + + .set at + SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + lw v1, 48(sp) + move t0, a1 + move t1, v1 + li t9, 4 + li s0, 0x2e75f93e + li s1, 0x21f9ba79 + li s2, 0xecc2efb0 + li s3, 0x52031ccd + +0: + lh s6, 32(t0) // inptr[DCTSIZE*2] + lh t6, 32(a0) // quantptr[DCTSIZE*2] + lh s7, 96(t0) // inptr[DCTSIZE*6] + lh t7, 96(a0) // quantptr[DCTSIZE*6] + mul t6, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2]) + lh s4, 0(t0) // inptr[DCTSIZE*0] + mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6]) + lh s5, 0(a0) // quantptr[0] + li s6, 15137 + li s7, 6270 + mul t2, s4, s5 // tmp0 = (inptr[0] * quantptr[0]) + mul t6, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2]) + lh t5, 112(t0) // inptr[DCTSIZE*7] + mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6]) + lh s4, 112(a0) // quantptr[DCTSIZE*7] + lh v0, 80(t0) // inptr[DCTSIZE*5] + lh s5, 80(a0) // quantptr[DCTSIZE*5] + lh s6, 48(a0) // quantptr[DCTSIZE*3] + sll t2, t2, 14 // tmp0 <<= (CONST_BITS+1) + lh s7, 16(a0) // quantptr[DCTSIZE*1] + lh t8, 16(t0) // inptr[DCTSIZE*1] + subu t6, t6, t7 // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6) + lh t7, 48(t0) // inptr[DCTSIZE*3] + mul t5, s4, t5 // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7]) + mul v0, s5, v0 // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5]) + mul t7, s6, t7 // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3]) + mul t8, s7, t8 // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1]) + addu t3, t2, t6 // tmp10 = tmp0 + z2 + subu t4, t2, t6 // tmp10 = tmp0 - z2 + mult $ac0, zero, zero + mult $ac1, zero, zero + ins t5, v0, 16, 16 + ins t7, t8, 16, 16 + addiu t9, t9, -1 + dpa.w.ph $ac0, t5, s0 + dpa.w.ph $ac0, t7, s1 + dpa.w.ph $ac1, t5, s2 + dpa.w.ph $ac1, t7, s3 + mflo s4, $ac0 + mflo s5, $ac1 + addiu a0, a0, 2 + addiu t1, t1, 4 + addiu t0, t0, 2 + addu t6, t4, s4 + subu t5, t4, s4 + addu s6, t3, s5 + subu s7, t3, s5 + shra_r.w t6, t6, 12 // DESCALE(tmp12 + temp1, 12) + shra_r.w t5, t5, 12 // DESCALE(tmp12 - temp1, 12) + shra_r.w s6, s6, 12 // DESCALE(tmp10 + temp2, 12) + shra_r.w s7, s7, 12 // DESCALE(tmp10 - temp2, 12) + sw t6, 28(t1) + sw t5, 60(t1) + sw s6, -4(t1) + bgtz t9, 0b + sw s7, 92(t1) + // second loop three pass + li t9, 3 +1: + lh s6, 34(t0) // inptr[DCTSIZE*2] + lh t6, 34(a0) // quantptr[DCTSIZE*2] + lh s7, 98(t0) // inptr[DCTSIZE*6] + lh t7, 98(a0) // quantptr[DCTSIZE*6] + mul t6, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2]) + lh s4, 2(t0) // inptr[DCTSIZE*0] + mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6]) + lh s5, 2(a0) // quantptr[DCTSIZE*0] + li s6, 15137 + li s7, 6270 + mul t2, s4, s5 // tmp0 = (inptr[0] * quantptr[0]) + mul v0, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2]) + lh t5, 114(t0) // inptr[DCTSIZE*7] + mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6]) + lh s4, 114(a0) // quantptr[DCTSIZE*7] + lh s5, 82(a0) // quantptr[DCTSIZE*5] + lh t6, 82(t0) // inptr[DCTSIZE*5] + sll t2, t2, 14 // tmp0 <<= (CONST_BITS+1) + lh s6, 50(a0) // quantptr[DCTSIZE*3] + lh t8, 18(t0) // inptr[DCTSIZE*1] + subu v0, v0, t7 // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6) + lh t7, 50(t0) // inptr[DCTSIZE*3] + lh s7, 18(a0) // quantptr[DCTSIZE*1] + mul t5, s4, t5 // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7]) + mul t6, s5, t6 // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5]) + mul t7, s6, t7 // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3]) + mul t8, s7, t8 // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1]) + addu t3, t2, v0 // tmp10 = tmp0 + z2 + subu t4, t2, v0 // tmp10 = tmp0 - z2 + mult $ac0, zero, zero + mult $ac1, zero, zero + ins t5, t6, 16, 16 + ins t7, t8, 16, 16 + dpa.w.ph $ac0, t5, s0 + dpa.w.ph $ac0, t7, s1 + dpa.w.ph $ac1, t5, s2 + dpa.w.ph $ac1, t7, s3 + mflo t5, $ac0 + mflo t6, $ac1 + addiu t9, t9, -1 + addiu t0, t0, 2 + addiu a0, a0, 2 + addiu t1, t1, 4 + addu s5, t4, t5 + subu s4, t4, t5 + addu s6, t3, t6 + subu s7, t3, t6 + shra_r.w s5, s5, 12 // DESCALE(tmp12 + temp1, 12) + shra_r.w s4, s4, 12 // DESCALE(tmp12 - temp1, 12) + shra_r.w s6, s6, 12 // DESCALE(tmp10 + temp2, 12) + shra_r.w s7, s7, 12 // DESCALE(tmp10 - temp2, 12) + sw s5, 32(t1) + sw s4, 64(t1) + sw s6, 0(t1) + bgtz t9, 1b + sw s7, 96(t1) + move t1, v1 + li s4, 15137 + lw s6, 8(t1) // wsptr[2] + li s5, 6270 + lw s7, 24(t1) // wsptr[6] + mul s4, s4, s6 // MULTIPLY((JLONG) wsptr[2], FIX_1_847759065) + lw t2, 0(t1) // wsptr[0] + mul s5, s5, s7 // MULTIPLY((JLONG) wsptr[6], - FIX_0_765366865) + lh t5, 28(t1) // wsptr[7] + lh t6, 20(t1) // wsptr[5] + lh t7, 12(t1) // wsptr[3] + lh t8, 4(t1) // wsptr[1] + ins t5, t6, 16, 16 + ins t7, t8, 16, 16 + mult $ac0, zero, zero + dpa.w.ph $ac0, t5, s0 + dpa.w.ph $ac0, t7, s1 + mult $ac1, zero, zero + dpa.w.ph $ac1, t5, s2 + dpa.w.ph $ac1, t7, s3 + sll t2, t2, 14 // tmp0 = ((JLONG) wsptr[0]) << (CONST_BITS+1) + mflo s6, $ac0 + // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865) + subu s4, s4, s5 + addu t3, t2, s4 // tmp10 = tmp0 + z2 + mflo s7, $ac1 + subu t4, t2, s4 // tmp10 = tmp0 - z2 + addu t7, t4, s6 + subu t8, t4, s6 + addu t5, t3, s7 + subu t6, t3, s7 + shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, 19) + shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, 19) + shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, 19) + shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, 19) + sll s4, t9, 2 + lw v0, 0(a2) // output_buf[ctr] + shll_s.w t5, t5, 24 + shll_s.w t6, t6, 24 + shll_s.w t7, t7, 24 + shll_s.w t8, t8, 24 + sra t5, t5, 24 + sra t6, t6, 24 + sra t7, t7, 24 + sra t8, t8, 24 + addu v0, v0, a3 // outptr = output_buf[ctr] + output_col + addiu t5, t5, 128 + addiu t6, t6, 128 + addiu t7, t7, 128 + addiu t8, t8, 128 + sb t5, 0(v0) + sb t7, 1(v0) + sb t8, 2(v0) + sb t6, 3(v0) + // 2 + li s4, 15137 + lw s6, 40(t1) // wsptr[2] + li s5, 6270 + lw s7, 56(t1) // wsptr[6] + mul s4, s4, s6 // MULTIPLY((JLONG) wsptr[2], FIX_1_847759065) + lw t2, 32(t1) // wsptr[0] + mul s5, s5, s7 // MULTIPLY((JLONG) wsptr[6], - FIX_0_765366865) + lh t5, 60(t1) // wsptr[7] + lh t6, 52(t1) // wsptr[5] + lh t7, 44(t1) // wsptr[3] + lh t8, 36(t1) // wsptr[1] + ins t5, t6, 16, 16 + ins t7, t8, 16, 16 + mult $ac0, zero, zero + dpa.w.ph $ac0, t5, s0 + dpa.w.ph $ac0, t7, s1 + mult $ac1, zero, zero + dpa.w.ph $ac1, t5, s2 + dpa.w.ph $ac1, t7, s3 + sll t2, t2, 14 // tmp0 = ((JLONG) wsptr[0]) << (CONST_BITS+1) + mflo s6, $ac0 + // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865) + subu s4, s4, s5 + addu t3, t2, s4 // tmp10 = tmp0 + z2 + mflo s7, $ac1 + subu t4, t2, s4 // tmp10 = tmp0 - z2 + addu t7, t4, s6 + subu t8, t4, s6 + addu t5, t3, s7 + subu t6, t3, s7 + shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, CONST_BITS-PASS1_BITS+1) + shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, CONST_BITS-PASS1_BITS+1) + shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, CONST_BITS-PASS1_BITS+1) + shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, CONST_BITS-PASS1_BITS+1) + sll s4, t9, 2 + lw v0, 4(a2) // output_buf[ctr] + shll_s.w t5, t5, 24 + shll_s.w t6, t6, 24 + shll_s.w t7, t7, 24 + shll_s.w t8, t8, 24 + sra t5, t5, 24 + sra t6, t6, 24 + sra t7, t7, 24 + sra t8, t8, 24 + addu v0, v0, a3 // outptr = output_buf[ctr] + output_col + addiu t5, t5, 128 + addiu t6, t6, 128 + addiu t7, t7, 128 + addiu t8, t8, 128 + sb t5, 0(v0) + sb t7, 1(v0) + sb t8, 2(v0) + sb t6, 3(v0) + // 3 + li s4, 15137 + lw s6, 72(t1) // wsptr[2] + li s5, 6270 + lw s7, 88(t1) // wsptr[6] + mul s4, s4, s6 // MULTIPLY((JLONG) wsptr[2], FIX_1_847759065) + lw t2, 64(t1) // wsptr[0] + mul s5, s5, s7 // MULTIPLY((JLONG) wsptr[6], - FIX_0_765366865) + lh t5, 92(t1) // wsptr[7] + lh t6, 84(t1) // wsptr[5] + lh t7, 76(t1) // wsptr[3] + lh t8, 68(t1) // wsptr[1] + ins t5, t6, 16, 16 + ins t7, t8, 16, 16 + mult $ac0, zero, zero + dpa.w.ph $ac0, t5, s0 + dpa.w.ph $ac0, t7, s1 + mult $ac1, zero, zero + dpa.w.ph $ac1, t5, s2 + dpa.w.ph $ac1, t7, s3 + sll t2, t2, 14 // tmp0 = ((JLONG) wsptr[0]) << (CONST_BITS+1) + mflo s6, $ac0 + // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865) + subu s4, s4, s5 + addu t3, t2, s4 // tmp10 = tmp0 + z2 + mflo s7, $ac1 + subu t4, t2, s4 // tmp10 = tmp0 - z2 + addu t7, t4, s6 + subu t8, t4, s6 + addu t5, t3, s7 + subu t6, t3, s7 + shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, 19) + shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, 19) + shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, 19) + shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, 19) + sll s4, t9, 2 + lw v0, 8(a2) // output_buf[ctr] + shll_s.w t5, t5, 24 + shll_s.w t6, t6, 24 + shll_s.w t7, t7, 24 + shll_s.w t8, t8, 24 + sra t5, t5, 24 + sra t6, t6, 24 + sra t7, t7, 24 + sra t8, t8, 24 + addu v0, v0, a3 // outptr = output_buf[ctr] + output_col + addiu t5, t5, 128 + addiu t6, t6, 128 + addiu t7, t7, 128 + addiu t8, t8, 128 + sb t5, 0(v0) + sb t7, 1(v0) + sb t8, 2(v0) + sb t6, 3(v0) + li s4, 15137 + lw s6, 104(t1) // wsptr[2] + li s5, 6270 + lw s7, 120(t1) // wsptr[6] + mul s4, s4, s6 // MULTIPLY((JLONG) wsptr[2], FIX_1_847759065) + lw t2, 96(t1) // wsptr[0] + mul s5, s5, s7 // MULTIPLY((JLONG) wsptr[6], -FIX_0_765366865) + lh t5, 124(t1) // wsptr[7] + lh t6, 116(t1) // wsptr[5] + lh t7, 108(t1) // wsptr[3] + lh t8, 100(t1) // wsptr[1] + ins t5, t6, 16, 16 + ins t7, t8, 16, 16 + mult $ac0, zero, zero + dpa.w.ph $ac0, t5, s0 + dpa.w.ph $ac0, t7, s1 + mult $ac1, zero, zero + dpa.w.ph $ac1, t5, s2 + dpa.w.ph $ac1, t7, s3 + sll t2, t2, 14 // tmp0 = ((JLONG) wsptr[0]) << (CONST_BITS+1) + mflo s6, $ac0 + // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865) + subu s4, s4, s5 + addu t3, t2, s4 // tmp10 = tmp0 + z2; + mflo s7, $ac1 + subu t4, t2, s4 // tmp10 = tmp0 - z2; + addu t7, t4, s6 + subu t8, t4, s6 + addu t5, t3, s7 + subu t6, t3, s7 + shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, 19) + shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, 19) + shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, 19) + shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, 19) + sll s4, t9, 2 + lw v0, 12(a2) // output_buf[ctr] + shll_s.w t5, t5, 24 + shll_s.w t6, t6, 24 + shll_s.w t7, t7, 24 + shll_s.w t8, t8, 24 + sra t5, t5, 24 + sra t6, t6, 24 + sra t7, t7, 24 + sra t8, t8, 24 + addu v0, v0, a3 // outptr = output_buf[ctr] + output_col + addiu t5, t5, 128 + addiu t6, t6, 128 + addiu t7, t7, 128 + addiu t8, t8, 128 + sb t5, 0(v0) + sb t7, 1(v0) + sb t8, 2(v0) + sb t6, 3(v0) + + RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + j ra + nop +END(jsimd_idct_4x4_mips_dspr2) + +/*****************************************************************************/ +LEAF_MIPS_DSPR2(jsimd_idct_6x6_mips_dspr2) +/* + * a0 - compptr->dct_table + * a1 - coef_block + * a2 - output_buf + * a3 - output_col + */ + .set at + + SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + addiu sp, sp, -144 + move v0, sp + addiu v1, v0, 24 + addiu t9, zero, 5793 + addiu s0, zero, 10033 + addiu s1, zero, 2998 + +1: + lh s2, 0(a0) // q0 = quantptr[ 0] + lh s3, 32(a0) // q1 = quantptr[16] + lh s4, 64(a0) // q2 = quantptr[32] + lh t2, 64(a1) // tmp2 = inptr[32] + lh t1, 32(a1) // tmp1 = inptr[16] + lh t0, 0(a1) // tmp0 = inptr[ 0] + mul t2, t2, s4 // tmp2 = tmp2 * q2 + mul t1, t1, s3 // tmp1 = tmp1 * q1 + mul t0, t0, s2 // tmp0 = tmp0 * q0 + lh t6, 16(a1) // z1 = inptr[ 8] + lh t8, 80(a1) // z3 = inptr[40] + lh t7, 48(a1) // z2 = inptr[24] + lh s2, 16(a0) // q0 = quantptr[ 8] + lh s4, 80(a0) // q2 = quantptr[40] + lh s3, 48(a0) // q1 = quantptr[24] + mul t2, t2, t9 // tmp2 = tmp2 * 5793 + mul t1, t1, s0 // tmp1 = tmp1 * 10033 + sll t0, t0, 13 // tmp0 = tmp0 << 13 + mul t6, t6, s2 // z1 = z1 * q0 + mul t8, t8, s4 // z3 = z3 * q2 + mul t7, t7, s3 // z2 = z2 * q1 + addu t3, t0, t2 // tmp10 = tmp0 + tmp2 + sll t2, t2, 1 // tmp2 = tmp2 << 2 + subu t4, t0, t2 // tmp11 = tmp0 - tmp2; + subu t5, t3, t1 // tmp12 = tmp10 - tmp1 + addu t3, t3, t1 // tmp10 = tmp10 + tmp1 + addu t1, t6, t8 // tmp1 = z1 + z3 + mul t1, t1, s1 // tmp1 = tmp1 * 2998 + shra_r.w t4, t4, 11 // tmp11 = (tmp11 + 1024) >> 11 + subu t2, t6, t8 // tmp2 = z1 - z3 + subu t2, t2, t7 // tmp2 = tmp2 - z2 + sll t2, t2, 2 // tmp2 = tmp2 << 2 + addu t0, t6, t7 // tmp0 = z1 + z2 + sll t0, t0, 13 // tmp0 = tmp0 << 13 + subu s2, t8, t7 // q0 = z3 - z2 + sll s2, s2, 13 // q0 = q0 << 13 + addu t0, t0, t1 // tmp0 = tmp0 + tmp1 + addu t1, s2, t1 // tmp1 = q0 + tmp1 + addu s2, t4, t2 // q0 = tmp11 + tmp2 + subu s3, t4, t2 // q1 = tmp11 - tmp2 + addu t6, t3, t0 // z1 = tmp10 + tmp0 + subu t7, t3, t0 // z2 = tmp10 - tmp0 + addu t4, t5, t1 // tmp11 = tmp12 + tmp1 + subu t5, t5, t1 // tmp12 = tmp12 - tmp1 + shra_r.w t6, t6, 11 // z1 = (z1 + 1024) >> 11 + shra_r.w t7, t7, 11 // z2 = (z2 + 1024) >> 11 + shra_r.w t4, t4, 11 // tmp11 = (tmp11 + 1024) >> 11 + shra_r.w t5, t5, 11 // tmp12 = (tmp12 + 1024) >> 11 + sw s2, 24(v0) + sw s3, 96(v0) + sw t6, 0(v0) + sw t7, 120(v0) + sw t4, 48(v0) + sw t5, 72(v0) + addiu v0, v0, 4 + addiu a1, a1, 2 + bne v0, v1, 1b + addiu a0, a0, 2 + + /* Pass 2: process 6 rows from work array, store into output array. */ + move v0, sp + addiu v1, v0, 144 + +2: + lw t0, 0(v0) + lw t2, 16(v0) + lw s5, 0(a2) + addiu t0, t0, 16 + sll t0, t0, 13 + mul t3, t2, t9 + lw t6, 4(v0) + lw t8, 20(v0) + lw t7, 12(v0) + addu s5, s5, a3 + addu s6, t6, t8 + mul s6, s6, s1 + addu t1, t0, t3 + subu t4, t0, t3 + subu t4, t4, t3 + lw t3, 8(v0) + mul t0, t3, s0 + addu s7, t6, t7 + sll s7, s7, 13 + addu s7, s6, s7 + subu t2, t8, t7 + sll t2, t2, 13 + addu t2, s6, t2 + subu s6, t6, t7 + subu s6, s6, t8 + sll s6, s6, 13 + addu t3, t1, t0 + subu t5, t1, t0 + addu t6, t3, s7 + subu t3, t3, s7 + addu t7, t4, s6 + subu t4, t4, s6 + addu t8, t5, t2 + subu t5, t5, t2 + shll_s.w t6, t6, 6 + shll_s.w t3, t3, 6 + shll_s.w t7, t7, 6 + shll_s.w t4, t4, 6 + shll_s.w t8, t8, 6 + shll_s.w t5, t5, 6 + sra t6, t6, 24 + addiu t6, t6, 128 + sra t3, t3, 24 + addiu t3, t3, 128 + sb t6, 0(s5) + sra t7, t7, 24 + addiu t7, t7, 128 + sb t3, 5(s5) + sra t4, t4, 24 + addiu t4, t4, 128 + sb t7, 1(s5) + sra t8, t8, 24 + addiu t8, t8, 128 + sb t4, 4(s5) + addiu v0, v0, 24 + sra t5, t5, 24 + addiu t5, t5, 128 + sb t8, 2(s5) + addiu a2, a2, 4 + bne v0, v1, 2b + sb t5, 3(s5) + + addiu sp, sp, 144 + + RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + j ra + nop + +END(jsimd_idct_6x6_mips_dspr2) + +/*****************************************************************************/ +LEAF_MIPS_DSPR2(jsimd_idct_12x12_pass1_mips_dspr2) +/* + * a0 - compptr->dct_table + * a1 - coef_block + * a2 - workspace + */ + + SAVE_REGS_ON_STACK 16, s0, s1, s2, s3 + + li a3, 8 + +1: + // odd part + lh t0, 48(a1) + lh t1, 48(a0) + lh t2, 16(a1) + lh t3, 16(a0) + lh t4, 80(a1) + lh t5, 80(a0) + lh t6, 112(a1) + lh t7, 112(a0) + mul t0, t0, t1 // z2 + mul t1, t2, t3 // z1 + mul t2, t4, t5 // z3 + mul t3, t6, t7 // z4 + li t4, 10703 // FIX(1.306562965) + li t5, 4433 // FIX_0_541196100 + li t6, 7053 // FIX(0.860918669) + mul t4, t0,t4 // tmp11 + mul t5, t0,t5 // -tmp14 + addu t7, t1,t2 // tmp10 + addu t8, t7,t3 // tmp10 + z4 + mul t6, t6, t8 // tmp15 + li t8, 2139 // FIX(0.261052384) + mul t8, t7, t8 // MULTIPLY(tmp10, FIX(0.261052384)) + li t7, 2295 // FIX(0.280143716) + mul t7, t1, t7 // MULTIPLY(z1, FIX(0.280143716)) + addu t9, t2, t3 // z3 + z4 + li s0, 8565 // FIX(1.045510580) + mul t9, t9, s0 // -tmp13 + li s0, 12112 // FIX(1.478575242) + mul s0, t2, s0 // MULTIPLY(z3, FIX(1.478575242) + li s1, 12998 // FIX(1.586706681) + mul s1, t3, s1 // MULTIPLY(z4, FIX(1.586706681)) + li s2, 5540 // FIX(0.676326758) + mul s2, t1, s2 // MULTIPLY(z1, FIX(0.676326758)) + li s3, 16244 // FIX(1.982889723) + mul s3, t3, s3 // MULTIPLY(z4, FIX(1.982889723)) + subu t1, t1, t3 // z1-=z4 + subu t0, t0, t2 // z2-=z3 + addu t2, t0, t1 // z1+z2 + li t3, 4433 // FIX_0_541196100 + mul t2, t2, t3 // z3 + li t3, 6270 // FIX_0_765366865 + mul t1, t1, t3 // MULTIPLY(z1, FIX_0_765366865) + li t3, 15137 // FIX_0_765366865 + mul t0, t0, t3 // MULTIPLY(z2, FIX_1_847759065) + addu t8, t6, t8 // tmp12 + addu t3, t8, t4 // tmp12 + tmp11 + addu t3, t3, t7 // tmp10 + subu t8, t8, t9 // tmp12 + tmp13 + addu s0, t5, s0 + subu t8, t8, s0 // tmp12 + subu t9, t6, t9 + subu s1, s1, t4 + addu t9, t9, s1 // tmp13 + subu t6, t6, t5 + subu t6, t6, s2 + subu t6, t6, s3 // tmp15 + // even part start + lh t4, 64(a1) + lh t5, 64(a0) + lh t7, 32(a1) + lh s0, 32(a0) + lh s1, 0(a1) + lh s2, 0(a0) + lh s3, 96(a1) + lh v0, 96(a0) + mul t4, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*4],quantptr[DCTSIZE*4]) + mul t5, t7, s0 // DEQUANTIZE(inptr[DCTSIZE*2],quantptr[DCTSIZE*2]) + mul t7, s1, s2 // DEQUANTIZE(inptr[DCTSIZE*0],quantptr[DCTSIZE*0]) + mul s0, s3, v0 // DEQUANTIZE(inptr[DCTSIZE*6],quantptr[DCTSIZE*6]) + // odd part end + addu t1, t2, t1 // tmp11 + subu t0, t2, t0 // tmp14 + // update counter and pointers + addiu a3, a3, -1 + addiu a0, a0, 2 + addiu a1, a1, 2 + // even part rest + li s1, 10033 + li s2, 11190 + mul t4, t4, s1 // z4 + mul s1, t5, s2 // z4 + sll t5, t5, 13 // z1 + sll t7, t7, 13 + addiu t7, t7, 1024 // z3 + sll s0, s0, 13 // z2 + addu s2, t7, t4 // tmp10 + subu t4, t7, t4 // tmp11 + subu s3, t5, s0 // tmp12 + addu t2, t7, s3 // tmp21 + subu s3, t7, s3 // tmp24 + addu t7, s1, s0 // tmp12 + addu v0, s2, t7 // tmp20 + subu s2, s2, t7 // tmp25 + subu s1, s1, t5 // z4 - z1 + subu s1, s1, s0 // tmp12 + addu s0, t4, s1 // tmp22 + subu t4, t4, s1 // tmp23 + // final output stage + addu t5, v0, t3 + subu v0, v0, t3 + addu t3, t2, t1 + subu t2, t2, t1 + addu t1, s0, t8 + subu s0, s0, t8 + addu t8, t4, t9 + subu t4, t4, t9 + addu t9, s3, t0 + subu s3, s3, t0 + addu t0, s2, t6 + subu s2, s2, t6 + sra t5, t5, 11 + sra t3, t3, 11 + sra t1, t1, 11 + sra t8, t8, 11 + sra t9, t9, 11 + sra t0, t0, 11 + sra s2, s2, 11 + sra s3, s3, 11 + sra t4, t4, 11 + sra s0, s0, 11 + sra t2, t2, 11 + sra v0, v0, 11 + sw t5, 0(a2) + sw t3, 32(a2) + sw t1, 64(a2) + sw t8, 96(a2) + sw t9, 128(a2) + sw t0, 160(a2) + sw s2, 192(a2) + sw s3, 224(a2) + sw t4, 256(a2) + sw s0, 288(a2) + sw t2, 320(a2) + sw v0, 352(a2) + bgtz a3, 1b + addiu a2, a2, 4 + + RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3 + + j ra + nop + +END(jsimd_idct_12x12_pass1_mips_dspr2) + +/*****************************************************************************/ +LEAF_MIPS_DSPR2(jsimd_idct_12x12_pass2_mips_dspr2) +/* + * a0 - workspace + * a1 - output + */ + + SAVE_REGS_ON_STACK 16, s0, s1, s2, s3 + + li a3, 12 + +1: + // Odd part + lw t0, 12(a0) + lw t1, 4(a0) + lw t2, 20(a0) + lw t3, 28(a0) + li t4, 10703 // FIX(1.306562965) + li t5, 4433 // FIX_0_541196100 + mul t4, t0, t4 // tmp11 + mul t5, t0, t5 // -tmp14 + addu t6, t1, t2 // tmp10 + li t7, 2139 // FIX(0.261052384) + mul t7, t6, t7 // MULTIPLY(tmp10, FIX(0.261052384)) + addu t6, t6, t3 // tmp10 + z4 + li t8, 7053 // FIX(0.860918669) + mul t6, t6, t8 // tmp15 + li t8, 2295 // FIX(0.280143716) + mul t8, t1, t8 // MULTIPLY(z1, FIX(0.280143716)) + addu t9, t2, t3 // z3 + z4 + li s0, 8565 // FIX(1.045510580) + mul t9, t9, s0 // -tmp13 + li s0, 12112 // FIX(1.478575242) + mul s0, t2, s0 // MULTIPLY(z3, FIX(1.478575242)) + li s1, 12998 // FIX(1.586706681) + mul s1, t3, s1 // MULTIPLY(z4, FIX(1.586706681)) + li s2, 5540 // FIX(0.676326758) + mul s2, t1, s2 // MULTIPLY(z1, FIX(0.676326758)) + li s3, 16244 // FIX(1.982889723) + mul s3, t3, s3 // MULTIPLY(z4, FIX(1.982889723)) + subu t1, t1, t3 // z1 -= z4 + subu t0, t0, t2 // z2 -= z3 + addu t2, t1, t0 // z1 + z2 + li t3, 4433 // FIX_0_541196100 + mul t2, t2, t3 // z3 + li t3, 6270 // FIX_0_765366865 + mul t1, t1, t3 // MULTIPLY(z1, FIX_0_765366865) + li t3, 15137 // FIX_1_847759065 + mul t0, t0, t3 // MULTIPLY(z2, FIX_1_847759065) + addu t3, t6, t7 // tmp12 + addu t7, t3, t4 + addu t7, t7, t8 // tmp10 + subu t3, t3, t9 + subu t3, t3, t5 + subu t3, t3, s0 // tmp12 + subu t9, t6, t9 + subu t9, t9, t4 + addu t9, t9, s1 // tmp13 + subu t6, t6, t5 + subu t6, t6, s2 + subu t6, t6, s3 // tmp15 + addu t1, t2, t1 // tmp11 + subu t0, t2, t0 // tmp14 + // even part + lw t2, 16(a0) // z4 + lw t4, 8(a0) // z1 + lw t5, 0(a0) // z3 + lw t8, 24(a0) // z2 + li s0, 10033 // FIX(1.224744871) + li s1, 11190 // FIX(1.366025404) + mul t2, t2, s0 // z4 + mul s0, t4, s1 // z4 + addiu t5, t5, 0x10 + sll t5, t5, 13 // z3 + sll t4, t4, 13 // z1 + sll t8, t8, 13 // z2 + subu s1, t4, t8 // tmp12 + addu s2, t5, t2 // tmp10 + subu t2, t5, t2 // tmp11 + addu s3, t5, s1 // tmp21 + subu s1, t5, s1 // tmp24 + addu t5, s0, t8 // tmp12 + addu v0, s2, t5 // tmp20 + subu t5, s2, t5 // tmp25 + subu t4, s0, t4 + subu t4, t4, t8 // tmp12 + addu t8, t2, t4 // tmp22 + subu t2, t2, t4 // tmp23 + // increment counter and pointers + addiu a3, a3, -1 + addiu a0, a0, 32 + // Final stage + addu t4, v0, t7 + subu v0, v0, t7 + addu t7, s3, t1 + subu s3, s3, t1 + addu t1, t8, t3 + subu t8, t8, t3 + addu t3, t2, t9 + subu t2, t2, t9 + addu t9, s1, t0 + subu s1, s1, t0 + addu t0, t5, t6 + subu t5, t5, t6 + sll t4, t4, 4 + sll t7, t7, 4 + sll t1, t1, 4 + sll t3, t3, 4 + sll t9, t9, 4 + sll t0, t0, 4 + sll t5, t5, 4 + sll s1, s1, 4 + sll t2, t2, 4 + sll t8, t8, 4 + sll s3, s3, 4 + sll v0, v0, 4 + shll_s.w t4, t4, 2 + shll_s.w t7, t7, 2 + shll_s.w t1, t1, 2 + shll_s.w t3, t3, 2 + shll_s.w t9, t9, 2 + shll_s.w t0, t0, 2 + shll_s.w t5, t5, 2 + shll_s.w s1, s1, 2 + shll_s.w t2, t2, 2 + shll_s.w t8, t8, 2 + shll_s.w s3, s3, 2 + shll_s.w v0, v0, 2 + srl t4, t4, 24 + srl t7, t7, 24 + srl t1, t1, 24 + srl t3, t3, 24 + srl t9, t9, 24 + srl t0, t0, 24 + srl t5, t5, 24 + srl s1, s1, 24 + srl t2, t2, 24 + srl t8, t8, 24 + srl s3, s3, 24 + srl v0, v0, 24 + lw t6, 0(a1) + addiu t4, t4, 0x80 + addiu t7, t7, 0x80 + addiu t1, t1, 0x80 + addiu t3, t3, 0x80 + addiu t9, t9, 0x80 + addiu t0, t0, 0x80 + addiu t5, t5, 0x80 + addiu s1, s1, 0x80 + addiu t2, t2, 0x80 + addiu t8, t8, 0x80 + addiu s3, s3, 0x80 + addiu v0, v0, 0x80 + sb t4, 0(t6) + sb t7, 1(t6) + sb t1, 2(t6) + sb t3, 3(t6) + sb t9, 4(t6) + sb t0, 5(t6) + sb t5, 6(t6) + sb s1, 7(t6) + sb t2, 8(t6) + sb t8, 9(t6) + sb s3, 10(t6) + sb v0, 11(t6) + bgtz a3, 1b + addiu a1, a1, 4 + + RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3 + + jr ra + nop + +END(jsimd_idct_12x12_pass2_mips_dspr2) + +/*****************************************************************************/ +LEAF_MIPS_DSPR2(jsimd_convsamp_mips_dspr2) +/* + * a0 - sample_data + * a1 - start_col + * a2 - workspace + */ + + lw t0, 0(a0) + li t7, 0xff80ff80 + addu t0, t0, a1 + ulw t1, 0(t0) + ulw t2, 4(t0) + preceu.ph.qbr t3, t1 + preceu.ph.qbl t4, t1 + lw t0, 4(a0) + preceu.ph.qbr t5, t2 + preceu.ph.qbl t6, t2 + addu t0, t0, a1 + addu.ph t3, t3, t7 + addu.ph t4, t4, t7 + ulw t1, 0(t0) + ulw t2, 4(t0) + addu.ph t5, t5, t7 + addu.ph t6, t6, t7 + usw t3, 0(a2) + usw t4, 4(a2) + preceu.ph.qbr t3, t1 + preceu.ph.qbl t4, t1 + usw t5, 8(a2) + usw t6, 12(a2) + + lw t0, 8(a0) + preceu.ph.qbr t5, t2 + preceu.ph.qbl t6, t2 + addu t0, t0, a1 + addu.ph t3, t3, t7 + addu.ph t4, t4, t7 + ulw t1, 0(t0) + ulw t2, 4(t0) + addu.ph t5, t5, t7 + addu.ph t6, t6, t7 + usw t3, 16(a2) + usw t4, 20(a2) + preceu.ph.qbr t3, t1 + preceu.ph.qbl t4, t1 + usw t5, 24(a2) + usw t6, 28(a2) + + lw t0, 12(a0) + preceu.ph.qbr t5, t2 + preceu.ph.qbl t6, t2 + addu t0, t0, a1 + addu.ph t3, t3, t7 + addu.ph t4, t4, t7 + ulw t1, 0(t0) + ulw t2, 4(t0) + addu.ph t5, t5, t7 + addu.ph t6, t6, t7 + usw t3, 32(a2) + usw t4, 36(a2) + preceu.ph.qbr t3, t1 + preceu.ph.qbl t4, t1 + usw t5, 40(a2) + usw t6, 44(a2) + + lw t0, 16(a0) + preceu.ph.qbr t5, t2 + preceu.ph.qbl t6, t2 + addu t0, t0, a1 + addu.ph t3, t3, t7 + addu.ph t4, t4, t7 + ulw t1, 0(t0) + ulw t2, 4(t0) + addu.ph t5, t5, t7 + addu.ph t6, t6, t7 + usw t3, 48(a2) + usw t4, 52(a2) + preceu.ph.qbr t3, t1 + preceu.ph.qbl t4, t1 + usw t5, 56(a2) + usw t6, 60(a2) + + lw t0, 20(a0) + preceu.ph.qbr t5, t2 + preceu.ph.qbl t6, t2 + addu t0, t0, a1 + addu.ph t3, t3, t7 + addu.ph t4, t4, t7 + ulw t1, 0(t0) + ulw t2, 4(t0) + addu.ph t5, t5, t7 + addu.ph t6, t6, t7 + usw t3, 64(a2) + usw t4, 68(a2) + preceu.ph.qbr t3, t1 + preceu.ph.qbl t4, t1 + usw t5, 72(a2) + usw t6, 76(a2) + + lw t0, 24(a0) + preceu.ph.qbr t5, t2 + preceu.ph.qbl t6, t2 + addu t0, t0, a1 + addu.ph t3, t3, t7 + addu.ph t4, t4, t7 + ulw t1, 0(t0) + ulw t2, 4(t0) + addu.ph t5, t5, t7 + addu.ph t6, t6, t7 + usw t3, 80(a2) + usw t4, 84(a2) + preceu.ph.qbr t3, t1 + preceu.ph.qbl t4, t1 + usw t5, 88(a2) + usw t6, 92(a2) + + lw t0, 28(a0) + preceu.ph.qbr t5, t2 + preceu.ph.qbl t6, t2 + addu t0, t0, a1 + addu.ph t3, t3, t7 + addu.ph t4, t4, t7 + ulw t1, 0(t0) + ulw t2, 4(t0) + addu.ph t5, t5, t7 + addu.ph t6, t6, t7 + usw t3, 96(a2) + usw t4, 100(a2) + preceu.ph.qbr t3, t1 + preceu.ph.qbl t4, t1 + usw t5, 104(a2) + usw t6, 108(a2) + preceu.ph.qbr t5, t2 + preceu.ph.qbl t6, t2 + addu.ph t3, t3, t7 + addu.ph t4, t4, t7 + addu.ph t5, t5, t7 + addu.ph t6, t6, t7 + usw t3, 112(a2) + usw t4, 116(a2) + usw t5, 120(a2) + usw t6, 124(a2) + + j ra + nop + +END(jsimd_convsamp_mips_dspr2) + +/*****************************************************************************/ +LEAF_MIPS_DSPR2(jsimd_convsamp_float_mips_dspr2) +/* + * a0 - sample_data + * a1 - start_col + * a2 - workspace + */ + + .set at + + lw t0, 0(a0) + addu t0, t0, a1 + lbu t1, 0(t0) + lbu t2, 1(t0) + lbu t3, 2(t0) + lbu t4, 3(t0) + lbu t5, 4(t0) + lbu t6, 5(t0) + lbu t7, 6(t0) + lbu t8, 7(t0) + addiu t1, t1, -128 + addiu t2, t2, -128 + addiu t3, t3, -128 + addiu t4, t4, -128 + addiu t5, t5, -128 + addiu t6, t6, -128 + addiu t7, t7, -128 + addiu t8, t8, -128 + mtc1 t1, f2 + mtc1 t2, f4 + mtc1 t3, f6 + mtc1 t4, f8 + mtc1 t5, f10 + mtc1 t6, f12 + mtc1 t7, f14 + mtc1 t8, f16 + cvt.s.w f2, f2 + cvt.s.w f4, f4 + cvt.s.w f6, f6 + cvt.s.w f8, f8 + cvt.s.w f10, f10 + cvt.s.w f12, f12 + cvt.s.w f14, f14 + cvt.s.w f16, f16 + lw t0, 4(a0) + swc1 f2, 0(a2) + swc1 f4, 4(a2) + swc1 f6, 8(a2) + addu t0, t0, a1 + swc1 f8, 12(a2) + swc1 f10, 16(a2) + swc1 f12, 20(a2) + swc1 f14, 24(a2) + swc1 f16, 28(a2) + //elemr 1 + lbu t1, 0(t0) + lbu t2, 1(t0) + lbu t3, 2(t0) + lbu t4, 3(t0) + lbu t5, 4(t0) + lbu t6, 5(t0) + lbu t7, 6(t0) + lbu t8, 7(t0) + addiu t1, t1, -128 + addiu t2, t2, -128 + addiu t3, t3, -128 + addiu t4, t4, -128 + addiu t5, t5, -128 + addiu t6, t6, -128 + addiu t7, t7, -128 + addiu t8, t8, -128 + mtc1 t1, f2 + mtc1 t2, f4 + mtc1 t3, f6 + mtc1 t4, f8 + mtc1 t5, f10 + mtc1 t6, f12 + mtc1 t7, f14 + mtc1 t8, f16 + cvt.s.w f2, f2 + cvt.s.w f4, f4 + cvt.s.w f6, f6 + cvt.s.w f8, f8 + cvt.s.w f10, f10 + cvt.s.w f12, f12 + cvt.s.w f14, f14 + cvt.s.w f16, f16 + lw t0, 8(a0) + swc1 f2, 32(a2) + swc1 f4, 36(a2) + swc1 f6, 40(a2) + addu t0, t0, a1 + swc1 f8, 44(a2) + swc1 f10, 48(a2) + swc1 f12, 52(a2) + swc1 f14, 56(a2) + swc1 f16, 60(a2) + //elemr 2 + lbu t1, 0(t0) + lbu t2, 1(t0) + lbu t3, 2(t0) + lbu t4, 3(t0) + lbu t5, 4(t0) + lbu t6, 5(t0) + lbu t7, 6(t0) + lbu t8, 7(t0) + addiu t1, t1, -128 + addiu t2, t2, -128 + addiu t3, t3, -128 + addiu t4, t4, -128 + addiu t5, t5, -128 + addiu t6, t6, -128 + addiu t7, t7, -128 + addiu t8, t8, -128 + mtc1 t1, f2 + mtc1 t2, f4 + mtc1 t3, f6 + mtc1 t4, f8 + mtc1 t5, f10 + mtc1 t6, f12 + mtc1 t7, f14 + mtc1 t8, f16 + cvt.s.w f2, f2 + cvt.s.w f4, f4 + cvt.s.w f6, f6 + cvt.s.w f8, f8 + cvt.s.w f10, f10 + cvt.s.w f12, f12 + cvt.s.w f14, f14 + cvt.s.w f16, f16 + lw t0, 12(a0) + swc1 f2, 64(a2) + swc1 f4, 68(a2) + swc1 f6, 72(a2) + addu t0, t0, a1 + swc1 f8, 76(a2) + swc1 f10, 80(a2) + swc1 f12, 84(a2) + swc1 f14, 88(a2) + swc1 f16, 92(a2) + //elemr 3 + lbu t1, 0(t0) + lbu t2, 1(t0) + lbu t3, 2(t0) + lbu t4, 3(t0) + lbu t5, 4(t0) + lbu t6, 5(t0) + lbu t7, 6(t0) + lbu t8, 7(t0) + addiu t1, t1, -128 + addiu t2, t2, -128 + addiu t3, t3, -128 + addiu t4, t4, -128 + addiu t5, t5, -128 + addiu t6, t6, -128 + addiu t7, t7, -128 + addiu t8, t8, -128 + mtc1 t1, f2 + mtc1 t2, f4 + mtc1 t3, f6 + mtc1 t4, f8 + mtc1 t5, f10 + mtc1 t6, f12 + mtc1 t7, f14 + mtc1 t8, f16 + cvt.s.w f2, f2 + cvt.s.w f4, f4 + cvt.s.w f6, f6 + cvt.s.w f8, f8 + cvt.s.w f10, f10 + cvt.s.w f12, f12 + cvt.s.w f14, f14 + cvt.s.w f16, f16 + lw t0, 16(a0) + swc1 f2, 96(a2) + swc1 f4, 100(a2) + swc1 f6, 104(a2) + addu t0, t0, a1 + swc1 f8, 108(a2) + swc1 f10, 112(a2) + swc1 f12, 116(a2) + swc1 f14, 120(a2) + swc1 f16, 124(a2) + //elemr 4 + lbu t1, 0(t0) + lbu t2, 1(t0) + lbu t3, 2(t0) + lbu t4, 3(t0) + lbu t5, 4(t0) + lbu t6, 5(t0) + lbu t7, 6(t0) + lbu t8, 7(t0) + addiu t1, t1, -128 + addiu t2, t2, -128 + addiu t3, t3, -128 + addiu t4, t4, -128 + addiu t5, t5, -128 + addiu t6, t6, -128 + addiu t7, t7, -128 + addiu t8, t8, -128 + mtc1 t1, f2 + mtc1 t2, f4 + mtc1 t3, f6 + mtc1 t4, f8 + mtc1 t5, f10 + mtc1 t6, f12 + mtc1 t7, f14 + mtc1 t8, f16 + cvt.s.w f2, f2 + cvt.s.w f4, f4 + cvt.s.w f6, f6 + cvt.s.w f8, f8 + cvt.s.w f10, f10 + cvt.s.w f12, f12 + cvt.s.w f14, f14 + cvt.s.w f16, f16 + lw t0, 20(a0) + swc1 f2, 128(a2) + swc1 f4, 132(a2) + swc1 f6, 136(a2) + addu t0, t0, a1 + swc1 f8, 140(a2) + swc1 f10, 144(a2) + swc1 f12, 148(a2) + swc1 f14, 152(a2) + swc1 f16, 156(a2) + //elemr 5 + lbu t1, 0(t0) + lbu t2, 1(t0) + lbu t3, 2(t0) + lbu t4, 3(t0) + lbu t5, 4(t0) + lbu t6, 5(t0) + lbu t7, 6(t0) + lbu t8, 7(t0) + addiu t1, t1, -128 + addiu t2, t2, -128 + addiu t3, t3, -128 + addiu t4, t4, -128 + addiu t5, t5, -128 + addiu t6, t6, -128 + addiu t7, t7, -128 + addiu t8, t8, -128 + mtc1 t1, f2 + mtc1 t2, f4 + mtc1 t3, f6 + mtc1 t4, f8 + mtc1 t5, f10 + mtc1 t6, f12 + mtc1 t7, f14 + mtc1 t8, f16 + cvt.s.w f2, f2 + cvt.s.w f4, f4 + cvt.s.w f6, f6 + cvt.s.w f8, f8 + cvt.s.w f10, f10 + cvt.s.w f12, f12 + cvt.s.w f14, f14 + cvt.s.w f16, f16 + lw t0, 24(a0) + swc1 f2, 160(a2) + swc1 f4, 164(a2) + swc1 f6, 168(a2) + addu t0, t0, a1 + swc1 f8, 172(a2) + swc1 f10, 176(a2) + swc1 f12, 180(a2) + swc1 f14, 184(a2) + swc1 f16, 188(a2) + //elemr 6 + lbu t1, 0(t0) + lbu t2, 1(t0) + lbu t3, 2(t0) + lbu t4, 3(t0) + lbu t5, 4(t0) + lbu t6, 5(t0) + lbu t7, 6(t0) + lbu t8, 7(t0) + addiu t1, t1, -128 + addiu t2, t2, -128 + addiu t3, t3, -128 + addiu t4, t4, -128 + addiu t5, t5, -128 + addiu t6, t6, -128 + addiu t7, t7, -128 + addiu t8, t8, -128 + mtc1 t1, f2 + mtc1 t2, f4 + mtc1 t3, f6 + mtc1 t4, f8 + mtc1 t5, f10 + mtc1 t6, f12 + mtc1 t7, f14 + mtc1 t8, f16 + cvt.s.w f2, f2 + cvt.s.w f4, f4 + cvt.s.w f6, f6 + cvt.s.w f8, f8 + cvt.s.w f10, f10 + cvt.s.w f12, f12 + cvt.s.w f14, f14 + cvt.s.w f16, f16 + lw t0, 28(a0) + swc1 f2, 192(a2) + swc1 f4, 196(a2) + swc1 f6, 200(a2) + addu t0, t0, a1 + swc1 f8, 204(a2) + swc1 f10, 208(a2) + swc1 f12, 212(a2) + swc1 f14, 216(a2) + swc1 f16, 220(a2) + //elemr 7 + lbu t1, 0(t0) + lbu t2, 1(t0) + lbu t3, 2(t0) + lbu t4, 3(t0) + lbu t5, 4(t0) + lbu t6, 5(t0) + lbu t7, 6(t0) + lbu t8, 7(t0) + addiu t1, t1, -128 + addiu t2, t2, -128 + addiu t3, t3, -128 + addiu t4, t4, -128 + addiu t5, t5, -128 + addiu t6, t6, -128 + addiu t7, t7, -128 + addiu t8, t8, -128 + mtc1 t1, f2 + mtc1 t2, f4 + mtc1 t3, f6 + mtc1 t4, f8 + mtc1 t5, f10 + mtc1 t6, f12 + mtc1 t7, f14 + mtc1 t8, f16 + cvt.s.w f2, f2 + cvt.s.w f4, f4 + cvt.s.w f6, f6 + cvt.s.w f8, f8 + cvt.s.w f10, f10 + cvt.s.w f12, f12 + cvt.s.w f14, f14 + cvt.s.w f16, f16 + swc1 f2, 224(a2) + swc1 f4, 228(a2) + swc1 f6, 232(a2) + swc1 f8, 236(a2) + swc1 f10, 240(a2) + swc1 f12, 244(a2) + swc1 f14, 248(a2) + swc1 f16, 252(a2) + + j ra + nop + +END(jsimd_convsamp_float_mips_dspr2) + +/*****************************************************************************/ diff --git a/Builder/jni-1.11/simd/jsimd_mips_dspr2_asm.h b/Builder/jni-1.11/simd/jsimd_mips_dspr2_asm.h new file mode 100644 index 000000000..499e34b7b --- /dev/null +++ b/Builder/jni-1.11/simd/jsimd_mips_dspr2_asm.h @@ -0,0 +1,283 @@ +/* + * MIPS DSPr2 optimizations for libjpeg-turbo + * + * Copyright (C) 2013, MIPS Technologies, Inc., California. + * All Rights Reserved. + * Authors: Teodora Novkovic (teodora.novkovic@imgtec.com) + * Darko Laus (darko.laus@imgtec.com) + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#define zero $0 +#define AT $1 +#define v0 $2 +#define v1 $3 +#define a0 $4 +#define a1 $5 +#define a2 $6 +#define a3 $7 +#define t0 $8 +#define t1 $9 +#define t2 $10 +#define t3 $11 +#define t4 $12 +#define t5 $13 +#define t6 $14 +#define t7 $15 +#define s0 $16 +#define s1 $17 +#define s2 $18 +#define s3 $19 +#define s4 $20 +#define s5 $21 +#define s6 $22 +#define s7 $23 +#define t8 $24 +#define t9 $25 +#define k0 $26 +#define k1 $27 +#define gp $28 +#define sp $29 +#define fp $30 +#define s8 $30 +#define ra $31 + +#define f0 $f0 +#define f1 $f1 +#define f2 $f2 +#define f3 $f3 +#define f4 $f4 +#define f5 $f5 +#define f6 $f6 +#define f7 $f7 +#define f8 $f8 +#define f9 $f9 +#define f10 $f10 +#define f11 $f11 +#define f12 $f12 +#define f13 $f13 +#define f14 $f14 +#define f15 $f15 +#define f16 $f16 +#define f17 $f17 +#define f18 $f18 +#define f19 $f19 +#define f20 $f20 +#define f21 $f21 +#define f22 $f22 +#define f23 $f23 +#define f24 $f24 +#define f25 $f25 +#define f26 $f26 +#define f27 $f27 +#define f28 $f28 +#define f29 $f29 +#define f30 $f30 +#define f31 $f31 + +/* + * LEAF_MIPS32R2 - declare leaf routine for MIPS32r2 + */ +#define LEAF_MIPS32R2(symbol) \ + .globl symbol; \ + .align 2; \ + .type symbol, @function; \ + .ent symbol, 0; \ +symbol: .frame sp, 0, ra; \ + .set push; \ + .set arch=mips32r2; \ + .set noreorder; \ + .set noat; + +/* + * LEAF_MIPS_DSPR2 - declare leaf routine for MIPS DSPr2 + */ +#define LEAF_MIPS_DSPR2(symbol) \ +LEAF_MIPS32R2(symbol) \ + .set dspr2; + +/* + * END - mark end of function + */ +#define END(function) \ + .set pop; \ + .end function; \ + .size function,.-function + +/* + * Checks if stack offset is big enough for storing/restoring regs_num + * number of register to/from stack. Stack offset must be greater than + * or equal to the number of bytes needed for storing registers (regs_num*4). + * Since MIPS ABI allows usage of first 16 bytes of stack frame (this is + * preserved for input arguments of the functions, already stored in a0-a3), + * stack size can be further optimized by utilizing this space. + */ +.macro CHECK_STACK_OFFSET regs_num, stack_offset +.if \stack_offset < \regs_num * 4 - 16 +.error "Stack offset too small." +.endif +.endm + +/* + * Saves set of registers on stack. Maximum number of registers that + * can be saved on stack is limitted to 14 (a0-a3, v0-v1 and s0-s7). + * Stack offset is number of bytes that are added to stack pointer (sp) + * before registers are pushed in order to provide enough space on stack + * (offset must be multiple of 4, and must be big enough, as described by + * CHECK_STACK_OFFSET macro). This macro is intended to be used in + * combination with RESTORE_REGS_FROM_STACK macro. Example: + * SAVE_REGS_ON_STACK 4, v0, v1, s0, s1 + * RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1 + */ +.macro SAVE_REGS_ON_STACK stack_offset = 0, r1, \ + r2 = 0, r3 = 0, r4 = 0, \ + r5 = 0, r6 = 0, r7 = 0, \ + r8 = 0, r9 = 0, r10 = 0, \ + r11 = 0, r12 = 0, r13 = 0, \ + r14 = 0 + .if (\stack_offset < 0) || (\stack_offset - (\stack_offset / 4) * 4) + .error "Stack offset must be pozitive and multiple of 4." + .endif + .if \stack_offset != 0 + addiu sp, sp, -\stack_offset + .endif + sw \r1, 0(sp) + .if \r2 != 0 + sw \r2, 4(sp) + .endif + .if \r3 != 0 + sw \r3, 8(sp) + .endif + .if \r4 != 0 + sw \r4, 12(sp) + .endif + .if \r5 != 0 + CHECK_STACK_OFFSET 5, \stack_offset + sw \r5, 16(sp) + .endif + .if \r6 != 0 + CHECK_STACK_OFFSET 6, \stack_offset + sw \r6, 20(sp) + .endif + .if \r7 != 0 + CHECK_STACK_OFFSET 7, \stack_offset + sw \r7, 24(sp) + .endif + .if \r8 != 0 + CHECK_STACK_OFFSET 8, \stack_offset + sw \r8, 28(sp) + .endif + .if \r9 != 0 + CHECK_STACK_OFFSET 9, \stack_offset + sw \r9, 32(sp) + .endif + .if \r10 != 0 + CHECK_STACK_OFFSET 10, \stack_offset + sw \r10, 36(sp) + .endif + .if \r11 != 0 + CHECK_STACK_OFFSET 11, \stack_offset + sw \r11, 40(sp) + .endif + .if \r12 != 0 + CHECK_STACK_OFFSET 12, \stack_offset + sw \r12, 44(sp) + .endif + .if \r13 != 0 + CHECK_STACK_OFFSET 13, \stack_offset + sw \r13, 48(sp) + .endif + .if \r14 != 0 + CHECK_STACK_OFFSET 14, \stack_offset + sw \r14, 52(sp) + .endif +.endm + +/* + * Restores set of registers from stack. Maximum number of registers that + * can be restored from stack is limitted to 14 (a0-a3, v0-v1 and s0-s7). + * Stack offset is number of bytes that are added to stack pointer (sp) + * after registers are restored (offset must be multiple of 4, and must + * be big enough, as described by CHECK_STACK_OFFSET macro). This macro is + * intended to be used in combination with RESTORE_REGS_FROM_STACK macro. + * Example: + * SAVE_REGS_ON_STACK 4, v0, v1, s0, s1 + * RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1 + */ +.macro RESTORE_REGS_FROM_STACK stack_offset = 0, r1, \ + r2 = 0, r3 = 0, r4 = 0, \ + r5 = 0, r6 = 0, r7 = 0, \ + r8 = 0, r9 = 0, r10 = 0, \ + r11 = 0, r12 = 0, r13 = 0, \ + r14 = 0 + .if (\stack_offset < 0) || (\stack_offset - (\stack_offset/4)*4) + .error "Stack offset must be pozitive and multiple of 4." + .endif + lw \r1, 0(sp) + .if \r2 != 0 + lw \r2, 4(sp) + .endif + .if \r3 != 0 + lw \r3, 8(sp) + .endif + .if \r4 != 0 + lw \r4, 12(sp) + .endif + .if \r5 != 0 + CHECK_STACK_OFFSET 5, \stack_offset + lw \r5, 16(sp) + .endif + .if \r6 != 0 + CHECK_STACK_OFFSET 6, \stack_offset + lw \r6, 20(sp) + .endif + .if \r7 != 0 + CHECK_STACK_OFFSET 7, \stack_offset + lw \r7, 24(sp) + .endif + .if \r8 != 0 + CHECK_STACK_OFFSET 8, \stack_offset + lw \r8, 28(sp) + .endif + .if \r9 != 0 + CHECK_STACK_OFFSET 9, \stack_offset + lw \r9, 32(sp) + .endif + .if \r10 != 0 + CHECK_STACK_OFFSET 10, \stack_offset + lw \r10, 36(sp) + .endif + .if \r11 != 0 + CHECK_STACK_OFFSET 11, \stack_offset + lw \r11, 40(sp) + .endif + .if \r12 != 0 + CHECK_STACK_OFFSET 12, \stack_offset + lw \r12, 44(sp) + .endif + .if \r13 != 0 + CHECK_STACK_OFFSET 13, \stack_offset + lw \r13, 48(sp) + .endif + .if \r14 != 0 + CHECK_STACK_OFFSET 14, \stack_offset + lw \r14, 52(sp) + .endif + .if \stack_offset != 0 + addiu sp, sp, \stack_offset + .endif +.endm diff --git a/Builder/jni-1.11/simd/jsimd_powerpc.c b/Builder/jni-1.11/simd/jsimd_powerpc.c new file mode 100644 index 000000000..47dd746f0 --- /dev/null +++ b/Builder/jni-1.11/simd/jsimd_powerpc.c @@ -0,0 +1,852 @@ +/* + * jsimd_powerpc.c + * + * Copyright 2009 Pierre Ossman for Cendio AB + * Copyright (C) 2009-2011, 2014-2016, D. R. Commander. + * Copyright (C) 2015, Matthieu Darbois. + * + * Based on the x86 SIMD extension for IJG JPEG library, + * Copyright (C) 1999-2006, MIYASAKA Masaru. + * For conditions of distribution and use, see copyright notice in jsimdext.inc + * + * This file contains the interface between the "normal" portions + * of the library and the SIMD implementations when running on a + * PowerPC architecture. + */ + +#ifdef __amigaos4__ +/* This must be defined first as it re-defines GLOBAL otherwise */ +#include +#endif + +#define JPEG_INTERNALS +#include "../jinclude.h" +#include "../jpeglib.h" +#include "../jsimd.h" +#include "../jdct.h" +#include "../jsimddct.h" +#include "jsimd.h" + +#include +#include +#include + +#if defined(__OpenBSD__) +#include +#include +#include +#endif + +static unsigned int simd_support = ~0; + +#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__) + +#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024) + +LOCAL(int) +check_feature (char *buffer, char *feature) +{ + char *p; + if (*feature == 0) + return 0; + if (strncmp(buffer, "cpu", 3) != 0) + return 0; + buffer += 3; + while (isspace(*buffer)) + buffer++; + + /* Check if 'feature' is present in the buffer as a separate word */ + while ((p = strstr(buffer, feature))) { + if (p > buffer && !isspace(*(p - 1))) { + buffer++; + continue; + } + p += strlen(feature); + if (*p != 0 && !isspace(*p)) { + buffer++; + continue; + } + return 1; + } + return 0; +} + +LOCAL(int) +parse_proc_cpuinfo (int bufsize) +{ + char *buffer = (char *)malloc(bufsize); + FILE *fd; + simd_support = 0; + + if (!buffer) + return 0; + + fd = fopen("/proc/cpuinfo", "r"); + if (fd) { + while (fgets(buffer, bufsize, fd)) { + if (!strchr(buffer, '\n') && !feof(fd)) { + /* "impossible" happened - insufficient size of the buffer! */ + fclose(fd); + free(buffer); + return 0; + } + if (check_feature(buffer, "altivec")) + simd_support |= JSIMD_ALTIVEC; + } + fclose(fd); + } + free(buffer); + return 1; +} + +#endif + +/* + * Check what SIMD accelerations are supported. + * + * FIXME: This code is racy under a multi-threaded environment. + */ +LOCAL(void) +init_simd (void) +{ + char *env = NULL; +#if !defined(__ALTIVEC__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)) + int bufsize = 1024; /* an initial guess for the line buffer size limit */ +#elif defined(__amigaos4__) + uint32 altivec = 0; +#elif defined(__OpenBSD__) + int mib[2] = { CTL_MACHDEP, CPU_ALTIVEC }; + int altivec; + size_t len = sizeof(altivec); +#endif + + if (simd_support != ~0U) + return; + + simd_support = 0; + +#if defined(__ALTIVEC__) || defined(__APPLE__) + simd_support |= JSIMD_ALTIVEC; +#elif defined(__linux__) || defined(ANDROID) || defined(__ANDROID__) + while (!parse_proc_cpuinfo(bufsize)) { + bufsize *= 2; + if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT) + break; + } +#elif defined(__amigaos4__) + IExec->GetCPUInfoTags(GCIT_VectorUnit, &altivec, TAG_DONE); + if(altivec == VECTORTYPE_ALTIVEC) + simd_support |= JSIMD_ALTIVEC; +#elif defined(__OpenBSD__) + if (sysctl(mib, 2, &altivec, &len, NULL, 0) == 0 && altivec != 0) + simd_support |= JSIMD_ALTIVEC; +#endif + + /* Force different settings through environment variables */ + env = getenv("JSIMD_FORCEALTIVEC"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_support = JSIMD_ALTIVEC; + env = getenv("JSIMD_FORCENONE"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_support = 0; +} + +GLOBAL(int) +jsimd_can_rgb_ycc (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_rgb_gray (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_ycc_rgb (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_ycc_rgb565 (void) +{ + return 0; +} + +GLOBAL(void) +jsimd_rgb_ycc_convert (j_compress_ptr cinfo, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) +{ + void (*altivecfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + + switch(cinfo->in_color_space) { + case JCS_EXT_RGB: + altivecfct=jsimd_extrgb_ycc_convert_altivec; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + altivecfct=jsimd_extrgbx_ycc_convert_altivec; + break; + case JCS_EXT_BGR: + altivecfct=jsimd_extbgr_ycc_convert_altivec; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + altivecfct=jsimd_extbgrx_ycc_convert_altivec; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + altivecfct=jsimd_extxbgr_ycc_convert_altivec; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + altivecfct=jsimd_extxrgb_ycc_convert_altivec; + break; + default: + altivecfct=jsimd_rgb_ycc_convert_altivec; + break; + } + + altivecfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); +} + +GLOBAL(void) +jsimd_rgb_gray_convert (j_compress_ptr cinfo, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) +{ + void (*altivecfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + + switch(cinfo->in_color_space) { + case JCS_EXT_RGB: + altivecfct=jsimd_extrgb_gray_convert_altivec; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + altivecfct=jsimd_extrgbx_gray_convert_altivec; + break; + case JCS_EXT_BGR: + altivecfct=jsimd_extbgr_gray_convert_altivec; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + altivecfct=jsimd_extbgrx_gray_convert_altivec; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + altivecfct=jsimd_extxbgr_gray_convert_altivec; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + altivecfct=jsimd_extxrgb_gray_convert_altivec; + break; + default: + altivecfct=jsimd_rgb_gray_convert_altivec; + break; + } + + altivecfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); +} + +GLOBAL(void) +jsimd_ycc_rgb_convert (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ + void (*altivecfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int); + + switch(cinfo->out_color_space) { + case JCS_EXT_RGB: + altivecfct=jsimd_ycc_extrgb_convert_altivec; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + altivecfct=jsimd_ycc_extrgbx_convert_altivec; + break; + case JCS_EXT_BGR: + altivecfct=jsimd_ycc_extbgr_convert_altivec; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + altivecfct=jsimd_ycc_extbgrx_convert_altivec; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + altivecfct=jsimd_ycc_extxbgr_convert_altivec; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + altivecfct=jsimd_ycc_extxrgb_convert_altivec; + break; + default: + altivecfct=jsimd_ycc_rgb_convert_altivec; + break; + } + + altivecfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows); +} + +GLOBAL(void) +jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ +} + +GLOBAL(int) +jsimd_can_h2v2_downsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_downsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + jsimd_h2v2_downsample_altivec(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, + compptr->width_in_blocks, + input_data, output_data); +} + +GLOBAL(void) +jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + jsimd_h2v1_downsample_altivec(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, + compptr->width_in_blocks, + input_data, output_data); +} + +GLOBAL(int) +jsimd_can_h2v2_upsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_upsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_upsample (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + jsimd_h2v2_upsample_altivec(cinfo->max_v_samp_factor, cinfo->output_width, + input_data, output_data_ptr); +} + +GLOBAL(void) +jsimd_h2v1_upsample (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + jsimd_h2v1_upsample_altivec(cinfo->max_v_samp_factor, cinfo->output_width, + input_data, output_data_ptr); +} + +GLOBAL(int) +jsimd_can_h2v2_fancy_upsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_fancy_upsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + jsimd_h2v2_fancy_upsample_altivec(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); +} + +GLOBAL(void) +jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + jsimd_h2v1_fancy_upsample_altivec(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); +} + +GLOBAL(int) +jsimd_can_h2v2_merged_upsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_merged_upsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf) +{ + void (*altivecfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); + + switch(cinfo->out_color_space) { + case JCS_EXT_RGB: + altivecfct=jsimd_h2v2_extrgb_merged_upsample_altivec; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + altivecfct=jsimd_h2v2_extrgbx_merged_upsample_altivec; + break; + case JCS_EXT_BGR: + altivecfct=jsimd_h2v2_extbgr_merged_upsample_altivec; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + altivecfct=jsimd_h2v2_extbgrx_merged_upsample_altivec; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + altivecfct=jsimd_h2v2_extxbgr_merged_upsample_altivec; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + altivecfct=jsimd_h2v2_extxrgb_merged_upsample_altivec; + break; + default: + altivecfct=jsimd_h2v2_merged_upsample_altivec; + break; + } + + altivecfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf); +} + +GLOBAL(void) +jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf) +{ + void (*altivecfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); + + switch(cinfo->out_color_space) { + case JCS_EXT_RGB: + altivecfct=jsimd_h2v1_extrgb_merged_upsample_altivec; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + altivecfct=jsimd_h2v1_extrgbx_merged_upsample_altivec; + break; + case JCS_EXT_BGR: + altivecfct=jsimd_h2v1_extbgr_merged_upsample_altivec; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + altivecfct=jsimd_h2v1_extbgrx_merged_upsample_altivec; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + altivecfct=jsimd_h2v1_extxbgr_merged_upsample_altivec; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + altivecfct=jsimd_h2v1_extxrgb_merged_upsample_altivec; + break; + default: + altivecfct=jsimd_h2v1_merged_upsample_altivec; + break; + } + + altivecfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf); +} + +GLOBAL(int) +jsimd_can_convsamp (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_convsamp_float (void) +{ + return 0; +} + +GLOBAL(void) +jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, + DCTELEM *workspace) +{ + jsimd_convsamp_altivec(sample_data, start_col, workspace); +} + +GLOBAL(void) +jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col, + FAST_FLOAT *workspace) +{ +} + +GLOBAL(int) +jsimd_can_fdct_islow (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_fdct_ifast (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_fdct_float (void) +{ + return 0; +} + +GLOBAL(void) +jsimd_fdct_islow (DCTELEM *data) +{ + jsimd_fdct_islow_altivec(data); +} + +GLOBAL(void) +jsimd_fdct_ifast (DCTELEM *data) +{ + jsimd_fdct_ifast_altivec(data); +} + +GLOBAL(void) +jsimd_fdct_float (FAST_FLOAT *data) +{ +} + +GLOBAL(int) +jsimd_can_quantize (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_quantize_float (void) +{ + return 0; +} + +GLOBAL(void) +jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors, + DCTELEM *workspace) +{ + jsimd_quantize_altivec(coef_block, divisors, workspace); +} + +GLOBAL(void) +jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors, + FAST_FLOAT *workspace) +{ +} + +GLOBAL(int) +jsimd_can_idct_2x2 (void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_idct_4x4 (void) +{ + return 0; +} + +GLOBAL(void) +jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +} + +GLOBAL(void) +jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +} + +GLOBAL(int) +jsimd_can_idct_islow (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_ifast (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_float (void) +{ + return 0; +} + +GLOBAL(void) +jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_islow_altivec(compptr->dct_table, coef_block, output_buf, + output_col); +} + +GLOBAL(void) +jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_ifast_altivec(compptr->dct_table, coef_block, output_buf, + output_col); +} + +GLOBAL(void) +jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +} + +GLOBAL(int) +jsimd_can_huff_encode_one_block (void) +{ + return 0; +} + +GLOBAL(JOCTET*) +jsimd_huff_encode_one_block (void *state, JOCTET *buffer, JCOEFPTR block, + int last_dc_val, c_derived_tbl *dctbl, + c_derived_tbl *actbl) +{ + return NULL; +} diff --git a/Builder/jni-1.11/simd/jsimd_x86_64.c b/Builder/jni-1.11/simd/jsimd_x86_64.c new file mode 100644 index 000000000..de0c60a29 --- /dev/null +++ b/Builder/jni-1.11/simd/jsimd_x86_64.c @@ -0,0 +1,887 @@ +/* + * jsimd_x86_64.c + * + * Copyright 2009 Pierre Ossman for Cendio AB + * Copyright (C) 2009-2011, 2014, 2016, D. R. Commander. + * Copyright (C) 2015, Matthieu Darbois. + * + * Based on the x86 SIMD extension for IJG JPEG library, + * Copyright (C) 1999-2006, MIYASAKA Masaru. + * For conditions of distribution and use, see copyright notice in jsimdext.inc + * + * This file contains the interface between the "normal" portions + * of the library and the SIMD implementations when running on a + * 64-bit x86 architecture. + */ + +#define JPEG_INTERNALS +#include "h/jinclude.h" +#include "h/jpeglib.h" +#include "h/jsimd.h" +#include "h/jdct.h" +#include "h/jsimddct.h" +#include "jsimd.h" + +/* + * In the PIC cases, we have no guarantee that constants will keep + * their alignment. This macro allows us to verify it at runtime. + */ +#define IS_ALIGNED(ptr, order) (((size_t)ptr & ((1 << order) - 1)) == 0) + +#define IS_ALIGNED_SSE(ptr) (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */ + +static unsigned int simd_support = ~0; +static unsigned int simd_huffman = 1; + +/* + * Check what SIMD accelerations are supported. + * + * FIXME: This code is racy under a multi-threaded environment. + */ +LOCAL(void) +init_simd (void) +{ + char *env = NULL; + + if (simd_support != ~0U) + return; + + simd_support = JSIMD_SSE2 | JSIMD_SSE; + + /* Force different settings through environment variables */ + env = getenv("JSIMD_FORCENONE"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_support = 0; + env = getenv("JSIMD_NOHUFFENC"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_huffman = 0; +} + +GLOBAL(int) +jsimd_can_rgb_ycc (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_rgb_ycc_convert_sse2)) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_rgb_gray (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2)) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_ycc_rgb (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2)) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_ycc_rgb565 (void) +{ + return 0; +} + +GLOBAL(void) +jsimd_rgb_ycc_convert (j_compress_ptr cinfo, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) +{ + void (*sse2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + + switch(cinfo->in_color_space) { + case JCS_EXT_RGB: + sse2fct=jsimd_extrgb_ycc_convert_sse2; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + sse2fct=jsimd_extrgbx_ycc_convert_sse2; + break; + case JCS_EXT_BGR: + sse2fct=jsimd_extbgr_ycc_convert_sse2; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + sse2fct=jsimd_extbgrx_ycc_convert_sse2; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + sse2fct=jsimd_extxbgr_ycc_convert_sse2; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + sse2fct=jsimd_extxrgb_ycc_convert_sse2; + break; + default: + sse2fct=jsimd_rgb_ycc_convert_sse2; + break; + } + + sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); +} + +GLOBAL(void) +jsimd_rgb_gray_convert (j_compress_ptr cinfo, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) +{ + void (*sse2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + + switch(cinfo->in_color_space) { + case JCS_EXT_RGB: + sse2fct=jsimd_extrgb_gray_convert_sse2; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + sse2fct=jsimd_extrgbx_gray_convert_sse2; + break; + case JCS_EXT_BGR: + sse2fct=jsimd_extbgr_gray_convert_sse2; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + sse2fct=jsimd_extbgrx_gray_convert_sse2; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + sse2fct=jsimd_extxbgr_gray_convert_sse2; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + sse2fct=jsimd_extxrgb_gray_convert_sse2; + break; + default: + sse2fct=jsimd_rgb_gray_convert_sse2; + break; + } + + sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); +} + +GLOBAL(void) +jsimd_ycc_rgb_convert (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ + void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int); + + switch(cinfo->out_color_space) { + case JCS_EXT_RGB: + sse2fct=jsimd_ycc_extrgb_convert_sse2; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + sse2fct=jsimd_ycc_extrgbx_convert_sse2; + break; + case JCS_EXT_BGR: + sse2fct=jsimd_ycc_extbgr_convert_sse2; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + sse2fct=jsimd_ycc_extbgrx_convert_sse2; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + sse2fct=jsimd_ycc_extxbgr_convert_sse2; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + sse2fct=jsimd_ycc_extxrgb_convert_sse2; + break; + default: + sse2fct=jsimd_ycc_rgb_convert_sse2; + break; + } + + sse2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows); +} + +GLOBAL(void) +jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ +} + +GLOBAL(int) +jsimd_can_h2v2_downsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_SSE2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_downsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_SSE2) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + jsimd_h2v2_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, compptr->width_in_blocks, + input_data, output_data); +} + +GLOBAL(void) +jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + jsimd_h2v1_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, compptr->width_in_blocks, + input_data, output_data); +} + +GLOBAL(int) +jsimd_can_h2v2_upsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_SSE2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_upsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_SSE2) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_upsample (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + jsimd_h2v2_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width, + input_data, output_data_ptr); +} + +GLOBAL(void) +jsimd_h2v1_upsample (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + jsimd_h2v1_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width, + input_data, output_data_ptr); +} + +GLOBAL(int) +jsimd_can_h2v2_fancy_upsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_fancy_upsample_sse2)) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_fancy_upsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_fancy_upsample_sse2)) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + jsimd_h2v2_fancy_upsample_sse2(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); +} + +GLOBAL(void) +jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + jsimd_h2v1_fancy_upsample_sse2(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); +} + +GLOBAL(int) +jsimd_can_h2v2_merged_upsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_merged_upsample_sse2)) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_merged_upsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_merged_upsample_sse2)) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf) +{ + void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); + + switch(cinfo->out_color_space) { + case JCS_EXT_RGB: + sse2fct=jsimd_h2v2_extrgb_merged_upsample_sse2; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + sse2fct=jsimd_h2v2_extrgbx_merged_upsample_sse2; + break; + case JCS_EXT_BGR: + sse2fct=jsimd_h2v2_extbgr_merged_upsample_sse2; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + sse2fct=jsimd_h2v2_extbgrx_merged_upsample_sse2; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + sse2fct=jsimd_h2v2_extxbgr_merged_upsample_sse2; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + sse2fct=jsimd_h2v2_extxrgb_merged_upsample_sse2; + break; + default: + sse2fct=jsimd_h2v2_merged_upsample_sse2; + break; + } + + sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf); +} + +GLOBAL(void) +jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf) +{ + void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); + + switch(cinfo->out_color_space) { + case JCS_EXT_RGB: + sse2fct=jsimd_h2v1_extrgb_merged_upsample_sse2; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + sse2fct=jsimd_h2v1_extrgbx_merged_upsample_sse2; + break; + case JCS_EXT_BGR: + sse2fct=jsimd_h2v1_extbgr_merged_upsample_sse2; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + sse2fct=jsimd_h2v1_extbgrx_merged_upsample_sse2; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + sse2fct=jsimd_h2v1_extxbgr_merged_upsample_sse2; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + sse2fct=jsimd_h2v1_extxrgb_merged_upsample_sse2; + break; + default: + sse2fct=jsimd_h2v1_merged_upsample_sse2; + break; + } + + sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf); +} + +GLOBAL(int) +jsimd_can_convsamp (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_SSE2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_convsamp_float (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(FAST_FLOAT) != 4) + return 0; + + if (simd_support & JSIMD_SSE2) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, + DCTELEM *workspace) +{ + jsimd_convsamp_sse2(sample_data, start_col, workspace); +} + +GLOBAL(void) +jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col, + FAST_FLOAT *workspace) +{ + jsimd_convsamp_float_sse2(sample_data, start_col, workspace); +} + +GLOBAL(int) +jsimd_can_fdct_islow (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2)) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_fdct_ifast (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_ifast_sse2)) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_fdct_float (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(FAST_FLOAT) != 4) + return 0; + + if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse)) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_fdct_islow (DCTELEM *data) +{ + jsimd_fdct_islow_sse2(data); +} + +GLOBAL(void) +jsimd_fdct_ifast (DCTELEM *data) +{ + jsimd_fdct_ifast_sse2(data); +} + +GLOBAL(void) +jsimd_fdct_float (FAST_FLOAT *data) +{ + jsimd_fdct_float_sse(data); +} + +GLOBAL(int) +jsimd_can_quantize (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_SSE2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_quantize_float (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (sizeof(FAST_FLOAT) != 4) + return 0; + + if (simd_support & JSIMD_SSE2) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors, + DCTELEM *workspace) +{ + jsimd_quantize_sse2(coef_block, divisors, workspace); +} + +GLOBAL(void) +jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors, + FAST_FLOAT *workspace) +{ + jsimd_quantize_float_sse2(coef_block, divisors, workspace); +} + +GLOBAL(int) +jsimd_can_idct_2x2 (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2)) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_4x4 (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2)) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_2x2_sse2(compptr->dct_table, coef_block, output_buf, output_col); +} + +GLOBAL(void) +jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_4x4_sse2(compptr->dct_table, coef_block, output_buf, output_col); +} + +GLOBAL(int) +jsimd_can_idct_islow (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_islow_sse2)) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_ifast (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(IFAST_MULT_TYPE) != 2) + return 0; + if (IFAST_SCALE_BITS != 2) + return 0; + + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2)) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_float (void) +{ + init_simd(); + + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(FAST_FLOAT) != 4) + return 0; + if (sizeof(FLOAT_MULT_TYPE) != 4) + return 0; + + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2)) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_islow_sse2(compptr->dct_table, coef_block, output_buf, + output_col); +} + +GLOBAL(void) +jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_ifast_sse2(compptr->dct_table, coef_block, output_buf, + output_col); +} + +GLOBAL(void) +jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_float_sse2(compptr->dct_table, coef_block, output_buf, + output_col); +} + +GLOBAL(int) +jsimd_can_huff_encode_one_block (void) +{ + init_simd(); + + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + + if ((simd_support & JSIMD_SSE2) && simd_huffman && + IS_ALIGNED_SSE(jconst_huff_encode_one_block)) + return 1; + + return 0; +} + +GLOBAL(JOCTET*) +jsimd_huff_encode_one_block (void *state, JOCTET *buffer, JCOEFPTR block, + int last_dc_val, c_derived_tbl *dctbl, + c_derived_tbl *actbl) +{ + return jsimd_huff_encode_one_block_sse2(state, buffer, block, last_dc_val, + dctbl, actbl); +} diff --git a/Builder/jni-1.11/simd/jsimdcfg.inc.h b/Builder/jni-1.11/simd/jsimdcfg.inc.h new file mode 100644 index 000000000..d2b499fae --- /dev/null +++ b/Builder/jni-1.11/simd/jsimdcfg.inc.h @@ -0,0 +1,130 @@ +// This file generates the include file for the assembly +// implementations by abusing the C preprocessor. +// +// Note: Some things are manually defined as they need to +// be mapped to NASM types. + +; +; Automatically generated include file from jsimdcfg.inc.h +; + +#define JPEG_INTERNALS + +#include "../jpeglib.h" +#include "../jconfig.h" +#include "../jmorecfg.h" +#include "jsimd.h" + +; +; -- jpeglib.h +; + +%define _cpp_protection_DCTSIZE DCTSIZE +%define _cpp_protection_DCTSIZE2 DCTSIZE2 + +; +; -- jmorecfg.h +; + +%define _cpp_protection_RGB_RED RGB_RED +%define _cpp_protection_RGB_GREEN RGB_GREEN +%define _cpp_protection_RGB_BLUE RGB_BLUE +%define _cpp_protection_RGB_PIXELSIZE RGB_PIXELSIZE + +%define _cpp_protection_EXT_RGB_RED EXT_RGB_RED +%define _cpp_protection_EXT_RGB_GREEN EXT_RGB_GREEN +%define _cpp_protection_EXT_RGB_BLUE EXT_RGB_BLUE +%define _cpp_protection_EXT_RGB_PIXELSIZE EXT_RGB_PIXELSIZE + +%define _cpp_protection_EXT_RGBX_RED EXT_RGBX_RED +%define _cpp_protection_EXT_RGBX_GREEN EXT_RGBX_GREEN +%define _cpp_protection_EXT_RGBX_BLUE EXT_RGBX_BLUE +%define _cpp_protection_EXT_RGBX_PIXELSIZE EXT_RGBX_PIXELSIZE + +%define _cpp_protection_EXT_BGR_RED EXT_BGR_RED +%define _cpp_protection_EXT_BGR_GREEN EXT_BGR_GREEN +%define _cpp_protection_EXT_BGR_BLUE EXT_BGR_BLUE +%define _cpp_protection_EXT_BGR_PIXELSIZE EXT_BGR_PIXELSIZE + +%define _cpp_protection_EXT_BGRX_RED EXT_BGRX_RED +%define _cpp_protection_EXT_BGRX_GREEN EXT_BGRX_GREEN +%define _cpp_protection_EXT_BGRX_BLUE EXT_BGRX_BLUE +%define _cpp_protection_EXT_BGRX_PIXELSIZE EXT_BGRX_PIXELSIZE + +%define _cpp_protection_EXT_XBGR_RED EXT_XBGR_RED +%define _cpp_protection_EXT_XBGR_GREEN EXT_XBGR_GREEN +%define _cpp_protection_EXT_XBGR_BLUE EXT_XBGR_BLUE +%define _cpp_protection_EXT_XBGR_PIXELSIZE EXT_XBGR_PIXELSIZE + +%define _cpp_protection_EXT_XRGB_RED EXT_XRGB_RED +%define _cpp_protection_EXT_XRGB_GREEN EXT_XRGB_GREEN +%define _cpp_protection_EXT_XRGB_BLUE EXT_XRGB_BLUE +%define _cpp_protection_EXT_XRGB_PIXELSIZE EXT_XRGB_PIXELSIZE + +%define RGBX_FILLER_0XFF 1 + +; Representation of a single sample (pixel element value). +; On this SIMD implementation, this must be 'unsigned char'. +; + +%define JSAMPLE byte ; unsigned char +%define SIZEOF_JSAMPLE SIZEOF_BYTE ; sizeof(JSAMPLE) + +%define _cpp_protection_CENTERJSAMPLE CENTERJSAMPLE + +; Representation of a DCT frequency coefficient. +; On this SIMD implementation, this must be 'short'. +; +%define JCOEF word ; short +%define SIZEOF_JCOEF SIZEOF_WORD ; sizeof(JCOEF) + +; Datatype used for image dimensions. +; On this SIMD implementation, this must be 'unsigned int'. +; +%define JDIMENSION dword ; unsigned int +%define SIZEOF_JDIMENSION SIZEOF_DWORD ; sizeof(JDIMENSION) + +%define JSAMPROW POINTER ; JSAMPLE * (jpeglib.h) +%define JSAMPARRAY POINTER ; JSAMPROW * (jpeglib.h) +%define JSAMPIMAGE POINTER ; JSAMPARRAY * (jpeglib.h) +%define JCOEFPTR POINTER ; JCOEF * (jpeglib.h) +%define SIZEOF_JSAMPROW SIZEOF_POINTER ; sizeof(JSAMPROW) +%define SIZEOF_JSAMPARRAY SIZEOF_POINTER ; sizeof(JSAMPARRAY) +%define SIZEOF_JSAMPIMAGE SIZEOF_POINTER ; sizeof(JSAMPIMAGE) +%define SIZEOF_JCOEFPTR SIZEOF_POINTER ; sizeof(JCOEFPTR) + +; +; -- jdct.h +; + +; A forward DCT routine is given a pointer to a work area of type DCTELEM[]; +; the DCT is to be performed in-place in that buffer. +; To maximize parallelism, Type DCTELEM is changed to short (originally, int). +; +%define DCTELEM word ; short +%define SIZEOF_DCTELEM SIZEOF_WORD ; sizeof(DCTELEM) + +%define FAST_FLOAT FP32 ; float +%define SIZEOF_FAST_FLOAT SIZEOF_FP32 ; sizeof(FAST_FLOAT) + +; To maximize parallelism, Type MULTIPLIER is changed to short. +; +%define ISLOW_MULT_TYPE word ; must be short +%define SIZEOF_ISLOW_MULT_TYPE SIZEOF_WORD ; sizeof(ISLOW_MULT_TYPE) + +%define IFAST_MULT_TYPE word ; must be short +%define SIZEOF_IFAST_MULT_TYPE SIZEOF_WORD ; sizeof(IFAST_MULT_TYPE) +%define IFAST_SCALE_BITS 2 ; fractional bits in scale factors + +%define FLOAT_MULT_TYPE FP32 ; must be float +%define SIZEOF_FLOAT_MULT_TYPE SIZEOF_FP32 ; sizeof(FLOAT_MULT_TYPE) + +; +; -- jsimd.h +; + +%define _cpp_protection_JSIMD_NONE JSIMD_NONE +%define _cpp_protection_JSIMD_MMX JSIMD_MMX +%define _cpp_protection_JSIMD_3DNOW JSIMD_3DNOW +%define _cpp_protection_JSIMD_SSE JSIMD_SSE +%define _cpp_protection_JSIMD_SSE2 JSIMD_SSE2 diff --git a/Builder/jni-1.11/simd/jsimdcpu.asm b/Builder/jni-1.11/simd/jsimdcpu.asm new file mode 100644 index 000000000..599083b18 --- /dev/null +++ b/Builder/jni-1.11/simd/jsimdcpu.asm @@ -0,0 +1,104 @@ +; +; jsimdcpu.asm - SIMD instruction support check +; +; Copyright 2009 Pierre Ossman for Cendio AB +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Check if the CPU supports SIMD instructions +; +; GLOBAL(unsigned int) +; jpeg_simd_cpu_support (void) +; + + align 16 + global EXTN(jpeg_simd_cpu_support) + +EXTN(jpeg_simd_cpu_support): + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved +; push esi ; unused + push edi + + xor edi,edi ; simd support flag + + pushfd + pop eax + mov edx,eax + xor eax, 1<<21 ; flip ID bit in EFLAGS + push eax + popfd + pushfd + pop eax + xor eax,edx + jz short .return ; CPUID is not supported + + ; Check for MMX instruction support + xor eax,eax + cpuid + test eax,eax + jz short .return + + xor eax,eax + inc eax + cpuid + mov eax,edx ; eax = Standard feature flags + + test eax, 1<<23 ; bit23:MMX + jz short .no_mmx + or edi, byte JSIMD_MMX +.no_mmx: + test eax, 1<<25 ; bit25:SSE + jz short .no_sse + or edi, byte JSIMD_SSE +.no_sse: + test eax, 1<<26 ; bit26:SSE2 + jz short .no_sse2 + or edi, byte JSIMD_SSE2 +.no_sse2: + + ; Check for 3DNow! instruction support + mov eax, 0x80000000 + cpuid + cmp eax, 0x80000000 + jbe short .return + + mov eax, 0x80000001 + cpuid + mov eax,edx ; eax = Extended feature flags + + test eax, 1<<31 ; bit31:3DNow!(vendor independent) + jz short .no_3dnow + or edi, byte JSIMD_3DNOW +.no_3dnow: + +.return: + mov eax,edi + + pop edi +; pop esi ; unused +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/Builder/jni-1.11/simd/i386/src/jsimdext.inc b/Builder/jni-1.11/simd/jsimdext.inc similarity index 59% rename from Builder/jni-1.11/simd/i386/src/jsimdext.inc rename to Builder/jni-1.11/simd/jsimdext.inc index 253b8972f..f28db60b5 100644 --- a/Builder/jni-1.11/simd/i386/src/jsimdext.inc +++ b/Builder/jni-1.11/simd/jsimdext.inc @@ -2,10 +2,9 @@ ; jsimdext.inc - common declarations ; ; Copyright 2009 Pierre Ossman for Cendio AB -; Copyright 2010 D. R. Commander +; Copyright (C) 2010, D. R. Commander. ; -; Based on -; x86 SIMD extension for IJG JPEG library - version 1.02 +; Based on the x86 SIMD extension for IJG JPEG library - version 1.02 ; ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; @@ -30,7 +29,7 @@ ; ========================================================================== ; System-dependent configurations -%ifdef WIN32 ; ----(nasm -fwin32 -DWIN32 ...)-------- +%ifdef WIN32 ; ----(nasm -fwin32 -DWIN32 ...)-------- ; * Microsoft Visual C++ ; * MinGW (Minimalist GNU for Windows) ; * CygWin @@ -46,7 +45,7 @@ %define SEG_CONST .rdata align=16 public use32 class=CONST %endif -%elifdef WIN64 ; ----(nasm -fwin64 -DWIN64 ...)-------- +%elifdef WIN64 ; ----(nasm -fwin64 -DWIN64 ...)-------- ; * Microsoft Visual C++ ; -- segment definition -- @@ -58,17 +57,17 @@ %define SEG_TEXT .text align=16 public use64 class=CODE %define SEG_CONST .rdata align=16 public use64 class=CONST %endif -%define EXTN(name) name ; foo() -> foo +%define EXTN(name) name ; foo() -> foo -%elifdef OBJ32 ; ----(nasm -fobj -DOBJ32 ...)---------- +%elifdef OBJ32 ; ----(nasm -fobj -DOBJ32 ...)---------- ; * Borland C++ (Win32) ; -- segment definition -- ; -%define SEG_TEXT .text align=16 public use32 class=CODE -%define SEG_CONST .data align=16 public use32 class=DATA +%define SEG_TEXT _text align=16 public use32 class=CODE +%define SEG_CONST _data align=16 public use32 class=DATA -%elifdef ELF ; ----(nasm -felf[64] -DELF ...)------------ +%elifdef ELF ; ----(nasm -felf[64] -DELF ...)------------ ; * Linux ; * *BSD family Unix using elf format ; * Unix System V, including Solaris x86, UnixWare and SCO Unix @@ -88,10 +87,10 @@ section .note.GNU-stack noalloc noexec nowrite progbits ; To make the code position-independent, append -DPIC to the commandline ; -%define GOT_SYMBOL _GLOBAL_OFFSET_TABLE_ ; ELF supports PIC -%define EXTN(name) name ; foo() -> foo +%define GOT_SYMBOL _GLOBAL_OFFSET_TABLE_ ; ELF supports PIC +%define EXTN(name) name ; foo() -> foo -%elifdef AOUT ; ----(nasm -faoutb/aout -DAOUT ...)---- +%elifdef AOUT ; ----(nasm -faoutb/aout -DAOUT ...)---- ; * Older Linux using a.out format (nasm -f aout -DAOUT ...) ; * *BSD family Unix using a.out format (nasm -f aoutb -DAOUT ...) @@ -102,29 +101,29 @@ section .note.GNU-stack noalloc noexec nowrite progbits ; To make the code position-independent, append -DPIC to the commandline ; -%define GOT_SYMBOL __GLOBAL_OFFSET_TABLE_ ; BSD-style a.out supports PIC +%define GOT_SYMBOL __GLOBAL_OFFSET_TABLE_ ; BSD-style a.out supports PIC -%elifdef MACHO ; ----(nasm -fmacho -DMACHO ...)-------- +%elifdef MACHO ; ----(nasm -fmacho -DMACHO ...)-------- ; * NeXTstep/OpenStep/Rhapsody/Darwin/MacOS X (Mach-O format) ; -- segment definition -- ; -%define SEG_TEXT .text ;align=16 ; nasm doesn't accept align=16. why? +%define SEG_TEXT .text ;align=16 ; nasm doesn't accept align=16. why? %define SEG_CONST .rodata align=16 ; The generation of position-independent code (PIC) is the default on Darwin. ; %define PIC -%define GOT_SYMBOL _MACHO_PIC_ ; Mach-O style code-relative addressing +%define GOT_SYMBOL _MACHO_PIC_ ; Mach-O style code-relative addressing -%else ; ----(Other case)---------------------- +%else ; ----(Other case)---------------------- ; -- segment definition -- ; %define SEG_TEXT .text %define SEG_CONST .data -%endif ; ---------------------------------------------- +%endif ; ---------------------------------------------- ; ========================================================================== @@ -179,7 +178,7 @@ section .note.GNU-stack noalloc noexec nowrite progbits ; External Symbol Name ; %ifndef EXTN -%define EXTN(name) _ %+ name ; foo() -> _foo +%define EXTN(name) _ %+ name ; foo() -> _foo %endif ; -------------------------------------------------------------------------- @@ -196,79 +195,79 @@ section .note.GNU-stack noalloc noexec nowrite progbits ; At present, nasm doesn't seem to support PIC generation for Mach-O. ; The PIC support code below is a little tricky. - SECTION SEG_CONST + SECTION SEG_CONST const_base: %define GOTOFF(got,sym) (got) + (sym) - const_base -%imacro get_GOT 1 - ; NOTE: this macro destroys ecx resister. - call %%geteip - add ecx, byte (%%ref - $) - jmp short %%adjust +%imacro get_GOT 1 + ; NOTE: this macro destroys ecx resister. + call %%geteip + add ecx, byte (%%ref - $) + jmp short %%adjust %%geteip: - mov ecx, POINTER [esp] - ret + mov ecx, POINTER [esp] + ret %%adjust: - push ebp - xor ebp,ebp ; ebp = 0 -%ifidni %1,ebx ; (%1 == ebx) - ; db 0x8D,0x9C + jmp near const_base = - ; lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32) - db 0x8D,0x9C ; 8D,9C - jmp near const_base ; E9,(const_base-%%ref) + push ebp + xor ebp,ebp ; ebp = 0 +%ifidni %1,ebx ; (%1 == ebx) + ; db 0x8D,0x9C + jmp near const_base = + ; lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32) + db 0x8D,0x9C ; 8D,9C + jmp near const_base ; E9,(const_base-%%ref) %%ref: %else ; (%1 != ebx) - ; db 0x8D,0x8C + jmp near const_base = - ; lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32) - db 0x8D,0x8C ; 8D,8C - jmp near const_base ; E9,(const_base-%%ref) -%%ref: mov %1, ecx + ; db 0x8D,0x8C + jmp near const_base = + ; lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32) + db 0x8D,0x8C ; 8D,8C + jmp near const_base ; E9,(const_base-%%ref) +%%ref: mov %1, ecx %endif ; (%1 == ebx) - pop ebp + pop ebp %endmacro -%else ; GOT_SYMBOL != _MACHO_PIC_ ---------------- +%else ; GOT_SYMBOL != _MACHO_PIC_ ---------------- %define GOTOFF(got,sym) (got) + (sym) wrt ..gotoff -%imacro get_GOT 1 - extern GOT_SYMBOL - call %%geteip - add %1, GOT_SYMBOL + $$ - $ wrt ..gotpc - jmp short %%done +%imacro get_GOT 1 + extern GOT_SYMBOL + call %%geteip + add %1, GOT_SYMBOL + $$ - $ wrt ..gotpc + jmp short %%done %%geteip: - mov %1, POINTER [esp] - ret + mov %1, POINTER [esp] + ret %%done: %endmacro -%endif ; GOT_SYMBOL == _MACHO_PIC_ ---------------- +%endif ; GOT_SYMBOL == _MACHO_PIC_ ---------------- -%imacro pushpic 1.nolist - push %1 +%imacro pushpic 1.nolist + push %1 %endmacro -%imacro poppic 1.nolist - pop %1 +%imacro poppic 1.nolist + pop %1 %endmacro -%imacro movpic 2.nolist - mov %1,%2 +%imacro movpic 2.nolist + mov %1,%2 %endmacro -%else ; !PIC ----------------------------------------- +%else ; !PIC ----------------------------------------- %define GOTOFF(got,sym) (sym) -%imacro get_GOT 1.nolist +%imacro get_GOT 1.nolist %endmacro -%imacro pushpic 1.nolist +%imacro pushpic 1.nolist %endmacro -%imacro poppic 1.nolist +%imacro poppic 1.nolist %endmacro -%imacro movpic 2.nolist +%imacro movpic 2.nolist %endmacro -%endif ; PIC ----------------------------------------- +%endif ; PIC ----------------------------------------- ; -------------------------------------------------------------------------- ; Align the next instruction on {2,4,8,16,..}-byte boundary. @@ -278,28 +277,28 @@ const_base: %define FILLB(b,n) (($$-(b)) & ((n)-1)) %imacro alignx 1-2.nolist 0xFFFF -%%bs: times MSKLE(FILLB(%%bs,%1),%2) & MSKLE(16,FILLB($,%1)) & FILLB($,%1) \ - db 0x90 ; nop - times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/9 \ - db 0x8D,0x9C,0x23,0x00,0x00,0x00,0x00 ; lea ebx,[ebx+0x00000000] - times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/7 \ - db 0x8D,0xAC,0x25,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000] - times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/6 \ - db 0x8D,0xAD,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000] - times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/4 \ - db 0x8D,0x6C,0x25,0x00 ; lea ebp,[ebp+0x00] - times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/3 \ - db 0x8D,0x6D,0x00 ; lea ebp,[ebp+0x00] - times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/2 \ - db 0x8B,0xED ; mov ebp,ebp - times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/1 \ - db 0x90 ; nop +%%bs: times MSKLE(FILLB(%%bs,%1),%2) & MSKLE(16,FILLB($,%1)) & FILLB($,%1) \ + db 0x90 ; nop + times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/9 \ + db 0x8D,0x9C,0x23,0x00,0x00,0x00,0x00 ; lea ebx,[ebx+0x00000000] + times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/7 \ + db 0x8D,0xAC,0x25,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000] + times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/6 \ + db 0x8D,0xAD,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000] + times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/4 \ + db 0x8D,0x6C,0x25,0x00 ; lea ebp,[ebp+0x00] + times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/3 \ + db 0x8D,0x6D,0x00 ; lea ebp,[ebp+0x00] + times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/2 \ + db 0x8B,0xED ; mov ebp,ebp + times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/1 \ + db 0x90 ; nop %endmacro ; Align the next data on {2,4,8,16,..}-byte boundary. ; %imacro alignz 1.nolist - align %1, db 0 ; filling zeros + align %1, db 0 ; filling zeros %endmacro %ifdef __x86_64__ @@ -307,61 +306,61 @@ const_base: %ifdef WIN64 %imacro collect_args 0 - push r12 - push r13 - push r14 - push r15 - mov r10, rcx - mov r11, rdx - mov r12, r8 - mov r13, r9 - mov r14, [rax+48] - mov r15, [rax+56] - push rsi - push rdi - sub rsp, SIZEOF_XMMWORD - movaps XMMWORD [rsp], xmm6 - sub rsp, SIZEOF_XMMWORD - movaps XMMWORD [rsp], xmm7 + push r12 + push r13 + push r14 + push r15 + mov r10, rcx + mov r11, rdx + mov r12, r8 + mov r13, r9 + mov r14, [rax+48] + mov r15, [rax+56] + push rsi + push rdi + sub rsp, SIZEOF_XMMWORD + movaps XMMWORD [rsp], xmm6 + sub rsp, SIZEOF_XMMWORD + movaps XMMWORD [rsp], xmm7 %endmacro %imacro uncollect_args 0 - movaps xmm7, XMMWORD [rsp] - add rsp, SIZEOF_XMMWORD - movaps xmm6, XMMWORD [rsp] - add rsp, SIZEOF_XMMWORD - pop rdi - pop rsi - pop r15 - pop r14 - pop r13 - pop r12 + movaps xmm7, XMMWORD [rsp] + add rsp, SIZEOF_XMMWORD + movaps xmm6, XMMWORD [rsp] + add rsp, SIZEOF_XMMWORD + pop rdi + pop rsi + pop r15 + pop r14 + pop r13 + pop r12 %endmacro %else %imacro collect_args 0 - push r10 - push r11 - push r12 - push r13 - push r14 - push r15 - mov r10, rdi - mov r11, rsi - mov r12, rdx - mov r13, rcx - mov r14, r8 - mov r15, r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + mov r10, rdi + mov r11, rsi + mov r12, rdx + mov r13, rcx + mov r14, r8 + mov r15, r9 %endmacro %imacro uncollect_args 0 - pop r15 - pop r14 - pop r13 - pop r12 - pop r11 - pop r10 + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 %endmacro %endif diff --git a/Builder/jni-1.11/simd/nasm_lt.sh b/Builder/jni-1.11/simd/nasm_lt.sh new file mode 100755 index 000000000..817be1612 --- /dev/null +++ b/Builder/jni-1.11/simd/nasm_lt.sh @@ -0,0 +1,60 @@ +#! /bin/sh +command="" +infile="" +o_opt=no +pic=no +while [ $# -gt 0 ]; do + case "$1" in + --silent) + exec > /dev/null + ;; + -DPIC|-fPIC|-fpic|-Kpic|-KPIC) + if [ "$pic" != "yes" ] ; then + command="$command -DPIC" + pic=yes + fi + ;; + -f|-fbin|-faout|-faoutb|-fcoff|-felf|-felf64|-fas86| \ + -fobj|-fwin32|-fwin64|-frdf|-fieee|-fmacho|-fmacho64) + # it's a file format specifier for nasm. + command="$command $1" + ;; + -f*) + # maybe a code-generation flag for gcc. + ;; + -[Ii]*) + incdir=`echo "$1" | sed 's/^-[Ii]//'` + if [ "x$incdir" = x -a "x$2" != x ] ; then + case "$2" in + -*) ;; + *) incdir="$2"; shift;; + esac + fi + if [ "x$incdir" != x ] ; then + # In the case of NASM, the trailing slash is necessary. + incdir=`echo "$incdir" | sed 's%/*$%/%'` + command="$command -I$incdir" + fi + ;; + -o*) + o_opt=yes + command="$command $1" + ;; + *.asm) + infile=$1 + command="$command $1" + ;; + *) + command="$command $1" + ;; + esac + shift +done +if [ "$o_opt" != yes ] ; then + # By default, NASM creates an output file + # in the same directory as the input file. + outfile="-o `echo $infile | sed -e 's%^.*/%%' -e 's%\.[^.]*$%%'`.o" + command="$command $outfile" +fi +echo $command +exec $command diff --git a/Builder/jni-1.11/simd/src/StLog.h b/Builder/jni-1.11/simd/src/StLog.h deleted file mode 100644 index af8e69717..000000000 --- a/Builder/jni-1.11/simd/src/StLog.h +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Copyright (C) 2013 The Common CLI viewer interface Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __LOG_H__ -#define __LOG_H__ - -#include - -#define DEBUG_L(DEBUG_ENABLED, LCTX, args...) \ - { if (DEBUG_ENABLED) {__android_log_print(ANDROID_LOG_DEBUG, LCTX, args); } } - -#define ERROR_L(LCTX, args...) \ - __android_log_print(ANDROID_LOG_ERROR, LCTX, args) - -#define WARN_L(LCTX, args...) \ - __android_log_print(ANDROID_LOG_WARNING, LCTX, args) - -#define INFO_L(LCTX, args...) \ - __android_log_print(ANDROID_LOG_INFO, LCTX, args) - -#endif diff --git a/Builder/jni-1.11/simd/src/jsimd.h b/Builder/jni-1.11/simd/src/jsimd.h deleted file mode 100644 index 3d4751ffc..000000000 --- a/Builder/jni-1.11/simd/src/jsimd.h +++ /dev/null @@ -1,670 +0,0 @@ -/* - * simd/jsimd.h - * - * Copyright 2009 Pierre Ossman for Cendio AB - * Copyright 2011 D. R. Commander - * - * Based on the x86 SIMD extension for IJG JPEG library, - * Copyright (C) 1999-2006, MIYASAKA Masaru. - * For conditions of distribution and use, see copyright notice in jsimdext.inc - * - */ - -/* Bitmask for supported acceleration methods */ - -#define JSIMD_NONE 0x00 -#define JSIMD_MMX 0x01 -#define JSIMD_3DNOW 0x02 -#define JSIMD_SSE 0x04 -#define JSIMD_SSE2 0x08 -#define JSIMD_ARM_NEON 0x10 - -/* Short forms of external names for systems with brain-damaged linkers. */ - -#ifdef NEED_SHORT_EXTERNAL_NAMES -#define jpeg_simd_cpu_support jSiCpuSupport -#define jsimd_rgb_ycc_convert_mmx jSRGBYCCM -#define jsimd_extrgb_ycc_convert_mmx jSEXTRGBYCCM -#define jsimd_extrgbx_ycc_convert_mmx jSEXTRGBXYCCM -#define jsimd_extbgr_ycc_convert_mmx jSEXTBGRYCCM -#define jsimd_extbgrx_ycc_convert_mmx jSEXTBGRXYCCM -#define jsimd_extxbgr_ycc_convert_mmx jSEXTXBGRYCCM -#define jsimd_extxrgb_ycc_convert_mmx jSEXTXRGBYCCM -#define jsimd_rgb_gray_convert_mmx jSRGBGRYM -#define jsimd_extrgb_gray_convert_mmx jSEXTRGBGRYM -#define jsimd_extrgbx_gray_convert_mmx jSEXTRGBXGRYM -#define jsimd_extbgr_gray_convert_mmx jSEXTBGRGRYM -#define jsimd_extbgrx_gray_convert_mmx jSEXTBGRXGRYM -#define jsimd_extxbgr_gray_convert_mmx jSEXTXBGRGRYM -#define jsimd_extxrgb_gray_convert_mmx jSEXTXRGBGRYM -#define jsimd_ycc_rgb_convert_mmx jSYCCRGBM -#define jsimd_ycc_extrgb_convert_mmx jSYCCEXTRGBM -#define jsimd_ycc_extrgbx_convert_mmx jSYCCEXTRGBXM -#define jsimd_ycc_extbgr_convert_mmx jSYCCEXTBGRM -#define jsimd_ycc_extbgrx_convert_mmx jSYCCEXTBGRXM -#define jsimd_ycc_extxbgr_convert_mmx jSYCCEXTXBGRM -#define jsimd_ycc_extxrgb_convert_mmx jSYCCEXTXRGBM -#define jconst_rgb_ycc_convert_sse2 jSCRGBYCCS2 -#define jsimd_rgb_ycc_convert_sse2 jSRGBYCCS2 -#define jsimd_extrgb_ycc_convert_sse2 jSEXTRGBYCCS2 -#define jsimd_extrgbx_ycc_convert_sse2 jSEXTRGBXYCCS2 -#define jsimd_extbgr_ycc_convert_sse2 jSEXTBGRYCCS2 -#define jsimd_extbgrx_ycc_convert_sse2 jSEXTBGRXYCCS2 -#define jsimd_extxbgr_ycc_convert_sse2 jSEXTXBGRYCCS2 -#define jsimd_extxrgb_ycc_convert_sse2 jSEXTXRGBYCCS2 -#define jconst_rgb_gray_convert_sse2 jSCRGBGRYS2 -#define jsimd_rgb_gray_convert_sse2 jSRGBGRYS2 -#define jsimd_extrgb_gray_convert_sse2 jSEXTRGBGRYS2 -#define jsimd_extrgbx_gray_convert_sse2 jSEXTRGBXGRYS2 -#define jsimd_extbgr_gray_convert_sse2 jSEXTBGRGRYS2 -#define jsimd_extbgrx_gray_convert_sse2 jSEXTBGRXGRYS2 -#define jsimd_extxbgr_gray_convert_sse2 jSEXTXBGRGRYS2 -#define jsimd_extxrgb_gray_convert_sse2 jSEXTXRGBGRYS2 -#define jconst_ycc_rgb_convert_sse2 jSCYCCRGBS2 -#define jsimd_ycc_rgb_convert_sse2 jSYCCRGBS2 -#define jsimd_ycc_extrgb_convert_sse2 jSYCCEXTRGBS2 -#define jsimd_ycc_extrgbx_convert_sse2 jSYCCEXTRGBXS2 -#define jsimd_ycc_extbgr_convert_sse2 jSYCCEXTBGRS2 -#define jsimd_ycc_extbgrx_convert_sse2 jSYCCEXTBGRXS2 -#define jsimd_ycc_extxbgr_convert_sse2 jSYCCEXTXBGRS2 -#define jsimd_ycc_extxrgb_convert_sse2 jSYCCEXTXRGBS2 -#define jsimd_h2v2_downsample_mmx jSDnH2V2M -#define jsimd_h2v1_downsample_mmx jSDnH2V1M -#define jsimd_h2v2_downsample_sse2 jSDnH2V2S2 -#define jsimd_h2v1_downsample_sse2 jSDnH2V1S2 -#define jsimd_h2v2_upsample_mmx jSUpH2V2M -#define jsimd_h2v1_upsample_mmx jSUpH2V1M -#define jsimd_h2v2_fancy_upsample_mmx jSFUpH2V2M -#define jsimd_h2v1_fancy_upsample_mmx jSFUpH2V1M -#define jsimd_h2v2_merged_upsample_mmx jSMUpH2V2M -#define jsimd_h2v2_extrgb_merged_upsample_mmx jSMUpH2V2EXTRGBM -#define jsimd_h2v2_extrgbx_merged_upsample_mmx jSMUpH2V2EXTRGBXM -#define jsimd_h2v2_extbgr_merged_upsample_mmx jSMUpH2V2EXTBGRM -#define jsimd_h2v2_extbgrx_merged_upsample_mmx jSMUpH2V2EXTBGRXM -#define jsimd_h2v2_extxbgr_merged_upsample_mmx jSMUpH2V2EXTXBGRM -#define jsimd_h2v2_extxrgb_merged_upsample_mmx jSMUpH2V2EXTXRGBM -#define jsimd_h2v1_merged_upsample_mmx jSMUpH2V1M -#define jsimd_h2v1_extrgb_merged_upsample_mmx jSMUpH2V1EXTRGBM -#define jsimd_h2v1_extrgbx_merged_upsample_mmx jSMUpH2V1EXTRGBXM -#define jsimd_h2v1_extbgr_merged_upsample_mmx jSMUpH2V1EXTBGRM -#define jsimd_h2v1_extbgrx_merged_upsample_mmx jSMUpH2V1EXTBGRXM -#define jsimd_h2v1_extxbgr_merged_upsample_mmx jSMUpH2V1EXTXBGRM -#define jsimd_h2v1_extxrgb_merged_upsample_mmx jSMUpH2V1EXTXRGBM -#define jsimd_h2v2_upsample_sse2 jSUpH2V2S2 -#define jsimd_h2v1_upsample_sse2 jSUpH2V1S2 -#define jconst_fancy_upsample_sse2 jSCFUpS2 -#define jsimd_h2v2_fancy_upsample_sse2 jSFUpH2V2S2 -#define jsimd_h2v1_fancy_upsample_sse2 jSFUpH2V1S2 -#define jconst_merged_upsample_sse2 jSCMUpS2 -#define jsimd_h2v2_merged_upsample_sse2 jSMUpH2V2S2 -#define jsimd_h2v2_extrgb_merged_upsample_sse2 jSMUpH2V2EXTRGBS2 -#define jsimd_h2v2_extrgbx_merged_upsample_sse2 jSMUpH2V2EXTRGBXS2 -#define jsimd_h2v2_extbgr_merged_upsample_sse2 jSMUpH2V2EXTBGRS2 -#define jsimd_h2v2_extbgrx_merged_upsample_sse2 jSMUpH2V2EXTBGRXS2 -#define jsimd_h2v2_extxbgr_merged_upsample_sse2 jSMUpH2V2EXTXBGRS2 -#define jsimd_h2v2_extxrgb_merged_upsample_sse2 jSMUpH2V2EXTXRGBS2 -#define jsimd_h2v1_merged_upsample_sse2 jSMUpH2V1S2 -#define jsimd_h2v1_extrgb_merged_upsample_sse2 jSMUpH2V1EXTRGBS2 -#define jsimd_h2v1_extrgbx_merged_upsample_sse2 jSMUpH2V1EXTRGBXS2 -#define jsimd_h2v1_extbgr_merged_upsample_sse2 jSMUpH2V1EXTBGRS2 -#define jsimd_h2v1_extbgrx_merged_upsample_sse2 jSMUpH2V1EXTBGRXS2 -#define jsimd_h2v1_extxbgr_merged_upsample_sse2 jSMUpH2V1EXTXBGRS2 -#define jsimd_h2v1_extxrgb_merged_upsample_sse2 jSMUpH2V1EXTXRGBS2 -#define jsimd_convsamp_mmx jSConvM -#define jsimd_convsamp_sse2 jSConvS2 -#define jsimd_convsamp_float_3dnow jSConvF3D -#define jsimd_convsamp_float_sse jSConvFS -#define jsimd_convsamp_float_sse2 jSConvFS2 -#define jsimd_fdct_islow_mmx jSFDMIS -#define jsimd_fdct_ifast_mmx jSFDMIF -#define jconst_fdct_islow_sse2 jSCFDS2IS -#define jsimd_fdct_islow_sse2 jSFDS2IS -#define jconst_fdct_ifast_sse2 jSCFDS2IF -#define jsimd_fdct_ifast_sse2 jSFDS2IF -#define jsimd_fdct_float_3dnow jSFD3DF -#define jconst_fdct_float_sse jSCFDSF -#define jsimd_fdct_float_sse jSFDSF -#define jsimd_quantize_mmx jSQuantM -#define jsimd_quantize_sse2 jSQuantS2 -#define jsimd_quantize_float_3dnow jSQuantF3D -#define jsimd_quantize_float_sse jSQuantFS -#define jsimd_quantize_float_sse2 jSQuantFS2 -#define jsimd_idct_2x2_mmx jSIDM22 -#define jsimd_idct_4x4_mmx jSIDM44 -#define jconst_idct_red_sse2 jSCIDS2R -#define jsimd_idct_2x2_sse2 jSIDS222 -#define jsimd_idct_4x4_sse2 jSIDS244 -#define jsimd_idct_islow_mmx jSIDMIS -#define jsimd_idct_ifast_mmx jSIDMIF -#define jconst_idct_islow_sse2 jSCIDS2IS -#define jsimd_idct_islow_sse2 jSIDS2IS -#define jconst_idct_ifast_sse2 jSCIDS2IF -#define jsimd_idct_ifast_sse2 jSIDS2IF -#define jsimd_idct_float_3dnow jSID3DF -#define jconst_fdct_float_sse jSCIDSF -#define jsimd_idct_float_sse jSIDSF -#define jconst_fdct_float_sse2 jSCIDS2F -#define jsimd_idct_float_sse2 jSIDS2F -#endif /* NEED_SHORT_EXTERNAL_NAMES */ - -/* SIMD Ext: retrieve SIMD/CPU information */ -EXTERN(unsigned int) jpeg_simd_cpu_support JPP((void)); - -/* SIMD Color Space Conversion */ -EXTERN(void) jsimd_rgb_ycc_convert_mmx - JPP((JDIMENSION img_width, - JSAMPARRAY input_buf, JSAMPIMAGE output_buf, - JDIMENSION output_row, int num_rows)); -EXTERN(void) jsimd_extrgb_ycc_convert_mmx - JPP((JDIMENSION img_width, - JSAMPARRAY input_buf, JSAMPIMAGE output_buf, - JDIMENSION output_row, int num_rows)); -EXTERN(void) jsimd_extrgbx_ycc_convert_mmx - JPP((JDIMENSION img_width, - JSAMPARRAY input_buf, JSAMPIMAGE output_buf, - JDIMENSION output_row, int num_rows)); -EXTERN(void) jsimd_extbgr_ycc_convert_mmx - JPP((JDIMENSION img_width, - JSAMPARRAY input_buf, JSAMPIMAGE output_buf, - JDIMENSION output_row, int num_rows)); -EXTERN(void) jsimd_extbgrx_ycc_convert_mmx - JPP((JDIMENSION img_width, - JSAMPARRAY input_buf, JSAMPIMAGE output_buf, - JDIMENSION output_row, int num_rows)); -EXTERN(void) jsimd_extxbgr_ycc_convert_mmx - JPP((JDIMENSION img_width, - JSAMPARRAY input_buf, JSAMPIMAGE output_buf, - JDIMENSION output_row, int num_rows)); -EXTERN(void) jsimd_extxrgb_ycc_convert_mmx - JPP((JDIMENSION img_width, - JSAMPARRAY input_buf, JSAMPIMAGE output_buf, - JDIMENSION output_row, int num_rows)); - -EXTERN(void) jsimd_rgb_gray_convert_mmx - JPP((JDIMENSION img_width, - JSAMPARRAY input_buf, JSAMPIMAGE output_buf, - JDIMENSION output_row, int num_rows)); -EXTERN(void) jsimd_extrgb_gray_convert_mmx - JPP((JDIMENSION img_width, - JSAMPARRAY input_buf, JSAMPIMAGE output_buf, - JDIMENSION output_row, int num_rows)); -EXTERN(void) jsimd_extrgbx_gray_convert_mmx - JPP((JDIMENSION img_width, - JSAMPARRAY input_buf, JSAMPIMAGE output_buf, - JDIMENSION output_row, int num_rows)); -EXTERN(void) jsimd_extbgr_gray_convert_mmx - JPP((JDIMENSION img_width, - JSAMPARRAY input_buf, JSAMPIMAGE output_buf, - JDIMENSION output_row, int num_rows)); -EXTERN(void) jsimd_extbgrx_gray_convert_mmx - JPP((JDIMENSION img_width, - JSAMPARRAY input_buf, JSAMPIMAGE output_buf, - JDIMENSION output_row, int num_rows)); -EXTERN(void) jsimd_extxbgr_gray_convert_mmx - JPP((JDIMENSION img_width, - JSAMPARRAY input_buf, JSAMPIMAGE output_buf, - JDIMENSION output_row, int num_rows)); -EXTERN(void) jsimd_extxrgb_gray_convert_mmx - JPP((JDIMENSION img_width, - JSAMPARRAY input_buf, JSAMPIMAGE output_buf, - JDIMENSION output_row, int num_rows)); - -EXTERN(void) jsimd_ycc_rgb_convert_mmx - JPP((JDIMENSION out_width, - JSAMPIMAGE input_buf, JDIMENSION input_row, - JSAMPARRAY output_buf, int num_rows)); -EXTERN(void) jsimd_ycc_extrgb_convert_mmx - JPP((JDIMENSION out_width, - JSAMPIMAGE input_buf, JDIMENSION input_row, - JSAMPARRAY output_buf, int num_rows)); -EXTERN(void) jsimd_ycc_extrgbx_convert_mmx - JPP((JDIMENSION out_width, - JSAMPIMAGE input_buf, JDIMENSION input_row, - JSAMPARRAY output_buf, int num_rows)); -EXTERN(void) jsimd_ycc_extbgr_convert_mmx - JPP((JDIMENSION out_width, - JSAMPIMAGE input_buf, JDIMENSION input_row, - JSAMPARRAY output_buf, int num_rows)); -EXTERN(void) jsimd_ycc_extbgrx_convert_mmx - JPP((JDIMENSION out_width, - JSAMPIMAGE input_buf, JDIMENSION input_row, - JSAMPARRAY output_buf, int num_rows)); -EXTERN(void) jsimd_ycc_extxbgr_convert_mmx - JPP((JDIMENSION out_width, - JSAMPIMAGE input_buf, JDIMENSION input_row, - JSAMPARRAY output_buf, int num_rows)); -EXTERN(void) jsimd_ycc_extxrgb_convert_mmx - JPP((JDIMENSION out_width, - JSAMPIMAGE input_buf, JDIMENSION input_row, - JSAMPARRAY output_buf, int num_rows)); - -extern const int jconst_rgb_ycc_convert_sse2[]; -EXTERN(void) jsimd_rgb_ycc_convert_sse2 - JPP((JDIMENSION img_width, - JSAMPARRAY input_buf, JSAMPIMAGE output_buf, - JDIMENSION output_row, int num_rows)); -EXTERN(void) jsimd_extrgb_ycc_convert_sse2 - JPP((JDIMENSION img_width, - JSAMPARRAY input_buf, JSAMPIMAGE output_buf, - JDIMENSION output_row, int num_rows)); -EXTERN(void) jsimd_extrgbx_ycc_convert_sse2 - JPP((JDIMENSION img_width, - JSAMPARRAY input_buf, JSAMPIMAGE output_buf, - JDIMENSION output_row, int num_rows)); -EXTERN(void) jsimd_extbgr_ycc_convert_sse2 - JPP((JDIMENSION img_width, - JSAMPARRAY input_buf, JSAMPIMAGE output_buf, - JDIMENSION output_row, int num_rows)); -EXTERN(void) jsimd_extbgrx_ycc_convert_sse2 - JPP((JDIMENSION img_width, - JSAMPARRAY input_buf, JSAMPIMAGE output_buf, - JDIMENSION output_row, int num_rows)); -EXTERN(void) jsimd_extxbgr_ycc_convert_sse2 - JPP((JDIMENSION img_width, - JSAMPARRAY input_buf, JSAMPIMAGE output_buf, - JDIMENSION output_row, int num_rows)); -EXTERN(void) jsimd_extxrgb_ycc_convert_sse2 - JPP((JDIMENSION img_width, - JSAMPARRAY input_buf, JSAMPIMAGE output_buf, - JDIMENSION output_row, int num_rows)); - -extern const int jconst_rgb_gray_convert_sse2[]; -EXTERN(void) jsimd_rgb_gray_convert_sse2 - JPP((JDIMENSION img_width, - JSAMPARRAY input_buf, JSAMPIMAGE output_buf, - JDIMENSION output_row, int num_rows)); -EXTERN(void) jsimd_extrgb_gray_convert_sse2 - JPP((JDIMENSION img_width, - JSAMPARRAY input_buf, JSAMPIMAGE output_buf, - JDIMENSION output_row, int num_rows)); -EXTERN(void) jsimd_extrgbx_gray_convert_sse2 - JPP((JDIMENSION img_width, - JSAMPARRAY input_buf, JSAMPIMAGE output_buf, - JDIMENSION output_row, int num_rows)); -EXTERN(void) jsimd_extbgr_gray_convert_sse2 - JPP((JDIMENSION img_width, - JSAMPARRAY input_buf, JSAMPIMAGE output_buf, - JDIMENSION output_row, int num_rows)); -EXTERN(void) jsimd_extbgrx_gray_convert_sse2 - JPP((JDIMENSION img_width, - JSAMPARRAY input_buf, JSAMPIMAGE output_buf, - JDIMENSION output_row, int num_rows)); -EXTERN(void) jsimd_extxbgr_gray_convert_sse2 - JPP((JDIMENSION img_width, - JSAMPARRAY input_buf, JSAMPIMAGE output_buf, - JDIMENSION output_row, int num_rows)); -EXTERN(void) jsimd_extxrgb_gray_convert_sse2 - JPP((JDIMENSION img_width, - JSAMPARRAY input_buf, JSAMPIMAGE output_buf, - JDIMENSION output_row, int num_rows)); - -extern const int jconst_ycc_rgb_convert_sse2[]; -EXTERN(void) jsimd_ycc_rgb_convert_sse2 - JPP((JDIMENSION out_width, - JSAMPIMAGE input_buf, JDIMENSION input_row, - JSAMPARRAY output_buf, int num_rows)); -EXTERN(void) jsimd_ycc_extrgb_convert_sse2 - JPP((JDIMENSION out_width, - JSAMPIMAGE input_buf, JDIMENSION input_row, - JSAMPARRAY output_buf, int num_rows)); -EXTERN(void) jsimd_ycc_extrgbx_convert_sse2 - JPP((JDIMENSION out_width, - JSAMPIMAGE input_buf, JDIMENSION input_row, - JSAMPARRAY output_buf, int num_rows)); -EXTERN(void) jsimd_ycc_extbgr_convert_sse2 - JPP((JDIMENSION out_width, - JSAMPIMAGE input_buf, JDIMENSION input_row, - JSAMPARRAY output_buf, int num_rows)); -EXTERN(void) jsimd_ycc_extbgrx_convert_sse2 - JPP((JDIMENSION out_width, - JSAMPIMAGE input_buf, JDIMENSION input_row, - JSAMPARRAY output_buf, int num_rows)); -EXTERN(void) jsimd_ycc_extxbgr_convert_sse2 - JPP((JDIMENSION out_width, - JSAMPIMAGE input_buf, JDIMENSION input_row, - JSAMPARRAY output_buf, int num_rows)); -EXTERN(void) jsimd_ycc_extxrgb_convert_sse2 - JPP((JDIMENSION out_width, - JSAMPIMAGE input_buf, JDIMENSION input_row, - JSAMPARRAY output_buf, int num_rows)); - -EXTERN(void) jsimd_rgb_ycc_convert_neon - JPP((JDIMENSION img_width, - JSAMPARRAY input_buf, JSAMPIMAGE output_buf, - JDIMENSION output_row, int num_rows)); -EXTERN(void) jsimd_extrgb_ycc_convert_neon - JPP((JDIMENSION img_width, - JSAMPARRAY input_buf, JSAMPIMAGE output_buf, - JDIMENSION output_row, int num_rows)); -EXTERN(void) jsimd_extrgbx_ycc_convert_neon - JPP((JDIMENSION img_width, - JSAMPARRAY input_buf, JSAMPIMAGE output_buf, - JDIMENSION output_row, int num_rows)); -EXTERN(void) jsimd_extbgr_ycc_convert_neon - JPP((JDIMENSION img_width, - JSAMPARRAY input_buf, JSAMPIMAGE output_buf, - JDIMENSION output_row, int num_rows)); -EXTERN(void) jsimd_extbgrx_ycc_convert_neon - JPP((JDIMENSION img_width, - JSAMPARRAY input_buf, JSAMPIMAGE output_buf, - JDIMENSION output_row, int num_rows)); -EXTERN(void) jsimd_extxbgr_ycc_convert_neon - JPP((JDIMENSION img_width, - JSAMPARRAY input_buf, JSAMPIMAGE output_buf, - JDIMENSION output_row, int num_rows)); -EXTERN(void) jsimd_extxrgb_ycc_convert_neon - JPP((JDIMENSION img_width, - JSAMPARRAY input_buf, JSAMPIMAGE output_buf, - JDIMENSION output_row, int num_rows)); - -EXTERN(void) jsimd_ycc_rgb_convert_neon - JPP((JDIMENSION out_width, - JSAMPIMAGE input_buf, JDIMENSION input_row, - JSAMPARRAY output_buf, int num_rows)); -EXTERN(void) jsimd_ycc_extrgb_convert_neon - JPP((JDIMENSION out_width, - JSAMPIMAGE input_buf, JDIMENSION input_row, - JSAMPARRAY output_buf, int num_rows)); -EXTERN(void) jsimd_ycc_extrgbx_convert_neon - JPP((JDIMENSION out_width, - JSAMPIMAGE input_buf, JDIMENSION input_row, - JSAMPARRAY output_buf, int num_rows)); -EXTERN(void) jsimd_ycc_extbgr_convert_neon - JPP((JDIMENSION out_width, - JSAMPIMAGE input_buf, JDIMENSION input_row, - JSAMPARRAY output_buf, int num_rows)); -EXTERN(void) jsimd_ycc_extbgrx_convert_neon - JPP((JDIMENSION out_width, - JSAMPIMAGE input_buf, JDIMENSION input_row, - JSAMPARRAY output_buf, int num_rows)); -EXTERN(void) jsimd_ycc_extxbgr_convert_neon - JPP((JDIMENSION out_width, - JSAMPIMAGE input_buf, JDIMENSION input_row, - JSAMPARRAY output_buf, int num_rows)); -EXTERN(void) jsimd_ycc_extxrgb_convert_neon - JPP((JDIMENSION out_width, - JSAMPIMAGE input_buf, JDIMENSION input_row, - JSAMPARRAY output_buf, int num_rows)); - -/* SIMD Downsample */ -EXTERN(void) jsimd_h2v2_downsample_mmx - JPP((JDIMENSION image_width, int max_v_samp_factor, - JDIMENSION v_samp_factor, JDIMENSION width_blocks, - JSAMPARRAY input_data, JSAMPARRAY output_data)); -EXTERN(void) jsimd_h2v1_downsample_mmx - JPP((JDIMENSION image_width, int max_v_samp_factor, - JDIMENSION v_samp_factor, JDIMENSION width_blocks, - JSAMPARRAY input_data, JSAMPARRAY output_data)); - -EXTERN(void) jsimd_h2v2_downsample_sse2 - JPP((JDIMENSION image_width, int max_v_samp_factor, - JDIMENSION v_samp_factor, JDIMENSION width_blocks, - JSAMPARRAY input_data, JSAMPARRAY output_data)); -EXTERN(void) jsimd_h2v1_downsample_sse2 - JPP((JDIMENSION image_width, int max_v_samp_factor, - JDIMENSION v_samp_factor, JDIMENSION width_blocks, - JSAMPARRAY input_data, JSAMPARRAY output_data)); - -/* SIMD Upsample */ -EXTERN(void) jsimd_h2v2_upsample_mmx - JPP((int max_v_samp_factor, JDIMENSION output_width, - JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr)); -EXTERN(void) jsimd_h2v1_upsample_mmx - JPP((int max_v_samp_factor, JDIMENSION output_width, - JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr)); - -EXTERN(void) jsimd_h2v2_fancy_upsample_mmx - JPP((int max_v_samp_factor, JDIMENSION downsampled_width, - JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr)); -EXTERN(void) jsimd_h2v1_fancy_upsample_mmx - JPP((int max_v_samp_factor, JDIMENSION downsampled_width, - JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr)); - -EXTERN(void) jsimd_h2v2_merged_upsample_mmx - JPP((JDIMENSION output_width, JSAMPIMAGE input_buf, - JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)); -EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_mmx - JPP((JDIMENSION output_width, JSAMPIMAGE input_buf, - JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)); -EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_mmx - JPP((JDIMENSION output_width, JSAMPIMAGE input_buf, - JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)); -EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_mmx - JPP((JDIMENSION output_width, JSAMPIMAGE input_buf, - JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)); -EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_mmx - JPP((JDIMENSION output_width, JSAMPIMAGE input_buf, - JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)); -EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_mmx - JPP((JDIMENSION output_width, JSAMPIMAGE input_buf, - JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)); -EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_mmx - JPP((JDIMENSION output_width, JSAMPIMAGE input_buf, - JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)); -EXTERN(void) jsimd_h2v1_merged_upsample_mmx - JPP((JDIMENSION output_width, JSAMPIMAGE input_buf, - JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)); -EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_mmx - JPP((JDIMENSION output_width, JSAMPIMAGE input_buf, - JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)); -EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_mmx - JPP((JDIMENSION output_width, JSAMPIMAGE input_buf, - JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)); -EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_mmx - JPP((JDIMENSION output_width, JSAMPIMAGE input_buf, - JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)); -EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_mmx - JPP((JDIMENSION output_width, JSAMPIMAGE input_buf, - JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)); -EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_mmx - JPP((JDIMENSION output_width, JSAMPIMAGE input_buf, - JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)); -EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_mmx - JPP((JDIMENSION output_width, JSAMPIMAGE input_buf, - JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)); - -EXTERN(void) jsimd_h2v2_upsample_sse2 - JPP((int max_v_samp_factor, JDIMENSION output_width, - JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr)); -EXTERN(void) jsimd_h2v1_upsample_sse2 - JPP((int max_v_samp_factor, JDIMENSION output_width, - JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr)); - -extern const int jconst_fancy_upsample_sse2[]; -EXTERN(void) jsimd_h2v2_fancy_upsample_sse2 - JPP((int max_v_samp_factor, JDIMENSION downsampled_width, - JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr)); -EXTERN(void) jsimd_h2v1_fancy_upsample_sse2 - JPP((int max_v_samp_factor, JDIMENSION downsampled_width, - JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr)); - -extern const int jconst_merged_upsample_sse2[]; -EXTERN(void) jsimd_h2v2_merged_upsample_sse2 - JPP((JDIMENSION output_width, JSAMPIMAGE input_buf, - JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)); -EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_sse2 - JPP((JDIMENSION output_width, JSAMPIMAGE input_buf, - JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)); -EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_sse2 - JPP((JDIMENSION output_width, JSAMPIMAGE input_buf, - JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)); -EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_sse2 - JPP((JDIMENSION output_width, JSAMPIMAGE input_buf, - JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)); -EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_sse2 - JPP((JDIMENSION output_width, JSAMPIMAGE input_buf, - JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)); -EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_sse2 - JPP((JDIMENSION output_width, JSAMPIMAGE input_buf, - JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)); -EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_sse2 - JPP((JDIMENSION output_width, JSAMPIMAGE input_buf, - JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)); -EXTERN(void) jsimd_h2v1_merged_upsample_sse2 - JPP((JDIMENSION output_width, JSAMPIMAGE input_buf, - JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)); -EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_sse2 - JPP((JDIMENSION output_width, JSAMPIMAGE input_buf, - JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)); -EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_sse2 - JPP((JDIMENSION output_width, JSAMPIMAGE input_buf, - JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)); -EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_sse2 - JPP((JDIMENSION output_width, JSAMPIMAGE input_buf, - JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)); -EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_sse2 - JPP((JDIMENSION output_width, JSAMPIMAGE input_buf, - JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)); -EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_sse2 - JPP((JDIMENSION output_width, JSAMPIMAGE input_buf, - JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)); -EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_sse2 - JPP((JDIMENSION output_width, JSAMPIMAGE input_buf, - JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)); - -EXTERN(void) jsimd_h2v1_fancy_upsample_neon - JPP((int max_v_samp_factor, JDIMENSION downsampled_width, - JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr)); - -/* SIMD Sample Conversion */ -EXTERN(void) jsimd_convsamp_mmx JPP((JSAMPARRAY sample_data, - JDIMENSION start_col, - DCTELEM * workspace)); - -EXTERN(void) jsimd_convsamp_sse2 JPP((JSAMPARRAY sample_data, - JDIMENSION start_col, - DCTELEM * workspace)); - -EXTERN(void) jsimd_convsamp_neon JPP((JSAMPARRAY sample_data, - JDIMENSION start_col, - DCTELEM * workspace)); - -EXTERN(void) jsimd_convsamp_float_3dnow JPP((JSAMPARRAY sample_data, - JDIMENSION start_col, - FAST_FLOAT * workspace)); - -EXTERN(void) jsimd_convsamp_float_sse JPP((JSAMPARRAY sample_data, - JDIMENSION start_col, - FAST_FLOAT * workspace)); - -EXTERN(void) jsimd_convsamp_float_sse2 JPP((JSAMPARRAY sample_data, - JDIMENSION start_col, - FAST_FLOAT * workspace)); - -/* SIMD Forward DCT */ -EXTERN(void) jsimd_fdct_islow_mmx JPP((DCTELEM * data)); -EXTERN(void) jsimd_fdct_ifast_mmx JPP((DCTELEM * data)); - -extern const int jconst_fdct_ifast_sse2[]; -EXTERN(void) jsimd_fdct_islow_sse2 JPP((DCTELEM * data)); -extern const int jconst_fdct_islow_sse2[]; -EXTERN(void) jsimd_fdct_ifast_sse2 JPP((DCTELEM * data)); - -EXTERN(void) jsimd_fdct_ifast_neon JPP((DCTELEM * data)); - -EXTERN(void) jsimd_fdct_float_3dnow JPP((FAST_FLOAT * data)); - -extern const int jconst_fdct_float_sse[]; -EXTERN(void) jsimd_fdct_float_sse JPP((FAST_FLOAT * data)); - -/* SIMD Quantization */ -EXTERN(void) jsimd_quantize_mmx JPP((JCOEFPTR coef_block, - DCTELEM * divisors, - DCTELEM * workspace)); - -EXTERN(void) jsimd_quantize_sse2 JPP((JCOEFPTR coef_block, - DCTELEM * divisors, - DCTELEM * workspace)); - -EXTERN(void) jsimd_quantize_neon JPP((JCOEFPTR coef_block, - DCTELEM * divisors, - DCTELEM * workspace)); - -EXTERN(void) jsimd_quantize_float_3dnow JPP((JCOEFPTR coef_block, - FAST_FLOAT * divisors, - FAST_FLOAT * workspace)); - -EXTERN(void) jsimd_quantize_float_sse JPP((JCOEFPTR coef_block, - FAST_FLOAT * divisors, - FAST_FLOAT * workspace)); - -EXTERN(void) jsimd_quantize_float_sse2 JPP((JCOEFPTR coef_block, - FAST_FLOAT * divisors, - FAST_FLOAT * workspace)); - -/* SIMD Reduced Inverse DCT */ -EXTERN(void) jsimd_idct_2x2_mmx JPP((void * dct_table, - JCOEFPTR coef_block, - JSAMPARRAY output_buf, - JDIMENSION output_col)); -EXTERN(void) jsimd_idct_4x4_mmx JPP((void * dct_table, - JCOEFPTR coef_block, - JSAMPARRAY output_buf, - JDIMENSION output_col)); - -extern const int jconst_idct_red_sse2[]; -EXTERN(void) jsimd_idct_2x2_sse2 JPP((void * dct_table, - JCOEFPTR coef_block, - JSAMPARRAY output_buf, - JDIMENSION output_col)); -EXTERN(void) jsimd_idct_4x4_sse2 JPP((void * dct_table, - JCOEFPTR coef_block, - JSAMPARRAY output_buf, - JDIMENSION output_col)); - -EXTERN(void) jsimd_idct_2x2_neon JPP((void * dct_table, - JCOEFPTR coef_block, - JSAMPARRAY output_buf, - JDIMENSION output_col)); -EXTERN(void) jsimd_idct_4x4_neon JPP((void * dct_table, - JCOEFPTR coef_block, - JSAMPARRAY output_buf, - JDIMENSION output_col)); - -/* SIMD Inverse DCT */ -EXTERN(void) jsimd_idct_islow_mmx JPP((void * dct_table, - JCOEFPTR coef_block, - JSAMPARRAY output_buf, - JDIMENSION output_col)); -EXTERN(void) jsimd_idct_ifast_mmx JPP((void * dct_table, - JCOEFPTR coef_block, - JSAMPARRAY output_buf, - JDIMENSION output_col)); - -extern const int jconst_idct_islow_sse2[]; -EXTERN(void) jsimd_idct_islow_sse2 JPP((void * dct_table, - JCOEFPTR coef_block, - JSAMPARRAY output_buf, - JDIMENSION output_col)); -extern const int jconst_idct_ifast_sse2[]; -EXTERN(void) jsimd_idct_ifast_sse2 JPP((void * dct_table, - JCOEFPTR coef_block, - JSAMPARRAY output_buf, - JDIMENSION output_col)); - -EXTERN(void) jsimd_idct_islow_neon JPP((void * dct_table, - JCOEFPTR coef_block, - JSAMPARRAY output_buf, - JDIMENSION output_col)); -EXTERN(void) jsimd_idct_ifast_neon JPP((void * dct_table, - JCOEFPTR coef_block, - JSAMPARRAY output_buf, - JDIMENSION output_col)); - -EXTERN(void) jsimd_idct_float_3dnow JPP((void * dct_table, - JCOEFPTR coef_block, - JSAMPARRAY output_buf, - JDIMENSION output_col)); - -extern const int jconst_idct_float_sse[]; -EXTERN(void) jsimd_idct_float_sse JPP((void * dct_table, - JCOEFPTR coef_block, - JSAMPARRAY output_buf, - JDIMENSION output_col)); - -extern const int jconst_idct_float_sse2[]; -EXTERN(void) jsimd_idct_float_sse2 JPP((void * dct_table, - JCOEFPTR coef_block, - JSAMPARRAY output_buf, - JDIMENSION output_col)); - diff --git a/Builder/jni-1.11/simd/src/jsimd_none.c b/Builder/jni-1.11/simd/src/jsimd_none.c deleted file mode 100644 index 523e5dda5..000000000 --- a/Builder/jni-1.11/simd/src/jsimd_none.c +++ /dev/null @@ -1,313 +0,0 @@ -/* - * jsimd_none.c - * - * Copyright 2009 Pierre Ossman for Cendio AB - * Copyright 2009-2011 D. R. Commander - * - * Based on the x86 SIMD extension for IJG JPEG library, - * Copyright (C) 1999-2006, MIYASAKA Masaru. - * For conditions of distribution and use, see copyright notice in jsimdext.inc - * - * This file contains stubs for when there is no SIMD support available. - */ - -#define JPEG_INTERNALS -#include "jinclude.h" -#include "jpeglib.h" -#include "jdct.h" -#include "jsimddct.h" -#include "jsimd.h" - -GLOBAL(int) -jsimd_can_rgb_ycc (void) -{ - return 0; -} - -GLOBAL(int) -jsimd_can_rgb_gray (void) -{ - return 0; -} - -GLOBAL(int) -jsimd_can_ycc_rgb (void) -{ - return 0; -} - -GLOBAL(void) -jsimd_rgb_ycc_convert (j_compress_ptr cinfo, - JSAMPARRAY input_buf, JSAMPIMAGE output_buf, - JDIMENSION output_row, int num_rows) -{ -} - -GLOBAL(void) -jsimd_rgb_gray_convert (j_compress_ptr cinfo, - JSAMPARRAY input_buf, JSAMPIMAGE output_buf, - JDIMENSION output_row, int num_rows) -{ -} - -GLOBAL(void) -jsimd_ycc_rgb_convert (j_decompress_ptr cinfo, - JSAMPIMAGE input_buf, JDIMENSION input_row, - JSAMPARRAY output_buf, int num_rows) -{ -} - -GLOBAL(int) -jsimd_can_h2v2_downsample (void) -{ - return 0; -} - -GLOBAL(int) -jsimd_can_h2v1_downsample (void) -{ - return 0; -} - -GLOBAL(void) -jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, - JSAMPARRAY input_data, JSAMPARRAY output_data) -{ -} - -GLOBAL(void) -jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, - JSAMPARRAY input_data, JSAMPARRAY output_data) -{ -} - -GLOBAL(int) -jsimd_can_h2v2_upsample (void) -{ - return 0; -} - -GLOBAL(int) -jsimd_can_h2v1_upsample (void) -{ - return 0; -} - -GLOBAL(void) -jsimd_h2v2_upsample (j_decompress_ptr cinfo, - jpeg_component_info * compptr, - JSAMPARRAY input_data, - JSAMPARRAY * output_data_ptr) -{ -} - -GLOBAL(void) -jsimd_h2v1_upsample (j_decompress_ptr cinfo, - jpeg_component_info * compptr, - JSAMPARRAY input_data, - JSAMPARRAY * output_data_ptr) -{ -} - -GLOBAL(int) -jsimd_can_h2v2_fancy_upsample (void) -{ - return 0; -} - -GLOBAL(int) -jsimd_can_h2v1_fancy_upsample (void) -{ - return 0; -} - -GLOBAL(void) -jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo, - jpeg_component_info * compptr, - JSAMPARRAY input_data, - JSAMPARRAY * output_data_ptr) -{ -} - -GLOBAL(void) -jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo, - jpeg_component_info * compptr, - JSAMPARRAY input_data, - JSAMPARRAY * output_data_ptr) -{ -} - -GLOBAL(int) -jsimd_can_h2v2_merged_upsample (void) -{ - return 0; -} - -GLOBAL(int) -jsimd_can_h2v1_merged_upsample (void) -{ - return 0; -} - -GLOBAL(void) -jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo, - JSAMPIMAGE input_buf, - JDIMENSION in_row_group_ctr, - JSAMPARRAY output_buf) -{ -} - -GLOBAL(void) -jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo, - JSAMPIMAGE input_buf, - JDIMENSION in_row_group_ctr, - JSAMPARRAY output_buf) -{ -} - -GLOBAL(int) -jsimd_can_convsamp (void) -{ - return 0; -} - -GLOBAL(int) -jsimd_can_convsamp_float (void) -{ - return 0; -} - -GLOBAL(void) -jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, - DCTELEM * workspace) -{ -} - -GLOBAL(void) -jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col, - FAST_FLOAT * workspace) -{ -} - -GLOBAL(int) -jsimd_can_fdct_islow (void) -{ - return 0; -} - -GLOBAL(int) -jsimd_can_fdct_ifast (void) -{ - return 0; -} - -GLOBAL(int) -jsimd_can_fdct_float (void) -{ - return 0; -} - -GLOBAL(void) -jsimd_fdct_islow (DCTELEM * data) -{ -} - -GLOBAL(void) -jsimd_fdct_ifast (DCTELEM * data) -{ -} - -GLOBAL(void) -jsimd_fdct_float (FAST_FLOAT * data) -{ -} - -GLOBAL(int) -jsimd_can_quantize (void) -{ - return 0; -} - -GLOBAL(int) -jsimd_can_quantize_float (void) -{ - return 0; -} - -GLOBAL(void) -jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors, - DCTELEM * workspace) -{ -} - -GLOBAL(void) -jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors, - FAST_FLOAT * workspace) -{ -} - -GLOBAL(int) -jsimd_can_idct_2x2 (void) -{ - return 0; -} - -GLOBAL(int) -jsimd_can_idct_4x4 (void) -{ - return 0; -} - -GLOBAL(void) -jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr, - JCOEFPTR coef_block, JSAMPARRAY output_buf, - JDIMENSION output_col) -{ -} - -GLOBAL(void) -jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr, - JCOEFPTR coef_block, JSAMPARRAY output_buf, - JDIMENSION output_col) -{ -} - -GLOBAL(int) -jsimd_can_idct_islow (void) -{ - return 0; -} - -GLOBAL(int) -jsimd_can_idct_ifast (void) -{ - return 0; -} - -GLOBAL(int) -jsimd_can_idct_float (void) -{ - return 0; -} - -GLOBAL(void) -jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr, - JCOEFPTR coef_block, JSAMPARRAY output_buf, - JDIMENSION output_col) -{ -} - -GLOBAL(void) -jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr, - JCOEFPTR coef_block, JSAMPARRAY output_buf, - JDIMENSION output_col) -{ -} - -GLOBAL(void) -jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr, - JCOEFPTR coef_block, JSAMPARRAY output_buf, - JDIMENSION output_col) -{ -} -