mirror of
https://git.freebsd.org/src.git
synced 2026-01-11 19:57:22 +00:00
Update the Arm Optimized Routine library to v25.01
Sponsored by: Arm Ltd
This commit is contained in:
commit
f3087bef11
472 changed files with 11930 additions and 14603 deletions
|
|
@ -1,12 +1,9 @@
|
|||
/
|
||||
Szabolcs Nagy <szabolcs.nagy@arm.com>
|
||||
Tamar Christina <tamar.christina@arm.com>
|
||||
math/
|
||||
Szabolcs Nagy <szabolcs.nagy@arm.com>
|
||||
networking/
|
||||
Szabolcs Nagy <szabolcs.nagy@arm.com>
|
||||
pl/
|
||||
Pierre Blanchard <pierre.blanchard@arm.com>
|
||||
Joe Ramsay <joe.ramsay@arm.com>
|
||||
networking/
|
||||
Ola Liljedahl <ola.liljedahl@arm.com>
|
||||
string/
|
||||
Szabolcs Nagy <szabolcs.nagy@arm.com>
|
||||
Wilco Dijkstra <wilco.dijkstra@arm.com>
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
# Makefile - requires GNU make
|
||||
#
|
||||
# Copyright (c) 2018-2022, Arm Limited.
|
||||
# Copyright (c) 2018-2024, Arm Limited.
|
||||
# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
|
||||
srcdir = .
|
||||
|
|
@ -11,7 +11,6 @@ includedir = $(prefix)/include
|
|||
|
||||
# Configure these in config.mk, do not make changes in this file.
|
||||
SUBS = math string networking
|
||||
PLSUBS = math
|
||||
HOST_CC = cc
|
||||
HOST_CFLAGS = -std=c99 -O2
|
||||
HOST_LDFLAGS =
|
||||
|
|
@ -21,12 +20,22 @@ CPPFLAGS =
|
|||
CFLAGS = -std=c99 -O2
|
||||
CFLAGS_SHARED = -fPIC
|
||||
CFLAGS_ALL = -Ibuild/include $(CPPFLAGS) $(CFLAGS)
|
||||
CFLAGS_PL = -Ibuild/pl/include $(CPPFLAGS) $(CFLAGS) -DPL
|
||||
LDFLAGS =
|
||||
LDLIBS =
|
||||
AR = $(CROSS_COMPILE)ar
|
||||
RANLIB = $(CROSS_COMPILE)ranlib
|
||||
INSTALL = install
|
||||
# Detect OS.
|
||||
# Assume Unix environment: Linux, Darwin, or Msys.
|
||||
OS := $(shell uname -s)
|
||||
OS := $(patsubst MSYS%,Msys,$(OS))
|
||||
# Following math dependencies can be adjusted in config file
|
||||
# if necessary, e.g. for Msys.
|
||||
libm-libs = -lm
|
||||
libc-libs = -lc
|
||||
mpfr-libs = -lmpfr
|
||||
gmp-libs = -lgmp
|
||||
mpc-libs = -lmpc
|
||||
|
||||
all:
|
||||
|
||||
|
|
@ -53,7 +62,6 @@ $(DIRS):
|
|||
mkdir -p $@
|
||||
|
||||
$(filter %.os,$(ALL_FILES)): CFLAGS_ALL += $(CFLAGS_SHARED)
|
||||
$(filter %.os,$(ALL_FILES)): CFLAGS_PL += $(CFLAGS_SHARED)
|
||||
|
||||
build/%.o: $(srcdir)/%.S
|
||||
$(CC) $(CFLAGS_ALL) -c -o $@ $<
|
||||
|
|
|
|||
|
|
@ -12,12 +12,25 @@ contribution requirements are documented in README.contributors of
|
|||
the appropriate subdirectory.
|
||||
|
||||
Regular quarterly releases are tagged as vYY.MM, the latest
|
||||
release is v24.01.
|
||||
release is v25.01.
|
||||
|
||||
Source code layout:
|
||||
|
||||
build/ - build directory (created by make).
|
||||
math/ - math subproject sources.
|
||||
math/ - math subproject sources for generic scalar
|
||||
subroutines and sources shared with
|
||||
subdirectories of math/.
|
||||
All math routines should meet the quality
|
||||
requirements stated in math/README.contributors,
|
||||
routines that fail to do so are located in an
|
||||
experimental/ directory.
|
||||
math/aarch64/ - math subproject AArch64-specific sources
|
||||
and sources shared with subdirectories.
|
||||
math/aarch64/advsimd - AdvSIMD-specific math sources.
|
||||
math/aarch64/experimental - Experimental math sources do not
|
||||
meet quality requirements stated in
|
||||
math/README.contributors.
|
||||
math/aarch64/sve - SVE-specific math sources.
|
||||
math/include/ - math library public headers.
|
||||
math/test/ - math test and benchmark related sources.
|
||||
math/tools/ - tools used for designing the algorithms.
|
||||
|
|
@ -25,9 +38,16 @@ networking/ - networking subproject sources.
|
|||
networking/include/ - networking library public headers.
|
||||
networking/test/ - networking test and benchmark related sources.
|
||||
string/ - string routines subproject sources.
|
||||
All string routines should meet the quality
|
||||
requirements stated in string/README.contributors,
|
||||
routines that fail to do so are located in an
|
||||
experimental/ directory.
|
||||
string/<arch> - <arch>-specific string routines sources for
|
||||
<arch>=aarch64, and arm.
|
||||
string/aarch64/experimental - Experimental string routines which
|
||||
may not be fully optimized yet.
|
||||
string/include/ - string library public headers.
|
||||
string/test/ - string test and benchmark related sources.
|
||||
pl/... - separately maintained performance library code.
|
||||
|
||||
The steps to build the target libraries and run the tests:
|
||||
|
||||
|
|
@ -50,6 +70,13 @@ Or building and testing the math subproject only:
|
|||
make all-math
|
||||
make check-math
|
||||
|
||||
Note on compiler compability/requirement:
|
||||
|
||||
SVE routines are always built by default - this means that on AArch64
|
||||
GCC >= 10 or LLVM >= 5 are always required for SVE ACLE compatibility.
|
||||
There is no explicit check for compatible compiler, therefore the SVE
|
||||
routines will fail to build if CC is too old.
|
||||
|
||||
The test system requires libmpfr and libmpc.
|
||||
For example on debian linux they can be installed as:
|
||||
|
||||
|
|
|
|||
|
|
@ -1,14 +1,11 @@
|
|||
# Example config.mk
|
||||
#
|
||||
# Copyright (c) 2018-2023, Arm Limited.
|
||||
# Copyright (c) 2018-2024, Arm Limited.
|
||||
# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
|
||||
# Subprojects to build
|
||||
SUBS = math string networking
|
||||
|
||||
# Subsubprojects to build if subproject pl is built
|
||||
PLSUBS = math
|
||||
|
||||
# Target architecture: aarch64, arm or x86_64
|
||||
ARCH = aarch64
|
||||
|
||||
|
|
@ -30,6 +27,27 @@ HOST_CFLAGS += -Wall -Wno-unused-function
|
|||
HOST_CFLAGS += -g
|
||||
CFLAGS += -g
|
||||
|
||||
ifeq ($(OS),Msys)
|
||||
# llvm is the only available/valid native compiler
|
||||
CC = clang
|
||||
AR = llvm-ar
|
||||
RANLIB = llvm-ranlib
|
||||
HOST_CC = clang
|
||||
SYSROOT = /c/wenv/msys2/msys64/clangarm64
|
||||
# Common windows flags
|
||||
COMMON_WIN_CFLAGS = -D_CRT_SECURE_NO_WARNINGS -D_CRT_NONSTDC_NO_DEPRECATE
|
||||
COMMON_WIN_CFLAGS += -Wno-deprecated-declarations -Wno-unused-variable
|
||||
# For mathtest
|
||||
HOST_CFLAGS += -I$(SYSROOT)/include
|
||||
HOST_CFLAGS += $(COMMON_WIN_CFLAGS) -Wno-ignored-attributes
|
||||
# Clear the default flag -fPIC, as not supported on Windows
|
||||
CFLAGS_SHARED =
|
||||
# For ulp.h with MPFR
|
||||
CFLAGS += -I$(SYSROOT)/include
|
||||
# For clang on Windows
|
||||
CFLAGS += $(COMMON_WIN_CFLAGS)
|
||||
endif
|
||||
|
||||
# Optimize the shared libraries on aarch64 assuming they fit in 1M.
|
||||
#CFLAGS_SHARED = -fPIC -mcmodel=tiny
|
||||
|
||||
|
|
@ -45,12 +63,33 @@ math-cflags =
|
|||
math-ldlibs =
|
||||
math-ulpflags =
|
||||
math-testflags =
|
||||
string-cflags =
|
||||
string-cflags = -falign-functions=64
|
||||
networking-cflags =
|
||||
|
||||
# Use if mpfr is available on the target for ulp error checking.
|
||||
#math-ldlibs += -lmpfr -lgmp
|
||||
#math-cflags += -DUSE_MPFR
|
||||
ifeq ($(OS),Msys)
|
||||
# Libraries can be installed with pacman
|
||||
libm-libs = -lmsvcrt -lvcruntime -lucrt
|
||||
libc-libs =
|
||||
# Linker will look for .lib but some systems only have .dll.a,
|
||||
# therefore we have to give absolute path to libraries.
|
||||
# This is system dependent and might need adjusting.
|
||||
mpfr-libs = $(SYSROOT)/lib/libmpfr.dll.a
|
||||
gmp-libs = $(SYSROOT)/lib/libgmp.dll.a
|
||||
mpc-libs = $(SYSROOT)/lib/libmpc.dll.a
|
||||
endif
|
||||
|
||||
# Use if mpfr is available on the target for ulp error checking. If
|
||||
# enabling this, it is advised to disable fenv checks by uncommenting
|
||||
# the two lines at the bottom of this block.
|
||||
USE_MPFR=0
|
||||
math-cflags += -DUSE_MPFR=$(USE_MPFR)
|
||||
ifeq ($(USE_MPFR), 1)
|
||||
math-ldlibs += $(mpfr-libs) $(gmp-libs)
|
||||
math-ulpflags += -m
|
||||
endif
|
||||
# Disable fenv checks
|
||||
#math-ulpflags = -q -f
|
||||
#math-testflags = -nostatus
|
||||
|
||||
# Use with gcc.
|
||||
math-cflags += -frounding-math -fexcess-precision=standard -fno-stack-protector
|
||||
|
|
@ -59,30 +98,36 @@ math-cflags += -ffp-contract=fast -fno-math-errno
|
|||
# Use with clang.
|
||||
#math-cflags += -ffp-contract=fast
|
||||
|
||||
# Disable/enable SVE vector math code and tests.
|
||||
# If WANT_SVE_MATH is enabled, math-sve-cflags is added for SVE
|
||||
# routines only so that SVE code does not leak into scalar
|
||||
# routines. It is also necessary to add it for tools (e.g. ulp,
|
||||
# mathbench)
|
||||
WANT_SVE_MATH = 0
|
||||
ifeq ($(WANT_SVE_MATH), 1)
|
||||
math-sve-cflags = -march=armv8-a+sve
|
||||
endif
|
||||
math-cflags += -DWANT_SVE_MATH=$(WANT_SVE_MATH)
|
||||
|
||||
# If defined to 1, set errno in math functions according to ISO C. Many math
|
||||
# libraries do not set errno, so this is 0 by default. It may need to be
|
||||
# set to 1 if math.h has (math_errhandling & MATH_ERRNO) != 0.
|
||||
WANT_ERRNO = 0
|
||||
math-cflags += -DWANT_ERRNO=$(WANT_ERRNO)
|
||||
|
||||
# Disable/enable SVE vector math tests/tools.
|
||||
ifeq ($(ARCH),aarch64)
|
||||
WANT_SVE_TESTS = 1
|
||||
else
|
||||
WANT_SVE_TESTS = 0
|
||||
endif
|
||||
math-cflags += -DWANT_SVE_TESTS=$(WANT_SVE_TESTS)
|
||||
|
||||
# If set to 1, set fenv in vector math routines.
|
||||
WANT_SIMD_EXCEPT = 0
|
||||
math-cflags += -DWANT_SIMD_EXCEPT=$(WANT_SIMD_EXCEPT)
|
||||
|
||||
# Disable fenv checks
|
||||
#math-ulpflags = -q -f
|
||||
#math-testflags = -nostatus
|
||||
# If set to 1, enable tests for exp10.
|
||||
WANT_EXP10_TESTS = 1
|
||||
math-cflags += -DWANT_EXP10_TESTS=$(WANT_EXP10_TESTS)
|
||||
|
||||
# If set to 1, enable tests for sinpi and cospi. These functions are
|
||||
# only supported on aarch64
|
||||
ifeq ($(ARCH),aarch64)
|
||||
WANT_TRIGPI_TESTS = 1
|
||||
else
|
||||
WANT_TRIGPI_TESTS = 0
|
||||
endif
|
||||
math-cflags += -DWANT_TRIGPI_TESTS=$(WANT_TRIGPI_TESTS)
|
||||
|
||||
# Remove GNU Property Notes from asm files.
|
||||
#string-cflags += -DWANT_GNU_PROPERTY=0
|
||||
|
|
@ -92,3 +137,13 @@ math-cflags += -DWANT_SIMD_EXCEPT=$(WANT_SIMD_EXCEPT)
|
|||
|
||||
# Avoid auto-vectorization of scalar code and unroll loops
|
||||
networking-cflags += -O2 -fno-tree-vectorize -funroll-loops
|
||||
|
||||
# Provide *_finite symbols and some of the glibc hidden symbols
|
||||
# so libmathlib can be used with binaries compiled against glibc
|
||||
# to interpose math functions with both static and dynamic linking
|
||||
USE_GLIBC_ABI = 1
|
||||
math-cflags += -DUSE_GLIBC_ABI=$(USE_GLIBC_ABI)
|
||||
|
||||
# Enable experimental math routines - non-C23 vector math and low-accuracy scalar
|
||||
WANT_EXPERIMENTAL_MATH = 0
|
||||
math-cflags += -DWANT_EXPERIMENTAL_MATH=$(WANT_EXPERIMENTAL_MATH)
|
||||
|
|
|
|||
|
|
@ -1,23 +1,61 @@
|
|||
# Makefile fragment - requires GNU make
|
||||
#
|
||||
# Copyright (c) 2019-2023, Arm Limited.
|
||||
# Copyright (c) 2019-2024, Arm Limited.
|
||||
# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
|
||||
S := $(srcdir)/math
|
||||
B := build/math
|
||||
.SECONDEXPANSION:
|
||||
|
||||
math-lib-srcs := $(wildcard $(S)/*.[cS])
|
||||
math-lib-srcs += $(wildcard $(S)/$(ARCH)/*.[cS])
|
||||
ifneq ($(OS),Linux)
|
||||
ifeq ($(WANT_SIMD_EXCEPT),1)
|
||||
$(error WANT_SIMD_EXCEPT is not supported outside Linux)
|
||||
endif
|
||||
ifneq ($(USE_MPFR),1)
|
||||
$(warning WARNING: Double-precision ULP tests will not be usable without MPFR)
|
||||
endif
|
||||
ifeq ($(USE_GLIBC_ABI),1)
|
||||
$(error Can only generate special GLIBC symbols on Linux - please disable USE_GLIBC_ABI)
|
||||
endif
|
||||
endif
|
||||
|
||||
ifneq ($(ARCH),aarch64)
|
||||
ifeq ($(WANT_TRIGPI_TESTS),1)
|
||||
$(error trigpi functions only supported on aarch64)
|
||||
endif
|
||||
ifeq ($(WANT_EXPERIMENTAL_MATH),1)
|
||||
$(error Experimental math only supported on aarch64)
|
||||
endif
|
||||
endif
|
||||
|
||||
math-src-dir := $(srcdir)/math
|
||||
math-build-dir := build/math
|
||||
|
||||
math-lib-srcs := $(wildcard $(math-src-dir)/*.[cS])
|
||||
math-lib-srcs += $(wildcard $(math-src-dir)/$(ARCH)/*.[cS])
|
||||
ifeq ($(OS),Linux)
|
||||
# Vector symbols only supported on Linux
|
||||
math-lib-srcs += $(wildcard $(math-src-dir)/$(ARCH)/*/*.[cS])
|
||||
endif
|
||||
|
||||
ifeq ($(WANT_EXPERIMENTAL_MATH), 1)
|
||||
ifeq ($(OS),Linux)
|
||||
# Vector symbols only supported on Linux
|
||||
math-lib-srcs += $(wildcard $(math-src-dir)/$(ARCH)/experimental/*/*.[cS])
|
||||
else
|
||||
math-lib-srcs += $(wildcard $(math-src-dir)/$(ARCH)/experimental/*.[cS])
|
||||
endif
|
||||
else
|
||||
# Scalar experimental symbols will have been added by wildcard, so remove them
|
||||
math-lib-srcs := $(filter-out $(math-src-dir)/aarch64/experimental/%, $(math-lib-srcs))
|
||||
endif
|
||||
|
||||
math-test-srcs := \
|
||||
$(S)/test/mathtest.c \
|
||||
$(S)/test/mathbench.c \
|
||||
$(S)/test/ulp.c \
|
||||
$(math-src-dir)/test/mathtest.c \
|
||||
$(math-src-dir)/test/mathbench.c \
|
||||
$(math-src-dir)/test/ulp.c \
|
||||
|
||||
math-test-host-srcs := $(wildcard $(S)/test/rtest/*.[cS])
|
||||
math-test-host-srcs := $(wildcard $(math-src-dir)/test/rtest/*.[cS])
|
||||
|
||||
math-includes := $(patsubst $(S)/%,build/%,$(wildcard $(S)/include/*.h))
|
||||
math-test-includes := $(patsubst $(S)/%,build/include/%,$(wildcard $(S)/test/*.h))
|
||||
math-includes := $(patsubst $(math-src-dir)/%,build/%,$(wildcard $(math-src-dir)/include/*.h))
|
||||
|
||||
math-libs := \
|
||||
build/lib/libmathlib.so \
|
||||
|
|
@ -33,9 +71,9 @@ math-tools := \
|
|||
math-host-tools := \
|
||||
build/bin/rtest \
|
||||
|
||||
math-lib-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(math-lib-srcs)))
|
||||
math-test-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(math-test-srcs)))
|
||||
math-host-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(math-test-host-srcs)))
|
||||
math-lib-objs := $(patsubst $(math-src-dir)/%,$(math-build-dir)/%.o,$(basename $(math-lib-srcs)))
|
||||
math-test-objs := $(patsubst $(math-src-dir)/%,$(math-build-dir)/%.o,$(basename $(math-test-srcs)))
|
||||
math-host-objs := $(patsubst $(math-src-dir)/%,$(math-build-dir)/%.o,$(basename $(math-test-host-srcs)))
|
||||
math-target-objs := $(math-lib-objs) $(math-test-objs)
|
||||
math-objs := $(math-target-objs) $(math-target-objs:%.o=%.os) $(math-host-objs)
|
||||
|
||||
|
|
@ -44,18 +82,69 @@ math-files := \
|
|||
$(math-libs) \
|
||||
$(math-tools) \
|
||||
$(math-host-tools) \
|
||||
$(math-includes) \
|
||||
$(math-test-includes) \
|
||||
$(math-includes)
|
||||
|
||||
all-math: $(math-libs) $(math-tools) $(math-includes) $(math-test-includes)
|
||||
all-math: $(math-libs) $(math-tools) $(math-includes)
|
||||
|
||||
$(math-objs): $(math-includes) $(math-test-includes)
|
||||
$(math-objs): $(math-includes)
|
||||
$(math-objs): CFLAGS_ALL += $(math-cflags)
|
||||
$(B)/test/mathtest.o: CFLAGS_ALL += -fmath-errno
|
||||
$(math-build-dir)/test/mathtest.o: CFLAGS_ALL += -fmath-errno
|
||||
$(math-host-objs): CC = $(HOST_CC)
|
||||
$(math-host-objs): CFLAGS_ALL = $(HOST_CFLAGS)
|
||||
|
||||
$(B)/test/ulp.o: $(S)/test/ulp.h
|
||||
# Add include path for experimental routines so they can share helpers with non-experimental
|
||||
$(math-build-dir)/aarch64/experimental/advsimd/%: CFLAGS_ALL += -I$(math-src-dir)/aarch64/advsimd
|
||||
$(math-build-dir)/aarch64/experimental/sve/%: CFLAGS_ALL += -I$(math-src-dir)/aarch64/sve
|
||||
|
||||
$(math-objs): CFLAGS_ALL += -I$(math-src-dir)
|
||||
|
||||
ulp-funcs-dir = build/test/ulp-funcs/
|
||||
ulp-wrappers-dir = build/test/ulp-wrappers/
|
||||
mathbench-funcs-dir = build/test/mathbench-funcs/
|
||||
test-sig-dirs = $(ulp-funcs-dir) $(ulp-wrappers-dir) $(mathbench-funcs-dir)
|
||||
build/include/test $(test-sig-dirs) $(addsuffix /$(ARCH),$(test-sig-dirs)) $(addsuffix /aarch64/experimental,$(test-sig-dirs)) \
|
||||
$(addsuffix /aarch64/experimental/advsimd,$(test-sig-dirs)) $(addsuffix /aarch64/experimental/sve,$(test-sig-dirs)) \
|
||||
$(addsuffix /aarch64/advsimd,$(test-sig-dirs)) $(addsuffix /aarch64/sve,$(test-sig-dirs)):
|
||||
mkdir -p $@
|
||||
|
||||
ulp-funcs = $(patsubst $(math-src-dir)/%,$(ulp-funcs-dir)/%,$(basename $(math-lib-srcs)))
|
||||
ulp-wrappers = $(patsubst $(math-src-dir)/%,$(ulp-wrappers-dir)/%,$(basename $(math-lib-srcs)))
|
||||
mathbench-funcs = $(patsubst $(math-src-dir)/%,$(mathbench-funcs-dir)/%,$(basename $(math-lib-srcs)))
|
||||
|
||||
ifeq ($(WANT_SVE_TESTS), 0)
|
||||
# Filter out anything with sve in the path
|
||||
ulp-funcs := $(foreach a,$(ulp-funcs),$(if $(findstring sve,$a),,$a))
|
||||
ulp-wrappers := $(foreach a,$(ulp-wrappers),$(if $(findstring sve,$a),,$a))
|
||||
mathbench-funcs := $(foreach a,$(mathbench-funcs),$(if $(findstring sve,$a),,$a))
|
||||
endif
|
||||
|
||||
define emit_sig
|
||||
$1/aarch64/experimental/sve/%.i: EXTRA_INC = -I$(math-src-dir)/aarch64/sve
|
||||
$1/aarch64/experimental/advsimd/%.i: EXTRA_INC = -I$(math-src-dir)/aarch64/advsimd
|
||||
$1/%.i: $(math-src-dir)/%.c | $$$$(@D)
|
||||
$(CC) $$< $(math-cflags) -I$(math-src-dir)/include -I$(math-src-dir) $$(EXTRA_INC) -D$2 -E -o $$@
|
||||
$1/%: $1/%.i
|
||||
{ grep TEST_SIG $$< || true; } | cut -f 2- -d ' ' > $$@
|
||||
endef
|
||||
|
||||
$(eval $(call emit_sig,$(ulp-funcs-dir),EMIT_ULP_FUNCS))
|
||||
$(eval $(call emit_sig,$(ulp-wrappers-dir),EMIT_ULP_WRAPPERS))
|
||||
$(eval $(call emit_sig,$(mathbench-funcs-dir),EMIT_MATHBENCH_FUNCS))
|
||||
|
||||
ulp-funcs-gen = build/include/test/ulp_funcs_gen.h
|
||||
ulp-wrappers-gen = build/include/test/ulp_wrappers_gen.h
|
||||
mathbench-funcs-gen = build/include/test/mathbench_funcs_gen.h
|
||||
math-tools-autogen-headers = $(ulp-funcs-gen) $(ulp-wrappers-gen) $(mathbench-funcs-gen)
|
||||
|
||||
$(ulp-funcs-gen): $(ulp-funcs) | $$(@D)
|
||||
$(ulp-wrappers-gen): $(ulp-wrappers) | $$(@D)
|
||||
$(mathbench-funcs-gen): $(mathbench-funcs) | $$(@D)
|
||||
|
||||
$(math-tools-autogen-headers): | $$(@D)
|
||||
cat $^ | sort -u > $@
|
||||
|
||||
$(math-build-dir)/test/mathbench.o: $(mathbench-funcs-gen)
|
||||
$(math-build-dir)/test/ulp.o: $(math-src-dir)/test/ulp.h $(ulp-funcs-gen) $(ulp-wrappers-gen)
|
||||
|
||||
build/lib/libmathlib.so: $(math-lib-objs:%.o=%.os)
|
||||
$(CC) $(CFLAGS_ALL) $(LDFLAGS) -shared -o $@ $^
|
||||
|
|
@ -65,38 +154,40 @@ build/lib/libmathlib.a: $(math-lib-objs)
|
|||
$(AR) rc $@ $^
|
||||
$(RANLIB) $@
|
||||
|
||||
$(math-host-tools): HOST_LDLIBS += -lm -lmpfr -lmpc
|
||||
$(math-tools): LDLIBS += $(math-ldlibs) -lm
|
||||
# math-sve-cflags should be empty if WANT_SVE_MATH is not enabled
|
||||
$(math-tools): CFLAGS_ALL += $(math-sve-cflags)
|
||||
$(math-host-tools): HOST_LDLIBS += $(libm-libs) $(mpfr-libs) $(mpc-libs)
|
||||
$(math-tools): LDLIBS += $(math-ldlibs) $(libm-libs)
|
||||
|
||||
ifneq ($(OS),Darwin)
|
||||
$(math-tools): LDFLAGS += -static
|
||||
endif
|
||||
|
||||
build/bin/rtest: $(math-host-objs)
|
||||
$(HOST_CC) $(HOST_CFLAGS) $(HOST_LDFLAGS) -o $@ $^ $(HOST_LDLIBS)
|
||||
|
||||
build/bin/mathtest: $(B)/test/mathtest.o build/lib/libmathlib.a
|
||||
$(CC) $(CFLAGS_ALL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS)
|
||||
build/bin/mathtest: $(math-build-dir)/test/mathtest.o build/lib/libmathlib.a
|
||||
$(CC) $(CFLAGS_ALL) $(LDFLAGS) -o $@ $^ $(libm-libs)
|
||||
|
||||
build/bin/mathbench: $(B)/test/mathbench.o build/lib/libmathlib.a
|
||||
$(CC) $(CFLAGS_ALL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS)
|
||||
build/bin/mathbench: $(math-build-dir)/test/mathbench.o build/lib/libmathlib.a
|
||||
$(CC) $(CFLAGS_ALL) $(LDFLAGS) -o $@ $^ $(libm-libs)
|
||||
|
||||
# This is not ideal, but allows custom symbols in mathbench to get resolved.
|
||||
build/bin/mathbench_libc: $(B)/test/mathbench.o build/lib/libmathlib.a
|
||||
$(CC) $(CFLAGS_ALL) $(LDFLAGS) -static -o $@ $< $(LDLIBS) -lc build/lib/libmathlib.a -lm
|
||||
build/bin/mathbench_libc: $(math-build-dir)/test/mathbench.o build/lib/libmathlib.a
|
||||
$(CC) $(CFLAGS_ALL) $(LDFLAGS) -o $@ $< $(libm-libs) $(libc-libs) build/lib/libmathlib.a $(libm-libs)
|
||||
|
||||
build/bin/ulp: $(B)/test/ulp.o build/lib/libmathlib.a
|
||||
$(CC) $(CFLAGS_ALL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS)
|
||||
build/bin/ulp: $(math-build-dir)/test/ulp.o build/lib/libmathlib.a
|
||||
$(CC) $(CFLAGS_ALL) $(LDFLAGS) -o $@ $^ $(LDLIBS)
|
||||
|
||||
build/include/%.h: $(S)/include/%.h
|
||||
build/include/%.h: $(math-src-dir)/include/%.h
|
||||
cp $< $@
|
||||
|
||||
build/include/test/%.h: $(S)/test/%.h
|
||||
build/bin/%.sh: $(math-src-dir)/test/%.sh
|
||||
cp $< $@
|
||||
|
||||
build/bin/%.sh: $(S)/test/%.sh
|
||||
cp $< $@
|
||||
|
||||
math-tests := $(wildcard $(S)/test/testcases/directed/*.tst)
|
||||
math-rtests := $(wildcard $(S)/test/testcases/random/*.tst)
|
||||
math-tests := $(wildcard $(math-src-dir)/test/testcases/directed/*.tst)
|
||||
ifneq ($(WANT_EXP10_TESTS),1)
|
||||
math-tests := $(filter-out %exp10.tst, $(math-tests))
|
||||
endif
|
||||
math-rtests := $(wildcard $(math-src-dir)/test/testcases/random/*.tst)
|
||||
|
||||
check-math-test: $(math-tools)
|
||||
cat $(math-tests) | $(EMULATOR) build/bin/mathtest $(math-testflags)
|
||||
|
|
@ -104,8 +195,88 @@ check-math-test: $(math-tools)
|
|||
check-math-rtest: $(math-host-tools) $(math-tools)
|
||||
cat $(math-rtests) | build/bin/rtest | $(EMULATOR) build/bin/mathtest $(math-testflags)
|
||||
|
||||
ulp-input-dir = $(math-build-dir)/test/inputs
|
||||
$(ulp-input-dir) $(ulp-input-dir)/$(ARCH) $(ulp-input-dir)/aarch64/sve $(ulp-input-dir)/aarch64/advsimd \
|
||||
$(ulp-input-dir)/aarch64/experimental $(ulp-input-dir)/aarch64/experimental/advsimd $(ulp-input-dir)/aarch64/experimental/sve:
|
||||
mkdir -p $@
|
||||
|
||||
math-lib-lims = $(patsubst $(math-src-dir)/%.c,$(ulp-input-dir)/%.ulp,$(math-lib-srcs))
|
||||
math-lib-lims-nn = $(patsubst $(math-src-dir)/%.c,$(ulp-input-dir)/%.ulp_nn,$(math-lib-srcs))
|
||||
math-lib-fenvs = $(patsubst $(math-src-dir)/%.c,$(ulp-input-dir)/%.fenv,$(math-lib-srcs))
|
||||
math-lib-itvs = $(patsubst $(math-src-dir)/%.c,$(ulp-input-dir)/%.itv,$(math-lib-srcs))
|
||||
math-lib-cvals = $(patsubst $(math-src-dir)/%.c,$(ulp-input-dir)/%.cval,$(math-lib-srcs))
|
||||
|
||||
ulp-inputs = $(math-lib-lims) $(math-lib-lims-nn) $(math-lib-fenvs) $(math-lib-itvs) $(math-lib-cvals)
|
||||
$(ulp-inputs): CFLAGS = -I$(math-src-dir)/test -I$(math-src-dir)/include -I$(math-src-dir) $(math-cflags)\
|
||||
-I$(math-src-dir)/aarch64/advsimd -I$(math-src-dir)/aarch64/sve
|
||||
|
||||
$(ulp-input-dir)/%.ulp.i: $(math-src-dir)/%.c | $$(@D)
|
||||
$(CC) $(CFLAGS) $< -E -o $@
|
||||
|
||||
$(ulp-input-dir)/%.ulp: $(ulp-input-dir)/%.ulp.i
|
||||
{ grep "TEST_ULP " $< || true; } > $@
|
||||
|
||||
$(ulp-input-dir)/%.ulp_nn.i: $(math-src-dir)/%.c | $$(@D)
|
||||
$(CC) $(CFLAGS) $< -E -o $@
|
||||
|
||||
$(ulp-input-dir)/%.ulp_nn: $(ulp-input-dir)/%.ulp_nn.i
|
||||
{ grep "TEST_ULP_NONNEAREST " $< || true; } > $@
|
||||
|
||||
$(ulp-input-dir)/%.fenv.i: $(math-src-dir)/%.c | $$(@D)
|
||||
$(CC) $(CFLAGS) $< -E -o $@
|
||||
|
||||
$(ulp-input-dir)/%.fenv: $(ulp-input-dir)/%.fenv.i
|
||||
{ grep "TEST_DISABLE_FENV " $< || true; } > $@
|
||||
|
||||
$(ulp-input-dir)/%.itv.i: $(math-src-dir)/%.c | $$(@D)
|
||||
$(CC) $(CFLAGS) $< -E -o $@
|
||||
|
||||
$(ulp-input-dir)/%.itv: $(ulp-input-dir)/%.itv.i
|
||||
{ grep "TEST_INTERVAL " $< || true; } | sed "s/ TEST_INTERVAL/\nTEST_INTERVAL/g" > $@
|
||||
|
||||
$(ulp-input-dir)/%.cval.i: $(math-src-dir)/%.c | $$(@D)
|
||||
$(CC) $(CFLAGS) $< -E -o $@
|
||||
|
||||
$(ulp-input-dir)/%.cval: $(ulp-input-dir)/%.cval.i
|
||||
{ grep "TEST_CONTROL_VALUE " $< || true; } > $@
|
||||
|
||||
ulp-lims = $(ulp-input-dir)/limits
|
||||
$(ulp-lims): $(math-lib-lims)
|
||||
|
||||
ulp-lims-nn = $(ulp-input-dir)/limits_nn
|
||||
$(ulp-lims-nn): $(math-lib-lims-nn)
|
||||
|
||||
fenv-exps := $(ulp-input-dir)/fenv
|
||||
$(fenv-exps): $(math-lib-fenvs)
|
||||
|
||||
generic-itvs = $(ulp-input-dir)/itvs
|
||||
$(generic-itvs): $(filter-out $(ulp-input-dir)/$(ARCH)/%,$(math-lib-itvs))
|
||||
|
||||
arch-itvs = $(ulp-input-dir)/$(ARCH)/itvs
|
||||
$(arch-itvs): $(filter $(ulp-input-dir)/$(ARCH)/%,$(math-lib-itvs))
|
||||
|
||||
ulp-cvals := $(ulp-input-dir)/cvals
|
||||
$(ulp-cvals): $(math-lib-cvals)
|
||||
|
||||
# Remove first word, which will be TEST directive
|
||||
$(ulp-lims) $(ulp-lims-nn) $(fenv-exps) $(arch-itvs) $(generic-itvs) $(ulp-cvals): | $$(@D)
|
||||
sed "s/TEST_[^ ]* //g" $^ | sort -u > $@
|
||||
|
||||
check-math-ulp: $(ulp-lims) $(ulp-lims-nn)
|
||||
check-math-ulp: $(fenv-exps) $(ulp-cvals)
|
||||
check-math-ulp: $(generic-itvs) $(arch-itvs)
|
||||
check-math-ulp: $(math-tools)
|
||||
ULPFLAGS="$(math-ulpflags)" WANT_SIMD_EXCEPT="$(WANT_SIMD_EXCEPT)" build/bin/runulp.sh $(EMULATOR)
|
||||
ULPFLAGS="$(math-ulpflags)" \
|
||||
LIMITS=../../$(ulp-lims) \
|
||||
ARCH_ITVS=../../$(arch-itvs) \
|
||||
GEN_ITVS=../../$(generic-itvs) \
|
||||
DISABLE_FENV=../../$(fenv-exps) \
|
||||
CVALS=../../$(ulp-cvals) \
|
||||
FUNC=$(func) \
|
||||
WANT_EXPERIMENTAL_MATH=$(WANT_EXPERIMENTAL_MATH) \
|
||||
WANT_SVE_TESTS=$(WANT_SVE_TESTS) \
|
||||
USE_MPFR=$(USE_MPFR) \
|
||||
build/bin/runulp.sh $(EMULATOR)
|
||||
|
||||
check-math: check-math-test check-math-rtest check-math-ulp
|
||||
|
||||
|
|
|
|||
|
|
@ -1,8 +1,9 @@
|
|||
STYLE REQUIREMENTS
|
||||
==================
|
||||
|
||||
1. Most code in this sub-directory is expected to be upstreamed into glibc so
|
||||
the GNU Coding Standard and glibc specific conventions should be followed
|
||||
1. With the exception of math/aarch64/experimental/, most code in this
|
||||
sub-directory is expected to be upstreamed into glibc so the GNU
|
||||
Coding Standard and glibc specific conventions should be followed
|
||||
to ease upstreaming.
|
||||
|
||||
2. ABI and symbols: the code should be written so it is suitable for inclusion
|
||||
|
|
|
|||
|
|
@ -1,14 +1,14 @@
|
|||
/*
|
||||
* Double-precision vector acos(x) function.
|
||||
*
|
||||
* Copyright (c) 2023, Arm Limited.
|
||||
* Copyright (c) 2023-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "v_math.h"
|
||||
#include "poly_advsimd_f64.h"
|
||||
#include "pl_sig.h"
|
||||
#include "pl_test.h"
|
||||
#include "v_poly_f64.h"
|
||||
#include "test_sig.h"
|
||||
#include "test_defs.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
|
|
@ -30,8 +30,8 @@ static const struct data
|
|||
};
|
||||
|
||||
#define AllMask v_u64 (0xffffffffffffffff)
|
||||
#define Oneu (0x3ff0000000000000)
|
||||
#define Small (0x3e50000000000000) /* 2^-53. */
|
||||
#define Oneu 0x3ff0000000000000
|
||||
#define Small 0x3e50000000000000 /* 2^-53. */
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
static float64x2_t VPCS_ATTR NOINLINE
|
||||
|
|
@ -111,12 +111,12 @@ float64x2_t VPCS_ATTR V_NAME_D1 (acos) (float64x2_t x)
|
|||
return vfmaq_f64 (add, mul, y);
|
||||
}
|
||||
|
||||
PL_SIG (V, D, 1, acos, -1.0, 1.0)
|
||||
PL_TEST_ULP (V_NAME_D1 (acos), 1.02)
|
||||
PL_TEST_EXPECT_FENV (V_NAME_D1 (acos), WANT_SIMD_EXCEPT)
|
||||
PL_TEST_INTERVAL (V_NAME_D1 (acos), 0, Small, 5000)
|
||||
PL_TEST_INTERVAL (V_NAME_D1 (acos), Small, 0.5, 50000)
|
||||
PL_TEST_INTERVAL (V_NAME_D1 (acos), 0.5, 1.0, 50000)
|
||||
PL_TEST_INTERVAL (V_NAME_D1 (acos), 1.0, 0x1p11, 50000)
|
||||
PL_TEST_INTERVAL (V_NAME_D1 (acos), 0x1p11, inf, 20000)
|
||||
PL_TEST_INTERVAL (V_NAME_D1 (acos), -0, -inf, 20000)
|
||||
TEST_SIG (V, D, 1, acos, -1.0, 1.0)
|
||||
TEST_ULP (V_NAME_D1 (acos), 1.02)
|
||||
TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (acos), WANT_SIMD_EXCEPT)
|
||||
TEST_INTERVAL (V_NAME_D1 (acos), 0, Small, 5000)
|
||||
TEST_INTERVAL (V_NAME_D1 (acos), Small, 0.5, 50000)
|
||||
TEST_INTERVAL (V_NAME_D1 (acos), 0.5, 1.0, 50000)
|
||||
TEST_INTERVAL (V_NAME_D1 (acos), 1.0, 0x1p11, 50000)
|
||||
TEST_INTERVAL (V_NAME_D1 (acos), 0x1p11, inf, 20000)
|
||||
TEST_INTERVAL (V_NAME_D1 (acos), -0, -inf, 20000)
|
||||
|
|
@ -1,14 +1,14 @@
|
|||
/*
|
||||
* Single-precision vector acos(x) function.
|
||||
*
|
||||
* Copyright (c) 2023, Arm Limited.
|
||||
* Copyright (c) 2023-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "v_math.h"
|
||||
#include "poly_advsimd_f32.h"
|
||||
#include "pl_sig.h"
|
||||
#include "pl_test.h"
|
||||
#include "v_poly_f32.h"
|
||||
#include "test_sig.h"
|
||||
#include "test_defs.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
|
|
@ -57,8 +57,8 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
|
|||
|
||||
The largest observed error in this region is 1.32 ulps,
|
||||
_ZGVnN4v_acosf (0x1.15ba56p-1) got 0x1.feb33p-1
|
||||
want 0x1.feb32ep-1. */
|
||||
float32x4_t VPCS_ATTR V_NAME_F1 (acos) (float32x4_t x)
|
||||
want 0x1.feb32ep-1. */
|
||||
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (acos) (float32x4_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
|
||||
|
|
@ -102,12 +102,14 @@ float32x4_t VPCS_ATTR V_NAME_F1 (acos) (float32x4_t x)
|
|||
return vfmaq_f32 (add, mul, y);
|
||||
}
|
||||
|
||||
PL_SIG (V, F, 1, acos, -1.0, 1.0)
|
||||
PL_TEST_ULP (V_NAME_F1 (acos), 0.82)
|
||||
PL_TEST_EXPECT_FENV (V_NAME_F1 (acos), WANT_SIMD_EXCEPT)
|
||||
PL_TEST_INTERVAL (V_NAME_F1 (acos), 0, 0x1p-26, 5000)
|
||||
PL_TEST_INTERVAL (V_NAME_F1 (acos), 0x1p-26, 0.5, 50000)
|
||||
PL_TEST_INTERVAL (V_NAME_F1 (acos), 0.5, 1.0, 50000)
|
||||
PL_TEST_INTERVAL (V_NAME_F1 (acos), 1.0, 0x1p11, 50000)
|
||||
PL_TEST_INTERVAL (V_NAME_F1 (acos), 0x1p11, inf, 20000)
|
||||
PL_TEST_INTERVAL (V_NAME_F1 (acos), -0, -inf, 20000)
|
||||
HALF_WIDTH_ALIAS_F1 (acos)
|
||||
|
||||
TEST_SIG (V, F, 1, acos, -1.0, 1.0)
|
||||
TEST_ULP (V_NAME_F1 (acos), 0.82)
|
||||
TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (acos), WANT_SIMD_EXCEPT)
|
||||
TEST_INTERVAL (V_NAME_F1 (acos), 0, 0x1p-26, 5000)
|
||||
TEST_INTERVAL (V_NAME_F1 (acos), 0x1p-26, 0.5, 50000)
|
||||
TEST_INTERVAL (V_NAME_F1 (acos), 0.5, 1.0, 50000)
|
||||
TEST_INTERVAL (V_NAME_F1 (acos), 1.0, 0x1p11, 50000)
|
||||
TEST_INTERVAL (V_NAME_F1 (acos), 0x1p11, inf, 20000)
|
||||
TEST_INTERVAL (V_NAME_F1 (acos), -0, -inf, 20000)
|
||||
|
|
@ -1,12 +1,12 @@
|
|||
/*
|
||||
* Single-precision vector acosh(x) function.
|
||||
* Copyright (c) 2023, Arm Limited.
|
||||
* Double-precision vector acosh(x) function.
|
||||
* Copyright (c) 2023-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "v_math.h"
|
||||
#include "pl_sig.h"
|
||||
#include "pl_test.h"
|
||||
#include "test_sig.h"
|
||||
#include "test_defs.h"
|
||||
|
||||
#define WANT_V_LOG1P_K0_SHORTCUT 1
|
||||
#include "v_log1p_inline.h"
|
||||
|
|
@ -45,9 +45,8 @@ VPCS_ATTR float64x2_t V_NAME_D1 (acosh) (float64x2_t x)
|
|||
x = vbslq_f64 (special, vreinterpretq_f64_u64 (d->one), x);
|
||||
#endif
|
||||
|
||||
float64x2_t xm1 = vsubq_f64 (x, v_f64 (1));
|
||||
float64x2_t y;
|
||||
y = vaddq_f64 (x, v_f64 (1));
|
||||
float64x2_t xm1 = vsubq_f64 (x, v_f64 (1.0));
|
||||
float64x2_t y = vaddq_f64 (x, v_f64 (1.0));
|
||||
y = vmulq_f64 (y, xm1);
|
||||
y = vsqrtq_f64 (y);
|
||||
y = vaddq_f64 (xm1, y);
|
||||
|
|
@ -57,10 +56,10 @@ VPCS_ATTR float64x2_t V_NAME_D1 (acosh) (float64x2_t x)
|
|||
return log1p_inline (y, &d->log1p_consts);
|
||||
}
|
||||
|
||||
PL_SIG (V, D, 1, acosh, 1.0, 10.0)
|
||||
PL_TEST_ULP (V_NAME_D1 (acosh), 2.53)
|
||||
PL_TEST_EXPECT_FENV (V_NAME_D1 (acosh), WANT_SIMD_EXCEPT)
|
||||
PL_TEST_INTERVAL (V_NAME_D1 (acosh), 1, 0x1p511, 90000)
|
||||
PL_TEST_INTERVAL (V_NAME_D1 (acosh), 0x1p511, inf, 10000)
|
||||
PL_TEST_INTERVAL (V_NAME_D1 (acosh), 0, 1, 1000)
|
||||
PL_TEST_INTERVAL (V_NAME_D1 (acosh), -0, -inf, 10000)
|
||||
TEST_SIG (V, D, 1, acosh, 1.0, 10.0)
|
||||
TEST_ULP (V_NAME_D1 (acosh), 2.53)
|
||||
TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (acosh), WANT_SIMD_EXCEPT)
|
||||
TEST_INTERVAL (V_NAME_D1 (acosh), 1, 0x1p511, 90000)
|
||||
TEST_INTERVAL (V_NAME_D1 (acosh), 0x1p511, inf, 10000)
|
||||
TEST_INTERVAL (V_NAME_D1 (acosh), 0, 1, 1000)
|
||||
TEST_INTERVAL (V_NAME_D1 (acosh), -0, -inf, 10000)
|
||||
|
|
@ -1,49 +1,46 @@
|
|||
/*
|
||||
* Single-precision vector acosh(x) function.
|
||||
* Copyright (c) 2023, Arm Limited.
|
||||
* Copyright (c) 2023-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "v_math.h"
|
||||
#include "pl_sig.h"
|
||||
#include "pl_test.h"
|
||||
#include "test_sig.h"
|
||||
#include "test_defs.h"
|
||||
#include "v_log1pf_inline.h"
|
||||
|
||||
#define SquareLim 0x1p64
|
||||
|
||||
const static struct data
|
||||
{
|
||||
struct v_log1pf_data log1pf_consts;
|
||||
uint32x4_t one;
|
||||
uint16x4_t thresh;
|
||||
} data = {
|
||||
.log1pf_consts = V_LOG1PF_CONSTANTS_TABLE,
|
||||
.one = V4 (0x3f800000),
|
||||
.thresh = V4 (0x2000) /* asuint(0x1p64) - asuint(1). */
|
||||
};
|
||||
} data = { .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE, .one = V4 (0x3f800000) };
|
||||
|
||||
#define SignMask 0x80000000
|
||||
#define Thresh vdup_n_u16 (0x2000) /* top(asuint(SquareLim) - asuint(1)). */
|
||||
|
||||
static float32x4_t NOINLINE VPCS_ATTR
|
||||
special_case (float32x4_t x, float32x4_t y, uint16x4_t special,
|
||||
const struct v_log1pf_data d)
|
||||
const struct v_log1pf_data *d)
|
||||
{
|
||||
return v_call_f32 (acoshf, x, log1pf_inline (y, d), vmovl_u16 (special));
|
||||
}
|
||||
|
||||
/* Vector approximation for single-precision acosh, based on log1p. Maximum
|
||||
error depends on WANT_SIMD_EXCEPT. With SIMD fp exceptions enabled, it
|
||||
is 2.78 ULP:
|
||||
__v_acoshf(0x1.07887p+0) got 0x1.ef9e9cp-3
|
||||
want 0x1.ef9ea2p-3.
|
||||
is 3.00 ULP:
|
||||
_ZGVnN4v_acoshf(0x1.01df3ap+0) got 0x1.ef0a82p-4
|
||||
want 0x1.ef0a7cp-4.
|
||||
With exceptions disabled, we can compute u with a shorter dependency chain,
|
||||
which gives maximum error of 3.07 ULP:
|
||||
__v_acoshf(0x1.01f83ep+0) got 0x1.fbc7fap-4
|
||||
want 0x1.fbc7f4p-4. */
|
||||
which gives maximum error of 3.22 ULP:
|
||||
_ZGVnN4v_acoshf(0x1.007ef2p+0) got 0x1.fdcdccp-5
|
||||
want 0x1.fdcdd2p-5. */
|
||||
|
||||
VPCS_ATTR float32x4_t V_NAME_F1 (acosh) (float32x4_t x)
|
||||
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (acosh) (float32x4_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
uint32x4_t ix = vreinterpretq_u32_f32 (x);
|
||||
uint16x4_t special = vcge_u16 (vsubhn_u32 (ix, d->one), d->thresh);
|
||||
uint16x4_t special = vcge_u16 (vsubhn_u32 (ix, d->one), Thresh);
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
/* Mask special lanes with 1 to side-step spurious invalid or overflow. Use
|
||||
|
|
@ -54,25 +51,28 @@ VPCS_ATTR float32x4_t V_NAME_F1 (acosh) (float32x4_t x)
|
|||
float32x4_t xm1 = v_zerofy_f32 (vsubq_f32 (x, v_f32 (1)), p);
|
||||
float32x4_t u = vfmaq_f32 (vaddq_f32 (xm1, xm1), xm1, xm1);
|
||||
#else
|
||||
float32x4_t xm1 = vsubq_f32 (x, v_f32 (1));
|
||||
float32x4_t u = vmulq_f32 (xm1, vaddq_f32 (x, v_f32 (1.0f)));
|
||||
float32x4_t xm1 = vsubq_f32 (x, vreinterpretq_f32_u32 (d->one));
|
||||
float32x4_t u
|
||||
= vmulq_f32 (xm1, vaddq_f32 (x, vreinterpretq_f32_u32 (d->one)));
|
||||
#endif
|
||||
|
||||
float32x4_t y = vaddq_f32 (xm1, vsqrtq_f32 (u));
|
||||
|
||||
if (unlikely (v_any_u16h (special)))
|
||||
return special_case (x, y, special, d->log1pf_consts);
|
||||
return log1pf_inline (y, d->log1pf_consts);
|
||||
return special_case (x, y, special, &d->log1pf_consts);
|
||||
return log1pf_inline (y, &d->log1pf_consts);
|
||||
}
|
||||
|
||||
PL_SIG (V, F, 1, acosh, 1.0, 10.0)
|
||||
HALF_WIDTH_ALIAS_F1 (acosh)
|
||||
|
||||
TEST_SIG (V, F, 1, acosh, 1.0, 10.0)
|
||||
#if WANT_SIMD_EXCEPT
|
||||
PL_TEST_ULP (V_NAME_F1 (acosh), 2.29)
|
||||
TEST_ULP (V_NAME_F1 (acosh), 2.50)
|
||||
#else
|
||||
PL_TEST_ULP (V_NAME_F1 (acosh), 2.58)
|
||||
TEST_ULP (V_NAME_F1 (acosh), 2.78)
|
||||
#endif
|
||||
PL_TEST_EXPECT_FENV (V_NAME_F1 (acosh), WANT_SIMD_EXCEPT)
|
||||
PL_TEST_INTERVAL (V_NAME_F1 (acosh), 0, 1, 500)
|
||||
PL_TEST_INTERVAL (V_NAME_F1 (acosh), 1, SquareLim, 100000)
|
||||
PL_TEST_INTERVAL (V_NAME_F1 (acosh), SquareLim, inf, 1000)
|
||||
PL_TEST_INTERVAL (V_NAME_F1 (acosh), -0, -inf, 1000)
|
||||
TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (acosh), WANT_SIMD_EXCEPT)
|
||||
TEST_INTERVAL (V_NAME_F1 (acosh), 0, 1, 500)
|
||||
TEST_INTERVAL (V_NAME_F1 (acosh), 1, SquareLim, 100000)
|
||||
TEST_INTERVAL (V_NAME_F1 (acosh), SquareLim, inf, 1000)
|
||||
TEST_INTERVAL (V_NAME_F1 (acosh), -0, -inf, 1000)
|
||||
|
|
@ -1,36 +1,35 @@
|
|||
/*
|
||||
* Double-precision vector asin(x) function.
|
||||
*
|
||||
* Copyright (c) 2023, Arm Limited.
|
||||
* Copyright (c) 2023-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "v_math.h"
|
||||
#include "poly_advsimd_f64.h"
|
||||
#include "pl_sig.h"
|
||||
#include "pl_test.h"
|
||||
#include "test_sig.h"
|
||||
#include "test_defs.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
float64x2_t poly[12];
|
||||
float64x2_t c0, c2, c4, c6, c8, c10;
|
||||
float64x2_t pi_over_2;
|
||||
uint64x2_t abs_mask;
|
||||
double c1, c3, c5, c7, c9, c11;
|
||||
} data = {
|
||||
/* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x))
|
||||
on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57. */
|
||||
.poly = { V2 (0x1.555555555554ep-3), V2 (0x1.3333333337233p-4),
|
||||
V2 (0x1.6db6db67f6d9fp-5), V2 (0x1.f1c71fbd29fbbp-6),
|
||||
V2 (0x1.6e8b264d467d6p-6), V2 (0x1.1c5997c357e9dp-6),
|
||||
V2 (0x1.c86a22cd9389dp-7), V2 (0x1.856073c22ebbep-7),
|
||||
V2 (0x1.fd1151acb6bedp-8), V2 (0x1.087182f799c1dp-6),
|
||||
V2 (-0x1.6602748120927p-7), V2 (0x1.cfa0dd1f9478p-6), },
|
||||
.pi_over_2 = V2 (0x1.921fb54442d18p+0),
|
||||
.abs_mask = V2 (0x7fffffffffffffff),
|
||||
.c0 = V2 (0x1.555555555554ep-3), .c1 = 0x1.3333333337233p-4,
|
||||
.c2 = V2 (0x1.6db6db67f6d9fp-5), .c3 = 0x1.f1c71fbd29fbbp-6,
|
||||
.c4 = V2 (0x1.6e8b264d467d6p-6), .c5 = 0x1.1c5997c357e9dp-6,
|
||||
.c6 = V2 (0x1.c86a22cd9389dp-7), .c7 = 0x1.856073c22ebbep-7,
|
||||
.c8 = V2 (0x1.fd1151acb6bedp-8), .c9 = 0x1.087182f799c1dp-6,
|
||||
.c10 = V2 (-0x1.6602748120927p-7), .c11 = 0x1.cfa0dd1f9478p-6,
|
||||
.pi_over_2 = V2 (0x1.921fb54442d18p+0), .abs_mask = V2 (0x7fffffffffffffff),
|
||||
};
|
||||
|
||||
#define AllMask v_u64 (0xffffffffffffffff)
|
||||
#define One (0x3ff0000000000000)
|
||||
#define Small (0x3e50000000000000) /* 2^-12. */
|
||||
#define One 0x3ff0000000000000
|
||||
#define Small 0x3e50000000000000 /* 2^-12. */
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
static float64x2_t VPCS_ATTR NOINLINE
|
||||
|
|
@ -58,12 +57,11 @@ special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
|
|||
asin(x) = pi/2 - (y + y * z * P(z)), with z = (1-x)/2 and y = sqrt(z).
|
||||
|
||||
The largest observed error in this region is 2.69 ulps,
|
||||
_ZGVnN2v_asin (0x1.044ac9819f573p-1) got 0x1.110d7e85fdd5p-1
|
||||
want 0x1.110d7e85fdd53p-1. */
|
||||
_ZGVnN2v_asin (0x1.044e8cefee301p-1) got 0x1.1111dd54ddf96p-1
|
||||
want 0x1.1111dd54ddf99p-1. */
|
||||
float64x2_t VPCS_ATTR V_NAME_D1 (asin) (float64x2_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
|
||||
float64x2_t ax = vabsq_f64 (x);
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
|
|
@ -76,7 +74,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (asin) (float64x2_t x)
|
|||
return special_case (x, x, AllMask);
|
||||
#endif
|
||||
|
||||
uint64x2_t a_lt_half = vcltq_f64 (ax, v_f64 (0.5));
|
||||
uint64x2_t a_lt_half = vcaltq_f64 (x, v_f64 (0.5));
|
||||
|
||||
/* Evaluate polynomial Q(x) = y + y * z * P(z) with
|
||||
z = x ^ 2 and y = |x| , if |x| < 0.5
|
||||
|
|
@ -89,7 +87,26 @@ float64x2_t VPCS_ATTR V_NAME_D1 (asin) (float64x2_t x)
|
|||
float64x2_t z4 = vmulq_f64 (z2, z2);
|
||||
float64x2_t z8 = vmulq_f64 (z4, z4);
|
||||
float64x2_t z16 = vmulq_f64 (z8, z8);
|
||||
float64x2_t p = v_estrin_11_f64 (z2, z4, z8, z16, d->poly);
|
||||
|
||||
/* order-11 estrin. */
|
||||
float64x2_t c13 = vld1q_f64 (&d->c1);
|
||||
float64x2_t c57 = vld1q_f64 (&d->c5);
|
||||
float64x2_t c911 = vld1q_f64 (&d->c9);
|
||||
|
||||
float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0);
|
||||
float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1);
|
||||
float64x2_t p03 = vfmaq_f64 (p01, z4, p23);
|
||||
|
||||
float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0);
|
||||
float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1);
|
||||
float64x2_t p47 = vfmaq_f64 (p45, z4, p67);
|
||||
|
||||
float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0);
|
||||
float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1);
|
||||
float64x2_t p811 = vfmaq_f64 (p89, z4, p1011);
|
||||
|
||||
float64x2_t p07 = vfmaq_f64 (p03, z8, p47);
|
||||
float64x2_t p = vfmaq_f64 (p07, z16, p811);
|
||||
|
||||
/* Finalize polynomial: z + z * z2 * P(z2). */
|
||||
p = vfmaq_f64 (z, vmulq_f64 (z, z2), p);
|
||||
|
|
@ -102,12 +119,12 @@ float64x2_t VPCS_ATTR V_NAME_D1 (asin) (float64x2_t x)
|
|||
return vbslq_f64 (d->abs_mask, y, x);
|
||||
}
|
||||
|
||||
PL_SIG (V, D, 1, asin, -1.0, 1.0)
|
||||
PL_TEST_ULP (V_NAME_D1 (asin), 2.19)
|
||||
PL_TEST_EXPECT_FENV (V_NAME_D1 (asin), WANT_SIMD_EXCEPT)
|
||||
PL_TEST_INTERVAL (V_NAME_D1 (asin), 0, Small, 5000)
|
||||
PL_TEST_INTERVAL (V_NAME_D1 (asin), Small, 0.5, 50000)
|
||||
PL_TEST_INTERVAL (V_NAME_D1 (asin), 0.5, 1.0, 50000)
|
||||
PL_TEST_INTERVAL (V_NAME_D1 (asin), 1.0, 0x1p11, 50000)
|
||||
PL_TEST_INTERVAL (V_NAME_D1 (asin), 0x1p11, inf, 20000)
|
||||
PL_TEST_INTERVAL (V_NAME_D1 (asin), -0, -inf, 20000)
|
||||
TEST_SIG (V, D, 1, asin, -1.0, 1.0)
|
||||
TEST_ULP (V_NAME_D1 (asin), 2.20)
|
||||
TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (asin), WANT_SIMD_EXCEPT)
|
||||
TEST_INTERVAL (V_NAME_D1 (asin), 0, Small, 5000)
|
||||
TEST_INTERVAL (V_NAME_D1 (asin), Small, 0.5, 50000)
|
||||
TEST_INTERVAL (V_NAME_D1 (asin), 0.5, 1.0, 50000)
|
||||
TEST_INTERVAL (V_NAME_D1 (asin), 1.0, 0x1p11, 50000)
|
||||
TEST_INTERVAL (V_NAME_D1 (asin), 0x1p11, inf, 20000)
|
||||
TEST_INTERVAL (V_NAME_D1 (asin), -0, -inf, 20000)
|
||||
|
|
@ -1,14 +1,14 @@
|
|||
/*
|
||||
* Single-precision vector asin(x) function.
|
||||
*
|
||||
* Copyright (c) 2023, Arm Limited.
|
||||
* Copyright (c) 2023-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "v_math.h"
|
||||
#include "poly_advsimd_f32.h"
|
||||
#include "pl_sig.h"
|
||||
#include "pl_test.h"
|
||||
#include "v_poly_f32.h"
|
||||
#include "test_sig.h"
|
||||
#include "test_defs.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
|
|
@ -53,7 +53,7 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
|
|||
|
||||
The largest observed error in this region is 2.41 ulps,
|
||||
_ZGVnN4v_asinf (0x1.00203ep-1) got 0x1.0c3a64p-1 want 0x1.0c3a6p-1. */
|
||||
float32x4_t VPCS_ATTR V_NAME_F1 (asin) (float32x4_t x)
|
||||
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (asin) (float32x4_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
|
||||
|
|
@ -93,12 +93,14 @@ float32x4_t VPCS_ATTR V_NAME_F1 (asin) (float32x4_t x)
|
|||
return vbslq_f32 (v_u32 (AbsMask), y, x);
|
||||
}
|
||||
|
||||
PL_SIG (V, F, 1, asin, -1.0, 1.0)
|
||||
PL_TEST_ULP (V_NAME_F1 (asin), 1.91)
|
||||
PL_TEST_EXPECT_FENV (V_NAME_F1 (asin), WANT_SIMD_EXCEPT)
|
||||
PL_TEST_INTERVAL (V_NAME_F1 (asin), 0, 0x1p-12, 5000)
|
||||
PL_TEST_INTERVAL (V_NAME_F1 (asin), 0x1p-12, 0.5, 50000)
|
||||
PL_TEST_INTERVAL (V_NAME_F1 (asin), 0.5, 1.0, 50000)
|
||||
PL_TEST_INTERVAL (V_NAME_F1 (asin), 1.0, 0x1p11, 50000)
|
||||
PL_TEST_INTERVAL (V_NAME_F1 (asin), 0x1p11, inf, 20000)
|
||||
PL_TEST_INTERVAL (V_NAME_F1 (asin), -0, -inf, 20000)
|
||||
HALF_WIDTH_ALIAS_F1 (asin)
|
||||
|
||||
TEST_SIG (V, F, 1, asin, -1.0, 1.0)
|
||||
TEST_ULP (V_NAME_F1 (asin), 1.91)
|
||||
TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (asin), WANT_SIMD_EXCEPT)
|
||||
TEST_INTERVAL (V_NAME_F1 (asin), 0, 0x1p-12, 5000)
|
||||
TEST_INTERVAL (V_NAME_F1 (asin), 0x1p-12, 0.5, 50000)
|
||||
TEST_INTERVAL (V_NAME_F1 (asin), 0.5, 1.0, 50000)
|
||||
TEST_INTERVAL (V_NAME_F1 (asin), 1.0, 0x1p11, 50000)
|
||||
TEST_INTERVAL (V_NAME_F1 (asin), 0x1p11, inf, 20000)
|
||||
TEST_INTERVAL (V_NAME_F1 (asin), -0, -inf, 20000)
|
||||
242
contrib/arm-optimized-routines/math/aarch64/advsimd/asinh.c
Normal file
242
contrib/arm-optimized-routines/math/aarch64/advsimd/asinh.c
Normal file
|
|
@ -0,0 +1,242 @@
|
|||
/*
|
||||
* Double-precision vector asinh(x) function.
|
||||
*
|
||||
* Copyright (c) 2022-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "test_defs.h"
|
||||
#include "test_sig.h"
|
||||
#include "v_math.h"
|
||||
|
||||
const static struct data
|
||||
{
|
||||
uint64x2_t huge_bound, abs_mask, off, mask;
|
||||
#if WANT_SIMD_EXCEPT
|
||||
float64x2_t tiny_bound;
|
||||
#endif
|
||||
float64x2_t lc0, lc2;
|
||||
double lc1, lc3, ln2, lc4;
|
||||
|
||||
float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c17;
|
||||
double c1, c3, c5, c7, c9, c11, c13, c15;
|
||||
|
||||
} data = {
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
.tiny_bound = V2 (0x1p-26),
|
||||
#endif
|
||||
/* Even terms of polynomial s.t. asinh(x) is approximated by
|
||||
asinh(x) ~= x + x^3 * (C0 + C1 * x + C2 * x^2 + C3 * x^3 + ...).
|
||||
Generated using Remez, f = (asinh(sqrt(x)) - sqrt(x))/x^(3/2). */
|
||||
|
||||
.c0 = V2 (-0x1.55555555554a7p-3),
|
||||
.c1 = 0x1.3333333326c7p-4,
|
||||
.c2 = V2 (-0x1.6db6db68332e6p-5),
|
||||
.c3 = 0x1.f1c71b26fb40dp-6,
|
||||
.c4 = V2 (-0x1.6e8b8b654a621p-6),
|
||||
.c5 = 0x1.1c4daa9e67871p-6,
|
||||
.c6 = V2 (-0x1.c9871d10885afp-7),
|
||||
.c7 = 0x1.7a16e8d9d2ecfp-7,
|
||||
.c8 = V2 (-0x1.3ddca533e9f54p-7),
|
||||
.c9 = 0x1.0becef748dafcp-7,
|
||||
.c10 = V2 (-0x1.b90c7099dd397p-8),
|
||||
.c11 = 0x1.541f2bb1ffe51p-8,
|
||||
.c12 = V2 (-0x1.d217026a669ecp-9),
|
||||
.c13 = 0x1.0b5c7977aaf7p-9,
|
||||
.c14 = V2 (-0x1.e0f37daef9127p-11),
|
||||
.c15 = 0x1.388b5fe542a6p-12,
|
||||
.c16 = V2 (-0x1.021a48685e287p-14),
|
||||
.c17 = V2 (0x1.93d4ba83d34dap-18),
|
||||
|
||||
.lc0 = V2 (-0x1.ffffffffffff7p-2),
|
||||
.lc1 = 0x1.55555555170d4p-2,
|
||||
.lc2 = V2 (-0x1.0000000399c27p-2),
|
||||
.lc3 = 0x1.999b2e90e94cap-3,
|
||||
.lc4 = -0x1.554e550bd501ep-3,
|
||||
.ln2 = 0x1.62e42fefa39efp-1,
|
||||
|
||||
.off = V2 (0x3fe6900900000000),
|
||||
.huge_bound = V2 (0x5fe0000000000000),
|
||||
.abs_mask = V2 (0x7fffffffffffffff),
|
||||
.mask = V2 (0xfffULL << 52),
|
||||
};
|
||||
|
||||
static float64x2_t NOINLINE VPCS_ATTR
|
||||
special_case (float64x2_t x, float64x2_t y, uint64x2_t abs_mask,
|
||||
uint64x2_t special)
|
||||
{
|
||||
/* Copy sign. */
|
||||
y = vbslq_f64 (abs_mask, y, x);
|
||||
return v_call_f64 (asinh, x, y, special);
|
||||
}
|
||||
|
||||
#define N (1 << V_LOG_TABLE_BITS)
|
||||
#define IndexMask (N - 1)
|
||||
|
||||
struct entry
|
||||
{
|
||||
float64x2_t invc;
|
||||
float64x2_t logc;
|
||||
};
|
||||
|
||||
static inline struct entry
|
||||
lookup (uint64x2_t i)
|
||||
{
|
||||
/* Since N is a power of 2, n % N = n & (N - 1). */
|
||||
struct entry e;
|
||||
uint64_t i0 = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
|
||||
uint64_t i1 = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
|
||||
float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc);
|
||||
float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc);
|
||||
e.invc = vuzp1q_f64 (e0, e1);
|
||||
e.logc = vuzp2q_f64 (e0, e1);
|
||||
return e;
|
||||
}
|
||||
|
||||
static inline float64x2_t
|
||||
log_inline (float64x2_t xm, const struct data *d)
|
||||
{
|
||||
|
||||
uint64x2_t u = vreinterpretq_u64_f64 (xm);
|
||||
uint64x2_t u_off = vsubq_u64 (u, d->off);
|
||||
|
||||
int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52);
|
||||
uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->mask));
|
||||
float64x2_t z = vreinterpretq_f64_u64 (iz);
|
||||
|
||||
struct entry e = lookup (u_off);
|
||||
|
||||
/* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */
|
||||
float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
|
||||
float64x2_t kd = vcvtq_f64_s64 (k);
|
||||
|
||||
/* hi = r + log(c) + k*Ln2. */
|
||||
float64x2_t ln2_and_lc4 = vld1q_f64 (&d->ln2);
|
||||
float64x2_t hi = vfmaq_laneq_f64 (vaddq_f64 (e.logc, r), kd, ln2_and_lc4, 0);
|
||||
|
||||
/* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */
|
||||
float64x2_t odd_coeffs = vld1q_f64 (&d->lc1);
|
||||
float64x2_t r2 = vmulq_f64 (r, r);
|
||||
float64x2_t y = vfmaq_laneq_f64 (d->lc2, r, odd_coeffs, 1);
|
||||
float64x2_t p = vfmaq_laneq_f64 (d->lc0, r, odd_coeffs, 0);
|
||||
y = vfmaq_laneq_f64 (y, r2, ln2_and_lc4, 1);
|
||||
y = vfmaq_f64 (p, r2, y);
|
||||
return vfmaq_f64 (hi, y, r2);
|
||||
}
|
||||
|
||||
/* Double-precision implementation of vector asinh(x).
|
||||
asinh is very sensitive around 1, so it is impractical to devise a single
|
||||
low-cost algorithm which is sufficiently accurate on a wide range of input.
|
||||
Instead we use two different algorithms:
|
||||
asinh(x) = sign(x) * log(|x| + sqrt(x^2 + 1) if |x| >= 1
|
||||
= sign(x) * (|x| + |x|^3 * P(x^2)) otherwise
|
||||
where log(x) is an optimized log approximation, and P(x) is a polynomial
|
||||
shared with the scalar routine. The greatest observed error 2.79 ULP, in
|
||||
|x| >= 1:
|
||||
_ZGVnN2v_asinh(0x1.2cd9d73ea76a6p+0) got 0x1.ffffd003219dap-1
|
||||
want 0x1.ffffd003219ddp-1. */
|
||||
VPCS_ATTR float64x2_t V_NAME_D1 (asinh) (float64x2_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
float64x2_t ax = vabsq_f64 (x);
|
||||
|
||||
uint64x2_t gt1 = vcgeq_f64 (ax, v_f64 (1));
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
uint64x2_t iax = vreinterpretq_u64_f64 (ax);
|
||||
uint64x2_t special = vcgeq_u64 (iax, (d->huge_bound));
|
||||
uint64x2_t tiny = vcltq_f64 (ax, d->tiny_bound);
|
||||
special = vorrq_u64 (special, tiny);
|
||||
#else
|
||||
uint64x2_t special = vcgeq_f64 (ax, vreinterpretq_f64_u64 (d->huge_bound));
|
||||
#endif
|
||||
|
||||
/* Option 1: |x| >= 1.
|
||||
Compute asinh(x) according by asinh(x) = log(x + sqrt(x^2 + 1)).
|
||||
If WANT_SIMD_EXCEPT is enabled, sidestep special values, which will
|
||||
overflow, by setting special lanes to 1. These will be fixed later. */
|
||||
float64x2_t option_1 = v_f64 (0);
|
||||
if (likely (v_any_u64 (gt1)))
|
||||
{
|
||||
#if WANT_SIMD_EXCEPT
|
||||
float64x2_t xm = v_zerofy_f64 (ax, special);
|
||||
#else
|
||||
float64x2_t xm = ax;
|
||||
#endif
|
||||
option_1 = log_inline (
|
||||
vaddq_f64 (xm, vsqrtq_f64 (vfmaq_f64 (v_f64 (1), xm, xm))), d);
|
||||
}
|
||||
|
||||
/* Option 2: |x| < 1.
|
||||
Compute asinh(x) using a polynomial.
|
||||
If WANT_SIMD_EXCEPT is enabled, sidestep special lanes, which will
|
||||
overflow, and tiny lanes, which will underflow, by setting them to 0. They
|
||||
will be fixed later, either by selecting x or falling back to the scalar
|
||||
special-case. The largest observed error in this region is 1.47 ULPs:
|
||||
_ZGVnN2v_asinh(0x1.fdfcd00cc1e6ap-1) got 0x1.c1d6bf874019bp-1
|
||||
want 0x1.c1d6bf874019cp-1. */
|
||||
float64x2_t option_2 = v_f64 (0);
|
||||
|
||||
if (likely (v_any_u64 (vceqzq_u64 (gt1))))
|
||||
{
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
ax = v_zerofy_f64 (ax, vorrq_u64 (tiny, gt1));
|
||||
#endif
|
||||
float64x2_t x2 = vmulq_f64 (ax, ax), z2 = vmulq_f64 (x2, x2);
|
||||
/* Order-17 Pairwise Horner scheme. */
|
||||
float64x2_t c13 = vld1q_f64 (&d->c1);
|
||||
float64x2_t c57 = vld1q_f64 (&d->c5);
|
||||
float64x2_t c911 = vld1q_f64 (&d->c9);
|
||||
float64x2_t c1315 = vld1q_f64 (&d->c13);
|
||||
|
||||
float64x2_t p01 = vfmaq_laneq_f64 (d->c0, x2, c13, 0);
|
||||
float64x2_t p23 = vfmaq_laneq_f64 (d->c2, x2, c13, 1);
|
||||
float64x2_t p45 = vfmaq_laneq_f64 (d->c4, x2, c57, 0);
|
||||
float64x2_t p67 = vfmaq_laneq_f64 (d->c6, x2, c57, 1);
|
||||
float64x2_t p89 = vfmaq_laneq_f64 (d->c8, x2, c911, 0);
|
||||
float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, x2, c911, 1);
|
||||
float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, x2, c1315, 0);
|
||||
float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, x2, c1315, 1);
|
||||
float64x2_t p1617 = vfmaq_f64 (d->c16, x2, d->c17);
|
||||
|
||||
float64x2_t p = vfmaq_f64 (p1415, z2, p1617);
|
||||
p = vfmaq_f64 (p1213, z2, p);
|
||||
p = vfmaq_f64 (p1011, z2, p);
|
||||
p = vfmaq_f64 (p89, z2, p);
|
||||
|
||||
p = vfmaq_f64 (p67, z2, p);
|
||||
p = vfmaq_f64 (p45, z2, p);
|
||||
|
||||
p = vfmaq_f64 (p23, z2, p);
|
||||
|
||||
p = vfmaq_f64 (p01, z2, p);
|
||||
option_2 = vfmaq_f64 (ax, p, vmulq_f64 (ax, x2));
|
||||
#if WANT_SIMD_EXCEPT
|
||||
option_2 = vbslq_f64 (tiny, x, option_2);
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Choose the right option for each lane. */
|
||||
float64x2_t y = vbslq_f64 (gt1, option_1, option_2);
|
||||
if (unlikely (v_any_u64 (special)))
|
||||
{
|
||||
return special_case (x, y, d->abs_mask, special);
|
||||
}
|
||||
/* Copy sign. */
|
||||
return vbslq_f64 (d->abs_mask, y, x);
|
||||
}
|
||||
|
||||
TEST_SIG (V, D, 1, asinh, -10.0, 10.0)
|
||||
TEST_ULP (V_NAME_D1 (asinh), 2.29)
|
||||
TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (asinh), WANT_SIMD_EXCEPT)
|
||||
TEST_SYM_INTERVAL (V_NAME_D1 (asinh), 0, 0x1p-26, 50000)
|
||||
TEST_SYM_INTERVAL (V_NAME_D1 (asinh), 0x1p-26, 1, 50000)
|
||||
TEST_SYM_INTERVAL (V_NAME_D1 (asinh), 1, 0x1p511, 50000)
|
||||
TEST_SYM_INTERVAL (V_NAME_D1 (asinh), 0x1p511, inf, 40000)
|
||||
/* Test vector asinh 3 times, with control lane < 1, > 1 and special.
|
||||
Ensures the v_sel is choosing the right option in all cases. */
|
||||
TEST_CONTROL_VALUE (V_NAME_D1 (asinh), 0.5)
|
||||
TEST_CONTROL_VALUE (V_NAME_D1 (asinh), 2)
|
||||
TEST_CONTROL_VALUE (V_NAME_D1 (asinh), 0x1p600)
|
||||
89
contrib/arm-optimized-routines/math/aarch64/advsimd/asinhf.c
Normal file
89
contrib/arm-optimized-routines/math/aarch64/advsimd/asinhf.c
Normal file
|
|
@ -0,0 +1,89 @@
|
|||
/*
|
||||
* Single-precision vector asinh(x) function.
|
||||
*
|
||||
* Copyright (c) 2022-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "v_math.h"
|
||||
#include "test_sig.h"
|
||||
#include "test_defs.h"
|
||||
#include "v_log1pf_inline.h"
|
||||
|
||||
const static struct data
|
||||
{
|
||||
struct v_log1pf_data log1pf_consts;
|
||||
float32x4_t one;
|
||||
uint32x4_t big_bound;
|
||||
#if WANT_SIMD_EXCEPT
|
||||
uint32x4_t tiny_bound;
|
||||
#endif
|
||||
} data = {
|
||||
.one = V4 (1),
|
||||
.log1pf_consts = V_LOG1PF_CONSTANTS_TABLE,
|
||||
.big_bound = V4 (0x5f800000), /* asuint(0x1p64). */
|
||||
#if WANT_SIMD_EXCEPT
|
||||
.tiny_bound = V4 (0x30800000) /* asuint(0x1p-30). */
|
||||
#endif
|
||||
};
|
||||
|
||||
static float32x4_t NOINLINE VPCS_ATTR
|
||||
special_case (float32x4_t x, uint32x4_t sign, float32x4_t y,
|
||||
uint32x4_t special, const struct data *d)
|
||||
{
|
||||
return v_call_f32 (
|
||||
asinhf, x,
|
||||
vreinterpretq_f32_u32 (veorq_u32 (
|
||||
sign, vreinterpretq_u32_f32 (log1pf_inline (y, &d->log1pf_consts)))),
|
||||
special);
|
||||
}
|
||||
|
||||
/* Single-precision implementation of vector asinh(x), using vector log1p.
|
||||
Worst-case error is 2.59 ULP:
|
||||
_ZGVnN4v_asinhf(0x1.d86124p-3) got 0x1.d449bep-3
|
||||
want 0x1.d449c4p-3. */
|
||||
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (asinh) (float32x4_t x)
|
||||
{
|
||||
const struct data *dat = ptr_barrier (&data);
|
||||
float32x4_t ax = vabsq_f32 (x);
|
||||
uint32x4_t iax = vreinterpretq_u32_f32 (ax);
|
||||
uint32x4_t special = vcgeq_u32 (iax, dat->big_bound);
|
||||
uint32x4_t sign = veorq_u32 (vreinterpretq_u32_f32 (x), iax);
|
||||
float32x4_t special_arg = x;
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
/* Sidestep tiny and large values to avoid inadvertently triggering
|
||||
under/overflow. */
|
||||
special = vorrq_u32 (special, vcltq_u32 (iax, dat->tiny_bound));
|
||||
if (unlikely (v_any_u32 (special)))
|
||||
{
|
||||
ax = v_zerofy_f32 (ax, special);
|
||||
x = v_zerofy_f32 (x, special);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* asinh(x) = log(x + sqrt(x * x + 1)).
|
||||
For positive x, asinh(x) = log1p(x + x * x / (1 + sqrt(x * x + 1))). */
|
||||
float32x4_t d
|
||||
= vaddq_f32 (v_f32 (1), vsqrtq_f32 (vfmaq_f32 (dat->one, ax, ax)));
|
||||
float32x4_t y = vaddq_f32 (ax, vdivq_f32 (vmulq_f32 (ax, ax), d));
|
||||
|
||||
if (unlikely (v_any_u32 (special)))
|
||||
return special_case (special_arg, sign, y, special, dat);
|
||||
return vreinterpretq_f32_u32 (veorq_u32 (
|
||||
sign, vreinterpretq_u32_f32 (log1pf_inline (y, &dat->log1pf_consts))));
|
||||
}
|
||||
|
||||
HALF_WIDTH_ALIAS_F1 (asinh)
|
||||
|
||||
TEST_SIG (V, F, 1, asinh, -10.0, 10.0)
|
||||
TEST_ULP (V_NAME_F1 (asinh), 2.10)
|
||||
TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (asinh), WANT_SIMD_EXCEPT)
|
||||
TEST_INTERVAL (V_NAME_F1 (asinh), 0, 0x1p-12, 40000)
|
||||
TEST_INTERVAL (V_NAME_F1 (asinh), 0x1p-12, 1.0, 40000)
|
||||
TEST_INTERVAL (V_NAME_F1 (asinh), 1.0, 0x1p11, 40000)
|
||||
TEST_INTERVAL (V_NAME_F1 (asinh), 0x1p11, inf, 40000)
|
||||
TEST_INTERVAL (V_NAME_F1 (asinh), -0, -0x1p-12, 20000)
|
||||
TEST_INTERVAL (V_NAME_F1 (asinh), -0x1p-12, -1.0, 20000)
|
||||
TEST_INTERVAL (V_NAME_F1 (asinh), -1.0, -0x1p11, 20000)
|
||||
TEST_INTERVAL (V_NAME_F1 (asinh), -0x1p11, -inf, 20000)
|
||||
|
|
@ -1,32 +1,32 @@
|
|||
/*
|
||||
* Double-precision vector atan(x) function.
|
||||
*
|
||||
* Copyright (c) 2021-2023, Arm Limited.
|
||||
* Copyright (c) 2021-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "v_math.h"
|
||||
#include "pl_sig.h"
|
||||
#include "pl_test.h"
|
||||
#include "poly_advsimd_f64.h"
|
||||
#include "test_sig.h"
|
||||
#include "test_defs.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c18;
|
||||
float64x2_t pi_over_2;
|
||||
float64x2_t poly[20];
|
||||
double c1, c3, c5, c7, c9, c11, c13, c15, c17, c19;
|
||||
} data = {
|
||||
/* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
|
||||
[2**-1022, 1.0]. */
|
||||
.poly = { V2 (-0x1.5555555555555p-2), V2 (0x1.99999999996c1p-3),
|
||||
V2 (-0x1.2492492478f88p-3), V2 (0x1.c71c71bc3951cp-4),
|
||||
V2 (-0x1.745d160a7e368p-4), V2 (0x1.3b139b6a88ba1p-4),
|
||||
V2 (-0x1.11100ee084227p-4), V2 (0x1.e1d0f9696f63bp-5),
|
||||
V2 (-0x1.aebfe7b418581p-5), V2 (0x1.842dbe9b0d916p-5),
|
||||
V2 (-0x1.5d30140ae5e99p-5), V2 (0x1.338e31eb2fbbcp-5),
|
||||
V2 (-0x1.00e6eece7de8p-5), V2 (0x1.860897b29e5efp-6),
|
||||
V2 (-0x1.0051381722a59p-6), V2 (0x1.14e9dc19a4a4ep-7),
|
||||
V2 (-0x1.d0062b42fe3bfp-9), V2 (0x1.17739e210171ap-10),
|
||||
V2 (-0x1.ab24da7be7402p-13), V2 (0x1.358851160a528p-16), },
|
||||
.c0 = V2 (-0x1.5555555555555p-2), .c1 = 0x1.99999999996c1p-3,
|
||||
.c2 = V2 (-0x1.2492492478f88p-3), .c3 = 0x1.c71c71bc3951cp-4,
|
||||
.c4 = V2 (-0x1.745d160a7e368p-4), .c5 = 0x1.3b139b6a88ba1p-4,
|
||||
.c6 = V2 (-0x1.11100ee084227p-4), .c7 = 0x1.e1d0f9696f63bp-5,
|
||||
.c8 = V2 (-0x1.aebfe7b418581p-5), .c9 = 0x1.842dbe9b0d916p-5,
|
||||
.c10 = V2 (-0x1.5d30140ae5e99p-5), .c11 = 0x1.338e31eb2fbbcp-5,
|
||||
.c12 = V2 (-0x1.00e6eece7de8p-5), .c13 = 0x1.860897b29e5efp-6,
|
||||
.c14 = V2 (-0x1.0051381722a59p-6), .c15 = 0x1.14e9dc19a4a4ep-7,
|
||||
.c16 = V2 (-0x1.d0062b42fe3bfp-9), .c17 = 0x1.17739e210171ap-10,
|
||||
.c18 = V2 (-0x1.ab24da7be7402p-13), .c19 = 0x1.358851160a528p-16,
|
||||
.pi_over_2 = V2 (0x1.921fb54442d18p+0),
|
||||
};
|
||||
|
||||
|
|
@ -42,6 +42,11 @@ static const struct data
|
|||
float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
float64x2_t c13 = vld1q_f64 (&d->c1);
|
||||
float64x2_t c57 = vld1q_f64 (&d->c5);
|
||||
float64x2_t c911 = vld1q_f64 (&d->c9);
|
||||
float64x2_t c1315 = vld1q_f64 (&d->c13);
|
||||
float64x2_t c1719 = vld1q_f64 (&d->c17);
|
||||
|
||||
/* Small cases, infs and nans are supported by our approximation technique,
|
||||
but do not set fenv flags correctly. Only trigger special case if we need
|
||||
|
|
@ -80,9 +85,35 @@ float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x)
|
|||
float64x2_t x2 = vmulq_f64 (z2, z2);
|
||||
float64x2_t x4 = vmulq_f64 (x2, x2);
|
||||
float64x2_t x8 = vmulq_f64 (x4, x4);
|
||||
float64x2_t y
|
||||
= vfmaq_f64 (v_estrin_7_f64 (z2, x2, x4, d->poly),
|
||||
v_estrin_11_f64 (z2, x2, x4, x8, d->poly + 8), x8);
|
||||
|
||||
/* estrin_7. */
|
||||
float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0);
|
||||
float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1);
|
||||
float64x2_t p03 = vfmaq_f64 (p01, x2, p23);
|
||||
|
||||
float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0);
|
||||
float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1);
|
||||
float64x2_t p47 = vfmaq_f64 (p45, x2, p67);
|
||||
|
||||
float64x2_t p07 = vfmaq_f64 (p03, x4, p47);
|
||||
|
||||
/* estrin_11. */
|
||||
float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0);
|
||||
float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1);
|
||||
float64x2_t p811 = vfmaq_f64 (p89, x2, p1011);
|
||||
|
||||
float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, z2, c1315, 0);
|
||||
float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, z2, c1315, 1);
|
||||
float64x2_t p1215 = vfmaq_f64 (p1213, x2, p1415);
|
||||
|
||||
float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, z2, c1719, 0);
|
||||
float64x2_t p1819 = vfmaq_laneq_f64 (d->c18, z2, c1719, 1);
|
||||
float64x2_t p1619 = vfmaq_f64 (p1617, x2, p1819);
|
||||
|
||||
float64x2_t p815 = vfmaq_f64 (p811, x4, p1215);
|
||||
float64x2_t p819 = vfmaq_f64 (p815, x8, p1619);
|
||||
|
||||
float64x2_t y = vfmaq_f64 (p07, p819, x8);
|
||||
|
||||
/* Finalize. y = shift + z + z^3 * P(z^2). */
|
||||
y = vfmaq_f64 (az, y, vmulq_f64 (z2, az));
|
||||
|
|
@ -93,12 +124,12 @@ float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x)
|
|||
return y;
|
||||
}
|
||||
|
||||
PL_SIG (V, D, 1, atan, -10.0, 10.0)
|
||||
PL_TEST_ULP (V_NAME_D1 (atan), 1.78)
|
||||
PL_TEST_EXPECT_FENV (V_NAME_D1 (atan), WANT_SIMD_EXCEPT)
|
||||
PL_TEST_INTERVAL (V_NAME_D1 (atan), 0, 0x1p-30, 10000)
|
||||
PL_TEST_INTERVAL (V_NAME_D1 (atan), -0, -0x1p-30, 1000)
|
||||
PL_TEST_INTERVAL (V_NAME_D1 (atan), 0x1p-30, 0x1p53, 900000)
|
||||
PL_TEST_INTERVAL (V_NAME_D1 (atan), -0x1p-30, -0x1p53, 90000)
|
||||
PL_TEST_INTERVAL (V_NAME_D1 (atan), 0x1p53, inf, 10000)
|
||||
PL_TEST_INTERVAL (V_NAME_D1 (atan), -0x1p53, -inf, 1000)
|
||||
TEST_SIG (V, D, 1, atan, -10.0, 10.0)
|
||||
TEST_ULP (V_NAME_D1 (atan), 1.78)
|
||||
TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (atan), WANT_SIMD_EXCEPT)
|
||||
TEST_INTERVAL (V_NAME_D1 (atan), 0, 0x1p-30, 10000)
|
||||
TEST_INTERVAL (V_NAME_D1 (atan), -0, -0x1p-30, 1000)
|
||||
TEST_INTERVAL (V_NAME_D1 (atan), 0x1p-30, 0x1p53, 900000)
|
||||
TEST_INTERVAL (V_NAME_D1 (atan), -0x1p-30, -0x1p53, 90000)
|
||||
TEST_INTERVAL (V_NAME_D1 (atan), 0x1p53, inf, 10000)
|
||||
TEST_INTERVAL (V_NAME_D1 (atan), -0x1p53, -inf, 1000)
|
||||
171
contrib/arm-optimized-routines/math/aarch64/advsimd/atan2.c
Normal file
171
contrib/arm-optimized-routines/math/aarch64/advsimd/atan2.c
Normal file
|
|
@ -0,0 +1,171 @@
|
|||
/*
|
||||
* Double-precision vector atan2(x) function.
|
||||
*
|
||||
* Copyright (c) 2021-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "v_math.h"
|
||||
#include "test_sig.h"
|
||||
#include "test_defs.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c18;
|
||||
float64x2_t pi_over_2;
|
||||
double c1, c3, c5, c7, c9, c11, c13, c15, c17, c19;
|
||||
uint64x2_t zeroinfnan, minustwo;
|
||||
} data = {
|
||||
/* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
|
||||
[2**-1022, 1.0]. */
|
||||
.c0 = V2 (-0x1.5555555555555p-2),
|
||||
.c1 = 0x1.99999999996c1p-3,
|
||||
.c2 = V2 (-0x1.2492492478f88p-3),
|
||||
.c3 = 0x1.c71c71bc3951cp-4,
|
||||
.c4 = V2 (-0x1.745d160a7e368p-4),
|
||||
.c5 = 0x1.3b139b6a88ba1p-4,
|
||||
.c6 = V2 (-0x1.11100ee084227p-4),
|
||||
.c7 = 0x1.e1d0f9696f63bp-5,
|
||||
.c8 = V2 (-0x1.aebfe7b418581p-5),
|
||||
.c9 = 0x1.842dbe9b0d916p-5,
|
||||
.c10 = V2 (-0x1.5d30140ae5e99p-5),
|
||||
.c11 = 0x1.338e31eb2fbbcp-5,
|
||||
.c12 = V2 (-0x1.00e6eece7de8p-5),
|
||||
.c13 = 0x1.860897b29e5efp-6,
|
||||
.c14 = V2 (-0x1.0051381722a59p-6),
|
||||
.c15 = 0x1.14e9dc19a4a4ep-7,
|
||||
.c16 = V2 (-0x1.d0062b42fe3bfp-9),
|
||||
.c17 = 0x1.17739e210171ap-10,
|
||||
.c18 = V2 (-0x1.ab24da7be7402p-13),
|
||||
.c19 = 0x1.358851160a528p-16,
|
||||
.pi_over_2 = V2 (0x1.921fb54442d18p+0),
|
||||
.zeroinfnan = V2 (2 * 0x7ff0000000000000ul - 1),
|
||||
.minustwo = V2 (0xc000000000000000),
|
||||
};
|
||||
|
||||
#define SignMask v_u64 (0x8000000000000000)
|
||||
|
||||
/* Special cases i.e. 0, infinity, NaN (fall back to scalar calls). */
|
||||
static float64x2_t VPCS_ATTR NOINLINE
|
||||
special_case (float64x2_t y, float64x2_t x, float64x2_t ret,
|
||||
uint64x2_t sign_xy, uint64x2_t cmp)
|
||||
{
|
||||
/* Account for the sign of x and y. */
|
||||
ret = vreinterpretq_f64_u64 (
|
||||
veorq_u64 (vreinterpretq_u64_f64 (ret), sign_xy));
|
||||
return v_call2_f64 (atan2, y, x, ret, cmp);
|
||||
}
|
||||
|
||||
/* Returns 1 if input is the bit representation of 0, infinity or nan. */
|
||||
static inline uint64x2_t
|
||||
zeroinfnan (uint64x2_t i, const struct data *d)
|
||||
{
|
||||
/* (2 * i - 1) >= (2 * asuint64 (INFINITY) - 1). */
|
||||
return vcgeq_u64 (vsubq_u64 (vaddq_u64 (i, i), v_u64 (1)), d->zeroinfnan);
|
||||
}
|
||||
|
||||
/* Fast implementation of vector atan2.
|
||||
Maximum observed error is 2.8 ulps:
|
||||
_ZGVnN2vv_atan2 (0x1.9651a429a859ap+5, 0x1.953075f4ee26p+5)
|
||||
got 0x1.92d628ab678ccp-1
|
||||
want 0x1.92d628ab678cfp-1. */
|
||||
float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
|
||||
uint64x2_t ix = vreinterpretq_u64_f64 (x);
|
||||
uint64x2_t iy = vreinterpretq_u64_f64 (y);
|
||||
|
||||
uint64x2_t special_cases
|
||||
= vorrq_u64 (zeroinfnan (ix, d), zeroinfnan (iy, d));
|
||||
|
||||
uint64x2_t sign_x = vandq_u64 (ix, SignMask);
|
||||
uint64x2_t sign_y = vandq_u64 (iy, SignMask);
|
||||
uint64x2_t sign_xy = veorq_u64 (sign_x, sign_y);
|
||||
|
||||
float64x2_t ax = vabsq_f64 (x);
|
||||
float64x2_t ay = vabsq_f64 (y);
|
||||
|
||||
uint64x2_t pred_xlt0 = vcltzq_f64 (x);
|
||||
uint64x2_t pred_aygtax = vcagtq_f64 (y, x);
|
||||
|
||||
/* Set up z for call to atan. */
|
||||
float64x2_t n = vbslq_f64 (pred_aygtax, vnegq_f64 (ax), ay);
|
||||
float64x2_t q = vbslq_f64 (pred_aygtax, ay, ax);
|
||||
float64x2_t z = vdivq_f64 (n, q);
|
||||
|
||||
/* Work out the correct shift. */
|
||||
float64x2_t shift
|
||||
= vreinterpretq_f64_u64 (vandq_u64 (pred_xlt0, d->minustwo));
|
||||
shift = vbslq_f64 (pred_aygtax, vaddq_f64 (shift, v_f64 (1.0)), shift);
|
||||
shift = vmulq_f64 (shift, d->pi_over_2);
|
||||
|
||||
/* Calculate the polynomial approximation.
|
||||
Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of
|
||||
full scheme to avoid underflow in x^16.
|
||||
The order 19 polynomial P approximates
|
||||
(atan(sqrt(x))-sqrt(x))/x^(3/2). */
|
||||
float64x2_t z2 = vmulq_f64 (z, z);
|
||||
float64x2_t x2 = vmulq_f64 (z2, z2);
|
||||
float64x2_t x4 = vmulq_f64 (x2, x2);
|
||||
float64x2_t x8 = vmulq_f64 (x4, x4);
|
||||
|
||||
float64x2_t c13 = vld1q_f64 (&d->c1);
|
||||
float64x2_t c57 = vld1q_f64 (&d->c5);
|
||||
float64x2_t c911 = vld1q_f64 (&d->c9);
|
||||
float64x2_t c1315 = vld1q_f64 (&d->c13);
|
||||
float64x2_t c1719 = vld1q_f64 (&d->c17);
|
||||
|
||||
/* estrin_7. */
|
||||
float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0);
|
||||
float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1);
|
||||
float64x2_t p03 = vfmaq_f64 (p01, x2, p23);
|
||||
|
||||
float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0);
|
||||
float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1);
|
||||
float64x2_t p47 = vfmaq_f64 (p45, x2, p67);
|
||||
|
||||
float64x2_t p07 = vfmaq_f64 (p03, x4, p47);
|
||||
|
||||
/* estrin_11. */
|
||||
float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0);
|
||||
float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1);
|
||||
float64x2_t p811 = vfmaq_f64 (p89, x2, p1011);
|
||||
|
||||
float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, z2, c1315, 0);
|
||||
float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, z2, c1315, 1);
|
||||
float64x2_t p1215 = vfmaq_f64 (p1213, x2, p1415);
|
||||
|
||||
float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, z2, c1719, 0);
|
||||
float64x2_t p1819 = vfmaq_laneq_f64 (d->c18, z2, c1719, 1);
|
||||
float64x2_t p1619 = vfmaq_f64 (p1617, x2, p1819);
|
||||
|
||||
float64x2_t p815 = vfmaq_f64 (p811, x4, p1215);
|
||||
float64x2_t p819 = vfmaq_f64 (p815, x8, p1619);
|
||||
|
||||
float64x2_t ret = vfmaq_f64 (p07, p819, x8);
|
||||
|
||||
/* Finalize. y = shift + z + z^3 * P(z^2). */
|
||||
ret = vfmaq_f64 (z, ret, vmulq_f64 (z2, z));
|
||||
ret = vaddq_f64 (ret, shift);
|
||||
|
||||
if (unlikely (v_any_u64 (special_cases)))
|
||||
return special_case (y, x, ret, sign_xy, special_cases);
|
||||
|
||||
/* Account for the sign of x and y. */
|
||||
ret = vreinterpretq_f64_u64 (
|
||||
veorq_u64 (vreinterpretq_u64_f64 (ret), sign_xy));
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h. */
|
||||
TEST_SIG (V, D, 2, atan2)
|
||||
// TODO tighten this once __v_atan2 is fixed
|
||||
TEST_ULP (V_NAME_D2 (atan2), 2.9)
|
||||
TEST_DISABLE_FENV (V_NAME_D2 (atan2))
|
||||
TEST_INTERVAL (V_NAME_D2 (atan2), -10.0, 10.0, 50000)
|
||||
TEST_INTERVAL (V_NAME_D2 (atan2), -1.0, 1.0, 40000)
|
||||
TEST_INTERVAL (V_NAME_D2 (atan2), 0.0, 1.0, 40000)
|
||||
TEST_INTERVAL (V_NAME_D2 (atan2), 1.0, 100.0, 40000)
|
||||
TEST_INTERVAL (V_NAME_D2 (atan2), 1e6, 1e32, 40000)
|
||||
|
|
@ -1,59 +1,64 @@
|
|||
/*
|
||||
* Single-precision vector atan2(x) function.
|
||||
*
|
||||
* Copyright (c) 2021-2023, Arm Limited.
|
||||
* Copyright (c) 2021-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "v_math.h"
|
||||
#include "pl_sig.h"
|
||||
#include "pl_test.h"
|
||||
#include "poly_advsimd_f32.h"
|
||||
#include "test_sig.h"
|
||||
#include "test_defs.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
float32x4_t poly[8];
|
||||
float32x4_t pi_over_2;
|
||||
float32x4_t c0, pi_over_2, c4, c6, c2;
|
||||
float c1, c3, c5, c7;
|
||||
uint32x4_t comp_const;
|
||||
} data = {
|
||||
/* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
|
||||
[2**-128, 1.0].
|
||||
Generated using fpminimax between FLT_MIN and 1. */
|
||||
.poly = { V4 (-0x1.55555p-2f), V4 (0x1.99935ep-3f), V4 (-0x1.24051ep-3f),
|
||||
V4 (0x1.bd7368p-4f), V4 (-0x1.491f0ep-4f), V4 (0x1.93a2c0p-5f),
|
||||
V4 (-0x1.4c3c60p-6f), V4 (0x1.01fd88p-8f) },
|
||||
.pi_over_2 = V4 (0x1.921fb6p+0f),
|
||||
.c0 = V4 (-0x1.55555p-2f), .c1 = 0x1.99935ep-3f,
|
||||
.c2 = V4 (-0x1.24051ep-3f), .c3 = 0x1.bd7368p-4f,
|
||||
.c4 = V4 (-0x1.491f0ep-4f), .c5 = 0x1.93a2c0p-5f,
|
||||
.c6 = V4 (-0x1.4c3c60p-6f), .c7 = 0x1.01fd88p-8f,
|
||||
.pi_over_2 = V4 (0x1.921fb6p+0f), .comp_const = V4 (2 * 0x7f800000lu - 1),
|
||||
};
|
||||
|
||||
#define SignMask v_u32 (0x80000000)
|
||||
|
||||
/* Special cases i.e. 0, infinity and nan (fall back to scalar calls). */
|
||||
static float32x4_t VPCS_ATTR NOINLINE
|
||||
special_case (float32x4_t y, float32x4_t x, float32x4_t ret, uint32x4_t cmp)
|
||||
special_case (float32x4_t y, float32x4_t x, float32x4_t ret,
|
||||
uint32x4_t sign_xy, uint32x4_t cmp)
|
||||
{
|
||||
/* Account for the sign of y. */
|
||||
ret = vreinterpretq_f32_u32 (
|
||||
veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy));
|
||||
return v_call2_f32 (atan2f, y, x, ret, cmp);
|
||||
}
|
||||
|
||||
/* Returns 1 if input is the bit representation of 0, infinity or nan. */
|
||||
static inline uint32x4_t
|
||||
zeroinfnan (uint32x4_t i)
|
||||
zeroinfnan (uint32x4_t i, const struct data *d)
|
||||
{
|
||||
/* 2 * i - 1 >= 2 * 0x7f800000lu - 1. */
|
||||
return vcgeq_u32 (vsubq_u32 (vmulq_n_u32 (i, 2), v_u32 (1)),
|
||||
v_u32 (2 * 0x7f800000lu - 1));
|
||||
return vcgeq_u32 (vsubq_u32 (vmulq_n_u32 (i, 2), v_u32 (1)), d->comp_const);
|
||||
}
|
||||
|
||||
/* Fast implementation of vector atan2f. Maximum observed error is
|
||||
2.95 ULP in [0x1.9300d6p+6 0x1.93c0c6p+6] x [0x1.8c2dbp+6 0x1.8cea6p+6]:
|
||||
_ZGVnN4vv_atan2f (0x1.93836cp+6, 0x1.8cae1p+6) got 0x1.967f06p-1
|
||||
want 0x1.967f00p-1. */
|
||||
float32x4_t VPCS_ATTR V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x)
|
||||
float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x)
|
||||
{
|
||||
const struct data *data_ptr = ptr_barrier (&data);
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
|
||||
uint32x4_t ix = vreinterpretq_u32_f32 (x);
|
||||
uint32x4_t iy = vreinterpretq_u32_f32 (y);
|
||||
|
||||
uint32x4_t special_cases = vorrq_u32 (zeroinfnan (ix), zeroinfnan (iy));
|
||||
uint32x4_t special_cases
|
||||
= vorrq_u32 (zeroinfnan (ix, d), zeroinfnan (iy, d));
|
||||
|
||||
uint32x4_t sign_x = vandq_u32 (ix, SignMask);
|
||||
uint32x4_t sign_y = vandq_u32 (iy, SignMask);
|
||||
|
|
@ -67,14 +72,14 @@ float32x4_t VPCS_ATTR V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x)
|
|||
|
||||
/* Set up z for call to atanf. */
|
||||
float32x4_t n = vbslq_f32 (pred_aygtax, vnegq_f32 (ax), ay);
|
||||
float32x4_t d = vbslq_f32 (pred_aygtax, ay, ax);
|
||||
float32x4_t z = vdivq_f32 (n, d);
|
||||
float32x4_t q = vbslq_f32 (pred_aygtax, ay, ax);
|
||||
float32x4_t z = vdivq_f32 (n, q);
|
||||
|
||||
/* Work out the correct shift. */
|
||||
float32x4_t shift = vreinterpretq_f32_u32 (
|
||||
vandq_u32 (pred_xlt0, vreinterpretq_u32_f32 (v_f32 (-2.0f))));
|
||||
shift = vbslq_f32 (pred_aygtax, vaddq_f32 (shift, v_f32 (1.0f)), shift);
|
||||
shift = vmulq_f32 (shift, data_ptr->pi_over_2);
|
||||
shift = vmulq_f32 (shift, d->pi_over_2);
|
||||
|
||||
/* Calculate the polynomial approximation.
|
||||
Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However,
|
||||
|
|
@ -86,30 +91,37 @@ float32x4_t VPCS_ATTR V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x)
|
|||
float32x4_t z2 = vmulq_f32 (z, z);
|
||||
float32x4_t z4 = vmulq_f32 (z2, z2);
|
||||
|
||||
float32x4_t ret = vfmaq_f32 (
|
||||
v_pairwise_poly_3_f32 (z2, z4, data_ptr->poly), z4,
|
||||
vmulq_f32 (z4, v_pairwise_poly_3_f32 (z2, z4, data_ptr->poly + 4)));
|
||||
float32x4_t c1357 = vld1q_f32 (&d->c1);
|
||||
float32x4_t p01 = vfmaq_laneq_f32 (d->c0, z2, c1357, 0);
|
||||
float32x4_t p23 = vfmaq_laneq_f32 (d->c2, z2, c1357, 1);
|
||||
float32x4_t p45 = vfmaq_laneq_f32 (d->c4, z2, c1357, 2);
|
||||
float32x4_t p67 = vfmaq_laneq_f32 (d->c6, z2, c1357, 3);
|
||||
float32x4_t p03 = vfmaq_f32 (p01, z4, p23);
|
||||
float32x4_t p47 = vfmaq_f32 (p45, z4, p67);
|
||||
|
||||
float32x4_t ret = vfmaq_f32 (p03, z4, vmulq_f32 (z4, p47));
|
||||
|
||||
/* y = shift + z * P(z^2). */
|
||||
ret = vaddq_f32 (vfmaq_f32 (z, ret, vmulq_f32 (z2, z)), shift);
|
||||
|
||||
/* Account for the sign of y. */
|
||||
ret = vreinterpretq_f32_u32 (
|
||||
veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy));
|
||||
|
||||
if (unlikely (v_any_u32 (special_cases)))
|
||||
{
|
||||
return special_case (y, x, ret, special_cases);
|
||||
return special_case (y, x, ret, sign_xy, special_cases);
|
||||
}
|
||||
|
||||
return ret;
|
||||
/* Account for the sign of y. */
|
||||
return vreinterpretq_f32_u32 (
|
||||
veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy));
|
||||
}
|
||||
|
||||
HALF_WIDTH_ALIAS_F2 (atan2)
|
||||
|
||||
/* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h. */
|
||||
PL_SIG (V, F, 2, atan2)
|
||||
PL_TEST_ULP (V_NAME_F2 (atan2), 2.46)
|
||||
PL_TEST_INTERVAL (V_NAME_F2 (atan2), -10.0, 10.0, 50000)
|
||||
PL_TEST_INTERVAL (V_NAME_F2 (atan2), -1.0, 1.0, 40000)
|
||||
PL_TEST_INTERVAL (V_NAME_F2 (atan2), 0.0, 1.0, 40000)
|
||||
PL_TEST_INTERVAL (V_NAME_F2 (atan2), 1.0, 100.0, 40000)
|
||||
PL_TEST_INTERVAL (V_NAME_F2 (atan2), 1e6, 1e32, 40000)
|
||||
TEST_SIG (V, F, 2, atan2)
|
||||
TEST_DISABLE_FENV (V_NAME_F2 (atan2))
|
||||
TEST_ULP (V_NAME_F2 (atan2), 2.46)
|
||||
TEST_INTERVAL (V_NAME_F2 (atan2), -10.0, 10.0, 50000)
|
||||
TEST_INTERVAL (V_NAME_F2 (atan2), -1.0, 1.0, 40000)
|
||||
TEST_INTERVAL (V_NAME_F2 (atan2), 0.0, 1.0, 40000)
|
||||
TEST_INTERVAL (V_NAME_F2 (atan2), 1.0, 100.0, 40000)
|
||||
TEST_INTERVAL (V_NAME_F2 (atan2), 1e6, 1e32, 40000)
|
||||
|
|
@ -1,14 +1,14 @@
|
|||
/*
|
||||
* Single-precision vector atan(x) function.
|
||||
*
|
||||
* Copyright (c) 2021-2023, Arm Limited.
|
||||
* Copyright (c) 2021-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "v_math.h"
|
||||
#include "pl_sig.h"
|
||||
#include "pl_test.h"
|
||||
#include "poly_advsimd_f32.h"
|
||||
#include "test_sig.h"
|
||||
#include "test_defs.h"
|
||||
#include "v_poly_f32.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
|
|
@ -43,7 +43,7 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
|
|||
atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1]
|
||||
using z=-1/x and shift = pi/2. Maximum observed error is 2.9ulps:
|
||||
_ZGVnN4v_atanf (0x1.0468f6p+0) got 0x1.967f06p-1 want 0x1.967fp-1. */
|
||||
float32x4_t VPCS_ATTR V_NAME_F1 (atan) (float32x4_t x)
|
||||
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (atan) (float32x4_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
|
||||
|
|
@ -98,10 +98,12 @@ float32x4_t VPCS_ATTR V_NAME_F1 (atan) (float32x4_t x)
|
|||
return y;
|
||||
}
|
||||
|
||||
PL_SIG (V, F, 1, atan, -10.0, 10.0)
|
||||
PL_TEST_ULP (V_NAME_F1 (atan), 2.5)
|
||||
PL_TEST_EXPECT_FENV (V_NAME_F1 (atan), WANT_SIMD_EXCEPT)
|
||||
PL_TEST_SYM_INTERVAL (V_NAME_F1 (atan), 0, 0x1p-30, 5000)
|
||||
PL_TEST_SYM_INTERVAL (V_NAME_F1 (atan), 0x1p-30, 1, 40000)
|
||||
PL_TEST_SYM_INTERVAL (V_NAME_F1 (atan), 1, 0x1p30, 40000)
|
||||
PL_TEST_SYM_INTERVAL (V_NAME_F1 (atan), 0x1p30, inf, 1000)
|
||||
HALF_WIDTH_ALIAS_F1 (atan)
|
||||
|
||||
TEST_SIG (V, F, 1, atan, -10.0, 10.0)
|
||||
TEST_ULP (V_NAME_F1 (atan), 2.5)
|
||||
TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (atan), WANT_SIMD_EXCEPT)
|
||||
TEST_SYM_INTERVAL (V_NAME_F1 (atan), 0, 0x1p-30, 5000)
|
||||
TEST_SYM_INTERVAL (V_NAME_F1 (atan), 0x1p-30, 1, 40000)
|
||||
TEST_SYM_INTERVAL (V_NAME_F1 (atan), 1, 0x1p30, 40000)
|
||||
TEST_SYM_INTERVAL (V_NAME_F1 (atan), 0x1p30, inf, 1000)
|
||||
|
|
@ -1,13 +1,13 @@
|
|||
/*
|
||||
* Double-precision vector atanh(x) function.
|
||||
*
|
||||
* Copyright (c) 2022-2023, Arm Limited.
|
||||
* Copyright (c) 2022-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "v_math.h"
|
||||
#include "pl_sig.h"
|
||||
#include "pl_test.h"
|
||||
#include "test_sig.h"
|
||||
#include "test_defs.h"
|
||||
|
||||
#define WANT_V_LOG1P_K0_SHORTCUT 0
|
||||
#include "v_log1p_inline.h"
|
||||
|
|
@ -15,15 +15,19 @@
|
|||
const static struct data
|
||||
{
|
||||
struct v_log1p_data log1p_consts;
|
||||
uint64x2_t one, half;
|
||||
uint64x2_t one;
|
||||
uint64x2_t sign_mask;
|
||||
} data = { .log1p_consts = V_LOG1P_CONSTANTS_TABLE,
|
||||
.one = V2 (0x3ff0000000000000),
|
||||
.half = V2 (0x3fe0000000000000) };
|
||||
.sign_mask = V2 (0x8000000000000000) };
|
||||
|
||||
static float64x2_t VPCS_ATTR NOINLINE
|
||||
special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
|
||||
special_case (float64x2_t x, float64x2_t halfsign, float64x2_t y,
|
||||
uint64x2_t special, const struct data *d)
|
||||
{
|
||||
return v_call_f64 (atanh, x, y, special);
|
||||
y = log1p_inline (y, &d->log1p_consts);
|
||||
return v_call_f64 (atanh, vbslq_f64 (d->sign_mask, halfsign, x),
|
||||
vmulq_f64 (halfsign, y), special);
|
||||
}
|
||||
|
||||
/* Approximation for vector double-precision atanh(x) using modified log1p.
|
||||
|
|
@ -35,11 +39,10 @@ float64x2_t V_NAME_D1 (atanh) (float64x2_t x)
|
|||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
|
||||
float64x2_t halfsign = vbslq_f64 (d->sign_mask, x, v_f64 (0.5));
|
||||
float64x2_t ax = vabsq_f64 (x);
|
||||
uint64x2_t ia = vreinterpretq_u64_f64 (ax);
|
||||
uint64x2_t sign = veorq_u64 (vreinterpretq_u64_f64 (x), ia);
|
||||
uint64x2_t special = vcgeq_u64 (ia, d->one);
|
||||
float64x2_t halfsign = vreinterpretq_f64_u64 (vorrq_u64 (sign, d->half));
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
ax = v_zerofy_f64 (ax, special);
|
||||
|
|
@ -47,20 +50,26 @@ float64x2_t V_NAME_D1 (atanh) (float64x2_t x)
|
|||
|
||||
float64x2_t y;
|
||||
y = vaddq_f64 (ax, ax);
|
||||
y = vdivq_f64 (y, vsubq_f64 (v_f64 (1), ax));
|
||||
y = log1p_inline (y, &d->log1p_consts);
|
||||
y = vdivq_f64 (y, vsubq_f64 (vreinterpretq_f64_u64 (d->one), ax));
|
||||
|
||||
if (unlikely (v_any_u64 (special)))
|
||||
return special_case (x, vmulq_f64 (y, halfsign), special);
|
||||
#if WANT_SIMD_EXCEPT
|
||||
return special_case (x, halfsign, y, special, d);
|
||||
#else
|
||||
return special_case (ax, halfsign, y, special, d);
|
||||
#endif
|
||||
|
||||
y = log1p_inline (y, &d->log1p_consts);
|
||||
return vmulq_f64 (y, halfsign);
|
||||
}
|
||||
|
||||
PL_SIG (V, D, 1, atanh, -1.0, 1.0)
|
||||
PL_TEST_EXPECT_FENV (V_NAME_D1 (atanh), WANT_SIMD_EXCEPT)
|
||||
PL_TEST_ULP (V_NAME_D1 (atanh), 3.32)
|
||||
TEST_SIG (V, D, 1, atanh, -1.0, 1.0)
|
||||
TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (atanh), WANT_SIMD_EXCEPT)
|
||||
TEST_ULP (V_NAME_D1 (atanh), 3.32)
|
||||
TEST_SYM_INTERVAL (V_NAME_D1 (atanh), 0, 0x1p-23, 10000)
|
||||
TEST_SYM_INTERVAL (V_NAME_D1 (atanh), 0x1p-23, 1, 90000)
|
||||
TEST_SYM_INTERVAL (V_NAME_D1 (atanh), 1, inf, 100)
|
||||
/* atanh is asymptotic at 1, which is the default control value - have to set
|
||||
-c 0 specially to ensure fp exceptions are triggered correctly (choice of
|
||||
control lane is irrelevant if fp exceptions are disabled). */
|
||||
PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (atanh), 0, 0x1p-23, 10000, 0)
|
||||
PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (atanh), 0x1p-23, 1, 90000, 0)
|
||||
PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (atanh), 1, inf, 100, 0)
|
||||
TEST_CONTROL_VALUE (V_NAME_D1 (atanh), 0)
|
||||
|
|
@ -1,13 +1,13 @@
|
|||
/*
|
||||
* Single-precision vector atanh(x) function.
|
||||
*
|
||||
* Copyright (c) 2022-2023, Arm Limited.
|
||||
* Copyright (c) 2022-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "v_math.h"
|
||||
#include "pl_sig.h"
|
||||
#include "pl_test.h"
|
||||
#include "test_sig.h"
|
||||
#include "test_defs.h"
|
||||
#include "v_log1pf_inline.h"
|
||||
|
||||
const static struct data
|
||||
|
|
@ -30,16 +30,18 @@ const static struct data
|
|||
#define Half v_u32 (0x3f000000)
|
||||
|
||||
static float32x4_t NOINLINE VPCS_ATTR
|
||||
special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
|
||||
special_case (float32x4_t x, float32x4_t halfsign, float32x4_t y,
|
||||
uint32x4_t special)
|
||||
{
|
||||
return v_call_f32 (atanhf, x, y, special);
|
||||
return v_call_f32 (atanhf, vbslq_f32 (AbsMask, x, halfsign),
|
||||
vmulq_f32 (halfsign, y), special);
|
||||
}
|
||||
|
||||
/* Approximation for vector single-precision atanh(x) using modified log1p.
|
||||
The maximum error is 3.08 ULP:
|
||||
__v_atanhf(0x1.ff215p-5) got 0x1.ffcb7cp-5
|
||||
want 0x1.ffcb82p-5. */
|
||||
VPCS_ATTR float32x4_t V_NAME_F1 (atanh) (float32x4_t x)
|
||||
The maximum error is 2.93 ULP:
|
||||
_ZGVnN4v_atanhf(0x1.f43d7p-5) got 0x1.f4dcfep-5
|
||||
want 0x1.f4dcf8p-5. */
|
||||
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (atanh) (float32x4_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
|
||||
|
|
@ -58,20 +60,31 @@ VPCS_ATTR float32x4_t V_NAME_F1 (atanh) (float32x4_t x)
|
|||
uint32x4_t special = vcgeq_u32 (iax, d->one);
|
||||
#endif
|
||||
|
||||
float32x4_t y = vdivq_f32 (vaddq_f32 (ax, ax), vsubq_f32 (v_f32 (1), ax));
|
||||
y = log1pf_inline (y, d->log1pf_consts);
|
||||
float32x4_t y = vdivq_f32 (vaddq_f32 (ax, ax),
|
||||
vsubq_f32 (vreinterpretq_f32_u32 (d->one), ax));
|
||||
y = log1pf_inline (y, &d->log1pf_consts);
|
||||
|
||||
/* If exceptions not required, pass ax to special-case for shorter dependency
|
||||
chain. If exceptions are required ax will have been zerofied, so have to
|
||||
pass x. */
|
||||
if (unlikely (v_any_u32 (special)))
|
||||
return special_case (x, vmulq_f32 (halfsign, y), special);
|
||||
#if WANT_SIMD_EXCEPT
|
||||
return special_case (x, halfsign, y, special);
|
||||
#else
|
||||
return special_case (ax, halfsign, y, special);
|
||||
#endif
|
||||
return vmulq_f32 (halfsign, y);
|
||||
}
|
||||
|
||||
PL_SIG (V, F, 1, atanh, -1.0, 1.0)
|
||||
PL_TEST_ULP (V_NAME_F1 (atanh), 2.59)
|
||||
PL_TEST_EXPECT_FENV (V_NAME_F1 (atanh), WANT_SIMD_EXCEPT)
|
||||
HALF_WIDTH_ALIAS_F1 (atanh)
|
||||
|
||||
TEST_SIG (V, F, 1, atanh, -1.0, 1.0)
|
||||
TEST_ULP (V_NAME_F1 (atanh), 2.44)
|
||||
TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (atanh), WANT_SIMD_EXCEPT)
|
||||
TEST_SYM_INTERVAL (V_NAME_F1 (atanh), 0, 0x1p-12, 500)
|
||||
TEST_SYM_INTERVAL (V_NAME_F1 (atanh), 0x1p-12, 1, 200000)
|
||||
TEST_SYM_INTERVAL (V_NAME_F1 (atanh), 1, inf, 1000)
|
||||
/* atanh is asymptotic at 1, which is the default control value - have to set
|
||||
-c 0 specially to ensure fp exceptions are triggered correctly (choice of
|
||||
control lane is irrelevant if fp exceptions are disabled). */
|
||||
PL_TEST_SYM_INTERVAL_C (V_NAME_F1 (atanh), 0, 0x1p-12, 500, 0)
|
||||
PL_TEST_SYM_INTERVAL_C (V_NAME_F1 (atanh), 0x1p-12, 1, 200000, 0)
|
||||
PL_TEST_SYM_INTERVAL_C (V_NAME_F1 (atanh), 1, inf, 1000, 0)
|
||||
TEST_CONTROL_VALUE (V_NAME_F1 (atanh), 0)
|
||||
|
|
@ -1,14 +1,14 @@
|
|||
/*
|
||||
* Double-precision vector cbrt(x) function.
|
||||
*
|
||||
* Copyright (c) 2022-2023, Arm Limited.
|
||||
* Copyright (c) 2022-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "v_math.h"
|
||||
#include "pl_sig.h"
|
||||
#include "pl_test.h"
|
||||
#include "poly_advsimd_f64.h"
|
||||
#include "test_sig.h"
|
||||
#include "test_defs.h"
|
||||
#include "v_poly_f64.h"
|
||||
|
||||
const static struct data
|
||||
{
|
||||
|
|
@ -40,13 +40,20 @@ special_case (float64x2_t x, float64x2_t y, uint32x2_t special)
|
|||
return v_call_f64 (cbrt, x, y, vmovl_u32 (special));
|
||||
}
|
||||
|
||||
/* Approximation for double-precision vector cbrt(x), using low-order polynomial
|
||||
and two Newton iterations. Greatest observed error is 1.79 ULP. Errors repeat
|
||||
/* Approximation for double-precision vector cbrt(x), using low-order
|
||||
polynomial and two Newton iterations.
|
||||
|
||||
The vector version of frexp does not handle subnormals
|
||||
correctly. As a result these need to be handled by the scalar
|
||||
fallback, where accuracy may be worse than that of the vector code
|
||||
path.
|
||||
|
||||
Greatest observed error in the normal range is 1.79 ULP. Errors repeat
|
||||
according to the exponent, for instance an error observed for double value
|
||||
m * 2^e will be observed for any input m * 2^(e + 3*i), where i is an
|
||||
integer.
|
||||
__v_cbrt(0x1.fffff403f0bc6p+1) got 0x1.965fe72821e9bp+0
|
||||
want 0x1.965fe72821e99p+0. */
|
||||
_ZGVnN2v_cbrt (0x1.fffff403f0bc6p+1) got 0x1.965fe72821e9bp+0
|
||||
want 0x1.965fe72821e99p+0. */
|
||||
VPCS_ATTR float64x2_t V_NAME_D1 (cbrt) (float64x2_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
|
|
@ -64,8 +71,8 @@ VPCS_ATTR float64x2_t V_NAME_D1 (cbrt) (float64x2_t x)
|
|||
uint64x2_t ia12 = vshrq_n_u64 (iax, 52);
|
||||
int64x2_t e = vsubq_s64 (vreinterpretq_s64_u64 (ia12), exp_bias);
|
||||
|
||||
/* Calculate rough approximation for cbrt(m) in [0.5, 1.0], starting point for
|
||||
Newton iterations. */
|
||||
/* Calculate rough approximation for cbrt(m) in [0.5, 1.0], starting point
|
||||
for Newton iterations. */
|
||||
float64x2_t p = v_pairwise_poly_3_f64 (m, vmulq_f64 (m, m), d->poly);
|
||||
float64x2_t one_third = d->one_third;
|
||||
/* Two iterations of Newton's method for iteratively approximating cbrt. */
|
||||
|
|
@ -84,8 +91,8 @@ VPCS_ATTR float64x2_t V_NAME_D1 (cbrt) (float64x2_t x)
|
|||
|
||||
Let q = 2 ^ round(e / 3), then t = 2 ^ (e / 3) / q.
|
||||
|
||||
Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which is
|
||||
an integer in [-2, 2], and can be looked up in the table T. Hence the
|
||||
Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which
|
||||
is an integer in [-2, 2], and can be looked up in the table T. Hence the
|
||||
result is assembled as:
|
||||
|
||||
cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign. */
|
||||
|
|
@ -110,7 +117,11 @@ VPCS_ATTR float64x2_t V_NAME_D1 (cbrt) (float64x2_t x)
|
|||
return vbslq_f64 (d->abs_mask, y, x);
|
||||
}
|
||||
|
||||
PL_TEST_ULP (V_NAME_D1 (cbrt), 1.30)
|
||||
PL_SIG (V, D, 1, cbrt, -10.0, 10.0)
|
||||
PL_TEST_EXPECT_FENV_ALWAYS (V_NAME_D1 (cbrt))
|
||||
PL_TEST_SYM_INTERVAL (V_NAME_D1 (cbrt), 0, inf, 1000000)
|
||||
/* Worse-case ULP error assumes that scalar fallback is GLIBC 2.40 cbrt, which
|
||||
has ULP error of 3.67 at 0x1.7a337e1ba1ec2p-257 [1]. Largest observed error
|
||||
in the vector path is 1.79 ULP.
|
||||
[1] Innocente, V., & Zimmermann, P. (2024). Accuracy of Mathematical
|
||||
Functions in Single, Double, Double Extended, and Quadruple Precision. */
|
||||
TEST_ULP (V_NAME_D1 (cbrt), 3.17)
|
||||
TEST_SIG (V, D, 1, cbrt, -10.0, 10.0)
|
||||
TEST_SYM_INTERVAL (V_NAME_D1 (cbrt), 0, inf, 1000000)
|
||||
|
|
@ -1,14 +1,14 @@
|
|||
/*
|
||||
* Single-precision vector cbrt(x) function.
|
||||
*
|
||||
* Copyright (c) 2022-2023, Arm Limited.
|
||||
* Copyright (c) 2022-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "v_math.h"
|
||||
#include "pl_sig.h"
|
||||
#include "pl_test.h"
|
||||
#include "poly_advsimd_f32.h"
|
||||
#include "test_sig.h"
|
||||
#include "test_defs.h"
|
||||
#include "v_poly_f32.h"
|
||||
|
||||
const static struct data
|
||||
{
|
||||
|
|
@ -49,7 +49,7 @@ shifted_lookup (const float *table, int32x4_t i)
|
|||
0x1.85a2aa and the exponent is a multiple of 3, for example:
|
||||
_ZGVnN4v_cbrtf(0x1.85a2aap+3) got 0x1.267936p+1
|
||||
want 0x1.267932p+1. */
|
||||
VPCS_ATTR float32x4_t V_NAME_F1 (cbrt) (float32x4_t x)
|
||||
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cbrt) (float32x4_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
uint32x4_t iax = vreinterpretq_u32_f32 (vabsq_f32 (x));
|
||||
|
|
@ -110,7 +110,8 @@ VPCS_ATTR float32x4_t V_NAME_F1 (cbrt) (float32x4_t x)
|
|||
return vbslq_f32 (SignMask, x, y);
|
||||
}
|
||||
|
||||
PL_SIG (V, F, 1, cbrt, -10.0, 10.0)
|
||||
PL_TEST_ULP (V_NAME_F1 (cbrt), 1.15)
|
||||
PL_TEST_EXPECT_FENV_ALWAYS (V_NAME_F1 (cbrt))
|
||||
PL_TEST_SYM_INTERVAL (V_NAME_F1 (cbrt), 0, inf, 1000000)
|
||||
HALF_WIDTH_ALIAS_F1 (cbrt)
|
||||
|
||||
TEST_SIG (V, F, 1, cbrt, -10.0, 10.0)
|
||||
TEST_ULP (V_NAME_F1 (cbrt), 1.15)
|
||||
TEST_SYM_INTERVAL (V_NAME_F1 (cbrt), 0, inf, 1000000)
|
||||
|
|
@ -1,13 +1,13 @@
|
|||
/*
|
||||
* Double-precision vector sincos function - return-by-value interface.
|
||||
*
|
||||
* Copyright (c) 2023, Arm Limited.
|
||||
* Copyright (c) 2023-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "v_sincos_common.h"
|
||||
#include "v_math.h"
|
||||
#include "pl_test.h"
|
||||
#include "test_defs.h"
|
||||
|
||||
static float64x2x2_t VPCS_ATTR NOINLINE
|
||||
special_case (float64x2_t x, uint64x2_t special, float64x2x2_t y)
|
||||
|
|
@ -34,11 +34,13 @@ _ZGVnN2v_cexpi (float64x2_t x)
|
|||
return sc;
|
||||
}
|
||||
|
||||
PL_TEST_ULP (_ZGVnN2v_cexpi_sin, 2.73)
|
||||
PL_TEST_ULP (_ZGVnN2v_cexpi_cos, 2.73)
|
||||
TEST_DISABLE_FENV (_ZGVnN2v_cexpi_cos)
|
||||
TEST_DISABLE_FENV (_ZGVnN2v_cexpi_sin)
|
||||
TEST_ULP (_ZGVnN2v_cexpi_sin, 2.73)
|
||||
TEST_ULP (_ZGVnN2v_cexpi_cos, 2.73)
|
||||
#define V_CEXPI_INTERVAL(lo, hi, n) \
|
||||
PL_TEST_INTERVAL (_ZGVnN2v_cexpi_sin, lo, hi, n) \
|
||||
PL_TEST_INTERVAL (_ZGVnN2v_cexpi_cos, lo, hi, n)
|
||||
TEST_INTERVAL (_ZGVnN2v_cexpi_sin, lo, hi, n) \
|
||||
TEST_INTERVAL (_ZGVnN2v_cexpi_cos, lo, hi, n)
|
||||
V_CEXPI_INTERVAL (0, 0x1p23, 500000)
|
||||
V_CEXPI_INTERVAL (-0, -0x1p23, 500000)
|
||||
V_CEXPI_INTERVAL (0x1p23, inf, 10000)
|
||||
|
|
@ -1,13 +1,13 @@
|
|||
/*
|
||||
* Single-precision vector cexpi function.
|
||||
*
|
||||
* Copyright (c) 2023, Arm Limited.
|
||||
* Copyright (c) 2023-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "v_sincosf_common.h"
|
||||
#include "v_math.h"
|
||||
#include "pl_test.h"
|
||||
#include "test_defs.h"
|
||||
|
||||
static float32x4x2_t VPCS_ATTR NOINLINE
|
||||
special_case (float32x4_t x, uint32x4_t special, float32x4x2_t y)
|
||||
|
|
@ -36,11 +36,13 @@ _ZGVnN4v_cexpif (float32x4_t x)
|
|||
return sc;
|
||||
}
|
||||
|
||||
PL_TEST_ULP (_ZGVnN4v_cexpif_sin, 1.17)
|
||||
PL_TEST_ULP (_ZGVnN4v_cexpif_cos, 1.31)
|
||||
TEST_DISABLE_FENV (_ZGVnN4v_cexpif_sin)
|
||||
TEST_DISABLE_FENV (_ZGVnN4v_cexpif_cos)
|
||||
TEST_ULP (_ZGVnN4v_cexpif_sin, 1.17)
|
||||
TEST_ULP (_ZGVnN4v_cexpif_cos, 1.31)
|
||||
#define V_CEXPIF_INTERVAL(lo, hi, n) \
|
||||
PL_TEST_INTERVAL (_ZGVnN4v_cexpif_sin, lo, hi, n) \
|
||||
PL_TEST_INTERVAL (_ZGVnN4v_cexpif_cos, lo, hi, n)
|
||||
TEST_INTERVAL (_ZGVnN4v_cexpif_sin, lo, hi, n) \
|
||||
TEST_INTERVAL (_ZGVnN4v_cexpif_cos, lo, hi, n)
|
||||
V_CEXPIF_INTERVAL (0, 0x1p20, 500000)
|
||||
V_CEXPIF_INTERVAL (-0, -0x1p20, 500000)
|
||||
V_CEXPIF_INTERVAL (0x1p20, inf, 10000)
|
||||
|
|
@ -1,17 +1,19 @@
|
|||
/*
|
||||
* Double-precision vector cos function.
|
||||
*
|
||||
* Copyright (c) 2019-2023, Arm Limited.
|
||||
* Copyright (c) 2019-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "mathlib.h"
|
||||
#include "v_math.h"
|
||||
#include "test_defs.h"
|
||||
#include "test_sig.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
float64x2_t poly[7];
|
||||
float64x2_t range_val, shift, inv_pi, half_pi, pi_1, pi_2, pi_3;
|
||||
float64x2_t range_val, inv_pi, pi_1, pi_2, pi_3;
|
||||
} data = {
|
||||
/* Worst-case error is 3.3 ulp in [-pi/2, pi/2]. */
|
||||
.poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7),
|
||||
|
|
@ -19,11 +21,9 @@ static const struct data
|
|||
V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33),
|
||||
V2 (-0x1.9e9540300a1p-41) },
|
||||
.inv_pi = V2 (0x1.45f306dc9c883p-2),
|
||||
.half_pi = V2 (0x1.921fb54442d18p+0),
|
||||
.pi_1 = V2 (0x1.921fb54442d18p+1),
|
||||
.pi_2 = V2 (0x1.1a62633145c06p-53),
|
||||
.pi_3 = V2 (0x1.c1cd129024e09p-106),
|
||||
.shift = V2 (0x1.8p52),
|
||||
.range_val = V2 (0x1p23)
|
||||
};
|
||||
|
||||
|
|
@ -57,10 +57,9 @@ float64x2_t VPCS_ATTR V_NAME_D1 (cos) (float64x2_t x)
|
|||
#endif
|
||||
|
||||
/* n = rint((|x|+pi/2)/pi) - 0.5. */
|
||||
n = vfmaq_f64 (d->shift, d->inv_pi, vaddq_f64 (r, d->half_pi));
|
||||
odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63);
|
||||
n = vsubq_f64 (n, d->shift);
|
||||
n = vsubq_f64 (n, v_f64 (0.5));
|
||||
n = vrndaq_f64 (vfmaq_f64 (v_f64 (0.5), r, d->inv_pi));
|
||||
odd = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtq_s64_f64 (n)), 63);
|
||||
n = vsubq_f64 (n, v_f64 (0.5f));
|
||||
|
||||
/* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
|
||||
r = vfmsq_f64 (r, d->pi_1, n);
|
||||
|
|
@ -85,3 +84,9 @@ float64x2_t VPCS_ATTR V_NAME_D1 (cos) (float64x2_t x)
|
|||
return special_case (x, y, odd, cmp);
|
||||
return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
|
||||
}
|
||||
|
||||
TEST_SIG (V, D, 1, cos, -3.1, 3.1)
|
||||
TEST_ULP (V_NAME_D1 (cos), 3.0)
|
||||
TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (cos), WANT_SIMD_EXCEPT)
|
||||
TEST_SYM_INTERVAL (V_NAME_D1 (cos), 0, 0x1p23, 500000)
|
||||
TEST_SYM_INTERVAL (V_NAME_D1 (cos), 0x1p23, inf, 10000)
|
||||
|
|
@ -1,17 +1,19 @@
|
|||
/*
|
||||
* Single-precision vector cos function.
|
||||
*
|
||||
* Copyright (c) 2019-2023, Arm Limited.
|
||||
* Copyright (c) 2019-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "mathlib.h"
|
||||
#include "v_math.h"
|
||||
#include "test_defs.h"
|
||||
#include "test_sig.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
float32x4_t poly[4];
|
||||
float32x4_t range_val, inv_pi, half_pi, shift, pi_1, pi_2, pi_3;
|
||||
float32x4_t range_val, inv_pi, pi_1, pi_2, pi_3;
|
||||
} data = {
|
||||
/* 1.886 ulp error. */
|
||||
.poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f),
|
||||
|
|
@ -22,8 +24,6 @@ static const struct data
|
|||
.pi_3 = V4 (-0x1.ee59dap-49f),
|
||||
|
||||
.inv_pi = V4 (0x1.45f306p-2f),
|
||||
.shift = V4 (0x1.8p+23f),
|
||||
.half_pi = V4 (0x1.921fb6p0f),
|
||||
.range_val = V4 (0x1p20f)
|
||||
};
|
||||
|
||||
|
|
@ -37,7 +37,7 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp)
|
|||
return v_call_f32 (cosf, x, y, cmp);
|
||||
}
|
||||
|
||||
float32x4_t VPCS_ATTR V_NAME_F1 (cos) (float32x4_t x)
|
||||
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cos) (float32x4_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
float32x4_t n, r, r2, r3, y;
|
||||
|
|
@ -58,9 +58,8 @@ float32x4_t VPCS_ATTR V_NAME_F1 (cos) (float32x4_t x)
|
|||
#endif
|
||||
|
||||
/* n = rint((|x|+pi/2)/pi) - 0.5. */
|
||||
n = vfmaq_f32 (d->shift, d->inv_pi, vaddq_f32 (r, d->half_pi));
|
||||
odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31);
|
||||
n = vsubq_f32 (n, d->shift);
|
||||
n = vrndaq_f32 (vfmaq_f32 (v_f32 (0.5), r, d->inv_pi));
|
||||
odd = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 31);
|
||||
n = vsubq_f32 (n, v_f32 (0.5f));
|
||||
|
||||
/* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
|
||||
|
|
@ -80,3 +79,11 @@ float32x4_t VPCS_ATTR V_NAME_F1 (cos) (float32x4_t x)
|
|||
return special_case (x, y, odd, cmp);
|
||||
return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
|
||||
}
|
||||
|
||||
HALF_WIDTH_ALIAS_F1 (cos)
|
||||
|
||||
TEST_SIG (V, F, 1, cos, -3.1, 3.1)
|
||||
TEST_ULP (V_NAME_F1 (cos), 1.4)
|
||||
TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (cos), WANT_SIMD_EXCEPT)
|
||||
TEST_SYM_INTERVAL (V_NAME_F1 (cos), 0, 0x1p20, 500000)
|
||||
TEST_SYM_INTERVAL (V_NAME_F1 (cos), 0x1p20, inf, 10000)
|
||||
|
|
@ -1,18 +1,20 @@
|
|||
/*
|
||||
* Double-precision vector cosh(x) function.
|
||||
*
|
||||
* Copyright (c) 2022-2023, Arm Limited.
|
||||
* Copyright (c) 2022-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "v_math.h"
|
||||
#include "pl_sig.h"
|
||||
#include "pl_test.h"
|
||||
#include "test_sig.h"
|
||||
#include "test_defs.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
float64x2_t poly[3];
|
||||
float64x2_t inv_ln2, ln2, shift, thres;
|
||||
float64x2_t inv_ln2;
|
||||
double ln2[2];
|
||||
float64x2_t shift, thres;
|
||||
uint64x2_t index_mask, special_bound;
|
||||
} data = {
|
||||
.poly = { V2 (0x1.fffffffffffd4p-2), V2 (0x1.5555571d6b68cp-3),
|
||||
|
|
@ -48,8 +50,9 @@ exp_inline (float64x2_t x)
|
|||
float64x2_t n = vsubq_f64 (z, d->shift);
|
||||
|
||||
/* r = x - n*ln2/N. */
|
||||
float64x2_t r = vfmaq_laneq_f64 (x, n, d->ln2, 0);
|
||||
r = vfmaq_laneq_f64 (r, n, d->ln2, 1);
|
||||
float64x2_t ln2 = vld1q_f64 (d->ln2);
|
||||
float64x2_t r = vfmaq_laneq_f64 (x, n, ln2, 0);
|
||||
r = vfmaq_laneq_f64 (r, n, ln2, 1);
|
||||
|
||||
uint64x2_t e = vshlq_n_u64 (u, 52 - V_EXP_TAIL_TABLE_BITS);
|
||||
uint64x2_t i = vandq_u64 (u, d->index_mask);
|
||||
|
|
@ -97,8 +100,8 @@ float64x2_t VPCS_ATTR V_NAME_D1 (cosh) (float64x2_t x)
|
|||
return vaddq_f64 (half_t, half_over_t);
|
||||
}
|
||||
|
||||
PL_SIG (V, D, 1, cosh, -10.0, 10.0)
|
||||
PL_TEST_ULP (V_NAME_D1 (cosh), 1.43)
|
||||
PL_TEST_EXPECT_FENV_ALWAYS (V_NAME_D1 (cosh))
|
||||
PL_TEST_SYM_INTERVAL (V_NAME_D1 (cosh), 0, 0x1.6p9, 100000)
|
||||
PL_TEST_SYM_INTERVAL (V_NAME_D1 (cosh), 0x1.6p9, inf, 1000)
|
||||
TEST_SIG (V, D, 1, cosh, -10.0, 10.0)
|
||||
TEST_ULP (V_NAME_D1 (cosh), 1.43)
|
||||
TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (cosh), WANT_SIMD_EXCEPT)
|
||||
TEST_SYM_INTERVAL (V_NAME_D1 (cosh), 0, 0x1.6p9, 100000)
|
||||
TEST_SYM_INTERVAL (V_NAME_D1 (cosh), 0x1.6p9, inf, 1000)
|
||||
|
|
@ -1,32 +1,39 @@
|
|||
/*
|
||||
* Single-precision vector cosh(x) function.
|
||||
*
|
||||
* Copyright (c) 2022-2023, Arm Limited.
|
||||
* Copyright (c) 2022-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "v_expf_inline.h"
|
||||
#include "v_math.h"
|
||||
#include "mathlib.h"
|
||||
#include "pl_sig.h"
|
||||
#include "pl_test.h"
|
||||
#include "test_sig.h"
|
||||
#include "test_defs.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
struct v_expf_data expf_consts;
|
||||
uint32x4_t tiny_bound, special_bound;
|
||||
uint32x4_t tiny_bound;
|
||||
float32x4_t bound;
|
||||
#if WANT_SIMD_EXCEPT
|
||||
uint32x4_t special_bound;
|
||||
#endif
|
||||
} data = {
|
||||
.expf_consts = V_EXPF_DATA,
|
||||
.tiny_bound = V4 (0x20000000), /* 0x1p-63: Round to 1 below this. */
|
||||
/* 0x1.5a92d8p+6: expf overflows above this, so have to use special case. */
|
||||
.bound = V4 (0x1.5a92d8p+6),
|
||||
#if WANT_SIMD_EXCEPT
|
||||
.special_bound = V4 (0x42ad496c),
|
||||
#endif
|
||||
};
|
||||
|
||||
#if !WANT_SIMD_EXCEPT
|
||||
static float32x4_t NOINLINE VPCS_ATTR
|
||||
special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
|
||||
special_case (float32x4_t x, float32x4_t half_t, float32x4_t half_over_t,
|
||||
uint32x4_t special)
|
||||
{
|
||||
return v_call_f32 (coshf, x, y, special);
|
||||
return v_call_f32 (coshf, x, vaddq_f32 (half_t, half_over_t), special);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
|
@ -34,18 +41,17 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
|
|||
Maximum error is 2.38 ULP:
|
||||
_ZGVnN4v_coshf (0x1.e8001ep+1) got 0x1.6a491ep+4
|
||||
want 0x1.6a4922p+4. */
|
||||
float32x4_t VPCS_ATTR V_NAME_F1 (cosh) (float32x4_t x)
|
||||
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cosh) (float32x4_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
|
||||
float32x4_t ax = vabsq_f32 (x);
|
||||
uint32x4_t iax = vreinterpretq_u32_f32 (ax);
|
||||
uint32x4_t special = vcgeq_u32 (iax, d->special_bound);
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
/* If fp exceptions are to be triggered correctly, fall back to the scalar
|
||||
variant for all inputs if any input is a special value or above the bound
|
||||
at which expf overflows. */
|
||||
float32x4_t ax = vabsq_f32 (x);
|
||||
uint32x4_t iax = vreinterpretq_u32_f32 (ax);
|
||||
uint32x4_t special = vcgeq_u32 (iax, d->special_bound);
|
||||
if (unlikely (v_any_u32 (special)))
|
||||
return v_call_f32 (coshf, x, x, v_u32 (-1));
|
||||
|
||||
|
|
@ -54,10 +60,13 @@ float32x4_t VPCS_ATTR V_NAME_F1 (cosh) (float32x4_t x)
|
|||
input to 0, which will generate no exceptions. */
|
||||
if (unlikely (v_any_u32 (tiny)))
|
||||
ax = v_zerofy_f32 (ax, tiny);
|
||||
float32x4_t t = v_expf_inline (ax, &d->expf_consts);
|
||||
#else
|
||||
uint32x4_t special = vcageq_f32 (x, d->bound);
|
||||
float32x4_t t = v_expf_inline (x, &d->expf_consts);
|
||||
#endif
|
||||
|
||||
/* Calculate cosh by exp(x) / 2 + exp(-x) / 2. */
|
||||
float32x4_t t = v_expf_inline (ax, &d->expf_consts);
|
||||
float32x4_t half_t = vmulq_n_f32 (t, 0.5);
|
||||
float32x4_t half_over_t = vdivq_f32 (v_f32 (0.5), t);
|
||||
|
||||
|
|
@ -66,15 +75,18 @@ float32x4_t VPCS_ATTR V_NAME_F1 (cosh) (float32x4_t x)
|
|||
return vbslq_f32 (tiny, v_f32 (1), vaddq_f32 (half_t, half_over_t));
|
||||
#else
|
||||
if (unlikely (v_any_u32 (special)))
|
||||
return special_case (x, vaddq_f32 (half_t, half_over_t), special);
|
||||
return special_case (x, half_t, half_over_t, special);
|
||||
#endif
|
||||
|
||||
return vaddq_f32 (half_t, half_over_t);
|
||||
}
|
||||
|
||||
PL_SIG (V, F, 1, cosh, -10.0, 10.0)
|
||||
PL_TEST_ULP (V_NAME_F1 (cosh), 1.89)
|
||||
PL_TEST_EXPECT_FENV (V_NAME_F1 (cosh), WANT_SIMD_EXCEPT)
|
||||
PL_TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 0, 0x1p-63, 100)
|
||||
PL_TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 0, 0x1.5a92d8p+6, 80000)
|
||||
PL_TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 0x1.5a92d8p+6, inf, 2000)
|
||||
HALF_WIDTH_ALIAS_F1 (cosh)
|
||||
|
||||
TEST_SIG (V, F, 1, cosh, -10.0, 10.0)
|
||||
TEST_ULP (V_NAME_F1 (cosh), 1.89)
|
||||
TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (cosh), WANT_SIMD_EXCEPT)
|
||||
TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 0, 0x1p-63, 100)
|
||||
TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 0x1p-63, 1, 1000)
|
||||
TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 1, 0x1.5a92d8p+6, 80000)
|
||||
TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 0x1.5a92d8p+6, inf, 2000)
|
||||
|
|
@ -1,15 +1,15 @@
|
|||
/*
|
||||
* Double-precision vector cospi function.
|
||||
*
|
||||
* Copyright (c) 2023, Arm Limited.
|
||||
* Copyright (c) 2023-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "mathlib.h"
|
||||
#include "v_math.h"
|
||||
#include "poly_advsimd_f64.h"
|
||||
#include "pl_sig.h"
|
||||
#include "pl_test.h"
|
||||
#include "v_poly_f64.h"
|
||||
#include "test_sig.h"
|
||||
#include "test_defs.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
|
|
@ -31,7 +31,7 @@ special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp)
|
|||
{
|
||||
/* Fall back to scalar code. */
|
||||
y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
|
||||
return v_call_f64 (cospi, x, y, cmp);
|
||||
return v_call_f64 (arm_math_cospi, x, y, cmp);
|
||||
}
|
||||
|
||||
/* Approximation for vector double-precision cospi(x).
|
||||
|
|
@ -77,10 +77,11 @@ float64x2_t VPCS_ATTR V_NAME_D1 (cospi) (float64x2_t x)
|
|||
return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
|
||||
}
|
||||
|
||||
PL_SIG (V, D, 1, cospi, -0.9, 0.9)
|
||||
PL_TEST_ULP (V_NAME_D1 (cospi), 2.56)
|
||||
PL_TEST_EXPECT_FENV (V_NAME_D1 (cospi), WANT_SIMD_EXCEPT)
|
||||
PL_TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0, 0x1p-63, 5000)
|
||||
PL_TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0x1p-63, 0.5, 10000)
|
||||
PL_TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0.5, 0x1p51, 10000)
|
||||
PL_TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0x1p51, inf, 10000)
|
||||
#if WANT_TRIGPI_TESTS
|
||||
TEST_ULP (V_NAME_D1 (cospi), 2.56)
|
||||
TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (cospi), WANT_SIMD_EXCEPT)
|
||||
TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0, 0x1p-63, 5000)
|
||||
TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0x1p-63, 0.5, 10000)
|
||||
TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0.5, 0x1p51, 10000)
|
||||
TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0x1p51, inf, 10000)
|
||||
#endif
|
||||
|
|
@ -1,15 +1,15 @@
|
|||
/*
|
||||
* Single-precision vector cospi function.
|
||||
*
|
||||
* Copyright (c) 2023, Arm Limited.
|
||||
* Copyright (c) 2023-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "mathlib.h"
|
||||
#include "v_math.h"
|
||||
#include "poly_advsimd_f32.h"
|
||||
#include "pl_sig.h"
|
||||
#include "pl_test.h"
|
||||
#include "v_poly_f32.h"
|
||||
#include "test_sig.h"
|
||||
#include "test_defs.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
|
|
@ -26,14 +26,14 @@ static float32x4_t VPCS_ATTR NOINLINE
|
|||
special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp)
|
||||
{
|
||||
y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
|
||||
return v_call_f32 (cospif, x, y, cmp);
|
||||
return v_call_f32 (arm_math_cospif, x, y, cmp);
|
||||
}
|
||||
|
||||
/* Approximation for vector single-precision cospi(x)
|
||||
Maximum Error: 3.17 ULP:
|
||||
_ZGVnN4v_cospif(0x1.d341a8p-5) got 0x1.f7cd56p-1
|
||||
want 0x1.f7cd5p-1. */
|
||||
float32x4_t VPCS_ATTR V_NAME_F1 (cospi) (float32x4_t x)
|
||||
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cospi) (float32x4_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
|
||||
|
|
@ -74,10 +74,13 @@ float32x4_t VPCS_ATTR V_NAME_F1 (cospi) (float32x4_t x)
|
|||
return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
|
||||
}
|
||||
|
||||
PL_SIG (V, F, 1, cospi, -0.9, 0.9)
|
||||
PL_TEST_ULP (V_NAME_F1 (cospi), 2.67)
|
||||
PL_TEST_EXPECT_FENV (V_NAME_F1 (cospi), WANT_SIMD_EXCEPT)
|
||||
PL_TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0, 0x1p-31, 5000)
|
||||
PL_TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0x1p-31, 0.5, 10000)
|
||||
PL_TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0.5, 0x1p32f, 10000)
|
||||
PL_TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0x1p32f, inf, 10000)
|
||||
HALF_WIDTH_ALIAS_F1 (cospi)
|
||||
|
||||
#if WANT_TRIGPI_TESTS
|
||||
TEST_ULP (V_NAME_F1 (cospi), 2.67)
|
||||
TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (cospi), WANT_SIMD_EXCEPT)
|
||||
TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0, 0x1p-31, 5000)
|
||||
TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0x1p-31, 0.5, 10000)
|
||||
TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0.5, 0x1p32f, 10000)
|
||||
TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0x1p32f, inf, 10000)
|
||||
#endif
|
||||
|
|
@ -1,30 +1,32 @@
|
|||
/*
|
||||
* Double-precision vector erf(x) function.
|
||||
*
|
||||
* Copyright (c) 2023, Arm Limited.
|
||||
* Copyright (c) 2023-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "v_math.h"
|
||||
#include "pl_sig.h"
|
||||
#include "pl_test.h"
|
||||
#include "test_sig.h"
|
||||
#include "test_defs.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
float64x2_t third;
|
||||
float64x2_t tenth, two_over_five, two_over_fifteen;
|
||||
float64x2_t two_over_nine, two_over_fortyfive;
|
||||
float64x2_t tenth, two_over_five, two_over_nine;
|
||||
double two_over_fifteen, two_over_fortyfive;
|
||||
float64x2_t max, shift;
|
||||
uint64x2_t max_idx;
|
||||
#if WANT_SIMD_EXCEPT
|
||||
float64x2_t tiny_bound, huge_bound, scale_minus_one;
|
||||
#endif
|
||||
} data = {
|
||||
.max_idx = V2 (768),
|
||||
.third = V2 (0x1.5555555555556p-2), /* used to compute 2/3 and 1/6 too. */
|
||||
.two_over_fifteen = V2 (0x1.1111111111111p-3),
|
||||
.two_over_fifteen = 0x1.1111111111111p-3,
|
||||
.tenth = V2 (-0x1.999999999999ap-4),
|
||||
.two_over_five = V2 (-0x1.999999999999ap-2),
|
||||
.two_over_nine = V2 (-0x1.c71c71c71c71cp-3),
|
||||
.two_over_fortyfive = V2 (0x1.6c16c16c16c17p-5),
|
||||
.two_over_fortyfive = 0x1.6c16c16c16c17p-5,
|
||||
.max = V2 (5.9921875), /* 6 - 1/128. */
|
||||
.shift = V2 (0x1p45),
|
||||
#if WANT_SIMD_EXCEPT
|
||||
|
|
@ -46,8 +48,8 @@ static inline struct entry
|
|||
lookup (uint64x2_t i)
|
||||
{
|
||||
struct entry e;
|
||||
float64x2_t e1 = vld1q_f64 ((float64_t *) (__erf_data.tab + i[0])),
|
||||
e2 = vld1q_f64 ((float64_t *) (__erf_data.tab + i[1]));
|
||||
float64x2_t e1 = vld1q_f64 (&__v_erf_data.tab[vgetq_lane_u64 (i, 0)].erf),
|
||||
e2 = vld1q_f64 (&__v_erf_data.tab[vgetq_lane_u64 (i, 1)].erf);
|
||||
e.erf = vuzp1q_f64 (e1, e2);
|
||||
e.scale = vuzp2q_f64 (e1, e2);
|
||||
return e;
|
||||
|
|
@ -77,8 +79,8 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erf) (float64x2_t x)
|
|||
float64x2_t a = vabsq_f64 (x);
|
||||
/* Reciprocal conditions that do not catch NaNs so they can be used in BSLs
|
||||
to return expected results. */
|
||||
uint64x2_t a_le_max = vcleq_f64 (a, dat->max);
|
||||
uint64x2_t a_gt_max = vcgtq_f64 (a, dat->max);
|
||||
uint64x2_t a_le_max = vcaleq_f64 (x, dat->max);
|
||||
uint64x2_t a_gt_max = vcagtq_f64 (x, dat->max);
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
/* |x| huge or tiny. */
|
||||
|
|
@ -105,7 +107,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erf) (float64x2_t x)
|
|||
segfault. */
|
||||
uint64x2_t i
|
||||
= vsubq_u64 (vreinterpretq_u64_f64 (z), vreinterpretq_u64_f64 (shift));
|
||||
i = vbslq_u64 (a_le_max, i, v_u64 (768));
|
||||
i = vbslq_u64 (a_le_max, i, dat->max_idx);
|
||||
struct entry e = lookup (i);
|
||||
|
||||
float64x2_t r = vsubq_f64 (z, shift);
|
||||
|
|
@ -115,14 +117,19 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erf) (float64x2_t x)
|
|||
float64x2_t d2 = vmulq_f64 (d, d);
|
||||
float64x2_t r2 = vmulq_f64 (r, r);
|
||||
|
||||
float64x2_t two_over_fifteen_and_fortyfive
|
||||
= vld1q_f64 (&dat->two_over_fifteen);
|
||||
|
||||
/* poly (d, r) = 1 + p1(r) * d + p2(r) * d^2 + ... + p5(r) * d^5. */
|
||||
float64x2_t p1 = r;
|
||||
float64x2_t p2
|
||||
= vfmsq_f64 (dat->third, r2, vaddq_f64 (dat->third, dat->third));
|
||||
float64x2_t p3 = vmulq_f64 (r, vfmaq_f64 (v_f64 (-0.5), r2, dat->third));
|
||||
float64x2_t p4 = vfmaq_f64 (dat->two_over_five, r2, dat->two_over_fifteen);
|
||||
float64x2_t p4 = vfmaq_laneq_f64 (dat->two_over_five, r2,
|
||||
two_over_fifteen_and_fortyfive, 0);
|
||||
p4 = vfmsq_f64 (dat->tenth, r2, p4);
|
||||
float64x2_t p5 = vfmaq_f64 (dat->two_over_nine, r2, dat->two_over_fortyfive);
|
||||
float64x2_t p5 = vfmaq_laneq_f64 (dat->two_over_nine, r2,
|
||||
two_over_fifteen_and_fortyfive, 1);
|
||||
p5 = vmulq_f64 (r, vfmaq_f64 (vmulq_f64 (v_f64 (0.5), dat->third), r2, p5));
|
||||
|
||||
float64x2_t p34 = vfmaq_f64 (p3, d, p4);
|
||||
|
|
@ -150,9 +157,10 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erf) (float64x2_t x)
|
|||
return y;
|
||||
}
|
||||
|
||||
PL_SIG (V, D, 1, erf, -6.0, 6.0)
|
||||
PL_TEST_ULP (V_NAME_D1 (erf), 1.79)
|
||||
PL_TEST_EXPECT_FENV (V_NAME_D1 (erf), WANT_SIMD_EXCEPT)
|
||||
PL_TEST_SYM_INTERVAL (V_NAME_D1 (erf), 0, 5.9921875, 40000)
|
||||
PL_TEST_SYM_INTERVAL (V_NAME_D1 (erf), 5.9921875, inf, 40000)
|
||||
PL_TEST_SYM_INTERVAL (V_NAME_D1 (erf), 0, inf, 40000)
|
||||
TEST_SIG (V, D, 1, erf, -6.0, 6.0)
|
||||
TEST_ULP (V_NAME_D1 (erf), 1.79)
|
||||
/* WANT_SIMD_EXCEPT blocks miss some cases. */
|
||||
TEST_DISABLE_FENV (V_NAME_D1 (erf))
|
||||
TEST_SYM_INTERVAL (V_NAME_D1 (erf), 0, 5.9921875, 40000)
|
||||
TEST_SYM_INTERVAL (V_NAME_D1 (erf), 5.9921875, inf, 40000)
|
||||
TEST_SYM_INTERVAL (V_NAME_D1 (erf), 0, inf, 40000)
|
||||
|
|
@ -1,21 +1,21 @@
|
|||
/*
|
||||
* Double-precision vector erfc(x) function.
|
||||
*
|
||||
* Copyright (c) 2023, Arm Limited.
|
||||
* Copyright (c) 2023-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "v_math.h"
|
||||
#include "pl_sig.h"
|
||||
#include "pl_test.h"
|
||||
#include "test_sig.h"
|
||||
#include "test_defs.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
uint64x2_t offset, table_scale;
|
||||
float64x2_t max, shift;
|
||||
float64x2_t p20, p40, p41, p42;
|
||||
float64x2_t p51, p52;
|
||||
float64x2_t qr5, qr6, qr7, qr8, qr9;
|
||||
float64x2_t p20, p40, p41, p51;
|
||||
double p42, p52;
|
||||
double qr5[2], qr6[2], qr7[2], qr8[2], qr9[2];
|
||||
#if WANT_SIMD_EXCEPT
|
||||
float64x2_t uflow_bound;
|
||||
#endif
|
||||
|
|
@ -30,9 +30,9 @@ static const struct data
|
|||
.p20 = V2 (0x1.5555555555555p-2), /* 1/3, used to compute 2/3 and 1/6. */
|
||||
.p40 = V2 (-0x1.999999999999ap-4), /* 1/10. */
|
||||
.p41 = V2 (-0x1.999999999999ap-2), /* 2/5. */
|
||||
.p42 = V2 (0x1.1111111111111p-3), /* 2/15. */
|
||||
.p42 = 0x1.1111111111111p-3, /* 2/15. */
|
||||
.p51 = V2 (-0x1.c71c71c71c71cp-3), /* 2/9. */
|
||||
.p52 = V2 (0x1.6c16c16c16c17p-5), /* 2/45. */
|
||||
.p52 = 0x1.6c16c16c16c17p-5, /* 2/45. */
|
||||
/* Qi = (i+1) / i, Ri = -2 * i / ((i+1)*(i+2)), for i = 5, ..., 9. */
|
||||
.qr5 = { 0x1.3333333333333p0, -0x1.e79e79e79e79ep-3 },
|
||||
.qr6 = { 0x1.2aaaaaaaaaaabp0, -0x1.b6db6db6db6dbp-3 },
|
||||
|
|
@ -57,8 +57,10 @@ static inline struct entry
|
|||
lookup (uint64x2_t i)
|
||||
{
|
||||
struct entry e;
|
||||
float64x2_t e1 = vld1q_f64 ((float64_t *) (__erfc_data.tab - Off + i[0])),
|
||||
e2 = vld1q_f64 ((float64_t *) (__erfc_data.tab - Off + i[1]));
|
||||
float64x2_t e1
|
||||
= vld1q_f64 (&__v_erfc_data.tab[vgetq_lane_u64 (i, 0) - Off].erfc);
|
||||
float64x2_t e2
|
||||
= vld1q_f64 (&__v_erfc_data.tab[vgetq_lane_u64 (i, 1) - Off].erfc);
|
||||
e.erfc = vuzp1q_f64 (e1, e2);
|
||||
e.scale = vuzp2q_f64 (e1, e2);
|
||||
return e;
|
||||
|
|
@ -144,22 +146,26 @@ float64x2_t V_NAME_D1 (erfc) (float64x2_t x)
|
|||
float64x2_t p1 = r;
|
||||
float64x2_t p2 = vfmsq_f64 (dat->p20, r2, vaddq_f64 (dat->p20, dat->p20));
|
||||
float64x2_t p3 = vmulq_f64 (r, vfmaq_f64 (v_f64 (-0.5), r2, dat->p20));
|
||||
float64x2_t p4 = vfmaq_f64 (dat->p41, r2, dat->p42);
|
||||
float64x2_t p42_p52 = vld1q_f64 (&dat->p42);
|
||||
float64x2_t p4 = vfmaq_laneq_f64 (dat->p41, r2, p42_p52, 0);
|
||||
p4 = vfmsq_f64 (dat->p40, r2, p4);
|
||||
float64x2_t p5 = vfmaq_f64 (dat->p51, r2, dat->p52);
|
||||
float64x2_t p5 = vfmaq_laneq_f64 (dat->p51, r2, p42_p52, 1);
|
||||
p5 = vmulq_f64 (r, vfmaq_f64 (vmulq_f64 (v_f64 (0.5), dat->p20), r2, p5));
|
||||
/* Compute p_i using recurrence relation:
|
||||
p_{i+2} = (p_i + r * Q_{i+1} * p_{i+1}) * R_{i+1}. */
|
||||
float64x2_t p6 = vfmaq_f64 (p4, p5, vmulq_laneq_f64 (r, dat->qr5, 0));
|
||||
p6 = vmulq_laneq_f64 (p6, dat->qr5, 1);
|
||||
float64x2_t p7 = vfmaq_f64 (p5, p6, vmulq_laneq_f64 (r, dat->qr6, 0));
|
||||
p7 = vmulq_laneq_f64 (p7, dat->qr6, 1);
|
||||
float64x2_t p8 = vfmaq_f64 (p6, p7, vmulq_laneq_f64 (r, dat->qr7, 0));
|
||||
p8 = vmulq_laneq_f64 (p8, dat->qr7, 1);
|
||||
float64x2_t p9 = vfmaq_f64 (p7, p8, vmulq_laneq_f64 (r, dat->qr8, 0));
|
||||
p9 = vmulq_laneq_f64 (p9, dat->qr8, 1);
|
||||
float64x2_t p10 = vfmaq_f64 (p8, p9, vmulq_laneq_f64 (r, dat->qr9, 0));
|
||||
p10 = vmulq_laneq_f64 (p10, dat->qr9, 1);
|
||||
float64x2_t qr5 = vld1q_f64 (dat->qr5), qr6 = vld1q_f64 (dat->qr6),
|
||||
qr7 = vld1q_f64 (dat->qr7), qr8 = vld1q_f64 (dat->qr8),
|
||||
qr9 = vld1q_f64 (dat->qr9);
|
||||
float64x2_t p6 = vfmaq_f64 (p4, p5, vmulq_laneq_f64 (r, qr5, 0));
|
||||
p6 = vmulq_laneq_f64 (p6, qr5, 1);
|
||||
float64x2_t p7 = vfmaq_f64 (p5, p6, vmulq_laneq_f64 (r, qr6, 0));
|
||||
p7 = vmulq_laneq_f64 (p7, qr6, 1);
|
||||
float64x2_t p8 = vfmaq_f64 (p6, p7, vmulq_laneq_f64 (r, qr7, 0));
|
||||
p8 = vmulq_laneq_f64 (p8, qr7, 1);
|
||||
float64x2_t p9 = vfmaq_f64 (p7, p8, vmulq_laneq_f64 (r, qr8, 0));
|
||||
p9 = vmulq_laneq_f64 (p9, qr8, 1);
|
||||
float64x2_t p10 = vfmaq_f64 (p8, p9, vmulq_laneq_f64 (r, qr9, 0));
|
||||
p10 = vmulq_laneq_f64 (p10, qr9, 1);
|
||||
/* Compute polynomial in d using pairwise Horner scheme. */
|
||||
float64x2_t p90 = vfmaq_f64 (p9, d, p10);
|
||||
float64x2_t p78 = vfmaq_f64 (p7, d, p8);
|
||||
|
|
@ -189,10 +195,11 @@ float64x2_t V_NAME_D1 (erfc) (float64x2_t x)
|
|||
return vfmaq_f64 (off, fac, y);
|
||||
}
|
||||
|
||||
PL_SIG (V, D, 1, erfc, -6.0, 28.0)
|
||||
PL_TEST_ULP (V_NAME_D1 (erfc), 1.21)
|
||||
PL_TEST_SYM_INTERVAL (V_NAME_D1 (erfc), 0, 0x1p-26, 40000)
|
||||
PL_TEST_INTERVAL (V_NAME_D1 (erfc), 0x1p-26, 28.0, 40000)
|
||||
PL_TEST_INTERVAL (V_NAME_D1 (erfc), -0x1p-26, -6.0, 40000)
|
||||
PL_TEST_INTERVAL (V_NAME_D1 (erfc), 28.0, inf, 40000)
|
||||
PL_TEST_INTERVAL (V_NAME_D1 (erfc), -6.0, -inf, 40000)
|
||||
TEST_SIG (V, D, 1, erfc, -6.0, 28.0)
|
||||
TEST_ULP (V_NAME_D1 (erfc), 1.21)
|
||||
TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (erfc), WANT_SIMD_EXCEPT)
|
||||
TEST_SYM_INTERVAL (V_NAME_D1 (erfc), 0, 0x1p-26, 40000)
|
||||
TEST_INTERVAL (V_NAME_D1 (erfc), 0x1p-26, 28.0, 40000)
|
||||
TEST_INTERVAL (V_NAME_D1 (erfc), -0x1p-26, -6.0, 40000)
|
||||
TEST_INTERVAL (V_NAME_D1 (erfc), 28.0, inf, 40000)
|
||||
TEST_INTERVAL (V_NAME_D1 (erfc), -6.0, -inf, 40000)
|
||||
|
|
@ -1,19 +1,20 @@
|
|||
/*
|
||||
* Single-precision vector erfc(x) function.
|
||||
*
|
||||
* Copyright (c) 2023, Arm Limited.
|
||||
* Copyright (c) 2023-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "v_math.h"
|
||||
#include "pl_sig.h"
|
||||
#include "pl_test.h"
|
||||
#include "test_sig.h"
|
||||
#include "test_defs.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
uint32x4_t offset, table_scale;
|
||||
float32x4_t max, shift;
|
||||
float32x4_t coeffs, third, two_over_five, tenth;
|
||||
float coeffs[4];
|
||||
float32x4_t third, two_over_five, tenth;
|
||||
#if WANT_SIMD_EXCEPT
|
||||
float32x4_t uflow_bound;
|
||||
#endif
|
||||
|
|
@ -27,7 +28,7 @@ static const struct data
|
|||
.shift = V4 (0x1p17f),
|
||||
/* Store 1/3, 2/3 and 2/15 in a single register for use with indexed muls and
|
||||
fmas. */
|
||||
.coeffs = (float32x4_t){ 0x1.555556p-2f, 0x1.555556p-1f, 0x1.111112p-3f, 0 },
|
||||
.coeffs = { 0x1.555556p-2f, 0x1.555556p-1f, 0x1.111112p-3f, 0 },
|
||||
.third = V4 (0x1.555556p-2f),
|
||||
.two_over_five = V4 (-0x1.99999ap-2f),
|
||||
.tenth = V4 (-0x1.99999ap-4f),
|
||||
|
|
@ -50,12 +51,16 @@ static inline struct entry
|
|||
lookup (uint32x4_t i)
|
||||
{
|
||||
struct entry e;
|
||||
float64_t t0 = *((float64_t *) (__erfcf_data.tab - Off + i[0]));
|
||||
float64_t t1 = *((float64_t *) (__erfcf_data.tab - Off + i[1]));
|
||||
float64_t t2 = *((float64_t *) (__erfcf_data.tab - Off + i[2]));
|
||||
float64_t t3 = *((float64_t *) (__erfcf_data.tab - Off + i[3]));
|
||||
float32x4_t e1 = vreinterpretq_f32_f64 ((float64x2_t){ t0, t1 });
|
||||
float32x4_t e2 = vreinterpretq_f32_f64 ((float64x2_t){ t2, t3 });
|
||||
float32x2_t t0
|
||||
= vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 0) - Off].erfc);
|
||||
float32x2_t t1
|
||||
= vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 1) - Off].erfc);
|
||||
float32x2_t t2
|
||||
= vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 2) - Off].erfc);
|
||||
float32x2_t t3
|
||||
= vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 3) - Off].erfc);
|
||||
float32x4_t e1 = vcombine_f32 (t0, t1);
|
||||
float32x4_t e2 = vcombine_f32 (t2, t3);
|
||||
e.erfc = vuzp1q_f32 (e1, e2);
|
||||
e.scale = vuzp2q_f32 (e1, e2);
|
||||
return e;
|
||||
|
|
@ -86,8 +91,7 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
|
|||
Maximum error: 1.63 ULP (~1.0 ULP for x < 0.0).
|
||||
_ZGVnN4v_erfcf(0x1.1dbf7ap+3) got 0x1.f51212p-120
|
||||
want 0x1.f51216p-120. */
|
||||
VPCS_ATTR
|
||||
float32x4_t V_NAME_F1 (erfc) (float32x4_t x)
|
||||
NOINLINE VPCS_ATTR float32x4_t V_NAME_F1 (erfc) (float32x4_t x)
|
||||
{
|
||||
const struct data *dat = ptr_barrier (&data);
|
||||
|
||||
|
|
@ -130,10 +134,11 @@ float32x4_t V_NAME_F1 (erfc) (float32x4_t x)
|
|||
float32x4_t r2 = vmulq_f32 (r, r);
|
||||
|
||||
float32x4_t p1 = r;
|
||||
float32x4_t p2 = vfmsq_laneq_f32 (dat->third, r2, dat->coeffs, 1);
|
||||
float32x4_t coeffs = vld1q_f32 (dat->coeffs);
|
||||
float32x4_t p2 = vfmsq_laneq_f32 (dat->third, r2, coeffs, 1);
|
||||
float32x4_t p3
|
||||
= vmulq_f32 (r, vfmaq_laneq_f32 (v_f32 (-0.5), r2, dat->coeffs, 0));
|
||||
float32x4_t p4 = vfmaq_laneq_f32 (dat->two_over_five, r2, dat->coeffs, 2);
|
||||
= vmulq_f32 (r, vfmaq_laneq_f32 (v_f32 (-0.5), r2, coeffs, 0));
|
||||
float32x4_t p4 = vfmaq_laneq_f32 (dat->two_over_five, r2, coeffs, 2);
|
||||
p4 = vfmsq_f32 (dat->tenth, r2, p4);
|
||||
|
||||
float32x4_t y = vfmaq_f32 (p3, d, p4);
|
||||
|
|
@ -157,10 +162,13 @@ float32x4_t V_NAME_F1 (erfc) (float32x4_t x)
|
|||
return vfmaq_f32 (off, fac, y);
|
||||
}
|
||||
|
||||
PL_SIG (V, F, 1, erfc, -4.0, 10.0)
|
||||
PL_TEST_ULP (V_NAME_F1 (erfc), 1.14)
|
||||
PL_TEST_SYM_INTERVAL (V_NAME_F1 (erfc), 0, 0x1p-26, 40000)
|
||||
PL_TEST_INTERVAL (V_NAME_F1 (erfc), 0x1p-26, 10.0625, 40000)
|
||||
PL_TEST_INTERVAL (V_NAME_F1 (erfc), -0x1p-26, -4.0, 40000)
|
||||
PL_TEST_INTERVAL (V_NAME_F1 (erfc), 10.0625, inf, 40000)
|
||||
PL_TEST_INTERVAL (V_NAME_F1 (erfc), -4.0, -inf, 40000)
|
||||
HALF_WIDTH_ALIAS_F1 (erfc)
|
||||
|
||||
TEST_SIG (V, F, 1, erfc, -4.0, 10.0)
|
||||
TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (erfc), WANT_SIMD_EXCEPT)
|
||||
TEST_ULP (V_NAME_F1 (erfc), 1.14)
|
||||
TEST_SYM_INTERVAL (V_NAME_F1 (erfc), 0, 0x1p-26, 40000)
|
||||
TEST_INTERVAL (V_NAME_F1 (erfc), 0x1p-26, 10.0625, 40000)
|
||||
TEST_INTERVAL (V_NAME_F1 (erfc), -0x1p-26, -4.0, 40000)
|
||||
TEST_INTERVAL (V_NAME_F1 (erfc), 10.0625, inf, 40000)
|
||||
TEST_INTERVAL (V_NAME_F1 (erfc), -4.0, -inf, 40000)
|
||||
|
|
@ -1,13 +1,13 @@
|
|||
/*
|
||||
* Single-precision vector erf(x) function.
|
||||
*
|
||||
* Copyright (c) 2023, Arm Limited.
|
||||
* Copyright (c) 2023-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "v_math.h"
|
||||
#include "pl_sig.h"
|
||||
#include "pl_test.h"
|
||||
#include "test_sig.h"
|
||||
#include "test_defs.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
|
|
@ -37,12 +37,12 @@ static inline struct entry
|
|||
lookup (uint32x4_t i)
|
||||
{
|
||||
struct entry e;
|
||||
float64_t t0 = *((float64_t *) (__erff_data.tab + i[0]));
|
||||
float64_t t1 = *((float64_t *) (__erff_data.tab + i[1]));
|
||||
float64_t t2 = *((float64_t *) (__erff_data.tab + i[2]));
|
||||
float64_t t3 = *((float64_t *) (__erff_data.tab + i[3]));
|
||||
float32x4_t e1 = vreinterpretq_f32_f64 ((float64x2_t){ t0, t1 });
|
||||
float32x4_t e2 = vreinterpretq_f32_f64 ((float64x2_t){ t2, t3 });
|
||||
float32x2_t t0 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 0)].erf);
|
||||
float32x2_t t1 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 1)].erf);
|
||||
float32x2_t t2 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 2)].erf);
|
||||
float32x2_t t3 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 3)].erf);
|
||||
float32x4_t e1 = vcombine_f32 (t0, t1);
|
||||
float32x4_t e2 = vcombine_f32 (t2, t3);
|
||||
e.erf = vuzp1q_f32 (e1, e2);
|
||||
e.scale = vuzp2q_f32 (e1, e2);
|
||||
return e;
|
||||
|
|
@ -61,7 +61,7 @@ lookup (uint32x4_t i)
|
|||
Maximum error: 1.93 ULP
|
||||
_ZGVnN4v_erff(0x1.c373e6p-9) got 0x1.fd686cp-9
|
||||
want 0x1.fd6868p-9. */
|
||||
float32x4_t VPCS_ATTR V_NAME_F1 (erf) (float32x4_t x)
|
||||
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (erf) (float32x4_t x)
|
||||
{
|
||||
const struct data *dat = ptr_barrier (&data);
|
||||
|
||||
|
|
@ -110,9 +110,11 @@ float32x4_t VPCS_ATTR V_NAME_F1 (erf) (float32x4_t x)
|
|||
return y;
|
||||
}
|
||||
|
||||
PL_SIG (V, F, 1, erf, -4.0, 4.0)
|
||||
PL_TEST_ULP (V_NAME_F1 (erf), 1.43)
|
||||
PL_TEST_EXPECT_FENV (V_NAME_F1 (erf), WANT_SIMD_EXCEPT)
|
||||
PL_TEST_SYM_INTERVAL (V_NAME_F1 (erf), 0, 3.9375, 40000)
|
||||
PL_TEST_SYM_INTERVAL (V_NAME_F1 (erf), 3.9375, inf, 40000)
|
||||
PL_TEST_SYM_INTERVAL (V_NAME_F1 (erf), 0, inf, 40000)
|
||||
HALF_WIDTH_ALIAS_F1 (erf)
|
||||
|
||||
TEST_SIG (V, F, 1, erf, -4.0, 4.0)
|
||||
TEST_ULP (V_NAME_F1 (erf), 1.43)
|
||||
TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (erf), WANT_SIMD_EXCEPT)
|
||||
TEST_SYM_INTERVAL (V_NAME_F1 (erf), 0, 3.9375, 40000)
|
||||
TEST_SYM_INTERVAL (V_NAME_F1 (erf), 3.9375, inf, 40000)
|
||||
TEST_SYM_INTERVAL (V_NAME_F1 (erf), 0, inf, 40000)
|
||||
|
|
@ -1,12 +1,14 @@
|
|||
/*
|
||||
* Double-precision vector e^x function.
|
||||
*
|
||||
* Copyright (c) 2019-2023, Arm Limited.
|
||||
* Copyright (c) 2019-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "mathlib.h"
|
||||
#include "v_math.h"
|
||||
#include "test_defs.h"
|
||||
#include "test_sig.h"
|
||||
|
||||
#define N (1 << V_EXP_TABLE_BITS)
|
||||
#define IndexMask (N - 1)
|
||||
|
|
@ -123,3 +125,10 @@ float64x2_t VPCS_ATTR V_NAME_D1 (exp) (float64x2_t x)
|
|||
|
||||
return vfmaq_f64 (s, y, s);
|
||||
}
|
||||
|
||||
TEST_SIG (V, D, 1, exp, -9.9, 9.9)
|
||||
TEST_ULP (V_NAME_D1 (exp), 1.9)
|
||||
TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (exp), WANT_SIMD_EXCEPT)
|
||||
TEST_INTERVAL (V_NAME_D1 (exp), 0, 0xffff000000000000, 10000)
|
||||
TEST_SYM_INTERVAL (V_NAME_D1 (exp), 0x1p-6, 0x1p6, 400000)
|
||||
TEST_SYM_INTERVAL (V_NAME_D1 (exp), 633.3, 733.3, 10000)
|
||||
|
|
@ -1,14 +1,15 @@
|
|||
/*
|
||||
* Double-precision vector 10^x function.
|
||||
*
|
||||
* Copyright (c) 2023, Arm Limited.
|
||||
* Copyright (c) 2023-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#define _GNU_SOURCE
|
||||
#include "mathlib.h"
|
||||
#include "v_math.h"
|
||||
#include "pl_sig.h"
|
||||
#include "pl_test.h"
|
||||
#include "test_sig.h"
|
||||
#include "test_defs.h"
|
||||
|
||||
/* Value of |x| above which scale overflows without special treatment. */
|
||||
#define SpecialBound 306.0 /* floor (log10 (2^1023)) - 1. */
|
||||
|
|
@ -135,10 +136,12 @@ float64x2_t VPCS_ATTR V_NAME_D1 (exp10) (float64x2_t x)
|
|||
return vfmaq_f64 (s, y, s);
|
||||
}
|
||||
|
||||
PL_SIG (S, D, 1, exp10, -9.9, 9.9)
|
||||
PL_SIG (V, D, 1, exp10, -9.9, 9.9)
|
||||
PL_TEST_ULP (V_NAME_D1 (exp10), 1.15)
|
||||
PL_TEST_EXPECT_FENV (V_NAME_D1 (exp10), WANT_SIMD_EXCEPT)
|
||||
PL_TEST_SYM_INTERVAL (V_NAME_D1 (exp10), 0, SpecialBound, 5000)
|
||||
PL_TEST_SYM_INTERVAL (V_NAME_D1 (exp10), SpecialBound, ScaleBound, 5000)
|
||||
PL_TEST_SYM_INTERVAL (V_NAME_D1 (exp10), ScaleBound, inf, 10000)
|
||||
#if WANT_EXP10_TESTS
|
||||
TEST_SIG (S, D, 1, exp10, -9.9, 9.9)
|
||||
TEST_SIG (V, D, 1, exp10, -9.9, 9.9)
|
||||
TEST_ULP (V_NAME_D1 (exp10), 1.15)
|
||||
TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (exp10), WANT_SIMD_EXCEPT)
|
||||
TEST_SYM_INTERVAL (V_NAME_D1 (exp10), 0, SpecialBound, 5000)
|
||||
TEST_SYM_INTERVAL (V_NAME_D1 (exp10), SpecialBound, ScaleBound, 5000)
|
||||
TEST_SYM_INTERVAL (V_NAME_D1 (exp10), ScaleBound, inf, 10000)
|
||||
#endif
|
||||
|
|
@ -1,23 +1,24 @@
|
|||
/*
|
||||
* Single-precision vector 10^x function.
|
||||
*
|
||||
* Copyright (c) 2023, Arm Limited.
|
||||
* Copyright (c) 2023-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "mathlib.h"
|
||||
#define _GNU_SOURCE
|
||||
#include "v_math.h"
|
||||
#include "pl_sig.h"
|
||||
#include "pl_test.h"
|
||||
#include "poly_advsimd_f32.h"
|
||||
#include "test_sig.h"
|
||||
#include "test_defs.h"
|
||||
#include "v_poly_f32.h"
|
||||
|
||||
#define ScaleBound 192.0f
|
||||
|
||||
static const struct data
|
||||
{
|
||||
float32x4_t poly[5];
|
||||
float32x4_t log10_2_and_inv, shift;
|
||||
|
||||
float32x4_t c0, c1, c3;
|
||||
float log10_2_high, log10_2_low, c2, c4;
|
||||
float32x4_t inv_log10_2, special_bound;
|
||||
uint32x4_t exponent_bias, special_offset, special_bias;
|
||||
#if !WANT_SIMD_EXCEPT
|
||||
float32x4_t scale_thresh;
|
||||
#endif
|
||||
|
|
@ -27,19 +28,24 @@ static const struct data
|
|||
rel error: 0x1.89dafa3p-24
|
||||
abs error: 0x1.167d55p-23 in [-log10(2)/2, log10(2)/2]
|
||||
maxerr: 1.85943 +0.5 ulp. */
|
||||
.poly = { V4 (0x1.26bb16p+1f), V4 (0x1.5350d2p+1f), V4 (0x1.04744ap+1f),
|
||||
V4 (0x1.2d8176p+0f), V4 (0x1.12b41ap-1f) },
|
||||
.shift = V4 (0x1.8p23f),
|
||||
|
||||
/* Stores constants 1/log10(2), log10(2)_high, log10(2)_low, 0. */
|
||||
.log10_2_and_inv = { 0x1.a934fp+1, 0x1.344136p-2, -0x1.ec10cp-27, 0 },
|
||||
.c0 = V4 (0x1.26bb16p+1f),
|
||||
.c1 = V4 (0x1.5350d2p+1f),
|
||||
.c2 = 0x1.04744ap+1f,
|
||||
.c3 = V4 (0x1.2d8176p+0f),
|
||||
.c4 = 0x1.12b41ap-1f,
|
||||
.inv_log10_2 = V4 (0x1.a934fp+1),
|
||||
.log10_2_high = 0x1.344136p-2,
|
||||
.log10_2_low = 0x1.ec10cp-27,
|
||||
/* rint (log2 (2^127 / (1 + sqrt (2)))). */
|
||||
.special_bound = V4 (126.0f),
|
||||
.exponent_bias = V4 (0x3f800000),
|
||||
.special_offset = V4 (0x82000000),
|
||||
.special_bias = V4 (0x7f000000),
|
||||
#if !WANT_SIMD_EXCEPT
|
||||
.scale_thresh = V4 (ScaleBound)
|
||||
#endif
|
||||
};
|
||||
|
||||
#define ExponentBias v_u32 (0x3f800000)
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
|
||||
# define SpecialBound 38.0f /* rint(log10(2^127)). */
|
||||
|
|
@ -57,17 +63,15 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
|
|||
|
||||
#else
|
||||
|
||||
# define SpecialBound 126.0f /* rint (log2 (2^127 / (1 + sqrt (2)))). */
|
||||
# define SpecialOffset v_u32 (0x82000000)
|
||||
# define SpecialBias v_u32 (0x7f000000)
|
||||
# define SpecialBound 126.0f
|
||||
|
||||
static float32x4_t VPCS_ATTR NOINLINE
|
||||
special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
|
||||
float32x4_t scale, const struct data *d)
|
||||
{
|
||||
/* 2^n may overflow, break it up into s1*s2. */
|
||||
uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset);
|
||||
float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias));
|
||||
uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset);
|
||||
float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias));
|
||||
float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
|
||||
uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
|
||||
float32x4_t r2 = vmulq_f32 (s1, s1);
|
||||
|
|
@ -84,7 +88,7 @@ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
|
|||
Algorithm is accurate to 2.36 ULP.
|
||||
_ZGVnN4v_exp10f(0x1.be2b36p+1) got 0x1.7e79c4p+11
|
||||
want 0x1.7e79cp+11. */
|
||||
float32x4_t VPCS_ATTR V_NAME_F1 (exp10) (float32x4_t x)
|
||||
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp10) (float32x4_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
#if WANT_SIMD_EXCEPT
|
||||
|
|
@ -102,22 +106,23 @@ float32x4_t VPCS_ATTR V_NAME_F1 (exp10) (float32x4_t x)
|
|||
/* exp10(x) = 2^n * 10^r = 2^n * (1 + poly (r)),
|
||||
with poly(r) in [1/sqrt(2), sqrt(2)] and
|
||||
x = r + n * log10 (2), with r in [-log10(2)/2, log10(2)/2]. */
|
||||
float32x4_t z = vfmaq_laneq_f32 (d->shift, x, d->log10_2_and_inv, 0);
|
||||
float32x4_t n = vsubq_f32 (z, d->shift);
|
||||
float32x4_t r = vfmsq_laneq_f32 (x, n, d->log10_2_and_inv, 1);
|
||||
r = vfmsq_laneq_f32 (r, n, d->log10_2_and_inv, 2);
|
||||
uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23);
|
||||
float32x4_t log10_2_c24 = vld1q_f32 (&d->log10_2_high);
|
||||
float32x4_t n = vrndaq_f32 (vmulq_f32 (x, d->inv_log10_2));
|
||||
float32x4_t r = vfmsq_laneq_f32 (x, n, log10_2_c24, 0);
|
||||
r = vfmaq_laneq_f32 (r, n, log10_2_c24, 1);
|
||||
uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (n)), 23);
|
||||
|
||||
float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, ExponentBias));
|
||||
float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
|
||||
|
||||
#if !WANT_SIMD_EXCEPT
|
||||
uint32x4_t cmp = vcagtq_f32 (n, v_f32 (SpecialBound));
|
||||
uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
|
||||
#endif
|
||||
|
||||
float32x4_t r2 = vmulq_f32 (r, r);
|
||||
float32x4_t poly
|
||||
= vfmaq_f32 (vmulq_f32 (r, d->poly[0]),
|
||||
v_pairwise_poly_3_f32 (r, r2, d->poly + 1), r2);
|
||||
float32x4_t p12 = vfmaq_laneq_f32 (d->c1, r, log10_2_c24, 2);
|
||||
float32x4_t p34 = vfmaq_laneq_f32 (d->c3, r, log10_2_c24, 3);
|
||||
float32x4_t p14 = vfmaq_f32 (p12, r2, p34);
|
||||
float32x4_t poly = vfmaq_f32 (vmulq_f32 (r, d->c0), p14, r2);
|
||||
|
||||
if (unlikely (v_any_u32 (cmp)))
|
||||
#if WANT_SIMD_EXCEPT
|
||||
|
|
@ -129,10 +134,14 @@ float32x4_t VPCS_ATTR V_NAME_F1 (exp10) (float32x4_t x)
|
|||
return vfmaq_f32 (scale, poly, scale);
|
||||
}
|
||||
|
||||
PL_SIG (S, F, 1, exp10, -9.9, 9.9)
|
||||
PL_SIG (V, F, 1, exp10, -9.9, 9.9)
|
||||
PL_TEST_ULP (V_NAME_F1 (exp10), 1.86)
|
||||
PL_TEST_EXPECT_FENV (V_NAME_F1 (exp10), WANT_SIMD_EXCEPT)
|
||||
PL_TEST_SYM_INTERVAL (V_NAME_F1 (exp10), 0, SpecialBound, 5000)
|
||||
PL_TEST_SYM_INTERVAL (V_NAME_F1 (exp10), SpecialBound, ScaleBound, 5000)
|
||||
PL_TEST_SYM_INTERVAL (V_NAME_F1 (exp10), ScaleBound, inf, 10000)
|
||||
HALF_WIDTH_ALIAS_F1 (exp10)
|
||||
|
||||
#if WANT_EXP10_TESTS
|
||||
TEST_SIG (S, F, 1, exp10, -9.9, 9.9)
|
||||
TEST_SIG (V, F, 1, exp10, -9.9, 9.9)
|
||||
TEST_ULP (V_NAME_F1 (exp10), 1.86)
|
||||
TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (exp10), WANT_SIMD_EXCEPT)
|
||||
TEST_SYM_INTERVAL (V_NAME_F1 (exp10), 0, SpecialBound, 5000)
|
||||
TEST_SYM_INTERVAL (V_NAME_F1 (exp10), SpecialBound, ScaleBound, 5000)
|
||||
TEST_SYM_INTERVAL (V_NAME_F1 (exp10), ScaleBound, inf, 10000)
|
||||
#endif
|
||||
|
|
@ -1,19 +1,20 @@
|
|||
/*
|
||||
* Double-precision vector 2^x function.
|
||||
*
|
||||
* Copyright (c) 2019-2023, Arm Limited.
|
||||
* Copyright (c) 2019-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "v_math.h"
|
||||
#include "poly_advsimd_f64.h"
|
||||
#include "pl_sig.h"
|
||||
#include "pl_test.h"
|
||||
#include "v_poly_f64.h"
|
||||
#include "test_sig.h"
|
||||
#include "test_defs.h"
|
||||
|
||||
#define N (1 << V_EXP_TABLE_BITS)
|
||||
#define IndexMask (N - 1)
|
||||
#define BigBound 1022.0
|
||||
#define UOFlowBound 1280.0
|
||||
#define TinyBound 0x2000000000000000 /* asuint64(0x1p-511). */
|
||||
|
||||
static const struct data
|
||||
{
|
||||
|
|
@ -38,7 +39,6 @@ lookup_sbits (uint64x2_t i)
|
|||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
|
||||
# define TinyBound 0x2000000000000000 /* asuint64(0x1p-511). */
|
||||
# define Thres 0x2080000000000000 /* asuint64(512.0) - TinyBound. */
|
||||
|
||||
/* Call scalar exp2 as a fallback. */
|
||||
|
|
@ -62,8 +62,8 @@ special_case (float64x2_t s, float64x2_t y, float64x2_t n,
|
|||
/* 2^(n/N) may overflow, break it up into s1*s2. */
|
||||
uint64x2_t b = vandq_u64 (vclezq_f64 (n), v_u64 (SpecialOffset));
|
||||
float64x2_t s1 = vreinterpretq_f64_u64 (vsubq_u64 (v_u64 (SpecialBias1), b));
|
||||
float64x2_t s2 = vreinterpretq_f64_u64 (
|
||||
vaddq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (s), v_u64 (SpecialBias2)), b));
|
||||
float64x2_t s2 = vreinterpretq_f64_u64 (vaddq_u64 (
|
||||
vsubq_u64 (vreinterpretq_u64_f64 (s), v_u64 (SpecialBias2)), b));
|
||||
uint64x2_t cmp = vcagtq_f64 (n, d->scale_uoflow_bound);
|
||||
float64x2_t r1 = vmulq_f64 (s1, s1);
|
||||
float64x2_t r0 = vmulq_f64 (vfmaq_f64 (s2, s2, y), s1);
|
||||
|
|
@ -119,10 +119,10 @@ float64x2_t V_NAME_D1 (exp2) (float64x2_t x)
|
|||
return vfmaq_f64 (s, s, y);
|
||||
}
|
||||
|
||||
PL_SIG (V, D, 1, exp2, -9.9, 9.9)
|
||||
PL_TEST_ULP (V_NAME_D1 (exp2), 1.15)
|
||||
PL_TEST_EXPECT_FENV (V_NAME_D1 (exp2), WANT_SIMD_EXCEPT)
|
||||
PL_TEST_SYM_INTERVAL (V_NAME_D1 (exp2), 0, TinyBound, 5000)
|
||||
PL_TEST_SYM_INTERVAL (V_NAME_D1 (exp2), TinyBound, BigBound, 10000)
|
||||
PL_TEST_SYM_INTERVAL (V_NAME_D1 (exp2), BigBound, UOFlowBound, 5000)
|
||||
PL_TEST_SYM_INTERVAL (V_NAME_D1 (exp2), UOFlowBound, inf, 10000)
|
||||
TEST_SIG (V, D, 1, exp2, -9.9, 9.9)
|
||||
TEST_ULP (V_NAME_D1 (exp2), 1.15)
|
||||
TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (exp2), WANT_SIMD_EXCEPT)
|
||||
TEST_SYM_INTERVAL (V_NAME_D1 (exp2), 0, TinyBound, 5000)
|
||||
TEST_SYM_INTERVAL (V_NAME_D1 (exp2), TinyBound, BigBound, 10000)
|
||||
TEST_SYM_INTERVAL (V_NAME_D1 (exp2), BigBound, UOFlowBound, 5000)
|
||||
TEST_SYM_INTERVAL (V_NAME_D1 (exp2), UOFlowBound, inf, 10000)
|
||||
|
|
@ -1,33 +1,38 @@
|
|||
/*
|
||||
* Single-precision vector 2^x function.
|
||||
*
|
||||
* Copyright (c) 2019-2023, Arm Limited.
|
||||
* Copyright (c) 2019-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "mathlib.h"
|
||||
#include "v_math.h"
|
||||
#include "test_defs.h"
|
||||
#include "test_sig.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
float32x4_t poly[5];
|
||||
uint32x4_t exponent_bias;
|
||||
float32x4_t c1, c3;
|
||||
uint32x4_t exponent_bias, special_offset, special_bias;
|
||||
#if !WANT_SIMD_EXCEPT
|
||||
float32x4_t special_bound, scale_thresh;
|
||||
float32x4_t scale_thresh, special_bound;
|
||||
#endif
|
||||
float c0, c2, c4, zero;
|
||||
} data = {
|
||||
/* maxerr: 1.962 ulp. */
|
||||
.poly = { V4 (0x1.59977ap-10f), V4 (0x1.3ce9e4p-7f), V4 (0x1.c6bd32p-5f),
|
||||
V4 (0x1.ebf9bcp-3f), V4 (0x1.62e422p-1f) },
|
||||
.c0 = 0x1.59977ap-10f,
|
||||
.c1 = V4 (0x1.3ce9e4p-7f),
|
||||
.c2 = 0x1.c6bd32p-5f,
|
||||
.c3 = V4 (0x1.ebf9bcp-3f),
|
||||
.c4 = 0x1.62e422p-1f,
|
||||
.exponent_bias = V4 (0x3f800000),
|
||||
.special_offset = V4 (0x82000000),
|
||||
.special_bias = V4 (0x7f000000),
|
||||
#if !WANT_SIMD_EXCEPT
|
||||
.special_bound = V4 (126.0f),
|
||||
.scale_thresh = V4 (192.0f),
|
||||
#endif
|
||||
};
|
||||
|
||||
#define C(i) d->poly[i]
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
|
||||
# define TinyBound v_u32 (0x20000000) /* asuint (0x1p-63). */
|
||||
|
|
@ -44,16 +49,13 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
|
|||
|
||||
#else
|
||||
|
||||
# define SpecialOffset v_u32 (0x82000000)
|
||||
# define SpecialBias v_u32 (0x7f000000)
|
||||
|
||||
static float32x4_t VPCS_ATTR NOINLINE
|
||||
special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
|
||||
float32x4_t scale, const struct data *d)
|
||||
{
|
||||
/* 2^n may overflow, break it up into s1*s2. */
|
||||
uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset);
|
||||
float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias));
|
||||
uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset);
|
||||
float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias));
|
||||
float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
|
||||
uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
|
||||
float32x4_t r2 = vmulq_f32 (s1, s1);
|
||||
|
|
@ -66,16 +68,14 @@ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
|
|||
|
||||
#endif
|
||||
|
||||
float32x4_t VPCS_ATTR V_NAME_F1 (exp2) (float32x4_t x)
|
||||
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp2) (float32x4_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
float32x4_t n, r, r2, scale, p, q, poly;
|
||||
uint32x4_t cmp, e;
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
/* asuint(|x|) - TinyBound >= BigBound - TinyBound. */
|
||||
uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x));
|
||||
cmp = vcgeq_u32 (vsubq_u32 (ia, TinyBound), SpecialBound);
|
||||
uint32x4_t cmp = vcgeq_u32 (vsubq_u32 (ia, TinyBound), SpecialBound);
|
||||
float32x4_t xm = x;
|
||||
/* If any lanes are special, mask them with 1 and retain a copy of x to allow
|
||||
special_case to fix special lanes later. This is only necessary if fenv
|
||||
|
|
@ -84,23 +84,24 @@ float32x4_t VPCS_ATTR V_NAME_F1 (exp2) (float32x4_t x)
|
|||
x = vbslq_f32 (cmp, v_f32 (1), x);
|
||||
#endif
|
||||
|
||||
/* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
|
||||
x = n + r, with r in [-1/2, 1/2]. */
|
||||
n = vrndaq_f32 (x);
|
||||
r = vsubq_f32 (x, n);
|
||||
e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 23);
|
||||
scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
|
||||
/* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
|
||||
x = n + r, with r in [-1/2, 1/2]. */
|
||||
float32x4_t n = vrndaq_f32 (x);
|
||||
float32x4_t r = vsubq_f32 (x, n);
|
||||
uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 23);
|
||||
float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
|
||||
|
||||
#if !WANT_SIMD_EXCEPT
|
||||
cmp = vcagtq_f32 (n, d->special_bound);
|
||||
uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
|
||||
#endif
|
||||
|
||||
r2 = vmulq_f32 (r, r);
|
||||
p = vfmaq_f32 (C (1), C (0), r);
|
||||
q = vfmaq_f32 (C (3), C (2), r);
|
||||
float32x4_t c024 = vld1q_f32 (&d->c0);
|
||||
float32x4_t r2 = vmulq_f32 (r, r);
|
||||
float32x4_t p = vfmaq_laneq_f32 (d->c1, r, c024, 0);
|
||||
float32x4_t q = vfmaq_laneq_f32 (d->c3, r, c024, 1);
|
||||
q = vfmaq_f32 (q, p, r2);
|
||||
p = vmulq_f32 (C (4), r);
|
||||
poly = vfmaq_f32 (p, q, r2);
|
||||
p = vmulq_laneq_f32 (r, c024, 2);
|
||||
float32x4_t poly = vfmaq_f32 (p, q, r2);
|
||||
|
||||
if (unlikely (v_any_u32 (cmp)))
|
||||
#if WANT_SIMD_EXCEPT
|
||||
|
|
@ -111,3 +112,11 @@ float32x4_t VPCS_ATTR V_NAME_F1 (exp2) (float32x4_t x)
|
|||
|
||||
return vfmaq_f32 (scale, poly, scale);
|
||||
}
|
||||
|
||||
HALF_WIDTH_ALIAS_F1 (exp2)
|
||||
|
||||
TEST_SIG (V, F, 1, exp2, -9.9, 9.9)
|
||||
TEST_ULP (V_NAME_F1 (exp2), 1.49)
|
||||
TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (exp2), WANT_SIMD_EXCEPT)
|
||||
TEST_INTERVAL (V_NAME_F1 (exp2), 0, 0xffff0000, 10000)
|
||||
TEST_SYM_INTERVAL (V_NAME_F1 (exp2), 0x1p-14, 0x1p8, 500000)
|
||||
|
|
@ -0,0 +1,73 @@
|
|||
/*
|
||||
* Single-precision vector 2^x function.
|
||||
*
|
||||
* Copyright (c) 2019-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "v_math.h"
|
||||
#include "test_defs.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
float32x4_t c0, c1, c2, c3, c4, c5, shift;
|
||||
uint32x4_t exponent_bias;
|
||||
float32x4_t special_bound, scale_thresh;
|
||||
uint32x4_t special_offset, special_bias;
|
||||
} data = {
|
||||
.shift = V4 (0x1.8p23f),
|
||||
.exponent_bias = V4 (0x3f800000),
|
||||
.special_bound = V4 (126.0f),
|
||||
.scale_thresh = V4 (192.0f),
|
||||
.special_offset = V4 (0x82000000),
|
||||
.special_bias = V4 (0x7f000000),
|
||||
/* maxerr: 0.878 ulp. */
|
||||
.c0 = V4 (0x1.416b5ep-13f),
|
||||
.c1 = V4 (0x1.5f082ep-10f),
|
||||
.c2 = V4 (0x1.3b2dep-7f),
|
||||
.c3 = V4 (0x1.c6af7cp-5f),
|
||||
.c4 = V4 (0x1.ebfbdcp-3f),
|
||||
.c5 = V4 (0x1.62e43p-1f),
|
||||
};
|
||||
|
||||
static float32x4_t VPCS_ATTR NOINLINE
|
||||
specialcase (float32x4_t p, float32x4_t n, uint32x4_t e, const struct data *d)
|
||||
{
|
||||
/* 2^n may overflow, break it up into s1*s2. */
|
||||
uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset);
|
||||
float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias));
|
||||
float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
|
||||
uint32x4_t cmp = vcagtq_f32 (n, d->scale_thresh);
|
||||
float32x4_t r1 = vmulq_f32 (s1, s1);
|
||||
float32x4_t r0 = vmulq_f32 (vmulq_f32 (p, s1), s2);
|
||||
return vreinterpretq_f32_u32 ((cmp & vreinterpretq_u32_f32 (r1))
|
||||
| (~cmp & vreinterpretq_u32_f32 (r0)));
|
||||
}
|
||||
|
||||
float32x4_t VPCS_ATTR
|
||||
_ZGVnN4v_exp2f_1u (float32x4_t x)
|
||||
{
|
||||
/* exp2(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)]
|
||||
x = n + r, with r in [-1/2, 1/2]. */
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
float32x4_t n = vrndaq_f32 (x);
|
||||
float32x4_t r = x - n;
|
||||
uint32x4_t e = vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)) << 23;
|
||||
float32x4_t scale = vreinterpretq_f32_u32 (e + d->exponent_bias);
|
||||
uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
|
||||
|
||||
float32x4_t p = vfmaq_f32 (d->c1, d->c0, r);
|
||||
p = vfmaq_f32 (d->c2, p, r);
|
||||
p = vfmaq_f32 (d->c3, p, r);
|
||||
p = vfmaq_f32 (d->c4, p, r);
|
||||
p = vfmaq_f32 (d->c5, p, r);
|
||||
p = vfmaq_f32 (v_f32 (1.0f), p, r);
|
||||
if (unlikely (v_any_u32 (cmp)))
|
||||
return specialcase (p, n, e, d);
|
||||
return scale * p;
|
||||
}
|
||||
|
||||
TEST_ULP (_ZGVnN4v_exp2f_1u, 0.4)
|
||||
TEST_DISABLE_FENV (_ZGVnN4v_exp2f_1u)
|
||||
TEST_INTERVAL (_ZGVnN4v_exp2f_1u, 0, 0xffff0000, 10000)
|
||||
TEST_SYM_INTERVAL (_ZGVnN4v_exp2f_1u, 0x1p-14, 0x1p8, 500000)
|
||||
|
|
@ -1,30 +1,34 @@
|
|||
/*
|
||||
* Single-precision vector e^x function.
|
||||
*
|
||||
* Copyright (c) 2019-2023, Arm Limited.
|
||||
* Copyright (c) 2019-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "mathlib.h"
|
||||
#include "v_math.h"
|
||||
#include "test_defs.h"
|
||||
#include "test_sig.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
float32x4_t poly[5];
|
||||
float32x4_t shift, inv_ln2, ln2_hi, ln2_lo;
|
||||
uint32x4_t exponent_bias;
|
||||
float32x4_t c1, c3, c4, inv_ln2;
|
||||
float ln2_hi, ln2_lo, c0, c2;
|
||||
uint32x4_t exponent_bias, special_offset, special_bias;
|
||||
#if !WANT_SIMD_EXCEPT
|
||||
float32x4_t special_bound, scale_thresh;
|
||||
#endif
|
||||
} data = {
|
||||
/* maxerr: 1.45358 +0.5 ulp. */
|
||||
.poly = { V4 (0x1.0e4020p-7f), V4 (0x1.573e2ep-5f), V4 (0x1.555e66p-3f),
|
||||
V4 (0x1.fffdb6p-2f), V4 (0x1.ffffecp-1f) },
|
||||
.shift = V4 (0x1.8p23f),
|
||||
.c0 = 0x1.0e4020p-7f,
|
||||
.c1 = V4 (0x1.573e2ep-5f),
|
||||
.c2 = 0x1.555e66p-3f,
|
||||
.c3 = V4 (0x1.fffdb6p-2f),
|
||||
.c4 = V4 (0x1.ffffecp-1f),
|
||||
.inv_ln2 = V4 (0x1.715476p+0f),
|
||||
.ln2_hi = V4 (0x1.62e4p-1f),
|
||||
.ln2_lo = V4 (0x1.7f7d1cp-20f),
|
||||
.ln2_hi = 0x1.62e4p-1f,
|
||||
.ln2_lo = 0x1.7f7d1cp-20f,
|
||||
.exponent_bias = V4 (0x3f800000),
|
||||
.special_offset = V4 (0x82000000),
|
||||
.special_bias = V4 (0x7f000000),
|
||||
#if !WANT_SIMD_EXCEPT
|
||||
.special_bound = V4 (126.0f),
|
||||
.scale_thresh = V4 (192.0f),
|
||||
|
|
@ -49,19 +53,17 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
|
|||
|
||||
#else
|
||||
|
||||
# define SpecialOffset v_u32 (0x82000000)
|
||||
# define SpecialBias v_u32 (0x7f000000)
|
||||
|
||||
static float32x4_t VPCS_ATTR NOINLINE
|
||||
special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
|
||||
float32x4_t scale, const struct data *d)
|
||||
{
|
||||
/* 2^n may overflow, break it up into s1*s2. */
|
||||
uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset);
|
||||
float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias));
|
||||
uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset);
|
||||
float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias));
|
||||
float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
|
||||
uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
|
||||
float32x4_t r2 = vmulq_f32 (s1, s1);
|
||||
// (s2 + p*s2)*s1 = s2(p+1)s1
|
||||
float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1);
|
||||
/* Similar to r1 but avoids double rounding in the subnormal range. */
|
||||
float32x4_t r0 = vfmaq_f32 (scale, poly, scale);
|
||||
|
|
@ -71,15 +73,14 @@ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
|
|||
|
||||
#endif
|
||||
|
||||
float32x4_t VPCS_ATTR V_NAME_F1 (exp) (float32x4_t x)
|
||||
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp) (float32x4_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
float32x4_t n, r, r2, scale, p, q, poly, z;
|
||||
uint32x4_t cmp, e;
|
||||
float32x4_t ln2_c02 = vld1q_f32 (&d->ln2_hi);
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
/* asuint(x) - TinyBound >= BigBound - TinyBound. */
|
||||
cmp = vcgeq_u32 (
|
||||
uint32x4_t cmp = vcgeq_u32 (
|
||||
vsubq_u32 (vandq_u32 (vreinterpretq_u32_f32 (x), v_u32 (0x7fffffff)),
|
||||
TinyBound),
|
||||
SpecialBound);
|
||||
|
|
@ -93,23 +94,22 @@ float32x4_t VPCS_ATTR V_NAME_F1 (exp) (float32x4_t x)
|
|||
|
||||
/* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
|
||||
x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
|
||||
z = vfmaq_f32 (d->shift, x, d->inv_ln2);
|
||||
n = vsubq_f32 (z, d->shift);
|
||||
r = vfmsq_f32 (x, n, d->ln2_hi);
|
||||
r = vfmsq_f32 (r, n, d->ln2_lo);
|
||||
e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23);
|
||||
scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
|
||||
float32x4_t n = vrndaq_f32 (vmulq_f32 (x, d->inv_ln2));
|
||||
float32x4_t r = vfmsq_laneq_f32 (x, n, ln2_c02, 0);
|
||||
r = vfmsq_laneq_f32 (r, n, ln2_c02, 1);
|
||||
uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23);
|
||||
float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
|
||||
|
||||
#if !WANT_SIMD_EXCEPT
|
||||
cmp = vcagtq_f32 (n, d->special_bound);
|
||||
uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
|
||||
#endif
|
||||
|
||||
r2 = vmulq_f32 (r, r);
|
||||
p = vfmaq_f32 (C (1), C (0), r);
|
||||
q = vfmaq_f32 (C (3), C (2), r);
|
||||
float32x4_t r2 = vmulq_f32 (r, r);
|
||||
float32x4_t p = vfmaq_laneq_f32 (d->c1, r, ln2_c02, 2);
|
||||
float32x4_t q = vfmaq_laneq_f32 (d->c3, r, ln2_c02, 3);
|
||||
q = vfmaq_f32 (q, p, r2);
|
||||
p = vmulq_f32 (C (4), r);
|
||||
poly = vfmaq_f32 (p, q, r2);
|
||||
p = vmulq_f32 (d->c4, r);
|
||||
float32x4_t poly = vfmaq_f32 (p, q, r2);
|
||||
|
||||
if (unlikely (v_any_u32 (cmp)))
|
||||
#if WANT_SIMD_EXCEPT
|
||||
|
|
@ -120,3 +120,11 @@ float32x4_t VPCS_ATTR V_NAME_F1 (exp) (float32x4_t x)
|
|||
|
||||
return vfmaq_f32 (scale, poly, scale);
|
||||
}
|
||||
|
||||
HALF_WIDTH_ALIAS_F1 (exp)
|
||||
|
||||
TEST_SIG (V, F, 1, exp, -9.9, 9.9)
|
||||
TEST_ULP (V_NAME_F1 (exp), 1.49)
|
||||
TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (exp), WANT_SIMD_EXCEPT)
|
||||
TEST_INTERVAL (V_NAME_F1 (exp), 0, 0xffff0000, 10000)
|
||||
TEST_SYM_INTERVAL (V_NAME_F1 (exp), 0x1p-14, 0x1p8, 500000)
|
||||
|
|
@ -0,0 +1,79 @@
|
|||
/*
|
||||
* Single-precision vector e^x function.
|
||||
*
|
||||
* Copyright (c) 2019-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
#include "v_math.h"
|
||||
#include "test_defs.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
float32x4_t shift, inv_ln2;
|
||||
uint32x4_t exponent_bias;
|
||||
float32x4_t c1, c2, c3, c4;
|
||||
float32x4_t special_bound, scale_thresh;
|
||||
uint32x4_t special_offset, special_bias;
|
||||
float ln2_hi, ln2_lo, c0, nothing;
|
||||
} data = {
|
||||
.ln2_hi = 0x1.62e4p-1f,
|
||||
.ln2_lo = 0x1.7f7d1cp-20f,
|
||||
.shift = V4 (0x1.8p23f),
|
||||
.inv_ln2 = V4 (0x1.715476p+0f),
|
||||
.exponent_bias = V4 (0x3f800000),
|
||||
.special_bound = V4 (126.0f),
|
||||
.scale_thresh = V4 (192.0f),
|
||||
.special_offset = V4 (0x83000000),
|
||||
.special_bias = V4 (0x7f000000),
|
||||
/* maxerr: 0.36565 +0.5 ulp. */
|
||||
.c0 = 0x1.6a6000p-10f,
|
||||
.c1 = V4 (0x1.12718ep-7f),
|
||||
.c2 = V4 (0x1.555af0p-5f),
|
||||
.c3 = V4 (0x1.555430p-3f),
|
||||
.c4 = V4 (0x1.fffff4p-2f),
|
||||
};
|
||||
|
||||
static float32x4_t VPCS_ATTR NOINLINE
|
||||
specialcase (float32x4_t p, float32x4_t n, uint32x4_t e, const struct data *d)
|
||||
{
|
||||
/* 2^n may overflow, break it up into s1*s2. */
|
||||
uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset);
|
||||
float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias));
|
||||
float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
|
||||
uint32x4_t cmp = vcagtq_f32 (n, d->scale_thresh);
|
||||
float32x4_t r1 = vmulq_f32 (s1, s1);
|
||||
float32x4_t r0 = vmulq_f32 (vmulq_f32 (p, s1), s2);
|
||||
return vreinterpretq_f32_u32 ((cmp & vreinterpretq_u32_f32 (r1))
|
||||
| (~cmp & vreinterpretq_u32_f32 (r0)));
|
||||
}
|
||||
|
||||
float32x4_t VPCS_ATTR
|
||||
_ZGVnN4v_expf_1u (float32x4_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
float32x4_t ln2_c0 = vld1q_f32 (&d->ln2_hi);
|
||||
|
||||
/* exp(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)]
|
||||
x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
|
||||
float32x4_t z = vmulq_f32 (x, d->inv_ln2);
|
||||
float32x4_t n = vrndaq_f32 (z);
|
||||
float32x4_t r = vfmsq_laneq_f32 (x, n, ln2_c0, 0);
|
||||
r = vfmsq_laneq_f32 (r, n, ln2_c0, 1);
|
||||
uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (z)), 23);
|
||||
float32x4_t scale = vreinterpretq_f32_u32 (e + d->exponent_bias);
|
||||
uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
|
||||
float32x4_t p = vfmaq_laneq_f32 (d->c1, r, ln2_c0, 2);
|
||||
p = vfmaq_f32 (d->c2, p, r);
|
||||
p = vfmaq_f32 (d->c3, p, r);
|
||||
p = vfmaq_f32 (d->c4, p, r);
|
||||
p = vfmaq_f32 (v_f32 (1.0f), p, r);
|
||||
p = vfmaq_f32 (v_f32 (1.0f), p, r);
|
||||
if (unlikely (v_any_u32 (cmp)))
|
||||
return specialcase (p, n, e, d);
|
||||
return scale * p;
|
||||
}
|
||||
|
||||
TEST_ULP (_ZGVnN4v_expf_1u, 0.4)
|
||||
TEST_DISABLE_FENV (_ZGVnN4v_expf_1u)
|
||||
TEST_INTERVAL (_ZGVnN4v_expf_1u, 0, 0xffff0000, 10000)
|
||||
TEST_SYM_INTERVAL (_ZGVnN4v_expf_1u, 0x1p-14, 0x1p8, 500000)
|
||||
77
contrib/arm-optimized-routines/math/aarch64/advsimd/expm1.c
Normal file
77
contrib/arm-optimized-routines/math/aarch64/advsimd/expm1.c
Normal file
|
|
@ -0,0 +1,77 @@
|
|||
/*
|
||||
* Double-precision vector exp(x) - 1 function.
|
||||
*
|
||||
* Copyright (c) 2022-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "v_math.h"
|
||||
#include "test_sig.h"
|
||||
#include "test_defs.h"
|
||||
#include "v_expm1_inline.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
struct v_expm1_data d;
|
||||
#if WANT_SIMD_EXCEPT
|
||||
uint64x2_t thresh, tiny_bound;
|
||||
#else
|
||||
float64x2_t oflow_bound;
|
||||
#endif
|
||||
} data = {
|
||||
.d = V_EXPM1_DATA,
|
||||
#if WANT_SIMD_EXCEPT
|
||||
/* asuint64(oflow_bound) - asuint64(0x1p-51), shifted left by 1 for abs
|
||||
compare. */
|
||||
.thresh = V2 (0x78c56fa6d34b552),
|
||||
/* asuint64(0x1p-51) << 1. */
|
||||
.tiny_bound = V2 (0x3cc0000000000000 << 1),
|
||||
#else
|
||||
/* Value above which expm1(x) should overflow. Absolute value of the
|
||||
underflow bound is greater than this, so it catches both cases - there is
|
||||
a small window where fallbacks are triggered unnecessarily. */
|
||||
.oflow_bound = V2 (0x1.62b7d369a5aa9p+9),
|
||||
#endif
|
||||
};
|
||||
|
||||
static float64x2_t VPCS_ATTR NOINLINE
|
||||
special_case (float64x2_t x, uint64x2_t special, const struct data *d)
|
||||
{
|
||||
return v_call_f64 (expm1, x, expm1_inline (v_zerofy_f64 (x, special), &d->d),
|
||||
special);
|
||||
}
|
||||
|
||||
/* Double-precision vector exp(x) - 1 function.
|
||||
The maximum error observed error is 2.05 ULP:
|
||||
_ZGVnN2v_expm1(0x1.6329669eb8c87p-2) got 0x1.a8897eef87b34p-2
|
||||
want 0x1.a8897eef87b32p-2. */
|
||||
float64x2_t VPCS_ATTR V_NAME_D1 (expm1) (float64x2_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
uint64x2_t ix = vreinterpretq_u64_f64 (x);
|
||||
/* If fp exceptions are to be triggered correctly, fall back to scalar for
|
||||
|x| < 2^-51, |x| > oflow_bound, Inf & NaN. Add ix to itself for
|
||||
shift-left by 1, and compare with thresh which was left-shifted offline -
|
||||
this is effectively an absolute compare. */
|
||||
uint64x2_t special
|
||||
= vcgeq_u64 (vsubq_u64 (vaddq_u64 (ix, ix), d->tiny_bound), d->thresh);
|
||||
#else
|
||||
/* Large input, NaNs and Infs. */
|
||||
uint64x2_t special = vcageq_f64 (x, d->oflow_bound);
|
||||
#endif
|
||||
|
||||
if (unlikely (v_any_u64 (special)))
|
||||
return special_case (x, special, d);
|
||||
|
||||
/* expm1(x) ~= p * t + (t - 1). */
|
||||
return expm1_inline (x, &d->d);
|
||||
}
|
||||
|
||||
TEST_SIG (V, D, 1, expm1, -9.9, 9.9)
|
||||
TEST_ULP (V_NAME_D1 (expm1), 1.56)
|
||||
TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (expm1), WANT_SIMD_EXCEPT)
|
||||
TEST_SYM_INTERVAL (V_NAME_D1 (expm1), 0, 0x1p-51, 1000)
|
||||
TEST_SYM_INTERVAL (V_NAME_D1 (expm1), 0x1p-51, 0x1.62b7d369a5aa9p+9, 100000)
|
||||
TEST_SYM_INTERVAL (V_NAME_D1 (expm1), 0x1.62b7d369a5aa9p+9, inf, 100)
|
||||
82
contrib/arm-optimized-routines/math/aarch64/advsimd/expm1f.c
Normal file
82
contrib/arm-optimized-routines/math/aarch64/advsimd/expm1f.c
Normal file
|
|
@ -0,0 +1,82 @@
|
|||
/*
|
||||
* Single-precision vector exp(x) - 1 function.
|
||||
*
|
||||
* Copyright (c) 2022-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "v_math.h"
|
||||
#include "test_sig.h"
|
||||
#include "test_defs.h"
|
||||
#include "v_expm1f_inline.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
struct v_expm1f_data d;
|
||||
#if WANT_SIMD_EXCEPT
|
||||
uint32x4_t thresh;
|
||||
#else
|
||||
float32x4_t oflow_bound;
|
||||
#endif
|
||||
} data = {
|
||||
.d = V_EXPM1F_DATA,
|
||||
#if !WANT_SIMD_EXCEPT
|
||||
/* Value above which expm1f(x) should overflow. Absolute value of the
|
||||
underflow bound is greater than this, so it catches both cases - there is
|
||||
a small window where fallbacks are triggered unnecessarily. */
|
||||
.oflow_bound = V4 (0x1.5ebc4p+6),
|
||||
#else
|
||||
/* asuint(oflow_bound) - asuint(0x1p-23), shifted left by 1 for absolute
|
||||
compare. */
|
||||
.thresh = V4 (0x1d5ebc40),
|
||||
#endif
|
||||
};
|
||||
|
||||
/* asuint(0x1p-23), shifted by 1 for abs compare. */
|
||||
#define TinyBound v_u32 (0x34000000 << 1)
|
||||
|
||||
static float32x4_t VPCS_ATTR NOINLINE
|
||||
special_case (float32x4_t x, uint32x4_t special, const struct data *d)
|
||||
{
|
||||
return v_call_f32 (
|
||||
expm1f, x, expm1f_inline (v_zerofy_f32 (x, special), &d->d), special);
|
||||
}
|
||||
|
||||
/* Single-precision vector exp(x) - 1 function.
|
||||
The maximum error is 1.62 ULP:
|
||||
_ZGVnN4v_expm1f(0x1.85f83p-2) got 0x1.da9f4p-2
|
||||
want 0x1.da9f44p-2. */
|
||||
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (expm1) (float32x4_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
uint32x4_t ix = vreinterpretq_u32_f32 (x);
|
||||
/* If fp exceptions are to be triggered correctly, fall back to scalar for
|
||||
|x| < 2^-23, |x| > oflow_bound, Inf & NaN. Add ix to itself for
|
||||
shift-left by 1, and compare with thresh which was left-shifted offline -
|
||||
this is effectively an absolute compare. */
|
||||
uint32x4_t special
|
||||
= vcgeq_u32 (vsubq_u32 (vaddq_u32 (ix, ix), TinyBound), d->thresh);
|
||||
#else
|
||||
/* Handles very large values (+ve and -ve), +/-NaN, +/-Inf. */
|
||||
uint32x4_t special = vcagtq_f32 (x, d->oflow_bound);
|
||||
#endif
|
||||
|
||||
if (unlikely (v_any_u32 (special)))
|
||||
return special_case (x, special, d);
|
||||
|
||||
/* expm1(x) ~= p * t + (t - 1). */
|
||||
return expm1f_inline (x, &d->d);
|
||||
}
|
||||
|
||||
HALF_WIDTH_ALIAS_F1 (expm1)
|
||||
|
||||
TEST_SIG (V, F, 1, expm1, -9.9, 9.9)
|
||||
TEST_ULP (V_NAME_F1 (expm1), 1.13)
|
||||
TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (expm1), WANT_SIMD_EXCEPT)
|
||||
TEST_SYM_INTERVAL (V_NAME_F1 (expm1), 0, 0x1p-23, 1000)
|
||||
TEST_INTERVAL (V_NAME_F1 (expm1), -0x1p-23, 0x1.5ebc4p+6, 1000000)
|
||||
TEST_INTERVAL (V_NAME_F1 (expm1), -0x1p-23, -0x1.9bbabcp+6, 1000000)
|
||||
TEST_INTERVAL (V_NAME_F1 (expm1), 0x1.5ebc4p+6, inf, 1000)
|
||||
TEST_INTERVAL (V_NAME_F1 (expm1), -0x1.9bbabcp+6, -inf, 1000)
|
||||
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
* Double-precision x^y function.
|
||||
*
|
||||
* Copyright (c) 2018-2023, Arm Limited.
|
||||
* Copyright (c) 2018-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
|
|
@ -108,7 +108,7 @@ special_case (double tmp, uint64_t sbits, uint64_t ki)
|
|||
sbits -= 1009ull << 52;
|
||||
scale = asdouble (sbits);
|
||||
y = 0x1p1009 * (scale + scale * tmp);
|
||||
return check_oflow (eval_as_double (y));
|
||||
return y;
|
||||
}
|
||||
/* k < 0, need special care in the subnormal range. */
|
||||
sbits += 1022ull << 52;
|
||||
|
|
@ -128,7 +128,7 @@ special_case (double tmp, uint64_t sbits, uint64_t ki)
|
|||
lo = scale - y + scale * tmp;
|
||||
hi = one + y;
|
||||
lo = one - hi + y + lo;
|
||||
y = eval_as_double (hi + lo) - one;
|
||||
y = (hi + lo) - one;
|
||||
/* Fix the sign of 0. */
|
||||
if (y == 0.0)
|
||||
y = asdouble (sbits & 0x8000000000000000);
|
||||
|
|
@ -137,7 +137,7 @@ special_case (double tmp, uint64_t sbits, uint64_t ki)
|
|||
}
|
||||
#endif
|
||||
y = 0x1p-1022 * y;
|
||||
return check_uflow (eval_as_double (y));
|
||||
return y;
|
||||
}
|
||||
|
||||
/* Computes sign*exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|.
|
||||
|
|
@ -192,7 +192,7 @@ exp_inline (double x, double xtail, uint32_t sign_bias)
|
|||
double scale = asdouble (sbits);
|
||||
/* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
|
||||
is no spurious underflow here even without fma. */
|
||||
return eval_as_double (scale + scale * tmp);
|
||||
return scale + scale * tmp;
|
||||
}
|
||||
|
||||
/* Computes exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|.
|
||||
|
|
@ -239,7 +239,7 @@ exp_nosignbias (double x, double xtail)
|
|||
double scale = asdouble (sbits);
|
||||
/* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
|
||||
is no spurious underflow here even without fma. */
|
||||
return eval_as_double (scale + scale * tmp);
|
||||
return scale + scale * tmp;
|
||||
}
|
||||
|
||||
/* Returns 0 if not int, 1 if odd int, 2 if even int. The argument is
|
||||
|
|
@ -267,7 +267,7 @@ zeroinfnan (uint64_t i)
|
|||
}
|
||||
|
||||
static double NOINLINE
|
||||
__pl_finite_pow (double x, double y)
|
||||
pow_scalar_special_case (double x, double y)
|
||||
{
|
||||
uint32_t sign_bias = 0;
|
||||
uint64_t ix, iy;
|
||||
|
|
@ -311,9 +311,7 @@ __pl_finite_pow (double x, double y)
|
|||
if (2 * ix == 0 && iy >> 63)
|
||||
return __math_divzero (sign_bias);
|
||||
#endif
|
||||
/* Without the barrier some versions of clang hoist the 1/x2 and
|
||||
thus division by zero exception can be signaled spuriously. */
|
||||
return iy >> 63 ? opt_barrier_double (1 / x2) : x2;
|
||||
return iy >> 63 ? 1 / x2 : x2;
|
||||
}
|
||||
/* Here x and y are non-zero finite. */
|
||||
if (ix >> 63)
|
||||
|
|
@ -349,9 +347,7 @@ __pl_finite_pow (double x, double y)
|
|||
if (topx == 0)
|
||||
{
|
||||
/* Normalize subnormal x so exponent becomes negative. */
|
||||
/* Without the barrier some versions of clang evalutate the mul
|
||||
unconditionally causing spurious overflow exceptions. */
|
||||
ix = asuint64 (opt_barrier_double (x) * 0x1p52);
|
||||
ix = asuint64 (x * 0x1p52);
|
||||
ix &= 0x7fffffffffffffff;
|
||||
ix -= 52ULL << 52;
|
||||
}
|
||||
|
|
@ -1,13 +1,13 @@
|
|||
/*
|
||||
* Double-precision vector hypot(x) function.
|
||||
*
|
||||
* Copyright (c) 2023, Arm Limited.
|
||||
* Copyright (c) 2023-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "v_math.h"
|
||||
#include "pl_sig.h"
|
||||
#include "pl_test.h"
|
||||
#include "test_sig.h"
|
||||
#include "test_defs.h"
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
static const struct data
|
||||
|
|
@ -15,7 +15,7 @@ static const struct data
|
|||
uint64x2_t tiny_bound, thres;
|
||||
} data = {
|
||||
.tiny_bound = V2 (0x2000000000000000), /* asuint (0x1p-511). */
|
||||
.thres = V2 (0x3fe0000000000000), /* asuint (0x1p511) - tiny_bound. */
|
||||
.thres = V2 (0x3fe0000000000000), /* asuint (0x1p511) - tiny_bound. */
|
||||
};
|
||||
#else
|
||||
static const struct data
|
||||
|
|
@ -24,7 +24,7 @@ static const struct data
|
|||
uint32x4_t thres;
|
||||
} data = {
|
||||
.tiny_bound = V2 (0x0360000000000000), /* asuint (0x1p-969). */
|
||||
.thres = V4 (0x7c900000), /* asuint (inf) - tiny_bound. */
|
||||
.thres = V4 (0x7c900000), /* asuint (inf) - tiny_bound. */
|
||||
};
|
||||
#endif
|
||||
|
||||
|
|
@ -75,9 +75,9 @@ float64x2_t VPCS_ATTR V_NAME_D2 (hypot) (float64x2_t x, float64x2_t y)
|
|||
|
||||
float64x2_t sqsum = vfmaq_f64 (vmulq_f64 (x, x), y, y);
|
||||
|
||||
uint32x2_t special = vcge_u32 (
|
||||
vsubhn_u64 (vreinterpretq_u64_f64 (sqsum), d->tiny_bound),
|
||||
vget_low_u32 (d->thres));
|
||||
uint32x2_t special
|
||||
= vcge_u32 (vsubhn_u64 (vreinterpretq_u64_f64 (sqsum), d->tiny_bound),
|
||||
vget_low_u32 (d->thres));
|
||||
|
||||
if (unlikely (v_any_u32h (special)))
|
||||
return special_case (x, y, sqsum, special);
|
||||
|
|
@ -86,10 +86,10 @@ float64x2_t VPCS_ATTR V_NAME_D2 (hypot) (float64x2_t x, float64x2_t y)
|
|||
}
|
||||
#endif
|
||||
|
||||
PL_SIG (V, D, 2, hypot, -10.0, 10.0)
|
||||
PL_TEST_ULP (V_NAME_D2 (hypot), 1.21)
|
||||
PL_TEST_EXPECT_FENV (V_NAME_D2 (hypot), WANT_SIMD_EXCEPT)
|
||||
PL_TEST_INTERVAL2 (V_NAME_D2 (hypot), 0, inf, 0, inf, 10000)
|
||||
PL_TEST_INTERVAL2 (V_NAME_D2 (hypot), 0, inf, -0, -inf, 10000)
|
||||
PL_TEST_INTERVAL2 (V_NAME_D2 (hypot), -0, -inf, 0, inf, 10000)
|
||||
PL_TEST_INTERVAL2 (V_NAME_D2 (hypot), -0, -inf, -0, -inf, 10000)
|
||||
TEST_SIG (V, D, 2, hypot, -10.0, 10.0)
|
||||
TEST_ULP (V_NAME_D2 (hypot), 1.21)
|
||||
TEST_DISABLE_FENV_IF_NOT (V_NAME_D2 (hypot), WANT_SIMD_EXCEPT)
|
||||
TEST_INTERVAL2 (V_NAME_D2 (hypot), 0, inf, 0, inf, 10000)
|
||||
TEST_INTERVAL2 (V_NAME_D2 (hypot), 0, inf, -0, -inf, 10000)
|
||||
TEST_INTERVAL2 (V_NAME_D2 (hypot), -0, -inf, 0, inf, 10000)
|
||||
TEST_INTERVAL2 (V_NAME_D2 (hypot), -0, -inf, -0, -inf, 10000)
|
||||
|
|
@ -1,13 +1,13 @@
|
|||
/*
|
||||
* Single-precision vector hypot(x) function.
|
||||
*
|
||||
* Copyright (c) 2023, Arm Limited.
|
||||
* Copyright (c) 2023-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "v_math.h"
|
||||
#include "pl_sig.h"
|
||||
#include "pl_test.h"
|
||||
#include "test_sig.h"
|
||||
#include "test_defs.h"
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
static const struct data
|
||||
|
|
@ -15,7 +15,7 @@ static const struct data
|
|||
uint32x4_t tiny_bound, thres;
|
||||
} data = {
|
||||
.tiny_bound = V4 (0x20000000), /* asuint (0x1p-63). */
|
||||
.thres = V4 (0x3f000000), /* asuint (0x1p63) - tiny_bound. */
|
||||
.thres = V4 (0x3f000000), /* asuint (0x1p63) - tiny_bound. */
|
||||
};
|
||||
#else
|
||||
static const struct data
|
||||
|
|
@ -24,7 +24,7 @@ static const struct data
|
|||
uint16x8_t thres;
|
||||
} data = {
|
||||
.tiny_bound = V4 (0x0C800000), /* asuint (0x1p-102). */
|
||||
.thres = V8 (0x7300), /* asuint (inf) - tiny_bound. */
|
||||
.thres = V8 (0x7300), /* asuint (inf) - tiny_bound. */
|
||||
};
|
||||
#endif
|
||||
|
||||
|
|
@ -41,7 +41,7 @@ special_case (float32x4_t x, float32x4_t y, float32x4_t sqsum,
|
|||
want 0x1.6a41dp-13. */
|
||||
#if WANT_SIMD_EXCEPT
|
||||
|
||||
float32x4_t VPCS_ATTR V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y)
|
||||
float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
|
||||
|
|
@ -68,15 +68,15 @@ float32x4_t VPCS_ATTR V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y)
|
|||
}
|
||||
#else
|
||||
|
||||
float32x4_t VPCS_ATTR V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y)
|
||||
float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
|
||||
float32x4_t sqsum = vfmaq_f32 (vmulq_f32 (x, x), y, y);
|
||||
|
||||
uint16x4_t special = vcge_u16 (
|
||||
vsubhn_u32 (vreinterpretq_u32_f32 (sqsum), d->tiny_bound),
|
||||
vget_low_u16 (d->thres));
|
||||
uint16x4_t special
|
||||
= vcge_u16 (vsubhn_u32 (vreinterpretq_u32_f32 (sqsum), d->tiny_bound),
|
||||
vget_low_u16 (d->thres));
|
||||
|
||||
if (unlikely (v_any_u16h (special)))
|
||||
return special_case (x, y, sqsum, special);
|
||||
|
|
@ -85,10 +85,12 @@ float32x4_t VPCS_ATTR V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y)
|
|||
}
|
||||
#endif
|
||||
|
||||
PL_SIG (V, F, 2, hypot, -10.0, 10.0)
|
||||
PL_TEST_ULP (V_NAME_F2 (hypot), 1.21)
|
||||
PL_TEST_EXPECT_FENV (V_NAME_F2 (hypot), WANT_SIMD_EXCEPT)
|
||||
PL_TEST_INTERVAL2 (V_NAME_F2 (hypot), 0, inf, 0, inf, 10000)
|
||||
PL_TEST_INTERVAL2 (V_NAME_F2 (hypot), 0, inf, -0, -inf, 10000)
|
||||
PL_TEST_INTERVAL2 (V_NAME_F2 (hypot), -0, -inf, 0, inf, 10000)
|
||||
PL_TEST_INTERVAL2 (V_NAME_F2 (hypot), -0, -inf, -0, -inf, 10000)
|
||||
HALF_WIDTH_ALIAS_F2 (hypot)
|
||||
|
||||
TEST_SIG (V, F, 2, hypot, -10.0, 10.0)
|
||||
TEST_ULP (V_NAME_F2 (hypot), 1.21)
|
||||
TEST_DISABLE_FENV_IF_NOT (V_NAME_F2 (hypot), WANT_SIMD_EXCEPT)
|
||||
TEST_INTERVAL2 (V_NAME_F2 (hypot), 0, inf, 0, inf, 10000)
|
||||
TEST_INTERVAL2 (V_NAME_F2 (hypot), 0, inf, -0, -inf, 10000)
|
||||
TEST_INTERVAL2 (V_NAME_F2 (hypot), -0, -inf, 0, inf, 10000)
|
||||
TEST_INTERVAL2 (V_NAME_F2 (hypot), -0, -inf, -0, -inf, 10000)
|
||||
118
contrib/arm-optimized-routines/math/aarch64/advsimd/log.c
Normal file
118
contrib/arm-optimized-routines/math/aarch64/advsimd/log.c
Normal file
|
|
@ -0,0 +1,118 @@
|
|||
/*
|
||||
* Double-precision vector log(x) function.
|
||||
*
|
||||
* Copyright (c) 2019-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "v_math.h"
|
||||
#include "test_defs.h"
|
||||
#include "test_sig.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
uint64x2_t off, sign_exp_mask, offset_lower_bound;
|
||||
uint32x4_t special_bound;
|
||||
float64x2_t c0, c2;
|
||||
double c1, c3, ln2, c4;
|
||||
} data = {
|
||||
/* Rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ]. */
|
||||
.c0 = V2 (-0x1.ffffffffffff7p-2),
|
||||
.c1 = 0x1.55555555170d4p-2,
|
||||
.c2 = V2 (-0x1.0000000399c27p-2),
|
||||
.c3 = 0x1.999b2e90e94cap-3,
|
||||
.c4 = -0x1.554e550bd501ep-3,
|
||||
.ln2 = 0x1.62e42fefa39efp-1,
|
||||
.sign_exp_mask = V2 (0xfff0000000000000),
|
||||
.off = V2 (0x3fe6900900000000),
|
||||
/* Lower bound is 0x0010000000000000. For
|
||||
optimised register use subnormals are detected after offset has been
|
||||
subtracted, so lower bound - offset (which wraps around). */
|
||||
.offset_lower_bound = V2 (0x0010000000000000 - 0x3fe6900900000000),
|
||||
.special_bound = V4 (0x7fe00000), /* asuint64(inf) - asuint64(0x1p-126). */
|
||||
};
|
||||
|
||||
#define N (1 << V_LOG_TABLE_BITS)
|
||||
#define IndexMask (N - 1)
|
||||
|
||||
struct entry
|
||||
{
|
||||
float64x2_t invc;
|
||||
float64x2_t logc;
|
||||
};
|
||||
|
||||
static inline struct entry
|
||||
lookup (uint64x2_t i)
|
||||
{
|
||||
/* Since N is a power of 2, n % N = n & (N - 1). */
|
||||
struct entry e;
|
||||
uint64_t i0 = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
|
||||
uint64_t i1 = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
|
||||
float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc);
|
||||
float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc);
|
||||
e.invc = vuzp1q_f64 (e0, e1);
|
||||
e.logc = vuzp2q_f64 (e0, e1);
|
||||
return e;
|
||||
}
|
||||
|
||||
static float64x2_t VPCS_ATTR NOINLINE
|
||||
special_case (float64x2_t hi, uint64x2_t u_off, float64x2_t y, float64x2_t r2,
|
||||
uint32x2_t special, const struct data *d)
|
||||
{
|
||||
float64x2_t x = vreinterpretq_f64_u64 (vaddq_u64 (u_off, d->off));
|
||||
return v_call_f64 (log, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (special));
|
||||
}
|
||||
|
||||
/* Double-precision vector log routine.
|
||||
The maximum observed error is 2.17 ULP:
|
||||
_ZGVnN2v_log(0x1.a6129884398a3p+0) got 0x1.ffffff1cca043p-2
|
||||
want 0x1.ffffff1cca045p-2. */
|
||||
float64x2_t VPCS_ATTR V_NAME_D1 (log) (float64x2_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
|
||||
/* To avoid having to mov x out of the way, keep u after offset has been
|
||||
applied, and recover x by adding the offset back in the special-case
|
||||
handler. */
|
||||
uint64x2_t u = vreinterpretq_u64_f64 (x);
|
||||
uint64x2_t u_off = vsubq_u64 (u, d->off);
|
||||
|
||||
/* x = 2^k z; where z is in range [Off,2*Off) and exact.
|
||||
The range is split into N subintervals.
|
||||
The ith subinterval contains z and c is near its center. */
|
||||
int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52);
|
||||
uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->sign_exp_mask));
|
||||
float64x2_t z = vreinterpretq_f64_u64 (iz);
|
||||
|
||||
struct entry e = lookup (u_off);
|
||||
|
||||
uint32x2_t special = vcge_u32 (vsubhn_u64 (u_off, d->offset_lower_bound),
|
||||
vget_low_u32 (d->special_bound));
|
||||
|
||||
/* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */
|
||||
float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
|
||||
float64x2_t kd = vcvtq_f64_s64 (k);
|
||||
|
||||
/* hi = r + log(c) + k*Ln2. */
|
||||
float64x2_t ln2_and_c4 = vld1q_f64 (&d->ln2);
|
||||
float64x2_t hi = vfmaq_laneq_f64 (vaddq_f64 (e.logc, r), kd, ln2_and_c4, 0);
|
||||
|
||||
/* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */
|
||||
float64x2_t odd_coeffs = vld1q_f64 (&d->c1);
|
||||
float64x2_t r2 = vmulq_f64 (r, r);
|
||||
float64x2_t y = vfmaq_laneq_f64 (d->c2, r, odd_coeffs, 1);
|
||||
float64x2_t p = vfmaq_laneq_f64 (d->c0, r, odd_coeffs, 0);
|
||||
y = vfmaq_laneq_f64 (y, r2, ln2_and_c4, 1);
|
||||
y = vfmaq_f64 (p, r2, y);
|
||||
|
||||
if (unlikely (v_any_u32h (special)))
|
||||
return special_case (hi, u_off, y, r2, special, d);
|
||||
return vfmaq_f64 (hi, y, r2);
|
||||
}
|
||||
|
||||
TEST_SIG (V, D, 1, log, 0.01, 11.1)
|
||||
TEST_ULP (V_NAME_D1 (log), 1.67)
|
||||
TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (log), WANT_SIMD_EXCEPT)
|
||||
TEST_INTERVAL (V_NAME_D1 (log), 0, 0xffff000000000000, 10000)
|
||||
TEST_INTERVAL (V_NAME_D1 (log), 0x1p-4, 0x1p4, 400000)
|
||||
TEST_INTERVAL (V_NAME_D1 (log), 0, inf, 400000)
|
||||
132
contrib/arm-optimized-routines/math/aarch64/advsimd/log10.c
Normal file
132
contrib/arm-optimized-routines/math/aarch64/advsimd/log10.c
Normal file
|
|
@ -0,0 +1,132 @@
|
|||
/*
|
||||
* Double-precision vector log10(x) function.
|
||||
*
|
||||
* Copyright (c) 2022-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "v_math.h"
|
||||
#include "test_sig.h"
|
||||
#include "test_defs.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
uint64x2_t off, sign_exp_mask, offset_lower_bound;
|
||||
uint32x4_t special_bound;
|
||||
double invln10, log10_2;
|
||||
double c1, c3;
|
||||
float64x2_t c0, c2, c4;
|
||||
} data = {
|
||||
/* Computed from log coefficients divided by log(10) then rounded to double
|
||||
precision. */
|
||||
.c0 = V2 (-0x1.bcb7b1526e506p-3),
|
||||
.c1 = 0x1.287a7636be1d1p-3,
|
||||
.c2 = V2 (-0x1.bcb7b158af938p-4),
|
||||
.c3 = 0x1.63c78734e6d07p-4,
|
||||
.c4 = V2 (-0x1.287461742fee4p-4),
|
||||
.invln10 = 0x1.bcb7b1526e50ep-2,
|
||||
.log10_2 = 0x1.34413509f79ffp-2,
|
||||
.off = V2 (0x3fe6900900000000),
|
||||
.sign_exp_mask = V2 (0xfff0000000000000),
|
||||
/* Lower bound is 0x0010000000000000. For
|
||||
optimised register use subnormals are detected after offset has been
|
||||
subtracted, so lower bound - offset (which wraps around). */
|
||||
.offset_lower_bound = V2 (0x0010000000000000 - 0x3fe6900900000000),
|
||||
.special_bound = V4 (0x7fe00000), /* asuint64(inf) - 0x0010000000000000. */
|
||||
};
|
||||
|
||||
#define N (1 << V_LOG10_TABLE_BITS)
|
||||
#define IndexMask (N - 1)
|
||||
|
||||
struct entry
|
||||
{
|
||||
float64x2_t invc;
|
||||
float64x2_t log10c;
|
||||
};
|
||||
|
||||
static inline struct entry
|
||||
lookup (uint64x2_t i)
|
||||
{
|
||||
struct entry e;
|
||||
uint64_t i0
|
||||
= (vgetq_lane_u64 (i, 0) >> (52 - V_LOG10_TABLE_BITS)) & IndexMask;
|
||||
uint64_t i1
|
||||
= (vgetq_lane_u64 (i, 1) >> (52 - V_LOG10_TABLE_BITS)) & IndexMask;
|
||||
float64x2_t e0 = vld1q_f64 (&__v_log10_data.table[i0].invc);
|
||||
float64x2_t e1 = vld1q_f64 (&__v_log10_data.table[i1].invc);
|
||||
e.invc = vuzp1q_f64 (e0, e1);
|
||||
e.log10c = vuzp2q_f64 (e0, e1);
|
||||
return e;
|
||||
}
|
||||
|
||||
static float64x2_t VPCS_ATTR NOINLINE
|
||||
special_case (float64x2_t hi, uint64x2_t u_off, float64x2_t y, float64x2_t r2,
|
||||
uint32x2_t special, const struct data *d)
|
||||
{
|
||||
float64x2_t x = vreinterpretq_f64_u64 (vaddq_u64 (u_off, d->off));
|
||||
return v_call_f64 (log10, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (special));
|
||||
}
|
||||
|
||||
/* Fast implementation of double-precision vector log10
|
||||
is a slight modification of double-precision vector log.
|
||||
Max ULP error: < 2.5 ulp (nearest rounding.)
|
||||
Maximum measured at 2.46 ulp for x in [0.96, 0.97]
|
||||
_ZGVnN2v_log10(0x1.13192407fcb46p+0) got 0x1.fff6be3cae4bbp-6
|
||||
want 0x1.fff6be3cae4b9p-6. */
|
||||
float64x2_t VPCS_ATTR V_NAME_D1 (log10) (float64x2_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
|
||||
/* To avoid having to mov x out of the way, keep u after offset has been
|
||||
applied, and recover x by adding the offset back in the special-case
|
||||
handler. */
|
||||
uint64x2_t u = vreinterpretq_u64_f64 (x);
|
||||
uint64x2_t u_off = vsubq_u64 (u, d->off);
|
||||
|
||||
/* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
|
||||
The range is split into N subintervals.
|
||||
The ith subinterval contains z and c is near its center. */
|
||||
int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52);
|
||||
uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->sign_exp_mask));
|
||||
float64x2_t z = vreinterpretq_f64_u64 (iz);
|
||||
|
||||
struct entry e = lookup (u_off);
|
||||
|
||||
uint32x2_t special = vcge_u32 (vsubhn_u64 (u_off, d->offset_lower_bound),
|
||||
vget_low_u32 (d->special_bound));
|
||||
|
||||
/* log10(x) = log1p(z/c-1)/log(10) + log10(c) + k*log10(2). */
|
||||
float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
|
||||
float64x2_t kd = vcvtq_f64_s64 (k);
|
||||
|
||||
/* hi = r / log(10) + log10(c) + k*log10(2).
|
||||
Constants in v_log10_data.c are computed (in extended precision) as
|
||||
e.log10c := e.logc * invln10. */
|
||||
float64x2_t cte = vld1q_f64 (&d->invln10);
|
||||
float64x2_t hi = vfmaq_laneq_f64 (e.log10c, r, cte, 0);
|
||||
|
||||
/* y = log10(1+r) + n * log10(2). */
|
||||
hi = vfmaq_laneq_f64 (hi, kd, cte, 1);
|
||||
|
||||
/* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */
|
||||
float64x2_t r2 = vmulq_f64 (r, r);
|
||||
float64x2_t odd_coeffs = vld1q_f64 (&d->c1);
|
||||
float64x2_t y = vfmaq_laneq_f64 (d->c2, r, odd_coeffs, 1);
|
||||
float64x2_t p = vfmaq_laneq_f64 (d->c0, r, odd_coeffs, 0);
|
||||
y = vfmaq_f64 (y, d->c4, r2);
|
||||
y = vfmaq_f64 (p, y, r2);
|
||||
|
||||
if (unlikely (v_any_u32h (special)))
|
||||
return special_case (hi, u_off, y, r2, special, d);
|
||||
return vfmaq_f64 (hi, y, r2);
|
||||
}
|
||||
|
||||
TEST_SIG (V, D, 1, log10, 0.01, 11.1)
|
||||
TEST_ULP (V_NAME_D1 (log10), 1.97)
|
||||
TEST_INTERVAL (V_NAME_D1 (log10), -0.0, -inf, 1000)
|
||||
TEST_INTERVAL (V_NAME_D1 (log10), 0, 0x1p-149, 1000)
|
||||
TEST_INTERVAL (V_NAME_D1 (log10), 0x1p-149, 0x1p-126, 4000)
|
||||
TEST_INTERVAL (V_NAME_D1 (log10), 0x1p-126, 0x1p-23, 50000)
|
||||
TEST_INTERVAL (V_NAME_D1 (log10), 0x1p-23, 1.0, 50000)
|
||||
TEST_INTERVAL (V_NAME_D1 (log10), 1.0, 100, 50000)
|
||||
TEST_INTERVAL (V_NAME_D1 (log10), 100, inf, 50000)
|
||||
106
contrib/arm-optimized-routines/math/aarch64/advsimd/log10f.c
Normal file
106
contrib/arm-optimized-routines/math/aarch64/advsimd/log10f.c
Normal file
|
|
@ -0,0 +1,106 @@
|
|||
/*
|
||||
* Single-precision vector log10 function.
|
||||
*
|
||||
* Copyright (c) 2020-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "v_math.h"
|
||||
#include "test_sig.h"
|
||||
#include "test_defs.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
float32x4_t c0, c2, c4, c6, inv_ln10, ln2;
|
||||
uint32x4_t off, offset_lower_bound;
|
||||
uint16x8_t special_bound;
|
||||
uint32x4_t mantissa_mask;
|
||||
float c1, c3, c5, c7;
|
||||
} data = {
|
||||
/* Use order 9 for log10(1+x), i.e. order 8 for log10(1+x)/x, with x in
|
||||
[-1/3, 1/3] (offset=2/3). Max. relative error: 0x1.068ee468p-25. */
|
||||
.c0 = V4 (-0x1.bcb79cp-3f),
|
||||
.c1 = 0x1.2879c8p-3f,
|
||||
.c2 = V4 (-0x1.bcd472p-4f),
|
||||
.c3 = 0x1.6408f8p-4f,
|
||||
.c4 = V4 (-0x1.246f8p-4f),
|
||||
.c5 = 0x1.f0e514p-5f,
|
||||
.c6 = V4 (-0x1.0fc92cp-4f),
|
||||
.c7 = 0x1.f5f76ap-5f,
|
||||
.ln2 = V4 (0x1.62e43p-1f),
|
||||
.inv_ln10 = V4 (0x1.bcb7b2p-2f),
|
||||
/* Lower bound is the smallest positive normal float 0x00800000. For
|
||||
optimised register use subnormals are detected after offset has been
|
||||
subtracted, so lower bound is 0x0080000 - offset (which wraps around). */
|
||||
.offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab),
|
||||
.special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000). */
|
||||
.off = V4 (0x3f2aaaab), /* 0.666667. */
|
||||
.mantissa_mask = V4 (0x007fffff),
|
||||
};
|
||||
|
||||
static float32x4_t VPCS_ATTR NOINLINE
|
||||
special_case (float32x4_t y, uint32x4_t u_off, float32x4_t p, float32x4_t r2,
|
||||
uint16x4_t cmp, const struct data *d)
|
||||
{
|
||||
/* Fall back to scalar code. */
|
||||
return v_call_f32 (log10f, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)),
|
||||
vfmaq_f32 (y, p, r2), vmovl_u16 (cmp));
|
||||
}
|
||||
|
||||
/* Fast implementation of AdvSIMD log10f,
|
||||
uses a similar approach as AdvSIMD logf with the same offset (i.e., 2/3) and
|
||||
an order 9 polynomial.
|
||||
Maximum error: 3.305ulps (nearest rounding.)
|
||||
_ZGVnN4v_log10f(0x1.555c16p+0) got 0x1.ffe2fap-4
|
||||
want 0x1.ffe2f4p-4. */
|
||||
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log10) (float32x4_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
float32x4_t c1357 = vld1q_f32 (&d->c1);
|
||||
/* To avoid having to mov x out of the way, keep u after offset has been
|
||||
applied, and recover x by adding the offset back in the special-case
|
||||
handler. */
|
||||
uint32x4_t u_off = vreinterpretq_u32_f32 (x);
|
||||
|
||||
/* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
|
||||
u_off = vsubq_u32 (u_off, d->off);
|
||||
float32x4_t n = vcvtq_f32_s32 (
|
||||
vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend. */
|
||||
|
||||
uint16x4_t special = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound),
|
||||
vget_low_u16 (d->special_bound));
|
||||
|
||||
uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off);
|
||||
float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
|
||||
|
||||
/* y = log10(1+r) + n * log10(2). */
|
||||
float32x4_t r2 = vmulq_f32 (r, r);
|
||||
|
||||
float32x4_t c01 = vfmaq_laneq_f32 (d->c0, r, c1357, 0);
|
||||
float32x4_t c23 = vfmaq_laneq_f32 (d->c2, r, c1357, 1);
|
||||
float32x4_t c45 = vfmaq_laneq_f32 (d->c4, r, c1357, 2);
|
||||
float32x4_t c67 = vfmaq_laneq_f32 (d->c6, r, c1357, 3);
|
||||
|
||||
float32x4_t p47 = vfmaq_f32 (c45, r2, c67);
|
||||
float32x4_t p27 = vfmaq_f32 (c23, r2, p47);
|
||||
float32x4_t poly = vfmaq_f32 (c01, r2, p27);
|
||||
|
||||
/* y = Log10(2) * n + poly * InvLn(10). */
|
||||
float32x4_t y = vfmaq_f32 (r, d->ln2, n);
|
||||
y = vmulq_f32 (y, d->inv_ln10);
|
||||
|
||||
if (unlikely (v_any_u16h (special)))
|
||||
return special_case (y, u_off, poly, r2, special, d);
|
||||
return vfmaq_f32 (y, poly, r2);
|
||||
}
|
||||
|
||||
HALF_WIDTH_ALIAS_F1 (log10)
|
||||
|
||||
TEST_SIG (V, F, 1, log10, 0.01, 11.1)
|
||||
TEST_ULP (V_NAME_F1 (log10), 2.81)
|
||||
TEST_INTERVAL (V_NAME_F1 (log10), -0.0, -inf, 100)
|
||||
TEST_INTERVAL (V_NAME_F1 (log10), 0, 0x1p-126, 100)
|
||||
TEST_INTERVAL (V_NAME_F1 (log10), 0x1p-126, 0x1p-23, 50000)
|
||||
TEST_INTERVAL (V_NAME_F1 (log10), 0x1p-23, 1.0, 50000)
|
||||
TEST_INTERVAL (V_NAME_F1 (log10), 1.0, 100, 50000)
|
||||
TEST_INTERVAL (V_NAME_F1 (log10), 100, inf, 50000)
|
||||
61
contrib/arm-optimized-routines/math/aarch64/advsimd/log1p.c
Normal file
61
contrib/arm-optimized-routines/math/aarch64/advsimd/log1p.c
Normal file
|
|
@ -0,0 +1,61 @@
|
|||
/*
|
||||
* Double-precision vector log(1+x) function.
|
||||
*
|
||||
* Copyright (c) 2022-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "v_math.h"
|
||||
#include "test_sig.h"
|
||||
#include "test_defs.h"
|
||||
|
||||
#define WANT_V_LOG1P_K0_SHORTCUT 0
|
||||
#include "v_log1p_inline.h"
|
||||
|
||||
const static struct data
|
||||
{
|
||||
struct v_log1p_data d;
|
||||
uint64x2_t inf, minus_one;
|
||||
} data = { .d = V_LOG1P_CONSTANTS_TABLE,
|
||||
.inf = V2 (0x7ff0000000000000),
|
||||
.minus_one = V2 (0xbff0000000000000) };
|
||||
|
||||
#define BottomMask v_u64 (0xffffffff)
|
||||
|
||||
static float64x2_t NOINLINE VPCS_ATTR
|
||||
special_case (float64x2_t x, uint64x2_t cmp, const struct data *d)
|
||||
{
|
||||
/* Side-step special lanes so fenv exceptions are not triggered
|
||||
inadvertently. */
|
||||
float64x2_t x_nospecial = v_zerofy_f64 (x, cmp);
|
||||
return v_call_f64 (log1p, x, log1p_inline (x_nospecial, &d->d), cmp);
|
||||
}
|
||||
|
||||
/* Vector log1p approximation using polynomial on reduced interval. Routine is
|
||||
a modification of the algorithm used in scalar log1p, with no shortcut for
|
||||
k=0 and no narrowing for f and k. Maximum observed error is 2.45 ULP:
|
||||
_ZGVnN2v_log1p(0x1.658f7035c4014p+11) got 0x1.fd61d0727429dp+2
|
||||
want 0x1.fd61d0727429fp+2 . */
|
||||
VPCS_ATTR float64x2_t V_NAME_D1 (log1p) (float64x2_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
uint64x2_t ix = vreinterpretq_u64_f64 (x);
|
||||
uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x));
|
||||
|
||||
uint64x2_t special_cases
|
||||
= vorrq_u64 (vcgeq_u64 (ia, d->inf), vcgeq_u64 (ix, d->minus_one));
|
||||
|
||||
if (unlikely (v_any_u64 (special_cases)))
|
||||
return special_case (x, special_cases, d);
|
||||
|
||||
return log1p_inline (x, &d->d);
|
||||
}
|
||||
|
||||
TEST_SIG (V, D, 1, log1p, -0.9, 10.0)
|
||||
TEST_ULP (V_NAME_D1 (log1p), 1.95)
|
||||
TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (log1p), WANT_SIMD_EXCEPT)
|
||||
TEST_SYM_INTERVAL (V_NAME_D1 (log1p), 0.0, 0x1p-23, 50000)
|
||||
TEST_SYM_INTERVAL (V_NAME_D1 (log1p), 0x1p-23, 0.001, 50000)
|
||||
TEST_SYM_INTERVAL (V_NAME_D1 (log1p), 0.001, 1.0, 50000)
|
||||
TEST_INTERVAL (V_NAME_D1 (log1p), 1, inf, 40000)
|
||||
TEST_INTERVAL (V_NAME_D1 (log1p), -1.0, -inf, 500)
|
||||
92
contrib/arm-optimized-routines/math/aarch64/advsimd/log1pf.c
Normal file
92
contrib/arm-optimized-routines/math/aarch64/advsimd/log1pf.c
Normal file
|
|
@ -0,0 +1,92 @@
|
|||
/*
|
||||
* Single-precision vector log(1+x) function.
|
||||
*
|
||||
* Copyright (c) 2022-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "v_math.h"
|
||||
#include "test_sig.h"
|
||||
#include "test_defs.h"
|
||||
#include "v_log1pf_inline.h"
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
|
||||
const static struct data
|
||||
{
|
||||
uint32x4_t minus_one, thresh;
|
||||
struct v_log1pf_data d;
|
||||
} data = {
|
||||
.d = V_LOG1PF_CONSTANTS_TABLE,
|
||||
.thresh = V4 (0x4b800000), /* asuint32(INFINITY) - TinyBound. */
|
||||
.minus_one = V4 (0xbf800000),
|
||||
};
|
||||
|
||||
/* asuint32(0x1p-23). ulp=0.5 at 0x1p-23. */
|
||||
# define TinyBound v_u32 (0x34000000)
|
||||
|
||||
static float32x4_t NOINLINE VPCS_ATTR
|
||||
special_case (float32x4_t x, uint32x4_t cmp, const struct data *d)
|
||||
{
|
||||
/* Side-step special lanes so fenv exceptions are not triggered
|
||||
inadvertently. */
|
||||
float32x4_t x_nospecial = v_zerofy_f32 (x, cmp);
|
||||
return v_call_f32 (log1pf, x, log1pf_inline (x_nospecial, &d->d), cmp);
|
||||
}
|
||||
|
||||
/* Vector log1pf approximation using polynomial on reduced interval. Worst-case
|
||||
error is 1.69 ULP:
|
||||
_ZGVnN4v_log1pf(0x1.04418ap-2) got 0x1.cfcbd8p-3
|
||||
want 0x1.cfcbdcp-3. */
|
||||
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log1p) (float32x4_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
uint32x4_t ix = vreinterpretq_u32_f32 (x);
|
||||
uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x));
|
||||
|
||||
uint32x4_t special_cases
|
||||
= vorrq_u32 (vcgeq_u32 (vsubq_u32 (ia, TinyBound), d->thresh),
|
||||
vcgeq_u32 (ix, d->minus_one));
|
||||
|
||||
if (unlikely (v_any_u32 (special_cases)))
|
||||
return special_case (x, special_cases, d);
|
||||
|
||||
return log1pf_inline (x, &d->d);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
const static struct v_log1pf_data data = V_LOG1PF_CONSTANTS_TABLE;
|
||||
|
||||
static float32x4_t NOINLINE VPCS_ATTR
|
||||
special_case (float32x4_t x, uint32x4_t cmp)
|
||||
{
|
||||
return v_call_f32 (log1pf, x, log1pf_inline (x, ptr_barrier (&data)), cmp);
|
||||
}
|
||||
|
||||
/* Vector log1pf approximation using polynomial on reduced interval. Worst-case
|
||||
error is 1.63 ULP:
|
||||
_ZGVnN4v_log1pf(0x1.216d12p-2) got 0x1.fdcb12p-3
|
||||
want 0x1.fdcb16p-3. */
|
||||
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log1p) (float32x4_t x)
|
||||
{
|
||||
uint32x4_t special_cases = vornq_u32 (vcleq_f32 (x, v_f32 (-1)),
|
||||
vcaleq_f32 (x, v_f32 (0x1p127f)));
|
||||
|
||||
if (unlikely (v_any_u32 (special_cases)))
|
||||
return special_case (x, special_cases);
|
||||
|
||||
return log1pf_inline (x, ptr_barrier (&data));
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
HALF_WIDTH_ALIAS_F1 (log1p)
|
||||
|
||||
TEST_SIG (V, F, 1, log1p, -0.9, 10.0)
|
||||
TEST_ULP (V_NAME_F1 (log1p), 1.20)
|
||||
TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (log1p), WANT_SIMD_EXCEPT)
|
||||
TEST_SYM_INTERVAL (V_NAME_F1 (log1p), 0.0, 0x1p-23, 30000)
|
||||
TEST_SYM_INTERVAL (V_NAME_F1 (log1p), 0x1p-23, 1, 50000)
|
||||
TEST_INTERVAL (V_NAME_F1 (log1p), 1, inf, 50000)
|
||||
TEST_INTERVAL (V_NAME_F1 (log1p), -1.0, -inf, 1000)
|
||||
123
contrib/arm-optimized-routines/math/aarch64/advsimd/log2.c
Normal file
123
contrib/arm-optimized-routines/math/aarch64/advsimd/log2.c
Normal file
|
|
@ -0,0 +1,123 @@
|
|||
/*
|
||||
* Double-precision vector log2 function.
|
||||
*
|
||||
* Copyright (c) 2022-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "v_math.h"
|
||||
#include "test_sig.h"
|
||||
#include "test_defs.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
uint64x2_t off, sign_exp_mask, offset_lower_bound;
|
||||
uint32x4_t special_bound;
|
||||
float64x2_t c0, c2;
|
||||
double c1, c3, invln2, c4;
|
||||
} data = {
|
||||
/* Each coefficient was generated to approximate log(r) for |r| < 0x1.fp-9
|
||||
and N = 128, then scaled by log2(e) in extended precision and rounded back
|
||||
to double precision. */
|
||||
.c0 = V2 (-0x1.71547652b8300p-1),
|
||||
.c1 = 0x1.ec709dc340953p-2,
|
||||
.c2 = V2 (-0x1.71547651c8f35p-2),
|
||||
.c3 = 0x1.2777ebe12dda5p-2,
|
||||
.c4 = -0x1.ec738d616fe26p-3,
|
||||
.invln2 = 0x1.71547652b82fep0,
|
||||
.off = V2 (0x3fe6900900000000),
|
||||
.sign_exp_mask = V2 (0xfff0000000000000),
|
||||
/* Lower bound is 0x0010000000000000. For
|
||||
optimised register use subnormals are detected after offset has been
|
||||
subtracted, so lower bound - offset (which wraps around). */
|
||||
.offset_lower_bound = V2 (0x0010000000000000 - 0x3fe6900900000000),
|
||||
.special_bound = V4 (0x7fe00000), /* asuint64(inf) - asuint64(0x1p-1022). */
|
||||
};
|
||||
|
||||
#define N (1 << V_LOG2_TABLE_BITS)
|
||||
#define IndexMask (N - 1)
|
||||
|
||||
struct entry
|
||||
{
|
||||
float64x2_t invc;
|
||||
float64x2_t log2c;
|
||||
};
|
||||
|
||||
static inline struct entry
|
||||
lookup (uint64x2_t i)
|
||||
{
|
||||
struct entry e;
|
||||
uint64_t i0
|
||||
= (vgetq_lane_u64 (i, 0) >> (52 - V_LOG2_TABLE_BITS)) & IndexMask;
|
||||
uint64_t i1
|
||||
= (vgetq_lane_u64 (i, 1) >> (52 - V_LOG2_TABLE_BITS)) & IndexMask;
|
||||
float64x2_t e0 = vld1q_f64 (&__v_log2_data.table[i0].invc);
|
||||
float64x2_t e1 = vld1q_f64 (&__v_log2_data.table[i1].invc);
|
||||
e.invc = vuzp1q_f64 (e0, e1);
|
||||
e.log2c = vuzp2q_f64 (e0, e1);
|
||||
return e;
|
||||
}
|
||||
|
||||
static float64x2_t VPCS_ATTR NOINLINE
|
||||
special_case (float64x2_t hi, uint64x2_t u_off, float64x2_t y, float64x2_t r2,
|
||||
uint32x2_t special, const struct data *d)
|
||||
{
|
||||
float64x2_t x = vreinterpretq_f64_u64 (vaddq_u64 (u_off, d->off));
|
||||
return v_call_f64 (log2, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (special));
|
||||
}
|
||||
|
||||
/* Double-precision vector log2 routine. Implements the same algorithm as
|
||||
vector log10, with coefficients and table entries scaled in extended
|
||||
precision. The maximum observed error is 2.58 ULP:
|
||||
_ZGVnN2v_log2(0x1.0b556b093869bp+0) got 0x1.fffb34198d9dap-5
|
||||
want 0x1.fffb34198d9ddp-5. */
|
||||
float64x2_t VPCS_ATTR V_NAME_D1 (log2) (float64x2_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
|
||||
/* To avoid having to mov x out of the way, keep u after offset has been
|
||||
applied, and recover x by adding the offset back in the special-case
|
||||
handler. */
|
||||
uint64x2_t u = vreinterpretq_u64_f64 (x);
|
||||
uint64x2_t u_off = vsubq_u64 (u, d->off);
|
||||
|
||||
/* x = 2^k z; where z is in range [Off,2*Off) and exact.
|
||||
The range is split into N subintervals.
|
||||
The ith subinterval contains z and c is near its center. */
|
||||
int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52);
|
||||
uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->sign_exp_mask));
|
||||
float64x2_t z = vreinterpretq_f64_u64 (iz);
|
||||
|
||||
struct entry e = lookup (u_off);
|
||||
|
||||
uint32x2_t special = vcge_u32 (vsubhn_u64 (u_off, d->offset_lower_bound),
|
||||
vget_low_u32 (d->special_bound));
|
||||
|
||||
/* log2(x) = log1p(z/c-1)/log(2) + log2(c) + k. */
|
||||
float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
|
||||
float64x2_t kd = vcvtq_f64_s64 (k);
|
||||
|
||||
float64x2_t invln2_and_c4 = vld1q_f64 (&d->invln2);
|
||||
float64x2_t hi
|
||||
= vfmaq_laneq_f64 (vaddq_f64 (e.log2c, kd), r, invln2_and_c4, 0);
|
||||
|
||||
float64x2_t r2 = vmulq_f64 (r, r);
|
||||
float64x2_t odd_coeffs = vld1q_f64 (&d->c1);
|
||||
float64x2_t y = vfmaq_laneq_f64 (d->c2, r, odd_coeffs, 1);
|
||||
float64x2_t p = vfmaq_laneq_f64 (d->c0, r, odd_coeffs, 0);
|
||||
y = vfmaq_laneq_f64 (y, r2, invln2_and_c4, 1);
|
||||
y = vfmaq_f64 (p, r2, y);
|
||||
|
||||
if (unlikely (v_any_u32h (special)))
|
||||
return special_case (hi, u_off, y, r2, special, d);
|
||||
return vfmaq_f64 (hi, y, r2);
|
||||
}
|
||||
|
||||
TEST_SIG (V, D, 1, log2, 0.01, 11.1)
|
||||
TEST_ULP (V_NAME_D1 (log2), 2.09)
|
||||
TEST_INTERVAL (V_NAME_D1 (log2), -0.0, -0x1p126, 100)
|
||||
TEST_INTERVAL (V_NAME_D1 (log2), 0x1p-149, 0x1p-126, 4000)
|
||||
TEST_INTERVAL (V_NAME_D1 (log2), 0x1p-126, 0x1p-23, 50000)
|
||||
TEST_INTERVAL (V_NAME_D1 (log2), 0x1p-23, 1.0, 50000)
|
||||
TEST_INTERVAL (V_NAME_D1 (log2), 1.0, 100, 50000)
|
||||
TEST_INTERVAL (V_NAME_D1 (log2), 100, inf, 50000)
|
||||
102
contrib/arm-optimized-routines/math/aarch64/advsimd/log2f.c
Normal file
102
contrib/arm-optimized-routines/math/aarch64/advsimd/log2f.c
Normal file
|
|
@ -0,0 +1,102 @@
|
|||
/*
|
||||
* Single-precision vector log2 function.
|
||||
*
|
||||
* Copyright (c) 2022-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "v_math.h"
|
||||
#include "test_sig.h"
|
||||
#include "test_defs.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
float32x4_t c0, c2, c4, c6, c8;
|
||||
uint32x4_t off, offset_lower_bound;
|
||||
uint16x8_t special_bound;
|
||||
uint32x4_t mantissa_mask;
|
||||
float c1, c3, c5, c7;
|
||||
} data = {
|
||||
/* Coefficients generated using Remez algorithm approximate
|
||||
log2(1+r)/r for r in [ -1/3, 1/3 ].
|
||||
rel error: 0x1.c4c4b0cp-26. */
|
||||
.c0 = V4 (0x1.715476p0f), /* (float)(1 / ln(2)). */
|
||||
.c1 = -0x1.715458p-1f,
|
||||
.c2 = V4 (0x1.ec701cp-2f),
|
||||
.c3 = -0x1.7171a4p-2f,
|
||||
.c4 = V4 (0x1.27a0b8p-2f),
|
||||
.c5 = -0x1.e5143ep-3f,
|
||||
.c6 = V4 (0x1.9d8ecap-3f),
|
||||
.c7 = -0x1.c675bp-3f,
|
||||
.c8 = V4 (0x1.9e495p-3f),
|
||||
/* Lower bound is the smallest positive normal float 0x00800000. For
|
||||
optimised register use subnormals are detected after offset has been
|
||||
subtracted, so lower bound is 0x0080000 - offset (which wraps around). */
|
||||
.offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab),
|
||||
.special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000). */
|
||||
.off = V4 (0x3f2aaaab), /* 0.666667. */
|
||||
.mantissa_mask = V4 (0x007fffff),
|
||||
};
|
||||
|
||||
static float32x4_t VPCS_ATTR NOINLINE
|
||||
special_case (float32x4_t n, uint32x4_t u_off, float32x4_t p, float32x4_t r,
|
||||
uint16x4_t cmp, const struct data *d)
|
||||
{
|
||||
/* Fall back to scalar code. */
|
||||
return v_call_f32 (log2f, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)),
|
||||
vfmaq_f32 (n, p, r), vmovl_u16 (cmp));
|
||||
}
|
||||
|
||||
/* Fast implementation for single precision AdvSIMD log2,
|
||||
relies on same argument reduction as AdvSIMD logf.
|
||||
Maximum error: 2.48 ULPs
|
||||
_ZGVnN4v_log2f(0x1.558174p+0) got 0x1.a9be84p-2
|
||||
want 0x1.a9be8p-2. */
|
||||
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log2) (float32x4_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
|
||||
/* To avoid having to mov x out of the way, keep u after offset has been
|
||||
applied, and recover x by adding the offset back in the special-case
|
||||
handler. */
|
||||
uint32x4_t u_off = vreinterpretq_u32_f32 (x);
|
||||
|
||||
/* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
|
||||
u_off = vsubq_u32 (u_off, d->off);
|
||||
float32x4_t n = vcvtq_f32_s32 (
|
||||
vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend. */
|
||||
|
||||
uint16x4_t special = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound),
|
||||
vget_low_u16 (d->special_bound));
|
||||
|
||||
uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off);
|
||||
float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
|
||||
|
||||
/* y = log2(1+r) + n. */
|
||||
float32x4_t r2 = vmulq_f32 (r, r);
|
||||
|
||||
float32x4_t c1357 = vld1q_f32 (&d->c1);
|
||||
float32x4_t c01 = vfmaq_laneq_f32 (d->c0, r, c1357, 0);
|
||||
float32x4_t c23 = vfmaq_laneq_f32 (d->c2, r, c1357, 1);
|
||||
float32x4_t c45 = vfmaq_laneq_f32 (d->c4, r, c1357, 2);
|
||||
float32x4_t c67 = vfmaq_laneq_f32 (d->c6, r, c1357, 3);
|
||||
float32x4_t p68 = vfmaq_f32 (c67, r2, d->c8);
|
||||
float32x4_t p48 = vfmaq_f32 (c45, r2, p68);
|
||||
float32x4_t p28 = vfmaq_f32 (c23, r2, p48);
|
||||
float32x4_t p = vfmaq_f32 (c01, r2, p28);
|
||||
|
||||
if (unlikely (v_any_u16h (special)))
|
||||
return special_case (n, u_off, p, r, special, d);
|
||||
return vfmaq_f32 (n, p, r);
|
||||
}
|
||||
|
||||
HALF_WIDTH_ALIAS_F1 (log2)
|
||||
|
||||
TEST_SIG (V, F, 1, log2, 0.01, 11.1)
|
||||
TEST_ULP (V_NAME_F1 (log2), 1.99)
|
||||
TEST_INTERVAL (V_NAME_F1 (log2), -0.0, -0x1p126, 100)
|
||||
TEST_INTERVAL (V_NAME_F1 (log2), 0x1p-149, 0x1p-126, 4000)
|
||||
TEST_INTERVAL (V_NAME_F1 (log2), 0x1p-126, 0x1p-23, 50000)
|
||||
TEST_INTERVAL (V_NAME_F1 (log2), 0x1p-23, 1.0, 50000)
|
||||
TEST_INTERVAL (V_NAME_F1 (log2), 1.0, 100, 50000)
|
||||
TEST_INTERVAL (V_NAME_F1 (log2), 100, inf, 50000)
|
||||
88
contrib/arm-optimized-routines/math/aarch64/advsimd/logf.c
Normal file
88
contrib/arm-optimized-routines/math/aarch64/advsimd/logf.c
Normal file
|
|
@ -0,0 +1,88 @@
|
|||
/*
|
||||
* Single-precision vector log function.
|
||||
*
|
||||
* Copyright (c) 2019-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
#include "v_math.h"
|
||||
#include "test_defs.h"
|
||||
#include "test_sig.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
float32x4_t c2, c4, c6, ln2;
|
||||
uint32x4_t off, offset_lower_bound, mantissa_mask;
|
||||
uint16x8_t special_bound;
|
||||
float c1, c3, c5, c0;
|
||||
} data = {
|
||||
/* 3.34 ulp error. */
|
||||
.c0 = -0x1.3e737cp-3f,
|
||||
.c1 = 0x1.5a9aa2p-3f,
|
||||
.c2 = V4 (-0x1.4f9934p-3f),
|
||||
.c3 = 0x1.961348p-3f,
|
||||
.c4 = V4 (-0x1.00187cp-2f),
|
||||
.c5 = 0x1.555d7cp-2f,
|
||||
.c6 = V4 (-0x1.ffffc8p-2f),
|
||||
.ln2 = V4 (0x1.62e43p-1f),
|
||||
/* Lower bound is the smallest positive normal float 0x00800000. For
|
||||
optimised register use subnormals are detected after offset has been
|
||||
subtracted, so lower bound is 0x0080000 - offset (which wraps around). */
|
||||
.offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab),
|
||||
.special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000). */
|
||||
.off = V4 (0x3f2aaaab), /* 0.666667. */
|
||||
.mantissa_mask = V4 (0x007fffff)
|
||||
};
|
||||
|
||||
static float32x4_t VPCS_ATTR NOINLINE
|
||||
special_case (float32x4_t p, uint32x4_t u_off, float32x4_t y, float32x4_t r2,
|
||||
uint16x4_t cmp, const struct data *d)
|
||||
{
|
||||
/* Fall back to scalar code. */
|
||||
return v_call_f32 (logf, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)),
|
||||
vfmaq_f32 (p, y, r2), vmovl_u16 (cmp));
|
||||
}
|
||||
|
||||
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log) (float32x4_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
float32x4_t c1350 = vld1q_f32 (&d->c1);
|
||||
|
||||
/* To avoid having to mov x out of the way, keep u after offset has been
|
||||
applied, and recover x by adding the offset back in the special-case
|
||||
handler. */
|
||||
uint32x4_t u_off = vsubq_u32 (vreinterpretq_u32_f32 (x), d->off);
|
||||
|
||||
/* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
|
||||
float32x4_t n = vcvtq_f32_s32 (
|
||||
vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend. */
|
||||
uint16x4_t cmp = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound),
|
||||
vget_low_u16 (d->special_bound));
|
||||
|
||||
uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off);
|
||||
float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
|
||||
|
||||
/* y = log(1+r) + n*ln2. */
|
||||
float32x4_t r2 = vmulq_f32 (r, r);
|
||||
/* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))). */
|
||||
float32x4_t p = vfmaq_laneq_f32 (d->c2, r, c1350, 0);
|
||||
float32x4_t q = vfmaq_laneq_f32 (d->c4, r, c1350, 1);
|
||||
float32x4_t y = vfmaq_laneq_f32 (d->c6, r, c1350, 2);
|
||||
p = vfmaq_laneq_f32 (p, r2, c1350, 3);
|
||||
|
||||
q = vfmaq_f32 (q, p, r2);
|
||||
y = vfmaq_f32 (y, q, r2);
|
||||
p = vfmaq_f32 (r, d->ln2, n);
|
||||
|
||||
if (unlikely (v_any_u16h (cmp)))
|
||||
return special_case (p, u_off, y, r2, cmp, d);
|
||||
return vfmaq_f32 (p, y, r2);
|
||||
}
|
||||
|
||||
HALF_WIDTH_ALIAS_F1 (log)
|
||||
|
||||
TEST_SIG (V, F, 1, log, 0.01, 11.1)
|
||||
TEST_ULP (V_NAME_F1 (log), 2.9)
|
||||
TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (log), WANT_SIMD_EXCEPT)
|
||||
TEST_INTERVAL (V_NAME_F1 (log), 0, 0xffff0000, 10000)
|
||||
TEST_INTERVAL (V_NAME_F1 (log), 0x1p-4, 0x1p4, 500000)
|
||||
TEST_INTERVAL (V_NAME_F1 (log), 0, inf, 50000)
|
||||
33
contrib/arm-optimized-routines/math/aarch64/advsimd/modf.c
Normal file
33
contrib/arm-optimized-routines/math/aarch64/advsimd/modf.c
Normal file
|
|
@ -0,0 +1,33 @@
|
|||
/*
|
||||
* Double-precision vector modf(x, *y) function.
|
||||
*
|
||||
* Copyright (c) 2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "v_math.h"
|
||||
#include "test_sig.h"
|
||||
#include "test_defs.h"
|
||||
|
||||
/* Modf algorithm. Produces exact values in all rounding modes. */
|
||||
float64x2_t VPCS_ATTR V_NAME_D1_L1 (modf) (float64x2_t x, double *out_int)
|
||||
{
|
||||
/* Get integer component of x. */
|
||||
float64x2_t rounded = vrndq_f64 (x);
|
||||
vst1q_f64 (out_int, rounded);
|
||||
|
||||
/* Subtract integer component from input. */
|
||||
uint64x2_t remaining = vreinterpretq_u64_f64 (vsubq_f64 (x, rounded));
|
||||
|
||||
/* Return +0 for integer x. */
|
||||
uint64x2_t is_integer = vceqq_f64 (x, rounded);
|
||||
return vreinterpretq_f64_u64 (vbicq_u64 (remaining, is_integer));
|
||||
}
|
||||
|
||||
TEST_ULP (_ZGVnN2vl8_modf_frac, 0.0)
|
||||
TEST_SYM_INTERVAL (_ZGVnN2vl8_modf_frac, 0, 1, 20000)
|
||||
TEST_SYM_INTERVAL (_ZGVnN2vl8_modf_frac, 1, inf, 20000)
|
||||
|
||||
TEST_ULP (_ZGVnN2vl8_modf_int, 0.0)
|
||||
TEST_SYM_INTERVAL (_ZGVnN2vl8_modf_int, 0, 1, 20000)
|
||||
TEST_SYM_INTERVAL (_ZGVnN2vl8_modf_int, 1, inf, 20000)
|
||||
34
contrib/arm-optimized-routines/math/aarch64/advsimd/modff.c
Normal file
34
contrib/arm-optimized-routines/math/aarch64/advsimd/modff.c
Normal file
|
|
@ -0,0 +1,34 @@
|
|||
/*
|
||||
* Single-precision vector modf(x, *y) function.
|
||||
*
|
||||
* Copyright (c) 2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "v_math.h"
|
||||
#include "test_sig.h"
|
||||
#include "test_defs.h"
|
||||
|
||||
/* Modff algorithm. Produces exact values in all rounding modes. */
|
||||
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1_L1 (modf) (float32x4_t x,
|
||||
float *out_int)
|
||||
{
|
||||
/* Get integer component of x. */
|
||||
float32x4_t rounded = vrndq_f32 (x);
|
||||
vst1q_f32 (out_int, rounded);
|
||||
|
||||
/* Subtract integer component from input. */
|
||||
uint32x4_t remaining = vreinterpretq_u32_f32 (vsubq_f32 (x, rounded));
|
||||
|
||||
/* Return +0 for integer x. */
|
||||
uint32x4_t is_integer = vceqq_f32 (x, rounded);
|
||||
return vreinterpretq_f32_u32 (vbicq_u32 (remaining, is_integer));
|
||||
}
|
||||
|
||||
TEST_ULP (_ZGVnN4vl4_modff_frac, 0.0)
|
||||
TEST_SYM_INTERVAL (_ZGVnN4vl4_modff_frac, 0, 1, 20000)
|
||||
TEST_SYM_INTERVAL (_ZGVnN4vl4_modff_frac, 1, inf, 20000)
|
||||
|
||||
TEST_ULP (_ZGVnN4vl4_modff_int, 0.0)
|
||||
TEST_SYM_INTERVAL (_ZGVnN4vl4_modff_int, 0, 1, 20000)
|
||||
TEST_SYM_INTERVAL (_ZGVnN4vl4_modff_int, 1, inf, 20000)
|
||||
|
|
@ -1,20 +1,17 @@
|
|||
/*
|
||||
* Double-precision vector pow function.
|
||||
*
|
||||
* Copyright (c) 2020-2023, Arm Limited.
|
||||
* Copyright (c) 2020-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "v_math.h"
|
||||
#include "pl_sig.h"
|
||||
#include "pl_test.h"
|
||||
#include "test_sig.h"
|
||||
#include "test_defs.h"
|
||||
|
||||
/* Defines parameters of the approximation and scalar fallback. */
|
||||
#include "finite_pow.h"
|
||||
|
||||
#define VecSmallExp v_u64 (SmallExp)
|
||||
#define VecThresExp v_u64 (ThresExp)
|
||||
|
||||
#define VecSmallPowX v_u64 (SmallPowX)
|
||||
#define VecThresPowX v_u64 (ThresPowX)
|
||||
#define VecSmallPowY v_u64 (SmallPowY)
|
||||
|
|
@ -22,34 +19,49 @@
|
|||
|
||||
static const struct data
|
||||
{
|
||||
float64x2_t log_poly[7];
|
||||
float64x2_t exp_poly[3];
|
||||
float64x2_t ln2_hi, ln2_lo;
|
||||
float64x2_t shift, inv_ln2_n, ln2_hi_n, ln2_lo_n;
|
||||
uint64x2_t inf;
|
||||
float64x2_t small_powx;
|
||||
uint64x2_t offset, mask;
|
||||
uint64x2_t mask_sub_0, mask_sub_1;
|
||||
float64x2_t log_c0, log_c2, log_c4, log_c5;
|
||||
double log_c1, log_c3;
|
||||
double ln2_lo, ln2_hi;
|
||||
uint64x2_t small_exp, thres_exp;
|
||||
double ln2_lo_n, ln2_hi_n;
|
||||
double inv_ln2_n, exp_c2;
|
||||
float64x2_t exp_c0, exp_c1;
|
||||
} data = {
|
||||
/* Power threshold. */
|
||||
.inf = V2 (0x7ff0000000000000),
|
||||
.small_powx = V2 (0x1p-126),
|
||||
.offset = V2 (Off),
|
||||
.mask = V2 (0xfffULL << 52),
|
||||
.mask_sub_0 = V2 (1ULL << 52),
|
||||
.mask_sub_1 = V2 (52ULL << 52),
|
||||
/* Coefficients copied from v_pow_log_data.c
|
||||
relative error: 0x1.11922ap-70 in [-0x1.6bp-8, 0x1.6bp-8]
|
||||
Coefficients are scaled to match the scaling during evaluation. */
|
||||
.log_poly = { V2 (-0x1p-1), V2 (0x1.555555555556p-2 * -2),
|
||||
V2 (-0x1.0000000000006p-2 * -2), V2 (0x1.999999959554ep-3 * 4),
|
||||
V2 (-0x1.555555529a47ap-3 * 4), V2 (0x1.2495b9b4845e9p-3 * -8),
|
||||
V2 (-0x1.0002b8b263fc3p-3 * -8) },
|
||||
.ln2_hi = V2 (0x1.62e42fefa3800p-1),
|
||||
.ln2_lo = V2 (0x1.ef35793c76730p-45),
|
||||
.log_c0 = V2 (0x1.555555555556p-2 * -2),
|
||||
.log_c1 = -0x1.0000000000006p-2 * -2,
|
||||
.log_c2 = V2 (0x1.999999959554ep-3 * 4),
|
||||
.log_c3 = -0x1.555555529a47ap-3 * 4,
|
||||
.log_c4 = V2 (0x1.2495b9b4845e9p-3 * -8),
|
||||
.log_c5 = V2 (-0x1.0002b8b263fc3p-3 * -8),
|
||||
.ln2_hi = 0x1.62e42fefa3800p-1,
|
||||
.ln2_lo = 0x1.ef35793c76730p-45,
|
||||
/* Polynomial coefficients: abs error: 1.43*2^-58, ulp error: 0.549
|
||||
(0.550 without fma) if |x| < ln2/512. */
|
||||
.exp_poly = { V2 (0x1.fffffffffffd4p-2), V2 (0x1.5555571d6ef9p-3),
|
||||
V2 (0x1.5555576a5adcep-5) },
|
||||
.shift = V2 (0x1.8p52), /* round to nearest int. without intrinsics. */
|
||||
.inv_ln2_n = V2 (0x1.71547652b82fep8), /* N/ln2. */
|
||||
.ln2_hi_n = V2 (0x1.62e42fefc0000p-9), /* ln2/N. */
|
||||
.ln2_lo_n = V2 (-0x1.c610ca86c3899p-45),
|
||||
.exp_c0 = V2 (0x1.fffffffffffd4p-2),
|
||||
.exp_c1 = V2 (0x1.5555571d6ef9p-3),
|
||||
.exp_c2 = 0x1.5555576a5adcep-5,
|
||||
.small_exp = V2 (0x3c90000000000000),
|
||||
.thres_exp = V2 (0x03f0000000000000),
|
||||
.inv_ln2_n = 0x1.71547652b82fep8, /* N/ln2. */
|
||||
.ln2_hi_n = 0x1.62e42fefc0000p-9, /* ln2/N. */
|
||||
.ln2_lo_n = -0x1.c610ca86c3899p-45,
|
||||
};
|
||||
|
||||
#define A(i) data.log_poly[i]
|
||||
#define C(i) data.exp_poly[i]
|
||||
|
||||
/* This version implements an algorithm close to AOR scalar pow but
|
||||
/* This version implements an algorithm close to scalar pow but
|
||||
- does not implement the trick in the exp's specialcase subroutine to avoid
|
||||
double-rounding,
|
||||
- does not use a tail in the exponential core computation,
|
||||
|
|
@ -78,10 +90,9 @@ v_log_inline (uint64x2_t ix, float64x2_t *tail, const struct data *d)
|
|||
/* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
|
||||
The range is split into N subintervals.
|
||||
The ith subinterval contains z and c is near its center. */
|
||||
uint64x2_t tmp = vsubq_u64 (ix, v_u64 (Off));
|
||||
int64x2_t k
|
||||
= vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); /* arithmetic shift. */
|
||||
uint64x2_t iz = vsubq_u64 (ix, vandq_u64 (tmp, v_u64 (0xfffULL << 52)));
|
||||
uint64x2_t tmp = vsubq_u64 (ix, d->offset);
|
||||
int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52);
|
||||
uint64x2_t iz = vsubq_u64 (ix, vandq_u64 (tmp, d->mask));
|
||||
float64x2_t z = vreinterpretq_f64_u64 (iz);
|
||||
float64x2_t kd = vcvtq_f64_s64 (k);
|
||||
/* log(x) = k*Ln2 + log(c) + log1p(z/c-1). */
|
||||
|
|
@ -92,12 +103,13 @@ v_log_inline (uint64x2_t ix, float64x2_t *tail, const struct data *d)
|
|||
|z/c - 1| < 1/N, so r = z/c - 1 is exactly representible. */
|
||||
float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, invc);
|
||||
/* k*Ln2 + log(c) + r. */
|
||||
float64x2_t t1 = vfmaq_f64 (logc, kd, d->ln2_hi);
|
||||
float64x2_t ln2 = vld1q_f64 (&d->ln2_lo);
|
||||
float64x2_t t1 = vfmaq_laneq_f64 (logc, kd, ln2, 1);
|
||||
float64x2_t t2 = vaddq_f64 (t1, r);
|
||||
float64x2_t lo1 = vfmaq_f64 (logctail, kd, d->ln2_lo);
|
||||
float64x2_t lo1 = vfmaq_laneq_f64 (logctail, kd, ln2, 0);
|
||||
float64x2_t lo2 = vaddq_f64 (vsubq_f64 (t1, t2), r);
|
||||
/* Evaluation is optimized assuming superscalar pipelined execution. */
|
||||
float64x2_t ar = vmulq_f64 (A (0), r);
|
||||
float64x2_t ar = vmulq_f64 (v_f64 (-0.5), r);
|
||||
float64x2_t ar2 = vmulq_f64 (r, ar);
|
||||
float64x2_t ar3 = vmulq_f64 (r, ar2);
|
||||
/* k*Ln2 + log(c) + r + A[0]*r*r. */
|
||||
|
|
@ -105,9 +117,10 @@ v_log_inline (uint64x2_t ix, float64x2_t *tail, const struct data *d)
|
|||
float64x2_t lo3 = vfmaq_f64 (vnegq_f64 (ar2), ar, r);
|
||||
float64x2_t lo4 = vaddq_f64 (vsubq_f64 (t2, hi), ar2);
|
||||
/* p = log1p(r) - r - A[0]*r*r. */
|
||||
float64x2_t a56 = vfmaq_f64 (A (5), r, A (6));
|
||||
float64x2_t a34 = vfmaq_f64 (A (3), r, A (4));
|
||||
float64x2_t a12 = vfmaq_f64 (A (1), r, A (2));
|
||||
float64x2_t odd_coeffs = vld1q_f64 (&d->log_c1);
|
||||
float64x2_t a56 = vfmaq_f64 (d->log_c4, r, d->log_c5);
|
||||
float64x2_t a34 = vfmaq_laneq_f64 (d->log_c2, r, odd_coeffs, 1);
|
||||
float64x2_t a12 = vfmaq_laneq_f64 (d->log_c0, r, odd_coeffs, 0);
|
||||
float64x2_t p = vfmaq_f64 (a34, ar2, a56);
|
||||
p = vfmaq_f64 (a12, ar2, p);
|
||||
p = vmulq_f64 (ar3, p);
|
||||
|
|
@ -118,29 +131,37 @@ v_log_inline (uint64x2_t ix, float64x2_t *tail, const struct data *d)
|
|||
return y;
|
||||
}
|
||||
|
||||
static float64x2_t VPCS_ATTR NOINLINE
|
||||
exp_special_case (float64x2_t x, float64x2_t xtail)
|
||||
{
|
||||
return (float64x2_t){ exp_nosignbias (x[0], xtail[0]),
|
||||
exp_nosignbias (x[1], xtail[1]) };
|
||||
}
|
||||
|
||||
/* Computes sign*exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|. */
|
||||
static inline float64x2_t
|
||||
v_exp_inline (float64x2_t x, float64x2_t xtail, const struct data *d)
|
||||
v_exp_inline (float64x2_t x, float64x2_t neg_xtail, const struct data *d)
|
||||
{
|
||||
/* Fallback to scalar exp_inline for all lanes if any lane
|
||||
contains value of x s.t. |x| <= 2^-54 or >= 512. */
|
||||
uint64x2_t abstop
|
||||
= vandq_u64 (vshrq_n_u64 (vreinterpretq_u64_f64 (x), 52), v_u64 (0x7ff));
|
||||
uint64x2_t uoflowx
|
||||
= vcgeq_u64 (vsubq_u64 (abstop, VecSmallExp), VecThresExp);
|
||||
uint64x2_t uoflowx = vcgeq_u64 (
|
||||
vsubq_u64 (vreinterpretq_u64_f64 (vabsq_f64 (x)), d->small_exp),
|
||||
d->thres_exp);
|
||||
if (unlikely (v_any_u64 (uoflowx)))
|
||||
return v_call2_f64 (exp_nosignbias, x, xtail, x, v_u64 (-1));
|
||||
return exp_special_case (x, vnegq_f64 (neg_xtail));
|
||||
|
||||
/* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */
|
||||
/* x = ln2/N*k + r, with k integer and r in [-ln2/2N, ln2/2N]. */
|
||||
float64x2_t z = vmulq_f64 (d->inv_ln2_n, x);
|
||||
/* z - kd is in [-1, 1] in non-nearest rounding modes. */
|
||||
float64x2_t kd = vaddq_f64 (z, d->shift);
|
||||
uint64x2_t ki = vreinterpretq_u64_f64 (kd);
|
||||
kd = vsubq_f64 (kd, d->shift);
|
||||
float64x2_t r = vfmsq_f64 (x, kd, d->ln2_hi_n);
|
||||
r = vfmsq_f64 (r, kd, d->ln2_lo_n);
|
||||
float64x2_t exp_consts = vld1q_f64 (&d->inv_ln2_n);
|
||||
float64x2_t z = vmulq_laneq_f64 (x, exp_consts, 0);
|
||||
float64x2_t kd = vrndnq_f64 (z);
|
||||
uint64x2_t ki = vreinterpretq_u64_s64 (vcvtaq_s64_f64 (z));
|
||||
float64x2_t ln2_n = vld1q_f64 (&d->ln2_lo_n);
|
||||
float64x2_t r = vfmsq_laneq_f64 (x, kd, ln2_n, 1);
|
||||
r = vfmsq_laneq_f64 (r, kd, ln2_n, 0);
|
||||
/* The code assumes 2^-200 < |xtail| < 2^-8/N. */
|
||||
r = vaddq_f64 (r, xtail);
|
||||
r = vsubq_f64 (r, neg_xtail);
|
||||
/* 2^(k/N) ~= scale. */
|
||||
uint64x2_t idx = vandq_u64 (ki, v_u64 (N_EXP - 1));
|
||||
uint64x2_t top = vshlq_n_u64 (ki, 52 - V_POW_EXP_TABLE_BITS);
|
||||
|
|
@ -149,8 +170,8 @@ v_exp_inline (float64x2_t x, float64x2_t xtail, const struct data *d)
|
|||
sbits = vaddq_u64 (sbits, top);
|
||||
/* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1). */
|
||||
float64x2_t r2 = vmulq_f64 (r, r);
|
||||
float64x2_t tmp = vfmaq_f64 (C (1), r, C (2));
|
||||
tmp = vfmaq_f64 (C (0), r, tmp);
|
||||
float64x2_t tmp = vfmaq_laneq_f64 (d->exp_c1, r, exp_consts, 1);
|
||||
tmp = vfmaq_f64 (d->exp_c0, r, tmp);
|
||||
tmp = vfmaq_f64 (r, r2, tmp);
|
||||
float64x2_t scale = vreinterpretq_f64_u64 (sbits);
|
||||
/* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
|
||||
|
|
@ -158,54 +179,59 @@ v_exp_inline (float64x2_t x, float64x2_t xtail, const struct data *d)
|
|||
return vfmaq_f64 (scale, scale, tmp);
|
||||
}
|
||||
|
||||
static float64x2_t NOINLINE VPCS_ATTR
|
||||
scalar_fallback (float64x2_t x, float64x2_t y)
|
||||
{
|
||||
return (float64x2_t){ pow_scalar_special_case (x[0], y[0]),
|
||||
pow_scalar_special_case (x[1], y[1]) };
|
||||
}
|
||||
|
||||
float64x2_t VPCS_ATTR V_NAME_D2 (pow) (float64x2_t x, float64x2_t y)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
/* Case of x <= 0 is too complicated to be vectorised efficiently here,
|
||||
fallback to scalar pow for all lanes if any x < 0 detected. */
|
||||
if (v_any_u64 (vclezq_s64 (vreinterpretq_s64_f64 (x))))
|
||||
return v_call2_f64 (__pl_finite_pow, x, y, x, v_u64 (-1));
|
||||
return scalar_fallback (x, y);
|
||||
|
||||
uint64x2_t vix = vreinterpretq_u64_f64 (x);
|
||||
uint64x2_t viy = vreinterpretq_u64_f64 (y);
|
||||
uint64x2_t vtopx = vshrq_n_u64 (vix, 52);
|
||||
uint64x2_t vtopy = vshrq_n_u64 (viy, 52);
|
||||
uint64x2_t vabstopx = vandq_u64 (vtopx, v_u64 (0x7ff));
|
||||
uint64x2_t vabstopy = vandq_u64 (vtopy, v_u64 (0x7ff));
|
||||
uint64x2_t iay = vandq_u64 (viy, d->inf);
|
||||
|
||||
/* Special cases of x or y. */
|
||||
#if WANT_SIMD_EXCEPT
|
||||
/* Small or large. */
|
||||
uint64x2_t vtopx = vshrq_n_u64 (vix, 52);
|
||||
uint64x2_t vabstopy = vshrq_n_u64 (iay, 52);
|
||||
uint64x2_t specialx
|
||||
= vcgeq_u64 (vsubq_u64 (vtopx, VecSmallPowX), VecThresPowX);
|
||||
uint64x2_t specialy
|
||||
= vcgeq_u64 (vsubq_u64 (vabstopy, VecSmallPowY), VecThresPowY);
|
||||
#else
|
||||
/* Inf or nan. */
|
||||
uint64x2_t specialx = vcgeq_u64 (vabstopx, v_u64 (0x7ff));
|
||||
uint64x2_t specialy = vcgeq_u64 (vabstopy, v_u64 (0x7ff));
|
||||
/* The case y==0 does not trigger a special case, since in this case it is
|
||||
necessary to fix the result only if x is a signalling nan, which already
|
||||
triggers a special case. We test y==0 directly in the scalar fallback. */
|
||||
uint64x2_t iax = vandq_u64 (vix, d->inf);
|
||||
uint64x2_t specialx = vcgeq_u64 (iax, d->inf);
|
||||
uint64x2_t specialy = vcgeq_u64 (iay, d->inf);
|
||||
#endif
|
||||
uint64x2_t special = vorrq_u64 (specialx, specialy);
|
||||
/* Fallback to scalar on all lanes if any lane is inf or nan. */
|
||||
if (unlikely (v_any_u64 (special)))
|
||||
return v_call2_f64 (__pl_finite_pow, x, y, x, v_u64 (-1));
|
||||
return scalar_fallback (x, y);
|
||||
|
||||
/* Small cases of x: |x| < 0x1p-126. */
|
||||
uint64x2_t smallx = vcltq_u64 (vabstopx, VecSmallPowX);
|
||||
uint64x2_t smallx = vcaltq_f64 (x, d->small_powx);
|
||||
if (unlikely (v_any_u64 (smallx)))
|
||||
{
|
||||
/* Update ix if top 12 bits of x are 0. */
|
||||
uint64x2_t sub_x = vceqzq_u64 (vtopx);
|
||||
uint64x2_t sub_x = vceqzq_u64 (vshrq_n_u64 (vix, 52));
|
||||
if (unlikely (v_any_u64 (sub_x)))
|
||||
{
|
||||
/* Normalize subnormal x so exponent becomes negative. */
|
||||
uint64x2_t vix_norm
|
||||
= vreinterpretq_u64_f64 (vmulq_f64 (x, v_f64 (0x1p52)));
|
||||
vix_norm = vandq_u64 (vix_norm, v_u64 (0x7fffffffffffffff));
|
||||
vix_norm = vsubq_u64 (vix_norm, v_u64 (52ULL << 52));
|
||||
uint64x2_t vix_norm = vreinterpretq_u64_f64 (
|
||||
vabsq_f64 (vmulq_f64 (x, vcvtq_f64_u64 (d->mask_sub_0))));
|
||||
vix_norm = vsubq_u64 (vix_norm, d->mask_sub_1);
|
||||
vix = vbslq_u64 (sub_x, vix_norm, vix);
|
||||
}
|
||||
}
|
||||
|
|
@ -216,21 +242,20 @@ float64x2_t VPCS_ATTR V_NAME_D2 (pow) (float64x2_t x, float64x2_t y)
|
|||
|
||||
/* Vector Exp(y_loghi, y_loglo). */
|
||||
float64x2_t vehi = vmulq_f64 (y, vhi);
|
||||
float64x2_t velo = vmulq_f64 (y, vlo);
|
||||
float64x2_t vemi = vfmsq_f64 (vehi, y, vhi);
|
||||
velo = vsubq_f64 (velo, vemi);
|
||||
return v_exp_inline (vehi, velo, d);
|
||||
float64x2_t neg_velo = vfmsq_f64 (vemi, y, vlo);
|
||||
return v_exp_inline (vehi, neg_velo, d);
|
||||
}
|
||||
|
||||
PL_SIG (V, D, 2, pow)
|
||||
PL_TEST_ULP (V_NAME_D2 (pow), 0.55)
|
||||
PL_TEST_EXPECT_FENV (V_NAME_D2 (pow), WANT_SIMD_EXCEPT)
|
||||
TEST_SIG (V, D, 2, pow)
|
||||
TEST_ULP (V_NAME_D2 (pow), 0.55)
|
||||
TEST_DISABLE_FENV_IF_NOT (V_NAME_D2 (pow), WANT_SIMD_EXCEPT)
|
||||
/* Wide intervals spanning the whole domain but shared between x and y. */
|
||||
#define V_POW_INTERVAL2(xlo, xhi, ylo, yhi, n) \
|
||||
PL_TEST_INTERVAL2 (V_NAME_D2 (pow), xlo, xhi, ylo, yhi, n) \
|
||||
PL_TEST_INTERVAL2 (V_NAME_D2 (pow), xlo, xhi, -ylo, -yhi, n) \
|
||||
PL_TEST_INTERVAL2 (V_NAME_D2 (pow), -xlo, -xhi, ylo, yhi, n) \
|
||||
PL_TEST_INTERVAL2 (V_NAME_D2 (pow), -xlo, -xhi, -ylo, -yhi, n)
|
||||
#define V_POW_INTERVAL2(xlo, xhi, ylo, yhi, n) \
|
||||
TEST_INTERVAL2 (V_NAME_D2 (pow), xlo, xhi, ylo, yhi, n) \
|
||||
TEST_INTERVAL2 (V_NAME_D2 (pow), xlo, xhi, -ylo, -yhi, n) \
|
||||
TEST_INTERVAL2 (V_NAME_D2 (pow), -xlo, -xhi, ylo, yhi, n) \
|
||||
TEST_INTERVAL2 (V_NAME_D2 (pow), -xlo, -xhi, -ylo, -yhi, n)
|
||||
#define EXPAND(str) str##000000000
|
||||
#define SHL52(str) EXPAND (str)
|
||||
V_POW_INTERVAL2 (0, SHL52 (SmallPowX), 0, inf, 40000)
|
||||
|
|
@ -248,12 +273,12 @@ V_POW_INTERVAL2 (0x1.ep-1, 0x1.1p0, 0x1p8, 0x1p16, 10000)
|
|||
V_POW_INTERVAL2 (0x1p-300, 0x1p-200, 0x1p-20, 0x1p-10, 10000)
|
||||
V_POW_INTERVAL2 (0x1p50, 0x1p100, 0x1p-20, 0x1p-10, 10000)
|
||||
/* x is negative, y is odd or even integer, or y is real not integer. */
|
||||
PL_TEST_INTERVAL2 (V_NAME_D2 (pow), -0.0, -10.0, 3.0, 3.0, 10000)
|
||||
PL_TEST_INTERVAL2 (V_NAME_D2 (pow), -0.0, -10.0, 4.0, 4.0, 10000)
|
||||
PL_TEST_INTERVAL2 (V_NAME_D2 (pow), -0.0, -10.0, 0.0, 10.0, 10000)
|
||||
PL_TEST_INTERVAL2 (V_NAME_D2 (pow), 0.0, 10.0, -0.0, -10.0, 10000)
|
||||
TEST_INTERVAL2 (V_NAME_D2 (pow), -0.0, -10.0, 3.0, 3.0, 10000)
|
||||
TEST_INTERVAL2 (V_NAME_D2 (pow), -0.0, -10.0, 4.0, 4.0, 10000)
|
||||
TEST_INTERVAL2 (V_NAME_D2 (pow), -0.0, -10.0, 0.0, 10.0, 10000)
|
||||
TEST_INTERVAL2 (V_NAME_D2 (pow), 0.0, 10.0, -0.0, -10.0, 10000)
|
||||
/* 1.0^y. */
|
||||
PL_TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, 0.0, 0x1p-50, 1000)
|
||||
PL_TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, 0x1p-50, 1.0, 1000)
|
||||
PL_TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, 1.0, 0x1p100, 1000)
|
||||
PL_TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, -1.0, -0x1p120, 1000)
|
||||
TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, 0.0, 0x1p-50, 1000)
|
||||
TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, 0x1p-50, 1.0, 1000)
|
||||
TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, 1.0, 0x1p100, 1000)
|
||||
TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, -1.0, -0x1p120, 1000)
|
||||
209
contrib/arm-optimized-routines/math/aarch64/advsimd/powf.c
Normal file
209
contrib/arm-optimized-routines/math/aarch64/advsimd/powf.c
Normal file
|
|
@ -0,0 +1,209 @@
|
|||
/*
|
||||
* Single-precision vector powf function.
|
||||
*
|
||||
* Copyright (c) 2019-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "v_math.h"
|
||||
#include "test_defs.h"
|
||||
#include "test_sig.h"
|
||||
|
||||
#define Min v_u32 (0x00800000)
|
||||
#define Max v_u32 (0x7f800000)
|
||||
#define Thresh v_u32 (0x7f000000) /* Max - Min. */
|
||||
#define MantissaMask v_u32 (0x007fffff)
|
||||
|
||||
#define A d->log2_poly
|
||||
#define C d->exp2f_poly
|
||||
|
||||
/* 2.6 ulp ~ 0.5 + 2^24 (128*Ln2*relerr_log2 + relerr_exp2). */
|
||||
#define Off v_u32 (0x3f35d000)
|
||||
|
||||
#define V_POWF_LOG2_TABLE_BITS 5
|
||||
#define V_EXP2F_TABLE_BITS 5
|
||||
#define Log2IdxMask ((1 << V_POWF_LOG2_TABLE_BITS) - 1)
|
||||
#define Scale ((double) (1 << V_EXP2F_TABLE_BITS))
|
||||
|
||||
static const struct data
|
||||
{
|
||||
struct
|
||||
{
|
||||
double invc, logc;
|
||||
} log2_tab[1 << V_POWF_LOG2_TABLE_BITS];
|
||||
float64x2_t log2_poly[4];
|
||||
uint64_t exp2f_tab[1 << V_EXP2F_TABLE_BITS];
|
||||
float64x2_t exp2f_poly[3];
|
||||
} data = {
|
||||
.log2_tab = {{0x1.6489890582816p+0, -0x1.e960f97b22702p-2 * Scale},
|
||||
{0x1.5cf19b35e3472p+0, -0x1.c993406cd4db6p-2 * Scale},
|
||||
{0x1.55aac0e956d65p+0, -0x1.aa711d9a7d0f3p-2 * Scale},
|
||||
{0x1.4eb0022977e01p+0, -0x1.8bf37bacdce9bp-2 * Scale},
|
||||
{0x1.47fcccda1dd1fp+0, -0x1.6e13b3519946ep-2 * Scale},
|
||||
{0x1.418ceabab68c1p+0, -0x1.50cb8281e4089p-2 * Scale},
|
||||
{0x1.3b5c788f1edb3p+0, -0x1.341504a237e2bp-2 * Scale},
|
||||
{0x1.3567de48e9c9ap+0, -0x1.17eaab624ffbbp-2 * Scale},
|
||||
{0x1.2fabc80fd19bap+0, -0x1.f88e708f8c853p-3 * Scale},
|
||||
{0x1.2a25200ce536bp+0, -0x1.c24b6da113914p-3 * Scale},
|
||||
{0x1.24d108e0152e3p+0, -0x1.8d02ee397cb1dp-3 * Scale},
|
||||
{0x1.1facd8ab2fbe1p+0, -0x1.58ac1223408b3p-3 * Scale},
|
||||
{0x1.1ab614a03efdfp+0, -0x1.253e6fd190e89p-3 * Scale},
|
||||
{0x1.15ea6d03af9ffp+0, -0x1.e5641882c12ffp-4 * Scale},
|
||||
{0x1.1147b994bb776p+0, -0x1.81fea712926f7p-4 * Scale},
|
||||
{0x1.0ccbf650593aap+0, -0x1.203e240de64a3p-4 * Scale},
|
||||
{0x1.0875408477302p+0, -0x1.8029b86a78281p-5 * Scale},
|
||||
{0x1.0441d42a93328p+0, -0x1.85d713190fb9p-6 * Scale},
|
||||
{0x1p+0, 0x0p+0 * Scale},
|
||||
{0x1.f1d006c855e86p-1, 0x1.4c1cc07312997p-5 * Scale},
|
||||
{0x1.e28c3341aa301p-1, 0x1.5e1848ccec948p-4 * Scale},
|
||||
{0x1.d4bdf9aa64747p-1, 0x1.04cfcb7f1196fp-3 * Scale},
|
||||
{0x1.c7b45a24e5803p-1, 0x1.582813d463c21p-3 * Scale},
|
||||
{0x1.bb5f5eb2ed60ap-1, 0x1.a936fa68760ccp-3 * Scale},
|
||||
{0x1.afb0bff8fe6b4p-1, 0x1.f81bc31d6cc4ep-3 * Scale},
|
||||
{0x1.a49badf7ab1f5p-1, 0x1.2279a09fae6b1p-2 * Scale},
|
||||
{0x1.9a14a111fc4c9p-1, 0x1.47ec0b6df5526p-2 * Scale},
|
||||
{0x1.901131f5b2fdcp-1, 0x1.6c71762280f1p-2 * Scale},
|
||||
{0x1.8687f73f6d865p-1, 0x1.90155070798dap-2 * Scale},
|
||||
{0x1.7d7067eb77986p-1, 0x1.b2e23b1d3068cp-2 * Scale},
|
||||
{0x1.74c2c1cf97b65p-1, 0x1.d4e21b0daa86ap-2 * Scale},
|
||||
{0x1.6c77f37cff2a1p-1, 0x1.f61e2a2f67f3fp-2 * Scale},},
|
||||
.log2_poly = { /* rel err: 1.5 * 2^-30. */
|
||||
V2 (-0x1.6ff5daa3b3d7cp-2 * Scale),
|
||||
V2 (0x1.ec81d03c01aebp-2 * Scale),
|
||||
V2 (-0x1.71547bb43f101p-1 * Scale),
|
||||
V2 (0x1.7154764a815cbp0 * Scale)},
|
||||
.exp2f_tab = {0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f,
|
||||
0x3fef9301d0125b51, 0x3fef72b83c7d517b, 0x3fef54873168b9aa,
|
||||
0x3fef387a6e756238, 0x3fef1e9df51fdee1, 0x3fef06fe0a31b715,
|
||||
0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d,
|
||||
0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429,
|
||||
0x3feea47eb03a5585, 0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74,
|
||||
0x3feea11473eb0187, 0x3feea589994cce13, 0x3feeace5422aa0db,
|
||||
0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d,
|
||||
0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c,
|
||||
0x3fef3720dcef9069, 0x3fef5818dcfba487, 0x3fef7c97337b9b5f,
|
||||
0x3fefa4afa2a490da, 0x3fefd0765b6e4540,},
|
||||
.exp2f_poly = { /* rel err: 1.69 * 2^-34. */
|
||||
V2 (0x1.c6af84b912394p-5 / Scale / Scale / Scale),
|
||||
V2 (0x1.ebfce50fac4f3p-3 / Scale / Scale),
|
||||
V2 (0x1.62e42ff0c52d6p-1 / Scale)}};
|
||||
|
||||
static float32x4_t VPCS_ATTR NOINLINE
|
||||
special_case (float32x4_t x, float32x4_t y, float32x4_t ret, uint32x4_t cmp)
|
||||
{
|
||||
return v_call2_f32 (powf, x, y, ret, cmp);
|
||||
}
|
||||
|
||||
static inline float64x2_t
|
||||
ylogx_core (const struct data *d, float64x2_t iz, float64x2_t k,
|
||||
float64x2_t invc, float64x2_t logc, float64x2_t y)
|
||||
{
|
||||
|
||||
/* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k. */
|
||||
float64x2_t r = vfmaq_f64 (v_f64 (-1.0), iz, invc);
|
||||
float64x2_t y0 = vaddq_f64 (logc, k);
|
||||
|
||||
/* Polynomial to approximate log1p(r)/ln2. */
|
||||
float64x2_t logx = vfmaq_f64 (A[1], r, A[0]);
|
||||
logx = vfmaq_f64 (A[2], logx, r);
|
||||
logx = vfmaq_f64 (A[3], logx, r);
|
||||
logx = vfmaq_f64 (y0, logx, r);
|
||||
|
||||
return vmulq_f64 (logx, y);
|
||||
}
|
||||
|
||||
static inline float64x2_t
|
||||
log2_lookup (const struct data *d, uint32_t i)
|
||||
{
|
||||
return vld1q_f64 (
|
||||
&d->log2_tab[(i >> (23 - V_POWF_LOG2_TABLE_BITS)) & Log2IdxMask].invc);
|
||||
}
|
||||
|
||||
static inline uint64x1_t
|
||||
exp2f_lookup (const struct data *d, uint64_t i)
|
||||
{
|
||||
return vld1_u64 (&d->exp2f_tab[i % (1 << V_EXP2F_TABLE_BITS)]);
|
||||
}
|
||||
|
||||
static inline float32x2_t
|
||||
powf_core (const struct data *d, float64x2_t ylogx)
|
||||
{
|
||||
/* N*x = k + r with r in [-1/2, 1/2]. */
|
||||
float64x2_t kd = vrndnq_f64 (ylogx);
|
||||
int64x2_t ki = vcvtaq_s64_f64 (ylogx);
|
||||
float64x2_t r = vsubq_f64 (ylogx, kd);
|
||||
|
||||
/* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1). */
|
||||
uint64x2_t t = vcombine_u64 (exp2f_lookup (d, vgetq_lane_s64 (ki, 0)),
|
||||
exp2f_lookup (d, vgetq_lane_s64 (ki, 1)));
|
||||
t = vaddq_u64 (
|
||||
t, vreinterpretq_u64_s64 (vshlq_n_s64 (ki, 52 - V_EXP2F_TABLE_BITS)));
|
||||
float64x2_t s = vreinterpretq_f64_u64 (t);
|
||||
float64x2_t p = vfmaq_f64 (C[1], r, C[0]);
|
||||
p = vfmaq_f64 (C[2], r, p);
|
||||
p = vfmaq_f64 (s, p, vmulq_f64 (s, r));
|
||||
return vcvt_f32_f64 (p);
|
||||
}
|
||||
|
||||
float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (pow) (float32x4_t x, float32x4_t y)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
uint32x4_t u = vreinterpretq_u32_f32 (x);
|
||||
uint32x4_t cmp = vcgeq_u32 (vsubq_u32 (u, Min), Thresh);
|
||||
uint32x4_t tmp = vsubq_u32 (u, Off);
|
||||
uint32x4_t top = vbicq_u32 (tmp, MantissaMask);
|
||||
float32x4_t iz = vreinterpretq_f32_u32 (vsubq_u32 (u, top));
|
||||
int32x4_t k = vshrq_n_s32 (vreinterpretq_s32_u32 (top),
|
||||
23 - V_EXP2F_TABLE_BITS); /* arithmetic shift. */
|
||||
|
||||
/* Use double precision for each lane: split input vectors into lo and hi
|
||||
halves and promote. */
|
||||
float64x2_t tab0 = log2_lookup (d, vgetq_lane_u32 (tmp, 0)),
|
||||
tab1 = log2_lookup (d, vgetq_lane_u32 (tmp, 1)),
|
||||
tab2 = log2_lookup (d, vgetq_lane_u32 (tmp, 2)),
|
||||
tab3 = log2_lookup (d, vgetq_lane_u32 (tmp, 3));
|
||||
|
||||
float64x2_t iz_lo = vcvt_f64_f32 (vget_low_f32 (iz)),
|
||||
iz_hi = vcvt_high_f64_f32 (iz);
|
||||
|
||||
float64x2_t k_lo = vcvtq_f64_s64 (vmovl_s32 (vget_low_s32 (k))),
|
||||
k_hi = vcvtq_f64_s64 (vmovl_high_s32 (k));
|
||||
|
||||
float64x2_t invc_lo = vzip1q_f64 (tab0, tab1),
|
||||
invc_hi = vzip1q_f64 (tab2, tab3),
|
||||
logc_lo = vzip2q_f64 (tab0, tab1),
|
||||
logc_hi = vzip2q_f64 (tab2, tab3);
|
||||
|
||||
float64x2_t y_lo = vcvt_f64_f32 (vget_low_f32 (y)),
|
||||
y_hi = vcvt_high_f64_f32 (y);
|
||||
|
||||
float64x2_t ylogx_lo = ylogx_core (d, iz_lo, k_lo, invc_lo, logc_lo, y_lo);
|
||||
float64x2_t ylogx_hi = ylogx_core (d, iz_hi, k_hi, invc_hi, logc_hi, y_hi);
|
||||
|
||||
uint32x4_t ylogx_top = vuzp2q_u32 (vreinterpretq_u32_f64 (ylogx_lo),
|
||||
vreinterpretq_u32_f64 (ylogx_hi));
|
||||
|
||||
cmp = vorrq_u32 (
|
||||
cmp, vcgeq_u32 (vandq_u32 (vshrq_n_u32 (ylogx_top, 15), v_u32 (0xffff)),
|
||||
vdupq_n_u32 (asuint64 (126.0 * (1 << V_EXP2F_TABLE_BITS))
|
||||
>> 47)));
|
||||
|
||||
float32x2_t p_lo = powf_core (d, ylogx_lo);
|
||||
float32x2_t p_hi = powf_core (d, ylogx_hi);
|
||||
|
||||
if (unlikely (v_any_u32 (cmp)))
|
||||
return special_case (x, y, vcombine_f32 (p_lo, p_hi), cmp);
|
||||
return vcombine_f32 (p_lo, p_hi);
|
||||
}
|
||||
|
||||
HALF_WIDTH_ALIAS_F2 (pow)
|
||||
|
||||
TEST_SIG (V, F, 2, pow)
|
||||
TEST_ULP (V_NAME_F2 (pow), 2.1)
|
||||
TEST_DISABLE_FENV (V_NAME_F2 (pow))
|
||||
TEST_INTERVAL2 (V_NAME_F2 (pow), 0x1p-1, 0x1p1, 0x1p-7, 0x1p7, 50000)
|
||||
TEST_INTERVAL2 (V_NAME_F2 (pow), 0x1p-1, 0x1p1, -0x1p-7, -0x1p7, 50000)
|
||||
TEST_INTERVAL2 (V_NAME_F2 (pow), 0x1p-70, 0x1p70, 0x1p-1, 0x1p1, 50000)
|
||||
TEST_INTERVAL2 (V_NAME_F2 (pow), 0x1p-70, 0x1p70, -0x1p-1, -0x1p1, 50000)
|
||||
TEST_INTERVAL2 (V_NAME_F2 (pow), 0x1.ep-1, 0x1.1p0, 0x1p8, 0x1p14, 50000)
|
||||
TEST_INTERVAL2 (V_NAME_F2 (pow), 0x1.ep-1, 0x1.1p0, -0x1p8, -0x1p14, 50000)
|
||||
|
|
@ -1,17 +1,19 @@
|
|||
/*
|
||||
* Double-precision vector sin function.
|
||||
*
|
||||
* Copyright (c) 2019-2023, Arm Limited.
|
||||
* Copyright (c) 2019-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "test_defs.h"
|
||||
#include "test_sig.h"
|
||||
#include "mathlib.h"
|
||||
#include "v_math.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
float64x2_t poly[7];
|
||||
float64x2_t range_val, inv_pi, shift, pi_1, pi_2, pi_3;
|
||||
float64x2_t range_val, inv_pi, pi_1, pi_2, pi_3;
|
||||
} data = {
|
||||
.poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7),
|
||||
V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19),
|
||||
|
|
@ -23,12 +25,13 @@ static const struct data
|
|||
.pi_1 = V2 (0x1.921fb54442d18p+1),
|
||||
.pi_2 = V2 (0x1.1a62633145c06p-53),
|
||||
.pi_3 = V2 (0x1.c1cd129024e09p-106),
|
||||
.shift = V2 (0x1.8p52),
|
||||
};
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
# define TinyBound v_u64 (0x3000000000000000) /* asuint64 (0x1p-255). */
|
||||
# define Thresh v_u64 (0x1160000000000000) /* RangeVal - TinyBound. */
|
||||
/* asuint64(0x1p-253)), below which multiply by inv_pi underflows. */
|
||||
# define TinyBound v_u64 (0x3020000000000000)
|
||||
/* RangeVal - TinyBound. */
|
||||
# define Thresh v_u64 (0x1160000000000000)
|
||||
#endif
|
||||
|
||||
#define C(i) d->poly[i]
|
||||
|
|
@ -61,16 +64,15 @@ float64x2_t VPCS_ATTR V_NAME_D1 (sin) (float64x2_t x)
|
|||
fenv). These lanes will be fixed by special-case handler later. */
|
||||
uint64x2_t ir = vreinterpretq_u64_f64 (vabsq_f64 (x));
|
||||
cmp = vcgeq_u64 (vsubq_u64 (ir, TinyBound), Thresh);
|
||||
r = vbslq_f64 (cmp, vreinterpretq_f64_u64 (cmp), x);
|
||||
r = vreinterpretq_f64_u64 (vbicq_u64 (vreinterpretq_u64_f64 (x), cmp));
|
||||
#else
|
||||
r = x;
|
||||
cmp = vcageq_f64 (x, d->range_val);
|
||||
#endif
|
||||
|
||||
/* n = rint(|x|/pi). */
|
||||
n = vfmaq_f64 (d->shift, d->inv_pi, r);
|
||||
odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63);
|
||||
n = vsubq_f64 (n, d->shift);
|
||||
n = vrndaq_f64 (vmulq_f64 (r, d->inv_pi));
|
||||
odd = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtq_s64_f64 (n)), 63);
|
||||
|
||||
/* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
|
||||
r = vfmsq_f64 (r, d->pi_1, n);
|
||||
|
|
@ -95,3 +97,9 @@ float64x2_t VPCS_ATTR V_NAME_D1 (sin) (float64x2_t x)
|
|||
return special_case (x, y, odd, cmp);
|
||||
return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
|
||||
}
|
||||
|
||||
TEST_SIG (V, D, 1, sin, -3.1, 3.1)
|
||||
TEST_ULP (V_NAME_D1 (sin), 3.0)
|
||||
TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (sin), WANT_SIMD_EXCEPT)
|
||||
TEST_SYM_INTERVAL (V_NAME_D1 (sin), 0, 0x1p23, 500000)
|
||||
TEST_SYM_INTERVAL (V_NAME_D1 (sin), 0x1p23, inf, 10000)
|
||||
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
* Double-precision vector sincos function.
|
||||
*
|
||||
* Copyright (c) 2023, Arm Limited.
|
||||
* Copyright (c) 2023-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
|
|
@ -10,12 +10,21 @@
|
|||
be linked against the scalar sincosf from math/. */
|
||||
#define _GNU_SOURCE
|
||||
#include <math.h>
|
||||
#undef _GNU_SOURCE
|
||||
|
||||
#include "v_math.h"
|
||||
#include "pl_test.h"
|
||||
#include "test_defs.h"
|
||||
#include "v_sincos_common.h"
|
||||
|
||||
/* sincos not available for all scalar libm implementations. */
|
||||
#if defined(_MSC_VER) || !defined(__GLIBC__)
|
||||
static void
|
||||
sincos (double x, double *out_sin, double *out_cos)
|
||||
{
|
||||
*out_sin = sin (x);
|
||||
*out_cos = cos (x);
|
||||
}
|
||||
#endif
|
||||
|
||||
static void VPCS_ATTR NOINLINE
|
||||
special_case (float64x2_t x, uint64x2_t special, double *out_sin,
|
||||
double *out_cos)
|
||||
|
|
@ -46,12 +55,13 @@ _ZGVnN2vl8l8_sincos (float64x2_t x, double *out_sin, double *out_cos)
|
|||
special_case (x, special, out_sin, out_cos);
|
||||
}
|
||||
|
||||
PL_TEST_ULP (_ZGVnN2v_sincos_sin, 2.73)
|
||||
PL_TEST_ULP (_ZGVnN2v_sincos_cos, 2.73)
|
||||
TEST_DISABLE_FENV (_ZGVnN2v_sincos_cos)
|
||||
TEST_DISABLE_FENV (_ZGVnN2v_sincos_sin)
|
||||
TEST_ULP (_ZGVnN2v_sincos_sin, 2.73)
|
||||
TEST_ULP (_ZGVnN2v_sincos_cos, 2.73)
|
||||
#define V_SINCOS_INTERVAL(lo, hi, n) \
|
||||
PL_TEST_INTERVAL (_ZGVnN2v_sincos_sin, lo, hi, n) \
|
||||
PL_TEST_INTERVAL (_ZGVnN2v_sincos_cos, lo, hi, n)
|
||||
V_SINCOS_INTERVAL (0, 0x1p23, 500000)
|
||||
V_SINCOS_INTERVAL (-0, -0x1p23, 500000)
|
||||
TEST_INTERVAL (_ZGVnN2v_sincos_sin, lo, hi, n) \
|
||||
TEST_INTERVAL (_ZGVnN2v_sincos_cos, lo, hi, n)
|
||||
V_SINCOS_INTERVAL (0, 0x1p-31, 50000)
|
||||
V_SINCOS_INTERVAL (0x1p-31, 0x1p23, 500000)
|
||||
V_SINCOS_INTERVAL (0x1p23, inf, 10000)
|
||||
V_SINCOS_INTERVAL (-0x1p23, -inf, 10000)
|
||||
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
* Single-precision vector sincos function.
|
||||
*
|
||||
* Copyright (c) 2023, Arm Limited.
|
||||
* Copyright (c) 2023-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
|
|
@ -10,11 +10,20 @@
|
|||
be linked against the scalar sincosf from math/. */
|
||||
#define _GNU_SOURCE
|
||||
#include <math.h>
|
||||
#undef _GNU_SOURCE
|
||||
|
||||
#include "v_sincosf_common.h"
|
||||
#include "v_math.h"
|
||||
#include "pl_test.h"
|
||||
#include "test_defs.h"
|
||||
|
||||
/* sincos not available for all scalar libm implementations. */
|
||||
#if defined(_MSC_VER) || !defined(__GLIBC__)
|
||||
static void
|
||||
sincosf (float x, float *out_sin, float *out_cos)
|
||||
{
|
||||
*out_sin = sinf (x);
|
||||
*out_cos = cosf (x);
|
||||
}
|
||||
#endif
|
||||
|
||||
static void VPCS_ATTR NOINLINE
|
||||
special_case (float32x4_t x, uint32x4_t special, float *out_sin,
|
||||
|
|
@ -47,12 +56,13 @@ _ZGVnN4vl4l4_sincosf (float32x4_t x, float *out_sin, float *out_cos)
|
|||
special_case (x, special, out_sin, out_cos);
|
||||
}
|
||||
|
||||
PL_TEST_ULP (_ZGVnN4v_sincosf_sin, 1.17)
|
||||
PL_TEST_ULP (_ZGVnN4v_sincosf_cos, 1.31)
|
||||
TEST_DISABLE_FENV (_ZGVnN4v_sincosf_sin)
|
||||
TEST_DISABLE_FENV (_ZGVnN4v_sincosf_cos)
|
||||
TEST_ULP (_ZGVnN4v_sincosf_sin, 1.17)
|
||||
TEST_ULP (_ZGVnN4v_sincosf_cos, 1.31)
|
||||
#define V_SINCOSF_INTERVAL(lo, hi, n) \
|
||||
PL_TEST_INTERVAL (_ZGVnN4v_sincosf_sin, lo, hi, n) \
|
||||
PL_TEST_INTERVAL (_ZGVnN4v_sincosf_cos, lo, hi, n)
|
||||
V_SINCOSF_INTERVAL (0, 0x1p20, 500000)
|
||||
V_SINCOSF_INTERVAL (-0, -0x1p20, 500000)
|
||||
TEST_INTERVAL (_ZGVnN4v_sincosf_sin, lo, hi, n) \
|
||||
TEST_INTERVAL (_ZGVnN4v_sincosf_cos, lo, hi, n)
|
||||
V_SINCOSF_INTERVAL (0, 0x1p-31, 50000)
|
||||
V_SINCOSF_INTERVAL (0x1p-31, 0x1p20, 500000)
|
||||
V_SINCOSF_INTERVAL (0x1p20, inf, 10000)
|
||||
V_SINCOSF_INTERVAL (-0x1p20, -inf, 10000)
|
||||
|
|
@ -0,0 +1,44 @@
|
|||
/*
|
||||
* Double-precision vector sincospi function.
|
||||
*
|
||||
* Copyright (c) 2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
#include "v_sincospi_common.h"
|
||||
#include "v_math.h"
|
||||
#include "test_defs.h"
|
||||
|
||||
/* Double-precision vector function allowing calculation of both sin and cos in
|
||||
one function call, using separate argument reduction and shared low-order
|
||||
polynomials.
|
||||
Approximation for vector double-precision sincospi(x).
|
||||
Maximum Error 3.09 ULP:
|
||||
_ZGVnN2v_sincospi_sin(0x1.7a41deb4b21e1p+14) got 0x1.fd54d0b327cf1p-1
|
||||
want 0x1.fd54d0b327cf4p-1
|
||||
Maximum Error 3.16 ULP:
|
||||
_ZGVnN2v_sincospi_cos(-0x1.11e3c7e284adep-5) got 0x1.fd2da484ff3ffp-1
|
||||
want 0x1.fd2da484ff402p-1. */
|
||||
VPCS_ATTR void
|
||||
_ZGVnN2vl8l8_sincospi (float64x2_t x, double *out_sin, double *out_cos)
|
||||
{
|
||||
const struct v_sincospi_data *d = ptr_barrier (&v_sincospi_data);
|
||||
|
||||
float64x2x2_t sc = v_sincospi_inline (x, d);
|
||||
|
||||
vst1q_f64 (out_sin, sc.val[0]);
|
||||
vst1q_f64 (out_cos, sc.val[1]);
|
||||
}
|
||||
|
||||
#if WANT_TRIGPI_TESTS
|
||||
TEST_DISABLE_FENV (_ZGVnN2v_sincospi_cos)
|
||||
TEST_DISABLE_FENV (_ZGVnN2v_sincospi_sin)
|
||||
TEST_ULP (_ZGVnN2v_sincospi_sin, 2.59)
|
||||
TEST_ULP (_ZGVnN2v_sincospi_cos, 2.66)
|
||||
# define V_SINCOSPI_INTERVAL(lo, hi, n) \
|
||||
TEST_SYM_INTERVAL (_ZGVnN2v_sincospi_sin, lo, hi, n) \
|
||||
TEST_SYM_INTERVAL (_ZGVnN2v_sincospi_cos, lo, hi, n)
|
||||
V_SINCOSPI_INTERVAL (0, 0x1p-63, 10000)
|
||||
V_SINCOSPI_INTERVAL (0x1p-63, 0.5, 50000)
|
||||
V_SINCOSPI_INTERVAL (0.5, 0x1p63, 50000)
|
||||
V_SINCOSPI_INTERVAL (0x1p63, inf, 10000)
|
||||
#endif
|
||||
|
|
@ -0,0 +1,43 @@
|
|||
/*
|
||||
* Single-precision vector sincospi function.
|
||||
*
|
||||
* Copyright (c) 2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "v_sincospif_common.h"
|
||||
#include "v_math.h"
|
||||
#include "test_defs.h"
|
||||
#include "mathlib.h"
|
||||
|
||||
/* Single-precision vector function allowing calculation of both sinpi and
|
||||
cospi in one function call, using shared argument reduction and polynomials.
|
||||
Worst-case error for sin is 3.04 ULP:
|
||||
_ZGVnN4v_sincospif_sin(0x1.1d341ap-1) got 0x1.f7cd56p-1 want 0x1.f7cd5p-1.
|
||||
Worst-case error for cos is 3.18 ULP:
|
||||
_ZGVnN4v_sincospif_cos(0x1.d341a8p-5) got 0x1.f7cd56p-1 want 0x1.f7cd5p-1.
|
||||
*/
|
||||
VPCS_ATTR void
|
||||
_ZGVnN4vl4l4_sincospif (float32x4_t x, float *out_sin, float *out_cos)
|
||||
{
|
||||
const struct v_sincospif_data *d = ptr_barrier (&v_sincospif_data);
|
||||
|
||||
float32x4x2_t sc = v_sincospif_inline (x, d);
|
||||
|
||||
vst1q_f32 (out_sin, sc.val[0]);
|
||||
vst1q_f32 (out_cos, sc.val[1]);
|
||||
}
|
||||
|
||||
#if WANT_TRIGPI_TESTS
|
||||
TEST_DISABLE_FENV (_ZGVnN4v_sincospif_sin)
|
||||
TEST_DISABLE_FENV (_ZGVnN4v_sincospif_cos)
|
||||
TEST_ULP (_ZGVnN4v_sincospif_sin, 2.54)
|
||||
TEST_ULP (_ZGVnN4v_sincospif_cos, 2.68)
|
||||
# define V_SINCOSPIF_INTERVAL(lo, hi, n) \
|
||||
TEST_SYM_INTERVAL (_ZGVnN4v_sincospif_sin, lo, hi, n) \
|
||||
TEST_SYM_INTERVAL (_ZGVnN4v_sincospif_cos, lo, hi, n)
|
||||
V_SINCOSPIF_INTERVAL (0, 0x1p-63, 10000)
|
||||
V_SINCOSPIF_INTERVAL (0x1p-63, 0.5, 50000)
|
||||
V_SINCOSPIF_INTERVAL (0.5, 0x1p31, 50000)
|
||||
V_SINCOSPIF_INTERVAL (0x1p31, inf, 10000)
|
||||
#endif
|
||||
|
|
@ -1,17 +1,19 @@
|
|||
/*
|
||||
* Single-precision vector sin function.
|
||||
*
|
||||
* Copyright (c) 2019-2023, Arm Limited.
|
||||
* Copyright (c) 2019-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "mathlib.h"
|
||||
#include "v_math.h"
|
||||
#include "test_defs.h"
|
||||
#include "test_sig.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
float32x4_t poly[4];
|
||||
float32x4_t range_val, inv_pi, shift, pi_1, pi_2, pi_3;
|
||||
float32x4_t range_val, inv_pi, pi_1, pi_2, pi_3;
|
||||
} data = {
|
||||
/* 1.886 ulp error. */
|
||||
.poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f),
|
||||
|
|
@ -22,13 +24,14 @@ static const struct data
|
|||
.pi_3 = V4 (-0x1.ee59dap-49f),
|
||||
|
||||
.inv_pi = V4 (0x1.45f306p-2f),
|
||||
.shift = V4 (0x1.8p+23f),
|
||||
.range_val = V4 (0x1p20f)
|
||||
};
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
# define TinyBound v_u32 (0x21000000) /* asuint32(0x1p-61f). */
|
||||
# define Thresh v_u32 (0x28800000) /* RangeVal - TinyBound. */
|
||||
/* asuint32(0x1p-59f), below which multiply by inv_pi underflows. */
|
||||
# define TinyBound v_u32 (0x22000000)
|
||||
/* RangeVal - TinyBound. */
|
||||
# define Thresh v_u32 (0x27800000)
|
||||
#endif
|
||||
|
||||
#define C(i) d->poly[i]
|
||||
|
|
@ -41,7 +44,7 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp)
|
|||
return v_call_f32 (sinf, x, y, cmp);
|
||||
}
|
||||
|
||||
float32x4_t VPCS_ATTR V_NAME_F1 (sin) (float32x4_t x)
|
||||
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sin) (float32x4_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
float32x4_t n, r, r2, y;
|
||||
|
|
@ -53,23 +56,22 @@ float32x4_t VPCS_ATTR V_NAME_F1 (sin) (float32x4_t x)
|
|||
/* If fenv exceptions are to be triggered correctly, set any special lanes
|
||||
to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
|
||||
special-case handler later. */
|
||||
r = vbslq_f32 (cmp, vreinterpretq_f32_u32 (cmp), x);
|
||||
r = vreinterpretq_f32_u32 (vbicq_u32 (vreinterpretq_u32_f32 (x), cmp));
|
||||
#else
|
||||
r = x;
|
||||
cmp = vcageq_f32 (x, d->range_val);
|
||||
#endif
|
||||
|
||||
/* n = rint(|x|/pi) */
|
||||
n = vfmaq_f32 (d->shift, d->inv_pi, r);
|
||||
odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31);
|
||||
n = vsubq_f32 (n, d->shift);
|
||||
/* n = rint(|x|/pi). */
|
||||
n = vrndaq_f32 (vmulq_f32 (r, d->inv_pi));
|
||||
odd = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 31);
|
||||
|
||||
/* r = |x| - n*pi (range reduction into -pi/2 .. pi/2) */
|
||||
/* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
|
||||
r = vfmsq_f32 (r, d->pi_1, n);
|
||||
r = vfmsq_f32 (r, d->pi_2, n);
|
||||
r = vfmsq_f32 (r, d->pi_3, n);
|
||||
|
||||
/* y = sin(r) */
|
||||
/* y = sin(r). */
|
||||
r2 = vmulq_f32 (r, r);
|
||||
y = vfmaq_f32 (C (2), C (3), r2);
|
||||
y = vfmaq_f32 (C (1), y, r2);
|
||||
|
|
@ -80,3 +82,11 @@ float32x4_t VPCS_ATTR V_NAME_F1 (sin) (float32x4_t x)
|
|||
return special_case (x, y, odd, cmp);
|
||||
return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
|
||||
}
|
||||
|
||||
HALF_WIDTH_ALIAS_F1 (sin)
|
||||
|
||||
TEST_SIG (V, F, 1, sin, -3.1, 3.1)
|
||||
TEST_ULP (V_NAME_F1 (sin), 1.4)
|
||||
TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (sin), WANT_SIMD_EXCEPT)
|
||||
TEST_SYM_INTERVAL (V_NAME_F1 (sin), 0, 0x1p20, 500000)
|
||||
TEST_SYM_INTERVAL (V_NAME_F1 (sin), 0x1p20, inf, 10000)
|
||||
80
contrib/arm-optimized-routines/math/aarch64/advsimd/sinh.c
Normal file
80
contrib/arm-optimized-routines/math/aarch64/advsimd/sinh.c
Normal file
|
|
@ -0,0 +1,80 @@
|
|||
/*
|
||||
* Double-precision vector sinh(x) function.
|
||||
*
|
||||
* Copyright (c) 2022-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "v_math.h"
|
||||
#include "test_sig.h"
|
||||
#include "test_defs.h"
|
||||
#include "v_expm1_inline.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
struct v_expm1_data d;
|
||||
uint64x2_t halff;
|
||||
#if WANT_SIMD_EXCEPT
|
||||
uint64x2_t tiny_bound, thresh;
|
||||
#else
|
||||
float64x2_t large_bound;
|
||||
#endif
|
||||
} data = {
|
||||
.d = V_EXPM1_DATA,
|
||||
.halff = V2 (0x3fe0000000000000),
|
||||
#if WANT_SIMD_EXCEPT
|
||||
/* 2^-26, below which sinh(x) rounds to x. */
|
||||
.tiny_bound = V2 (0x3e50000000000000),
|
||||
/* asuint(large_bound) - asuint(tiny_bound). */
|
||||
.thresh = V2 (0x0230000000000000),
|
||||
#else
|
||||
/* 2^9. expm1 helper overflows for large input. */
|
||||
.large_bound = V2 (0x1p+9),
|
||||
#endif
|
||||
};
|
||||
|
||||
static float64x2_t NOINLINE VPCS_ATTR
|
||||
special_case (float64x2_t x)
|
||||
{
|
||||
return v_call_f64 (sinh, x, x, v_u64 (-1));
|
||||
}
|
||||
|
||||
/* Approximation for vector double-precision sinh(x) using expm1.
|
||||
sinh(x) = (exp(x) - exp(-x)) / 2.
|
||||
The greatest observed error is 2.52 ULP:
|
||||
_ZGVnN2v_sinh(-0x1.a098a2177a2b9p-2) got -0x1.ac2f05bb66fccp-2
|
||||
want -0x1.ac2f05bb66fc9p-2. */
|
||||
float64x2_t VPCS_ATTR V_NAME_D1 (sinh) (float64x2_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
|
||||
float64x2_t ax = vabsq_f64 (x);
|
||||
uint64x2_t ix = vreinterpretq_u64_f64 (x);
|
||||
float64x2_t halfsign = vreinterpretq_f64_u64 (
|
||||
vbslq_u64 (v_u64 (0x8000000000000000), ix, d->halff));
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
uint64x2_t special = vcgeq_u64 (
|
||||
vsubq_u64 (vreinterpretq_u64_f64 (ax), d->tiny_bound), d->thresh);
|
||||
#else
|
||||
uint64x2_t special = vcageq_f64 (x, d->large_bound);
|
||||
#endif
|
||||
|
||||
/* Fall back to scalar variant for all lanes if any of them are special. */
|
||||
if (unlikely (v_any_u64 (special)))
|
||||
return special_case (x);
|
||||
|
||||
/* Up to the point that expm1 overflows, we can use it to calculate sinh
|
||||
using a slight rearrangement of the definition of sinh. This allows us to
|
||||
retain acceptable accuracy for very small inputs. */
|
||||
float64x2_t t = expm1_inline (ax, &d->d);
|
||||
t = vaddq_f64 (t, vdivq_f64 (t, vaddq_f64 (t, v_f64 (1.0))));
|
||||
return vmulq_f64 (t, halfsign);
|
||||
}
|
||||
|
||||
TEST_SIG (V, D, 1, sinh, -10.0, 10.0)
|
||||
TEST_ULP (V_NAME_D1 (sinh), 2.02)
|
||||
TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (sinh), WANT_SIMD_EXCEPT)
|
||||
TEST_SYM_INTERVAL (V_NAME_D1 (sinh), 0, 0x1p-26, 1000)
|
||||
TEST_SYM_INTERVAL (V_NAME_D1 (sinh), 0x1p-26, 0x1p9, 500000)
|
||||
TEST_SYM_INTERVAL (V_NAME_D1 (sinh), 0x1p9, inf, 1000)
|
||||
|
|
@ -1,28 +1,25 @@
|
|||
/*
|
||||
* Single-precision vector sinh(x) function.
|
||||
*
|
||||
* Copyright (c) 2022-2023, Arm Limited.
|
||||
* Copyright (c) 2022-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "v_math.h"
|
||||
#include "pl_sig.h"
|
||||
#include "pl_test.h"
|
||||
|
||||
#include "test_sig.h"
|
||||
#include "test_defs.h"
|
||||
#include "v_expm1f_inline.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
struct v_expm1f_data expm1f_consts;
|
||||
uint32x4_t halff;
|
||||
#if WANT_SIMD_EXCEPT
|
||||
uint32x4_t tiny_bound, thresh;
|
||||
#else
|
||||
uint32x4_t oflow_bound;
|
||||
float32x4_t oflow_bound;
|
||||
#endif
|
||||
} data = {
|
||||
.expm1f_consts = V_EXPM1F_DATA,
|
||||
.halff = V4 (0x3f000000),
|
||||
#if WANT_SIMD_EXCEPT
|
||||
/* 0x1.6a09e8p-32, below which expm1f underflows. */
|
||||
.tiny_bound = V4 (0x2fb504f4),
|
||||
|
|
@ -30,14 +27,15 @@ static const struct data
|
|||
.thresh = V4 (0x12fbbbb3),
|
||||
#else
|
||||
/* 0x1.61814ep+6, above which expm1f helper overflows. */
|
||||
.oflow_bound = V4 (0x42b0c0a7),
|
||||
.oflow_bound = V4 (0x1.61814ep+6),
|
||||
#endif
|
||||
};
|
||||
|
||||
static float32x4_t NOINLINE VPCS_ATTR
|
||||
special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
|
||||
special_case (float32x4_t x, float32x4_t t, float32x4_t halfsign,
|
||||
uint32x4_t special)
|
||||
{
|
||||
return v_call_f32 (sinhf, x, y, special);
|
||||
return v_call_f32 (sinhf, x, vmulq_f32 (t, halfsign), special);
|
||||
}
|
||||
|
||||
/* Approximation for vector single-precision sinh(x) using expm1.
|
||||
|
|
@ -45,21 +43,21 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
|
|||
The maximum error is 2.26 ULP:
|
||||
_ZGVnN4v_sinhf (0x1.e34a9ep-4) got 0x1.e469ep-4
|
||||
want 0x1.e469e4p-4. */
|
||||
float32x4_t VPCS_ATTR V_NAME_F1 (sinh) (float32x4_t x)
|
||||
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sinh) (float32x4_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
|
||||
uint32x4_t ix = vreinterpretq_u32_f32 (x);
|
||||
float32x4_t ax = vabsq_f32 (x);
|
||||
uint32x4_t iax = vreinterpretq_u32_f32 (ax);
|
||||
uint32x4_t sign = veorq_u32 (ix, iax);
|
||||
float32x4_t halfsign = vreinterpretq_f32_u32 (vorrq_u32 (sign, d->halff));
|
||||
float32x4_t halfsign = vreinterpretq_f32_u32 (
|
||||
vbslq_u32 (v_u32 (0x80000000), ix, vreinterpretq_u32_f32 (v_f32 (0.5))));
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
uint32x4_t special = vcgeq_u32 (vsubq_u32 (iax, d->tiny_bound), d->thresh);
|
||||
uint32x4_t special = vcgeq_u32 (
|
||||
vsubq_u32 (vreinterpretq_u32_f32 (ax), d->tiny_bound), d->thresh);
|
||||
ax = v_zerofy_f32 (ax, special);
|
||||
#else
|
||||
uint32x4_t special = vcgeq_u32 (iax, d->oflow_bound);
|
||||
uint32x4_t special = vcageq_f32 (x, d->oflow_bound);
|
||||
#endif
|
||||
|
||||
/* Up to the point that expm1f overflows, we can use it to calculate sinhf
|
||||
|
|
@ -71,14 +69,16 @@ float32x4_t VPCS_ATTR V_NAME_F1 (sinh) (float32x4_t x)
|
|||
/* Fall back to the scalar variant for any lanes that should trigger an
|
||||
exception. */
|
||||
if (unlikely (v_any_u32 (special)))
|
||||
return special_case (x, vmulq_f32 (t, halfsign), special);
|
||||
return special_case (x, t, halfsign, special);
|
||||
|
||||
return vmulq_f32 (t, halfsign);
|
||||
}
|
||||
|
||||
PL_SIG (V, F, 1, sinh, -10.0, 10.0)
|
||||
PL_TEST_ULP (V_NAME_F1 (sinh), 1.76)
|
||||
PL_TEST_EXPECT_FENV (V_NAME_F1 (sinh), WANT_SIMD_EXCEPT)
|
||||
PL_TEST_SYM_INTERVAL (V_NAME_F1 (sinh), 0, 0x2fb504f4, 1000)
|
||||
PL_TEST_SYM_INTERVAL (V_NAME_F1 (sinh), 0x2fb504f4, 0x42b0c0a7, 100000)
|
||||
PL_TEST_SYM_INTERVAL (V_NAME_F1 (sinh), 0x42b0c0a7, inf, 1000)
|
||||
HALF_WIDTH_ALIAS_F1 (sinh)
|
||||
|
||||
TEST_SIG (V, F, 1, sinh, -10.0, 10.0)
|
||||
TEST_ULP (V_NAME_F1 (sinh), 1.76)
|
||||
TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (sinh), WANT_SIMD_EXCEPT)
|
||||
TEST_SYM_INTERVAL (V_NAME_F1 (sinh), 0, 0x2fb504f4, 1000)
|
||||
TEST_SYM_INTERVAL (V_NAME_F1 (sinh), 0x2fb504f4, 0x42b0c0a7, 100000)
|
||||
TEST_SYM_INTERVAL (V_NAME_F1 (sinh), 0x42b0c0a7, inf, 1000)
|
||||
|
|
@ -1,15 +1,15 @@
|
|||
/*
|
||||
* Double-precision vector sinpi function.
|
||||
*
|
||||
* Copyright (c) 2023, Arm Limited.
|
||||
* Copyright (c) 2023-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "mathlib.h"
|
||||
#include "v_math.h"
|
||||
#include "poly_advsimd_f64.h"
|
||||
#include "pl_sig.h"
|
||||
#include "pl_test.h"
|
||||
#include "v_poly_f64.h"
|
||||
#include "test_sig.h"
|
||||
#include "test_defs.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
|
|
@ -34,7 +34,7 @@ special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp)
|
|||
{
|
||||
/* Fall back to scalar code. */
|
||||
y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
|
||||
return v_call_f64 (sinpi, x, y, cmp);
|
||||
return v_call_f64 (arm_math_sinpi, x, y, cmp);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
|
@ -77,10 +77,11 @@ float64x2_t VPCS_ATTR V_NAME_D1 (sinpi) (float64x2_t x)
|
|||
return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
|
||||
}
|
||||
|
||||
PL_SIG (V, D, 1, sinpi, -0.9, 0.9)
|
||||
PL_TEST_ULP (V_NAME_D1 (sinpi), 3.06)
|
||||
PL_TEST_EXPECT_FENV (V_NAME_D1 (sinpi), WANT_SIMD_EXCEPT)
|
||||
PL_TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0, 0x1p-63, 5000)
|
||||
PL_TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0x1p-63, 0.5, 10000)
|
||||
PL_TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0.5, 0x1p51, 10000)
|
||||
PL_TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0x1p51, inf, 10000)
|
||||
#if WANT_TRIGPI_TESTS
|
||||
TEST_ULP (V_NAME_D1 (sinpi), 2.56)
|
||||
TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (sinpi), WANT_SIMD_EXCEPT)
|
||||
TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0, 0x1p-63, 5000)
|
||||
TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0x1p-63, 0.5, 10000)
|
||||
TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0.5, 0x1p51, 10000)
|
||||
TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0x1p51, inf, 10000)
|
||||
#endif
|
||||
|
|
@ -1,15 +1,15 @@
|
|||
/*
|
||||
* Single-precision vector sinpi function.
|
||||
*
|
||||
* Copyright (c) 2023, Arm Limited.
|
||||
* Copyright (c) 2023-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "mathlib.h"
|
||||
#include "v_math.h"
|
||||
#include "poly_advsimd_f32.h"
|
||||
#include "pl_sig.h"
|
||||
#include "pl_test.h"
|
||||
#include "v_poly_f32.h"
|
||||
#include "test_sig.h"
|
||||
#include "test_defs.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
|
|
@ -29,7 +29,7 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp)
|
|||
{
|
||||
/* Fall back to scalar code. */
|
||||
y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
|
||||
return v_call_f32 (sinpif, x, y, cmp);
|
||||
return v_call_f32 (arm_math_sinpif, x, y, cmp);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
|
@ -37,7 +37,7 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp)
|
|||
Maximum Error 3.03 ULP:
|
||||
_ZGVnN4v_sinpif(0x1.c597ccp-2) got 0x1.f7cd56p-1
|
||||
want 0x1.f7cd5p-1. */
|
||||
float32x4_t VPCS_ATTR V_NAME_F1 (sinpi) (float32x4_t x)
|
||||
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sinpi) (float32x4_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
|
||||
|
|
@ -72,10 +72,13 @@ float32x4_t VPCS_ATTR V_NAME_F1 (sinpi) (float32x4_t x)
|
|||
return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
|
||||
}
|
||||
|
||||
PL_SIG (V, F, 1, sinpi, -0.9, 0.9)
|
||||
PL_TEST_ULP (V_NAME_F1 (sinpi), 2.54)
|
||||
PL_TEST_EXPECT_FENV (V_NAME_F1 (sinpi), WANT_SIMD_EXCEPT)
|
||||
PL_TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0, 0x1p-31, 5000)
|
||||
PL_TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0x1p-31, 0.5, 10000)
|
||||
PL_TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0.5, 0x1p31f, 10000)
|
||||
PL_TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0x1p31f, inf, 10000)
|
||||
HALF_WIDTH_ALIAS_F1 (sinpi)
|
||||
|
||||
#if WANT_TRIGPI_TESTS
|
||||
TEST_ULP (V_NAME_F1 (sinpi), 2.54)
|
||||
TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (sinpi), WANT_SIMD_EXCEPT)
|
||||
TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0, 0x1p-31, 5000)
|
||||
TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0x1p-31, 0.5, 10000)
|
||||
TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0.5, 0x1p31f, 10000)
|
||||
TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0x1p31f, inf, 10000)
|
||||
#endif
|
||||
|
|
@ -1,19 +1,20 @@
|
|||
/*
|
||||
* Double-precision vector tan(x) function.
|
||||
*
|
||||
* Copyright (c) 2023, Arm Limited.
|
||||
* Copyright (c) 2023-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "v_math.h"
|
||||
#include "poly_advsimd_f64.h"
|
||||
#include "pl_sig.h"
|
||||
#include "pl_test.h"
|
||||
#include "v_poly_f64.h"
|
||||
#include "test_sig.h"
|
||||
#include "test_defs.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
float64x2_t poly[9];
|
||||
float64x2_t half_pi, two_over_pi, shift;
|
||||
double half_pi[2];
|
||||
float64x2_t two_over_pi, shift;
|
||||
#if !WANT_SIMD_EXCEPT
|
||||
float64x2_t range_val;
|
||||
#endif
|
||||
|
|
@ -71,8 +72,9 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x)
|
|||
/* Use q to reduce x to r in [-pi/4, pi/4], by:
|
||||
r = x - q * pi/2, in extended precision. */
|
||||
float64x2_t r = x;
|
||||
r = vfmsq_laneq_f64 (r, q, dat->half_pi, 0);
|
||||
r = vfmsq_laneq_f64 (r, q, dat->half_pi, 1);
|
||||
float64x2_t half_pi = vld1q_f64 (dat->half_pi);
|
||||
r = vfmsq_laneq_f64 (r, q, half_pi, 0);
|
||||
r = vfmsq_laneq_f64 (r, q, half_pi, 1);
|
||||
/* Further reduce r to [-pi/8, pi/8], to be reconstructed using double angle
|
||||
formula. */
|
||||
r = vmulq_n_f64 (r, 0.5);
|
||||
|
|
@ -112,9 +114,9 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x)
|
|||
vbslq_f64 (no_recip, d, n));
|
||||
}
|
||||
|
||||
PL_SIG (V, D, 1, tan, -3.1, 3.1)
|
||||
PL_TEST_ULP (V_NAME_D1 (tan), 2.99)
|
||||
PL_TEST_EXPECT_FENV (V_NAME_D1 (tan), WANT_SIMD_EXCEPT)
|
||||
PL_TEST_SYM_INTERVAL (V_NAME_D1 (tan), 0, TinyBound, 5000)
|
||||
PL_TEST_SYM_INTERVAL (V_NAME_D1 (tan), TinyBound, RangeVal, 100000)
|
||||
PL_TEST_SYM_INTERVAL (V_NAME_D1 (tan), RangeVal, inf, 5000)
|
||||
TEST_SIG (V, D, 1, tan, -3.1, 3.1)
|
||||
TEST_ULP (V_NAME_D1 (tan), 2.99)
|
||||
TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (tan), WANT_SIMD_EXCEPT)
|
||||
TEST_SYM_INTERVAL (V_NAME_D1 (tan), 0, TinyBound, 5000)
|
||||
TEST_SYM_INTERVAL (V_NAME_D1 (tan), TinyBound, RangeVal, 100000)
|
||||
TEST_SYM_INTERVAL (V_NAME_D1 (tan), RangeVal, inf, 5000)
|
||||
|
|
@ -1,19 +1,19 @@
|
|||
/*
|
||||
* Single-precision vector tan(x) function.
|
||||
*
|
||||
* Copyright (c) 2021-2023, Arm Limited.
|
||||
* Copyright (c) 2021-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "v_math.h"
|
||||
#include "poly_advsimd_f32.h"
|
||||
#include "pl_sig.h"
|
||||
#include "pl_test.h"
|
||||
#include "v_poly_f32.h"
|
||||
#include "test_sig.h"
|
||||
#include "test_defs.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
float32x4_t poly[6];
|
||||
float32x4_t pi_consts;
|
||||
float pi_consts[4];
|
||||
float32x4_t shift;
|
||||
#if !WANT_SIMD_EXCEPT
|
||||
float32x4_t range_val;
|
||||
|
|
@ -64,7 +64,7 @@ eval_poly (float32x4_t z, const struct data *d)
|
|||
Maximum error is 3.45 ULP:
|
||||
__v_tanf(-0x1.e5f0cap+13) got 0x1.ff9856p-1
|
||||
want 0x1.ff9850p-1. */
|
||||
float32x4_t VPCS_ATTR V_NAME_F1 (tan) (float32x4_t x)
|
||||
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tan) (float32x4_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
float32x4_t special_arg = x;
|
||||
|
|
@ -85,16 +85,17 @@ float32x4_t VPCS_ATTR V_NAME_F1 (tan) (float32x4_t x)
|
|||
#endif
|
||||
|
||||
/* n = rint(x/(pi/2)). */
|
||||
float32x4_t q = vfmaq_laneq_f32 (d->shift, x, d->pi_consts, 3);
|
||||
float32x4_t pi_consts = vld1q_f32 (d->pi_consts);
|
||||
float32x4_t q = vfmaq_laneq_f32 (d->shift, x, pi_consts, 3);
|
||||
float32x4_t n = vsubq_f32 (q, d->shift);
|
||||
/* Determine if x lives in an interval, where |tan(x)| grows to infinity. */
|
||||
uint32x4_t pred_alt = vtstq_u32 (vreinterpretq_u32_f32 (q), v_u32 (1));
|
||||
|
||||
/* r = x - n * (pi/2) (range reduction into -pi./4 .. pi/4). */
|
||||
float32x4_t r;
|
||||
r = vfmaq_laneq_f32 (x, n, d->pi_consts, 0);
|
||||
r = vfmaq_laneq_f32 (r, n, d->pi_consts, 1);
|
||||
r = vfmaq_laneq_f32 (r, n, d->pi_consts, 2);
|
||||
r = vfmaq_laneq_f32 (x, n, pi_consts, 0);
|
||||
r = vfmaq_laneq_f32 (r, n, pi_consts, 1);
|
||||
r = vfmaq_laneq_f32 (r, n, pi_consts, 2);
|
||||
|
||||
/* If x lives in an interval, where |tan(x)|
|
||||
- is finite, then use a polynomial approximation of the form
|
||||
|
|
@ -119,9 +120,11 @@ float32x4_t VPCS_ATTR V_NAME_F1 (tan) (float32x4_t x)
|
|||
return vbslq_f32 (pred_alt, inv_y, y);
|
||||
}
|
||||
|
||||
PL_SIG (V, F, 1, tan, -3.1, 3.1)
|
||||
PL_TEST_ULP (V_NAME_F1 (tan), 2.96)
|
||||
PL_TEST_EXPECT_FENV (V_NAME_F1 (tan), WANT_SIMD_EXCEPT)
|
||||
PL_TEST_SYM_INTERVAL (V_NAME_F1 (tan), 0, 0x1p-31, 5000)
|
||||
PL_TEST_SYM_INTERVAL (V_NAME_F1 (tan), 0x1p-31, 0x1p15, 500000)
|
||||
PL_TEST_SYM_INTERVAL (V_NAME_F1 (tan), 0x1p15, inf, 5000)
|
||||
HALF_WIDTH_ALIAS_F1 (tan)
|
||||
|
||||
TEST_SIG (V, F, 1, tan, -3.1, 3.1)
|
||||
TEST_ULP (V_NAME_F1 (tan), 2.96)
|
||||
TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (tan), WANT_SIMD_EXCEPT)
|
||||
TEST_SYM_INTERVAL (V_NAME_F1 (tan), 0, 0x1p-31, 5000)
|
||||
TEST_SYM_INTERVAL (V_NAME_F1 (tan), 0x1p-31, 0x1p15, 500000)
|
||||
TEST_SYM_INTERVAL (V_NAME_F1 (tan), 0x1p15, inf, 5000)
|
||||
67
contrib/arm-optimized-routines/math/aarch64/advsimd/tanh.c
Normal file
67
contrib/arm-optimized-routines/math/aarch64/advsimd/tanh.c
Normal file
|
|
@ -0,0 +1,67 @@
|
|||
/*
|
||||
* Double-precision vector tanh(x) function.
|
||||
* Copyright (c) 2023-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "v_math.h"
|
||||
#include "test_sig.h"
|
||||
#include "test_defs.h"
|
||||
#include "v_expm1_inline.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
struct v_expm1_data d;
|
||||
uint64x2_t thresh, tiny_bound;
|
||||
} data = {
|
||||
.d = V_EXPM1_DATA,
|
||||
.tiny_bound = V2 (0x3e40000000000000), /* asuint64 (0x1p-27). */
|
||||
/* asuint64(0x1.241bf835f9d5fp+4) - asuint64(tiny_bound). */
|
||||
.thresh = V2 (0x01f241bf835f9d5f),
|
||||
};
|
||||
|
||||
static float64x2_t NOINLINE VPCS_ATTR
|
||||
special_case (float64x2_t x, float64x2_t q, float64x2_t qp2,
|
||||
uint64x2_t special)
|
||||
{
|
||||
return v_call_f64 (tanh, x, vdivq_f64 (q, qp2), special);
|
||||
}
|
||||
|
||||
/* Vector approximation for double-precision tanh(x), using a simplified
|
||||
version of expm1. The greatest observed error is 2.70 ULP:
|
||||
_ZGVnN2v_tanh(-0x1.c59aa220cb177p-3) got -0x1.be5452a6459fep-3
|
||||
want -0x1.be5452a6459fbp-3. */
|
||||
float64x2_t VPCS_ATTR V_NAME_D1 (tanh) (float64x2_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
|
||||
uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x));
|
||||
|
||||
float64x2_t u = x;
|
||||
|
||||
/* Trigger special-cases for tiny, boring and infinity/NaN. */
|
||||
uint64x2_t special = vcgtq_u64 (vsubq_u64 (ia, d->tiny_bound), d->thresh);
|
||||
#if WANT_SIMD_EXCEPT
|
||||
/* To trigger fp exceptions correctly, set special lanes to a neutral value.
|
||||
They will be fixed up later by the special-case handler. */
|
||||
if (unlikely (v_any_u64 (special)))
|
||||
u = v_zerofy_f64 (u, special);
|
||||
#endif
|
||||
|
||||
u = vaddq_f64 (u, u);
|
||||
|
||||
/* tanh(x) = (e^2x - 1) / (e^2x + 1). */
|
||||
float64x2_t q = expm1_inline (u, &d->d);
|
||||
float64x2_t qp2 = vaddq_f64 (q, v_f64 (2.0));
|
||||
|
||||
if (unlikely (v_any_u64 (special)))
|
||||
return special_case (x, q, qp2, special);
|
||||
return vdivq_f64 (q, qp2);
|
||||
}
|
||||
|
||||
TEST_SIG (V, D, 1, tanh, -10.0, 10.0)
|
||||
TEST_ULP (V_NAME_D1 (tanh), 2.21)
|
||||
TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (tanh), WANT_SIMD_EXCEPT)
|
||||
TEST_SYM_INTERVAL (V_NAME_D1 (tanh), 0, 0x1p-27, 5000)
|
||||
TEST_SYM_INTERVAL (V_NAME_D1 (tanh), 0x1p-27, 0x1.241bf835f9d5fp+4, 50000)
|
||||
TEST_SYM_INTERVAL (V_NAME_D1 (tanh), 0x1.241bf835f9d5fp+4, inf, 1000)
|
||||
|
|
@ -1,14 +1,13 @@
|
|||
/*
|
||||
* Single-precision vector tanh(x) function.
|
||||
*
|
||||
* Copyright (c) 2022-2023, Arm Limited.
|
||||
* Copyright (c) 2022-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "v_math.h"
|
||||
#include "pl_sig.h"
|
||||
#include "pl_test.h"
|
||||
|
||||
#include "test_sig.h"
|
||||
#include "test_defs.h"
|
||||
#include "v_expm1f_inline.h"
|
||||
|
||||
static const struct data
|
||||
|
|
@ -20,20 +19,23 @@ static const struct data
|
|||
/* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for negative). */
|
||||
.boring_bound = V4 (0x41102cb3),
|
||||
.large_bound = V4 (0x7f800000),
|
||||
.onef = V4 (0x3f800000),
|
||||
};
|
||||
|
||||
static float32x4_t NOINLINE VPCS_ATTR
|
||||
special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
|
||||
special_case (float32x4_t x, uint32x4_t is_boring, float32x4_t boring,
|
||||
float32x4_t q, uint32x4_t special)
|
||||
{
|
||||
return v_call_f32 (tanhf, x, y, special);
|
||||
return v_call_f32 (
|
||||
tanhf, x,
|
||||
vbslq_f32 (is_boring, boring, vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)))),
|
||||
special);
|
||||
}
|
||||
|
||||
/* Approximation for single-precision vector tanh(x), using a simplified
|
||||
version of expm1f. The maximum error is 2.58 ULP:
|
||||
_ZGVnN4v_tanhf (0x1.fa5eep-5) got 0x1.f9ba02p-5
|
||||
want 0x1.f9ba08p-5. */
|
||||
float32x4_t VPCS_ATTR V_NAME_F1 (tanh) (float32x4_t x)
|
||||
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tanh) (float32x4_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
|
||||
|
|
@ -42,7 +44,9 @@ float32x4_t VPCS_ATTR V_NAME_F1 (tanh) (float32x4_t x)
|
|||
uint32x4_t iax = vreinterpretq_u32_f32 (ax);
|
||||
uint32x4_t sign = veorq_u32 (ix, iax);
|
||||
uint32x4_t is_boring = vcgtq_u32 (iax, d->boring_bound);
|
||||
float32x4_t boring = vreinterpretq_f32_u32 (vorrq_u32 (sign, d->onef));
|
||||
/* expm1 exponent bias is 1.0f reinterpreted to int. */
|
||||
float32x4_t boring = vreinterpretq_f32_u32 (vorrq_u32 (
|
||||
sign, vreinterpretq_u32_s32 (d->expm1f_consts.exponent_bias)));
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
/* If fp exceptions are to be triggered properly, set all special and boring
|
||||
|
|
@ -58,16 +62,20 @@ float32x4_t VPCS_ATTR V_NAME_F1 (tanh) (float32x4_t x)
|
|||
|
||||
/* tanh(x) = (e^2x - 1) / (e^2x + 1). */
|
||||
float32x4_t q = expm1f_inline (vmulq_n_f32 (x, 2), &d->expm1f_consts);
|
||||
float32x4_t y = vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)));
|
||||
|
||||
if (unlikely (v_any_u32 (special)))
|
||||
return special_case (vreinterpretq_f32_u32 (ix),
|
||||
vbslq_f32 (is_boring, boring, y), special);
|
||||
return special_case (vreinterpretq_f32_u32 (ix), is_boring, boring, q,
|
||||
special);
|
||||
|
||||
float32x4_t y = vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)));
|
||||
return vbslq_f32 (is_boring, boring, y);
|
||||
}
|
||||
|
||||
PL_SIG (V, F, 1, tanh, -10.0, 10.0)
|
||||
PL_TEST_ULP (V_NAME_F1 (tanh), 2.09)
|
||||
PL_TEST_EXPECT_FENV (V_NAME_F1 (tanh), WANT_SIMD_EXCEPT)
|
||||
PL_TEST_SYM_INTERVAL (V_NAME_F1 (tanh), 0, 0x1p-23, 1000)
|
||||
PL_TEST_SYM_INTERVAL (V_NAME_F1 (tanh), 0x1p-23, 0x1.205966p+3, 100000)
|
||||
PL_TEST_SYM_INTERVAL (V_NAME_F1 (tanh), 0x1.205966p+3, inf, 100)
|
||||
HALF_WIDTH_ALIAS_F1 (tanh)
|
||||
|
||||
TEST_SIG (V, F, 1, tanh, -10.0, 10.0)
|
||||
TEST_ULP (V_NAME_F1 (tanh), 2.09)
|
||||
TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (tanh), WANT_SIMD_EXCEPT)
|
||||
TEST_SYM_INTERVAL (V_NAME_F1 (tanh), 0, 0x1p-23, 1000)
|
||||
TEST_SYM_INTERVAL (V_NAME_F1 (tanh), 0x1p-23, 0x1.205966p+3, 100000)
|
||||
TEST_SYM_INTERVAL (V_NAME_F1 (tanh), 0x1.205966p+3, inf, 100)
|
||||
88
contrib/arm-optimized-routines/math/aarch64/advsimd/tanpi.c
Normal file
88
contrib/arm-optimized-routines/math/aarch64/advsimd/tanpi.c
Normal file
|
|
@ -0,0 +1,88 @@
|
|||
/*
|
||||
* Double-precision vector tanpi(x) function.
|
||||
*
|
||||
* Copyright (c) 2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "v_math.h"
|
||||
#include "test_sig.h"
|
||||
#include "test_defs.h"
|
||||
|
||||
const static struct v_tanpi_data
|
||||
{
|
||||
float64x2_t c0, c2, c4, c6, c8, c10, c12;
|
||||
double c1, c3, c5, c7, c9, c11, c13, c14;
|
||||
} tanpi_data = {
|
||||
/* Coefficents for tan(pi * x) computed with fpminimax
|
||||
on [ 0x1p-1022 0x1p-2 ]
|
||||
approx rel error: 0x1.7eap-55
|
||||
approx abs error: 0x1.7eap-55. */
|
||||
.c0 = V2 (0x1.921fb54442d18p1), /* pi. */
|
||||
.c1 = 0x1.4abbce625be52p3, .c2 = V2 (0x1.466bc6775b0f9p5),
|
||||
.c3 = 0x1.45fff9b426f5ep7, .c4 = V2 (0x1.45f4730dbca5cp9),
|
||||
.c5 = 0x1.45f3265994f85p11, .c6 = V2 (0x1.45f4234b330cap13),
|
||||
.c7 = 0x1.45dca11be79ebp15, .c8 = V2 (0x1.47283fc5eea69p17),
|
||||
.c9 = 0x1.3a6d958cdefaep19, .c10 = V2 (0x1.927896baee627p21),
|
||||
.c11 = -0x1.89333f6acd922p19, .c12 = V2 (0x1.5d4e912bb8456p27),
|
||||
.c13 = -0x1.a854d53ab6874p29, .c14 = 0x1.1b76de7681424p32,
|
||||
};
|
||||
|
||||
/* Approximation for double-precision vector tanpi(x)
|
||||
The maximum error is 3.06 ULP:
|
||||
_ZGVnN2v_tanpi(0x1.0a4a07dfcca3ep-1) got -0x1.fa30112702c98p+3
|
||||
want -0x1.fa30112702c95p+3. */
|
||||
float64x2_t VPCS_ATTR V_NAME_D1 (tanpi) (float64x2_t x)
|
||||
{
|
||||
const struct v_tanpi_data *d = ptr_barrier (&tanpi_data);
|
||||
|
||||
float64x2_t n = vrndnq_f64 (x);
|
||||
|
||||
/* inf produces nan that propagates. */
|
||||
float64x2_t xr = vsubq_f64 (x, n);
|
||||
float64x2_t ar = vabdq_f64 (x, n);
|
||||
uint64x2_t flip = vcgtq_f64 (ar, v_f64 (0.25));
|
||||
float64x2_t r = vbslq_f64 (flip, vsubq_f64 (v_f64 (0.5), ar), ar);
|
||||
|
||||
/* Order-14 pairwise Horner. */
|
||||
float64x2_t r2 = vmulq_f64 (r, r);
|
||||
float64x2_t r4 = vmulq_f64 (r2, r2);
|
||||
|
||||
float64x2_t c_1_3 = vld1q_f64 (&d->c1);
|
||||
float64x2_t c_5_7 = vld1q_f64 (&d->c5);
|
||||
float64x2_t c_9_11 = vld1q_f64 (&d->c9);
|
||||
float64x2_t c_13_14 = vld1q_f64 (&d->c13);
|
||||
float64x2_t p01 = vfmaq_laneq_f64 (d->c0, r2, c_1_3, 0);
|
||||
float64x2_t p23 = vfmaq_laneq_f64 (d->c2, r2, c_1_3, 1);
|
||||
float64x2_t p45 = vfmaq_laneq_f64 (d->c4, r2, c_5_7, 0);
|
||||
float64x2_t p67 = vfmaq_laneq_f64 (d->c6, r2, c_5_7, 1);
|
||||
float64x2_t p89 = vfmaq_laneq_f64 (d->c8, r2, c_9_11, 0);
|
||||
float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, r2, c_9_11, 1);
|
||||
float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, r2, c_13_14, 0);
|
||||
|
||||
float64x2_t p = vfmaq_laneq_f64 (p1213, r4, c_13_14, 1);
|
||||
p = vfmaq_f64 (p1011, r4, p);
|
||||
p = vfmaq_f64 (p89, r4, p);
|
||||
p = vfmaq_f64 (p67, r4, p);
|
||||
p = vfmaq_f64 (p45, r4, p);
|
||||
p = vfmaq_f64 (p23, r4, p);
|
||||
p = vfmaq_f64 (p01, r4, p);
|
||||
p = vmulq_f64 (r, p);
|
||||
|
||||
float64x2_t p_recip = vdivq_f64 (v_f64 (1.0), p);
|
||||
float64x2_t y = vbslq_f64 (flip, p_recip, p);
|
||||
|
||||
uint64x2_t sign
|
||||
= veorq_u64 (vreinterpretq_u64_f64 (xr), vreinterpretq_u64_f64 (ar));
|
||||
return vreinterpretq_f64_u64 (vorrq_u64 (vreinterpretq_u64_f64 (y), sign));
|
||||
}
|
||||
|
||||
#if WANT_TRIGPI_TESTS
|
||||
TEST_DISABLE_FENV (V_NAME_D1 (tanpi))
|
||||
TEST_ULP (V_NAME_D1 (tanpi), 2.57)
|
||||
TEST_SYM_INTERVAL (V_NAME_D1 (tanpi), 0, 0x1p-31, 50000)
|
||||
TEST_SYM_INTERVAL (V_NAME_D1 (tanpi), 0x1p-31, 0.5, 50000)
|
||||
TEST_SYM_INTERVAL (V_NAME_D1 (tanpi), 0.5, 1.0, 200000)
|
||||
TEST_SYM_INTERVAL (V_NAME_D1 (tanpi), 1.0, 0x1p23, 50000)
|
||||
TEST_SYM_INTERVAL (V_NAME_D1 (tanpi), 0x1p23, inf, 50000)
|
||||
#endif
|
||||
70
contrib/arm-optimized-routines/math/aarch64/advsimd/tanpif.c
Normal file
70
contrib/arm-optimized-routines/math/aarch64/advsimd/tanpif.c
Normal file
|
|
@ -0,0 +1,70 @@
|
|||
/*
|
||||
* Single-precision vector tanpi(x) function.
|
||||
*
|
||||
* Copyright (c) 2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "v_math.h"
|
||||
#include "test_sig.h"
|
||||
#include "test_defs.h"
|
||||
|
||||
const static struct v_tanpif_data
|
||||
{
|
||||
float32x4_t c0, c2, c4, c6;
|
||||
float c1, c3, c5, c7;
|
||||
} tanpif_data = {
|
||||
/* Coefficents for tan(pi * x). */
|
||||
.c0 = V4 (0x1.921fb4p1f), .c1 = 0x1.4abbcep3f, .c2 = V4 (0x1.466b8p5f),
|
||||
.c3 = 0x1.461c72p7f, .c4 = V4 (0x1.42e9d4p9f), .c5 = 0x1.69e2c4p11f,
|
||||
.c6 = V4 (0x1.e85558p11f), .c7 = 0x1.a52e08p16f,
|
||||
};
|
||||
|
||||
/* Approximation for single-precision vector tanpi(x)
|
||||
The maximum error is 3.34 ULP:
|
||||
_ZGVnN4v_tanpif(0x1.d6c09ap-2) got 0x1.f70aacp+2
|
||||
want 0x1.f70aa6p+2. */
|
||||
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tanpi) (float32x4_t x)
|
||||
{
|
||||
const struct v_tanpif_data *d = ptr_barrier (&tanpif_data);
|
||||
|
||||
float32x4_t n = vrndnq_f32 (x);
|
||||
|
||||
/* inf produces nan that propagates. */
|
||||
float32x4_t xr = vsubq_f32 (x, n);
|
||||
float32x4_t ar = vabdq_f32 (x, n);
|
||||
uint32x4_t flip = vcgtq_f32 (ar, v_f32 (0.25f));
|
||||
float32x4_t r = vbslq_f32 (flip, vsubq_f32 (v_f32 (0.5f), ar), ar);
|
||||
|
||||
/* Order-7 pairwise Horner polynomial evaluation scheme. */
|
||||
float32x4_t r2 = vmulq_f32 (r, r);
|
||||
float32x4_t r4 = vmulq_f32 (r2, r2);
|
||||
|
||||
float32x4_t odd_coeffs = vld1q_f32 (&d->c1);
|
||||
float32x4_t p01 = vfmaq_laneq_f32 (d->c0, r2, odd_coeffs, 0);
|
||||
float32x4_t p23 = vfmaq_laneq_f32 (d->c2, r2, odd_coeffs, 1);
|
||||
float32x4_t p45 = vfmaq_laneq_f32 (d->c4, r2, odd_coeffs, 2);
|
||||
float32x4_t p67 = vfmaq_laneq_f32 (d->c6, r2, odd_coeffs, 3);
|
||||
float32x4_t p = vfmaq_f32 (p45, r4, p67);
|
||||
p = vfmaq_f32 (p23, r4, p);
|
||||
p = vfmaq_f32 (p01, r4, p);
|
||||
|
||||
p = vmulq_f32 (r, p);
|
||||
float32x4_t p_recip = vdivq_f32 (v_f32 (1.0f), p);
|
||||
float32x4_t y = vbslq_f32 (flip, p_recip, p);
|
||||
|
||||
uint32x4_t sign
|
||||
= veorq_u32 (vreinterpretq_u32_f32 (xr), vreinterpretq_u32_f32 (ar));
|
||||
return vreinterpretq_f32_u32 (vorrq_u32 (vreinterpretq_u32_f32 (y), sign));
|
||||
}
|
||||
|
||||
HALF_WIDTH_ALIAS_F1 (tanpi)
|
||||
|
||||
#if WANT_TRIGPI_TESTS
|
||||
TEST_DISABLE_FENV (V_NAME_F1 (tanpi))
|
||||
TEST_ULP (V_NAME_F1 (tanpi), 2.84)
|
||||
TEST_SYM_INTERVAL (V_NAME_F1 (tanpi), 0, 0x1p-31, 50000)
|
||||
TEST_SYM_INTERVAL (V_NAME_F1 (tanpi), 0x1p-31, 0.5, 100000)
|
||||
TEST_SYM_INTERVAL (V_NAME_F1 (tanpi), 0.5, 0x1p23f, 100000)
|
||||
TEST_SYM_INTERVAL (V_NAME_F1 (tanpi), 0x1p23f, inf, 100000)
|
||||
#endif
|
||||
|
|
@ -0,0 +1,58 @@
|
|||
/*
|
||||
* Helper for single-precision routines which calculate exp(ax) and do not
|
||||
* need special-case handling
|
||||
*
|
||||
* Copyright (c) 2019-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#ifndef MATH_V_EXPF_INLINE_H
|
||||
#define MATH_V_EXPF_INLINE_H
|
||||
|
||||
#include "v_math.h"
|
||||
|
||||
struct v_expf_data
|
||||
{
|
||||
float ln2_hi, ln2_lo, c0, c2;
|
||||
float32x4_t inv_ln2, c1, c3, c4;
|
||||
/* asuint(1.0f). */
|
||||
uint32x4_t exponent_bias;
|
||||
};
|
||||
|
||||
/* maxerr: 1.45358 +0.5 ulp. */
|
||||
#define V_EXPF_DATA \
|
||||
{ \
|
||||
.c0 = 0x1.0e4020p-7f, .c1 = V4 (0x1.573e2ep-5f), .c2 = 0x1.555e66p-3f, \
|
||||
.c3 = V4 (0x1.fffdb6p-2f), .c4 = V4 (0x1.ffffecp-1f), \
|
||||
.ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f, \
|
||||
.inv_ln2 = V4 (0x1.715476p+0f), .exponent_bias = V4 (0x3f800000), \
|
||||
}
|
||||
|
||||
static inline float32x4_t
|
||||
v_expf_inline (float32x4_t x, const struct v_expf_data *d)
|
||||
{
|
||||
/* Helper routine for calculating exp(ax).
|
||||
Copied from v_expf.c, with all special-case handling removed - the
|
||||
calling routine should handle special values if required. */
|
||||
|
||||
/* exp(ax) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
|
||||
ax = ln2*n + r, with r in [-ln2/2, ln2/2]. */
|
||||
float32x4_t ax = vabsq_f32 (x);
|
||||
float32x4_t ln2_c02 = vld1q_f32 (&d->ln2_hi);
|
||||
float32x4_t n = vrndaq_f32 (vmulq_f32 (ax, d->inv_ln2));
|
||||
float32x4_t r = vfmsq_laneq_f32 (ax, n, ln2_c02, 0);
|
||||
r = vfmsq_laneq_f32 (r, n, ln2_c02, 1);
|
||||
uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23);
|
||||
float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
|
||||
|
||||
/* Custom order-4 Estrin avoids building high order monomial. */
|
||||
float32x4_t r2 = vmulq_f32 (r, r);
|
||||
float32x4_t p = vfmaq_laneq_f32 (d->c1, r, ln2_c02, 2);
|
||||
float32x4_t q = vfmaq_laneq_f32 (d->c3, r, ln2_c02, 3);
|
||||
q = vfmaq_f32 (q, p, r2);
|
||||
p = vmulq_f32 (d->c4, r);
|
||||
float32x4_t poly = vfmaq_f32 (p, q, r2);
|
||||
return vfmaq_f32 (scale, poly, scale);
|
||||
}
|
||||
|
||||
#endif // MATH_V_EXPF_INLINE_H
|
||||
|
|
@ -0,0 +1,86 @@
|
|||
/*
|
||||
* Helper for double-precision routines which calculate exp(x) - 1 and do not
|
||||
* need special-case handling
|
||||
*
|
||||
* Copyright (c) 2022-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#ifndef MATH_V_EXPM1_INLINE_H
|
||||
#define MATH_V_EXPM1_INLINE_H
|
||||
|
||||
#include "v_math.h"
|
||||
|
||||
struct v_expm1_data
|
||||
{
|
||||
float64x2_t c2, c4, c6, c8;
|
||||
float64x2_t invln2;
|
||||
int64x2_t exponent_bias;
|
||||
double c1, c3, c5, c7, c9, c10;
|
||||
double ln2[2];
|
||||
};
|
||||
|
||||
/* Generated using fpminimax, with degree=12 in [log(2)/2, log(2)/2]. */
|
||||
#define V_EXPM1_DATA \
|
||||
{ \
|
||||
.c1 = 0x1.5555555555559p-3, .c2 = V2 (0x1.555555555554bp-5), \
|
||||
.c3 = 0x1.111111110f663p-7, .c4 = V2 (0x1.6c16c16c1b5f3p-10), \
|
||||
.c5 = 0x1.a01a01affa35dp-13, .c6 = V2 (0x1.a01a018b4ecbbp-16), \
|
||||
.c7 = 0x1.71ddf82db5bb4p-19, .c8 = V2 (0x1.27e517fc0d54bp-22), \
|
||||
.c9 = 0x1.af5eedae67435p-26, .c10 = 0x1.1f143d060a28ap-29, \
|
||||
.ln2 = { 0x1.62e42fefa39efp-1, 0x1.abc9e3b39803fp-56 }, \
|
||||
.invln2 = V2 (0x1.71547652b82fep0), \
|
||||
.exponent_bias = V2 (0x3ff0000000000000), \
|
||||
}
|
||||
|
||||
static inline float64x2_t
|
||||
expm1_inline (float64x2_t x, const struct v_expm1_data *d)
|
||||
{
|
||||
/* Helper routine for calculating exp(x) - 1. */
|
||||
|
||||
float64x2_t ln2 = vld1q_f64 (&d->ln2[0]);
|
||||
|
||||
/* Reduce argument to smaller range:
|
||||
Let i = round(x / ln2)
|
||||
and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
|
||||
exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
|
||||
where 2^i is exact because i is an integer. */
|
||||
float64x2_t n = vrndaq_f64 (vmulq_f64 (x, d->invln2));
|
||||
int64x2_t i = vcvtq_s64_f64 (n);
|
||||
float64x2_t f = vfmsq_laneq_f64 (x, n, ln2, 0);
|
||||
f = vfmsq_laneq_f64 (f, n, ln2, 1);
|
||||
|
||||
/* Approximate expm1(f) using polynomial.
|
||||
Taylor expansion for expm1(x) has the form:
|
||||
x + ax^2 + bx^3 + cx^4 ....
|
||||
So we calculate the polynomial P(f) = a + bf + cf^2 + ...
|
||||
and assemble the approximation expm1(f) ~= f + f^2 * P(f). */
|
||||
float64x2_t f2 = vmulq_f64 (f, f);
|
||||
float64x2_t f4 = vmulq_f64 (f2, f2);
|
||||
float64x2_t lane_consts_13 = vld1q_f64 (&d->c1);
|
||||
float64x2_t lane_consts_57 = vld1q_f64 (&d->c5);
|
||||
float64x2_t lane_consts_910 = vld1q_f64 (&d->c9);
|
||||
float64x2_t p01 = vfmaq_laneq_f64 (v_f64 (0.5), f, lane_consts_13, 0);
|
||||
float64x2_t p23 = vfmaq_laneq_f64 (d->c2, f, lane_consts_13, 1);
|
||||
float64x2_t p45 = vfmaq_laneq_f64 (d->c4, f, lane_consts_57, 0);
|
||||
float64x2_t p67 = vfmaq_laneq_f64 (d->c6, f, lane_consts_57, 1);
|
||||
float64x2_t p03 = vfmaq_f64 (p01, f2, p23);
|
||||
float64x2_t p47 = vfmaq_f64 (p45, f2, p67);
|
||||
float64x2_t p89 = vfmaq_laneq_f64 (d->c8, f, lane_consts_910, 0);
|
||||
float64x2_t p = vfmaq_laneq_f64 (p89, f2, lane_consts_910, 1);
|
||||
p = vfmaq_f64 (p47, f4, p);
|
||||
p = vfmaq_f64 (p03, f4, p);
|
||||
|
||||
p = vfmaq_f64 (f, f2, p);
|
||||
|
||||
/* Assemble the result.
|
||||
expm1(x) ~= 2^i * (p + 1) - 1
|
||||
Let t = 2^i. */
|
||||
int64x2_t u = vaddq_s64 (vshlq_n_s64 (i, 52), d->exponent_bias);
|
||||
float64x2_t t = vreinterpretq_f64_s64 (u);
|
||||
|
||||
/* expm1(x) ~= p * t + (t - 1). */
|
||||
return vfmaq_f64 (vsubq_f64 (t, v_f64 (1.0)), p, t);
|
||||
}
|
||||
|
||||
#endif // MATH_V_EXPM1_INLINE_H
|
||||
|
|
@ -0,0 +1,62 @@
|
|||
/*
|
||||
* Helper for single-precision routines which calculate exp(x) - 1 and do not
|
||||
* need special-case handling
|
||||
*
|
||||
* Copyright (c) 2022-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#ifndef MATH_V_EXPM1F_INLINE_H
|
||||
#define MATH_V_EXPM1F_INLINE_H
|
||||
|
||||
#include "v_math.h"
|
||||
|
||||
struct v_expm1f_data
|
||||
{
|
||||
float32x4_t c0, c2;
|
||||
int32x4_t exponent_bias;
|
||||
float c1, c3, inv_ln2, c4;
|
||||
float ln2_hi, ln2_lo;
|
||||
};
|
||||
|
||||
/* Coefficients generated using fpminimax with degree=5 in [-log(2)/2,
|
||||
log(2)/2]. Exponent bias is asuint(1.0f). */
|
||||
#define V_EXPM1F_DATA \
|
||||
{ \
|
||||
.c0 = V4 (0x1.fffffep-2), .c1 = 0x1.5554aep-3, .c2 = V4 (0x1.555736p-5), \
|
||||
.c3 = 0x1.12287cp-7, .c4 = 0x1.6b55a2p-10, \
|
||||
.exponent_bias = V4 (0x3f800000), .inv_ln2 = 0x1.715476p+0f, \
|
||||
.ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f, \
|
||||
}
|
||||
|
||||
static inline float32x4_t
|
||||
expm1f_inline (float32x4_t x, const struct v_expm1f_data *d)
|
||||
{
|
||||
/* Helper routine for calculating exp(x) - 1. */
|
||||
|
||||
float32x2_t ln2 = vld1_f32 (&d->ln2_hi);
|
||||
float32x4_t lane_consts = vld1q_f32 (&d->c1);
|
||||
|
||||
/* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */
|
||||
float32x4_t j = vrndaq_f32 (vmulq_laneq_f32 (x, lane_consts, 2));
|
||||
int32x4_t i = vcvtq_s32_f32 (j);
|
||||
float32x4_t f = vfmsq_lane_f32 (x, j, ln2, 0);
|
||||
f = vfmsq_lane_f32 (f, j, ln2, 1);
|
||||
|
||||
/* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f). */
|
||||
float32x4_t f2 = vmulq_f32 (f, f);
|
||||
float32x4_t f4 = vmulq_f32 (f2, f2);
|
||||
float32x4_t p01 = vfmaq_laneq_f32 (d->c0, f, lane_consts, 0);
|
||||
float32x4_t p23 = vfmaq_laneq_f32 (d->c2, f, lane_consts, 1);
|
||||
float32x4_t p = vfmaq_f32 (p01, f2, p23);
|
||||
p = vfmaq_laneq_f32 (p, f4, lane_consts, 3);
|
||||
p = vfmaq_f32 (f, f2, p);
|
||||
|
||||
/* t = 2^i. */
|
||||
int32x4_t u = vaddq_s32 (vshlq_n_s32 (i, 23), d->exponent_bias);
|
||||
float32x4_t t = vreinterpretq_f32_s32 (u);
|
||||
/* expm1(x) ~= p * t + (t - 1). */
|
||||
return vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t);
|
||||
}
|
||||
|
||||
#endif // MATH_V_EXPM1F_INLINE_H
|
||||
|
|
@ -0,0 +1,119 @@
|
|||
/*
|
||||
* Helper for vector double-precision routines which calculate log(1 + x) and
|
||||
* do not need special-case handling
|
||||
*
|
||||
* Copyright (c) 2022-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
#ifndef MATH_V_LOG1P_INLINE_H
|
||||
#define MATH_V_LOG1P_INLINE_H
|
||||
|
||||
#include "v_math.h"
|
||||
|
||||
struct v_log1p_data
|
||||
{
|
||||
float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16;
|
||||
uint64x2_t hf_rt2_top, one_m_hf_rt2_top, umask;
|
||||
int64x2_t one_top;
|
||||
double c1, c3, c5, c7, c9, c11, c13, c15, c17, c18;
|
||||
double ln2[2];
|
||||
};
|
||||
|
||||
/* Coefficients generated using Remez, deg=20, in [sqrt(2)/2-1, sqrt(2)-1]. */
|
||||
#define V_LOG1P_CONSTANTS_TABLE \
|
||||
{ \
|
||||
.c0 = V2 (-0x1.ffffffffffffbp-2), .c1 = 0x1.55555555551a9p-2, \
|
||||
.c2 = V2 (-0x1.00000000008e3p-2), .c3 = 0x1.9999999a32797p-3, \
|
||||
.c4 = V2 (-0x1.555555552fecfp-3), .c5 = 0x1.249248e071e5ap-3, \
|
||||
.c6 = V2 (-0x1.ffffff8bf8482p-4), .c7 = 0x1.c71c8f07da57ap-4, \
|
||||
.c8 = V2 (-0x1.9999ca4ccb617p-4), .c9 = 0x1.7459ad2e1dfa3p-4, \
|
||||
.c10 = V2 (-0x1.554d2680a3ff2p-4), .c11 = 0x1.3b4c54d487455p-4, \
|
||||
.c12 = V2 (-0x1.2548a9ffe80e6p-4), .c13 = 0x1.0f389a24b2e07p-4, \
|
||||
.c14 = V2 (-0x1.eee4db15db335p-5), .c15 = 0x1.e95b494d4a5ddp-5, \
|
||||
.c16 = V2 (-0x1.15fdf07cb7c73p-4), .c17 = 0x1.0310b70800fcfp-4, \
|
||||
.c18 = -0x1.cfa7385bdb37ep-6, \
|
||||
.ln2 = { 0x1.62e42fefa3800p-1, 0x1.ef35793c76730p-45 }, \
|
||||
.hf_rt2_top = V2 (0x3fe6a09e00000000), \
|
||||
.one_m_hf_rt2_top = V2 (0x00095f6200000000), \
|
||||
.umask = V2 (0x000fffff00000000), .one_top = V2 (0x3ff) \
|
||||
}
|
||||
|
||||
#define BottomMask v_u64 (0xffffffff)
|
||||
|
||||
static inline float64x2_t
|
||||
eval_poly (float64x2_t m, float64x2_t m2, const struct v_log1p_data *d)
|
||||
{
|
||||
/* Approximate log(1+m) on [-0.25, 0.5] using pairwise Horner. */
|
||||
float64x2_t c13 = vld1q_f64 (&d->c1);
|
||||
float64x2_t c57 = vld1q_f64 (&d->c5);
|
||||
float64x2_t c911 = vld1q_f64 (&d->c9);
|
||||
float64x2_t c1315 = vld1q_f64 (&d->c13);
|
||||
float64x2_t c1718 = vld1q_f64 (&d->c17);
|
||||
float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, m, c1718, 0);
|
||||
float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, m, c1315, 1);
|
||||
float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, m, c1315, 0);
|
||||
float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, m, c911, 1);
|
||||
float64x2_t p89 = vfmaq_laneq_f64 (d->c8, m, c911, 0);
|
||||
float64x2_t p67 = vfmaq_laneq_f64 (d->c6, m, c57, 1);
|
||||
float64x2_t p45 = vfmaq_laneq_f64 (d->c4, m, c57, 0);
|
||||
float64x2_t p23 = vfmaq_laneq_f64 (d->c2, m, c13, 1);
|
||||
float64x2_t p01 = vfmaq_laneq_f64 (d->c0, m, c13, 0);
|
||||
float64x2_t p = vfmaq_laneq_f64 (p1617, m2, c1718, 1);
|
||||
p = vfmaq_f64 (p1415, m2, p);
|
||||
p = vfmaq_f64 (p1213, m2, p);
|
||||
p = vfmaq_f64 (p1011, m2, p);
|
||||
p = vfmaq_f64 (p89, m2, p);
|
||||
p = vfmaq_f64 (p67, m2, p);
|
||||
p = vfmaq_f64 (p45, m2, p);
|
||||
p = vfmaq_f64 (p23, m2, p);
|
||||
return vfmaq_f64 (p01, m2, p);
|
||||
}
|
||||
|
||||
static inline float64x2_t
|
||||
log1p_inline (float64x2_t x, const struct v_log1p_data *d)
|
||||
{
|
||||
/* Helper for calculating log(x + 1):
|
||||
- No special-case handling - this should be dealt with by the caller.
|
||||
- Optionally simulate the shortcut for k=0, used in the scalar routine,
|
||||
using v_sel, for improved accuracy when the argument to log1p is close
|
||||
to 0. This feature is enabled by defining WANT_V_LOG1P_K0_SHORTCUT as 1
|
||||
in the source of the caller before including this file. */
|
||||
float64x2_t m = vaddq_f64 (x, v_f64 (1.0));
|
||||
uint64x2_t mi = vreinterpretq_u64_f64 (m);
|
||||
uint64x2_t u = vaddq_u64 (mi, d->one_m_hf_rt2_top);
|
||||
|
||||
int64x2_t ki
|
||||
= vsubq_s64 (vreinterpretq_s64_u64 (vshrq_n_u64 (u, 52)), d->one_top);
|
||||
float64x2_t k = vcvtq_f64_s64 (ki);
|
||||
|
||||
/* Reduce x to f in [sqrt(2)/2, sqrt(2)]. */
|
||||
uint64x2_t utop = vaddq_u64 (vandq_u64 (u, d->umask), d->hf_rt2_top);
|
||||
uint64x2_t u_red = vorrq_u64 (utop, vandq_u64 (mi, BottomMask));
|
||||
float64x2_t f = vsubq_f64 (vreinterpretq_f64_u64 (u_red), v_f64 (1.0));
|
||||
|
||||
/* Correction term c/m. */
|
||||
float64x2_t cm = vdivq_f64 (vsubq_f64 (x, vsubq_f64 (m, v_f64 (1.0))), m);
|
||||
|
||||
#ifndef WANT_V_LOG1P_K0_SHORTCUT
|
||||
# error \
|
||||
"Cannot use v_log1p_inline.h without specifying whether you need the k0 shortcut for greater accuracy close to 0"
|
||||
#elif WANT_V_LOG1P_K0_SHORTCUT
|
||||
/* Shortcut if k is 0 - set correction term to 0 and f to x. The result is
|
||||
that the approximation is solely the polynomial. */
|
||||
uint64x2_t k0 = vceqzq_f64 (k);
|
||||
cm = v_zerofy_f64 (cm, k0);
|
||||
f = vbslq_f64 (k0, x, f);
|
||||
#endif
|
||||
|
||||
/* Approximate log1p(f) on the reduced input using a polynomial. */
|
||||
float64x2_t f2 = vmulq_f64 (f, f);
|
||||
float64x2_t p = eval_poly (f, f2, d);
|
||||
|
||||
/* Assemble log1p(x) = k * log2 + log1p(f) + c/m. */
|
||||
float64x2_t ln2 = vld1q_f64 (&d->ln2[0]);
|
||||
float64x2_t ylo = vfmaq_laneq_f64 (cm, k, ln2, 1);
|
||||
float64x2_t yhi = vfmaq_laneq_f64 (f, k, ln2, 0);
|
||||
return vfmaq_f64 (vaddq_f64 (ylo, yhi), f2, p);
|
||||
}
|
||||
|
||||
#endif // MATH_V_LOG1P_INLINE_H
|
||||
|
|
@ -0,0 +1,94 @@
|
|||
/*
|
||||
* Helper for single-precision routines which calculate log(1 + x) and do not
|
||||
* need special-case handling
|
||||
*
|
||||
* Copyright (c) 2022-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#ifndef MATH_V_LOG1PF_INLINE_H
|
||||
#define MATH_V_LOG1PF_INLINE_H
|
||||
|
||||
#include "v_math.h"
|
||||
#include "v_poly_f32.h"
|
||||
|
||||
struct v_log1pf_data
|
||||
{
|
||||
uint32x4_t four;
|
||||
int32x4_t three_quarters;
|
||||
float c0, c3, c5, c7;
|
||||
float32x4_t c4, c6, c1, c2, ln2;
|
||||
};
|
||||
|
||||
/* Polynomial generated using FPMinimax in [-0.25, 0.5]. First two coefficients
|
||||
(1, -0.5) are not stored as they can be generated more efficiently. */
|
||||
#define V_LOG1PF_CONSTANTS_TABLE \
|
||||
{ \
|
||||
.c0 = 0x1.5555aap-2f, .c1 = V4 (-0x1.000038p-2f), \
|
||||
.c2 = V4 (0x1.99675cp-3f), .c3 = -0x1.54ef78p-3f, \
|
||||
.c4 = V4 (0x1.28a1f4p-3f), .c5 = -0x1.0da91p-3f, \
|
||||
.c6 = V4 (0x1.abcb6p-4f), .c7 = -0x1.6f0d5ep-5f, \
|
||||
.ln2 = V4 (0x1.62e43p-1f), .four = V4 (0x40800000), \
|
||||
.three_quarters = V4 (0x3f400000) \
|
||||
}
|
||||
|
||||
static inline float32x4_t
|
||||
eval_poly (float32x4_t m, const struct v_log1pf_data *d)
|
||||
{
|
||||
/* Approximate log(1+m) on [-0.25, 0.5] using pairwise Horner. */
|
||||
float32x4_t c0357 = vld1q_f32 (&d->c0);
|
||||
float32x4_t q = vfmaq_laneq_f32 (v_f32 (-0.5), m, c0357, 0);
|
||||
float32x4_t m2 = vmulq_f32 (m, m);
|
||||
float32x4_t p67 = vfmaq_laneq_f32 (d->c6, m, c0357, 3);
|
||||
float32x4_t p45 = vfmaq_laneq_f32 (d->c4, m, c0357, 2);
|
||||
float32x4_t p23 = vfmaq_laneq_f32 (d->c2, m, c0357, 1);
|
||||
float32x4_t p = vfmaq_f32 (p45, m2, p67);
|
||||
p = vfmaq_f32 (p23, m2, p);
|
||||
p = vfmaq_f32 (d->c1, m, p);
|
||||
p = vmulq_f32 (m2, p);
|
||||
p = vfmaq_f32 (m, m2, p);
|
||||
return vfmaq_f32 (p, m2, q);
|
||||
}
|
||||
|
||||
static inline float32x4_t
|
||||
log1pf_inline (float32x4_t x, const struct v_log1pf_data *d)
|
||||
{
|
||||
/* Helper for calculating log(x + 1). */
|
||||
|
||||
/* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m
|
||||
is in [-0.25, 0.5]):
|
||||
log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2).
|
||||
|
||||
We approximate log1p(m) with a polynomial, then scale by
|
||||
k*log(2). Instead of doing this directly, we use an intermediate
|
||||
scale factor s = 4*k*log(2) to ensure the scale is representable
|
||||
as a normalised fp32 number. */
|
||||
float32x4_t m = vaddq_f32 (x, v_f32 (1.0f));
|
||||
|
||||
/* Choose k to scale x to the range [-1/4, 1/2]. */
|
||||
int32x4_t k
|
||||
= vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d->three_quarters),
|
||||
v_s32 (0xff800000));
|
||||
uint32x4_t ku = vreinterpretq_u32_s32 (k);
|
||||
|
||||
/* Scale up to ensure that the scale factor is representable as normalised
|
||||
fp32 number, and scale m down accordingly. */
|
||||
float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d->four, ku));
|
||||
|
||||
/* Scale x by exponent manipulation. */
|
||||
float32x4_t m_scale
|
||||
= vreinterpretq_f32_u32 (vsubq_u32 (vreinterpretq_u32_f32 (x), ku));
|
||||
m_scale = vaddq_f32 (m_scale, vfmaq_f32 (v_f32 (-1.0f), v_f32 (0.25f), s));
|
||||
|
||||
/* Evaluate polynomial on the reduced interval. */
|
||||
float32x4_t p = eval_poly (m_scale, d);
|
||||
|
||||
/* The scale factor to be applied back at the end - by multiplying float(k)
|
||||
by 2^-23 we get the unbiased exponent of k. */
|
||||
float32x4_t scale_back = vmulq_f32 (vcvtq_f32_s32 (k), v_f32 (0x1.0p-23f));
|
||||
|
||||
/* Apply the scaling back. */
|
||||
return vfmaq_f32 (p, scale_back, d->ln2);
|
||||
}
|
||||
|
||||
#endif // MATH_V_LOG1PF_INLINE_H
|
||||
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
* Double-precision vector log(x) function - inline version
|
||||
*
|
||||
* Copyright (c) 2019-2023, Arm Limited.
|
||||
* Copyright (c) 2019-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
|
|
@ -57,8 +57,8 @@ log_lookup (uint64x2_t i)
|
|||
{
|
||||
/* Since N is a power of 2, n % N = n & (N - 1). */
|
||||
struct entry e;
|
||||
uint64_t i0 = (i[0] >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
|
||||
uint64_t i1 = (i[1] >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
|
||||
uint64_t i0 = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
|
||||
uint64_t i1 = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
|
||||
float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc);
|
||||
float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc);
|
||||
e.invc = vuzp1q_f64 (e0, e1);
|
||||
|
|
@ -1,36 +1,63 @@
|
|||
/*
|
||||
* Vector math abstractions.
|
||||
*
|
||||
* Copyright (c) 2019-2023, Arm Limited.
|
||||
* Copyright (c) 2019-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#ifndef _V_MATH_H
|
||||
#define _V_MATH_H
|
||||
|
||||
#ifndef WANT_VMATH
|
||||
/* Enable the build of vector math code. */
|
||||
# define WANT_VMATH 1
|
||||
#if !__aarch64__
|
||||
# error "Cannot build without AArch64"
|
||||
#endif
|
||||
|
||||
#if WANT_VMATH
|
||||
#define VPCS_ATTR __attribute__ ((aarch64_vector_pcs))
|
||||
|
||||
# if __aarch64__
|
||||
# define VPCS_ATTR __attribute__ ((aarch64_vector_pcs))
|
||||
# else
|
||||
# error "Cannot build without AArch64"
|
||||
# endif
|
||||
#define V_NAME_F1(fun) _ZGVnN4v_##fun##f
|
||||
#define V_NAME_D1(fun) _ZGVnN2v_##fun
|
||||
#define V_NAME_F2(fun) _ZGVnN4vv_##fun##f
|
||||
#define V_NAME_D2(fun) _ZGVnN2vv_##fun
|
||||
#define V_NAME_F1_L1(fun) _ZGVnN4vl4_##fun##f
|
||||
#define V_NAME_D1_L1(fun) _ZGVnN2vl8_##fun
|
||||
|
||||
# include <stdint.h>
|
||||
# include "math_config.h"
|
||||
# if __aarch64__
|
||||
#if USE_GLIBC_ABI
|
||||
|
||||
# include <arm_neon.h>
|
||||
# define HALF_WIDTH_ALIAS_F1(fun) \
|
||||
float32x2_t VPCS_ATTR _ZGVnN2v_##fun##f (float32x2_t x) \
|
||||
{ \
|
||||
return vget_low_f32 (_ZGVnN4v_##fun##f (vcombine_f32 (x, x))); \
|
||||
}
|
||||
|
||||
# define HALF_WIDTH_ALIAS_F2(fun) \
|
||||
float32x2_t VPCS_ATTR _ZGVnN2vv_##fun##f (float32x2_t x, float32x2_t y) \
|
||||
{ \
|
||||
return vget_low_f32 ( \
|
||||
_ZGVnN4vv_##fun##f (vcombine_f32 (x, x), vcombine_f32 (y, y))); \
|
||||
}
|
||||
|
||||
#else
|
||||
# define HALF_WIDTH_ALIAS_F1(fun)
|
||||
# define HALF_WIDTH_ALIAS_F2(fun)
|
||||
#endif
|
||||
|
||||
#include <stdint.h>
|
||||
#include "math_config.h"
|
||||
#include <arm_neon.h>
|
||||
|
||||
/* Shorthand helpers for declaring constants. */
|
||||
# define V2(X) { X, X }
|
||||
# define V4(X) { X, X, X, X }
|
||||
# define V8(X) { X, X, X, X, X, X, X, X }
|
||||
#define V2(X) \
|
||||
{ \
|
||||
X, X \
|
||||
}
|
||||
#define V4(X) \
|
||||
{ \
|
||||
X, X, X, X \
|
||||
}
|
||||
#define V8(X) \
|
||||
{ \
|
||||
X, X, X, X, X, X, X, X \
|
||||
}
|
||||
|
||||
static inline int
|
||||
v_any_u16h (uint16x4_t x)
|
||||
|
|
@ -38,6 +65,12 @@ v_any_u16h (uint16x4_t x)
|
|||
return vget_lane_u64 (vreinterpret_u64_u16 (x), 0) != 0;
|
||||
}
|
||||
|
||||
static inline int
|
||||
v_lanes32 (void)
|
||||
{
|
||||
return 4;
|
||||
}
|
||||
|
||||
static inline float32x4_t
|
||||
v_f32 (float x)
|
||||
{
|
||||
|
|
@ -54,7 +87,7 @@ v_s32 (int32_t x)
|
|||
return (int32x4_t) V4 (x);
|
||||
}
|
||||
|
||||
/* true if any elements of a vector compare result is non-zero. */
|
||||
/* true if any elements of a v_cond result is non-zero. */
|
||||
static inline int
|
||||
v_any_u32 (uint32x4_t x)
|
||||
{
|
||||
|
|
@ -97,6 +130,11 @@ v_zerofy_f32 (float32x4_t x, uint32x4_t mask)
|
|||
return vreinterpretq_f32_u32 (vbicq_u32 (vreinterpretq_u32_f32 (x), mask));
|
||||
}
|
||||
|
||||
static inline int
|
||||
v_lanes64 (void)
|
||||
{
|
||||
return 2;
|
||||
}
|
||||
static inline float64x2_t
|
||||
v_f64 (double x)
|
||||
{
|
||||
|
|
@ -113,20 +151,13 @@ v_s64 (int64_t x)
|
|||
return (int64x2_t) V2 (x);
|
||||
}
|
||||
|
||||
/* true if any elements of a vector compare result is non-zero. */
|
||||
/* true if any elements of a v_cond result is non-zero. */
|
||||
static inline int
|
||||
v_any_u64 (uint64x2_t x)
|
||||
{
|
||||
/* assume elements in x are either 0 or -1u. */
|
||||
return vpaddd_u64 (x) != 0;
|
||||
}
|
||||
/* true if all elements of a vector compare result is 1. */
|
||||
static inline int
|
||||
v_all_u64 (uint64x2_t x)
|
||||
{
|
||||
/* assume elements in x are either 0 or -1u. */
|
||||
return vpaddd_s64 (vreinterpretq_s64_u64 (x)) == -2;
|
||||
}
|
||||
static inline float64x2_t
|
||||
v_lookup_f64 (const double *tab, uint64x2_t idx)
|
||||
{
|
||||
|
|
@ -137,7 +168,6 @@ v_lookup_u64 (const uint64_t *tab, uint64x2_t idx)
|
|||
{
|
||||
return (uint64x2_t){ tab[idx[0]], tab[idx[1]] };
|
||||
}
|
||||
|
||||
static inline float64x2_t
|
||||
v_call_f64 (double (*f) (double), float64x2_t x, float64x2_t y, uint64x2_t p)
|
||||
{
|
||||
|
|
@ -169,7 +199,4 @@ v_zerofy_f64 (float64x2_t x, uint64x2_t mask)
|
|||
return vreinterpretq_f64_u64 (vbicq_u64 (vreinterpretq_u64_f64 (x), mask));
|
||||
}
|
||||
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
@ -2,12 +2,12 @@
|
|||
* Helpers for evaluating polynomials on single-precision AdvSIMD input, using
|
||||
* various schemes.
|
||||
*
|
||||
* Copyright (c) 2023, Arm Limited.
|
||||
* Copyright (c) 2023-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#ifndef PL_MATH_POLY_ADVSIMD_F32_H
|
||||
#define PL_MATH_POLY_ADVSIMD_F32_H
|
||||
#ifndef MATH_POLY_ADVSIMD_F32_H
|
||||
#define MATH_POLY_ADVSIMD_F32_H
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
|
|
@ -2,12 +2,12 @@
|
|||
* Helpers for evaluating polynomials on double-precision AdvSIMD input, using
|
||||
* various schemes.
|
||||
*
|
||||
* Copyright (c) 2023, Arm Limited.
|
||||
* Copyright (c) 2023-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#ifndef PL_MATH_POLY_ADVSIMD_F64_H
|
||||
#define PL_MATH_POLY_ADVSIMD_F64_H
|
||||
#ifndef MATH_POLY_ADVSIMD_F64_H
|
||||
#define MATH_POLY_ADVSIMD_F64_H
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
|
|
@ -1,12 +1,12 @@
|
|||
/*
|
||||
* Core approximation for double-precision vector sincos
|
||||
*
|
||||
* Copyright (c) 2023, Arm Limited.
|
||||
* Copyright (c) 2023-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "v_math.h"
|
||||
#include "poly_advsimd_f64.h"
|
||||
#include "v_poly_f64.h"
|
||||
|
||||
static const struct v_sincos_data
|
||||
{
|
||||
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
* Core approximation for single-precision vector sincos
|
||||
*
|
||||
* Copyright (c) 2023, Arm Limited.
|
||||
* Copyright (c) 2023-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
|
|
@ -0,0 +1,64 @@
|
|||
/*
|
||||
* Helper for Double-precision vector sincospi function.
|
||||
*
|
||||
* Copyright (c) 2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
#include "v_math.h"
|
||||
#include "v_poly_f64.h"
|
||||
|
||||
static const struct v_sincospi_data
|
||||
{
|
||||
float64x2_t poly[10], range_val;
|
||||
} v_sincospi_data = {
|
||||
/* Polynomial coefficients generated using Remez algorithm,
|
||||
see sinpi.sollya for details. */
|
||||
.poly = { V2 (0x1.921fb54442d184p1), V2 (-0x1.4abbce625be53p2),
|
||||
V2 (0x1.466bc6775ab16p1), V2 (-0x1.32d2cce62dc33p-1),
|
||||
V2 (0x1.507834891188ep-4), V2 (-0x1.e30750a28c88ep-8),
|
||||
V2 (0x1.e8f48308acda4p-12), V2 (-0x1.6fc0032b3c29fp-16),
|
||||
V2 (0x1.af86ae521260bp-21), V2 (-0x1.012a9870eeb7dp-25) },
|
||||
.range_val = V2 (0x1p63),
|
||||
};
|
||||
|
||||
/* Double-precision vector function allowing calculation of both sin and cos in
|
||||
one function call, using separate argument reduction and shared low-order
|
||||
polynomials.
|
||||
Approximation for vector double-precision sincospi(x).
|
||||
Maximum Error 3.09 ULP:
|
||||
_ZGVnN2v_sincospi_sin(0x1.7a41deb4b21e1p+14) got 0x1.fd54d0b327cf1p-1
|
||||
want 0x1.fd54d0b327cf4p-1
|
||||
Maximum Error 3.16 ULP:
|
||||
_ZGVnN2v_sincospi_cos(-0x1.11e3c7e284adep-5) got 0x1.fd2da484ff3ffp-1
|
||||
want 0x1.fd2da484ff402p-1. */
|
||||
static inline float64x2x2_t
|
||||
v_sincospi_inline (float64x2_t x, const struct v_sincospi_data *d)
|
||||
{
|
||||
/* If r is odd, the sign of the result should be inverted for sinpi
|
||||
and reintroduced for cospi. */
|
||||
uint64x2_t cmp = vcgeq_f64 (x, d->range_val);
|
||||
uint64x2_t odd = vshlq_n_u64 (
|
||||
vbicq_u64 (vreinterpretq_u64_s64 (vcvtaq_s64_f64 (x)), cmp), 63);
|
||||
|
||||
/* r = x - rint(x). */
|
||||
float64x2_t sr = vsubq_f64 (x, vrndaq_f64 (x));
|
||||
/* cospi(x) = sinpi(0.5 - abs(x)) for values -1/2 .. 1/2. */
|
||||
float64x2_t cr = vsubq_f64 (v_f64 (0.5), vabsq_f64 (sr));
|
||||
|
||||
/* Pairwise Horner approximation for y = sin(r * pi). */
|
||||
float64x2_t sr2 = vmulq_f64 (sr, sr);
|
||||
float64x2_t sr4 = vmulq_f64 (sr2, sr2);
|
||||
float64x2_t cr2 = vmulq_f64 (cr, cr);
|
||||
float64x2_t cr4 = vmulq_f64 (cr2, cr2);
|
||||
|
||||
float64x2_t ss = vmulq_f64 (v_pw_horner_9_f64 (sr2, sr4, d->poly), sr);
|
||||
float64x2_t cc = vmulq_f64 (v_pw_horner_9_f64 (cr2, cr4, d->poly), cr);
|
||||
|
||||
float64x2_t sinpix
|
||||
= vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (ss), odd));
|
||||
|
||||
float64x2_t cospix
|
||||
= vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (cc), odd));
|
||||
|
||||
return (float64x2x2_t){ sinpix, cospix };
|
||||
}
|
||||
|
|
@ -0,0 +1,57 @@
|
|||
/*
|
||||
* Helper for Single-precision vector sincospi function.
|
||||
*
|
||||
* Copyright (c) 2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
#include "mathlib.h"
|
||||
#include "v_math.h"
|
||||
#include "v_poly_f32.h"
|
||||
|
||||
const static struct v_sincospif_data
|
||||
{
|
||||
float32x4_t poly[6], range_val;
|
||||
} v_sincospif_data = {
|
||||
/* Taylor series coefficents for sin(pi * x). */
|
||||
.poly = { V4 (0x1.921fb6p1f), V4 (-0x1.4abbcep2f), V4 (0x1.466bc6p1f),
|
||||
V4 (-0x1.32d2ccp-1f), V4 (0x1.50783p-4f), V4 (-0x1.e30750p-8f) },
|
||||
.range_val = V4 (0x1p31f),
|
||||
};
|
||||
|
||||
/* Single-precision vector function allowing calculation of both sinpi and
|
||||
cospi in one function call, using shared argument reduction and polynomials.
|
||||
Worst-case error for sin is 3.04 ULP:
|
||||
_ZGVnN4v_sincospif_sin(0x1.1d341ap-1) got 0x1.f7cd56p-1 want 0x1.f7cd5p-1.
|
||||
Worst-case error for cos is 3.18 ULP:
|
||||
_ZGVnN4v_sincospif_cos(0x1.d341a8p-5) got 0x1.f7cd56p-1 want 0x1.f7cd5p-1.
|
||||
*/
|
||||
static inline float32x4x2_t
|
||||
v_sincospif_inline (float32x4_t x, const struct v_sincospif_data *d)
|
||||
{
|
||||
/* If r is odd, the sign of the result should be inverted for sinpi and
|
||||
reintroduced for cospi. */
|
||||
uint32x4_t cmp = vcgeq_f32 (x, d->range_val);
|
||||
uint32x4_t odd = vshlq_n_u32 (
|
||||
vbicq_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), cmp), 31);
|
||||
|
||||
/* r = x - rint(x). */
|
||||
float32x4_t sr = vsubq_f32 (x, vrndaq_f32 (x));
|
||||
/* cospi(x) = sinpi(0.5 - abs(x)) for values -1/2 .. 1/2. */
|
||||
float32x4_t cr = vsubq_f32 (v_f32 (0.5f), vabsq_f32 (sr));
|
||||
|
||||
/* Pairwise Horner approximation for y = sin(r * pi). */
|
||||
float32x4_t sr2 = vmulq_f32 (sr, sr);
|
||||
float32x4_t sr4 = vmulq_f32 (sr2, sr2);
|
||||
float32x4_t cr2 = vmulq_f32 (cr, cr);
|
||||
float32x4_t cr4 = vmulq_f32 (cr2, cr2);
|
||||
|
||||
float32x4_t ss = vmulq_f32 (v_pw_horner_5_f32 (sr2, sr4, d->poly), sr);
|
||||
float32x4_t cc = vmulq_f32 (v_pw_horner_5_f32 (cr2, cr4, d->poly), cr);
|
||||
|
||||
float32x4_t sinpix
|
||||
= vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (ss), odd));
|
||||
float32x4_t cospix
|
||||
= vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (cc), odd));
|
||||
|
||||
return (float32x4x2_t){ sinpix, cospix };
|
||||
}
|
||||
|
|
@ -1,14 +1,14 @@
|
|||
/*
|
||||
* Double-precision scalar cospi function.
|
||||
*
|
||||
* Copyright (c) 2023, Arm Limited.
|
||||
* Copyright (c) 2023-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "mathlib.h"
|
||||
#include "math_config.h"
|
||||
#include "pl_sig.h"
|
||||
#include "pl_test.h"
|
||||
#include "test_sig.h"
|
||||
#include "test_defs.h"
|
||||
#include "poly_scalar_f64.h"
|
||||
|
||||
/* Taylor series coefficents for sin(pi * x).
|
||||
|
|
@ -29,9 +29,9 @@ static const double poly[]
|
|||
cospi(0x1.160b129300112p-21) got 0x1.fffffffffd16bp-1
|
||||
want 0x1.fffffffffd16ep-1. */
|
||||
double
|
||||
cospi (double x)
|
||||
arm_math_cospi (double x)
|
||||
{
|
||||
if (isinf (x))
|
||||
if (isinf (x) || isnan (x))
|
||||
return __math_invalid (x);
|
||||
|
||||
double ax = asdouble (asuint64 (x) & ~0x8000000000000000);
|
||||
|
|
@ -81,9 +81,18 @@ cospi (double x)
|
|||
return asdouble (asuint64 (y) ^ sign);
|
||||
}
|
||||
|
||||
PL_SIG (S, D, 1, cospi, -0.9, 0.9)
|
||||
PL_TEST_ULP (cospi, 2.63)
|
||||
PL_TEST_SYM_INTERVAL (cospi, 0, 0x1p-63, 5000)
|
||||
PL_TEST_SYM_INTERVAL (cospi, 0x1p-63, 0.5, 10000)
|
||||
PL_TEST_SYM_INTERVAL (cospi, 0.5, 0x1p51f, 10000)
|
||||
PL_TEST_SYM_INTERVAL (cospi, 0x1p51f, inf, 10000)
|
||||
#if WANT_EXPERIMENTAL_MATH
|
||||
double
|
||||
cospi (double x)
|
||||
{
|
||||
return arm_math_cospi (x);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if WANT_TRIGPI_TESTS
|
||||
TEST_ULP (arm_math_cospi, 2.63)
|
||||
TEST_SYM_INTERVAL (arm_math_cospi, 0, 0x1p-63, 5000)
|
||||
TEST_SYM_INTERVAL (arm_math_cospi, 0x1p-63, 0.5, 10000)
|
||||
TEST_SYM_INTERVAL (arm_math_cospi, 0.5, 0x1p51f, 10000)
|
||||
TEST_SYM_INTERVAL (arm_math_cospi, 0x1p51f, inf, 10000)
|
||||
#endif
|
||||
|
|
@ -1,14 +1,14 @@
|
|||
/*
|
||||
* Single-precision scalar cospi function.
|
||||
*
|
||||
* Copyright (c) 2023, Arm Limited.
|
||||
* Copyright (c) 2023-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "mathlib.h"
|
||||
#include "math_config.h"
|
||||
#include "pl_sig.h"
|
||||
#include "pl_test.h"
|
||||
#include "test_sig.h"
|
||||
#include "test_defs.h"
|
||||
|
||||
/* Taylor series coefficents for sin(pi * x). */
|
||||
#define C0 0x1.921fb6p1f
|
||||
|
|
@ -25,9 +25,9 @@
|
|||
cospif(0x1.37e844p-4) got 0x1.f16b3p-1
|
||||
want 0x1.f16b2ap-1. */
|
||||
float
|
||||
cospif (float x)
|
||||
arm_math_cospif (float x)
|
||||
{
|
||||
if (isinf (x))
|
||||
if (isinf (x) || isnan (x))
|
||||
return __math_invalidf (x);
|
||||
|
||||
float ax = asfloat (asuint (x) & ~0x80000000);
|
||||
|
|
@ -76,9 +76,18 @@ cospif (float x)
|
|||
return asfloat (asuint (y * r) ^ sign);
|
||||
}
|
||||
|
||||
PL_SIG (S, F, 1, cospi, -0.9, 0.9)
|
||||
PL_TEST_ULP (cospif, 2.15)
|
||||
PL_TEST_SYM_INTERVAL (cospif, 0, 0x1p-31, 5000)
|
||||
PL_TEST_SYM_INTERVAL (cospif, 0x1p-31, 0.5, 10000)
|
||||
PL_TEST_SYM_INTERVAL (cospif, 0.5, 0x1p22f, 10000)
|
||||
PL_TEST_SYM_INTERVAL (cospif, 0x1p22f, inf, 10000)
|
||||
#if WANT_EXPERIMENTAL_MATH
|
||||
float
|
||||
cospif (float x)
|
||||
{
|
||||
return arm_math_cospif (x);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if WANT_TRIGPI_TESTS
|
||||
TEST_ULP (arm_math_cospif, 2.15)
|
||||
TEST_SYM_INTERVAL (arm_math_cospif, 0, 0x1p-31, 5000)
|
||||
TEST_SYM_INTERVAL (arm_math_cospif, 0x1p-31, 0.5, 10000)
|
||||
TEST_SYM_INTERVAL (arm_math_cospif, 0.5, 0x1p22f, 10000)
|
||||
TEST_SYM_INTERVAL (arm_math_cospif, 0x1p22f, inf, 10000)
|
||||
#endif
|
||||
|
|
@ -5,7 +5,6 @@ glibc-specific conventions need not be followed.
|
|||
The requirements for portable code apply to non-portable code with the
|
||||
following differences:
|
||||
|
||||
|
||||
1. Worst-case ULP error should be encoded in filenames (e.g. sin_u35.c). There
|
||||
are no specific restrictions on acceptable ULP error, but if functions
|
||||
provide significantly less accuracy than portable equivalents then a clear
|
||||
|
|
@ -15,9 +14,3 @@ following differences:
|
|||
|
||||
2. Functions are assumed to support round-to-nearest mode by default, unless
|
||||
stated; other rounding modes are not required to be provided.
|
||||
|
||||
3. Handling of special cases may be relaxed for vector functions. Checking
|
||||
whether each vector lane contains special values such as NaN, Inf or
|
||||
denormal numbers can prove too costly for vector functions. This is often
|
||||
not required since vector functions are typically used along with aggressive
|
||||
compiler optimization flags.
|
||||
|
|
@ -1,23 +1,23 @@
|
|||
/*
|
||||
* Double-precision acos(x) function.
|
||||
*
|
||||
* Copyright (c) 2023, Arm Limited.
|
||||
* Copyright (c) 2023-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "math_config.h"
|
||||
#include "poly_scalar_f64.h"
|
||||
#include "pl_sig.h"
|
||||
#include "pl_test.h"
|
||||
#include "test_sig.h"
|
||||
#include "test_defs.h"
|
||||
|
||||
#define AbsMask (0x7fffffffffffffff)
|
||||
#define Half (0x3fe0000000000000)
|
||||
#define One (0x3ff0000000000000)
|
||||
#define PiOver2 (0x1.921fb54442d18p+0)
|
||||
#define Pi (0x1.921fb54442d18p+1)
|
||||
#define Small (0x3c90000000000000) /* 2^-53. */
|
||||
#define Small16 (0x3c90)
|
||||
#define QNaN (0x7ff8)
|
||||
#define AbsMask 0x7fffffffffffffff
|
||||
#define Half 0x3fe0000000000000
|
||||
#define One 0x3ff0000000000000
|
||||
#define PiOver2 0x1.921fb54442d18p+0
|
||||
#define Pi 0x1.921fb54442d18p+1
|
||||
#define Small 0x3c90000000000000 /* 2^-53. */
|
||||
#define Small16 0x3c90
|
||||
#define QNaN 0x7ff8
|
||||
|
||||
/* Fast implementation of double-precision acos(x) based on polynomial
|
||||
approximation of double-precision asin(x).
|
||||
|
|
@ -29,8 +29,8 @@
|
|||
|
||||
acos(x) = pi/2 - asin(x)
|
||||
|
||||
and use an order 11 polynomial P such that the final approximation of asin is
|
||||
an odd polynomial: asin(x) ~ x + x^3 * P(x^2).
|
||||
and use an order 11 polynomial P such that the final approximation of asin
|
||||
is an odd polynomial: asin(x) ~ x + x^3 * P(x^2).
|
||||
|
||||
The largest observed error in this region is 1.18 ulps,
|
||||
acos(0x1.fbab0a7c460f6p-2) got 0x1.0d54d1985c068p+0
|
||||
|
|
@ -90,11 +90,11 @@ acos (double x)
|
|||
return (x <= -0.5) ? fma (-2.0, p, Pi) : 2.0 * p;
|
||||
}
|
||||
|
||||
PL_SIG (S, D, 1, acos, -1.0, 1.0)
|
||||
PL_TEST_ULP (acos, 1.02)
|
||||
PL_TEST_INTERVAL (acos, 0, Small, 5000)
|
||||
PL_TEST_INTERVAL (acos, Small, 0.5, 50000)
|
||||
PL_TEST_INTERVAL (acos, 0.5, 1.0, 50000)
|
||||
PL_TEST_INTERVAL (acos, 1.0, 0x1p11, 50000)
|
||||
PL_TEST_INTERVAL (acos, 0x1p11, inf, 20000)
|
||||
PL_TEST_INTERVAL (acos, -0, -inf, 20000)
|
||||
TEST_SIG (S, D, 1, acos, -1.0, 1.0)
|
||||
TEST_ULP (acos, 1.02)
|
||||
TEST_INTERVAL (acos, 0, Small, 5000)
|
||||
TEST_INTERVAL (acos, Small, 0.5, 50000)
|
||||
TEST_INTERVAL (acos, 0.5, 1.0, 50000)
|
||||
TEST_INTERVAL (acos, 1.0, 0x1p11, 50000)
|
||||
TEST_INTERVAL (acos, 0x1p11, inf, 20000)
|
||||
TEST_INTERVAL (acos, -0, -inf, 20000)
|
||||
|
|
@ -1,23 +1,23 @@
|
|||
/*
|
||||
* Single-precision acos(x) function.
|
||||
*
|
||||
* Copyright (c) 2023, Arm Limited.
|
||||
* Copyright (c) 2023-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "poly_scalar_f32.h"
|
||||
#include "math_config.h"
|
||||
#include "pl_sig.h"
|
||||
#include "pl_test.h"
|
||||
#include "test_sig.h"
|
||||
#include "test_defs.h"
|
||||
|
||||
#define AbsMask (0x7fffffff)
|
||||
#define Half (0x3f000000)
|
||||
#define One (0x3f800000)
|
||||
#define PiOver2f (0x1.921fb6p+0f)
|
||||
#define Pif (0x1.921fb6p+1f)
|
||||
#define Small (0x32800000) /* 2^-26. */
|
||||
#define Small12 (0x328)
|
||||
#define QNaN (0x7fc)
|
||||
#define AbsMask 0x7fffffff
|
||||
#define Half 0x3f000000
|
||||
#define One 0x3f800000
|
||||
#define PiOver2f 0x1.921fb6p+0f
|
||||
#define Pif 0x1.921fb6p+1f
|
||||
#define Small 0x32800000 /* 2^-26. */
|
||||
#define Small12 0x328
|
||||
#define QNaN 0x7fc
|
||||
|
||||
/* Fast implementation of single-precision acos(x) based on polynomial
|
||||
approximation of single-precision asin(x).
|
||||
|
|
@ -89,11 +89,11 @@ acosf (float x)
|
|||
return (x <= -0.5) ? fmaf (-2.0f, p, Pif) : 2.0f * p;
|
||||
}
|
||||
|
||||
PL_SIG (S, F, 1, acos, -1.0, 1.0)
|
||||
PL_TEST_ULP (acosf, 0.82)
|
||||
PL_TEST_INTERVAL (acosf, 0, Small, 5000)
|
||||
PL_TEST_INTERVAL (acosf, Small, 0.5, 50000)
|
||||
PL_TEST_INTERVAL (acosf, 0.5, 1.0, 50000)
|
||||
PL_TEST_INTERVAL (acosf, 1.0, 0x1p11, 50000)
|
||||
PL_TEST_INTERVAL (acosf, 0x1p11, inf, 20000)
|
||||
PL_TEST_INTERVAL (acosf, -0, -inf, 20000)
|
||||
TEST_SIG (S, F, 1, acos, -1.0, 1.0)
|
||||
TEST_ULP (acosf, 0.82)
|
||||
TEST_INTERVAL (acosf, 0, Small, 5000)
|
||||
TEST_INTERVAL (acosf, Small, 0.5, 50000)
|
||||
TEST_INTERVAL (acosf, 0.5, 1.0, 50000)
|
||||
TEST_INTERVAL (acosf, 1.0, 0x1p11, 50000)
|
||||
TEST_INTERVAL (acosf, 0x1p11, inf, 20000)
|
||||
TEST_INTERVAL (acosf, -0, -inf, 20000)
|
||||
|
|
@ -1,31 +1,26 @@
|
|||
/*
|
||||
* Double-precision acosh(x) function.
|
||||
*
|
||||
* Copyright (c) 2022-2023, Arm Limited.
|
||||
* Copyright (c) 2022-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "mathlib.h"
|
||||
#include "math_config.h"
|
||||
#include "pl_sig.h"
|
||||
#include "pl_test.h"
|
||||
#include "test_sig.h"
|
||||
#include "test_defs.h"
|
||||
|
||||
#define Ln2 (0x1.62e42fefa39efp-1)
|
||||
#define MinusZero (0x8000000000000000)
|
||||
#define SquareLim (0x5fe0000000000000) /* asuint64(0x1.0p511). */
|
||||
#define Two (0x4000000000000000) /* asuint64(2.0). */
|
||||
|
||||
double
|
||||
optr_aor_log_f64 (double);
|
||||
|
||||
double
|
||||
log1p (double);
|
||||
|
||||
/* acosh approximation using a variety of approaches on different intervals:
|
||||
|
||||
acosh(x) = ln(x + sqrt(x * x - 1)).
|
||||
|
||||
x >= 2^511: We cannot square x without overflow. For huge x, sqrt(x*x - 1) is
|
||||
close enough to x that we can calculate the result by ln(2x) == ln(x) +
|
||||
x >= 2^511: We cannot square x without overflow. For huge x, sqrt(x*x - 1)
|
||||
is close enough to x that we can calculate the result by ln(2x) == ln(x) +
|
||||
ln(2). The greatest observed error in this region is 0.98 ULP:
|
||||
acosh(0x1.1b9bf42923d1dp+853) got 0x1.28066a11a7c7fp+9
|
||||
want 0x1.28066a11a7c8p+9.
|
||||
|
|
@ -48,19 +43,19 @@ acosh (double x)
|
|||
return __math_invalid (x);
|
||||
|
||||
if (unlikely (ix >= SquareLim))
|
||||
return optr_aor_log_f64 (x) + Ln2;
|
||||
return log (x) + Ln2;
|
||||
|
||||
if (ix >= Two)
|
||||
return optr_aor_log_f64 (x + sqrt (x * x - 1));
|
||||
return log (x + sqrt (x * x - 1));
|
||||
|
||||
double xm1 = x - 1;
|
||||
return log1p (xm1 + sqrt (2 * xm1 + xm1 * xm1));
|
||||
}
|
||||
|
||||
PL_SIG (S, D, 1, acosh, 1.0, 10.0)
|
||||
PL_TEST_ULP (acosh, 2.19)
|
||||
PL_TEST_INTERVAL (acosh, 0, 1, 10000)
|
||||
PL_TEST_INTERVAL (acosh, 1, 2, 100000)
|
||||
PL_TEST_INTERVAL (acosh, 2, 0x1p511, 100000)
|
||||
PL_TEST_INTERVAL (acosh, 0x1p511, inf, 100000)
|
||||
PL_TEST_INTERVAL (acosh, -0, -inf, 10000)
|
||||
TEST_SIG (S, D, 1, acosh, 1.0, 10.0)
|
||||
TEST_ULP (acosh, 2.19)
|
||||
TEST_INTERVAL (acosh, 0, 1, 10000)
|
||||
TEST_INTERVAL (acosh, 1, 2, 100000)
|
||||
TEST_INTERVAL (acosh, 2, 0x1p511, 100000)
|
||||
TEST_INTERVAL (acosh, 0x1p511, inf, 100000)
|
||||
TEST_INTERVAL (acosh, -0, -inf, 10000)
|
||||
|
|
@ -1,27 +1,19 @@
|
|||
/*
|
||||
* Single-precision acosh(x) function.
|
||||
*
|
||||
* Copyright (c) 2022-2023, Arm Limited.
|
||||
* Copyright (c) 2022-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "math_config.h"
|
||||
#include "pl_sig.h"
|
||||
#include "pl_test.h"
|
||||
#include "test_sig.h"
|
||||
#include "test_defs.h"
|
||||
|
||||
#define Ln2 (0x1.62e4p-1f)
|
||||
#define MinusZero 0x80000000
|
||||
#define SquareLim 0x5f800000 /* asuint(0x1p64). */
|
||||
#define Two 0x40000000
|
||||
|
||||
/* Single-precision log from math/. */
|
||||
float
|
||||
optr_aor_log_f32 (float);
|
||||
|
||||
/* Single-precision log(1+x) from pl/math. */
|
||||
float
|
||||
log1pf (float);
|
||||
|
||||
/* acoshf approximation using a variety of approaches on different intervals:
|
||||
|
||||
x >= 2^64: We cannot square x without overflow. For huge x, sqrt(x*x - 1) is
|
||||
|
|
@ -45,19 +37,19 @@ acoshf (float x)
|
|||
return __math_invalidf (x);
|
||||
|
||||
if (unlikely (ix >= SquareLim))
|
||||
return optr_aor_log_f32 (x) + Ln2;
|
||||
return logf (x) + Ln2;
|
||||
|
||||
if (ix > Two)
|
||||
return optr_aor_log_f32 (x + sqrtf (x * x - 1));
|
||||
return logf (x + sqrtf (x * x - 1));
|
||||
|
||||
float xm1 = x - 1;
|
||||
return log1pf (xm1 + sqrtf (2 * xm1 + xm1 * xm1));
|
||||
}
|
||||
|
||||
PL_SIG (S, F, 1, acosh, 1.0, 10.0)
|
||||
PL_TEST_ULP (acoshf, 2.30)
|
||||
PL_TEST_INTERVAL (acoshf, 0, 1, 100)
|
||||
PL_TEST_INTERVAL (acoshf, 1, 2, 10000)
|
||||
PL_TEST_INTERVAL (acoshf, 2, 0x1p64, 100000)
|
||||
PL_TEST_INTERVAL (acoshf, 0x1p64, inf, 100000)
|
||||
PL_TEST_INTERVAL (acoshf, -0, -inf, 10000)
|
||||
TEST_SIG (S, F, 1, acosh, 1.0, 10.0)
|
||||
TEST_ULP (acoshf, 2.30)
|
||||
TEST_INTERVAL (acoshf, 0, 1, 100)
|
||||
TEST_INTERVAL (acoshf, 1, 2, 10000)
|
||||
TEST_INTERVAL (acoshf, 2, 0x1p64, 100000)
|
||||
TEST_INTERVAL (acoshf, 0x1p64, inf, 100000)
|
||||
TEST_INTERVAL (acoshf, -0, -inf, 10000)
|
||||
|
|
@ -1,15 +1,15 @@
|
|||
/*
|
||||
* Double-precision inverse error function (AdvSIMD variant).
|
||||
*
|
||||
* Copyright (c) 2023, Arm Limited.
|
||||
* Copyright (c) 2023-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
#include "v_math.h"
|
||||
#include "pl_test.h"
|
||||
#include "test_defs.h"
|
||||
#include "mathlib.h"
|
||||
#include "math_config.h"
|
||||
#include "pl_sig.h"
|
||||
#include "poly_advsimd_f64.h"
|
||||
#include "test_sig.h"
|
||||
#include "v_poly_f64.h"
|
||||
#define V_LOG_INLINE_POLY_ORDER 4
|
||||
#include "v_log_inline.h"
|
||||
|
||||
|
|
@ -22,7 +22,7 @@ const static struct data
|
|||
can be taken. */
|
||||
double P[8][2], Q[7][2];
|
||||
float64x2_t tailshift;
|
||||
uint8x16_t idx;
|
||||
uint8_t idx[16];
|
||||
struct v_log_inline_data log_tbl;
|
||||
float64x2_t P_57[9], Q_57[10], P_17[7], Q_17[6];
|
||||
} data = { .P = { { 0x1.007ce8f01b2e8p+4, -0x1.f3596123109edp-7 },
|
||||
|
|
@ -58,7 +58,7 @@ const static struct data
|
|||
V2 (0x1.a450d8e7f4cbbp+7), V2 (-0x1.bc3480485857p+7),
|
||||
V2 (0x1.ae6b0c504ee02p+6), V2 (-0x1.499dfec1a7f5fp+4) },
|
||||
.tailshift = V2 (-0.87890625),
|
||||
.idx = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
|
||||
.idx = { 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 },
|
||||
.log_tbl = V_LOG_CONSTANTS };
|
||||
|
||||
static inline float64x2_t
|
||||
|
|
@ -128,7 +128,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erfinv) (float64x2_t x)
|
|||
uint64x2_t extreme_tail = vcagtq_f64 (x, v_f64 (0.9375));
|
||||
|
||||
uint8x16_t off = vandq_u8 (vreinterpretq_u8_u64 (is_tail), vdupq_n_u8 (8));
|
||||
uint8x16_t idx = vaddq_u8 (d->idx, off);
|
||||
uint8x16_t idx = vaddq_u8 (vld1q_u8 (d->idx), off);
|
||||
|
||||
float64x2_t t = vbslq_f64 (is_tail, d->tailshift, v_f64 (-0.5625));
|
||||
t = vfmaq_f64 (t, x, x);
|
||||
|
|
@ -150,12 +150,17 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erfinv) (float64x2_t x)
|
|||
return vdivq_f64 (p, q);
|
||||
}
|
||||
|
||||
PL_SIG (V, D, 1, erfinv, -0.99, 0.99)
|
||||
PL_TEST_ULP (V_NAME_D1 (erfinv), 24.8)
|
||||
#if USE_MPFR
|
||||
# warning Not generating tests for _ZGVnN2v_erfinv, as MPFR has no suitable reference
|
||||
#else
|
||||
TEST_SIG (V, D, 1, erfinv, -0.99, 0.99)
|
||||
TEST_ULP (V_NAME_D1 (erfinv), 24.8)
|
||||
TEST_DISABLE_FENV (V_NAME_D1 (erfinv))
|
||||
TEST_SYM_INTERVAL (V_NAME_D1 (erfinv), 0, 0x1.fffffffffffffp-1, 100000)
|
||||
TEST_SYM_INTERVAL (V_NAME_D1 (erfinv), 0, 0x1.fffffffffffffp-1, 100000)
|
||||
TEST_SYM_INTERVAL (V_NAME_D1 (erfinv), 0, 0x1.fffffffffffffp-1, 100000)
|
||||
/* Test with control lane in each interval. */
|
||||
PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (erfinv), 0, 0x1.fffffffffffffp-1, 100000,
|
||||
0.5)
|
||||
PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (erfinv), 0, 0x1.fffffffffffffp-1, 100000,
|
||||
0.8)
|
||||
PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (erfinv), 0, 0x1.fffffffffffffp-1, 100000,
|
||||
0.95)
|
||||
TEST_CONTROL_VALUE (V_NAME_D1 (erfinv), 0.5)
|
||||
TEST_CONTROL_VALUE (V_NAME_D1 (erfinv), 0.8)
|
||||
TEST_CONTROL_VALUE (V_NAME_D1 (erfinv), 0.95)
|
||||
#endif
|
||||
|
|
@ -1,13 +1,13 @@
|
|||
/*
|
||||
* Single-precision inverse error function (AdvSIMD variant).
|
||||
*
|
||||
* Copyright (c) 2023, Arm Limited.
|
||||
* Copyright (c) 2023-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
#include "v_math.h"
|
||||
#include "pl_sig.h"
|
||||
#include "pl_test.h"
|
||||
#include "poly_advsimd_f32.h"
|
||||
#include "test_sig.h"
|
||||
#include "test_defs.h"
|
||||
#include "v_poly_f32.h"
|
||||
#include "v_logf_inline.h"
|
||||
|
||||
const static struct data
|
||||
|
|
@ -24,14 +24,15 @@ const static struct data
|
|||
|
||||
P_10 and Q_10 are also stored in homogenous vectors to allow better
|
||||
memory access when no lanes are in a tail region. */
|
||||
float32x4_t Plo, PQ, Qhi, P29_3, tailshift;
|
||||
float Plo[4], PQ[4], Qhi[4];
|
||||
float32x4_t P29_3, tailshift;
|
||||
float32x4_t P_50[6], Q_50[2];
|
||||
float32x4_t P_10[3], Q_10[3];
|
||||
uint8x16_t idxhi, idxlo;
|
||||
uint8_t idxhi[16], idxlo[16];
|
||||
struct v_logf_data logf_tbl;
|
||||
} data = {
|
||||
.idxlo = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
|
||||
.idxhi = { 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23 },
|
||||
.idxlo = { 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 },
|
||||
.idxhi = { 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11 },
|
||||
.P29_3 = V4 (0x1.b13626p-2),
|
||||
.tailshift = V4 (-0.87890625),
|
||||
.Plo = { -0x1.a31268p+3, -0x1.fc0252p-4, 0x1.ac9048p+4, 0x1.119d44p+0 },
|
||||
|
|
@ -86,7 +87,7 @@ lookup (float32x4_t tbl, uint8x16_t idx)
|
|||
tail region:
|
||||
_ZGVnN4v_erfinvf(0x1.f7dbeep-1) got 0x1.b4793p+0
|
||||
want 0x1.b4793ap+0 . */
|
||||
float32x4_t VPCS_ATTR V_NAME_F1 (erfinv) (float32x4_t x)
|
||||
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (erfinv) (float32x4_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
|
||||
|
|
@ -124,18 +125,18 @@ float32x4_t VPCS_ATTR V_NAME_F1 (erfinv) (float32x4_t x)
|
|||
Add 4 * i to a group of 4 lanes to copy 32-bit lane i. Each vector stores
|
||||
two pairs of coeffs, so we need two idx vectors - one for each pair. */
|
||||
uint8x16_t off = vandq_u8 (vreinterpretq_u8_u32 (is_tail), vdupq_n_u8 (4));
|
||||
uint8x16_t idx_lo = vaddq_u8 (d->idxlo, off);
|
||||
uint8x16_t idx_hi = vaddq_u8 (d->idxhi, off);
|
||||
uint8x16_t idx_lo = vaddq_u8 (vld1q_u8 (d->idxlo), off);
|
||||
uint8x16_t idx_hi = vaddq_u8 (vld1q_u8 (d->idxhi), off);
|
||||
|
||||
/* Load the tables. */
|
||||
float32x4_t p_lo = d->Plo;
|
||||
float32x4_t pq = d->PQ;
|
||||
float32x4_t qhi = d->Qhi;
|
||||
float32x4_t plo = vld1q_f32 (d->Plo);
|
||||
float32x4_t pq = vld1q_f32 (d->PQ);
|
||||
float32x4_t qhi = vld1q_f32 (d->Qhi);
|
||||
|
||||
/* Do the lookup (and calculate p3 by masking non-tail lanes). */
|
||||
float32x4_t p3 = vreinterpretq_f32_u32 (
|
||||
vandq_u32 (is_tail, vreinterpretq_u32_f32 (d->P29_3)));
|
||||
float32x4_t p0 = lookup (p_lo, idx_lo), p1 = lookup (p_lo, idx_hi),
|
||||
float32x4_t p0 = lookup (plo, idx_lo), p1 = lookup (plo, idx_hi),
|
||||
p2 = lookup (pq, idx_lo), q0 = lookup (pq, idx_hi),
|
||||
q1 = lookup (qhi, idx_lo), q2 = lookup (qhi, idx_hi);
|
||||
|
||||
|
|
@ -155,9 +156,17 @@ float32x4_t VPCS_ATTR V_NAME_F1 (erfinv) (float32x4_t x)
|
|||
return vdivq_f32 (p, q);
|
||||
}
|
||||
|
||||
PL_SIG (V, F, 1, erfinv, -0.99, 0.99)
|
||||
PL_TEST_ULP (V_NAME_F1 (erfinv), 4.49)
|
||||
HALF_WIDTH_ALIAS_F1 (erfinv)
|
||||
|
||||
#if USE_MPFR
|
||||
# warning Not generating tests for _ZGVnN4v_erfinvf, as MPFR has no suitable reference
|
||||
#else
|
||||
TEST_SIG (V, F, 1, erfinv, -0.99, 0.99)
|
||||
TEST_DISABLE_FENV (V_NAME_F1 (erfinv))
|
||||
TEST_ULP (V_NAME_F1 (erfinv), 4.49)
|
||||
TEST_SYM_INTERVAL (V_NAME_F1 (erfinv), 0, 0x1.fffffep-1, 40000)
|
||||
/* Test with control lane in each interval. */
|
||||
PL_TEST_SYM_INTERVAL_C (V_NAME_F1 (erfinv), 0, 0x1.fffffep-1, 40000, 0.5)
|
||||
PL_TEST_SYM_INTERVAL_C (V_NAME_F1 (erfinv), 0, 0x1.fffffep-1, 40000, 0.8)
|
||||
PL_TEST_SYM_INTERVAL_C (V_NAME_F1 (erfinv), 0, 0x1.fffffep-1, 40000, 0.95)
|
||||
TEST_CONTROL_VALUE (V_NAME_F1 (erfinv), 0.5)
|
||||
TEST_CONTROL_VALUE (V_NAME_F1 (erfinv), 0.8)
|
||||
TEST_CONTROL_VALUE (V_NAME_F1 (erfinv), 0.95)
|
||||
#endif
|
||||
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
* Single-precision vector log function - inline version
|
||||
*
|
||||
* Copyright (c) 2019-2023, Arm Limited.
|
||||
* Copyright (c) 2019-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
|
|
@ -1,22 +1,22 @@
|
|||
/*
|
||||
* Double-precision asin(x) function.
|
||||
*
|
||||
* Copyright (c) 2023, Arm Limited.
|
||||
* Copyright (c) 2023-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "poly_scalar_f64.h"
|
||||
#include "math_config.h"
|
||||
#include "pl_sig.h"
|
||||
#include "pl_test.h"
|
||||
#include "test_sig.h"
|
||||
#include "test_defs.h"
|
||||
|
||||
#define AbsMask (0x7fffffffffffffff)
|
||||
#define Half (0x3fe0000000000000)
|
||||
#define One (0x3ff0000000000000)
|
||||
#define PiOver2 (0x1.921fb54442d18p+0)
|
||||
#define Small (0x3e50000000000000) /* 2^-26. */
|
||||
#define Small16 (0x3e50)
|
||||
#define QNaN (0x7ff8)
|
||||
#define AbsMask 0x7fffffffffffffff
|
||||
#define Half 0x3fe0000000000000
|
||||
#define One 0x3ff0000000000000
|
||||
#define PiOver2 0x1.921fb54442d18p+0
|
||||
#define Small 0x3e50000000000000 /* 2^-26. */
|
||||
#define Small16 0x3e50
|
||||
#define QNaN 0x7ff8
|
||||
|
||||
/* Fast implementation of double-precision asin(x) based on polynomial
|
||||
approximation.
|
||||
|
|
@ -54,8 +54,8 @@
|
|||
asin(x) ~ pi/2 - acos(x) ~ pi/2 - 2 * sqrt(z) (1 + z * P(z)).
|
||||
|
||||
The largest observed error in this region is 2.69 ulps,
|
||||
asin(0x1.044ac9819f573p-1) got 0x1.110d7e85fdd5p-1
|
||||
want 0x1.110d7e85fdd53p-1. */
|
||||
asin(0x1.044e8cefee301p-1) got 0x1.1111dd54ddf96p-1
|
||||
want 0x1.1111dd54ddf99p-1. */
|
||||
double
|
||||
asin (double x)
|
||||
{
|
||||
|
|
@ -96,11 +96,11 @@ asin (double x)
|
|||
return asdouble (asuint64 (y) | sign);
|
||||
}
|
||||
|
||||
PL_SIG (S, D, 1, asin, -1.0, 1.0)
|
||||
PL_TEST_ULP (asin, 2.19)
|
||||
PL_TEST_INTERVAL (asin, 0, Small, 5000)
|
||||
PL_TEST_INTERVAL (asin, Small, 0.5, 50000)
|
||||
PL_TEST_INTERVAL (asin, 0.5, 1.0, 50000)
|
||||
PL_TEST_INTERVAL (asin, 1.0, 0x1p11, 50000)
|
||||
PL_TEST_INTERVAL (asin, 0x1p11, inf, 20000)
|
||||
PL_TEST_INTERVAL (asin, -0, -inf, 20000)
|
||||
TEST_SIG (S, D, 1, asin, -1.0, 1.0)
|
||||
TEST_ULP (asin, 2.20)
|
||||
TEST_INTERVAL (asin, 0, Small, 5000)
|
||||
TEST_INTERVAL (asin, Small, 0.5, 50000)
|
||||
TEST_INTERVAL (asin, 0.5, 1.0, 50000)
|
||||
TEST_INTERVAL (asin, 1.0, 0x1p11, 50000)
|
||||
TEST_INTERVAL (asin, 0x1p11, inf, 20000)
|
||||
TEST_INTERVAL (asin, -0, -inf, 20000)
|
||||
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
* Coefficients for single-precision asin(x) function.
|
||||
*
|
||||
* Copyright (c) 2023, Arm Limited.
|
||||
* Copyright (c) 2023-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Reference in a new issue