Update the Arm Optimized Routine library to v25.01

Sponsored by:	Arm Ltd
This commit is contained in:
Andrew Turner 2025-01-10 11:14:39 +00:00
commit f3087bef11
472 changed files with 11930 additions and 14603 deletions

View file

@ -1,12 +1,9 @@
/
Szabolcs Nagy <szabolcs.nagy@arm.com>
Tamar Christina <tamar.christina@arm.com>
math/
Szabolcs Nagy <szabolcs.nagy@arm.com>
networking/
Szabolcs Nagy <szabolcs.nagy@arm.com>
pl/
Pierre Blanchard <pierre.blanchard@arm.com>
Joe Ramsay <joe.ramsay@arm.com>
networking/
Ola Liljedahl <ola.liljedahl@arm.com>
string/
Szabolcs Nagy <szabolcs.nagy@arm.com>
Wilco Dijkstra <wilco.dijkstra@arm.com>

View file

@ -1,6 +1,6 @@
# Makefile - requires GNU make
#
# Copyright (c) 2018-2022, Arm Limited.
# Copyright (c) 2018-2024, Arm Limited.
# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
srcdir = .
@ -11,7 +11,6 @@ includedir = $(prefix)/include
# Configure these in config.mk, do not make changes in this file.
SUBS = math string networking
PLSUBS = math
HOST_CC = cc
HOST_CFLAGS = -std=c99 -O2
HOST_LDFLAGS =
@ -21,12 +20,22 @@ CPPFLAGS =
CFLAGS = -std=c99 -O2
CFLAGS_SHARED = -fPIC
CFLAGS_ALL = -Ibuild/include $(CPPFLAGS) $(CFLAGS)
CFLAGS_PL = -Ibuild/pl/include $(CPPFLAGS) $(CFLAGS) -DPL
LDFLAGS =
LDLIBS =
AR = $(CROSS_COMPILE)ar
RANLIB = $(CROSS_COMPILE)ranlib
INSTALL = install
# Detect OS.
# Assume Unix environment: Linux, Darwin, or Msys.
OS := $(shell uname -s)
OS := $(patsubst MSYS%,Msys,$(OS))
# Following math dependencies can be adjusted in config file
# if necessary, e.g. for Msys.
libm-libs = -lm
libc-libs = -lc
mpfr-libs = -lmpfr
gmp-libs = -lgmp
mpc-libs = -lmpc
all:
@ -53,7 +62,6 @@ $(DIRS):
mkdir -p $@
$(filter %.os,$(ALL_FILES)): CFLAGS_ALL += $(CFLAGS_SHARED)
$(filter %.os,$(ALL_FILES)): CFLAGS_PL += $(CFLAGS_SHARED)
build/%.o: $(srcdir)/%.S
$(CC) $(CFLAGS_ALL) -c -o $@ $<

View file

@ -12,12 +12,25 @@ contribution requirements are documented in README.contributors of
the appropriate subdirectory.
Regular quarterly releases are tagged as vYY.MM, the latest
release is v24.01.
release is v25.01.
Source code layout:
build/ - build directory (created by make).
math/ - math subproject sources.
math/ - math subproject sources for generic scalar
subroutines and sources shared with
subdirectories of math/.
All math routines should meet the quality
requirements stated in math/README.contributors,
routines that fail to do so are located in an
experimental/ directory.
math/aarch64/ - math subproject AArch64-specific sources
and sources shared with subdirectories.
math/aarch64/advsimd - AdvSIMD-specific math sources.
math/aarch64/experimental - Experimental math sources do not
meet quality requirements stated in
math/README.contributors.
math/aarch64/sve - SVE-specific math sources.
math/include/ - math library public headers.
math/test/ - math test and benchmark related sources.
math/tools/ - tools used for designing the algorithms.
@ -25,9 +38,16 @@ networking/ - networking subproject sources.
networking/include/ - networking library public headers.
networking/test/ - networking test and benchmark related sources.
string/ - string routines subproject sources.
All string routines should meet the quality
requirements stated in string/README.contributors,
routines that fail to do so are located in an
experimental/ directory.
string/<arch> - <arch>-specific string routines sources for
<arch>=aarch64, and arm.
string/aarch64/experimental - Experimental string routines which
may not be fully optimized yet.
string/include/ - string library public headers.
string/test/ - string test and benchmark related sources.
pl/... - separately maintained performance library code.
The steps to build the target libraries and run the tests:
@ -50,6 +70,13 @@ Or building and testing the math subproject only:
make all-math
make check-math
Note on compiler compability/requirement:
SVE routines are always built by default - this means that on AArch64
GCC >= 10 or LLVM >= 5 are always required for SVE ACLE compatibility.
There is no explicit check for compatible compiler, therefore the SVE
routines will fail to build if CC is too old.
The test system requires libmpfr and libmpc.
For example on debian linux they can be installed as:

View file

@ -1,14 +1,11 @@
# Example config.mk
#
# Copyright (c) 2018-2023, Arm Limited.
# Copyright (c) 2018-2024, Arm Limited.
# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
# Subprojects to build
SUBS = math string networking
# Subsubprojects to build if subproject pl is built
PLSUBS = math
# Target architecture: aarch64, arm or x86_64
ARCH = aarch64
@ -30,6 +27,27 @@ HOST_CFLAGS += -Wall -Wno-unused-function
HOST_CFLAGS += -g
CFLAGS += -g
ifeq ($(OS),Msys)
# llvm is the only available/valid native compiler
CC = clang
AR = llvm-ar
RANLIB = llvm-ranlib
HOST_CC = clang
SYSROOT = /c/wenv/msys2/msys64/clangarm64
# Common windows flags
COMMON_WIN_CFLAGS = -D_CRT_SECURE_NO_WARNINGS -D_CRT_NONSTDC_NO_DEPRECATE
COMMON_WIN_CFLAGS += -Wno-deprecated-declarations -Wno-unused-variable
# For mathtest
HOST_CFLAGS += -I$(SYSROOT)/include
HOST_CFLAGS += $(COMMON_WIN_CFLAGS) -Wno-ignored-attributes
# Clear the default flag -fPIC, as not supported on Windows
CFLAGS_SHARED =
# For ulp.h with MPFR
CFLAGS += -I$(SYSROOT)/include
# For clang on Windows
CFLAGS += $(COMMON_WIN_CFLAGS)
endif
# Optimize the shared libraries on aarch64 assuming they fit in 1M.
#CFLAGS_SHARED = -fPIC -mcmodel=tiny
@ -45,12 +63,33 @@ math-cflags =
math-ldlibs =
math-ulpflags =
math-testflags =
string-cflags =
string-cflags = -falign-functions=64
networking-cflags =
# Use if mpfr is available on the target for ulp error checking.
#math-ldlibs += -lmpfr -lgmp
#math-cflags += -DUSE_MPFR
ifeq ($(OS),Msys)
# Libraries can be installed with pacman
libm-libs = -lmsvcrt -lvcruntime -lucrt
libc-libs =
# Linker will look for .lib but some systems only have .dll.a,
# therefore we have to give absolute path to libraries.
# This is system dependent and might need adjusting.
mpfr-libs = $(SYSROOT)/lib/libmpfr.dll.a
gmp-libs = $(SYSROOT)/lib/libgmp.dll.a
mpc-libs = $(SYSROOT)/lib/libmpc.dll.a
endif
# Use if mpfr is available on the target for ulp error checking. If
# enabling this, it is advised to disable fenv checks by uncommenting
# the two lines at the bottom of this block.
USE_MPFR=0
math-cflags += -DUSE_MPFR=$(USE_MPFR)
ifeq ($(USE_MPFR), 1)
math-ldlibs += $(mpfr-libs) $(gmp-libs)
math-ulpflags += -m
endif
# Disable fenv checks
#math-ulpflags = -q -f
#math-testflags = -nostatus
# Use with gcc.
math-cflags += -frounding-math -fexcess-precision=standard -fno-stack-protector
@ -59,30 +98,36 @@ math-cflags += -ffp-contract=fast -fno-math-errno
# Use with clang.
#math-cflags += -ffp-contract=fast
# Disable/enable SVE vector math code and tests.
# If WANT_SVE_MATH is enabled, math-sve-cflags is added for SVE
# routines only so that SVE code does not leak into scalar
# routines. It is also necessary to add it for tools (e.g. ulp,
# mathbench)
WANT_SVE_MATH = 0
ifeq ($(WANT_SVE_MATH), 1)
math-sve-cflags = -march=armv8-a+sve
endif
math-cflags += -DWANT_SVE_MATH=$(WANT_SVE_MATH)
# If defined to 1, set errno in math functions according to ISO C. Many math
# libraries do not set errno, so this is 0 by default. It may need to be
# set to 1 if math.h has (math_errhandling & MATH_ERRNO) != 0.
WANT_ERRNO = 0
math-cflags += -DWANT_ERRNO=$(WANT_ERRNO)
# Disable/enable SVE vector math tests/tools.
ifeq ($(ARCH),aarch64)
WANT_SVE_TESTS = 1
else
WANT_SVE_TESTS = 0
endif
math-cflags += -DWANT_SVE_TESTS=$(WANT_SVE_TESTS)
# If set to 1, set fenv in vector math routines.
WANT_SIMD_EXCEPT = 0
math-cflags += -DWANT_SIMD_EXCEPT=$(WANT_SIMD_EXCEPT)
# Disable fenv checks
#math-ulpflags = -q -f
#math-testflags = -nostatus
# If set to 1, enable tests for exp10.
WANT_EXP10_TESTS = 1
math-cflags += -DWANT_EXP10_TESTS=$(WANT_EXP10_TESTS)
# If set to 1, enable tests for sinpi and cospi. These functions are
# only supported on aarch64
ifeq ($(ARCH),aarch64)
WANT_TRIGPI_TESTS = 1
else
WANT_TRIGPI_TESTS = 0
endif
math-cflags += -DWANT_TRIGPI_TESTS=$(WANT_TRIGPI_TESTS)
# Remove GNU Property Notes from asm files.
#string-cflags += -DWANT_GNU_PROPERTY=0
@ -92,3 +137,13 @@ math-cflags += -DWANT_SIMD_EXCEPT=$(WANT_SIMD_EXCEPT)
# Avoid auto-vectorization of scalar code and unroll loops
networking-cflags += -O2 -fno-tree-vectorize -funroll-loops
# Provide *_finite symbols and some of the glibc hidden symbols
# so libmathlib can be used with binaries compiled against glibc
# to interpose math functions with both static and dynamic linking
USE_GLIBC_ABI = 1
math-cflags += -DUSE_GLIBC_ABI=$(USE_GLIBC_ABI)
# Enable experimental math routines - non-C23 vector math and low-accuracy scalar
WANT_EXPERIMENTAL_MATH = 0
math-cflags += -DWANT_EXPERIMENTAL_MATH=$(WANT_EXPERIMENTAL_MATH)

View file

@ -1,23 +1,61 @@
# Makefile fragment - requires GNU make
#
# Copyright (c) 2019-2023, Arm Limited.
# Copyright (c) 2019-2024, Arm Limited.
# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
S := $(srcdir)/math
B := build/math
.SECONDEXPANSION:
math-lib-srcs := $(wildcard $(S)/*.[cS])
math-lib-srcs += $(wildcard $(S)/$(ARCH)/*.[cS])
ifneq ($(OS),Linux)
ifeq ($(WANT_SIMD_EXCEPT),1)
$(error WANT_SIMD_EXCEPT is not supported outside Linux)
endif
ifneq ($(USE_MPFR),1)
$(warning WARNING: Double-precision ULP tests will not be usable without MPFR)
endif
ifeq ($(USE_GLIBC_ABI),1)
$(error Can only generate special GLIBC symbols on Linux - please disable USE_GLIBC_ABI)
endif
endif
ifneq ($(ARCH),aarch64)
ifeq ($(WANT_TRIGPI_TESTS),1)
$(error trigpi functions only supported on aarch64)
endif
ifeq ($(WANT_EXPERIMENTAL_MATH),1)
$(error Experimental math only supported on aarch64)
endif
endif
math-src-dir := $(srcdir)/math
math-build-dir := build/math
math-lib-srcs := $(wildcard $(math-src-dir)/*.[cS])
math-lib-srcs += $(wildcard $(math-src-dir)/$(ARCH)/*.[cS])
ifeq ($(OS),Linux)
# Vector symbols only supported on Linux
math-lib-srcs += $(wildcard $(math-src-dir)/$(ARCH)/*/*.[cS])
endif
ifeq ($(WANT_EXPERIMENTAL_MATH), 1)
ifeq ($(OS),Linux)
# Vector symbols only supported on Linux
math-lib-srcs += $(wildcard $(math-src-dir)/$(ARCH)/experimental/*/*.[cS])
else
math-lib-srcs += $(wildcard $(math-src-dir)/$(ARCH)/experimental/*.[cS])
endif
else
# Scalar experimental symbols will have been added by wildcard, so remove them
math-lib-srcs := $(filter-out $(math-src-dir)/aarch64/experimental/%, $(math-lib-srcs))
endif
math-test-srcs := \
$(S)/test/mathtest.c \
$(S)/test/mathbench.c \
$(S)/test/ulp.c \
$(math-src-dir)/test/mathtest.c \
$(math-src-dir)/test/mathbench.c \
$(math-src-dir)/test/ulp.c \
math-test-host-srcs := $(wildcard $(S)/test/rtest/*.[cS])
math-test-host-srcs := $(wildcard $(math-src-dir)/test/rtest/*.[cS])
math-includes := $(patsubst $(S)/%,build/%,$(wildcard $(S)/include/*.h))
math-test-includes := $(patsubst $(S)/%,build/include/%,$(wildcard $(S)/test/*.h))
math-includes := $(patsubst $(math-src-dir)/%,build/%,$(wildcard $(math-src-dir)/include/*.h))
math-libs := \
build/lib/libmathlib.so \
@ -33,9 +71,9 @@ math-tools := \
math-host-tools := \
build/bin/rtest \
math-lib-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(math-lib-srcs)))
math-test-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(math-test-srcs)))
math-host-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(math-test-host-srcs)))
math-lib-objs := $(patsubst $(math-src-dir)/%,$(math-build-dir)/%.o,$(basename $(math-lib-srcs)))
math-test-objs := $(patsubst $(math-src-dir)/%,$(math-build-dir)/%.o,$(basename $(math-test-srcs)))
math-host-objs := $(patsubst $(math-src-dir)/%,$(math-build-dir)/%.o,$(basename $(math-test-host-srcs)))
math-target-objs := $(math-lib-objs) $(math-test-objs)
math-objs := $(math-target-objs) $(math-target-objs:%.o=%.os) $(math-host-objs)
@ -44,18 +82,69 @@ math-files := \
$(math-libs) \
$(math-tools) \
$(math-host-tools) \
$(math-includes) \
$(math-test-includes) \
$(math-includes)
all-math: $(math-libs) $(math-tools) $(math-includes) $(math-test-includes)
all-math: $(math-libs) $(math-tools) $(math-includes)
$(math-objs): $(math-includes) $(math-test-includes)
$(math-objs): $(math-includes)
$(math-objs): CFLAGS_ALL += $(math-cflags)
$(B)/test/mathtest.o: CFLAGS_ALL += -fmath-errno
$(math-build-dir)/test/mathtest.o: CFLAGS_ALL += -fmath-errno
$(math-host-objs): CC = $(HOST_CC)
$(math-host-objs): CFLAGS_ALL = $(HOST_CFLAGS)
$(B)/test/ulp.o: $(S)/test/ulp.h
# Add include path for experimental routines so they can share helpers with non-experimental
$(math-build-dir)/aarch64/experimental/advsimd/%: CFLAGS_ALL += -I$(math-src-dir)/aarch64/advsimd
$(math-build-dir)/aarch64/experimental/sve/%: CFLAGS_ALL += -I$(math-src-dir)/aarch64/sve
$(math-objs): CFLAGS_ALL += -I$(math-src-dir)
ulp-funcs-dir = build/test/ulp-funcs/
ulp-wrappers-dir = build/test/ulp-wrappers/
mathbench-funcs-dir = build/test/mathbench-funcs/
test-sig-dirs = $(ulp-funcs-dir) $(ulp-wrappers-dir) $(mathbench-funcs-dir)
build/include/test $(test-sig-dirs) $(addsuffix /$(ARCH),$(test-sig-dirs)) $(addsuffix /aarch64/experimental,$(test-sig-dirs)) \
$(addsuffix /aarch64/experimental/advsimd,$(test-sig-dirs)) $(addsuffix /aarch64/experimental/sve,$(test-sig-dirs)) \
$(addsuffix /aarch64/advsimd,$(test-sig-dirs)) $(addsuffix /aarch64/sve,$(test-sig-dirs)):
mkdir -p $@
ulp-funcs = $(patsubst $(math-src-dir)/%,$(ulp-funcs-dir)/%,$(basename $(math-lib-srcs)))
ulp-wrappers = $(patsubst $(math-src-dir)/%,$(ulp-wrappers-dir)/%,$(basename $(math-lib-srcs)))
mathbench-funcs = $(patsubst $(math-src-dir)/%,$(mathbench-funcs-dir)/%,$(basename $(math-lib-srcs)))
ifeq ($(WANT_SVE_TESTS), 0)
# Filter out anything with sve in the path
ulp-funcs := $(foreach a,$(ulp-funcs),$(if $(findstring sve,$a),,$a))
ulp-wrappers := $(foreach a,$(ulp-wrappers),$(if $(findstring sve,$a),,$a))
mathbench-funcs := $(foreach a,$(mathbench-funcs),$(if $(findstring sve,$a),,$a))
endif
define emit_sig
$1/aarch64/experimental/sve/%.i: EXTRA_INC = -I$(math-src-dir)/aarch64/sve
$1/aarch64/experimental/advsimd/%.i: EXTRA_INC = -I$(math-src-dir)/aarch64/advsimd
$1/%.i: $(math-src-dir)/%.c | $$$$(@D)
$(CC) $$< $(math-cflags) -I$(math-src-dir)/include -I$(math-src-dir) $$(EXTRA_INC) -D$2 -E -o $$@
$1/%: $1/%.i
{ grep TEST_SIG $$< || true; } | cut -f 2- -d ' ' > $$@
endef
$(eval $(call emit_sig,$(ulp-funcs-dir),EMIT_ULP_FUNCS))
$(eval $(call emit_sig,$(ulp-wrappers-dir),EMIT_ULP_WRAPPERS))
$(eval $(call emit_sig,$(mathbench-funcs-dir),EMIT_MATHBENCH_FUNCS))
ulp-funcs-gen = build/include/test/ulp_funcs_gen.h
ulp-wrappers-gen = build/include/test/ulp_wrappers_gen.h
mathbench-funcs-gen = build/include/test/mathbench_funcs_gen.h
math-tools-autogen-headers = $(ulp-funcs-gen) $(ulp-wrappers-gen) $(mathbench-funcs-gen)
$(ulp-funcs-gen): $(ulp-funcs) | $$(@D)
$(ulp-wrappers-gen): $(ulp-wrappers) | $$(@D)
$(mathbench-funcs-gen): $(mathbench-funcs) | $$(@D)
$(math-tools-autogen-headers): | $$(@D)
cat $^ | sort -u > $@
$(math-build-dir)/test/mathbench.o: $(mathbench-funcs-gen)
$(math-build-dir)/test/ulp.o: $(math-src-dir)/test/ulp.h $(ulp-funcs-gen) $(ulp-wrappers-gen)
build/lib/libmathlib.so: $(math-lib-objs:%.o=%.os)
$(CC) $(CFLAGS_ALL) $(LDFLAGS) -shared -o $@ $^
@ -65,38 +154,40 @@ build/lib/libmathlib.a: $(math-lib-objs)
$(AR) rc $@ $^
$(RANLIB) $@
$(math-host-tools): HOST_LDLIBS += -lm -lmpfr -lmpc
$(math-tools): LDLIBS += $(math-ldlibs) -lm
# math-sve-cflags should be empty if WANT_SVE_MATH is not enabled
$(math-tools): CFLAGS_ALL += $(math-sve-cflags)
$(math-host-tools): HOST_LDLIBS += $(libm-libs) $(mpfr-libs) $(mpc-libs)
$(math-tools): LDLIBS += $(math-ldlibs) $(libm-libs)
ifneq ($(OS),Darwin)
$(math-tools): LDFLAGS += -static
endif
build/bin/rtest: $(math-host-objs)
$(HOST_CC) $(HOST_CFLAGS) $(HOST_LDFLAGS) -o $@ $^ $(HOST_LDLIBS)
build/bin/mathtest: $(B)/test/mathtest.o build/lib/libmathlib.a
$(CC) $(CFLAGS_ALL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS)
build/bin/mathtest: $(math-build-dir)/test/mathtest.o build/lib/libmathlib.a
$(CC) $(CFLAGS_ALL) $(LDFLAGS) -o $@ $^ $(libm-libs)
build/bin/mathbench: $(B)/test/mathbench.o build/lib/libmathlib.a
$(CC) $(CFLAGS_ALL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS)
build/bin/mathbench: $(math-build-dir)/test/mathbench.o build/lib/libmathlib.a
$(CC) $(CFLAGS_ALL) $(LDFLAGS) -o $@ $^ $(libm-libs)
# This is not ideal, but allows custom symbols in mathbench to get resolved.
build/bin/mathbench_libc: $(B)/test/mathbench.o build/lib/libmathlib.a
$(CC) $(CFLAGS_ALL) $(LDFLAGS) -static -o $@ $< $(LDLIBS) -lc build/lib/libmathlib.a -lm
build/bin/mathbench_libc: $(math-build-dir)/test/mathbench.o build/lib/libmathlib.a
$(CC) $(CFLAGS_ALL) $(LDFLAGS) -o $@ $< $(libm-libs) $(libc-libs) build/lib/libmathlib.a $(libm-libs)
build/bin/ulp: $(B)/test/ulp.o build/lib/libmathlib.a
$(CC) $(CFLAGS_ALL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS)
build/bin/ulp: $(math-build-dir)/test/ulp.o build/lib/libmathlib.a
$(CC) $(CFLAGS_ALL) $(LDFLAGS) -o $@ $^ $(LDLIBS)
build/include/%.h: $(S)/include/%.h
build/include/%.h: $(math-src-dir)/include/%.h
cp $< $@
build/include/test/%.h: $(S)/test/%.h
build/bin/%.sh: $(math-src-dir)/test/%.sh
cp $< $@
build/bin/%.sh: $(S)/test/%.sh
cp $< $@
math-tests := $(wildcard $(S)/test/testcases/directed/*.tst)
math-rtests := $(wildcard $(S)/test/testcases/random/*.tst)
math-tests := $(wildcard $(math-src-dir)/test/testcases/directed/*.tst)
ifneq ($(WANT_EXP10_TESTS),1)
math-tests := $(filter-out %exp10.tst, $(math-tests))
endif
math-rtests := $(wildcard $(math-src-dir)/test/testcases/random/*.tst)
check-math-test: $(math-tools)
cat $(math-tests) | $(EMULATOR) build/bin/mathtest $(math-testflags)
@ -104,8 +195,88 @@ check-math-test: $(math-tools)
check-math-rtest: $(math-host-tools) $(math-tools)
cat $(math-rtests) | build/bin/rtest | $(EMULATOR) build/bin/mathtest $(math-testflags)
ulp-input-dir = $(math-build-dir)/test/inputs
$(ulp-input-dir) $(ulp-input-dir)/$(ARCH) $(ulp-input-dir)/aarch64/sve $(ulp-input-dir)/aarch64/advsimd \
$(ulp-input-dir)/aarch64/experimental $(ulp-input-dir)/aarch64/experimental/advsimd $(ulp-input-dir)/aarch64/experimental/sve:
mkdir -p $@
math-lib-lims = $(patsubst $(math-src-dir)/%.c,$(ulp-input-dir)/%.ulp,$(math-lib-srcs))
math-lib-lims-nn = $(patsubst $(math-src-dir)/%.c,$(ulp-input-dir)/%.ulp_nn,$(math-lib-srcs))
math-lib-fenvs = $(patsubst $(math-src-dir)/%.c,$(ulp-input-dir)/%.fenv,$(math-lib-srcs))
math-lib-itvs = $(patsubst $(math-src-dir)/%.c,$(ulp-input-dir)/%.itv,$(math-lib-srcs))
math-lib-cvals = $(patsubst $(math-src-dir)/%.c,$(ulp-input-dir)/%.cval,$(math-lib-srcs))
ulp-inputs = $(math-lib-lims) $(math-lib-lims-nn) $(math-lib-fenvs) $(math-lib-itvs) $(math-lib-cvals)
$(ulp-inputs): CFLAGS = -I$(math-src-dir)/test -I$(math-src-dir)/include -I$(math-src-dir) $(math-cflags)\
-I$(math-src-dir)/aarch64/advsimd -I$(math-src-dir)/aarch64/sve
$(ulp-input-dir)/%.ulp.i: $(math-src-dir)/%.c | $$(@D)
$(CC) $(CFLAGS) $< -E -o $@
$(ulp-input-dir)/%.ulp: $(ulp-input-dir)/%.ulp.i
{ grep "TEST_ULP " $< || true; } > $@
$(ulp-input-dir)/%.ulp_nn.i: $(math-src-dir)/%.c | $$(@D)
$(CC) $(CFLAGS) $< -E -o $@
$(ulp-input-dir)/%.ulp_nn: $(ulp-input-dir)/%.ulp_nn.i
{ grep "TEST_ULP_NONNEAREST " $< || true; } > $@
$(ulp-input-dir)/%.fenv.i: $(math-src-dir)/%.c | $$(@D)
$(CC) $(CFLAGS) $< -E -o $@
$(ulp-input-dir)/%.fenv: $(ulp-input-dir)/%.fenv.i
{ grep "TEST_DISABLE_FENV " $< || true; } > $@
$(ulp-input-dir)/%.itv.i: $(math-src-dir)/%.c | $$(@D)
$(CC) $(CFLAGS) $< -E -o $@
$(ulp-input-dir)/%.itv: $(ulp-input-dir)/%.itv.i
{ grep "TEST_INTERVAL " $< || true; } | sed "s/ TEST_INTERVAL/\nTEST_INTERVAL/g" > $@
$(ulp-input-dir)/%.cval.i: $(math-src-dir)/%.c | $$(@D)
$(CC) $(CFLAGS) $< -E -o $@
$(ulp-input-dir)/%.cval: $(ulp-input-dir)/%.cval.i
{ grep "TEST_CONTROL_VALUE " $< || true; } > $@
ulp-lims = $(ulp-input-dir)/limits
$(ulp-lims): $(math-lib-lims)
ulp-lims-nn = $(ulp-input-dir)/limits_nn
$(ulp-lims-nn): $(math-lib-lims-nn)
fenv-exps := $(ulp-input-dir)/fenv
$(fenv-exps): $(math-lib-fenvs)
generic-itvs = $(ulp-input-dir)/itvs
$(generic-itvs): $(filter-out $(ulp-input-dir)/$(ARCH)/%,$(math-lib-itvs))
arch-itvs = $(ulp-input-dir)/$(ARCH)/itvs
$(arch-itvs): $(filter $(ulp-input-dir)/$(ARCH)/%,$(math-lib-itvs))
ulp-cvals := $(ulp-input-dir)/cvals
$(ulp-cvals): $(math-lib-cvals)
# Remove first word, which will be TEST directive
$(ulp-lims) $(ulp-lims-nn) $(fenv-exps) $(arch-itvs) $(generic-itvs) $(ulp-cvals): | $$(@D)
sed "s/TEST_[^ ]* //g" $^ | sort -u > $@
check-math-ulp: $(ulp-lims) $(ulp-lims-nn)
check-math-ulp: $(fenv-exps) $(ulp-cvals)
check-math-ulp: $(generic-itvs) $(arch-itvs)
check-math-ulp: $(math-tools)
ULPFLAGS="$(math-ulpflags)" WANT_SIMD_EXCEPT="$(WANT_SIMD_EXCEPT)" build/bin/runulp.sh $(EMULATOR)
ULPFLAGS="$(math-ulpflags)" \
LIMITS=../../$(ulp-lims) \
ARCH_ITVS=../../$(arch-itvs) \
GEN_ITVS=../../$(generic-itvs) \
DISABLE_FENV=../../$(fenv-exps) \
CVALS=../../$(ulp-cvals) \
FUNC=$(func) \
WANT_EXPERIMENTAL_MATH=$(WANT_EXPERIMENTAL_MATH) \
WANT_SVE_TESTS=$(WANT_SVE_TESTS) \
USE_MPFR=$(USE_MPFR) \
build/bin/runulp.sh $(EMULATOR)
check-math: check-math-test check-math-rtest check-math-ulp

View file

@ -1,8 +1,9 @@
STYLE REQUIREMENTS
==================
1. Most code in this sub-directory is expected to be upstreamed into glibc so
the GNU Coding Standard and glibc specific conventions should be followed
1. With the exception of math/aarch64/experimental/, most code in this
sub-directory is expected to be upstreamed into glibc so the GNU
Coding Standard and glibc specific conventions should be followed
to ease upstreaming.
2. ABI and symbols: the code should be written so it is suitable for inclusion

View file

@ -1,14 +1,14 @@
/*
* Double-precision vector acos(x) function.
*
* Copyright (c) 2023, Arm Limited.
* Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
#include "poly_advsimd_f64.h"
#include "pl_sig.h"
#include "pl_test.h"
#include "v_poly_f64.h"
#include "test_sig.h"
#include "test_defs.h"
static const struct data
{
@ -30,8 +30,8 @@ static const struct data
};
#define AllMask v_u64 (0xffffffffffffffff)
#define Oneu (0x3ff0000000000000)
#define Small (0x3e50000000000000) /* 2^-53. */
#define Oneu 0x3ff0000000000000
#define Small 0x3e50000000000000 /* 2^-53. */
#if WANT_SIMD_EXCEPT
static float64x2_t VPCS_ATTR NOINLINE
@ -111,12 +111,12 @@ float64x2_t VPCS_ATTR V_NAME_D1 (acos) (float64x2_t x)
return vfmaq_f64 (add, mul, y);
}
PL_SIG (V, D, 1, acos, -1.0, 1.0)
PL_TEST_ULP (V_NAME_D1 (acos), 1.02)
PL_TEST_EXPECT_FENV (V_NAME_D1 (acos), WANT_SIMD_EXCEPT)
PL_TEST_INTERVAL (V_NAME_D1 (acos), 0, Small, 5000)
PL_TEST_INTERVAL (V_NAME_D1 (acos), Small, 0.5, 50000)
PL_TEST_INTERVAL (V_NAME_D1 (acos), 0.5, 1.0, 50000)
PL_TEST_INTERVAL (V_NAME_D1 (acos), 1.0, 0x1p11, 50000)
PL_TEST_INTERVAL (V_NAME_D1 (acos), 0x1p11, inf, 20000)
PL_TEST_INTERVAL (V_NAME_D1 (acos), -0, -inf, 20000)
TEST_SIG (V, D, 1, acos, -1.0, 1.0)
TEST_ULP (V_NAME_D1 (acos), 1.02)
TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (acos), WANT_SIMD_EXCEPT)
TEST_INTERVAL (V_NAME_D1 (acos), 0, Small, 5000)
TEST_INTERVAL (V_NAME_D1 (acos), Small, 0.5, 50000)
TEST_INTERVAL (V_NAME_D1 (acos), 0.5, 1.0, 50000)
TEST_INTERVAL (V_NAME_D1 (acos), 1.0, 0x1p11, 50000)
TEST_INTERVAL (V_NAME_D1 (acos), 0x1p11, inf, 20000)
TEST_INTERVAL (V_NAME_D1 (acos), -0, -inf, 20000)

View file

@ -1,14 +1,14 @@
/*
* Single-precision vector acos(x) function.
*
* Copyright (c) 2023, Arm Limited.
* Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
#include "poly_advsimd_f32.h"
#include "pl_sig.h"
#include "pl_test.h"
#include "v_poly_f32.h"
#include "test_sig.h"
#include "test_defs.h"
static const struct data
{
@ -57,8 +57,8 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
The largest observed error in this region is 1.32 ulps,
_ZGVnN4v_acosf (0x1.15ba56p-1) got 0x1.feb33p-1
want 0x1.feb32ep-1. */
float32x4_t VPCS_ATTR V_NAME_F1 (acos) (float32x4_t x)
want 0x1.feb32ep-1. */
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (acos) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
@ -102,12 +102,14 @@ float32x4_t VPCS_ATTR V_NAME_F1 (acos) (float32x4_t x)
return vfmaq_f32 (add, mul, y);
}
PL_SIG (V, F, 1, acos, -1.0, 1.0)
PL_TEST_ULP (V_NAME_F1 (acos), 0.82)
PL_TEST_EXPECT_FENV (V_NAME_F1 (acos), WANT_SIMD_EXCEPT)
PL_TEST_INTERVAL (V_NAME_F1 (acos), 0, 0x1p-26, 5000)
PL_TEST_INTERVAL (V_NAME_F1 (acos), 0x1p-26, 0.5, 50000)
PL_TEST_INTERVAL (V_NAME_F1 (acos), 0.5, 1.0, 50000)
PL_TEST_INTERVAL (V_NAME_F1 (acos), 1.0, 0x1p11, 50000)
PL_TEST_INTERVAL (V_NAME_F1 (acos), 0x1p11, inf, 20000)
PL_TEST_INTERVAL (V_NAME_F1 (acos), -0, -inf, 20000)
HALF_WIDTH_ALIAS_F1 (acos)
TEST_SIG (V, F, 1, acos, -1.0, 1.0)
TEST_ULP (V_NAME_F1 (acos), 0.82)
TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (acos), WANT_SIMD_EXCEPT)
TEST_INTERVAL (V_NAME_F1 (acos), 0, 0x1p-26, 5000)
TEST_INTERVAL (V_NAME_F1 (acos), 0x1p-26, 0.5, 50000)
TEST_INTERVAL (V_NAME_F1 (acos), 0.5, 1.0, 50000)
TEST_INTERVAL (V_NAME_F1 (acos), 1.0, 0x1p11, 50000)
TEST_INTERVAL (V_NAME_F1 (acos), 0x1p11, inf, 20000)
TEST_INTERVAL (V_NAME_F1 (acos), -0, -inf, 20000)

View file

@ -1,12 +1,12 @@
/*
* Single-precision vector acosh(x) function.
* Copyright (c) 2023, Arm Limited.
* Double-precision vector acosh(x) function.
* Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
#include "pl_sig.h"
#include "pl_test.h"
#include "test_sig.h"
#include "test_defs.h"
#define WANT_V_LOG1P_K0_SHORTCUT 1
#include "v_log1p_inline.h"
@ -45,9 +45,8 @@ VPCS_ATTR float64x2_t V_NAME_D1 (acosh) (float64x2_t x)
x = vbslq_f64 (special, vreinterpretq_f64_u64 (d->one), x);
#endif
float64x2_t xm1 = vsubq_f64 (x, v_f64 (1));
float64x2_t y;
y = vaddq_f64 (x, v_f64 (1));
float64x2_t xm1 = vsubq_f64 (x, v_f64 (1.0));
float64x2_t y = vaddq_f64 (x, v_f64 (1.0));
y = vmulq_f64 (y, xm1);
y = vsqrtq_f64 (y);
y = vaddq_f64 (xm1, y);
@ -57,10 +56,10 @@ VPCS_ATTR float64x2_t V_NAME_D1 (acosh) (float64x2_t x)
return log1p_inline (y, &d->log1p_consts);
}
PL_SIG (V, D, 1, acosh, 1.0, 10.0)
PL_TEST_ULP (V_NAME_D1 (acosh), 2.53)
PL_TEST_EXPECT_FENV (V_NAME_D1 (acosh), WANT_SIMD_EXCEPT)
PL_TEST_INTERVAL (V_NAME_D1 (acosh), 1, 0x1p511, 90000)
PL_TEST_INTERVAL (V_NAME_D1 (acosh), 0x1p511, inf, 10000)
PL_TEST_INTERVAL (V_NAME_D1 (acosh), 0, 1, 1000)
PL_TEST_INTERVAL (V_NAME_D1 (acosh), -0, -inf, 10000)
TEST_SIG (V, D, 1, acosh, 1.0, 10.0)
TEST_ULP (V_NAME_D1 (acosh), 2.53)
TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (acosh), WANT_SIMD_EXCEPT)
TEST_INTERVAL (V_NAME_D1 (acosh), 1, 0x1p511, 90000)
TEST_INTERVAL (V_NAME_D1 (acosh), 0x1p511, inf, 10000)
TEST_INTERVAL (V_NAME_D1 (acosh), 0, 1, 1000)
TEST_INTERVAL (V_NAME_D1 (acosh), -0, -inf, 10000)

View file

@ -1,49 +1,46 @@
/*
* Single-precision vector acosh(x) function.
* Copyright (c) 2023, Arm Limited.
* Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
#include "pl_sig.h"
#include "pl_test.h"
#include "test_sig.h"
#include "test_defs.h"
#include "v_log1pf_inline.h"
#define SquareLim 0x1p64
const static struct data
{
struct v_log1pf_data log1pf_consts;
uint32x4_t one;
uint16x4_t thresh;
} data = {
.log1pf_consts = V_LOG1PF_CONSTANTS_TABLE,
.one = V4 (0x3f800000),
.thresh = V4 (0x2000) /* asuint(0x1p64) - asuint(1). */
};
} data = { .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE, .one = V4 (0x3f800000) };
#define SignMask 0x80000000
#define Thresh vdup_n_u16 (0x2000) /* top(asuint(SquareLim) - asuint(1)). */
static float32x4_t NOINLINE VPCS_ATTR
special_case (float32x4_t x, float32x4_t y, uint16x4_t special,
const struct v_log1pf_data d)
const struct v_log1pf_data *d)
{
return v_call_f32 (acoshf, x, log1pf_inline (y, d), vmovl_u16 (special));
}
/* Vector approximation for single-precision acosh, based on log1p. Maximum
error depends on WANT_SIMD_EXCEPT. With SIMD fp exceptions enabled, it
is 2.78 ULP:
__v_acoshf(0x1.07887p+0) got 0x1.ef9e9cp-3
want 0x1.ef9ea2p-3.
is 3.00 ULP:
_ZGVnN4v_acoshf(0x1.01df3ap+0) got 0x1.ef0a82p-4
want 0x1.ef0a7cp-4.
With exceptions disabled, we can compute u with a shorter dependency chain,
which gives maximum error of 3.07 ULP:
__v_acoshf(0x1.01f83ep+0) got 0x1.fbc7fap-4
want 0x1.fbc7f4p-4. */
which gives maximum error of 3.22 ULP:
_ZGVnN4v_acoshf(0x1.007ef2p+0) got 0x1.fdcdccp-5
want 0x1.fdcdd2p-5. */
VPCS_ATTR float32x4_t V_NAME_F1 (acosh) (float32x4_t x)
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (acosh) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
uint32x4_t ix = vreinterpretq_u32_f32 (x);
uint16x4_t special = vcge_u16 (vsubhn_u32 (ix, d->one), d->thresh);
uint16x4_t special = vcge_u16 (vsubhn_u32 (ix, d->one), Thresh);
#if WANT_SIMD_EXCEPT
/* Mask special lanes with 1 to side-step spurious invalid or overflow. Use
@ -54,25 +51,28 @@ VPCS_ATTR float32x4_t V_NAME_F1 (acosh) (float32x4_t x)
float32x4_t xm1 = v_zerofy_f32 (vsubq_f32 (x, v_f32 (1)), p);
float32x4_t u = vfmaq_f32 (vaddq_f32 (xm1, xm1), xm1, xm1);
#else
float32x4_t xm1 = vsubq_f32 (x, v_f32 (1));
float32x4_t u = vmulq_f32 (xm1, vaddq_f32 (x, v_f32 (1.0f)));
float32x4_t xm1 = vsubq_f32 (x, vreinterpretq_f32_u32 (d->one));
float32x4_t u
= vmulq_f32 (xm1, vaddq_f32 (x, vreinterpretq_f32_u32 (d->one)));
#endif
float32x4_t y = vaddq_f32 (xm1, vsqrtq_f32 (u));
if (unlikely (v_any_u16h (special)))
return special_case (x, y, special, d->log1pf_consts);
return log1pf_inline (y, d->log1pf_consts);
return special_case (x, y, special, &d->log1pf_consts);
return log1pf_inline (y, &d->log1pf_consts);
}
PL_SIG (V, F, 1, acosh, 1.0, 10.0)
HALF_WIDTH_ALIAS_F1 (acosh)
TEST_SIG (V, F, 1, acosh, 1.0, 10.0)
#if WANT_SIMD_EXCEPT
PL_TEST_ULP (V_NAME_F1 (acosh), 2.29)
TEST_ULP (V_NAME_F1 (acosh), 2.50)
#else
PL_TEST_ULP (V_NAME_F1 (acosh), 2.58)
TEST_ULP (V_NAME_F1 (acosh), 2.78)
#endif
PL_TEST_EXPECT_FENV (V_NAME_F1 (acosh), WANT_SIMD_EXCEPT)
PL_TEST_INTERVAL (V_NAME_F1 (acosh), 0, 1, 500)
PL_TEST_INTERVAL (V_NAME_F1 (acosh), 1, SquareLim, 100000)
PL_TEST_INTERVAL (V_NAME_F1 (acosh), SquareLim, inf, 1000)
PL_TEST_INTERVAL (V_NAME_F1 (acosh), -0, -inf, 1000)
TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (acosh), WANT_SIMD_EXCEPT)
TEST_INTERVAL (V_NAME_F1 (acosh), 0, 1, 500)
TEST_INTERVAL (V_NAME_F1 (acosh), 1, SquareLim, 100000)
TEST_INTERVAL (V_NAME_F1 (acosh), SquareLim, inf, 1000)
TEST_INTERVAL (V_NAME_F1 (acosh), -0, -inf, 1000)

View file

@ -1,36 +1,35 @@
/*
* Double-precision vector asin(x) function.
*
* Copyright (c) 2023, Arm Limited.
* Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
#include "poly_advsimd_f64.h"
#include "pl_sig.h"
#include "pl_test.h"
#include "test_sig.h"
#include "test_defs.h"
static const struct data
{
float64x2_t poly[12];
float64x2_t c0, c2, c4, c6, c8, c10;
float64x2_t pi_over_2;
uint64x2_t abs_mask;
double c1, c3, c5, c7, c9, c11;
} data = {
/* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x))
on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57. */
.poly = { V2 (0x1.555555555554ep-3), V2 (0x1.3333333337233p-4),
V2 (0x1.6db6db67f6d9fp-5), V2 (0x1.f1c71fbd29fbbp-6),
V2 (0x1.6e8b264d467d6p-6), V2 (0x1.1c5997c357e9dp-6),
V2 (0x1.c86a22cd9389dp-7), V2 (0x1.856073c22ebbep-7),
V2 (0x1.fd1151acb6bedp-8), V2 (0x1.087182f799c1dp-6),
V2 (-0x1.6602748120927p-7), V2 (0x1.cfa0dd1f9478p-6), },
.pi_over_2 = V2 (0x1.921fb54442d18p+0),
.abs_mask = V2 (0x7fffffffffffffff),
.c0 = V2 (0x1.555555555554ep-3), .c1 = 0x1.3333333337233p-4,
.c2 = V2 (0x1.6db6db67f6d9fp-5), .c3 = 0x1.f1c71fbd29fbbp-6,
.c4 = V2 (0x1.6e8b264d467d6p-6), .c5 = 0x1.1c5997c357e9dp-6,
.c6 = V2 (0x1.c86a22cd9389dp-7), .c7 = 0x1.856073c22ebbep-7,
.c8 = V2 (0x1.fd1151acb6bedp-8), .c9 = 0x1.087182f799c1dp-6,
.c10 = V2 (-0x1.6602748120927p-7), .c11 = 0x1.cfa0dd1f9478p-6,
.pi_over_2 = V2 (0x1.921fb54442d18p+0), .abs_mask = V2 (0x7fffffffffffffff),
};
#define AllMask v_u64 (0xffffffffffffffff)
#define One (0x3ff0000000000000)
#define Small (0x3e50000000000000) /* 2^-12. */
#define One 0x3ff0000000000000
#define Small 0x3e50000000000000 /* 2^-12. */
#if WANT_SIMD_EXCEPT
static float64x2_t VPCS_ATTR NOINLINE
@ -58,12 +57,11 @@ special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
asin(x) = pi/2 - (y + y * z * P(z)), with z = (1-x)/2 and y = sqrt(z).
The largest observed error in this region is 2.69 ulps,
_ZGVnN2v_asin (0x1.044ac9819f573p-1) got 0x1.110d7e85fdd5p-1
want 0x1.110d7e85fdd53p-1. */
_ZGVnN2v_asin (0x1.044e8cefee301p-1) got 0x1.1111dd54ddf96p-1
want 0x1.1111dd54ddf99p-1. */
float64x2_t VPCS_ATTR V_NAME_D1 (asin) (float64x2_t x)
{
const struct data *d = ptr_barrier (&data);
float64x2_t ax = vabsq_f64 (x);
#if WANT_SIMD_EXCEPT
@ -76,7 +74,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (asin) (float64x2_t x)
return special_case (x, x, AllMask);
#endif
uint64x2_t a_lt_half = vcltq_f64 (ax, v_f64 (0.5));
uint64x2_t a_lt_half = vcaltq_f64 (x, v_f64 (0.5));
/* Evaluate polynomial Q(x) = y + y * z * P(z) with
z = x ^ 2 and y = |x| , if |x| < 0.5
@ -89,7 +87,26 @@ float64x2_t VPCS_ATTR V_NAME_D1 (asin) (float64x2_t x)
float64x2_t z4 = vmulq_f64 (z2, z2);
float64x2_t z8 = vmulq_f64 (z4, z4);
float64x2_t z16 = vmulq_f64 (z8, z8);
float64x2_t p = v_estrin_11_f64 (z2, z4, z8, z16, d->poly);
/* order-11 estrin. */
float64x2_t c13 = vld1q_f64 (&d->c1);
float64x2_t c57 = vld1q_f64 (&d->c5);
float64x2_t c911 = vld1q_f64 (&d->c9);
float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0);
float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1);
float64x2_t p03 = vfmaq_f64 (p01, z4, p23);
float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0);
float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1);
float64x2_t p47 = vfmaq_f64 (p45, z4, p67);
float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0);
float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1);
float64x2_t p811 = vfmaq_f64 (p89, z4, p1011);
float64x2_t p07 = vfmaq_f64 (p03, z8, p47);
float64x2_t p = vfmaq_f64 (p07, z16, p811);
/* Finalize polynomial: z + z * z2 * P(z2). */
p = vfmaq_f64 (z, vmulq_f64 (z, z2), p);
@ -102,12 +119,12 @@ float64x2_t VPCS_ATTR V_NAME_D1 (asin) (float64x2_t x)
return vbslq_f64 (d->abs_mask, y, x);
}
PL_SIG (V, D, 1, asin, -1.0, 1.0)
PL_TEST_ULP (V_NAME_D1 (asin), 2.19)
PL_TEST_EXPECT_FENV (V_NAME_D1 (asin), WANT_SIMD_EXCEPT)
PL_TEST_INTERVAL (V_NAME_D1 (asin), 0, Small, 5000)
PL_TEST_INTERVAL (V_NAME_D1 (asin), Small, 0.5, 50000)
PL_TEST_INTERVAL (V_NAME_D1 (asin), 0.5, 1.0, 50000)
PL_TEST_INTERVAL (V_NAME_D1 (asin), 1.0, 0x1p11, 50000)
PL_TEST_INTERVAL (V_NAME_D1 (asin), 0x1p11, inf, 20000)
PL_TEST_INTERVAL (V_NAME_D1 (asin), -0, -inf, 20000)
TEST_SIG (V, D, 1, asin, -1.0, 1.0)
TEST_ULP (V_NAME_D1 (asin), 2.20)
TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (asin), WANT_SIMD_EXCEPT)
TEST_INTERVAL (V_NAME_D1 (asin), 0, Small, 5000)
TEST_INTERVAL (V_NAME_D1 (asin), Small, 0.5, 50000)
TEST_INTERVAL (V_NAME_D1 (asin), 0.5, 1.0, 50000)
TEST_INTERVAL (V_NAME_D1 (asin), 1.0, 0x1p11, 50000)
TEST_INTERVAL (V_NAME_D1 (asin), 0x1p11, inf, 20000)
TEST_INTERVAL (V_NAME_D1 (asin), -0, -inf, 20000)

View file

@ -1,14 +1,14 @@
/*
* Single-precision vector asin(x) function.
*
* Copyright (c) 2023, Arm Limited.
* Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
#include "poly_advsimd_f32.h"
#include "pl_sig.h"
#include "pl_test.h"
#include "v_poly_f32.h"
#include "test_sig.h"
#include "test_defs.h"
static const struct data
{
@ -53,7 +53,7 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
The largest observed error in this region is 2.41 ulps,
_ZGVnN4v_asinf (0x1.00203ep-1) got 0x1.0c3a64p-1 want 0x1.0c3a6p-1. */
float32x4_t VPCS_ATTR V_NAME_F1 (asin) (float32x4_t x)
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (asin) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
@ -93,12 +93,14 @@ float32x4_t VPCS_ATTR V_NAME_F1 (asin) (float32x4_t x)
return vbslq_f32 (v_u32 (AbsMask), y, x);
}
PL_SIG (V, F, 1, asin, -1.0, 1.0)
PL_TEST_ULP (V_NAME_F1 (asin), 1.91)
PL_TEST_EXPECT_FENV (V_NAME_F1 (asin), WANT_SIMD_EXCEPT)
PL_TEST_INTERVAL (V_NAME_F1 (asin), 0, 0x1p-12, 5000)
PL_TEST_INTERVAL (V_NAME_F1 (asin), 0x1p-12, 0.5, 50000)
PL_TEST_INTERVAL (V_NAME_F1 (asin), 0.5, 1.0, 50000)
PL_TEST_INTERVAL (V_NAME_F1 (asin), 1.0, 0x1p11, 50000)
PL_TEST_INTERVAL (V_NAME_F1 (asin), 0x1p11, inf, 20000)
PL_TEST_INTERVAL (V_NAME_F1 (asin), -0, -inf, 20000)
HALF_WIDTH_ALIAS_F1 (asin)
TEST_SIG (V, F, 1, asin, -1.0, 1.0)
TEST_ULP (V_NAME_F1 (asin), 1.91)
TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (asin), WANT_SIMD_EXCEPT)
TEST_INTERVAL (V_NAME_F1 (asin), 0, 0x1p-12, 5000)
TEST_INTERVAL (V_NAME_F1 (asin), 0x1p-12, 0.5, 50000)
TEST_INTERVAL (V_NAME_F1 (asin), 0.5, 1.0, 50000)
TEST_INTERVAL (V_NAME_F1 (asin), 1.0, 0x1p11, 50000)
TEST_INTERVAL (V_NAME_F1 (asin), 0x1p11, inf, 20000)
TEST_INTERVAL (V_NAME_F1 (asin), -0, -inf, 20000)

View file

@ -0,0 +1,242 @@
/*
* Double-precision vector asinh(x) function.
*
* Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "test_defs.h"
#include "test_sig.h"
#include "v_math.h"
const static struct data
{
uint64x2_t huge_bound, abs_mask, off, mask;
#if WANT_SIMD_EXCEPT
float64x2_t tiny_bound;
#endif
float64x2_t lc0, lc2;
double lc1, lc3, ln2, lc4;
float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c17;
double c1, c3, c5, c7, c9, c11, c13, c15;
} data = {
#if WANT_SIMD_EXCEPT
.tiny_bound = V2 (0x1p-26),
#endif
/* Even terms of polynomial s.t. asinh(x) is approximated by
asinh(x) ~= x + x^3 * (C0 + C1 * x + C2 * x^2 + C3 * x^3 + ...).
Generated using Remez, f = (asinh(sqrt(x)) - sqrt(x))/x^(3/2). */
.c0 = V2 (-0x1.55555555554a7p-3),
.c1 = 0x1.3333333326c7p-4,
.c2 = V2 (-0x1.6db6db68332e6p-5),
.c3 = 0x1.f1c71b26fb40dp-6,
.c4 = V2 (-0x1.6e8b8b654a621p-6),
.c5 = 0x1.1c4daa9e67871p-6,
.c6 = V2 (-0x1.c9871d10885afp-7),
.c7 = 0x1.7a16e8d9d2ecfp-7,
.c8 = V2 (-0x1.3ddca533e9f54p-7),
.c9 = 0x1.0becef748dafcp-7,
.c10 = V2 (-0x1.b90c7099dd397p-8),
.c11 = 0x1.541f2bb1ffe51p-8,
.c12 = V2 (-0x1.d217026a669ecp-9),
.c13 = 0x1.0b5c7977aaf7p-9,
.c14 = V2 (-0x1.e0f37daef9127p-11),
.c15 = 0x1.388b5fe542a6p-12,
.c16 = V2 (-0x1.021a48685e287p-14),
.c17 = V2 (0x1.93d4ba83d34dap-18),
.lc0 = V2 (-0x1.ffffffffffff7p-2),
.lc1 = 0x1.55555555170d4p-2,
.lc2 = V2 (-0x1.0000000399c27p-2),
.lc3 = 0x1.999b2e90e94cap-3,
.lc4 = -0x1.554e550bd501ep-3,
.ln2 = 0x1.62e42fefa39efp-1,
.off = V2 (0x3fe6900900000000),
.huge_bound = V2 (0x5fe0000000000000),
.abs_mask = V2 (0x7fffffffffffffff),
.mask = V2 (0xfffULL << 52),
};
static float64x2_t NOINLINE VPCS_ATTR
special_case (float64x2_t x, float64x2_t y, uint64x2_t abs_mask,
uint64x2_t special)
{
/* Copy sign. */
y = vbslq_f64 (abs_mask, y, x);
return v_call_f64 (asinh, x, y, special);
}
#define N (1 << V_LOG_TABLE_BITS)
#define IndexMask (N - 1)
struct entry
{
float64x2_t invc;
float64x2_t logc;
};
static inline struct entry
lookup (uint64x2_t i)
{
/* Since N is a power of 2, n % N = n & (N - 1). */
struct entry e;
uint64_t i0 = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
uint64_t i1 = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc);
float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc);
e.invc = vuzp1q_f64 (e0, e1);
e.logc = vuzp2q_f64 (e0, e1);
return e;
}
static inline float64x2_t
log_inline (float64x2_t xm, const struct data *d)
{
uint64x2_t u = vreinterpretq_u64_f64 (xm);
uint64x2_t u_off = vsubq_u64 (u, d->off);
int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52);
uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->mask));
float64x2_t z = vreinterpretq_f64_u64 (iz);
struct entry e = lookup (u_off);
/* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */
float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
float64x2_t kd = vcvtq_f64_s64 (k);
/* hi = r + log(c) + k*Ln2. */
float64x2_t ln2_and_lc4 = vld1q_f64 (&d->ln2);
float64x2_t hi = vfmaq_laneq_f64 (vaddq_f64 (e.logc, r), kd, ln2_and_lc4, 0);
/* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */
float64x2_t odd_coeffs = vld1q_f64 (&d->lc1);
float64x2_t r2 = vmulq_f64 (r, r);
float64x2_t y = vfmaq_laneq_f64 (d->lc2, r, odd_coeffs, 1);
float64x2_t p = vfmaq_laneq_f64 (d->lc0, r, odd_coeffs, 0);
y = vfmaq_laneq_f64 (y, r2, ln2_and_lc4, 1);
y = vfmaq_f64 (p, r2, y);
return vfmaq_f64 (hi, y, r2);
}
/* Double-precision implementation of vector asinh(x).
asinh is very sensitive around 1, so it is impractical to devise a single
low-cost algorithm which is sufficiently accurate on a wide range of input.
Instead we use two different algorithms:
asinh(x) = sign(x) * log(|x| + sqrt(x^2 + 1) if |x| >= 1
= sign(x) * (|x| + |x|^3 * P(x^2)) otherwise
where log(x) is an optimized log approximation, and P(x) is a polynomial
shared with the scalar routine. The greatest observed error 2.79 ULP, in
|x| >= 1:
_ZGVnN2v_asinh(0x1.2cd9d73ea76a6p+0) got 0x1.ffffd003219dap-1
want 0x1.ffffd003219ddp-1. */
VPCS_ATTR float64x2_t V_NAME_D1 (asinh) (float64x2_t x)
{
const struct data *d = ptr_barrier (&data);
float64x2_t ax = vabsq_f64 (x);
uint64x2_t gt1 = vcgeq_f64 (ax, v_f64 (1));
#if WANT_SIMD_EXCEPT
uint64x2_t iax = vreinterpretq_u64_f64 (ax);
uint64x2_t special = vcgeq_u64 (iax, (d->huge_bound));
uint64x2_t tiny = vcltq_f64 (ax, d->tiny_bound);
special = vorrq_u64 (special, tiny);
#else
uint64x2_t special = vcgeq_f64 (ax, vreinterpretq_f64_u64 (d->huge_bound));
#endif
/* Option 1: |x| >= 1.
Compute asinh(x) according by asinh(x) = log(x + sqrt(x^2 + 1)).
If WANT_SIMD_EXCEPT is enabled, sidestep special values, which will
overflow, by setting special lanes to 1. These will be fixed later. */
float64x2_t option_1 = v_f64 (0);
if (likely (v_any_u64 (gt1)))
{
#if WANT_SIMD_EXCEPT
float64x2_t xm = v_zerofy_f64 (ax, special);
#else
float64x2_t xm = ax;
#endif
option_1 = log_inline (
vaddq_f64 (xm, vsqrtq_f64 (vfmaq_f64 (v_f64 (1), xm, xm))), d);
}
/* Option 2: |x| < 1.
Compute asinh(x) using a polynomial.
If WANT_SIMD_EXCEPT is enabled, sidestep special lanes, which will
overflow, and tiny lanes, which will underflow, by setting them to 0. They
will be fixed later, either by selecting x or falling back to the scalar
special-case. The largest observed error in this region is 1.47 ULPs:
_ZGVnN2v_asinh(0x1.fdfcd00cc1e6ap-1) got 0x1.c1d6bf874019bp-1
want 0x1.c1d6bf874019cp-1. */
float64x2_t option_2 = v_f64 (0);
if (likely (v_any_u64 (vceqzq_u64 (gt1))))
{
#if WANT_SIMD_EXCEPT
ax = v_zerofy_f64 (ax, vorrq_u64 (tiny, gt1));
#endif
float64x2_t x2 = vmulq_f64 (ax, ax), z2 = vmulq_f64 (x2, x2);
/* Order-17 Pairwise Horner scheme. */
float64x2_t c13 = vld1q_f64 (&d->c1);
float64x2_t c57 = vld1q_f64 (&d->c5);
float64x2_t c911 = vld1q_f64 (&d->c9);
float64x2_t c1315 = vld1q_f64 (&d->c13);
float64x2_t p01 = vfmaq_laneq_f64 (d->c0, x2, c13, 0);
float64x2_t p23 = vfmaq_laneq_f64 (d->c2, x2, c13, 1);
float64x2_t p45 = vfmaq_laneq_f64 (d->c4, x2, c57, 0);
float64x2_t p67 = vfmaq_laneq_f64 (d->c6, x2, c57, 1);
float64x2_t p89 = vfmaq_laneq_f64 (d->c8, x2, c911, 0);
float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, x2, c911, 1);
float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, x2, c1315, 0);
float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, x2, c1315, 1);
float64x2_t p1617 = vfmaq_f64 (d->c16, x2, d->c17);
float64x2_t p = vfmaq_f64 (p1415, z2, p1617);
p = vfmaq_f64 (p1213, z2, p);
p = vfmaq_f64 (p1011, z2, p);
p = vfmaq_f64 (p89, z2, p);
p = vfmaq_f64 (p67, z2, p);
p = vfmaq_f64 (p45, z2, p);
p = vfmaq_f64 (p23, z2, p);
p = vfmaq_f64 (p01, z2, p);
option_2 = vfmaq_f64 (ax, p, vmulq_f64 (ax, x2));
#if WANT_SIMD_EXCEPT
option_2 = vbslq_f64 (tiny, x, option_2);
#endif
}
/* Choose the right option for each lane. */
float64x2_t y = vbslq_f64 (gt1, option_1, option_2);
if (unlikely (v_any_u64 (special)))
{
return special_case (x, y, d->abs_mask, special);
}
/* Copy sign. */
return vbslq_f64 (d->abs_mask, y, x);
}
TEST_SIG (V, D, 1, asinh, -10.0, 10.0)
TEST_ULP (V_NAME_D1 (asinh), 2.29)
TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (asinh), WANT_SIMD_EXCEPT)
TEST_SYM_INTERVAL (V_NAME_D1 (asinh), 0, 0x1p-26, 50000)
TEST_SYM_INTERVAL (V_NAME_D1 (asinh), 0x1p-26, 1, 50000)
TEST_SYM_INTERVAL (V_NAME_D1 (asinh), 1, 0x1p511, 50000)
TEST_SYM_INTERVAL (V_NAME_D1 (asinh), 0x1p511, inf, 40000)
/* Test vector asinh 3 times, with control lane < 1, > 1 and special.
Ensures the v_sel is choosing the right option in all cases. */
TEST_CONTROL_VALUE (V_NAME_D1 (asinh), 0.5)
TEST_CONTROL_VALUE (V_NAME_D1 (asinh), 2)
TEST_CONTROL_VALUE (V_NAME_D1 (asinh), 0x1p600)

View file

@ -0,0 +1,89 @@
/*
* Single-precision vector asinh(x) function.
*
* Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
#include "test_sig.h"
#include "test_defs.h"
#include "v_log1pf_inline.h"
const static struct data
{
struct v_log1pf_data log1pf_consts;
float32x4_t one;
uint32x4_t big_bound;
#if WANT_SIMD_EXCEPT
uint32x4_t tiny_bound;
#endif
} data = {
.one = V4 (1),
.log1pf_consts = V_LOG1PF_CONSTANTS_TABLE,
.big_bound = V4 (0x5f800000), /* asuint(0x1p64). */
#if WANT_SIMD_EXCEPT
.tiny_bound = V4 (0x30800000) /* asuint(0x1p-30). */
#endif
};
static float32x4_t NOINLINE VPCS_ATTR
special_case (float32x4_t x, uint32x4_t sign, float32x4_t y,
uint32x4_t special, const struct data *d)
{
return v_call_f32 (
asinhf, x,
vreinterpretq_f32_u32 (veorq_u32 (
sign, vreinterpretq_u32_f32 (log1pf_inline (y, &d->log1pf_consts)))),
special);
}
/* Single-precision implementation of vector asinh(x), using vector log1p.
Worst-case error is 2.59 ULP:
_ZGVnN4v_asinhf(0x1.d86124p-3) got 0x1.d449bep-3
want 0x1.d449c4p-3. */
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (asinh) (float32x4_t x)
{
const struct data *dat = ptr_barrier (&data);
float32x4_t ax = vabsq_f32 (x);
uint32x4_t iax = vreinterpretq_u32_f32 (ax);
uint32x4_t special = vcgeq_u32 (iax, dat->big_bound);
uint32x4_t sign = veorq_u32 (vreinterpretq_u32_f32 (x), iax);
float32x4_t special_arg = x;
#if WANT_SIMD_EXCEPT
/* Sidestep tiny and large values to avoid inadvertently triggering
under/overflow. */
special = vorrq_u32 (special, vcltq_u32 (iax, dat->tiny_bound));
if (unlikely (v_any_u32 (special)))
{
ax = v_zerofy_f32 (ax, special);
x = v_zerofy_f32 (x, special);
}
#endif
/* asinh(x) = log(x + sqrt(x * x + 1)).
For positive x, asinh(x) = log1p(x + x * x / (1 + sqrt(x * x + 1))). */
float32x4_t d
= vaddq_f32 (v_f32 (1), vsqrtq_f32 (vfmaq_f32 (dat->one, ax, ax)));
float32x4_t y = vaddq_f32 (ax, vdivq_f32 (vmulq_f32 (ax, ax), d));
if (unlikely (v_any_u32 (special)))
return special_case (special_arg, sign, y, special, dat);
return vreinterpretq_f32_u32 (veorq_u32 (
sign, vreinterpretq_u32_f32 (log1pf_inline (y, &dat->log1pf_consts))));
}
HALF_WIDTH_ALIAS_F1 (asinh)
TEST_SIG (V, F, 1, asinh, -10.0, 10.0)
TEST_ULP (V_NAME_F1 (asinh), 2.10)
TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (asinh), WANT_SIMD_EXCEPT)
TEST_INTERVAL (V_NAME_F1 (asinh), 0, 0x1p-12, 40000)
TEST_INTERVAL (V_NAME_F1 (asinh), 0x1p-12, 1.0, 40000)
TEST_INTERVAL (V_NAME_F1 (asinh), 1.0, 0x1p11, 40000)
TEST_INTERVAL (V_NAME_F1 (asinh), 0x1p11, inf, 40000)
TEST_INTERVAL (V_NAME_F1 (asinh), -0, -0x1p-12, 20000)
TEST_INTERVAL (V_NAME_F1 (asinh), -0x1p-12, -1.0, 20000)
TEST_INTERVAL (V_NAME_F1 (asinh), -1.0, -0x1p11, 20000)
TEST_INTERVAL (V_NAME_F1 (asinh), -0x1p11, -inf, 20000)

View file

@ -1,32 +1,32 @@
/*
* Double-precision vector atan(x) function.
*
* Copyright (c) 2021-2023, Arm Limited.
* Copyright (c) 2021-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
#include "pl_sig.h"
#include "pl_test.h"
#include "poly_advsimd_f64.h"
#include "test_sig.h"
#include "test_defs.h"
static const struct data
{
float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c18;
float64x2_t pi_over_2;
float64x2_t poly[20];
double c1, c3, c5, c7, c9, c11, c13, c15, c17, c19;
} data = {
/* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
[2**-1022, 1.0]. */
.poly = { V2 (-0x1.5555555555555p-2), V2 (0x1.99999999996c1p-3),
V2 (-0x1.2492492478f88p-3), V2 (0x1.c71c71bc3951cp-4),
V2 (-0x1.745d160a7e368p-4), V2 (0x1.3b139b6a88ba1p-4),
V2 (-0x1.11100ee084227p-4), V2 (0x1.e1d0f9696f63bp-5),
V2 (-0x1.aebfe7b418581p-5), V2 (0x1.842dbe9b0d916p-5),
V2 (-0x1.5d30140ae5e99p-5), V2 (0x1.338e31eb2fbbcp-5),
V2 (-0x1.00e6eece7de8p-5), V2 (0x1.860897b29e5efp-6),
V2 (-0x1.0051381722a59p-6), V2 (0x1.14e9dc19a4a4ep-7),
V2 (-0x1.d0062b42fe3bfp-9), V2 (0x1.17739e210171ap-10),
V2 (-0x1.ab24da7be7402p-13), V2 (0x1.358851160a528p-16), },
.c0 = V2 (-0x1.5555555555555p-2), .c1 = 0x1.99999999996c1p-3,
.c2 = V2 (-0x1.2492492478f88p-3), .c3 = 0x1.c71c71bc3951cp-4,
.c4 = V2 (-0x1.745d160a7e368p-4), .c5 = 0x1.3b139b6a88ba1p-4,
.c6 = V2 (-0x1.11100ee084227p-4), .c7 = 0x1.e1d0f9696f63bp-5,
.c8 = V2 (-0x1.aebfe7b418581p-5), .c9 = 0x1.842dbe9b0d916p-5,
.c10 = V2 (-0x1.5d30140ae5e99p-5), .c11 = 0x1.338e31eb2fbbcp-5,
.c12 = V2 (-0x1.00e6eece7de8p-5), .c13 = 0x1.860897b29e5efp-6,
.c14 = V2 (-0x1.0051381722a59p-6), .c15 = 0x1.14e9dc19a4a4ep-7,
.c16 = V2 (-0x1.d0062b42fe3bfp-9), .c17 = 0x1.17739e210171ap-10,
.c18 = V2 (-0x1.ab24da7be7402p-13), .c19 = 0x1.358851160a528p-16,
.pi_over_2 = V2 (0x1.921fb54442d18p+0),
};
@ -42,6 +42,11 @@ static const struct data
float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x)
{
const struct data *d = ptr_barrier (&data);
float64x2_t c13 = vld1q_f64 (&d->c1);
float64x2_t c57 = vld1q_f64 (&d->c5);
float64x2_t c911 = vld1q_f64 (&d->c9);
float64x2_t c1315 = vld1q_f64 (&d->c13);
float64x2_t c1719 = vld1q_f64 (&d->c17);
/* Small cases, infs and nans are supported by our approximation technique,
but do not set fenv flags correctly. Only trigger special case if we need
@ -80,9 +85,35 @@ float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x)
float64x2_t x2 = vmulq_f64 (z2, z2);
float64x2_t x4 = vmulq_f64 (x2, x2);
float64x2_t x8 = vmulq_f64 (x4, x4);
float64x2_t y
= vfmaq_f64 (v_estrin_7_f64 (z2, x2, x4, d->poly),
v_estrin_11_f64 (z2, x2, x4, x8, d->poly + 8), x8);
/* estrin_7. */
float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0);
float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1);
float64x2_t p03 = vfmaq_f64 (p01, x2, p23);
float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0);
float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1);
float64x2_t p47 = vfmaq_f64 (p45, x2, p67);
float64x2_t p07 = vfmaq_f64 (p03, x4, p47);
/* estrin_11. */
float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0);
float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1);
float64x2_t p811 = vfmaq_f64 (p89, x2, p1011);
float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, z2, c1315, 0);
float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, z2, c1315, 1);
float64x2_t p1215 = vfmaq_f64 (p1213, x2, p1415);
float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, z2, c1719, 0);
float64x2_t p1819 = vfmaq_laneq_f64 (d->c18, z2, c1719, 1);
float64x2_t p1619 = vfmaq_f64 (p1617, x2, p1819);
float64x2_t p815 = vfmaq_f64 (p811, x4, p1215);
float64x2_t p819 = vfmaq_f64 (p815, x8, p1619);
float64x2_t y = vfmaq_f64 (p07, p819, x8);
/* Finalize. y = shift + z + z^3 * P(z^2). */
y = vfmaq_f64 (az, y, vmulq_f64 (z2, az));
@ -93,12 +124,12 @@ float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x)
return y;
}
PL_SIG (V, D, 1, atan, -10.0, 10.0)
PL_TEST_ULP (V_NAME_D1 (atan), 1.78)
PL_TEST_EXPECT_FENV (V_NAME_D1 (atan), WANT_SIMD_EXCEPT)
PL_TEST_INTERVAL (V_NAME_D1 (atan), 0, 0x1p-30, 10000)
PL_TEST_INTERVAL (V_NAME_D1 (atan), -0, -0x1p-30, 1000)
PL_TEST_INTERVAL (V_NAME_D1 (atan), 0x1p-30, 0x1p53, 900000)
PL_TEST_INTERVAL (V_NAME_D1 (atan), -0x1p-30, -0x1p53, 90000)
PL_TEST_INTERVAL (V_NAME_D1 (atan), 0x1p53, inf, 10000)
PL_TEST_INTERVAL (V_NAME_D1 (atan), -0x1p53, -inf, 1000)
TEST_SIG (V, D, 1, atan, -10.0, 10.0)
TEST_ULP (V_NAME_D1 (atan), 1.78)
TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (atan), WANT_SIMD_EXCEPT)
TEST_INTERVAL (V_NAME_D1 (atan), 0, 0x1p-30, 10000)
TEST_INTERVAL (V_NAME_D1 (atan), -0, -0x1p-30, 1000)
TEST_INTERVAL (V_NAME_D1 (atan), 0x1p-30, 0x1p53, 900000)
TEST_INTERVAL (V_NAME_D1 (atan), -0x1p-30, -0x1p53, 90000)
TEST_INTERVAL (V_NAME_D1 (atan), 0x1p53, inf, 10000)
TEST_INTERVAL (V_NAME_D1 (atan), -0x1p53, -inf, 1000)

View file

@ -0,0 +1,171 @@
/*
* Double-precision vector atan2(x) function.
*
* Copyright (c) 2021-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
#include "test_sig.h"
#include "test_defs.h"
static const struct data
{
float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c18;
float64x2_t pi_over_2;
double c1, c3, c5, c7, c9, c11, c13, c15, c17, c19;
uint64x2_t zeroinfnan, minustwo;
} data = {
/* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
[2**-1022, 1.0]. */
.c0 = V2 (-0x1.5555555555555p-2),
.c1 = 0x1.99999999996c1p-3,
.c2 = V2 (-0x1.2492492478f88p-3),
.c3 = 0x1.c71c71bc3951cp-4,
.c4 = V2 (-0x1.745d160a7e368p-4),
.c5 = 0x1.3b139b6a88ba1p-4,
.c6 = V2 (-0x1.11100ee084227p-4),
.c7 = 0x1.e1d0f9696f63bp-5,
.c8 = V2 (-0x1.aebfe7b418581p-5),
.c9 = 0x1.842dbe9b0d916p-5,
.c10 = V2 (-0x1.5d30140ae5e99p-5),
.c11 = 0x1.338e31eb2fbbcp-5,
.c12 = V2 (-0x1.00e6eece7de8p-5),
.c13 = 0x1.860897b29e5efp-6,
.c14 = V2 (-0x1.0051381722a59p-6),
.c15 = 0x1.14e9dc19a4a4ep-7,
.c16 = V2 (-0x1.d0062b42fe3bfp-9),
.c17 = 0x1.17739e210171ap-10,
.c18 = V2 (-0x1.ab24da7be7402p-13),
.c19 = 0x1.358851160a528p-16,
.pi_over_2 = V2 (0x1.921fb54442d18p+0),
.zeroinfnan = V2 (2 * 0x7ff0000000000000ul - 1),
.minustwo = V2 (0xc000000000000000),
};
#define SignMask v_u64 (0x8000000000000000)
/* Special cases i.e. 0, infinity, NaN (fall back to scalar calls). */
static float64x2_t VPCS_ATTR NOINLINE
special_case (float64x2_t y, float64x2_t x, float64x2_t ret,
uint64x2_t sign_xy, uint64x2_t cmp)
{
/* Account for the sign of x and y. */
ret = vreinterpretq_f64_u64 (
veorq_u64 (vreinterpretq_u64_f64 (ret), sign_xy));
return v_call2_f64 (atan2, y, x, ret, cmp);
}
/* Returns 1 if input is the bit representation of 0, infinity or nan. */
static inline uint64x2_t
zeroinfnan (uint64x2_t i, const struct data *d)
{
/* (2 * i - 1) >= (2 * asuint64 (INFINITY) - 1). */
return vcgeq_u64 (vsubq_u64 (vaddq_u64 (i, i), v_u64 (1)), d->zeroinfnan);
}
/* Fast implementation of vector atan2.
Maximum observed error is 2.8 ulps:
_ZGVnN2vv_atan2 (0x1.9651a429a859ap+5, 0x1.953075f4ee26p+5)
got 0x1.92d628ab678ccp-1
want 0x1.92d628ab678cfp-1. */
float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x)
{
const struct data *d = ptr_barrier (&data);
uint64x2_t ix = vreinterpretq_u64_f64 (x);
uint64x2_t iy = vreinterpretq_u64_f64 (y);
uint64x2_t special_cases
= vorrq_u64 (zeroinfnan (ix, d), zeroinfnan (iy, d));
uint64x2_t sign_x = vandq_u64 (ix, SignMask);
uint64x2_t sign_y = vandq_u64 (iy, SignMask);
uint64x2_t sign_xy = veorq_u64 (sign_x, sign_y);
float64x2_t ax = vabsq_f64 (x);
float64x2_t ay = vabsq_f64 (y);
uint64x2_t pred_xlt0 = vcltzq_f64 (x);
uint64x2_t pred_aygtax = vcagtq_f64 (y, x);
/* Set up z for call to atan. */
float64x2_t n = vbslq_f64 (pred_aygtax, vnegq_f64 (ax), ay);
float64x2_t q = vbslq_f64 (pred_aygtax, ay, ax);
float64x2_t z = vdivq_f64 (n, q);
/* Work out the correct shift. */
float64x2_t shift
= vreinterpretq_f64_u64 (vandq_u64 (pred_xlt0, d->minustwo));
shift = vbslq_f64 (pred_aygtax, vaddq_f64 (shift, v_f64 (1.0)), shift);
shift = vmulq_f64 (shift, d->pi_over_2);
/* Calculate the polynomial approximation.
Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of
full scheme to avoid underflow in x^16.
The order 19 polynomial P approximates
(atan(sqrt(x))-sqrt(x))/x^(3/2). */
float64x2_t z2 = vmulq_f64 (z, z);
float64x2_t x2 = vmulq_f64 (z2, z2);
float64x2_t x4 = vmulq_f64 (x2, x2);
float64x2_t x8 = vmulq_f64 (x4, x4);
float64x2_t c13 = vld1q_f64 (&d->c1);
float64x2_t c57 = vld1q_f64 (&d->c5);
float64x2_t c911 = vld1q_f64 (&d->c9);
float64x2_t c1315 = vld1q_f64 (&d->c13);
float64x2_t c1719 = vld1q_f64 (&d->c17);
/* estrin_7. */
float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0);
float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1);
float64x2_t p03 = vfmaq_f64 (p01, x2, p23);
float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0);
float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1);
float64x2_t p47 = vfmaq_f64 (p45, x2, p67);
float64x2_t p07 = vfmaq_f64 (p03, x4, p47);
/* estrin_11. */
float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0);
float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1);
float64x2_t p811 = vfmaq_f64 (p89, x2, p1011);
float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, z2, c1315, 0);
float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, z2, c1315, 1);
float64x2_t p1215 = vfmaq_f64 (p1213, x2, p1415);
float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, z2, c1719, 0);
float64x2_t p1819 = vfmaq_laneq_f64 (d->c18, z2, c1719, 1);
float64x2_t p1619 = vfmaq_f64 (p1617, x2, p1819);
float64x2_t p815 = vfmaq_f64 (p811, x4, p1215);
float64x2_t p819 = vfmaq_f64 (p815, x8, p1619);
float64x2_t ret = vfmaq_f64 (p07, p819, x8);
/* Finalize. y = shift + z + z^3 * P(z^2). */
ret = vfmaq_f64 (z, ret, vmulq_f64 (z2, z));
ret = vaddq_f64 (ret, shift);
if (unlikely (v_any_u64 (special_cases)))
return special_case (y, x, ret, sign_xy, special_cases);
/* Account for the sign of x and y. */
ret = vreinterpretq_f64_u64 (
veorq_u64 (vreinterpretq_u64_f64 (ret), sign_xy));
return ret;
}
/* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h. */
TEST_SIG (V, D, 2, atan2)
// TODO tighten this once __v_atan2 is fixed
TEST_ULP (V_NAME_D2 (atan2), 2.9)
TEST_DISABLE_FENV (V_NAME_D2 (atan2))
TEST_INTERVAL (V_NAME_D2 (atan2), -10.0, 10.0, 50000)
TEST_INTERVAL (V_NAME_D2 (atan2), -1.0, 1.0, 40000)
TEST_INTERVAL (V_NAME_D2 (atan2), 0.0, 1.0, 40000)
TEST_INTERVAL (V_NAME_D2 (atan2), 1.0, 100.0, 40000)
TEST_INTERVAL (V_NAME_D2 (atan2), 1e6, 1e32, 40000)

View file

@ -1,59 +1,64 @@
/*
* Single-precision vector atan2(x) function.
*
* Copyright (c) 2021-2023, Arm Limited.
* Copyright (c) 2021-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
#include "pl_sig.h"
#include "pl_test.h"
#include "poly_advsimd_f32.h"
#include "test_sig.h"
#include "test_defs.h"
static const struct data
{
float32x4_t poly[8];
float32x4_t pi_over_2;
float32x4_t c0, pi_over_2, c4, c6, c2;
float c1, c3, c5, c7;
uint32x4_t comp_const;
} data = {
/* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
[2**-128, 1.0].
Generated using fpminimax between FLT_MIN and 1. */
.poly = { V4 (-0x1.55555p-2f), V4 (0x1.99935ep-3f), V4 (-0x1.24051ep-3f),
V4 (0x1.bd7368p-4f), V4 (-0x1.491f0ep-4f), V4 (0x1.93a2c0p-5f),
V4 (-0x1.4c3c60p-6f), V4 (0x1.01fd88p-8f) },
.pi_over_2 = V4 (0x1.921fb6p+0f),
.c0 = V4 (-0x1.55555p-2f), .c1 = 0x1.99935ep-3f,
.c2 = V4 (-0x1.24051ep-3f), .c3 = 0x1.bd7368p-4f,
.c4 = V4 (-0x1.491f0ep-4f), .c5 = 0x1.93a2c0p-5f,
.c6 = V4 (-0x1.4c3c60p-6f), .c7 = 0x1.01fd88p-8f,
.pi_over_2 = V4 (0x1.921fb6p+0f), .comp_const = V4 (2 * 0x7f800000lu - 1),
};
#define SignMask v_u32 (0x80000000)
/* Special cases i.e. 0, infinity and nan (fall back to scalar calls). */
static float32x4_t VPCS_ATTR NOINLINE
special_case (float32x4_t y, float32x4_t x, float32x4_t ret, uint32x4_t cmp)
special_case (float32x4_t y, float32x4_t x, float32x4_t ret,
uint32x4_t sign_xy, uint32x4_t cmp)
{
/* Account for the sign of y. */
ret = vreinterpretq_f32_u32 (
veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy));
return v_call2_f32 (atan2f, y, x, ret, cmp);
}
/* Returns 1 if input is the bit representation of 0, infinity or nan. */
static inline uint32x4_t
zeroinfnan (uint32x4_t i)
zeroinfnan (uint32x4_t i, const struct data *d)
{
/* 2 * i - 1 >= 2 * 0x7f800000lu - 1. */
return vcgeq_u32 (vsubq_u32 (vmulq_n_u32 (i, 2), v_u32 (1)),
v_u32 (2 * 0x7f800000lu - 1));
return vcgeq_u32 (vsubq_u32 (vmulq_n_u32 (i, 2), v_u32 (1)), d->comp_const);
}
/* Fast implementation of vector atan2f. Maximum observed error is
2.95 ULP in [0x1.9300d6p+6 0x1.93c0c6p+6] x [0x1.8c2dbp+6 0x1.8cea6p+6]:
_ZGVnN4vv_atan2f (0x1.93836cp+6, 0x1.8cae1p+6) got 0x1.967f06p-1
want 0x1.967f00p-1. */
float32x4_t VPCS_ATTR V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x)
float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x)
{
const struct data *data_ptr = ptr_barrier (&data);
const struct data *d = ptr_barrier (&data);
uint32x4_t ix = vreinterpretq_u32_f32 (x);
uint32x4_t iy = vreinterpretq_u32_f32 (y);
uint32x4_t special_cases = vorrq_u32 (zeroinfnan (ix), zeroinfnan (iy));
uint32x4_t special_cases
= vorrq_u32 (zeroinfnan (ix, d), zeroinfnan (iy, d));
uint32x4_t sign_x = vandq_u32 (ix, SignMask);
uint32x4_t sign_y = vandq_u32 (iy, SignMask);
@ -67,14 +72,14 @@ float32x4_t VPCS_ATTR V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x)
/* Set up z for call to atanf. */
float32x4_t n = vbslq_f32 (pred_aygtax, vnegq_f32 (ax), ay);
float32x4_t d = vbslq_f32 (pred_aygtax, ay, ax);
float32x4_t z = vdivq_f32 (n, d);
float32x4_t q = vbslq_f32 (pred_aygtax, ay, ax);
float32x4_t z = vdivq_f32 (n, q);
/* Work out the correct shift. */
float32x4_t shift = vreinterpretq_f32_u32 (
vandq_u32 (pred_xlt0, vreinterpretq_u32_f32 (v_f32 (-2.0f))));
shift = vbslq_f32 (pred_aygtax, vaddq_f32 (shift, v_f32 (1.0f)), shift);
shift = vmulq_f32 (shift, data_ptr->pi_over_2);
shift = vmulq_f32 (shift, d->pi_over_2);
/* Calculate the polynomial approximation.
Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However,
@ -86,30 +91,37 @@ float32x4_t VPCS_ATTR V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x)
float32x4_t z2 = vmulq_f32 (z, z);
float32x4_t z4 = vmulq_f32 (z2, z2);
float32x4_t ret = vfmaq_f32 (
v_pairwise_poly_3_f32 (z2, z4, data_ptr->poly), z4,
vmulq_f32 (z4, v_pairwise_poly_3_f32 (z2, z4, data_ptr->poly + 4)));
float32x4_t c1357 = vld1q_f32 (&d->c1);
float32x4_t p01 = vfmaq_laneq_f32 (d->c0, z2, c1357, 0);
float32x4_t p23 = vfmaq_laneq_f32 (d->c2, z2, c1357, 1);
float32x4_t p45 = vfmaq_laneq_f32 (d->c4, z2, c1357, 2);
float32x4_t p67 = vfmaq_laneq_f32 (d->c6, z2, c1357, 3);
float32x4_t p03 = vfmaq_f32 (p01, z4, p23);
float32x4_t p47 = vfmaq_f32 (p45, z4, p67);
float32x4_t ret = vfmaq_f32 (p03, z4, vmulq_f32 (z4, p47));
/* y = shift + z * P(z^2). */
ret = vaddq_f32 (vfmaq_f32 (z, ret, vmulq_f32 (z2, z)), shift);
/* Account for the sign of y. */
ret = vreinterpretq_f32_u32 (
veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy));
if (unlikely (v_any_u32 (special_cases)))
{
return special_case (y, x, ret, special_cases);
return special_case (y, x, ret, sign_xy, special_cases);
}
return ret;
/* Account for the sign of y. */
return vreinterpretq_f32_u32 (
veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy));
}
HALF_WIDTH_ALIAS_F2 (atan2)
/* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h. */
PL_SIG (V, F, 2, atan2)
PL_TEST_ULP (V_NAME_F2 (atan2), 2.46)
PL_TEST_INTERVAL (V_NAME_F2 (atan2), -10.0, 10.0, 50000)
PL_TEST_INTERVAL (V_NAME_F2 (atan2), -1.0, 1.0, 40000)
PL_TEST_INTERVAL (V_NAME_F2 (atan2), 0.0, 1.0, 40000)
PL_TEST_INTERVAL (V_NAME_F2 (atan2), 1.0, 100.0, 40000)
PL_TEST_INTERVAL (V_NAME_F2 (atan2), 1e6, 1e32, 40000)
TEST_SIG (V, F, 2, atan2)
TEST_DISABLE_FENV (V_NAME_F2 (atan2))
TEST_ULP (V_NAME_F2 (atan2), 2.46)
TEST_INTERVAL (V_NAME_F2 (atan2), -10.0, 10.0, 50000)
TEST_INTERVAL (V_NAME_F2 (atan2), -1.0, 1.0, 40000)
TEST_INTERVAL (V_NAME_F2 (atan2), 0.0, 1.0, 40000)
TEST_INTERVAL (V_NAME_F2 (atan2), 1.0, 100.0, 40000)
TEST_INTERVAL (V_NAME_F2 (atan2), 1e6, 1e32, 40000)

View file

@ -1,14 +1,14 @@
/*
* Single-precision vector atan(x) function.
*
* Copyright (c) 2021-2023, Arm Limited.
* Copyright (c) 2021-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
#include "pl_sig.h"
#include "pl_test.h"
#include "poly_advsimd_f32.h"
#include "test_sig.h"
#include "test_defs.h"
#include "v_poly_f32.h"
static const struct data
{
@ -43,7 +43,7 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1]
using z=-1/x and shift = pi/2. Maximum observed error is 2.9ulps:
_ZGVnN4v_atanf (0x1.0468f6p+0) got 0x1.967f06p-1 want 0x1.967fp-1. */
float32x4_t VPCS_ATTR V_NAME_F1 (atan) (float32x4_t x)
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (atan) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
@ -98,10 +98,12 @@ float32x4_t VPCS_ATTR V_NAME_F1 (atan) (float32x4_t x)
return y;
}
PL_SIG (V, F, 1, atan, -10.0, 10.0)
PL_TEST_ULP (V_NAME_F1 (atan), 2.5)
PL_TEST_EXPECT_FENV (V_NAME_F1 (atan), WANT_SIMD_EXCEPT)
PL_TEST_SYM_INTERVAL (V_NAME_F1 (atan), 0, 0x1p-30, 5000)
PL_TEST_SYM_INTERVAL (V_NAME_F1 (atan), 0x1p-30, 1, 40000)
PL_TEST_SYM_INTERVAL (V_NAME_F1 (atan), 1, 0x1p30, 40000)
PL_TEST_SYM_INTERVAL (V_NAME_F1 (atan), 0x1p30, inf, 1000)
HALF_WIDTH_ALIAS_F1 (atan)
TEST_SIG (V, F, 1, atan, -10.0, 10.0)
TEST_ULP (V_NAME_F1 (atan), 2.5)
TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (atan), WANT_SIMD_EXCEPT)
TEST_SYM_INTERVAL (V_NAME_F1 (atan), 0, 0x1p-30, 5000)
TEST_SYM_INTERVAL (V_NAME_F1 (atan), 0x1p-30, 1, 40000)
TEST_SYM_INTERVAL (V_NAME_F1 (atan), 1, 0x1p30, 40000)
TEST_SYM_INTERVAL (V_NAME_F1 (atan), 0x1p30, inf, 1000)

View file

@ -1,13 +1,13 @@
/*
* Double-precision vector atanh(x) function.
*
* Copyright (c) 2022-2023, Arm Limited.
* Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
#include "pl_sig.h"
#include "pl_test.h"
#include "test_sig.h"
#include "test_defs.h"
#define WANT_V_LOG1P_K0_SHORTCUT 0
#include "v_log1p_inline.h"
@ -15,15 +15,19 @@
const static struct data
{
struct v_log1p_data log1p_consts;
uint64x2_t one, half;
uint64x2_t one;
uint64x2_t sign_mask;
} data = { .log1p_consts = V_LOG1P_CONSTANTS_TABLE,
.one = V2 (0x3ff0000000000000),
.half = V2 (0x3fe0000000000000) };
.sign_mask = V2 (0x8000000000000000) };
static float64x2_t VPCS_ATTR NOINLINE
special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
special_case (float64x2_t x, float64x2_t halfsign, float64x2_t y,
uint64x2_t special, const struct data *d)
{
return v_call_f64 (atanh, x, y, special);
y = log1p_inline (y, &d->log1p_consts);
return v_call_f64 (atanh, vbslq_f64 (d->sign_mask, halfsign, x),
vmulq_f64 (halfsign, y), special);
}
/* Approximation for vector double-precision atanh(x) using modified log1p.
@ -35,11 +39,10 @@ float64x2_t V_NAME_D1 (atanh) (float64x2_t x)
{
const struct data *d = ptr_barrier (&data);
float64x2_t halfsign = vbslq_f64 (d->sign_mask, x, v_f64 (0.5));
float64x2_t ax = vabsq_f64 (x);
uint64x2_t ia = vreinterpretq_u64_f64 (ax);
uint64x2_t sign = veorq_u64 (vreinterpretq_u64_f64 (x), ia);
uint64x2_t special = vcgeq_u64 (ia, d->one);
float64x2_t halfsign = vreinterpretq_f64_u64 (vorrq_u64 (sign, d->half));
#if WANT_SIMD_EXCEPT
ax = v_zerofy_f64 (ax, special);
@ -47,20 +50,26 @@ float64x2_t V_NAME_D1 (atanh) (float64x2_t x)
float64x2_t y;
y = vaddq_f64 (ax, ax);
y = vdivq_f64 (y, vsubq_f64 (v_f64 (1), ax));
y = log1p_inline (y, &d->log1p_consts);
y = vdivq_f64 (y, vsubq_f64 (vreinterpretq_f64_u64 (d->one), ax));
if (unlikely (v_any_u64 (special)))
return special_case (x, vmulq_f64 (y, halfsign), special);
#if WANT_SIMD_EXCEPT
return special_case (x, halfsign, y, special, d);
#else
return special_case (ax, halfsign, y, special, d);
#endif
y = log1p_inline (y, &d->log1p_consts);
return vmulq_f64 (y, halfsign);
}
PL_SIG (V, D, 1, atanh, -1.0, 1.0)
PL_TEST_EXPECT_FENV (V_NAME_D1 (atanh), WANT_SIMD_EXCEPT)
PL_TEST_ULP (V_NAME_D1 (atanh), 3.32)
TEST_SIG (V, D, 1, atanh, -1.0, 1.0)
TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (atanh), WANT_SIMD_EXCEPT)
TEST_ULP (V_NAME_D1 (atanh), 3.32)
TEST_SYM_INTERVAL (V_NAME_D1 (atanh), 0, 0x1p-23, 10000)
TEST_SYM_INTERVAL (V_NAME_D1 (atanh), 0x1p-23, 1, 90000)
TEST_SYM_INTERVAL (V_NAME_D1 (atanh), 1, inf, 100)
/* atanh is asymptotic at 1, which is the default control value - have to set
-c 0 specially to ensure fp exceptions are triggered correctly (choice of
control lane is irrelevant if fp exceptions are disabled). */
PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (atanh), 0, 0x1p-23, 10000, 0)
PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (atanh), 0x1p-23, 1, 90000, 0)
PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (atanh), 1, inf, 100, 0)
TEST_CONTROL_VALUE (V_NAME_D1 (atanh), 0)

View file

@ -1,13 +1,13 @@
/*
* Single-precision vector atanh(x) function.
*
* Copyright (c) 2022-2023, Arm Limited.
* Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
#include "pl_sig.h"
#include "pl_test.h"
#include "test_sig.h"
#include "test_defs.h"
#include "v_log1pf_inline.h"
const static struct data
@ -30,16 +30,18 @@ const static struct data
#define Half v_u32 (0x3f000000)
static float32x4_t NOINLINE VPCS_ATTR
special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
special_case (float32x4_t x, float32x4_t halfsign, float32x4_t y,
uint32x4_t special)
{
return v_call_f32 (atanhf, x, y, special);
return v_call_f32 (atanhf, vbslq_f32 (AbsMask, x, halfsign),
vmulq_f32 (halfsign, y), special);
}
/* Approximation for vector single-precision atanh(x) using modified log1p.
The maximum error is 3.08 ULP:
__v_atanhf(0x1.ff215p-5) got 0x1.ffcb7cp-5
want 0x1.ffcb82p-5. */
VPCS_ATTR float32x4_t V_NAME_F1 (atanh) (float32x4_t x)
The maximum error is 2.93 ULP:
_ZGVnN4v_atanhf(0x1.f43d7p-5) got 0x1.f4dcfep-5
want 0x1.f4dcf8p-5. */
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (atanh) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
@ -58,20 +60,31 @@ VPCS_ATTR float32x4_t V_NAME_F1 (atanh) (float32x4_t x)
uint32x4_t special = vcgeq_u32 (iax, d->one);
#endif
float32x4_t y = vdivq_f32 (vaddq_f32 (ax, ax), vsubq_f32 (v_f32 (1), ax));
y = log1pf_inline (y, d->log1pf_consts);
float32x4_t y = vdivq_f32 (vaddq_f32 (ax, ax),
vsubq_f32 (vreinterpretq_f32_u32 (d->one), ax));
y = log1pf_inline (y, &d->log1pf_consts);
/* If exceptions not required, pass ax to special-case for shorter dependency
chain. If exceptions are required ax will have been zerofied, so have to
pass x. */
if (unlikely (v_any_u32 (special)))
return special_case (x, vmulq_f32 (halfsign, y), special);
#if WANT_SIMD_EXCEPT
return special_case (x, halfsign, y, special);
#else
return special_case (ax, halfsign, y, special);
#endif
return vmulq_f32 (halfsign, y);
}
PL_SIG (V, F, 1, atanh, -1.0, 1.0)
PL_TEST_ULP (V_NAME_F1 (atanh), 2.59)
PL_TEST_EXPECT_FENV (V_NAME_F1 (atanh), WANT_SIMD_EXCEPT)
HALF_WIDTH_ALIAS_F1 (atanh)
TEST_SIG (V, F, 1, atanh, -1.0, 1.0)
TEST_ULP (V_NAME_F1 (atanh), 2.44)
TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (atanh), WANT_SIMD_EXCEPT)
TEST_SYM_INTERVAL (V_NAME_F1 (atanh), 0, 0x1p-12, 500)
TEST_SYM_INTERVAL (V_NAME_F1 (atanh), 0x1p-12, 1, 200000)
TEST_SYM_INTERVAL (V_NAME_F1 (atanh), 1, inf, 1000)
/* atanh is asymptotic at 1, which is the default control value - have to set
-c 0 specially to ensure fp exceptions are triggered correctly (choice of
control lane is irrelevant if fp exceptions are disabled). */
PL_TEST_SYM_INTERVAL_C (V_NAME_F1 (atanh), 0, 0x1p-12, 500, 0)
PL_TEST_SYM_INTERVAL_C (V_NAME_F1 (atanh), 0x1p-12, 1, 200000, 0)
PL_TEST_SYM_INTERVAL_C (V_NAME_F1 (atanh), 1, inf, 1000, 0)
TEST_CONTROL_VALUE (V_NAME_F1 (atanh), 0)

View file

@ -1,14 +1,14 @@
/*
* Double-precision vector cbrt(x) function.
*
* Copyright (c) 2022-2023, Arm Limited.
* Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
#include "pl_sig.h"
#include "pl_test.h"
#include "poly_advsimd_f64.h"
#include "test_sig.h"
#include "test_defs.h"
#include "v_poly_f64.h"
const static struct data
{
@ -40,13 +40,20 @@ special_case (float64x2_t x, float64x2_t y, uint32x2_t special)
return v_call_f64 (cbrt, x, y, vmovl_u32 (special));
}
/* Approximation for double-precision vector cbrt(x), using low-order polynomial
and two Newton iterations. Greatest observed error is 1.79 ULP. Errors repeat
/* Approximation for double-precision vector cbrt(x), using low-order
polynomial and two Newton iterations.
The vector version of frexp does not handle subnormals
correctly. As a result these need to be handled by the scalar
fallback, where accuracy may be worse than that of the vector code
path.
Greatest observed error in the normal range is 1.79 ULP. Errors repeat
according to the exponent, for instance an error observed for double value
m * 2^e will be observed for any input m * 2^(e + 3*i), where i is an
integer.
__v_cbrt(0x1.fffff403f0bc6p+1) got 0x1.965fe72821e9bp+0
want 0x1.965fe72821e99p+0. */
_ZGVnN2v_cbrt (0x1.fffff403f0bc6p+1) got 0x1.965fe72821e9bp+0
want 0x1.965fe72821e99p+0. */
VPCS_ATTR float64x2_t V_NAME_D1 (cbrt) (float64x2_t x)
{
const struct data *d = ptr_barrier (&data);
@ -64,8 +71,8 @@ VPCS_ATTR float64x2_t V_NAME_D1 (cbrt) (float64x2_t x)
uint64x2_t ia12 = vshrq_n_u64 (iax, 52);
int64x2_t e = vsubq_s64 (vreinterpretq_s64_u64 (ia12), exp_bias);
/* Calculate rough approximation for cbrt(m) in [0.5, 1.0], starting point for
Newton iterations. */
/* Calculate rough approximation for cbrt(m) in [0.5, 1.0], starting point
for Newton iterations. */
float64x2_t p = v_pairwise_poly_3_f64 (m, vmulq_f64 (m, m), d->poly);
float64x2_t one_third = d->one_third;
/* Two iterations of Newton's method for iteratively approximating cbrt. */
@ -84,8 +91,8 @@ VPCS_ATTR float64x2_t V_NAME_D1 (cbrt) (float64x2_t x)
Let q = 2 ^ round(e / 3), then t = 2 ^ (e / 3) / q.
Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which is
an integer in [-2, 2], and can be looked up in the table T. Hence the
Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which
is an integer in [-2, 2], and can be looked up in the table T. Hence the
result is assembled as:
cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign. */
@ -110,7 +117,11 @@ VPCS_ATTR float64x2_t V_NAME_D1 (cbrt) (float64x2_t x)
return vbslq_f64 (d->abs_mask, y, x);
}
PL_TEST_ULP (V_NAME_D1 (cbrt), 1.30)
PL_SIG (V, D, 1, cbrt, -10.0, 10.0)
PL_TEST_EXPECT_FENV_ALWAYS (V_NAME_D1 (cbrt))
PL_TEST_SYM_INTERVAL (V_NAME_D1 (cbrt), 0, inf, 1000000)
/* Worse-case ULP error assumes that scalar fallback is GLIBC 2.40 cbrt, which
has ULP error of 3.67 at 0x1.7a337e1ba1ec2p-257 [1]. Largest observed error
in the vector path is 1.79 ULP.
[1] Innocente, V., & Zimmermann, P. (2024). Accuracy of Mathematical
Functions in Single, Double, Double Extended, and Quadruple Precision. */
TEST_ULP (V_NAME_D1 (cbrt), 3.17)
TEST_SIG (V, D, 1, cbrt, -10.0, 10.0)
TEST_SYM_INTERVAL (V_NAME_D1 (cbrt), 0, inf, 1000000)

View file

@ -1,14 +1,14 @@
/*
* Single-precision vector cbrt(x) function.
*
* Copyright (c) 2022-2023, Arm Limited.
* Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
#include "pl_sig.h"
#include "pl_test.h"
#include "poly_advsimd_f32.h"
#include "test_sig.h"
#include "test_defs.h"
#include "v_poly_f32.h"
const static struct data
{
@ -49,7 +49,7 @@ shifted_lookup (const float *table, int32x4_t i)
0x1.85a2aa and the exponent is a multiple of 3, for example:
_ZGVnN4v_cbrtf(0x1.85a2aap+3) got 0x1.267936p+1
want 0x1.267932p+1. */
VPCS_ATTR float32x4_t V_NAME_F1 (cbrt) (float32x4_t x)
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cbrt) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
uint32x4_t iax = vreinterpretq_u32_f32 (vabsq_f32 (x));
@ -110,7 +110,8 @@ VPCS_ATTR float32x4_t V_NAME_F1 (cbrt) (float32x4_t x)
return vbslq_f32 (SignMask, x, y);
}
PL_SIG (V, F, 1, cbrt, -10.0, 10.0)
PL_TEST_ULP (V_NAME_F1 (cbrt), 1.15)
PL_TEST_EXPECT_FENV_ALWAYS (V_NAME_F1 (cbrt))
PL_TEST_SYM_INTERVAL (V_NAME_F1 (cbrt), 0, inf, 1000000)
HALF_WIDTH_ALIAS_F1 (cbrt)
TEST_SIG (V, F, 1, cbrt, -10.0, 10.0)
TEST_ULP (V_NAME_F1 (cbrt), 1.15)
TEST_SYM_INTERVAL (V_NAME_F1 (cbrt), 0, inf, 1000000)

View file

@ -1,13 +1,13 @@
/*
* Double-precision vector sincos function - return-by-value interface.
*
* Copyright (c) 2023, Arm Limited.
* Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_sincos_common.h"
#include "v_math.h"
#include "pl_test.h"
#include "test_defs.h"
static float64x2x2_t VPCS_ATTR NOINLINE
special_case (float64x2_t x, uint64x2_t special, float64x2x2_t y)
@ -34,11 +34,13 @@ _ZGVnN2v_cexpi (float64x2_t x)
return sc;
}
PL_TEST_ULP (_ZGVnN2v_cexpi_sin, 2.73)
PL_TEST_ULP (_ZGVnN2v_cexpi_cos, 2.73)
TEST_DISABLE_FENV (_ZGVnN2v_cexpi_cos)
TEST_DISABLE_FENV (_ZGVnN2v_cexpi_sin)
TEST_ULP (_ZGVnN2v_cexpi_sin, 2.73)
TEST_ULP (_ZGVnN2v_cexpi_cos, 2.73)
#define V_CEXPI_INTERVAL(lo, hi, n) \
PL_TEST_INTERVAL (_ZGVnN2v_cexpi_sin, lo, hi, n) \
PL_TEST_INTERVAL (_ZGVnN2v_cexpi_cos, lo, hi, n)
TEST_INTERVAL (_ZGVnN2v_cexpi_sin, lo, hi, n) \
TEST_INTERVAL (_ZGVnN2v_cexpi_cos, lo, hi, n)
V_CEXPI_INTERVAL (0, 0x1p23, 500000)
V_CEXPI_INTERVAL (-0, -0x1p23, 500000)
V_CEXPI_INTERVAL (0x1p23, inf, 10000)

View file

@ -1,13 +1,13 @@
/*
* Single-precision vector cexpi function.
*
* Copyright (c) 2023, Arm Limited.
* Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_sincosf_common.h"
#include "v_math.h"
#include "pl_test.h"
#include "test_defs.h"
static float32x4x2_t VPCS_ATTR NOINLINE
special_case (float32x4_t x, uint32x4_t special, float32x4x2_t y)
@ -36,11 +36,13 @@ _ZGVnN4v_cexpif (float32x4_t x)
return sc;
}
PL_TEST_ULP (_ZGVnN4v_cexpif_sin, 1.17)
PL_TEST_ULP (_ZGVnN4v_cexpif_cos, 1.31)
TEST_DISABLE_FENV (_ZGVnN4v_cexpif_sin)
TEST_DISABLE_FENV (_ZGVnN4v_cexpif_cos)
TEST_ULP (_ZGVnN4v_cexpif_sin, 1.17)
TEST_ULP (_ZGVnN4v_cexpif_cos, 1.31)
#define V_CEXPIF_INTERVAL(lo, hi, n) \
PL_TEST_INTERVAL (_ZGVnN4v_cexpif_sin, lo, hi, n) \
PL_TEST_INTERVAL (_ZGVnN4v_cexpif_cos, lo, hi, n)
TEST_INTERVAL (_ZGVnN4v_cexpif_sin, lo, hi, n) \
TEST_INTERVAL (_ZGVnN4v_cexpif_cos, lo, hi, n)
V_CEXPIF_INTERVAL (0, 0x1p20, 500000)
V_CEXPIF_INTERVAL (-0, -0x1p20, 500000)
V_CEXPIF_INTERVAL (0x1p20, inf, 10000)

View file

@ -1,17 +1,19 @@
/*
* Double-precision vector cos function.
*
* Copyright (c) 2019-2023, Arm Limited.
* Copyright (c) 2019-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#include "v_math.h"
#include "test_defs.h"
#include "test_sig.h"
static const struct data
{
float64x2_t poly[7];
float64x2_t range_val, shift, inv_pi, half_pi, pi_1, pi_2, pi_3;
float64x2_t range_val, inv_pi, pi_1, pi_2, pi_3;
} data = {
/* Worst-case error is 3.3 ulp in [-pi/2, pi/2]. */
.poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7),
@ -19,11 +21,9 @@ static const struct data
V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33),
V2 (-0x1.9e9540300a1p-41) },
.inv_pi = V2 (0x1.45f306dc9c883p-2),
.half_pi = V2 (0x1.921fb54442d18p+0),
.pi_1 = V2 (0x1.921fb54442d18p+1),
.pi_2 = V2 (0x1.1a62633145c06p-53),
.pi_3 = V2 (0x1.c1cd129024e09p-106),
.shift = V2 (0x1.8p52),
.range_val = V2 (0x1p23)
};
@ -57,10 +57,9 @@ float64x2_t VPCS_ATTR V_NAME_D1 (cos) (float64x2_t x)
#endif
/* n = rint((|x|+pi/2)/pi) - 0.5. */
n = vfmaq_f64 (d->shift, d->inv_pi, vaddq_f64 (r, d->half_pi));
odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63);
n = vsubq_f64 (n, d->shift);
n = vsubq_f64 (n, v_f64 (0.5));
n = vrndaq_f64 (vfmaq_f64 (v_f64 (0.5), r, d->inv_pi));
odd = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtq_s64_f64 (n)), 63);
n = vsubq_f64 (n, v_f64 (0.5f));
/* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
r = vfmsq_f64 (r, d->pi_1, n);
@ -85,3 +84,9 @@ float64x2_t VPCS_ATTR V_NAME_D1 (cos) (float64x2_t x)
return special_case (x, y, odd, cmp);
return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
}
TEST_SIG (V, D, 1, cos, -3.1, 3.1)
TEST_ULP (V_NAME_D1 (cos), 3.0)
TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (cos), WANT_SIMD_EXCEPT)
TEST_SYM_INTERVAL (V_NAME_D1 (cos), 0, 0x1p23, 500000)
TEST_SYM_INTERVAL (V_NAME_D1 (cos), 0x1p23, inf, 10000)

View file

@ -1,17 +1,19 @@
/*
* Single-precision vector cos function.
*
* Copyright (c) 2019-2023, Arm Limited.
* Copyright (c) 2019-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#include "v_math.h"
#include "test_defs.h"
#include "test_sig.h"
static const struct data
{
float32x4_t poly[4];
float32x4_t range_val, inv_pi, half_pi, shift, pi_1, pi_2, pi_3;
float32x4_t range_val, inv_pi, pi_1, pi_2, pi_3;
} data = {
/* 1.886 ulp error. */
.poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f),
@ -22,8 +24,6 @@ static const struct data
.pi_3 = V4 (-0x1.ee59dap-49f),
.inv_pi = V4 (0x1.45f306p-2f),
.shift = V4 (0x1.8p+23f),
.half_pi = V4 (0x1.921fb6p0f),
.range_val = V4 (0x1p20f)
};
@ -37,7 +37,7 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp)
return v_call_f32 (cosf, x, y, cmp);
}
float32x4_t VPCS_ATTR V_NAME_F1 (cos) (float32x4_t x)
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cos) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
float32x4_t n, r, r2, r3, y;
@ -58,9 +58,8 @@ float32x4_t VPCS_ATTR V_NAME_F1 (cos) (float32x4_t x)
#endif
/* n = rint((|x|+pi/2)/pi) - 0.5. */
n = vfmaq_f32 (d->shift, d->inv_pi, vaddq_f32 (r, d->half_pi));
odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31);
n = vsubq_f32 (n, d->shift);
n = vrndaq_f32 (vfmaq_f32 (v_f32 (0.5), r, d->inv_pi));
odd = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 31);
n = vsubq_f32 (n, v_f32 (0.5f));
/* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
@ -80,3 +79,11 @@ float32x4_t VPCS_ATTR V_NAME_F1 (cos) (float32x4_t x)
return special_case (x, y, odd, cmp);
return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
}
HALF_WIDTH_ALIAS_F1 (cos)
TEST_SIG (V, F, 1, cos, -3.1, 3.1)
TEST_ULP (V_NAME_F1 (cos), 1.4)
TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (cos), WANT_SIMD_EXCEPT)
TEST_SYM_INTERVAL (V_NAME_F1 (cos), 0, 0x1p20, 500000)
TEST_SYM_INTERVAL (V_NAME_F1 (cos), 0x1p20, inf, 10000)

View file

@ -1,18 +1,20 @@
/*
* Double-precision vector cosh(x) function.
*
* Copyright (c) 2022-2023, Arm Limited.
* Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
#include "pl_sig.h"
#include "pl_test.h"
#include "test_sig.h"
#include "test_defs.h"
static const struct data
{
float64x2_t poly[3];
float64x2_t inv_ln2, ln2, shift, thres;
float64x2_t inv_ln2;
double ln2[2];
float64x2_t shift, thres;
uint64x2_t index_mask, special_bound;
} data = {
.poly = { V2 (0x1.fffffffffffd4p-2), V2 (0x1.5555571d6b68cp-3),
@ -48,8 +50,9 @@ exp_inline (float64x2_t x)
float64x2_t n = vsubq_f64 (z, d->shift);
/* r = x - n*ln2/N. */
float64x2_t r = vfmaq_laneq_f64 (x, n, d->ln2, 0);
r = vfmaq_laneq_f64 (r, n, d->ln2, 1);
float64x2_t ln2 = vld1q_f64 (d->ln2);
float64x2_t r = vfmaq_laneq_f64 (x, n, ln2, 0);
r = vfmaq_laneq_f64 (r, n, ln2, 1);
uint64x2_t e = vshlq_n_u64 (u, 52 - V_EXP_TAIL_TABLE_BITS);
uint64x2_t i = vandq_u64 (u, d->index_mask);
@ -97,8 +100,8 @@ float64x2_t VPCS_ATTR V_NAME_D1 (cosh) (float64x2_t x)
return vaddq_f64 (half_t, half_over_t);
}
PL_SIG (V, D, 1, cosh, -10.0, 10.0)
PL_TEST_ULP (V_NAME_D1 (cosh), 1.43)
PL_TEST_EXPECT_FENV_ALWAYS (V_NAME_D1 (cosh))
PL_TEST_SYM_INTERVAL (V_NAME_D1 (cosh), 0, 0x1.6p9, 100000)
PL_TEST_SYM_INTERVAL (V_NAME_D1 (cosh), 0x1.6p9, inf, 1000)
TEST_SIG (V, D, 1, cosh, -10.0, 10.0)
TEST_ULP (V_NAME_D1 (cosh), 1.43)
TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (cosh), WANT_SIMD_EXCEPT)
TEST_SYM_INTERVAL (V_NAME_D1 (cosh), 0, 0x1.6p9, 100000)
TEST_SYM_INTERVAL (V_NAME_D1 (cosh), 0x1.6p9, inf, 1000)

View file

@ -1,32 +1,39 @@
/*
* Single-precision vector cosh(x) function.
*
* Copyright (c) 2022-2023, Arm Limited.
* Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_expf_inline.h"
#include "v_math.h"
#include "mathlib.h"
#include "pl_sig.h"
#include "pl_test.h"
#include "test_sig.h"
#include "test_defs.h"
static const struct data
{
struct v_expf_data expf_consts;
uint32x4_t tiny_bound, special_bound;
uint32x4_t tiny_bound;
float32x4_t bound;
#if WANT_SIMD_EXCEPT
uint32x4_t special_bound;
#endif
} data = {
.expf_consts = V_EXPF_DATA,
.tiny_bound = V4 (0x20000000), /* 0x1p-63: Round to 1 below this. */
/* 0x1.5a92d8p+6: expf overflows above this, so have to use special case. */
.bound = V4 (0x1.5a92d8p+6),
#if WANT_SIMD_EXCEPT
.special_bound = V4 (0x42ad496c),
#endif
};
#if !WANT_SIMD_EXCEPT
static float32x4_t NOINLINE VPCS_ATTR
special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
special_case (float32x4_t x, float32x4_t half_t, float32x4_t half_over_t,
uint32x4_t special)
{
return v_call_f32 (coshf, x, y, special);
return v_call_f32 (coshf, x, vaddq_f32 (half_t, half_over_t), special);
}
#endif
@ -34,18 +41,17 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
Maximum error is 2.38 ULP:
_ZGVnN4v_coshf (0x1.e8001ep+1) got 0x1.6a491ep+4
want 0x1.6a4922p+4. */
float32x4_t VPCS_ATTR V_NAME_F1 (cosh) (float32x4_t x)
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cosh) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
float32x4_t ax = vabsq_f32 (x);
uint32x4_t iax = vreinterpretq_u32_f32 (ax);
uint32x4_t special = vcgeq_u32 (iax, d->special_bound);
#if WANT_SIMD_EXCEPT
/* If fp exceptions are to be triggered correctly, fall back to the scalar
variant for all inputs if any input is a special value or above the bound
at which expf overflows. */
float32x4_t ax = vabsq_f32 (x);
uint32x4_t iax = vreinterpretq_u32_f32 (ax);
uint32x4_t special = vcgeq_u32 (iax, d->special_bound);
if (unlikely (v_any_u32 (special)))
return v_call_f32 (coshf, x, x, v_u32 (-1));
@ -54,10 +60,13 @@ float32x4_t VPCS_ATTR V_NAME_F1 (cosh) (float32x4_t x)
input to 0, which will generate no exceptions. */
if (unlikely (v_any_u32 (tiny)))
ax = v_zerofy_f32 (ax, tiny);
float32x4_t t = v_expf_inline (ax, &d->expf_consts);
#else
uint32x4_t special = vcageq_f32 (x, d->bound);
float32x4_t t = v_expf_inline (x, &d->expf_consts);
#endif
/* Calculate cosh by exp(x) / 2 + exp(-x) / 2. */
float32x4_t t = v_expf_inline (ax, &d->expf_consts);
float32x4_t half_t = vmulq_n_f32 (t, 0.5);
float32x4_t half_over_t = vdivq_f32 (v_f32 (0.5), t);
@ -66,15 +75,18 @@ float32x4_t VPCS_ATTR V_NAME_F1 (cosh) (float32x4_t x)
return vbslq_f32 (tiny, v_f32 (1), vaddq_f32 (half_t, half_over_t));
#else
if (unlikely (v_any_u32 (special)))
return special_case (x, vaddq_f32 (half_t, half_over_t), special);
return special_case (x, half_t, half_over_t, special);
#endif
return vaddq_f32 (half_t, half_over_t);
}
PL_SIG (V, F, 1, cosh, -10.0, 10.0)
PL_TEST_ULP (V_NAME_F1 (cosh), 1.89)
PL_TEST_EXPECT_FENV (V_NAME_F1 (cosh), WANT_SIMD_EXCEPT)
PL_TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 0, 0x1p-63, 100)
PL_TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 0, 0x1.5a92d8p+6, 80000)
PL_TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 0x1.5a92d8p+6, inf, 2000)
HALF_WIDTH_ALIAS_F1 (cosh)
TEST_SIG (V, F, 1, cosh, -10.0, 10.0)
TEST_ULP (V_NAME_F1 (cosh), 1.89)
TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (cosh), WANT_SIMD_EXCEPT)
TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 0, 0x1p-63, 100)
TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 0x1p-63, 1, 1000)
TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 1, 0x1.5a92d8p+6, 80000)
TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 0x1.5a92d8p+6, inf, 2000)

View file

@ -1,15 +1,15 @@
/*
* Double-precision vector cospi function.
*
* Copyright (c) 2023, Arm Limited.
* Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#include "v_math.h"
#include "poly_advsimd_f64.h"
#include "pl_sig.h"
#include "pl_test.h"
#include "v_poly_f64.h"
#include "test_sig.h"
#include "test_defs.h"
static const struct data
{
@ -31,7 +31,7 @@ special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp)
{
/* Fall back to scalar code. */
y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
return v_call_f64 (cospi, x, y, cmp);
return v_call_f64 (arm_math_cospi, x, y, cmp);
}
/* Approximation for vector double-precision cospi(x).
@ -77,10 +77,11 @@ float64x2_t VPCS_ATTR V_NAME_D1 (cospi) (float64x2_t x)
return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
}
PL_SIG (V, D, 1, cospi, -0.9, 0.9)
PL_TEST_ULP (V_NAME_D1 (cospi), 2.56)
PL_TEST_EXPECT_FENV (V_NAME_D1 (cospi), WANT_SIMD_EXCEPT)
PL_TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0, 0x1p-63, 5000)
PL_TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0x1p-63, 0.5, 10000)
PL_TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0.5, 0x1p51, 10000)
PL_TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0x1p51, inf, 10000)
#if WANT_TRIGPI_TESTS
TEST_ULP (V_NAME_D1 (cospi), 2.56)
TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (cospi), WANT_SIMD_EXCEPT)
TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0, 0x1p-63, 5000)
TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0x1p-63, 0.5, 10000)
TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0.5, 0x1p51, 10000)
TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0x1p51, inf, 10000)
#endif

View file

@ -1,15 +1,15 @@
/*
* Single-precision vector cospi function.
*
* Copyright (c) 2023, Arm Limited.
* Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#include "v_math.h"
#include "poly_advsimd_f32.h"
#include "pl_sig.h"
#include "pl_test.h"
#include "v_poly_f32.h"
#include "test_sig.h"
#include "test_defs.h"
static const struct data
{
@ -26,14 +26,14 @@ static float32x4_t VPCS_ATTR NOINLINE
special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp)
{
y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
return v_call_f32 (cospif, x, y, cmp);
return v_call_f32 (arm_math_cospif, x, y, cmp);
}
/* Approximation for vector single-precision cospi(x)
Maximum Error: 3.17 ULP:
_ZGVnN4v_cospif(0x1.d341a8p-5) got 0x1.f7cd56p-1
want 0x1.f7cd5p-1. */
float32x4_t VPCS_ATTR V_NAME_F1 (cospi) (float32x4_t x)
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cospi) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
@ -74,10 +74,13 @@ float32x4_t VPCS_ATTR V_NAME_F1 (cospi) (float32x4_t x)
return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
}
PL_SIG (V, F, 1, cospi, -0.9, 0.9)
PL_TEST_ULP (V_NAME_F1 (cospi), 2.67)
PL_TEST_EXPECT_FENV (V_NAME_F1 (cospi), WANT_SIMD_EXCEPT)
PL_TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0, 0x1p-31, 5000)
PL_TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0x1p-31, 0.5, 10000)
PL_TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0.5, 0x1p32f, 10000)
PL_TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0x1p32f, inf, 10000)
HALF_WIDTH_ALIAS_F1 (cospi)
#if WANT_TRIGPI_TESTS
TEST_ULP (V_NAME_F1 (cospi), 2.67)
TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (cospi), WANT_SIMD_EXCEPT)
TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0, 0x1p-31, 5000)
TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0x1p-31, 0.5, 10000)
TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0.5, 0x1p32f, 10000)
TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0x1p32f, inf, 10000)
#endif

View file

@ -1,30 +1,32 @@
/*
* Double-precision vector erf(x) function.
*
* Copyright (c) 2023, Arm Limited.
* Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
#include "pl_sig.h"
#include "pl_test.h"
#include "test_sig.h"
#include "test_defs.h"
static const struct data
{
float64x2_t third;
float64x2_t tenth, two_over_five, two_over_fifteen;
float64x2_t two_over_nine, two_over_fortyfive;
float64x2_t tenth, two_over_five, two_over_nine;
double two_over_fifteen, two_over_fortyfive;
float64x2_t max, shift;
uint64x2_t max_idx;
#if WANT_SIMD_EXCEPT
float64x2_t tiny_bound, huge_bound, scale_minus_one;
#endif
} data = {
.max_idx = V2 (768),
.third = V2 (0x1.5555555555556p-2), /* used to compute 2/3 and 1/6 too. */
.two_over_fifteen = V2 (0x1.1111111111111p-3),
.two_over_fifteen = 0x1.1111111111111p-3,
.tenth = V2 (-0x1.999999999999ap-4),
.two_over_five = V2 (-0x1.999999999999ap-2),
.two_over_nine = V2 (-0x1.c71c71c71c71cp-3),
.two_over_fortyfive = V2 (0x1.6c16c16c16c17p-5),
.two_over_fortyfive = 0x1.6c16c16c16c17p-5,
.max = V2 (5.9921875), /* 6 - 1/128. */
.shift = V2 (0x1p45),
#if WANT_SIMD_EXCEPT
@ -46,8 +48,8 @@ static inline struct entry
lookup (uint64x2_t i)
{
struct entry e;
float64x2_t e1 = vld1q_f64 ((float64_t *) (__erf_data.tab + i[0])),
e2 = vld1q_f64 ((float64_t *) (__erf_data.tab + i[1]));
float64x2_t e1 = vld1q_f64 (&__v_erf_data.tab[vgetq_lane_u64 (i, 0)].erf),
e2 = vld1q_f64 (&__v_erf_data.tab[vgetq_lane_u64 (i, 1)].erf);
e.erf = vuzp1q_f64 (e1, e2);
e.scale = vuzp2q_f64 (e1, e2);
return e;
@ -77,8 +79,8 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erf) (float64x2_t x)
float64x2_t a = vabsq_f64 (x);
/* Reciprocal conditions that do not catch NaNs so they can be used in BSLs
to return expected results. */
uint64x2_t a_le_max = vcleq_f64 (a, dat->max);
uint64x2_t a_gt_max = vcgtq_f64 (a, dat->max);
uint64x2_t a_le_max = vcaleq_f64 (x, dat->max);
uint64x2_t a_gt_max = vcagtq_f64 (x, dat->max);
#if WANT_SIMD_EXCEPT
/* |x| huge or tiny. */
@ -105,7 +107,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erf) (float64x2_t x)
segfault. */
uint64x2_t i
= vsubq_u64 (vreinterpretq_u64_f64 (z), vreinterpretq_u64_f64 (shift));
i = vbslq_u64 (a_le_max, i, v_u64 (768));
i = vbslq_u64 (a_le_max, i, dat->max_idx);
struct entry e = lookup (i);
float64x2_t r = vsubq_f64 (z, shift);
@ -115,14 +117,19 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erf) (float64x2_t x)
float64x2_t d2 = vmulq_f64 (d, d);
float64x2_t r2 = vmulq_f64 (r, r);
float64x2_t two_over_fifteen_and_fortyfive
= vld1q_f64 (&dat->two_over_fifteen);
/* poly (d, r) = 1 + p1(r) * d + p2(r) * d^2 + ... + p5(r) * d^5. */
float64x2_t p1 = r;
float64x2_t p2
= vfmsq_f64 (dat->third, r2, vaddq_f64 (dat->third, dat->third));
float64x2_t p3 = vmulq_f64 (r, vfmaq_f64 (v_f64 (-0.5), r2, dat->third));
float64x2_t p4 = vfmaq_f64 (dat->two_over_five, r2, dat->two_over_fifteen);
float64x2_t p4 = vfmaq_laneq_f64 (dat->two_over_five, r2,
two_over_fifteen_and_fortyfive, 0);
p4 = vfmsq_f64 (dat->tenth, r2, p4);
float64x2_t p5 = vfmaq_f64 (dat->two_over_nine, r2, dat->two_over_fortyfive);
float64x2_t p5 = vfmaq_laneq_f64 (dat->two_over_nine, r2,
two_over_fifteen_and_fortyfive, 1);
p5 = vmulq_f64 (r, vfmaq_f64 (vmulq_f64 (v_f64 (0.5), dat->third), r2, p5));
float64x2_t p34 = vfmaq_f64 (p3, d, p4);
@ -150,9 +157,10 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erf) (float64x2_t x)
return y;
}
PL_SIG (V, D, 1, erf, -6.0, 6.0)
PL_TEST_ULP (V_NAME_D1 (erf), 1.79)
PL_TEST_EXPECT_FENV (V_NAME_D1 (erf), WANT_SIMD_EXCEPT)
PL_TEST_SYM_INTERVAL (V_NAME_D1 (erf), 0, 5.9921875, 40000)
PL_TEST_SYM_INTERVAL (V_NAME_D1 (erf), 5.9921875, inf, 40000)
PL_TEST_SYM_INTERVAL (V_NAME_D1 (erf), 0, inf, 40000)
TEST_SIG (V, D, 1, erf, -6.0, 6.0)
TEST_ULP (V_NAME_D1 (erf), 1.79)
/* WANT_SIMD_EXCEPT blocks miss some cases. */
TEST_DISABLE_FENV (V_NAME_D1 (erf))
TEST_SYM_INTERVAL (V_NAME_D1 (erf), 0, 5.9921875, 40000)
TEST_SYM_INTERVAL (V_NAME_D1 (erf), 5.9921875, inf, 40000)
TEST_SYM_INTERVAL (V_NAME_D1 (erf), 0, inf, 40000)

View file

@ -1,21 +1,21 @@
/*
* Double-precision vector erfc(x) function.
*
* Copyright (c) 2023, Arm Limited.
* Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
#include "pl_sig.h"
#include "pl_test.h"
#include "test_sig.h"
#include "test_defs.h"
static const struct data
{
uint64x2_t offset, table_scale;
float64x2_t max, shift;
float64x2_t p20, p40, p41, p42;
float64x2_t p51, p52;
float64x2_t qr5, qr6, qr7, qr8, qr9;
float64x2_t p20, p40, p41, p51;
double p42, p52;
double qr5[2], qr6[2], qr7[2], qr8[2], qr9[2];
#if WANT_SIMD_EXCEPT
float64x2_t uflow_bound;
#endif
@ -30,9 +30,9 @@ static const struct data
.p20 = V2 (0x1.5555555555555p-2), /* 1/3, used to compute 2/3 and 1/6. */
.p40 = V2 (-0x1.999999999999ap-4), /* 1/10. */
.p41 = V2 (-0x1.999999999999ap-2), /* 2/5. */
.p42 = V2 (0x1.1111111111111p-3), /* 2/15. */
.p42 = 0x1.1111111111111p-3, /* 2/15. */
.p51 = V2 (-0x1.c71c71c71c71cp-3), /* 2/9. */
.p52 = V2 (0x1.6c16c16c16c17p-5), /* 2/45. */
.p52 = 0x1.6c16c16c16c17p-5, /* 2/45. */
/* Qi = (i+1) / i, Ri = -2 * i / ((i+1)*(i+2)), for i = 5, ..., 9. */
.qr5 = { 0x1.3333333333333p0, -0x1.e79e79e79e79ep-3 },
.qr6 = { 0x1.2aaaaaaaaaaabp0, -0x1.b6db6db6db6dbp-3 },
@ -57,8 +57,10 @@ static inline struct entry
lookup (uint64x2_t i)
{
struct entry e;
float64x2_t e1 = vld1q_f64 ((float64_t *) (__erfc_data.tab - Off + i[0])),
e2 = vld1q_f64 ((float64_t *) (__erfc_data.tab - Off + i[1]));
float64x2_t e1
= vld1q_f64 (&__v_erfc_data.tab[vgetq_lane_u64 (i, 0) - Off].erfc);
float64x2_t e2
= vld1q_f64 (&__v_erfc_data.tab[vgetq_lane_u64 (i, 1) - Off].erfc);
e.erfc = vuzp1q_f64 (e1, e2);
e.scale = vuzp2q_f64 (e1, e2);
return e;
@ -144,22 +146,26 @@ float64x2_t V_NAME_D1 (erfc) (float64x2_t x)
float64x2_t p1 = r;
float64x2_t p2 = vfmsq_f64 (dat->p20, r2, vaddq_f64 (dat->p20, dat->p20));
float64x2_t p3 = vmulq_f64 (r, vfmaq_f64 (v_f64 (-0.5), r2, dat->p20));
float64x2_t p4 = vfmaq_f64 (dat->p41, r2, dat->p42);
float64x2_t p42_p52 = vld1q_f64 (&dat->p42);
float64x2_t p4 = vfmaq_laneq_f64 (dat->p41, r2, p42_p52, 0);
p4 = vfmsq_f64 (dat->p40, r2, p4);
float64x2_t p5 = vfmaq_f64 (dat->p51, r2, dat->p52);
float64x2_t p5 = vfmaq_laneq_f64 (dat->p51, r2, p42_p52, 1);
p5 = vmulq_f64 (r, vfmaq_f64 (vmulq_f64 (v_f64 (0.5), dat->p20), r2, p5));
/* Compute p_i using recurrence relation:
p_{i+2} = (p_i + r * Q_{i+1} * p_{i+1}) * R_{i+1}. */
float64x2_t p6 = vfmaq_f64 (p4, p5, vmulq_laneq_f64 (r, dat->qr5, 0));
p6 = vmulq_laneq_f64 (p6, dat->qr5, 1);
float64x2_t p7 = vfmaq_f64 (p5, p6, vmulq_laneq_f64 (r, dat->qr6, 0));
p7 = vmulq_laneq_f64 (p7, dat->qr6, 1);
float64x2_t p8 = vfmaq_f64 (p6, p7, vmulq_laneq_f64 (r, dat->qr7, 0));
p8 = vmulq_laneq_f64 (p8, dat->qr7, 1);
float64x2_t p9 = vfmaq_f64 (p7, p8, vmulq_laneq_f64 (r, dat->qr8, 0));
p9 = vmulq_laneq_f64 (p9, dat->qr8, 1);
float64x2_t p10 = vfmaq_f64 (p8, p9, vmulq_laneq_f64 (r, dat->qr9, 0));
p10 = vmulq_laneq_f64 (p10, dat->qr9, 1);
float64x2_t qr5 = vld1q_f64 (dat->qr5), qr6 = vld1q_f64 (dat->qr6),
qr7 = vld1q_f64 (dat->qr7), qr8 = vld1q_f64 (dat->qr8),
qr9 = vld1q_f64 (dat->qr9);
float64x2_t p6 = vfmaq_f64 (p4, p5, vmulq_laneq_f64 (r, qr5, 0));
p6 = vmulq_laneq_f64 (p6, qr5, 1);
float64x2_t p7 = vfmaq_f64 (p5, p6, vmulq_laneq_f64 (r, qr6, 0));
p7 = vmulq_laneq_f64 (p7, qr6, 1);
float64x2_t p8 = vfmaq_f64 (p6, p7, vmulq_laneq_f64 (r, qr7, 0));
p8 = vmulq_laneq_f64 (p8, qr7, 1);
float64x2_t p9 = vfmaq_f64 (p7, p8, vmulq_laneq_f64 (r, qr8, 0));
p9 = vmulq_laneq_f64 (p9, qr8, 1);
float64x2_t p10 = vfmaq_f64 (p8, p9, vmulq_laneq_f64 (r, qr9, 0));
p10 = vmulq_laneq_f64 (p10, qr9, 1);
/* Compute polynomial in d using pairwise Horner scheme. */
float64x2_t p90 = vfmaq_f64 (p9, d, p10);
float64x2_t p78 = vfmaq_f64 (p7, d, p8);
@ -189,10 +195,11 @@ float64x2_t V_NAME_D1 (erfc) (float64x2_t x)
return vfmaq_f64 (off, fac, y);
}
PL_SIG (V, D, 1, erfc, -6.0, 28.0)
PL_TEST_ULP (V_NAME_D1 (erfc), 1.21)
PL_TEST_SYM_INTERVAL (V_NAME_D1 (erfc), 0, 0x1p-26, 40000)
PL_TEST_INTERVAL (V_NAME_D1 (erfc), 0x1p-26, 28.0, 40000)
PL_TEST_INTERVAL (V_NAME_D1 (erfc), -0x1p-26, -6.0, 40000)
PL_TEST_INTERVAL (V_NAME_D1 (erfc), 28.0, inf, 40000)
PL_TEST_INTERVAL (V_NAME_D1 (erfc), -6.0, -inf, 40000)
TEST_SIG (V, D, 1, erfc, -6.0, 28.0)
TEST_ULP (V_NAME_D1 (erfc), 1.21)
TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (erfc), WANT_SIMD_EXCEPT)
TEST_SYM_INTERVAL (V_NAME_D1 (erfc), 0, 0x1p-26, 40000)
TEST_INTERVAL (V_NAME_D1 (erfc), 0x1p-26, 28.0, 40000)
TEST_INTERVAL (V_NAME_D1 (erfc), -0x1p-26, -6.0, 40000)
TEST_INTERVAL (V_NAME_D1 (erfc), 28.0, inf, 40000)
TEST_INTERVAL (V_NAME_D1 (erfc), -6.0, -inf, 40000)

View file

@ -1,19 +1,20 @@
/*
* Single-precision vector erfc(x) function.
*
* Copyright (c) 2023, Arm Limited.
* Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
#include "pl_sig.h"
#include "pl_test.h"
#include "test_sig.h"
#include "test_defs.h"
static const struct data
{
uint32x4_t offset, table_scale;
float32x4_t max, shift;
float32x4_t coeffs, third, two_over_five, tenth;
float coeffs[4];
float32x4_t third, two_over_five, tenth;
#if WANT_SIMD_EXCEPT
float32x4_t uflow_bound;
#endif
@ -27,7 +28,7 @@ static const struct data
.shift = V4 (0x1p17f),
/* Store 1/3, 2/3 and 2/15 in a single register for use with indexed muls and
fmas. */
.coeffs = (float32x4_t){ 0x1.555556p-2f, 0x1.555556p-1f, 0x1.111112p-3f, 0 },
.coeffs = { 0x1.555556p-2f, 0x1.555556p-1f, 0x1.111112p-3f, 0 },
.third = V4 (0x1.555556p-2f),
.two_over_five = V4 (-0x1.99999ap-2f),
.tenth = V4 (-0x1.99999ap-4f),
@ -50,12 +51,16 @@ static inline struct entry
lookup (uint32x4_t i)
{
struct entry e;
float64_t t0 = *((float64_t *) (__erfcf_data.tab - Off + i[0]));
float64_t t1 = *((float64_t *) (__erfcf_data.tab - Off + i[1]));
float64_t t2 = *((float64_t *) (__erfcf_data.tab - Off + i[2]));
float64_t t3 = *((float64_t *) (__erfcf_data.tab - Off + i[3]));
float32x4_t e1 = vreinterpretq_f32_f64 ((float64x2_t){ t0, t1 });
float32x4_t e2 = vreinterpretq_f32_f64 ((float64x2_t){ t2, t3 });
float32x2_t t0
= vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 0) - Off].erfc);
float32x2_t t1
= vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 1) - Off].erfc);
float32x2_t t2
= vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 2) - Off].erfc);
float32x2_t t3
= vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 3) - Off].erfc);
float32x4_t e1 = vcombine_f32 (t0, t1);
float32x4_t e2 = vcombine_f32 (t2, t3);
e.erfc = vuzp1q_f32 (e1, e2);
e.scale = vuzp2q_f32 (e1, e2);
return e;
@ -86,8 +91,7 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
Maximum error: 1.63 ULP (~1.0 ULP for x < 0.0).
_ZGVnN4v_erfcf(0x1.1dbf7ap+3) got 0x1.f51212p-120
want 0x1.f51216p-120. */
VPCS_ATTR
float32x4_t V_NAME_F1 (erfc) (float32x4_t x)
NOINLINE VPCS_ATTR float32x4_t V_NAME_F1 (erfc) (float32x4_t x)
{
const struct data *dat = ptr_barrier (&data);
@ -130,10 +134,11 @@ float32x4_t V_NAME_F1 (erfc) (float32x4_t x)
float32x4_t r2 = vmulq_f32 (r, r);
float32x4_t p1 = r;
float32x4_t p2 = vfmsq_laneq_f32 (dat->third, r2, dat->coeffs, 1);
float32x4_t coeffs = vld1q_f32 (dat->coeffs);
float32x4_t p2 = vfmsq_laneq_f32 (dat->third, r2, coeffs, 1);
float32x4_t p3
= vmulq_f32 (r, vfmaq_laneq_f32 (v_f32 (-0.5), r2, dat->coeffs, 0));
float32x4_t p4 = vfmaq_laneq_f32 (dat->two_over_five, r2, dat->coeffs, 2);
= vmulq_f32 (r, vfmaq_laneq_f32 (v_f32 (-0.5), r2, coeffs, 0));
float32x4_t p4 = vfmaq_laneq_f32 (dat->two_over_five, r2, coeffs, 2);
p4 = vfmsq_f32 (dat->tenth, r2, p4);
float32x4_t y = vfmaq_f32 (p3, d, p4);
@ -157,10 +162,13 @@ float32x4_t V_NAME_F1 (erfc) (float32x4_t x)
return vfmaq_f32 (off, fac, y);
}
PL_SIG (V, F, 1, erfc, -4.0, 10.0)
PL_TEST_ULP (V_NAME_F1 (erfc), 1.14)
PL_TEST_SYM_INTERVAL (V_NAME_F1 (erfc), 0, 0x1p-26, 40000)
PL_TEST_INTERVAL (V_NAME_F1 (erfc), 0x1p-26, 10.0625, 40000)
PL_TEST_INTERVAL (V_NAME_F1 (erfc), -0x1p-26, -4.0, 40000)
PL_TEST_INTERVAL (V_NAME_F1 (erfc), 10.0625, inf, 40000)
PL_TEST_INTERVAL (V_NAME_F1 (erfc), -4.0, -inf, 40000)
HALF_WIDTH_ALIAS_F1 (erfc)
TEST_SIG (V, F, 1, erfc, -4.0, 10.0)
TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (erfc), WANT_SIMD_EXCEPT)
TEST_ULP (V_NAME_F1 (erfc), 1.14)
TEST_SYM_INTERVAL (V_NAME_F1 (erfc), 0, 0x1p-26, 40000)
TEST_INTERVAL (V_NAME_F1 (erfc), 0x1p-26, 10.0625, 40000)
TEST_INTERVAL (V_NAME_F1 (erfc), -0x1p-26, -4.0, 40000)
TEST_INTERVAL (V_NAME_F1 (erfc), 10.0625, inf, 40000)
TEST_INTERVAL (V_NAME_F1 (erfc), -4.0, -inf, 40000)

View file

@ -1,13 +1,13 @@
/*
* Single-precision vector erf(x) function.
*
* Copyright (c) 2023, Arm Limited.
* Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
#include "pl_sig.h"
#include "pl_test.h"
#include "test_sig.h"
#include "test_defs.h"
static const struct data
{
@ -37,12 +37,12 @@ static inline struct entry
lookup (uint32x4_t i)
{
struct entry e;
float64_t t0 = *((float64_t *) (__erff_data.tab + i[0]));
float64_t t1 = *((float64_t *) (__erff_data.tab + i[1]));
float64_t t2 = *((float64_t *) (__erff_data.tab + i[2]));
float64_t t3 = *((float64_t *) (__erff_data.tab + i[3]));
float32x4_t e1 = vreinterpretq_f32_f64 ((float64x2_t){ t0, t1 });
float32x4_t e2 = vreinterpretq_f32_f64 ((float64x2_t){ t2, t3 });
float32x2_t t0 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 0)].erf);
float32x2_t t1 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 1)].erf);
float32x2_t t2 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 2)].erf);
float32x2_t t3 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 3)].erf);
float32x4_t e1 = vcombine_f32 (t0, t1);
float32x4_t e2 = vcombine_f32 (t2, t3);
e.erf = vuzp1q_f32 (e1, e2);
e.scale = vuzp2q_f32 (e1, e2);
return e;
@ -61,7 +61,7 @@ lookup (uint32x4_t i)
Maximum error: 1.93 ULP
_ZGVnN4v_erff(0x1.c373e6p-9) got 0x1.fd686cp-9
want 0x1.fd6868p-9. */
float32x4_t VPCS_ATTR V_NAME_F1 (erf) (float32x4_t x)
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (erf) (float32x4_t x)
{
const struct data *dat = ptr_barrier (&data);
@ -110,9 +110,11 @@ float32x4_t VPCS_ATTR V_NAME_F1 (erf) (float32x4_t x)
return y;
}
PL_SIG (V, F, 1, erf, -4.0, 4.0)
PL_TEST_ULP (V_NAME_F1 (erf), 1.43)
PL_TEST_EXPECT_FENV (V_NAME_F1 (erf), WANT_SIMD_EXCEPT)
PL_TEST_SYM_INTERVAL (V_NAME_F1 (erf), 0, 3.9375, 40000)
PL_TEST_SYM_INTERVAL (V_NAME_F1 (erf), 3.9375, inf, 40000)
PL_TEST_SYM_INTERVAL (V_NAME_F1 (erf), 0, inf, 40000)
HALF_WIDTH_ALIAS_F1 (erf)
TEST_SIG (V, F, 1, erf, -4.0, 4.0)
TEST_ULP (V_NAME_F1 (erf), 1.43)
TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (erf), WANT_SIMD_EXCEPT)
TEST_SYM_INTERVAL (V_NAME_F1 (erf), 0, 3.9375, 40000)
TEST_SYM_INTERVAL (V_NAME_F1 (erf), 3.9375, inf, 40000)
TEST_SYM_INTERVAL (V_NAME_F1 (erf), 0, inf, 40000)

View file

@ -1,12 +1,14 @@
/*
* Double-precision vector e^x function.
*
* Copyright (c) 2019-2023, Arm Limited.
* Copyright (c) 2019-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#include "v_math.h"
#include "test_defs.h"
#include "test_sig.h"
#define N (1 << V_EXP_TABLE_BITS)
#define IndexMask (N - 1)
@ -123,3 +125,10 @@ float64x2_t VPCS_ATTR V_NAME_D1 (exp) (float64x2_t x)
return vfmaq_f64 (s, y, s);
}
TEST_SIG (V, D, 1, exp, -9.9, 9.9)
TEST_ULP (V_NAME_D1 (exp), 1.9)
TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (exp), WANT_SIMD_EXCEPT)
TEST_INTERVAL (V_NAME_D1 (exp), 0, 0xffff000000000000, 10000)
TEST_SYM_INTERVAL (V_NAME_D1 (exp), 0x1p-6, 0x1p6, 400000)
TEST_SYM_INTERVAL (V_NAME_D1 (exp), 633.3, 733.3, 10000)

View file

@ -1,14 +1,15 @@
/*
* Double-precision vector 10^x function.
*
* Copyright (c) 2023, Arm Limited.
* Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#define _GNU_SOURCE
#include "mathlib.h"
#include "v_math.h"
#include "pl_sig.h"
#include "pl_test.h"
#include "test_sig.h"
#include "test_defs.h"
/* Value of |x| above which scale overflows without special treatment. */
#define SpecialBound 306.0 /* floor (log10 (2^1023)) - 1. */
@ -135,10 +136,12 @@ float64x2_t VPCS_ATTR V_NAME_D1 (exp10) (float64x2_t x)
return vfmaq_f64 (s, y, s);
}
PL_SIG (S, D, 1, exp10, -9.9, 9.9)
PL_SIG (V, D, 1, exp10, -9.9, 9.9)
PL_TEST_ULP (V_NAME_D1 (exp10), 1.15)
PL_TEST_EXPECT_FENV (V_NAME_D1 (exp10), WANT_SIMD_EXCEPT)
PL_TEST_SYM_INTERVAL (V_NAME_D1 (exp10), 0, SpecialBound, 5000)
PL_TEST_SYM_INTERVAL (V_NAME_D1 (exp10), SpecialBound, ScaleBound, 5000)
PL_TEST_SYM_INTERVAL (V_NAME_D1 (exp10), ScaleBound, inf, 10000)
#if WANT_EXP10_TESTS
TEST_SIG (S, D, 1, exp10, -9.9, 9.9)
TEST_SIG (V, D, 1, exp10, -9.9, 9.9)
TEST_ULP (V_NAME_D1 (exp10), 1.15)
TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (exp10), WANT_SIMD_EXCEPT)
TEST_SYM_INTERVAL (V_NAME_D1 (exp10), 0, SpecialBound, 5000)
TEST_SYM_INTERVAL (V_NAME_D1 (exp10), SpecialBound, ScaleBound, 5000)
TEST_SYM_INTERVAL (V_NAME_D1 (exp10), ScaleBound, inf, 10000)
#endif

View file

@ -1,23 +1,24 @@
/*
* Single-precision vector 10^x function.
*
* Copyright (c) 2023, Arm Limited.
* Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#define _GNU_SOURCE
#include "v_math.h"
#include "pl_sig.h"
#include "pl_test.h"
#include "poly_advsimd_f32.h"
#include "test_sig.h"
#include "test_defs.h"
#include "v_poly_f32.h"
#define ScaleBound 192.0f
static const struct data
{
float32x4_t poly[5];
float32x4_t log10_2_and_inv, shift;
float32x4_t c0, c1, c3;
float log10_2_high, log10_2_low, c2, c4;
float32x4_t inv_log10_2, special_bound;
uint32x4_t exponent_bias, special_offset, special_bias;
#if !WANT_SIMD_EXCEPT
float32x4_t scale_thresh;
#endif
@ -27,19 +28,24 @@ static const struct data
rel error: 0x1.89dafa3p-24
abs error: 0x1.167d55p-23 in [-log10(2)/2, log10(2)/2]
maxerr: 1.85943 +0.5 ulp. */
.poly = { V4 (0x1.26bb16p+1f), V4 (0x1.5350d2p+1f), V4 (0x1.04744ap+1f),
V4 (0x1.2d8176p+0f), V4 (0x1.12b41ap-1f) },
.shift = V4 (0x1.8p23f),
/* Stores constants 1/log10(2), log10(2)_high, log10(2)_low, 0. */
.log10_2_and_inv = { 0x1.a934fp+1, 0x1.344136p-2, -0x1.ec10cp-27, 0 },
.c0 = V4 (0x1.26bb16p+1f),
.c1 = V4 (0x1.5350d2p+1f),
.c2 = 0x1.04744ap+1f,
.c3 = V4 (0x1.2d8176p+0f),
.c4 = 0x1.12b41ap-1f,
.inv_log10_2 = V4 (0x1.a934fp+1),
.log10_2_high = 0x1.344136p-2,
.log10_2_low = 0x1.ec10cp-27,
/* rint (log2 (2^127 / (1 + sqrt (2)))). */
.special_bound = V4 (126.0f),
.exponent_bias = V4 (0x3f800000),
.special_offset = V4 (0x82000000),
.special_bias = V4 (0x7f000000),
#if !WANT_SIMD_EXCEPT
.scale_thresh = V4 (ScaleBound)
#endif
};
#define ExponentBias v_u32 (0x3f800000)
#if WANT_SIMD_EXCEPT
# define SpecialBound 38.0f /* rint(log10(2^127)). */
@ -57,17 +63,15 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
#else
# define SpecialBound 126.0f /* rint (log2 (2^127 / (1 + sqrt (2)))). */
# define SpecialOffset v_u32 (0x82000000)
# define SpecialBias v_u32 (0x7f000000)
# define SpecialBound 126.0f
static float32x4_t VPCS_ATTR NOINLINE
special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
float32x4_t scale, const struct data *d)
{
/* 2^n may overflow, break it up into s1*s2. */
uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset);
float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias));
uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset);
float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias));
float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
float32x4_t r2 = vmulq_f32 (s1, s1);
@ -84,7 +88,7 @@ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
Algorithm is accurate to 2.36 ULP.
_ZGVnN4v_exp10f(0x1.be2b36p+1) got 0x1.7e79c4p+11
want 0x1.7e79cp+11. */
float32x4_t VPCS_ATTR V_NAME_F1 (exp10) (float32x4_t x)
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp10) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
#if WANT_SIMD_EXCEPT
@ -102,22 +106,23 @@ float32x4_t VPCS_ATTR V_NAME_F1 (exp10) (float32x4_t x)
/* exp10(x) = 2^n * 10^r = 2^n * (1 + poly (r)),
with poly(r) in [1/sqrt(2), sqrt(2)] and
x = r + n * log10 (2), with r in [-log10(2)/2, log10(2)/2]. */
float32x4_t z = vfmaq_laneq_f32 (d->shift, x, d->log10_2_and_inv, 0);
float32x4_t n = vsubq_f32 (z, d->shift);
float32x4_t r = vfmsq_laneq_f32 (x, n, d->log10_2_and_inv, 1);
r = vfmsq_laneq_f32 (r, n, d->log10_2_and_inv, 2);
uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23);
float32x4_t log10_2_c24 = vld1q_f32 (&d->log10_2_high);
float32x4_t n = vrndaq_f32 (vmulq_f32 (x, d->inv_log10_2));
float32x4_t r = vfmsq_laneq_f32 (x, n, log10_2_c24, 0);
r = vfmaq_laneq_f32 (r, n, log10_2_c24, 1);
uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (n)), 23);
float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, ExponentBias));
float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
#if !WANT_SIMD_EXCEPT
uint32x4_t cmp = vcagtq_f32 (n, v_f32 (SpecialBound));
uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
#endif
float32x4_t r2 = vmulq_f32 (r, r);
float32x4_t poly
= vfmaq_f32 (vmulq_f32 (r, d->poly[0]),
v_pairwise_poly_3_f32 (r, r2, d->poly + 1), r2);
float32x4_t p12 = vfmaq_laneq_f32 (d->c1, r, log10_2_c24, 2);
float32x4_t p34 = vfmaq_laneq_f32 (d->c3, r, log10_2_c24, 3);
float32x4_t p14 = vfmaq_f32 (p12, r2, p34);
float32x4_t poly = vfmaq_f32 (vmulq_f32 (r, d->c0), p14, r2);
if (unlikely (v_any_u32 (cmp)))
#if WANT_SIMD_EXCEPT
@ -129,10 +134,14 @@ float32x4_t VPCS_ATTR V_NAME_F1 (exp10) (float32x4_t x)
return vfmaq_f32 (scale, poly, scale);
}
PL_SIG (S, F, 1, exp10, -9.9, 9.9)
PL_SIG (V, F, 1, exp10, -9.9, 9.9)
PL_TEST_ULP (V_NAME_F1 (exp10), 1.86)
PL_TEST_EXPECT_FENV (V_NAME_F1 (exp10), WANT_SIMD_EXCEPT)
PL_TEST_SYM_INTERVAL (V_NAME_F1 (exp10), 0, SpecialBound, 5000)
PL_TEST_SYM_INTERVAL (V_NAME_F1 (exp10), SpecialBound, ScaleBound, 5000)
PL_TEST_SYM_INTERVAL (V_NAME_F1 (exp10), ScaleBound, inf, 10000)
HALF_WIDTH_ALIAS_F1 (exp10)
#if WANT_EXP10_TESTS
TEST_SIG (S, F, 1, exp10, -9.9, 9.9)
TEST_SIG (V, F, 1, exp10, -9.9, 9.9)
TEST_ULP (V_NAME_F1 (exp10), 1.86)
TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (exp10), WANT_SIMD_EXCEPT)
TEST_SYM_INTERVAL (V_NAME_F1 (exp10), 0, SpecialBound, 5000)
TEST_SYM_INTERVAL (V_NAME_F1 (exp10), SpecialBound, ScaleBound, 5000)
TEST_SYM_INTERVAL (V_NAME_F1 (exp10), ScaleBound, inf, 10000)
#endif

View file

@ -1,19 +1,20 @@
/*
* Double-precision vector 2^x function.
*
* Copyright (c) 2019-2023, Arm Limited.
* Copyright (c) 2019-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
#include "poly_advsimd_f64.h"
#include "pl_sig.h"
#include "pl_test.h"
#include "v_poly_f64.h"
#include "test_sig.h"
#include "test_defs.h"
#define N (1 << V_EXP_TABLE_BITS)
#define IndexMask (N - 1)
#define BigBound 1022.0
#define UOFlowBound 1280.0
#define TinyBound 0x2000000000000000 /* asuint64(0x1p-511). */
static const struct data
{
@ -38,7 +39,6 @@ lookup_sbits (uint64x2_t i)
#if WANT_SIMD_EXCEPT
# define TinyBound 0x2000000000000000 /* asuint64(0x1p-511). */
# define Thres 0x2080000000000000 /* asuint64(512.0) - TinyBound. */
/* Call scalar exp2 as a fallback. */
@ -62,8 +62,8 @@ special_case (float64x2_t s, float64x2_t y, float64x2_t n,
/* 2^(n/N) may overflow, break it up into s1*s2. */
uint64x2_t b = vandq_u64 (vclezq_f64 (n), v_u64 (SpecialOffset));
float64x2_t s1 = vreinterpretq_f64_u64 (vsubq_u64 (v_u64 (SpecialBias1), b));
float64x2_t s2 = vreinterpretq_f64_u64 (
vaddq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (s), v_u64 (SpecialBias2)), b));
float64x2_t s2 = vreinterpretq_f64_u64 (vaddq_u64 (
vsubq_u64 (vreinterpretq_u64_f64 (s), v_u64 (SpecialBias2)), b));
uint64x2_t cmp = vcagtq_f64 (n, d->scale_uoflow_bound);
float64x2_t r1 = vmulq_f64 (s1, s1);
float64x2_t r0 = vmulq_f64 (vfmaq_f64 (s2, s2, y), s1);
@ -119,10 +119,10 @@ float64x2_t V_NAME_D1 (exp2) (float64x2_t x)
return vfmaq_f64 (s, s, y);
}
PL_SIG (V, D, 1, exp2, -9.9, 9.9)
PL_TEST_ULP (V_NAME_D1 (exp2), 1.15)
PL_TEST_EXPECT_FENV (V_NAME_D1 (exp2), WANT_SIMD_EXCEPT)
PL_TEST_SYM_INTERVAL (V_NAME_D1 (exp2), 0, TinyBound, 5000)
PL_TEST_SYM_INTERVAL (V_NAME_D1 (exp2), TinyBound, BigBound, 10000)
PL_TEST_SYM_INTERVAL (V_NAME_D1 (exp2), BigBound, UOFlowBound, 5000)
PL_TEST_SYM_INTERVAL (V_NAME_D1 (exp2), UOFlowBound, inf, 10000)
TEST_SIG (V, D, 1, exp2, -9.9, 9.9)
TEST_ULP (V_NAME_D1 (exp2), 1.15)
TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (exp2), WANT_SIMD_EXCEPT)
TEST_SYM_INTERVAL (V_NAME_D1 (exp2), 0, TinyBound, 5000)
TEST_SYM_INTERVAL (V_NAME_D1 (exp2), TinyBound, BigBound, 10000)
TEST_SYM_INTERVAL (V_NAME_D1 (exp2), BigBound, UOFlowBound, 5000)
TEST_SYM_INTERVAL (V_NAME_D1 (exp2), UOFlowBound, inf, 10000)

View file

@ -1,33 +1,38 @@
/*
* Single-precision vector 2^x function.
*
* Copyright (c) 2019-2023, Arm Limited.
* Copyright (c) 2019-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#include "v_math.h"
#include "test_defs.h"
#include "test_sig.h"
static const struct data
{
float32x4_t poly[5];
uint32x4_t exponent_bias;
float32x4_t c1, c3;
uint32x4_t exponent_bias, special_offset, special_bias;
#if !WANT_SIMD_EXCEPT
float32x4_t special_bound, scale_thresh;
float32x4_t scale_thresh, special_bound;
#endif
float c0, c2, c4, zero;
} data = {
/* maxerr: 1.962 ulp. */
.poly = { V4 (0x1.59977ap-10f), V4 (0x1.3ce9e4p-7f), V4 (0x1.c6bd32p-5f),
V4 (0x1.ebf9bcp-3f), V4 (0x1.62e422p-1f) },
.c0 = 0x1.59977ap-10f,
.c1 = V4 (0x1.3ce9e4p-7f),
.c2 = 0x1.c6bd32p-5f,
.c3 = V4 (0x1.ebf9bcp-3f),
.c4 = 0x1.62e422p-1f,
.exponent_bias = V4 (0x3f800000),
.special_offset = V4 (0x82000000),
.special_bias = V4 (0x7f000000),
#if !WANT_SIMD_EXCEPT
.special_bound = V4 (126.0f),
.scale_thresh = V4 (192.0f),
#endif
};
#define C(i) d->poly[i]
#if WANT_SIMD_EXCEPT
# define TinyBound v_u32 (0x20000000) /* asuint (0x1p-63). */
@ -44,16 +49,13 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
#else
# define SpecialOffset v_u32 (0x82000000)
# define SpecialBias v_u32 (0x7f000000)
static float32x4_t VPCS_ATTR NOINLINE
special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
float32x4_t scale, const struct data *d)
{
/* 2^n may overflow, break it up into s1*s2. */
uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset);
float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias));
uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset);
float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias));
float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
float32x4_t r2 = vmulq_f32 (s1, s1);
@ -66,16 +68,14 @@ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
#endif
float32x4_t VPCS_ATTR V_NAME_F1 (exp2) (float32x4_t x)
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp2) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
float32x4_t n, r, r2, scale, p, q, poly;
uint32x4_t cmp, e;
#if WANT_SIMD_EXCEPT
/* asuint(|x|) - TinyBound >= BigBound - TinyBound. */
uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x));
cmp = vcgeq_u32 (vsubq_u32 (ia, TinyBound), SpecialBound);
uint32x4_t cmp = vcgeq_u32 (vsubq_u32 (ia, TinyBound), SpecialBound);
float32x4_t xm = x;
/* If any lanes are special, mask them with 1 and retain a copy of x to allow
special_case to fix special lanes later. This is only necessary if fenv
@ -84,23 +84,24 @@ float32x4_t VPCS_ATTR V_NAME_F1 (exp2) (float32x4_t x)
x = vbslq_f32 (cmp, v_f32 (1), x);
#endif
/* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
x = n + r, with r in [-1/2, 1/2]. */
n = vrndaq_f32 (x);
r = vsubq_f32 (x, n);
e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 23);
scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
/* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
x = n + r, with r in [-1/2, 1/2]. */
float32x4_t n = vrndaq_f32 (x);
float32x4_t r = vsubq_f32 (x, n);
uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 23);
float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
#if !WANT_SIMD_EXCEPT
cmp = vcagtq_f32 (n, d->special_bound);
uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
#endif
r2 = vmulq_f32 (r, r);
p = vfmaq_f32 (C (1), C (0), r);
q = vfmaq_f32 (C (3), C (2), r);
float32x4_t c024 = vld1q_f32 (&d->c0);
float32x4_t r2 = vmulq_f32 (r, r);
float32x4_t p = vfmaq_laneq_f32 (d->c1, r, c024, 0);
float32x4_t q = vfmaq_laneq_f32 (d->c3, r, c024, 1);
q = vfmaq_f32 (q, p, r2);
p = vmulq_f32 (C (4), r);
poly = vfmaq_f32 (p, q, r2);
p = vmulq_laneq_f32 (r, c024, 2);
float32x4_t poly = vfmaq_f32 (p, q, r2);
if (unlikely (v_any_u32 (cmp)))
#if WANT_SIMD_EXCEPT
@ -111,3 +112,11 @@ float32x4_t VPCS_ATTR V_NAME_F1 (exp2) (float32x4_t x)
return vfmaq_f32 (scale, poly, scale);
}
HALF_WIDTH_ALIAS_F1 (exp2)
TEST_SIG (V, F, 1, exp2, -9.9, 9.9)
TEST_ULP (V_NAME_F1 (exp2), 1.49)
TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (exp2), WANT_SIMD_EXCEPT)
TEST_INTERVAL (V_NAME_F1 (exp2), 0, 0xffff0000, 10000)
TEST_SYM_INTERVAL (V_NAME_F1 (exp2), 0x1p-14, 0x1p8, 500000)

View file

@ -0,0 +1,73 @@
/*
* Single-precision vector 2^x function.
*
* Copyright (c) 2019-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
#include "test_defs.h"
static const struct data
{
float32x4_t c0, c1, c2, c3, c4, c5, shift;
uint32x4_t exponent_bias;
float32x4_t special_bound, scale_thresh;
uint32x4_t special_offset, special_bias;
} data = {
.shift = V4 (0x1.8p23f),
.exponent_bias = V4 (0x3f800000),
.special_bound = V4 (126.0f),
.scale_thresh = V4 (192.0f),
.special_offset = V4 (0x82000000),
.special_bias = V4 (0x7f000000),
/* maxerr: 0.878 ulp. */
.c0 = V4 (0x1.416b5ep-13f),
.c1 = V4 (0x1.5f082ep-10f),
.c2 = V4 (0x1.3b2dep-7f),
.c3 = V4 (0x1.c6af7cp-5f),
.c4 = V4 (0x1.ebfbdcp-3f),
.c5 = V4 (0x1.62e43p-1f),
};
static float32x4_t VPCS_ATTR NOINLINE
specialcase (float32x4_t p, float32x4_t n, uint32x4_t e, const struct data *d)
{
/* 2^n may overflow, break it up into s1*s2. */
uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset);
float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias));
float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
uint32x4_t cmp = vcagtq_f32 (n, d->scale_thresh);
float32x4_t r1 = vmulq_f32 (s1, s1);
float32x4_t r0 = vmulq_f32 (vmulq_f32 (p, s1), s2);
return vreinterpretq_f32_u32 ((cmp & vreinterpretq_u32_f32 (r1))
| (~cmp & vreinterpretq_u32_f32 (r0)));
}
float32x4_t VPCS_ATTR
_ZGVnN4v_exp2f_1u (float32x4_t x)
{
/* exp2(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)]
x = n + r, with r in [-1/2, 1/2]. */
const struct data *d = ptr_barrier (&data);
float32x4_t n = vrndaq_f32 (x);
float32x4_t r = x - n;
uint32x4_t e = vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)) << 23;
float32x4_t scale = vreinterpretq_f32_u32 (e + d->exponent_bias);
uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
float32x4_t p = vfmaq_f32 (d->c1, d->c0, r);
p = vfmaq_f32 (d->c2, p, r);
p = vfmaq_f32 (d->c3, p, r);
p = vfmaq_f32 (d->c4, p, r);
p = vfmaq_f32 (d->c5, p, r);
p = vfmaq_f32 (v_f32 (1.0f), p, r);
if (unlikely (v_any_u32 (cmp)))
return specialcase (p, n, e, d);
return scale * p;
}
TEST_ULP (_ZGVnN4v_exp2f_1u, 0.4)
TEST_DISABLE_FENV (_ZGVnN4v_exp2f_1u)
TEST_INTERVAL (_ZGVnN4v_exp2f_1u, 0, 0xffff0000, 10000)
TEST_SYM_INTERVAL (_ZGVnN4v_exp2f_1u, 0x1p-14, 0x1p8, 500000)

View file

@ -1,30 +1,34 @@
/*
* Single-precision vector e^x function.
*
* Copyright (c) 2019-2023, Arm Limited.
* Copyright (c) 2019-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#include "v_math.h"
#include "test_defs.h"
#include "test_sig.h"
static const struct data
{
float32x4_t poly[5];
float32x4_t shift, inv_ln2, ln2_hi, ln2_lo;
uint32x4_t exponent_bias;
float32x4_t c1, c3, c4, inv_ln2;
float ln2_hi, ln2_lo, c0, c2;
uint32x4_t exponent_bias, special_offset, special_bias;
#if !WANT_SIMD_EXCEPT
float32x4_t special_bound, scale_thresh;
#endif
} data = {
/* maxerr: 1.45358 +0.5 ulp. */
.poly = { V4 (0x1.0e4020p-7f), V4 (0x1.573e2ep-5f), V4 (0x1.555e66p-3f),
V4 (0x1.fffdb6p-2f), V4 (0x1.ffffecp-1f) },
.shift = V4 (0x1.8p23f),
.c0 = 0x1.0e4020p-7f,
.c1 = V4 (0x1.573e2ep-5f),
.c2 = 0x1.555e66p-3f,
.c3 = V4 (0x1.fffdb6p-2f),
.c4 = V4 (0x1.ffffecp-1f),
.inv_ln2 = V4 (0x1.715476p+0f),
.ln2_hi = V4 (0x1.62e4p-1f),
.ln2_lo = V4 (0x1.7f7d1cp-20f),
.ln2_hi = 0x1.62e4p-1f,
.ln2_lo = 0x1.7f7d1cp-20f,
.exponent_bias = V4 (0x3f800000),
.special_offset = V4 (0x82000000),
.special_bias = V4 (0x7f000000),
#if !WANT_SIMD_EXCEPT
.special_bound = V4 (126.0f),
.scale_thresh = V4 (192.0f),
@ -49,19 +53,17 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
#else
# define SpecialOffset v_u32 (0x82000000)
# define SpecialBias v_u32 (0x7f000000)
static float32x4_t VPCS_ATTR NOINLINE
special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
float32x4_t scale, const struct data *d)
{
/* 2^n may overflow, break it up into s1*s2. */
uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset);
float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias));
uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset);
float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias));
float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
float32x4_t r2 = vmulq_f32 (s1, s1);
// (s2 + p*s2)*s1 = s2(p+1)s1
float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1);
/* Similar to r1 but avoids double rounding in the subnormal range. */
float32x4_t r0 = vfmaq_f32 (scale, poly, scale);
@ -71,15 +73,14 @@ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
#endif
float32x4_t VPCS_ATTR V_NAME_F1 (exp) (float32x4_t x)
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
float32x4_t n, r, r2, scale, p, q, poly, z;
uint32x4_t cmp, e;
float32x4_t ln2_c02 = vld1q_f32 (&d->ln2_hi);
#if WANT_SIMD_EXCEPT
/* asuint(x) - TinyBound >= BigBound - TinyBound. */
cmp = vcgeq_u32 (
uint32x4_t cmp = vcgeq_u32 (
vsubq_u32 (vandq_u32 (vreinterpretq_u32_f32 (x), v_u32 (0x7fffffff)),
TinyBound),
SpecialBound);
@ -93,23 +94,22 @@ float32x4_t VPCS_ATTR V_NAME_F1 (exp) (float32x4_t x)
/* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
z = vfmaq_f32 (d->shift, x, d->inv_ln2);
n = vsubq_f32 (z, d->shift);
r = vfmsq_f32 (x, n, d->ln2_hi);
r = vfmsq_f32 (r, n, d->ln2_lo);
e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23);
scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
float32x4_t n = vrndaq_f32 (vmulq_f32 (x, d->inv_ln2));
float32x4_t r = vfmsq_laneq_f32 (x, n, ln2_c02, 0);
r = vfmsq_laneq_f32 (r, n, ln2_c02, 1);
uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23);
float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
#if !WANT_SIMD_EXCEPT
cmp = vcagtq_f32 (n, d->special_bound);
uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
#endif
r2 = vmulq_f32 (r, r);
p = vfmaq_f32 (C (1), C (0), r);
q = vfmaq_f32 (C (3), C (2), r);
float32x4_t r2 = vmulq_f32 (r, r);
float32x4_t p = vfmaq_laneq_f32 (d->c1, r, ln2_c02, 2);
float32x4_t q = vfmaq_laneq_f32 (d->c3, r, ln2_c02, 3);
q = vfmaq_f32 (q, p, r2);
p = vmulq_f32 (C (4), r);
poly = vfmaq_f32 (p, q, r2);
p = vmulq_f32 (d->c4, r);
float32x4_t poly = vfmaq_f32 (p, q, r2);
if (unlikely (v_any_u32 (cmp)))
#if WANT_SIMD_EXCEPT
@ -120,3 +120,11 @@ float32x4_t VPCS_ATTR V_NAME_F1 (exp) (float32x4_t x)
return vfmaq_f32 (scale, poly, scale);
}
HALF_WIDTH_ALIAS_F1 (exp)
TEST_SIG (V, F, 1, exp, -9.9, 9.9)
TEST_ULP (V_NAME_F1 (exp), 1.49)
TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (exp), WANT_SIMD_EXCEPT)
TEST_INTERVAL (V_NAME_F1 (exp), 0, 0xffff0000, 10000)
TEST_SYM_INTERVAL (V_NAME_F1 (exp), 0x1p-14, 0x1p8, 500000)

View file

@ -0,0 +1,79 @@
/*
* Single-precision vector e^x function.
*
* Copyright (c) 2019-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
#include "test_defs.h"
static const struct data
{
float32x4_t shift, inv_ln2;
uint32x4_t exponent_bias;
float32x4_t c1, c2, c3, c4;
float32x4_t special_bound, scale_thresh;
uint32x4_t special_offset, special_bias;
float ln2_hi, ln2_lo, c0, nothing;
} data = {
.ln2_hi = 0x1.62e4p-1f,
.ln2_lo = 0x1.7f7d1cp-20f,
.shift = V4 (0x1.8p23f),
.inv_ln2 = V4 (0x1.715476p+0f),
.exponent_bias = V4 (0x3f800000),
.special_bound = V4 (126.0f),
.scale_thresh = V4 (192.0f),
.special_offset = V4 (0x83000000),
.special_bias = V4 (0x7f000000),
/* maxerr: 0.36565 +0.5 ulp. */
.c0 = 0x1.6a6000p-10f,
.c1 = V4 (0x1.12718ep-7f),
.c2 = V4 (0x1.555af0p-5f),
.c3 = V4 (0x1.555430p-3f),
.c4 = V4 (0x1.fffff4p-2f),
};
static float32x4_t VPCS_ATTR NOINLINE
specialcase (float32x4_t p, float32x4_t n, uint32x4_t e, const struct data *d)
{
/* 2^n may overflow, break it up into s1*s2. */
uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset);
float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias));
float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
uint32x4_t cmp = vcagtq_f32 (n, d->scale_thresh);
float32x4_t r1 = vmulq_f32 (s1, s1);
float32x4_t r0 = vmulq_f32 (vmulq_f32 (p, s1), s2);
return vreinterpretq_f32_u32 ((cmp & vreinterpretq_u32_f32 (r1))
| (~cmp & vreinterpretq_u32_f32 (r0)));
}
float32x4_t VPCS_ATTR
_ZGVnN4v_expf_1u (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
float32x4_t ln2_c0 = vld1q_f32 (&d->ln2_hi);
/* exp(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)]
x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
float32x4_t z = vmulq_f32 (x, d->inv_ln2);
float32x4_t n = vrndaq_f32 (z);
float32x4_t r = vfmsq_laneq_f32 (x, n, ln2_c0, 0);
r = vfmsq_laneq_f32 (r, n, ln2_c0, 1);
uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (z)), 23);
float32x4_t scale = vreinterpretq_f32_u32 (e + d->exponent_bias);
uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
float32x4_t p = vfmaq_laneq_f32 (d->c1, r, ln2_c0, 2);
p = vfmaq_f32 (d->c2, p, r);
p = vfmaq_f32 (d->c3, p, r);
p = vfmaq_f32 (d->c4, p, r);
p = vfmaq_f32 (v_f32 (1.0f), p, r);
p = vfmaq_f32 (v_f32 (1.0f), p, r);
if (unlikely (v_any_u32 (cmp)))
return specialcase (p, n, e, d);
return scale * p;
}
TEST_ULP (_ZGVnN4v_expf_1u, 0.4)
TEST_DISABLE_FENV (_ZGVnN4v_expf_1u)
TEST_INTERVAL (_ZGVnN4v_expf_1u, 0, 0xffff0000, 10000)
TEST_SYM_INTERVAL (_ZGVnN4v_expf_1u, 0x1p-14, 0x1p8, 500000)

View file

@ -0,0 +1,77 @@
/*
* Double-precision vector exp(x) - 1 function.
*
* Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
#include "test_sig.h"
#include "test_defs.h"
#include "v_expm1_inline.h"
static const struct data
{
struct v_expm1_data d;
#if WANT_SIMD_EXCEPT
uint64x2_t thresh, tiny_bound;
#else
float64x2_t oflow_bound;
#endif
} data = {
.d = V_EXPM1_DATA,
#if WANT_SIMD_EXCEPT
/* asuint64(oflow_bound) - asuint64(0x1p-51), shifted left by 1 for abs
compare. */
.thresh = V2 (0x78c56fa6d34b552),
/* asuint64(0x1p-51) << 1. */
.tiny_bound = V2 (0x3cc0000000000000 << 1),
#else
/* Value above which expm1(x) should overflow. Absolute value of the
underflow bound is greater than this, so it catches both cases - there is
a small window where fallbacks are triggered unnecessarily. */
.oflow_bound = V2 (0x1.62b7d369a5aa9p+9),
#endif
};
static float64x2_t VPCS_ATTR NOINLINE
special_case (float64x2_t x, uint64x2_t special, const struct data *d)
{
return v_call_f64 (expm1, x, expm1_inline (v_zerofy_f64 (x, special), &d->d),
special);
}
/* Double-precision vector exp(x) - 1 function.
The maximum error observed error is 2.05 ULP:
_ZGVnN2v_expm1(0x1.6329669eb8c87p-2) got 0x1.a8897eef87b34p-2
want 0x1.a8897eef87b32p-2. */
float64x2_t VPCS_ATTR V_NAME_D1 (expm1) (float64x2_t x)
{
const struct data *d = ptr_barrier (&data);
#if WANT_SIMD_EXCEPT
uint64x2_t ix = vreinterpretq_u64_f64 (x);
/* If fp exceptions are to be triggered correctly, fall back to scalar for
|x| < 2^-51, |x| > oflow_bound, Inf & NaN. Add ix to itself for
shift-left by 1, and compare with thresh which was left-shifted offline -
this is effectively an absolute compare. */
uint64x2_t special
= vcgeq_u64 (vsubq_u64 (vaddq_u64 (ix, ix), d->tiny_bound), d->thresh);
#else
/* Large input, NaNs and Infs. */
uint64x2_t special = vcageq_f64 (x, d->oflow_bound);
#endif
if (unlikely (v_any_u64 (special)))
return special_case (x, special, d);
/* expm1(x) ~= p * t + (t - 1). */
return expm1_inline (x, &d->d);
}
TEST_SIG (V, D, 1, expm1, -9.9, 9.9)
TEST_ULP (V_NAME_D1 (expm1), 1.56)
TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (expm1), WANT_SIMD_EXCEPT)
TEST_SYM_INTERVAL (V_NAME_D1 (expm1), 0, 0x1p-51, 1000)
TEST_SYM_INTERVAL (V_NAME_D1 (expm1), 0x1p-51, 0x1.62b7d369a5aa9p+9, 100000)
TEST_SYM_INTERVAL (V_NAME_D1 (expm1), 0x1.62b7d369a5aa9p+9, inf, 100)

View file

@ -0,0 +1,82 @@
/*
* Single-precision vector exp(x) - 1 function.
*
* Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
#include "test_sig.h"
#include "test_defs.h"
#include "v_expm1f_inline.h"
static const struct data
{
struct v_expm1f_data d;
#if WANT_SIMD_EXCEPT
uint32x4_t thresh;
#else
float32x4_t oflow_bound;
#endif
} data = {
.d = V_EXPM1F_DATA,
#if !WANT_SIMD_EXCEPT
/* Value above which expm1f(x) should overflow. Absolute value of the
underflow bound is greater than this, so it catches both cases - there is
a small window where fallbacks are triggered unnecessarily. */
.oflow_bound = V4 (0x1.5ebc4p+6),
#else
/* asuint(oflow_bound) - asuint(0x1p-23), shifted left by 1 for absolute
compare. */
.thresh = V4 (0x1d5ebc40),
#endif
};
/* asuint(0x1p-23), shifted by 1 for abs compare. */
#define TinyBound v_u32 (0x34000000 << 1)
static float32x4_t VPCS_ATTR NOINLINE
special_case (float32x4_t x, uint32x4_t special, const struct data *d)
{
return v_call_f32 (
expm1f, x, expm1f_inline (v_zerofy_f32 (x, special), &d->d), special);
}
/* Single-precision vector exp(x) - 1 function.
The maximum error is 1.62 ULP:
_ZGVnN4v_expm1f(0x1.85f83p-2) got 0x1.da9f4p-2
want 0x1.da9f44p-2. */
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (expm1) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
#if WANT_SIMD_EXCEPT
uint32x4_t ix = vreinterpretq_u32_f32 (x);
/* If fp exceptions are to be triggered correctly, fall back to scalar for
|x| < 2^-23, |x| > oflow_bound, Inf & NaN. Add ix to itself for
shift-left by 1, and compare with thresh which was left-shifted offline -
this is effectively an absolute compare. */
uint32x4_t special
= vcgeq_u32 (vsubq_u32 (vaddq_u32 (ix, ix), TinyBound), d->thresh);
#else
/* Handles very large values (+ve and -ve), +/-NaN, +/-Inf. */
uint32x4_t special = vcagtq_f32 (x, d->oflow_bound);
#endif
if (unlikely (v_any_u32 (special)))
return special_case (x, special, d);
/* expm1(x) ~= p * t + (t - 1). */
return expm1f_inline (x, &d->d);
}
HALF_WIDTH_ALIAS_F1 (expm1)
TEST_SIG (V, F, 1, expm1, -9.9, 9.9)
TEST_ULP (V_NAME_F1 (expm1), 1.13)
TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (expm1), WANT_SIMD_EXCEPT)
TEST_SYM_INTERVAL (V_NAME_F1 (expm1), 0, 0x1p-23, 1000)
TEST_INTERVAL (V_NAME_F1 (expm1), -0x1p-23, 0x1.5ebc4p+6, 1000000)
TEST_INTERVAL (V_NAME_F1 (expm1), -0x1p-23, -0x1.9bbabcp+6, 1000000)
TEST_INTERVAL (V_NAME_F1 (expm1), 0x1.5ebc4p+6, inf, 1000)
TEST_INTERVAL (V_NAME_F1 (expm1), -0x1.9bbabcp+6, -inf, 1000)

View file

@ -1,7 +1,7 @@
/*
* Double-precision x^y function.
*
* Copyright (c) 2018-2023, Arm Limited.
* Copyright (c) 2018-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
@ -108,7 +108,7 @@ special_case (double tmp, uint64_t sbits, uint64_t ki)
sbits -= 1009ull << 52;
scale = asdouble (sbits);
y = 0x1p1009 * (scale + scale * tmp);
return check_oflow (eval_as_double (y));
return y;
}
/* k < 0, need special care in the subnormal range. */
sbits += 1022ull << 52;
@ -128,7 +128,7 @@ special_case (double tmp, uint64_t sbits, uint64_t ki)
lo = scale - y + scale * tmp;
hi = one + y;
lo = one - hi + y + lo;
y = eval_as_double (hi + lo) - one;
y = (hi + lo) - one;
/* Fix the sign of 0. */
if (y == 0.0)
y = asdouble (sbits & 0x8000000000000000);
@ -137,7 +137,7 @@ special_case (double tmp, uint64_t sbits, uint64_t ki)
}
#endif
y = 0x1p-1022 * y;
return check_uflow (eval_as_double (y));
return y;
}
/* Computes sign*exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|.
@ -192,7 +192,7 @@ exp_inline (double x, double xtail, uint32_t sign_bias)
double scale = asdouble (sbits);
/* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
is no spurious underflow here even without fma. */
return eval_as_double (scale + scale * tmp);
return scale + scale * tmp;
}
/* Computes exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|.
@ -239,7 +239,7 @@ exp_nosignbias (double x, double xtail)
double scale = asdouble (sbits);
/* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
is no spurious underflow here even without fma. */
return eval_as_double (scale + scale * tmp);
return scale + scale * tmp;
}
/* Returns 0 if not int, 1 if odd int, 2 if even int. The argument is
@ -267,7 +267,7 @@ zeroinfnan (uint64_t i)
}
static double NOINLINE
__pl_finite_pow (double x, double y)
pow_scalar_special_case (double x, double y)
{
uint32_t sign_bias = 0;
uint64_t ix, iy;
@ -311,9 +311,7 @@ __pl_finite_pow (double x, double y)
if (2 * ix == 0 && iy >> 63)
return __math_divzero (sign_bias);
#endif
/* Without the barrier some versions of clang hoist the 1/x2 and
thus division by zero exception can be signaled spuriously. */
return iy >> 63 ? opt_barrier_double (1 / x2) : x2;
return iy >> 63 ? 1 / x2 : x2;
}
/* Here x and y are non-zero finite. */
if (ix >> 63)
@ -349,9 +347,7 @@ __pl_finite_pow (double x, double y)
if (topx == 0)
{
/* Normalize subnormal x so exponent becomes negative. */
/* Without the barrier some versions of clang evalutate the mul
unconditionally causing spurious overflow exceptions. */
ix = asuint64 (opt_barrier_double (x) * 0x1p52);
ix = asuint64 (x * 0x1p52);
ix &= 0x7fffffffffffffff;
ix -= 52ULL << 52;
}

View file

@ -1,13 +1,13 @@
/*
* Double-precision vector hypot(x) function.
*
* Copyright (c) 2023, Arm Limited.
* Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
#include "pl_sig.h"
#include "pl_test.h"
#include "test_sig.h"
#include "test_defs.h"
#if WANT_SIMD_EXCEPT
static const struct data
@ -15,7 +15,7 @@ static const struct data
uint64x2_t tiny_bound, thres;
} data = {
.tiny_bound = V2 (0x2000000000000000), /* asuint (0x1p-511). */
.thres = V2 (0x3fe0000000000000), /* asuint (0x1p511) - tiny_bound. */
.thres = V2 (0x3fe0000000000000), /* asuint (0x1p511) - tiny_bound. */
};
#else
static const struct data
@ -24,7 +24,7 @@ static const struct data
uint32x4_t thres;
} data = {
.tiny_bound = V2 (0x0360000000000000), /* asuint (0x1p-969). */
.thres = V4 (0x7c900000), /* asuint (inf) - tiny_bound. */
.thres = V4 (0x7c900000), /* asuint (inf) - tiny_bound. */
};
#endif
@ -75,9 +75,9 @@ float64x2_t VPCS_ATTR V_NAME_D2 (hypot) (float64x2_t x, float64x2_t y)
float64x2_t sqsum = vfmaq_f64 (vmulq_f64 (x, x), y, y);
uint32x2_t special = vcge_u32 (
vsubhn_u64 (vreinterpretq_u64_f64 (sqsum), d->tiny_bound),
vget_low_u32 (d->thres));
uint32x2_t special
= vcge_u32 (vsubhn_u64 (vreinterpretq_u64_f64 (sqsum), d->tiny_bound),
vget_low_u32 (d->thres));
if (unlikely (v_any_u32h (special)))
return special_case (x, y, sqsum, special);
@ -86,10 +86,10 @@ float64x2_t VPCS_ATTR V_NAME_D2 (hypot) (float64x2_t x, float64x2_t y)
}
#endif
PL_SIG (V, D, 2, hypot, -10.0, 10.0)
PL_TEST_ULP (V_NAME_D2 (hypot), 1.21)
PL_TEST_EXPECT_FENV (V_NAME_D2 (hypot), WANT_SIMD_EXCEPT)
PL_TEST_INTERVAL2 (V_NAME_D2 (hypot), 0, inf, 0, inf, 10000)
PL_TEST_INTERVAL2 (V_NAME_D2 (hypot), 0, inf, -0, -inf, 10000)
PL_TEST_INTERVAL2 (V_NAME_D2 (hypot), -0, -inf, 0, inf, 10000)
PL_TEST_INTERVAL2 (V_NAME_D2 (hypot), -0, -inf, -0, -inf, 10000)
TEST_SIG (V, D, 2, hypot, -10.0, 10.0)
TEST_ULP (V_NAME_D2 (hypot), 1.21)
TEST_DISABLE_FENV_IF_NOT (V_NAME_D2 (hypot), WANT_SIMD_EXCEPT)
TEST_INTERVAL2 (V_NAME_D2 (hypot), 0, inf, 0, inf, 10000)
TEST_INTERVAL2 (V_NAME_D2 (hypot), 0, inf, -0, -inf, 10000)
TEST_INTERVAL2 (V_NAME_D2 (hypot), -0, -inf, 0, inf, 10000)
TEST_INTERVAL2 (V_NAME_D2 (hypot), -0, -inf, -0, -inf, 10000)

View file

@ -1,13 +1,13 @@
/*
* Single-precision vector hypot(x) function.
*
* Copyright (c) 2023, Arm Limited.
* Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
#include "pl_sig.h"
#include "pl_test.h"
#include "test_sig.h"
#include "test_defs.h"
#if WANT_SIMD_EXCEPT
static const struct data
@ -15,7 +15,7 @@ static const struct data
uint32x4_t tiny_bound, thres;
} data = {
.tiny_bound = V4 (0x20000000), /* asuint (0x1p-63). */
.thres = V4 (0x3f000000), /* asuint (0x1p63) - tiny_bound. */
.thres = V4 (0x3f000000), /* asuint (0x1p63) - tiny_bound. */
};
#else
static const struct data
@ -24,7 +24,7 @@ static const struct data
uint16x8_t thres;
} data = {
.tiny_bound = V4 (0x0C800000), /* asuint (0x1p-102). */
.thres = V8 (0x7300), /* asuint (inf) - tiny_bound. */
.thres = V8 (0x7300), /* asuint (inf) - tiny_bound. */
};
#endif
@ -41,7 +41,7 @@ special_case (float32x4_t x, float32x4_t y, float32x4_t sqsum,
want 0x1.6a41dp-13. */
#if WANT_SIMD_EXCEPT
float32x4_t VPCS_ATTR V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y)
float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y)
{
const struct data *d = ptr_barrier (&data);
@ -68,15 +68,15 @@ float32x4_t VPCS_ATTR V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y)
}
#else
float32x4_t VPCS_ATTR V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y)
float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y)
{
const struct data *d = ptr_barrier (&data);
float32x4_t sqsum = vfmaq_f32 (vmulq_f32 (x, x), y, y);
uint16x4_t special = vcge_u16 (
vsubhn_u32 (vreinterpretq_u32_f32 (sqsum), d->tiny_bound),
vget_low_u16 (d->thres));
uint16x4_t special
= vcge_u16 (vsubhn_u32 (vreinterpretq_u32_f32 (sqsum), d->tiny_bound),
vget_low_u16 (d->thres));
if (unlikely (v_any_u16h (special)))
return special_case (x, y, sqsum, special);
@ -85,10 +85,12 @@ float32x4_t VPCS_ATTR V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y)
}
#endif
PL_SIG (V, F, 2, hypot, -10.0, 10.0)
PL_TEST_ULP (V_NAME_F2 (hypot), 1.21)
PL_TEST_EXPECT_FENV (V_NAME_F2 (hypot), WANT_SIMD_EXCEPT)
PL_TEST_INTERVAL2 (V_NAME_F2 (hypot), 0, inf, 0, inf, 10000)
PL_TEST_INTERVAL2 (V_NAME_F2 (hypot), 0, inf, -0, -inf, 10000)
PL_TEST_INTERVAL2 (V_NAME_F2 (hypot), -0, -inf, 0, inf, 10000)
PL_TEST_INTERVAL2 (V_NAME_F2 (hypot), -0, -inf, -0, -inf, 10000)
HALF_WIDTH_ALIAS_F2 (hypot)
TEST_SIG (V, F, 2, hypot, -10.0, 10.0)
TEST_ULP (V_NAME_F2 (hypot), 1.21)
TEST_DISABLE_FENV_IF_NOT (V_NAME_F2 (hypot), WANT_SIMD_EXCEPT)
TEST_INTERVAL2 (V_NAME_F2 (hypot), 0, inf, 0, inf, 10000)
TEST_INTERVAL2 (V_NAME_F2 (hypot), 0, inf, -0, -inf, 10000)
TEST_INTERVAL2 (V_NAME_F2 (hypot), -0, -inf, 0, inf, 10000)
TEST_INTERVAL2 (V_NAME_F2 (hypot), -0, -inf, -0, -inf, 10000)

View file

@ -0,0 +1,118 @@
/*
* Double-precision vector log(x) function.
*
* Copyright (c) 2019-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
#include "test_defs.h"
#include "test_sig.h"
static const struct data
{
uint64x2_t off, sign_exp_mask, offset_lower_bound;
uint32x4_t special_bound;
float64x2_t c0, c2;
double c1, c3, ln2, c4;
} data = {
/* Rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ]. */
.c0 = V2 (-0x1.ffffffffffff7p-2),
.c1 = 0x1.55555555170d4p-2,
.c2 = V2 (-0x1.0000000399c27p-2),
.c3 = 0x1.999b2e90e94cap-3,
.c4 = -0x1.554e550bd501ep-3,
.ln2 = 0x1.62e42fefa39efp-1,
.sign_exp_mask = V2 (0xfff0000000000000),
.off = V2 (0x3fe6900900000000),
/* Lower bound is 0x0010000000000000. For
optimised register use subnormals are detected after offset has been
subtracted, so lower bound - offset (which wraps around). */
.offset_lower_bound = V2 (0x0010000000000000 - 0x3fe6900900000000),
.special_bound = V4 (0x7fe00000), /* asuint64(inf) - asuint64(0x1p-126). */
};
#define N (1 << V_LOG_TABLE_BITS)
#define IndexMask (N - 1)
struct entry
{
float64x2_t invc;
float64x2_t logc;
};
static inline struct entry
lookup (uint64x2_t i)
{
/* Since N is a power of 2, n % N = n & (N - 1). */
struct entry e;
uint64_t i0 = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
uint64_t i1 = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc);
float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc);
e.invc = vuzp1q_f64 (e0, e1);
e.logc = vuzp2q_f64 (e0, e1);
return e;
}
static float64x2_t VPCS_ATTR NOINLINE
special_case (float64x2_t hi, uint64x2_t u_off, float64x2_t y, float64x2_t r2,
uint32x2_t special, const struct data *d)
{
float64x2_t x = vreinterpretq_f64_u64 (vaddq_u64 (u_off, d->off));
return v_call_f64 (log, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (special));
}
/* Double-precision vector log routine.
The maximum observed error is 2.17 ULP:
_ZGVnN2v_log(0x1.a6129884398a3p+0) got 0x1.ffffff1cca043p-2
want 0x1.ffffff1cca045p-2. */
float64x2_t VPCS_ATTR V_NAME_D1 (log) (float64x2_t x)
{
const struct data *d = ptr_barrier (&data);
/* To avoid having to mov x out of the way, keep u after offset has been
applied, and recover x by adding the offset back in the special-case
handler. */
uint64x2_t u = vreinterpretq_u64_f64 (x);
uint64x2_t u_off = vsubq_u64 (u, d->off);
/* x = 2^k z; where z is in range [Off,2*Off) and exact.
The range is split into N subintervals.
The ith subinterval contains z and c is near its center. */
int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52);
uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->sign_exp_mask));
float64x2_t z = vreinterpretq_f64_u64 (iz);
struct entry e = lookup (u_off);
uint32x2_t special = vcge_u32 (vsubhn_u64 (u_off, d->offset_lower_bound),
vget_low_u32 (d->special_bound));
/* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */
float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
float64x2_t kd = vcvtq_f64_s64 (k);
/* hi = r + log(c) + k*Ln2. */
float64x2_t ln2_and_c4 = vld1q_f64 (&d->ln2);
float64x2_t hi = vfmaq_laneq_f64 (vaddq_f64 (e.logc, r), kd, ln2_and_c4, 0);
/* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */
float64x2_t odd_coeffs = vld1q_f64 (&d->c1);
float64x2_t r2 = vmulq_f64 (r, r);
float64x2_t y = vfmaq_laneq_f64 (d->c2, r, odd_coeffs, 1);
float64x2_t p = vfmaq_laneq_f64 (d->c0, r, odd_coeffs, 0);
y = vfmaq_laneq_f64 (y, r2, ln2_and_c4, 1);
y = vfmaq_f64 (p, r2, y);
if (unlikely (v_any_u32h (special)))
return special_case (hi, u_off, y, r2, special, d);
return vfmaq_f64 (hi, y, r2);
}
TEST_SIG (V, D, 1, log, 0.01, 11.1)
TEST_ULP (V_NAME_D1 (log), 1.67)
TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (log), WANT_SIMD_EXCEPT)
TEST_INTERVAL (V_NAME_D1 (log), 0, 0xffff000000000000, 10000)
TEST_INTERVAL (V_NAME_D1 (log), 0x1p-4, 0x1p4, 400000)
TEST_INTERVAL (V_NAME_D1 (log), 0, inf, 400000)

View file

@ -0,0 +1,132 @@
/*
* Double-precision vector log10(x) function.
*
* Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
#include "test_sig.h"
#include "test_defs.h"
static const struct data
{
uint64x2_t off, sign_exp_mask, offset_lower_bound;
uint32x4_t special_bound;
double invln10, log10_2;
double c1, c3;
float64x2_t c0, c2, c4;
} data = {
/* Computed from log coefficients divided by log(10) then rounded to double
precision. */
.c0 = V2 (-0x1.bcb7b1526e506p-3),
.c1 = 0x1.287a7636be1d1p-3,
.c2 = V2 (-0x1.bcb7b158af938p-4),
.c3 = 0x1.63c78734e6d07p-4,
.c4 = V2 (-0x1.287461742fee4p-4),
.invln10 = 0x1.bcb7b1526e50ep-2,
.log10_2 = 0x1.34413509f79ffp-2,
.off = V2 (0x3fe6900900000000),
.sign_exp_mask = V2 (0xfff0000000000000),
/* Lower bound is 0x0010000000000000. For
optimised register use subnormals are detected after offset has been
subtracted, so lower bound - offset (which wraps around). */
.offset_lower_bound = V2 (0x0010000000000000 - 0x3fe6900900000000),
.special_bound = V4 (0x7fe00000), /* asuint64(inf) - 0x0010000000000000. */
};
#define N (1 << V_LOG10_TABLE_BITS)
#define IndexMask (N - 1)
struct entry
{
float64x2_t invc;
float64x2_t log10c;
};
static inline struct entry
lookup (uint64x2_t i)
{
struct entry e;
uint64_t i0
= (vgetq_lane_u64 (i, 0) >> (52 - V_LOG10_TABLE_BITS)) & IndexMask;
uint64_t i1
= (vgetq_lane_u64 (i, 1) >> (52 - V_LOG10_TABLE_BITS)) & IndexMask;
float64x2_t e0 = vld1q_f64 (&__v_log10_data.table[i0].invc);
float64x2_t e1 = vld1q_f64 (&__v_log10_data.table[i1].invc);
e.invc = vuzp1q_f64 (e0, e1);
e.log10c = vuzp2q_f64 (e0, e1);
return e;
}
static float64x2_t VPCS_ATTR NOINLINE
special_case (float64x2_t hi, uint64x2_t u_off, float64x2_t y, float64x2_t r2,
uint32x2_t special, const struct data *d)
{
float64x2_t x = vreinterpretq_f64_u64 (vaddq_u64 (u_off, d->off));
return v_call_f64 (log10, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (special));
}
/* Fast implementation of double-precision vector log10
is a slight modification of double-precision vector log.
Max ULP error: < 2.5 ulp (nearest rounding.)
Maximum measured at 2.46 ulp for x in [0.96, 0.97]
_ZGVnN2v_log10(0x1.13192407fcb46p+0) got 0x1.fff6be3cae4bbp-6
want 0x1.fff6be3cae4b9p-6. */
float64x2_t VPCS_ATTR V_NAME_D1 (log10) (float64x2_t x)
{
const struct data *d = ptr_barrier (&data);
/* To avoid having to mov x out of the way, keep u after offset has been
applied, and recover x by adding the offset back in the special-case
handler. */
uint64x2_t u = vreinterpretq_u64_f64 (x);
uint64x2_t u_off = vsubq_u64 (u, d->off);
/* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
The range is split into N subintervals.
The ith subinterval contains z and c is near its center. */
int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52);
uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->sign_exp_mask));
float64x2_t z = vreinterpretq_f64_u64 (iz);
struct entry e = lookup (u_off);
uint32x2_t special = vcge_u32 (vsubhn_u64 (u_off, d->offset_lower_bound),
vget_low_u32 (d->special_bound));
/* log10(x) = log1p(z/c-1)/log(10) + log10(c) + k*log10(2). */
float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
float64x2_t kd = vcvtq_f64_s64 (k);
/* hi = r / log(10) + log10(c) + k*log10(2).
Constants in v_log10_data.c are computed (in extended precision) as
e.log10c := e.logc * invln10. */
float64x2_t cte = vld1q_f64 (&d->invln10);
float64x2_t hi = vfmaq_laneq_f64 (e.log10c, r, cte, 0);
/* y = log10(1+r) + n * log10(2). */
hi = vfmaq_laneq_f64 (hi, kd, cte, 1);
/* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */
float64x2_t r2 = vmulq_f64 (r, r);
float64x2_t odd_coeffs = vld1q_f64 (&d->c1);
float64x2_t y = vfmaq_laneq_f64 (d->c2, r, odd_coeffs, 1);
float64x2_t p = vfmaq_laneq_f64 (d->c0, r, odd_coeffs, 0);
y = vfmaq_f64 (y, d->c4, r2);
y = vfmaq_f64 (p, y, r2);
if (unlikely (v_any_u32h (special)))
return special_case (hi, u_off, y, r2, special, d);
return vfmaq_f64 (hi, y, r2);
}
TEST_SIG (V, D, 1, log10, 0.01, 11.1)
TEST_ULP (V_NAME_D1 (log10), 1.97)
TEST_INTERVAL (V_NAME_D1 (log10), -0.0, -inf, 1000)
TEST_INTERVAL (V_NAME_D1 (log10), 0, 0x1p-149, 1000)
TEST_INTERVAL (V_NAME_D1 (log10), 0x1p-149, 0x1p-126, 4000)
TEST_INTERVAL (V_NAME_D1 (log10), 0x1p-126, 0x1p-23, 50000)
TEST_INTERVAL (V_NAME_D1 (log10), 0x1p-23, 1.0, 50000)
TEST_INTERVAL (V_NAME_D1 (log10), 1.0, 100, 50000)
TEST_INTERVAL (V_NAME_D1 (log10), 100, inf, 50000)

View file

@ -0,0 +1,106 @@
/*
* Single-precision vector log10 function.
*
* Copyright (c) 2020-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
#include "test_sig.h"
#include "test_defs.h"
static const struct data
{
float32x4_t c0, c2, c4, c6, inv_ln10, ln2;
uint32x4_t off, offset_lower_bound;
uint16x8_t special_bound;
uint32x4_t mantissa_mask;
float c1, c3, c5, c7;
} data = {
/* Use order 9 for log10(1+x), i.e. order 8 for log10(1+x)/x, with x in
[-1/3, 1/3] (offset=2/3). Max. relative error: 0x1.068ee468p-25. */
.c0 = V4 (-0x1.bcb79cp-3f),
.c1 = 0x1.2879c8p-3f,
.c2 = V4 (-0x1.bcd472p-4f),
.c3 = 0x1.6408f8p-4f,
.c4 = V4 (-0x1.246f8p-4f),
.c5 = 0x1.f0e514p-5f,
.c6 = V4 (-0x1.0fc92cp-4f),
.c7 = 0x1.f5f76ap-5f,
.ln2 = V4 (0x1.62e43p-1f),
.inv_ln10 = V4 (0x1.bcb7b2p-2f),
/* Lower bound is the smallest positive normal float 0x00800000. For
optimised register use subnormals are detected after offset has been
subtracted, so lower bound is 0x0080000 - offset (which wraps around). */
.offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab),
.special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000). */
.off = V4 (0x3f2aaaab), /* 0.666667. */
.mantissa_mask = V4 (0x007fffff),
};
static float32x4_t VPCS_ATTR NOINLINE
special_case (float32x4_t y, uint32x4_t u_off, float32x4_t p, float32x4_t r2,
uint16x4_t cmp, const struct data *d)
{
/* Fall back to scalar code. */
return v_call_f32 (log10f, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)),
vfmaq_f32 (y, p, r2), vmovl_u16 (cmp));
}
/* Fast implementation of AdvSIMD log10f,
uses a similar approach as AdvSIMD logf with the same offset (i.e., 2/3) and
an order 9 polynomial.
Maximum error: 3.305ulps (nearest rounding.)
_ZGVnN4v_log10f(0x1.555c16p+0) got 0x1.ffe2fap-4
want 0x1.ffe2f4p-4. */
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log10) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
float32x4_t c1357 = vld1q_f32 (&d->c1);
/* To avoid having to mov x out of the way, keep u after offset has been
applied, and recover x by adding the offset back in the special-case
handler. */
uint32x4_t u_off = vreinterpretq_u32_f32 (x);
/* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
u_off = vsubq_u32 (u_off, d->off);
float32x4_t n = vcvtq_f32_s32 (
vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend. */
uint16x4_t special = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound),
vget_low_u16 (d->special_bound));
uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off);
float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
/* y = log10(1+r) + n * log10(2). */
float32x4_t r2 = vmulq_f32 (r, r);
float32x4_t c01 = vfmaq_laneq_f32 (d->c0, r, c1357, 0);
float32x4_t c23 = vfmaq_laneq_f32 (d->c2, r, c1357, 1);
float32x4_t c45 = vfmaq_laneq_f32 (d->c4, r, c1357, 2);
float32x4_t c67 = vfmaq_laneq_f32 (d->c6, r, c1357, 3);
float32x4_t p47 = vfmaq_f32 (c45, r2, c67);
float32x4_t p27 = vfmaq_f32 (c23, r2, p47);
float32x4_t poly = vfmaq_f32 (c01, r2, p27);
/* y = Log10(2) * n + poly * InvLn(10). */
float32x4_t y = vfmaq_f32 (r, d->ln2, n);
y = vmulq_f32 (y, d->inv_ln10);
if (unlikely (v_any_u16h (special)))
return special_case (y, u_off, poly, r2, special, d);
return vfmaq_f32 (y, poly, r2);
}
HALF_WIDTH_ALIAS_F1 (log10)
TEST_SIG (V, F, 1, log10, 0.01, 11.1)
TEST_ULP (V_NAME_F1 (log10), 2.81)
TEST_INTERVAL (V_NAME_F1 (log10), -0.0, -inf, 100)
TEST_INTERVAL (V_NAME_F1 (log10), 0, 0x1p-126, 100)
TEST_INTERVAL (V_NAME_F1 (log10), 0x1p-126, 0x1p-23, 50000)
TEST_INTERVAL (V_NAME_F1 (log10), 0x1p-23, 1.0, 50000)
TEST_INTERVAL (V_NAME_F1 (log10), 1.0, 100, 50000)
TEST_INTERVAL (V_NAME_F1 (log10), 100, inf, 50000)

View file

@ -0,0 +1,61 @@
/*
* Double-precision vector log(1+x) function.
*
* Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
#include "test_sig.h"
#include "test_defs.h"
#define WANT_V_LOG1P_K0_SHORTCUT 0
#include "v_log1p_inline.h"
const static struct data
{
struct v_log1p_data d;
uint64x2_t inf, minus_one;
} data = { .d = V_LOG1P_CONSTANTS_TABLE,
.inf = V2 (0x7ff0000000000000),
.minus_one = V2 (0xbff0000000000000) };
#define BottomMask v_u64 (0xffffffff)
static float64x2_t NOINLINE VPCS_ATTR
special_case (float64x2_t x, uint64x2_t cmp, const struct data *d)
{
/* Side-step special lanes so fenv exceptions are not triggered
inadvertently. */
float64x2_t x_nospecial = v_zerofy_f64 (x, cmp);
return v_call_f64 (log1p, x, log1p_inline (x_nospecial, &d->d), cmp);
}
/* Vector log1p approximation using polynomial on reduced interval. Routine is
a modification of the algorithm used in scalar log1p, with no shortcut for
k=0 and no narrowing for f and k. Maximum observed error is 2.45 ULP:
_ZGVnN2v_log1p(0x1.658f7035c4014p+11) got 0x1.fd61d0727429dp+2
want 0x1.fd61d0727429fp+2 . */
VPCS_ATTR float64x2_t V_NAME_D1 (log1p) (float64x2_t x)
{
const struct data *d = ptr_barrier (&data);
uint64x2_t ix = vreinterpretq_u64_f64 (x);
uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x));
uint64x2_t special_cases
= vorrq_u64 (vcgeq_u64 (ia, d->inf), vcgeq_u64 (ix, d->minus_one));
if (unlikely (v_any_u64 (special_cases)))
return special_case (x, special_cases, d);
return log1p_inline (x, &d->d);
}
TEST_SIG (V, D, 1, log1p, -0.9, 10.0)
TEST_ULP (V_NAME_D1 (log1p), 1.95)
TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (log1p), WANT_SIMD_EXCEPT)
TEST_SYM_INTERVAL (V_NAME_D1 (log1p), 0.0, 0x1p-23, 50000)
TEST_SYM_INTERVAL (V_NAME_D1 (log1p), 0x1p-23, 0.001, 50000)
TEST_SYM_INTERVAL (V_NAME_D1 (log1p), 0.001, 1.0, 50000)
TEST_INTERVAL (V_NAME_D1 (log1p), 1, inf, 40000)
TEST_INTERVAL (V_NAME_D1 (log1p), -1.0, -inf, 500)

View file

@ -0,0 +1,92 @@
/*
* Single-precision vector log(1+x) function.
*
* Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
#include "test_sig.h"
#include "test_defs.h"
#include "v_log1pf_inline.h"
#if WANT_SIMD_EXCEPT
const static struct data
{
uint32x4_t minus_one, thresh;
struct v_log1pf_data d;
} data = {
.d = V_LOG1PF_CONSTANTS_TABLE,
.thresh = V4 (0x4b800000), /* asuint32(INFINITY) - TinyBound. */
.minus_one = V4 (0xbf800000),
};
/* asuint32(0x1p-23). ulp=0.5 at 0x1p-23. */
# define TinyBound v_u32 (0x34000000)
static float32x4_t NOINLINE VPCS_ATTR
special_case (float32x4_t x, uint32x4_t cmp, const struct data *d)
{
/* Side-step special lanes so fenv exceptions are not triggered
inadvertently. */
float32x4_t x_nospecial = v_zerofy_f32 (x, cmp);
return v_call_f32 (log1pf, x, log1pf_inline (x_nospecial, &d->d), cmp);
}
/* Vector log1pf approximation using polynomial on reduced interval. Worst-case
error is 1.69 ULP:
_ZGVnN4v_log1pf(0x1.04418ap-2) got 0x1.cfcbd8p-3
want 0x1.cfcbdcp-3. */
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log1p) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
uint32x4_t ix = vreinterpretq_u32_f32 (x);
uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x));
uint32x4_t special_cases
= vorrq_u32 (vcgeq_u32 (vsubq_u32 (ia, TinyBound), d->thresh),
vcgeq_u32 (ix, d->minus_one));
if (unlikely (v_any_u32 (special_cases)))
return special_case (x, special_cases, d);
return log1pf_inline (x, &d->d);
}
#else
const static struct v_log1pf_data data = V_LOG1PF_CONSTANTS_TABLE;
static float32x4_t NOINLINE VPCS_ATTR
special_case (float32x4_t x, uint32x4_t cmp)
{
return v_call_f32 (log1pf, x, log1pf_inline (x, ptr_barrier (&data)), cmp);
}
/* Vector log1pf approximation using polynomial on reduced interval. Worst-case
error is 1.63 ULP:
_ZGVnN4v_log1pf(0x1.216d12p-2) got 0x1.fdcb12p-3
want 0x1.fdcb16p-3. */
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log1p) (float32x4_t x)
{
uint32x4_t special_cases = vornq_u32 (vcleq_f32 (x, v_f32 (-1)),
vcaleq_f32 (x, v_f32 (0x1p127f)));
if (unlikely (v_any_u32 (special_cases)))
return special_case (x, special_cases);
return log1pf_inline (x, ptr_barrier (&data));
}
#endif
HALF_WIDTH_ALIAS_F1 (log1p)
TEST_SIG (V, F, 1, log1p, -0.9, 10.0)
TEST_ULP (V_NAME_F1 (log1p), 1.20)
TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (log1p), WANT_SIMD_EXCEPT)
TEST_SYM_INTERVAL (V_NAME_F1 (log1p), 0.0, 0x1p-23, 30000)
TEST_SYM_INTERVAL (V_NAME_F1 (log1p), 0x1p-23, 1, 50000)
TEST_INTERVAL (V_NAME_F1 (log1p), 1, inf, 50000)
TEST_INTERVAL (V_NAME_F1 (log1p), -1.0, -inf, 1000)

View file

@ -0,0 +1,123 @@
/*
* Double-precision vector log2 function.
*
* Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
#include "test_sig.h"
#include "test_defs.h"
static const struct data
{
uint64x2_t off, sign_exp_mask, offset_lower_bound;
uint32x4_t special_bound;
float64x2_t c0, c2;
double c1, c3, invln2, c4;
} data = {
/* Each coefficient was generated to approximate log(r) for |r| < 0x1.fp-9
and N = 128, then scaled by log2(e) in extended precision and rounded back
to double precision. */
.c0 = V2 (-0x1.71547652b8300p-1),
.c1 = 0x1.ec709dc340953p-2,
.c2 = V2 (-0x1.71547651c8f35p-2),
.c3 = 0x1.2777ebe12dda5p-2,
.c4 = -0x1.ec738d616fe26p-3,
.invln2 = 0x1.71547652b82fep0,
.off = V2 (0x3fe6900900000000),
.sign_exp_mask = V2 (0xfff0000000000000),
/* Lower bound is 0x0010000000000000. For
optimised register use subnormals are detected after offset has been
subtracted, so lower bound - offset (which wraps around). */
.offset_lower_bound = V2 (0x0010000000000000 - 0x3fe6900900000000),
.special_bound = V4 (0x7fe00000), /* asuint64(inf) - asuint64(0x1p-1022). */
};
#define N (1 << V_LOG2_TABLE_BITS)
#define IndexMask (N - 1)
struct entry
{
float64x2_t invc;
float64x2_t log2c;
};
static inline struct entry
lookup (uint64x2_t i)
{
struct entry e;
uint64_t i0
= (vgetq_lane_u64 (i, 0) >> (52 - V_LOG2_TABLE_BITS)) & IndexMask;
uint64_t i1
= (vgetq_lane_u64 (i, 1) >> (52 - V_LOG2_TABLE_BITS)) & IndexMask;
float64x2_t e0 = vld1q_f64 (&__v_log2_data.table[i0].invc);
float64x2_t e1 = vld1q_f64 (&__v_log2_data.table[i1].invc);
e.invc = vuzp1q_f64 (e0, e1);
e.log2c = vuzp2q_f64 (e0, e1);
return e;
}
static float64x2_t VPCS_ATTR NOINLINE
special_case (float64x2_t hi, uint64x2_t u_off, float64x2_t y, float64x2_t r2,
uint32x2_t special, const struct data *d)
{
float64x2_t x = vreinterpretq_f64_u64 (vaddq_u64 (u_off, d->off));
return v_call_f64 (log2, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (special));
}
/* Double-precision vector log2 routine. Implements the same algorithm as
vector log10, with coefficients and table entries scaled in extended
precision. The maximum observed error is 2.58 ULP:
_ZGVnN2v_log2(0x1.0b556b093869bp+0) got 0x1.fffb34198d9dap-5
want 0x1.fffb34198d9ddp-5. */
float64x2_t VPCS_ATTR V_NAME_D1 (log2) (float64x2_t x)
{
const struct data *d = ptr_barrier (&data);
/* To avoid having to mov x out of the way, keep u after offset has been
applied, and recover x by adding the offset back in the special-case
handler. */
uint64x2_t u = vreinterpretq_u64_f64 (x);
uint64x2_t u_off = vsubq_u64 (u, d->off);
/* x = 2^k z; where z is in range [Off,2*Off) and exact.
The range is split into N subintervals.
The ith subinterval contains z and c is near its center. */
int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52);
uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->sign_exp_mask));
float64x2_t z = vreinterpretq_f64_u64 (iz);
struct entry e = lookup (u_off);
uint32x2_t special = vcge_u32 (vsubhn_u64 (u_off, d->offset_lower_bound),
vget_low_u32 (d->special_bound));
/* log2(x) = log1p(z/c-1)/log(2) + log2(c) + k. */
float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
float64x2_t kd = vcvtq_f64_s64 (k);
float64x2_t invln2_and_c4 = vld1q_f64 (&d->invln2);
float64x2_t hi
= vfmaq_laneq_f64 (vaddq_f64 (e.log2c, kd), r, invln2_and_c4, 0);
float64x2_t r2 = vmulq_f64 (r, r);
float64x2_t odd_coeffs = vld1q_f64 (&d->c1);
float64x2_t y = vfmaq_laneq_f64 (d->c2, r, odd_coeffs, 1);
float64x2_t p = vfmaq_laneq_f64 (d->c0, r, odd_coeffs, 0);
y = vfmaq_laneq_f64 (y, r2, invln2_and_c4, 1);
y = vfmaq_f64 (p, r2, y);
if (unlikely (v_any_u32h (special)))
return special_case (hi, u_off, y, r2, special, d);
return vfmaq_f64 (hi, y, r2);
}
TEST_SIG (V, D, 1, log2, 0.01, 11.1)
TEST_ULP (V_NAME_D1 (log2), 2.09)
TEST_INTERVAL (V_NAME_D1 (log2), -0.0, -0x1p126, 100)
TEST_INTERVAL (V_NAME_D1 (log2), 0x1p-149, 0x1p-126, 4000)
TEST_INTERVAL (V_NAME_D1 (log2), 0x1p-126, 0x1p-23, 50000)
TEST_INTERVAL (V_NAME_D1 (log2), 0x1p-23, 1.0, 50000)
TEST_INTERVAL (V_NAME_D1 (log2), 1.0, 100, 50000)
TEST_INTERVAL (V_NAME_D1 (log2), 100, inf, 50000)

View file

@ -0,0 +1,102 @@
/*
* Single-precision vector log2 function.
*
* Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
#include "test_sig.h"
#include "test_defs.h"
static const struct data
{
float32x4_t c0, c2, c4, c6, c8;
uint32x4_t off, offset_lower_bound;
uint16x8_t special_bound;
uint32x4_t mantissa_mask;
float c1, c3, c5, c7;
} data = {
/* Coefficients generated using Remez algorithm approximate
log2(1+r)/r for r in [ -1/3, 1/3 ].
rel error: 0x1.c4c4b0cp-26. */
.c0 = V4 (0x1.715476p0f), /* (float)(1 / ln(2)). */
.c1 = -0x1.715458p-1f,
.c2 = V4 (0x1.ec701cp-2f),
.c3 = -0x1.7171a4p-2f,
.c4 = V4 (0x1.27a0b8p-2f),
.c5 = -0x1.e5143ep-3f,
.c6 = V4 (0x1.9d8ecap-3f),
.c7 = -0x1.c675bp-3f,
.c8 = V4 (0x1.9e495p-3f),
/* Lower bound is the smallest positive normal float 0x00800000. For
optimised register use subnormals are detected after offset has been
subtracted, so lower bound is 0x0080000 - offset (which wraps around). */
.offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab),
.special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000). */
.off = V4 (0x3f2aaaab), /* 0.666667. */
.mantissa_mask = V4 (0x007fffff),
};
static float32x4_t VPCS_ATTR NOINLINE
special_case (float32x4_t n, uint32x4_t u_off, float32x4_t p, float32x4_t r,
uint16x4_t cmp, const struct data *d)
{
/* Fall back to scalar code. */
return v_call_f32 (log2f, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)),
vfmaq_f32 (n, p, r), vmovl_u16 (cmp));
}
/* Fast implementation for single precision AdvSIMD log2,
relies on same argument reduction as AdvSIMD logf.
Maximum error: 2.48 ULPs
_ZGVnN4v_log2f(0x1.558174p+0) got 0x1.a9be84p-2
want 0x1.a9be8p-2. */
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log2) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
/* To avoid having to mov x out of the way, keep u after offset has been
applied, and recover x by adding the offset back in the special-case
handler. */
uint32x4_t u_off = vreinterpretq_u32_f32 (x);
/* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
u_off = vsubq_u32 (u_off, d->off);
float32x4_t n = vcvtq_f32_s32 (
vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend. */
uint16x4_t special = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound),
vget_low_u16 (d->special_bound));
uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off);
float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
/* y = log2(1+r) + n. */
float32x4_t r2 = vmulq_f32 (r, r);
float32x4_t c1357 = vld1q_f32 (&d->c1);
float32x4_t c01 = vfmaq_laneq_f32 (d->c0, r, c1357, 0);
float32x4_t c23 = vfmaq_laneq_f32 (d->c2, r, c1357, 1);
float32x4_t c45 = vfmaq_laneq_f32 (d->c4, r, c1357, 2);
float32x4_t c67 = vfmaq_laneq_f32 (d->c6, r, c1357, 3);
float32x4_t p68 = vfmaq_f32 (c67, r2, d->c8);
float32x4_t p48 = vfmaq_f32 (c45, r2, p68);
float32x4_t p28 = vfmaq_f32 (c23, r2, p48);
float32x4_t p = vfmaq_f32 (c01, r2, p28);
if (unlikely (v_any_u16h (special)))
return special_case (n, u_off, p, r, special, d);
return vfmaq_f32 (n, p, r);
}
HALF_WIDTH_ALIAS_F1 (log2)
TEST_SIG (V, F, 1, log2, 0.01, 11.1)
TEST_ULP (V_NAME_F1 (log2), 1.99)
TEST_INTERVAL (V_NAME_F1 (log2), -0.0, -0x1p126, 100)
TEST_INTERVAL (V_NAME_F1 (log2), 0x1p-149, 0x1p-126, 4000)
TEST_INTERVAL (V_NAME_F1 (log2), 0x1p-126, 0x1p-23, 50000)
TEST_INTERVAL (V_NAME_F1 (log2), 0x1p-23, 1.0, 50000)
TEST_INTERVAL (V_NAME_F1 (log2), 1.0, 100, 50000)
TEST_INTERVAL (V_NAME_F1 (log2), 100, inf, 50000)

View file

@ -0,0 +1,88 @@
/*
* Single-precision vector log function.
*
* Copyright (c) 2019-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
#include "test_defs.h"
#include "test_sig.h"
static const struct data
{
float32x4_t c2, c4, c6, ln2;
uint32x4_t off, offset_lower_bound, mantissa_mask;
uint16x8_t special_bound;
float c1, c3, c5, c0;
} data = {
/* 3.34 ulp error. */
.c0 = -0x1.3e737cp-3f,
.c1 = 0x1.5a9aa2p-3f,
.c2 = V4 (-0x1.4f9934p-3f),
.c3 = 0x1.961348p-3f,
.c4 = V4 (-0x1.00187cp-2f),
.c5 = 0x1.555d7cp-2f,
.c6 = V4 (-0x1.ffffc8p-2f),
.ln2 = V4 (0x1.62e43p-1f),
/* Lower bound is the smallest positive normal float 0x00800000. For
optimised register use subnormals are detected after offset has been
subtracted, so lower bound is 0x0080000 - offset (which wraps around). */
.offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab),
.special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000). */
.off = V4 (0x3f2aaaab), /* 0.666667. */
.mantissa_mask = V4 (0x007fffff)
};
static float32x4_t VPCS_ATTR NOINLINE
special_case (float32x4_t p, uint32x4_t u_off, float32x4_t y, float32x4_t r2,
uint16x4_t cmp, const struct data *d)
{
/* Fall back to scalar code. */
return v_call_f32 (logf, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)),
vfmaq_f32 (p, y, r2), vmovl_u16 (cmp));
}
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
float32x4_t c1350 = vld1q_f32 (&d->c1);
/* To avoid having to mov x out of the way, keep u after offset has been
applied, and recover x by adding the offset back in the special-case
handler. */
uint32x4_t u_off = vsubq_u32 (vreinterpretq_u32_f32 (x), d->off);
/* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
float32x4_t n = vcvtq_f32_s32 (
vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend. */
uint16x4_t cmp = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound),
vget_low_u16 (d->special_bound));
uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off);
float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
/* y = log(1+r) + n*ln2. */
float32x4_t r2 = vmulq_f32 (r, r);
/* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))). */
float32x4_t p = vfmaq_laneq_f32 (d->c2, r, c1350, 0);
float32x4_t q = vfmaq_laneq_f32 (d->c4, r, c1350, 1);
float32x4_t y = vfmaq_laneq_f32 (d->c6, r, c1350, 2);
p = vfmaq_laneq_f32 (p, r2, c1350, 3);
q = vfmaq_f32 (q, p, r2);
y = vfmaq_f32 (y, q, r2);
p = vfmaq_f32 (r, d->ln2, n);
if (unlikely (v_any_u16h (cmp)))
return special_case (p, u_off, y, r2, cmp, d);
return vfmaq_f32 (p, y, r2);
}
HALF_WIDTH_ALIAS_F1 (log)
TEST_SIG (V, F, 1, log, 0.01, 11.1)
TEST_ULP (V_NAME_F1 (log), 2.9)
TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (log), WANT_SIMD_EXCEPT)
TEST_INTERVAL (V_NAME_F1 (log), 0, 0xffff0000, 10000)
TEST_INTERVAL (V_NAME_F1 (log), 0x1p-4, 0x1p4, 500000)
TEST_INTERVAL (V_NAME_F1 (log), 0, inf, 50000)

View file

@ -0,0 +1,33 @@
/*
* Double-precision vector modf(x, *y) function.
*
* Copyright (c) 2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
#include "test_sig.h"
#include "test_defs.h"
/* Modf algorithm. Produces exact values in all rounding modes. */
float64x2_t VPCS_ATTR V_NAME_D1_L1 (modf) (float64x2_t x, double *out_int)
{
/* Get integer component of x. */
float64x2_t rounded = vrndq_f64 (x);
vst1q_f64 (out_int, rounded);
/* Subtract integer component from input. */
uint64x2_t remaining = vreinterpretq_u64_f64 (vsubq_f64 (x, rounded));
/* Return +0 for integer x. */
uint64x2_t is_integer = vceqq_f64 (x, rounded);
return vreinterpretq_f64_u64 (vbicq_u64 (remaining, is_integer));
}
TEST_ULP (_ZGVnN2vl8_modf_frac, 0.0)
TEST_SYM_INTERVAL (_ZGVnN2vl8_modf_frac, 0, 1, 20000)
TEST_SYM_INTERVAL (_ZGVnN2vl8_modf_frac, 1, inf, 20000)
TEST_ULP (_ZGVnN2vl8_modf_int, 0.0)
TEST_SYM_INTERVAL (_ZGVnN2vl8_modf_int, 0, 1, 20000)
TEST_SYM_INTERVAL (_ZGVnN2vl8_modf_int, 1, inf, 20000)

View file

@ -0,0 +1,34 @@
/*
* Single-precision vector modf(x, *y) function.
*
* Copyright (c) 2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
#include "test_sig.h"
#include "test_defs.h"
/* Modff algorithm. Produces exact values in all rounding modes. */
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1_L1 (modf) (float32x4_t x,
float *out_int)
{
/* Get integer component of x. */
float32x4_t rounded = vrndq_f32 (x);
vst1q_f32 (out_int, rounded);
/* Subtract integer component from input. */
uint32x4_t remaining = vreinterpretq_u32_f32 (vsubq_f32 (x, rounded));
/* Return +0 for integer x. */
uint32x4_t is_integer = vceqq_f32 (x, rounded);
return vreinterpretq_f32_u32 (vbicq_u32 (remaining, is_integer));
}
TEST_ULP (_ZGVnN4vl4_modff_frac, 0.0)
TEST_SYM_INTERVAL (_ZGVnN4vl4_modff_frac, 0, 1, 20000)
TEST_SYM_INTERVAL (_ZGVnN4vl4_modff_frac, 1, inf, 20000)
TEST_ULP (_ZGVnN4vl4_modff_int, 0.0)
TEST_SYM_INTERVAL (_ZGVnN4vl4_modff_int, 0, 1, 20000)
TEST_SYM_INTERVAL (_ZGVnN4vl4_modff_int, 1, inf, 20000)

View file

@ -1,20 +1,17 @@
/*
* Double-precision vector pow function.
*
* Copyright (c) 2020-2023, Arm Limited.
* Copyright (c) 2020-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
#include "pl_sig.h"
#include "pl_test.h"
#include "test_sig.h"
#include "test_defs.h"
/* Defines parameters of the approximation and scalar fallback. */
#include "finite_pow.h"
#define VecSmallExp v_u64 (SmallExp)
#define VecThresExp v_u64 (ThresExp)
#define VecSmallPowX v_u64 (SmallPowX)
#define VecThresPowX v_u64 (ThresPowX)
#define VecSmallPowY v_u64 (SmallPowY)
@ -22,34 +19,49 @@
static const struct data
{
float64x2_t log_poly[7];
float64x2_t exp_poly[3];
float64x2_t ln2_hi, ln2_lo;
float64x2_t shift, inv_ln2_n, ln2_hi_n, ln2_lo_n;
uint64x2_t inf;
float64x2_t small_powx;
uint64x2_t offset, mask;
uint64x2_t mask_sub_0, mask_sub_1;
float64x2_t log_c0, log_c2, log_c4, log_c5;
double log_c1, log_c3;
double ln2_lo, ln2_hi;
uint64x2_t small_exp, thres_exp;
double ln2_lo_n, ln2_hi_n;
double inv_ln2_n, exp_c2;
float64x2_t exp_c0, exp_c1;
} data = {
/* Power threshold. */
.inf = V2 (0x7ff0000000000000),
.small_powx = V2 (0x1p-126),
.offset = V2 (Off),
.mask = V2 (0xfffULL << 52),
.mask_sub_0 = V2 (1ULL << 52),
.mask_sub_1 = V2 (52ULL << 52),
/* Coefficients copied from v_pow_log_data.c
relative error: 0x1.11922ap-70 in [-0x1.6bp-8, 0x1.6bp-8]
Coefficients are scaled to match the scaling during evaluation. */
.log_poly = { V2 (-0x1p-1), V2 (0x1.555555555556p-2 * -2),
V2 (-0x1.0000000000006p-2 * -2), V2 (0x1.999999959554ep-3 * 4),
V2 (-0x1.555555529a47ap-3 * 4), V2 (0x1.2495b9b4845e9p-3 * -8),
V2 (-0x1.0002b8b263fc3p-3 * -8) },
.ln2_hi = V2 (0x1.62e42fefa3800p-1),
.ln2_lo = V2 (0x1.ef35793c76730p-45),
.log_c0 = V2 (0x1.555555555556p-2 * -2),
.log_c1 = -0x1.0000000000006p-2 * -2,
.log_c2 = V2 (0x1.999999959554ep-3 * 4),
.log_c3 = -0x1.555555529a47ap-3 * 4,
.log_c4 = V2 (0x1.2495b9b4845e9p-3 * -8),
.log_c5 = V2 (-0x1.0002b8b263fc3p-3 * -8),
.ln2_hi = 0x1.62e42fefa3800p-1,
.ln2_lo = 0x1.ef35793c76730p-45,
/* Polynomial coefficients: abs error: 1.43*2^-58, ulp error: 0.549
(0.550 without fma) if |x| < ln2/512. */
.exp_poly = { V2 (0x1.fffffffffffd4p-2), V2 (0x1.5555571d6ef9p-3),
V2 (0x1.5555576a5adcep-5) },
.shift = V2 (0x1.8p52), /* round to nearest int. without intrinsics. */
.inv_ln2_n = V2 (0x1.71547652b82fep8), /* N/ln2. */
.ln2_hi_n = V2 (0x1.62e42fefc0000p-9), /* ln2/N. */
.ln2_lo_n = V2 (-0x1.c610ca86c3899p-45),
.exp_c0 = V2 (0x1.fffffffffffd4p-2),
.exp_c1 = V2 (0x1.5555571d6ef9p-3),
.exp_c2 = 0x1.5555576a5adcep-5,
.small_exp = V2 (0x3c90000000000000),
.thres_exp = V2 (0x03f0000000000000),
.inv_ln2_n = 0x1.71547652b82fep8, /* N/ln2. */
.ln2_hi_n = 0x1.62e42fefc0000p-9, /* ln2/N. */
.ln2_lo_n = -0x1.c610ca86c3899p-45,
};
#define A(i) data.log_poly[i]
#define C(i) data.exp_poly[i]
/* This version implements an algorithm close to AOR scalar pow but
/* This version implements an algorithm close to scalar pow but
- does not implement the trick in the exp's specialcase subroutine to avoid
double-rounding,
- does not use a tail in the exponential core computation,
@ -78,10 +90,9 @@ v_log_inline (uint64x2_t ix, float64x2_t *tail, const struct data *d)
/* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
The range is split into N subintervals.
The ith subinterval contains z and c is near its center. */
uint64x2_t tmp = vsubq_u64 (ix, v_u64 (Off));
int64x2_t k
= vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); /* arithmetic shift. */
uint64x2_t iz = vsubq_u64 (ix, vandq_u64 (tmp, v_u64 (0xfffULL << 52)));
uint64x2_t tmp = vsubq_u64 (ix, d->offset);
int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52);
uint64x2_t iz = vsubq_u64 (ix, vandq_u64 (tmp, d->mask));
float64x2_t z = vreinterpretq_f64_u64 (iz);
float64x2_t kd = vcvtq_f64_s64 (k);
/* log(x) = k*Ln2 + log(c) + log1p(z/c-1). */
@ -92,12 +103,13 @@ v_log_inline (uint64x2_t ix, float64x2_t *tail, const struct data *d)
|z/c - 1| < 1/N, so r = z/c - 1 is exactly representible. */
float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, invc);
/* k*Ln2 + log(c) + r. */
float64x2_t t1 = vfmaq_f64 (logc, kd, d->ln2_hi);
float64x2_t ln2 = vld1q_f64 (&d->ln2_lo);
float64x2_t t1 = vfmaq_laneq_f64 (logc, kd, ln2, 1);
float64x2_t t2 = vaddq_f64 (t1, r);
float64x2_t lo1 = vfmaq_f64 (logctail, kd, d->ln2_lo);
float64x2_t lo1 = vfmaq_laneq_f64 (logctail, kd, ln2, 0);
float64x2_t lo2 = vaddq_f64 (vsubq_f64 (t1, t2), r);
/* Evaluation is optimized assuming superscalar pipelined execution. */
float64x2_t ar = vmulq_f64 (A (0), r);
float64x2_t ar = vmulq_f64 (v_f64 (-0.5), r);
float64x2_t ar2 = vmulq_f64 (r, ar);
float64x2_t ar3 = vmulq_f64 (r, ar2);
/* k*Ln2 + log(c) + r + A[0]*r*r. */
@ -105,9 +117,10 @@ v_log_inline (uint64x2_t ix, float64x2_t *tail, const struct data *d)
float64x2_t lo3 = vfmaq_f64 (vnegq_f64 (ar2), ar, r);
float64x2_t lo4 = vaddq_f64 (vsubq_f64 (t2, hi), ar2);
/* p = log1p(r) - r - A[0]*r*r. */
float64x2_t a56 = vfmaq_f64 (A (5), r, A (6));
float64x2_t a34 = vfmaq_f64 (A (3), r, A (4));
float64x2_t a12 = vfmaq_f64 (A (1), r, A (2));
float64x2_t odd_coeffs = vld1q_f64 (&d->log_c1);
float64x2_t a56 = vfmaq_f64 (d->log_c4, r, d->log_c5);
float64x2_t a34 = vfmaq_laneq_f64 (d->log_c2, r, odd_coeffs, 1);
float64x2_t a12 = vfmaq_laneq_f64 (d->log_c0, r, odd_coeffs, 0);
float64x2_t p = vfmaq_f64 (a34, ar2, a56);
p = vfmaq_f64 (a12, ar2, p);
p = vmulq_f64 (ar3, p);
@ -118,29 +131,37 @@ v_log_inline (uint64x2_t ix, float64x2_t *tail, const struct data *d)
return y;
}
static float64x2_t VPCS_ATTR NOINLINE
exp_special_case (float64x2_t x, float64x2_t xtail)
{
return (float64x2_t){ exp_nosignbias (x[0], xtail[0]),
exp_nosignbias (x[1], xtail[1]) };
}
/* Computes sign*exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|. */
static inline float64x2_t
v_exp_inline (float64x2_t x, float64x2_t xtail, const struct data *d)
v_exp_inline (float64x2_t x, float64x2_t neg_xtail, const struct data *d)
{
/* Fallback to scalar exp_inline for all lanes if any lane
contains value of x s.t. |x| <= 2^-54 or >= 512. */
uint64x2_t abstop
= vandq_u64 (vshrq_n_u64 (vreinterpretq_u64_f64 (x), 52), v_u64 (0x7ff));
uint64x2_t uoflowx
= vcgeq_u64 (vsubq_u64 (abstop, VecSmallExp), VecThresExp);
uint64x2_t uoflowx = vcgeq_u64 (
vsubq_u64 (vreinterpretq_u64_f64 (vabsq_f64 (x)), d->small_exp),
d->thres_exp);
if (unlikely (v_any_u64 (uoflowx)))
return v_call2_f64 (exp_nosignbias, x, xtail, x, v_u64 (-1));
return exp_special_case (x, vnegq_f64 (neg_xtail));
/* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */
/* x = ln2/N*k + r, with k integer and r in [-ln2/2N, ln2/2N]. */
float64x2_t z = vmulq_f64 (d->inv_ln2_n, x);
/* z - kd is in [-1, 1] in non-nearest rounding modes. */
float64x2_t kd = vaddq_f64 (z, d->shift);
uint64x2_t ki = vreinterpretq_u64_f64 (kd);
kd = vsubq_f64 (kd, d->shift);
float64x2_t r = vfmsq_f64 (x, kd, d->ln2_hi_n);
r = vfmsq_f64 (r, kd, d->ln2_lo_n);
float64x2_t exp_consts = vld1q_f64 (&d->inv_ln2_n);
float64x2_t z = vmulq_laneq_f64 (x, exp_consts, 0);
float64x2_t kd = vrndnq_f64 (z);
uint64x2_t ki = vreinterpretq_u64_s64 (vcvtaq_s64_f64 (z));
float64x2_t ln2_n = vld1q_f64 (&d->ln2_lo_n);
float64x2_t r = vfmsq_laneq_f64 (x, kd, ln2_n, 1);
r = vfmsq_laneq_f64 (r, kd, ln2_n, 0);
/* The code assumes 2^-200 < |xtail| < 2^-8/N. */
r = vaddq_f64 (r, xtail);
r = vsubq_f64 (r, neg_xtail);
/* 2^(k/N) ~= scale. */
uint64x2_t idx = vandq_u64 (ki, v_u64 (N_EXP - 1));
uint64x2_t top = vshlq_n_u64 (ki, 52 - V_POW_EXP_TABLE_BITS);
@ -149,8 +170,8 @@ v_exp_inline (float64x2_t x, float64x2_t xtail, const struct data *d)
sbits = vaddq_u64 (sbits, top);
/* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1). */
float64x2_t r2 = vmulq_f64 (r, r);
float64x2_t tmp = vfmaq_f64 (C (1), r, C (2));
tmp = vfmaq_f64 (C (0), r, tmp);
float64x2_t tmp = vfmaq_laneq_f64 (d->exp_c1, r, exp_consts, 1);
tmp = vfmaq_f64 (d->exp_c0, r, tmp);
tmp = vfmaq_f64 (r, r2, tmp);
float64x2_t scale = vreinterpretq_f64_u64 (sbits);
/* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
@ -158,54 +179,59 @@ v_exp_inline (float64x2_t x, float64x2_t xtail, const struct data *d)
return vfmaq_f64 (scale, scale, tmp);
}
static float64x2_t NOINLINE VPCS_ATTR
scalar_fallback (float64x2_t x, float64x2_t y)
{
return (float64x2_t){ pow_scalar_special_case (x[0], y[0]),
pow_scalar_special_case (x[1], y[1]) };
}
float64x2_t VPCS_ATTR V_NAME_D2 (pow) (float64x2_t x, float64x2_t y)
{
const struct data *d = ptr_barrier (&data);
/* Case of x <= 0 is too complicated to be vectorised efficiently here,
fallback to scalar pow for all lanes if any x < 0 detected. */
if (v_any_u64 (vclezq_s64 (vreinterpretq_s64_f64 (x))))
return v_call2_f64 (__pl_finite_pow, x, y, x, v_u64 (-1));
return scalar_fallback (x, y);
uint64x2_t vix = vreinterpretq_u64_f64 (x);
uint64x2_t viy = vreinterpretq_u64_f64 (y);
uint64x2_t vtopx = vshrq_n_u64 (vix, 52);
uint64x2_t vtopy = vshrq_n_u64 (viy, 52);
uint64x2_t vabstopx = vandq_u64 (vtopx, v_u64 (0x7ff));
uint64x2_t vabstopy = vandq_u64 (vtopy, v_u64 (0x7ff));
uint64x2_t iay = vandq_u64 (viy, d->inf);
/* Special cases of x or y. */
#if WANT_SIMD_EXCEPT
/* Small or large. */
uint64x2_t vtopx = vshrq_n_u64 (vix, 52);
uint64x2_t vabstopy = vshrq_n_u64 (iay, 52);
uint64x2_t specialx
= vcgeq_u64 (vsubq_u64 (vtopx, VecSmallPowX), VecThresPowX);
uint64x2_t specialy
= vcgeq_u64 (vsubq_u64 (vabstopy, VecSmallPowY), VecThresPowY);
#else
/* Inf or nan. */
uint64x2_t specialx = vcgeq_u64 (vabstopx, v_u64 (0x7ff));
uint64x2_t specialy = vcgeq_u64 (vabstopy, v_u64 (0x7ff));
/* The case y==0 does not trigger a special case, since in this case it is
necessary to fix the result only if x is a signalling nan, which already
triggers a special case. We test y==0 directly in the scalar fallback. */
uint64x2_t iax = vandq_u64 (vix, d->inf);
uint64x2_t specialx = vcgeq_u64 (iax, d->inf);
uint64x2_t specialy = vcgeq_u64 (iay, d->inf);
#endif
uint64x2_t special = vorrq_u64 (specialx, specialy);
/* Fallback to scalar on all lanes if any lane is inf or nan. */
if (unlikely (v_any_u64 (special)))
return v_call2_f64 (__pl_finite_pow, x, y, x, v_u64 (-1));
return scalar_fallback (x, y);
/* Small cases of x: |x| < 0x1p-126. */
uint64x2_t smallx = vcltq_u64 (vabstopx, VecSmallPowX);
uint64x2_t smallx = vcaltq_f64 (x, d->small_powx);
if (unlikely (v_any_u64 (smallx)))
{
/* Update ix if top 12 bits of x are 0. */
uint64x2_t sub_x = vceqzq_u64 (vtopx);
uint64x2_t sub_x = vceqzq_u64 (vshrq_n_u64 (vix, 52));
if (unlikely (v_any_u64 (sub_x)))
{
/* Normalize subnormal x so exponent becomes negative. */
uint64x2_t vix_norm
= vreinterpretq_u64_f64 (vmulq_f64 (x, v_f64 (0x1p52)));
vix_norm = vandq_u64 (vix_norm, v_u64 (0x7fffffffffffffff));
vix_norm = vsubq_u64 (vix_norm, v_u64 (52ULL << 52));
uint64x2_t vix_norm = vreinterpretq_u64_f64 (
vabsq_f64 (vmulq_f64 (x, vcvtq_f64_u64 (d->mask_sub_0))));
vix_norm = vsubq_u64 (vix_norm, d->mask_sub_1);
vix = vbslq_u64 (sub_x, vix_norm, vix);
}
}
@ -216,21 +242,20 @@ float64x2_t VPCS_ATTR V_NAME_D2 (pow) (float64x2_t x, float64x2_t y)
/* Vector Exp(y_loghi, y_loglo). */
float64x2_t vehi = vmulq_f64 (y, vhi);
float64x2_t velo = vmulq_f64 (y, vlo);
float64x2_t vemi = vfmsq_f64 (vehi, y, vhi);
velo = vsubq_f64 (velo, vemi);
return v_exp_inline (vehi, velo, d);
float64x2_t neg_velo = vfmsq_f64 (vemi, y, vlo);
return v_exp_inline (vehi, neg_velo, d);
}
PL_SIG (V, D, 2, pow)
PL_TEST_ULP (V_NAME_D2 (pow), 0.55)
PL_TEST_EXPECT_FENV (V_NAME_D2 (pow), WANT_SIMD_EXCEPT)
TEST_SIG (V, D, 2, pow)
TEST_ULP (V_NAME_D2 (pow), 0.55)
TEST_DISABLE_FENV_IF_NOT (V_NAME_D2 (pow), WANT_SIMD_EXCEPT)
/* Wide intervals spanning the whole domain but shared between x and y. */
#define V_POW_INTERVAL2(xlo, xhi, ylo, yhi, n) \
PL_TEST_INTERVAL2 (V_NAME_D2 (pow), xlo, xhi, ylo, yhi, n) \
PL_TEST_INTERVAL2 (V_NAME_D2 (pow), xlo, xhi, -ylo, -yhi, n) \
PL_TEST_INTERVAL2 (V_NAME_D2 (pow), -xlo, -xhi, ylo, yhi, n) \
PL_TEST_INTERVAL2 (V_NAME_D2 (pow), -xlo, -xhi, -ylo, -yhi, n)
#define V_POW_INTERVAL2(xlo, xhi, ylo, yhi, n) \
TEST_INTERVAL2 (V_NAME_D2 (pow), xlo, xhi, ylo, yhi, n) \
TEST_INTERVAL2 (V_NAME_D2 (pow), xlo, xhi, -ylo, -yhi, n) \
TEST_INTERVAL2 (V_NAME_D2 (pow), -xlo, -xhi, ylo, yhi, n) \
TEST_INTERVAL2 (V_NAME_D2 (pow), -xlo, -xhi, -ylo, -yhi, n)
#define EXPAND(str) str##000000000
#define SHL52(str) EXPAND (str)
V_POW_INTERVAL2 (0, SHL52 (SmallPowX), 0, inf, 40000)
@ -248,12 +273,12 @@ V_POW_INTERVAL2 (0x1.ep-1, 0x1.1p0, 0x1p8, 0x1p16, 10000)
V_POW_INTERVAL2 (0x1p-300, 0x1p-200, 0x1p-20, 0x1p-10, 10000)
V_POW_INTERVAL2 (0x1p50, 0x1p100, 0x1p-20, 0x1p-10, 10000)
/* x is negative, y is odd or even integer, or y is real not integer. */
PL_TEST_INTERVAL2 (V_NAME_D2 (pow), -0.0, -10.0, 3.0, 3.0, 10000)
PL_TEST_INTERVAL2 (V_NAME_D2 (pow), -0.0, -10.0, 4.0, 4.0, 10000)
PL_TEST_INTERVAL2 (V_NAME_D2 (pow), -0.0, -10.0, 0.0, 10.0, 10000)
PL_TEST_INTERVAL2 (V_NAME_D2 (pow), 0.0, 10.0, -0.0, -10.0, 10000)
TEST_INTERVAL2 (V_NAME_D2 (pow), -0.0, -10.0, 3.0, 3.0, 10000)
TEST_INTERVAL2 (V_NAME_D2 (pow), -0.0, -10.0, 4.0, 4.0, 10000)
TEST_INTERVAL2 (V_NAME_D2 (pow), -0.0, -10.0, 0.0, 10.0, 10000)
TEST_INTERVAL2 (V_NAME_D2 (pow), 0.0, 10.0, -0.0, -10.0, 10000)
/* 1.0^y. */
PL_TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, 0.0, 0x1p-50, 1000)
PL_TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, 0x1p-50, 1.0, 1000)
PL_TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, 1.0, 0x1p100, 1000)
PL_TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, -1.0, -0x1p120, 1000)
TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, 0.0, 0x1p-50, 1000)
TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, 0x1p-50, 1.0, 1000)
TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, 1.0, 0x1p100, 1000)
TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, -1.0, -0x1p120, 1000)

View file

@ -0,0 +1,209 @@
/*
* Single-precision vector powf function.
*
* Copyright (c) 2019-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
#include "test_defs.h"
#include "test_sig.h"
#define Min v_u32 (0x00800000)
#define Max v_u32 (0x7f800000)
#define Thresh v_u32 (0x7f000000) /* Max - Min. */
#define MantissaMask v_u32 (0x007fffff)
#define A d->log2_poly
#define C d->exp2f_poly
/* 2.6 ulp ~ 0.5 + 2^24 (128*Ln2*relerr_log2 + relerr_exp2). */
#define Off v_u32 (0x3f35d000)
#define V_POWF_LOG2_TABLE_BITS 5
#define V_EXP2F_TABLE_BITS 5
#define Log2IdxMask ((1 << V_POWF_LOG2_TABLE_BITS) - 1)
#define Scale ((double) (1 << V_EXP2F_TABLE_BITS))
static const struct data
{
struct
{
double invc, logc;
} log2_tab[1 << V_POWF_LOG2_TABLE_BITS];
float64x2_t log2_poly[4];
uint64_t exp2f_tab[1 << V_EXP2F_TABLE_BITS];
float64x2_t exp2f_poly[3];
} data = {
.log2_tab = {{0x1.6489890582816p+0, -0x1.e960f97b22702p-2 * Scale},
{0x1.5cf19b35e3472p+0, -0x1.c993406cd4db6p-2 * Scale},
{0x1.55aac0e956d65p+0, -0x1.aa711d9a7d0f3p-2 * Scale},
{0x1.4eb0022977e01p+0, -0x1.8bf37bacdce9bp-2 * Scale},
{0x1.47fcccda1dd1fp+0, -0x1.6e13b3519946ep-2 * Scale},
{0x1.418ceabab68c1p+0, -0x1.50cb8281e4089p-2 * Scale},
{0x1.3b5c788f1edb3p+0, -0x1.341504a237e2bp-2 * Scale},
{0x1.3567de48e9c9ap+0, -0x1.17eaab624ffbbp-2 * Scale},
{0x1.2fabc80fd19bap+0, -0x1.f88e708f8c853p-3 * Scale},
{0x1.2a25200ce536bp+0, -0x1.c24b6da113914p-3 * Scale},
{0x1.24d108e0152e3p+0, -0x1.8d02ee397cb1dp-3 * Scale},
{0x1.1facd8ab2fbe1p+0, -0x1.58ac1223408b3p-3 * Scale},
{0x1.1ab614a03efdfp+0, -0x1.253e6fd190e89p-3 * Scale},
{0x1.15ea6d03af9ffp+0, -0x1.e5641882c12ffp-4 * Scale},
{0x1.1147b994bb776p+0, -0x1.81fea712926f7p-4 * Scale},
{0x1.0ccbf650593aap+0, -0x1.203e240de64a3p-4 * Scale},
{0x1.0875408477302p+0, -0x1.8029b86a78281p-5 * Scale},
{0x1.0441d42a93328p+0, -0x1.85d713190fb9p-6 * Scale},
{0x1p+0, 0x0p+0 * Scale},
{0x1.f1d006c855e86p-1, 0x1.4c1cc07312997p-5 * Scale},
{0x1.e28c3341aa301p-1, 0x1.5e1848ccec948p-4 * Scale},
{0x1.d4bdf9aa64747p-1, 0x1.04cfcb7f1196fp-3 * Scale},
{0x1.c7b45a24e5803p-1, 0x1.582813d463c21p-3 * Scale},
{0x1.bb5f5eb2ed60ap-1, 0x1.a936fa68760ccp-3 * Scale},
{0x1.afb0bff8fe6b4p-1, 0x1.f81bc31d6cc4ep-3 * Scale},
{0x1.a49badf7ab1f5p-1, 0x1.2279a09fae6b1p-2 * Scale},
{0x1.9a14a111fc4c9p-1, 0x1.47ec0b6df5526p-2 * Scale},
{0x1.901131f5b2fdcp-1, 0x1.6c71762280f1p-2 * Scale},
{0x1.8687f73f6d865p-1, 0x1.90155070798dap-2 * Scale},
{0x1.7d7067eb77986p-1, 0x1.b2e23b1d3068cp-2 * Scale},
{0x1.74c2c1cf97b65p-1, 0x1.d4e21b0daa86ap-2 * Scale},
{0x1.6c77f37cff2a1p-1, 0x1.f61e2a2f67f3fp-2 * Scale},},
.log2_poly = { /* rel err: 1.5 * 2^-30. */
V2 (-0x1.6ff5daa3b3d7cp-2 * Scale),
V2 (0x1.ec81d03c01aebp-2 * Scale),
V2 (-0x1.71547bb43f101p-1 * Scale),
V2 (0x1.7154764a815cbp0 * Scale)},
.exp2f_tab = {0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f,
0x3fef9301d0125b51, 0x3fef72b83c7d517b, 0x3fef54873168b9aa,
0x3fef387a6e756238, 0x3fef1e9df51fdee1, 0x3fef06fe0a31b715,
0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d,
0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429,
0x3feea47eb03a5585, 0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74,
0x3feea11473eb0187, 0x3feea589994cce13, 0x3feeace5422aa0db,
0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d,
0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c,
0x3fef3720dcef9069, 0x3fef5818dcfba487, 0x3fef7c97337b9b5f,
0x3fefa4afa2a490da, 0x3fefd0765b6e4540,},
.exp2f_poly = { /* rel err: 1.69 * 2^-34. */
V2 (0x1.c6af84b912394p-5 / Scale / Scale / Scale),
V2 (0x1.ebfce50fac4f3p-3 / Scale / Scale),
V2 (0x1.62e42ff0c52d6p-1 / Scale)}};
static float32x4_t VPCS_ATTR NOINLINE
special_case (float32x4_t x, float32x4_t y, float32x4_t ret, uint32x4_t cmp)
{
return v_call2_f32 (powf, x, y, ret, cmp);
}
static inline float64x2_t
ylogx_core (const struct data *d, float64x2_t iz, float64x2_t k,
float64x2_t invc, float64x2_t logc, float64x2_t y)
{
/* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k. */
float64x2_t r = vfmaq_f64 (v_f64 (-1.0), iz, invc);
float64x2_t y0 = vaddq_f64 (logc, k);
/* Polynomial to approximate log1p(r)/ln2. */
float64x2_t logx = vfmaq_f64 (A[1], r, A[0]);
logx = vfmaq_f64 (A[2], logx, r);
logx = vfmaq_f64 (A[3], logx, r);
logx = vfmaq_f64 (y0, logx, r);
return vmulq_f64 (logx, y);
}
static inline float64x2_t
log2_lookup (const struct data *d, uint32_t i)
{
return vld1q_f64 (
&d->log2_tab[(i >> (23 - V_POWF_LOG2_TABLE_BITS)) & Log2IdxMask].invc);
}
static inline uint64x1_t
exp2f_lookup (const struct data *d, uint64_t i)
{
return vld1_u64 (&d->exp2f_tab[i % (1 << V_EXP2F_TABLE_BITS)]);
}
static inline float32x2_t
powf_core (const struct data *d, float64x2_t ylogx)
{
/* N*x = k + r with r in [-1/2, 1/2]. */
float64x2_t kd = vrndnq_f64 (ylogx);
int64x2_t ki = vcvtaq_s64_f64 (ylogx);
float64x2_t r = vsubq_f64 (ylogx, kd);
/* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1). */
uint64x2_t t = vcombine_u64 (exp2f_lookup (d, vgetq_lane_s64 (ki, 0)),
exp2f_lookup (d, vgetq_lane_s64 (ki, 1)));
t = vaddq_u64 (
t, vreinterpretq_u64_s64 (vshlq_n_s64 (ki, 52 - V_EXP2F_TABLE_BITS)));
float64x2_t s = vreinterpretq_f64_u64 (t);
float64x2_t p = vfmaq_f64 (C[1], r, C[0]);
p = vfmaq_f64 (C[2], r, p);
p = vfmaq_f64 (s, p, vmulq_f64 (s, r));
return vcvt_f32_f64 (p);
}
float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (pow) (float32x4_t x, float32x4_t y)
{
const struct data *d = ptr_barrier (&data);
uint32x4_t u = vreinterpretq_u32_f32 (x);
uint32x4_t cmp = vcgeq_u32 (vsubq_u32 (u, Min), Thresh);
uint32x4_t tmp = vsubq_u32 (u, Off);
uint32x4_t top = vbicq_u32 (tmp, MantissaMask);
float32x4_t iz = vreinterpretq_f32_u32 (vsubq_u32 (u, top));
int32x4_t k = vshrq_n_s32 (vreinterpretq_s32_u32 (top),
23 - V_EXP2F_TABLE_BITS); /* arithmetic shift. */
/* Use double precision for each lane: split input vectors into lo and hi
halves and promote. */
float64x2_t tab0 = log2_lookup (d, vgetq_lane_u32 (tmp, 0)),
tab1 = log2_lookup (d, vgetq_lane_u32 (tmp, 1)),
tab2 = log2_lookup (d, vgetq_lane_u32 (tmp, 2)),
tab3 = log2_lookup (d, vgetq_lane_u32 (tmp, 3));
float64x2_t iz_lo = vcvt_f64_f32 (vget_low_f32 (iz)),
iz_hi = vcvt_high_f64_f32 (iz);
float64x2_t k_lo = vcvtq_f64_s64 (vmovl_s32 (vget_low_s32 (k))),
k_hi = vcvtq_f64_s64 (vmovl_high_s32 (k));
float64x2_t invc_lo = vzip1q_f64 (tab0, tab1),
invc_hi = vzip1q_f64 (tab2, tab3),
logc_lo = vzip2q_f64 (tab0, tab1),
logc_hi = vzip2q_f64 (tab2, tab3);
float64x2_t y_lo = vcvt_f64_f32 (vget_low_f32 (y)),
y_hi = vcvt_high_f64_f32 (y);
float64x2_t ylogx_lo = ylogx_core (d, iz_lo, k_lo, invc_lo, logc_lo, y_lo);
float64x2_t ylogx_hi = ylogx_core (d, iz_hi, k_hi, invc_hi, logc_hi, y_hi);
uint32x4_t ylogx_top = vuzp2q_u32 (vreinterpretq_u32_f64 (ylogx_lo),
vreinterpretq_u32_f64 (ylogx_hi));
cmp = vorrq_u32 (
cmp, vcgeq_u32 (vandq_u32 (vshrq_n_u32 (ylogx_top, 15), v_u32 (0xffff)),
vdupq_n_u32 (asuint64 (126.0 * (1 << V_EXP2F_TABLE_BITS))
>> 47)));
float32x2_t p_lo = powf_core (d, ylogx_lo);
float32x2_t p_hi = powf_core (d, ylogx_hi);
if (unlikely (v_any_u32 (cmp)))
return special_case (x, y, vcombine_f32 (p_lo, p_hi), cmp);
return vcombine_f32 (p_lo, p_hi);
}
HALF_WIDTH_ALIAS_F2 (pow)
TEST_SIG (V, F, 2, pow)
TEST_ULP (V_NAME_F2 (pow), 2.1)
TEST_DISABLE_FENV (V_NAME_F2 (pow))
TEST_INTERVAL2 (V_NAME_F2 (pow), 0x1p-1, 0x1p1, 0x1p-7, 0x1p7, 50000)
TEST_INTERVAL2 (V_NAME_F2 (pow), 0x1p-1, 0x1p1, -0x1p-7, -0x1p7, 50000)
TEST_INTERVAL2 (V_NAME_F2 (pow), 0x1p-70, 0x1p70, 0x1p-1, 0x1p1, 50000)
TEST_INTERVAL2 (V_NAME_F2 (pow), 0x1p-70, 0x1p70, -0x1p-1, -0x1p1, 50000)
TEST_INTERVAL2 (V_NAME_F2 (pow), 0x1.ep-1, 0x1.1p0, 0x1p8, 0x1p14, 50000)
TEST_INTERVAL2 (V_NAME_F2 (pow), 0x1.ep-1, 0x1.1p0, -0x1p8, -0x1p14, 50000)

View file

@ -1,17 +1,19 @@
/*
* Double-precision vector sin function.
*
* Copyright (c) 2019-2023, Arm Limited.
* Copyright (c) 2019-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "test_defs.h"
#include "test_sig.h"
#include "mathlib.h"
#include "v_math.h"
static const struct data
{
float64x2_t poly[7];
float64x2_t range_val, inv_pi, shift, pi_1, pi_2, pi_3;
float64x2_t range_val, inv_pi, pi_1, pi_2, pi_3;
} data = {
.poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7),
V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19),
@ -23,12 +25,13 @@ static const struct data
.pi_1 = V2 (0x1.921fb54442d18p+1),
.pi_2 = V2 (0x1.1a62633145c06p-53),
.pi_3 = V2 (0x1.c1cd129024e09p-106),
.shift = V2 (0x1.8p52),
};
#if WANT_SIMD_EXCEPT
# define TinyBound v_u64 (0x3000000000000000) /* asuint64 (0x1p-255). */
# define Thresh v_u64 (0x1160000000000000) /* RangeVal - TinyBound. */
/* asuint64(0x1p-253)), below which multiply by inv_pi underflows. */
# define TinyBound v_u64 (0x3020000000000000)
/* RangeVal - TinyBound. */
# define Thresh v_u64 (0x1160000000000000)
#endif
#define C(i) d->poly[i]
@ -61,16 +64,15 @@ float64x2_t VPCS_ATTR V_NAME_D1 (sin) (float64x2_t x)
fenv). These lanes will be fixed by special-case handler later. */
uint64x2_t ir = vreinterpretq_u64_f64 (vabsq_f64 (x));
cmp = vcgeq_u64 (vsubq_u64 (ir, TinyBound), Thresh);
r = vbslq_f64 (cmp, vreinterpretq_f64_u64 (cmp), x);
r = vreinterpretq_f64_u64 (vbicq_u64 (vreinterpretq_u64_f64 (x), cmp));
#else
r = x;
cmp = vcageq_f64 (x, d->range_val);
#endif
/* n = rint(|x|/pi). */
n = vfmaq_f64 (d->shift, d->inv_pi, r);
odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63);
n = vsubq_f64 (n, d->shift);
n = vrndaq_f64 (vmulq_f64 (r, d->inv_pi));
odd = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtq_s64_f64 (n)), 63);
/* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
r = vfmsq_f64 (r, d->pi_1, n);
@ -95,3 +97,9 @@ float64x2_t VPCS_ATTR V_NAME_D1 (sin) (float64x2_t x)
return special_case (x, y, odd, cmp);
return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
}
TEST_SIG (V, D, 1, sin, -3.1, 3.1)
TEST_ULP (V_NAME_D1 (sin), 3.0)
TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (sin), WANT_SIMD_EXCEPT)
TEST_SYM_INTERVAL (V_NAME_D1 (sin), 0, 0x1p23, 500000)
TEST_SYM_INTERVAL (V_NAME_D1 (sin), 0x1p23, inf, 10000)

View file

@ -1,7 +1,7 @@
/*
* Double-precision vector sincos function.
*
* Copyright (c) 2023, Arm Limited.
* Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
@ -10,12 +10,21 @@
be linked against the scalar sincosf from math/. */
#define _GNU_SOURCE
#include <math.h>
#undef _GNU_SOURCE
#include "v_math.h"
#include "pl_test.h"
#include "test_defs.h"
#include "v_sincos_common.h"
/* sincos not available for all scalar libm implementations. */
#if defined(_MSC_VER) || !defined(__GLIBC__)
static void
sincos (double x, double *out_sin, double *out_cos)
{
*out_sin = sin (x);
*out_cos = cos (x);
}
#endif
static void VPCS_ATTR NOINLINE
special_case (float64x2_t x, uint64x2_t special, double *out_sin,
double *out_cos)
@ -46,12 +55,13 @@ _ZGVnN2vl8l8_sincos (float64x2_t x, double *out_sin, double *out_cos)
special_case (x, special, out_sin, out_cos);
}
PL_TEST_ULP (_ZGVnN2v_sincos_sin, 2.73)
PL_TEST_ULP (_ZGVnN2v_sincos_cos, 2.73)
TEST_DISABLE_FENV (_ZGVnN2v_sincos_cos)
TEST_DISABLE_FENV (_ZGVnN2v_sincos_sin)
TEST_ULP (_ZGVnN2v_sincos_sin, 2.73)
TEST_ULP (_ZGVnN2v_sincos_cos, 2.73)
#define V_SINCOS_INTERVAL(lo, hi, n) \
PL_TEST_INTERVAL (_ZGVnN2v_sincos_sin, lo, hi, n) \
PL_TEST_INTERVAL (_ZGVnN2v_sincos_cos, lo, hi, n)
V_SINCOS_INTERVAL (0, 0x1p23, 500000)
V_SINCOS_INTERVAL (-0, -0x1p23, 500000)
TEST_INTERVAL (_ZGVnN2v_sincos_sin, lo, hi, n) \
TEST_INTERVAL (_ZGVnN2v_sincos_cos, lo, hi, n)
V_SINCOS_INTERVAL (0, 0x1p-31, 50000)
V_SINCOS_INTERVAL (0x1p-31, 0x1p23, 500000)
V_SINCOS_INTERVAL (0x1p23, inf, 10000)
V_SINCOS_INTERVAL (-0x1p23, -inf, 10000)

View file

@ -1,7 +1,7 @@
/*
* Single-precision vector sincos function.
*
* Copyright (c) 2023, Arm Limited.
* Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
@ -10,11 +10,20 @@
be linked against the scalar sincosf from math/. */
#define _GNU_SOURCE
#include <math.h>
#undef _GNU_SOURCE
#include "v_sincosf_common.h"
#include "v_math.h"
#include "pl_test.h"
#include "test_defs.h"
/* sincos not available for all scalar libm implementations. */
#if defined(_MSC_VER) || !defined(__GLIBC__)
static void
sincosf (float x, float *out_sin, float *out_cos)
{
*out_sin = sinf (x);
*out_cos = cosf (x);
}
#endif
static void VPCS_ATTR NOINLINE
special_case (float32x4_t x, uint32x4_t special, float *out_sin,
@ -47,12 +56,13 @@ _ZGVnN4vl4l4_sincosf (float32x4_t x, float *out_sin, float *out_cos)
special_case (x, special, out_sin, out_cos);
}
PL_TEST_ULP (_ZGVnN4v_sincosf_sin, 1.17)
PL_TEST_ULP (_ZGVnN4v_sincosf_cos, 1.31)
TEST_DISABLE_FENV (_ZGVnN4v_sincosf_sin)
TEST_DISABLE_FENV (_ZGVnN4v_sincosf_cos)
TEST_ULP (_ZGVnN4v_sincosf_sin, 1.17)
TEST_ULP (_ZGVnN4v_sincosf_cos, 1.31)
#define V_SINCOSF_INTERVAL(lo, hi, n) \
PL_TEST_INTERVAL (_ZGVnN4v_sincosf_sin, lo, hi, n) \
PL_TEST_INTERVAL (_ZGVnN4v_sincosf_cos, lo, hi, n)
V_SINCOSF_INTERVAL (0, 0x1p20, 500000)
V_SINCOSF_INTERVAL (-0, -0x1p20, 500000)
TEST_INTERVAL (_ZGVnN4v_sincosf_sin, lo, hi, n) \
TEST_INTERVAL (_ZGVnN4v_sincosf_cos, lo, hi, n)
V_SINCOSF_INTERVAL (0, 0x1p-31, 50000)
V_SINCOSF_INTERVAL (0x1p-31, 0x1p20, 500000)
V_SINCOSF_INTERVAL (0x1p20, inf, 10000)
V_SINCOSF_INTERVAL (-0x1p20, -inf, 10000)

View file

@ -0,0 +1,44 @@
/*
* Double-precision vector sincospi function.
*
* Copyright (c) 2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_sincospi_common.h"
#include "v_math.h"
#include "test_defs.h"
/* Double-precision vector function allowing calculation of both sin and cos in
one function call, using separate argument reduction and shared low-order
polynomials.
Approximation for vector double-precision sincospi(x).
Maximum Error 3.09 ULP:
_ZGVnN2v_sincospi_sin(0x1.7a41deb4b21e1p+14) got 0x1.fd54d0b327cf1p-1
want 0x1.fd54d0b327cf4p-1
Maximum Error 3.16 ULP:
_ZGVnN2v_sincospi_cos(-0x1.11e3c7e284adep-5) got 0x1.fd2da484ff3ffp-1
want 0x1.fd2da484ff402p-1. */
VPCS_ATTR void
_ZGVnN2vl8l8_sincospi (float64x2_t x, double *out_sin, double *out_cos)
{
const struct v_sincospi_data *d = ptr_barrier (&v_sincospi_data);
float64x2x2_t sc = v_sincospi_inline (x, d);
vst1q_f64 (out_sin, sc.val[0]);
vst1q_f64 (out_cos, sc.val[1]);
}
#if WANT_TRIGPI_TESTS
TEST_DISABLE_FENV (_ZGVnN2v_sincospi_cos)
TEST_DISABLE_FENV (_ZGVnN2v_sincospi_sin)
TEST_ULP (_ZGVnN2v_sincospi_sin, 2.59)
TEST_ULP (_ZGVnN2v_sincospi_cos, 2.66)
# define V_SINCOSPI_INTERVAL(lo, hi, n) \
TEST_SYM_INTERVAL (_ZGVnN2v_sincospi_sin, lo, hi, n) \
TEST_SYM_INTERVAL (_ZGVnN2v_sincospi_cos, lo, hi, n)
V_SINCOSPI_INTERVAL (0, 0x1p-63, 10000)
V_SINCOSPI_INTERVAL (0x1p-63, 0.5, 50000)
V_SINCOSPI_INTERVAL (0.5, 0x1p63, 50000)
V_SINCOSPI_INTERVAL (0x1p63, inf, 10000)
#endif

View file

@ -0,0 +1,43 @@
/*
* Single-precision vector sincospi function.
*
* Copyright (c) 2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_sincospif_common.h"
#include "v_math.h"
#include "test_defs.h"
#include "mathlib.h"
/* Single-precision vector function allowing calculation of both sinpi and
cospi in one function call, using shared argument reduction and polynomials.
Worst-case error for sin is 3.04 ULP:
_ZGVnN4v_sincospif_sin(0x1.1d341ap-1) got 0x1.f7cd56p-1 want 0x1.f7cd5p-1.
Worst-case error for cos is 3.18 ULP:
_ZGVnN4v_sincospif_cos(0x1.d341a8p-5) got 0x1.f7cd56p-1 want 0x1.f7cd5p-1.
*/
VPCS_ATTR void
_ZGVnN4vl4l4_sincospif (float32x4_t x, float *out_sin, float *out_cos)
{
const struct v_sincospif_data *d = ptr_barrier (&v_sincospif_data);
float32x4x2_t sc = v_sincospif_inline (x, d);
vst1q_f32 (out_sin, sc.val[0]);
vst1q_f32 (out_cos, sc.val[1]);
}
#if WANT_TRIGPI_TESTS
TEST_DISABLE_FENV (_ZGVnN4v_sincospif_sin)
TEST_DISABLE_FENV (_ZGVnN4v_sincospif_cos)
TEST_ULP (_ZGVnN4v_sincospif_sin, 2.54)
TEST_ULP (_ZGVnN4v_sincospif_cos, 2.68)
# define V_SINCOSPIF_INTERVAL(lo, hi, n) \
TEST_SYM_INTERVAL (_ZGVnN4v_sincospif_sin, lo, hi, n) \
TEST_SYM_INTERVAL (_ZGVnN4v_sincospif_cos, lo, hi, n)
V_SINCOSPIF_INTERVAL (0, 0x1p-63, 10000)
V_SINCOSPIF_INTERVAL (0x1p-63, 0.5, 50000)
V_SINCOSPIF_INTERVAL (0.5, 0x1p31, 50000)
V_SINCOSPIF_INTERVAL (0x1p31, inf, 10000)
#endif

View file

@ -1,17 +1,19 @@
/*
* Single-precision vector sin function.
*
* Copyright (c) 2019-2023, Arm Limited.
* Copyright (c) 2019-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#include "v_math.h"
#include "test_defs.h"
#include "test_sig.h"
static const struct data
{
float32x4_t poly[4];
float32x4_t range_val, inv_pi, shift, pi_1, pi_2, pi_3;
float32x4_t range_val, inv_pi, pi_1, pi_2, pi_3;
} data = {
/* 1.886 ulp error. */
.poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f),
@ -22,13 +24,14 @@ static const struct data
.pi_3 = V4 (-0x1.ee59dap-49f),
.inv_pi = V4 (0x1.45f306p-2f),
.shift = V4 (0x1.8p+23f),
.range_val = V4 (0x1p20f)
};
#if WANT_SIMD_EXCEPT
# define TinyBound v_u32 (0x21000000) /* asuint32(0x1p-61f). */
# define Thresh v_u32 (0x28800000) /* RangeVal - TinyBound. */
/* asuint32(0x1p-59f), below which multiply by inv_pi underflows. */
# define TinyBound v_u32 (0x22000000)
/* RangeVal - TinyBound. */
# define Thresh v_u32 (0x27800000)
#endif
#define C(i) d->poly[i]
@ -41,7 +44,7 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp)
return v_call_f32 (sinf, x, y, cmp);
}
float32x4_t VPCS_ATTR V_NAME_F1 (sin) (float32x4_t x)
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sin) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
float32x4_t n, r, r2, y;
@ -53,23 +56,22 @@ float32x4_t VPCS_ATTR V_NAME_F1 (sin) (float32x4_t x)
/* If fenv exceptions are to be triggered correctly, set any special lanes
to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
special-case handler later. */
r = vbslq_f32 (cmp, vreinterpretq_f32_u32 (cmp), x);
r = vreinterpretq_f32_u32 (vbicq_u32 (vreinterpretq_u32_f32 (x), cmp));
#else
r = x;
cmp = vcageq_f32 (x, d->range_val);
#endif
/* n = rint(|x|/pi) */
n = vfmaq_f32 (d->shift, d->inv_pi, r);
odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31);
n = vsubq_f32 (n, d->shift);
/* n = rint(|x|/pi). */
n = vrndaq_f32 (vmulq_f32 (r, d->inv_pi));
odd = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 31);
/* r = |x| - n*pi (range reduction into -pi/2 .. pi/2) */
/* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
r = vfmsq_f32 (r, d->pi_1, n);
r = vfmsq_f32 (r, d->pi_2, n);
r = vfmsq_f32 (r, d->pi_3, n);
/* y = sin(r) */
/* y = sin(r). */
r2 = vmulq_f32 (r, r);
y = vfmaq_f32 (C (2), C (3), r2);
y = vfmaq_f32 (C (1), y, r2);
@ -80,3 +82,11 @@ float32x4_t VPCS_ATTR V_NAME_F1 (sin) (float32x4_t x)
return special_case (x, y, odd, cmp);
return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
}
HALF_WIDTH_ALIAS_F1 (sin)
TEST_SIG (V, F, 1, sin, -3.1, 3.1)
TEST_ULP (V_NAME_F1 (sin), 1.4)
TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (sin), WANT_SIMD_EXCEPT)
TEST_SYM_INTERVAL (V_NAME_F1 (sin), 0, 0x1p20, 500000)
TEST_SYM_INTERVAL (V_NAME_F1 (sin), 0x1p20, inf, 10000)

View file

@ -0,0 +1,80 @@
/*
* Double-precision vector sinh(x) function.
*
* Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
#include "test_sig.h"
#include "test_defs.h"
#include "v_expm1_inline.h"
static const struct data
{
struct v_expm1_data d;
uint64x2_t halff;
#if WANT_SIMD_EXCEPT
uint64x2_t tiny_bound, thresh;
#else
float64x2_t large_bound;
#endif
} data = {
.d = V_EXPM1_DATA,
.halff = V2 (0x3fe0000000000000),
#if WANT_SIMD_EXCEPT
/* 2^-26, below which sinh(x) rounds to x. */
.tiny_bound = V2 (0x3e50000000000000),
/* asuint(large_bound) - asuint(tiny_bound). */
.thresh = V2 (0x0230000000000000),
#else
/* 2^9. expm1 helper overflows for large input. */
.large_bound = V2 (0x1p+9),
#endif
};
static float64x2_t NOINLINE VPCS_ATTR
special_case (float64x2_t x)
{
return v_call_f64 (sinh, x, x, v_u64 (-1));
}
/* Approximation for vector double-precision sinh(x) using expm1.
sinh(x) = (exp(x) - exp(-x)) / 2.
The greatest observed error is 2.52 ULP:
_ZGVnN2v_sinh(-0x1.a098a2177a2b9p-2) got -0x1.ac2f05bb66fccp-2
want -0x1.ac2f05bb66fc9p-2. */
float64x2_t VPCS_ATTR V_NAME_D1 (sinh) (float64x2_t x)
{
const struct data *d = ptr_barrier (&data);
float64x2_t ax = vabsq_f64 (x);
uint64x2_t ix = vreinterpretq_u64_f64 (x);
float64x2_t halfsign = vreinterpretq_f64_u64 (
vbslq_u64 (v_u64 (0x8000000000000000), ix, d->halff));
#if WANT_SIMD_EXCEPT
uint64x2_t special = vcgeq_u64 (
vsubq_u64 (vreinterpretq_u64_f64 (ax), d->tiny_bound), d->thresh);
#else
uint64x2_t special = vcageq_f64 (x, d->large_bound);
#endif
/* Fall back to scalar variant for all lanes if any of them are special. */
if (unlikely (v_any_u64 (special)))
return special_case (x);
/* Up to the point that expm1 overflows, we can use it to calculate sinh
using a slight rearrangement of the definition of sinh. This allows us to
retain acceptable accuracy for very small inputs. */
float64x2_t t = expm1_inline (ax, &d->d);
t = vaddq_f64 (t, vdivq_f64 (t, vaddq_f64 (t, v_f64 (1.0))));
return vmulq_f64 (t, halfsign);
}
TEST_SIG (V, D, 1, sinh, -10.0, 10.0)
TEST_ULP (V_NAME_D1 (sinh), 2.02)
TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (sinh), WANT_SIMD_EXCEPT)
TEST_SYM_INTERVAL (V_NAME_D1 (sinh), 0, 0x1p-26, 1000)
TEST_SYM_INTERVAL (V_NAME_D1 (sinh), 0x1p-26, 0x1p9, 500000)
TEST_SYM_INTERVAL (V_NAME_D1 (sinh), 0x1p9, inf, 1000)

View file

@ -1,28 +1,25 @@
/*
* Single-precision vector sinh(x) function.
*
* Copyright (c) 2022-2023, Arm Limited.
* Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
#include "pl_sig.h"
#include "pl_test.h"
#include "test_sig.h"
#include "test_defs.h"
#include "v_expm1f_inline.h"
static const struct data
{
struct v_expm1f_data expm1f_consts;
uint32x4_t halff;
#if WANT_SIMD_EXCEPT
uint32x4_t tiny_bound, thresh;
#else
uint32x4_t oflow_bound;
float32x4_t oflow_bound;
#endif
} data = {
.expm1f_consts = V_EXPM1F_DATA,
.halff = V4 (0x3f000000),
#if WANT_SIMD_EXCEPT
/* 0x1.6a09e8p-32, below which expm1f underflows. */
.tiny_bound = V4 (0x2fb504f4),
@ -30,14 +27,15 @@ static const struct data
.thresh = V4 (0x12fbbbb3),
#else
/* 0x1.61814ep+6, above which expm1f helper overflows. */
.oflow_bound = V4 (0x42b0c0a7),
.oflow_bound = V4 (0x1.61814ep+6),
#endif
};
static float32x4_t NOINLINE VPCS_ATTR
special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
special_case (float32x4_t x, float32x4_t t, float32x4_t halfsign,
uint32x4_t special)
{
return v_call_f32 (sinhf, x, y, special);
return v_call_f32 (sinhf, x, vmulq_f32 (t, halfsign), special);
}
/* Approximation for vector single-precision sinh(x) using expm1.
@ -45,21 +43,21 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
The maximum error is 2.26 ULP:
_ZGVnN4v_sinhf (0x1.e34a9ep-4) got 0x1.e469ep-4
want 0x1.e469e4p-4. */
float32x4_t VPCS_ATTR V_NAME_F1 (sinh) (float32x4_t x)
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sinh) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
uint32x4_t ix = vreinterpretq_u32_f32 (x);
float32x4_t ax = vabsq_f32 (x);
uint32x4_t iax = vreinterpretq_u32_f32 (ax);
uint32x4_t sign = veorq_u32 (ix, iax);
float32x4_t halfsign = vreinterpretq_f32_u32 (vorrq_u32 (sign, d->halff));
float32x4_t halfsign = vreinterpretq_f32_u32 (
vbslq_u32 (v_u32 (0x80000000), ix, vreinterpretq_u32_f32 (v_f32 (0.5))));
#if WANT_SIMD_EXCEPT
uint32x4_t special = vcgeq_u32 (vsubq_u32 (iax, d->tiny_bound), d->thresh);
uint32x4_t special = vcgeq_u32 (
vsubq_u32 (vreinterpretq_u32_f32 (ax), d->tiny_bound), d->thresh);
ax = v_zerofy_f32 (ax, special);
#else
uint32x4_t special = vcgeq_u32 (iax, d->oflow_bound);
uint32x4_t special = vcageq_f32 (x, d->oflow_bound);
#endif
/* Up to the point that expm1f overflows, we can use it to calculate sinhf
@ -71,14 +69,16 @@ float32x4_t VPCS_ATTR V_NAME_F1 (sinh) (float32x4_t x)
/* Fall back to the scalar variant for any lanes that should trigger an
exception. */
if (unlikely (v_any_u32 (special)))
return special_case (x, vmulq_f32 (t, halfsign), special);
return special_case (x, t, halfsign, special);
return vmulq_f32 (t, halfsign);
}
PL_SIG (V, F, 1, sinh, -10.0, 10.0)
PL_TEST_ULP (V_NAME_F1 (sinh), 1.76)
PL_TEST_EXPECT_FENV (V_NAME_F1 (sinh), WANT_SIMD_EXCEPT)
PL_TEST_SYM_INTERVAL (V_NAME_F1 (sinh), 0, 0x2fb504f4, 1000)
PL_TEST_SYM_INTERVAL (V_NAME_F1 (sinh), 0x2fb504f4, 0x42b0c0a7, 100000)
PL_TEST_SYM_INTERVAL (V_NAME_F1 (sinh), 0x42b0c0a7, inf, 1000)
HALF_WIDTH_ALIAS_F1 (sinh)
TEST_SIG (V, F, 1, sinh, -10.0, 10.0)
TEST_ULP (V_NAME_F1 (sinh), 1.76)
TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (sinh), WANT_SIMD_EXCEPT)
TEST_SYM_INTERVAL (V_NAME_F1 (sinh), 0, 0x2fb504f4, 1000)
TEST_SYM_INTERVAL (V_NAME_F1 (sinh), 0x2fb504f4, 0x42b0c0a7, 100000)
TEST_SYM_INTERVAL (V_NAME_F1 (sinh), 0x42b0c0a7, inf, 1000)

View file

@ -1,15 +1,15 @@
/*
* Double-precision vector sinpi function.
*
* Copyright (c) 2023, Arm Limited.
* Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#include "v_math.h"
#include "poly_advsimd_f64.h"
#include "pl_sig.h"
#include "pl_test.h"
#include "v_poly_f64.h"
#include "test_sig.h"
#include "test_defs.h"
static const struct data
{
@ -34,7 +34,7 @@ special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp)
{
/* Fall back to scalar code. */
y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
return v_call_f64 (sinpi, x, y, cmp);
return v_call_f64 (arm_math_sinpi, x, y, cmp);
}
#endif
@ -77,10 +77,11 @@ float64x2_t VPCS_ATTR V_NAME_D1 (sinpi) (float64x2_t x)
return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
}
PL_SIG (V, D, 1, sinpi, -0.9, 0.9)
PL_TEST_ULP (V_NAME_D1 (sinpi), 3.06)
PL_TEST_EXPECT_FENV (V_NAME_D1 (sinpi), WANT_SIMD_EXCEPT)
PL_TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0, 0x1p-63, 5000)
PL_TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0x1p-63, 0.5, 10000)
PL_TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0.5, 0x1p51, 10000)
PL_TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0x1p51, inf, 10000)
#if WANT_TRIGPI_TESTS
TEST_ULP (V_NAME_D1 (sinpi), 2.56)
TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (sinpi), WANT_SIMD_EXCEPT)
TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0, 0x1p-63, 5000)
TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0x1p-63, 0.5, 10000)
TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0.5, 0x1p51, 10000)
TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0x1p51, inf, 10000)
#endif

View file

@ -1,15 +1,15 @@
/*
* Single-precision vector sinpi function.
*
* Copyright (c) 2023, Arm Limited.
* Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#include "v_math.h"
#include "poly_advsimd_f32.h"
#include "pl_sig.h"
#include "pl_test.h"
#include "v_poly_f32.h"
#include "test_sig.h"
#include "test_defs.h"
static const struct data
{
@ -29,7 +29,7 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp)
{
/* Fall back to scalar code. */
y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
return v_call_f32 (sinpif, x, y, cmp);
return v_call_f32 (arm_math_sinpif, x, y, cmp);
}
#endif
@ -37,7 +37,7 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp)
Maximum Error 3.03 ULP:
_ZGVnN4v_sinpif(0x1.c597ccp-2) got 0x1.f7cd56p-1
want 0x1.f7cd5p-1. */
float32x4_t VPCS_ATTR V_NAME_F1 (sinpi) (float32x4_t x)
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sinpi) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
@ -72,10 +72,13 @@ float32x4_t VPCS_ATTR V_NAME_F1 (sinpi) (float32x4_t x)
return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
}
PL_SIG (V, F, 1, sinpi, -0.9, 0.9)
PL_TEST_ULP (V_NAME_F1 (sinpi), 2.54)
PL_TEST_EXPECT_FENV (V_NAME_F1 (sinpi), WANT_SIMD_EXCEPT)
PL_TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0, 0x1p-31, 5000)
PL_TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0x1p-31, 0.5, 10000)
PL_TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0.5, 0x1p31f, 10000)
PL_TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0x1p31f, inf, 10000)
HALF_WIDTH_ALIAS_F1 (sinpi)
#if WANT_TRIGPI_TESTS
TEST_ULP (V_NAME_F1 (sinpi), 2.54)
TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (sinpi), WANT_SIMD_EXCEPT)
TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0, 0x1p-31, 5000)
TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0x1p-31, 0.5, 10000)
TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0.5, 0x1p31f, 10000)
TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0x1p31f, inf, 10000)
#endif

View file

@ -1,19 +1,20 @@
/*
* Double-precision vector tan(x) function.
*
* Copyright (c) 2023, Arm Limited.
* Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
#include "poly_advsimd_f64.h"
#include "pl_sig.h"
#include "pl_test.h"
#include "v_poly_f64.h"
#include "test_sig.h"
#include "test_defs.h"
static const struct data
{
float64x2_t poly[9];
float64x2_t half_pi, two_over_pi, shift;
double half_pi[2];
float64x2_t two_over_pi, shift;
#if !WANT_SIMD_EXCEPT
float64x2_t range_val;
#endif
@ -71,8 +72,9 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x)
/* Use q to reduce x to r in [-pi/4, pi/4], by:
r = x - q * pi/2, in extended precision. */
float64x2_t r = x;
r = vfmsq_laneq_f64 (r, q, dat->half_pi, 0);
r = vfmsq_laneq_f64 (r, q, dat->half_pi, 1);
float64x2_t half_pi = vld1q_f64 (dat->half_pi);
r = vfmsq_laneq_f64 (r, q, half_pi, 0);
r = vfmsq_laneq_f64 (r, q, half_pi, 1);
/* Further reduce r to [-pi/8, pi/8], to be reconstructed using double angle
formula. */
r = vmulq_n_f64 (r, 0.5);
@ -112,9 +114,9 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x)
vbslq_f64 (no_recip, d, n));
}
PL_SIG (V, D, 1, tan, -3.1, 3.1)
PL_TEST_ULP (V_NAME_D1 (tan), 2.99)
PL_TEST_EXPECT_FENV (V_NAME_D1 (tan), WANT_SIMD_EXCEPT)
PL_TEST_SYM_INTERVAL (V_NAME_D1 (tan), 0, TinyBound, 5000)
PL_TEST_SYM_INTERVAL (V_NAME_D1 (tan), TinyBound, RangeVal, 100000)
PL_TEST_SYM_INTERVAL (V_NAME_D1 (tan), RangeVal, inf, 5000)
TEST_SIG (V, D, 1, tan, -3.1, 3.1)
TEST_ULP (V_NAME_D1 (tan), 2.99)
TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (tan), WANT_SIMD_EXCEPT)
TEST_SYM_INTERVAL (V_NAME_D1 (tan), 0, TinyBound, 5000)
TEST_SYM_INTERVAL (V_NAME_D1 (tan), TinyBound, RangeVal, 100000)
TEST_SYM_INTERVAL (V_NAME_D1 (tan), RangeVal, inf, 5000)

View file

@ -1,19 +1,19 @@
/*
* Single-precision vector tan(x) function.
*
* Copyright (c) 2021-2023, Arm Limited.
* Copyright (c) 2021-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
#include "poly_advsimd_f32.h"
#include "pl_sig.h"
#include "pl_test.h"
#include "v_poly_f32.h"
#include "test_sig.h"
#include "test_defs.h"
static const struct data
{
float32x4_t poly[6];
float32x4_t pi_consts;
float pi_consts[4];
float32x4_t shift;
#if !WANT_SIMD_EXCEPT
float32x4_t range_val;
@ -64,7 +64,7 @@ eval_poly (float32x4_t z, const struct data *d)
Maximum error is 3.45 ULP:
__v_tanf(-0x1.e5f0cap+13) got 0x1.ff9856p-1
want 0x1.ff9850p-1. */
float32x4_t VPCS_ATTR V_NAME_F1 (tan) (float32x4_t x)
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tan) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
float32x4_t special_arg = x;
@ -85,16 +85,17 @@ float32x4_t VPCS_ATTR V_NAME_F1 (tan) (float32x4_t x)
#endif
/* n = rint(x/(pi/2)). */
float32x4_t q = vfmaq_laneq_f32 (d->shift, x, d->pi_consts, 3);
float32x4_t pi_consts = vld1q_f32 (d->pi_consts);
float32x4_t q = vfmaq_laneq_f32 (d->shift, x, pi_consts, 3);
float32x4_t n = vsubq_f32 (q, d->shift);
/* Determine if x lives in an interval, where |tan(x)| grows to infinity. */
uint32x4_t pred_alt = vtstq_u32 (vreinterpretq_u32_f32 (q), v_u32 (1));
/* r = x - n * (pi/2) (range reduction into -pi./4 .. pi/4). */
float32x4_t r;
r = vfmaq_laneq_f32 (x, n, d->pi_consts, 0);
r = vfmaq_laneq_f32 (r, n, d->pi_consts, 1);
r = vfmaq_laneq_f32 (r, n, d->pi_consts, 2);
r = vfmaq_laneq_f32 (x, n, pi_consts, 0);
r = vfmaq_laneq_f32 (r, n, pi_consts, 1);
r = vfmaq_laneq_f32 (r, n, pi_consts, 2);
/* If x lives in an interval, where |tan(x)|
- is finite, then use a polynomial approximation of the form
@ -119,9 +120,11 @@ float32x4_t VPCS_ATTR V_NAME_F1 (tan) (float32x4_t x)
return vbslq_f32 (pred_alt, inv_y, y);
}
PL_SIG (V, F, 1, tan, -3.1, 3.1)
PL_TEST_ULP (V_NAME_F1 (tan), 2.96)
PL_TEST_EXPECT_FENV (V_NAME_F1 (tan), WANT_SIMD_EXCEPT)
PL_TEST_SYM_INTERVAL (V_NAME_F1 (tan), 0, 0x1p-31, 5000)
PL_TEST_SYM_INTERVAL (V_NAME_F1 (tan), 0x1p-31, 0x1p15, 500000)
PL_TEST_SYM_INTERVAL (V_NAME_F1 (tan), 0x1p15, inf, 5000)
HALF_WIDTH_ALIAS_F1 (tan)
TEST_SIG (V, F, 1, tan, -3.1, 3.1)
TEST_ULP (V_NAME_F1 (tan), 2.96)
TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (tan), WANT_SIMD_EXCEPT)
TEST_SYM_INTERVAL (V_NAME_F1 (tan), 0, 0x1p-31, 5000)
TEST_SYM_INTERVAL (V_NAME_F1 (tan), 0x1p-31, 0x1p15, 500000)
TEST_SYM_INTERVAL (V_NAME_F1 (tan), 0x1p15, inf, 5000)

View file

@ -0,0 +1,67 @@
/*
* Double-precision vector tanh(x) function.
* Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
#include "test_sig.h"
#include "test_defs.h"
#include "v_expm1_inline.h"
static const struct data
{
struct v_expm1_data d;
uint64x2_t thresh, tiny_bound;
} data = {
.d = V_EXPM1_DATA,
.tiny_bound = V2 (0x3e40000000000000), /* asuint64 (0x1p-27). */
/* asuint64(0x1.241bf835f9d5fp+4) - asuint64(tiny_bound). */
.thresh = V2 (0x01f241bf835f9d5f),
};
static float64x2_t NOINLINE VPCS_ATTR
special_case (float64x2_t x, float64x2_t q, float64x2_t qp2,
uint64x2_t special)
{
return v_call_f64 (tanh, x, vdivq_f64 (q, qp2), special);
}
/* Vector approximation for double-precision tanh(x), using a simplified
version of expm1. The greatest observed error is 2.70 ULP:
_ZGVnN2v_tanh(-0x1.c59aa220cb177p-3) got -0x1.be5452a6459fep-3
want -0x1.be5452a6459fbp-3. */
float64x2_t VPCS_ATTR V_NAME_D1 (tanh) (float64x2_t x)
{
const struct data *d = ptr_barrier (&data);
uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x));
float64x2_t u = x;
/* Trigger special-cases for tiny, boring and infinity/NaN. */
uint64x2_t special = vcgtq_u64 (vsubq_u64 (ia, d->tiny_bound), d->thresh);
#if WANT_SIMD_EXCEPT
/* To trigger fp exceptions correctly, set special lanes to a neutral value.
They will be fixed up later by the special-case handler. */
if (unlikely (v_any_u64 (special)))
u = v_zerofy_f64 (u, special);
#endif
u = vaddq_f64 (u, u);
/* tanh(x) = (e^2x - 1) / (e^2x + 1). */
float64x2_t q = expm1_inline (u, &d->d);
float64x2_t qp2 = vaddq_f64 (q, v_f64 (2.0));
if (unlikely (v_any_u64 (special)))
return special_case (x, q, qp2, special);
return vdivq_f64 (q, qp2);
}
TEST_SIG (V, D, 1, tanh, -10.0, 10.0)
TEST_ULP (V_NAME_D1 (tanh), 2.21)
TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (tanh), WANT_SIMD_EXCEPT)
TEST_SYM_INTERVAL (V_NAME_D1 (tanh), 0, 0x1p-27, 5000)
TEST_SYM_INTERVAL (V_NAME_D1 (tanh), 0x1p-27, 0x1.241bf835f9d5fp+4, 50000)
TEST_SYM_INTERVAL (V_NAME_D1 (tanh), 0x1.241bf835f9d5fp+4, inf, 1000)

View file

@ -1,14 +1,13 @@
/*
* Single-precision vector tanh(x) function.
*
* Copyright (c) 2022-2023, Arm Limited.
* Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
#include "pl_sig.h"
#include "pl_test.h"
#include "test_sig.h"
#include "test_defs.h"
#include "v_expm1f_inline.h"
static const struct data
@ -20,20 +19,23 @@ static const struct data
/* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for negative). */
.boring_bound = V4 (0x41102cb3),
.large_bound = V4 (0x7f800000),
.onef = V4 (0x3f800000),
};
static float32x4_t NOINLINE VPCS_ATTR
special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
special_case (float32x4_t x, uint32x4_t is_boring, float32x4_t boring,
float32x4_t q, uint32x4_t special)
{
return v_call_f32 (tanhf, x, y, special);
return v_call_f32 (
tanhf, x,
vbslq_f32 (is_boring, boring, vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)))),
special);
}
/* Approximation for single-precision vector tanh(x), using a simplified
version of expm1f. The maximum error is 2.58 ULP:
_ZGVnN4v_tanhf (0x1.fa5eep-5) got 0x1.f9ba02p-5
want 0x1.f9ba08p-5. */
float32x4_t VPCS_ATTR V_NAME_F1 (tanh) (float32x4_t x)
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tanh) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
@ -42,7 +44,9 @@ float32x4_t VPCS_ATTR V_NAME_F1 (tanh) (float32x4_t x)
uint32x4_t iax = vreinterpretq_u32_f32 (ax);
uint32x4_t sign = veorq_u32 (ix, iax);
uint32x4_t is_boring = vcgtq_u32 (iax, d->boring_bound);
float32x4_t boring = vreinterpretq_f32_u32 (vorrq_u32 (sign, d->onef));
/* expm1 exponent bias is 1.0f reinterpreted to int. */
float32x4_t boring = vreinterpretq_f32_u32 (vorrq_u32 (
sign, vreinterpretq_u32_s32 (d->expm1f_consts.exponent_bias)));
#if WANT_SIMD_EXCEPT
/* If fp exceptions are to be triggered properly, set all special and boring
@ -58,16 +62,20 @@ float32x4_t VPCS_ATTR V_NAME_F1 (tanh) (float32x4_t x)
/* tanh(x) = (e^2x - 1) / (e^2x + 1). */
float32x4_t q = expm1f_inline (vmulq_n_f32 (x, 2), &d->expm1f_consts);
float32x4_t y = vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)));
if (unlikely (v_any_u32 (special)))
return special_case (vreinterpretq_f32_u32 (ix),
vbslq_f32 (is_boring, boring, y), special);
return special_case (vreinterpretq_f32_u32 (ix), is_boring, boring, q,
special);
float32x4_t y = vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)));
return vbslq_f32 (is_boring, boring, y);
}
PL_SIG (V, F, 1, tanh, -10.0, 10.0)
PL_TEST_ULP (V_NAME_F1 (tanh), 2.09)
PL_TEST_EXPECT_FENV (V_NAME_F1 (tanh), WANT_SIMD_EXCEPT)
PL_TEST_SYM_INTERVAL (V_NAME_F1 (tanh), 0, 0x1p-23, 1000)
PL_TEST_SYM_INTERVAL (V_NAME_F1 (tanh), 0x1p-23, 0x1.205966p+3, 100000)
PL_TEST_SYM_INTERVAL (V_NAME_F1 (tanh), 0x1.205966p+3, inf, 100)
HALF_WIDTH_ALIAS_F1 (tanh)
TEST_SIG (V, F, 1, tanh, -10.0, 10.0)
TEST_ULP (V_NAME_F1 (tanh), 2.09)
TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (tanh), WANT_SIMD_EXCEPT)
TEST_SYM_INTERVAL (V_NAME_F1 (tanh), 0, 0x1p-23, 1000)
TEST_SYM_INTERVAL (V_NAME_F1 (tanh), 0x1p-23, 0x1.205966p+3, 100000)
TEST_SYM_INTERVAL (V_NAME_F1 (tanh), 0x1.205966p+3, inf, 100)

View file

@ -0,0 +1,88 @@
/*
* Double-precision vector tanpi(x) function.
*
* Copyright (c) 2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
#include "test_sig.h"
#include "test_defs.h"
const static struct v_tanpi_data
{
float64x2_t c0, c2, c4, c6, c8, c10, c12;
double c1, c3, c5, c7, c9, c11, c13, c14;
} tanpi_data = {
/* Coefficents for tan(pi * x) computed with fpminimax
on [ 0x1p-1022 0x1p-2 ]
approx rel error: 0x1.7eap-55
approx abs error: 0x1.7eap-55. */
.c0 = V2 (0x1.921fb54442d18p1), /* pi. */
.c1 = 0x1.4abbce625be52p3, .c2 = V2 (0x1.466bc6775b0f9p5),
.c3 = 0x1.45fff9b426f5ep7, .c4 = V2 (0x1.45f4730dbca5cp9),
.c5 = 0x1.45f3265994f85p11, .c6 = V2 (0x1.45f4234b330cap13),
.c7 = 0x1.45dca11be79ebp15, .c8 = V2 (0x1.47283fc5eea69p17),
.c9 = 0x1.3a6d958cdefaep19, .c10 = V2 (0x1.927896baee627p21),
.c11 = -0x1.89333f6acd922p19, .c12 = V2 (0x1.5d4e912bb8456p27),
.c13 = -0x1.a854d53ab6874p29, .c14 = 0x1.1b76de7681424p32,
};
/* Approximation for double-precision vector tanpi(x)
The maximum error is 3.06 ULP:
_ZGVnN2v_tanpi(0x1.0a4a07dfcca3ep-1) got -0x1.fa30112702c98p+3
want -0x1.fa30112702c95p+3. */
float64x2_t VPCS_ATTR V_NAME_D1 (tanpi) (float64x2_t x)
{
const struct v_tanpi_data *d = ptr_barrier (&tanpi_data);
float64x2_t n = vrndnq_f64 (x);
/* inf produces nan that propagates. */
float64x2_t xr = vsubq_f64 (x, n);
float64x2_t ar = vabdq_f64 (x, n);
uint64x2_t flip = vcgtq_f64 (ar, v_f64 (0.25));
float64x2_t r = vbslq_f64 (flip, vsubq_f64 (v_f64 (0.5), ar), ar);
/* Order-14 pairwise Horner. */
float64x2_t r2 = vmulq_f64 (r, r);
float64x2_t r4 = vmulq_f64 (r2, r2);
float64x2_t c_1_3 = vld1q_f64 (&d->c1);
float64x2_t c_5_7 = vld1q_f64 (&d->c5);
float64x2_t c_9_11 = vld1q_f64 (&d->c9);
float64x2_t c_13_14 = vld1q_f64 (&d->c13);
float64x2_t p01 = vfmaq_laneq_f64 (d->c0, r2, c_1_3, 0);
float64x2_t p23 = vfmaq_laneq_f64 (d->c2, r2, c_1_3, 1);
float64x2_t p45 = vfmaq_laneq_f64 (d->c4, r2, c_5_7, 0);
float64x2_t p67 = vfmaq_laneq_f64 (d->c6, r2, c_5_7, 1);
float64x2_t p89 = vfmaq_laneq_f64 (d->c8, r2, c_9_11, 0);
float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, r2, c_9_11, 1);
float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, r2, c_13_14, 0);
float64x2_t p = vfmaq_laneq_f64 (p1213, r4, c_13_14, 1);
p = vfmaq_f64 (p1011, r4, p);
p = vfmaq_f64 (p89, r4, p);
p = vfmaq_f64 (p67, r4, p);
p = vfmaq_f64 (p45, r4, p);
p = vfmaq_f64 (p23, r4, p);
p = vfmaq_f64 (p01, r4, p);
p = vmulq_f64 (r, p);
float64x2_t p_recip = vdivq_f64 (v_f64 (1.0), p);
float64x2_t y = vbslq_f64 (flip, p_recip, p);
uint64x2_t sign
= veorq_u64 (vreinterpretq_u64_f64 (xr), vreinterpretq_u64_f64 (ar));
return vreinterpretq_f64_u64 (vorrq_u64 (vreinterpretq_u64_f64 (y), sign));
}
#if WANT_TRIGPI_TESTS
TEST_DISABLE_FENV (V_NAME_D1 (tanpi))
TEST_ULP (V_NAME_D1 (tanpi), 2.57)
TEST_SYM_INTERVAL (V_NAME_D1 (tanpi), 0, 0x1p-31, 50000)
TEST_SYM_INTERVAL (V_NAME_D1 (tanpi), 0x1p-31, 0.5, 50000)
TEST_SYM_INTERVAL (V_NAME_D1 (tanpi), 0.5, 1.0, 200000)
TEST_SYM_INTERVAL (V_NAME_D1 (tanpi), 1.0, 0x1p23, 50000)
TEST_SYM_INTERVAL (V_NAME_D1 (tanpi), 0x1p23, inf, 50000)
#endif

View file

@ -0,0 +1,70 @@
/*
* Single-precision vector tanpi(x) function.
*
* Copyright (c) 2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
#include "test_sig.h"
#include "test_defs.h"
const static struct v_tanpif_data
{
float32x4_t c0, c2, c4, c6;
float c1, c3, c5, c7;
} tanpif_data = {
/* Coefficents for tan(pi * x). */
.c0 = V4 (0x1.921fb4p1f), .c1 = 0x1.4abbcep3f, .c2 = V4 (0x1.466b8p5f),
.c3 = 0x1.461c72p7f, .c4 = V4 (0x1.42e9d4p9f), .c5 = 0x1.69e2c4p11f,
.c6 = V4 (0x1.e85558p11f), .c7 = 0x1.a52e08p16f,
};
/* Approximation for single-precision vector tanpi(x)
The maximum error is 3.34 ULP:
_ZGVnN4v_tanpif(0x1.d6c09ap-2) got 0x1.f70aacp+2
want 0x1.f70aa6p+2. */
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tanpi) (float32x4_t x)
{
const struct v_tanpif_data *d = ptr_barrier (&tanpif_data);
float32x4_t n = vrndnq_f32 (x);
/* inf produces nan that propagates. */
float32x4_t xr = vsubq_f32 (x, n);
float32x4_t ar = vabdq_f32 (x, n);
uint32x4_t flip = vcgtq_f32 (ar, v_f32 (0.25f));
float32x4_t r = vbslq_f32 (flip, vsubq_f32 (v_f32 (0.5f), ar), ar);
/* Order-7 pairwise Horner polynomial evaluation scheme. */
float32x4_t r2 = vmulq_f32 (r, r);
float32x4_t r4 = vmulq_f32 (r2, r2);
float32x4_t odd_coeffs = vld1q_f32 (&d->c1);
float32x4_t p01 = vfmaq_laneq_f32 (d->c0, r2, odd_coeffs, 0);
float32x4_t p23 = vfmaq_laneq_f32 (d->c2, r2, odd_coeffs, 1);
float32x4_t p45 = vfmaq_laneq_f32 (d->c4, r2, odd_coeffs, 2);
float32x4_t p67 = vfmaq_laneq_f32 (d->c6, r2, odd_coeffs, 3);
float32x4_t p = vfmaq_f32 (p45, r4, p67);
p = vfmaq_f32 (p23, r4, p);
p = vfmaq_f32 (p01, r4, p);
p = vmulq_f32 (r, p);
float32x4_t p_recip = vdivq_f32 (v_f32 (1.0f), p);
float32x4_t y = vbslq_f32 (flip, p_recip, p);
uint32x4_t sign
= veorq_u32 (vreinterpretq_u32_f32 (xr), vreinterpretq_u32_f32 (ar));
return vreinterpretq_f32_u32 (vorrq_u32 (vreinterpretq_u32_f32 (y), sign));
}
HALF_WIDTH_ALIAS_F1 (tanpi)
#if WANT_TRIGPI_TESTS
TEST_DISABLE_FENV (V_NAME_F1 (tanpi))
TEST_ULP (V_NAME_F1 (tanpi), 2.84)
TEST_SYM_INTERVAL (V_NAME_F1 (tanpi), 0, 0x1p-31, 50000)
TEST_SYM_INTERVAL (V_NAME_F1 (tanpi), 0x1p-31, 0.5, 100000)
TEST_SYM_INTERVAL (V_NAME_F1 (tanpi), 0.5, 0x1p23f, 100000)
TEST_SYM_INTERVAL (V_NAME_F1 (tanpi), 0x1p23f, inf, 100000)
#endif

View file

@ -0,0 +1,58 @@
/*
* Helper for single-precision routines which calculate exp(ax) and do not
* need special-case handling
*
* Copyright (c) 2019-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#ifndef MATH_V_EXPF_INLINE_H
#define MATH_V_EXPF_INLINE_H
#include "v_math.h"
struct v_expf_data
{
float ln2_hi, ln2_lo, c0, c2;
float32x4_t inv_ln2, c1, c3, c4;
/* asuint(1.0f). */
uint32x4_t exponent_bias;
};
/* maxerr: 1.45358 +0.5 ulp. */
#define V_EXPF_DATA \
{ \
.c0 = 0x1.0e4020p-7f, .c1 = V4 (0x1.573e2ep-5f), .c2 = 0x1.555e66p-3f, \
.c3 = V4 (0x1.fffdb6p-2f), .c4 = V4 (0x1.ffffecp-1f), \
.ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f, \
.inv_ln2 = V4 (0x1.715476p+0f), .exponent_bias = V4 (0x3f800000), \
}
static inline float32x4_t
v_expf_inline (float32x4_t x, const struct v_expf_data *d)
{
/* Helper routine for calculating exp(ax).
Copied from v_expf.c, with all special-case handling removed - the
calling routine should handle special values if required. */
/* exp(ax) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
ax = ln2*n + r, with r in [-ln2/2, ln2/2]. */
float32x4_t ax = vabsq_f32 (x);
float32x4_t ln2_c02 = vld1q_f32 (&d->ln2_hi);
float32x4_t n = vrndaq_f32 (vmulq_f32 (ax, d->inv_ln2));
float32x4_t r = vfmsq_laneq_f32 (ax, n, ln2_c02, 0);
r = vfmsq_laneq_f32 (r, n, ln2_c02, 1);
uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23);
float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
/* Custom order-4 Estrin avoids building high order monomial. */
float32x4_t r2 = vmulq_f32 (r, r);
float32x4_t p = vfmaq_laneq_f32 (d->c1, r, ln2_c02, 2);
float32x4_t q = vfmaq_laneq_f32 (d->c3, r, ln2_c02, 3);
q = vfmaq_f32 (q, p, r2);
p = vmulq_f32 (d->c4, r);
float32x4_t poly = vfmaq_f32 (p, q, r2);
return vfmaq_f32 (scale, poly, scale);
}
#endif // MATH_V_EXPF_INLINE_H

View file

@ -0,0 +1,86 @@
/*
* Helper for double-precision routines which calculate exp(x) - 1 and do not
* need special-case handling
*
* Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#ifndef MATH_V_EXPM1_INLINE_H
#define MATH_V_EXPM1_INLINE_H
#include "v_math.h"
struct v_expm1_data
{
float64x2_t c2, c4, c6, c8;
float64x2_t invln2;
int64x2_t exponent_bias;
double c1, c3, c5, c7, c9, c10;
double ln2[2];
};
/* Generated using fpminimax, with degree=12 in [log(2)/2, log(2)/2]. */
#define V_EXPM1_DATA \
{ \
.c1 = 0x1.5555555555559p-3, .c2 = V2 (0x1.555555555554bp-5), \
.c3 = 0x1.111111110f663p-7, .c4 = V2 (0x1.6c16c16c1b5f3p-10), \
.c5 = 0x1.a01a01affa35dp-13, .c6 = V2 (0x1.a01a018b4ecbbp-16), \
.c7 = 0x1.71ddf82db5bb4p-19, .c8 = V2 (0x1.27e517fc0d54bp-22), \
.c9 = 0x1.af5eedae67435p-26, .c10 = 0x1.1f143d060a28ap-29, \
.ln2 = { 0x1.62e42fefa39efp-1, 0x1.abc9e3b39803fp-56 }, \
.invln2 = V2 (0x1.71547652b82fep0), \
.exponent_bias = V2 (0x3ff0000000000000), \
}
static inline float64x2_t
expm1_inline (float64x2_t x, const struct v_expm1_data *d)
{
/* Helper routine for calculating exp(x) - 1. */
float64x2_t ln2 = vld1q_f64 (&d->ln2[0]);
/* Reduce argument to smaller range:
Let i = round(x / ln2)
and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
where 2^i is exact because i is an integer. */
float64x2_t n = vrndaq_f64 (vmulq_f64 (x, d->invln2));
int64x2_t i = vcvtq_s64_f64 (n);
float64x2_t f = vfmsq_laneq_f64 (x, n, ln2, 0);
f = vfmsq_laneq_f64 (f, n, ln2, 1);
/* Approximate expm1(f) using polynomial.
Taylor expansion for expm1(x) has the form:
x + ax^2 + bx^3 + cx^4 ....
So we calculate the polynomial P(f) = a + bf + cf^2 + ...
and assemble the approximation expm1(f) ~= f + f^2 * P(f). */
float64x2_t f2 = vmulq_f64 (f, f);
float64x2_t f4 = vmulq_f64 (f2, f2);
float64x2_t lane_consts_13 = vld1q_f64 (&d->c1);
float64x2_t lane_consts_57 = vld1q_f64 (&d->c5);
float64x2_t lane_consts_910 = vld1q_f64 (&d->c9);
float64x2_t p01 = vfmaq_laneq_f64 (v_f64 (0.5), f, lane_consts_13, 0);
float64x2_t p23 = vfmaq_laneq_f64 (d->c2, f, lane_consts_13, 1);
float64x2_t p45 = vfmaq_laneq_f64 (d->c4, f, lane_consts_57, 0);
float64x2_t p67 = vfmaq_laneq_f64 (d->c6, f, lane_consts_57, 1);
float64x2_t p03 = vfmaq_f64 (p01, f2, p23);
float64x2_t p47 = vfmaq_f64 (p45, f2, p67);
float64x2_t p89 = vfmaq_laneq_f64 (d->c8, f, lane_consts_910, 0);
float64x2_t p = vfmaq_laneq_f64 (p89, f2, lane_consts_910, 1);
p = vfmaq_f64 (p47, f4, p);
p = vfmaq_f64 (p03, f4, p);
p = vfmaq_f64 (f, f2, p);
/* Assemble the result.
expm1(x) ~= 2^i * (p + 1) - 1
Let t = 2^i. */
int64x2_t u = vaddq_s64 (vshlq_n_s64 (i, 52), d->exponent_bias);
float64x2_t t = vreinterpretq_f64_s64 (u);
/* expm1(x) ~= p * t + (t - 1). */
return vfmaq_f64 (vsubq_f64 (t, v_f64 (1.0)), p, t);
}
#endif // MATH_V_EXPM1_INLINE_H

View file

@ -0,0 +1,62 @@
/*
* Helper for single-precision routines which calculate exp(x) - 1 and do not
* need special-case handling
*
* Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#ifndef MATH_V_EXPM1F_INLINE_H
#define MATH_V_EXPM1F_INLINE_H
#include "v_math.h"
struct v_expm1f_data
{
float32x4_t c0, c2;
int32x4_t exponent_bias;
float c1, c3, inv_ln2, c4;
float ln2_hi, ln2_lo;
};
/* Coefficients generated using fpminimax with degree=5 in [-log(2)/2,
log(2)/2]. Exponent bias is asuint(1.0f). */
#define V_EXPM1F_DATA \
{ \
.c0 = V4 (0x1.fffffep-2), .c1 = 0x1.5554aep-3, .c2 = V4 (0x1.555736p-5), \
.c3 = 0x1.12287cp-7, .c4 = 0x1.6b55a2p-10, \
.exponent_bias = V4 (0x3f800000), .inv_ln2 = 0x1.715476p+0f, \
.ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f, \
}
static inline float32x4_t
expm1f_inline (float32x4_t x, const struct v_expm1f_data *d)
{
/* Helper routine for calculating exp(x) - 1. */
float32x2_t ln2 = vld1_f32 (&d->ln2_hi);
float32x4_t lane_consts = vld1q_f32 (&d->c1);
/* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */
float32x4_t j = vrndaq_f32 (vmulq_laneq_f32 (x, lane_consts, 2));
int32x4_t i = vcvtq_s32_f32 (j);
float32x4_t f = vfmsq_lane_f32 (x, j, ln2, 0);
f = vfmsq_lane_f32 (f, j, ln2, 1);
/* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f). */
float32x4_t f2 = vmulq_f32 (f, f);
float32x4_t f4 = vmulq_f32 (f2, f2);
float32x4_t p01 = vfmaq_laneq_f32 (d->c0, f, lane_consts, 0);
float32x4_t p23 = vfmaq_laneq_f32 (d->c2, f, lane_consts, 1);
float32x4_t p = vfmaq_f32 (p01, f2, p23);
p = vfmaq_laneq_f32 (p, f4, lane_consts, 3);
p = vfmaq_f32 (f, f2, p);
/* t = 2^i. */
int32x4_t u = vaddq_s32 (vshlq_n_s32 (i, 23), d->exponent_bias);
float32x4_t t = vreinterpretq_f32_s32 (u);
/* expm1(x) ~= p * t + (t - 1). */
return vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t);
}
#endif // MATH_V_EXPM1F_INLINE_H

View file

@ -0,0 +1,119 @@
/*
* Helper for vector double-precision routines which calculate log(1 + x) and
* do not need special-case handling
*
* Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#ifndef MATH_V_LOG1P_INLINE_H
#define MATH_V_LOG1P_INLINE_H
#include "v_math.h"
struct v_log1p_data
{
float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16;
uint64x2_t hf_rt2_top, one_m_hf_rt2_top, umask;
int64x2_t one_top;
double c1, c3, c5, c7, c9, c11, c13, c15, c17, c18;
double ln2[2];
};
/* Coefficients generated using Remez, deg=20, in [sqrt(2)/2-1, sqrt(2)-1]. */
#define V_LOG1P_CONSTANTS_TABLE \
{ \
.c0 = V2 (-0x1.ffffffffffffbp-2), .c1 = 0x1.55555555551a9p-2, \
.c2 = V2 (-0x1.00000000008e3p-2), .c3 = 0x1.9999999a32797p-3, \
.c4 = V2 (-0x1.555555552fecfp-3), .c5 = 0x1.249248e071e5ap-3, \
.c6 = V2 (-0x1.ffffff8bf8482p-4), .c7 = 0x1.c71c8f07da57ap-4, \
.c8 = V2 (-0x1.9999ca4ccb617p-4), .c9 = 0x1.7459ad2e1dfa3p-4, \
.c10 = V2 (-0x1.554d2680a3ff2p-4), .c11 = 0x1.3b4c54d487455p-4, \
.c12 = V2 (-0x1.2548a9ffe80e6p-4), .c13 = 0x1.0f389a24b2e07p-4, \
.c14 = V2 (-0x1.eee4db15db335p-5), .c15 = 0x1.e95b494d4a5ddp-5, \
.c16 = V2 (-0x1.15fdf07cb7c73p-4), .c17 = 0x1.0310b70800fcfp-4, \
.c18 = -0x1.cfa7385bdb37ep-6, \
.ln2 = { 0x1.62e42fefa3800p-1, 0x1.ef35793c76730p-45 }, \
.hf_rt2_top = V2 (0x3fe6a09e00000000), \
.one_m_hf_rt2_top = V2 (0x00095f6200000000), \
.umask = V2 (0x000fffff00000000), .one_top = V2 (0x3ff) \
}
#define BottomMask v_u64 (0xffffffff)
static inline float64x2_t
eval_poly (float64x2_t m, float64x2_t m2, const struct v_log1p_data *d)
{
/* Approximate log(1+m) on [-0.25, 0.5] using pairwise Horner. */
float64x2_t c13 = vld1q_f64 (&d->c1);
float64x2_t c57 = vld1q_f64 (&d->c5);
float64x2_t c911 = vld1q_f64 (&d->c9);
float64x2_t c1315 = vld1q_f64 (&d->c13);
float64x2_t c1718 = vld1q_f64 (&d->c17);
float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, m, c1718, 0);
float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, m, c1315, 1);
float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, m, c1315, 0);
float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, m, c911, 1);
float64x2_t p89 = vfmaq_laneq_f64 (d->c8, m, c911, 0);
float64x2_t p67 = vfmaq_laneq_f64 (d->c6, m, c57, 1);
float64x2_t p45 = vfmaq_laneq_f64 (d->c4, m, c57, 0);
float64x2_t p23 = vfmaq_laneq_f64 (d->c2, m, c13, 1);
float64x2_t p01 = vfmaq_laneq_f64 (d->c0, m, c13, 0);
float64x2_t p = vfmaq_laneq_f64 (p1617, m2, c1718, 1);
p = vfmaq_f64 (p1415, m2, p);
p = vfmaq_f64 (p1213, m2, p);
p = vfmaq_f64 (p1011, m2, p);
p = vfmaq_f64 (p89, m2, p);
p = vfmaq_f64 (p67, m2, p);
p = vfmaq_f64 (p45, m2, p);
p = vfmaq_f64 (p23, m2, p);
return vfmaq_f64 (p01, m2, p);
}
static inline float64x2_t
log1p_inline (float64x2_t x, const struct v_log1p_data *d)
{
/* Helper for calculating log(x + 1):
- No special-case handling - this should be dealt with by the caller.
- Optionally simulate the shortcut for k=0, used in the scalar routine,
using v_sel, for improved accuracy when the argument to log1p is close
to 0. This feature is enabled by defining WANT_V_LOG1P_K0_SHORTCUT as 1
in the source of the caller before including this file. */
float64x2_t m = vaddq_f64 (x, v_f64 (1.0));
uint64x2_t mi = vreinterpretq_u64_f64 (m);
uint64x2_t u = vaddq_u64 (mi, d->one_m_hf_rt2_top);
int64x2_t ki
= vsubq_s64 (vreinterpretq_s64_u64 (vshrq_n_u64 (u, 52)), d->one_top);
float64x2_t k = vcvtq_f64_s64 (ki);
/* Reduce x to f in [sqrt(2)/2, sqrt(2)]. */
uint64x2_t utop = vaddq_u64 (vandq_u64 (u, d->umask), d->hf_rt2_top);
uint64x2_t u_red = vorrq_u64 (utop, vandq_u64 (mi, BottomMask));
float64x2_t f = vsubq_f64 (vreinterpretq_f64_u64 (u_red), v_f64 (1.0));
/* Correction term c/m. */
float64x2_t cm = vdivq_f64 (vsubq_f64 (x, vsubq_f64 (m, v_f64 (1.0))), m);
#ifndef WANT_V_LOG1P_K0_SHORTCUT
# error \
"Cannot use v_log1p_inline.h without specifying whether you need the k0 shortcut for greater accuracy close to 0"
#elif WANT_V_LOG1P_K0_SHORTCUT
/* Shortcut if k is 0 - set correction term to 0 and f to x. The result is
that the approximation is solely the polynomial. */
uint64x2_t k0 = vceqzq_f64 (k);
cm = v_zerofy_f64 (cm, k0);
f = vbslq_f64 (k0, x, f);
#endif
/* Approximate log1p(f) on the reduced input using a polynomial. */
float64x2_t f2 = vmulq_f64 (f, f);
float64x2_t p = eval_poly (f, f2, d);
/* Assemble log1p(x) = k * log2 + log1p(f) + c/m. */
float64x2_t ln2 = vld1q_f64 (&d->ln2[0]);
float64x2_t ylo = vfmaq_laneq_f64 (cm, k, ln2, 1);
float64x2_t yhi = vfmaq_laneq_f64 (f, k, ln2, 0);
return vfmaq_f64 (vaddq_f64 (ylo, yhi), f2, p);
}
#endif // MATH_V_LOG1P_INLINE_H

View file

@ -0,0 +1,94 @@
/*
* Helper for single-precision routines which calculate log(1 + x) and do not
* need special-case handling
*
* Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#ifndef MATH_V_LOG1PF_INLINE_H
#define MATH_V_LOG1PF_INLINE_H
#include "v_math.h"
#include "v_poly_f32.h"
struct v_log1pf_data
{
uint32x4_t four;
int32x4_t three_quarters;
float c0, c3, c5, c7;
float32x4_t c4, c6, c1, c2, ln2;
};
/* Polynomial generated using FPMinimax in [-0.25, 0.5]. First two coefficients
(1, -0.5) are not stored as they can be generated more efficiently. */
#define V_LOG1PF_CONSTANTS_TABLE \
{ \
.c0 = 0x1.5555aap-2f, .c1 = V4 (-0x1.000038p-2f), \
.c2 = V4 (0x1.99675cp-3f), .c3 = -0x1.54ef78p-3f, \
.c4 = V4 (0x1.28a1f4p-3f), .c5 = -0x1.0da91p-3f, \
.c6 = V4 (0x1.abcb6p-4f), .c7 = -0x1.6f0d5ep-5f, \
.ln2 = V4 (0x1.62e43p-1f), .four = V4 (0x40800000), \
.three_quarters = V4 (0x3f400000) \
}
static inline float32x4_t
eval_poly (float32x4_t m, const struct v_log1pf_data *d)
{
/* Approximate log(1+m) on [-0.25, 0.5] using pairwise Horner. */
float32x4_t c0357 = vld1q_f32 (&d->c0);
float32x4_t q = vfmaq_laneq_f32 (v_f32 (-0.5), m, c0357, 0);
float32x4_t m2 = vmulq_f32 (m, m);
float32x4_t p67 = vfmaq_laneq_f32 (d->c6, m, c0357, 3);
float32x4_t p45 = vfmaq_laneq_f32 (d->c4, m, c0357, 2);
float32x4_t p23 = vfmaq_laneq_f32 (d->c2, m, c0357, 1);
float32x4_t p = vfmaq_f32 (p45, m2, p67);
p = vfmaq_f32 (p23, m2, p);
p = vfmaq_f32 (d->c1, m, p);
p = vmulq_f32 (m2, p);
p = vfmaq_f32 (m, m2, p);
return vfmaq_f32 (p, m2, q);
}
static inline float32x4_t
log1pf_inline (float32x4_t x, const struct v_log1pf_data *d)
{
/* Helper for calculating log(x + 1). */
/* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m
is in [-0.25, 0.5]):
log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2).
We approximate log1p(m) with a polynomial, then scale by
k*log(2). Instead of doing this directly, we use an intermediate
scale factor s = 4*k*log(2) to ensure the scale is representable
as a normalised fp32 number. */
float32x4_t m = vaddq_f32 (x, v_f32 (1.0f));
/* Choose k to scale x to the range [-1/4, 1/2]. */
int32x4_t k
= vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d->three_quarters),
v_s32 (0xff800000));
uint32x4_t ku = vreinterpretq_u32_s32 (k);
/* Scale up to ensure that the scale factor is representable as normalised
fp32 number, and scale m down accordingly. */
float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d->four, ku));
/* Scale x by exponent manipulation. */
float32x4_t m_scale
= vreinterpretq_f32_u32 (vsubq_u32 (vreinterpretq_u32_f32 (x), ku));
m_scale = vaddq_f32 (m_scale, vfmaq_f32 (v_f32 (-1.0f), v_f32 (0.25f), s));
/* Evaluate polynomial on the reduced interval. */
float32x4_t p = eval_poly (m_scale, d);
/* The scale factor to be applied back at the end - by multiplying float(k)
by 2^-23 we get the unbiased exponent of k. */
float32x4_t scale_back = vmulq_f32 (vcvtq_f32_s32 (k), v_f32 (0x1.0p-23f));
/* Apply the scaling back. */
return vfmaq_f32 (p, scale_back, d->ln2);
}
#endif // MATH_V_LOG1PF_INLINE_H

View file

@ -1,7 +1,7 @@
/*
* Double-precision vector log(x) function - inline version
*
* Copyright (c) 2019-2023, Arm Limited.
* Copyright (c) 2019-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
@ -57,8 +57,8 @@ log_lookup (uint64x2_t i)
{
/* Since N is a power of 2, n % N = n & (N - 1). */
struct entry e;
uint64_t i0 = (i[0] >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
uint64_t i1 = (i[1] >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
uint64_t i0 = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
uint64_t i1 = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc);
float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc);
e.invc = vuzp1q_f64 (e0, e1);

View file

@ -1,36 +1,63 @@
/*
* Vector math abstractions.
*
* Copyright (c) 2019-2023, Arm Limited.
* Copyright (c) 2019-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#ifndef _V_MATH_H
#define _V_MATH_H
#ifndef WANT_VMATH
/* Enable the build of vector math code. */
# define WANT_VMATH 1
#if !__aarch64__
# error "Cannot build without AArch64"
#endif
#if WANT_VMATH
#define VPCS_ATTR __attribute__ ((aarch64_vector_pcs))
# if __aarch64__
# define VPCS_ATTR __attribute__ ((aarch64_vector_pcs))
# else
# error "Cannot build without AArch64"
# endif
#define V_NAME_F1(fun) _ZGVnN4v_##fun##f
#define V_NAME_D1(fun) _ZGVnN2v_##fun
#define V_NAME_F2(fun) _ZGVnN4vv_##fun##f
#define V_NAME_D2(fun) _ZGVnN2vv_##fun
#define V_NAME_F1_L1(fun) _ZGVnN4vl4_##fun##f
#define V_NAME_D1_L1(fun) _ZGVnN2vl8_##fun
# include <stdint.h>
# include "math_config.h"
# if __aarch64__
#if USE_GLIBC_ABI
# include <arm_neon.h>
# define HALF_WIDTH_ALIAS_F1(fun) \
float32x2_t VPCS_ATTR _ZGVnN2v_##fun##f (float32x2_t x) \
{ \
return vget_low_f32 (_ZGVnN4v_##fun##f (vcombine_f32 (x, x))); \
}
# define HALF_WIDTH_ALIAS_F2(fun) \
float32x2_t VPCS_ATTR _ZGVnN2vv_##fun##f (float32x2_t x, float32x2_t y) \
{ \
return vget_low_f32 ( \
_ZGVnN4vv_##fun##f (vcombine_f32 (x, x), vcombine_f32 (y, y))); \
}
#else
# define HALF_WIDTH_ALIAS_F1(fun)
# define HALF_WIDTH_ALIAS_F2(fun)
#endif
#include <stdint.h>
#include "math_config.h"
#include <arm_neon.h>
/* Shorthand helpers for declaring constants. */
# define V2(X) { X, X }
# define V4(X) { X, X, X, X }
# define V8(X) { X, X, X, X, X, X, X, X }
#define V2(X) \
{ \
X, X \
}
#define V4(X) \
{ \
X, X, X, X \
}
#define V8(X) \
{ \
X, X, X, X, X, X, X, X \
}
static inline int
v_any_u16h (uint16x4_t x)
@ -38,6 +65,12 @@ v_any_u16h (uint16x4_t x)
return vget_lane_u64 (vreinterpret_u64_u16 (x), 0) != 0;
}
static inline int
v_lanes32 (void)
{
return 4;
}
static inline float32x4_t
v_f32 (float x)
{
@ -54,7 +87,7 @@ v_s32 (int32_t x)
return (int32x4_t) V4 (x);
}
/* true if any elements of a vector compare result is non-zero. */
/* true if any elements of a v_cond result is non-zero. */
static inline int
v_any_u32 (uint32x4_t x)
{
@ -97,6 +130,11 @@ v_zerofy_f32 (float32x4_t x, uint32x4_t mask)
return vreinterpretq_f32_u32 (vbicq_u32 (vreinterpretq_u32_f32 (x), mask));
}
static inline int
v_lanes64 (void)
{
return 2;
}
static inline float64x2_t
v_f64 (double x)
{
@ -113,20 +151,13 @@ v_s64 (int64_t x)
return (int64x2_t) V2 (x);
}
/* true if any elements of a vector compare result is non-zero. */
/* true if any elements of a v_cond result is non-zero. */
static inline int
v_any_u64 (uint64x2_t x)
{
/* assume elements in x are either 0 or -1u. */
return vpaddd_u64 (x) != 0;
}
/* true if all elements of a vector compare result is 1. */
static inline int
v_all_u64 (uint64x2_t x)
{
/* assume elements in x are either 0 or -1u. */
return vpaddd_s64 (vreinterpretq_s64_u64 (x)) == -2;
}
static inline float64x2_t
v_lookup_f64 (const double *tab, uint64x2_t idx)
{
@ -137,7 +168,6 @@ v_lookup_u64 (const uint64_t *tab, uint64x2_t idx)
{
return (uint64x2_t){ tab[idx[0]], tab[idx[1]] };
}
static inline float64x2_t
v_call_f64 (double (*f) (double), float64x2_t x, float64x2_t y, uint64x2_t p)
{
@ -169,7 +199,4 @@ v_zerofy_f64 (float64x2_t x, uint64x2_t mask)
return vreinterpretq_f64_u64 (vbicq_u64 (vreinterpretq_u64_f64 (x), mask));
}
# endif
#endif
#endif

View file

@ -2,12 +2,12 @@
* Helpers for evaluating polynomials on single-precision AdvSIMD input, using
* various schemes.
*
* Copyright (c) 2023, Arm Limited.
* Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#ifndef PL_MATH_POLY_ADVSIMD_F32_H
#define PL_MATH_POLY_ADVSIMD_F32_H
#ifndef MATH_POLY_ADVSIMD_F32_H
#define MATH_POLY_ADVSIMD_F32_H
#include <arm_neon.h>

View file

@ -2,12 +2,12 @@
* Helpers for evaluating polynomials on double-precision AdvSIMD input, using
* various schemes.
*
* Copyright (c) 2023, Arm Limited.
* Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#ifndef PL_MATH_POLY_ADVSIMD_F64_H
#define PL_MATH_POLY_ADVSIMD_F64_H
#ifndef MATH_POLY_ADVSIMD_F64_H
#define MATH_POLY_ADVSIMD_F64_H
#include <arm_neon.h>

View file

@ -1,12 +1,12 @@
/*
* Core approximation for double-precision vector sincos
*
* Copyright (c) 2023, Arm Limited.
* Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
#include "poly_advsimd_f64.h"
#include "v_poly_f64.h"
static const struct v_sincos_data
{

View file

@ -1,7 +1,7 @@
/*
* Core approximation for single-precision vector sincos
*
* Copyright (c) 2023, Arm Limited.
* Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/

View file

@ -0,0 +1,64 @@
/*
* Helper for Double-precision vector sincospi function.
*
* Copyright (c) 2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
#include "v_poly_f64.h"
static const struct v_sincospi_data
{
float64x2_t poly[10], range_val;
} v_sincospi_data = {
/* Polynomial coefficients generated using Remez algorithm,
see sinpi.sollya for details. */
.poly = { V2 (0x1.921fb54442d184p1), V2 (-0x1.4abbce625be53p2),
V2 (0x1.466bc6775ab16p1), V2 (-0x1.32d2cce62dc33p-1),
V2 (0x1.507834891188ep-4), V2 (-0x1.e30750a28c88ep-8),
V2 (0x1.e8f48308acda4p-12), V2 (-0x1.6fc0032b3c29fp-16),
V2 (0x1.af86ae521260bp-21), V2 (-0x1.012a9870eeb7dp-25) },
.range_val = V2 (0x1p63),
};
/* Double-precision vector function allowing calculation of both sin and cos in
one function call, using separate argument reduction and shared low-order
polynomials.
Approximation for vector double-precision sincospi(x).
Maximum Error 3.09 ULP:
_ZGVnN2v_sincospi_sin(0x1.7a41deb4b21e1p+14) got 0x1.fd54d0b327cf1p-1
want 0x1.fd54d0b327cf4p-1
Maximum Error 3.16 ULP:
_ZGVnN2v_sincospi_cos(-0x1.11e3c7e284adep-5) got 0x1.fd2da484ff3ffp-1
want 0x1.fd2da484ff402p-1. */
static inline float64x2x2_t
v_sincospi_inline (float64x2_t x, const struct v_sincospi_data *d)
{
/* If r is odd, the sign of the result should be inverted for sinpi
and reintroduced for cospi. */
uint64x2_t cmp = vcgeq_f64 (x, d->range_val);
uint64x2_t odd = vshlq_n_u64 (
vbicq_u64 (vreinterpretq_u64_s64 (vcvtaq_s64_f64 (x)), cmp), 63);
/* r = x - rint(x). */
float64x2_t sr = vsubq_f64 (x, vrndaq_f64 (x));
/* cospi(x) = sinpi(0.5 - abs(x)) for values -1/2 .. 1/2. */
float64x2_t cr = vsubq_f64 (v_f64 (0.5), vabsq_f64 (sr));
/* Pairwise Horner approximation for y = sin(r * pi). */
float64x2_t sr2 = vmulq_f64 (sr, sr);
float64x2_t sr4 = vmulq_f64 (sr2, sr2);
float64x2_t cr2 = vmulq_f64 (cr, cr);
float64x2_t cr4 = vmulq_f64 (cr2, cr2);
float64x2_t ss = vmulq_f64 (v_pw_horner_9_f64 (sr2, sr4, d->poly), sr);
float64x2_t cc = vmulq_f64 (v_pw_horner_9_f64 (cr2, cr4, d->poly), cr);
float64x2_t sinpix
= vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (ss), odd));
float64x2_t cospix
= vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (cc), odd));
return (float64x2x2_t){ sinpix, cospix };
}

View file

@ -0,0 +1,57 @@
/*
* Helper for Single-precision vector sincospi function.
*
* Copyright (c) 2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#include "v_math.h"
#include "v_poly_f32.h"
const static struct v_sincospif_data
{
float32x4_t poly[6], range_val;
} v_sincospif_data = {
/* Taylor series coefficents for sin(pi * x). */
.poly = { V4 (0x1.921fb6p1f), V4 (-0x1.4abbcep2f), V4 (0x1.466bc6p1f),
V4 (-0x1.32d2ccp-1f), V4 (0x1.50783p-4f), V4 (-0x1.e30750p-8f) },
.range_val = V4 (0x1p31f),
};
/* Single-precision vector function allowing calculation of both sinpi and
cospi in one function call, using shared argument reduction and polynomials.
Worst-case error for sin is 3.04 ULP:
_ZGVnN4v_sincospif_sin(0x1.1d341ap-1) got 0x1.f7cd56p-1 want 0x1.f7cd5p-1.
Worst-case error for cos is 3.18 ULP:
_ZGVnN4v_sincospif_cos(0x1.d341a8p-5) got 0x1.f7cd56p-1 want 0x1.f7cd5p-1.
*/
static inline float32x4x2_t
v_sincospif_inline (float32x4_t x, const struct v_sincospif_data *d)
{
/* If r is odd, the sign of the result should be inverted for sinpi and
reintroduced for cospi. */
uint32x4_t cmp = vcgeq_f32 (x, d->range_val);
uint32x4_t odd = vshlq_n_u32 (
vbicq_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), cmp), 31);
/* r = x - rint(x). */
float32x4_t sr = vsubq_f32 (x, vrndaq_f32 (x));
/* cospi(x) = sinpi(0.5 - abs(x)) for values -1/2 .. 1/2. */
float32x4_t cr = vsubq_f32 (v_f32 (0.5f), vabsq_f32 (sr));
/* Pairwise Horner approximation for y = sin(r * pi). */
float32x4_t sr2 = vmulq_f32 (sr, sr);
float32x4_t sr4 = vmulq_f32 (sr2, sr2);
float32x4_t cr2 = vmulq_f32 (cr, cr);
float32x4_t cr4 = vmulq_f32 (cr2, cr2);
float32x4_t ss = vmulq_f32 (v_pw_horner_5_f32 (sr2, sr4, d->poly), sr);
float32x4_t cc = vmulq_f32 (v_pw_horner_5_f32 (cr2, cr4, d->poly), cr);
float32x4_t sinpix
= vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (ss), odd));
float32x4_t cospix
= vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (cc), odd));
return (float32x4x2_t){ sinpix, cospix };
}

View file

@ -1,14 +1,14 @@
/*
* Double-precision scalar cospi function.
*
* Copyright (c) 2023, Arm Limited.
* Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#include "math_config.h"
#include "pl_sig.h"
#include "pl_test.h"
#include "test_sig.h"
#include "test_defs.h"
#include "poly_scalar_f64.h"
/* Taylor series coefficents for sin(pi * x).
@ -29,9 +29,9 @@ static const double poly[]
cospi(0x1.160b129300112p-21) got 0x1.fffffffffd16bp-1
want 0x1.fffffffffd16ep-1. */
double
cospi (double x)
arm_math_cospi (double x)
{
if (isinf (x))
if (isinf (x) || isnan (x))
return __math_invalid (x);
double ax = asdouble (asuint64 (x) & ~0x8000000000000000);
@ -81,9 +81,18 @@ cospi (double x)
return asdouble (asuint64 (y) ^ sign);
}
PL_SIG (S, D, 1, cospi, -0.9, 0.9)
PL_TEST_ULP (cospi, 2.63)
PL_TEST_SYM_INTERVAL (cospi, 0, 0x1p-63, 5000)
PL_TEST_SYM_INTERVAL (cospi, 0x1p-63, 0.5, 10000)
PL_TEST_SYM_INTERVAL (cospi, 0.5, 0x1p51f, 10000)
PL_TEST_SYM_INTERVAL (cospi, 0x1p51f, inf, 10000)
#if WANT_EXPERIMENTAL_MATH
double
cospi (double x)
{
return arm_math_cospi (x);
}
#endif
#if WANT_TRIGPI_TESTS
TEST_ULP (arm_math_cospi, 2.63)
TEST_SYM_INTERVAL (arm_math_cospi, 0, 0x1p-63, 5000)
TEST_SYM_INTERVAL (arm_math_cospi, 0x1p-63, 0.5, 10000)
TEST_SYM_INTERVAL (arm_math_cospi, 0.5, 0x1p51f, 10000)
TEST_SYM_INTERVAL (arm_math_cospi, 0x1p51f, inf, 10000)
#endif

View file

@ -1,14 +1,14 @@
/*
* Single-precision scalar cospi function.
*
* Copyright (c) 2023, Arm Limited.
* Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#include "math_config.h"
#include "pl_sig.h"
#include "pl_test.h"
#include "test_sig.h"
#include "test_defs.h"
/* Taylor series coefficents for sin(pi * x). */
#define C0 0x1.921fb6p1f
@ -25,9 +25,9 @@
cospif(0x1.37e844p-4) got 0x1.f16b3p-1
want 0x1.f16b2ap-1. */
float
cospif (float x)
arm_math_cospif (float x)
{
if (isinf (x))
if (isinf (x) || isnan (x))
return __math_invalidf (x);
float ax = asfloat (asuint (x) & ~0x80000000);
@ -76,9 +76,18 @@ cospif (float x)
return asfloat (asuint (y * r) ^ sign);
}
PL_SIG (S, F, 1, cospi, -0.9, 0.9)
PL_TEST_ULP (cospif, 2.15)
PL_TEST_SYM_INTERVAL (cospif, 0, 0x1p-31, 5000)
PL_TEST_SYM_INTERVAL (cospif, 0x1p-31, 0.5, 10000)
PL_TEST_SYM_INTERVAL (cospif, 0.5, 0x1p22f, 10000)
PL_TEST_SYM_INTERVAL (cospif, 0x1p22f, inf, 10000)
#if WANT_EXPERIMENTAL_MATH
float
cospif (float x)
{
return arm_math_cospif (x);
}
#endif
#if WANT_TRIGPI_TESTS
TEST_ULP (arm_math_cospif, 2.15)
TEST_SYM_INTERVAL (arm_math_cospif, 0, 0x1p-31, 5000)
TEST_SYM_INTERVAL (arm_math_cospif, 0x1p-31, 0.5, 10000)
TEST_SYM_INTERVAL (arm_math_cospif, 0.5, 0x1p22f, 10000)
TEST_SYM_INTERVAL (arm_math_cospif, 0x1p22f, inf, 10000)
#endif

View file

@ -5,7 +5,6 @@ glibc-specific conventions need not be followed.
The requirements for portable code apply to non-portable code with the
following differences:
1. Worst-case ULP error should be encoded in filenames (e.g. sin_u35.c). There
are no specific restrictions on acceptable ULP error, but if functions
provide significantly less accuracy than portable equivalents then a clear
@ -15,9 +14,3 @@ following differences:
2. Functions are assumed to support round-to-nearest mode by default, unless
stated; other rounding modes are not required to be provided.
3. Handling of special cases may be relaxed for vector functions. Checking
whether each vector lane contains special values such as NaN, Inf or
denormal numbers can prove too costly for vector functions. This is often
not required since vector functions are typically used along with aggressive
compiler optimization flags.

View file

@ -1,23 +1,23 @@
/*
* Double-precision acos(x) function.
*
* Copyright (c) 2023, Arm Limited.
* Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "math_config.h"
#include "poly_scalar_f64.h"
#include "pl_sig.h"
#include "pl_test.h"
#include "test_sig.h"
#include "test_defs.h"
#define AbsMask (0x7fffffffffffffff)
#define Half (0x3fe0000000000000)
#define One (0x3ff0000000000000)
#define PiOver2 (0x1.921fb54442d18p+0)
#define Pi (0x1.921fb54442d18p+1)
#define Small (0x3c90000000000000) /* 2^-53. */
#define Small16 (0x3c90)
#define QNaN (0x7ff8)
#define AbsMask 0x7fffffffffffffff
#define Half 0x3fe0000000000000
#define One 0x3ff0000000000000
#define PiOver2 0x1.921fb54442d18p+0
#define Pi 0x1.921fb54442d18p+1
#define Small 0x3c90000000000000 /* 2^-53. */
#define Small16 0x3c90
#define QNaN 0x7ff8
/* Fast implementation of double-precision acos(x) based on polynomial
approximation of double-precision asin(x).
@ -29,8 +29,8 @@
acos(x) = pi/2 - asin(x)
and use an order 11 polynomial P such that the final approximation of asin is
an odd polynomial: asin(x) ~ x + x^3 * P(x^2).
and use an order 11 polynomial P such that the final approximation of asin
is an odd polynomial: asin(x) ~ x + x^3 * P(x^2).
The largest observed error in this region is 1.18 ulps,
acos(0x1.fbab0a7c460f6p-2) got 0x1.0d54d1985c068p+0
@ -90,11 +90,11 @@ acos (double x)
return (x <= -0.5) ? fma (-2.0, p, Pi) : 2.0 * p;
}
PL_SIG (S, D, 1, acos, -1.0, 1.0)
PL_TEST_ULP (acos, 1.02)
PL_TEST_INTERVAL (acos, 0, Small, 5000)
PL_TEST_INTERVAL (acos, Small, 0.5, 50000)
PL_TEST_INTERVAL (acos, 0.5, 1.0, 50000)
PL_TEST_INTERVAL (acos, 1.0, 0x1p11, 50000)
PL_TEST_INTERVAL (acos, 0x1p11, inf, 20000)
PL_TEST_INTERVAL (acos, -0, -inf, 20000)
TEST_SIG (S, D, 1, acos, -1.0, 1.0)
TEST_ULP (acos, 1.02)
TEST_INTERVAL (acos, 0, Small, 5000)
TEST_INTERVAL (acos, Small, 0.5, 50000)
TEST_INTERVAL (acos, 0.5, 1.0, 50000)
TEST_INTERVAL (acos, 1.0, 0x1p11, 50000)
TEST_INTERVAL (acos, 0x1p11, inf, 20000)
TEST_INTERVAL (acos, -0, -inf, 20000)

View file

@ -1,23 +1,23 @@
/*
* Single-precision acos(x) function.
*
* Copyright (c) 2023, Arm Limited.
* Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "poly_scalar_f32.h"
#include "math_config.h"
#include "pl_sig.h"
#include "pl_test.h"
#include "test_sig.h"
#include "test_defs.h"
#define AbsMask (0x7fffffff)
#define Half (0x3f000000)
#define One (0x3f800000)
#define PiOver2f (0x1.921fb6p+0f)
#define Pif (0x1.921fb6p+1f)
#define Small (0x32800000) /* 2^-26. */
#define Small12 (0x328)
#define QNaN (0x7fc)
#define AbsMask 0x7fffffff
#define Half 0x3f000000
#define One 0x3f800000
#define PiOver2f 0x1.921fb6p+0f
#define Pif 0x1.921fb6p+1f
#define Small 0x32800000 /* 2^-26. */
#define Small12 0x328
#define QNaN 0x7fc
/* Fast implementation of single-precision acos(x) based on polynomial
approximation of single-precision asin(x).
@ -89,11 +89,11 @@ acosf (float x)
return (x <= -0.5) ? fmaf (-2.0f, p, Pif) : 2.0f * p;
}
PL_SIG (S, F, 1, acos, -1.0, 1.0)
PL_TEST_ULP (acosf, 0.82)
PL_TEST_INTERVAL (acosf, 0, Small, 5000)
PL_TEST_INTERVAL (acosf, Small, 0.5, 50000)
PL_TEST_INTERVAL (acosf, 0.5, 1.0, 50000)
PL_TEST_INTERVAL (acosf, 1.0, 0x1p11, 50000)
PL_TEST_INTERVAL (acosf, 0x1p11, inf, 20000)
PL_TEST_INTERVAL (acosf, -0, -inf, 20000)
TEST_SIG (S, F, 1, acos, -1.0, 1.0)
TEST_ULP (acosf, 0.82)
TEST_INTERVAL (acosf, 0, Small, 5000)
TEST_INTERVAL (acosf, Small, 0.5, 50000)
TEST_INTERVAL (acosf, 0.5, 1.0, 50000)
TEST_INTERVAL (acosf, 1.0, 0x1p11, 50000)
TEST_INTERVAL (acosf, 0x1p11, inf, 20000)
TEST_INTERVAL (acosf, -0, -inf, 20000)

View file

@ -1,31 +1,26 @@
/*
* Double-precision acosh(x) function.
*
* Copyright (c) 2022-2023, Arm Limited.
* Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#include "math_config.h"
#include "pl_sig.h"
#include "pl_test.h"
#include "test_sig.h"
#include "test_defs.h"
#define Ln2 (0x1.62e42fefa39efp-1)
#define MinusZero (0x8000000000000000)
#define SquareLim (0x5fe0000000000000) /* asuint64(0x1.0p511). */
#define Two (0x4000000000000000) /* asuint64(2.0). */
double
optr_aor_log_f64 (double);
double
log1p (double);
/* acosh approximation using a variety of approaches on different intervals:
acosh(x) = ln(x + sqrt(x * x - 1)).
x >= 2^511: We cannot square x without overflow. For huge x, sqrt(x*x - 1) is
close enough to x that we can calculate the result by ln(2x) == ln(x) +
x >= 2^511: We cannot square x without overflow. For huge x, sqrt(x*x - 1)
is close enough to x that we can calculate the result by ln(2x) == ln(x) +
ln(2). The greatest observed error in this region is 0.98 ULP:
acosh(0x1.1b9bf42923d1dp+853) got 0x1.28066a11a7c7fp+9
want 0x1.28066a11a7c8p+9.
@ -48,19 +43,19 @@ acosh (double x)
return __math_invalid (x);
if (unlikely (ix >= SquareLim))
return optr_aor_log_f64 (x) + Ln2;
return log (x) + Ln2;
if (ix >= Two)
return optr_aor_log_f64 (x + sqrt (x * x - 1));
return log (x + sqrt (x * x - 1));
double xm1 = x - 1;
return log1p (xm1 + sqrt (2 * xm1 + xm1 * xm1));
}
PL_SIG (S, D, 1, acosh, 1.0, 10.0)
PL_TEST_ULP (acosh, 2.19)
PL_TEST_INTERVAL (acosh, 0, 1, 10000)
PL_TEST_INTERVAL (acosh, 1, 2, 100000)
PL_TEST_INTERVAL (acosh, 2, 0x1p511, 100000)
PL_TEST_INTERVAL (acosh, 0x1p511, inf, 100000)
PL_TEST_INTERVAL (acosh, -0, -inf, 10000)
TEST_SIG (S, D, 1, acosh, 1.0, 10.0)
TEST_ULP (acosh, 2.19)
TEST_INTERVAL (acosh, 0, 1, 10000)
TEST_INTERVAL (acosh, 1, 2, 100000)
TEST_INTERVAL (acosh, 2, 0x1p511, 100000)
TEST_INTERVAL (acosh, 0x1p511, inf, 100000)
TEST_INTERVAL (acosh, -0, -inf, 10000)

View file

@ -1,27 +1,19 @@
/*
* Single-precision acosh(x) function.
*
* Copyright (c) 2022-2023, Arm Limited.
* Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "math_config.h"
#include "pl_sig.h"
#include "pl_test.h"
#include "test_sig.h"
#include "test_defs.h"
#define Ln2 (0x1.62e4p-1f)
#define MinusZero 0x80000000
#define SquareLim 0x5f800000 /* asuint(0x1p64). */
#define Two 0x40000000
/* Single-precision log from math/. */
float
optr_aor_log_f32 (float);
/* Single-precision log(1+x) from pl/math. */
float
log1pf (float);
/* acoshf approximation using a variety of approaches on different intervals:
x >= 2^64: We cannot square x without overflow. For huge x, sqrt(x*x - 1) is
@ -45,19 +37,19 @@ acoshf (float x)
return __math_invalidf (x);
if (unlikely (ix >= SquareLim))
return optr_aor_log_f32 (x) + Ln2;
return logf (x) + Ln2;
if (ix > Two)
return optr_aor_log_f32 (x + sqrtf (x * x - 1));
return logf (x + sqrtf (x * x - 1));
float xm1 = x - 1;
return log1pf (xm1 + sqrtf (2 * xm1 + xm1 * xm1));
}
PL_SIG (S, F, 1, acosh, 1.0, 10.0)
PL_TEST_ULP (acoshf, 2.30)
PL_TEST_INTERVAL (acoshf, 0, 1, 100)
PL_TEST_INTERVAL (acoshf, 1, 2, 10000)
PL_TEST_INTERVAL (acoshf, 2, 0x1p64, 100000)
PL_TEST_INTERVAL (acoshf, 0x1p64, inf, 100000)
PL_TEST_INTERVAL (acoshf, -0, -inf, 10000)
TEST_SIG (S, F, 1, acosh, 1.0, 10.0)
TEST_ULP (acoshf, 2.30)
TEST_INTERVAL (acoshf, 0, 1, 100)
TEST_INTERVAL (acoshf, 1, 2, 10000)
TEST_INTERVAL (acoshf, 2, 0x1p64, 100000)
TEST_INTERVAL (acoshf, 0x1p64, inf, 100000)
TEST_INTERVAL (acoshf, -0, -inf, 10000)

View file

@ -1,15 +1,15 @@
/*
* Double-precision inverse error function (AdvSIMD variant).
*
* Copyright (c) 2023, Arm Limited.
* Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
#include "pl_test.h"
#include "test_defs.h"
#include "mathlib.h"
#include "math_config.h"
#include "pl_sig.h"
#include "poly_advsimd_f64.h"
#include "test_sig.h"
#include "v_poly_f64.h"
#define V_LOG_INLINE_POLY_ORDER 4
#include "v_log_inline.h"
@ -22,7 +22,7 @@ const static struct data
can be taken. */
double P[8][2], Q[7][2];
float64x2_t tailshift;
uint8x16_t idx;
uint8_t idx[16];
struct v_log_inline_data log_tbl;
float64x2_t P_57[9], Q_57[10], P_17[7], Q_17[6];
} data = { .P = { { 0x1.007ce8f01b2e8p+4, -0x1.f3596123109edp-7 },
@ -58,7 +58,7 @@ const static struct data
V2 (0x1.a450d8e7f4cbbp+7), V2 (-0x1.bc3480485857p+7),
V2 (0x1.ae6b0c504ee02p+6), V2 (-0x1.499dfec1a7f5fp+4) },
.tailshift = V2 (-0.87890625),
.idx = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
.idx = { 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 },
.log_tbl = V_LOG_CONSTANTS };
static inline float64x2_t
@ -128,7 +128,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erfinv) (float64x2_t x)
uint64x2_t extreme_tail = vcagtq_f64 (x, v_f64 (0.9375));
uint8x16_t off = vandq_u8 (vreinterpretq_u8_u64 (is_tail), vdupq_n_u8 (8));
uint8x16_t idx = vaddq_u8 (d->idx, off);
uint8x16_t idx = vaddq_u8 (vld1q_u8 (d->idx), off);
float64x2_t t = vbslq_f64 (is_tail, d->tailshift, v_f64 (-0.5625));
t = vfmaq_f64 (t, x, x);
@ -150,12 +150,17 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erfinv) (float64x2_t x)
return vdivq_f64 (p, q);
}
PL_SIG (V, D, 1, erfinv, -0.99, 0.99)
PL_TEST_ULP (V_NAME_D1 (erfinv), 24.8)
#if USE_MPFR
# warning Not generating tests for _ZGVnN2v_erfinv, as MPFR has no suitable reference
#else
TEST_SIG (V, D, 1, erfinv, -0.99, 0.99)
TEST_ULP (V_NAME_D1 (erfinv), 24.8)
TEST_DISABLE_FENV (V_NAME_D1 (erfinv))
TEST_SYM_INTERVAL (V_NAME_D1 (erfinv), 0, 0x1.fffffffffffffp-1, 100000)
TEST_SYM_INTERVAL (V_NAME_D1 (erfinv), 0, 0x1.fffffffffffffp-1, 100000)
TEST_SYM_INTERVAL (V_NAME_D1 (erfinv), 0, 0x1.fffffffffffffp-1, 100000)
/* Test with control lane in each interval. */
PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (erfinv), 0, 0x1.fffffffffffffp-1, 100000,
0.5)
PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (erfinv), 0, 0x1.fffffffffffffp-1, 100000,
0.8)
PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (erfinv), 0, 0x1.fffffffffffffp-1, 100000,
0.95)
TEST_CONTROL_VALUE (V_NAME_D1 (erfinv), 0.5)
TEST_CONTROL_VALUE (V_NAME_D1 (erfinv), 0.8)
TEST_CONTROL_VALUE (V_NAME_D1 (erfinv), 0.95)
#endif

View file

@ -1,13 +1,13 @@
/*
* Single-precision inverse error function (AdvSIMD variant).
*
* Copyright (c) 2023, Arm Limited.
* Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
#include "pl_sig.h"
#include "pl_test.h"
#include "poly_advsimd_f32.h"
#include "test_sig.h"
#include "test_defs.h"
#include "v_poly_f32.h"
#include "v_logf_inline.h"
const static struct data
@ -24,14 +24,15 @@ const static struct data
P_10 and Q_10 are also stored in homogenous vectors to allow better
memory access when no lanes are in a tail region. */
float32x4_t Plo, PQ, Qhi, P29_3, tailshift;
float Plo[4], PQ[4], Qhi[4];
float32x4_t P29_3, tailshift;
float32x4_t P_50[6], Q_50[2];
float32x4_t P_10[3], Q_10[3];
uint8x16_t idxhi, idxlo;
uint8_t idxhi[16], idxlo[16];
struct v_logf_data logf_tbl;
} data = {
.idxlo = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
.idxhi = { 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23 },
.idxlo = { 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 },
.idxhi = { 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11 },
.P29_3 = V4 (0x1.b13626p-2),
.tailshift = V4 (-0.87890625),
.Plo = { -0x1.a31268p+3, -0x1.fc0252p-4, 0x1.ac9048p+4, 0x1.119d44p+0 },
@ -86,7 +87,7 @@ lookup (float32x4_t tbl, uint8x16_t idx)
tail region:
_ZGVnN4v_erfinvf(0x1.f7dbeep-1) got 0x1.b4793p+0
want 0x1.b4793ap+0 . */
float32x4_t VPCS_ATTR V_NAME_F1 (erfinv) (float32x4_t x)
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (erfinv) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
@ -124,18 +125,18 @@ float32x4_t VPCS_ATTR V_NAME_F1 (erfinv) (float32x4_t x)
Add 4 * i to a group of 4 lanes to copy 32-bit lane i. Each vector stores
two pairs of coeffs, so we need two idx vectors - one for each pair. */
uint8x16_t off = vandq_u8 (vreinterpretq_u8_u32 (is_tail), vdupq_n_u8 (4));
uint8x16_t idx_lo = vaddq_u8 (d->idxlo, off);
uint8x16_t idx_hi = vaddq_u8 (d->idxhi, off);
uint8x16_t idx_lo = vaddq_u8 (vld1q_u8 (d->idxlo), off);
uint8x16_t idx_hi = vaddq_u8 (vld1q_u8 (d->idxhi), off);
/* Load the tables. */
float32x4_t p_lo = d->Plo;
float32x4_t pq = d->PQ;
float32x4_t qhi = d->Qhi;
float32x4_t plo = vld1q_f32 (d->Plo);
float32x4_t pq = vld1q_f32 (d->PQ);
float32x4_t qhi = vld1q_f32 (d->Qhi);
/* Do the lookup (and calculate p3 by masking non-tail lanes). */
float32x4_t p3 = vreinterpretq_f32_u32 (
vandq_u32 (is_tail, vreinterpretq_u32_f32 (d->P29_3)));
float32x4_t p0 = lookup (p_lo, idx_lo), p1 = lookup (p_lo, idx_hi),
float32x4_t p0 = lookup (plo, idx_lo), p1 = lookup (plo, idx_hi),
p2 = lookup (pq, idx_lo), q0 = lookup (pq, idx_hi),
q1 = lookup (qhi, idx_lo), q2 = lookup (qhi, idx_hi);
@ -155,9 +156,17 @@ float32x4_t VPCS_ATTR V_NAME_F1 (erfinv) (float32x4_t x)
return vdivq_f32 (p, q);
}
PL_SIG (V, F, 1, erfinv, -0.99, 0.99)
PL_TEST_ULP (V_NAME_F1 (erfinv), 4.49)
HALF_WIDTH_ALIAS_F1 (erfinv)
#if USE_MPFR
# warning Not generating tests for _ZGVnN4v_erfinvf, as MPFR has no suitable reference
#else
TEST_SIG (V, F, 1, erfinv, -0.99, 0.99)
TEST_DISABLE_FENV (V_NAME_F1 (erfinv))
TEST_ULP (V_NAME_F1 (erfinv), 4.49)
TEST_SYM_INTERVAL (V_NAME_F1 (erfinv), 0, 0x1.fffffep-1, 40000)
/* Test with control lane in each interval. */
PL_TEST_SYM_INTERVAL_C (V_NAME_F1 (erfinv), 0, 0x1.fffffep-1, 40000, 0.5)
PL_TEST_SYM_INTERVAL_C (V_NAME_F1 (erfinv), 0, 0x1.fffffep-1, 40000, 0.8)
PL_TEST_SYM_INTERVAL_C (V_NAME_F1 (erfinv), 0, 0x1.fffffep-1, 40000, 0.95)
TEST_CONTROL_VALUE (V_NAME_F1 (erfinv), 0.5)
TEST_CONTROL_VALUE (V_NAME_F1 (erfinv), 0.8)
TEST_CONTROL_VALUE (V_NAME_F1 (erfinv), 0.95)
#endif

View file

@ -1,7 +1,7 @@
/*
* Single-precision vector log function - inline version
*
* Copyright (c) 2019-2023, Arm Limited.
* Copyright (c) 2019-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/

View file

@ -1,22 +1,22 @@
/*
* Double-precision asin(x) function.
*
* Copyright (c) 2023, Arm Limited.
* Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "poly_scalar_f64.h"
#include "math_config.h"
#include "pl_sig.h"
#include "pl_test.h"
#include "test_sig.h"
#include "test_defs.h"
#define AbsMask (0x7fffffffffffffff)
#define Half (0x3fe0000000000000)
#define One (0x3ff0000000000000)
#define PiOver2 (0x1.921fb54442d18p+0)
#define Small (0x3e50000000000000) /* 2^-26. */
#define Small16 (0x3e50)
#define QNaN (0x7ff8)
#define AbsMask 0x7fffffffffffffff
#define Half 0x3fe0000000000000
#define One 0x3ff0000000000000
#define PiOver2 0x1.921fb54442d18p+0
#define Small 0x3e50000000000000 /* 2^-26. */
#define Small16 0x3e50
#define QNaN 0x7ff8
/* Fast implementation of double-precision asin(x) based on polynomial
approximation.
@ -54,8 +54,8 @@
asin(x) ~ pi/2 - acos(x) ~ pi/2 - 2 * sqrt(z) (1 + z * P(z)).
The largest observed error in this region is 2.69 ulps,
asin(0x1.044ac9819f573p-1) got 0x1.110d7e85fdd5p-1
want 0x1.110d7e85fdd53p-1. */
asin(0x1.044e8cefee301p-1) got 0x1.1111dd54ddf96p-1
want 0x1.1111dd54ddf99p-1. */
double
asin (double x)
{
@ -96,11 +96,11 @@ asin (double x)
return asdouble (asuint64 (y) | sign);
}
PL_SIG (S, D, 1, asin, -1.0, 1.0)
PL_TEST_ULP (asin, 2.19)
PL_TEST_INTERVAL (asin, 0, Small, 5000)
PL_TEST_INTERVAL (asin, Small, 0.5, 50000)
PL_TEST_INTERVAL (asin, 0.5, 1.0, 50000)
PL_TEST_INTERVAL (asin, 1.0, 0x1p11, 50000)
PL_TEST_INTERVAL (asin, 0x1p11, inf, 20000)
PL_TEST_INTERVAL (asin, -0, -inf, 20000)
TEST_SIG (S, D, 1, asin, -1.0, 1.0)
TEST_ULP (asin, 2.20)
TEST_INTERVAL (asin, 0, Small, 5000)
TEST_INTERVAL (asin, Small, 0.5, 50000)
TEST_INTERVAL (asin, 0.5, 1.0, 50000)
TEST_INTERVAL (asin, 1.0, 0x1p11, 50000)
TEST_INTERVAL (asin, 0x1p11, inf, 20000)
TEST_INTERVAL (asin, -0, -inf, 20000)

View file

@ -1,7 +1,7 @@
/*
* Coefficients for single-precision asin(x) function.
*
* Copyright (c) 2023, Arm Limited.
* Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/

Some files were not shown because too many files have changed in this diff Show more