Update the Arm Optimized Routine library to v25.01

Sponsored by: Arm Ltd
2026-01-11 19:57:22 +00:00 · 2025-01-10 11:14:39 +00:00 · 2025-01-10 11:14:39 +00:00 · f3087bef11
commit f3087bef11
parent 16f0d01f9c 9d1de25930
472 changed files with 11930 additions and 14603 deletions
--- a/contrib/arm-optimized-routines/MAINTAINERS
+++ b/contrib/arm-optimized-routines/MAINTAINERS
@ -1,12 +1,9 @@
 /
-	Szabolcs Nagy <szabolcs.nagy@arm.com>
+	Tamar Christina <tamar.christina@arm.com>
 math/
-	Szabolcs Nagy <szabolcs.nagy@arm.com>
-networking/
-	Szabolcs Nagy <szabolcs.nagy@arm.com>
-pl/
 	Pierre Blanchard <pierre.blanchard@arm.com>
 	Joe Ramsay <joe.ramsay@arm.com>
+networking/
+	Ola Liljedahl <ola.liljedahl@arm.com>
 string/
-	Szabolcs Nagy <szabolcs.nagy@arm.com>
 	Wilco Dijkstra <wilco.dijkstra@arm.com>
--- a/contrib/arm-optimized-routines/Makefile
+++ b/contrib/arm-optimized-routines/Makefile
@ -1,6 +1,6 @@
 # Makefile - requires GNU make
 #
-# Copyright (c) 2018-2022, Arm Limited.
+# Copyright (c) 2018-2024, Arm Limited.
 # SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception

 srcdir = .
@ -11,7 +11,6 @@ includedir = $(prefix)/include

 # Configure these in config.mk, do not make changes in this file.
 SUBS = math string networking
-PLSUBS = math
 HOST_CC = cc
 HOST_CFLAGS = -std=c99 -O2
 HOST_LDFLAGS =
@ -21,12 +20,22 @@ CPPFLAGS =
 CFLAGS = -std=c99 -O2
 CFLAGS_SHARED = -fPIC
 CFLAGS_ALL = -Ibuild/include $(CPPFLAGS) $(CFLAGS)
-CFLAGS_PL = -Ibuild/pl/include $(CPPFLAGS) $(CFLAGS) -DPL
 LDFLAGS =
 LDLIBS =
 AR = $(CROSS_COMPILE)ar
 RANLIB = $(CROSS_COMPILE)ranlib
 INSTALL = install
+# Detect OS.
+# Assume Unix environment: Linux, Darwin, or Msys.
+OS := $(shell uname -s)
+OS := $(patsubst MSYS%,Msys,$(OS))
+# Following math dependencies can be adjusted in config file
+# if necessary, e.g. for Msys.
+libm-libs = -lm
+libc-libs = -lc
+mpfr-libs = -lmpfr
+gmp-libs = -lgmp
+mpc-libs = -lmpc

 all:

@ -53,7 +62,6 @@ $(DIRS):
 	mkdir -p $@

 $(filter %.os,$(ALL_FILES)): CFLAGS_ALL += $(CFLAGS_SHARED)
-$(filter %.os,$(ALL_FILES)): CFLAGS_PL += $(CFLAGS_SHARED)

 build/%.o: $(srcdir)/%.S
 	$(CC) $(CFLAGS_ALL) -c -o $@ $<
--- a/contrib/arm-optimized-routines/README
+++ b/contrib/arm-optimized-routines/README
@ -12,12 +12,25 @@ contribution requirements are documented in README.contributors of
 the appropriate subdirectory.

 Regular quarterly releases are tagged as vYY.MM, the latest
-release is v24.01.
+release is v25.01.

 Source code layout:

 build/          - build directory (created by make).
-math/           - math subproject sources.
+math/           - math subproject sources for generic scalar
+                  subroutines and sources shared with
+                  subdirectories of math/.
+                  All math routines should meet the quality
+                  requirements stated in math/README.contributors,
+                  routines that fail to do so are located in an
+                  experimental/ directory.
+math/aarch64/   - math subproject AArch64-specific sources
+                  and sources shared with subdirectories.
+math/aarch64/advsimd      - AdvSIMD-specific math sources.
+math/aarch64/experimental - Experimental math sources do not
+                            meet quality requirements stated in
+                            math/README.contributors.
+math/aarch64/sve          - SVE-specific math sources.
 math/include/   - math library public headers.
 math/test/      - math test and benchmark related sources.
 math/tools/     - tools used for designing the algorithms.
@ -25,9 +38,16 @@ networking/     - networking subproject sources.
 networking/include/ - networking library public headers.
 networking/test/ - networking test and benchmark related sources.
 string/         - string routines subproject sources.
+                  All string routines should meet the quality
+                  requirements stated in string/README.contributors,
+                  routines that fail to do so are located in an
+                  experimental/ directory.
+string/<arch>   - <arch>-specific string routines sources for
+                  <arch>=aarch64, and arm.
+string/aarch64/experimental - Experimental string routines which
+                              may not be fully optimized yet.
 string/include/ - string library public headers.
 string/test/    - string test and benchmark related sources.
-pl/...          - separately maintained performance library code.

 The steps to build the target libraries and run the tests:

@ -50,6 +70,13 @@ Or building and testing the math subproject only:
 make all-math
 make check-math

+Note on compiler compability/requirement:
+
+SVE routines are always built by default - this means that on AArch64
+GCC >= 10 or LLVM >= 5 are always required for SVE ACLE compatibility.
+There is no explicit check for compatible compiler, therefore the SVE
+routines will fail to build if CC is too old.
+
 The test system requires libmpfr and libmpc.
 For example on debian linux they can be installed as:

--- a/contrib/arm-optimized-routines/config.mk.dist
+++ b/contrib/arm-optimized-routines/config.mk.dist
@ -1,14 +1,11 @@
 # Example config.mk
 #
-# Copyright (c) 2018-2023, Arm Limited.
+# Copyright (c) 2018-2024, Arm Limited.
 # SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception

 # Subprojects to build
 SUBS = math string networking

-# Subsubprojects to build if subproject pl is built
-PLSUBS = math
-
 # Target architecture: aarch64, arm or x86_64
 ARCH = aarch64

@ -30,6 +27,27 @@ HOST_CFLAGS += -Wall -Wno-unused-function
 HOST_CFLAGS += -g
 CFLAGS += -g

+ifeq ($(OS),Msys)
+  # llvm is the only available/valid native compiler
+  CC = clang
+  AR = llvm-ar
+  RANLIB = llvm-ranlib
+  HOST_CC = clang
+  SYSROOT = /c/wenv/msys2/msys64/clangarm64
+  # Common windows flags
+  COMMON_WIN_CFLAGS = -D_CRT_SECURE_NO_WARNINGS -D_CRT_NONSTDC_NO_DEPRECATE
+  COMMON_WIN_CFLAGS += -Wno-deprecated-declarations -Wno-unused-variable
+  # For mathtest
+  HOST_CFLAGS += -I$(SYSROOT)/include
+  HOST_CFLAGS += $(COMMON_WIN_CFLAGS) -Wno-ignored-attributes
+  # Clear the default flag -fPIC, as not supported on Windows
+  CFLAGS_SHARED =
+  # For ulp.h with MPFR
+  CFLAGS += -I$(SYSROOT)/include
+  # For clang on Windows
+  CFLAGS += $(COMMON_WIN_CFLAGS)
+endif
+
 # Optimize the shared libraries on aarch64 assuming they fit in 1M.
 #CFLAGS_SHARED = -fPIC -mcmodel=tiny

@ -45,12 +63,33 @@ math-cflags =
 math-ldlibs =
 math-ulpflags =
 math-testflags =
-string-cflags =
+string-cflags = -falign-functions=64
 networking-cflags =

-# Use if mpfr is available on the target for ulp error checking.
-#math-ldlibs += -lmpfr -lgmp
-#math-cflags += -DUSE_MPFR
+ifeq ($(OS),Msys)
+  # Libraries can be installed with pacman
+  libm-libs = -lmsvcrt -lvcruntime -lucrt
+  libc-libs =
+  # Linker will look for .lib but some systems only have .dll.a,
+  # therefore we have to give absolute path to libraries.
+  # This is system dependent and might need adjusting.
+  mpfr-libs = $(SYSROOT)/lib/libmpfr.dll.a
+  gmp-libs = $(SYSROOT)/lib/libgmp.dll.a
+  mpc-libs = $(SYSROOT)/lib/libmpc.dll.a
+endif
+
+# Use if mpfr is available on the target for ulp error checking. If
+# enabling this, it is advised to disable fenv checks by uncommenting
+# the two lines at the bottom of this block.
+USE_MPFR=0
+math-cflags += -DUSE_MPFR=$(USE_MPFR)
+ifeq ($(USE_MPFR), 1)
+  math-ldlibs += $(mpfr-libs) $(gmp-libs)
+  math-ulpflags += -m
+endif
+# Disable fenv checks
+#math-ulpflags = -q -f
+#math-testflags = -nostatus

 # Use with gcc.
 math-cflags += -frounding-math -fexcess-precision=standard -fno-stack-protector
@ -59,30 +98,36 @@ math-cflags += -ffp-contract=fast -fno-math-errno
 # Use with clang.
 #math-cflags += -ffp-contract=fast

-# Disable/enable SVE vector math code and tests.
-# If WANT_SVE_MATH is enabled, math-sve-cflags is added for SVE
-# routines only so that SVE code does not leak into scalar
-# routines. It is also necessary to add it for tools (e.g. ulp,
-# mathbench)
-WANT_SVE_MATH = 0
-ifeq ($(WANT_SVE_MATH), 1)
-  math-sve-cflags = -march=armv8-a+sve
-endif
-math-cflags += -DWANT_SVE_MATH=$(WANT_SVE_MATH)
-
 # If defined to 1, set errno in math functions according to ISO C.  Many math
 # libraries do not set errno, so this is 0 by default.  It may need to be
 # set to 1 if math.h has (math_errhandling & MATH_ERRNO) != 0.
 WANT_ERRNO = 0
 math-cflags += -DWANT_ERRNO=$(WANT_ERRNO)

+# Disable/enable SVE vector math tests/tools.
+ifeq ($(ARCH),aarch64)
+  WANT_SVE_TESTS = 1
+else
+  WANT_SVE_TESTS = 0
+endif
+math-cflags += -DWANT_SVE_TESTS=$(WANT_SVE_TESTS)
+
 # If set to 1, set fenv in vector math routines.
 WANT_SIMD_EXCEPT = 0
 math-cflags += -DWANT_SIMD_EXCEPT=$(WANT_SIMD_EXCEPT)

-# Disable fenv checks
-#math-ulpflags = -q -f
-#math-testflags = -nostatus
+# If set to 1, enable tests for exp10.
+WANT_EXP10_TESTS = 1
+math-cflags += -DWANT_EXP10_TESTS=$(WANT_EXP10_TESTS)
+
+# If set to 1, enable tests for sinpi and cospi. These functions are
+# only supported on aarch64
+ifeq ($(ARCH),aarch64)
+  WANT_TRIGPI_TESTS = 1
+else
+  WANT_TRIGPI_TESTS = 0
+endif
+math-cflags += -DWANT_TRIGPI_TESTS=$(WANT_TRIGPI_TESTS)

 # Remove GNU Property Notes from asm files.
 #string-cflags += -DWANT_GNU_PROPERTY=0
@ -92,3 +137,13 @@ math-cflags += -DWANT_SIMD_EXCEPT=$(WANT_SIMD_EXCEPT)

 # Avoid auto-vectorization of scalar code and unroll loops
 networking-cflags += -O2 -fno-tree-vectorize -funroll-loops
+
+# Provide *_finite symbols and some of the glibc hidden symbols
+# so libmathlib can be used with binaries compiled against glibc
+# to interpose math functions with both static and dynamic linking
+USE_GLIBC_ABI = 1
+math-cflags += -DUSE_GLIBC_ABI=$(USE_GLIBC_ABI)
+
+# Enable experimental math routines - non-C23 vector math and low-accuracy scalar
+WANT_EXPERIMENTAL_MATH = 0
+math-cflags += -DWANT_EXPERIMENTAL_MATH=$(WANT_EXPERIMENTAL_MATH)
--- a/contrib/arm-optimized-routines/math/Dir.mk
+++ b/contrib/arm-optimized-routines/math/Dir.mk
@ -1,23 +1,61 @@
 # Makefile fragment - requires GNU make
 #
-# Copyright (c) 2019-2023, Arm Limited.
+# Copyright (c) 2019-2024, Arm Limited.
 # SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception

-S := $(srcdir)/math
-B := build/math
+.SECONDEXPANSION:

-math-lib-srcs := $(wildcard $(S)/*.[cS])
-math-lib-srcs += $(wildcard $(S)/$(ARCH)/*.[cS])
+ifneq ($(OS),Linux)
+  ifeq ($(WANT_SIMD_EXCEPT),1)
+    $(error WANT_SIMD_EXCEPT is not supported outside Linux)
+  endif
+  ifneq ($(USE_MPFR),1)
+    $(warning WARNING: Double-precision ULP tests will not be usable without MPFR)
+  endif
+  ifeq ($(USE_GLIBC_ABI),1)
+    $(error Can only generate special GLIBC symbols on Linux - please disable USE_GLIBC_ABI)
+  endif
+endif
+
+ifneq ($(ARCH),aarch64)
+  ifeq ($(WANT_TRIGPI_TESTS),1)
+    $(error trigpi functions only supported on aarch64)
+  endif
+  ifeq ($(WANT_EXPERIMENTAL_MATH),1)
+    $(error Experimental math only supported on aarch64)
+  endif
+endif
+
+math-src-dir := $(srcdir)/math
+math-build-dir := build/math
+
+math-lib-srcs := $(wildcard $(math-src-dir)/*.[cS])
+math-lib-srcs += $(wildcard $(math-src-dir)/$(ARCH)/*.[cS])
+ifeq ($(OS),Linux)
+# Vector symbols only supported on Linux
+math-lib-srcs += $(wildcard $(math-src-dir)/$(ARCH)/*/*.[cS])
+endif
+
+ifeq ($(WANT_EXPERIMENTAL_MATH), 1)
+ifeq ($(OS),Linux)
+# Vector symbols only supported on Linux
+math-lib-srcs += $(wildcard $(math-src-dir)/$(ARCH)/experimental/*/*.[cS])
+else
+math-lib-srcs += $(wildcard $(math-src-dir)/$(ARCH)/experimental/*.[cS])
+endif
+else
+# Scalar experimental symbols will have been added by wildcard, so remove them
+math-lib-srcs := $(filter-out $(math-src-dir)/aarch64/experimental/%, $(math-lib-srcs))
+endif

 math-test-srcs := \
-	$(S)/test/mathtest.c \
-	$(S)/test/mathbench.c \
-	$(S)/test/ulp.c \
+	$(math-src-dir)/test/mathtest.c \
+	$(math-src-dir)/test/mathbench.c \
+	$(math-src-dir)/test/ulp.c \

-math-test-host-srcs := $(wildcard $(S)/test/rtest/*.[cS])
+math-test-host-srcs := $(wildcard $(math-src-dir)/test/rtest/*.[cS])

-math-includes := $(patsubst $(S)/%,build/%,$(wildcard $(S)/include/*.h))
-math-test-includes := $(patsubst $(S)/%,build/include/%,$(wildcard $(S)/test/*.h))
+math-includes := $(patsubst $(math-src-dir)/%,build/%,$(wildcard $(math-src-dir)/include/*.h))

 math-libs := \
 	build/lib/libmathlib.so \
@ -33,9 +71,9 @@ math-tools := \
 math-host-tools := \
 	build/bin/rtest \

-math-lib-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(math-lib-srcs)))
-math-test-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(math-test-srcs)))
-math-host-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(math-test-host-srcs)))
+math-lib-objs := $(patsubst $(math-src-dir)/%,$(math-build-dir)/%.o,$(basename $(math-lib-srcs)))
+math-test-objs := $(patsubst $(math-src-dir)/%,$(math-build-dir)/%.o,$(basename $(math-test-srcs)))
+math-host-objs := $(patsubst $(math-src-dir)/%,$(math-build-dir)/%.o,$(basename $(math-test-host-srcs)))
 math-target-objs := $(math-lib-objs) $(math-test-objs)
 math-objs := $(math-target-objs) $(math-target-objs:%.o=%.os) $(math-host-objs)

@ -44,18 +82,69 @@ math-files := \
 	$(math-libs) \
 	$(math-tools) \
 	$(math-host-tools) \
-	$(math-includes) \
-	$(math-test-includes) \
+	$(math-includes)

-all-math: $(math-libs) $(math-tools) $(math-includes) $(math-test-includes)
+all-math: $(math-libs) $(math-tools) $(math-includes)

-$(math-objs): $(math-includes) $(math-test-includes)
+$(math-objs): $(math-includes)
 $(math-objs): CFLAGS_ALL += $(math-cflags)
-$(B)/test/mathtest.o: CFLAGS_ALL += -fmath-errno
+$(math-build-dir)/test/mathtest.o: CFLAGS_ALL += -fmath-errno
 $(math-host-objs): CC = $(HOST_CC)
 $(math-host-objs): CFLAGS_ALL = $(HOST_CFLAGS)

-$(B)/test/ulp.o: $(S)/test/ulp.h
+# Add include path for experimental routines so they can share helpers with non-experimental
+$(math-build-dir)/aarch64/experimental/advsimd/%: CFLAGS_ALL += -I$(math-src-dir)/aarch64/advsimd
+$(math-build-dir)/aarch64/experimental/sve/%: CFLAGS_ALL += -I$(math-src-dir)/aarch64/sve
+
+$(math-objs): CFLAGS_ALL += -I$(math-src-dir)
+
+ulp-funcs-dir = build/test/ulp-funcs/
+ulp-wrappers-dir = build/test/ulp-wrappers/
+mathbench-funcs-dir = build/test/mathbench-funcs/
+test-sig-dirs = $(ulp-funcs-dir) $(ulp-wrappers-dir) $(mathbench-funcs-dir)
+build/include/test $(test-sig-dirs) $(addsuffix /$(ARCH),$(test-sig-dirs)) $(addsuffix /aarch64/experimental,$(test-sig-dirs)) \
+$(addsuffix /aarch64/experimental/advsimd,$(test-sig-dirs)) $(addsuffix /aarch64/experimental/sve,$(test-sig-dirs)) \
+$(addsuffix /aarch64/advsimd,$(test-sig-dirs)) $(addsuffix /aarch64/sve,$(test-sig-dirs)):
+	mkdir -p $@
+
+ulp-funcs = $(patsubst $(math-src-dir)/%,$(ulp-funcs-dir)/%,$(basename $(math-lib-srcs)))
+ulp-wrappers = $(patsubst $(math-src-dir)/%,$(ulp-wrappers-dir)/%,$(basename $(math-lib-srcs)))
+mathbench-funcs = $(patsubst $(math-src-dir)/%,$(mathbench-funcs-dir)/%,$(basename $(math-lib-srcs)))
+
+ifeq ($(WANT_SVE_TESTS), 0)
+  # Filter out anything with sve in the path
+  ulp-funcs := $(foreach a,$(ulp-funcs),$(if $(findstring sve,$a),,$a))
+  ulp-wrappers := $(foreach a,$(ulp-wrappers),$(if $(findstring sve,$a),,$a))
+  mathbench-funcs := $(foreach a,$(mathbench-funcs),$(if $(findstring sve,$a),,$a))
+endif
+
+define emit_sig
+$1/aarch64/experimental/sve/%.i: EXTRA_INC = -I$(math-src-dir)/aarch64/sve
+$1/aarch64/experimental/advsimd/%.i: EXTRA_INC = -I$(math-src-dir)/aarch64/advsimd
+$1/%.i: $(math-src-dir)/%.c | $$$$(@D)
+	$(CC) $$< $(math-cflags) -I$(math-src-dir)/include -I$(math-src-dir) $$(EXTRA_INC) -D$2 -E -o $$@
+$1/%: $1/%.i
+	{ grep TEST_SIG $$< || true; } | cut -f 2- -d ' ' > $$@
+endef
+
+$(eval $(call emit_sig,$(ulp-funcs-dir),EMIT_ULP_FUNCS))
+$(eval $(call emit_sig,$(ulp-wrappers-dir),EMIT_ULP_WRAPPERS))
+$(eval $(call emit_sig,$(mathbench-funcs-dir),EMIT_MATHBENCH_FUNCS))
+
+ulp-funcs-gen = build/include/test/ulp_funcs_gen.h
+ulp-wrappers-gen = build/include/test/ulp_wrappers_gen.h
+mathbench-funcs-gen = build/include/test/mathbench_funcs_gen.h
+math-tools-autogen-headers = $(ulp-funcs-gen) $(ulp-wrappers-gen) $(mathbench-funcs-gen)
+
+$(ulp-funcs-gen): $(ulp-funcs) | $$(@D)
+$(ulp-wrappers-gen): $(ulp-wrappers) | $$(@D)
+$(mathbench-funcs-gen): $(mathbench-funcs) | $$(@D)
+
+$(math-tools-autogen-headers): | $$(@D)
+	cat $^ | sort -u > $@
+
+$(math-build-dir)/test/mathbench.o: $(mathbench-funcs-gen)
+$(math-build-dir)/test/ulp.o: $(math-src-dir)/test/ulp.h $(ulp-funcs-gen) $(ulp-wrappers-gen)

 build/lib/libmathlib.so: $(math-lib-objs:%.o=%.os)
 	$(CC) $(CFLAGS_ALL) $(LDFLAGS) -shared -o $@ $^
@ -65,38 +154,40 @@ build/lib/libmathlib.a: $(math-lib-objs)
 	$(AR) rc $@ $^
 	$(RANLIB) $@

-$(math-host-tools): HOST_LDLIBS += -lm -lmpfr -lmpc
-$(math-tools): LDLIBS += $(math-ldlibs) -lm
-# math-sve-cflags should be empty if WANT_SVE_MATH is not enabled
-$(math-tools): CFLAGS_ALL += $(math-sve-cflags)
+$(math-host-tools): HOST_LDLIBS += $(libm-libs) $(mpfr-libs) $(mpc-libs)
+$(math-tools): LDLIBS += $(math-ldlibs) $(libm-libs)
+
+ifneq ($(OS),Darwin)
+  $(math-tools): LDFLAGS += -static
+endif

 build/bin/rtest: $(math-host-objs)
 	$(HOST_CC) $(HOST_CFLAGS) $(HOST_LDFLAGS) -o $@ $^ $(HOST_LDLIBS)

-build/bin/mathtest: $(B)/test/mathtest.o build/lib/libmathlib.a
-	$(CC) $(CFLAGS_ALL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS)
+build/bin/mathtest: $(math-build-dir)/test/mathtest.o build/lib/libmathlib.a
+	$(CC) $(CFLAGS_ALL) $(LDFLAGS) -o $@ $^ $(libm-libs)

-build/bin/mathbench: $(B)/test/mathbench.o build/lib/libmathlib.a
-	$(CC) $(CFLAGS_ALL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS)
+build/bin/mathbench: $(math-build-dir)/test/mathbench.o build/lib/libmathlib.a
+	$(CC) $(CFLAGS_ALL) $(LDFLAGS) -o $@ $^ $(libm-libs)

 # This is not ideal, but allows custom symbols in mathbench to get resolved.
-build/bin/mathbench_libc: $(B)/test/mathbench.o build/lib/libmathlib.a
-	$(CC) $(CFLAGS_ALL) $(LDFLAGS) -static -o $@ $< $(LDLIBS) -lc build/lib/libmathlib.a -lm
+build/bin/mathbench_libc: $(math-build-dir)/test/mathbench.o build/lib/libmathlib.a
+	$(CC) $(CFLAGS_ALL) $(LDFLAGS) -o $@ $< $(libm-libs) $(libc-libs) build/lib/libmathlib.a $(libm-libs)

-build/bin/ulp: $(B)/test/ulp.o build/lib/libmathlib.a
-	$(CC) $(CFLAGS_ALL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS)
+build/bin/ulp: $(math-build-dir)/test/ulp.o build/lib/libmathlib.a
+	$(CC) $(CFLAGS_ALL) $(LDFLAGS) -o $@ $^ $(LDLIBS)

-build/include/%.h: $(S)/include/%.h
+build/include/%.h: $(math-src-dir)/include/%.h
 	cp $< $@

-build/include/test/%.h: $(S)/test/%.h
+build/bin/%.sh: $(math-src-dir)/test/%.sh
 	cp $< $@

-build/bin/%.sh: $(S)/test/%.sh
-	cp $< $@
-
-math-tests := $(wildcard $(S)/test/testcases/directed/*.tst)
-math-rtests := $(wildcard $(S)/test/testcases/random/*.tst)
+math-tests := $(wildcard $(math-src-dir)/test/testcases/directed/*.tst)
+ifneq ($(WANT_EXP10_TESTS),1)
+math-tests := $(filter-out %exp10.tst, $(math-tests))
+endif
+math-rtests := $(wildcard $(math-src-dir)/test/testcases/random/*.tst)

 check-math-test: $(math-tools)
 	cat $(math-tests) | $(EMULATOR) build/bin/mathtest $(math-testflags)
@ -104,8 +195,88 @@ check-math-test: $(math-tools)
 check-math-rtest: $(math-host-tools) $(math-tools)
 	cat $(math-rtests) | build/bin/rtest | $(EMULATOR) build/bin/mathtest $(math-testflags)

+ulp-input-dir = $(math-build-dir)/test/inputs
+$(ulp-input-dir) $(ulp-input-dir)/$(ARCH) $(ulp-input-dir)/aarch64/sve $(ulp-input-dir)/aarch64/advsimd \
+$(ulp-input-dir)/aarch64/experimental $(ulp-input-dir)/aarch64/experimental/advsimd $(ulp-input-dir)/aarch64/experimental/sve:
+	mkdir -p $@
+
+math-lib-lims = $(patsubst $(math-src-dir)/%.c,$(ulp-input-dir)/%.ulp,$(math-lib-srcs))
+math-lib-lims-nn = $(patsubst $(math-src-dir)/%.c,$(ulp-input-dir)/%.ulp_nn,$(math-lib-srcs))
+math-lib-fenvs = $(patsubst $(math-src-dir)/%.c,$(ulp-input-dir)/%.fenv,$(math-lib-srcs))
+math-lib-itvs = $(patsubst $(math-src-dir)/%.c,$(ulp-input-dir)/%.itv,$(math-lib-srcs))
+math-lib-cvals = $(patsubst $(math-src-dir)/%.c,$(ulp-input-dir)/%.cval,$(math-lib-srcs))
+
+ulp-inputs = $(math-lib-lims) $(math-lib-lims-nn) $(math-lib-fenvs) $(math-lib-itvs) $(math-lib-cvals)
+$(ulp-inputs): CFLAGS = -I$(math-src-dir)/test -I$(math-src-dir)/include -I$(math-src-dir) $(math-cflags)\
+                        -I$(math-src-dir)/aarch64/advsimd -I$(math-src-dir)/aarch64/sve
+
+$(ulp-input-dir)/%.ulp.i: $(math-src-dir)/%.c | $$(@D)
+	$(CC) $(CFLAGS) $< -E -o $@
+
+$(ulp-input-dir)/%.ulp: $(ulp-input-dir)/%.ulp.i
+	{ grep "TEST_ULP " $< || true; } > $@
+
+$(ulp-input-dir)/%.ulp_nn.i: $(math-src-dir)/%.c | $$(@D)
+	$(CC) $(CFLAGS) $< -E -o $@
+
+$(ulp-input-dir)/%.ulp_nn: $(ulp-input-dir)/%.ulp_nn.i
+	{ grep "TEST_ULP_NONNEAREST " $< || true; } > $@
+
+$(ulp-input-dir)/%.fenv.i: $(math-src-dir)/%.c | $$(@D)
+	$(CC) $(CFLAGS) $< -E -o $@
+
+$(ulp-input-dir)/%.fenv: $(ulp-input-dir)/%.fenv.i
+	{ grep "TEST_DISABLE_FENV " $< || true; } > $@
+
+$(ulp-input-dir)/%.itv.i: $(math-src-dir)/%.c | $$(@D)
+	$(CC) $(CFLAGS) $< -E -o $@
+
+$(ulp-input-dir)/%.itv: $(ulp-input-dir)/%.itv.i
+	{ grep "TEST_INTERVAL " $< || true; } | sed "s/ TEST_INTERVAL/\nTEST_INTERVAL/g" > $@
+
+$(ulp-input-dir)/%.cval.i: $(math-src-dir)/%.c | $$(@D)
+	$(CC) $(CFLAGS) $< -E -o $@
+
+$(ulp-input-dir)/%.cval: $(ulp-input-dir)/%.cval.i
+	{ grep "TEST_CONTROL_VALUE " $< || true; } > $@
+
+ulp-lims = $(ulp-input-dir)/limits
+$(ulp-lims): $(math-lib-lims)
+
+ulp-lims-nn = $(ulp-input-dir)/limits_nn
+$(ulp-lims-nn): $(math-lib-lims-nn)
+
+fenv-exps := $(ulp-input-dir)/fenv
+$(fenv-exps): $(math-lib-fenvs)
+
+generic-itvs = $(ulp-input-dir)/itvs
+$(generic-itvs): $(filter-out $(ulp-input-dir)/$(ARCH)/%,$(math-lib-itvs))
+
+arch-itvs = $(ulp-input-dir)/$(ARCH)/itvs
+$(arch-itvs): $(filter $(ulp-input-dir)/$(ARCH)/%,$(math-lib-itvs))
+
+ulp-cvals := $(ulp-input-dir)/cvals
+$(ulp-cvals): $(math-lib-cvals)
+
+# Remove first word, which will be TEST directive
+$(ulp-lims) $(ulp-lims-nn) $(fenv-exps) $(arch-itvs) $(generic-itvs) $(ulp-cvals): | $$(@D)
+	sed "s/TEST_[^ ]* //g" $^ | sort -u > $@
+
+check-math-ulp: $(ulp-lims) $(ulp-lims-nn)
+check-math-ulp: $(fenv-exps) $(ulp-cvals)
+check-math-ulp: $(generic-itvs) $(arch-itvs)
 check-math-ulp: $(math-tools)
-	ULPFLAGS="$(math-ulpflags)" WANT_SIMD_EXCEPT="$(WANT_SIMD_EXCEPT)" build/bin/runulp.sh $(EMULATOR)
+	ULPFLAGS="$(math-ulpflags)" \
+	LIMITS=../../$(ulp-lims) \
+	ARCH_ITVS=../../$(arch-itvs) \
+	GEN_ITVS=../../$(generic-itvs) \
+	DISABLE_FENV=../../$(fenv-exps) \
+	CVALS=../../$(ulp-cvals) \
+	FUNC=$(func) \
+	WANT_EXPERIMENTAL_MATH=$(WANT_EXPERIMENTAL_MATH) \
+	WANT_SVE_TESTS=$(WANT_SVE_TESTS) \
+	USE_MPFR=$(USE_MPFR) \
+	build/bin/runulp.sh $(EMULATOR)

 check-math: check-math-test check-math-rtest check-math-ulp

--- a/contrib/arm-optimized-routines/math/README.contributors
+++ b/contrib/arm-optimized-routines/math/README.contributors
@ -1,8 +1,9 @@
 STYLE REQUIREMENTS
 ==================

-1. Most code in this sub-directory is expected to be upstreamed into glibc so
-   the GNU Coding Standard and glibc specific conventions should be followed
+1. With the exception of math/aarch64/experimental/, most code in this
+   sub-directory is expected to be upstreamed into glibc so the GNU
+   Coding Standard and glibc specific conventions should be followed
   to ease upstreaming.

 2. ABI and symbols: the code should be written so it is suitable for inclusion
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/acos.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/acos.c
@ -1,14 +1,14 @@
 /*
 * Double-precision vector acos(x) function.
 *
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

 #include "v_math.h"
-#include "poly_advsimd_f64.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "v_poly_f64.h"
+#include "test_sig.h"
+#include "test_defs.h"

 static const struct data
 {
@ -30,8 +30,8 @@ static const struct data
 };

 #define AllMask v_u64 (0xffffffffffffffff)
-#define Oneu (0x3ff0000000000000)
-#define Small (0x3e50000000000000) /* 2^-53.  */
+#define Oneu 0x3ff0000000000000
+#define Small 0x3e50000000000000 /* 2^-53.  */

 #if WANT_SIMD_EXCEPT
 static float64x2_t VPCS_ATTR NOINLINE
@ -111,12 +111,12 @@ float64x2_t VPCS_ATTR V_NAME_D1 (acos) (float64x2_t x)
  return vfmaq_f64 (add, mul, y);
 }

-PL_SIG (V, D, 1, acos, -1.0, 1.0)
-PL_TEST_ULP (V_NAME_D1 (acos), 1.02)
-PL_TEST_EXPECT_FENV (V_NAME_D1 (acos), WANT_SIMD_EXCEPT)
-PL_TEST_INTERVAL (V_NAME_D1 (acos), 0, Small, 5000)
-PL_TEST_INTERVAL (V_NAME_D1 (acos), Small, 0.5, 50000)
-PL_TEST_INTERVAL (V_NAME_D1 (acos), 0.5, 1.0, 50000)
-PL_TEST_INTERVAL (V_NAME_D1 (acos), 1.0, 0x1p11, 50000)
-PL_TEST_INTERVAL (V_NAME_D1 (acos), 0x1p11, inf, 20000)
-PL_TEST_INTERVAL (V_NAME_D1 (acos), -0, -inf, 20000)
+TEST_SIG (V, D, 1, acos, -1.0, 1.0)
+TEST_ULP (V_NAME_D1 (acos), 1.02)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (acos), WANT_SIMD_EXCEPT)
+TEST_INTERVAL (V_NAME_D1 (acos), 0, Small, 5000)
+TEST_INTERVAL (V_NAME_D1 (acos), Small, 0.5, 50000)
+TEST_INTERVAL (V_NAME_D1 (acos), 0.5, 1.0, 50000)
+TEST_INTERVAL (V_NAME_D1 (acos), 1.0, 0x1p11, 50000)
+TEST_INTERVAL (V_NAME_D1 (acos), 0x1p11, inf, 20000)
+TEST_INTERVAL (V_NAME_D1 (acos), -0, -inf, 20000)
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/acosf.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/acosf.c
@ -1,14 +1,14 @@
 /*
 * Single-precision vector acos(x) function.
 *
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

 #include "v_math.h"
-#include "poly_advsimd_f32.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "v_poly_f32.h"
+#include "test_sig.h"
+#include "test_defs.h"

 static const struct data
 {
@ -57,8 +57,8 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t special)

   The largest observed error in this region is 1.32 ulps,
   _ZGVnN4v_acosf (0x1.15ba56p-1) got 0x1.feb33p-1
-			   want 0x1.feb32ep-1.  */
-float32x4_t VPCS_ATTR V_NAME_F1 (acos) (float32x4_t x)
+				 want 0x1.feb32ep-1.  */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (acos) (float32x4_t x)
 {
  const struct data *d = ptr_barrier (&data);

@ -102,12 +102,14 @@ float32x4_t VPCS_ATTR V_NAME_F1 (acos) (float32x4_t x)
  return vfmaq_f32 (add, mul, y);
 }

-PL_SIG (V, F, 1, acos, -1.0, 1.0)
-PL_TEST_ULP (V_NAME_F1 (acos), 0.82)
-PL_TEST_EXPECT_FENV (V_NAME_F1 (acos), WANT_SIMD_EXCEPT)
-PL_TEST_INTERVAL (V_NAME_F1 (acos), 0, 0x1p-26, 5000)
-PL_TEST_INTERVAL (V_NAME_F1 (acos), 0x1p-26, 0.5, 50000)
-PL_TEST_INTERVAL (V_NAME_F1 (acos), 0.5, 1.0, 50000)
-PL_TEST_INTERVAL (V_NAME_F1 (acos), 1.0, 0x1p11, 50000)
-PL_TEST_INTERVAL (V_NAME_F1 (acos), 0x1p11, inf, 20000)
-PL_TEST_INTERVAL (V_NAME_F1 (acos), -0, -inf, 20000)
+HALF_WIDTH_ALIAS_F1 (acos)
+
+TEST_SIG (V, F, 1, acos, -1.0, 1.0)
+TEST_ULP (V_NAME_F1 (acos), 0.82)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (acos), WANT_SIMD_EXCEPT)
+TEST_INTERVAL (V_NAME_F1 (acos), 0, 0x1p-26, 5000)
+TEST_INTERVAL (V_NAME_F1 (acos), 0x1p-26, 0.5, 50000)
+TEST_INTERVAL (V_NAME_F1 (acos), 0.5, 1.0, 50000)
+TEST_INTERVAL (V_NAME_F1 (acos), 1.0, 0x1p11, 50000)
+TEST_INTERVAL (V_NAME_F1 (acos), 0x1p11, inf, 20000)
+TEST_INTERVAL (V_NAME_F1 (acos), -0, -inf, 20000)
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/acosh.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/acosh.c
@ -1,12 +1,12 @@
 /*
- * Single-precision vector acosh(x) function.
- * Copyright (c) 2023, Arm Limited.
+ * Double-precision vector acosh(x) function.
+ * Copyright (c) 2023-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

 #include "v_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"

 #define WANT_V_LOG1P_K0_SHORTCUT 1
 #include "v_log1p_inline.h"
@ -45,9 +45,8 @@ VPCS_ATTR float64x2_t V_NAME_D1 (acosh) (float64x2_t x)
    x = vbslq_f64 (special, vreinterpretq_f64_u64 (d->one), x);
 #endif

-  float64x2_t xm1 = vsubq_f64 (x, v_f64 (1));
-  float64x2_t y;
-  y = vaddq_f64 (x, v_f64 (1));
+  float64x2_t xm1 = vsubq_f64 (x, v_f64 (1.0));
+  float64x2_t y = vaddq_f64 (x, v_f64 (1.0));
  y = vmulq_f64 (y, xm1);
  y = vsqrtq_f64 (y);
  y = vaddq_f64 (xm1, y);
@ -57,10 +56,10 @@ VPCS_ATTR float64x2_t V_NAME_D1 (acosh) (float64x2_t x)
  return log1p_inline (y, &d->log1p_consts);
 }

-PL_SIG (V, D, 1, acosh, 1.0, 10.0)
-PL_TEST_ULP (V_NAME_D1 (acosh), 2.53)
-PL_TEST_EXPECT_FENV (V_NAME_D1 (acosh), WANT_SIMD_EXCEPT)
-PL_TEST_INTERVAL (V_NAME_D1 (acosh), 1, 0x1p511, 90000)
-PL_TEST_INTERVAL (V_NAME_D1 (acosh), 0x1p511, inf, 10000)
-PL_TEST_INTERVAL (V_NAME_D1 (acosh), 0, 1, 1000)
-PL_TEST_INTERVAL (V_NAME_D1 (acosh), -0, -inf, 10000)
+TEST_SIG (V, D, 1, acosh, 1.0, 10.0)
+TEST_ULP (V_NAME_D1 (acosh), 2.53)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (acosh), WANT_SIMD_EXCEPT)
+TEST_INTERVAL (V_NAME_D1 (acosh), 1, 0x1p511, 90000)
+TEST_INTERVAL (V_NAME_D1 (acosh), 0x1p511, inf, 10000)
+TEST_INTERVAL (V_NAME_D1 (acosh), 0, 1, 1000)
+TEST_INTERVAL (V_NAME_D1 (acosh), -0, -inf, 10000)
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/acoshf.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/acoshf.c
@ -1,49 +1,46 @@
 /*
 * Single-precision vector acosh(x) function.
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

 #include "v_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
 #include "v_log1pf_inline.h"

+#define SquareLim 0x1p64
+
 const static struct data
 {
  struct v_log1pf_data log1pf_consts;
  uint32x4_t one;
-  uint16x4_t thresh;
-} data = {
-  .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE,
-  .one = V4 (0x3f800000),
-  .thresh = V4 (0x2000) /* asuint(0x1p64) - asuint(1).  */
-};
+} data = { .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE, .one = V4 (0x3f800000) };

-#define SignMask 0x80000000
+#define Thresh vdup_n_u16 (0x2000) /* top(asuint(SquareLim) - asuint(1)).  */

 static float32x4_t NOINLINE VPCS_ATTR
 special_case (float32x4_t x, float32x4_t y, uint16x4_t special,
-	      const struct v_log1pf_data d)
+	      const struct v_log1pf_data *d)
 {
  return v_call_f32 (acoshf, x, log1pf_inline (y, d), vmovl_u16 (special));
 }

 /* Vector approximation for single-precision acosh, based on log1p. Maximum
   error depends on WANT_SIMD_EXCEPT. With SIMD fp exceptions enabled, it
-   is 2.78 ULP:
-   __v_acoshf(0x1.07887p+0) got 0x1.ef9e9cp-3
-			   want 0x1.ef9ea2p-3.
+   is 3.00 ULP:
+   _ZGVnN4v_acoshf(0x1.01df3ap+0) got 0x1.ef0a82p-4
+				 want 0x1.ef0a7cp-4.
   With exceptions disabled, we can compute u with a shorter dependency chain,
-   which gives maximum error of 3.07 ULP:
-  __v_acoshf(0x1.01f83ep+0) got 0x1.fbc7fap-4
-			   want 0x1.fbc7f4p-4.  */
+   which gives maximum error of 3.22 ULP:
+   _ZGVnN4v_acoshf(0x1.007ef2p+0) got 0x1.fdcdccp-5
+				 want 0x1.fdcdd2p-5.  */

-VPCS_ATTR float32x4_t V_NAME_F1 (acosh) (float32x4_t x)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (acosh) (float32x4_t x)
 {
  const struct data *d = ptr_barrier (&data);
  uint32x4_t ix = vreinterpretq_u32_f32 (x);
-  uint16x4_t special = vcge_u16 (vsubhn_u32 (ix, d->one), d->thresh);
+  uint16x4_t special = vcge_u16 (vsubhn_u32 (ix, d->one), Thresh);

 #if WANT_SIMD_EXCEPT
  /* Mask special lanes with 1 to side-step spurious invalid or overflow. Use
@ -54,25 +51,28 @@ VPCS_ATTR float32x4_t V_NAME_F1 (acosh) (float32x4_t x)
  float32x4_t xm1 = v_zerofy_f32 (vsubq_f32 (x, v_f32 (1)), p);
  float32x4_t u = vfmaq_f32 (vaddq_f32 (xm1, xm1), xm1, xm1);
 #else
-  float32x4_t xm1 = vsubq_f32 (x, v_f32 (1));
-  float32x4_t u = vmulq_f32 (xm1, vaddq_f32 (x, v_f32 (1.0f)));
+  float32x4_t xm1 = vsubq_f32 (x, vreinterpretq_f32_u32 (d->one));
+  float32x4_t u
+      = vmulq_f32 (xm1, vaddq_f32 (x, vreinterpretq_f32_u32 (d->one)));
 #endif

  float32x4_t y = vaddq_f32 (xm1, vsqrtq_f32 (u));

  if (unlikely (v_any_u16h (special)))
-    return special_case (x, y, special, d->log1pf_consts);
-  return log1pf_inline (y, d->log1pf_consts);
+    return special_case (x, y, special, &d->log1pf_consts);
+  return log1pf_inline (y, &d->log1pf_consts);
 }

-PL_SIG (V, F, 1, acosh, 1.0, 10.0)
+HALF_WIDTH_ALIAS_F1 (acosh)
+
+TEST_SIG (V, F, 1, acosh, 1.0, 10.0)
 #if WANT_SIMD_EXCEPT
-PL_TEST_ULP (V_NAME_F1 (acosh), 2.29)
+TEST_ULP (V_NAME_F1 (acosh), 2.50)
 #else
-PL_TEST_ULP (V_NAME_F1 (acosh), 2.58)
+TEST_ULP (V_NAME_F1 (acosh), 2.78)
 #endif
-PL_TEST_EXPECT_FENV (V_NAME_F1 (acosh), WANT_SIMD_EXCEPT)
-PL_TEST_INTERVAL (V_NAME_F1 (acosh), 0, 1, 500)
-PL_TEST_INTERVAL (V_NAME_F1 (acosh), 1, SquareLim, 100000)
-PL_TEST_INTERVAL (V_NAME_F1 (acosh), SquareLim, inf, 1000)
-PL_TEST_INTERVAL (V_NAME_F1 (acosh), -0, -inf, 1000)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (acosh), WANT_SIMD_EXCEPT)
+TEST_INTERVAL (V_NAME_F1 (acosh), 0, 1, 500)
+TEST_INTERVAL (V_NAME_F1 (acosh), 1, SquareLim, 100000)
+TEST_INTERVAL (V_NAME_F1 (acosh), SquareLim, inf, 1000)
+TEST_INTERVAL (V_NAME_F1 (acosh), -0, -inf, 1000)
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/asin.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/asin.c
@ -1,36 +1,35 @@
 /*
 * Double-precision vector asin(x) function.
 *
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

 #include "v_math.h"
-#include "poly_advsimd_f64.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"

 static const struct data
 {
-  float64x2_t poly[12];
+  float64x2_t c0, c2, c4, c6, c8, c10;
  float64x2_t pi_over_2;
  uint64x2_t abs_mask;
+  double c1, c3, c5, c7, c9, c11;
 } data = {
  /* Polynomial approximation of  (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x))
     on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57.  */
-  .poly = { V2 (0x1.555555555554ep-3), V2 (0x1.3333333337233p-4),
-	    V2 (0x1.6db6db67f6d9fp-5), V2 (0x1.f1c71fbd29fbbp-6),
-	    V2 (0x1.6e8b264d467d6p-6), V2 (0x1.1c5997c357e9dp-6),
-	    V2 (0x1.c86a22cd9389dp-7), V2 (0x1.856073c22ebbep-7),
-	    V2 (0x1.fd1151acb6bedp-8), V2 (0x1.087182f799c1dp-6),
-	    V2 (-0x1.6602748120927p-7), V2 (0x1.cfa0dd1f9478p-6), },
-  .pi_over_2 = V2 (0x1.921fb54442d18p+0),
-  .abs_mask = V2 (0x7fffffffffffffff),
+  .c0 = V2 (0x1.555555555554ep-3),	  .c1 = 0x1.3333333337233p-4,
+  .c2 = V2 (0x1.6db6db67f6d9fp-5),	  .c3 = 0x1.f1c71fbd29fbbp-6,
+  .c4 = V2 (0x1.6e8b264d467d6p-6),	  .c5 = 0x1.1c5997c357e9dp-6,
+  .c6 = V2 (0x1.c86a22cd9389dp-7),	  .c7 = 0x1.856073c22ebbep-7,
+  .c8 = V2 (0x1.fd1151acb6bedp-8),	  .c9 = 0x1.087182f799c1dp-6,
+  .c10 = V2 (-0x1.6602748120927p-7),	  .c11 = 0x1.cfa0dd1f9478p-6,
+  .pi_over_2 = V2 (0x1.921fb54442d18p+0), .abs_mask = V2 (0x7fffffffffffffff),
 };

 #define AllMask v_u64 (0xffffffffffffffff)
-#define One (0x3ff0000000000000)
-#define Small (0x3e50000000000000) /* 2^-12.  */
+#define One 0x3ff0000000000000
+#define Small 0x3e50000000000000 /* 2^-12.  */

 #if WANT_SIMD_EXCEPT
 static float64x2_t VPCS_ATTR NOINLINE
@ -58,12 +57,11 @@ special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
     asin(x) = pi/2 - (y + y * z * P(z)), with  z = (1-x)/2 and y = sqrt(z).

   The largest observed error in this region is 2.69 ulps,
-   _ZGVnN2v_asin (0x1.044ac9819f573p-1) got 0x1.110d7e85fdd5p-1
-				       want 0x1.110d7e85fdd53p-1.  */
+   _ZGVnN2v_asin (0x1.044e8cefee301p-1) got 0x1.1111dd54ddf96p-1
+				       want 0x1.1111dd54ddf99p-1.  */
 float64x2_t VPCS_ATTR V_NAME_D1 (asin) (float64x2_t x)
 {
  const struct data *d = ptr_barrier (&data);
-
  float64x2_t ax = vabsq_f64 (x);

 #if WANT_SIMD_EXCEPT
@ -76,7 +74,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (asin) (float64x2_t x)
    return special_case (x, x, AllMask);
 #endif

-  uint64x2_t a_lt_half = vcltq_f64 (ax, v_f64 (0.5));
+  uint64x2_t a_lt_half = vcaltq_f64 (x, v_f64 (0.5));

  /* Evaluate polynomial Q(x) = y + y * z * P(z) with
     z = x ^ 2 and y = |x|            , if |x| < 0.5
@ -89,7 +87,26 @@ float64x2_t VPCS_ATTR V_NAME_D1 (asin) (float64x2_t x)
  float64x2_t z4 = vmulq_f64 (z2, z2);
  float64x2_t z8 = vmulq_f64 (z4, z4);
  float64x2_t z16 = vmulq_f64 (z8, z8);
-  float64x2_t p = v_estrin_11_f64 (z2, z4, z8, z16, d->poly);
+
+  /* order-11 estrin.  */
+  float64x2_t c13 = vld1q_f64 (&d->c1);
+  float64x2_t c57 = vld1q_f64 (&d->c5);
+  float64x2_t c911 = vld1q_f64 (&d->c9);
+
+  float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0);
+  float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1);
+  float64x2_t p03 = vfmaq_f64 (p01, z4, p23);
+
+  float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0);
+  float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1);
+  float64x2_t p47 = vfmaq_f64 (p45, z4, p67);
+
+  float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0);
+  float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1);
+  float64x2_t p811 = vfmaq_f64 (p89, z4, p1011);
+
+  float64x2_t p07 = vfmaq_f64 (p03, z8, p47);
+  float64x2_t p = vfmaq_f64 (p07, z16, p811);

  /* Finalize polynomial: z + z * z2 * P(z2).  */
  p = vfmaq_f64 (z, vmulq_f64 (z, z2), p);
@ -102,12 +119,12 @@ float64x2_t VPCS_ATTR V_NAME_D1 (asin) (float64x2_t x)
  return vbslq_f64 (d->abs_mask, y, x);
 }

-PL_SIG (V, D, 1, asin, -1.0, 1.0)
-PL_TEST_ULP (V_NAME_D1 (asin), 2.19)
-PL_TEST_EXPECT_FENV (V_NAME_D1 (asin), WANT_SIMD_EXCEPT)
-PL_TEST_INTERVAL (V_NAME_D1 (asin), 0, Small, 5000)
-PL_TEST_INTERVAL (V_NAME_D1 (asin), Small, 0.5, 50000)
-PL_TEST_INTERVAL (V_NAME_D1 (asin), 0.5, 1.0, 50000)
-PL_TEST_INTERVAL (V_NAME_D1 (asin), 1.0, 0x1p11, 50000)
-PL_TEST_INTERVAL (V_NAME_D1 (asin), 0x1p11, inf, 20000)
-PL_TEST_INTERVAL (V_NAME_D1 (asin), -0, -inf, 20000)
+TEST_SIG (V, D, 1, asin, -1.0, 1.0)
+TEST_ULP (V_NAME_D1 (asin), 2.20)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (asin), WANT_SIMD_EXCEPT)
+TEST_INTERVAL (V_NAME_D1 (asin), 0, Small, 5000)
+TEST_INTERVAL (V_NAME_D1 (asin), Small, 0.5, 50000)
+TEST_INTERVAL (V_NAME_D1 (asin), 0.5, 1.0, 50000)
+TEST_INTERVAL (V_NAME_D1 (asin), 1.0, 0x1p11, 50000)
+TEST_INTERVAL (V_NAME_D1 (asin), 0x1p11, inf, 20000)
+TEST_INTERVAL (V_NAME_D1 (asin), -0, -inf, 20000)
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/asinf.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/asinf.c
@ -1,14 +1,14 @@
 /*
 * Single-precision vector asin(x) function.
 *
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

 #include "v_math.h"
-#include "poly_advsimd_f32.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "v_poly_f32.h"
+#include "test_sig.h"
+#include "test_defs.h"

 static const struct data
 {
@ -53,7 +53,7 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t special)

   The largest observed error in this region is 2.41 ulps,
     _ZGVnN4v_asinf (0x1.00203ep-1) got 0x1.0c3a64p-1 want 0x1.0c3a6p-1.  */
-float32x4_t VPCS_ATTR V_NAME_F1 (asin) (float32x4_t x)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (asin) (float32x4_t x)
 {
  const struct data *d = ptr_barrier (&data);

@ -93,12 +93,14 @@ float32x4_t VPCS_ATTR V_NAME_F1 (asin) (float32x4_t x)
  return vbslq_f32 (v_u32 (AbsMask), y, x);
 }

-PL_SIG (V, F, 1, asin, -1.0, 1.0)
-PL_TEST_ULP (V_NAME_F1 (asin), 1.91)
-PL_TEST_EXPECT_FENV (V_NAME_F1 (asin), WANT_SIMD_EXCEPT)
-PL_TEST_INTERVAL (V_NAME_F1 (asin), 0, 0x1p-12, 5000)
-PL_TEST_INTERVAL (V_NAME_F1 (asin), 0x1p-12, 0.5, 50000)
-PL_TEST_INTERVAL (V_NAME_F1 (asin), 0.5, 1.0, 50000)
-PL_TEST_INTERVAL (V_NAME_F1 (asin), 1.0, 0x1p11, 50000)
-PL_TEST_INTERVAL (V_NAME_F1 (asin), 0x1p11, inf, 20000)
-PL_TEST_INTERVAL (V_NAME_F1 (asin), -0, -inf, 20000)
+HALF_WIDTH_ALIAS_F1 (asin)
+
+TEST_SIG (V, F, 1, asin, -1.0, 1.0)
+TEST_ULP (V_NAME_F1 (asin), 1.91)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (asin), WANT_SIMD_EXCEPT)
+TEST_INTERVAL (V_NAME_F1 (asin), 0, 0x1p-12, 5000)
+TEST_INTERVAL (V_NAME_F1 (asin), 0x1p-12, 0.5, 50000)
+TEST_INTERVAL (V_NAME_F1 (asin), 0.5, 1.0, 50000)
+TEST_INTERVAL (V_NAME_F1 (asin), 1.0, 0x1p11, 50000)
+TEST_INTERVAL (V_NAME_F1 (asin), 0x1p11, inf, 20000)
+TEST_INTERVAL (V_NAME_F1 (asin), -0, -inf, 20000)
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/asinh.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/asinh.c
@ -0,0 +1,242 @@
+/*
+ * Double-precision vector asinh(x) function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "test_defs.h"
+#include "test_sig.h"
+#include "v_math.h"
+
+const static struct data
+{
+  uint64x2_t huge_bound, abs_mask, off, mask;
+#if WANT_SIMD_EXCEPT
+  float64x2_t tiny_bound;
+#endif
+  float64x2_t lc0, lc2;
+  double lc1, lc3, ln2, lc4;
+
+  float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c17;
+  double c1, c3, c5, c7, c9, c11, c13, c15;
+
+} data = {
+
+#if WANT_SIMD_EXCEPT
+  .tiny_bound = V2 (0x1p-26),
+#endif
+  /* Even terms of polynomial s.t. asinh(x) is approximated by
+     asinh(x) ~= x + x^3 * (C0 + C1 * x + C2 * x^2 + C3 * x^3 + ...).
+     Generated using Remez, f = (asinh(sqrt(x)) - sqrt(x))/x^(3/2).  */
+
+  .c0 = V2 (-0x1.55555555554a7p-3),
+  .c1 = 0x1.3333333326c7p-4,
+  .c2 = V2 (-0x1.6db6db68332e6p-5),
+  .c3 = 0x1.f1c71b26fb40dp-6,
+  .c4 = V2 (-0x1.6e8b8b654a621p-6),
+  .c5 = 0x1.1c4daa9e67871p-6,
+  .c6 = V2 (-0x1.c9871d10885afp-7),
+  .c7 = 0x1.7a16e8d9d2ecfp-7,
+  .c8 = V2 (-0x1.3ddca533e9f54p-7),
+  .c9 = 0x1.0becef748dafcp-7,
+  .c10 = V2 (-0x1.b90c7099dd397p-8),
+  .c11 = 0x1.541f2bb1ffe51p-8,
+  .c12 = V2 (-0x1.d217026a669ecp-9),
+  .c13 = 0x1.0b5c7977aaf7p-9,
+  .c14 = V2 (-0x1.e0f37daef9127p-11),
+  .c15 = 0x1.388b5fe542a6p-12,
+  .c16 = V2 (-0x1.021a48685e287p-14),
+  .c17 = V2 (0x1.93d4ba83d34dap-18),
+
+  .lc0 = V2 (-0x1.ffffffffffff7p-2),
+  .lc1 = 0x1.55555555170d4p-2,
+  .lc2 = V2 (-0x1.0000000399c27p-2),
+  .lc3 = 0x1.999b2e90e94cap-3,
+  .lc4 = -0x1.554e550bd501ep-3,
+  .ln2 = 0x1.62e42fefa39efp-1,
+
+  .off = V2 (0x3fe6900900000000),
+  .huge_bound = V2 (0x5fe0000000000000),
+  .abs_mask = V2 (0x7fffffffffffffff),
+  .mask = V2 (0xfffULL << 52),
+};
+
+static float64x2_t NOINLINE VPCS_ATTR
+special_case (float64x2_t x, float64x2_t y, uint64x2_t abs_mask,
+	      uint64x2_t special)
+{
+  /* Copy sign.  */
+  y = vbslq_f64 (abs_mask, y, x);
+  return v_call_f64 (asinh, x, y, special);
+}
+
+#define N (1 << V_LOG_TABLE_BITS)
+#define IndexMask (N - 1)
+
+struct entry
+{
+  float64x2_t invc;
+  float64x2_t logc;
+};
+
+static inline struct entry
+lookup (uint64x2_t i)
+{
+  /* Since N is a power of 2, n % N = n & (N - 1).  */
+  struct entry e;
+  uint64_t i0 = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
+  uint64_t i1 = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
+  float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc);
+  float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc);
+  e.invc = vuzp1q_f64 (e0, e1);
+  e.logc = vuzp2q_f64 (e0, e1);
+  return e;
+}
+
+static inline float64x2_t
+log_inline (float64x2_t xm, const struct data *d)
+{
+
+  uint64x2_t u = vreinterpretq_u64_f64 (xm);
+  uint64x2_t u_off = vsubq_u64 (u, d->off);
+
+  int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52);
+  uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->mask));
+  float64x2_t z = vreinterpretq_f64_u64 (iz);
+
+  struct entry e = lookup (u_off);
+
+  /* log(x) = log1p(z/c-1) + log(c) + k*Ln2.  */
+  float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
+  float64x2_t kd = vcvtq_f64_s64 (k);
+
+  /* hi = r + log(c) + k*Ln2.  */
+  float64x2_t ln2_and_lc4 = vld1q_f64 (&d->ln2);
+  float64x2_t hi = vfmaq_laneq_f64 (vaddq_f64 (e.logc, r), kd, ln2_and_lc4, 0);
+
+  /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi.  */
+  float64x2_t odd_coeffs = vld1q_f64 (&d->lc1);
+  float64x2_t r2 = vmulq_f64 (r, r);
+  float64x2_t y = vfmaq_laneq_f64 (d->lc2, r, odd_coeffs, 1);
+  float64x2_t p = vfmaq_laneq_f64 (d->lc0, r, odd_coeffs, 0);
+  y = vfmaq_laneq_f64 (y, r2, ln2_and_lc4, 1);
+  y = vfmaq_f64 (p, r2, y);
+  return vfmaq_f64 (hi, y, r2);
+}
+
+/* Double-precision implementation of vector asinh(x).
+   asinh is very sensitive around 1, so it is impractical to devise a single
+   low-cost algorithm which is sufficiently accurate on a wide range of input.
+   Instead we use two different algorithms:
+   asinh(x) = sign(x) * log(|x| + sqrt(x^2 + 1)      if |x| >= 1
+	    = sign(x) * (|x| + |x|^3 * P(x^2))       otherwise
+   where log(x) is an optimized log approximation, and P(x) is a polynomial
+   shared with the scalar routine. The greatest observed error 2.79 ULP, in
+   |x| >= 1:
+   _ZGVnN2v_asinh(0x1.2cd9d73ea76a6p+0) got 0x1.ffffd003219dap-1
+				       want  0x1.ffffd003219ddp-1.  */
+VPCS_ATTR float64x2_t V_NAME_D1 (asinh) (float64x2_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+  float64x2_t ax = vabsq_f64 (x);
+
+  uint64x2_t gt1 = vcgeq_f64 (ax, v_f64 (1));
+
+#if WANT_SIMD_EXCEPT
+  uint64x2_t iax = vreinterpretq_u64_f64 (ax);
+  uint64x2_t special = vcgeq_u64 (iax, (d->huge_bound));
+  uint64x2_t tiny = vcltq_f64 (ax, d->tiny_bound);
+  special = vorrq_u64 (special, tiny);
+#else
+  uint64x2_t special = vcgeq_f64 (ax, vreinterpretq_f64_u64 (d->huge_bound));
+#endif
+
+  /* Option 1: |x| >= 1.
+     Compute asinh(x) according by asinh(x) = log(x + sqrt(x^2 + 1)).
+     If WANT_SIMD_EXCEPT is enabled, sidestep special values, which will
+     overflow, by setting special lanes to 1. These will be fixed later.  */
+  float64x2_t option_1 = v_f64 (0);
+  if (likely (v_any_u64 (gt1)))
+    {
+#if WANT_SIMD_EXCEPT
+      float64x2_t xm = v_zerofy_f64 (ax, special);
+#else
+      float64x2_t xm = ax;
+#endif
+      option_1 = log_inline (
+	  vaddq_f64 (xm, vsqrtq_f64 (vfmaq_f64 (v_f64 (1), xm, xm))), d);
+    }
+
+  /* Option 2: |x| < 1.
+     Compute asinh(x) using a polynomial.
+     If WANT_SIMD_EXCEPT is enabled, sidestep special lanes, which will
+     overflow, and tiny lanes, which will underflow, by setting them to 0. They
+     will be fixed later, either by selecting x or falling back to the scalar
+     special-case. The largest observed error in this region is 1.47 ULPs:
+     _ZGVnN2v_asinh(0x1.fdfcd00cc1e6ap-1) got 0x1.c1d6bf874019bp-1
+					 want 0x1.c1d6bf874019cp-1.  */
+  float64x2_t option_2 = v_f64 (0);
+
+  if (likely (v_any_u64 (vceqzq_u64 (gt1))))
+    {
+
+#if WANT_SIMD_EXCEPT
+      ax = v_zerofy_f64 (ax, vorrq_u64 (tiny, gt1));
+#endif
+      float64x2_t x2 = vmulq_f64 (ax, ax), z2 = vmulq_f64 (x2, x2);
+      /* Order-17 Pairwise Horner scheme.  */
+      float64x2_t c13 = vld1q_f64 (&d->c1);
+      float64x2_t c57 = vld1q_f64 (&d->c5);
+      float64x2_t c911 = vld1q_f64 (&d->c9);
+      float64x2_t c1315 = vld1q_f64 (&d->c13);
+
+      float64x2_t p01 = vfmaq_laneq_f64 (d->c0, x2, c13, 0);
+      float64x2_t p23 = vfmaq_laneq_f64 (d->c2, x2, c13, 1);
+      float64x2_t p45 = vfmaq_laneq_f64 (d->c4, x2, c57, 0);
+      float64x2_t p67 = vfmaq_laneq_f64 (d->c6, x2, c57, 1);
+      float64x2_t p89 = vfmaq_laneq_f64 (d->c8, x2, c911, 0);
+      float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, x2, c911, 1);
+      float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, x2, c1315, 0);
+      float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, x2, c1315, 1);
+      float64x2_t p1617 = vfmaq_f64 (d->c16, x2, d->c17);
+
+      float64x2_t p = vfmaq_f64 (p1415, z2, p1617);
+      p = vfmaq_f64 (p1213, z2, p);
+      p = vfmaq_f64 (p1011, z2, p);
+      p = vfmaq_f64 (p89, z2, p);
+
+      p = vfmaq_f64 (p67, z2, p);
+      p = vfmaq_f64 (p45, z2, p);
+
+      p = vfmaq_f64 (p23, z2, p);
+
+      p = vfmaq_f64 (p01, z2, p);
+      option_2 = vfmaq_f64 (ax, p, vmulq_f64 (ax, x2));
+#if WANT_SIMD_EXCEPT
+      option_2 = vbslq_f64 (tiny, x, option_2);
+#endif
+    }
+
+  /* Choose the right option for each lane.  */
+  float64x2_t y = vbslq_f64 (gt1, option_1, option_2);
+  if (unlikely (v_any_u64 (special)))
+    {
+      return special_case (x, y, d->abs_mask, special);
+    }
+  /* Copy sign.  */
+  return vbslq_f64 (d->abs_mask, y, x);
+}
+
+TEST_SIG (V, D, 1, asinh, -10.0, 10.0)
+TEST_ULP (V_NAME_D1 (asinh), 2.29)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (asinh), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_D1 (asinh), 0, 0x1p-26, 50000)
+TEST_SYM_INTERVAL (V_NAME_D1 (asinh), 0x1p-26, 1, 50000)
+TEST_SYM_INTERVAL (V_NAME_D1 (asinh), 1, 0x1p511, 50000)
+TEST_SYM_INTERVAL (V_NAME_D1 (asinh), 0x1p511, inf, 40000)
+/* Test vector asinh 3 times, with control lane < 1, > 1 and special.
+   Ensures the v_sel is choosing the right option in all cases.  */
+TEST_CONTROL_VALUE (V_NAME_D1 (asinh), 0.5)
+TEST_CONTROL_VALUE (V_NAME_D1 (asinh), 2)
+TEST_CONTROL_VALUE (V_NAME_D1 (asinh), 0x1p600)
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/asinhf.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/asinhf.c
@ -0,0 +1,89 @@
+/*
+ * Single-precision vector asinh(x) function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "v_log1pf_inline.h"
+
+const static struct data
+{
+  struct v_log1pf_data log1pf_consts;
+  float32x4_t one;
+  uint32x4_t big_bound;
+#if WANT_SIMD_EXCEPT
+  uint32x4_t tiny_bound;
+#endif
+} data = {
+  .one = V4 (1),
+  .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE,
+  .big_bound = V4 (0x5f800000), /* asuint(0x1p64).  */
+#if WANT_SIMD_EXCEPT
+  .tiny_bound = V4 (0x30800000) /* asuint(0x1p-30).  */
+#endif
+};
+
+static float32x4_t NOINLINE VPCS_ATTR
+special_case (float32x4_t x, uint32x4_t sign, float32x4_t y,
+	      uint32x4_t special, const struct data *d)
+{
+  return v_call_f32 (
+      asinhf, x,
+      vreinterpretq_f32_u32 (veorq_u32 (
+	  sign, vreinterpretq_u32_f32 (log1pf_inline (y, &d->log1pf_consts)))),
+      special);
+}
+
+/* Single-precision implementation of vector asinh(x), using vector log1p.
+   Worst-case error is 2.59 ULP:
+   _ZGVnN4v_asinhf(0x1.d86124p-3) got 0x1.d449bep-3
+				 want 0x1.d449c4p-3.  */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (asinh) (float32x4_t x)
+{
+  const struct data *dat = ptr_barrier (&data);
+  float32x4_t ax = vabsq_f32 (x);
+  uint32x4_t iax = vreinterpretq_u32_f32 (ax);
+  uint32x4_t special = vcgeq_u32 (iax, dat->big_bound);
+  uint32x4_t sign = veorq_u32 (vreinterpretq_u32_f32 (x), iax);
+  float32x4_t special_arg = x;
+
+#if WANT_SIMD_EXCEPT
+  /* Sidestep tiny and large values to avoid inadvertently triggering
+     under/overflow.  */
+  special = vorrq_u32 (special, vcltq_u32 (iax, dat->tiny_bound));
+  if (unlikely (v_any_u32 (special)))
+    {
+      ax = v_zerofy_f32 (ax, special);
+      x = v_zerofy_f32 (x, special);
+    }
+#endif
+
+  /* asinh(x) = log(x + sqrt(x * x + 1)).
+     For positive x, asinh(x) = log1p(x + x * x / (1 + sqrt(x * x + 1))).  */
+  float32x4_t d
+      = vaddq_f32 (v_f32 (1), vsqrtq_f32 (vfmaq_f32 (dat->one, ax, ax)));
+  float32x4_t y = vaddq_f32 (ax, vdivq_f32 (vmulq_f32 (ax, ax), d));
+
+  if (unlikely (v_any_u32 (special)))
+    return special_case (special_arg, sign, y, special, dat);
+  return vreinterpretq_f32_u32 (veorq_u32 (
+      sign, vreinterpretq_u32_f32 (log1pf_inline (y, &dat->log1pf_consts))));
+}
+
+HALF_WIDTH_ALIAS_F1 (asinh)
+
+TEST_SIG (V, F, 1, asinh, -10.0, 10.0)
+TEST_ULP (V_NAME_F1 (asinh), 2.10)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (asinh), WANT_SIMD_EXCEPT)
+TEST_INTERVAL (V_NAME_F1 (asinh), 0, 0x1p-12, 40000)
+TEST_INTERVAL (V_NAME_F1 (asinh), 0x1p-12, 1.0, 40000)
+TEST_INTERVAL (V_NAME_F1 (asinh), 1.0, 0x1p11, 40000)
+TEST_INTERVAL (V_NAME_F1 (asinh), 0x1p11, inf, 40000)
+TEST_INTERVAL (V_NAME_F1 (asinh), -0, -0x1p-12, 20000)
+TEST_INTERVAL (V_NAME_F1 (asinh), -0x1p-12, -1.0, 20000)
+TEST_INTERVAL (V_NAME_F1 (asinh), -1.0, -0x1p11, 20000)
+TEST_INTERVAL (V_NAME_F1 (asinh), -0x1p11, -inf, 20000)
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/atan.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/atan.c
@ -1,32 +1,32 @@
 /*
 * Double-precision vector atan(x) function.
 *
- * Copyright (c) 2021-2023, Arm Limited.
+ * Copyright (c) 2021-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

 #include "v_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-#include "poly_advsimd_f64.h"
+#include "test_sig.h"
+#include "test_defs.h"

 static const struct data
 {
+  float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c18;
  float64x2_t pi_over_2;
-  float64x2_t poly[20];
+  double c1, c3, c5, c7, c9, c11, c13, c15, c17, c19;
 } data = {
  /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
 	      [2**-1022, 1.0].  */
-  .poly = { V2 (-0x1.5555555555555p-2),	 V2 (0x1.99999999996c1p-3),
-	    V2 (-0x1.2492492478f88p-3),	 V2 (0x1.c71c71bc3951cp-4),
-	    V2 (-0x1.745d160a7e368p-4),	 V2 (0x1.3b139b6a88ba1p-4),
-	    V2 (-0x1.11100ee084227p-4),	 V2 (0x1.e1d0f9696f63bp-5),
-	    V2 (-0x1.aebfe7b418581p-5),	 V2 (0x1.842dbe9b0d916p-5),
-	    V2 (-0x1.5d30140ae5e99p-5),	 V2 (0x1.338e31eb2fbbcp-5),
-	    V2 (-0x1.00e6eece7de8p-5),	 V2 (0x1.860897b29e5efp-6),
-	    V2 (-0x1.0051381722a59p-6),	 V2 (0x1.14e9dc19a4a4ep-7),
-	    V2 (-0x1.d0062b42fe3bfp-9),	 V2 (0x1.17739e210171ap-10),
-	    V2 (-0x1.ab24da7be7402p-13), V2 (0x1.358851160a528p-16), },
+  .c0 = V2 (-0x1.5555555555555p-2),	  .c1 = 0x1.99999999996c1p-3,
+  .c2 = V2 (-0x1.2492492478f88p-3),	  .c3 = 0x1.c71c71bc3951cp-4,
+  .c4 = V2 (-0x1.745d160a7e368p-4),	  .c5 = 0x1.3b139b6a88ba1p-4,
+  .c6 = V2 (-0x1.11100ee084227p-4),	  .c7 = 0x1.e1d0f9696f63bp-5,
+  .c8 = V2 (-0x1.aebfe7b418581p-5),	  .c9 = 0x1.842dbe9b0d916p-5,
+  .c10 = V2 (-0x1.5d30140ae5e99p-5),	  .c11 = 0x1.338e31eb2fbbcp-5,
+  .c12 = V2 (-0x1.00e6eece7de8p-5),	  .c13 = 0x1.860897b29e5efp-6,
+  .c14 = V2 (-0x1.0051381722a59p-6),	  .c15 = 0x1.14e9dc19a4a4ep-7,
+  .c16 = V2 (-0x1.d0062b42fe3bfp-9),	  .c17 = 0x1.17739e210171ap-10,
+  .c18 = V2 (-0x1.ab24da7be7402p-13),	  .c19 = 0x1.358851160a528p-16,
  .pi_over_2 = V2 (0x1.921fb54442d18p+0),
 };

@ -42,6 +42,11 @@ static const struct data
 float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x)
 {
  const struct data *d = ptr_barrier (&data);
+  float64x2_t c13 = vld1q_f64 (&d->c1);
+  float64x2_t c57 = vld1q_f64 (&d->c5);
+  float64x2_t c911 = vld1q_f64 (&d->c9);
+  float64x2_t c1315 = vld1q_f64 (&d->c13);
+  float64x2_t c1719 = vld1q_f64 (&d->c17);

  /* Small cases, infs and nans are supported by our approximation technique,
     but do not set fenv flags correctly. Only trigger special case if we need
@ -80,9 +85,35 @@ float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x)
  float64x2_t x2 = vmulq_f64 (z2, z2);
  float64x2_t x4 = vmulq_f64 (x2, x2);
  float64x2_t x8 = vmulq_f64 (x4, x4);
-  float64x2_t y
-      = vfmaq_f64 (v_estrin_7_f64 (z2, x2, x4, d->poly),
-		   v_estrin_11_f64 (z2, x2, x4, x8, d->poly + 8), x8);
+
+  /* estrin_7.  */
+  float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0);
+  float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1);
+  float64x2_t p03 = vfmaq_f64 (p01, x2, p23);
+
+  float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0);
+  float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1);
+  float64x2_t p47 = vfmaq_f64 (p45, x2, p67);
+
+  float64x2_t p07 = vfmaq_f64 (p03, x4, p47);
+
+  /* estrin_11.  */
+  float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0);
+  float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1);
+  float64x2_t p811 = vfmaq_f64 (p89, x2, p1011);
+
+  float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, z2, c1315, 0);
+  float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, z2, c1315, 1);
+  float64x2_t p1215 = vfmaq_f64 (p1213, x2, p1415);
+
+  float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, z2, c1719, 0);
+  float64x2_t p1819 = vfmaq_laneq_f64 (d->c18, z2, c1719, 1);
+  float64x2_t p1619 = vfmaq_f64 (p1617, x2, p1819);
+
+  float64x2_t p815 = vfmaq_f64 (p811, x4, p1215);
+  float64x2_t p819 = vfmaq_f64 (p815, x8, p1619);
+
+  float64x2_t y = vfmaq_f64 (p07, p819, x8);

  /* Finalize. y = shift + z + z^3 * P(z^2).  */
  y = vfmaq_f64 (az, y, vmulq_f64 (z2, az));
@ -93,12 +124,12 @@ float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x)
  return y;
 }

-PL_SIG (V, D, 1, atan, -10.0, 10.0)
-PL_TEST_ULP (V_NAME_D1 (atan), 1.78)
-PL_TEST_EXPECT_FENV (V_NAME_D1 (atan), WANT_SIMD_EXCEPT)
-PL_TEST_INTERVAL (V_NAME_D1 (atan), 0, 0x1p-30, 10000)
-PL_TEST_INTERVAL (V_NAME_D1 (atan), -0, -0x1p-30, 1000)
-PL_TEST_INTERVAL (V_NAME_D1 (atan), 0x1p-30, 0x1p53, 900000)
-PL_TEST_INTERVAL (V_NAME_D1 (atan), -0x1p-30, -0x1p53, 90000)
-PL_TEST_INTERVAL (V_NAME_D1 (atan), 0x1p53, inf, 10000)
-PL_TEST_INTERVAL (V_NAME_D1 (atan), -0x1p53, -inf, 1000)
+TEST_SIG (V, D, 1, atan, -10.0, 10.0)
+TEST_ULP (V_NAME_D1 (atan), 1.78)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (atan), WANT_SIMD_EXCEPT)
+TEST_INTERVAL (V_NAME_D1 (atan), 0, 0x1p-30, 10000)
+TEST_INTERVAL (V_NAME_D1 (atan), -0, -0x1p-30, 1000)
+TEST_INTERVAL (V_NAME_D1 (atan), 0x1p-30, 0x1p53, 900000)
+TEST_INTERVAL (V_NAME_D1 (atan), -0x1p-30, -0x1p53, 90000)
+TEST_INTERVAL (V_NAME_D1 (atan), 0x1p53, inf, 10000)
+TEST_INTERVAL (V_NAME_D1 (atan), -0x1p53, -inf, 1000)
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/atan2.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/atan2.c
@ -0,0 +1,171 @@
+/*
+ * Double-precision vector atan2(x) function.
+ *
+ * Copyright (c) 2021-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+static const struct data
+{
+  float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c18;
+  float64x2_t pi_over_2;
+  double c1, c3, c5, c7, c9, c11, c13, c15, c17, c19;
+  uint64x2_t zeroinfnan, minustwo;
+} data = {
+  /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
+	      [2**-1022, 1.0].  */
+  .c0 = V2 (-0x1.5555555555555p-2),
+  .c1 = 0x1.99999999996c1p-3,
+  .c2 = V2 (-0x1.2492492478f88p-3),
+  .c3 = 0x1.c71c71bc3951cp-4,
+  .c4 = V2 (-0x1.745d160a7e368p-4),
+  .c5 = 0x1.3b139b6a88ba1p-4,
+  .c6 = V2 (-0x1.11100ee084227p-4),
+  .c7 = 0x1.e1d0f9696f63bp-5,
+  .c8 = V2 (-0x1.aebfe7b418581p-5),
+  .c9 = 0x1.842dbe9b0d916p-5,
+  .c10 = V2 (-0x1.5d30140ae5e99p-5),
+  .c11 = 0x1.338e31eb2fbbcp-5,
+  .c12 = V2 (-0x1.00e6eece7de8p-5),
+  .c13 = 0x1.860897b29e5efp-6,
+  .c14 = V2 (-0x1.0051381722a59p-6),
+  .c15 = 0x1.14e9dc19a4a4ep-7,
+  .c16 = V2 (-0x1.d0062b42fe3bfp-9),
+  .c17 = 0x1.17739e210171ap-10,
+  .c18 = V2 (-0x1.ab24da7be7402p-13),
+  .c19 = 0x1.358851160a528p-16,
+  .pi_over_2 = V2 (0x1.921fb54442d18p+0),
+  .zeroinfnan = V2 (2 * 0x7ff0000000000000ul - 1),
+  .minustwo = V2 (0xc000000000000000),
+};
+
+#define SignMask v_u64 (0x8000000000000000)
+
+/* Special cases i.e. 0, infinity, NaN (fall back to scalar calls).  */
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t y, float64x2_t x, float64x2_t ret,
+	      uint64x2_t sign_xy, uint64x2_t cmp)
+{
+  /* Account for the sign of x and y.  */
+  ret = vreinterpretq_f64_u64 (
+      veorq_u64 (vreinterpretq_u64_f64 (ret), sign_xy));
+  return v_call2_f64 (atan2, y, x, ret, cmp);
+}
+
+/* Returns 1 if input is the bit representation of 0, infinity or nan.  */
+static inline uint64x2_t
+zeroinfnan (uint64x2_t i, const struct data *d)
+{
+  /* (2 * i - 1) >= (2 * asuint64 (INFINITY) - 1).  */
+  return vcgeq_u64 (vsubq_u64 (vaddq_u64 (i, i), v_u64 (1)), d->zeroinfnan);
+}
+
+/* Fast implementation of vector atan2.
+   Maximum observed error is 2.8 ulps:
+   _ZGVnN2vv_atan2 (0x1.9651a429a859ap+5, 0x1.953075f4ee26p+5)
+	got 0x1.92d628ab678ccp-1
+       want 0x1.92d628ab678cfp-1.  */
+float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  uint64x2_t ix = vreinterpretq_u64_f64 (x);
+  uint64x2_t iy = vreinterpretq_u64_f64 (y);
+
+  uint64x2_t special_cases
+      = vorrq_u64 (zeroinfnan (ix, d), zeroinfnan (iy, d));
+
+  uint64x2_t sign_x = vandq_u64 (ix, SignMask);
+  uint64x2_t sign_y = vandq_u64 (iy, SignMask);
+  uint64x2_t sign_xy = veorq_u64 (sign_x, sign_y);
+
+  float64x2_t ax = vabsq_f64 (x);
+  float64x2_t ay = vabsq_f64 (y);
+
+  uint64x2_t pred_xlt0 = vcltzq_f64 (x);
+  uint64x2_t pred_aygtax = vcagtq_f64 (y, x);
+
+  /* Set up z for call to atan.  */
+  float64x2_t n = vbslq_f64 (pred_aygtax, vnegq_f64 (ax), ay);
+  float64x2_t q = vbslq_f64 (pred_aygtax, ay, ax);
+  float64x2_t z = vdivq_f64 (n, q);
+
+  /* Work out the correct shift.  */
+  float64x2_t shift
+      = vreinterpretq_f64_u64 (vandq_u64 (pred_xlt0, d->minustwo));
+  shift = vbslq_f64 (pred_aygtax, vaddq_f64 (shift, v_f64 (1.0)), shift);
+  shift = vmulq_f64 (shift, d->pi_over_2);
+
+  /* Calculate the polynomial approximation.
+     Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of
+     full scheme to avoid underflow in x^16.
+     The order 19 polynomial P approximates
+     (atan(sqrt(x))-sqrt(x))/x^(3/2).  */
+  float64x2_t z2 = vmulq_f64 (z, z);
+  float64x2_t x2 = vmulq_f64 (z2, z2);
+  float64x2_t x4 = vmulq_f64 (x2, x2);
+  float64x2_t x8 = vmulq_f64 (x4, x4);
+
+  float64x2_t c13 = vld1q_f64 (&d->c1);
+  float64x2_t c57 = vld1q_f64 (&d->c5);
+  float64x2_t c911 = vld1q_f64 (&d->c9);
+  float64x2_t c1315 = vld1q_f64 (&d->c13);
+  float64x2_t c1719 = vld1q_f64 (&d->c17);
+
+  /* estrin_7.  */
+  float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0);
+  float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1);
+  float64x2_t p03 = vfmaq_f64 (p01, x2, p23);
+
+  float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0);
+  float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1);
+  float64x2_t p47 = vfmaq_f64 (p45, x2, p67);
+
+  float64x2_t p07 = vfmaq_f64 (p03, x4, p47);
+
+  /* estrin_11.  */
+  float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0);
+  float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1);
+  float64x2_t p811 = vfmaq_f64 (p89, x2, p1011);
+
+  float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, z2, c1315, 0);
+  float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, z2, c1315, 1);
+  float64x2_t p1215 = vfmaq_f64 (p1213, x2, p1415);
+
+  float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, z2, c1719, 0);
+  float64x2_t p1819 = vfmaq_laneq_f64 (d->c18, z2, c1719, 1);
+  float64x2_t p1619 = vfmaq_f64 (p1617, x2, p1819);
+
+  float64x2_t p815 = vfmaq_f64 (p811, x4, p1215);
+  float64x2_t p819 = vfmaq_f64 (p815, x8, p1619);
+
+  float64x2_t ret = vfmaq_f64 (p07, p819, x8);
+
+  /* Finalize. y = shift + z + z^3 * P(z^2).  */
+  ret = vfmaq_f64 (z, ret, vmulq_f64 (z2, z));
+  ret = vaddq_f64 (ret, shift);
+
+  if (unlikely (v_any_u64 (special_cases)))
+    return special_case (y, x, ret, sign_xy, special_cases);
+
+  /* Account for the sign of x and y.  */
+  ret = vreinterpretq_f64_u64 (
+      veorq_u64 (vreinterpretq_u64_f64 (ret), sign_xy));
+
+  return ret;
+}
+
+/* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h.  */
+TEST_SIG (V, D, 2, atan2)
+// TODO tighten this once __v_atan2 is fixed
+TEST_ULP (V_NAME_D2 (atan2), 2.9)
+TEST_DISABLE_FENV (V_NAME_D2 (atan2))
+TEST_INTERVAL (V_NAME_D2 (atan2), -10.0, 10.0, 50000)
+TEST_INTERVAL (V_NAME_D2 (atan2), -1.0, 1.0, 40000)
+TEST_INTERVAL (V_NAME_D2 (atan2), 0.0, 1.0, 40000)
+TEST_INTERVAL (V_NAME_D2 (atan2), 1.0, 100.0, 40000)
+TEST_INTERVAL (V_NAME_D2 (atan2), 1e6, 1e32, 40000)
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/atan2f.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/atan2f.c
@ -1,59 +1,64 @@
 /*
 * Single-precision vector atan2(x) function.
 *
- * Copyright (c) 2021-2023, Arm Limited.
+ * Copyright (c) 2021-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

 #include "v_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-#include "poly_advsimd_f32.h"
+#include "test_sig.h"
+#include "test_defs.h"

 static const struct data
 {
-  float32x4_t poly[8];
-  float32x4_t pi_over_2;
+  float32x4_t c0, pi_over_2, c4, c6, c2;
+  float c1, c3, c5, c7;
+  uint32x4_t comp_const;
 } data = {
  /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
     [2**-128, 1.0].
     Generated using fpminimax between FLT_MIN and 1.  */
-  .poly = { V4 (-0x1.55555p-2f), V4 (0x1.99935ep-3f), V4 (-0x1.24051ep-3f),
-	    V4 (0x1.bd7368p-4f), V4 (-0x1.491f0ep-4f), V4 (0x1.93a2c0p-5f),
-	    V4 (-0x1.4c3c60p-6f), V4 (0x1.01fd88p-8f) },
-  .pi_over_2 = V4 (0x1.921fb6p+0f),
+  .c0 = V4 (-0x1.55555p-2f),	    .c1 = 0x1.99935ep-3f,
+  .c2 = V4 (-0x1.24051ep-3f),	    .c3 = 0x1.bd7368p-4f,
+  .c4 = V4 (-0x1.491f0ep-4f),	    .c5 = 0x1.93a2c0p-5f,
+  .c6 = V4 (-0x1.4c3c60p-6f),	    .c7 = 0x1.01fd88p-8f,
+  .pi_over_2 = V4 (0x1.921fb6p+0f), .comp_const = V4 (2 * 0x7f800000lu - 1),
 };

 #define SignMask v_u32 (0x80000000)

 /* Special cases i.e. 0, infinity and nan (fall back to scalar calls).  */
 static float32x4_t VPCS_ATTR NOINLINE
-special_case (float32x4_t y, float32x4_t x, float32x4_t ret, uint32x4_t cmp)
+special_case (float32x4_t y, float32x4_t x, float32x4_t ret,
+	      uint32x4_t sign_xy, uint32x4_t cmp)
 {
+  /* Account for the sign of y.  */
+  ret = vreinterpretq_f32_u32 (
+      veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy));
  return v_call2_f32 (atan2f, y, x, ret, cmp);
 }

 /* Returns 1 if input is the bit representation of 0, infinity or nan.  */
 static inline uint32x4_t
-zeroinfnan (uint32x4_t i)
+zeroinfnan (uint32x4_t i, const struct data *d)
 {
  /* 2 * i - 1 >= 2 * 0x7f800000lu - 1.  */
-  return vcgeq_u32 (vsubq_u32 (vmulq_n_u32 (i, 2), v_u32 (1)),
-		    v_u32 (2 * 0x7f800000lu - 1));
+  return vcgeq_u32 (vsubq_u32 (vmulq_n_u32 (i, 2), v_u32 (1)), d->comp_const);
 }

 /* Fast implementation of vector atan2f. Maximum observed error is
   2.95 ULP in [0x1.9300d6p+6 0x1.93c0c6p+6] x [0x1.8c2dbp+6 0x1.8cea6p+6]:
   _ZGVnN4vv_atan2f (0x1.93836cp+6, 0x1.8cae1p+6) got 0x1.967f06p-1
 						 want 0x1.967f00p-1.  */
-float32x4_t VPCS_ATTR V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x)
 {
-  const struct data *data_ptr = ptr_barrier (&data);
+  const struct data *d = ptr_barrier (&data);

  uint32x4_t ix = vreinterpretq_u32_f32 (x);
  uint32x4_t iy = vreinterpretq_u32_f32 (y);

-  uint32x4_t special_cases = vorrq_u32 (zeroinfnan (ix), zeroinfnan (iy));
+  uint32x4_t special_cases
+      = vorrq_u32 (zeroinfnan (ix, d), zeroinfnan (iy, d));

  uint32x4_t sign_x = vandq_u32 (ix, SignMask);
  uint32x4_t sign_y = vandq_u32 (iy, SignMask);
@ -67,14 +72,14 @@ float32x4_t VPCS_ATTR V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x)

  /* Set up z for call to atanf.  */
  float32x4_t n = vbslq_f32 (pred_aygtax, vnegq_f32 (ax), ay);
-  float32x4_t d = vbslq_f32 (pred_aygtax, ay, ax);
-  float32x4_t z = vdivq_f32 (n, d);
+  float32x4_t q = vbslq_f32 (pred_aygtax, ay, ax);
+  float32x4_t z = vdivq_f32 (n, q);

  /* Work out the correct shift.  */
  float32x4_t shift = vreinterpretq_f32_u32 (
      vandq_u32 (pred_xlt0, vreinterpretq_u32_f32 (v_f32 (-2.0f))));
  shift = vbslq_f32 (pred_aygtax, vaddq_f32 (shift, v_f32 (1.0f)), shift);
-  shift = vmulq_f32 (shift, data_ptr->pi_over_2);
+  shift = vmulq_f32 (shift, d->pi_over_2);

  /* Calculate the polynomial approximation.
     Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However,
@ -86,30 +91,37 @@ float32x4_t VPCS_ATTR V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x)
  float32x4_t z2 = vmulq_f32 (z, z);
  float32x4_t z4 = vmulq_f32 (z2, z2);

-  float32x4_t ret = vfmaq_f32 (
-      v_pairwise_poly_3_f32 (z2, z4, data_ptr->poly), z4,
-      vmulq_f32 (z4, v_pairwise_poly_3_f32 (z2, z4, data_ptr->poly + 4)));
+  float32x4_t c1357 = vld1q_f32 (&d->c1);
+  float32x4_t p01 = vfmaq_laneq_f32 (d->c0, z2, c1357, 0);
+  float32x4_t p23 = vfmaq_laneq_f32 (d->c2, z2, c1357, 1);
+  float32x4_t p45 = vfmaq_laneq_f32 (d->c4, z2, c1357, 2);
+  float32x4_t p67 = vfmaq_laneq_f32 (d->c6, z2, c1357, 3);
+  float32x4_t p03 = vfmaq_f32 (p01, z4, p23);
+  float32x4_t p47 = vfmaq_f32 (p45, z4, p67);
+
+  float32x4_t ret = vfmaq_f32 (p03, z4, vmulq_f32 (z4, p47));

  /* y = shift + z * P(z^2).  */
  ret = vaddq_f32 (vfmaq_f32 (z, ret, vmulq_f32 (z2, z)), shift);

-  /* Account for the sign of y.  */
-  ret = vreinterpretq_f32_u32 (
-      veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy));
-
  if (unlikely (v_any_u32 (special_cases)))
    {
-      return special_case (y, x, ret, special_cases);
+      return special_case (y, x, ret, sign_xy, special_cases);
    }

-  return ret;
+  /* Account for the sign of y.  */
+  return vreinterpretq_f32_u32 (
+      veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy));
 }

+HALF_WIDTH_ALIAS_F2 (atan2)
+
 /* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h.  */
-PL_SIG (V, F, 2, atan2)
-PL_TEST_ULP (V_NAME_F2 (atan2), 2.46)
-PL_TEST_INTERVAL (V_NAME_F2 (atan2), -10.0, 10.0, 50000)
-PL_TEST_INTERVAL (V_NAME_F2 (atan2), -1.0, 1.0, 40000)
-PL_TEST_INTERVAL (V_NAME_F2 (atan2), 0.0, 1.0, 40000)
-PL_TEST_INTERVAL (V_NAME_F2 (atan2), 1.0, 100.0, 40000)
-PL_TEST_INTERVAL (V_NAME_F2 (atan2), 1e6, 1e32, 40000)
+TEST_SIG (V, F, 2, atan2)
+TEST_DISABLE_FENV (V_NAME_F2 (atan2))
+TEST_ULP (V_NAME_F2 (atan2), 2.46)
+TEST_INTERVAL (V_NAME_F2 (atan2), -10.0, 10.0, 50000)
+TEST_INTERVAL (V_NAME_F2 (atan2), -1.0, 1.0, 40000)
+TEST_INTERVAL (V_NAME_F2 (atan2), 0.0, 1.0, 40000)
+TEST_INTERVAL (V_NAME_F2 (atan2), 1.0, 100.0, 40000)
+TEST_INTERVAL (V_NAME_F2 (atan2), 1e6, 1e32, 40000)
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/atanf.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/atanf.c
@ -1,14 +1,14 @@
 /*
 * Single-precision vector atan(x) function.
 *
- * Copyright (c) 2021-2023, Arm Limited.
+ * Copyright (c) 2021-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

 #include "v_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-#include "poly_advsimd_f32.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "v_poly_f32.h"

 static const struct data
 {
@ -43,7 +43,7 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
   atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1]
   using z=-1/x and shift = pi/2. Maximum observed error is 2.9ulps:
   _ZGVnN4v_atanf (0x1.0468f6p+0) got 0x1.967f06p-1 want 0x1.967fp-1.  */
-float32x4_t VPCS_ATTR V_NAME_F1 (atan) (float32x4_t x)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (atan) (float32x4_t x)
 {
  const struct data *d = ptr_barrier (&data);

@ -98,10 +98,12 @@ float32x4_t VPCS_ATTR V_NAME_F1 (atan) (float32x4_t x)
  return y;
 }

-PL_SIG (V, F, 1, atan, -10.0, 10.0)
-PL_TEST_ULP (V_NAME_F1 (atan), 2.5)
-PL_TEST_EXPECT_FENV (V_NAME_F1 (atan), WANT_SIMD_EXCEPT)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (atan), 0, 0x1p-30, 5000)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (atan), 0x1p-30, 1, 40000)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (atan), 1, 0x1p30, 40000)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (atan), 0x1p30, inf, 1000)
+HALF_WIDTH_ALIAS_F1 (atan)
+
+TEST_SIG (V, F, 1, atan, -10.0, 10.0)
+TEST_ULP (V_NAME_F1 (atan), 2.5)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (atan), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_F1 (atan), 0, 0x1p-30, 5000)
+TEST_SYM_INTERVAL (V_NAME_F1 (atan), 0x1p-30, 1, 40000)
+TEST_SYM_INTERVAL (V_NAME_F1 (atan), 1, 0x1p30, 40000)
+TEST_SYM_INTERVAL (V_NAME_F1 (atan), 0x1p30, inf, 1000)
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/atanh.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/atanh.c
@ -1,13 +1,13 @@
 /*
 * Double-precision vector atanh(x) function.
 *
- * Copyright (c) 2022-2023, Arm Limited.
+ * Copyright (c) 2022-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

 #include "v_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"

 #define WANT_V_LOG1P_K0_SHORTCUT 0
 #include "v_log1p_inline.h"
@ -15,15 +15,19 @@
 const static struct data
 {
  struct v_log1p_data log1p_consts;
-  uint64x2_t one, half;
+  uint64x2_t one;
+  uint64x2_t sign_mask;
 } data = { .log1p_consts = V_LOG1P_CONSTANTS_TABLE,
 	   .one = V2 (0x3ff0000000000000),
-	   .half = V2 (0x3fe0000000000000) };
+	   .sign_mask = V2 (0x8000000000000000) };

 static float64x2_t VPCS_ATTR NOINLINE
-special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
+special_case (float64x2_t x, float64x2_t halfsign, float64x2_t y,
+	      uint64x2_t special, const struct data *d)
 {
-  return v_call_f64 (atanh, x, y, special);
+  y = log1p_inline (y, &d->log1p_consts);
+  return v_call_f64 (atanh, vbslq_f64 (d->sign_mask, halfsign, x),
+		     vmulq_f64 (halfsign, y), special);
 }

 /* Approximation for vector double-precision atanh(x) using modified log1p.
@ -35,11 +39,10 @@ float64x2_t V_NAME_D1 (atanh) (float64x2_t x)
 {
  const struct data *d = ptr_barrier (&data);

+  float64x2_t halfsign = vbslq_f64 (d->sign_mask, x, v_f64 (0.5));
  float64x2_t ax = vabsq_f64 (x);
  uint64x2_t ia = vreinterpretq_u64_f64 (ax);
-  uint64x2_t sign = veorq_u64 (vreinterpretq_u64_f64 (x), ia);
  uint64x2_t special = vcgeq_u64 (ia, d->one);
-  float64x2_t halfsign = vreinterpretq_f64_u64 (vorrq_u64 (sign, d->half));

 #if WANT_SIMD_EXCEPT
  ax = v_zerofy_f64 (ax, special);
@ -47,20 +50,26 @@ float64x2_t V_NAME_D1 (atanh) (float64x2_t x)

  float64x2_t y;
  y = vaddq_f64 (ax, ax);
-  y = vdivq_f64 (y, vsubq_f64 (v_f64 (1), ax));
-  y = log1p_inline (y, &d->log1p_consts);
+  y = vdivq_f64 (y, vsubq_f64 (vreinterpretq_f64_u64 (d->one), ax));

  if (unlikely (v_any_u64 (special)))
-    return special_case (x, vmulq_f64 (y, halfsign), special);
+#if WANT_SIMD_EXCEPT
+    return special_case (x, halfsign, y, special, d);
+#else
+    return special_case (ax, halfsign, y, special, d);
+#endif
+
+  y = log1p_inline (y, &d->log1p_consts);
  return vmulq_f64 (y, halfsign);
 }

-PL_SIG (V, D, 1, atanh, -1.0, 1.0)
-PL_TEST_EXPECT_FENV (V_NAME_D1 (atanh), WANT_SIMD_EXCEPT)
-PL_TEST_ULP (V_NAME_D1 (atanh), 3.32)
+TEST_SIG (V, D, 1, atanh, -1.0, 1.0)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (atanh), WANT_SIMD_EXCEPT)
+TEST_ULP (V_NAME_D1 (atanh), 3.32)
+TEST_SYM_INTERVAL (V_NAME_D1 (atanh), 0, 0x1p-23, 10000)
+TEST_SYM_INTERVAL (V_NAME_D1 (atanh), 0x1p-23, 1, 90000)
+TEST_SYM_INTERVAL (V_NAME_D1 (atanh), 1, inf, 100)
 /* atanh is asymptotic at 1, which is the default control value - have to set
   -c 0 specially to ensure fp exceptions are triggered correctly (choice of
   control lane is irrelevant if fp exceptions are disabled).  */
-PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (atanh), 0, 0x1p-23, 10000, 0)
-PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (atanh), 0x1p-23, 1, 90000, 0)
-PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (atanh), 1, inf, 100, 0)
+TEST_CONTROL_VALUE (V_NAME_D1 (atanh), 0)
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/atanhf.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/atanhf.c
@ -1,13 +1,13 @@
 /*
 * Single-precision vector atanh(x) function.
 *
- * Copyright (c) 2022-2023, Arm Limited.
+ * Copyright (c) 2022-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

 #include "v_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
 #include "v_log1pf_inline.h"

 const static struct data
@ -30,16 +30,18 @@ const static struct data
 #define Half v_u32 (0x3f000000)

 static float32x4_t NOINLINE VPCS_ATTR
-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+special_case (float32x4_t x, float32x4_t halfsign, float32x4_t y,
+	      uint32x4_t special)
 {
-  return v_call_f32 (atanhf, x, y, special);
+  return v_call_f32 (atanhf, vbslq_f32 (AbsMask, x, halfsign),
+		     vmulq_f32 (halfsign, y), special);
 }

 /* Approximation for vector single-precision atanh(x) using modified log1p.
-   The maximum error is 3.08 ULP:
-   __v_atanhf(0x1.ff215p-5) got 0x1.ffcb7cp-5
-			   want 0x1.ffcb82p-5.  */
-VPCS_ATTR float32x4_t V_NAME_F1 (atanh) (float32x4_t x)
+   The maximum error is 2.93 ULP:
+   _ZGVnN4v_atanhf(0x1.f43d7p-5) got 0x1.f4dcfep-5
+				want 0x1.f4dcf8p-5.  */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (atanh) (float32x4_t x)
 {
  const struct data *d = ptr_barrier (&data);

@ -58,20 +60,31 @@ VPCS_ATTR float32x4_t V_NAME_F1 (atanh) (float32x4_t x)
  uint32x4_t special = vcgeq_u32 (iax, d->one);
 #endif

-  float32x4_t y = vdivq_f32 (vaddq_f32 (ax, ax), vsubq_f32 (v_f32 (1), ax));
-  y = log1pf_inline (y, d->log1pf_consts);
+  float32x4_t y = vdivq_f32 (vaddq_f32 (ax, ax),
+			     vsubq_f32 (vreinterpretq_f32_u32 (d->one), ax));
+  y = log1pf_inline (y, &d->log1pf_consts);

+  /* If exceptions not required, pass ax to special-case for shorter dependency
+     chain. If exceptions are required ax will have been zerofied, so have to
+     pass x.  */
  if (unlikely (v_any_u32 (special)))
-    return special_case (x, vmulq_f32 (halfsign, y), special);
+#if WANT_SIMD_EXCEPT
+    return special_case (x, halfsign, y, special);
+#else
+    return special_case (ax, halfsign, y, special);
+#endif
  return vmulq_f32 (halfsign, y);
 }

-PL_SIG (V, F, 1, atanh, -1.0, 1.0)
-PL_TEST_ULP (V_NAME_F1 (atanh), 2.59)
-PL_TEST_EXPECT_FENV (V_NAME_F1 (atanh), WANT_SIMD_EXCEPT)
+HALF_WIDTH_ALIAS_F1 (atanh)
+
+TEST_SIG (V, F, 1, atanh, -1.0, 1.0)
+TEST_ULP (V_NAME_F1 (atanh), 2.44)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (atanh), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_F1 (atanh), 0, 0x1p-12, 500)
+TEST_SYM_INTERVAL (V_NAME_F1 (atanh), 0x1p-12, 1, 200000)
+TEST_SYM_INTERVAL (V_NAME_F1 (atanh), 1, inf, 1000)
 /* atanh is asymptotic at 1, which is the default control value - have to set
 -c 0 specially to ensure fp exceptions are triggered correctly (choice of
 control lane is irrelevant if fp exceptions are disabled).  */
-PL_TEST_SYM_INTERVAL_C (V_NAME_F1 (atanh), 0, 0x1p-12, 500, 0)
-PL_TEST_SYM_INTERVAL_C (V_NAME_F1 (atanh), 0x1p-12, 1, 200000, 0)
-PL_TEST_SYM_INTERVAL_C (V_NAME_F1 (atanh), 1, inf, 1000, 0)
+TEST_CONTROL_VALUE (V_NAME_F1 (atanh), 0)
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/cbrt.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/cbrt.c
@ -1,14 +1,14 @@
 /*
 * Double-precision vector cbrt(x) function.
 *
- * Copyright (c) 2022-2023, Arm Limited.
+ * Copyright (c) 2022-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

 #include "v_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-#include "poly_advsimd_f64.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "v_poly_f64.h"

 const static struct data
 {
@ -40,13 +40,20 @@ special_case (float64x2_t x, float64x2_t y, uint32x2_t special)
  return v_call_f64 (cbrt, x, y, vmovl_u32 (special));
 }

-/* Approximation for double-precision vector cbrt(x), using low-order polynomial
-   and two Newton iterations. Greatest observed error is 1.79 ULP. Errors repeat
+/* Approximation for double-precision vector cbrt(x), using low-order
+   polynomial and two Newton iterations.
+
+   The vector version of frexp does not handle subnormals
+   correctly. As a result these need to be handled by the scalar
+   fallback, where accuracy may be worse than that of the vector code
+   path.
+
+   Greatest observed error in the normal range is 1.79 ULP. Errors repeat
   according to the exponent, for instance an error observed for double value
   m * 2^e will be observed for any input m * 2^(e + 3*i), where i is an
   integer.
-   __v_cbrt(0x1.fffff403f0bc6p+1) got 0x1.965fe72821e9bp+0
-				 want 0x1.965fe72821e99p+0.  */
+   _ZGVnN2v_cbrt (0x1.fffff403f0bc6p+1) got 0x1.965fe72821e9bp+0
+				       want 0x1.965fe72821e99p+0.  */
 VPCS_ATTR float64x2_t V_NAME_D1 (cbrt) (float64x2_t x)
 {
  const struct data *d = ptr_barrier (&data);
@ -64,8 +71,8 @@ VPCS_ATTR float64x2_t V_NAME_D1 (cbrt) (float64x2_t x)
  uint64x2_t ia12 = vshrq_n_u64 (iax, 52);
  int64x2_t e = vsubq_s64 (vreinterpretq_s64_u64 (ia12), exp_bias);

-  /* Calculate rough approximation for cbrt(m) in [0.5, 1.0], starting point for
-     Newton iterations.  */
+  /* Calculate rough approximation for cbrt(m) in [0.5, 1.0], starting point
+     for Newton iterations.  */
  float64x2_t p = v_pairwise_poly_3_f64 (m, vmulq_f64 (m, m), d->poly);
  float64x2_t one_third = d->one_third;
  /* Two iterations of Newton's method for iteratively approximating cbrt.  */
@ -84,8 +91,8 @@ VPCS_ATTR float64x2_t V_NAME_D1 (cbrt) (float64x2_t x)

     Let q = 2 ^ round(e / 3), then t = 2 ^ (e / 3) / q.

-     Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which is
-     an integer in [-2, 2], and can be looked up in the table T. Hence the
+     Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which
+     is an integer in [-2, 2], and can be looked up in the table T. Hence the
     result is assembled as:

     cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign.  */
@ -110,7 +117,11 @@ VPCS_ATTR float64x2_t V_NAME_D1 (cbrt) (float64x2_t x)
  return vbslq_f64 (d->abs_mask, y, x);
 }

-PL_TEST_ULP (V_NAME_D1 (cbrt), 1.30)
-PL_SIG (V, D, 1, cbrt, -10.0, 10.0)
-PL_TEST_EXPECT_FENV_ALWAYS (V_NAME_D1 (cbrt))
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (cbrt), 0, inf, 1000000)
+/* Worse-case ULP error assumes that scalar fallback is GLIBC 2.40 cbrt, which
+   has ULP error of 3.67 at 0x1.7a337e1ba1ec2p-257 [1]. Largest observed error
+   in the vector path is 1.79 ULP.
+   [1] Innocente, V., & Zimmermann, P. (2024). Accuracy of Mathematical
+   Functions in Single, Double, Double Extended, and Quadruple Precision.  */
+TEST_ULP (V_NAME_D1 (cbrt), 3.17)
+TEST_SIG (V, D, 1, cbrt, -10.0, 10.0)
+TEST_SYM_INTERVAL (V_NAME_D1 (cbrt), 0, inf, 1000000)
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/cbrtf.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/cbrtf.c
@ -1,14 +1,14 @@
 /*
 * Single-precision vector cbrt(x) function.
 *
- * Copyright (c) 2022-2023, Arm Limited.
+ * Copyright (c) 2022-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

 #include "v_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-#include "poly_advsimd_f32.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "v_poly_f32.h"

 const static struct data
 {
@ -49,7 +49,7 @@ shifted_lookup (const float *table, int32x4_t i)
   0x1.85a2aa and the exponent is a multiple of 3, for example:
   _ZGVnN4v_cbrtf(0x1.85a2aap+3) got 0x1.267936p+1
 				want 0x1.267932p+1.  */
-VPCS_ATTR float32x4_t V_NAME_F1 (cbrt) (float32x4_t x)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cbrt) (float32x4_t x)
 {
  const struct data *d = ptr_barrier (&data);
  uint32x4_t iax = vreinterpretq_u32_f32 (vabsq_f32 (x));
@ -110,7 +110,8 @@ VPCS_ATTR float32x4_t V_NAME_F1 (cbrt) (float32x4_t x)
  return vbslq_f32 (SignMask, x, y);
 }

-PL_SIG (V, F, 1, cbrt, -10.0, 10.0)
-PL_TEST_ULP (V_NAME_F1 (cbrt), 1.15)
-PL_TEST_EXPECT_FENV_ALWAYS (V_NAME_F1 (cbrt))
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (cbrt), 0, inf, 1000000)
+HALF_WIDTH_ALIAS_F1 (cbrt)
+
+TEST_SIG (V, F, 1, cbrt, -10.0, 10.0)
+TEST_ULP (V_NAME_F1 (cbrt), 1.15)
+TEST_SYM_INTERVAL (V_NAME_F1 (cbrt), 0, inf, 1000000)
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/cexpi.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/cexpi.c
@ -1,13 +1,13 @@
 /*
 * Double-precision vector sincos function - return-by-value interface.
 *
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

 #include "v_sincos_common.h"
 #include "v_math.h"
-#include "pl_test.h"
+#include "test_defs.h"

 static float64x2x2_t VPCS_ATTR NOINLINE
 special_case (float64x2_t x, uint64x2_t special, float64x2x2_t y)
@ -34,11 +34,13 @@ _ZGVnN2v_cexpi (float64x2_t x)
  return sc;
 }

-PL_TEST_ULP (_ZGVnN2v_cexpi_sin, 2.73)
-PL_TEST_ULP (_ZGVnN2v_cexpi_cos, 2.73)
+TEST_DISABLE_FENV (_ZGVnN2v_cexpi_cos)
+TEST_DISABLE_FENV (_ZGVnN2v_cexpi_sin)
+TEST_ULP (_ZGVnN2v_cexpi_sin, 2.73)
+TEST_ULP (_ZGVnN2v_cexpi_cos, 2.73)
 #define V_CEXPI_INTERVAL(lo, hi, n)                                           \
-  PL_TEST_INTERVAL (_ZGVnN2v_cexpi_sin, lo, hi, n)                            \
-  PL_TEST_INTERVAL (_ZGVnN2v_cexpi_cos, lo, hi, n)
+  TEST_INTERVAL (_ZGVnN2v_cexpi_sin, lo, hi, n)                               \
+  TEST_INTERVAL (_ZGVnN2v_cexpi_cos, lo, hi, n)
 V_CEXPI_INTERVAL (0, 0x1p23, 500000)
 V_CEXPI_INTERVAL (-0, -0x1p23, 500000)
 V_CEXPI_INTERVAL (0x1p23, inf, 10000)
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/cexpif.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/cexpif.c
@ -1,13 +1,13 @@
 /*
 * Single-precision vector cexpi function.
 *
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

 #include "v_sincosf_common.h"
 #include "v_math.h"
-#include "pl_test.h"
+#include "test_defs.h"

 static float32x4x2_t VPCS_ATTR NOINLINE
 special_case (float32x4_t x, uint32x4_t special, float32x4x2_t y)
@ -36,11 +36,13 @@ _ZGVnN4v_cexpif (float32x4_t x)
  return sc;
 }

-PL_TEST_ULP (_ZGVnN4v_cexpif_sin, 1.17)
-PL_TEST_ULP (_ZGVnN4v_cexpif_cos, 1.31)
+TEST_DISABLE_FENV (_ZGVnN4v_cexpif_sin)
+TEST_DISABLE_FENV (_ZGVnN4v_cexpif_cos)
+TEST_ULP (_ZGVnN4v_cexpif_sin, 1.17)
+TEST_ULP (_ZGVnN4v_cexpif_cos, 1.31)
 #define V_CEXPIF_INTERVAL(lo, hi, n)                                          \
-  PL_TEST_INTERVAL (_ZGVnN4v_cexpif_sin, lo, hi, n)                           \
-  PL_TEST_INTERVAL (_ZGVnN4v_cexpif_cos, lo, hi, n)
+  TEST_INTERVAL (_ZGVnN4v_cexpif_sin, lo, hi, n)                              \
+  TEST_INTERVAL (_ZGVnN4v_cexpif_cos, lo, hi, n)
 V_CEXPIF_INTERVAL (0, 0x1p20, 500000)
 V_CEXPIF_INTERVAL (-0, -0x1p20, 500000)
 V_CEXPIF_INTERVAL (0x1p20, inf, 10000)
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/cos.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/cos.c
@ -1,17 +1,19 @@
 /*
 * Double-precision vector cos function.
 *
- * Copyright (c) 2019-2023, Arm Limited.
+ * Copyright (c) 2019-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

 #include "mathlib.h"
 #include "v_math.h"
+#include "test_defs.h"
+#include "test_sig.h"

 static const struct data
 {
  float64x2_t poly[7];
-  float64x2_t range_val, shift, inv_pi, half_pi, pi_1, pi_2, pi_3;
+  float64x2_t range_val, inv_pi, pi_1, pi_2, pi_3;
 } data = {
  /* Worst-case error is 3.3 ulp in [-pi/2, pi/2].  */
  .poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7),
@ -19,11 +21,9 @@ static const struct data
 	    V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33),
 	    V2 (-0x1.9e9540300a1p-41) },
  .inv_pi = V2 (0x1.45f306dc9c883p-2),
-  .half_pi = V2 (0x1.921fb54442d18p+0),
  .pi_1 = V2 (0x1.921fb54442d18p+1),
  .pi_2 = V2 (0x1.1a62633145c06p-53),
  .pi_3 = V2 (0x1.c1cd129024e09p-106),
-  .shift = V2 (0x1.8p52),
  .range_val = V2 (0x1p23)
 };

@ -57,10 +57,9 @@ float64x2_t VPCS_ATTR V_NAME_D1 (cos) (float64x2_t x)
 #endif

  /* n = rint((|x|+pi/2)/pi) - 0.5.  */
-  n = vfmaq_f64 (d->shift, d->inv_pi, vaddq_f64 (r, d->half_pi));
-  odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63);
-  n = vsubq_f64 (n, d->shift);
-  n = vsubq_f64 (n, v_f64 (0.5));
+  n = vrndaq_f64 (vfmaq_f64 (v_f64 (0.5), r, d->inv_pi));
+  odd = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtq_s64_f64 (n)), 63);
+  n = vsubq_f64 (n, v_f64 (0.5f));

  /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2).  */
  r = vfmsq_f64 (r, d->pi_1, n);
@ -85,3 +84,9 @@ float64x2_t VPCS_ATTR V_NAME_D1 (cos) (float64x2_t x)
    return special_case (x, y, odd, cmp);
  return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
 }
+
+TEST_SIG (V, D, 1, cos, -3.1, 3.1)
+TEST_ULP (V_NAME_D1 (cos), 3.0)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (cos), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_D1 (cos), 0, 0x1p23, 500000)
+TEST_SYM_INTERVAL (V_NAME_D1 (cos), 0x1p23, inf, 10000)
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/cosf.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/cosf.c
@ -1,17 +1,19 @@
 /*
 * Single-precision vector cos function.
 *
- * Copyright (c) 2019-2023, Arm Limited.
+ * Copyright (c) 2019-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

 #include "mathlib.h"
 #include "v_math.h"
+#include "test_defs.h"
+#include "test_sig.h"

 static const struct data
 {
  float32x4_t poly[4];
-  float32x4_t range_val, inv_pi, half_pi, shift, pi_1, pi_2, pi_3;
+  float32x4_t range_val, inv_pi, pi_1, pi_2, pi_3;
 } data = {
  /* 1.886 ulp error.  */
  .poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f),
@ -22,8 +24,6 @@ static const struct data
  .pi_3 = V4 (-0x1.ee59dap-49f),

  .inv_pi = V4 (0x1.45f306p-2f),
-  .shift = V4 (0x1.8p+23f),
-  .half_pi = V4 (0x1.921fb6p0f),
  .range_val = V4 (0x1p20f)
 };

@ -37,7 +37,7 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp)
  return v_call_f32 (cosf, x, y, cmp);
 }

-float32x4_t VPCS_ATTR V_NAME_F1 (cos) (float32x4_t x)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cos) (float32x4_t x)
 {
  const struct data *d = ptr_barrier (&data);
  float32x4_t n, r, r2, r3, y;
@ -58,9 +58,8 @@ float32x4_t VPCS_ATTR V_NAME_F1 (cos) (float32x4_t x)
 #endif

  /* n = rint((|x|+pi/2)/pi) - 0.5.  */
-  n = vfmaq_f32 (d->shift, d->inv_pi, vaddq_f32 (r, d->half_pi));
-  odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31);
-  n = vsubq_f32 (n, d->shift);
+  n = vrndaq_f32 (vfmaq_f32 (v_f32 (0.5), r, d->inv_pi));
+  odd = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 31);
  n = vsubq_f32 (n, v_f32 (0.5f));

  /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2).  */
@ -80,3 +79,11 @@ float32x4_t VPCS_ATTR V_NAME_F1 (cos) (float32x4_t x)
    return special_case (x, y, odd, cmp);
  return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
 }
+
+HALF_WIDTH_ALIAS_F1 (cos)
+
+TEST_SIG (V, F, 1, cos, -3.1, 3.1)
+TEST_ULP (V_NAME_F1 (cos), 1.4)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (cos), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_F1 (cos), 0, 0x1p20, 500000)
+TEST_SYM_INTERVAL (V_NAME_F1 (cos), 0x1p20, inf, 10000)
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/cosh.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/cosh.c
@ -1,18 +1,20 @@
 /*
 * Double-precision vector cosh(x) function.
 *
- * Copyright (c) 2022-2023, Arm Limited.
+ * Copyright (c) 2022-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

 #include "v_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"

 static const struct data
 {
  float64x2_t poly[3];
-  float64x2_t inv_ln2, ln2, shift, thres;
+  float64x2_t inv_ln2;
+  double ln2[2];
+  float64x2_t shift, thres;
  uint64x2_t index_mask, special_bound;
 } data = {
  .poly = { V2 (0x1.fffffffffffd4p-2), V2 (0x1.5555571d6b68cp-3),
@ -48,8 +50,9 @@ exp_inline (float64x2_t x)
  float64x2_t n = vsubq_f64 (z, d->shift);

  /* r = x - n*ln2/N.  */
-  float64x2_t r = vfmaq_laneq_f64 (x, n, d->ln2, 0);
-  r = vfmaq_laneq_f64 (r, n, d->ln2, 1);
+  float64x2_t ln2 = vld1q_f64 (d->ln2);
+  float64x2_t r = vfmaq_laneq_f64 (x, n, ln2, 0);
+  r = vfmaq_laneq_f64 (r, n, ln2, 1);

  uint64x2_t e = vshlq_n_u64 (u, 52 - V_EXP_TAIL_TABLE_BITS);
  uint64x2_t i = vandq_u64 (u, d->index_mask);
@ -97,8 +100,8 @@ float64x2_t VPCS_ATTR V_NAME_D1 (cosh) (float64x2_t x)
  return vaddq_f64 (half_t, half_over_t);
 }

-PL_SIG (V, D, 1, cosh, -10.0, 10.0)
-PL_TEST_ULP (V_NAME_D1 (cosh), 1.43)
-PL_TEST_EXPECT_FENV_ALWAYS (V_NAME_D1 (cosh))
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (cosh), 0, 0x1.6p9, 100000)
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (cosh), 0x1.6p9, inf, 1000)
+TEST_SIG (V, D, 1, cosh, -10.0, 10.0)
+TEST_ULP (V_NAME_D1 (cosh), 1.43)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (cosh), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_D1 (cosh), 0, 0x1.6p9, 100000)
+TEST_SYM_INTERVAL (V_NAME_D1 (cosh), 0x1.6p9, inf, 1000)
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/coshf.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/coshf.c
@ -1,32 +1,39 @@
 /*
 * Single-precision vector cosh(x) function.
 *
- * Copyright (c) 2022-2023, Arm Limited.
+ * Copyright (c) 2022-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

 #include "v_expf_inline.h"
 #include "v_math.h"
-#include "mathlib.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"

 static const struct data
 {
  struct v_expf_data expf_consts;
-  uint32x4_t tiny_bound, special_bound;
+  uint32x4_t tiny_bound;
+  float32x4_t bound;
+#if WANT_SIMD_EXCEPT
+  uint32x4_t special_bound;
+#endif
 } data = {
  .expf_consts = V_EXPF_DATA,
  .tiny_bound = V4 (0x20000000), /* 0x1p-63: Round to 1 below this.  */
  /* 0x1.5a92d8p+6: expf overflows above this, so have to use special case.  */
+  .bound = V4 (0x1.5a92d8p+6),
+#if WANT_SIMD_EXCEPT
  .special_bound = V4 (0x42ad496c),
+#endif
 };

 #if !WANT_SIMD_EXCEPT
 static float32x4_t NOINLINE VPCS_ATTR
-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+special_case (float32x4_t x, float32x4_t half_t, float32x4_t half_over_t,
+	      uint32x4_t special)
 {
-  return v_call_f32 (coshf, x, y, special);
+  return v_call_f32 (coshf, x, vaddq_f32 (half_t, half_over_t), special);
 }
 #endif

@ -34,18 +41,17 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
   Maximum error is 2.38 ULP:
   _ZGVnN4v_coshf (0x1.e8001ep+1) got 0x1.6a491ep+4
 				 want 0x1.6a4922p+4.  */
-float32x4_t VPCS_ATTR V_NAME_F1 (cosh) (float32x4_t x)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cosh) (float32x4_t x)
 {
  const struct data *d = ptr_barrier (&data);

-  float32x4_t ax = vabsq_f32 (x);
-  uint32x4_t iax = vreinterpretq_u32_f32 (ax);
-  uint32x4_t special = vcgeq_u32 (iax, d->special_bound);
-
 #if WANT_SIMD_EXCEPT
  /* If fp exceptions are to be triggered correctly, fall back to the scalar
     variant for all inputs if any input is a special value or above the bound
     at which expf overflows.  */
+  float32x4_t ax = vabsq_f32 (x);
+  uint32x4_t iax = vreinterpretq_u32_f32 (ax);
+  uint32x4_t special = vcgeq_u32 (iax, d->special_bound);
  if (unlikely (v_any_u32 (special)))
    return v_call_f32 (coshf, x, x, v_u32 (-1));

@ -54,10 +60,13 @@ float32x4_t VPCS_ATTR V_NAME_F1 (cosh) (float32x4_t x)
     input to 0, which will generate no exceptions.  */
  if (unlikely (v_any_u32 (tiny)))
    ax = v_zerofy_f32 (ax, tiny);
+  float32x4_t t = v_expf_inline (ax, &d->expf_consts);
+#else
+  uint32x4_t special = vcageq_f32 (x, d->bound);
+  float32x4_t t = v_expf_inline (x, &d->expf_consts);
 #endif

  /* Calculate cosh by exp(x) / 2 + exp(-x) / 2.  */
-  float32x4_t t = v_expf_inline (ax, &d->expf_consts);
  float32x4_t half_t = vmulq_n_f32 (t, 0.5);
  float32x4_t half_over_t = vdivq_f32 (v_f32 (0.5), t);

@ -66,15 +75,18 @@ float32x4_t VPCS_ATTR V_NAME_F1 (cosh) (float32x4_t x)
    return vbslq_f32 (tiny, v_f32 (1), vaddq_f32 (half_t, half_over_t));
 #else
  if (unlikely (v_any_u32 (special)))
-    return special_case (x, vaddq_f32 (half_t, half_over_t), special);
+    return special_case (x, half_t, half_over_t, special);
 #endif

  return vaddq_f32 (half_t, half_over_t);
 }

-PL_SIG (V, F, 1, cosh, -10.0, 10.0)
-PL_TEST_ULP (V_NAME_F1 (cosh), 1.89)
-PL_TEST_EXPECT_FENV (V_NAME_F1 (cosh), WANT_SIMD_EXCEPT)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 0, 0x1p-63, 100)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 0, 0x1.5a92d8p+6, 80000)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 0x1.5a92d8p+6, inf, 2000)
+HALF_WIDTH_ALIAS_F1 (cosh)
+
+TEST_SIG (V, F, 1, cosh, -10.0, 10.0)
+TEST_ULP (V_NAME_F1 (cosh), 1.89)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (cosh), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 0, 0x1p-63, 100)
+TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 0x1p-63, 1, 1000)
+TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 1, 0x1.5a92d8p+6, 80000)
+TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 0x1.5a92d8p+6, inf, 2000)
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/cospi.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/cospi.c
@ -1,15 +1,15 @@
 /*
 * Double-precision vector cospi function.
 *
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

 #include "mathlib.h"
 #include "v_math.h"
-#include "poly_advsimd_f64.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "v_poly_f64.h"
+#include "test_sig.h"
+#include "test_defs.h"

 static const struct data
 {
@ -31,7 +31,7 @@ special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp)
 {
  /* Fall back to scalar code.  */
  y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
-  return v_call_f64 (cospi, x, y, cmp);
+  return v_call_f64 (arm_math_cospi, x, y, cmp);
 }

 /* Approximation for vector double-precision cospi(x).
@ -77,10 +77,11 @@ float64x2_t VPCS_ATTR V_NAME_D1 (cospi) (float64x2_t x)
  return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
 }

-PL_SIG (V, D, 1, cospi, -0.9, 0.9)
-PL_TEST_ULP (V_NAME_D1 (cospi), 2.56)
-PL_TEST_EXPECT_FENV (V_NAME_D1 (cospi), WANT_SIMD_EXCEPT)
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0, 0x1p-63, 5000)
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0x1p-63, 0.5, 10000)
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0.5, 0x1p51, 10000)
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0x1p51, inf, 10000)
+#if WANT_TRIGPI_TESTS
+TEST_ULP (V_NAME_D1 (cospi), 2.56)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (cospi), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0, 0x1p-63, 5000)
+TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0x1p-63, 0.5, 10000)
+TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0.5, 0x1p51, 10000)
+TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0x1p51, inf, 10000)
+#endif
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/cospif.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/cospif.c
@ -1,15 +1,15 @@
 /*
 * Single-precision vector cospi function.
 *
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

 #include "mathlib.h"
 #include "v_math.h"
-#include "poly_advsimd_f32.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "v_poly_f32.h"
+#include "test_sig.h"
+#include "test_defs.h"

 static const struct data
 {
@ -26,14 +26,14 @@ static float32x4_t VPCS_ATTR NOINLINE
 special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp)
 {
  y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
-  return v_call_f32 (cospif, x, y, cmp);
+  return v_call_f32 (arm_math_cospif, x, y, cmp);
 }

 /* Approximation for vector single-precision cospi(x)
    Maximum Error: 3.17 ULP:
    _ZGVnN4v_cospif(0x1.d341a8p-5) got 0x1.f7cd56p-1
 				  want 0x1.f7cd5p-1.  */
-float32x4_t VPCS_ATTR V_NAME_F1 (cospi) (float32x4_t x)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cospi) (float32x4_t x)
 {
  const struct data *d = ptr_barrier (&data);

@ -74,10 +74,13 @@ float32x4_t VPCS_ATTR V_NAME_F1 (cospi) (float32x4_t x)
  return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
 }

-PL_SIG (V, F, 1, cospi, -0.9, 0.9)
-PL_TEST_ULP (V_NAME_F1 (cospi), 2.67)
-PL_TEST_EXPECT_FENV (V_NAME_F1 (cospi), WANT_SIMD_EXCEPT)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0, 0x1p-31, 5000)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0x1p-31, 0.5, 10000)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0.5, 0x1p32f, 10000)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0x1p32f, inf, 10000)
+HALF_WIDTH_ALIAS_F1 (cospi)
+
+#if WANT_TRIGPI_TESTS
+TEST_ULP (V_NAME_F1 (cospi), 2.67)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (cospi), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0, 0x1p-31, 5000)
+TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0x1p-31, 0.5, 10000)
+TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0.5, 0x1p32f, 10000)
+TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0x1p32f, inf, 10000)
+#endif
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/erf.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/erf.c
@ -1,30 +1,32 @@
 /*
 * Double-precision vector erf(x) function.
 *
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

 #include "v_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"

 static const struct data
 {
  float64x2_t third;
-  float64x2_t tenth, two_over_five, two_over_fifteen;
-  float64x2_t two_over_nine, two_over_fortyfive;
+  float64x2_t tenth, two_over_five, two_over_nine;
+  double two_over_fifteen, two_over_fortyfive;
  float64x2_t max, shift;
+  uint64x2_t max_idx;
 #if WANT_SIMD_EXCEPT
  float64x2_t tiny_bound, huge_bound, scale_minus_one;
 #endif
 } data = {
+  .max_idx = V2 (768),
  .third = V2 (0x1.5555555555556p-2), /* used to compute 2/3 and 1/6 too.  */
-  .two_over_fifteen = V2 (0x1.1111111111111p-3),
+  .two_over_fifteen = 0x1.1111111111111p-3,
  .tenth = V2 (-0x1.999999999999ap-4),
  .two_over_five = V2 (-0x1.999999999999ap-2),
  .two_over_nine = V2 (-0x1.c71c71c71c71cp-3),
-  .two_over_fortyfive = V2 (0x1.6c16c16c16c17p-5),
+  .two_over_fortyfive = 0x1.6c16c16c16c17p-5,
  .max = V2 (5.9921875), /* 6 - 1/128.  */
  .shift = V2 (0x1p45),
 #if WANT_SIMD_EXCEPT
@ -46,8 +48,8 @@ static inline struct entry
 lookup (uint64x2_t i)
 {
  struct entry e;
-  float64x2_t e1 = vld1q_f64 ((float64_t *) (__erf_data.tab + i[0])),
-	      e2 = vld1q_f64 ((float64_t *) (__erf_data.tab + i[1]));
+  float64x2_t e1 = vld1q_f64 (&__v_erf_data.tab[vgetq_lane_u64 (i, 0)].erf),
+	      e2 = vld1q_f64 (&__v_erf_data.tab[vgetq_lane_u64 (i, 1)].erf);
  e.erf = vuzp1q_f64 (e1, e2);
  e.scale = vuzp2q_f64 (e1, e2);
  return e;
@ -77,8 +79,8 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erf) (float64x2_t x)
  float64x2_t a = vabsq_f64 (x);
  /* Reciprocal conditions that do not catch NaNs so they can be used in BSLs
     to return expected results.  */
-  uint64x2_t a_le_max = vcleq_f64 (a, dat->max);
-  uint64x2_t a_gt_max = vcgtq_f64 (a, dat->max);
+  uint64x2_t a_le_max = vcaleq_f64 (x, dat->max);
+  uint64x2_t a_gt_max = vcagtq_f64 (x, dat->max);

 #if WANT_SIMD_EXCEPT
  /* |x| huge or tiny.  */
@ -105,7 +107,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erf) (float64x2_t x)
     segfault.  */
  uint64x2_t i
      = vsubq_u64 (vreinterpretq_u64_f64 (z), vreinterpretq_u64_f64 (shift));
-  i = vbslq_u64 (a_le_max, i, v_u64 (768));
+  i = vbslq_u64 (a_le_max, i, dat->max_idx);
  struct entry e = lookup (i);

  float64x2_t r = vsubq_f64 (z, shift);
@ -115,14 +117,19 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erf) (float64x2_t x)
  float64x2_t d2 = vmulq_f64 (d, d);
  float64x2_t r2 = vmulq_f64 (r, r);

+  float64x2_t two_over_fifteen_and_fortyfive
+      = vld1q_f64 (&dat->two_over_fifteen);
+
  /* poly (d, r) = 1 + p1(r) * d + p2(r) * d^2 + ... + p5(r) * d^5.  */
  float64x2_t p1 = r;
  float64x2_t p2
      = vfmsq_f64 (dat->third, r2, vaddq_f64 (dat->third, dat->third));
  float64x2_t p3 = vmulq_f64 (r, vfmaq_f64 (v_f64 (-0.5), r2, dat->third));
-  float64x2_t p4 = vfmaq_f64 (dat->two_over_five, r2, dat->two_over_fifteen);
+  float64x2_t p4 = vfmaq_laneq_f64 (dat->two_over_five, r2,
+				    two_over_fifteen_and_fortyfive, 0);
  p4 = vfmsq_f64 (dat->tenth, r2, p4);
-  float64x2_t p5 = vfmaq_f64 (dat->two_over_nine, r2, dat->two_over_fortyfive);
+  float64x2_t p5 = vfmaq_laneq_f64 (dat->two_over_nine, r2,
+				    two_over_fifteen_and_fortyfive, 1);
  p5 = vmulq_f64 (r, vfmaq_f64 (vmulq_f64 (v_f64 (0.5), dat->third), r2, p5));

  float64x2_t p34 = vfmaq_f64 (p3, d, p4);
@ -150,9 +157,10 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erf) (float64x2_t x)
  return y;
 }

-PL_SIG (V, D, 1, erf, -6.0, 6.0)
-PL_TEST_ULP (V_NAME_D1 (erf), 1.79)
-PL_TEST_EXPECT_FENV (V_NAME_D1 (erf), WANT_SIMD_EXCEPT)
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (erf), 0, 5.9921875, 40000)
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (erf), 5.9921875, inf, 40000)
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (erf), 0, inf, 40000)
+TEST_SIG (V, D, 1, erf, -6.0, 6.0)
+TEST_ULP (V_NAME_D1 (erf), 1.79)
+/* WANT_SIMD_EXCEPT blocks miss some cases.  */
+TEST_DISABLE_FENV (V_NAME_D1 (erf))
+TEST_SYM_INTERVAL (V_NAME_D1 (erf), 0, 5.9921875, 40000)
+TEST_SYM_INTERVAL (V_NAME_D1 (erf), 5.9921875, inf, 40000)
+TEST_SYM_INTERVAL (V_NAME_D1 (erf), 0, inf, 40000)
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/erfc.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/erfc.c
@ -1,21 +1,21 @@
 /*
 * Double-precision vector erfc(x) function.
 *
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

 #include "v_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"

 static const struct data
 {
  uint64x2_t offset, table_scale;
  float64x2_t max, shift;
-  float64x2_t p20, p40, p41, p42;
-  float64x2_t p51, p52;
-  float64x2_t qr5, qr6, qr7, qr8, qr9;
+  float64x2_t p20, p40, p41, p51;
+  double p42, p52;
+  double qr5[2], qr6[2], qr7[2], qr8[2], qr9[2];
 #if WANT_SIMD_EXCEPT
  float64x2_t uflow_bound;
 #endif
@ -30,9 +30,9 @@ static const struct data
  .p20 = V2 (0x1.5555555555555p-2),  /* 1/3, used to compute 2/3 and 1/6.  */
  .p40 = V2 (-0x1.999999999999ap-4), /* 1/10.  */
  .p41 = V2 (-0x1.999999999999ap-2), /* 2/5.  */
-  .p42 = V2 (0x1.1111111111111p-3),  /* 2/15.  */
+  .p42 = 0x1.1111111111111p-3,	     /* 2/15.  */
  .p51 = V2 (-0x1.c71c71c71c71cp-3), /* 2/9.  */
-  .p52 = V2 (0x1.6c16c16c16c17p-5),  /* 2/45.  */
+  .p52 = 0x1.6c16c16c16c17p-5,	     /* 2/45.  */
  /* Qi = (i+1) / i, Ri = -2 * i / ((i+1)*(i+2)), for i = 5, ..., 9.  */
  .qr5 = { 0x1.3333333333333p0, -0x1.e79e79e79e79ep-3 },
  .qr6 = { 0x1.2aaaaaaaaaaabp0, -0x1.b6db6db6db6dbp-3 },
@ -57,8 +57,10 @@ static inline struct entry
 lookup (uint64x2_t i)
 {
  struct entry e;
-  float64x2_t e1 = vld1q_f64 ((float64_t *) (__erfc_data.tab - Off + i[0])),
-	      e2 = vld1q_f64 ((float64_t *) (__erfc_data.tab - Off + i[1]));
+  float64x2_t e1
+      = vld1q_f64 (&__v_erfc_data.tab[vgetq_lane_u64 (i, 0) - Off].erfc);
+  float64x2_t e2
+      = vld1q_f64 (&__v_erfc_data.tab[vgetq_lane_u64 (i, 1) - Off].erfc);
  e.erfc = vuzp1q_f64 (e1, e2);
  e.scale = vuzp2q_f64 (e1, e2);
  return e;
@ -144,22 +146,26 @@ float64x2_t V_NAME_D1 (erfc) (float64x2_t x)
  float64x2_t p1 = r;
  float64x2_t p2 = vfmsq_f64 (dat->p20, r2, vaddq_f64 (dat->p20, dat->p20));
  float64x2_t p3 = vmulq_f64 (r, vfmaq_f64 (v_f64 (-0.5), r2, dat->p20));
-  float64x2_t p4 = vfmaq_f64 (dat->p41, r2, dat->p42);
+  float64x2_t p42_p52 = vld1q_f64 (&dat->p42);
+  float64x2_t p4 = vfmaq_laneq_f64 (dat->p41, r2, p42_p52, 0);
  p4 = vfmsq_f64 (dat->p40, r2, p4);
-  float64x2_t p5 = vfmaq_f64 (dat->p51, r2, dat->p52);
+  float64x2_t p5 = vfmaq_laneq_f64 (dat->p51, r2, p42_p52, 1);
  p5 = vmulq_f64 (r, vfmaq_f64 (vmulq_f64 (v_f64 (0.5), dat->p20), r2, p5));
  /* Compute p_i using recurrence relation:
     p_{i+2} = (p_i + r * Q_{i+1} * p_{i+1}) * R_{i+1}.  */
-  float64x2_t p6 = vfmaq_f64 (p4, p5, vmulq_laneq_f64 (r, dat->qr5, 0));
-  p6 = vmulq_laneq_f64 (p6, dat->qr5, 1);
-  float64x2_t p7 = vfmaq_f64 (p5, p6, vmulq_laneq_f64 (r, dat->qr6, 0));
-  p7 = vmulq_laneq_f64 (p7, dat->qr6, 1);
-  float64x2_t p8 = vfmaq_f64 (p6, p7, vmulq_laneq_f64 (r, dat->qr7, 0));
-  p8 = vmulq_laneq_f64 (p8, dat->qr7, 1);
-  float64x2_t p9 = vfmaq_f64 (p7, p8, vmulq_laneq_f64 (r, dat->qr8, 0));
-  p9 = vmulq_laneq_f64 (p9, dat->qr8, 1);
-  float64x2_t p10 = vfmaq_f64 (p8, p9, vmulq_laneq_f64 (r, dat->qr9, 0));
-  p10 = vmulq_laneq_f64 (p10, dat->qr9, 1);
+  float64x2_t qr5 = vld1q_f64 (dat->qr5), qr6 = vld1q_f64 (dat->qr6),
+	      qr7 = vld1q_f64 (dat->qr7), qr8 = vld1q_f64 (dat->qr8),
+	      qr9 = vld1q_f64 (dat->qr9);
+  float64x2_t p6 = vfmaq_f64 (p4, p5, vmulq_laneq_f64 (r, qr5, 0));
+  p6 = vmulq_laneq_f64 (p6, qr5, 1);
+  float64x2_t p7 = vfmaq_f64 (p5, p6, vmulq_laneq_f64 (r, qr6, 0));
+  p7 = vmulq_laneq_f64 (p7, qr6, 1);
+  float64x2_t p8 = vfmaq_f64 (p6, p7, vmulq_laneq_f64 (r, qr7, 0));
+  p8 = vmulq_laneq_f64 (p8, qr7, 1);
+  float64x2_t p9 = vfmaq_f64 (p7, p8, vmulq_laneq_f64 (r, qr8, 0));
+  p9 = vmulq_laneq_f64 (p9, qr8, 1);
+  float64x2_t p10 = vfmaq_f64 (p8, p9, vmulq_laneq_f64 (r, qr9, 0));
+  p10 = vmulq_laneq_f64 (p10, qr9, 1);
  /* Compute polynomial in d using pairwise Horner scheme.  */
  float64x2_t p90 = vfmaq_f64 (p9, d, p10);
  float64x2_t p78 = vfmaq_f64 (p7, d, p8);
@ -189,10 +195,11 @@ float64x2_t V_NAME_D1 (erfc) (float64x2_t x)
  return vfmaq_f64 (off, fac, y);
 }

-PL_SIG (V, D, 1, erfc, -6.0, 28.0)
-PL_TEST_ULP (V_NAME_D1 (erfc), 1.21)
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (erfc), 0, 0x1p-26, 40000)
-PL_TEST_INTERVAL (V_NAME_D1 (erfc), 0x1p-26, 28.0, 40000)
-PL_TEST_INTERVAL (V_NAME_D1 (erfc), -0x1p-26, -6.0, 40000)
-PL_TEST_INTERVAL (V_NAME_D1 (erfc), 28.0, inf, 40000)
-PL_TEST_INTERVAL (V_NAME_D1 (erfc), -6.0, -inf, 40000)
+TEST_SIG (V, D, 1, erfc, -6.0, 28.0)
+TEST_ULP (V_NAME_D1 (erfc), 1.21)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (erfc), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_D1 (erfc), 0, 0x1p-26, 40000)
+TEST_INTERVAL (V_NAME_D1 (erfc), 0x1p-26, 28.0, 40000)
+TEST_INTERVAL (V_NAME_D1 (erfc), -0x1p-26, -6.0, 40000)
+TEST_INTERVAL (V_NAME_D1 (erfc), 28.0, inf, 40000)
+TEST_INTERVAL (V_NAME_D1 (erfc), -6.0, -inf, 40000)
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/erfcf.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/erfcf.c
@ -1,19 +1,20 @@
 /*
 * Single-precision vector erfc(x) function.
 *
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

 #include "v_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"

 static const struct data
 {
  uint32x4_t offset, table_scale;
  float32x4_t max, shift;
-  float32x4_t coeffs, third, two_over_five, tenth;
+  float coeffs[4];
+  float32x4_t third, two_over_five, tenth;
 #if WANT_SIMD_EXCEPT
  float32x4_t uflow_bound;
 #endif
@ -27,7 +28,7 @@ static const struct data
  .shift = V4 (0x1p17f),
  /* Store 1/3, 2/3 and 2/15 in a single register for use with indexed muls and
     fmas.  */
-  .coeffs = (float32x4_t){ 0x1.555556p-2f, 0x1.555556p-1f, 0x1.111112p-3f, 0 },
+  .coeffs = { 0x1.555556p-2f, 0x1.555556p-1f, 0x1.111112p-3f, 0 },
  .third = V4 (0x1.555556p-2f),
  .two_over_five = V4 (-0x1.99999ap-2f),
  .tenth = V4 (-0x1.99999ap-4f),
@ -50,12 +51,16 @@ static inline struct entry
 lookup (uint32x4_t i)
 {
  struct entry e;
-  float64_t t0 = *((float64_t *) (__erfcf_data.tab - Off + i[0]));
-  float64_t t1 = *((float64_t *) (__erfcf_data.tab - Off + i[1]));
-  float64_t t2 = *((float64_t *) (__erfcf_data.tab - Off + i[2]));
-  float64_t t3 = *((float64_t *) (__erfcf_data.tab - Off + i[3]));
-  float32x4_t e1 = vreinterpretq_f32_f64 ((float64x2_t){ t0, t1 });
-  float32x4_t e2 = vreinterpretq_f32_f64 ((float64x2_t){ t2, t3 });
+  float32x2_t t0
+      = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 0) - Off].erfc);
+  float32x2_t t1
+      = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 1) - Off].erfc);
+  float32x2_t t2
+      = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 2) - Off].erfc);
+  float32x2_t t3
+      = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 3) - Off].erfc);
+  float32x4_t e1 = vcombine_f32 (t0, t1);
+  float32x4_t e2 = vcombine_f32 (t2, t3);
  e.erfc = vuzp1q_f32 (e1, e2);
  e.scale = vuzp2q_f32 (e1, e2);
  return e;
@ -86,8 +91,7 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
   Maximum error: 1.63 ULP (~1.0 ULP for x < 0.0).
   _ZGVnN4v_erfcf(0x1.1dbf7ap+3) got 0x1.f51212p-120
 				want 0x1.f51216p-120.  */
-VPCS_ATTR
-float32x4_t V_NAME_F1 (erfc) (float32x4_t x)
+NOINLINE VPCS_ATTR float32x4_t V_NAME_F1 (erfc) (float32x4_t x)
 {
  const struct data *dat = ptr_barrier (&data);

@ -130,10 +134,11 @@ float32x4_t V_NAME_F1 (erfc) (float32x4_t x)
  float32x4_t r2 = vmulq_f32 (r, r);

  float32x4_t p1 = r;
-  float32x4_t p2 = vfmsq_laneq_f32 (dat->third, r2, dat->coeffs, 1);
+  float32x4_t coeffs = vld1q_f32 (dat->coeffs);
+  float32x4_t p2 = vfmsq_laneq_f32 (dat->third, r2, coeffs, 1);
  float32x4_t p3
-      = vmulq_f32 (r, vfmaq_laneq_f32 (v_f32 (-0.5), r2, dat->coeffs, 0));
-  float32x4_t p4 = vfmaq_laneq_f32 (dat->two_over_five, r2, dat->coeffs, 2);
+      = vmulq_f32 (r, vfmaq_laneq_f32 (v_f32 (-0.5), r2, coeffs, 0));
+  float32x4_t p4 = vfmaq_laneq_f32 (dat->two_over_five, r2, coeffs, 2);
  p4 = vfmsq_f32 (dat->tenth, r2, p4);

  float32x4_t y = vfmaq_f32 (p3, d, p4);
@ -157,10 +162,13 @@ float32x4_t V_NAME_F1 (erfc) (float32x4_t x)
  return vfmaq_f32 (off, fac, y);
 }

-PL_SIG (V, F, 1, erfc, -4.0, 10.0)
-PL_TEST_ULP (V_NAME_F1 (erfc), 1.14)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (erfc), 0, 0x1p-26, 40000)
-PL_TEST_INTERVAL (V_NAME_F1 (erfc), 0x1p-26, 10.0625, 40000)
-PL_TEST_INTERVAL (V_NAME_F1 (erfc), -0x1p-26, -4.0, 40000)
-PL_TEST_INTERVAL (V_NAME_F1 (erfc), 10.0625, inf, 40000)
-PL_TEST_INTERVAL (V_NAME_F1 (erfc), -4.0, -inf, 40000)
+HALF_WIDTH_ALIAS_F1 (erfc)
+
+TEST_SIG (V, F, 1, erfc, -4.0, 10.0)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (erfc), WANT_SIMD_EXCEPT)
+TEST_ULP (V_NAME_F1 (erfc), 1.14)
+TEST_SYM_INTERVAL (V_NAME_F1 (erfc), 0, 0x1p-26, 40000)
+TEST_INTERVAL (V_NAME_F1 (erfc), 0x1p-26, 10.0625, 40000)
+TEST_INTERVAL (V_NAME_F1 (erfc), -0x1p-26, -4.0, 40000)
+TEST_INTERVAL (V_NAME_F1 (erfc), 10.0625, inf, 40000)
+TEST_INTERVAL (V_NAME_F1 (erfc), -4.0, -inf, 40000)
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/erff.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/erff.c
@ -1,13 +1,13 @@
 /*
 * Single-precision vector erf(x) function.
 *
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

 #include "v_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"

 static const struct data
 {
@ -37,12 +37,12 @@ static inline struct entry
 lookup (uint32x4_t i)
 {
  struct entry e;
-  float64_t t0 = *((float64_t *) (__erff_data.tab + i[0]));
-  float64_t t1 = *((float64_t *) (__erff_data.tab + i[1]));
-  float64_t t2 = *((float64_t *) (__erff_data.tab + i[2]));
-  float64_t t3 = *((float64_t *) (__erff_data.tab + i[3]));
-  float32x4_t e1 = vreinterpretq_f32_f64 ((float64x2_t){ t0, t1 });
-  float32x4_t e2 = vreinterpretq_f32_f64 ((float64x2_t){ t2, t3 });
+  float32x2_t t0 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 0)].erf);
+  float32x2_t t1 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 1)].erf);
+  float32x2_t t2 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 2)].erf);
+  float32x2_t t3 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 3)].erf);
+  float32x4_t e1 = vcombine_f32 (t0, t1);
+  float32x4_t e2 = vcombine_f32 (t2, t3);
  e.erf = vuzp1q_f32 (e1, e2);
  e.scale = vuzp2q_f32 (e1, e2);
  return e;
@ -61,7 +61,7 @@ lookup (uint32x4_t i)
   Maximum error: 1.93 ULP
     _ZGVnN4v_erff(0x1.c373e6p-9) got 0x1.fd686cp-9
 				 want 0x1.fd6868p-9.  */
-float32x4_t VPCS_ATTR V_NAME_F1 (erf) (float32x4_t x)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (erf) (float32x4_t x)
 {
  const struct data *dat = ptr_barrier (&data);

@ -110,9 +110,11 @@ float32x4_t VPCS_ATTR V_NAME_F1 (erf) (float32x4_t x)
  return y;
 }

-PL_SIG (V, F, 1, erf, -4.0, 4.0)
-PL_TEST_ULP (V_NAME_F1 (erf), 1.43)
-PL_TEST_EXPECT_FENV (V_NAME_F1 (erf), WANT_SIMD_EXCEPT)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (erf), 0, 3.9375, 40000)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (erf), 3.9375, inf, 40000)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (erf), 0, inf, 40000)
+HALF_WIDTH_ALIAS_F1 (erf)
+
+TEST_SIG (V, F, 1, erf, -4.0, 4.0)
+TEST_ULP (V_NAME_F1 (erf), 1.43)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (erf), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_F1 (erf), 0, 3.9375, 40000)
+TEST_SYM_INTERVAL (V_NAME_F1 (erf), 3.9375, inf, 40000)
+TEST_SYM_INTERVAL (V_NAME_F1 (erf), 0, inf, 40000)
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/exp.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/exp.c
@ -1,12 +1,14 @@
 /*
 * Double-precision vector e^x function.
 *
- * Copyright (c) 2019-2023, Arm Limited.
+ * Copyright (c) 2019-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

 #include "mathlib.h"
 #include "v_math.h"
+#include "test_defs.h"
+#include "test_sig.h"

 #define N (1 << V_EXP_TABLE_BITS)
 #define IndexMask (N - 1)
@ -123,3 +125,10 @@ float64x2_t VPCS_ATTR V_NAME_D1 (exp) (float64x2_t x)

  return vfmaq_f64 (s, y, s);
 }
+
+TEST_SIG (V, D, 1, exp, -9.9, 9.9)
+TEST_ULP (V_NAME_D1 (exp), 1.9)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (exp), WANT_SIMD_EXCEPT)
+TEST_INTERVAL (V_NAME_D1 (exp), 0, 0xffff000000000000, 10000)
+TEST_SYM_INTERVAL (V_NAME_D1 (exp), 0x1p-6, 0x1p6, 400000)
+TEST_SYM_INTERVAL (V_NAME_D1 (exp), 633.3, 733.3, 10000)
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/exp10.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/exp10.c
@ -1,14 +1,15 @@
 /*
 * Double-precision vector 10^x function.
 *
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

+#define _GNU_SOURCE
 #include "mathlib.h"
 #include "v_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"

 /* Value of |x| above which scale overflows without special treatment.  */
 #define SpecialBound 306.0 /* floor (log10 (2^1023)) - 1.  */
@ -135,10 +136,12 @@ float64x2_t VPCS_ATTR V_NAME_D1 (exp10) (float64x2_t x)
  return vfmaq_f64 (s, y, s);
 }

-PL_SIG (S, D, 1, exp10, -9.9, 9.9)
-PL_SIG (V, D, 1, exp10, -9.9, 9.9)
-PL_TEST_ULP (V_NAME_D1 (exp10), 1.15)
-PL_TEST_EXPECT_FENV (V_NAME_D1 (exp10), WANT_SIMD_EXCEPT)
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (exp10), 0, SpecialBound, 5000)
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (exp10), SpecialBound, ScaleBound, 5000)
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (exp10), ScaleBound, inf, 10000)
+#if WANT_EXP10_TESTS
+TEST_SIG (S, D, 1, exp10, -9.9, 9.9)
+TEST_SIG (V, D, 1, exp10, -9.9, 9.9)
+TEST_ULP (V_NAME_D1 (exp10), 1.15)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (exp10), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_D1 (exp10), 0, SpecialBound, 5000)
+TEST_SYM_INTERVAL (V_NAME_D1 (exp10), SpecialBound, ScaleBound, 5000)
+TEST_SYM_INTERVAL (V_NAME_D1 (exp10), ScaleBound, inf, 10000)
+#endif
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/exp10f.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/exp10f.c
@ -1,23 +1,24 @@
 /*
 * Single-precision vector 10^x function.
 *
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

-#include "mathlib.h"
+#define _GNU_SOURCE
 #include "v_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-#include "poly_advsimd_f32.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "v_poly_f32.h"

 #define ScaleBound 192.0f

 static const struct data
 {
-  float32x4_t poly[5];
-  float32x4_t log10_2_and_inv, shift;
-
+  float32x4_t c0, c1, c3;
+  float log10_2_high, log10_2_low, c2, c4;
+  float32x4_t inv_log10_2, special_bound;
+  uint32x4_t exponent_bias, special_offset, special_bias;
 #if !WANT_SIMD_EXCEPT
  float32x4_t scale_thresh;
 #endif
@ -27,19 +28,24 @@ static const struct data
     rel error: 0x1.89dafa3p-24
     abs error: 0x1.167d55p-23 in [-log10(2)/2, log10(2)/2]
     maxerr: 1.85943 +0.5 ulp.  */
-  .poly = { V4 (0x1.26bb16p+1f), V4 (0x1.5350d2p+1f), V4 (0x1.04744ap+1f),
-	    V4 (0x1.2d8176p+0f), V4 (0x1.12b41ap-1f) },
-  .shift = V4 (0x1.8p23f),
-
-  /* Stores constants 1/log10(2), log10(2)_high, log10(2)_low, 0.  */
-  .log10_2_and_inv = { 0x1.a934fp+1, 0x1.344136p-2, -0x1.ec10cp-27, 0 },
+  .c0 = V4 (0x1.26bb16p+1f),
+  .c1 = V4 (0x1.5350d2p+1f),
+  .c2 = 0x1.04744ap+1f,
+  .c3 = V4 (0x1.2d8176p+0f),
+  .c4 = 0x1.12b41ap-1f,
+  .inv_log10_2 = V4 (0x1.a934fp+1),
+  .log10_2_high = 0x1.344136p-2,
+  .log10_2_low = 0x1.ec10cp-27,
+  /* rint (log2 (2^127 / (1 + sqrt (2)))).  */
+  .special_bound = V4 (126.0f),
+  .exponent_bias = V4 (0x3f800000),
+  .special_offset = V4 (0x82000000),
+  .special_bias = V4 (0x7f000000),
 #if !WANT_SIMD_EXCEPT
  .scale_thresh = V4 (ScaleBound)
 #endif
 };

-#define ExponentBias v_u32 (0x3f800000)
-
 #if WANT_SIMD_EXCEPT

 # define SpecialBound 38.0f	       /* rint(log10(2^127)).  */
@ -57,17 +63,15 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)

 #else

-# define SpecialBound 126.0f /* rint (log2 (2^127 / (1 + sqrt (2)))).  */
-# define SpecialOffset v_u32 (0x82000000)
-# define SpecialBias v_u32 (0x7f000000)
+# define SpecialBound 126.0f

 static float32x4_t VPCS_ATTR NOINLINE
 special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
 	      float32x4_t scale, const struct data *d)
 {
  /* 2^n may overflow, break it up into s1*s2.  */
-  uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset);
-  float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias));
+  uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset);
+  float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias));
  float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
  uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
  float32x4_t r2 = vmulq_f32 (s1, s1);
@ -84,7 +88,7 @@ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
   Algorithm is accurate to 2.36 ULP.
   _ZGVnN4v_exp10f(0x1.be2b36p+1) got 0x1.7e79c4p+11
 				 want 0x1.7e79cp+11.  */
-float32x4_t VPCS_ATTR V_NAME_F1 (exp10) (float32x4_t x)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp10) (float32x4_t x)
 {
  const struct data *d = ptr_barrier (&data);
 #if WANT_SIMD_EXCEPT
@ -102,22 +106,23 @@ float32x4_t VPCS_ATTR V_NAME_F1 (exp10) (float32x4_t x)
  /* exp10(x) = 2^n * 10^r = 2^n * (1 + poly (r)),
     with poly(r) in [1/sqrt(2), sqrt(2)] and
     x = r + n * log10 (2), with r in [-log10(2)/2, log10(2)/2].  */
-  float32x4_t z = vfmaq_laneq_f32 (d->shift, x, d->log10_2_and_inv, 0);
-  float32x4_t n = vsubq_f32 (z, d->shift);
-  float32x4_t r = vfmsq_laneq_f32 (x, n, d->log10_2_and_inv, 1);
-  r = vfmsq_laneq_f32 (r, n, d->log10_2_and_inv, 2);
-  uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23);
+  float32x4_t log10_2_c24 = vld1q_f32 (&d->log10_2_high);
+  float32x4_t n = vrndaq_f32 (vmulq_f32 (x, d->inv_log10_2));
+  float32x4_t r = vfmsq_laneq_f32 (x, n, log10_2_c24, 0);
+  r = vfmaq_laneq_f32 (r, n, log10_2_c24, 1);
+  uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (n)), 23);

-  float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, ExponentBias));
+  float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));

 #if !WANT_SIMD_EXCEPT
-  uint32x4_t cmp = vcagtq_f32 (n, v_f32 (SpecialBound));
+  uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
 #endif

  float32x4_t r2 = vmulq_f32 (r, r);
-  float32x4_t poly
-      = vfmaq_f32 (vmulq_f32 (r, d->poly[0]),
-		   v_pairwise_poly_3_f32 (r, r2, d->poly + 1), r2);
+  float32x4_t p12 = vfmaq_laneq_f32 (d->c1, r, log10_2_c24, 2);
+  float32x4_t p34 = vfmaq_laneq_f32 (d->c3, r, log10_2_c24, 3);
+  float32x4_t p14 = vfmaq_f32 (p12, r2, p34);
+  float32x4_t poly = vfmaq_f32 (vmulq_f32 (r, d->c0), p14, r2);

  if (unlikely (v_any_u32 (cmp)))
 #if WANT_SIMD_EXCEPT
@ -129,10 +134,14 @@ float32x4_t VPCS_ATTR V_NAME_F1 (exp10) (float32x4_t x)
  return vfmaq_f32 (scale, poly, scale);
 }

-PL_SIG (S, F, 1, exp10, -9.9, 9.9)
-PL_SIG (V, F, 1, exp10, -9.9, 9.9)
-PL_TEST_ULP (V_NAME_F1 (exp10), 1.86)
-PL_TEST_EXPECT_FENV (V_NAME_F1 (exp10), WANT_SIMD_EXCEPT)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (exp10), 0, SpecialBound, 5000)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (exp10), SpecialBound, ScaleBound, 5000)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (exp10), ScaleBound, inf, 10000)
+HALF_WIDTH_ALIAS_F1 (exp10)
+
+#if WANT_EXP10_TESTS
+TEST_SIG (S, F, 1, exp10, -9.9, 9.9)
+TEST_SIG (V, F, 1, exp10, -9.9, 9.9)
+TEST_ULP (V_NAME_F1 (exp10), 1.86)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (exp10), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_F1 (exp10), 0, SpecialBound, 5000)
+TEST_SYM_INTERVAL (V_NAME_F1 (exp10), SpecialBound, ScaleBound, 5000)
+TEST_SYM_INTERVAL (V_NAME_F1 (exp10), ScaleBound, inf, 10000)
+#endif
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/exp2.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/exp2.c
@ -1,19 +1,20 @@
 /*
 * Double-precision vector 2^x function.
 *
- * Copyright (c) 2019-2023, Arm Limited.
+ * Copyright (c) 2019-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

 #include "v_math.h"
-#include "poly_advsimd_f64.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "v_poly_f64.h"
+#include "test_sig.h"
+#include "test_defs.h"

 #define N (1 << V_EXP_TABLE_BITS)
 #define IndexMask (N - 1)
 #define BigBound 1022.0
 #define UOFlowBound 1280.0
+#define TinyBound 0x2000000000000000 /* asuint64(0x1p-511).  */

 static const struct data
 {
@ -38,7 +39,6 @@ lookup_sbits (uint64x2_t i)

 #if WANT_SIMD_EXCEPT

-# define TinyBound 0x2000000000000000 /* asuint64(0x1p-511).  */
 # define Thres 0x2080000000000000     /* asuint64(512.0) - TinyBound.  */

 /* Call scalar exp2 as a fallback.  */
@ -62,8 +62,8 @@ special_case (float64x2_t s, float64x2_t y, float64x2_t n,
  /* 2^(n/N) may overflow, break it up into s1*s2.  */
  uint64x2_t b = vandq_u64 (vclezq_f64 (n), v_u64 (SpecialOffset));
  float64x2_t s1 = vreinterpretq_f64_u64 (vsubq_u64 (v_u64 (SpecialBias1), b));
-  float64x2_t s2 = vreinterpretq_f64_u64 (
-    vaddq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (s), v_u64 (SpecialBias2)), b));
+  float64x2_t s2 = vreinterpretq_f64_u64 (vaddq_u64 (
+      vsubq_u64 (vreinterpretq_u64_f64 (s), v_u64 (SpecialBias2)), b));
  uint64x2_t cmp = vcagtq_f64 (n, d->scale_uoflow_bound);
  float64x2_t r1 = vmulq_f64 (s1, s1);
  float64x2_t r0 = vmulq_f64 (vfmaq_f64 (s2, s2, y), s1);
@ -119,10 +119,10 @@ float64x2_t V_NAME_D1 (exp2) (float64x2_t x)
  return vfmaq_f64 (s, s, y);
 }

-PL_SIG (V, D, 1, exp2, -9.9, 9.9)
-PL_TEST_ULP (V_NAME_D1 (exp2), 1.15)
-PL_TEST_EXPECT_FENV (V_NAME_D1 (exp2), WANT_SIMD_EXCEPT)
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (exp2), 0, TinyBound, 5000)
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (exp2), TinyBound, BigBound, 10000)
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (exp2), BigBound, UOFlowBound, 5000)
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (exp2), UOFlowBound, inf, 10000)
+TEST_SIG (V, D, 1, exp2, -9.9, 9.9)
+TEST_ULP (V_NAME_D1 (exp2), 1.15)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (exp2), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_D1 (exp2), 0, TinyBound, 5000)
+TEST_SYM_INTERVAL (V_NAME_D1 (exp2), TinyBound, BigBound, 10000)
+TEST_SYM_INTERVAL (V_NAME_D1 (exp2), BigBound, UOFlowBound, 5000)
+TEST_SYM_INTERVAL (V_NAME_D1 (exp2), UOFlowBound, inf, 10000)
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/exp2f.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/exp2f.c
@ -1,33 +1,38 @@
 /*
 * Single-precision vector 2^x function.
 *
- * Copyright (c) 2019-2023, Arm Limited.
+ * Copyright (c) 2019-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

-#include "mathlib.h"
 #include "v_math.h"
+#include "test_defs.h"
+#include "test_sig.h"

 static const struct data
 {
-  float32x4_t poly[5];
-  uint32x4_t exponent_bias;
+  float32x4_t c1, c3;
+  uint32x4_t exponent_bias, special_offset, special_bias;
 #if !WANT_SIMD_EXCEPT
-  float32x4_t special_bound, scale_thresh;
+  float32x4_t scale_thresh, special_bound;
 #endif
+  float c0, c2, c4, zero;
 } data = {
  /* maxerr: 1.962 ulp.  */
-  .poly = { V4 (0x1.59977ap-10f), V4 (0x1.3ce9e4p-7f), V4 (0x1.c6bd32p-5f),
-	    V4 (0x1.ebf9bcp-3f), V4 (0x1.62e422p-1f) },
+  .c0 = 0x1.59977ap-10f,
+  .c1 = V4 (0x1.3ce9e4p-7f),
+  .c2 = 0x1.c6bd32p-5f,
+  .c3 = V4 (0x1.ebf9bcp-3f),
+  .c4 = 0x1.62e422p-1f,
  .exponent_bias = V4 (0x3f800000),
+  .special_offset = V4 (0x82000000),
+  .special_bias = V4 (0x7f000000),
 #if !WANT_SIMD_EXCEPT
  .special_bound = V4 (126.0f),
  .scale_thresh = V4 (192.0f),
 #endif
 };

-#define C(i) d->poly[i]
-
 #if WANT_SIMD_EXCEPT

 # define TinyBound v_u32 (0x20000000)	  /* asuint (0x1p-63).  */
@ -44,16 +49,13 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)

 #else

-# define SpecialOffset v_u32 (0x82000000)
-# define SpecialBias v_u32 (0x7f000000)
-
 static float32x4_t VPCS_ATTR NOINLINE
 special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
 	      float32x4_t scale, const struct data *d)
 {
  /* 2^n may overflow, break it up into s1*s2.  */
-  uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset);
-  float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias));
+  uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset);
+  float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias));
  float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
  uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
  float32x4_t r2 = vmulq_f32 (s1, s1);
@ -66,16 +68,14 @@ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,

 #endif

-float32x4_t VPCS_ATTR V_NAME_F1 (exp2) (float32x4_t x)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp2) (float32x4_t x)
 {
  const struct data *d = ptr_barrier (&data);
-  float32x4_t n, r, r2, scale, p, q, poly;
-  uint32x4_t cmp, e;

 #if WANT_SIMD_EXCEPT
  /* asuint(|x|) - TinyBound >= BigBound - TinyBound.  */
  uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x));
-  cmp = vcgeq_u32 (vsubq_u32 (ia, TinyBound), SpecialBound);
+  uint32x4_t cmp = vcgeq_u32 (vsubq_u32 (ia, TinyBound), SpecialBound);
  float32x4_t xm = x;
  /* If any lanes are special, mask them with 1 and retain a copy of x to allow
     special_case to fix special lanes later. This is only necessary if fenv
@ -84,23 +84,24 @@ float32x4_t VPCS_ATTR V_NAME_F1 (exp2) (float32x4_t x)
    x = vbslq_f32 (cmp, v_f32 (1), x);
 #endif

-    /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
-       x = n + r, with r in [-1/2, 1/2].  */
-  n = vrndaq_f32 (x);
-  r = vsubq_f32 (x, n);
-  e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 23);
-  scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
+  /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+     x = n + r, with r in [-1/2, 1/2].  */
+  float32x4_t n = vrndaq_f32 (x);
+  float32x4_t r = vsubq_f32 (x, n);
+  uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 23);
+  float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));

 #if !WANT_SIMD_EXCEPT
-  cmp = vcagtq_f32 (n, d->special_bound);
+  uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
 #endif

-  r2 = vmulq_f32 (r, r);
-  p = vfmaq_f32 (C (1), C (0), r);
-  q = vfmaq_f32 (C (3), C (2), r);
+  float32x4_t c024 = vld1q_f32 (&d->c0);
+  float32x4_t r2 = vmulq_f32 (r, r);
+  float32x4_t p = vfmaq_laneq_f32 (d->c1, r, c024, 0);
+  float32x4_t q = vfmaq_laneq_f32 (d->c3, r, c024, 1);
  q = vfmaq_f32 (q, p, r2);
-  p = vmulq_f32 (C (4), r);
-  poly = vfmaq_f32 (p, q, r2);
+  p = vmulq_laneq_f32 (r, c024, 2);
+  float32x4_t poly = vfmaq_f32 (p, q, r2);

  if (unlikely (v_any_u32 (cmp)))
 #if WANT_SIMD_EXCEPT
@ -111,3 +112,11 @@ float32x4_t VPCS_ATTR V_NAME_F1 (exp2) (float32x4_t x)

  return vfmaq_f32 (scale, poly, scale);
 }
+
+HALF_WIDTH_ALIAS_F1 (exp2)
+
+TEST_SIG (V, F, 1, exp2, -9.9, 9.9)
+TEST_ULP (V_NAME_F1 (exp2), 1.49)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (exp2), WANT_SIMD_EXCEPT)
+TEST_INTERVAL (V_NAME_F1 (exp2), 0, 0xffff0000, 10000)
+TEST_SYM_INTERVAL (V_NAME_F1 (exp2), 0x1p-14, 0x1p8, 500000)
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/exp2f_1u.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/exp2f_1u.c
@ -0,0 +1,73 @@
+/*
+ * Single-precision vector 2^x function.
+ *
+ * Copyright (c) 2019-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_defs.h"
+
+static const struct data
+{
+  float32x4_t c0, c1, c2, c3, c4, c5, shift;
+  uint32x4_t exponent_bias;
+  float32x4_t special_bound, scale_thresh;
+  uint32x4_t special_offset, special_bias;
+} data = {
+  .shift = V4 (0x1.8p23f),
+  .exponent_bias = V4 (0x3f800000),
+  .special_bound = V4 (126.0f),
+  .scale_thresh = V4 (192.0f),
+  .special_offset = V4 (0x82000000),
+  .special_bias = V4 (0x7f000000),
+  /*  maxerr: 0.878 ulp.  */
+  .c0 = V4 (0x1.416b5ep-13f),
+  .c1 = V4 (0x1.5f082ep-10f),
+  .c2 = V4 (0x1.3b2dep-7f),
+  .c3 = V4 (0x1.c6af7cp-5f),
+  .c4 = V4 (0x1.ebfbdcp-3f),
+  .c5 = V4 (0x1.62e43p-1f),
+};
+
+static float32x4_t VPCS_ATTR NOINLINE
+specialcase (float32x4_t p, float32x4_t n, uint32x4_t e, const struct data *d)
+{
+  /* 2^n may overflow, break it up into s1*s2.  */
+  uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset);
+  float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias));
+  float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
+  uint32x4_t cmp = vcagtq_f32 (n, d->scale_thresh);
+  float32x4_t r1 = vmulq_f32 (s1, s1);
+  float32x4_t r0 = vmulq_f32 (vmulq_f32 (p, s1), s2);
+  return vreinterpretq_f32_u32 ((cmp & vreinterpretq_u32_f32 (r1))
+				| (~cmp & vreinterpretq_u32_f32 (r0)));
+}
+
+float32x4_t VPCS_ATTR
+_ZGVnN4v_exp2f_1u (float32x4_t x)
+{
+  /* exp2(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)]
+     x = n + r, with r in [-1/2, 1/2].  */
+  const struct data *d = ptr_barrier (&data);
+  float32x4_t n = vrndaq_f32 (x);
+  float32x4_t r = x - n;
+  uint32x4_t e = vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)) << 23;
+  float32x4_t scale = vreinterpretq_f32_u32 (e + d->exponent_bias);
+  uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
+
+  float32x4_t p = vfmaq_f32 (d->c1, d->c0, r);
+  p = vfmaq_f32 (d->c2, p, r);
+  p = vfmaq_f32 (d->c3, p, r);
+  p = vfmaq_f32 (d->c4, p, r);
+  p = vfmaq_f32 (d->c5, p, r);
+  p = vfmaq_f32 (v_f32 (1.0f), p, r);
+  if (unlikely (v_any_u32 (cmp)))
+    return specialcase (p, n, e, d);
+  return scale * p;
+}
+
+TEST_ULP (_ZGVnN4v_exp2f_1u, 0.4)
+TEST_DISABLE_FENV (_ZGVnN4v_exp2f_1u)
+TEST_INTERVAL (_ZGVnN4v_exp2f_1u, 0, 0xffff0000, 10000)
+TEST_SYM_INTERVAL (_ZGVnN4v_exp2f_1u, 0x1p-14, 0x1p8, 500000)
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/expf.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/expf.c
@ -1,30 +1,34 @@
 /*
 * Single-precision vector e^x function.
 *
- * Copyright (c) 2019-2023, Arm Limited.
+ * Copyright (c) 2019-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */
-
-#include "mathlib.h"
 #include "v_math.h"
+#include "test_defs.h"
+#include "test_sig.h"

 static const struct data
 {
-  float32x4_t poly[5];
-  float32x4_t shift, inv_ln2, ln2_hi, ln2_lo;
-  uint32x4_t exponent_bias;
+  float32x4_t c1, c3, c4, inv_ln2;
+  float ln2_hi, ln2_lo, c0, c2;
+  uint32x4_t exponent_bias, special_offset, special_bias;
 #if !WANT_SIMD_EXCEPT
  float32x4_t special_bound, scale_thresh;
 #endif
 } data = {
  /* maxerr: 1.45358 +0.5 ulp.  */
-  .poly = { V4 (0x1.0e4020p-7f), V4 (0x1.573e2ep-5f), V4 (0x1.555e66p-3f),
-	    V4 (0x1.fffdb6p-2f), V4 (0x1.ffffecp-1f) },
-  .shift = V4 (0x1.8p23f),
+  .c0 = 0x1.0e4020p-7f,
+  .c1 = V4 (0x1.573e2ep-5f),
+  .c2 = 0x1.555e66p-3f,
+  .c3 = V4 (0x1.fffdb6p-2f),
+  .c4 = V4 (0x1.ffffecp-1f),
  .inv_ln2 = V4 (0x1.715476p+0f),
-  .ln2_hi = V4 (0x1.62e4p-1f),
-  .ln2_lo = V4 (0x1.7f7d1cp-20f),
+  .ln2_hi = 0x1.62e4p-1f,
+  .ln2_lo = 0x1.7f7d1cp-20f,
  .exponent_bias = V4 (0x3f800000),
+  .special_offset = V4 (0x82000000),
+  .special_bias = V4 (0x7f000000),
 #if !WANT_SIMD_EXCEPT
  .special_bound = V4 (126.0f),
  .scale_thresh = V4 (192.0f),
@ -49,19 +53,17 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)

 #else

-# define SpecialOffset v_u32 (0x82000000)
-# define SpecialBias v_u32 (0x7f000000)
-
 static float32x4_t VPCS_ATTR NOINLINE
 special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
 	      float32x4_t scale, const struct data *d)
 {
  /* 2^n may overflow, break it up into s1*s2.  */
-  uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset);
-  float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias));
+  uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset);
+  float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias));
  float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
  uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
  float32x4_t r2 = vmulq_f32 (s1, s1);
+  // (s2 + p*s2)*s1 = s2(p+1)s1
  float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1);
  /* Similar to r1 but avoids double rounding in the subnormal range.  */
  float32x4_t r0 = vfmaq_f32 (scale, poly, scale);
@ -71,15 +73,14 @@ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,

 #endif

-float32x4_t VPCS_ATTR V_NAME_F1 (exp) (float32x4_t x)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp) (float32x4_t x)
 {
  const struct data *d = ptr_barrier (&data);
-  float32x4_t n, r, r2, scale, p, q, poly, z;
-  uint32x4_t cmp, e;
+  float32x4_t ln2_c02 = vld1q_f32 (&d->ln2_hi);

 #if WANT_SIMD_EXCEPT
  /* asuint(x) - TinyBound >= BigBound - TinyBound.  */
-  cmp = vcgeq_u32 (
+  uint32x4_t cmp = vcgeq_u32 (
      vsubq_u32 (vandq_u32 (vreinterpretq_u32_f32 (x), v_u32 (0x7fffffff)),
 		 TinyBound),
      SpecialBound);
@ -93,23 +94,22 @@ float32x4_t VPCS_ATTR V_NAME_F1 (exp) (float32x4_t x)

  /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
     x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
-  z = vfmaq_f32 (d->shift, x, d->inv_ln2);
-  n = vsubq_f32 (z, d->shift);
-  r = vfmsq_f32 (x, n, d->ln2_hi);
-  r = vfmsq_f32 (r, n, d->ln2_lo);
-  e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23);
-  scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
+  float32x4_t n = vrndaq_f32 (vmulq_f32 (x, d->inv_ln2));
+  float32x4_t r = vfmsq_laneq_f32 (x, n, ln2_c02, 0);
+  r = vfmsq_laneq_f32 (r, n, ln2_c02, 1);
+  uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23);
+  float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));

 #if !WANT_SIMD_EXCEPT
-  cmp = vcagtq_f32 (n, d->special_bound);
+  uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
 #endif

-  r2 = vmulq_f32 (r, r);
-  p = vfmaq_f32 (C (1), C (0), r);
-  q = vfmaq_f32 (C (3), C (2), r);
+  float32x4_t r2 = vmulq_f32 (r, r);
+  float32x4_t p = vfmaq_laneq_f32 (d->c1, r, ln2_c02, 2);
+  float32x4_t q = vfmaq_laneq_f32 (d->c3, r, ln2_c02, 3);
  q = vfmaq_f32 (q, p, r2);
-  p = vmulq_f32 (C (4), r);
-  poly = vfmaq_f32 (p, q, r2);
+  p = vmulq_f32 (d->c4, r);
+  float32x4_t poly = vfmaq_f32 (p, q, r2);

  if (unlikely (v_any_u32 (cmp)))
 #if WANT_SIMD_EXCEPT
@ -120,3 +120,11 @@ float32x4_t VPCS_ATTR V_NAME_F1 (exp) (float32x4_t x)

  return vfmaq_f32 (scale, poly, scale);
 }
+
+HALF_WIDTH_ALIAS_F1 (exp)
+
+TEST_SIG (V, F, 1, exp, -9.9, 9.9)
+TEST_ULP (V_NAME_F1 (exp), 1.49)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (exp), WANT_SIMD_EXCEPT)
+TEST_INTERVAL (V_NAME_F1 (exp), 0, 0xffff0000, 10000)
+TEST_SYM_INTERVAL (V_NAME_F1 (exp), 0x1p-14, 0x1p8, 500000)
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/expf_1u.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/expf_1u.c
@ -0,0 +1,79 @@
+/*
+ * Single-precision vector e^x function.
+ *
+ * Copyright (c) 2019-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "v_math.h"
+#include "test_defs.h"
+
+static const struct data
+{
+  float32x4_t shift, inv_ln2;
+  uint32x4_t exponent_bias;
+  float32x4_t c1, c2, c3, c4;
+  float32x4_t special_bound, scale_thresh;
+  uint32x4_t special_offset, special_bias;
+  float ln2_hi, ln2_lo, c0, nothing;
+} data = {
+  .ln2_hi = 0x1.62e4p-1f,
+  .ln2_lo = 0x1.7f7d1cp-20f,
+  .shift = V4 (0x1.8p23f),
+  .inv_ln2 = V4 (0x1.715476p+0f),
+  .exponent_bias = V4 (0x3f800000),
+  .special_bound = V4 (126.0f),
+  .scale_thresh = V4 (192.0f),
+  .special_offset = V4 (0x83000000),
+  .special_bias = V4 (0x7f000000),
+  /*  maxerr: 0.36565 +0.5 ulp.  */
+  .c0 = 0x1.6a6000p-10f,
+  .c1 = V4 (0x1.12718ep-7f),
+  .c2 = V4 (0x1.555af0p-5f),
+  .c3 = V4 (0x1.555430p-3f),
+  .c4 = V4 (0x1.fffff4p-2f),
+};
+
+static float32x4_t VPCS_ATTR NOINLINE
+specialcase (float32x4_t p, float32x4_t n, uint32x4_t e, const struct data *d)
+{
+  /* 2^n may overflow, break it up into s1*s2.  */
+  uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset);
+  float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias));
+  float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
+  uint32x4_t cmp = vcagtq_f32 (n, d->scale_thresh);
+  float32x4_t r1 = vmulq_f32 (s1, s1);
+  float32x4_t r0 = vmulq_f32 (vmulq_f32 (p, s1), s2);
+  return vreinterpretq_f32_u32 ((cmp & vreinterpretq_u32_f32 (r1))
+				| (~cmp & vreinterpretq_u32_f32 (r0)));
+}
+
+float32x4_t VPCS_ATTR
+_ZGVnN4v_expf_1u (float32x4_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+  float32x4_t ln2_c0 = vld1q_f32 (&d->ln2_hi);
+
+  /* exp(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)]
+     x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
+  float32x4_t z = vmulq_f32 (x, d->inv_ln2);
+  float32x4_t n = vrndaq_f32 (z);
+  float32x4_t r = vfmsq_laneq_f32 (x, n, ln2_c0, 0);
+  r = vfmsq_laneq_f32 (r, n, ln2_c0, 1);
+  uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (z)), 23);
+  float32x4_t scale = vreinterpretq_f32_u32 (e + d->exponent_bias);
+  uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
+  float32x4_t p = vfmaq_laneq_f32 (d->c1, r, ln2_c0, 2);
+  p = vfmaq_f32 (d->c2, p, r);
+  p = vfmaq_f32 (d->c3, p, r);
+  p = vfmaq_f32 (d->c4, p, r);
+  p = vfmaq_f32 (v_f32 (1.0f), p, r);
+  p = vfmaq_f32 (v_f32 (1.0f), p, r);
+  if (unlikely (v_any_u32 (cmp)))
+    return specialcase (p, n, e, d);
+  return scale * p;
+}
+
+TEST_ULP (_ZGVnN4v_expf_1u, 0.4)
+TEST_DISABLE_FENV (_ZGVnN4v_expf_1u)
+TEST_INTERVAL (_ZGVnN4v_expf_1u, 0, 0xffff0000, 10000)
+TEST_SYM_INTERVAL (_ZGVnN4v_expf_1u, 0x1p-14, 0x1p8, 500000)
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/expm1.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/expm1.c
@ -0,0 +1,77 @@
+/*
+ * Double-precision vector exp(x) - 1 function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "v_expm1_inline.h"
+
+static const struct data
+{
+  struct v_expm1_data d;
+#if WANT_SIMD_EXCEPT
+  uint64x2_t thresh, tiny_bound;
+#else
+  float64x2_t oflow_bound;
+#endif
+} data = {
+  .d = V_EXPM1_DATA,
+#if WANT_SIMD_EXCEPT
+  /* asuint64(oflow_bound) - asuint64(0x1p-51), shifted left by 1 for abs
+     compare.  */
+  .thresh = V2 (0x78c56fa6d34b552),
+  /* asuint64(0x1p-51) << 1.  */
+  .tiny_bound = V2 (0x3cc0000000000000 << 1),
+#else
+  /* Value above which expm1(x) should overflow. Absolute value of the
+     underflow bound is greater than this, so it catches both cases - there is
+     a small window where fallbacks are triggered unnecessarily.  */
+  .oflow_bound = V2 (0x1.62b7d369a5aa9p+9),
+#endif
+};
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, uint64x2_t special, const struct data *d)
+{
+  return v_call_f64 (expm1, x, expm1_inline (v_zerofy_f64 (x, special), &d->d),
+		     special);
+}
+
+/* Double-precision vector exp(x) - 1 function.
+   The maximum error observed error is 2.05 ULP:
+  _ZGVnN2v_expm1(0x1.6329669eb8c87p-2) got 0x1.a8897eef87b34p-2
+				      want 0x1.a8897eef87b32p-2.  */
+float64x2_t VPCS_ATTR V_NAME_D1 (expm1) (float64x2_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+#if WANT_SIMD_EXCEPT
+  uint64x2_t ix = vreinterpretq_u64_f64 (x);
+  /* If fp exceptions are to be triggered correctly, fall back to scalar for
+     |x| < 2^-51, |x| > oflow_bound, Inf & NaN. Add ix to itself for
+     shift-left by 1, and compare with thresh which was left-shifted offline -
+     this is effectively an absolute compare.  */
+  uint64x2_t special
+      = vcgeq_u64 (vsubq_u64 (vaddq_u64 (ix, ix), d->tiny_bound), d->thresh);
+#else
+  /* Large input, NaNs and Infs.  */
+  uint64x2_t special = vcageq_f64 (x, d->oflow_bound);
+#endif
+
+  if (unlikely (v_any_u64 (special)))
+    return special_case (x, special, d);
+
+  /* expm1(x) ~= p * t + (t - 1).  */
+  return expm1_inline (x, &d->d);
+}
+
+TEST_SIG (V, D, 1, expm1, -9.9, 9.9)
+TEST_ULP (V_NAME_D1 (expm1), 1.56)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (expm1), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_D1 (expm1), 0, 0x1p-51, 1000)
+TEST_SYM_INTERVAL (V_NAME_D1 (expm1), 0x1p-51, 0x1.62b7d369a5aa9p+9, 100000)
+TEST_SYM_INTERVAL (V_NAME_D1 (expm1), 0x1.62b7d369a5aa9p+9, inf, 100)
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/expm1f.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/expm1f.c
@ -0,0 +1,82 @@
+/*
+ * Single-precision vector exp(x) - 1 function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "v_expm1f_inline.h"
+
+static const struct data
+{
+  struct v_expm1f_data d;
+#if WANT_SIMD_EXCEPT
+  uint32x4_t thresh;
+#else
+  float32x4_t oflow_bound;
+#endif
+} data = {
+  .d = V_EXPM1F_DATA,
+#if !WANT_SIMD_EXCEPT
+  /* Value above which expm1f(x) should overflow. Absolute value of the
+     underflow bound is greater than this, so it catches both cases - there is
+     a small window where fallbacks are triggered unnecessarily.  */
+  .oflow_bound = V4 (0x1.5ebc4p+6),
+#else
+  /* asuint(oflow_bound) - asuint(0x1p-23), shifted left by 1 for absolute
+     compare.  */
+  .thresh = V4 (0x1d5ebc40),
+#endif
+};
+
+/* asuint(0x1p-23), shifted by 1 for abs compare.  */
+#define TinyBound v_u32 (0x34000000 << 1)
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, uint32x4_t special, const struct data *d)
+{
+  return v_call_f32 (
+      expm1f, x, expm1f_inline (v_zerofy_f32 (x, special), &d->d), special);
+}
+
+/* Single-precision vector exp(x) - 1 function.
+   The maximum error is 1.62 ULP:
+   _ZGVnN4v_expm1f(0x1.85f83p-2) got 0x1.da9f4p-2
+				want 0x1.da9f44p-2.  */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (expm1) (float32x4_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+#if WANT_SIMD_EXCEPT
+  uint32x4_t ix = vreinterpretq_u32_f32 (x);
+  /* If fp exceptions are to be triggered correctly, fall back to scalar for
+     |x| < 2^-23, |x| > oflow_bound, Inf & NaN. Add ix to itself for
+     shift-left by 1, and compare with thresh which was left-shifted offline -
+     this is effectively an absolute compare.  */
+  uint32x4_t special
+      = vcgeq_u32 (vsubq_u32 (vaddq_u32 (ix, ix), TinyBound), d->thresh);
+#else
+  /* Handles very large values (+ve and -ve), +/-NaN, +/-Inf.  */
+  uint32x4_t special = vcagtq_f32 (x, d->oflow_bound);
+#endif
+
+  if (unlikely (v_any_u32 (special)))
+    return special_case (x, special, d);
+
+  /* expm1(x) ~= p * t + (t - 1).  */
+  return expm1f_inline (x, &d->d);
+}
+
+HALF_WIDTH_ALIAS_F1 (expm1)
+
+TEST_SIG (V, F, 1, expm1, -9.9, 9.9)
+TEST_ULP (V_NAME_F1 (expm1), 1.13)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (expm1), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_F1 (expm1), 0, 0x1p-23, 1000)
+TEST_INTERVAL (V_NAME_F1 (expm1), -0x1p-23, 0x1.5ebc4p+6, 1000000)
+TEST_INTERVAL (V_NAME_F1 (expm1), -0x1p-23, -0x1.9bbabcp+6, 1000000)
+TEST_INTERVAL (V_NAME_F1 (expm1), 0x1.5ebc4p+6, inf, 1000)
+TEST_INTERVAL (V_NAME_F1 (expm1), -0x1.9bbabcp+6, -inf, 1000)
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/finite_pow.h
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/finite_pow.h
@ -1,7 +1,7 @@
 /*
 * Double-precision x^y function.
 *
- * Copyright (c) 2018-2023, Arm Limited.
+ * Copyright (c) 2018-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

@ -108,7 +108,7 @@ special_case (double tmp, uint64_t sbits, uint64_t ki)
      sbits -= 1009ull << 52;
      scale = asdouble (sbits);
      y = 0x1p1009 * (scale + scale * tmp);
-      return check_oflow (eval_as_double (y));
+      return y;
    }
  /* k < 0, need special care in the subnormal range.  */
  sbits += 1022ull << 52;
@ -128,7 +128,7 @@ special_case (double tmp, uint64_t sbits, uint64_t ki)
      lo = scale - y + scale * tmp;
      hi = one + y;
      lo = one - hi + y + lo;
-      y = eval_as_double (hi + lo) - one;
+      y = (hi + lo) - one;
      /* Fix the sign of 0.  */
      if (y == 0.0)
 	y = asdouble (sbits & 0x8000000000000000);
@ -137,7 +137,7 @@ special_case (double tmp, uint64_t sbits, uint64_t ki)
    }
 #endif
  y = 0x1p-1022 * y;
-  return check_uflow (eval_as_double (y));
+  return y;
 }

 /* Computes sign*exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|.
@ -192,7 +192,7 @@ exp_inline (double x, double xtail, uint32_t sign_bias)
  double scale = asdouble (sbits);
  /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
     is no spurious underflow here even without fma.  */
-  return eval_as_double (scale + scale * tmp);
+  return scale + scale * tmp;
 }

 /* Computes exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|.
@ -239,7 +239,7 @@ exp_nosignbias (double x, double xtail)
  double scale = asdouble (sbits);
  /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
     is no spurious underflow here even without fma.  */
-  return eval_as_double (scale + scale * tmp);
+  return scale + scale * tmp;
 }

 /* Returns 0 if not int, 1 if odd int, 2 if even int.  The argument is
@ -267,7 +267,7 @@ zeroinfnan (uint64_t i)
 }

 static double NOINLINE
-__pl_finite_pow (double x, double y)
+pow_scalar_special_case (double x, double y)
 {
  uint32_t sign_bias = 0;
  uint64_t ix, iy;
@ -311,9 +311,7 @@ __pl_finite_pow (double x, double y)
 	  if (2 * ix == 0 && iy >> 63)
 	    return __math_divzero (sign_bias);
 #endif
-	  /* Without the barrier some versions of clang hoist the 1/x2 and
-	     thus division by zero exception can be signaled spuriously.  */
-	  return iy >> 63 ? opt_barrier_double (1 / x2) : x2;
+	  return iy >> 63 ? 1 / x2 : x2;
 	}
      /* Here x and y are non-zero finite.  */
      if (ix >> 63)
@ -349,9 +347,7 @@ __pl_finite_pow (double x, double y)
      if (topx == 0)
 	{
 	  /* Normalize subnormal x so exponent becomes negative.  */
-	  /* Without the barrier some versions of clang evalutate the mul
-	     unconditionally causing spurious overflow exceptions.  */
-	  ix = asuint64 (opt_barrier_double (x) * 0x1p52);
+	  ix = asuint64 (x * 0x1p52);
 	  ix &= 0x7fffffffffffffff;
 	  ix -= 52ULL << 52;
 	}
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/hypot.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/hypot.c
@ -1,13 +1,13 @@
 /*
 * Double-precision vector hypot(x) function.
 *
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

 #include "v_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"

 #if WANT_SIMD_EXCEPT
 static const struct data
@ -15,7 +15,7 @@ static const struct data
  uint64x2_t tiny_bound, thres;
 } data = {
  .tiny_bound = V2 (0x2000000000000000), /* asuint (0x1p-511).  */
-  .thres = V2 (0x3fe0000000000000), /* asuint (0x1p511) - tiny_bound.  */
+  .thres = V2 (0x3fe0000000000000),	 /* asuint (0x1p511) - tiny_bound.  */
 };
 #else
 static const struct data
@ -24,7 +24,7 @@ static const struct data
  uint32x4_t thres;
 } data = {
  .tiny_bound = V2 (0x0360000000000000), /* asuint (0x1p-969).  */
-  .thres = V4 (0x7c900000),	 /* asuint (inf) - tiny_bound.  */
+  .thres = V4 (0x7c900000),		 /* asuint (inf) - tiny_bound.  */
 };
 #endif

@ -75,9 +75,9 @@ float64x2_t VPCS_ATTR V_NAME_D2 (hypot) (float64x2_t x, float64x2_t y)

  float64x2_t sqsum = vfmaq_f64 (vmulq_f64 (x, x), y, y);

-  uint32x2_t special = vcge_u32 (
-      vsubhn_u64 (vreinterpretq_u64_f64 (sqsum), d->tiny_bound),
-      vget_low_u32 (d->thres));
+  uint32x2_t special
+      = vcge_u32 (vsubhn_u64 (vreinterpretq_u64_f64 (sqsum), d->tiny_bound),
+		  vget_low_u32 (d->thres));

  if (unlikely (v_any_u32h (special)))
    return special_case (x, y, sqsum, special);
@ -86,10 +86,10 @@ float64x2_t VPCS_ATTR V_NAME_D2 (hypot) (float64x2_t x, float64x2_t y)
 }
 #endif

-PL_SIG (V, D, 2, hypot, -10.0, 10.0)
-PL_TEST_ULP (V_NAME_D2 (hypot), 1.21)
-PL_TEST_EXPECT_FENV (V_NAME_D2 (hypot), WANT_SIMD_EXCEPT)
-PL_TEST_INTERVAL2 (V_NAME_D2 (hypot), 0, inf, 0, inf, 10000)
-PL_TEST_INTERVAL2 (V_NAME_D2 (hypot), 0, inf, -0, -inf, 10000)
-PL_TEST_INTERVAL2 (V_NAME_D2 (hypot), -0, -inf, 0, inf, 10000)
-PL_TEST_INTERVAL2 (V_NAME_D2 (hypot), -0, -inf, -0, -inf, 10000)
+TEST_SIG (V, D, 2, hypot, -10.0, 10.0)
+TEST_ULP (V_NAME_D2 (hypot), 1.21)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D2 (hypot), WANT_SIMD_EXCEPT)
+TEST_INTERVAL2 (V_NAME_D2 (hypot), 0, inf, 0, inf, 10000)
+TEST_INTERVAL2 (V_NAME_D2 (hypot), 0, inf, -0, -inf, 10000)
+TEST_INTERVAL2 (V_NAME_D2 (hypot), -0, -inf, 0, inf, 10000)
+TEST_INTERVAL2 (V_NAME_D2 (hypot), -0, -inf, -0, -inf, 10000)
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/hypotf.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/hypotf.c
@ -1,13 +1,13 @@
 /*
 * Single-precision vector hypot(x) function.
 *
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

 #include "v_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"

 #if WANT_SIMD_EXCEPT
 static const struct data
@ -15,7 +15,7 @@ static const struct data
  uint32x4_t tiny_bound, thres;
 } data = {
  .tiny_bound = V4 (0x20000000), /* asuint (0x1p-63).  */
-  .thres = V4 (0x3f000000), /* asuint (0x1p63) - tiny_bound.  */
+  .thres = V4 (0x3f000000),	 /* asuint (0x1p63) - tiny_bound.  */
 };
 #else
 static const struct data
@ -24,7 +24,7 @@ static const struct data
  uint16x8_t thres;
 } data = {
  .tiny_bound = V4 (0x0C800000), /* asuint (0x1p-102).  */
-  .thres = V8 (0x7300), /* asuint (inf) - tiny_bound.  */
+  .thres = V8 (0x7300),		 /* asuint (inf) - tiny_bound.  */
 };
 #endif

@ -41,7 +41,7 @@ special_case (float32x4_t x, float32x4_t y, float32x4_t sqsum,
 						    want 0x1.6a41dp-13.  */
 #if WANT_SIMD_EXCEPT

-float32x4_t VPCS_ATTR V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y)
 {
  const struct data *d = ptr_barrier (&data);

@ -68,15 +68,15 @@ float32x4_t VPCS_ATTR V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y)
 }
 #else

-float32x4_t VPCS_ATTR V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y)
 {
  const struct data *d = ptr_barrier (&data);

  float32x4_t sqsum = vfmaq_f32 (vmulq_f32 (x, x), y, y);

-  uint16x4_t special = vcge_u16 (
-      vsubhn_u32 (vreinterpretq_u32_f32 (sqsum), d->tiny_bound),
-      vget_low_u16 (d->thres));
+  uint16x4_t special
+      = vcge_u16 (vsubhn_u32 (vreinterpretq_u32_f32 (sqsum), d->tiny_bound),
+		  vget_low_u16 (d->thres));

  if (unlikely (v_any_u16h (special)))
    return special_case (x, y, sqsum, special);
@ -85,10 +85,12 @@ float32x4_t VPCS_ATTR V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y)
 }
 #endif

-PL_SIG (V, F, 2, hypot, -10.0, 10.0)
-PL_TEST_ULP (V_NAME_F2 (hypot), 1.21)
-PL_TEST_EXPECT_FENV (V_NAME_F2 (hypot), WANT_SIMD_EXCEPT)
-PL_TEST_INTERVAL2 (V_NAME_F2 (hypot), 0, inf, 0, inf, 10000)
-PL_TEST_INTERVAL2 (V_NAME_F2 (hypot), 0, inf, -0, -inf, 10000)
-PL_TEST_INTERVAL2 (V_NAME_F2 (hypot), -0, -inf, 0, inf, 10000)
-PL_TEST_INTERVAL2 (V_NAME_F2 (hypot), -0, -inf, -0, -inf, 10000)
+HALF_WIDTH_ALIAS_F2 (hypot)
+
+TEST_SIG (V, F, 2, hypot, -10.0, 10.0)
+TEST_ULP (V_NAME_F2 (hypot), 1.21)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F2 (hypot), WANT_SIMD_EXCEPT)
+TEST_INTERVAL2 (V_NAME_F2 (hypot), 0, inf, 0, inf, 10000)
+TEST_INTERVAL2 (V_NAME_F2 (hypot), 0, inf, -0, -inf, 10000)
+TEST_INTERVAL2 (V_NAME_F2 (hypot), -0, -inf, 0, inf, 10000)
+TEST_INTERVAL2 (V_NAME_F2 (hypot), -0, -inf, -0, -inf, 10000)
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/log.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/log.c
@ -0,0 +1,118 @@
+/*
+ * Double-precision vector log(x) function.
+ *
+ * Copyright (c) 2019-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_defs.h"
+#include "test_sig.h"
+
+static const struct data
+{
+  uint64x2_t off, sign_exp_mask, offset_lower_bound;
+  uint32x4_t special_bound;
+  float64x2_t c0, c2;
+  double c1, c3, ln2, c4;
+} data = {
+  /* Rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ].  */
+  .c0 = V2 (-0x1.ffffffffffff7p-2),
+  .c1 = 0x1.55555555170d4p-2,
+  .c2 = V2 (-0x1.0000000399c27p-2),
+  .c3 = 0x1.999b2e90e94cap-3,
+  .c4 = -0x1.554e550bd501ep-3,
+  .ln2 = 0x1.62e42fefa39efp-1,
+  .sign_exp_mask = V2 (0xfff0000000000000),
+  .off = V2 (0x3fe6900900000000),
+  /* Lower bound is 0x0010000000000000. For
+     optimised register use subnormals are detected after offset has been
+     subtracted, so lower bound - offset (which wraps around).  */
+  .offset_lower_bound = V2 (0x0010000000000000 - 0x3fe6900900000000),
+  .special_bound = V4 (0x7fe00000), /* asuint64(inf) -  asuint64(0x1p-126).  */
+};
+
+#define N (1 << V_LOG_TABLE_BITS)
+#define IndexMask (N - 1)
+
+struct entry
+{
+  float64x2_t invc;
+  float64x2_t logc;
+};
+
+static inline struct entry
+lookup (uint64x2_t i)
+{
+  /* Since N is a power of 2, n % N = n & (N - 1).  */
+  struct entry e;
+  uint64_t i0 = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
+  uint64_t i1 = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
+  float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc);
+  float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc);
+  e.invc = vuzp1q_f64 (e0, e1);
+  e.logc = vuzp2q_f64 (e0, e1);
+  return e;
+}
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t hi, uint64x2_t u_off, float64x2_t y, float64x2_t r2,
+	      uint32x2_t special, const struct data *d)
+{
+  float64x2_t x = vreinterpretq_f64_u64 (vaddq_u64 (u_off, d->off));
+  return v_call_f64 (log, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (special));
+}
+
+/* Double-precision vector log routine.
+   The maximum observed error is 2.17 ULP:
+   _ZGVnN2v_log(0x1.a6129884398a3p+0) got 0x1.ffffff1cca043p-2
+				     want 0x1.ffffff1cca045p-2.  */
+float64x2_t VPCS_ATTR V_NAME_D1 (log) (float64x2_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  /* To avoid having to mov x out of the way, keep u after offset has been
+     applied, and recover x by adding the offset back in the special-case
+     handler.  */
+  uint64x2_t u = vreinterpretq_u64_f64 (x);
+  uint64x2_t u_off = vsubq_u64 (u, d->off);
+
+  /* x = 2^k z; where z is in range [Off,2*Off) and exact.
+     The range is split into N subintervals.
+     The ith subinterval contains z and c is near its center.  */
+  int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52);
+  uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->sign_exp_mask));
+  float64x2_t z = vreinterpretq_f64_u64 (iz);
+
+  struct entry e = lookup (u_off);
+
+  uint32x2_t special = vcge_u32 (vsubhn_u64 (u_off, d->offset_lower_bound),
+				 vget_low_u32 (d->special_bound));
+
+  /* log(x) = log1p(z/c-1) + log(c) + k*Ln2.  */
+  float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
+  float64x2_t kd = vcvtq_f64_s64 (k);
+
+  /* hi = r + log(c) + k*Ln2.  */
+  float64x2_t ln2_and_c4 = vld1q_f64 (&d->ln2);
+  float64x2_t hi = vfmaq_laneq_f64 (vaddq_f64 (e.logc, r), kd, ln2_and_c4, 0);
+
+  /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi.  */
+  float64x2_t odd_coeffs = vld1q_f64 (&d->c1);
+  float64x2_t r2 = vmulq_f64 (r, r);
+  float64x2_t y = vfmaq_laneq_f64 (d->c2, r, odd_coeffs, 1);
+  float64x2_t p = vfmaq_laneq_f64 (d->c0, r, odd_coeffs, 0);
+  y = vfmaq_laneq_f64 (y, r2, ln2_and_c4, 1);
+  y = vfmaq_f64 (p, r2, y);
+
+  if (unlikely (v_any_u32h (special)))
+    return special_case (hi, u_off, y, r2, special, d);
+  return vfmaq_f64 (hi, y, r2);
+}
+
+TEST_SIG (V, D, 1, log, 0.01, 11.1)
+TEST_ULP (V_NAME_D1 (log), 1.67)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (log), WANT_SIMD_EXCEPT)
+TEST_INTERVAL (V_NAME_D1 (log), 0, 0xffff000000000000, 10000)
+TEST_INTERVAL (V_NAME_D1 (log), 0x1p-4, 0x1p4, 400000)
+TEST_INTERVAL (V_NAME_D1 (log), 0, inf, 400000)
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/log10.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/log10.c
@ -0,0 +1,132 @@
+/*
+ * Double-precision vector log10(x) function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+static const struct data
+{
+  uint64x2_t off, sign_exp_mask, offset_lower_bound;
+  uint32x4_t special_bound;
+  double invln10, log10_2;
+  double c1, c3;
+  float64x2_t c0, c2, c4;
+} data = {
+  /* Computed from log coefficients divided by log(10) then rounded to double
+     precision.  */
+  .c0 = V2 (-0x1.bcb7b1526e506p-3),
+  .c1 = 0x1.287a7636be1d1p-3,
+  .c2 = V2 (-0x1.bcb7b158af938p-4),
+  .c3 = 0x1.63c78734e6d07p-4,
+  .c4 = V2 (-0x1.287461742fee4p-4),
+  .invln10 = 0x1.bcb7b1526e50ep-2,
+  .log10_2 = 0x1.34413509f79ffp-2,
+  .off = V2 (0x3fe6900900000000),
+  .sign_exp_mask = V2 (0xfff0000000000000),
+  /* Lower bound is 0x0010000000000000. For
+     optimised register use subnormals are detected after offset has been
+     subtracted, so lower bound - offset (which wraps around).  */
+  .offset_lower_bound = V2 (0x0010000000000000 - 0x3fe6900900000000),
+  .special_bound = V4 (0x7fe00000), /* asuint64(inf) - 0x0010000000000000.  */
+};
+
+#define N (1 << V_LOG10_TABLE_BITS)
+#define IndexMask (N - 1)
+
+struct entry
+{
+  float64x2_t invc;
+  float64x2_t log10c;
+};
+
+static inline struct entry
+lookup (uint64x2_t i)
+{
+  struct entry e;
+  uint64_t i0
+      = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG10_TABLE_BITS)) & IndexMask;
+  uint64_t i1
+      = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG10_TABLE_BITS)) & IndexMask;
+  float64x2_t e0 = vld1q_f64 (&__v_log10_data.table[i0].invc);
+  float64x2_t e1 = vld1q_f64 (&__v_log10_data.table[i1].invc);
+  e.invc = vuzp1q_f64 (e0, e1);
+  e.log10c = vuzp2q_f64 (e0, e1);
+  return e;
+}
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t hi, uint64x2_t u_off, float64x2_t y, float64x2_t r2,
+	      uint32x2_t special, const struct data *d)
+{
+  float64x2_t x = vreinterpretq_f64_u64 (vaddq_u64 (u_off, d->off));
+  return v_call_f64 (log10, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (special));
+}
+
+/* Fast implementation of double-precision vector log10
+   is a slight modification of double-precision vector log.
+   Max ULP error: < 2.5 ulp (nearest rounding.)
+   Maximum measured at 2.46 ulp for x in [0.96, 0.97]
+   _ZGVnN2v_log10(0x1.13192407fcb46p+0) got 0x1.fff6be3cae4bbp-6
+				       want 0x1.fff6be3cae4b9p-6.  */
+float64x2_t VPCS_ATTR V_NAME_D1 (log10) (float64x2_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  /* To avoid having to mov x out of the way, keep u after offset has been
+     applied, and recover x by adding the offset back in the special-case
+     handler.  */
+  uint64x2_t u = vreinterpretq_u64_f64 (x);
+  uint64x2_t u_off = vsubq_u64 (u, d->off);
+
+  /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
+     The range is split into N subintervals.
+     The ith subinterval contains z and c is near its center.  */
+  int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52);
+  uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->sign_exp_mask));
+  float64x2_t z = vreinterpretq_f64_u64 (iz);
+
+  struct entry e = lookup (u_off);
+
+  uint32x2_t special = vcge_u32 (vsubhn_u64 (u_off, d->offset_lower_bound),
+				 vget_low_u32 (d->special_bound));
+
+  /* log10(x) = log1p(z/c-1)/log(10) + log10(c) + k*log10(2).  */
+  float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
+  float64x2_t kd = vcvtq_f64_s64 (k);
+
+  /* hi = r / log(10) + log10(c) + k*log10(2).
+     Constants in v_log10_data.c are computed (in extended precision) as
+     e.log10c := e.logc * invln10.  */
+  float64x2_t cte = vld1q_f64 (&d->invln10);
+  float64x2_t hi = vfmaq_laneq_f64 (e.log10c, r, cte, 0);
+
+  /* y = log10(1+r) + n * log10(2).  */
+  hi = vfmaq_laneq_f64 (hi, kd, cte, 1);
+
+  /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi.  */
+  float64x2_t r2 = vmulq_f64 (r, r);
+  float64x2_t odd_coeffs = vld1q_f64 (&d->c1);
+  float64x2_t y = vfmaq_laneq_f64 (d->c2, r, odd_coeffs, 1);
+  float64x2_t p = vfmaq_laneq_f64 (d->c0, r, odd_coeffs, 0);
+  y = vfmaq_f64 (y, d->c4, r2);
+  y = vfmaq_f64 (p, y, r2);
+
+  if (unlikely (v_any_u32h (special)))
+    return special_case (hi, u_off, y, r2, special, d);
+  return vfmaq_f64 (hi, y, r2);
+}
+
+TEST_SIG (V, D, 1, log10, 0.01, 11.1)
+TEST_ULP (V_NAME_D1 (log10), 1.97)
+TEST_INTERVAL (V_NAME_D1 (log10), -0.0, -inf, 1000)
+TEST_INTERVAL (V_NAME_D1 (log10), 0, 0x1p-149, 1000)
+TEST_INTERVAL (V_NAME_D1 (log10), 0x1p-149, 0x1p-126, 4000)
+TEST_INTERVAL (V_NAME_D1 (log10), 0x1p-126, 0x1p-23, 50000)
+TEST_INTERVAL (V_NAME_D1 (log10), 0x1p-23, 1.0, 50000)
+TEST_INTERVAL (V_NAME_D1 (log10), 1.0, 100, 50000)
+TEST_INTERVAL (V_NAME_D1 (log10), 100, inf, 50000)
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/log10f.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/log10f.c
@ -0,0 +1,106 @@
+/*
+ * Single-precision vector log10 function.
+ *
+ * Copyright (c) 2020-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+static const struct data
+{
+  float32x4_t c0, c2, c4, c6, inv_ln10, ln2;
+  uint32x4_t off, offset_lower_bound;
+  uint16x8_t special_bound;
+  uint32x4_t mantissa_mask;
+  float c1, c3, c5, c7;
+} data = {
+  /* Use order 9 for log10(1+x), i.e. order 8 for log10(1+x)/x, with x in
+      [-1/3, 1/3] (offset=2/3). Max. relative error: 0x1.068ee468p-25.  */
+  .c0 = V4 (-0x1.bcb79cp-3f),
+  .c1 = 0x1.2879c8p-3f,
+  .c2 = V4 (-0x1.bcd472p-4f),
+  .c3 = 0x1.6408f8p-4f,
+  .c4 = V4 (-0x1.246f8p-4f),
+  .c5 = 0x1.f0e514p-5f,
+  .c6 = V4 (-0x1.0fc92cp-4f),
+  .c7 = 0x1.f5f76ap-5f,
+  .ln2 = V4 (0x1.62e43p-1f),
+  .inv_ln10 = V4 (0x1.bcb7b2p-2f),
+  /* Lower bound is the smallest positive normal float 0x00800000. For
+     optimised register use subnormals are detected after offset has been
+     subtracted, so lower bound is 0x0080000 - offset (which wraps around).  */
+  .offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab),
+  .special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000).  */
+  .off = V4 (0x3f2aaaab),	/* 0.666667.  */
+  .mantissa_mask = V4 (0x007fffff),
+};
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t y, uint32x4_t u_off, float32x4_t p, float32x4_t r2,
+	      uint16x4_t cmp, const struct data *d)
+{
+  /* Fall back to scalar code.  */
+  return v_call_f32 (log10f, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)),
+		     vfmaq_f32 (y, p, r2), vmovl_u16 (cmp));
+}
+
+/* Fast implementation of AdvSIMD log10f,
+   uses a similar approach as AdvSIMD logf with the same offset (i.e., 2/3) and
+   an order 9 polynomial.
+   Maximum error: 3.305ulps (nearest rounding.)
+   _ZGVnN4v_log10f(0x1.555c16p+0) got 0x1.ffe2fap-4
+				 want 0x1.ffe2f4p-4.  */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log10) (float32x4_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+  float32x4_t c1357 = vld1q_f32 (&d->c1);
+  /* To avoid having to mov x out of the way, keep u after offset has been
+     applied, and recover x by adding the offset back in the special-case
+     handler.  */
+  uint32x4_t u_off = vreinterpretq_u32_f32 (x);
+
+  /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3.  */
+  u_off = vsubq_u32 (u_off, d->off);
+  float32x4_t n = vcvtq_f32_s32 (
+      vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend.  */
+
+  uint16x4_t special = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound),
+				 vget_low_u16 (d->special_bound));
+
+  uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off);
+  float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
+
+  /* y = log10(1+r) + n * log10(2).  */
+  float32x4_t r2 = vmulq_f32 (r, r);
+
+  float32x4_t c01 = vfmaq_laneq_f32 (d->c0, r, c1357, 0);
+  float32x4_t c23 = vfmaq_laneq_f32 (d->c2, r, c1357, 1);
+  float32x4_t c45 = vfmaq_laneq_f32 (d->c4, r, c1357, 2);
+  float32x4_t c67 = vfmaq_laneq_f32 (d->c6, r, c1357, 3);
+
+  float32x4_t p47 = vfmaq_f32 (c45, r2, c67);
+  float32x4_t p27 = vfmaq_f32 (c23, r2, p47);
+  float32x4_t poly = vfmaq_f32 (c01, r2, p27);
+
+  /* y = Log10(2) * n + poly * InvLn(10).  */
+  float32x4_t y = vfmaq_f32 (r, d->ln2, n);
+  y = vmulq_f32 (y, d->inv_ln10);
+
+  if (unlikely (v_any_u16h (special)))
+    return special_case (y, u_off, poly, r2, special, d);
+  return vfmaq_f32 (y, poly, r2);
+}
+
+HALF_WIDTH_ALIAS_F1 (log10)
+
+TEST_SIG (V, F, 1, log10, 0.01, 11.1)
+TEST_ULP (V_NAME_F1 (log10), 2.81)
+TEST_INTERVAL (V_NAME_F1 (log10), -0.0, -inf, 100)
+TEST_INTERVAL (V_NAME_F1 (log10), 0, 0x1p-126, 100)
+TEST_INTERVAL (V_NAME_F1 (log10), 0x1p-126, 0x1p-23, 50000)
+TEST_INTERVAL (V_NAME_F1 (log10), 0x1p-23, 1.0, 50000)
+TEST_INTERVAL (V_NAME_F1 (log10), 1.0, 100, 50000)
+TEST_INTERVAL (V_NAME_F1 (log10), 100, inf, 50000)
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/log1p.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/log1p.c
@ -0,0 +1,61 @@
+/*
+ * Double-precision vector log(1+x) function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+#define WANT_V_LOG1P_K0_SHORTCUT 0
+#include "v_log1p_inline.h"
+
+const static struct data
+{
+  struct v_log1p_data d;
+  uint64x2_t inf, minus_one;
+} data = { .d = V_LOG1P_CONSTANTS_TABLE,
+	   .inf = V2 (0x7ff0000000000000),
+	   .minus_one = V2 (0xbff0000000000000) };
+
+#define BottomMask v_u64 (0xffffffff)
+
+static float64x2_t NOINLINE VPCS_ATTR
+special_case (float64x2_t x, uint64x2_t cmp, const struct data *d)
+{
+  /* Side-step special lanes so fenv exceptions are not triggered
+     inadvertently.  */
+  float64x2_t x_nospecial = v_zerofy_f64 (x, cmp);
+  return v_call_f64 (log1p, x, log1p_inline (x_nospecial, &d->d), cmp);
+}
+
+/* Vector log1p approximation using polynomial on reduced interval. Routine is
+   a modification of the algorithm used in scalar log1p, with no shortcut for
+   k=0 and no narrowing for f and k. Maximum observed error is 2.45 ULP:
+   _ZGVnN2v_log1p(0x1.658f7035c4014p+11) got 0x1.fd61d0727429dp+2
+					want 0x1.fd61d0727429fp+2 .  */
+VPCS_ATTR float64x2_t V_NAME_D1 (log1p) (float64x2_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+  uint64x2_t ix = vreinterpretq_u64_f64 (x);
+  uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x));
+
+  uint64x2_t special_cases
+      = vorrq_u64 (vcgeq_u64 (ia, d->inf), vcgeq_u64 (ix, d->minus_one));
+
+  if (unlikely (v_any_u64 (special_cases)))
+    return special_case (x, special_cases, d);
+
+  return log1p_inline (x, &d->d);
+}
+
+TEST_SIG (V, D, 1, log1p, -0.9, 10.0)
+TEST_ULP (V_NAME_D1 (log1p), 1.95)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (log1p), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_D1 (log1p), 0.0, 0x1p-23, 50000)
+TEST_SYM_INTERVAL (V_NAME_D1 (log1p), 0x1p-23, 0.001, 50000)
+TEST_SYM_INTERVAL (V_NAME_D1 (log1p), 0.001, 1.0, 50000)
+TEST_INTERVAL (V_NAME_D1 (log1p), 1, inf, 40000)
+TEST_INTERVAL (V_NAME_D1 (log1p), -1.0, -inf, 500)
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/log1pf.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/log1pf.c
@ -0,0 +1,92 @@
+/*
+ * Single-precision vector log(1+x) function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "v_log1pf_inline.h"
+
+#if WANT_SIMD_EXCEPT
+
+const static struct data
+{
+  uint32x4_t minus_one, thresh;
+  struct v_log1pf_data d;
+} data = {
+  .d = V_LOG1PF_CONSTANTS_TABLE,
+  .thresh = V4 (0x4b800000), /* asuint32(INFINITY) - TinyBound.  */
+  .minus_one = V4 (0xbf800000),
+};
+
+/* asuint32(0x1p-23). ulp=0.5 at 0x1p-23.  */
+#  define TinyBound v_u32 (0x34000000)
+
+static float32x4_t NOINLINE VPCS_ATTR
+special_case (float32x4_t x, uint32x4_t cmp, const struct data *d)
+{
+  /* Side-step special lanes so fenv exceptions are not triggered
+     inadvertently.  */
+  float32x4_t x_nospecial = v_zerofy_f32 (x, cmp);
+  return v_call_f32 (log1pf, x, log1pf_inline (x_nospecial, &d->d), cmp);
+}
+
+/* Vector log1pf approximation using polynomial on reduced interval. Worst-case
+   error is 1.69 ULP:
+   _ZGVnN4v_log1pf(0x1.04418ap-2) got 0x1.cfcbd8p-3
+				 want 0x1.cfcbdcp-3.  */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log1p) (float32x4_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+  uint32x4_t ix = vreinterpretq_u32_f32 (x);
+  uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x));
+
+  uint32x4_t special_cases
+      = vorrq_u32 (vcgeq_u32 (vsubq_u32 (ia, TinyBound), d->thresh),
+		   vcgeq_u32 (ix, d->minus_one));
+
+  if (unlikely (v_any_u32 (special_cases)))
+    return special_case (x, special_cases, d);
+
+  return log1pf_inline (x, &d->d);
+}
+
+#else
+
+const static struct v_log1pf_data data = V_LOG1PF_CONSTANTS_TABLE;
+
+static float32x4_t NOINLINE VPCS_ATTR
+special_case (float32x4_t x, uint32x4_t cmp)
+{
+  return v_call_f32 (log1pf, x, log1pf_inline (x, ptr_barrier (&data)), cmp);
+}
+
+/* Vector log1pf approximation using polynomial on reduced interval. Worst-case
+   error is 1.63 ULP:
+   _ZGVnN4v_log1pf(0x1.216d12p-2) got 0x1.fdcb12p-3
+				 want 0x1.fdcb16p-3.  */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log1p) (float32x4_t x)
+{
+  uint32x4_t special_cases = vornq_u32 (vcleq_f32 (x, v_f32 (-1)),
+					vcaleq_f32 (x, v_f32 (0x1p127f)));
+
+  if (unlikely (v_any_u32 (special_cases)))
+    return special_case (x, special_cases);
+
+  return log1pf_inline (x, ptr_barrier (&data));
+}
+
+#endif
+
+HALF_WIDTH_ALIAS_F1 (log1p)
+
+TEST_SIG (V, F, 1, log1p, -0.9, 10.0)
+TEST_ULP (V_NAME_F1 (log1p), 1.20)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (log1p), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_F1 (log1p), 0.0, 0x1p-23, 30000)
+TEST_SYM_INTERVAL (V_NAME_F1 (log1p), 0x1p-23, 1, 50000)
+TEST_INTERVAL (V_NAME_F1 (log1p), 1, inf, 50000)
+TEST_INTERVAL (V_NAME_F1 (log1p), -1.0, -inf, 1000)
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/log2.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/log2.c
@ -0,0 +1,123 @@
+/*
+ * Double-precision vector log2 function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+static const struct data
+{
+  uint64x2_t off, sign_exp_mask, offset_lower_bound;
+  uint32x4_t special_bound;
+  float64x2_t c0, c2;
+  double c1, c3, invln2, c4;
+} data = {
+  /* Each coefficient was generated to approximate log(r) for |r| < 0x1.fp-9
+     and N = 128, then scaled by log2(e) in extended precision and rounded back
+     to double precision.  */
+  .c0 = V2 (-0x1.71547652b8300p-1),
+  .c1 = 0x1.ec709dc340953p-2,
+  .c2 = V2 (-0x1.71547651c8f35p-2),
+  .c3 = 0x1.2777ebe12dda5p-2,
+  .c4 = -0x1.ec738d616fe26p-3,
+  .invln2 = 0x1.71547652b82fep0,
+  .off = V2 (0x3fe6900900000000),
+  .sign_exp_mask = V2 (0xfff0000000000000),
+  /* Lower bound is 0x0010000000000000. For
+     optimised register use subnormals are detected after offset has been
+     subtracted, so lower bound - offset (which wraps around).  */
+  .offset_lower_bound = V2 (0x0010000000000000 - 0x3fe6900900000000),
+  .special_bound = V4 (0x7fe00000), /* asuint64(inf) - asuint64(0x1p-1022).  */
+};
+
+#define N (1 << V_LOG2_TABLE_BITS)
+#define IndexMask (N - 1)
+
+struct entry
+{
+  float64x2_t invc;
+  float64x2_t log2c;
+};
+
+static inline struct entry
+lookup (uint64x2_t i)
+{
+  struct entry e;
+  uint64_t i0
+      = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG2_TABLE_BITS)) & IndexMask;
+  uint64_t i1
+      = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG2_TABLE_BITS)) & IndexMask;
+  float64x2_t e0 = vld1q_f64 (&__v_log2_data.table[i0].invc);
+  float64x2_t e1 = vld1q_f64 (&__v_log2_data.table[i1].invc);
+  e.invc = vuzp1q_f64 (e0, e1);
+  e.log2c = vuzp2q_f64 (e0, e1);
+  return e;
+}
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t hi, uint64x2_t u_off, float64x2_t y, float64x2_t r2,
+	      uint32x2_t special, const struct data *d)
+{
+  float64x2_t x = vreinterpretq_f64_u64 (vaddq_u64 (u_off, d->off));
+  return v_call_f64 (log2, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (special));
+}
+
+/* Double-precision vector log2 routine. Implements the same algorithm as
+   vector log10, with coefficients and table entries scaled in extended
+   precision. The maximum observed error is 2.58 ULP:
+   _ZGVnN2v_log2(0x1.0b556b093869bp+0) got 0x1.fffb34198d9dap-5
+				      want 0x1.fffb34198d9ddp-5.  */
+float64x2_t VPCS_ATTR V_NAME_D1 (log2) (float64x2_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  /* To avoid having to mov x out of the way, keep u after offset has been
+     applied, and recover x by adding the offset back in the special-case
+     handler.  */
+  uint64x2_t u = vreinterpretq_u64_f64 (x);
+  uint64x2_t u_off = vsubq_u64 (u, d->off);
+
+  /* x = 2^k z; where z is in range [Off,2*Off) and exact.
+     The range is split into N subintervals.
+     The ith subinterval contains z and c is near its center.  */
+  int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52);
+  uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->sign_exp_mask));
+  float64x2_t z = vreinterpretq_f64_u64 (iz);
+
+  struct entry e = lookup (u_off);
+
+  uint32x2_t special = vcge_u32 (vsubhn_u64 (u_off, d->offset_lower_bound),
+				 vget_low_u32 (d->special_bound));
+
+  /* log2(x) = log1p(z/c-1)/log(2) + log2(c) + k.  */
+  float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
+  float64x2_t kd = vcvtq_f64_s64 (k);
+
+  float64x2_t invln2_and_c4 = vld1q_f64 (&d->invln2);
+  float64x2_t hi
+      = vfmaq_laneq_f64 (vaddq_f64 (e.log2c, kd), r, invln2_and_c4, 0);
+
+  float64x2_t r2 = vmulq_f64 (r, r);
+  float64x2_t odd_coeffs = vld1q_f64 (&d->c1);
+  float64x2_t y = vfmaq_laneq_f64 (d->c2, r, odd_coeffs, 1);
+  float64x2_t p = vfmaq_laneq_f64 (d->c0, r, odd_coeffs, 0);
+  y = vfmaq_laneq_f64 (y, r2, invln2_and_c4, 1);
+  y = vfmaq_f64 (p, r2, y);
+
+  if (unlikely (v_any_u32h (special)))
+    return special_case (hi, u_off, y, r2, special, d);
+  return vfmaq_f64 (hi, y, r2);
+}
+
+TEST_SIG (V, D, 1, log2, 0.01, 11.1)
+TEST_ULP (V_NAME_D1 (log2), 2.09)
+TEST_INTERVAL (V_NAME_D1 (log2), -0.0, -0x1p126, 100)
+TEST_INTERVAL (V_NAME_D1 (log2), 0x1p-149, 0x1p-126, 4000)
+TEST_INTERVAL (V_NAME_D1 (log2), 0x1p-126, 0x1p-23, 50000)
+TEST_INTERVAL (V_NAME_D1 (log2), 0x1p-23, 1.0, 50000)
+TEST_INTERVAL (V_NAME_D1 (log2), 1.0, 100, 50000)
+TEST_INTERVAL (V_NAME_D1 (log2), 100, inf, 50000)
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/log2f.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/log2f.c
@ -0,0 +1,102 @@
+/*
+ * Single-precision vector log2 function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+static const struct data
+{
+  float32x4_t c0, c2, c4, c6, c8;
+  uint32x4_t off, offset_lower_bound;
+  uint16x8_t special_bound;
+  uint32x4_t mantissa_mask;
+  float c1, c3, c5, c7;
+} data = {
+  /* Coefficients generated using Remez algorithm approximate
+     log2(1+r)/r for r in [ -1/3, 1/3 ].
+     rel error: 0x1.c4c4b0cp-26.  */
+  .c0 = V4 (0x1.715476p0f), /* (float)(1 / ln(2)).  */
+  .c1 = -0x1.715458p-1f,
+  .c2 = V4 (0x1.ec701cp-2f),
+  .c3 = -0x1.7171a4p-2f,
+  .c4 = V4 (0x1.27a0b8p-2f),
+  .c5 = -0x1.e5143ep-3f,
+  .c6 = V4 (0x1.9d8ecap-3f),
+  .c7 = -0x1.c675bp-3f,
+  .c8 = V4 (0x1.9e495p-3f),
+  /* Lower bound is the smallest positive normal float 0x00800000. For
+     optimised register use subnormals are detected after offset has been
+     subtracted, so lower bound is 0x0080000 - offset (which wraps around).  */
+  .offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab),
+  .special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000).  */
+  .off = V4 (0x3f2aaaab),	/* 0.666667.  */
+  .mantissa_mask = V4 (0x007fffff),
+};
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t n, uint32x4_t u_off, float32x4_t p, float32x4_t r,
+	      uint16x4_t cmp, const struct data *d)
+{
+  /* Fall back to scalar code.  */
+  return v_call_f32 (log2f, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)),
+		     vfmaq_f32 (n, p, r), vmovl_u16 (cmp));
+}
+
+/* Fast implementation for single precision AdvSIMD log2,
+   relies on same argument reduction as AdvSIMD logf.
+   Maximum error: 2.48 ULPs
+   _ZGVnN4v_log2f(0x1.558174p+0) got 0x1.a9be84p-2
+				want 0x1.a9be8p-2.  */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log2) (float32x4_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  /* To avoid having to mov x out of the way, keep u after offset has been
+     applied, and recover x by adding the offset back in the special-case
+     handler.  */
+  uint32x4_t u_off = vreinterpretq_u32_f32 (x);
+
+  /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3.  */
+  u_off = vsubq_u32 (u_off, d->off);
+  float32x4_t n = vcvtq_f32_s32 (
+      vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend.  */
+
+  uint16x4_t special = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound),
+				 vget_low_u16 (d->special_bound));
+
+  uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off);
+  float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
+
+  /* y = log2(1+r) + n.  */
+  float32x4_t r2 = vmulq_f32 (r, r);
+
+  float32x4_t c1357 = vld1q_f32 (&d->c1);
+  float32x4_t c01 = vfmaq_laneq_f32 (d->c0, r, c1357, 0);
+  float32x4_t c23 = vfmaq_laneq_f32 (d->c2, r, c1357, 1);
+  float32x4_t c45 = vfmaq_laneq_f32 (d->c4, r, c1357, 2);
+  float32x4_t c67 = vfmaq_laneq_f32 (d->c6, r, c1357, 3);
+  float32x4_t p68 = vfmaq_f32 (c67, r2, d->c8);
+  float32x4_t p48 = vfmaq_f32 (c45, r2, p68);
+  float32x4_t p28 = vfmaq_f32 (c23, r2, p48);
+  float32x4_t p = vfmaq_f32 (c01, r2, p28);
+
+  if (unlikely (v_any_u16h (special)))
+    return special_case (n, u_off, p, r, special, d);
+  return vfmaq_f32 (n, p, r);
+}
+
+HALF_WIDTH_ALIAS_F1 (log2)
+
+TEST_SIG (V, F, 1, log2, 0.01, 11.1)
+TEST_ULP (V_NAME_F1 (log2), 1.99)
+TEST_INTERVAL (V_NAME_F1 (log2), -0.0, -0x1p126, 100)
+TEST_INTERVAL (V_NAME_F1 (log2), 0x1p-149, 0x1p-126, 4000)
+TEST_INTERVAL (V_NAME_F1 (log2), 0x1p-126, 0x1p-23, 50000)
+TEST_INTERVAL (V_NAME_F1 (log2), 0x1p-23, 1.0, 50000)
+TEST_INTERVAL (V_NAME_F1 (log2), 1.0, 100, 50000)
+TEST_INTERVAL (V_NAME_F1 (log2), 100, inf, 50000)
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/logf.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/logf.c
@ -0,0 +1,88 @@
+/*
+ * Single-precision vector log function.
+ *
+ * Copyright (c) 2019-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "v_math.h"
+#include "test_defs.h"
+#include "test_sig.h"
+
+static const struct data
+{
+  float32x4_t c2, c4, c6, ln2;
+  uint32x4_t off, offset_lower_bound, mantissa_mask;
+  uint16x8_t special_bound;
+  float c1, c3, c5, c0;
+} data = {
+  /* 3.34 ulp error.  */
+  .c0 = -0x1.3e737cp-3f,
+  .c1 = 0x1.5a9aa2p-3f,
+  .c2 = V4 (-0x1.4f9934p-3f),
+  .c3 = 0x1.961348p-3f,
+  .c4 = V4 (-0x1.00187cp-2f),
+  .c5 = 0x1.555d7cp-2f,
+  .c6 = V4 (-0x1.ffffc8p-2f),
+  .ln2 = V4 (0x1.62e43p-1f),
+  /* Lower bound is the smallest positive normal float 0x00800000. For
+     optimised register use subnormals are detected after offset has been
+     subtracted, so lower bound is 0x0080000 - offset (which wraps around).  */
+  .offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab),
+  .special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000).  */
+  .off = V4 (0x3f2aaaab),	/* 0.666667.  */
+  .mantissa_mask = V4 (0x007fffff)
+};
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t p, uint32x4_t u_off, float32x4_t y, float32x4_t r2,
+	      uint16x4_t cmp, const struct data *d)
+{
+  /* Fall back to scalar code.  */
+  return v_call_f32 (logf, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)),
+		     vfmaq_f32 (p, y, r2), vmovl_u16 (cmp));
+}
+
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log) (float32x4_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+  float32x4_t c1350 = vld1q_f32 (&d->c1);
+
+  /* To avoid having to mov x out of the way, keep u after offset has been
+     applied, and recover x by adding the offset back in the special-case
+     handler.  */
+  uint32x4_t u_off = vsubq_u32 (vreinterpretq_u32_f32 (x), d->off);
+
+  /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3.  */
+  float32x4_t n = vcvtq_f32_s32 (
+      vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend.  */
+  uint16x4_t cmp = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound),
+			     vget_low_u16 (d->special_bound));
+
+  uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off);
+  float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
+
+  /* y = log(1+r) + n*ln2.  */
+  float32x4_t r2 = vmulq_f32 (r, r);
+  /* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))).  */
+  float32x4_t p = vfmaq_laneq_f32 (d->c2, r, c1350, 0);
+  float32x4_t q = vfmaq_laneq_f32 (d->c4, r, c1350, 1);
+  float32x4_t y = vfmaq_laneq_f32 (d->c6, r, c1350, 2);
+  p = vfmaq_laneq_f32 (p, r2, c1350, 3);
+
+  q = vfmaq_f32 (q, p, r2);
+  y = vfmaq_f32 (y, q, r2);
+  p = vfmaq_f32 (r, d->ln2, n);
+
+  if (unlikely (v_any_u16h (cmp)))
+    return special_case (p, u_off, y, r2, cmp, d);
+  return vfmaq_f32 (p, y, r2);
+}
+
+HALF_WIDTH_ALIAS_F1 (log)
+
+TEST_SIG (V, F, 1, log, 0.01, 11.1)
+TEST_ULP (V_NAME_F1 (log), 2.9)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (log), WANT_SIMD_EXCEPT)
+TEST_INTERVAL (V_NAME_F1 (log), 0, 0xffff0000, 10000)
+TEST_INTERVAL (V_NAME_F1 (log), 0x1p-4, 0x1p4, 500000)
+TEST_INTERVAL (V_NAME_F1 (log), 0, inf, 50000)
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/modf.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/modf.c
@ -0,0 +1,33 @@
+/*
+ * Double-precision vector modf(x, *y) function.
+ *
+ * Copyright (c) 2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+/* Modf algorithm. Produces exact values in all rounding modes.  */
+float64x2_t VPCS_ATTR V_NAME_D1_L1 (modf) (float64x2_t x, double *out_int)
+{
+  /* Get integer component of x.  */
+  float64x2_t rounded = vrndq_f64 (x);
+  vst1q_f64 (out_int, rounded);
+
+  /* Subtract integer component from input.  */
+  uint64x2_t remaining = vreinterpretq_u64_f64 (vsubq_f64 (x, rounded));
+
+  /* Return +0 for integer x.  */
+  uint64x2_t is_integer = vceqq_f64 (x, rounded);
+  return vreinterpretq_f64_u64 (vbicq_u64 (remaining, is_integer));
+}
+
+TEST_ULP (_ZGVnN2vl8_modf_frac, 0.0)
+TEST_SYM_INTERVAL (_ZGVnN2vl8_modf_frac, 0, 1, 20000)
+TEST_SYM_INTERVAL (_ZGVnN2vl8_modf_frac, 1, inf, 20000)
+
+TEST_ULP (_ZGVnN2vl8_modf_int, 0.0)
+TEST_SYM_INTERVAL (_ZGVnN2vl8_modf_int, 0, 1, 20000)
+TEST_SYM_INTERVAL (_ZGVnN2vl8_modf_int, 1, inf, 20000)
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/modff.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/modff.c
@ -0,0 +1,34 @@
+/*
+ * Single-precision vector modf(x, *y) function.
+ *
+ * Copyright (c) 2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+/* Modff algorithm. Produces exact values in all rounding modes.  */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1_L1 (modf) (float32x4_t x,
+						    float *out_int)
+{
+  /* Get integer component of x.  */
+  float32x4_t rounded = vrndq_f32 (x);
+  vst1q_f32 (out_int, rounded);
+
+  /* Subtract integer component from input.  */
+  uint32x4_t remaining = vreinterpretq_u32_f32 (vsubq_f32 (x, rounded));
+
+  /* Return +0 for integer x.  */
+  uint32x4_t is_integer = vceqq_f32 (x, rounded);
+  return vreinterpretq_f32_u32 (vbicq_u32 (remaining, is_integer));
+}
+
+TEST_ULP (_ZGVnN4vl4_modff_frac, 0.0)
+TEST_SYM_INTERVAL (_ZGVnN4vl4_modff_frac, 0, 1, 20000)
+TEST_SYM_INTERVAL (_ZGVnN4vl4_modff_frac, 1, inf, 20000)
+
+TEST_ULP (_ZGVnN4vl4_modff_int, 0.0)
+TEST_SYM_INTERVAL (_ZGVnN4vl4_modff_int, 0, 1, 20000)
+TEST_SYM_INTERVAL (_ZGVnN4vl4_modff_int, 1, inf, 20000)
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/pow.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/pow.c
@ -1,20 +1,17 @@
 /*
 * Double-precision vector pow function.
 *
- * Copyright (c) 2020-2023, Arm Limited.
+ * Copyright (c) 2020-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

 #include "v_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"

 /* Defines parameters of the approximation and scalar fallback.  */
 #include "finite_pow.h"

-#define VecSmallExp v_u64 (SmallExp)
-#define VecThresExp v_u64 (ThresExp)
-
 #define VecSmallPowX v_u64 (SmallPowX)
 #define VecThresPowX v_u64 (ThresPowX)
 #define VecSmallPowY v_u64 (SmallPowY)
@ -22,34 +19,49 @@

 static const struct data
 {
-  float64x2_t log_poly[7];
-  float64x2_t exp_poly[3];
-  float64x2_t ln2_hi, ln2_lo;
-  float64x2_t shift, inv_ln2_n, ln2_hi_n, ln2_lo_n;
+  uint64x2_t inf;
+  float64x2_t small_powx;
+  uint64x2_t offset, mask;
+  uint64x2_t mask_sub_0, mask_sub_1;
+  float64x2_t log_c0, log_c2, log_c4, log_c5;
+  double log_c1, log_c3;
+  double ln2_lo, ln2_hi;
+  uint64x2_t small_exp, thres_exp;
+  double ln2_lo_n, ln2_hi_n;
+  double inv_ln2_n, exp_c2;
+  float64x2_t exp_c0, exp_c1;
 } data = {
+  /* Power threshold.  */
+  .inf = V2 (0x7ff0000000000000),
+  .small_powx = V2 (0x1p-126),
+  .offset = V2 (Off),
+  .mask = V2 (0xfffULL << 52),
+  .mask_sub_0 = V2 (1ULL << 52),
+  .mask_sub_1 = V2 (52ULL << 52),
  /* Coefficients copied from v_pow_log_data.c
     relative error: 0x1.11922ap-70 in [-0x1.6bp-8, 0x1.6bp-8]
     Coefficients are scaled to match the scaling during evaluation.  */
-  .log_poly = { V2 (-0x1p-1), V2 (0x1.555555555556p-2 * -2),
-		V2 (-0x1.0000000000006p-2 * -2), V2 (0x1.999999959554ep-3 * 4),
-		V2 (-0x1.555555529a47ap-3 * 4), V2 (0x1.2495b9b4845e9p-3 * -8),
-		V2 (-0x1.0002b8b263fc3p-3 * -8) },
-  .ln2_hi = V2 (0x1.62e42fefa3800p-1),
-  .ln2_lo = V2 (0x1.ef35793c76730p-45),
+  .log_c0 = V2 (0x1.555555555556p-2 * -2),
+  .log_c1 = -0x1.0000000000006p-2 * -2,
+  .log_c2 = V2 (0x1.999999959554ep-3 * 4),
+  .log_c3 = -0x1.555555529a47ap-3 * 4,
+  .log_c4 = V2 (0x1.2495b9b4845e9p-3 * -8),
+  .log_c5 = V2 (-0x1.0002b8b263fc3p-3 * -8),
+  .ln2_hi = 0x1.62e42fefa3800p-1,
+  .ln2_lo = 0x1.ef35793c76730p-45,
  /* Polynomial coefficients: abs error: 1.43*2^-58, ulp error: 0.549
     (0.550 without fma) if |x| < ln2/512.  */
-  .exp_poly = { V2 (0x1.fffffffffffd4p-2), V2 (0x1.5555571d6ef9p-3),
-		V2 (0x1.5555576a5adcep-5) },
-  .shift = V2 (0x1.8p52), /* round to nearest int. without intrinsics.  */
-  .inv_ln2_n = V2 (0x1.71547652b82fep8), /* N/ln2.  */
-  .ln2_hi_n = V2 (0x1.62e42fefc0000p-9), /* ln2/N.  */
-  .ln2_lo_n = V2 (-0x1.c610ca86c3899p-45),
+  .exp_c0 = V2 (0x1.fffffffffffd4p-2),
+  .exp_c1 = V2 (0x1.5555571d6ef9p-3),
+  .exp_c2 = 0x1.5555576a5adcep-5,
+  .small_exp = V2 (0x3c90000000000000),
+  .thres_exp = V2 (0x03f0000000000000),
+  .inv_ln2_n = 0x1.71547652b82fep8, /* N/ln2.  */
+  .ln2_hi_n = 0x1.62e42fefc0000p-9, /* ln2/N.  */
+  .ln2_lo_n = -0x1.c610ca86c3899p-45,
 };

-#define A(i) data.log_poly[i]
-#define C(i) data.exp_poly[i]
-
-/* This version implements an algorithm close to AOR scalar pow but
+/* This version implements an algorithm close to scalar pow but
   - does not implement the trick in the exp's specialcase subroutine to avoid
     double-rounding,
   - does not use a tail in the exponential core computation,
@ -78,10 +90,9 @@ v_log_inline (uint64x2_t ix, float64x2_t *tail, const struct data *d)
  /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
     The range is split into N subintervals.
     The ith subinterval contains z and c is near its center.  */
-  uint64x2_t tmp = vsubq_u64 (ix, v_u64 (Off));
-  int64x2_t k
-      = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); /* arithmetic shift.  */
-  uint64x2_t iz = vsubq_u64 (ix, vandq_u64 (tmp, v_u64 (0xfffULL << 52)));
+  uint64x2_t tmp = vsubq_u64 (ix, d->offset);
+  int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52);
+  uint64x2_t iz = vsubq_u64 (ix, vandq_u64 (tmp, d->mask));
  float64x2_t z = vreinterpretq_f64_u64 (iz);
  float64x2_t kd = vcvtq_f64_s64 (k);
  /* log(x) = k*Ln2 + log(c) + log1p(z/c-1).  */
@ -92,12 +103,13 @@ v_log_inline (uint64x2_t ix, float64x2_t *tail, const struct data *d)
     |z/c - 1| < 1/N, so r = z/c - 1 is exactly representible.  */
  float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, invc);
  /* k*Ln2 + log(c) + r.  */
-  float64x2_t t1 = vfmaq_f64 (logc, kd, d->ln2_hi);
+  float64x2_t ln2 = vld1q_f64 (&d->ln2_lo);
+  float64x2_t t1 = vfmaq_laneq_f64 (logc, kd, ln2, 1);
  float64x2_t t2 = vaddq_f64 (t1, r);
-  float64x2_t lo1 = vfmaq_f64 (logctail, kd, d->ln2_lo);
+  float64x2_t lo1 = vfmaq_laneq_f64 (logctail, kd, ln2, 0);
  float64x2_t lo2 = vaddq_f64 (vsubq_f64 (t1, t2), r);
  /* Evaluation is optimized assuming superscalar pipelined execution.  */
-  float64x2_t ar = vmulq_f64 (A (0), r);
+  float64x2_t ar = vmulq_f64 (v_f64 (-0.5), r);
  float64x2_t ar2 = vmulq_f64 (r, ar);
  float64x2_t ar3 = vmulq_f64 (r, ar2);
  /* k*Ln2 + log(c) + r + A[0]*r*r.  */
@ -105,9 +117,10 @@ v_log_inline (uint64x2_t ix, float64x2_t *tail, const struct data *d)
  float64x2_t lo3 = vfmaq_f64 (vnegq_f64 (ar2), ar, r);
  float64x2_t lo4 = vaddq_f64 (vsubq_f64 (t2, hi), ar2);
  /* p = log1p(r) - r - A[0]*r*r.  */
-  float64x2_t a56 = vfmaq_f64 (A (5), r, A (6));
-  float64x2_t a34 = vfmaq_f64 (A (3), r, A (4));
-  float64x2_t a12 = vfmaq_f64 (A (1), r, A (2));
+  float64x2_t odd_coeffs = vld1q_f64 (&d->log_c1);
+  float64x2_t a56 = vfmaq_f64 (d->log_c4, r, d->log_c5);
+  float64x2_t a34 = vfmaq_laneq_f64 (d->log_c2, r, odd_coeffs, 1);
+  float64x2_t a12 = vfmaq_laneq_f64 (d->log_c0, r, odd_coeffs, 0);
  float64x2_t p = vfmaq_f64 (a34, ar2, a56);
  p = vfmaq_f64 (a12, ar2, p);
  p = vmulq_f64 (ar3, p);
@ -118,29 +131,37 @@ v_log_inline (uint64x2_t ix, float64x2_t *tail, const struct data *d)
  return y;
 }

+static float64x2_t VPCS_ATTR NOINLINE
+exp_special_case (float64x2_t x, float64x2_t xtail)
+{
+  return (float64x2_t){ exp_nosignbias (x[0], xtail[0]),
+			exp_nosignbias (x[1], xtail[1]) };
+}
+
 /* Computes sign*exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|.  */
 static inline float64x2_t
-v_exp_inline (float64x2_t x, float64x2_t xtail, const struct data *d)
+v_exp_inline (float64x2_t x, float64x2_t neg_xtail, const struct data *d)
 {
  /* Fallback to scalar exp_inline for all lanes if any lane
     contains value of x s.t. |x| <= 2^-54 or >= 512.  */
-  uint64x2_t abstop
-      = vandq_u64 (vshrq_n_u64 (vreinterpretq_u64_f64 (x), 52), v_u64 (0x7ff));
-  uint64x2_t uoflowx
-      = vcgeq_u64 (vsubq_u64 (abstop, VecSmallExp), VecThresExp);
+  uint64x2_t uoflowx = vcgeq_u64 (
+      vsubq_u64 (vreinterpretq_u64_f64 (vabsq_f64 (x)), d->small_exp),
+      d->thres_exp);
  if (unlikely (v_any_u64 (uoflowx)))
-    return v_call2_f64 (exp_nosignbias, x, xtail, x, v_u64 (-1));
+    return exp_special_case (x, vnegq_f64 (neg_xtail));
+
  /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)].  */
  /* x = ln2/N*k + r, with k integer and r in [-ln2/2N, ln2/2N].  */
-  float64x2_t z = vmulq_f64 (d->inv_ln2_n, x);
  /* z - kd is in [-1, 1] in non-nearest rounding modes.  */
-  float64x2_t kd = vaddq_f64 (z, d->shift);
-  uint64x2_t ki = vreinterpretq_u64_f64 (kd);
-  kd = vsubq_f64 (kd, d->shift);
-  float64x2_t r = vfmsq_f64 (x, kd, d->ln2_hi_n);
-  r = vfmsq_f64 (r, kd, d->ln2_lo_n);
+  float64x2_t exp_consts = vld1q_f64 (&d->inv_ln2_n);
+  float64x2_t z = vmulq_laneq_f64 (x, exp_consts, 0);
+  float64x2_t kd = vrndnq_f64 (z);
+  uint64x2_t ki = vreinterpretq_u64_s64 (vcvtaq_s64_f64 (z));
+  float64x2_t ln2_n = vld1q_f64 (&d->ln2_lo_n);
+  float64x2_t r = vfmsq_laneq_f64 (x, kd, ln2_n, 1);
+  r = vfmsq_laneq_f64 (r, kd, ln2_n, 0);
  /* The code assumes 2^-200 < |xtail| < 2^-8/N.  */
-  r = vaddq_f64 (r, xtail);
+  r = vsubq_f64 (r, neg_xtail);
  /* 2^(k/N) ~= scale.  */
  uint64x2_t idx = vandq_u64 (ki, v_u64 (N_EXP - 1));
  uint64x2_t top = vshlq_n_u64 (ki, 52 - V_POW_EXP_TABLE_BITS);
@ -149,8 +170,8 @@ v_exp_inline (float64x2_t x, float64x2_t xtail, const struct data *d)
  sbits = vaddq_u64 (sbits, top);
  /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1).  */
  float64x2_t r2 = vmulq_f64 (r, r);
-  float64x2_t tmp = vfmaq_f64 (C (1), r, C (2));
-  tmp = vfmaq_f64 (C (0), r, tmp);
+  float64x2_t tmp = vfmaq_laneq_f64 (d->exp_c1, r, exp_consts, 1);
+  tmp = vfmaq_f64 (d->exp_c0, r, tmp);
  tmp = vfmaq_f64 (r, r2, tmp);
  float64x2_t scale = vreinterpretq_f64_u64 (sbits);
  /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
@ -158,54 +179,59 @@ v_exp_inline (float64x2_t x, float64x2_t xtail, const struct data *d)
  return vfmaq_f64 (scale, scale, tmp);
 }

+static float64x2_t NOINLINE VPCS_ATTR
+scalar_fallback (float64x2_t x, float64x2_t y)
+{
+  return (float64x2_t){ pow_scalar_special_case (x[0], y[0]),
+			pow_scalar_special_case (x[1], y[1]) };
+}
+
 float64x2_t VPCS_ATTR V_NAME_D2 (pow) (float64x2_t x, float64x2_t y)
 {
  const struct data *d = ptr_barrier (&data);
  /* Case of x <= 0 is too complicated to be vectorised efficiently here,
     fallback to scalar pow for all lanes if any x < 0 detected.  */
  if (v_any_u64 (vclezq_s64 (vreinterpretq_s64_f64 (x))))
-    return v_call2_f64 (__pl_finite_pow, x, y, x, v_u64 (-1));
+    return scalar_fallback (x, y);

  uint64x2_t vix = vreinterpretq_u64_f64 (x);
  uint64x2_t viy = vreinterpretq_u64_f64 (y);
-  uint64x2_t vtopx = vshrq_n_u64 (vix, 52);
-  uint64x2_t vtopy = vshrq_n_u64 (viy, 52);
-  uint64x2_t vabstopx = vandq_u64 (vtopx, v_u64 (0x7ff));
-  uint64x2_t vabstopy = vandq_u64 (vtopy, v_u64 (0x7ff));
+  uint64x2_t iay = vandq_u64 (viy, d->inf);

  /* Special cases of x or y.  */
 #if WANT_SIMD_EXCEPT
  /* Small or large.  */
+  uint64x2_t vtopx = vshrq_n_u64 (vix, 52);
+  uint64x2_t vabstopy = vshrq_n_u64 (iay, 52);
  uint64x2_t specialx
      = vcgeq_u64 (vsubq_u64 (vtopx, VecSmallPowX), VecThresPowX);
  uint64x2_t specialy
      = vcgeq_u64 (vsubq_u64 (vabstopy, VecSmallPowY), VecThresPowY);
 #else
-  /* Inf or nan.  */
-  uint64x2_t specialx = vcgeq_u64 (vabstopx, v_u64 (0x7ff));
-  uint64x2_t specialy = vcgeq_u64 (vabstopy, v_u64 (0x7ff));
  /* The case y==0 does not trigger a special case, since in this case it is
     necessary to fix the result only if x is a signalling nan, which already
     triggers a special case. We test y==0 directly in the scalar fallback.  */
+  uint64x2_t iax = vandq_u64 (vix, d->inf);
+  uint64x2_t specialx = vcgeq_u64 (iax, d->inf);
+  uint64x2_t specialy = vcgeq_u64 (iay, d->inf);
 #endif
  uint64x2_t special = vorrq_u64 (specialx, specialy);
  /* Fallback to scalar on all lanes if any lane is inf or nan.  */
  if (unlikely (v_any_u64 (special)))
-    return v_call2_f64 (__pl_finite_pow, x, y, x, v_u64 (-1));
+    return scalar_fallback (x, y);

  /* Small cases of x: |x| < 0x1p-126.  */
-  uint64x2_t smallx = vcltq_u64 (vabstopx, VecSmallPowX);
+  uint64x2_t smallx = vcaltq_f64 (x, d->small_powx);
  if (unlikely (v_any_u64 (smallx)))
    {
      /* Update ix if top 12 bits of x are 0.  */
-      uint64x2_t sub_x = vceqzq_u64 (vtopx);
+      uint64x2_t sub_x = vceqzq_u64 (vshrq_n_u64 (vix, 52));
      if (unlikely (v_any_u64 (sub_x)))
 	{
 	  /* Normalize subnormal x so exponent becomes negative.  */
-	  uint64x2_t vix_norm
-	      = vreinterpretq_u64_f64 (vmulq_f64 (x, v_f64 (0x1p52)));
-	  vix_norm = vandq_u64 (vix_norm, v_u64 (0x7fffffffffffffff));
-	  vix_norm = vsubq_u64 (vix_norm, v_u64 (52ULL << 52));
+	  uint64x2_t vix_norm = vreinterpretq_u64_f64 (
+	      vabsq_f64 (vmulq_f64 (x, vcvtq_f64_u64 (d->mask_sub_0))));
+	  vix_norm = vsubq_u64 (vix_norm, d->mask_sub_1);
 	  vix = vbslq_u64 (sub_x, vix_norm, vix);
 	}
    }
@ -216,21 +242,20 @@ float64x2_t VPCS_ATTR V_NAME_D2 (pow) (float64x2_t x, float64x2_t y)

  /* Vector Exp(y_loghi, y_loglo).  */
  float64x2_t vehi = vmulq_f64 (y, vhi);
-  float64x2_t velo = vmulq_f64 (y, vlo);
  float64x2_t vemi = vfmsq_f64 (vehi, y, vhi);
-  velo = vsubq_f64 (velo, vemi);
-  return v_exp_inline (vehi, velo, d);
+  float64x2_t neg_velo = vfmsq_f64 (vemi, y, vlo);
+  return v_exp_inline (vehi, neg_velo, d);
 }

-PL_SIG (V, D, 2, pow)
-PL_TEST_ULP (V_NAME_D2 (pow), 0.55)
-PL_TEST_EXPECT_FENV (V_NAME_D2 (pow), WANT_SIMD_EXCEPT)
+TEST_SIG (V, D, 2, pow)
+TEST_ULP (V_NAME_D2 (pow), 0.55)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D2 (pow), WANT_SIMD_EXCEPT)
 /* Wide intervals spanning the whole domain but shared between x and y.  */
-#define V_POW_INTERVAL2(xlo, xhi, ylo, yhi, n)                                 \
-  PL_TEST_INTERVAL2 (V_NAME_D2 (pow), xlo, xhi, ylo, yhi, n)                   \
-  PL_TEST_INTERVAL2 (V_NAME_D2 (pow), xlo, xhi, -ylo, -yhi, n)                 \
-  PL_TEST_INTERVAL2 (V_NAME_D2 (pow), -xlo, -xhi, ylo, yhi, n)                 \
-  PL_TEST_INTERVAL2 (V_NAME_D2 (pow), -xlo, -xhi, -ylo, -yhi, n)
+#define V_POW_INTERVAL2(xlo, xhi, ylo, yhi, n)                                \
+  TEST_INTERVAL2 (V_NAME_D2 (pow), xlo, xhi, ylo, yhi, n)                     \
+  TEST_INTERVAL2 (V_NAME_D2 (pow), xlo, xhi, -ylo, -yhi, n)                   \
+  TEST_INTERVAL2 (V_NAME_D2 (pow), -xlo, -xhi, ylo, yhi, n)                   \
+  TEST_INTERVAL2 (V_NAME_D2 (pow), -xlo, -xhi, -ylo, -yhi, n)
 #define EXPAND(str) str##000000000
 #define SHL52(str) EXPAND (str)
 V_POW_INTERVAL2 (0, SHL52 (SmallPowX), 0, inf, 40000)
@ -248,12 +273,12 @@ V_POW_INTERVAL2 (0x1.ep-1, 0x1.1p0, 0x1p8, 0x1p16, 10000)
 V_POW_INTERVAL2 (0x1p-300, 0x1p-200, 0x1p-20, 0x1p-10, 10000)
 V_POW_INTERVAL2 (0x1p50, 0x1p100, 0x1p-20, 0x1p-10, 10000)
 /* x is negative, y is odd or even integer, or y is real not integer.  */
-PL_TEST_INTERVAL2 (V_NAME_D2 (pow), -0.0, -10.0, 3.0, 3.0, 10000)
-PL_TEST_INTERVAL2 (V_NAME_D2 (pow), -0.0, -10.0, 4.0, 4.0, 10000)
-PL_TEST_INTERVAL2 (V_NAME_D2 (pow), -0.0, -10.0, 0.0, 10.0, 10000)
-PL_TEST_INTERVAL2 (V_NAME_D2 (pow), 0.0, 10.0, -0.0, -10.0, 10000)
+TEST_INTERVAL2 (V_NAME_D2 (pow), -0.0, -10.0, 3.0, 3.0, 10000)
+TEST_INTERVAL2 (V_NAME_D2 (pow), -0.0, -10.0, 4.0, 4.0, 10000)
+TEST_INTERVAL2 (V_NAME_D2 (pow), -0.0, -10.0, 0.0, 10.0, 10000)
+TEST_INTERVAL2 (V_NAME_D2 (pow), 0.0, 10.0, -0.0, -10.0, 10000)
 /* 1.0^y.  */
-PL_TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, 0.0, 0x1p-50, 1000)
-PL_TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, 0x1p-50, 1.0, 1000)
-PL_TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, 1.0, 0x1p100, 1000)
-PL_TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, -1.0, -0x1p120, 1000)
+TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, 0.0, 0x1p-50, 1000)
+TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, 0x1p-50, 1.0, 1000)
+TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, 1.0, 0x1p100, 1000)
+TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, -1.0, -0x1p120, 1000)
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/powf.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/powf.c
@ -0,0 +1,209 @@
+/*
+ * Single-precision vector powf function.
+ *
+ * Copyright (c) 2019-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_defs.h"
+#include "test_sig.h"
+
+#define Min v_u32 (0x00800000)
+#define Max v_u32 (0x7f800000)
+#define Thresh v_u32 (0x7f000000) /* Max - Min.  */
+#define MantissaMask v_u32 (0x007fffff)
+
+#define A d->log2_poly
+#define C d->exp2f_poly
+
+/* 2.6 ulp ~ 0.5 + 2^24 (128*Ln2*relerr_log2 + relerr_exp2).  */
+#define Off v_u32 (0x3f35d000)
+
+#define V_POWF_LOG2_TABLE_BITS 5
+#define V_EXP2F_TABLE_BITS 5
+#define Log2IdxMask ((1 << V_POWF_LOG2_TABLE_BITS) - 1)
+#define Scale ((double) (1 << V_EXP2F_TABLE_BITS))
+
+static const struct data
+{
+  struct
+  {
+    double invc, logc;
+  } log2_tab[1 << V_POWF_LOG2_TABLE_BITS];
+  float64x2_t log2_poly[4];
+  uint64_t exp2f_tab[1 << V_EXP2F_TABLE_BITS];
+  float64x2_t exp2f_poly[3];
+} data = {
+  .log2_tab = {{0x1.6489890582816p+0, -0x1.e960f97b22702p-2 * Scale},
+	       {0x1.5cf19b35e3472p+0, -0x1.c993406cd4db6p-2 * Scale},
+	       {0x1.55aac0e956d65p+0, -0x1.aa711d9a7d0f3p-2 * Scale},
+	       {0x1.4eb0022977e01p+0, -0x1.8bf37bacdce9bp-2 * Scale},
+	       {0x1.47fcccda1dd1fp+0, -0x1.6e13b3519946ep-2 * Scale},
+	       {0x1.418ceabab68c1p+0, -0x1.50cb8281e4089p-2 * Scale},
+	       {0x1.3b5c788f1edb3p+0, -0x1.341504a237e2bp-2 * Scale},
+	       {0x1.3567de48e9c9ap+0, -0x1.17eaab624ffbbp-2 * Scale},
+	       {0x1.2fabc80fd19bap+0, -0x1.f88e708f8c853p-3 * Scale},
+	       {0x1.2a25200ce536bp+0, -0x1.c24b6da113914p-3 * Scale},
+	       {0x1.24d108e0152e3p+0, -0x1.8d02ee397cb1dp-3 * Scale},
+	       {0x1.1facd8ab2fbe1p+0, -0x1.58ac1223408b3p-3 * Scale},
+	       {0x1.1ab614a03efdfp+0, -0x1.253e6fd190e89p-3 * Scale},
+	       {0x1.15ea6d03af9ffp+0, -0x1.e5641882c12ffp-4 * Scale},
+	       {0x1.1147b994bb776p+0, -0x1.81fea712926f7p-4 * Scale},
+	       {0x1.0ccbf650593aap+0, -0x1.203e240de64a3p-4 * Scale},
+	       {0x1.0875408477302p+0, -0x1.8029b86a78281p-5 * Scale},
+	       {0x1.0441d42a93328p+0, -0x1.85d713190fb9p-6 * Scale},
+	       {0x1p+0, 0x0p+0 * Scale},
+	       {0x1.f1d006c855e86p-1, 0x1.4c1cc07312997p-5 * Scale},
+	       {0x1.e28c3341aa301p-1, 0x1.5e1848ccec948p-4 * Scale},
+	       {0x1.d4bdf9aa64747p-1, 0x1.04cfcb7f1196fp-3 * Scale},
+	       {0x1.c7b45a24e5803p-1, 0x1.582813d463c21p-3 * Scale},
+	       {0x1.bb5f5eb2ed60ap-1, 0x1.a936fa68760ccp-3 * Scale},
+	       {0x1.afb0bff8fe6b4p-1, 0x1.f81bc31d6cc4ep-3 * Scale},
+	       {0x1.a49badf7ab1f5p-1, 0x1.2279a09fae6b1p-2 * Scale},
+	       {0x1.9a14a111fc4c9p-1, 0x1.47ec0b6df5526p-2 * Scale},
+	       {0x1.901131f5b2fdcp-1, 0x1.6c71762280f1p-2 * Scale},
+	       {0x1.8687f73f6d865p-1, 0x1.90155070798dap-2 * Scale},
+	       {0x1.7d7067eb77986p-1, 0x1.b2e23b1d3068cp-2 * Scale},
+	       {0x1.74c2c1cf97b65p-1, 0x1.d4e21b0daa86ap-2 * Scale},
+	       {0x1.6c77f37cff2a1p-1, 0x1.f61e2a2f67f3fp-2 * Scale},},
+  .log2_poly = { /* rel err: 1.5 * 2^-30.  */
+		 V2 (-0x1.6ff5daa3b3d7cp-2 * Scale),
+		 V2 (0x1.ec81d03c01aebp-2 * Scale),
+		 V2 (-0x1.71547bb43f101p-1 * Scale),
+		 V2 (0x1.7154764a815cbp0 * Scale)},
+  .exp2f_tab = {0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f,
+		0x3fef9301d0125b51, 0x3fef72b83c7d517b, 0x3fef54873168b9aa,
+		0x3fef387a6e756238, 0x3fef1e9df51fdee1, 0x3fef06fe0a31b715,
+		0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d,
+		0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429,
+		0x3feea47eb03a5585, 0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74,
+		0x3feea11473eb0187, 0x3feea589994cce13, 0x3feeace5422aa0db,
+		0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d,
+		0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c,
+		0x3fef3720dcef9069, 0x3fef5818dcfba487, 0x3fef7c97337b9b5f,
+		0x3fefa4afa2a490da, 0x3fefd0765b6e4540,},
+  .exp2f_poly = { /* rel err: 1.69 * 2^-34.  */
+		  V2 (0x1.c6af84b912394p-5 / Scale / Scale / Scale),
+		  V2 (0x1.ebfce50fac4f3p-3 / Scale / Scale),
+		  V2 (0x1.62e42ff0c52d6p-1 / Scale)}};
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, float32x4_t ret, uint32x4_t cmp)
+{
+  return v_call2_f32 (powf, x, y, ret, cmp);
+}
+
+static inline float64x2_t
+ylogx_core (const struct data *d, float64x2_t iz, float64x2_t k,
+	    float64x2_t invc, float64x2_t logc, float64x2_t y)
+{
+
+  /* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k.  */
+  float64x2_t r = vfmaq_f64 (v_f64 (-1.0), iz, invc);
+  float64x2_t y0 = vaddq_f64 (logc, k);
+
+  /* Polynomial to approximate log1p(r)/ln2.  */
+  float64x2_t logx = vfmaq_f64 (A[1], r, A[0]);
+  logx = vfmaq_f64 (A[2], logx, r);
+  logx = vfmaq_f64 (A[3], logx, r);
+  logx = vfmaq_f64 (y0, logx, r);
+
+  return vmulq_f64 (logx, y);
+}
+
+static inline float64x2_t
+log2_lookup (const struct data *d, uint32_t i)
+{
+  return vld1q_f64 (
+      &d->log2_tab[(i >> (23 - V_POWF_LOG2_TABLE_BITS)) & Log2IdxMask].invc);
+}
+
+static inline uint64x1_t
+exp2f_lookup (const struct data *d, uint64_t i)
+{
+  return vld1_u64 (&d->exp2f_tab[i % (1 << V_EXP2F_TABLE_BITS)]);
+}
+
+static inline float32x2_t
+powf_core (const struct data *d, float64x2_t ylogx)
+{
+  /* N*x = k + r with r in [-1/2, 1/2].  */
+  float64x2_t kd = vrndnq_f64 (ylogx);
+  int64x2_t ki = vcvtaq_s64_f64 (ylogx);
+  float64x2_t r = vsubq_f64 (ylogx, kd);
+
+  /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1).  */
+  uint64x2_t t = vcombine_u64 (exp2f_lookup (d, vgetq_lane_s64 (ki, 0)),
+			       exp2f_lookup (d, vgetq_lane_s64 (ki, 1)));
+  t = vaddq_u64 (
+      t, vreinterpretq_u64_s64 (vshlq_n_s64 (ki, 52 - V_EXP2F_TABLE_BITS)));
+  float64x2_t s = vreinterpretq_f64_u64 (t);
+  float64x2_t p = vfmaq_f64 (C[1], r, C[0]);
+  p = vfmaq_f64 (C[2], r, p);
+  p = vfmaq_f64 (s, p, vmulq_f64 (s, r));
+  return vcvt_f32_f64 (p);
+}
+
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (pow) (float32x4_t x, float32x4_t y)
+{
+  const struct data *d = ptr_barrier (&data);
+  uint32x4_t u = vreinterpretq_u32_f32 (x);
+  uint32x4_t cmp = vcgeq_u32 (vsubq_u32 (u, Min), Thresh);
+  uint32x4_t tmp = vsubq_u32 (u, Off);
+  uint32x4_t top = vbicq_u32 (tmp, MantissaMask);
+  float32x4_t iz = vreinterpretq_f32_u32 (vsubq_u32 (u, top));
+  int32x4_t k = vshrq_n_s32 (vreinterpretq_s32_u32 (top),
+			     23 - V_EXP2F_TABLE_BITS); /* arithmetic shift.  */
+
+  /* Use double precision for each lane: split input vectors into lo and hi
+     halves and promote.  */
+  float64x2_t tab0 = log2_lookup (d, vgetq_lane_u32 (tmp, 0)),
+	      tab1 = log2_lookup (d, vgetq_lane_u32 (tmp, 1)),
+	      tab2 = log2_lookup (d, vgetq_lane_u32 (tmp, 2)),
+	      tab3 = log2_lookup (d, vgetq_lane_u32 (tmp, 3));
+
+  float64x2_t iz_lo = vcvt_f64_f32 (vget_low_f32 (iz)),
+	      iz_hi = vcvt_high_f64_f32 (iz);
+
+  float64x2_t k_lo = vcvtq_f64_s64 (vmovl_s32 (vget_low_s32 (k))),
+	      k_hi = vcvtq_f64_s64 (vmovl_high_s32 (k));
+
+  float64x2_t invc_lo = vzip1q_f64 (tab0, tab1),
+	      invc_hi = vzip1q_f64 (tab2, tab3),
+	      logc_lo = vzip2q_f64 (tab0, tab1),
+	      logc_hi = vzip2q_f64 (tab2, tab3);
+
+  float64x2_t y_lo = vcvt_f64_f32 (vget_low_f32 (y)),
+	      y_hi = vcvt_high_f64_f32 (y);
+
+  float64x2_t ylogx_lo = ylogx_core (d, iz_lo, k_lo, invc_lo, logc_lo, y_lo);
+  float64x2_t ylogx_hi = ylogx_core (d, iz_hi, k_hi, invc_hi, logc_hi, y_hi);
+
+  uint32x4_t ylogx_top = vuzp2q_u32 (vreinterpretq_u32_f64 (ylogx_lo),
+				     vreinterpretq_u32_f64 (ylogx_hi));
+
+  cmp = vorrq_u32 (
+      cmp, vcgeq_u32 (vandq_u32 (vshrq_n_u32 (ylogx_top, 15), v_u32 (0xffff)),
+		      vdupq_n_u32 (asuint64 (126.0 * (1 << V_EXP2F_TABLE_BITS))
+				   >> 47)));
+
+  float32x2_t p_lo = powf_core (d, ylogx_lo);
+  float32x2_t p_hi = powf_core (d, ylogx_hi);
+
+  if (unlikely (v_any_u32 (cmp)))
+    return special_case (x, y, vcombine_f32 (p_lo, p_hi), cmp);
+  return vcombine_f32 (p_lo, p_hi);
+}
+
+HALF_WIDTH_ALIAS_F2 (pow)
+
+TEST_SIG (V, F, 2, pow)
+TEST_ULP (V_NAME_F2 (pow), 2.1)
+TEST_DISABLE_FENV (V_NAME_F2 (pow))
+TEST_INTERVAL2 (V_NAME_F2 (pow), 0x1p-1, 0x1p1, 0x1p-7, 0x1p7, 50000)
+TEST_INTERVAL2 (V_NAME_F2 (pow), 0x1p-1, 0x1p1, -0x1p-7, -0x1p7, 50000)
+TEST_INTERVAL2 (V_NAME_F2 (pow), 0x1p-70, 0x1p70, 0x1p-1, 0x1p1, 50000)
+TEST_INTERVAL2 (V_NAME_F2 (pow), 0x1p-70, 0x1p70, -0x1p-1, -0x1p1, 50000)
+TEST_INTERVAL2 (V_NAME_F2 (pow), 0x1.ep-1, 0x1.1p0, 0x1p8, 0x1p14, 50000)
+TEST_INTERVAL2 (V_NAME_F2 (pow), 0x1.ep-1, 0x1.1p0, -0x1p8, -0x1p14, 50000)
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/sin.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/sin.c
@ -1,17 +1,19 @@
 /*
 * Double-precision vector sin function.
 *
- * Copyright (c) 2019-2023, Arm Limited.
+ * Copyright (c) 2019-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

+#include "test_defs.h"
+#include "test_sig.h"
 #include "mathlib.h"
 #include "v_math.h"

 static const struct data
 {
  float64x2_t poly[7];
-  float64x2_t range_val, inv_pi, shift, pi_1, pi_2, pi_3;
+  float64x2_t range_val, inv_pi, pi_1, pi_2, pi_3;
 } data = {
  .poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7),
 	    V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19),
@ -23,12 +25,13 @@ static const struct data
  .pi_1 = V2 (0x1.921fb54442d18p+1),
  .pi_2 = V2 (0x1.1a62633145c06p-53),
  .pi_3 = V2 (0x1.c1cd129024e09p-106),
-  .shift = V2 (0x1.8p52),
 };

 #if WANT_SIMD_EXCEPT
-# define TinyBound v_u64 (0x3000000000000000) /* asuint64 (0x1p-255).  */
-# define Thresh v_u64 (0x1160000000000000)    /* RangeVal - TinyBound.  */
+/* asuint64(0x1p-253)), below which multiply by inv_pi underflows.  */
+# define TinyBound v_u64 (0x3020000000000000)
+/* RangeVal - TinyBound.  */
+# define Thresh v_u64 (0x1160000000000000)
 #endif

 #define C(i) d->poly[i]
@ -61,16 +64,15 @@ float64x2_t VPCS_ATTR V_NAME_D1 (sin) (float64x2_t x)
     fenv). These lanes will be fixed by special-case handler later.  */
  uint64x2_t ir = vreinterpretq_u64_f64 (vabsq_f64 (x));
  cmp = vcgeq_u64 (vsubq_u64 (ir, TinyBound), Thresh);
-  r = vbslq_f64 (cmp, vreinterpretq_f64_u64 (cmp), x);
+  r = vreinterpretq_f64_u64 (vbicq_u64 (vreinterpretq_u64_f64 (x), cmp));
 #else
  r = x;
  cmp = vcageq_f64 (x, d->range_val);
 #endif

  /* n = rint(|x|/pi).  */
-  n = vfmaq_f64 (d->shift, d->inv_pi, r);
-  odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63);
-  n = vsubq_f64 (n, d->shift);
+  n = vrndaq_f64 (vmulq_f64 (r, d->inv_pi));
+  odd = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtq_s64_f64 (n)), 63);

  /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2).  */
  r = vfmsq_f64 (r, d->pi_1, n);
@ -95,3 +97,9 @@ float64x2_t VPCS_ATTR V_NAME_D1 (sin) (float64x2_t x)
    return special_case (x, y, odd, cmp);
  return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
 }
+
+TEST_SIG (V, D, 1, sin, -3.1, 3.1)
+TEST_ULP (V_NAME_D1 (sin), 3.0)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (sin), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_D1 (sin), 0, 0x1p23, 500000)
+TEST_SYM_INTERVAL (V_NAME_D1 (sin), 0x1p23, inf, 10000)
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/sincos.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/sincos.c
@ -1,7 +1,7 @@
 /*
 * Double-precision vector sincos function.
 *
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

@ -10,12 +10,21 @@
   be linked against the scalar sincosf from math/.  */
 #define _GNU_SOURCE
 #include <math.h>
-#undef _GNU_SOURCE

 #include "v_math.h"
-#include "pl_test.h"
+#include "test_defs.h"
 #include "v_sincos_common.h"

+/* sincos not available for all scalar libm implementations.  */
+#if defined(_MSC_VER) || !defined(__GLIBC__)
+static void
+sincos (double x, double *out_sin, double *out_cos)
+{
+  *out_sin = sin (x);
+  *out_cos = cos (x);
+}
+#endif
+
 static void VPCS_ATTR NOINLINE
 special_case (float64x2_t x, uint64x2_t special, double *out_sin,
 	      double *out_cos)
@ -46,12 +55,13 @@ _ZGVnN2vl8l8_sincos (float64x2_t x, double *out_sin, double *out_cos)
    special_case (x, special, out_sin, out_cos);
 }

-PL_TEST_ULP (_ZGVnN2v_sincos_sin, 2.73)
-PL_TEST_ULP (_ZGVnN2v_sincos_cos, 2.73)
+TEST_DISABLE_FENV (_ZGVnN2v_sincos_cos)
+TEST_DISABLE_FENV (_ZGVnN2v_sincos_sin)
+TEST_ULP (_ZGVnN2v_sincos_sin, 2.73)
+TEST_ULP (_ZGVnN2v_sincos_cos, 2.73)
 #define V_SINCOS_INTERVAL(lo, hi, n)                                          \
-  PL_TEST_INTERVAL (_ZGVnN2v_sincos_sin, lo, hi, n)                           \
-  PL_TEST_INTERVAL (_ZGVnN2v_sincos_cos, lo, hi, n)
-V_SINCOS_INTERVAL (0, 0x1p23, 500000)
-V_SINCOS_INTERVAL (-0, -0x1p23, 500000)
+  TEST_INTERVAL (_ZGVnN2v_sincos_sin, lo, hi, n)                              \
+  TEST_INTERVAL (_ZGVnN2v_sincos_cos, lo, hi, n)
+V_SINCOS_INTERVAL (0, 0x1p-31, 50000)
+V_SINCOS_INTERVAL (0x1p-31, 0x1p23, 500000)
 V_SINCOS_INTERVAL (0x1p23, inf, 10000)
-V_SINCOS_INTERVAL (-0x1p23, -inf, 10000)
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/sincosf.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/sincosf.c
@ -1,7 +1,7 @@
 /*
 * Single-precision vector sincos function.
 *
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

@ -10,11 +10,20 @@
   be linked against the scalar sincosf from math/.  */
 #define _GNU_SOURCE
 #include <math.h>
-#undef _GNU_SOURCE

 #include "v_sincosf_common.h"
 #include "v_math.h"
-#include "pl_test.h"
+#include "test_defs.h"
+
+/* sincos not available for all scalar libm implementations.  */
+#if defined(_MSC_VER) || !defined(__GLIBC__)
+static void
+sincosf (float x, float *out_sin, float *out_cos)
+{
+  *out_sin = sinf (x);
+  *out_cos = cosf (x);
+}
+#endif

 static void VPCS_ATTR NOINLINE
 special_case (float32x4_t x, uint32x4_t special, float *out_sin,
@ -47,12 +56,13 @@ _ZGVnN4vl4l4_sincosf (float32x4_t x, float *out_sin, float *out_cos)
    special_case (x, special, out_sin, out_cos);
 }

-PL_TEST_ULP (_ZGVnN4v_sincosf_sin, 1.17)
-PL_TEST_ULP (_ZGVnN4v_sincosf_cos, 1.31)
+TEST_DISABLE_FENV (_ZGVnN4v_sincosf_sin)
+TEST_DISABLE_FENV (_ZGVnN4v_sincosf_cos)
+TEST_ULP (_ZGVnN4v_sincosf_sin, 1.17)
+TEST_ULP (_ZGVnN4v_sincosf_cos, 1.31)
 #define V_SINCOSF_INTERVAL(lo, hi, n)                                         \
-  PL_TEST_INTERVAL (_ZGVnN4v_sincosf_sin, lo, hi, n)                          \
-  PL_TEST_INTERVAL (_ZGVnN4v_sincosf_cos, lo, hi, n)
-V_SINCOSF_INTERVAL (0, 0x1p20, 500000)
-V_SINCOSF_INTERVAL (-0, -0x1p20, 500000)
+  TEST_INTERVAL (_ZGVnN4v_sincosf_sin, lo, hi, n)                             \
+  TEST_INTERVAL (_ZGVnN4v_sincosf_cos, lo, hi, n)
+V_SINCOSF_INTERVAL (0, 0x1p-31, 50000)
+V_SINCOSF_INTERVAL (0x1p-31, 0x1p20, 500000)
 V_SINCOSF_INTERVAL (0x1p20, inf, 10000)
-V_SINCOSF_INTERVAL (-0x1p20, -inf, 10000)
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/sincospi.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/sincospi.c
@ -0,0 +1,44 @@
+/*
+ * Double-precision vector sincospi function.
+ *
+ * Copyright (c) 2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "v_sincospi_common.h"
+#include "v_math.h"
+#include "test_defs.h"
+
+/* Double-precision vector function allowing calculation of both sin and cos in
+   one function call, using separate argument reduction and shared low-order
+   polynomials.
+   Approximation for vector double-precision sincospi(x).
+   Maximum Error 3.09 ULP:
+  _ZGVnN2v_sincospi_sin(0x1.7a41deb4b21e1p+14) got 0x1.fd54d0b327cf1p-1
+					      want 0x1.fd54d0b327cf4p-1
+   Maximum Error 3.16 ULP:
+  _ZGVnN2v_sincospi_cos(-0x1.11e3c7e284adep-5) got 0x1.fd2da484ff3ffp-1
+					      want 0x1.fd2da484ff402p-1.  */
+VPCS_ATTR void
+_ZGVnN2vl8l8_sincospi (float64x2_t x, double *out_sin, double *out_cos)
+{
+  const struct v_sincospi_data *d = ptr_barrier (&v_sincospi_data);
+
+  float64x2x2_t sc = v_sincospi_inline (x, d);
+
+  vst1q_f64 (out_sin, sc.val[0]);
+  vst1q_f64 (out_cos, sc.val[1]);
+}
+
+#if WANT_TRIGPI_TESTS
+TEST_DISABLE_FENV (_ZGVnN2v_sincospi_cos)
+TEST_DISABLE_FENV (_ZGVnN2v_sincospi_sin)
+TEST_ULP (_ZGVnN2v_sincospi_sin, 2.59)
+TEST_ULP (_ZGVnN2v_sincospi_cos, 2.66)
+#  define V_SINCOSPI_INTERVAL(lo, hi, n)                                      \
+    TEST_SYM_INTERVAL (_ZGVnN2v_sincospi_sin, lo, hi, n)                      \
+    TEST_SYM_INTERVAL (_ZGVnN2v_sincospi_cos, lo, hi, n)
+V_SINCOSPI_INTERVAL (0, 0x1p-63, 10000)
+V_SINCOSPI_INTERVAL (0x1p-63, 0.5, 50000)
+V_SINCOSPI_INTERVAL (0.5, 0x1p63, 50000)
+V_SINCOSPI_INTERVAL (0x1p63, inf, 10000)
+#endif
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/sincospif.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/sincospif.c
@ -0,0 +1,43 @@
+/*
+ * Single-precision vector sincospi function.
+ *
+ * Copyright (c) 2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_sincospif_common.h"
+#include "v_math.h"
+#include "test_defs.h"
+#include "mathlib.h"
+
+/* Single-precision vector function allowing calculation of both sinpi and
+   cospi in one function call, using shared argument reduction and polynomials.
+   Worst-case error for sin is 3.04 ULP:
+   _ZGVnN4v_sincospif_sin(0x1.1d341ap-1) got 0x1.f7cd56p-1 want 0x1.f7cd5p-1.
+   Worst-case error for cos is 3.18 ULP:
+   _ZGVnN4v_sincospif_cos(0x1.d341a8p-5) got 0x1.f7cd56p-1 want 0x1.f7cd5p-1.
+ */
+VPCS_ATTR void
+_ZGVnN4vl4l4_sincospif (float32x4_t x, float *out_sin, float *out_cos)
+{
+  const struct v_sincospif_data *d = ptr_barrier (&v_sincospif_data);
+
+  float32x4x2_t sc = v_sincospif_inline (x, d);
+
+  vst1q_f32 (out_sin, sc.val[0]);
+  vst1q_f32 (out_cos, sc.val[1]);
+}
+
+#if WANT_TRIGPI_TESTS
+TEST_DISABLE_FENV (_ZGVnN4v_sincospif_sin)
+TEST_DISABLE_FENV (_ZGVnN4v_sincospif_cos)
+TEST_ULP (_ZGVnN4v_sincospif_sin, 2.54)
+TEST_ULP (_ZGVnN4v_sincospif_cos, 2.68)
+#  define V_SINCOSPIF_INTERVAL(lo, hi, n)                                     \
+    TEST_SYM_INTERVAL (_ZGVnN4v_sincospif_sin, lo, hi, n)                     \
+    TEST_SYM_INTERVAL (_ZGVnN4v_sincospif_cos, lo, hi, n)
+V_SINCOSPIF_INTERVAL (0, 0x1p-63, 10000)
+V_SINCOSPIF_INTERVAL (0x1p-63, 0.5, 50000)
+V_SINCOSPIF_INTERVAL (0.5, 0x1p31, 50000)
+V_SINCOSPIF_INTERVAL (0x1p31, inf, 10000)
+#endif
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/sinf.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/sinf.c
@ -1,17 +1,19 @@
 /*
 * Single-precision vector sin function.
 *
- * Copyright (c) 2019-2023, Arm Limited.
+ * Copyright (c) 2019-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

 #include "mathlib.h"
 #include "v_math.h"
+#include "test_defs.h"
+#include "test_sig.h"

 static const struct data
 {
  float32x4_t poly[4];
-  float32x4_t range_val, inv_pi, shift, pi_1, pi_2, pi_3;
+  float32x4_t range_val, inv_pi, pi_1, pi_2, pi_3;
 } data = {
  /* 1.886 ulp error.  */
  .poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f),
@ -22,13 +24,14 @@ static const struct data
  .pi_3 = V4 (-0x1.ee59dap-49f),

  .inv_pi = V4 (0x1.45f306p-2f),
-  .shift = V4 (0x1.8p+23f),
  .range_val = V4 (0x1p20f)
 };

 #if WANT_SIMD_EXCEPT
-# define TinyBound v_u32 (0x21000000) /* asuint32(0x1p-61f).  */
-# define Thresh v_u32 (0x28800000)    /* RangeVal - TinyBound.  */
+/* asuint32(0x1p-59f), below which multiply by inv_pi underflows.  */
+# define TinyBound v_u32 (0x22000000)
+/* RangeVal - TinyBound.  */
+# define Thresh v_u32 (0x27800000)
 #endif

 #define C(i) d->poly[i]
@ -41,7 +44,7 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp)
  return v_call_f32 (sinf, x, y, cmp);
 }

-float32x4_t VPCS_ATTR V_NAME_F1 (sin) (float32x4_t x)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sin) (float32x4_t x)
 {
  const struct data *d = ptr_barrier (&data);
  float32x4_t n, r, r2, y;
@ -53,23 +56,22 @@ float32x4_t VPCS_ATTR V_NAME_F1 (sin) (float32x4_t x)
  /* If fenv exceptions are to be triggered correctly, set any special lanes
     to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
     special-case handler later.  */
-  r = vbslq_f32 (cmp, vreinterpretq_f32_u32 (cmp), x);
+  r = vreinterpretq_f32_u32 (vbicq_u32 (vreinterpretq_u32_f32 (x), cmp));
 #else
  r = x;
  cmp = vcageq_f32 (x, d->range_val);
 #endif

-  /* n = rint(|x|/pi) */
-  n = vfmaq_f32 (d->shift, d->inv_pi, r);
-  odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31);
-  n = vsubq_f32 (n, d->shift);
+  /* n = rint(|x|/pi).  */
+  n = vrndaq_f32 (vmulq_f32 (r, d->inv_pi));
+  odd = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 31);

-  /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2) */
+  /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2).  */
  r = vfmsq_f32 (r, d->pi_1, n);
  r = vfmsq_f32 (r, d->pi_2, n);
  r = vfmsq_f32 (r, d->pi_3, n);

-  /* y = sin(r) */
+  /* y = sin(r).  */
  r2 = vmulq_f32 (r, r);
  y = vfmaq_f32 (C (2), C (3), r2);
  y = vfmaq_f32 (C (1), y, r2);
@ -80,3 +82,11 @@ float32x4_t VPCS_ATTR V_NAME_F1 (sin) (float32x4_t x)
    return special_case (x, y, odd, cmp);
  return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
 }
+
+HALF_WIDTH_ALIAS_F1 (sin)
+
+TEST_SIG (V, F, 1, sin, -3.1, 3.1)
+TEST_ULP (V_NAME_F1 (sin), 1.4)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (sin), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_F1 (sin), 0, 0x1p20, 500000)
+TEST_SYM_INTERVAL (V_NAME_F1 (sin), 0x1p20, inf, 10000)
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/sinh.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/sinh.c
@ -0,0 +1,80 @@
+/*
+ * Double-precision vector sinh(x) function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "v_expm1_inline.h"
+
+static const struct data
+{
+  struct v_expm1_data d;
+  uint64x2_t halff;
+#if WANT_SIMD_EXCEPT
+  uint64x2_t tiny_bound, thresh;
+#else
+  float64x2_t large_bound;
+#endif
+} data = {
+  .d = V_EXPM1_DATA,
+  .halff = V2 (0x3fe0000000000000),
+#if WANT_SIMD_EXCEPT
+  /* 2^-26, below which sinh(x) rounds to x.  */
+  .tiny_bound = V2 (0x3e50000000000000),
+  /* asuint(large_bound) - asuint(tiny_bound).  */
+  .thresh = V2 (0x0230000000000000),
+#else
+  /* 2^9. expm1 helper overflows for large input.  */
+  .large_bound = V2 (0x1p+9),
+#endif
+};
+
+static float64x2_t NOINLINE VPCS_ATTR
+special_case (float64x2_t x)
+{
+  return v_call_f64 (sinh, x, x, v_u64 (-1));
+}
+
+/* Approximation for vector double-precision sinh(x) using expm1.
+   sinh(x) = (exp(x) - exp(-x)) / 2.
+   The greatest observed error is 2.52 ULP:
+   _ZGVnN2v_sinh(-0x1.a098a2177a2b9p-2) got -0x1.ac2f05bb66fccp-2
+				       want -0x1.ac2f05bb66fc9p-2.  */
+float64x2_t VPCS_ATTR V_NAME_D1 (sinh) (float64x2_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  float64x2_t ax = vabsq_f64 (x);
+  uint64x2_t ix = vreinterpretq_u64_f64 (x);
+  float64x2_t halfsign = vreinterpretq_f64_u64 (
+      vbslq_u64 (v_u64 (0x8000000000000000), ix, d->halff));
+
+#if WANT_SIMD_EXCEPT
+  uint64x2_t special = vcgeq_u64 (
+      vsubq_u64 (vreinterpretq_u64_f64 (ax), d->tiny_bound), d->thresh);
+#else
+  uint64x2_t special = vcageq_f64 (x, d->large_bound);
+#endif
+
+  /* Fall back to scalar variant for all lanes if any of them are special.  */
+  if (unlikely (v_any_u64 (special)))
+    return special_case (x);
+
+  /* Up to the point that expm1 overflows, we can use it to calculate sinh
+     using a slight rearrangement of the definition of sinh. This allows us to
+     retain acceptable accuracy for very small inputs.  */
+  float64x2_t t = expm1_inline (ax, &d->d);
+  t = vaddq_f64 (t, vdivq_f64 (t, vaddq_f64 (t, v_f64 (1.0))));
+  return vmulq_f64 (t, halfsign);
+}
+
+TEST_SIG (V, D, 1, sinh, -10.0, 10.0)
+TEST_ULP (V_NAME_D1 (sinh), 2.02)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (sinh), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_D1 (sinh), 0, 0x1p-26, 1000)
+TEST_SYM_INTERVAL (V_NAME_D1 (sinh), 0x1p-26, 0x1p9, 500000)
+TEST_SYM_INTERVAL (V_NAME_D1 (sinh), 0x1p9, inf, 1000)
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/sinhf.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/sinhf.c
@ -1,28 +1,25 @@
 /*
 * Single-precision vector sinh(x) function.
 *
- * Copyright (c) 2022-2023, Arm Limited.
+ * Copyright (c) 2022-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

 #include "v_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-
+#include "test_sig.h"
+#include "test_defs.h"
 #include "v_expm1f_inline.h"

 static const struct data
 {
  struct v_expm1f_data expm1f_consts;
-  uint32x4_t halff;
 #if WANT_SIMD_EXCEPT
  uint32x4_t tiny_bound, thresh;
 #else
-  uint32x4_t oflow_bound;
+  float32x4_t oflow_bound;
 #endif
 } data = {
  .expm1f_consts = V_EXPM1F_DATA,
-  .halff = V4 (0x3f000000),
 #if WANT_SIMD_EXCEPT
  /* 0x1.6a09e8p-32, below which expm1f underflows.  */
  .tiny_bound = V4 (0x2fb504f4),
@ -30,14 +27,15 @@ static const struct data
  .thresh = V4 (0x12fbbbb3),
 #else
  /* 0x1.61814ep+6, above which expm1f helper overflows.  */
-  .oflow_bound = V4 (0x42b0c0a7),
+  .oflow_bound = V4 (0x1.61814ep+6),
 #endif
 };

 static float32x4_t NOINLINE VPCS_ATTR
-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+special_case (float32x4_t x, float32x4_t t, float32x4_t halfsign,
+	      uint32x4_t special)
 {
-  return v_call_f32 (sinhf, x, y, special);
+  return v_call_f32 (sinhf, x, vmulq_f32 (t, halfsign), special);
 }

 /* Approximation for vector single-precision sinh(x) using expm1.
@ -45,21 +43,21 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
   The maximum error is 2.26 ULP:
   _ZGVnN4v_sinhf (0x1.e34a9ep-4) got 0x1.e469ep-4
 				 want 0x1.e469e4p-4.  */
-float32x4_t VPCS_ATTR V_NAME_F1 (sinh) (float32x4_t x)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sinh) (float32x4_t x)
 {
  const struct data *d = ptr_barrier (&data);

  uint32x4_t ix = vreinterpretq_u32_f32 (x);
  float32x4_t ax = vabsq_f32 (x);
-  uint32x4_t iax = vreinterpretq_u32_f32 (ax);
-  uint32x4_t sign = veorq_u32 (ix, iax);
-  float32x4_t halfsign = vreinterpretq_f32_u32 (vorrq_u32 (sign, d->halff));
+  float32x4_t halfsign = vreinterpretq_f32_u32 (
+      vbslq_u32 (v_u32 (0x80000000), ix, vreinterpretq_u32_f32 (v_f32 (0.5))));

 #if WANT_SIMD_EXCEPT
-  uint32x4_t special = vcgeq_u32 (vsubq_u32 (iax, d->tiny_bound), d->thresh);
+  uint32x4_t special = vcgeq_u32 (
+      vsubq_u32 (vreinterpretq_u32_f32 (ax), d->tiny_bound), d->thresh);
  ax = v_zerofy_f32 (ax, special);
 #else
-  uint32x4_t special = vcgeq_u32 (iax, d->oflow_bound);
+  uint32x4_t special = vcageq_f32 (x, d->oflow_bound);
 #endif

  /* Up to the point that expm1f overflows, we can use it to calculate sinhf
@ -71,14 +69,16 @@ float32x4_t VPCS_ATTR V_NAME_F1 (sinh) (float32x4_t x)
  /* Fall back to the scalar variant for any lanes that should trigger an
     exception.  */
  if (unlikely (v_any_u32 (special)))
-    return special_case (x, vmulq_f32 (t, halfsign), special);
+    return special_case (x, t, halfsign, special);

  return vmulq_f32 (t, halfsign);
 }

-PL_SIG (V, F, 1, sinh, -10.0, 10.0)
-PL_TEST_ULP (V_NAME_F1 (sinh), 1.76)
-PL_TEST_EXPECT_FENV (V_NAME_F1 (sinh), WANT_SIMD_EXCEPT)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (sinh), 0, 0x2fb504f4, 1000)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (sinh), 0x2fb504f4, 0x42b0c0a7, 100000)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (sinh), 0x42b0c0a7, inf, 1000)
+HALF_WIDTH_ALIAS_F1 (sinh)
+
+TEST_SIG (V, F, 1, sinh, -10.0, 10.0)
+TEST_ULP (V_NAME_F1 (sinh), 1.76)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (sinh), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_F1 (sinh), 0, 0x2fb504f4, 1000)
+TEST_SYM_INTERVAL (V_NAME_F1 (sinh), 0x2fb504f4, 0x42b0c0a7, 100000)
+TEST_SYM_INTERVAL (V_NAME_F1 (sinh), 0x42b0c0a7, inf, 1000)
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/sinpi.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/sinpi.c
@ -1,15 +1,15 @@
 /*
 * Double-precision vector sinpi function.
 *
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

 #include "mathlib.h"
 #include "v_math.h"
-#include "poly_advsimd_f64.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "v_poly_f64.h"
+#include "test_sig.h"
+#include "test_defs.h"

 static const struct data
 {
@ -34,7 +34,7 @@ special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp)
 {
  /* Fall back to scalar code.  */
  y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
-  return v_call_f64 (sinpi, x, y, cmp);
+  return v_call_f64 (arm_math_sinpi, x, y, cmp);
 }
 #endif

@ -77,10 +77,11 @@ float64x2_t VPCS_ATTR V_NAME_D1 (sinpi) (float64x2_t x)
  return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
 }

-PL_SIG (V, D, 1, sinpi, -0.9, 0.9)
-PL_TEST_ULP (V_NAME_D1 (sinpi), 3.06)
-PL_TEST_EXPECT_FENV (V_NAME_D1 (sinpi), WANT_SIMD_EXCEPT)
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0, 0x1p-63, 5000)
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0x1p-63, 0.5, 10000)
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0.5, 0x1p51, 10000)
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0x1p51, inf, 10000)
+#if WANT_TRIGPI_TESTS
+TEST_ULP (V_NAME_D1 (sinpi), 2.56)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (sinpi), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0, 0x1p-63, 5000)
+TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0x1p-63, 0.5, 10000)
+TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0.5, 0x1p51, 10000)
+TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0x1p51, inf, 10000)
+#endif
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/sinpif.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/sinpif.c
@ -1,15 +1,15 @@
 /*
 * Single-precision vector sinpi function.
 *
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

 #include "mathlib.h"
 #include "v_math.h"
-#include "poly_advsimd_f32.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "v_poly_f32.h"
+#include "test_sig.h"
+#include "test_defs.h"

 static const struct data
 {
@ -29,7 +29,7 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp)
 {
  /* Fall back to scalar code.  */
  y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
-  return v_call_f32 (sinpif, x, y, cmp);
+  return v_call_f32 (arm_math_sinpif, x, y, cmp);
 }
 #endif

@ -37,7 +37,7 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp)
    Maximum Error 3.03 ULP:
    _ZGVnN4v_sinpif(0x1.c597ccp-2) got 0x1.f7cd56p-1
 				  want 0x1.f7cd5p-1.  */
-float32x4_t VPCS_ATTR V_NAME_F1 (sinpi) (float32x4_t x)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sinpi) (float32x4_t x)
 {
  const struct data *d = ptr_barrier (&data);

@ -72,10 +72,13 @@ float32x4_t VPCS_ATTR V_NAME_F1 (sinpi) (float32x4_t x)
  return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
 }

-PL_SIG (V, F, 1, sinpi, -0.9, 0.9)
-PL_TEST_ULP (V_NAME_F1 (sinpi), 2.54)
-PL_TEST_EXPECT_FENV (V_NAME_F1 (sinpi), WANT_SIMD_EXCEPT)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0, 0x1p-31, 5000)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0x1p-31, 0.5, 10000)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0.5, 0x1p31f, 10000)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0x1p31f, inf, 10000)
+HALF_WIDTH_ALIAS_F1 (sinpi)
+
+#if WANT_TRIGPI_TESTS
+TEST_ULP (V_NAME_F1 (sinpi), 2.54)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (sinpi), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0, 0x1p-31, 5000)
+TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0x1p-31, 0.5, 10000)
+TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0.5, 0x1p31f, 10000)
+TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0x1p31f, inf, 10000)
+#endif
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/tan.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/tan.c
@ -1,19 +1,20 @@
 /*
 * Double-precision vector tan(x) function.
 *
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

 #include "v_math.h"
-#include "poly_advsimd_f64.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "v_poly_f64.h"
+#include "test_sig.h"
+#include "test_defs.h"

 static const struct data
 {
  float64x2_t poly[9];
-  float64x2_t half_pi, two_over_pi, shift;
+  double half_pi[2];
+  float64x2_t two_over_pi, shift;
 #if !WANT_SIMD_EXCEPT
  float64x2_t range_val;
 #endif
@ -71,8 +72,9 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x)
  /* Use q to reduce x to r in [-pi/4, pi/4], by:
     r = x - q * pi/2, in extended precision.  */
  float64x2_t r = x;
-  r = vfmsq_laneq_f64 (r, q, dat->half_pi, 0);
-  r = vfmsq_laneq_f64 (r, q, dat->half_pi, 1);
+  float64x2_t half_pi = vld1q_f64 (dat->half_pi);
+  r = vfmsq_laneq_f64 (r, q, half_pi, 0);
+  r = vfmsq_laneq_f64 (r, q, half_pi, 1);
  /* Further reduce r to [-pi/8, pi/8], to be reconstructed using double angle
     formula.  */
  r = vmulq_n_f64 (r, 0.5);
@ -112,9 +114,9 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x)
 		    vbslq_f64 (no_recip, d, n));
 }

-PL_SIG (V, D, 1, tan, -3.1, 3.1)
-PL_TEST_ULP (V_NAME_D1 (tan), 2.99)
-PL_TEST_EXPECT_FENV (V_NAME_D1 (tan), WANT_SIMD_EXCEPT)
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (tan), 0, TinyBound, 5000)
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (tan), TinyBound, RangeVal, 100000)
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (tan), RangeVal, inf, 5000)
+TEST_SIG (V, D, 1, tan, -3.1, 3.1)
+TEST_ULP (V_NAME_D1 (tan), 2.99)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (tan), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_D1 (tan), 0, TinyBound, 5000)
+TEST_SYM_INTERVAL (V_NAME_D1 (tan), TinyBound, RangeVal, 100000)
+TEST_SYM_INTERVAL (V_NAME_D1 (tan), RangeVal, inf, 5000)
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/tanf.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/tanf.c
@ -1,19 +1,19 @@
 /*
 * Single-precision vector tan(x) function.
 *
- * Copyright (c) 2021-2023, Arm Limited.
+ * Copyright (c) 2021-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

 #include "v_math.h"
-#include "poly_advsimd_f32.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "v_poly_f32.h"
+#include "test_sig.h"
+#include "test_defs.h"

 static const struct data
 {
  float32x4_t poly[6];
-  float32x4_t pi_consts;
+  float pi_consts[4];
  float32x4_t shift;
 #if !WANT_SIMD_EXCEPT
  float32x4_t range_val;
@ -64,7 +64,7 @@ eval_poly (float32x4_t z, const struct data *d)
   Maximum error is 3.45 ULP:
   __v_tanf(-0x1.e5f0cap+13) got 0x1.ff9856p-1
 			    want 0x1.ff9850p-1.  */
-float32x4_t VPCS_ATTR V_NAME_F1 (tan) (float32x4_t x)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tan) (float32x4_t x)
 {
  const struct data *d = ptr_barrier (&data);
  float32x4_t special_arg = x;
@ -85,16 +85,17 @@ float32x4_t VPCS_ATTR V_NAME_F1 (tan) (float32x4_t x)
 #endif

  /* n = rint(x/(pi/2)).  */
-  float32x4_t q = vfmaq_laneq_f32 (d->shift, x, d->pi_consts, 3);
+  float32x4_t pi_consts = vld1q_f32 (d->pi_consts);
+  float32x4_t q = vfmaq_laneq_f32 (d->shift, x, pi_consts, 3);
  float32x4_t n = vsubq_f32 (q, d->shift);
  /* Determine if x lives in an interval, where |tan(x)| grows to infinity.  */
  uint32x4_t pred_alt = vtstq_u32 (vreinterpretq_u32_f32 (q), v_u32 (1));

  /* r = x - n * (pi/2)  (range reduction into -pi./4 .. pi/4).  */
  float32x4_t r;
-  r = vfmaq_laneq_f32 (x, n, d->pi_consts, 0);
-  r = vfmaq_laneq_f32 (r, n, d->pi_consts, 1);
-  r = vfmaq_laneq_f32 (r, n, d->pi_consts, 2);
+  r = vfmaq_laneq_f32 (x, n, pi_consts, 0);
+  r = vfmaq_laneq_f32 (r, n, pi_consts, 1);
+  r = vfmaq_laneq_f32 (r, n, pi_consts, 2);

  /* If x lives in an interval, where |tan(x)|
     - is finite, then use a polynomial approximation of the form
@ -119,9 +120,11 @@ float32x4_t VPCS_ATTR V_NAME_F1 (tan) (float32x4_t x)
  return vbslq_f32 (pred_alt, inv_y, y);
 }

-PL_SIG (V, F, 1, tan, -3.1, 3.1)
-PL_TEST_ULP (V_NAME_F1 (tan), 2.96)
-PL_TEST_EXPECT_FENV (V_NAME_F1 (tan), WANT_SIMD_EXCEPT)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (tan), 0, 0x1p-31, 5000)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (tan), 0x1p-31, 0x1p15, 500000)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (tan), 0x1p15, inf, 5000)
+HALF_WIDTH_ALIAS_F1 (tan)
+
+TEST_SIG (V, F, 1, tan, -3.1, 3.1)
+TEST_ULP (V_NAME_F1 (tan), 2.96)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (tan), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_F1 (tan), 0, 0x1p-31, 5000)
+TEST_SYM_INTERVAL (V_NAME_F1 (tan), 0x1p-31, 0x1p15, 500000)
+TEST_SYM_INTERVAL (V_NAME_F1 (tan), 0x1p15, inf, 5000)
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/tanh.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/tanh.c
@ -0,0 +1,67 @@
+/*
+ * Double-precision vector tanh(x) function.
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "v_expm1_inline.h"
+
+static const struct data
+{
+  struct v_expm1_data d;
+  uint64x2_t thresh, tiny_bound;
+} data = {
+  .d = V_EXPM1_DATA,
+  .tiny_bound = V2 (0x3e40000000000000), /* asuint64 (0x1p-27).  */
+  /* asuint64(0x1.241bf835f9d5fp+4) - asuint64(tiny_bound).  */
+  .thresh = V2 (0x01f241bf835f9d5f),
+};
+
+static float64x2_t NOINLINE VPCS_ATTR
+special_case (float64x2_t x, float64x2_t q, float64x2_t qp2,
+	      uint64x2_t special)
+{
+  return v_call_f64 (tanh, x, vdivq_f64 (q, qp2), special);
+}
+
+/* Vector approximation for double-precision tanh(x), using a simplified
+   version of expm1. The greatest observed error is 2.70 ULP:
+   _ZGVnN2v_tanh(-0x1.c59aa220cb177p-3) got -0x1.be5452a6459fep-3
+				       want -0x1.be5452a6459fbp-3.  */
+float64x2_t VPCS_ATTR V_NAME_D1 (tanh) (float64x2_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x));
+
+  float64x2_t u = x;
+
+  /* Trigger special-cases for tiny, boring and infinity/NaN.  */
+  uint64x2_t special = vcgtq_u64 (vsubq_u64 (ia, d->tiny_bound), d->thresh);
+#if WANT_SIMD_EXCEPT
+  /* To trigger fp exceptions correctly, set special lanes to a neutral value.
+     They will be fixed up later by the special-case handler.  */
+  if (unlikely (v_any_u64 (special)))
+    u = v_zerofy_f64 (u, special);
+#endif
+
+  u = vaddq_f64 (u, u);
+
+  /* tanh(x) = (e^2x - 1) / (e^2x + 1).  */
+  float64x2_t q = expm1_inline (u, &d->d);
+  float64x2_t qp2 = vaddq_f64 (q, v_f64 (2.0));
+
+  if (unlikely (v_any_u64 (special)))
+    return special_case (x, q, qp2, special);
+  return vdivq_f64 (q, qp2);
+}
+
+TEST_SIG (V, D, 1, tanh, -10.0, 10.0)
+TEST_ULP (V_NAME_D1 (tanh), 2.21)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (tanh), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_D1 (tanh), 0, 0x1p-27, 5000)
+TEST_SYM_INTERVAL (V_NAME_D1 (tanh), 0x1p-27, 0x1.241bf835f9d5fp+4, 50000)
+TEST_SYM_INTERVAL (V_NAME_D1 (tanh), 0x1.241bf835f9d5fp+4, inf, 1000)
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/tanhf.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/tanhf.c
@ -1,14 +1,13 @@
 /*
 * Single-precision vector tanh(x) function.
 *
- * Copyright (c) 2022-2023, Arm Limited.
+ * Copyright (c) 2022-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

 #include "v_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-
+#include "test_sig.h"
+#include "test_defs.h"
 #include "v_expm1f_inline.h"

 static const struct data
@ -20,20 +19,23 @@ static const struct data
  /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for  negative).  */
  .boring_bound = V4 (0x41102cb3),
  .large_bound = V4 (0x7f800000),
-  .onef = V4 (0x3f800000),
 };

 static float32x4_t NOINLINE VPCS_ATTR
-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+special_case (float32x4_t x, uint32x4_t is_boring, float32x4_t boring,
+	      float32x4_t q, uint32x4_t special)
 {
-  return v_call_f32 (tanhf, x, y, special);
+  return v_call_f32 (
+      tanhf, x,
+      vbslq_f32 (is_boring, boring, vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)))),
+      special);
 }

 /* Approximation for single-precision vector tanh(x), using a simplified
   version of expm1f. The maximum error is 2.58 ULP:
   _ZGVnN4v_tanhf (0x1.fa5eep-5) got 0x1.f9ba02p-5
 				want 0x1.f9ba08p-5.  */
-float32x4_t VPCS_ATTR V_NAME_F1 (tanh) (float32x4_t x)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tanh) (float32x4_t x)
 {
  const struct data *d = ptr_barrier (&data);

@ -42,7 +44,9 @@ float32x4_t VPCS_ATTR V_NAME_F1 (tanh) (float32x4_t x)
  uint32x4_t iax = vreinterpretq_u32_f32 (ax);
  uint32x4_t sign = veorq_u32 (ix, iax);
  uint32x4_t is_boring = vcgtq_u32 (iax, d->boring_bound);
-  float32x4_t boring = vreinterpretq_f32_u32 (vorrq_u32 (sign, d->onef));
+  /* expm1 exponent bias is 1.0f reinterpreted to int.  */
+  float32x4_t boring = vreinterpretq_f32_u32 (vorrq_u32 (
+      sign, vreinterpretq_u32_s32 (d->expm1f_consts.exponent_bias)));

 #if WANT_SIMD_EXCEPT
  /* If fp exceptions are to be triggered properly, set all special and boring
@ -58,16 +62,20 @@ float32x4_t VPCS_ATTR V_NAME_F1 (tanh) (float32x4_t x)

  /* tanh(x) = (e^2x - 1) / (e^2x + 1).  */
  float32x4_t q = expm1f_inline (vmulq_n_f32 (x, 2), &d->expm1f_consts);
-  float32x4_t y = vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)));
+
  if (unlikely (v_any_u32 (special)))
-    return special_case (vreinterpretq_f32_u32 (ix),
-			 vbslq_f32 (is_boring, boring, y), special);
+    return special_case (vreinterpretq_f32_u32 (ix), is_boring, boring, q,
+			 special);
+
+  float32x4_t y = vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)));
  return vbslq_f32 (is_boring, boring, y);
 }

-PL_SIG (V, F, 1, tanh, -10.0, 10.0)
-PL_TEST_ULP (V_NAME_F1 (tanh), 2.09)
-PL_TEST_EXPECT_FENV (V_NAME_F1 (tanh), WANT_SIMD_EXCEPT)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (tanh), 0, 0x1p-23, 1000)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (tanh), 0x1p-23, 0x1.205966p+3, 100000)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (tanh), 0x1.205966p+3, inf, 100)
+HALF_WIDTH_ALIAS_F1 (tanh)
+
+TEST_SIG (V, F, 1, tanh, -10.0, 10.0)
+TEST_ULP (V_NAME_F1 (tanh), 2.09)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (tanh), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_F1 (tanh), 0, 0x1p-23, 1000)
+TEST_SYM_INTERVAL (V_NAME_F1 (tanh), 0x1p-23, 0x1.205966p+3, 100000)
+TEST_SYM_INTERVAL (V_NAME_F1 (tanh), 0x1.205966p+3, inf, 100)
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/tanpi.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/tanpi.c
@ -0,0 +1,88 @@
+/*
+ * Double-precision vector tanpi(x) function.
+ *
+ * Copyright (c) 2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+const static struct v_tanpi_data
+{
+  float64x2_t c0, c2, c4, c6, c8, c10, c12;
+  double c1, c3, c5, c7, c9, c11, c13, c14;
+} tanpi_data = {
+  /* Coefficents for tan(pi * x) computed with fpminimax
+     on [ 0x1p-1022 0x1p-2 ]
+     approx rel error: 0x1.7eap-55
+     approx abs error: 0x1.7eap-55.  */
+  .c0 = V2 (0x1.921fb54442d18p1), /* pi.  */
+  .c1 = 0x1.4abbce625be52p3,	  .c2 = V2 (0x1.466bc6775b0f9p5),
+  .c3 = 0x1.45fff9b426f5ep7,	  .c4 = V2 (0x1.45f4730dbca5cp9),
+  .c5 = 0x1.45f3265994f85p11,	  .c6 = V2 (0x1.45f4234b330cap13),
+  .c7 = 0x1.45dca11be79ebp15,	  .c8 = V2 (0x1.47283fc5eea69p17),
+  .c9 = 0x1.3a6d958cdefaep19,	  .c10 = V2 (0x1.927896baee627p21),
+  .c11 = -0x1.89333f6acd922p19,	  .c12 = V2 (0x1.5d4e912bb8456p27),
+  .c13 = -0x1.a854d53ab6874p29,	  .c14 = 0x1.1b76de7681424p32,
+};
+
+/* Approximation for double-precision vector tanpi(x)
+   The maximum error is 3.06 ULP:
+   _ZGVnN2v_tanpi(0x1.0a4a07dfcca3ep-1) got -0x1.fa30112702c98p+3
+				       want -0x1.fa30112702c95p+3.  */
+float64x2_t VPCS_ATTR V_NAME_D1 (tanpi) (float64x2_t x)
+{
+  const struct v_tanpi_data *d = ptr_barrier (&tanpi_data);
+
+  float64x2_t n = vrndnq_f64 (x);
+
+  /* inf produces nan that propagates.  */
+  float64x2_t xr = vsubq_f64 (x, n);
+  float64x2_t ar = vabdq_f64 (x, n);
+  uint64x2_t flip = vcgtq_f64 (ar, v_f64 (0.25));
+  float64x2_t r = vbslq_f64 (flip, vsubq_f64 (v_f64 (0.5), ar), ar);
+
+  /* Order-14 pairwise Horner.  */
+  float64x2_t r2 = vmulq_f64 (r, r);
+  float64x2_t r4 = vmulq_f64 (r2, r2);
+
+  float64x2_t c_1_3 = vld1q_f64 (&d->c1);
+  float64x2_t c_5_7 = vld1q_f64 (&d->c5);
+  float64x2_t c_9_11 = vld1q_f64 (&d->c9);
+  float64x2_t c_13_14 = vld1q_f64 (&d->c13);
+  float64x2_t p01 = vfmaq_laneq_f64 (d->c0, r2, c_1_3, 0);
+  float64x2_t p23 = vfmaq_laneq_f64 (d->c2, r2, c_1_3, 1);
+  float64x2_t p45 = vfmaq_laneq_f64 (d->c4, r2, c_5_7, 0);
+  float64x2_t p67 = vfmaq_laneq_f64 (d->c6, r2, c_5_7, 1);
+  float64x2_t p89 = vfmaq_laneq_f64 (d->c8, r2, c_9_11, 0);
+  float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, r2, c_9_11, 1);
+  float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, r2, c_13_14, 0);
+
+  float64x2_t p = vfmaq_laneq_f64 (p1213, r4, c_13_14, 1);
+  p = vfmaq_f64 (p1011, r4, p);
+  p = vfmaq_f64 (p89, r4, p);
+  p = vfmaq_f64 (p67, r4, p);
+  p = vfmaq_f64 (p45, r4, p);
+  p = vfmaq_f64 (p23, r4, p);
+  p = vfmaq_f64 (p01, r4, p);
+  p = vmulq_f64 (r, p);
+
+  float64x2_t p_recip = vdivq_f64 (v_f64 (1.0), p);
+  float64x2_t y = vbslq_f64 (flip, p_recip, p);
+
+  uint64x2_t sign
+      = veorq_u64 (vreinterpretq_u64_f64 (xr), vreinterpretq_u64_f64 (ar));
+  return vreinterpretq_f64_u64 (vorrq_u64 (vreinterpretq_u64_f64 (y), sign));
+}
+
+#if WANT_TRIGPI_TESTS
+TEST_DISABLE_FENV (V_NAME_D1 (tanpi))
+TEST_ULP (V_NAME_D1 (tanpi), 2.57)
+TEST_SYM_INTERVAL (V_NAME_D1 (tanpi), 0, 0x1p-31, 50000)
+TEST_SYM_INTERVAL (V_NAME_D1 (tanpi), 0x1p-31, 0.5, 50000)
+TEST_SYM_INTERVAL (V_NAME_D1 (tanpi), 0.5, 1.0, 200000)
+TEST_SYM_INTERVAL (V_NAME_D1 (tanpi), 1.0, 0x1p23, 50000)
+TEST_SYM_INTERVAL (V_NAME_D1 (tanpi), 0x1p23, inf, 50000)
+#endif
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/tanpif.c
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/tanpif.c
@ -0,0 +1,70 @@
+/*
+ * Single-precision vector tanpi(x) function.
+ *
+ * Copyright (c) 2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+const static struct v_tanpif_data
+{
+  float32x4_t c0, c2, c4, c6;
+  float c1, c3, c5, c7;
+} tanpif_data = {
+  /* Coefficents for tan(pi * x).  */
+  .c0 = V4 (0x1.921fb4p1f),  .c1 = 0x1.4abbcep3f,      .c2 = V4 (0x1.466b8p5f),
+  .c3 = 0x1.461c72p7f,	     .c4 = V4 (0x1.42e9d4p9f), .c5 = 0x1.69e2c4p11f,
+  .c6 = V4 (0x1.e85558p11f), .c7 = 0x1.a52e08p16f,
+};
+
+/* Approximation for single-precision vector tanpi(x)
+   The maximum error is 3.34 ULP:
+   _ZGVnN4v_tanpif(0x1.d6c09ap-2) got 0x1.f70aacp+2
+				 want 0x1.f70aa6p+2.  */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tanpi) (float32x4_t x)
+{
+  const struct v_tanpif_data *d = ptr_barrier (&tanpif_data);
+
+  float32x4_t n = vrndnq_f32 (x);
+
+  /* inf produces nan that propagates.  */
+  float32x4_t xr = vsubq_f32 (x, n);
+  float32x4_t ar = vabdq_f32 (x, n);
+  uint32x4_t flip = vcgtq_f32 (ar, v_f32 (0.25f));
+  float32x4_t r = vbslq_f32 (flip, vsubq_f32 (v_f32 (0.5f), ar), ar);
+
+  /* Order-7 pairwise Horner polynomial evaluation scheme.  */
+  float32x4_t r2 = vmulq_f32 (r, r);
+  float32x4_t r4 = vmulq_f32 (r2, r2);
+
+  float32x4_t odd_coeffs = vld1q_f32 (&d->c1);
+  float32x4_t p01 = vfmaq_laneq_f32 (d->c0, r2, odd_coeffs, 0);
+  float32x4_t p23 = vfmaq_laneq_f32 (d->c2, r2, odd_coeffs, 1);
+  float32x4_t p45 = vfmaq_laneq_f32 (d->c4, r2, odd_coeffs, 2);
+  float32x4_t p67 = vfmaq_laneq_f32 (d->c6, r2, odd_coeffs, 3);
+  float32x4_t p = vfmaq_f32 (p45, r4, p67);
+  p = vfmaq_f32 (p23, r4, p);
+  p = vfmaq_f32 (p01, r4, p);
+
+  p = vmulq_f32 (r, p);
+  float32x4_t p_recip = vdivq_f32 (v_f32 (1.0f), p);
+  float32x4_t y = vbslq_f32 (flip, p_recip, p);
+
+  uint32x4_t sign
+      = veorq_u32 (vreinterpretq_u32_f32 (xr), vreinterpretq_u32_f32 (ar));
+  return vreinterpretq_f32_u32 (vorrq_u32 (vreinterpretq_u32_f32 (y), sign));
+}
+
+HALF_WIDTH_ALIAS_F1 (tanpi)
+
+#if WANT_TRIGPI_TESTS
+TEST_DISABLE_FENV (V_NAME_F1 (tanpi))
+TEST_ULP (V_NAME_F1 (tanpi), 2.84)
+TEST_SYM_INTERVAL (V_NAME_F1 (tanpi), 0, 0x1p-31, 50000)
+TEST_SYM_INTERVAL (V_NAME_F1 (tanpi), 0x1p-31, 0.5, 100000)
+TEST_SYM_INTERVAL (V_NAME_F1 (tanpi), 0.5, 0x1p23f, 100000)
+TEST_SYM_INTERVAL (V_NAME_F1 (tanpi), 0x1p23f, inf, 100000)
+#endif
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/v_expf_inline.h
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_expf_inline.h
@ -0,0 +1,58 @@
+/*
+ * Helper for single-precision routines which calculate exp(ax) and do not
+ * need special-case handling
+ *
+ * Copyright (c) 2019-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef MATH_V_EXPF_INLINE_H
+#define MATH_V_EXPF_INLINE_H
+
+#include "v_math.h"
+
+struct v_expf_data
+{
+  float ln2_hi, ln2_lo, c0, c2;
+  float32x4_t inv_ln2, c1, c3, c4;
+  /* asuint(1.0f).  */
+  uint32x4_t exponent_bias;
+};
+
+/* maxerr: 1.45358 +0.5 ulp.  */
+#define V_EXPF_DATA                                                           \
+  {                                                                           \
+    .c0 = 0x1.0e4020p-7f, .c1 = V4 (0x1.573e2ep-5f), .c2 = 0x1.555e66p-3f,    \
+    .c3 = V4 (0x1.fffdb6p-2f), .c4 = V4 (0x1.ffffecp-1f),                     \
+    .ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f,                        \
+    .inv_ln2 = V4 (0x1.715476p+0f), .exponent_bias = V4 (0x3f800000),         \
+  }
+
+static inline float32x4_t
+v_expf_inline (float32x4_t x, const struct v_expf_data *d)
+{
+  /* Helper routine for calculating exp(ax).
+     Copied from v_expf.c, with all special-case handling removed - the
+     calling routine should handle special values if required.  */
+
+  /* exp(ax) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+     ax = ln2*n + r, with r in [-ln2/2, ln2/2].  */
+  float32x4_t ax = vabsq_f32 (x);
+  float32x4_t ln2_c02 = vld1q_f32 (&d->ln2_hi);
+  float32x4_t n = vrndaq_f32 (vmulq_f32 (ax, d->inv_ln2));
+  float32x4_t r = vfmsq_laneq_f32 (ax, n, ln2_c02, 0);
+  r = vfmsq_laneq_f32 (r, n, ln2_c02, 1);
+  uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23);
+  float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
+
+  /* Custom order-4 Estrin avoids building high order monomial.  */
+  float32x4_t r2 = vmulq_f32 (r, r);
+  float32x4_t p = vfmaq_laneq_f32 (d->c1, r, ln2_c02, 2);
+  float32x4_t q = vfmaq_laneq_f32 (d->c3, r, ln2_c02, 3);
+  q = vfmaq_f32 (q, p, r2);
+  p = vmulq_f32 (d->c4, r);
+  float32x4_t poly = vfmaq_f32 (p, q, r2);
+  return vfmaq_f32 (scale, poly, scale);
+}
+
+#endif // MATH_V_EXPF_INLINE_H
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/v_expm1_inline.h
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_expm1_inline.h
@ -0,0 +1,86 @@
+/*
+ * Helper for double-precision routines which calculate exp(x) - 1 and do not
+ * need special-case handling
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef MATH_V_EXPM1_INLINE_H
+#define MATH_V_EXPM1_INLINE_H
+
+#include "v_math.h"
+
+struct v_expm1_data
+{
+  float64x2_t c2, c4, c6, c8;
+  float64x2_t invln2;
+  int64x2_t exponent_bias;
+  double c1, c3, c5, c7, c9, c10;
+  double ln2[2];
+};
+
+/* Generated using fpminimax, with degree=12 in [log(2)/2, log(2)/2].  */
+#define V_EXPM1_DATA                                                          \
+  {                                                                           \
+    .c1 = 0x1.5555555555559p-3, .c2 = V2 (0x1.555555555554bp-5),              \
+    .c3 = 0x1.111111110f663p-7, .c4 = V2 (0x1.6c16c16c1b5f3p-10),             \
+    .c5 = 0x1.a01a01affa35dp-13, .c6 = V2 (0x1.a01a018b4ecbbp-16),            \
+    .c7 = 0x1.71ddf82db5bb4p-19, .c8 = V2 (0x1.27e517fc0d54bp-22),            \
+    .c9 = 0x1.af5eedae67435p-26, .c10 = 0x1.1f143d060a28ap-29,                \
+    .ln2 = { 0x1.62e42fefa39efp-1, 0x1.abc9e3b39803fp-56 },                   \
+    .invln2 = V2 (0x1.71547652b82fep0),                                       \
+    .exponent_bias = V2 (0x3ff0000000000000),                                 \
+  }
+
+static inline float64x2_t
+expm1_inline (float64x2_t x, const struct v_expm1_data *d)
+{
+  /* Helper routine for calculating exp(x) - 1.  */
+
+  float64x2_t ln2 = vld1q_f64 (&d->ln2[0]);
+
+  /* Reduce argument to smaller range:
+     Let i = round(x / ln2)
+     and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
+     exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
+     where 2^i is exact because i is an integer.  */
+  float64x2_t n = vrndaq_f64 (vmulq_f64 (x, d->invln2));
+  int64x2_t i = vcvtq_s64_f64 (n);
+  float64x2_t f = vfmsq_laneq_f64 (x, n, ln2, 0);
+  f = vfmsq_laneq_f64 (f, n, ln2, 1);
+
+  /* Approximate expm1(f) using polynomial.
+     Taylor expansion for expm1(x) has the form:
+	 x + ax^2 + bx^3 + cx^4 ....
+     So we calculate the polynomial P(f) = a + bf + cf^2 + ...
+     and assemble the approximation expm1(f) ~= f + f^2 * P(f).  */
+  float64x2_t f2 = vmulq_f64 (f, f);
+  float64x2_t f4 = vmulq_f64 (f2, f2);
+  float64x2_t lane_consts_13 = vld1q_f64 (&d->c1);
+  float64x2_t lane_consts_57 = vld1q_f64 (&d->c5);
+  float64x2_t lane_consts_910 = vld1q_f64 (&d->c9);
+  float64x2_t p01 = vfmaq_laneq_f64 (v_f64 (0.5), f, lane_consts_13, 0);
+  float64x2_t p23 = vfmaq_laneq_f64 (d->c2, f, lane_consts_13, 1);
+  float64x2_t p45 = vfmaq_laneq_f64 (d->c4, f, lane_consts_57, 0);
+  float64x2_t p67 = vfmaq_laneq_f64 (d->c6, f, lane_consts_57, 1);
+  float64x2_t p03 = vfmaq_f64 (p01, f2, p23);
+  float64x2_t p47 = vfmaq_f64 (p45, f2, p67);
+  float64x2_t p89 = vfmaq_laneq_f64 (d->c8, f, lane_consts_910, 0);
+  float64x2_t p = vfmaq_laneq_f64 (p89, f2, lane_consts_910, 1);
+  p = vfmaq_f64 (p47, f4, p);
+  p = vfmaq_f64 (p03, f4, p);
+
+  p = vfmaq_f64 (f, f2, p);
+
+  /* Assemble the result.
+     expm1(x) ~= 2^i * (p + 1) - 1
+     Let t = 2^i.  */
+  int64x2_t u = vaddq_s64 (vshlq_n_s64 (i, 52), d->exponent_bias);
+  float64x2_t t = vreinterpretq_f64_s64 (u);
+
+  /* expm1(x) ~= p * t + (t - 1).  */
+  return vfmaq_f64 (vsubq_f64 (t, v_f64 (1.0)), p, t);
+}
+
+#endif // MATH_V_EXPM1_INLINE_H
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/v_expm1f_inline.h
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_expm1f_inline.h
@ -0,0 +1,62 @@
+/*
+ * Helper for single-precision routines which calculate exp(x) - 1 and do not
+ * need special-case handling
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef MATH_V_EXPM1F_INLINE_H
+#define MATH_V_EXPM1F_INLINE_H
+
+#include "v_math.h"
+
+struct v_expm1f_data
+{
+  float32x4_t c0, c2;
+  int32x4_t exponent_bias;
+  float c1, c3, inv_ln2, c4;
+  float ln2_hi, ln2_lo;
+};
+
+/* Coefficients generated using fpminimax with degree=5 in [-log(2)/2,
+   log(2)/2]. Exponent bias is asuint(1.0f).  */
+#define V_EXPM1F_DATA                                                         \
+  {                                                                           \
+    .c0 = V4 (0x1.fffffep-2), .c1 = 0x1.5554aep-3, .c2 = V4 (0x1.555736p-5),  \
+    .c3 = 0x1.12287cp-7, .c4 = 0x1.6b55a2p-10,                                \
+    .exponent_bias = V4 (0x3f800000), .inv_ln2 = 0x1.715476p+0f,              \
+    .ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f,                        \
+  }
+
+static inline float32x4_t
+expm1f_inline (float32x4_t x, const struct v_expm1f_data *d)
+{
+  /* Helper routine for calculating exp(x) - 1.  */
+
+  float32x2_t ln2 = vld1_f32 (&d->ln2_hi);
+  float32x4_t lane_consts = vld1q_f32 (&d->c1);
+
+  /* Reduce argument: f in [-ln2/2, ln2/2], i is exact.  */
+  float32x4_t j = vrndaq_f32 (vmulq_laneq_f32 (x, lane_consts, 2));
+  int32x4_t i = vcvtq_s32_f32 (j);
+  float32x4_t f = vfmsq_lane_f32 (x, j, ln2, 0);
+  f = vfmsq_lane_f32 (f, j, ln2, 1);
+
+  /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f).  */
+  float32x4_t f2 = vmulq_f32 (f, f);
+  float32x4_t f4 = vmulq_f32 (f2, f2);
+  float32x4_t p01 = vfmaq_laneq_f32 (d->c0, f, lane_consts, 0);
+  float32x4_t p23 = vfmaq_laneq_f32 (d->c2, f, lane_consts, 1);
+  float32x4_t p = vfmaq_f32 (p01, f2, p23);
+  p = vfmaq_laneq_f32 (p, f4, lane_consts, 3);
+  p = vfmaq_f32 (f, f2, p);
+
+  /* t = 2^i.  */
+  int32x4_t u = vaddq_s32 (vshlq_n_s32 (i, 23), d->exponent_bias);
+  float32x4_t t = vreinterpretq_f32_s32 (u);
+  /* expm1(x) ~= p * t + (t - 1).  */
+  return vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t);
+}
+
+#endif // MATH_V_EXPM1F_INLINE_H
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/v_log1p_inline.h
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_log1p_inline.h
@ -0,0 +1,119 @@
+/*
+ * Helper for vector double-precision routines which calculate log(1 + x) and
+ * do not need special-case handling
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#ifndef MATH_V_LOG1P_INLINE_H
+#define MATH_V_LOG1P_INLINE_H
+
+#include "v_math.h"
+
+struct v_log1p_data
+{
+  float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16;
+  uint64x2_t hf_rt2_top, one_m_hf_rt2_top, umask;
+  int64x2_t one_top;
+  double c1, c3, c5, c7, c9, c11, c13, c15, c17, c18;
+  double ln2[2];
+};
+
+/* Coefficients generated using Remez, deg=20, in [sqrt(2)/2-1, sqrt(2)-1].  */
+#define V_LOG1P_CONSTANTS_TABLE                                               \
+  {                                                                           \
+    .c0 = V2 (-0x1.ffffffffffffbp-2), .c1 = 0x1.55555555551a9p-2,             \
+    .c2 = V2 (-0x1.00000000008e3p-2), .c3 = 0x1.9999999a32797p-3,             \
+    .c4 = V2 (-0x1.555555552fecfp-3), .c5 = 0x1.249248e071e5ap-3,             \
+    .c6 = V2 (-0x1.ffffff8bf8482p-4), .c7 = 0x1.c71c8f07da57ap-4,             \
+    .c8 = V2 (-0x1.9999ca4ccb617p-4), .c9 = 0x1.7459ad2e1dfa3p-4,             \
+    .c10 = V2 (-0x1.554d2680a3ff2p-4), .c11 = 0x1.3b4c54d487455p-4,           \
+    .c12 = V2 (-0x1.2548a9ffe80e6p-4), .c13 = 0x1.0f389a24b2e07p-4,           \
+    .c14 = V2 (-0x1.eee4db15db335p-5), .c15 = 0x1.e95b494d4a5ddp-5,           \
+    .c16 = V2 (-0x1.15fdf07cb7c73p-4), .c17 = 0x1.0310b70800fcfp-4,           \
+    .c18 = -0x1.cfa7385bdb37ep-6,                                             \
+    .ln2 = { 0x1.62e42fefa3800p-1, 0x1.ef35793c76730p-45 },                   \
+    .hf_rt2_top = V2 (0x3fe6a09e00000000),                                    \
+    .one_m_hf_rt2_top = V2 (0x00095f6200000000),                              \
+    .umask = V2 (0x000fffff00000000), .one_top = V2 (0x3ff)                   \
+  }
+
+#define BottomMask v_u64 (0xffffffff)
+
+static inline float64x2_t
+eval_poly (float64x2_t m, float64x2_t m2, const struct v_log1p_data *d)
+{
+  /* Approximate log(1+m) on [-0.25, 0.5] using pairwise Horner.  */
+  float64x2_t c13 = vld1q_f64 (&d->c1);
+  float64x2_t c57 = vld1q_f64 (&d->c5);
+  float64x2_t c911 = vld1q_f64 (&d->c9);
+  float64x2_t c1315 = vld1q_f64 (&d->c13);
+  float64x2_t c1718 = vld1q_f64 (&d->c17);
+  float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, m, c1718, 0);
+  float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, m, c1315, 1);
+  float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, m, c1315, 0);
+  float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, m, c911, 1);
+  float64x2_t p89 = vfmaq_laneq_f64 (d->c8, m, c911, 0);
+  float64x2_t p67 = vfmaq_laneq_f64 (d->c6, m, c57, 1);
+  float64x2_t p45 = vfmaq_laneq_f64 (d->c4, m, c57, 0);
+  float64x2_t p23 = vfmaq_laneq_f64 (d->c2, m, c13, 1);
+  float64x2_t p01 = vfmaq_laneq_f64 (d->c0, m, c13, 0);
+  float64x2_t p = vfmaq_laneq_f64 (p1617, m2, c1718, 1);
+  p = vfmaq_f64 (p1415, m2, p);
+  p = vfmaq_f64 (p1213, m2, p);
+  p = vfmaq_f64 (p1011, m2, p);
+  p = vfmaq_f64 (p89, m2, p);
+  p = vfmaq_f64 (p67, m2, p);
+  p = vfmaq_f64 (p45, m2, p);
+  p = vfmaq_f64 (p23, m2, p);
+  return vfmaq_f64 (p01, m2, p);
+}
+
+static inline float64x2_t
+log1p_inline (float64x2_t x, const struct v_log1p_data *d)
+{
+  /* Helper for calculating log(x + 1):
+     - No special-case handling - this should be dealt with by the caller.
+     - Optionally simulate the shortcut for k=0, used in the scalar routine,
+       using v_sel, for improved accuracy when the argument to log1p is close
+       to 0. This feature is enabled by defining WANT_V_LOG1P_K0_SHORTCUT as 1
+       in the source of the caller before including this file.  */
+  float64x2_t m = vaddq_f64 (x, v_f64 (1.0));
+  uint64x2_t mi = vreinterpretq_u64_f64 (m);
+  uint64x2_t u = vaddq_u64 (mi, d->one_m_hf_rt2_top);
+
+  int64x2_t ki
+      = vsubq_s64 (vreinterpretq_s64_u64 (vshrq_n_u64 (u, 52)), d->one_top);
+  float64x2_t k = vcvtq_f64_s64 (ki);
+
+  /* Reduce x to f in [sqrt(2)/2, sqrt(2)].  */
+  uint64x2_t utop = vaddq_u64 (vandq_u64 (u, d->umask), d->hf_rt2_top);
+  uint64x2_t u_red = vorrq_u64 (utop, vandq_u64 (mi, BottomMask));
+  float64x2_t f = vsubq_f64 (vreinterpretq_f64_u64 (u_red), v_f64 (1.0));
+
+  /* Correction term c/m.  */
+  float64x2_t cm = vdivq_f64 (vsubq_f64 (x, vsubq_f64 (m, v_f64 (1.0))), m);
+
+#ifndef WANT_V_LOG1P_K0_SHORTCUT
+# error                                                                       \
+      "Cannot use v_log1p_inline.h without specifying whether you need the k0 shortcut for greater accuracy close to 0"
+#elif WANT_V_LOG1P_K0_SHORTCUT
+  /* Shortcut if k is 0 - set correction term to 0 and f to x. The result is
+     that the approximation is solely the polynomial.  */
+  uint64x2_t k0 = vceqzq_f64 (k);
+  cm = v_zerofy_f64 (cm, k0);
+  f = vbslq_f64 (k0, x, f);
+#endif
+
+  /* Approximate log1p(f) on the reduced input using a polynomial.  */
+  float64x2_t f2 = vmulq_f64 (f, f);
+  float64x2_t p = eval_poly (f, f2, d);
+
+  /* Assemble log1p(x) = k * log2 + log1p(f) + c/m.  */
+  float64x2_t ln2 = vld1q_f64 (&d->ln2[0]);
+  float64x2_t ylo = vfmaq_laneq_f64 (cm, k, ln2, 1);
+  float64x2_t yhi = vfmaq_laneq_f64 (f, k, ln2, 0);
+  return vfmaq_f64 (vaddq_f64 (ylo, yhi), f2, p);
+}
+
+#endif // MATH_V_LOG1P_INLINE_H
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/v_log1pf_inline.h
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_log1pf_inline.h
@ -0,0 +1,94 @@
+/*
+ * Helper for single-precision routines which calculate log(1 + x) and do not
+ * need special-case handling
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef MATH_V_LOG1PF_INLINE_H
+#define MATH_V_LOG1PF_INLINE_H
+
+#include "v_math.h"
+#include "v_poly_f32.h"
+
+struct v_log1pf_data
+{
+  uint32x4_t four;
+  int32x4_t three_quarters;
+  float c0, c3, c5, c7;
+  float32x4_t c4, c6, c1, c2, ln2;
+};
+
+/* Polynomial generated using FPMinimax in [-0.25, 0.5]. First two coefficients
+   (1, -0.5) are not stored as they can be generated more efficiently.  */
+#define V_LOG1PF_CONSTANTS_TABLE                                              \
+  {                                                                           \
+    .c0 = 0x1.5555aap-2f, .c1 = V4 (-0x1.000038p-2f),                         \
+    .c2 = V4 (0x1.99675cp-3f), .c3 = -0x1.54ef78p-3f,                         \
+    .c4 = V4 (0x1.28a1f4p-3f), .c5 = -0x1.0da91p-3f,                          \
+    .c6 = V4 (0x1.abcb6p-4f), .c7 = -0x1.6f0d5ep-5f,                          \
+    .ln2 = V4 (0x1.62e43p-1f), .four = V4 (0x40800000),                       \
+    .three_quarters = V4 (0x3f400000)                                         \
+  }
+
+static inline float32x4_t
+eval_poly (float32x4_t m, const struct v_log1pf_data *d)
+{
+  /* Approximate log(1+m) on [-0.25, 0.5] using pairwise Horner.  */
+  float32x4_t c0357 = vld1q_f32 (&d->c0);
+  float32x4_t q = vfmaq_laneq_f32 (v_f32 (-0.5), m, c0357, 0);
+  float32x4_t m2 = vmulq_f32 (m, m);
+  float32x4_t p67 = vfmaq_laneq_f32 (d->c6, m, c0357, 3);
+  float32x4_t p45 = vfmaq_laneq_f32 (d->c4, m, c0357, 2);
+  float32x4_t p23 = vfmaq_laneq_f32 (d->c2, m, c0357, 1);
+  float32x4_t p = vfmaq_f32 (p45, m2, p67);
+  p = vfmaq_f32 (p23, m2, p);
+  p = vfmaq_f32 (d->c1, m, p);
+  p = vmulq_f32 (m2, p);
+  p = vfmaq_f32 (m, m2, p);
+  return vfmaq_f32 (p, m2, q);
+}
+
+static inline float32x4_t
+log1pf_inline (float32x4_t x, const struct v_log1pf_data *d)
+{
+  /* Helper for calculating log(x + 1).  */
+
+  /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m
+			   is in [-0.25, 0.5]):
+     log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2).
+
+     We approximate log1p(m) with a polynomial, then scale by
+     k*log(2). Instead of doing this directly, we use an intermediate
+     scale factor s = 4*k*log(2) to ensure the scale is representable
+     as a normalised fp32 number.  */
+  float32x4_t m = vaddq_f32 (x, v_f32 (1.0f));
+
+  /* Choose k to scale x to the range [-1/4, 1/2].  */
+  int32x4_t k
+      = vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d->three_quarters),
+		   v_s32 (0xff800000));
+  uint32x4_t ku = vreinterpretq_u32_s32 (k);
+
+  /* Scale up to ensure that the scale factor is representable as normalised
+     fp32 number, and scale m down accordingly.  */
+  float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d->four, ku));
+
+  /* Scale x by exponent manipulation.  */
+  float32x4_t m_scale
+      = vreinterpretq_f32_u32 (vsubq_u32 (vreinterpretq_u32_f32 (x), ku));
+  m_scale = vaddq_f32 (m_scale, vfmaq_f32 (v_f32 (-1.0f), v_f32 (0.25f), s));
+
+  /* Evaluate polynomial on the reduced interval.  */
+  float32x4_t p = eval_poly (m_scale, d);
+
+  /* The scale factor to be applied back at the end - by multiplying float(k)
+     by 2^-23 we get the unbiased exponent of k.  */
+  float32x4_t scale_back = vmulq_f32 (vcvtq_f32_s32 (k), v_f32 (0x1.0p-23f));
+
+  /* Apply the scaling back.  */
+  return vfmaq_f32 (p, scale_back, d->ln2);
+}
+
+#endif //  MATH_V_LOG1PF_INLINE_H
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/v_log_inline.h
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_log_inline.h
@ -1,7 +1,7 @@
 /*
 * Double-precision vector log(x) function - inline version
 *
- * Copyright (c) 2019-2023, Arm Limited.
+ * Copyright (c) 2019-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

@ -57,8 +57,8 @@ log_lookup (uint64x2_t i)
 {
  /* Since N is a power of 2, n % N = n & (N - 1).  */
  struct entry e;
-  uint64_t i0 = (i[0] >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
-  uint64_t i1 = (i[1] >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
+  uint64_t i0 = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
+  uint64_t i1 = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
  float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc);
  float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc);
  e.invc = vuzp1q_f64 (e0, e1);
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/v_math.h
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_math.h
@ -1,36 +1,63 @@
 /*
 * Vector math abstractions.
 *
- * Copyright (c) 2019-2023, Arm Limited.
+ * Copyright (c) 2019-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

 #ifndef _V_MATH_H
 #define _V_MATH_H

-#ifndef WANT_VMATH
-/* Enable the build of vector math code.  */
-# define WANT_VMATH 1
+#if !__aarch64__
+# error "Cannot build without AArch64"
 #endif

-#if WANT_VMATH
+#define VPCS_ATTR __attribute__ ((aarch64_vector_pcs))

-# if __aarch64__
-#  define VPCS_ATTR __attribute__ ((aarch64_vector_pcs))
-# else
-#  error "Cannot build without AArch64"
-# endif
+#define V_NAME_F1(fun) _ZGVnN4v_##fun##f
+#define V_NAME_D1(fun) _ZGVnN2v_##fun
+#define V_NAME_F2(fun) _ZGVnN4vv_##fun##f
+#define V_NAME_D2(fun) _ZGVnN2vv_##fun
+#define V_NAME_F1_L1(fun) _ZGVnN4vl4_##fun##f
+#define V_NAME_D1_L1(fun) _ZGVnN2vl8_##fun

-# include <stdint.h>
-# include "math_config.h"
-# if __aarch64__
+#if USE_GLIBC_ABI

-#  include <arm_neon.h>
+# define HALF_WIDTH_ALIAS_F1(fun)                                             \
+    float32x2_t VPCS_ATTR _ZGVnN2v_##fun##f (float32x2_t x)                   \
+    {                                                                         \
+      return vget_low_f32 (_ZGVnN4v_##fun##f (vcombine_f32 (x, x)));          \
+    }
+
+# define HALF_WIDTH_ALIAS_F2(fun)                                             \
+    float32x2_t VPCS_ATTR _ZGVnN2vv_##fun##f (float32x2_t x, float32x2_t y)   \
+    {                                                                         \
+      return vget_low_f32 (                                                   \
+	  _ZGVnN4vv_##fun##f (vcombine_f32 (x, x), vcombine_f32 (y, y)));     \
+    }
+
+#else
+# define HALF_WIDTH_ALIAS_F1(fun)
+# define HALF_WIDTH_ALIAS_F2(fun)
+#endif
+
+#include <stdint.h>
+#include "math_config.h"
+#include <arm_neon.h>

 /* Shorthand helpers for declaring constants.  */
-#  define V2(X) { X, X }
-#  define V4(X) { X, X, X, X }
-#  define V8(X) { X, X, X, X, X, X, X, X }
+#define V2(X)                                                                 \
+  {                                                                           \
+    X, X                                                                      \
+  }
+#define V4(X)                                                                 \
+  {                                                                           \
+    X, X, X, X                                                                \
+  }
+#define V8(X)                                                                 \
+  {                                                                           \
+    X, X, X, X, X, X, X, X                                                    \
+  }

 static inline int
 v_any_u16h (uint16x4_t x)
@ -38,6 +65,12 @@ v_any_u16h (uint16x4_t x)
  return vget_lane_u64 (vreinterpret_u64_u16 (x), 0) != 0;
 }

+static inline int
+v_lanes32 (void)
+{
+  return 4;
+}
+
 static inline float32x4_t
 v_f32 (float x)
 {
@ -54,7 +87,7 @@ v_s32 (int32_t x)
  return (int32x4_t) V4 (x);
 }

-/* true if any elements of a vector compare result is non-zero.  */
+/* true if any elements of a v_cond result is non-zero.  */
 static inline int
 v_any_u32 (uint32x4_t x)
 {
@ -97,6 +130,11 @@ v_zerofy_f32 (float32x4_t x, uint32x4_t mask)
  return vreinterpretq_f32_u32 (vbicq_u32 (vreinterpretq_u32_f32 (x), mask));
 }

+static inline int
+v_lanes64 (void)
+{
+  return 2;
+}
 static inline float64x2_t
 v_f64 (double x)
 {
@ -113,20 +151,13 @@ v_s64 (int64_t x)
  return (int64x2_t) V2 (x);
 }

-/* true if any elements of a vector compare result is non-zero.  */
+/* true if any elements of a v_cond result is non-zero.  */
 static inline int
 v_any_u64 (uint64x2_t x)
 {
  /* assume elements in x are either 0 or -1u.  */
  return vpaddd_u64 (x) != 0;
 }
-/* true if all elements of a vector compare result is 1.  */
-static inline int
-v_all_u64 (uint64x2_t x)
-{
-  /* assume elements in x are either 0 or -1u.  */
-  return vpaddd_s64 (vreinterpretq_s64_u64 (x)) == -2;
-}
 static inline float64x2_t
 v_lookup_f64 (const double *tab, uint64x2_t idx)
 {
@ -137,7 +168,6 @@ v_lookup_u64 (const uint64_t *tab, uint64x2_t idx)
 {
  return (uint64x2_t){ tab[idx[0]], tab[idx[1]] };
 }
-
 static inline float64x2_t
 v_call_f64 (double (*f) (double), float64x2_t x, float64x2_t y, uint64x2_t p)
 {
@ -169,7 +199,4 @@ v_zerofy_f64 (float64x2_t x, uint64x2_t mask)
  return vreinterpretq_f64_u64 (vbicq_u64 (vreinterpretq_u64_f64 (x), mask));
 }

-# endif
-#endif
-
 #endif
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/v_poly_f32.h
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_poly_f32.h
@ -2,12 +2,12 @@
 * Helpers for evaluating polynomials on single-precision AdvSIMD input, using
 * various schemes.
 *
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

-#ifndef PL_MATH_POLY_ADVSIMD_F32_H
-#define PL_MATH_POLY_ADVSIMD_F32_H
+#ifndef MATH_POLY_ADVSIMD_F32_H
+#define MATH_POLY_ADVSIMD_F32_H

 #include <arm_neon.h>

--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/v_poly_f64.h
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_poly_f64.h
@ -2,12 +2,12 @@
 * Helpers for evaluating polynomials on double-precision AdvSIMD input, using
 * various schemes.
 *
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

-#ifndef PL_MATH_POLY_ADVSIMD_F64_H
-#define PL_MATH_POLY_ADVSIMD_F64_H
+#ifndef MATH_POLY_ADVSIMD_F64_H
+#define MATH_POLY_ADVSIMD_F64_H

 #include <arm_neon.h>

--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/v_sincos_common.h
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_sincos_common.h
@ -1,12 +1,12 @@
 /*
 * Core approximation for double-precision vector sincos
 *
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

 #include "v_math.h"
-#include "poly_advsimd_f64.h"
+#include "v_poly_f64.h"

 static const struct v_sincos_data
 {
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/v_sincosf_common.h
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_sincosf_common.h
@ -1,7 +1,7 @@
 /*
 * Core approximation for single-precision vector sincos
 *
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/v_sincospi_common.h
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_sincospi_common.h
@ -0,0 +1,64 @@
+/*
+ * Helper for Double-precision vector sincospi function.
+ *
+ * Copyright (c) 2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "v_math.h"
+#include "v_poly_f64.h"
+
+static const struct v_sincospi_data
+{
+  float64x2_t poly[10], range_val;
+} v_sincospi_data = {
+  /* Polynomial coefficients generated using Remez algorithm,
+     see sinpi.sollya for details.  */
+  .poly = { V2 (0x1.921fb54442d184p1), V2 (-0x1.4abbce625be53p2),
+	    V2 (0x1.466bc6775ab16p1), V2 (-0x1.32d2cce62dc33p-1),
+	    V2 (0x1.507834891188ep-4), V2 (-0x1.e30750a28c88ep-8),
+	    V2 (0x1.e8f48308acda4p-12), V2 (-0x1.6fc0032b3c29fp-16),
+	    V2 (0x1.af86ae521260bp-21), V2 (-0x1.012a9870eeb7dp-25) },
+  .range_val = V2 (0x1p63),
+};
+
+/* Double-precision vector function allowing calculation of both sin and cos in
+   one function call, using separate argument reduction and shared low-order
+   polynomials.
+   Approximation for vector double-precision sincospi(x).
+   Maximum Error 3.09 ULP:
+  _ZGVnN2v_sincospi_sin(0x1.7a41deb4b21e1p+14) got 0x1.fd54d0b327cf1p-1
+					      want 0x1.fd54d0b327cf4p-1
+   Maximum Error 3.16 ULP:
+  _ZGVnN2v_sincospi_cos(-0x1.11e3c7e284adep-5) got 0x1.fd2da484ff3ffp-1
+					      want 0x1.fd2da484ff402p-1.  */
+static inline float64x2x2_t
+v_sincospi_inline (float64x2_t x, const struct v_sincospi_data *d)
+{
+  /* If r is odd, the sign of the result should be inverted for sinpi
+     and reintroduced for cospi.  */
+  uint64x2_t cmp = vcgeq_f64 (x, d->range_val);
+  uint64x2_t odd = vshlq_n_u64 (
+      vbicq_u64 (vreinterpretq_u64_s64 (vcvtaq_s64_f64 (x)), cmp), 63);
+
+  /* r = x - rint(x).  */
+  float64x2_t sr = vsubq_f64 (x, vrndaq_f64 (x));
+  /* cospi(x) = sinpi(0.5 - abs(x)) for values -1/2 .. 1/2.  */
+  float64x2_t cr = vsubq_f64 (v_f64 (0.5), vabsq_f64 (sr));
+
+  /* Pairwise Horner approximation for y = sin(r * pi).  */
+  float64x2_t sr2 = vmulq_f64 (sr, sr);
+  float64x2_t sr4 = vmulq_f64 (sr2, sr2);
+  float64x2_t cr2 = vmulq_f64 (cr, cr);
+  float64x2_t cr4 = vmulq_f64 (cr2, cr2);
+
+  float64x2_t ss = vmulq_f64 (v_pw_horner_9_f64 (sr2, sr4, d->poly), sr);
+  float64x2_t cc = vmulq_f64 (v_pw_horner_9_f64 (cr2, cr4, d->poly), cr);
+
+  float64x2_t sinpix
+      = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (ss), odd));
+
+  float64x2_t cospix
+      = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (cc), odd));
+
+  return (float64x2x2_t){ sinpix, cospix };
+}
--- a/contrib/arm-optimized-routines/math/aarch64/advsimd/v_sincospif_common.h
+++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_sincospif_common.h
@ -0,0 +1,57 @@
+/*
+ * Helper for Single-precision vector sincospi function.
+ *
+ * Copyright (c) 2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "mathlib.h"
+#include "v_math.h"
+#include "v_poly_f32.h"
+
+const static struct v_sincospif_data
+{
+  float32x4_t poly[6], range_val;
+} v_sincospif_data = {
+  /* Taylor series coefficents for sin(pi * x).  */
+  .poly = { V4 (0x1.921fb6p1f), V4 (-0x1.4abbcep2f), V4 (0x1.466bc6p1f),
+	    V4 (-0x1.32d2ccp-1f), V4 (0x1.50783p-4f), V4 (-0x1.e30750p-8f) },
+  .range_val = V4 (0x1p31f),
+};
+
+/* Single-precision vector function allowing calculation of both sinpi and
+   cospi in one function call, using shared argument reduction and polynomials.
+   Worst-case error for sin is 3.04 ULP:
+   _ZGVnN4v_sincospif_sin(0x1.1d341ap-1) got 0x1.f7cd56p-1 want 0x1.f7cd5p-1.
+   Worst-case error for cos is 3.18 ULP:
+   _ZGVnN4v_sincospif_cos(0x1.d341a8p-5) got 0x1.f7cd56p-1 want 0x1.f7cd5p-1.
+ */
+static inline float32x4x2_t
+v_sincospif_inline (float32x4_t x, const struct v_sincospif_data *d)
+{
+  /* If r is odd, the sign of the result should be inverted for sinpi and
+     reintroduced for cospi.  */
+  uint32x4_t cmp = vcgeq_f32 (x, d->range_val);
+  uint32x4_t odd = vshlq_n_u32 (
+      vbicq_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), cmp), 31);
+
+  /* r = x - rint(x).  */
+  float32x4_t sr = vsubq_f32 (x, vrndaq_f32 (x));
+  /* cospi(x) = sinpi(0.5 - abs(x)) for values -1/2 .. 1/2.  */
+  float32x4_t cr = vsubq_f32 (v_f32 (0.5f), vabsq_f32 (sr));
+
+  /* Pairwise Horner approximation for y = sin(r * pi).  */
+  float32x4_t sr2 = vmulq_f32 (sr, sr);
+  float32x4_t sr4 = vmulq_f32 (sr2, sr2);
+  float32x4_t cr2 = vmulq_f32 (cr, cr);
+  float32x4_t cr4 = vmulq_f32 (cr2, cr2);
+
+  float32x4_t ss = vmulq_f32 (v_pw_horner_5_f32 (sr2, sr4, d->poly), sr);
+  float32x4_t cc = vmulq_f32 (v_pw_horner_5_f32 (cr2, cr4, d->poly), cr);
+
+  float32x4_t sinpix
+      = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (ss), odd));
+  float32x4_t cospix
+      = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (cc), odd));
+
+  return (float32x4x2_t){ sinpix, cospix };
+}
--- a/contrib/arm-optimized-routines/math/aarch64/cospi_3u5.c
+++ b/contrib/arm-optimized-routines/math/aarch64/cospi_3u5.c
@ -1,14 +1,14 @@
 /*
 * Double-precision scalar cospi function.
 *
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

 #include "mathlib.h"
 #include "math_config.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
 #include "poly_scalar_f64.h"

 /* Taylor series coefficents for sin(pi * x).
@ -29,9 +29,9 @@ static const double poly[]
   cospi(0x1.160b129300112p-21) got 0x1.fffffffffd16bp-1
 			       want 0x1.fffffffffd16ep-1.  */
 double
-cospi (double x)
+arm_math_cospi (double x)
 {
-  if (isinf (x))
+  if (isinf (x) || isnan (x))
    return __math_invalid (x);

  double ax = asdouble (asuint64 (x) & ~0x8000000000000000);
@ -81,9 +81,18 @@ cospi (double x)
  return asdouble (asuint64 (y) ^ sign);
 }

-PL_SIG (S, D, 1, cospi, -0.9, 0.9)
-PL_TEST_ULP (cospi, 2.63)
-PL_TEST_SYM_INTERVAL (cospi, 0, 0x1p-63, 5000)
-PL_TEST_SYM_INTERVAL (cospi, 0x1p-63, 0.5, 10000)
-PL_TEST_SYM_INTERVAL (cospi, 0.5, 0x1p51f, 10000)
-PL_TEST_SYM_INTERVAL (cospi, 0x1p51f, inf, 10000)
+#if WANT_EXPERIMENTAL_MATH
+double
+cospi (double x)
+{
+  return arm_math_cospi (x);
+}
+#endif
+
+#if WANT_TRIGPI_TESTS
+TEST_ULP (arm_math_cospi, 2.63)
+TEST_SYM_INTERVAL (arm_math_cospi, 0, 0x1p-63, 5000)
+TEST_SYM_INTERVAL (arm_math_cospi, 0x1p-63, 0.5, 10000)
+TEST_SYM_INTERVAL (arm_math_cospi, 0.5, 0x1p51f, 10000)
+TEST_SYM_INTERVAL (arm_math_cospi, 0x1p51f, inf, 10000)
+#endif
--- a/contrib/arm-optimized-routines/math/aarch64/cospif_2u6.c
+++ b/contrib/arm-optimized-routines/math/aarch64/cospif_2u6.c
@ -1,14 +1,14 @@
 /*
 * Single-precision scalar cospi function.
 *
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

 #include "mathlib.h"
 #include "math_config.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"

 /* Taylor series coefficents for sin(pi * x).  */
 #define C0 0x1.921fb6p1f
@ -25,9 +25,9 @@
   cospif(0x1.37e844p-4) got 0x1.f16b3p-1
 			want 0x1.f16b2ap-1.  */
 float
-cospif (float x)
+arm_math_cospif (float x)
 {
-  if (isinf (x))
+  if (isinf (x) || isnan (x))
    return __math_invalidf (x);

  float ax = asfloat (asuint (x) & ~0x80000000);
@ -76,9 +76,18 @@ cospif (float x)
  return asfloat (asuint (y * r) ^ sign);
 }

-PL_SIG (S, F, 1, cospi, -0.9, 0.9)
-PL_TEST_ULP (cospif, 2.15)
-PL_TEST_SYM_INTERVAL (cospif, 0, 0x1p-31, 5000)
-PL_TEST_SYM_INTERVAL (cospif, 0x1p-31, 0.5, 10000)
-PL_TEST_SYM_INTERVAL (cospif, 0.5, 0x1p22f, 10000)
-PL_TEST_SYM_INTERVAL (cospif, 0x1p22f, inf, 10000)
+#if WANT_EXPERIMENTAL_MATH
+float
+cospif (float x)
+{
+  return arm_math_cospif (x);
+}
+#endif
+
+#if WANT_TRIGPI_TESTS
+TEST_ULP (arm_math_cospif, 2.15)
+TEST_SYM_INTERVAL (arm_math_cospif, 0, 0x1p-31, 5000)
+TEST_SYM_INTERVAL (arm_math_cospif, 0x1p-31, 0.5, 10000)
+TEST_SYM_INTERVAL (arm_math_cospif, 0.5, 0x1p22f, 10000)
+TEST_SYM_INTERVAL (arm_math_cospif, 0x1p22f, inf, 10000)
+#endif
--- a/contrib/arm-optimized-routines/math/aarch64/experimental/README.contributors
+++ b/contrib/arm-optimized-routines/math/aarch64/experimental/README.contributors
@ -5,7 +5,6 @@ glibc-specific conventions need not be followed.
 The requirements for portable code apply to non-portable code with the
 following differences:

-
 1. Worst-case ULP error should be encoded in filenames (e.g. sin_u35.c). There
   are no specific restrictions on acceptable ULP error, but if functions
   provide significantly less accuracy than portable equivalents then a clear
@ -15,9 +14,3 @@ following differences:

 2. Functions are assumed to support round-to-nearest mode by default, unless
   stated; other rounding modes are not required to be provided.
-
-3. Handling of special cases may be relaxed for vector functions. Checking
-   whether each vector lane contains special values such as NaN, Inf or
-   denormal numbers can prove too costly for vector functions. This is often
-   not required since vector functions are typically used along with aggressive
-   compiler optimization flags.
--- a/contrib/arm-optimized-routines/math/aarch64/experimental/acos_2u.c
+++ b/contrib/arm-optimized-routines/math/aarch64/experimental/acos_2u.c
@ -1,23 +1,23 @@
 /*
 * Double-precision acos(x) function.
 *
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

 #include "math_config.h"
 #include "poly_scalar_f64.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"

-#define AbsMask (0x7fffffffffffffff)
-#define Half (0x3fe0000000000000)
-#define One (0x3ff0000000000000)
-#define PiOver2 (0x1.921fb54442d18p+0)
-#define Pi (0x1.921fb54442d18p+1)
-#define Small (0x3c90000000000000) /* 2^-53.  */
-#define Small16 (0x3c90)
-#define QNaN (0x7ff8)
+#define AbsMask 0x7fffffffffffffff
+#define Half 0x3fe0000000000000
+#define One 0x3ff0000000000000
+#define PiOver2 0x1.921fb54442d18p+0
+#define Pi 0x1.921fb54442d18p+1
+#define Small 0x3c90000000000000 /* 2^-53.  */
+#define Small16 0x3c90
+#define QNaN 0x7ff8

 /* Fast implementation of double-precision acos(x) based on polynomial
   approximation of double-precision asin(x).
@ -29,8 +29,8 @@

     acos(x) = pi/2 - asin(x)

-   and use an order 11 polynomial P such that the final approximation of asin is
-   an odd polynomial: asin(x) ~ x + x^3 * P(x^2).
+   and use an order 11 polynomial P such that the final approximation of asin
+   is an odd polynomial: asin(x) ~ x + x^3 * P(x^2).

   The largest observed error in this region is 1.18 ulps,
   acos(0x1.fbab0a7c460f6p-2) got 0x1.0d54d1985c068p+0
@ -90,11 +90,11 @@ acos (double x)
  return (x <= -0.5) ? fma (-2.0, p, Pi) : 2.0 * p;
 }

-PL_SIG (S, D, 1, acos, -1.0, 1.0)
-PL_TEST_ULP (acos, 1.02)
-PL_TEST_INTERVAL (acos, 0, Small, 5000)
-PL_TEST_INTERVAL (acos, Small, 0.5, 50000)
-PL_TEST_INTERVAL (acos, 0.5, 1.0, 50000)
-PL_TEST_INTERVAL (acos, 1.0, 0x1p11, 50000)
-PL_TEST_INTERVAL (acos, 0x1p11, inf, 20000)
-PL_TEST_INTERVAL (acos, -0, -inf, 20000)
+TEST_SIG (S, D, 1, acos, -1.0, 1.0)
+TEST_ULP (acos, 1.02)
+TEST_INTERVAL (acos, 0, Small, 5000)
+TEST_INTERVAL (acos, Small, 0.5, 50000)
+TEST_INTERVAL (acos, 0.5, 1.0, 50000)
+TEST_INTERVAL (acos, 1.0, 0x1p11, 50000)
+TEST_INTERVAL (acos, 0x1p11, inf, 20000)
+TEST_INTERVAL (acos, -0, -inf, 20000)
--- a/contrib/arm-optimized-routines/math/aarch64/experimental/acosf_1u4.c
+++ b/contrib/arm-optimized-routines/math/aarch64/experimental/acosf_1u4.c
@ -1,23 +1,23 @@
 /*
 * Single-precision acos(x) function.
 *
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

 #include "poly_scalar_f32.h"
 #include "math_config.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"

-#define AbsMask (0x7fffffff)
-#define Half (0x3f000000)
-#define One (0x3f800000)
-#define PiOver2f (0x1.921fb6p+0f)
-#define Pif (0x1.921fb6p+1f)
-#define Small (0x32800000) /* 2^-26.  */
-#define Small12 (0x328)
-#define QNaN (0x7fc)
+#define AbsMask 0x7fffffff
+#define Half 0x3f000000
+#define One 0x3f800000
+#define PiOver2f 0x1.921fb6p+0f
+#define Pif 0x1.921fb6p+1f
+#define Small 0x32800000 /* 2^-26.  */
+#define Small12 0x328
+#define QNaN 0x7fc

 /* Fast implementation of single-precision acos(x) based on polynomial
   approximation of single-precision asin(x).
@ -89,11 +89,11 @@ acosf (float x)
  return (x <= -0.5) ? fmaf (-2.0f, p, Pif) : 2.0f * p;
 }

-PL_SIG (S, F, 1, acos, -1.0, 1.0)
-PL_TEST_ULP (acosf, 0.82)
-PL_TEST_INTERVAL (acosf, 0, Small, 5000)
-PL_TEST_INTERVAL (acosf, Small, 0.5, 50000)
-PL_TEST_INTERVAL (acosf, 0.5, 1.0, 50000)
-PL_TEST_INTERVAL (acosf, 1.0, 0x1p11, 50000)
-PL_TEST_INTERVAL (acosf, 0x1p11, inf, 20000)
-PL_TEST_INTERVAL (acosf, -0, -inf, 20000)
+TEST_SIG (S, F, 1, acos, -1.0, 1.0)
+TEST_ULP (acosf, 0.82)
+TEST_INTERVAL (acosf, 0, Small, 5000)
+TEST_INTERVAL (acosf, Small, 0.5, 50000)
+TEST_INTERVAL (acosf, 0.5, 1.0, 50000)
+TEST_INTERVAL (acosf, 1.0, 0x1p11, 50000)
+TEST_INTERVAL (acosf, 0x1p11, inf, 20000)
+TEST_INTERVAL (acosf, -0, -inf, 20000)
--- a/contrib/arm-optimized-routines/math/aarch64/experimental/acosh_3u.c
+++ b/contrib/arm-optimized-routines/math/aarch64/experimental/acosh_3u.c
@ -1,31 +1,26 @@
 /*
 * Double-precision acosh(x) function.
 *
- * Copyright (c) 2022-2023, Arm Limited.
+ * Copyright (c) 2022-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

+#include "mathlib.h"
 #include "math_config.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"

 #define Ln2 (0x1.62e42fefa39efp-1)
 #define MinusZero (0x8000000000000000)
 #define SquareLim (0x5fe0000000000000) /* asuint64(0x1.0p511).  */
 #define Two (0x4000000000000000)       /* asuint64(2.0).  */

-double
-optr_aor_log_f64 (double);
-
-double
-log1p (double);
-
 /* acosh approximation using a variety of approaches on different intervals:

   acosh(x) = ln(x + sqrt(x * x - 1)).

-   x >= 2^511: We cannot square x without overflow. For huge x, sqrt(x*x - 1) is
-   close enough to x that we can calculate the result by ln(2x) == ln(x) +
+   x >= 2^511: We cannot square x without overflow. For huge x, sqrt(x*x - 1)
+   is close enough to x that we can calculate the result by ln(2x) == ln(x) +
   ln(2). The greatest observed error in this region is 0.98 ULP:
   acosh(0x1.1b9bf42923d1dp+853) got 0x1.28066a11a7c7fp+9
 				want 0x1.28066a11a7c8p+9.
@ -48,19 +43,19 @@ acosh (double x)
    return __math_invalid (x);

  if (unlikely (ix >= SquareLim))
-    return optr_aor_log_f64 (x) + Ln2;
+    return log (x) + Ln2;

  if (ix >= Two)
-    return optr_aor_log_f64 (x + sqrt (x * x - 1));
+    return log (x + sqrt (x * x - 1));

  double xm1 = x - 1;
  return log1p (xm1 + sqrt (2 * xm1 + xm1 * xm1));
 }

-PL_SIG (S, D, 1, acosh, 1.0, 10.0)
-PL_TEST_ULP (acosh, 2.19)
-PL_TEST_INTERVAL (acosh, 0, 1, 10000)
-PL_TEST_INTERVAL (acosh, 1, 2, 100000)
-PL_TEST_INTERVAL (acosh, 2, 0x1p511, 100000)
-PL_TEST_INTERVAL (acosh, 0x1p511, inf, 100000)
-PL_TEST_INTERVAL (acosh, -0, -inf, 10000)
+TEST_SIG (S, D, 1, acosh, 1.0, 10.0)
+TEST_ULP (acosh, 2.19)
+TEST_INTERVAL (acosh, 0, 1, 10000)
+TEST_INTERVAL (acosh, 1, 2, 100000)
+TEST_INTERVAL (acosh, 2, 0x1p511, 100000)
+TEST_INTERVAL (acosh, 0x1p511, inf, 100000)
+TEST_INTERVAL (acosh, -0, -inf, 10000)
--- a/contrib/arm-optimized-routines/math/aarch64/experimental/acoshf_2u8.c
+++ b/contrib/arm-optimized-routines/math/aarch64/experimental/acoshf_2u8.c
@ -1,27 +1,19 @@
 /*
 * Single-precision acosh(x) function.
 *
- * Copyright (c) 2022-2023, Arm Limited.
+ * Copyright (c) 2022-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

 #include "math_config.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"

 #define Ln2 (0x1.62e4p-1f)
 #define MinusZero 0x80000000
 #define SquareLim 0x5f800000 /* asuint(0x1p64).  */
 #define Two 0x40000000

-/* Single-precision log from math/.  */
-float
-optr_aor_log_f32 (float);
-
-/* Single-precision log(1+x) from pl/math.  */
-float
-log1pf (float);
-
 /* acoshf approximation using a variety of approaches on different intervals:

   x >= 2^64: We cannot square x without overflow. For huge x, sqrt(x*x - 1) is
@ -45,19 +37,19 @@ acoshf (float x)
    return __math_invalidf (x);

  if (unlikely (ix >= SquareLim))
-    return optr_aor_log_f32 (x) + Ln2;
+    return logf (x) + Ln2;

  if (ix > Two)
-    return optr_aor_log_f32 (x + sqrtf (x * x - 1));
+    return logf (x + sqrtf (x * x - 1));

  float xm1 = x - 1;
  return log1pf (xm1 + sqrtf (2 * xm1 + xm1 * xm1));
 }

-PL_SIG (S, F, 1, acosh, 1.0, 10.0)
-PL_TEST_ULP (acoshf, 2.30)
-PL_TEST_INTERVAL (acoshf, 0, 1, 100)
-PL_TEST_INTERVAL (acoshf, 1, 2, 10000)
-PL_TEST_INTERVAL (acoshf, 2, 0x1p64, 100000)
-PL_TEST_INTERVAL (acoshf, 0x1p64, inf, 100000)
-PL_TEST_INTERVAL (acoshf, -0, -inf, 10000)
+TEST_SIG (S, F, 1, acosh, 1.0, 10.0)
+TEST_ULP (acoshf, 2.30)
+TEST_INTERVAL (acoshf, 0, 1, 100)
+TEST_INTERVAL (acoshf, 1, 2, 10000)
+TEST_INTERVAL (acoshf, 2, 0x1p64, 100000)
+TEST_INTERVAL (acoshf, 0x1p64, inf, 100000)
+TEST_INTERVAL (acoshf, -0, -inf, 10000)
--- a/contrib/arm-optimized-routines/math/aarch64/experimental/advsimd/erfinv_25u.c
+++ b/contrib/arm-optimized-routines/math/aarch64/experimental/advsimd/erfinv_25u.c
@ -1,15 +1,15 @@
 /*
 * Double-precision inverse error function (AdvSIMD variant).
 *
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */
 #include "v_math.h"
-#include "pl_test.h"
+#include "test_defs.h"
 #include "mathlib.h"
 #include "math_config.h"
-#include "pl_sig.h"
-#include "poly_advsimd_f64.h"
+#include "test_sig.h"
+#include "v_poly_f64.h"
 #define V_LOG_INLINE_POLY_ORDER 4
 #include "v_log_inline.h"

@ -22,7 +22,7 @@ const static struct data
      can be taken.  */
  double P[8][2], Q[7][2];
  float64x2_t tailshift;
-  uint8x16_t idx;
+  uint8_t idx[16];
  struct v_log_inline_data log_tbl;
  float64x2_t P_57[9], Q_57[10], P_17[7], Q_17[6];
 } data = { .P = { { 0x1.007ce8f01b2e8p+4, -0x1.f3596123109edp-7 },
@ -58,7 +58,7 @@ const static struct data
 		     V2 (0x1.a450d8e7f4cbbp+7), V2 (-0x1.bc3480485857p+7),
 		     V2 (0x1.ae6b0c504ee02p+6), V2 (-0x1.499dfec1a7f5fp+4) },
 	   .tailshift = V2 (-0.87890625),
-	   .idx = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+	   .idx = { 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 },
 	   .log_tbl = V_LOG_CONSTANTS };

 static inline float64x2_t
@ -128,7 +128,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erfinv) (float64x2_t x)
  uint64x2_t extreme_tail = vcagtq_f64 (x, v_f64 (0.9375));

  uint8x16_t off = vandq_u8 (vreinterpretq_u8_u64 (is_tail), vdupq_n_u8 (8));
-  uint8x16_t idx = vaddq_u8 (d->idx, off);
+  uint8x16_t idx = vaddq_u8 (vld1q_u8 (d->idx), off);

  float64x2_t t = vbslq_f64 (is_tail, d->tailshift, v_f64 (-0.5625));
  t = vfmaq_f64 (t, x, x);
@ -150,12 +150,17 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erfinv) (float64x2_t x)
  return vdivq_f64 (p, q);
 }

-PL_SIG (V, D, 1, erfinv, -0.99, 0.99)
-PL_TEST_ULP (V_NAME_D1 (erfinv), 24.8)
+#if USE_MPFR
+# warning Not generating tests for _ZGVnN2v_erfinv, as MPFR has no suitable reference
+#else
+TEST_SIG (V, D, 1, erfinv, -0.99, 0.99)
+TEST_ULP (V_NAME_D1 (erfinv), 24.8)
+TEST_DISABLE_FENV (V_NAME_D1 (erfinv))
+TEST_SYM_INTERVAL (V_NAME_D1 (erfinv), 0, 0x1.fffffffffffffp-1, 100000)
+TEST_SYM_INTERVAL (V_NAME_D1 (erfinv), 0, 0x1.fffffffffffffp-1, 100000)
+TEST_SYM_INTERVAL (V_NAME_D1 (erfinv), 0, 0x1.fffffffffffffp-1, 100000)
 /* Test with control lane in each interval.  */
-PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (erfinv), 0, 0x1.fffffffffffffp-1, 100000,
-			0.5)
-PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (erfinv), 0, 0x1.fffffffffffffp-1, 100000,
-			0.8)
-PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (erfinv), 0, 0x1.fffffffffffffp-1, 100000,
-			0.95)
+TEST_CONTROL_VALUE (V_NAME_D1 (erfinv), 0.5)
+TEST_CONTROL_VALUE (V_NAME_D1 (erfinv), 0.8)
+TEST_CONTROL_VALUE (V_NAME_D1 (erfinv), 0.95)
+#endif
--- a/contrib/arm-optimized-routines/math/aarch64/experimental/advsimd/erfinvf_5u.c
+++ b/contrib/arm-optimized-routines/math/aarch64/experimental/advsimd/erfinvf_5u.c
@ -1,13 +1,13 @@
 /*
 * Single-precision inverse error function (AdvSIMD variant).
 *
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */
 #include "v_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-#include "poly_advsimd_f32.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "v_poly_f32.h"
 #include "v_logf_inline.h"

 const static struct data
@ -24,14 +24,15 @@ const static struct data

      P_10 and Q_10 are also stored in homogenous vectors to allow better
      memory access when no lanes are in a tail region.  */
-  float32x4_t Plo, PQ, Qhi, P29_3, tailshift;
+  float Plo[4], PQ[4], Qhi[4];
+  float32x4_t P29_3, tailshift;
  float32x4_t P_50[6], Q_50[2];
  float32x4_t P_10[3], Q_10[3];
-  uint8x16_t idxhi, idxlo;
+  uint8_t idxhi[16], idxlo[16];
  struct v_logf_data logf_tbl;
 } data = {
-  .idxlo = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
-  .idxhi = { 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23 },
+  .idxlo = { 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 },
+  .idxhi = { 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11 },
  .P29_3 = V4 (0x1.b13626p-2),
  .tailshift = V4 (-0.87890625),
  .Plo = { -0x1.a31268p+3, -0x1.fc0252p-4, 0x1.ac9048p+4, 0x1.119d44p+0 },
@ -86,7 +87,7 @@ lookup (float32x4_t tbl, uint8x16_t idx)
   tail region:
   _ZGVnN4v_erfinvf(0x1.f7dbeep-1) got 0x1.b4793p+0
 				  want 0x1.b4793ap+0 .  */
-float32x4_t VPCS_ATTR V_NAME_F1 (erfinv) (float32x4_t x)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (erfinv) (float32x4_t x)
 {
  const struct data *d = ptr_barrier (&data);

@ -124,18 +125,18 @@ float32x4_t VPCS_ATTR V_NAME_F1 (erfinv) (float32x4_t x)
     Add 4 * i to a group of 4 lanes to copy 32-bit lane i. Each vector stores
     two pairs of coeffs, so we need two idx vectors - one for each pair.  */
  uint8x16_t off = vandq_u8 (vreinterpretq_u8_u32 (is_tail), vdupq_n_u8 (4));
-  uint8x16_t idx_lo = vaddq_u8 (d->idxlo, off);
-  uint8x16_t idx_hi = vaddq_u8 (d->idxhi, off);
+  uint8x16_t idx_lo = vaddq_u8 (vld1q_u8 (d->idxlo), off);
+  uint8x16_t idx_hi = vaddq_u8 (vld1q_u8 (d->idxhi), off);

  /* Load the tables.  */
-  float32x4_t p_lo = d->Plo;
-  float32x4_t pq = d->PQ;
-  float32x4_t qhi = d->Qhi;
+  float32x4_t plo = vld1q_f32 (d->Plo);
+  float32x4_t pq = vld1q_f32 (d->PQ);
+  float32x4_t qhi = vld1q_f32 (d->Qhi);

  /* Do the lookup (and calculate p3 by masking non-tail lanes).  */
  float32x4_t p3 = vreinterpretq_f32_u32 (
      vandq_u32 (is_tail, vreinterpretq_u32_f32 (d->P29_3)));
-  float32x4_t p0 = lookup (p_lo, idx_lo), p1 = lookup (p_lo, idx_hi),
+  float32x4_t p0 = lookup (plo, idx_lo), p1 = lookup (plo, idx_hi),
 	      p2 = lookup (pq, idx_lo), q0 = lookup (pq, idx_hi),
 	      q1 = lookup (qhi, idx_lo), q2 = lookup (qhi, idx_hi);

@ -155,9 +156,17 @@ float32x4_t VPCS_ATTR V_NAME_F1 (erfinv) (float32x4_t x)
  return vdivq_f32 (p, q);
 }

-PL_SIG (V, F, 1, erfinv, -0.99, 0.99)
-PL_TEST_ULP (V_NAME_F1 (erfinv), 4.49)
+HALF_WIDTH_ALIAS_F1 (erfinv)
+
+#if USE_MPFR
+# warning Not generating tests for _ZGVnN4v_erfinvf, as MPFR has no suitable reference
+#else
+TEST_SIG (V, F, 1, erfinv, -0.99, 0.99)
+TEST_DISABLE_FENV (V_NAME_F1 (erfinv))
+TEST_ULP (V_NAME_F1 (erfinv), 4.49)
+TEST_SYM_INTERVAL (V_NAME_F1 (erfinv), 0, 0x1.fffffep-1, 40000)
 /* Test with control lane in each interval.  */
-PL_TEST_SYM_INTERVAL_C (V_NAME_F1 (erfinv), 0, 0x1.fffffep-1, 40000, 0.5)
-PL_TEST_SYM_INTERVAL_C (V_NAME_F1 (erfinv), 0, 0x1.fffffep-1, 40000, 0.8)
-PL_TEST_SYM_INTERVAL_C (V_NAME_F1 (erfinv), 0, 0x1.fffffep-1, 40000, 0.95)
+TEST_CONTROL_VALUE (V_NAME_F1 (erfinv), 0.5)
+TEST_CONTROL_VALUE (V_NAME_F1 (erfinv), 0.8)
+TEST_CONTROL_VALUE (V_NAME_F1 (erfinv), 0.95)
+#endif
--- a/contrib/arm-optimized-routines/math/aarch64/experimental/advsimd/v_logf_inline.h
+++ b/contrib/arm-optimized-routines/math/aarch64/experimental/advsimd/v_logf_inline.h
@ -1,7 +1,7 @@
 /*
 * Single-precision vector log function - inline version
 *
- * Copyright (c) 2019-2023, Arm Limited.
+ * Copyright (c) 2019-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

--- a/contrib/arm-optimized-routines/math/aarch64/experimental/asin_3u.c
+++ b/contrib/arm-optimized-routines/math/aarch64/experimental/asin_3u.c
@ -1,22 +1,22 @@
 /*
 * Double-precision asin(x) function.
 *
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

 #include "poly_scalar_f64.h"
 #include "math_config.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"

-#define AbsMask (0x7fffffffffffffff)
-#define Half (0x3fe0000000000000)
-#define One (0x3ff0000000000000)
-#define PiOver2 (0x1.921fb54442d18p+0)
-#define Small (0x3e50000000000000) /* 2^-26.  */
-#define Small16 (0x3e50)
-#define QNaN (0x7ff8)
+#define AbsMask 0x7fffffffffffffff
+#define Half 0x3fe0000000000000
+#define One 0x3ff0000000000000
+#define PiOver2 0x1.921fb54442d18p+0
+#define Small 0x3e50000000000000 /* 2^-26.  */
+#define Small16 0x3e50
+#define QNaN 0x7ff8

 /* Fast implementation of double-precision asin(x) based on polynomial
   approximation.
@ -54,8 +54,8 @@
     asin(x) ~ pi/2 - acos(x) ~ pi/2 - 2 * sqrt(z) (1 + z * P(z)).

   The largest observed error in this region is 2.69 ulps,
-   asin(0x1.044ac9819f573p-1) got 0x1.110d7e85fdd5p-1
-			     want 0x1.110d7e85fdd53p-1.  */
+   asin(0x1.044e8cefee301p-1) got 0x1.1111dd54ddf96p-1
+			     want 0x1.1111dd54ddf99p-1.  */
 double
 asin (double x)
 {
@ -96,11 +96,11 @@ asin (double x)
  return asdouble (asuint64 (y) | sign);
 }

-PL_SIG (S, D, 1, asin, -1.0, 1.0)
-PL_TEST_ULP (asin, 2.19)
-PL_TEST_INTERVAL (asin, 0, Small, 5000)
-PL_TEST_INTERVAL (asin, Small, 0.5, 50000)
-PL_TEST_INTERVAL (asin, 0.5, 1.0, 50000)
-PL_TEST_INTERVAL (asin, 1.0, 0x1p11, 50000)
-PL_TEST_INTERVAL (asin, 0x1p11, inf, 20000)
-PL_TEST_INTERVAL (asin, -0, -inf, 20000)
+TEST_SIG (S, D, 1, asin, -1.0, 1.0)
+TEST_ULP (asin, 2.20)
+TEST_INTERVAL (asin, 0, Small, 5000)
+TEST_INTERVAL (asin, Small, 0.5, 50000)
+TEST_INTERVAL (asin, 0.5, 1.0, 50000)
+TEST_INTERVAL (asin, 1.0, 0x1p11, 50000)
+TEST_INTERVAL (asin, 0x1p11, inf, 20000)
+TEST_INTERVAL (asin, -0, -inf, 20000)
--- a/contrib/arm-optimized-routines/math/aarch64/experimental/asin_data.c
+++ b/contrib/arm-optimized-routines/math/aarch64/experimental/asin_data.c
@ -1,7 +1,7 @@
 /*
 * Coefficients for single-precision asin(x) function.
 *
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

--- a/Show more
+++ b/Show more