From d7024bf5034d474cfb5eaf446bbb1d42c266b674 Mon Sep 17 00:00:00 2001
From: Maximilian Bosch <maximilian@mbosch.me>
Date: Thu, 24 Jul 2025 16:17:55 +0200
Subject: [PATCH] glibc: 2.40-66 -> 2.40-142, fix CVE-2025-8058

See https://nvd.nist.gov/vuln/detail/CVE-2025-8058

Updating to the latest patch-level of the 2.40 branch to address this.

(cherry picked from commit 7c104c9ba338167903547cff25e5eb5d2d0fcd04)
---
 .../libraries/glibc/2.40-master.patch         | 17092 ++++++++++++++++
 pkgs/development/libraries/glibc/common.nix   |     2 +-
 2 files changed, 17093 insertions(+), 1 deletion(-)

diff --git a/pkgs/development/libraries/glibc/2.40-master.patch b/pkgs/development/libraries/glibc/2.40-master.patch
index c7885ead7b7f..3384de16dc3b 100644
--- a/pkgs/development/libraries/glibc/2.40-master.patch
+++ b/pkgs/development/libraries/glibc/2.40-master.patch
@@ -26216,3 +26216,17095 @@ index f9e3425e04..089c47b04b 100644
        struct abort_msg_s *buf = __mmap (NULL, total,
  					PROT_READ | PROT_WRITE,
  					MAP_ANON | MAP_PRIVATE, -1, 0);
+
+commit aef8f8d6a947b290162393e1d717c7aee96fef8e
+Author: H.J. Lu <hjl.tools@gmail.com>
+Date:   Tue Dec 17 18:41:45 2024 +0800
+
+    Hide all malloc functions from compiler [BZ #32366]
+    
+    Since -1 isn't a power of two, compiler may reject it, hide memalign from
+    Clang 19 which issues an error:
+    
+    tst-memalign.c:86:31: error: requested alignment is not a power of 2 [-Werror,-Wnon-power-of-two-alignment]
+       86 |   p = memalign (-1, pagesize);
+          |                 ^~
+    tst-memalign.c:86:31: error: requested alignment must be 4294967296 bytes or smaller; maximum alignment assumed [-Werror,-Wbuiltin-assume-aligned-alignment]
+       86 |   p = memalign (-1, pagesize);
+          |                 ^~
+    
+    Update tst-malloc-aux.h to hide all malloc functions and include it in
+    all malloc tests to prevent compiler from optimizing out any malloc
+    functions.
+    
+    Tested with Clang 19.1.5 and GCC 15 20241206 for BZ #32366.
+    
+    Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
+    Reviewed-by: Sam James <sam@gentoo.org>
+    (cherry picked from commit f9493a15ea9cfb63a815c00c23142369ec09d8ce)
+
+diff --git a/malloc/tst-mallinfo2.c b/malloc/tst-mallinfo2.c
+index 2c02f5f700..f072b9f24b 100644
+--- a/malloc/tst-mallinfo2.c
++++ b/malloc/tst-mallinfo2.c
+@@ -23,6 +23,8 @@
+ #include <stdlib.h>
+ #include <support/check.h>
+ 
++#include "tst-malloc-aux.h"
++
+ /* This is not specifically needed for the test, but (1) does
+    something to the data so gcc doesn't optimize it away, and (2) may
+    help when developing future tests.  */
+diff --git a/malloc/tst-malloc-aux.h b/malloc/tst-malloc-aux.h
+index 54908b4a24..3e1b61ce34 100644
+--- a/malloc/tst-malloc-aux.h
++++ b/malloc/tst-malloc-aux.h
+@@ -22,20 +22,35 @@
+ 
+ #include <stddef.h>
+ #include <stdlib.h>
+-
+-static void *(*volatile aligned_alloc_indirect)(size_t, size_t) = aligned_alloc;
+-static void *(*volatile calloc_indirect)(size_t, size_t) = calloc;
+-static void *(*volatile malloc_indirect)(size_t) = malloc;
+-static void *(*volatile realloc_indirect)(void*, size_t) = realloc;
++#include <malloc.h>
++
++static __typeof (aligned_alloc) * volatile aligned_alloc_indirect
++  = aligned_alloc;
++static __typeof (calloc) * volatile calloc_indirect = calloc;
++static __typeof (malloc) * volatile malloc_indirect = malloc;
++static __typeof (memalign) * volatile memalign_indirect = memalign;
++static __typeof (posix_memalign) * volatile posix_memalign_indirect
++  = posix_memalign;
++static __typeof (pvalloc) * volatile pvalloc_indirect = pvalloc;
++static __typeof (realloc) * volatile realloc_indirect = realloc;
++static __typeof (valloc) * volatile valloc_indirect = valloc;
+ 
+ #undef aligned_alloc
+ #undef calloc
+ #undef malloc
++#undef memalign
++#undef posix_memalign
++#undef pvalloc
+ #undef realloc
++#undef valloc
+ 
+ #define aligned_alloc aligned_alloc_indirect
+ #define calloc calloc_indirect
+ #define malloc malloc_indirect
++#define memalign memalign_indirect
++#define posix_memalign posix_memalign_indirect
++#define pvalloc pvalloc_indirect
+ #define realloc realloc_indirect
++#define valloc valloc_indirect
+ 
+ #endif /* TST_MALLOC_AUX_H */
+diff --git a/malloc/tst-malloc-backtrace.c b/malloc/tst-malloc-backtrace.c
+index c7b1d65e5c..65fa91f6fd 100644
+--- a/malloc/tst-malloc-backtrace.c
++++ b/malloc/tst-malloc-backtrace.c
+@@ -22,6 +22,8 @@
+ #include <support/support.h>
+ #include <libc-diag.h>
+ 
++#include "tst-malloc-aux.h"
++
+ #define SIZE 4096
+ 
+ /* Wrap free with a function to prevent gcc from optimizing it out.  */
+diff --git a/malloc/tst-memalign.c b/malloc/tst-memalign.c
+index 563f6413d2..ac9770d3f9 100644
+--- a/malloc/tst-memalign.c
++++ b/malloc/tst-memalign.c
+@@ -23,6 +23,8 @@
+ #include <unistd.h>
+ #include <libc-diag.h>
+ 
++#include "tst-malloc-aux.h"
++
+ static int errors = 0;
+ 
+ static void
+diff --git a/malloc/tst-safe-linking.c b/malloc/tst-safe-linking.c
+index 01dd07004d..63a7e2bc8e 100644
+--- a/malloc/tst-safe-linking.c
++++ b/malloc/tst-safe-linking.c
+@@ -26,6 +26,8 @@
+ #include <support/capture_subprocess.h>
+ #include <support/check.h>
+ 
++#include "tst-malloc-aux.h"
++
+ /* Run CALLBACK and check that the data on standard error equals
+    EXPECTED.  */
+ static void
+diff --git a/malloc/tst-valloc.c b/malloc/tst-valloc.c
+index 9bab8c6470..0243d3dfd4 100644
+--- a/malloc/tst-valloc.c
++++ b/malloc/tst-valloc.c
+@@ -23,6 +23,8 @@
+ #include <unistd.h>
+ #include <libc-diag.h>
+ 
++#include "tst-malloc-aux.h"
++
+ static int errors = 0;
+ 
+ static void
+
+commit be48b8f6ad0ec6d0d6b1d2f45eb59bf8e8c67dd7
+Author: Sam James <sam@gentoo.org>
+Date:   Fri Jan 10 03:03:47 2025 +0000
+
+    malloc: obscure calloc use in tst-calloc
+    
+    Similar to a9944a52c967ce76a5894c30d0274b824df43c7a and
+    f9493a15ea9cfb63a815c00c23142369ec09d8ce, we need to hide calloc use from
+    the compiler to accommodate GCC's r15-6566-g804e9d55d9e54c change.
+    
+    First, include tst-malloc-aux.h, but then use `volatile` variables
+    for size.
+    
+    The test passes without the tst-malloc-aux.h change but IMO we want
+    it there for consistency and to avoid future problems (possibly silent).
+    
+    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+    (cherry picked from commit c3d1dac96bdd10250aa37bb367d5ef8334a093a1)
+
+diff --git a/malloc/tst-calloc.c b/malloc/tst-calloc.c
+index 01f17f9e65..5a8c7ab121 100644
+--- a/malloc/tst-calloc.c
++++ b/malloc/tst-calloc.c
+@@ -23,6 +23,7 @@
+ #include <stdio.h>
+ #include <libc-diag.h>
+ 
++#include "tst-malloc-aux.h"
+ 
+ /* Number of samples per size.  */
+ #define N 50000
+@@ -94,16 +95,19 @@ random_test (void)
+ static void
+ null_test (void)
+ {
++  /* Obscure allocation size from the compiler.  */
++  volatile size_t max_size = UINT_MAX;
++  volatile size_t zero_size = 0;
+   /* If the size is 0 the result is implementation defined.  Just make
+      sure the program doesn't crash.  The result of calloc is
+      deliberately ignored, so do not warn about that.  */
+   DIAG_PUSH_NEEDS_COMMENT;
+   DIAG_IGNORE_NEEDS_COMMENT (10, "-Wunused-result");
+   calloc (0, 0);
+-  calloc (0, UINT_MAX);
+-  calloc (UINT_MAX, 0);
+-  calloc (0, ~((size_t) 0));
+-  calloc (~((size_t) 0), 0);
++  calloc (0, max_size);
++  calloc (max_size, 0);
++  calloc (0, ~((size_t) zero_size));
++  calloc (~((size_t) zero_size), 0);
+   DIAG_POP_NEEDS_COMMENT;
+ }
+ 
+
+commit 85668221974db44459527e04d04f77ca8f8e3115
+Author: H.J. Lu <hjl.tools@gmail.com>
+Date:   Fri Jan 24 18:53:13 2025 +0800
+
+    stdlib: Test using setenv with updated environ [BZ #32588]
+    
+    Add a test for setenv with updated environ.  Verify that BZ #32588 is
+    fixed.
+    
+    Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
+    Reviewed-by: Florian Weimer <fweimer@redhat.com>
+    (cherry picked from commit 8ab34497de14e35aff09b607222fe1309ef156da)
+
+diff --git a/stdlib/Makefile b/stdlib/Makefile
+index 8213fa83ef..d3a84fa641 100644
+--- a/stdlib/Makefile
++++ b/stdlib/Makefile
+@@ -307,6 +307,7 @@ tests := \
+   tst-setcontext9 \
+   tst-setcontext10 \
+   tst-setcontext11 \
++  tst-setenv-environ \
+   tst-stdbit-Wconversion \
+   tst-stdbit-builtins \
+   tst-stdc_bit_ceil \
+diff --git a/stdlib/tst-setenv-environ.c b/stdlib/tst-setenv-environ.c
+new file mode 100644
+index 0000000000..02fcef96d0
+--- /dev/null
++++ b/stdlib/tst-setenv-environ.c
+@@ -0,0 +1,36 @@
++/* Test using setenv with updated environ.
++   Copyright (C) 2025 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <stdlib.h>
++#include <support/check.h>
++
++extern char **environ;
++
++int
++do_test (void)
++{
++  char *valp;
++  static char *dummy_environ[] = { NULL };
++  environ = dummy_environ;
++  setenv ("A", "1", 0);
++  valp = getenv ("A");
++  TEST_VERIFY_EXIT (valp[0] == '1' && valp[1] == '\0');
++  return 0;
++}
++
++#include <support/test-driver.c>
+
+commit e899ca3651f8c5e01bf3420cfb34aad97d093f74
+Author: John David Anglin <danglin@gcc.gnu.org>
+Date:   Wed Jan 29 16:51:16 2025 -0500
+
+    nptl: Correct stack size attribute when stack grows up [BZ #32574]
+    
+    Set stack size attribute to the size of the mmap'd region only
+    when the size of the remaining stack space is less than the size
+    of the mmap'd region.
+    
+    This was reversed.  As a result, the initial stack size was only
+    135168 bytes.  On architectures where the stack grows down, the
+    initial stack size is approximately 8384512 bytes with the default
+    rlimit settings.  The small main stack size on hppa broke
+    applications like ruby that check for stack overflows.
+    
+    Signed-off-by: John David Anglin <dave.anglin@bell.net>
+
+diff --git a/nptl/pthread_getattr_np.c b/nptl/pthread_getattr_np.c
+index 1e91874767..3ce34437bc 100644
+--- a/nptl/pthread_getattr_np.c
++++ b/nptl/pthread_getattr_np.c
+@@ -145,9 +145,9 @@ __pthread_getattr_np (pthread_t thread_id, pthread_attr_t *attr)
+ 			  > (size_t) iattr->stackaddr - last_to)
+ 			iattr->stacksize = (size_t) iattr->stackaddr - last_to;
+ #else
+-		      /* The limit might be too high.  */
++		      /* The limit might be too low.  */
+ 		      if ((size_t) iattr->stacksize
+-			  > to - (size_t) iattr->stackaddr)
++			  < to - (size_t) iattr->stackaddr)
+ 			iattr->stacksize = to - (size_t) iattr->stackaddr;
+ #endif
+ 		      /* We succeed and no need to look further.  */
+
+commit d6c156c326999f144cb5b73d29982108d549ad8a
+Author: Siddhesh Poyarekar <siddhesh@sourceware.org>
+Date:   Fri Jan 31 12:16:30 2025 -0500
+
+    assert: Add test for CVE-2025-0395
+    
+    Use the __progname symbol to override the program name to induce the
+    failure that CVE-2025-0395 describes.
+    
+    This is related to BZ #32582
+    
+    Signed-off-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
+    Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
+    (cherry picked from commit cdb9ba84191ce72e86346fb8b1d906e7cd930ea2)
+
+diff --git a/assert/Makefile b/assert/Makefile
+index 35dc908ddb..c0fe660bd6 100644
+--- a/assert/Makefile
++++ b/assert/Makefile
+@@ -38,6 +38,7 @@ tests := \
+   test-assert-perr \
+   tst-assert-c++ \
+   tst-assert-g++ \
++  tst-assert-sa-2025-0001 \
+   # tests
+ 
+ ifeq ($(have-cxx-thread_local),yes)
+diff --git a/assert/tst-assert-sa-2025-0001.c b/assert/tst-assert-sa-2025-0001.c
+new file mode 100644
+index 0000000000..102cb0078d
+--- /dev/null
++++ b/assert/tst-assert-sa-2025-0001.c
+@@ -0,0 +1,92 @@
++/* Test for CVE-2025-0395.
++   Copyright The GNU Toolchain Authors.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++/* Test that a large enough __progname does not result in a buffer overflow
++   when printing an assertion failure.  This was CVE-2025-0395.  */
++#include <assert.h>
++#include <inttypes.h>
++#include <signal.h>
++#include <stdbool.h>
++#include <string.h>
++#include <sys/mman.h>
++#include <support/check.h>
++#include <support/support.h>
++#include <support/xstdio.h>
++#include <support/xunistd.h>
++
++extern const char *__progname;
++
++int
++do_test (int argc, char **argv)
++{
++
++  support_need_proc ("Reads /proc/self/maps to add guards to writable maps.");
++  ignore_stderr ();
++
++  /* XXX assumes that the assert is on a 2 digit line number.  */
++  const char *prompt = ": %s:99: do_test: Assertion `argc < 1' failed.\n";
++
++  int ret = fprintf (stderr, prompt, __FILE__);
++  if (ret < 0)
++    FAIL_EXIT1 ("fprintf failed: %m\n");
++
++  size_t pagesize = getpagesize ();
++  size_t namesize = pagesize - 1 - ret;
++
++  /* Alter the progname so that the assert message fills the entire page.  */
++  char progname[namesize];
++  memset (progname, 'A', namesize - 1);
++  progname[namesize - 1] = '\0';
++  __progname = progname;
++
++  FILE *f = xfopen ("/proc/self/maps", "r");
++  char *line = NULL;
++  size_t len = 0;
++  uintptr_t prev_to = 0;
++
++  /* Pad the beginning of every writable mapping with a PROT_NONE map.  This
++     ensures that the mmap in the assert_fail path never ends up below a
++     writable map and will terminate immediately in case of a buffer
++     overflow.  */
++  while (xgetline (&line, &len, f))
++    {
++      uintptr_t from, to;
++      char perm[4];
++
++      sscanf (line, "%" SCNxPTR "-%" SCNxPTR " %c%c%c%c ",
++	      &from, &to,
++	      &perm[0], &perm[1], &perm[2], &perm[3]);
++
++      bool writable = (memchr (perm, 'w', 4) != NULL);
++
++      if (prev_to != 0 && from - prev_to > pagesize && writable)
++	xmmap ((void *) from - pagesize, pagesize, PROT_NONE,
++	       MAP_ANONYMOUS | MAP_PRIVATE, 0);
++
++      prev_to = to;
++    }
++
++  xfclose (f);
++
++  assert (argc < 1);
++  return 0;
++}
++
++#define EXPECTED_SIGNAL SIGABRT
++#define TEST_FUNCTION_ARGV do_test
++#include <support/test-driver.c>
+
+commit 523f85558152a1b9cced6d669f758c27677775ba
+Author: John David Anglin <danglin@gcc.gnu.org>
+Date:   Tue Feb 25 15:57:53 2025 -0500
+
+    math: Add optimization barrier to ensure a1 + u.d is not reused [BZ #30664]
+    
+    A number of fma tests started to fail on hppa when gcc was changed to
+    use Ranger rather than EVRP.  Eventually I found that the value of
+    a1 + u.d in this is block of code was being computed in FE_TOWARDZERO
+    mode and not the original rounding mode:
+    
+        if (TININESS_AFTER_ROUNDING)
+          {
+            w.d = a1 + u.d;
+            if (w.ieee.exponent == 109)
+              return w.d * 0x1p-108;
+          }
+    
+    This caused the exponent value to be wrong and the wrong return path
+    to be used.
+    
+    Here we add an optimization barrier after the rounding mode is reset
+    to ensure that the previous value of a1 + u.d is not reused.
+    
+    Signed-off-by: John David Anglin <dave.anglin@bell.net>
+
+diff --git a/sysdeps/ieee754/dbl-64/s_fma.c b/sysdeps/ieee754/dbl-64/s_fma.c
+index c5f5abdc68..79a3cd721d 100644
+--- a/sysdeps/ieee754/dbl-64/s_fma.c
++++ b/sysdeps/ieee754/dbl-64/s_fma.c
+@@ -244,6 +244,9 @@ __fma (double x, double y, double z)
+   /* Reset rounding mode and test for inexact simultaneously.  */
+   int j = libc_feupdateenv_test (&env, FE_INEXACT) != 0;
+ 
++  /* Ensure value of a1 + u.d is not reused.  */
++  a1 = math_opt_barrier (a1);
++
+   if (__glibc_likely (adjust == 0))
+     {
+       if ((u.ieee.mantissa1 & 1) == 0 && u.ieee.exponent != 0x7ff)
+
+commit ff10623706ea0096f3af7b38a3330ffb7fb15ae7
+Author: Joe Ramsay <Joe.Ramsay@arm.com>
+Date:   Mon Sep 9 13:00:01 2024 +0100
+
+    aarch64: Avoid redundant MOVs in AdvSIMD F32 logs
+    
+    Since the last operation is destructive, the first argument to the FMA
+    also has to be the first argument to the special-case in order to
+    avoid unnecessary MOVs. Reorder arguments and adjust special-case
+    bounds to facilitate this.
+    
+    Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
+    (cherry picked from commit 8b09af572b208bfde4d31c6abbae047dcc217675)
+
+diff --git a/sysdeps/aarch64/fpu/log10f_advsimd.c b/sysdeps/aarch64/fpu/log10f_advsimd.c
+index 9347422a77..82228b599a 100644
+--- a/sysdeps/aarch64/fpu/log10f_advsimd.c
++++ b/sysdeps/aarch64/fpu/log10f_advsimd.c
+@@ -22,11 +22,11 @@
+ 
+ static const struct data
+ {
+-  uint32x4_t min_norm;
++  uint32x4_t off, offset_lower_bound;
+   uint16x8_t special_bound;
++  uint32x4_t mantissa_mask;
+   float32x4_t poly[8];
+   float32x4_t inv_ln10, ln2;
+-  uint32x4_t off, mantissa_mask;
+ } data = {
+   /* Use order 9 for log10(1+x), i.e. order 8 for log10(1+x)/x, with x in
+       [-1/3, 1/3] (offset=2/3). Max. relative error: 0x1.068ee468p-25.  */
+@@ -35,18 +35,22 @@ static const struct data
+ 	    V4 (-0x1.0fc92cp-4f), V4 (0x1.f5f76ap-5f) },
+   .ln2 = V4 (0x1.62e43p-1f),
+   .inv_ln10 = V4 (0x1.bcb7b2p-2f),
+-  .min_norm = V4 (0x00800000),
+-  .special_bound = V8 (0x7f00), /* asuint32(inf) - min_norm.  */
++  /* Lower bound is the smallest positive normal float 0x00800000. For
++     optimised register use subnormals are detected after offset has been
++     subtracted, so lower bound is 0x0080000 - offset (which wraps around).  */
++  .offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab),
++  .special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000).  */
+   .off = V4 (0x3f2aaaab),	/* 0.666667.  */
+   .mantissa_mask = V4 (0x007fffff),
+ };
+ 
+ static float32x4_t VPCS_ATTR NOINLINE
+-special_case (float32x4_t x, float32x4_t y, float32x4_t p, float32x4_t r2,
+-	      uint16x4_t cmp)
++special_case (float32x4_t y, uint32x4_t u_off, float32x4_t p, float32x4_t r2,
++	      uint16x4_t cmp, const struct data *d)
+ {
+   /* Fall back to scalar code.  */
+-  return v_call_f32 (log10f, x, vfmaq_f32 (y, p, r2), vmovl_u16 (cmp));
++  return v_call_f32 (log10f, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)),
++		     vfmaq_f32 (y, p, r2), vmovl_u16 (cmp));
+ }
+ 
+ /* Fast implementation of AdvSIMD log10f,
+@@ -58,15 +62,21 @@ special_case (float32x4_t x, float32x4_t y, float32x4_t p, float32x4_t r2,
+ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log10) (float32x4_t x)
+ {
+   const struct data *d = ptr_barrier (&data);
+-  uint32x4_t u = vreinterpretq_u32_f32 (x);
+-  uint16x4_t special = vcge_u16 (vsubhn_u32 (u, d->min_norm),
+-				 vget_low_u16 (d->special_bound));
++
++  /* To avoid having to mov x out of the way, keep u after offset has been
++     applied, and recover x by adding the offset back in the special-case
++     handler.  */
++  uint32x4_t u_off = vreinterpretq_u32_f32 (x);
+ 
+   /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3.  */
+-  u = vsubq_u32 (u, d->off);
++  u_off = vsubq_u32 (u_off, d->off);
+   float32x4_t n = vcvtq_f32_s32 (
+-      vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend.  */
+-  u = vaddq_u32 (vandq_u32 (u, d->mantissa_mask), d->off);
++      vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend.  */
++
++  uint16x4_t special = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound),
++				 vget_low_u16 (d->special_bound));
++
++  uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off);
+   float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
+ 
+   /* y = log10(1+r) + n * log10(2).  */
+@@ -77,7 +87,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log10) (float32x4_t x)
+   y = vmulq_f32 (y, d->inv_ln10);
+ 
+   if (__glibc_unlikely (v_any_u16h (special)))
+-    return special_case (x, y, poly, r2, special);
++    return special_case (y, u_off, poly, r2, special, d);
+   return vfmaq_f32 (y, poly, r2);
+ }
+ libmvec_hidden_def (V_NAME_F1 (log10))
+diff --git a/sysdeps/aarch64/fpu/log2f_advsimd.c b/sysdeps/aarch64/fpu/log2f_advsimd.c
+index db21836749..84effe4fe9 100644
+--- a/sysdeps/aarch64/fpu/log2f_advsimd.c
++++ b/sysdeps/aarch64/fpu/log2f_advsimd.c
+@@ -22,9 +22,9 @@
+ 
+ static const struct data
+ {
+-  uint32x4_t min_norm;
++  uint32x4_t off, offset_lower_bound;
+   uint16x8_t special_bound;
+-  uint32x4_t off, mantissa_mask;
++  uint32x4_t mantissa_mask;
+   float32x4_t poly[9];
+ } data = {
+   /* Coefficients generated using Remez algorithm approximate
+@@ -34,18 +34,22 @@ static const struct data
+ 	    V4 (-0x1.715458p-1f), V4 (0x1.ec701cp-2f), V4 (-0x1.7171a4p-2f),
+ 	    V4 (0x1.27a0b8p-2f), V4 (-0x1.e5143ep-3f), V4 (0x1.9d8ecap-3f),
+ 	    V4 (-0x1.c675bp-3f), V4 (0x1.9e495p-3f) },
+-  .min_norm = V4 (0x00800000),
+-  .special_bound = V8 (0x7f00), /* asuint32(inf) - min_norm.  */
++  /* Lower bound is the smallest positive normal float 0x00800000. For
++     optimised register use subnormals are detected after offset has been
++     subtracted, so lower bound is 0x0080000 - offset (which wraps around).  */
++  .offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab),
++  .special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000).  */
+   .off = V4 (0x3f2aaaab),	/* 0.666667.  */
+   .mantissa_mask = V4 (0x007fffff),
+ };
+ 
+ static float32x4_t VPCS_ATTR NOINLINE
+-special_case (float32x4_t x, float32x4_t n, float32x4_t p, float32x4_t r,
+-	      uint16x4_t cmp)
++special_case (float32x4_t n, uint32x4_t u_off, float32x4_t p, float32x4_t r,
++	      uint16x4_t cmp, const struct data *d)
+ {
+   /* Fall back to scalar code.  */
+-  return v_call_f32 (log2f, x, vfmaq_f32 (n, p, r), vmovl_u16 (cmp));
++  return v_call_f32 (log2f, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)),
++		     vfmaq_f32 (n, p, r), vmovl_u16 (cmp));
+ }
+ 
+ /* Fast implementation for single precision AdvSIMD log2,
+@@ -56,15 +60,21 @@ special_case (float32x4_t x, float32x4_t n, float32x4_t p, float32x4_t r,
+ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log2) (float32x4_t x)
+ {
+   const struct data *d = ptr_barrier (&data);
+-  uint32x4_t u = vreinterpretq_u32_f32 (x);
+-  uint16x4_t special = vcge_u16 (vsubhn_u32 (u, d->min_norm),
+-				 vget_low_u16 (d->special_bound));
++
++  /* To avoid having to mov x out of the way, keep u after offset has been
++     applied, and recover x by adding the offset back in the special-case
++     handler.  */
++  uint32x4_t u_off = vreinterpretq_u32_f32 (x);
+ 
+   /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3.  */
+-  u = vsubq_u32 (u, d->off);
++  u_off = vsubq_u32 (u_off, d->off);
+   float32x4_t n = vcvtq_f32_s32 (
+-      vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend.  */
+-  u = vaddq_u32 (vandq_u32 (u, d->mantissa_mask), d->off);
++      vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend.  */
++
++  uint16x4_t special = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound),
++				 vget_low_u16 (d->special_bound));
++
++  uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off);
+   float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
+ 
+   /* y = log2(1+r) + n.  */
+@@ -72,7 +82,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log2) (float32x4_t x)
+   float32x4_t p = v_pw_horner_8_f32 (r, r2, d->poly);
+ 
+   if (__glibc_unlikely (v_any_u16h (special)))
+-    return special_case (x, n, p, r, special);
++    return special_case (n, u_off, p, r, special, d);
+   return vfmaq_f32 (n, p, r);
+ }
+ libmvec_hidden_def (V_NAME_F1 (log2))
+diff --git a/sysdeps/aarch64/fpu/logf_advsimd.c b/sysdeps/aarch64/fpu/logf_advsimd.c
+index 3c0d0fcdc7..c20dbfd6c0 100644
+--- a/sysdeps/aarch64/fpu/logf_advsimd.c
++++ b/sysdeps/aarch64/fpu/logf_advsimd.c
+@@ -21,20 +21,22 @@
+ 
+ static const struct data
+ {
+-  uint32x4_t min_norm;
++  uint32x4_t off, offset_lower_bound;
+   uint16x8_t special_bound;
++  uint32x4_t mantissa_mask;
+   float32x4_t poly[7];
+-  float32x4_t ln2, tiny_bound;
+-  uint32x4_t off, mantissa_mask;
++  float32x4_t ln2;
+ } data = {
+   /* 3.34 ulp error.  */
+   .poly = { V4 (-0x1.3e737cp-3f), V4 (0x1.5a9aa2p-3f), V4 (-0x1.4f9934p-3f),
+ 	    V4 (0x1.961348p-3f), V4 (-0x1.00187cp-2f), V4 (0x1.555d7cp-2f),
+ 	    V4 (-0x1.ffffc8p-2f) },
+   .ln2 = V4 (0x1.62e43p-1f),
+-  .tiny_bound = V4 (0x1p-126),
+-  .min_norm = V4 (0x00800000),
+-  .special_bound = V8 (0x7f00), /* asuint32(inf) - min_norm.  */
++  /* Lower bound is the smallest positive normal float 0x00800000. For
++     optimised register use subnormals are detected after offset has been
++     subtracted, so lower bound is 0x0080000 - offset (which wraps around).  */
++  .offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab),
++  .special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000).  */
+   .off = V4 (0x3f2aaaab),	/* 0.666667.  */
+   .mantissa_mask = V4 (0x007fffff)
+ };
+@@ -42,32 +44,37 @@ static const struct data
+ #define P(i) d->poly[7 - i]
+ 
+ static float32x4_t VPCS_ATTR NOINLINE
+-special_case (float32x4_t x, float32x4_t y, float32x4_t r2, float32x4_t p,
+-	      uint16x4_t cmp)
++special_case (float32x4_t p, uint32x4_t u_off, float32x4_t y, float32x4_t r2,
++	      uint16x4_t cmp, const struct data *d)
+ {
+   /* Fall back to scalar code.  */
+-  return v_call_f32 (logf, x, vfmaq_f32 (p, y, r2), vmovl_u16 (cmp));
++  return v_call_f32 (logf, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)),
++		     vfmaq_f32 (p, y, r2), vmovl_u16 (cmp));
+ }
+ 
+ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log) (float32x4_t x)
+ {
+   const struct data *d = ptr_barrier (&data);
+   float32x4_t n, p, q, r, r2, y;
+-  uint32x4_t u;
++  uint32x4_t u, u_off;
+   uint16x4_t cmp;
+ 
+-  u = vreinterpretq_u32_f32 (x);
+-  cmp = vcge_u16 (vsubhn_u32 (u, d->min_norm),
+-		  vget_low_u16 (d->special_bound));
++  /* To avoid having to mov x out of the way, keep u after offset has been
++     applied, and recover x by adding the offset back in the special-case
++     handler.  */
++  u_off = vreinterpretq_u32_f32 (x);
+ 
+   /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3.  */
+-  u = vsubq_u32 (u, d->off);
++  u_off = vsubq_u32 (u_off, d->off);
+   n = vcvtq_f32_s32 (
+-      vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend.  */
+-  u = vandq_u32 (u, d->mantissa_mask);
++      vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend.  */
++  u = vandq_u32 (u_off, d->mantissa_mask);
+   u = vaddq_u32 (u, d->off);
+   r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
+ 
++  cmp = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound),
++		  vget_low_u16 (d->special_bound));
++
+   /* y = log(1+r) + n*ln2.  */
+   r2 = vmulq_f32 (r, r);
+   /* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))).  */
+@@ -80,7 +87,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log) (float32x4_t x)
+   p = vfmaq_f32 (r, d->ln2, n);
+ 
+   if (__glibc_unlikely (v_any_u16h (cmp)))
+-    return special_case (x, y, r2, p, cmp);
++    return special_case (p, u_off, y, r2, cmp, d);
+   return vfmaq_f32 (p, y, r2);
+ }
+ libmvec_hidden_def (V_NAME_F1 (log))
+
+commit a991a0fc7c051d7ef2ea7778e0a699f22d4e53d7
+Author: Joe Ramsay <Joe.Ramsay@arm.com>
+Date:   Thu Sep 19 17:34:02 2024 +0100
+
+    AArch64: Add vector logp1 alias for log1p
+    
+    This enables vectorisation of C23 logp1, which is an alias for log1p.
+    There are no new tests or ulp entries because the new symbols are simply
+    aliases.
+    
+    Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
+    (cherry picked from commit 751a5502bea1d13551c62c47bb9bd25bff870cda)
+
+diff --git a/bits/libm-simd-decl-stubs.h b/bits/libm-simd-decl-stubs.h
+index 08a41c46ad..5019e8e25c 100644
+--- a/bits/libm-simd-decl-stubs.h
++++ b/bits/libm-simd-decl-stubs.h
+@@ -253,6 +253,17 @@
+ #define __DECL_SIMD_log1pf64x
+ #define __DECL_SIMD_log1pf128x
+ 
++#define __DECL_SIMD_logp1
++#define __DECL_SIMD_logp1f
++#define __DECL_SIMD_logp1l
++#define __DECL_SIMD_logp1f16
++#define __DECL_SIMD_logp1f32
++#define __DECL_SIMD_logp1f64
++#define __DECL_SIMD_logp1f128
++#define __DECL_SIMD_logp1f32x
++#define __DECL_SIMD_logp1f64x
++#define __DECL_SIMD_logp1f128x
++
+ #define __DECL_SIMD_atanh
+ #define __DECL_SIMD_atanhf
+ #define __DECL_SIMD_atanhl
+diff --git a/math/bits/mathcalls.h b/math/bits/mathcalls.h
+index 6cb594b6ff..92856becc4 100644
+--- a/math/bits/mathcalls.h
++++ b/math/bits/mathcalls.h
+@@ -126,7 +126,7 @@ __MATHCALL (log2p1,, (_Mdouble_ __x));
+ __MATHCALL (log10p1,, (_Mdouble_ __x));
+ 
+ /* Return log(1 + X).  */
+-__MATHCALL (logp1,, (_Mdouble_ __x));
++__MATHCALL_VEC (logp1,, (_Mdouble_ __x));
+ #endif
+ 
+ #if defined __USE_XOPEN_EXTENDED || defined __USE_ISOC99
+diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions
+index cc15ce2d1e..015211f5f4 100644
+--- a/sysdeps/aarch64/fpu/Versions
++++ b/sysdeps/aarch64/fpu/Versions
+@@ -135,4 +135,11 @@ libmvec {
+     _ZGVsMxv_tanh;
+     _ZGVsMxv_tanhf;
+   }
++  GLIBC_2.41 {
++    _ZGVnN2v_logp1;
++    _ZGVnN2v_logp1f;
++    _ZGVnN4v_logp1f;
++    _ZGVsMxv_logp1;
++    _ZGVsMxv_logp1f;
++  }
+ }
+diff --git a/sysdeps/aarch64/fpu/advsimd_f32_protos.h b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
+index 097d403ffe..5909bb4ce9 100644
+--- a/sysdeps/aarch64/fpu/advsimd_f32_protos.h
++++ b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
+@@ -36,6 +36,7 @@ libmvec_hidden_proto (V_NAME_F2(hypot));
+ libmvec_hidden_proto (V_NAME_F1(log10));
+ libmvec_hidden_proto (V_NAME_F1(log1p));
+ libmvec_hidden_proto (V_NAME_F1(log2));
++libmvec_hidden_proto (V_NAME_F1(logp1));
+ libmvec_hidden_proto (V_NAME_F1(log));
+ libmvec_hidden_proto (V_NAME_F2(pow));
+ libmvec_hidden_proto (V_NAME_F1(sin));
+diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h
+index 7484150131..f295fe185d 100644
+--- a/sysdeps/aarch64/fpu/bits/math-vector.h
++++ b/sysdeps/aarch64/fpu/bits/math-vector.h
+@@ -113,6 +113,10 @@
+ # define __DECL_SIMD_log2 __DECL_SIMD_aarch64
+ # undef __DECL_SIMD_log2f
+ # define __DECL_SIMD_log2f __DECL_SIMD_aarch64
++# undef __DECL_SIMD_logp1
++# define __DECL_SIMD_logp1 __DECL_SIMD_aarch64
++# undef __DECL_SIMD_logp1f
++# define __DECL_SIMD_logp1f __DECL_SIMD_aarch64
+ # undef __DECL_SIMD_pow
+ # define __DECL_SIMD_pow __DECL_SIMD_aarch64
+ # undef __DECL_SIMD_powf
+@@ -180,6 +184,7 @@ __vpcs __f32x4_t _ZGVnN4v_logf (__f32x4_t);
+ __vpcs __f32x4_t _ZGVnN4v_log10f (__f32x4_t);
+ __vpcs __f32x4_t _ZGVnN4v_log1pf (__f32x4_t);
+ __vpcs __f32x4_t _ZGVnN4v_log2f (__f32x4_t);
++__vpcs __f32x4_t _ZGVnN4v_logp1f (__f32x4_t);
+ __vpcs __f32x4_t _ZGVnN4vv_powf (__f32x4_t, __f32x4_t);
+ __vpcs __f32x4_t _ZGVnN4v_sinf (__f32x4_t);
+ __vpcs __f32x4_t _ZGVnN4v_sinhf (__f32x4_t);
+@@ -207,6 +212,7 @@ __vpcs __f64x2_t _ZGVnN2v_log (__f64x2_t);
+ __vpcs __f64x2_t _ZGVnN2v_log10 (__f64x2_t);
+ __vpcs __f64x2_t _ZGVnN2v_log1p (__f64x2_t);
+ __vpcs __f64x2_t _ZGVnN2v_log2 (__f64x2_t);
++__vpcs __f64x2_t _ZGVnN2v_logp1 (__f64x2_t);
+ __vpcs __f64x2_t _ZGVnN2vv_pow (__f64x2_t, __f64x2_t);
+ __vpcs __f64x2_t _ZGVnN2v_sin (__f64x2_t);
+ __vpcs __f64x2_t _ZGVnN2v_sinh (__f64x2_t);
+@@ -239,6 +245,7 @@ __sv_f32_t _ZGVsMxv_logf (__sv_f32_t, __sv_bool_t);
+ __sv_f32_t _ZGVsMxv_log10f (__sv_f32_t, __sv_bool_t);
+ __sv_f32_t _ZGVsMxv_log1pf (__sv_f32_t, __sv_bool_t);
+ __sv_f32_t _ZGVsMxv_log2f (__sv_f32_t, __sv_bool_t);
++__sv_f32_t _ZGVsMxv_logp1f (__sv_f32_t, __sv_bool_t);
+ __sv_f32_t _ZGVsMxvv_powf (__sv_f32_t, __sv_f32_t, __sv_bool_t);
+ __sv_f32_t _ZGVsMxv_sinf (__sv_f32_t, __sv_bool_t);
+ __sv_f32_t _ZGVsMxv_sinhf (__sv_f32_t, __sv_bool_t);
+@@ -266,6 +273,7 @@ __sv_f64_t _ZGVsMxv_log (__sv_f64_t, __sv_bool_t);
+ __sv_f64_t _ZGVsMxv_log10 (__sv_f64_t, __sv_bool_t);
+ __sv_f64_t _ZGVsMxv_log1p (__sv_f64_t, __sv_bool_t);
+ __sv_f64_t _ZGVsMxv_log2 (__sv_f64_t, __sv_bool_t);
++__sv_f64_t _ZGVsMxv_logp1 (__sv_f64_t, __sv_bool_t);
+ __sv_f64_t _ZGVsMxvv_pow (__sv_f64_t, __sv_f64_t, __sv_bool_t);
+ __sv_f64_t _ZGVsMxv_sin (__sv_f64_t, __sv_bool_t);
+ __sv_f64_t _ZGVsMxv_sinh (__sv_f64_t, __sv_bool_t);
+diff --git a/sysdeps/aarch64/fpu/log1p_advsimd.c b/sysdeps/aarch64/fpu/log1p_advsimd.c
+index ffc418fc9c..114064c696 100644
+--- a/sysdeps/aarch64/fpu/log1p_advsimd.c
++++ b/sysdeps/aarch64/fpu/log1p_advsimd.c
+@@ -127,3 +127,5 @@ VPCS_ATTR float64x2_t V_NAME_D1 (log1p) (float64x2_t x)
+ 
+   return vfmaq_f64 (y, f2, p);
+ }
++
++strong_alias (V_NAME_D1 (log1p), V_NAME_D1 (logp1))
+diff --git a/sysdeps/aarch64/fpu/log1p_sve.c b/sysdeps/aarch64/fpu/log1p_sve.c
+index 04f7e5720e..b21cfb2c90 100644
+--- a/sysdeps/aarch64/fpu/log1p_sve.c
++++ b/sysdeps/aarch64/fpu/log1p_sve.c
+@@ -116,3 +116,5 @@ svfloat64_t SV_NAME_D1 (log1p) (svfloat64_t x, svbool_t pg)
+ 
+   return y;
+ }
++
++strong_alias (SV_NAME_D1 (log1p), SV_NAME_D1 (logp1))
+diff --git a/sysdeps/aarch64/fpu/log1pf_advsimd.c b/sysdeps/aarch64/fpu/log1pf_advsimd.c
+index dc15334a85..8cfa28fb8a 100644
+--- a/sysdeps/aarch64/fpu/log1pf_advsimd.c
++++ b/sysdeps/aarch64/fpu/log1pf_advsimd.c
+@@ -128,3 +128,6 @@ VPCS_ATTR float32x4_t V_NAME_F1 (log1p) (float32x4_t x)
+ }
+ libmvec_hidden_def (V_NAME_F1 (log1p))
+ HALF_WIDTH_ALIAS_F1 (log1p)
++strong_alias (V_NAME_F1 (log1p), V_NAME_F1 (logp1))
++libmvec_hidden_def (V_NAME_F1 (logp1))
++HALF_WIDTH_ALIAS_F1 (logp1)
+diff --git a/sysdeps/aarch64/fpu/log1pf_sve.c b/sysdeps/aarch64/fpu/log1pf_sve.c
+index f645cc997e..5256d5e94c 100644
+--- a/sysdeps/aarch64/fpu/log1pf_sve.c
++++ b/sysdeps/aarch64/fpu/log1pf_sve.c
+@@ -98,3 +98,5 @@ svfloat32_t SV_NAME_F1 (log1p) (svfloat32_t x, svbool_t pg)
+ 
+   return y;
+ }
++
++strong_alias (SV_NAME_F1 (log1p), SV_NAME_F1 (logp1))
+diff --git a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
+index b685106954..98687cae0d 100644
+--- a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
++++ b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
+@@ -128,3 +128,8 @@ GLIBC_2.40 _ZGVsMxvv_hypot F
+ GLIBC_2.40 _ZGVsMxvv_hypotf F
+ GLIBC_2.40 _ZGVsMxvv_pow F
+ GLIBC_2.40 _ZGVsMxvv_powf F
++GLIBC_2.41 _ZGVnN2v_logp1 F
++GLIBC_2.41 _ZGVnN2v_logp1f F
++GLIBC_2.41 _ZGVnN4v_logp1f F
++GLIBC_2.41 _ZGVsMxv_logp1 F
++GLIBC_2.41 _ZGVsMxv_logp1f F
+
+commit 354aeaf2130c1484007025563fe87c997f07324a
+Author: Joe Ramsay <Joe.Ramsay@arm.com>
+Date:   Mon Sep 23 15:26:12 2024 +0100
+
+    AArch64: Improve codegen in SVE expf & related routines
+    
+    Reduce MOV and MOVPRFX by improving special-case handling.  Use inline
+    helper to duplicate the entire computation between the special- and
+    non-special case branches, removing the contention for z0 between x
+    and the return value.
+    
+    Also rearrange some MLAs and MLSs - by making the multiplicand the
+    destination we can avoid a MOVPRFX in several cases.  Also change which
+    constants go in the vector used for lanewise ops - the last lane is no
+    longer wasted.
+    
+    Spotted that shift was incorrect in exp2f and exp10f, w.r.t. to the
+    comment that explains it.  Fixed - worst-case ULP for exp2f moves
+    around but it doesn't change significantly for either routine.
+    
+    Worst-case error for coshf increases due to passing x to exp rather
+    than abs(x) - updated the comment, but does not require regen-ulps.
+    
+    Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
+    (cherry picked from commit 7b8c134b5460ed933d610fa92ed1227372b68fdc)
+
+diff --git a/sysdeps/aarch64/fpu/coshf_sve.c b/sysdeps/aarch64/fpu/coshf_sve.c
+index e5d8a299c6..7ad6efa0fc 100644
+--- a/sysdeps/aarch64/fpu/coshf_sve.c
++++ b/sysdeps/aarch64/fpu/coshf_sve.c
+@@ -23,37 +23,42 @@
+ static const struct data
+ {
+   struct sv_expf_data expf_consts;
+-  uint32_t special_bound;
++  float special_bound;
+ } data = {
+   .expf_consts = SV_EXPF_DATA,
+   /* 0x1.5a92d8p+6: expf overflows above this, so have to use special case.  */
+-  .special_bound = 0x42ad496c,
++  .special_bound = 0x1.5a92d8p+6,
+ };
+ 
+ static svfloat32_t NOINLINE
+-special_case (svfloat32_t x, svfloat32_t y, svbool_t pg)
++special_case (svfloat32_t x, svfloat32_t half_e, svfloat32_t half_over_e,
++	      svbool_t pg)
+ {
+-  return sv_call_f32 (coshf, x, y, pg);
++  return sv_call_f32 (coshf, x, svadd_x (svptrue_b32 (), half_e, half_over_e),
++		      pg);
+ }
+ 
+ /* Single-precision vector cosh, using vector expf.
+-   Maximum error is 1.89 ULP:
+-   _ZGVsMxv_coshf (-0x1.65898cp+6) got 0x1.f00aep+127
+-				  want 0x1.f00adcp+127.  */
++   Maximum error is 2.77 ULP:
++   _ZGVsMxv_coshf(-0x1.5b38f4p+1) got 0x1.e45946p+2
++				 want 0x1.e4594cp+2.  */
+ svfloat32_t SV_NAME_F1 (cosh) (svfloat32_t x, svbool_t pg)
+ {
+   const struct data *d = ptr_barrier (&data);
+ 
+-  svfloat32_t ax = svabs_x (pg, x);
+-  svbool_t special = svcmpge (pg, svreinterpret_u32 (ax), d->special_bound);
++  svbool_t special = svacge (pg, x, d->special_bound);
+ 
+-  /* Calculate cosh by exp(x) / 2 + exp(-x) / 2.  */
+-  svfloat32_t t = expf_inline (ax, pg, &d->expf_consts);
+-  svfloat32_t half_t = svmul_x (pg, t, 0.5);
+-  svfloat32_t half_over_t = svdivr_x (pg, t, 0.5);
++  /* Calculate cosh by exp(x) / 2 + exp(-x) / 2.
++     Note that x is passed to exp here, rather than |x|. This is to avoid using
++     destructive unary ABS for better register usage. However it means the
++     routine is not exactly symmetrical, as the exp helper is slightly less
++     accurate in the negative range.  */
++  svfloat32_t e = expf_inline (x, pg, &d->expf_consts);
++  svfloat32_t half_e = svmul_x (svptrue_b32 (), e, 0.5);
++  svfloat32_t half_over_e = svdivr_x (pg, e, 0.5);
+ 
+   if (__glibc_unlikely (svptest_any (pg, special)))
+-    return special_case (x, svadd_x (pg, half_t, half_over_t), special);
++    return special_case (x, half_e, half_over_e, special);
+ 
+-  return svadd_x (pg, half_t, half_over_t);
++  return svadd_x (svptrue_b32 (), half_e, half_over_e);
+ }
+diff --git a/sysdeps/aarch64/fpu/exp10f_sve.c b/sysdeps/aarch64/fpu/exp10f_sve.c
+index e09b2f3b27..8aa3fa9c43 100644
+--- a/sysdeps/aarch64/fpu/exp10f_sve.c
++++ b/sysdeps/aarch64/fpu/exp10f_sve.c
+@@ -18,74 +18,83 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include "sv_math.h"
+-#include "poly_sve_f32.h"
+ 
+-/* For x < -SpecialBound, the result is subnormal and not handled correctly by
++/* For x < -Thres, the result is subnormal and not handled correctly by
+    FEXPA.  */
+-#define SpecialBound 37.9
++#define Thres 37.9
+ 
+ static const struct data
+ {
+-  float poly[5];
+-  float shift, log10_2, log2_10_hi, log2_10_lo, special_bound;
++  float log2_10_lo, c0, c2, c4;
++  float c1, c3, log10_2;
++  float shift, log2_10_hi, thres;
+ } data = {
+   /* Coefficients generated using Remez algorithm with minimisation of relative
+      error.
+      rel error: 0x1.89dafa3p-24
+      abs error: 0x1.167d55p-23 in [-log10(2)/2, log10(2)/2]
+      maxerr: 0.52 +0.5 ulp.  */
+-  .poly = { 0x1.26bb16p+1f, 0x1.5350d2p+1f, 0x1.04744ap+1f, 0x1.2d8176p+0f,
+-	    0x1.12b41ap-1f },
++  .c0 = 0x1.26bb16p+1f,
++  .c1 = 0x1.5350d2p+1f,
++  .c2 = 0x1.04744ap+1f,
++  .c3 = 0x1.2d8176p+0f,
++  .c4 = 0x1.12b41ap-1f,
+   /* 1.5*2^17 + 127, a shift value suitable for FEXPA.  */
+-  .shift = 0x1.903f8p17f,
++  .shift = 0x1.803f8p17f,
+   .log10_2 = 0x1.a934fp+1,
+   .log2_10_hi = 0x1.344136p-2,
+   .log2_10_lo = -0x1.ec10cp-27,
+-  .special_bound = SpecialBound,
++  .thres = Thres,
+ };
+ 
+-static svfloat32_t NOINLINE
+-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
++static inline svfloat32_t
++sv_exp10f_inline (svfloat32_t x, const svbool_t pg, const struct data *d)
+ {
+-  return sv_call_f32 (exp10f, x, y, special);
+-}
+-
+-/* Single-precision SVE exp10f routine. Implements the same algorithm
+-   as AdvSIMD exp10f.
+-   Worst case error is 1.02 ULPs.
+-   _ZGVsMxv_exp10f(-0x1.040488p-4) got 0x1.ba5f9ep-1
+-				  want 0x1.ba5f9cp-1.  */
+-svfloat32_t SV_NAME_F1 (exp10) (svfloat32_t x, const svbool_t pg)
+-{
+-  const struct data *d = ptr_barrier (&data);
+   /* exp10(x) = 2^(n/N) * 10^r = 2^n * (1 + poly (r)),
+      with poly(r) in [1/sqrt(2), sqrt(2)] and
+      x = r + n * log10(2) / N, with r in [-log10(2)/2N, log10(2)/2N].  */
+ 
+-  /* Load some constants in quad-word chunks to minimise memory access (last
+-     lane is wasted).  */
+-  svfloat32_t log10_2_and_inv = svld1rq (svptrue_b32 (), &d->log10_2);
++  svfloat32_t lane_consts = svld1rq (svptrue_b32 (), &d->log2_10_lo);
+ 
+   /* n = round(x/(log10(2)/N)).  */
+   svfloat32_t shift = sv_f32 (d->shift);
+-  svfloat32_t z = svmla_lane (shift, x, log10_2_and_inv, 0);
+-  svfloat32_t n = svsub_x (pg, z, shift);
++  svfloat32_t z = svmad_x (pg, sv_f32 (d->log10_2), x, shift);
++  svfloat32_t n = svsub_x (svptrue_b32 (), z, shift);
+ 
+   /* r = x - n*log10(2)/N.  */
+-  svfloat32_t r = svmls_lane (x, n, log10_2_and_inv, 1);
+-  r = svmls_lane (r, n, log10_2_and_inv, 2);
++  svfloat32_t r = svmsb_x (pg, sv_f32 (d->log2_10_hi), n, x);
++  r = svmls_lane (r, n, lane_consts, 0);
+ 
+-  svbool_t special = svacgt (pg, x, d->special_bound);
+   svfloat32_t scale = svexpa (svreinterpret_u32 (z));
+ 
+   /* Polynomial evaluation: poly(r) ~ exp10(r)-1.  */
+-  svfloat32_t r2 = svmul_x (pg, r, r);
+-  svfloat32_t poly
+-      = svmla_x (pg, svmul_x (pg, r, d->poly[0]),
+-		 sv_pairwise_poly_3_f32_x (pg, r, r2, d->poly + 1), r2);
+-
+-  if (__glibc_unlikely (svptest_any (pg, special)))
+-    return special_case (x, svmla_x (pg, scale, scale, poly), special);
++  svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, lane_consts, 2);
++  svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, lane_consts, 3);
++  svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
++  svfloat32_t p14 = svmla_x (pg, p12, p34, r2);
++  svfloat32_t p0 = svmul_lane (r, lane_consts, 1);
++  svfloat32_t poly = svmla_x (pg, p0, r2, p14);
+ 
+   return svmla_x (pg, scale, scale, poly);
+ }
++
++static svfloat32_t NOINLINE
++special_case (svfloat32_t x, svbool_t special, const struct data *d)
++{
++  return sv_call_f32 (exp10f, x, sv_exp10f_inline (x, svptrue_b32 (), d),
++		      special);
++}
++
++/* Single-precision SVE exp10f routine. Implements the same algorithm
++   as AdvSIMD exp10f.
++   Worst case error is 1.02 ULPs.
++   _ZGVsMxv_exp10f(-0x1.040488p-4) got 0x1.ba5f9ep-1
++				  want 0x1.ba5f9cp-1.  */
++svfloat32_t SV_NAME_F1 (exp10) (svfloat32_t x, const svbool_t pg)
++{
++  const struct data *d = ptr_barrier (&data);
++  svbool_t special = svacgt (pg, x, d->thres);
++  if (__glibc_unlikely (svptest_any (special, special)))
++    return special_case (x, special, d);
++  return sv_exp10f_inline (x, pg, d);
++}
+diff --git a/sysdeps/aarch64/fpu/exp2f_sve.c b/sysdeps/aarch64/fpu/exp2f_sve.c
+index 8a686e3e05..c6216bed9e 100644
+--- a/sysdeps/aarch64/fpu/exp2f_sve.c
++++ b/sysdeps/aarch64/fpu/exp2f_sve.c
+@@ -24,54 +24,64 @@
+ 
+ static const struct data
+ {
+-  float poly[5];
++  float c0, c2, c4, c1, c3;
+   float shift, thres;
+ } data = {
+-  /* Coefficients copied from the polynomial in AdvSIMD variant, reversed for
+-     compatibility with polynomial helpers.  */
+-  .poly = { 0x1.62e422p-1f, 0x1.ebf9bcp-3f, 0x1.c6bd32p-5f, 0x1.3ce9e4p-7f,
+-	    0x1.59977ap-10f },
++  /* Coefficients copied from the polynomial in AdvSIMD variant.  */
++  .c0 = 0x1.62e422p-1f,
++  .c1 = 0x1.ebf9bcp-3f,
++  .c2 = 0x1.c6bd32p-5f,
++  .c3 = 0x1.3ce9e4p-7f,
++  .c4 = 0x1.59977ap-10f,
+   /* 1.5*2^17 + 127.  */
+-  .shift = 0x1.903f8p17f,
++  .shift = 0x1.803f8p17f,
+   /* Roughly 87.3. For x < -Thres, the result is subnormal and not handled
+      correctly by FEXPA.  */
+   .thres = Thres,
+ };
+ 
+-static svfloat32_t NOINLINE
+-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
+-{
+-  return sv_call_f32 (exp2f, x, y, special);
+-}
+-
+-/* Single-precision SVE exp2f routine. Implements the same algorithm
+-   as AdvSIMD exp2f.
+-   Worst case error is 1.04 ULPs.
+-   SV_NAME_F1 (exp2)(0x1.943b9p-1) got 0x1.ba7eb2p+0
+-				  want 0x1.ba7ebp+0.  */
+-svfloat32_t SV_NAME_F1 (exp2) (svfloat32_t x, const svbool_t pg)
++static inline svfloat32_t
++sv_exp2f_inline (svfloat32_t x, const svbool_t pg, const struct data *d)
+ {
+-  const struct data *d = ptr_barrier (&data);
+   /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+     x = n + r, with r in [-1/2, 1/2].  */
+-  svfloat32_t shift = sv_f32 (d->shift);
+-  svfloat32_t z = svadd_x (pg, x, shift);
+-  svfloat32_t n = svsub_x (pg, z, shift);
+-  svfloat32_t r = svsub_x (pg, x, n);
++  svfloat32_t z = svadd_x (svptrue_b32 (), x, d->shift);
++  svfloat32_t n = svsub_x (svptrue_b32 (), z, d->shift);
++  svfloat32_t r = svsub_x (svptrue_b32 (), x, n);
+ 
+-  svbool_t special = svacgt (pg, x, d->thres);
+   svfloat32_t scale = svexpa (svreinterpret_u32 (z));
+ 
+   /* Polynomial evaluation: poly(r) ~ exp2(r)-1.
+      Evaluate polynomial use hybrid scheme - offset ESTRIN by 1 for
+      coefficients 1 to 4, and apply most significant coefficient directly.  */
+-  svfloat32_t r2 = svmul_x (pg, r, r);
+-  svfloat32_t p14 = sv_pairwise_poly_3_f32_x (pg, r, r2, d->poly + 1);
+-  svfloat32_t p0 = svmul_x (pg, r, d->poly[0]);
++  svfloat32_t even_coeffs = svld1rq (svptrue_b32 (), &d->c0);
++  svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
++  svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, even_coeffs, 1);
++  svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, even_coeffs, 2);
++  svfloat32_t p14 = svmla_x (pg, p12, r2, p34);
++  svfloat32_t p0 = svmul_lane (r, even_coeffs, 0);
+   svfloat32_t poly = svmla_x (pg, p0, r2, p14);
+ 
+-  if (__glibc_unlikely (svptest_any (pg, special)))
+-    return special_case (x, svmla_x (pg, scale, scale, poly), special);
+-
+   return svmla_x (pg, scale, scale, poly);
+ }
++
++static svfloat32_t NOINLINE
++special_case (svfloat32_t x, svbool_t special, const struct data *d)
++{
++  return sv_call_f32 (exp2f, x, sv_exp2f_inline (x, svptrue_b32 (), d),
++		      special);
++}
++
++/* Single-precision SVE exp2f routine. Implements the same algorithm
++   as AdvSIMD exp2f.
++   Worst case error is 1.04 ULPs.
++   _ZGVsMxv_exp2f(-0x1.af994ap-3) got 0x1.ba6a66p-1
++				 want 0x1.ba6a64p-1.  */
++svfloat32_t SV_NAME_F1 (exp2) (svfloat32_t x, const svbool_t pg)
++{
++  const struct data *d = ptr_barrier (&data);
++  svbool_t special = svacgt (pg, x, d->thres);
++  if (__glibc_unlikely (svptest_any (special, special)))
++    return special_case (x, special, d);
++  return sv_exp2f_inline (x, pg, d);
++}
+diff --git a/sysdeps/aarch64/fpu/expf_sve.c b/sysdeps/aarch64/fpu/expf_sve.c
+index 3ba79bc4f1..da93e01b87 100644
+--- a/sysdeps/aarch64/fpu/expf_sve.c
++++ b/sysdeps/aarch64/fpu/expf_sve.c
+@@ -18,33 +18,25 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include "sv_math.h"
++#include "sv_expf_inline.h"
++
++/* Roughly 87.3. For x < -Thres, the result is subnormal and not handled
++   correctly by FEXPA.  */
++#define Thres 0x1.5d5e2ap+6f
+ 
+ static const struct data
+ {
+-  float poly[5];
+-  float inv_ln2, ln2_hi, ln2_lo, shift, thres;
++  struct sv_expf_data d;
++  float thres;
+ } data = {
+-  /* Coefficients copied from the polynomial in AdvSIMD variant, reversed for
+-     compatibility with polynomial helpers.  */
+-  .poly = { 0x1.ffffecp-1f, 0x1.fffdb6p-2f, 0x1.555e66p-3f, 0x1.573e2ep-5f,
+-	    0x1.0e4020p-7f },
+-  .inv_ln2 = 0x1.715476p+0f,
+-  .ln2_hi = 0x1.62e4p-1f,
+-  .ln2_lo = 0x1.7f7d1cp-20f,
+-  /* 1.5*2^17 + 127.  */
+-  .shift = 0x1.903f8p17f,
+-  /* Roughly 87.3. For x < -Thres, the result is subnormal and not handled
+-     correctly by FEXPA.  */
+-  .thres = 0x1.5d5e2ap+6f,
++  .d = SV_EXPF_DATA,
++  .thres = Thres,
+ };
+ 
+-#define C(i) sv_f32 (d->poly[i])
+-#define ExponentBias 0x3f800000
+-
+ static svfloat32_t NOINLINE
+-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
++special_case (svfloat32_t x, svbool_t special, const struct sv_expf_data *d)
+ {
+-  return sv_call_f32 (expf, x, y, special);
++  return sv_call_f32 (expf, x, expf_inline (x, svptrue_b32 (), d), special);
+ }
+ 
+ /* Optimised single-precision SVE exp function.
+@@ -54,36 +46,8 @@ special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
+ svfloat32_t SV_NAME_F1 (exp) (svfloat32_t x, const svbool_t pg)
+ {
+   const struct data *d = ptr_barrier (&data);
+-
+-  /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+-     x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
+-
+-  /* Load some constants in quad-word chunks to minimise memory access (last
+-     lane is wasted).  */
+-  svfloat32_t invln2_and_ln2 = svld1rq (svptrue_b32 (), &d->inv_ln2);
+-
+-  /* n = round(x/(ln2/N)).  */
+-  svfloat32_t z = svmla_lane (sv_f32 (d->shift), x, invln2_and_ln2, 0);
+-  svfloat32_t n = svsub_x (pg, z, d->shift);
+-
+-  /* r = x - n*ln2/N.  */
+-  svfloat32_t r = svmls_lane (x, n, invln2_and_ln2, 1);
+-  r = svmls_lane (r, n, invln2_and_ln2, 2);
+-
+-  /* scale = 2^(n/N).  */
+   svbool_t is_special_case = svacgt (pg, x, d->thres);
+-  svfloat32_t scale = svexpa (svreinterpret_u32 (z));
+-
+-  /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6.  */
+-  svfloat32_t p12 = svmla_x (pg, C (1), C (2), r);
+-  svfloat32_t p34 = svmla_x (pg, C (3), C (4), r);
+-  svfloat32_t r2 = svmul_x (pg, r, r);
+-  svfloat32_t p14 = svmla_x (pg, p12, p34, r2);
+-  svfloat32_t p0 = svmul_x (pg, r, C (0));
+-  svfloat32_t poly = svmla_x (pg, p0, r2, p14);
+-
+   if (__glibc_unlikely (svptest_any (pg, is_special_case)))
+-    return special_case (x, svmla_x (pg, scale, scale, poly), is_special_case);
+-
+-  return svmla_x (pg, scale, scale, poly);
++    return special_case (x, is_special_case, &d->d);
++  return expf_inline (x, pg, &d->d);
+ }
+diff --git a/sysdeps/aarch64/fpu/sv_expf_inline.h b/sysdeps/aarch64/fpu/sv_expf_inline.h
+index 23963b5f8e..6166df6553 100644
+--- a/sysdeps/aarch64/fpu/sv_expf_inline.h
++++ b/sysdeps/aarch64/fpu/sv_expf_inline.h
+@@ -24,19 +24,20 @@
+ 
+ struct sv_expf_data
+ {
+-  float poly[5];
+-  float inv_ln2, ln2_hi, ln2_lo, shift;
++  float c1, c3, inv_ln2;
++  float ln2_lo, c0, c2, c4;
++  float ln2_hi, shift;
+ };
+ 
+ /* Coefficients copied from the polynomial in AdvSIMD variant, reversed for
+    compatibility with polynomial helpers. Shift is 1.5*2^17 + 127.  */
+ #define SV_EXPF_DATA                                                          \
+   {                                                                           \
+-    .poly = { 0x1.ffffecp-1f, 0x1.fffdb6p-2f, 0x1.555e66p-3f, 0x1.573e2ep-5f, \
+-	      0x1.0e4020p-7f },                                               \
+-                                                                              \
+-    .inv_ln2 = 0x1.715476p+0f, .ln2_hi = 0x1.62e4p-1f,                        \
+-    .ln2_lo = 0x1.7f7d1cp-20f, .shift = 0x1.803f8p17f,                        \
++    /* Coefficients copied from the polynomial in AdvSIMD variant.  */        \
++    .c0 = 0x1.ffffecp-1f, .c1 = 0x1.fffdb6p-2f, .c2 = 0x1.555e66p-3f,         \
++    .c3 = 0x1.573e2ep-5f, .c4 = 0x1.0e4020p-7f, .inv_ln2 = 0x1.715476p+0f,    \
++    .ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f,                        \
++    .shift = 0x1.803f8p17f,                                                   \
+   }
+ 
+ #define C(i) sv_f32 (d->poly[i])
+@@ -47,26 +48,25 @@ expf_inline (svfloat32_t x, const svbool_t pg, const struct sv_expf_data *d)
+   /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+      x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
+ 
+-  /* Load some constants in quad-word chunks to minimise memory access.  */
+-  svfloat32_t c4_invln2_and_ln2 = svld1rq (svptrue_b32 (), &d->poly[4]);
++  svfloat32_t lane_consts = svld1rq (svptrue_b32 (), &d->ln2_lo);
+ 
+   /* n = round(x/(ln2/N)).  */
+-  svfloat32_t z = svmla_lane (sv_f32 (d->shift), x, c4_invln2_and_ln2, 1);
++  svfloat32_t z = svmad_x (pg, sv_f32 (d->inv_ln2), x, d->shift);
+   svfloat32_t n = svsub_x (pg, z, d->shift);
+ 
+   /* r = x - n*ln2/N.  */
+-  svfloat32_t r = svmls_lane (x, n, c4_invln2_and_ln2, 2);
+-  r = svmls_lane (r, n, c4_invln2_and_ln2, 3);
++  svfloat32_t r = svmsb_x (pg, sv_f32 (d->ln2_hi), n, x);
++  r = svmls_lane (r, n, lane_consts, 0);
+ 
+   /* scale = 2^(n/N).  */
+-  svfloat32_t scale = svexpa (svreinterpret_u32_f32 (z));
++  svfloat32_t scale = svexpa (svreinterpret_u32 (z));
+ 
+   /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6.  */
+-  svfloat32_t p12 = svmla_x (pg, C (1), C (2), r);
+-  svfloat32_t p34 = svmla_lane (C (3), r, c4_invln2_and_ln2, 0);
+-  svfloat32_t r2 = svmul_f32_x (pg, r, r);
++  svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, lane_consts, 2);
++  svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, lane_consts, 3);
++  svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
+   svfloat32_t p14 = svmla_x (pg, p12, p34, r2);
+-  svfloat32_t p0 = svmul_f32_x (pg, r, C (0));
++  svfloat32_t p0 = svmul_lane (r, lane_consts, 1);
+   svfloat32_t poly = svmla_x (pg, p0, r2, p14);
+ 
+   return svmla_x (pg, scale, scale, poly);
+
+commit c4373426e3a85ec483a0f412c2a7c6cdfa32ccdb
+Author: Joe Ramsay <Joe.Ramsay@arm.com>
+Date:   Mon Sep 23 15:30:20 2024 +0100
+
+    AArch64: Improve codegen in SVE F32 logs
+    
+    Reduce MOVPRFXs by using unpredicated (non-destructive) instructions
+    where possible.  Similar to the recent change to AdvSIMD F32 logs,
+    adjust special-case arguments and bounds to allow for more optimal
+    register usage.  For all 3 routines one MOVPRFX remains in the
+    reduction, which cannot be avoided as immediate AND and ASR are both
+    destructive.
+    
+    Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
+    (cherry picked from commit a15b1394b5eba98ffe28a02a392b587e4fe13c0d)
+
+diff --git a/sysdeps/aarch64/fpu/log10f_sve.c b/sysdeps/aarch64/fpu/log10f_sve.c
+index bdbb49cd32..7913679f67 100644
+--- a/sysdeps/aarch64/fpu/log10f_sve.c
++++ b/sysdeps/aarch64/fpu/log10f_sve.c
+@@ -24,6 +24,7 @@ static const struct data
+   float poly_0246[4];
+   float poly_1357[4];
+   float ln2, inv_ln10;
++  uint32_t off, lower;
+ } data = {
+   .poly_1357 = {
+     /* Coefficients copied from the AdvSIMD routine, then rearranged so that coeffs
+@@ -35,18 +36,23 @@ static const struct data
+ 		 -0x1.0fc92cp-4f },
+   .ln2 = 0x1.62e43p-1f,
+   .inv_ln10 = 0x1.bcb7b2p-2f,
++  .off = 0x3f2aaaab,
++  /* Lower bound is the smallest positive normal float 0x00800000. For
++     optimised register use subnormals are detected after offset has been
++     subtracted, so lower bound is 0x0080000 - offset (which wraps around).  */
++  .lower = 0x00800000 - 0x3f2aaaab
+ };
+ 
+-#define Min 0x00800000
+-#define Max 0x7f800000
+-#define Thres 0x7f000000  /* Max - Min.  */
+-#define Offset 0x3f2aaaab /* 0.666667.  */
++#define Thres 0x7f000000 /* asuint32(inf) - 0x00800000.  */
+ #define MantissaMask 0x007fffff
+ 
+ static svfloat32_t NOINLINE
+-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
++special_case (svuint32_t u_off, svfloat32_t p, svfloat32_t r2, svfloat32_t y,
++	      svbool_t cmp)
+ {
+-  return sv_call_f32 (log10f, x, y, special);
++  return sv_call_f32 (
++      log10f, svreinterpret_f32 (svadd_x (svptrue_b32 (), u_off, data.off)),
++      svmla_x (svptrue_b32 (), p, r2, y), cmp);
+ }
+ 
+ /* Optimised implementation of SVE log10f using the same algorithm and
+@@ -57,23 +63,25 @@ special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
+ svfloat32_t SV_NAME_F1 (log10) (svfloat32_t x, const svbool_t pg)
+ {
+   const struct data *d = ptr_barrier (&data);
+-  svuint32_t ix = svreinterpret_u32 (x);
+-  svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thres);
++
++  svuint32_t u_off = svreinterpret_u32 (x);
++
++  u_off = svsub_x (pg, u_off, d->off);
++  svbool_t special = svcmpge (pg, svsub_x (pg, u_off, d->lower), Thres);
+ 
+   /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3.  */
+-  ix = svsub_x (pg, ix, Offset);
+   svfloat32_t n = svcvt_f32_x (
+-      pg, svasr_x (pg, svreinterpret_s32 (ix), 23)); /* signextend.  */
+-  ix = svand_x (pg, ix, MantissaMask);
+-  ix = svadd_x (pg, ix, Offset);
++      pg, svasr_x (pg, svreinterpret_s32 (u_off), 23)); /* signextend.  */
++  svuint32_t ix = svand_x (pg, u_off, MantissaMask);
++  ix = svadd_x (pg, ix, d->off);
+   svfloat32_t r = svsub_x (pg, svreinterpret_f32 (ix), 1.0f);
+ 
+   /* y = log10(1+r) + n*log10(2)
+      log10(1+r) ~ r * InvLn(10) + P(r)
+      where P(r) is a polynomial. Use order 9 for log10(1+x), i.e. order 8 for
+      log10(1+x)/x, with x in [-1/3, 1/3] (offset=2/3).  */
+-  svfloat32_t r2 = svmul_x (pg, r, r);
+-  svfloat32_t r4 = svmul_x (pg, r2, r2);
++  svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
++  svfloat32_t r4 = svmul_x (svptrue_b32 (), r2, r2);
+   svfloat32_t p_1357 = svld1rq (svptrue_b32 (), &d->poly_1357[0]);
+   svfloat32_t q_01 = svmla_lane (sv_f32 (d->poly_0246[0]), r, p_1357, 0);
+   svfloat32_t q_23 = svmla_lane (sv_f32 (d->poly_0246[1]), r, p_1357, 1);
+@@ -88,7 +96,6 @@ svfloat32_t SV_NAME_F1 (log10) (svfloat32_t x, const svbool_t pg)
+   hi = svmul_x (pg, hi, d->inv_ln10);
+ 
+   if (__glibc_unlikely (svptest_any (pg, special)))
+-    return special_case (x, svmla_x (svnot_z (pg, special), hi, r2, y),
+-			 special);
+-  return svmla_x (pg, hi, r2, y);
++    return special_case (u_off, hi, r2, y, special);
++  return svmla_x (svptrue_b32 (), hi, r2, y);
+ }
+diff --git a/sysdeps/aarch64/fpu/log2f_sve.c b/sysdeps/aarch64/fpu/log2f_sve.c
+index 5031c42483..939d89bfb9 100644
+--- a/sysdeps/aarch64/fpu/log2f_sve.c
++++ b/sysdeps/aarch64/fpu/log2f_sve.c
+@@ -23,6 +23,7 @@ static const struct data
+ {
+   float poly_02468[5];
+   float poly_1357[4];
++  uint32_t off, lower;
+ } data = {
+   .poly_1357 = {
+     /* Coefficients copied from the AdvSIMD routine, then rearranged so that coeffs
+@@ -32,18 +33,23 @@ static const struct data
+   },
+   .poly_02468 = { 0x1.715476p0f, 0x1.ec701cp-2f, 0x1.27a0b8p-2f,
+ 		  0x1.9d8ecap-3f, 0x1.9e495p-3f },
++  .off = 0x3f2aaaab,
++  /* Lower bound is the smallest positive normal float 0x00800000. For
++     optimised register use subnormals are detected after offset has been
++     subtracted, so lower bound is 0x0080000 - offset (which wraps around).  */
++  .lower = 0x00800000 - 0x3f2aaaab
+ };
+ 
+-#define Min (0x00800000)
+-#define Max (0x7f800000)
+-#define Thres (0x7f000000) /* Max - Min.  */
++#define Thresh (0x7f000000) /* asuint32(inf) - 0x00800000.  */
+ #define MantissaMask (0x007fffff)
+-#define Off (0x3f2aaaab) /* 0.666667.  */
+ 
+ static svfloat32_t NOINLINE
+-special_case (svfloat32_t x, svfloat32_t y, svbool_t cmp)
++special_case (svuint32_t u_off, svfloat32_t p, svfloat32_t r2, svfloat32_t y,
++	      svbool_t cmp)
+ {
+-  return sv_call_f32 (log2f, x, y, cmp);
++  return sv_call_f32 (
++      log2f, svreinterpret_f32 (svadd_x (svptrue_b32 (), u_off, data.off)),
++      svmla_x (svptrue_b32 (), p, r2, y), cmp);
+ }
+ 
+ /* Optimised implementation of SVE log2f, using the same algorithm
+@@ -55,19 +61,20 @@ svfloat32_t SV_NAME_F1 (log2) (svfloat32_t x, const svbool_t pg)
+ {
+   const struct data *d = ptr_barrier (&data);
+ 
+-  svuint32_t u = svreinterpret_u32 (x);
+-  svbool_t special = svcmpge (pg, svsub_x (pg, u, Min), Thres);
++  svuint32_t u_off = svreinterpret_u32 (x);
++
++  u_off = svsub_x (pg, u_off, d->off);
++  svbool_t special = svcmpge (pg, svsub_x (pg, u_off, d->lower), Thresh);
+ 
+   /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3.  */
+-  u = svsub_x (pg, u, Off);
+   svfloat32_t n = svcvt_f32_x (
+-      pg, svasr_x (pg, svreinterpret_s32 (u), 23)); /* Sign-extend.  */
+-  u = svand_x (pg, u, MantissaMask);
+-  u = svadd_x (pg, u, Off);
++      pg, svasr_x (pg, svreinterpret_s32 (u_off), 23)); /* Sign-extend.  */
++  svuint32_t u = svand_x (pg, u_off, MantissaMask);
++  u = svadd_x (pg, u, d->off);
+   svfloat32_t r = svsub_x (pg, svreinterpret_f32 (u), 1.0f);
+ 
+   /* y = log2(1+r) + n.  */
+-  svfloat32_t r2 = svmul_x (pg, r, r);
++  svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
+ 
+   /* Evaluate polynomial using pairwise Horner scheme.  */
+   svfloat32_t p_1357 = svld1rq (svptrue_b32 (), &d->poly_1357[0]);
+@@ -81,6 +88,6 @@ svfloat32_t SV_NAME_F1 (log2) (svfloat32_t x, const svbool_t pg)
+   y = svmla_x (pg, q_01, r2, y);
+ 
+   if (__glibc_unlikely (svptest_any (pg, special)))
+-    return special_case (x, svmla_x (svnot_z (pg, special), n, r, y), special);
+-  return svmla_x (pg, n, r, y);
++    return special_case (u_off, n, r, y, special);
++  return svmla_x (svptrue_b32 (), n, r, y);
+ }
+diff --git a/sysdeps/aarch64/fpu/logf_sve.c b/sysdeps/aarch64/fpu/logf_sve.c
+index d64e810cfe..5b9324678d 100644
+--- a/sysdeps/aarch64/fpu/logf_sve.c
++++ b/sysdeps/aarch64/fpu/logf_sve.c
+@@ -24,6 +24,7 @@ static const struct data
+   float poly_0135[4];
+   float poly_246[3];
+   float ln2;
++  uint32_t off, lower;
+ } data = {
+   .poly_0135 = {
+     /* Coefficients copied from the AdvSIMD routine in math/, then rearranged so
+@@ -32,19 +33,24 @@ static const struct data
+     -0x1.3e737cp-3f, 0x1.5a9aa2p-3f, 0x1.961348p-3f, 0x1.555d7cp-2f
+   },
+   .poly_246 = { -0x1.4f9934p-3f, -0x1.00187cp-2f, -0x1.ffffc8p-2f },
+-  .ln2 = 0x1.62e43p-1f
++  .ln2 = 0x1.62e43p-1f,
++  .off = 0x3f2aaaab,
++  /* Lower bound is the smallest positive normal float 0x00800000. For
++     optimised register use subnormals are detected after offset has been
++     subtracted, so lower bound is 0x0080000 - offset (which wraps around).  */
++  .lower = 0x00800000 - 0x3f2aaaab
+ };
+ 
+-#define Min (0x00800000)
+-#define Max (0x7f800000)
+-#define Thresh (0x7f000000) /* Max - Min.  */
++#define Thresh (0x7f000000) /* asuint32(inf) - 0x00800000.  */
+ #define Mask (0x007fffff)
+-#define Off (0x3f2aaaab) /* 0.666667.  */
+ 
+ static svfloat32_t NOINLINE
+-special_case (svfloat32_t x, svfloat32_t y, svbool_t cmp)
++special_case (svuint32_t u_off, svfloat32_t p, svfloat32_t r2, svfloat32_t y,
++	      svbool_t cmp)
+ {
+-  return sv_call_f32 (logf, x, y, cmp);
++  return sv_call_f32 (
++      logf, svreinterpret_f32 (svadd_x (svptrue_b32 (), u_off, data.off)),
++      svmla_x (svptrue_b32 (), p, r2, y), cmp);
+ }
+ 
+ /* Optimised implementation of SVE logf, using the same algorithm and
+@@ -55,19 +61,21 @@ svfloat32_t SV_NAME_F1 (log) (svfloat32_t x, const svbool_t pg)
+ {
+   const struct data *d = ptr_barrier (&data);
+ 
+-  svuint32_t u = svreinterpret_u32 (x);
+-  svbool_t cmp = svcmpge (pg, svsub_x (pg, u, Min), Thresh);
++  svuint32_t u_off = svreinterpret_u32 (x);
++
++  u_off = svsub_x (pg, u_off, d->off);
++  svbool_t cmp = svcmpge (pg, svsub_x (pg, u_off, d->lower), Thresh);
+ 
+   /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3.  */
+-  u = svsub_x (pg, u, Off);
+   svfloat32_t n = svcvt_f32_x (
+-      pg, svasr_x (pg, svreinterpret_s32 (u), 23)); /* Sign-extend.  */
+-  u = svand_x (pg, u, Mask);
+-  u = svadd_x (pg, u, Off);
++      pg, svasr_x (pg, svreinterpret_s32 (u_off), 23)); /* Sign-extend.  */
++
++  svuint32_t u = svand_x (pg, u_off, Mask);
++  u = svadd_x (pg, u, d->off);
+   svfloat32_t r = svsub_x (pg, svreinterpret_f32 (u), 1.0f);
+ 
+   /* y = log(1+r) + n*ln2.  */
+-  svfloat32_t r2 = svmul_x (pg, r, r);
++  svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
+   /* n*ln2 + r + r2*(P6 + r*P5 + r2*(P4 + r*P3 + r2*(P2 + r*P1 + r2*P0))).  */
+   svfloat32_t p_0135 = svld1rq (svptrue_b32 (), &d->poly_0135[0]);
+   svfloat32_t p = svmla_lane (sv_f32 (d->poly_246[0]), r, p_0135, 1);
+@@ -80,6 +88,6 @@ svfloat32_t SV_NAME_F1 (log) (svfloat32_t x, const svbool_t pg)
+   p = svmla_x (pg, r, n, d->ln2);
+ 
+   if (__glibc_unlikely (svptest_any (pg, cmp)))
+-    return special_case (x, svmla_x (svnot_z (pg, cmp), p, r2, y), cmp);
++    return special_case (u_off, p, r2, y, cmp);
+   return svmla_x (pg, p, r2, y);
+ }
+
+commit 520240173029fd03388ec01db9a5359291cbbd27
+Author: Joe Ramsay <Joe.Ramsay@arm.com>
+Date:   Mon Sep 23 15:32:14 2024 +0100
+
+    AArch64: Improve codegen in users of AdvSIMD log1pf helper
+    
+    log1pf is quite register-intensive - use fewer registers for the
+    polynomial, and make various changes to shorten dependency chains in
+    parent routines.  There is now no spilling with GCC 14.  Accuracy moves
+    around a little - comments adjusted accordingly but does not require
+    regen-ulps.
+    
+    Use the helper in log1pf as well, instead of having separate
+    implementations.  The more accurate polynomial means special-casing can
+    be simplified, and the shorter dependency chain avoids the usual dance
+    around v0, which is otherwise difficult.
+    
+    There is a small duplication of vectors containing 1.0f (or 0x3f800000) -
+    GCC is not currently able to efficiently handle values which fit in FMOV
+    but not MOVI, and are reinterpreted to integer.  There may be potential
+    for more optimisation if this is fixed.
+    
+    Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
+    (cherry picked from commit 5bc100bd4b7e00db3009ae93d25d303341545d23)
+
+diff --git a/sysdeps/aarch64/fpu/acoshf_advsimd.c b/sysdeps/aarch64/fpu/acoshf_advsimd.c
+index 8916dcbf40..004474acf9 100644
+--- a/sysdeps/aarch64/fpu/acoshf_advsimd.c
++++ b/sysdeps/aarch64/fpu/acoshf_advsimd.c
+@@ -25,35 +25,32 @@ const static struct data
+ {
+   struct v_log1pf_data log1pf_consts;
+   uint32x4_t one;
+-  uint16x4_t thresh;
+-} data = {
+-  .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE,
+-  .one = V4 (0x3f800000),
+-  .thresh = V4 (0x2000) /* top(asuint(SquareLim) - asuint(1)).  */
+-};
++} data = { .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE, .one = V4 (0x3f800000) };
++
++#define Thresh vdup_n_u16 (0x2000) /* top(asuint(SquareLim) - asuint(1)).  */
+ 
+ static float32x4_t NOINLINE VPCS_ATTR
+ special_case (float32x4_t x, float32x4_t y, uint16x4_t special,
+-	      const struct v_log1pf_data d)
++	      const struct v_log1pf_data *d)
+ {
+   return v_call_f32 (acoshf, x, log1pf_inline (y, d), vmovl_u16 (special));
+ }
+ 
+ /* Vector approximation for single-precision acosh, based on log1p. Maximum
+    error depends on WANT_SIMD_EXCEPT. With SIMD fp exceptions enabled, it
+-   is 2.78 ULP:
+-   __v_acoshf(0x1.07887p+0) got 0x1.ef9e9cp-3
+-			   want 0x1.ef9ea2p-3.
++   is 3.00 ULP:
++   _ZGVnN4v_acoshf(0x1.01df3ap+0) got 0x1.ef0a82p-4
++				 want 0x1.ef0a7cp-4.
+    With exceptions disabled, we can compute u with a shorter dependency chain,
+-   which gives maximum error of 3.07 ULP:
+-  __v_acoshf(0x1.01f83ep+0) got 0x1.fbc7fap-4
+-			   want 0x1.fbc7f4p-4.  */
++   which gives maximum error of 3.22 ULP:
++   _ZGVnN4v_acoshf(0x1.007ef2p+0) got 0x1.fdcdccp-5
++				 want 0x1.fdcdd2p-5.  */
+ 
+ VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (acosh) (float32x4_t x)
+ {
+   const struct data *d = ptr_barrier (&data);
+   uint32x4_t ix = vreinterpretq_u32_f32 (x);
+-  uint16x4_t special = vcge_u16 (vsubhn_u32 (ix, d->one), d->thresh);
++  uint16x4_t special = vcge_u16 (vsubhn_u32 (ix, d->one), Thresh);
+ 
+ #if WANT_SIMD_EXCEPT
+   /* Mask special lanes with 1 to side-step spurious invalid or overflow. Use
+@@ -64,15 +61,16 @@ VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (acosh) (float32x4_t x)
+   float32x4_t xm1 = v_zerofy_f32 (vsubq_f32 (x, v_f32 (1)), p);
+   float32x4_t u = vfmaq_f32 (vaddq_f32 (xm1, xm1), xm1, xm1);
+ #else
+-  float32x4_t xm1 = vsubq_f32 (x, v_f32 (1));
+-  float32x4_t u = vmulq_f32 (xm1, vaddq_f32 (x, v_f32 (1.0f)));
++  float32x4_t xm1 = vsubq_f32 (x, vreinterpretq_f32_u32 (d->one));
++  float32x4_t u
++      = vmulq_f32 (xm1, vaddq_f32 (x, vreinterpretq_f32_u32 (d->one)));
+ #endif
+ 
+   float32x4_t y = vaddq_f32 (xm1, vsqrtq_f32 (u));
+ 
+   if (__glibc_unlikely (v_any_u16h (special)))
+-    return special_case (x, y, special, d->log1pf_consts);
+-  return log1pf_inline (y, d->log1pf_consts);
++    return special_case (x, y, special, &d->log1pf_consts);
++  return log1pf_inline (y, &d->log1pf_consts);
+ }
+ libmvec_hidden_def (V_NAME_F1 (acosh))
+ HALF_WIDTH_ALIAS_F1 (acosh)
+diff --git a/sysdeps/aarch64/fpu/asinhf_advsimd.c b/sysdeps/aarch64/fpu/asinhf_advsimd.c
+index 09fd8a6143..eb789b91b6 100644
+--- a/sysdeps/aarch64/fpu/asinhf_advsimd.c
++++ b/sysdeps/aarch64/fpu/asinhf_advsimd.c
+@@ -20,16 +20,16 @@
+ #include "v_math.h"
+ #include "v_log1pf_inline.h"
+ 
+-#define SignMask v_u32 (0x80000000)
+-
+ const static struct data
+ {
+   struct v_log1pf_data log1pf_consts;
++  float32x4_t one;
+   uint32x4_t big_bound;
+ #if WANT_SIMD_EXCEPT
+   uint32x4_t tiny_bound;
+ #endif
+ } data = {
++  .one = V4 (1),
+   .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE,
+   .big_bound = V4 (0x5f800000), /* asuint(0x1p64).  */
+ #if WANT_SIMD_EXCEPT
+@@ -38,20 +38,27 @@ const static struct data
+ };
+ 
+ static float32x4_t NOINLINE VPCS_ATTR
+-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
++special_case (float32x4_t x, uint32x4_t sign, float32x4_t y,
++	      uint32x4_t special, const struct data *d)
+ {
+-  return v_call_f32 (asinhf, x, y, special);
++  return v_call_f32 (
++      asinhf, x,
++      vreinterpretq_f32_u32 (veorq_u32 (
++	  sign, vreinterpretq_u32_f32 (log1pf_inline (y, &d->log1pf_consts)))),
++      special);
+ }
+ 
+ /* Single-precision implementation of vector asinh(x), using vector log1p.
+-   Worst-case error is 2.66 ULP, at roughly +/-0.25:
+-   __v_asinhf(0x1.01b04p-2) got 0x1.fe163ep-3 want 0x1.fe1638p-3.  */
++   Worst-case error is 2.59 ULP:
++   _ZGVnN4v_asinhf(0x1.d86124p-3) got 0x1.d449bep-3
++				 want 0x1.d449c4p-3.  */
+ VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (asinh) (float32x4_t x)
+ {
+   const struct data *dat = ptr_barrier (&data);
+-  uint32x4_t iax = vbicq_u32 (vreinterpretq_u32_f32 (x), SignMask);
+-  float32x4_t ax = vreinterpretq_f32_u32 (iax);
++  float32x4_t ax = vabsq_f32 (x);
++  uint32x4_t iax = vreinterpretq_u32_f32 (ax);
+   uint32x4_t special = vcgeq_u32 (iax, dat->big_bound);
++  uint32x4_t sign = veorq_u32 (vreinterpretq_u32_f32 (x), iax);
+   float32x4_t special_arg = x;
+ 
+ #if WANT_SIMD_EXCEPT
+@@ -68,13 +75,13 @@ VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (asinh) (float32x4_t x)
+   /* asinh(x) = log(x + sqrt(x * x + 1)).
+      For positive x, asinh(x) = log1p(x + x * x / (1 + sqrt(x * x + 1))).  */
+   float32x4_t d
+-      = vaddq_f32 (v_f32 (1), vsqrtq_f32 (vfmaq_f32 (v_f32 (1), x, x)));
+-  float32x4_t y = log1pf_inline (
+-      vaddq_f32 (ax, vdivq_f32 (vmulq_f32 (ax, ax), d)), dat->log1pf_consts);
++      = vaddq_f32 (v_f32 (1), vsqrtq_f32 (vfmaq_f32 (dat->one, ax, ax)));
++  float32x4_t y = vaddq_f32 (ax, vdivq_f32 (vmulq_f32 (ax, ax), d));
+ 
+   if (__glibc_unlikely (v_any_u32 (special)))
+-    return special_case (special_arg, vbslq_f32 (SignMask, x, y), special);
+-  return vbslq_f32 (SignMask, x, y);
++    return special_case (special_arg, sign, y, special, dat);
++  return vreinterpretq_f32_u32 (veorq_u32 (
++      sign, vreinterpretq_u32_f32 (log1pf_inline (y, &dat->log1pf_consts))));
+ }
+ libmvec_hidden_def (V_NAME_F1 (asinh))
+ HALF_WIDTH_ALIAS_F1 (asinh)
+diff --git a/sysdeps/aarch64/fpu/atanhf_advsimd.c b/sysdeps/aarch64/fpu/atanhf_advsimd.c
+index ae488f7b54..818b6c92ad 100644
+--- a/sysdeps/aarch64/fpu/atanhf_advsimd.c
++++ b/sysdeps/aarch64/fpu/atanhf_advsimd.c
+@@ -40,15 +40,17 @@ const static struct data
+ #define Half v_u32 (0x3f000000)
+ 
+ static float32x4_t NOINLINE VPCS_ATTR
+-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
++special_case (float32x4_t x, float32x4_t halfsign, float32x4_t y,
++	      uint32x4_t special)
+ {
+-  return v_call_f32 (atanhf, x, y, special);
++  return v_call_f32 (atanhf, vbslq_f32 (AbsMask, x, halfsign),
++		     vmulq_f32 (halfsign, y), special);
+ }
+ 
+ /* Approximation for vector single-precision atanh(x) using modified log1p.
+-   The maximum error is 3.08 ULP:
+-   __v_atanhf(0x1.ff215p-5) got 0x1.ffcb7cp-5
+-			   want 0x1.ffcb82p-5.  */
++   The maximum error is 2.93 ULP:
++   _ZGVnN4v_atanhf(0x1.f43d7p-5) got 0x1.f4dcfep-5
++				want 0x1.f4dcf8p-5.  */
+ VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (atanh) (float32x4_t x)
+ {
+   const struct data *d = ptr_barrier (&data);
+@@ -68,11 +70,19 @@ VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (atanh) (float32x4_t x)
+   uint32x4_t special = vcgeq_u32 (iax, d->one);
+ #endif
+ 
+-  float32x4_t y = vdivq_f32 (vaddq_f32 (ax, ax), vsubq_f32 (v_f32 (1), ax));
+-  y = log1pf_inline (y, d->log1pf_consts);
++  float32x4_t y = vdivq_f32 (vaddq_f32 (ax, ax),
++			     vsubq_f32 (vreinterpretq_f32_u32 (d->one), ax));
++  y = log1pf_inline (y, &d->log1pf_consts);
+ 
++  /* If exceptions not required, pass ax to special-case for shorter dependency
++     chain. If exceptions are required ax will have been zerofied, so have to
++     pass x.  */
+   if (__glibc_unlikely (v_any_u32 (special)))
+-    return special_case (x, vmulq_f32 (halfsign, y), special);
++#if WANT_SIMD_EXCEPT
++    return special_case (x, halfsign, y, special);
++#else
++    return special_case (ax, halfsign, y, special);
++#endif
+   return vmulq_f32 (halfsign, y);
+ }
+ libmvec_hidden_def (V_NAME_F1 (atanh))
+diff --git a/sysdeps/aarch64/fpu/log1pf_advsimd.c b/sysdeps/aarch64/fpu/log1pf_advsimd.c
+index 8cfa28fb8a..00006fc703 100644
+--- a/sysdeps/aarch64/fpu/log1pf_advsimd.c
++++ b/sysdeps/aarch64/fpu/log1pf_advsimd.c
+@@ -18,114 +18,79 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include "v_math.h"
+-#include "poly_advsimd_f32.h"
++#include "v_log1pf_inline.h"
++
++#if WANT_SIMD_EXCEPT
+ 
+ const static struct data
+ {
+-  float32x4_t poly[8], ln2;
+-  uint32x4_t tiny_bound, minus_one, four, thresh;
+-  int32x4_t three_quarters;
++  uint32x4_t minus_one, thresh;
++  struct v_log1pf_data d;
+ } data = {
+-  .poly = { /* Generated using FPMinimax in [-0.25, 0.5]. First two coefficients
+-	       (1, -0.5) are not stored as they can be generated more
+-	       efficiently.  */
+-	    V4 (0x1.5555aap-2f), V4 (-0x1.000038p-2f), V4 (0x1.99675cp-3f),
+-	    V4 (-0x1.54ef78p-3f), V4 (0x1.28a1f4p-3f), V4 (-0x1.0da91p-3f),
+-	    V4 (0x1.abcb6p-4f), V4 (-0x1.6f0d5ep-5f) },
+-  .ln2 = V4 (0x1.62e43p-1f),
+-  .tiny_bound = V4 (0x34000000), /* asuint32(0x1p-23). ulp=0.5 at 0x1p-23.  */
+-  .thresh = V4 (0x4b800000), /* asuint32(INFINITY) - tiny_bound.  */
++  .d = V_LOG1PF_CONSTANTS_TABLE,
++  .thresh = V4 (0x4b800000), /* asuint32(INFINITY) - TinyBound.  */
+   .minus_one = V4 (0xbf800000),
+-  .four = V4 (0x40800000),
+-  .three_quarters = V4 (0x3f400000)
+ };
+ 
+-static inline float32x4_t
+-eval_poly (float32x4_t m, const float32x4_t *p)
+-{
+-  /* Approximate log(1+m) on [-0.25, 0.5] using split Estrin scheme.  */
+-  float32x4_t p_12 = vfmaq_f32 (v_f32 (-0.5), m, p[0]);
+-  float32x4_t p_34 = vfmaq_f32 (p[1], m, p[2]);
+-  float32x4_t p_56 = vfmaq_f32 (p[3], m, p[4]);
+-  float32x4_t p_78 = vfmaq_f32 (p[5], m, p[6]);
+-
+-  float32x4_t m2 = vmulq_f32 (m, m);
+-  float32x4_t p_02 = vfmaq_f32 (m, m2, p_12);
+-  float32x4_t p_36 = vfmaq_f32 (p_34, m2, p_56);
+-  float32x4_t p_79 = vfmaq_f32 (p_78, m2, p[7]);
+-
+-  float32x4_t m4 = vmulq_f32 (m2, m2);
+-  float32x4_t p_06 = vfmaq_f32 (p_02, m4, p_36);
+-  return vfmaq_f32 (p_06, m4, vmulq_f32 (m4, p_79));
+-}
++/* asuint32(0x1p-23). ulp=0.5 at 0x1p-23.  */
++#  define TinyBound v_u32 (0x34000000)
+ 
+ static float32x4_t NOINLINE VPCS_ATTR
+-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
++special_case (float32x4_t x, uint32x4_t cmp, const struct data *d)
+ {
+-  return v_call_f32 (log1pf, x, y, special);
++  /* Side-step special lanes so fenv exceptions are not triggered
++     inadvertently.  */
++  float32x4_t x_nospecial = v_zerofy_f32 (x, cmp);
++  return v_call_f32 (log1pf, x, log1pf_inline (x_nospecial, &d->d), cmp);
+ }
+ 
+-/* Vector log1pf approximation using polynomial on reduced interval. Accuracy
+-   is roughly 2.02 ULP:
+-   log1pf(0x1.21e13ap-2) got 0x1.fe8028p-3 want 0x1.fe802cp-3.  */
++/* Vector log1pf approximation using polynomial on reduced interval. Worst-case
++   error is 1.69 ULP:
++   _ZGVnN4v_log1pf(0x1.04418ap-2) got 0x1.cfcbd8p-3
++				 want 0x1.cfcbdcp-3.  */
+ VPCS_ATTR float32x4_t V_NAME_F1 (log1p) (float32x4_t x)
+ {
+   const struct data *d = ptr_barrier (&data);
+-
+   uint32x4_t ix = vreinterpretq_u32_f32 (x);
+   uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x));
++
+   uint32x4_t special_cases
+-      = vorrq_u32 (vcgeq_u32 (vsubq_u32 (ia, d->tiny_bound), d->thresh),
++      = vorrq_u32 (vcgeq_u32 (vsubq_u32 (ia, TinyBound), d->thresh),
+ 		   vcgeq_u32 (ix, d->minus_one));
+-  float32x4_t special_arg = x;
+ 
+-#if WANT_SIMD_EXCEPT
+   if (__glibc_unlikely (v_any_u32 (special_cases)))
+-    /* Side-step special lanes so fenv exceptions are not triggered
+-       inadvertently.  */
+-    x = v_zerofy_f32 (x, special_cases);
+-#endif
++    return special_case (x, special_cases, d);
+ 
+-  /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m
+-			   is in [-0.25, 0.5]):
+-     log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2).
+-
+-     We approximate log1p(m) with a polynomial, then scale by
+-     k*log(2). Instead of doing this directly, we use an intermediate
+-     scale factor s = 4*k*log(2) to ensure the scale is representable
+-     as a normalised fp32 number.  */
++  return log1pf_inline (x, &d->d);
++}
+ 
+-  float32x4_t m = vaddq_f32 (x, v_f32 (1.0f));
++#else
+ 
+-  /* Choose k to scale x to the range [-1/4, 1/2].  */
+-  int32x4_t k
+-      = vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d->three_quarters),
+-		   v_s32 (0xff800000));
+-  uint32x4_t ku = vreinterpretq_u32_s32 (k);
++const static struct v_log1pf_data data = V_LOG1PF_CONSTANTS_TABLE;
+ 
+-  /* Scale x by exponent manipulation.  */
+-  float32x4_t m_scale
+-      = vreinterpretq_f32_u32 (vsubq_u32 (vreinterpretq_u32_f32 (x), ku));
++static float32x4_t NOINLINE VPCS_ATTR
++special_case (float32x4_t x, uint32x4_t cmp)
++{
++  return v_call_f32 (log1pf, x, log1pf_inline (x, ptr_barrier (&data)), cmp);
++}
+ 
+-  /* Scale up to ensure that the scale factor is representable as normalised
+-     fp32 number, and scale m down accordingly.  */
+-  float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d->four, ku));
+-  m_scale = vaddq_f32 (m_scale, vfmaq_f32 (v_f32 (-1.0f), v_f32 (0.25f), s));
++/* Vector log1pf approximation using polynomial on reduced interval. Worst-case
++   error is 1.63 ULP:
++   _ZGVnN4v_log1pf(0x1.216d12p-2) got 0x1.fdcb12p-3
++				 want 0x1.fdcb16p-3.  */
++VPCS_ATTR float32x4_t V_NAME_F1 (log1p) (float32x4_t x)
++{
++  uint32x4_t special_cases = vornq_u32 (vcleq_f32 (x, v_f32 (-1)),
++					vcaleq_f32 (x, v_f32 (0x1p127f)));
+ 
+-  /* Evaluate polynomial on the reduced interval.  */
+-  float32x4_t p = eval_poly (m_scale, d->poly);
++  if (__glibc_unlikely (v_any_u32 (special_cases)))
++    return special_case (x, special_cases);
+ 
+-  /* The scale factor to be applied back at the end - by multiplying float(k)
+-     by 2^-23 we get the unbiased exponent of k.  */
+-  float32x4_t scale_back = vcvtq_f32_s32 (vshrq_n_s32 (k, 23));
++  return log1pf_inline (x, ptr_barrier (&data));
++}
+ 
+-  /* Apply the scaling back.  */
+-  float32x4_t y = vfmaq_f32 (p, scale_back, d->ln2);
++#endif
+ 
+-  if (__glibc_unlikely (v_any_u32 (special_cases)))
+-    return special_case (special_arg, y, special_cases);
+-  return y;
+-}
+ libmvec_hidden_def (V_NAME_F1 (log1p))
+ HALF_WIDTH_ALIAS_F1 (log1p)
+ strong_alias (V_NAME_F1 (log1p), V_NAME_F1 (logp1))
+diff --git a/sysdeps/aarch64/fpu/v_log1pf_inline.h b/sysdeps/aarch64/fpu/v_log1pf_inline.h
+index 643a6cdcfc..73e45a942e 100644
+--- a/sysdeps/aarch64/fpu/v_log1pf_inline.h
++++ b/sysdeps/aarch64/fpu/v_log1pf_inline.h
+@@ -25,54 +25,81 @@
+ 
+ struct v_log1pf_data
+ {
+-  float32x4_t poly[8], ln2;
+   uint32x4_t four;
+   int32x4_t three_quarters;
++  float c0, c3, c5, c7;
++  float32x4_t c4, c6, c1, c2, ln2;
+ };
+ 
+ /* Polynomial generated using FPMinimax in [-0.25, 0.5]. First two coefficients
+    (1, -0.5) are not stored as they can be generated more efficiently.  */
+ #define V_LOG1PF_CONSTANTS_TABLE                                              \
+   {                                                                           \
+-    .poly                                                                     \
+-	= { V4 (0x1.5555aap-2f),  V4 (-0x1.000038p-2f), V4 (0x1.99675cp-3f),  \
+-	    V4 (-0x1.54ef78p-3f), V4 (0x1.28a1f4p-3f),	V4 (-0x1.0da91p-3f),  \
+-	    V4 (0x1.abcb6p-4f),	  V4 (-0x1.6f0d5ep-5f) },                     \
+-	.ln2 = V4 (0x1.62e43p-1f), .four = V4 (0x40800000),                   \
+-	.three_quarters = V4 (0x3f400000)                                     \
++    .c0 = 0x1.5555aap-2f, .c1 = V4 (-0x1.000038p-2f),                         \
++    .c2 = V4 (0x1.99675cp-3f), .c3 = -0x1.54ef78p-3f,                         \
++    .c4 = V4 (0x1.28a1f4p-3f), .c5 = -0x1.0da91p-3f,                          \
++    .c6 = V4 (0x1.abcb6p-4f), .c7 = -0x1.6f0d5ep-5f,                          \
++    .ln2 = V4 (0x1.62e43p-1f), .four = V4 (0x40800000),                       \
++    .three_quarters = V4 (0x3f400000)                                         \
+   }
+ 
+ static inline float32x4_t
+-eval_poly (float32x4_t m, const float32x4_t *c)
++eval_poly (float32x4_t m, const struct v_log1pf_data *d)
+ {
+-  /* Approximate log(1+m) on [-0.25, 0.5] using pairwise Horner (main routine
+-     uses split Estrin, but this way reduces register pressure in the calling
+-     routine).  */
+-  float32x4_t q = vfmaq_f32 (v_f32 (-0.5), m, c[0]);
++  /* Approximate log(1+m) on [-0.25, 0.5] using pairwise Horner.  */
++  float32x4_t c0357 = vld1q_f32 (&d->c0);
++  float32x4_t q = vfmaq_laneq_f32 (v_f32 (-0.5), m, c0357, 0);
+   float32x4_t m2 = vmulq_f32 (m, m);
+-  q = vfmaq_f32 (m, m2, q);
+-  float32x4_t p = v_pw_horner_6_f32 (m, m2, c + 1);
++  float32x4_t p67 = vfmaq_laneq_f32 (d->c6, m, c0357, 3);
++  float32x4_t p45 = vfmaq_laneq_f32 (d->c4, m, c0357, 2);
++  float32x4_t p23 = vfmaq_laneq_f32 (d->c2, m, c0357, 1);
++  float32x4_t p = vfmaq_f32 (p45, m2, p67);
++  p = vfmaq_f32 (p23, m2, p);
++  p = vfmaq_f32 (d->c1, m, p);
+   p = vmulq_f32 (m2, p);
+-  return vfmaq_f32 (q, m2, p);
++  p = vfmaq_f32 (m, m2, p);
++  return vfmaq_f32 (p, m2, q);
+ }
+ 
+ static inline float32x4_t
+-log1pf_inline (float32x4_t x, const struct v_log1pf_data d)
++log1pf_inline (float32x4_t x, const struct v_log1pf_data *d)
+ {
+-  /* Helper for calculating log(x + 1). Copied from log1pf_2u1.c, with no
+-     special-case handling. See that file for details of the algorithm.  */
++  /* Helper for calculating log(x + 1).  */
++
++  /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m
++			   is in [-0.25, 0.5]):
++     log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2).
++
++     We approximate log1p(m) with a polynomial, then scale by
++     k*log(2). Instead of doing this directly, we use an intermediate
++     scale factor s = 4*k*log(2) to ensure the scale is representable
++     as a normalised fp32 number.  */
+   float32x4_t m = vaddq_f32 (x, v_f32 (1.0f));
++
++  /* Choose k to scale x to the range [-1/4, 1/2].  */
+   int32x4_t k
+-      = vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d.three_quarters),
++      = vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d->three_quarters),
+ 		   v_s32 (0xff800000));
+   uint32x4_t ku = vreinterpretq_u32_s32 (k);
+-  float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d.four, ku));
++
++  /* Scale up to ensure that the scale factor is representable as normalised
++     fp32 number, and scale m down accordingly.  */
++  float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d->four, ku));
++
++  /* Scale x by exponent manipulation.  */
+   float32x4_t m_scale
+       = vreinterpretq_f32_u32 (vsubq_u32 (vreinterpretq_u32_f32 (x), ku));
+   m_scale = vaddq_f32 (m_scale, vfmaq_f32 (v_f32 (-1.0f), v_f32 (0.25f), s));
+-  float32x4_t p = eval_poly (m_scale, d.poly);
++
++  /* Evaluate polynomial on the reduced interval.  */
++  float32x4_t p = eval_poly (m_scale, d);
++
++  /* The scale factor to be applied back at the end - by multiplying float(k)
++     by 2^-23 we get the unbiased exponent of k.  */
+   float32x4_t scale_back = vmulq_f32 (vcvtq_f32_s32 (k), v_f32 (0x1.0p-23f));
+-  return vfmaq_f32 (p, scale_back, d.ln2);
++
++  /* Apply the scaling back.  */
++  return vfmaq_f32 (p, scale_back, d->ln2);
+ }
+ 
+ #endif
+
+commit a947a43b95bbea53ec50df058b42392fd5ea52b6
+Author: Joe Ramsay <Joe.Ramsay@arm.com>
+Date:   Mon Sep 23 15:32:53 2024 +0100
+
+    AArch64: Improve codegen in users of ADVSIMD expm1f helper
+    
+    Rearrange operations so MOV is not necessary in reduction or around
+    the special-case handler.  Reduce memory access by using more indexed
+    MLAs in polynomial.
+    
+    Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
+    (cherry picked from commit 7900ac490db32f6bccff812733f00280dde34e27)
+
+diff --git a/sysdeps/aarch64/fpu/expm1f_advsimd.c b/sysdeps/aarch64/fpu/expm1f_advsimd.c
+index a0616ec754..8303ca296e 100644
+--- a/sysdeps/aarch64/fpu/expm1f_advsimd.c
++++ b/sysdeps/aarch64/fpu/expm1f_advsimd.c
+@@ -18,27 +18,18 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include "v_math.h"
+-#include "poly_advsimd_f32.h"
++#include "v_expm1f_inline.h"
+ 
+ static const struct data
+ {
+-  float32x4_t poly[5];
+-  float invln2_and_ln2[4];
+-  float32x4_t shift;
+-  int32x4_t exponent_bias;
++  struct v_expm1f_data d;
+ #if WANT_SIMD_EXCEPT
+   uint32x4_t thresh;
+ #else
+   float32x4_t oflow_bound;
+ #endif
+ } data = {
+-  /* Generated using fpminimax with degree=5 in [-log(2)/2, log(2)/2].  */
+-  .poly = { V4 (0x1.fffffep-2), V4 (0x1.5554aep-3), V4 (0x1.555736p-5),
+-	    V4 (0x1.12287cp-7), V4 (0x1.6b55a2p-10) },
+-  /* Stores constants: invln2, ln2_hi, ln2_lo, 0.  */
+-  .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 },
+-  .shift = V4 (0x1.8p23f),
+-  .exponent_bias = V4 (0x3f800000),
++  .d = V_EXPM1F_DATA,
+ #if !WANT_SIMD_EXCEPT
+   /* Value above which expm1f(x) should overflow. Absolute value of the
+      underflow bound is greater than this, so it catches both cases - there is
+@@ -55,67 +46,38 @@ static const struct data
+ #define TinyBound v_u32 (0x34000000 << 1)
+ 
+ static float32x4_t VPCS_ATTR NOINLINE
+-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
++special_case (float32x4_t x, uint32x4_t special, const struct data *d)
+ {
+-  return v_call_f32 (expm1f, x, y, special);
++  return v_call_f32 (
++      expm1f, x, expm1f_inline (v_zerofy_f32 (x, special), &d->d), special);
+ }
+ 
+ /* Single-precision vector exp(x) - 1 function.
+-   The maximum error is 1.51 ULP:
+-   _ZGVnN4v_expm1f (0x1.8baa96p-2) got 0x1.e2fb9p-2
+-				  want 0x1.e2fb94p-2.  */
++   The maximum error is 1.62 ULP:
++   _ZGVnN4v_expm1f(0x1.85f83p-2) got 0x1.da9f4p-2
++				want 0x1.da9f44p-2.  */
+ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (expm1) (float32x4_t x)
+ {
+   const struct data *d = ptr_barrier (&data);
+-  uint32x4_t ix = vreinterpretq_u32_f32 (x);
+ 
+ #if WANT_SIMD_EXCEPT
++  uint32x4_t ix = vreinterpretq_u32_f32 (x);
+   /* If fp exceptions are to be triggered correctly, fall back to scalar for
+      |x| < 2^-23, |x| > oflow_bound, Inf & NaN. Add ix to itself for
+      shift-left by 1, and compare with thresh which was left-shifted offline -
+      this is effectively an absolute compare.  */
+   uint32x4_t special
+       = vcgeq_u32 (vsubq_u32 (vaddq_u32 (ix, ix), TinyBound), d->thresh);
+-  if (__glibc_unlikely (v_any_u32 (special)))
+-    x = v_zerofy_f32 (x, special);
+ #else
+   /* Handles very large values (+ve and -ve), +/-NaN, +/-Inf.  */
+   uint32x4_t special = vcagtq_f32 (x, d->oflow_bound);
+ #endif
+ 
+-  /* Reduce argument to smaller range:
+-     Let i = round(x / ln2)
+-     and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
+-     exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
+-     where 2^i is exact because i is an integer.  */
+-  float32x4_t invln2_and_ln2 = vld1q_f32 (d->invln2_and_ln2);
+-  float32x4_t j
+-      = vsubq_f32 (vfmaq_laneq_f32 (d->shift, x, invln2_and_ln2, 0), d->shift);
+-  int32x4_t i = vcvtq_s32_f32 (j);
+-  float32x4_t f = vfmsq_laneq_f32 (x, j, invln2_and_ln2, 1);
+-  f = vfmsq_laneq_f32 (f, j, invln2_and_ln2, 2);
+-
+-  /* Approximate expm1(f) using polynomial.
+-     Taylor expansion for expm1(x) has the form:
+-	 x + ax^2 + bx^3 + cx^4 ....
+-     So we calculate the polynomial P(f) = a + bf + cf^2 + ...
+-     and assemble the approximation expm1(f) ~= f + f^2 * P(f).  */
+-  float32x4_t p = v_horner_4_f32 (f, d->poly);
+-  p = vfmaq_f32 (f, vmulq_f32 (f, f), p);
+-
+-  /* Assemble the result.
+-     expm1(x) ~= 2^i * (p + 1) - 1
+-     Let t = 2^i.  */
+-  int32x4_t u = vaddq_s32 (vshlq_n_s32 (i, 23), d->exponent_bias);
+-  float32x4_t t = vreinterpretq_f32_s32 (u);
+-
+   if (__glibc_unlikely (v_any_u32 (special)))
+-    return special_case (vreinterpretq_f32_u32 (ix),
+-			 vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t),
+-			 special);
++    return special_case (x, special, d);
+ 
+   /* expm1(x) ~= p * t + (t - 1).  */
+-  return vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t);
++  return expm1f_inline (x, &d->d);
+ }
+ libmvec_hidden_def (V_NAME_F1 (expm1))
+ HALF_WIDTH_ALIAS_F1 (expm1)
+diff --git a/sysdeps/aarch64/fpu/sinhf_advsimd.c b/sysdeps/aarch64/fpu/sinhf_advsimd.c
+index 6bb7482dc2..c6ed7598e7 100644
+--- a/sysdeps/aarch64/fpu/sinhf_advsimd.c
++++ b/sysdeps/aarch64/fpu/sinhf_advsimd.c
+@@ -23,15 +23,13 @@
+ static const struct data
+ {
+   struct v_expm1f_data expm1f_consts;
+-  uint32x4_t halff;
+ #if WANT_SIMD_EXCEPT
+   uint32x4_t tiny_bound, thresh;
+ #else
+-  uint32x4_t oflow_bound;
++  float32x4_t oflow_bound;
+ #endif
+ } data = {
+   .expm1f_consts = V_EXPM1F_DATA,
+-  .halff = V4 (0x3f000000),
+ #if WANT_SIMD_EXCEPT
+   /* 0x1.6a09e8p-32, below which expm1f underflows.  */
+   .tiny_bound = V4 (0x2fb504f4),
+@@ -39,14 +37,15 @@ static const struct data
+   .thresh = V4 (0x12fbbbb3),
+ #else
+   /* 0x1.61814ep+6, above which expm1f helper overflows.  */
+-  .oflow_bound = V4 (0x42b0c0a7),
++  .oflow_bound = V4 (0x1.61814ep+6),
+ #endif
+ };
+ 
+ static float32x4_t NOINLINE VPCS_ATTR
+-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
++special_case (float32x4_t x, float32x4_t t, float32x4_t halfsign,
++	      uint32x4_t special)
+ {
+-  return v_call_f32 (sinhf, x, y, special);
++  return v_call_f32 (sinhf, x, vmulq_f32 (t, halfsign), special);
+ }
+ 
+ /* Approximation for vector single-precision sinh(x) using expm1.
+@@ -60,15 +59,15 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sinh) (float32x4_t x)
+ 
+   uint32x4_t ix = vreinterpretq_u32_f32 (x);
+   float32x4_t ax = vabsq_f32 (x);
+-  uint32x4_t iax = vreinterpretq_u32_f32 (ax);
+-  uint32x4_t sign = veorq_u32 (ix, iax);
+-  float32x4_t halfsign = vreinterpretq_f32_u32 (vorrq_u32 (sign, d->halff));
++  float32x4_t halfsign = vreinterpretq_f32_u32 (
++      vbslq_u32 (v_u32 (0x80000000), ix, vreinterpretq_u32_f32 (v_f32 (0.5))));
+ 
+ #if WANT_SIMD_EXCEPT
+-  uint32x4_t special = vcgeq_u32 (vsubq_u32 (iax, d->tiny_bound), d->thresh);
++  uint32x4_t special = vcgeq_u32 (
++      vsubq_u32 (vreinterpretq_u32_f32 (ax), d->tiny_bound), d->thresh);
+   ax = v_zerofy_f32 (ax, special);
+ #else
+-  uint32x4_t special = vcgeq_u32 (iax, d->oflow_bound);
++  uint32x4_t special = vcageq_f32 (x, d->oflow_bound);
+ #endif
+ 
+   /* Up to the point that expm1f overflows, we can use it to calculate sinhf
+@@ -80,7 +79,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sinh) (float32x4_t x)
+   /* Fall back to the scalar variant for any lanes that should trigger an
+      exception.  */
+   if (__glibc_unlikely (v_any_u32 (special)))
+-    return special_case (x, vmulq_f32 (t, halfsign), special);
++    return special_case (x, t, halfsign, special);
+ 
+   return vmulq_f32 (t, halfsign);
+ }
+diff --git a/sysdeps/aarch64/fpu/tanhf_advsimd.c b/sysdeps/aarch64/fpu/tanhf_advsimd.c
+index 50defd6ef0..3ced9b7a41 100644
+--- a/sysdeps/aarch64/fpu/tanhf_advsimd.c
++++ b/sysdeps/aarch64/fpu/tanhf_advsimd.c
+@@ -28,13 +28,16 @@ static const struct data
+   /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for  negative).  */
+   .boring_bound = V4 (0x41102cb3),
+   .large_bound = V4 (0x7f800000),
+-  .onef = V4 (0x3f800000),
+ };
+ 
+ static float32x4_t NOINLINE VPCS_ATTR
+-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
++special_case (float32x4_t x, uint32x4_t is_boring, float32x4_t boring,
++	      float32x4_t q, uint32x4_t special)
+ {
+-  return v_call_f32 (tanhf, x, y, special);
++  return v_call_f32 (
++      tanhf, x,
++      vbslq_f32 (is_boring, boring, vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)))),
++      special);
+ }
+ 
+ /* Approximation for single-precision vector tanh(x), using a simplified
+@@ -50,7 +53,9 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tanh) (float32x4_t x)
+   uint32x4_t iax = vreinterpretq_u32_f32 (ax);
+   uint32x4_t sign = veorq_u32 (ix, iax);
+   uint32x4_t is_boring = vcgtq_u32 (iax, d->boring_bound);
+-  float32x4_t boring = vreinterpretq_f32_u32 (vorrq_u32 (sign, d->onef));
++  /* expm1 exponent bias is 1.0f reinterpreted to int.  */
++  float32x4_t boring = vreinterpretq_f32_u32 (vorrq_u32 (
++      sign, vreinterpretq_u32_s32 (d->expm1f_consts.exponent_bias)));
+ 
+ #if WANT_SIMD_EXCEPT
+   /* If fp exceptions are to be triggered properly, set all special and boring
+@@ -66,10 +71,12 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tanh) (float32x4_t x)
+ 
+   /* tanh(x) = (e^2x - 1) / (e^2x + 1).  */
+   float32x4_t q = expm1f_inline (vmulq_n_f32 (x, 2), &d->expm1f_consts);
+-  float32x4_t y = vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)));
++
+   if (__glibc_unlikely (v_any_u32 (special)))
+-    return special_case (vreinterpretq_f32_u32 (ix),
+-			 vbslq_f32 (is_boring, boring, y), special);
++    return special_case (vreinterpretq_f32_u32 (ix), is_boring, boring, q,
++			 special);
++
++  float32x4_t y = vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)));
+   return vbslq_f32 (is_boring, boring, y);
+ }
+ libmvec_hidden_def (V_NAME_F1 (tanh))
+diff --git a/sysdeps/aarch64/fpu/v_expm1f_inline.h b/sysdeps/aarch64/fpu/v_expm1f_inline.h
+index 59b552da6b..1daedfdd51 100644
+--- a/sysdeps/aarch64/fpu/v_expm1f_inline.h
++++ b/sysdeps/aarch64/fpu/v_expm1f_inline.h
+@@ -21,48 +21,47 @@
+ #define AARCH64_FPU_V_EXPM1F_INLINE_H
+ 
+ #include "v_math.h"
+-#include "poly_advsimd_f32.h"
++#include "math_config.h"
+ 
+ struct v_expm1f_data
+ {
+-  float32x4_t poly[5];
+-  float invln2_and_ln2[4];
+-  float32x4_t shift;
++  float32x4_t c0, c2;
+   int32x4_t exponent_bias;
++  float c1, c3, inv_ln2, c4;
++  float ln2_hi, ln2_lo;
+ };
+ 
+ /* Coefficients generated using fpminimax with degree=5 in [-log(2)/2,
+-   log(2)/2]. Exponent bias is asuint(1.0f).
+-   invln2_and_ln2 Stores constants: invln2, ln2_lo, ln2_hi, 0.  */
++   log(2)/2]. Exponent bias is asuint(1.0f).  */
+ #define V_EXPM1F_DATA                                                         \
+   {                                                                           \
+-    .poly = { V4 (0x1.fffffep-2), V4 (0x1.5554aep-3), V4 (0x1.555736p-5),     \
+-	      V4 (0x1.12287cp-7), V4 (0x1.6b55a2p-10) },                      \
+-    .shift = V4 (0x1.8p23f), .exponent_bias = V4 (0x3f800000),                \
+-    .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 },   \
++    .c0 = V4 (0x1.fffffep-2), .c1 = 0x1.5554aep-3, .c2 = V4 (0x1.555736p-5),  \
++    .c3 = 0x1.12287cp-7, .c4 = 0x1.6b55a2p-10,                                \
++    .exponent_bias = V4 (0x3f800000), .inv_ln2 = 0x1.715476p+0f,              \
++    .ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f,                        \
+   }
+ 
+ static inline float32x4_t
+ expm1f_inline (float32x4_t x, const struct v_expm1f_data *d)
+ {
+-  /* Helper routine for calculating exp(x) - 1.
+-     Copied from v_expm1f_1u6.c, with all special-case handling removed - the
+-     calling routine should handle special values if required.  */
++  /* Helper routine for calculating exp(x) - 1.  */
++
++  float32x2_t ln2 = vld1_f32 (&d->ln2_hi);
++  float32x4_t lane_consts = vld1q_f32 (&d->c1);
+ 
+   /* Reduce argument: f in [-ln2/2, ln2/2], i is exact.  */
+-  float32x4_t invln2_and_ln2 = vld1q_f32 (d->invln2_and_ln2);
+-  float32x4_t j
+-      = vsubq_f32 (vfmaq_laneq_f32 (d->shift, x, invln2_and_ln2, 0), d->shift);
++  float32x4_t j = vrndaq_f32 (vmulq_laneq_f32 (x, lane_consts, 2));
+   int32x4_t i = vcvtq_s32_f32 (j);
+-  float32x4_t f = vfmsq_laneq_f32 (x, j, invln2_and_ln2, 1);
+-  f = vfmsq_laneq_f32 (f, j, invln2_and_ln2, 2);
++  float32x4_t f = vfmsq_lane_f32 (x, j, ln2, 0);
++  f = vfmsq_lane_f32 (f, j, ln2, 1);
+ 
+-  /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f).
+-     Uses Estrin scheme, where the main _ZGVnN4v_expm1f routine uses
+-     Horner.  */
++  /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f).  */
+   float32x4_t f2 = vmulq_f32 (f, f);
+   float32x4_t f4 = vmulq_f32 (f2, f2);
+-  float32x4_t p = v_estrin_4_f32 (f, f2, f4, d->poly);
++  float32x4_t p01 = vfmaq_laneq_f32 (d->c0, f, lane_consts, 0);
++  float32x4_t p23 = vfmaq_laneq_f32 (d->c2, f, lane_consts, 1);
++  float32x4_t p = vfmaq_f32 (p01, f2, p23);
++  p = vfmaq_laneq_f32 (p, f4, lane_consts, 3);
+   p = vfmaq_f32 (f, f2, p);
+ 
+   /* t = 2^i.  */
+
+commit 68f2eb20de698675ddc74068c2cd03fee29207df
+Author: Joe Ramsay <Joe.Ramsay@arm.com>
+Date:   Mon Sep 23 15:33:31 2024 +0100
+
+    AArch64: Simplify rounding-multiply pattern in several AdvSIMD routines
+    
+    This operation can be simplified to use simpler multiply-round-convert
+    sequence, which uses fewer instructions and constants.
+    
+    Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
+    (cherry picked from commit 16a59571e4e9fd019d3fc23a2e7d73c1df8bb5cb)
+
+diff --git a/sysdeps/aarch64/fpu/cos_advsimd.c b/sysdeps/aarch64/fpu/cos_advsimd.c
+index 3924c9ce44..11a89b1530 100644
+--- a/sysdeps/aarch64/fpu/cos_advsimd.c
++++ b/sysdeps/aarch64/fpu/cos_advsimd.c
+@@ -22,7 +22,7 @@
+ static const struct data
+ {
+   float64x2_t poly[7];
+-  float64x2_t range_val, shift, inv_pi, half_pi, pi_1, pi_2, pi_3;
++  float64x2_t range_val, inv_pi, pi_1, pi_2, pi_3;
+ } data = {
+   /* Worst-case error is 3.3 ulp in [-pi/2, pi/2].  */
+   .poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7),
+@@ -30,11 +30,9 @@ static const struct data
+ 	    V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33),
+ 	    V2 (-0x1.9e9540300a1p-41) },
+   .inv_pi = V2 (0x1.45f306dc9c883p-2),
+-  .half_pi = V2 (0x1.921fb54442d18p+0),
+   .pi_1 = V2 (0x1.921fb54442d18p+1),
+   .pi_2 = V2 (0x1.1a62633145c06p-53),
+   .pi_3 = V2 (0x1.c1cd129024e09p-106),
+-  .shift = V2 (0x1.8p52),
+   .range_val = V2 (0x1p23)
+ };
+ 
+@@ -68,10 +66,9 @@ float64x2_t VPCS_ATTR V_NAME_D1 (cos) (float64x2_t x)
+ #endif
+ 
+   /* n = rint((|x|+pi/2)/pi) - 0.5.  */
+-  n = vfmaq_f64 (d->shift, d->inv_pi, vaddq_f64 (r, d->half_pi));
+-  odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63);
+-  n = vsubq_f64 (n, d->shift);
+-  n = vsubq_f64 (n, v_f64 (0.5));
++  n = vrndaq_f64 (vfmaq_f64 (v_f64 (0.5), r, d->inv_pi));
++  odd = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtq_s64_f64 (n)), 63);
++  n = vsubq_f64 (n, v_f64 (0.5f));
+ 
+   /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2).  */
+   r = vfmsq_f64 (r, d->pi_1, n);
+diff --git a/sysdeps/aarch64/fpu/cosf_advsimd.c b/sysdeps/aarch64/fpu/cosf_advsimd.c
+index d0c285b03a..85a1b37373 100644
+--- a/sysdeps/aarch64/fpu/cosf_advsimd.c
++++ b/sysdeps/aarch64/fpu/cosf_advsimd.c
+@@ -22,7 +22,7 @@
+ static const struct data
+ {
+   float32x4_t poly[4];
+-  float32x4_t range_val, inv_pi, half_pi, shift, pi_1, pi_2, pi_3;
++  float32x4_t range_val, inv_pi, pi_1, pi_2, pi_3;
+ } data = {
+   /* 1.886 ulp error.  */
+   .poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f),
+@@ -33,8 +33,6 @@ static const struct data
+   .pi_3 = V4 (-0x1.ee59dap-49f),
+ 
+   .inv_pi = V4 (0x1.45f306p-2f),
+-  .shift = V4 (0x1.8p+23f),
+-  .half_pi = V4 (0x1.921fb6p0f),
+   .range_val = V4 (0x1p20f)
+ };
+ 
+@@ -69,9 +67,8 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cos) (float32x4_t x)
+ #endif
+ 
+   /* n = rint((|x|+pi/2)/pi) - 0.5.  */
+-  n = vfmaq_f32 (d->shift, d->inv_pi, vaddq_f32 (r, d->half_pi));
+-  odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31);
+-  n = vsubq_f32 (n, d->shift);
++  n = vrndaq_f32 (vfmaq_f32 (v_f32 (0.5), r, d->inv_pi));
++  odd = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 31);
+   n = vsubq_f32 (n, v_f32 (0.5f));
+ 
+   /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2).  */
+diff --git a/sysdeps/aarch64/fpu/expf_advsimd.c b/sysdeps/aarch64/fpu/expf_advsimd.c
+index 99d2e647aa..5c9cb72620 100644
+--- a/sysdeps/aarch64/fpu/expf_advsimd.c
++++ b/sysdeps/aarch64/fpu/expf_advsimd.c
+@@ -22,7 +22,7 @@
+ static const struct data
+ {
+   float32x4_t poly[5];
+-  float32x4_t shift, inv_ln2, ln2_hi, ln2_lo;
++  float32x4_t inv_ln2, ln2_hi, ln2_lo;
+   uint32x4_t exponent_bias;
+ #if !WANT_SIMD_EXCEPT
+   float32x4_t special_bound, scale_thresh;
+@@ -31,7 +31,6 @@ static const struct data
+   /* maxerr: 1.45358 +0.5 ulp.  */
+   .poly = { V4 (0x1.0e4020p-7f), V4 (0x1.573e2ep-5f), V4 (0x1.555e66p-3f),
+ 	    V4 (0x1.fffdb6p-2f), V4 (0x1.ffffecp-1f) },
+-  .shift = V4 (0x1.8p23f),
+   .inv_ln2 = V4 (0x1.715476p+0f),
+   .ln2_hi = V4 (0x1.62e4p-1f),
+   .ln2_lo = V4 (0x1.7f7d1cp-20f),
+@@ -85,7 +84,7 @@ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
+ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp) (float32x4_t x)
+ {
+   const struct data *d = ptr_barrier (&data);
+-  float32x4_t n, r, r2, scale, p, q, poly, z;
++  float32x4_t n, r, r2, scale, p, q, poly;
+   uint32x4_t cmp, e;
+ 
+ #if WANT_SIMD_EXCEPT
+@@ -104,11 +103,10 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp) (float32x4_t x)
+ 
+   /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+      x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
+-  z = vfmaq_f32 (d->shift, x, d->inv_ln2);
+-  n = vsubq_f32 (z, d->shift);
++  n = vrndaq_f32 (vmulq_f32 (x, d->inv_ln2));
+   r = vfmsq_f32 (x, n, d->ln2_hi);
+   r = vfmsq_f32 (r, n, d->ln2_lo);
+-  e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23);
++  e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23);
+   scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
+ 
+ #if !WANT_SIMD_EXCEPT
+diff --git a/sysdeps/aarch64/fpu/sin_advsimd.c b/sysdeps/aarch64/fpu/sin_advsimd.c
+index a0d9d3b819..718125cbad 100644
+--- a/sysdeps/aarch64/fpu/sin_advsimd.c
++++ b/sysdeps/aarch64/fpu/sin_advsimd.c
+@@ -22,7 +22,7 @@
+ static const struct data
+ {
+   float64x2_t poly[7];
+-  float64x2_t range_val, inv_pi, shift, pi_1, pi_2, pi_3;
++  float64x2_t range_val, inv_pi, pi_1, pi_2, pi_3;
+ } data = {
+   .poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7),
+ 	    V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19),
+@@ -34,12 +34,13 @@ static const struct data
+   .pi_1 = V2 (0x1.921fb54442d18p+1),
+   .pi_2 = V2 (0x1.1a62633145c06p-53),
+   .pi_3 = V2 (0x1.c1cd129024e09p-106),
+-  .shift = V2 (0x1.8p52),
+ };
+ 
+ #if WANT_SIMD_EXCEPT
+-# define TinyBound v_u64 (0x3000000000000000) /* asuint64 (0x1p-255).  */
+-# define Thresh v_u64 (0x1160000000000000)    /* RangeVal - TinyBound.  */
++/* asuint64(0x1p-253)), below which multiply by inv_pi underflows.  */
++# define TinyBound v_u64 (0x3020000000000000)
++/* RangeVal - TinyBound.  */
++# define Thresh v_u64 (0x1160000000000000)
+ #endif
+ 
+ #define C(i) d->poly[i]
+@@ -72,16 +73,15 @@ float64x2_t VPCS_ATTR V_NAME_D1 (sin) (float64x2_t x)
+      fenv). These lanes will be fixed by special-case handler later.  */
+   uint64x2_t ir = vreinterpretq_u64_f64 (vabsq_f64 (x));
+   cmp = vcgeq_u64 (vsubq_u64 (ir, TinyBound), Thresh);
+-  r = vbslq_f64 (cmp, vreinterpretq_f64_u64 (cmp), x);
++  r = vreinterpretq_f64_u64 (vbicq_u64 (vreinterpretq_u64_f64 (x), cmp));
+ #else
+   r = x;
+   cmp = vcageq_f64 (x, d->range_val);
+ #endif
+ 
+   /* n = rint(|x|/pi).  */
+-  n = vfmaq_f64 (d->shift, d->inv_pi, r);
+-  odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63);
+-  n = vsubq_f64 (n, d->shift);
++  n = vrndaq_f64 (vmulq_f64 (r, d->inv_pi));
++  odd = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtq_s64_f64 (n)), 63);
+ 
+   /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2).  */
+   r = vfmsq_f64 (r, d->pi_1, n);
+diff --git a/sysdeps/aarch64/fpu/sinf_advsimd.c b/sysdeps/aarch64/fpu/sinf_advsimd.c
+index 375dfc3331..6ee9a23d5b 100644
+--- a/sysdeps/aarch64/fpu/sinf_advsimd.c
++++ b/sysdeps/aarch64/fpu/sinf_advsimd.c
+@@ -22,7 +22,7 @@
+ static const struct data
+ {
+   float32x4_t poly[4];
+-  float32x4_t range_val, inv_pi, shift, pi_1, pi_2, pi_3;
++  float32x4_t range_val, inv_pi, pi_1, pi_2, pi_3;
+ } data = {
+   /* 1.886 ulp error.  */
+   .poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f),
+@@ -33,13 +33,14 @@ static const struct data
+   .pi_3 = V4 (-0x1.ee59dap-49f),
+ 
+   .inv_pi = V4 (0x1.45f306p-2f),
+-  .shift = V4 (0x1.8p+23f),
+   .range_val = V4 (0x1p20f)
+ };
+ 
+ #if WANT_SIMD_EXCEPT
+-# define TinyBound v_u32 (0x21000000) /* asuint32(0x1p-61f).  */
+-# define Thresh v_u32 (0x28800000)    /* RangeVal - TinyBound.  */
++/* asuint32(0x1p-59f), below which multiply by inv_pi underflows.  */
++# define TinyBound v_u32 (0x22000000)
++/* RangeVal - TinyBound.  */
++# define Thresh v_u32 (0x27800000)
+ #endif
+ 
+ #define C(i) d->poly[i]
+@@ -64,23 +65,22 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sin) (float32x4_t x)
+   /* If fenv exceptions are to be triggered correctly, set any special lanes
+      to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
+      special-case handler later.  */
+-  r = vbslq_f32 (cmp, vreinterpretq_f32_u32 (cmp), x);
++  r = vreinterpretq_f32_u32 (vbicq_u32 (vreinterpretq_u32_f32 (x), cmp));
+ #else
+   r = x;
+   cmp = vcageq_f32 (x, d->range_val);
+ #endif
+ 
+-  /* n = rint(|x|/pi) */
+-  n = vfmaq_f32 (d->shift, d->inv_pi, r);
+-  odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31);
+-  n = vsubq_f32 (n, d->shift);
++  /* n = rint(|x|/pi).  */
++  n = vrndaq_f32 (vmulq_f32 (r, d->inv_pi));
++  odd = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 31);
+ 
+-  /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2) */
++  /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2).  */
+   r = vfmsq_f32 (r, d->pi_1, n);
+   r = vfmsq_f32 (r, d->pi_2, n);
+   r = vfmsq_f32 (r, d->pi_3, n);
+ 
+-  /* y = sin(r) */
++  /* y = sin(r).  */
+   r2 = vmulq_f32 (r, r);
+   y = vfmaq_f32 (C (2), C (3), r2);
+   y = vfmaq_f32 (C (1), y, r2);
+
+commit 9ff7559b274eb0dbce2cbcf87284c1d30d47a2d6
+Author: Joe Ramsay <Joe.Ramsay@arm.com>
+Date:   Mon Oct 28 14:58:35 2024 +0000
+
+    AArch64: Small optimisation in AdvSIMD erf and erfc
+    
+    In both routines, reduce register pressure such that GCC 14 emits no
+    spills for erf and fewer spills for erfc.  Also use more efficient
+    comparison for the special-case in erf.
+    
+    Benchtests show erf improves by 6.4%, erfc by 1.0%.
+    
+    (cherry picked from commit 1cf29fbc5be23db775d1dfa6b332ded6e6554252)
+
+diff --git a/sysdeps/aarch64/fpu/erf_advsimd.c b/sysdeps/aarch64/fpu/erf_advsimd.c
+index 19cbb7d0f4..c0116735e4 100644
+--- a/sysdeps/aarch64/fpu/erf_advsimd.c
++++ b/sysdeps/aarch64/fpu/erf_advsimd.c
+@@ -22,19 +22,21 @@
+ static const struct data
+ {
+   float64x2_t third;
+-  float64x2_t tenth, two_over_five, two_over_fifteen;
+-  float64x2_t two_over_nine, two_over_fortyfive;
++  float64x2_t tenth, two_over_five, two_over_nine;
++  double two_over_fifteen, two_over_fortyfive;
+   float64x2_t max, shift;
++  uint64x2_t max_idx;
+ #if WANT_SIMD_EXCEPT
+   float64x2_t tiny_bound, huge_bound, scale_minus_one;
+ #endif
+ } data = {
++  .max_idx = V2 (768),
+   .third = V2 (0x1.5555555555556p-2), /* used to compute 2/3 and 1/6 too.  */
+-  .two_over_fifteen = V2 (0x1.1111111111111p-3),
++  .two_over_fifteen = 0x1.1111111111111p-3,
+   .tenth = V2 (-0x1.999999999999ap-4),
+   .two_over_five = V2 (-0x1.999999999999ap-2),
+   .two_over_nine = V2 (-0x1.c71c71c71c71cp-3),
+-  .two_over_fortyfive = V2 (0x1.6c16c16c16c17p-5),
++  .two_over_fortyfive = 0x1.6c16c16c16c17p-5,
+   .max = V2 (5.9921875), /* 6 - 1/128.  */
+   .shift = V2 (0x1p45),
+ #if WANT_SIMD_EXCEPT
+@@ -87,8 +89,8 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erf) (float64x2_t x)
+   float64x2_t a = vabsq_f64 (x);
+   /* Reciprocal conditions that do not catch NaNs so they can be used in BSLs
+      to return expected results.  */
+-  uint64x2_t a_le_max = vcleq_f64 (a, dat->max);
+-  uint64x2_t a_gt_max = vcgtq_f64 (a, dat->max);
++  uint64x2_t a_le_max = vcaleq_f64 (x, dat->max);
++  uint64x2_t a_gt_max = vcagtq_f64 (x, dat->max);
+ 
+ #if WANT_SIMD_EXCEPT
+   /* |x| huge or tiny.  */
+@@ -115,7 +117,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erf) (float64x2_t x)
+      segfault.  */
+   uint64x2_t i
+       = vsubq_u64 (vreinterpretq_u64_f64 (z), vreinterpretq_u64_f64 (shift));
+-  i = vbslq_u64 (a_le_max, i, v_u64 (768));
++  i = vbslq_u64 (a_le_max, i, dat->max_idx);
+   struct entry e = lookup (i);
+ 
+   float64x2_t r = vsubq_f64 (z, shift);
+@@ -125,14 +127,19 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erf) (float64x2_t x)
+   float64x2_t d2 = vmulq_f64 (d, d);
+   float64x2_t r2 = vmulq_f64 (r, r);
+ 
++  float64x2_t two_over_fifteen_and_fortyfive
++      = vld1q_f64 (&dat->two_over_fifteen);
++
+   /* poly (d, r) = 1 + p1(r) * d + p2(r) * d^2 + ... + p5(r) * d^5.  */
+   float64x2_t p1 = r;
+   float64x2_t p2
+       = vfmsq_f64 (dat->third, r2, vaddq_f64 (dat->third, dat->third));
+   float64x2_t p3 = vmulq_f64 (r, vfmaq_f64 (v_f64 (-0.5), r2, dat->third));
+-  float64x2_t p4 = vfmaq_f64 (dat->two_over_five, r2, dat->two_over_fifteen);
++  float64x2_t p4 = vfmaq_laneq_f64 (dat->two_over_five, r2,
++				    two_over_fifteen_and_fortyfive, 0);
+   p4 = vfmsq_f64 (dat->tenth, r2, p4);
+-  float64x2_t p5 = vfmaq_f64 (dat->two_over_nine, r2, dat->two_over_fortyfive);
++  float64x2_t p5 = vfmaq_laneq_f64 (dat->two_over_nine, r2,
++				    two_over_fifteen_and_fortyfive, 1);
+   p5 = vmulq_f64 (r, vfmaq_f64 (vmulq_f64 (v_f64 (0.5), dat->third), r2, p5));
+ 
+   float64x2_t p34 = vfmaq_f64 (p3, d, p4);
+diff --git a/sysdeps/aarch64/fpu/erfc_advsimd.c b/sysdeps/aarch64/fpu/erfc_advsimd.c
+index f1b3bfe830..2f2f755c46 100644
+--- a/sysdeps/aarch64/fpu/erfc_advsimd.c
++++ b/sysdeps/aarch64/fpu/erfc_advsimd.c
+@@ -24,8 +24,8 @@ static const struct data
+ {
+   uint64x2_t offset, table_scale;
+   float64x2_t max, shift;
+-  float64x2_t p20, p40, p41, p42;
+-  float64x2_t p51, p52;
++  float64x2_t p20, p40, p41, p51;
++  double p42, p52;
+   double qr5[2], qr6[2], qr7[2], qr8[2], qr9[2];
+ #if WANT_SIMD_EXCEPT
+   float64x2_t uflow_bound;
+@@ -41,9 +41,9 @@ static const struct data
+   .p20 = V2 (0x1.5555555555555p-2),  /* 1/3, used to compute 2/3 and 1/6.  */
+   .p40 = V2 (-0x1.999999999999ap-4), /* 1/10.  */
+   .p41 = V2 (-0x1.999999999999ap-2), /* 2/5.  */
+-  .p42 = V2 (0x1.1111111111111p-3),  /* 2/15.  */
++  .p42 = 0x1.1111111111111p-3,	     /* 2/15.  */
+   .p51 = V2 (-0x1.c71c71c71c71cp-3), /* 2/9.  */
+-  .p52 = V2 (0x1.6c16c16c16c17p-5),  /* 2/45.  */
++  .p52 = 0x1.6c16c16c16c17p-5,	     /* 2/45.  */
+   /* Qi = (i+1) / i, Ri = -2 * i / ((i+1)*(i+2)), for i = 5, ..., 9.  */
+   .qr5 = { 0x1.3333333333333p0, -0x1.e79e79e79e79ep-3 },
+   .qr6 = { 0x1.2aaaaaaaaaaabp0, -0x1.b6db6db6db6dbp-3 },
+@@ -157,9 +157,10 @@ float64x2_t V_NAME_D1 (erfc) (float64x2_t x)
+   float64x2_t p1 = r;
+   float64x2_t p2 = vfmsq_f64 (dat->p20, r2, vaddq_f64 (dat->p20, dat->p20));
+   float64x2_t p3 = vmulq_f64 (r, vfmaq_f64 (v_f64 (-0.5), r2, dat->p20));
+-  float64x2_t p4 = vfmaq_f64 (dat->p41, r2, dat->p42);
++  float64x2_t p42_p52 = vld1q_f64 (&dat->p42);
++  float64x2_t p4 = vfmaq_laneq_f64 (dat->p41, r2, p42_p52, 0);
+   p4 = vfmsq_f64 (dat->p40, r2, p4);
+-  float64x2_t p5 = vfmaq_f64 (dat->p51, r2, dat->p52);
++  float64x2_t p5 = vfmaq_laneq_f64 (dat->p51, r2, p42_p52, 1);
+   p5 = vmulq_f64 (r, vfmaq_f64 (vmulq_f64 (v_f64 (0.5), dat->p20), r2, p5));
+   /* Compute p_i using recurrence relation:
+      p_{i+2} = (p_i + r * Q_{i+1} * p_{i+1}) * R_{i+1}.  */
+
+commit 76c923fe9d09befc8131205659d99cb9ac97460a
+Author: Joe Ramsay <Joe.Ramsay@arm.com>
+Date:   Fri Nov 1 15:48:54 2024 +0000
+
+    AArch64: Remove SVE erf and erfc tables
+    
+    By using a combination of mask-and-add instead of the shift-based
+    index calculation the routines can share the same table as other
+    variants with no performance degradation.
+    
+    The tables change name because of other changes in downstream AOR.
+    
+    Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
+    (cherry picked from commit 2d82d781a539ce8e82178fc1fa2c99ae1884e7fe)
+
+diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile
+index 234a6c457c..be8541f649 100644
+--- a/sysdeps/aarch64/fpu/Makefile
++++ b/sysdeps/aarch64/fpu/Makefile
+@@ -41,8 +41,6 @@ libmvec-support = $(addsuffix f_advsimd,$(float-advsimd-funcs)) \
+                   v_log10_data \
+                   erf_data \
+                   erff_data \
+-                  sv_erf_data \
+-                  sv_erff_data \
+                   v_exp_tail_data \
+                   erfc_data \
+                   erfcf_data \
+diff --git a/sysdeps/aarch64/fpu/erf_advsimd.c b/sysdeps/aarch64/fpu/erf_advsimd.c
+index c0116735e4..a48092e838 100644
+--- a/sysdeps/aarch64/fpu/erf_advsimd.c
++++ b/sysdeps/aarch64/fpu/erf_advsimd.c
+@@ -58,8 +58,8 @@ static inline struct entry
+ lookup (uint64x2_t i)
+ {
+   struct entry e;
+-  float64x2_t e1 = vld1q_f64 (&__erf_data.tab[vgetq_lane_u64 (i, 0)].erf),
+-	      e2 = vld1q_f64 (&__erf_data.tab[vgetq_lane_u64 (i, 1)].erf);
++  float64x2_t e1 = vld1q_f64 (&__v_erf_data.tab[vgetq_lane_u64 (i, 0)].erf),
++	      e2 = vld1q_f64 (&__v_erf_data.tab[vgetq_lane_u64 (i, 1)].erf);
+   e.erf = vuzp1q_f64 (e1, e2);
+   e.scale = vuzp2q_f64 (e1, e2);
+   return e;
+diff --git a/sysdeps/aarch64/fpu/erf_data.c b/sysdeps/aarch64/fpu/erf_data.c
+index 6d2dcd235c..ea01fad7ca 100644
+--- a/sysdeps/aarch64/fpu/erf_data.c
++++ b/sysdeps/aarch64/fpu/erf_data.c
+@@ -19,14 +19,14 @@
+ 
+ #include "vecmath_config.h"
+ 
+-/* Lookup table used in erf.
++/* Lookup table used in vector erf.
+    For each possible rounded input r (multiples of 1/128), between
+    r = 0.0 and r = 6.0 (769 values):
+-   - the first entry __erff_data.tab.erf contains the values of erf(r),
+-   - the second entry __erff_data.tab.scale contains the values of
++   - the first entry __v_erff_data.tab.erf contains the values of erf(r),
++   - the second entry __v_erff_data.tab.scale contains the values of
+    2/sqrt(pi)*exp(-r^2). Note that indices 0 and 1 are never hit by the
+    algorithm, since lookup is performed only for x >= 1/64-1/512.  */
+-const struct erf_data __erf_data = {
++const struct v_erf_data __v_erf_data = {
+   .tab = { { 0x0.0000000000000p+0, 0x1.20dd750429b6dp+0 },
+ 	   { 0x1.20dbf3deb1340p-7, 0x1.20d8f1975c85dp+0 },
+ 	   { 0x1.20d77083f17a0p-6, 0x1.20cb67bd452c7p+0 },
+diff --git a/sysdeps/aarch64/fpu/erf_sve.c b/sysdeps/aarch64/fpu/erf_sve.c
+index 7d51417406..671d55a02b 100644
+--- a/sysdeps/aarch64/fpu/erf_sve.c
++++ b/sysdeps/aarch64/fpu/erf_sve.c
+@@ -67,14 +67,16 @@ svfloat64_t SV_NAME_D1 (erf) (svfloat64_t x, const svbool_t pg)
+   svfloat64_t a = svabs_x (pg, x);
+   svfloat64_t shift = sv_f64 (dat->shift);
+   svfloat64_t z = svadd_x (pg, a, shift);
+-  svuint64_t i
+-      = svsub_x (pg, svreinterpret_u64 (z), svreinterpret_u64 (shift));
++  svuint64_t i = svand_x (pg, svreinterpret_u64 (z), 0xfff);
++  i = svadd_x (pg, i, i);
+ 
+   /* Lookup without shortcut for small values but with predicate to avoid
+      segfault for large values and NaNs.  */
+   svfloat64_t r = svsub_x (pg, z, shift);
+-  svfloat64_t erfr = svld1_gather_index (a_lt_max, __sv_erf_data.erf, i);
+-  svfloat64_t scale = svld1_gather_index (a_lt_max, __sv_erf_data.scale, i);
++  svfloat64_t erfr
++      = svld1_gather_index (a_lt_max, &__v_erf_data.tab[0].erf, i);
++  svfloat64_t scale
++      = svld1_gather_index (a_lt_max, &__v_erf_data.tab[0].scale, i);
+ 
+   /* erf(x) ~ erf(r) + scale * d * poly (r, d).  */
+   svfloat64_t d = svsub_x (pg, a, r);
+diff --git a/sysdeps/aarch64/fpu/erfc_advsimd.c b/sysdeps/aarch64/fpu/erfc_advsimd.c
+index 2f2f755c46..d05eac61a2 100644
+--- a/sysdeps/aarch64/fpu/erfc_advsimd.c
++++ b/sysdeps/aarch64/fpu/erfc_advsimd.c
+@@ -69,9 +69,9 @@ lookup (uint64x2_t i)
+ {
+   struct entry e;
+   float64x2_t e1
+-      = vld1q_f64 (&__erfc_data.tab[vgetq_lane_u64 (i, 0) - Off].erfc);
++      = vld1q_f64 (&__v_erfc_data.tab[vgetq_lane_u64 (i, 0) - Off].erfc);
+   float64x2_t e2
+-      = vld1q_f64 (&__erfc_data.tab[vgetq_lane_u64 (i, 1) - Off].erfc);
++      = vld1q_f64 (&__v_erfc_data.tab[vgetq_lane_u64 (i, 1) - Off].erfc);
+   e.erfc = vuzp1q_f64 (e1, e2);
+   e.scale = vuzp2q_f64 (e1, e2);
+   return e;
+diff --git a/sysdeps/aarch64/fpu/erfc_data.c b/sysdeps/aarch64/fpu/erfc_data.c
+index 76a94e4681..8dc6a8c42c 100644
+--- a/sysdeps/aarch64/fpu/erfc_data.c
++++ b/sysdeps/aarch64/fpu/erfc_data.c
+@@ -19,14 +19,14 @@
+ 
+ #include "vecmath_config.h"
+ 
+-/* Lookup table used in erfc.
++/* Lookup table used in vector erfc.
+    For each possible rounded input r (multiples of 1/128), between
+    r = 0.0 and r = ~27.0 (3488 values):
+-   - the first entry __erfc_data.tab.erfc contains the values of erfc(r),
+-   - the second entry __erfc_data.tab.scale contains the values of
++   - the first entry __v_erfc_data.tab.erfc contains the values of erfc(r),
++   - the second entry __v_erfc_data.tab.scale contains the values of
+    2/sqrt(pi)*exp(-r^2). Both values may go into subnormal range, therefore
+    they are scaled by a large enough value 2^128 (fits in 8bit).  */
+-const struct erfc_data __erfc_data = {
++const struct v_erfc_data __v_erfc_data = {
+   .tab = { { 0x1p128, 0x1.20dd750429b6dp128 },
+ 	   { 0x1.fb7c9030853b3p127, 0x1.20d8f1975c85dp128 },
+ 	   { 0x1.f6f9447be0743p127, 0x1.20cb67bd452c7p128 },
+diff --git a/sysdeps/aarch64/fpu/erfc_sve.c b/sysdeps/aarch64/fpu/erfc_sve.c
+index c17d3e4484..703926ee41 100644
+--- a/sysdeps/aarch64/fpu/erfc_sve.c
++++ b/sysdeps/aarch64/fpu/erfc_sve.c
+@@ -104,7 +104,7 @@ svfloat64_t SV_NAME_D1 (erfc) (svfloat64_t x, const svbool_t pg)
+ 
+   /* Lookup erfc(r) and 2/sqrt(pi)*exp(-r^2) in tables.  */
+   i = svadd_x (pg, i, i);
+-  const float64_t *p = &__erfc_data.tab[0].erfc - 2 * dat->off_arr;
++  const float64_t *p = &__v_erfc_data.tab[0].erfc - 2 * dat->off_arr;
+   svfloat64_t erfcr = svld1_gather_index (pg, p, i);
+   svfloat64_t scale = svld1_gather_index (pg, p + 1, i);
+ 
+diff --git a/sysdeps/aarch64/fpu/erfcf_advsimd.c b/sysdeps/aarch64/fpu/erfcf_advsimd.c
+index ca5bc3ab33..59b0b0d64b 100644
+--- a/sysdeps/aarch64/fpu/erfcf_advsimd.c
++++ b/sysdeps/aarch64/fpu/erfcf_advsimd.c
+@@ -62,13 +62,13 @@ lookup (uint32x4_t i)
+ {
+   struct entry e;
+   float32x2_t t0
+-      = vld1_f32 (&__erfcf_data.tab[vgetq_lane_u32 (i, 0) - Off].erfc);
++      = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 0) - Off].erfc);
+   float32x2_t t1
+-      = vld1_f32 (&__erfcf_data.tab[vgetq_lane_u32 (i, 1) - Off].erfc);
++      = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 1) - Off].erfc);
+   float32x2_t t2
+-      = vld1_f32 (&__erfcf_data.tab[vgetq_lane_u32 (i, 2) - Off].erfc);
++      = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 2) - Off].erfc);
+   float32x2_t t3
+-      = vld1_f32 (&__erfcf_data.tab[vgetq_lane_u32 (i, 3) - Off].erfc);
++      = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 3) - Off].erfc);
+   float32x4_t e1 = vcombine_f32 (t0, t1);
+   float32x4_t e2 = vcombine_f32 (t2, t3);
+   e.erfc = vuzp1q_f32 (e1, e2);
+diff --git a/sysdeps/aarch64/fpu/erfcf_data.c b/sysdeps/aarch64/fpu/erfcf_data.c
+index 77fb889a78..d45087bbb9 100644
+--- a/sysdeps/aarch64/fpu/erfcf_data.c
++++ b/sysdeps/aarch64/fpu/erfcf_data.c
+@@ -19,14 +19,14 @@
+ 
+ #include "vecmath_config.h"
+ 
+-/* Lookup table used in erfcf.
++/* Lookup table used in vector erfcf.
+    For each possible rounded input r (multiples of 1/64), between
+    r = 0.0 and r = 10.0625 (645 values):
+-   - the first entry __erfcf_data.tab.erfc contains the values of erfc(r),
+-   - the second entry __erfcf_data.tab.scale contains the values of
++   - the first entry __v_erfcf_data.tab.erfc contains the values of erfc(r),
++   - the second entry __v_erfcf_data.tab.scale contains the values of
+    2/sqrt(pi)*exp(-r^2). Both values may go into subnormal range, therefore
+    they are scaled by a large enough value 2^47 (fits in 8 bits).  */
+-const struct erfcf_data __erfcf_data = {
++const struct v_erfcf_data __v_erfcf_data = {
+   .tab = { { 0x1p47, 0x1.20dd76p47 },
+ 	   { 0x1.f6f944p46, 0x1.20cb68p47 },
+ 	   { 0x1.edf3aap46, 0x1.209546p47 },
+diff --git a/sysdeps/aarch64/fpu/erfcf_sve.c b/sysdeps/aarch64/fpu/erfcf_sve.c
+index 48d1677eb4..ecacb933ac 100644
+--- a/sysdeps/aarch64/fpu/erfcf_sve.c
++++ b/sysdeps/aarch64/fpu/erfcf_sve.c
+@@ -77,7 +77,7 @@ svfloat32_t SV_NAME_F1 (erfc) (svfloat32_t x, const svbool_t pg)
+ 
+   /* Lookup erfc(r) and 2/sqrt(pi)*exp(-r^2) in tables.  */
+   i = svmul_x (pg, i, 2);
+-  const float32_t *p = &__erfcf_data.tab[0].erfc - 2 * dat->off_arr;
++  const float32_t *p = &__v_erfcf_data.tab[0].erfc - 2 * dat->off_arr;
+   svfloat32_t erfcr = svld1_gather_index (pg, p, i);
+   svfloat32_t scale = svld1_gather_index (pg, p + 1, i);
+ 
+diff --git a/sysdeps/aarch64/fpu/erff_advsimd.c b/sysdeps/aarch64/fpu/erff_advsimd.c
+index f2fe6ff236..db39e789b6 100644
+--- a/sysdeps/aarch64/fpu/erff_advsimd.c
++++ b/sysdeps/aarch64/fpu/erff_advsimd.c
+@@ -47,10 +47,10 @@ static inline struct entry
+ lookup (uint32x4_t i)
+ {
+   struct entry e;
+-  float32x2_t t0 = vld1_f32 (&__erff_data.tab[vgetq_lane_u32 (i, 0)].erf);
+-  float32x2_t t1 = vld1_f32 (&__erff_data.tab[vgetq_lane_u32 (i, 1)].erf);
+-  float32x2_t t2 = vld1_f32 (&__erff_data.tab[vgetq_lane_u32 (i, 2)].erf);
+-  float32x2_t t3 = vld1_f32 (&__erff_data.tab[vgetq_lane_u32 (i, 3)].erf);
++  float32x2_t t0 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 0)].erf);
++  float32x2_t t1 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 1)].erf);
++  float32x2_t t2 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 2)].erf);
++  float32x2_t t3 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 3)].erf);
+   float32x4_t e1 = vcombine_f32 (t0, t1);
+   float32x4_t e2 = vcombine_f32 (t2, t3);
+   e.erf = vuzp1q_f32 (e1, e2);
+diff --git a/sysdeps/aarch64/fpu/erff_data.c b/sysdeps/aarch64/fpu/erff_data.c
+index 9a32940915..da38aed205 100644
+--- a/sysdeps/aarch64/fpu/erff_data.c
++++ b/sysdeps/aarch64/fpu/erff_data.c
+@@ -19,14 +19,14 @@
+ 
+ #include "vecmath_config.h"
+ 
+-/* Lookup table used in erff.
++/* Lookup table used in vector erff.
+    For each possible rounded input r (multiples of 1/128), between
+    r = 0.0 and r = 4.0 (513 values):
+-   - the first entry __erff_data.tab.erf contains the values of erf(r),
+-   - the second entry __erff_data.tab.scale contains the values of
++   - the first entry __v_erff_data.tab.erf contains the values of erf(r),
++   - the second entry __v_erff_data.tab.scale contains the values of
+    2/sqrt(pi)*exp(-r^2). Note that indices 0 and 1 are never hit by the
+    algorithm, since lookup is performed only for x >= 1/64-1/512.  */
+-const struct erff_data __erff_data = {
++const struct v_erff_data __v_erff_data = {
+   .tab = { { 0x0.000000p+0, 0x1.20dd76p+0 },
+ 	   { 0x1.20dbf4p-7, 0x1.20d8f2p+0 },
+ 	   { 0x1.20d770p-6, 0x1.20cb68p+0 },
+diff --git a/sysdeps/aarch64/fpu/erff_sve.c b/sysdeps/aarch64/fpu/erff_sve.c
+index 38f00db9be..0e382eb09a 100644
+--- a/sysdeps/aarch64/fpu/erff_sve.c
++++ b/sysdeps/aarch64/fpu/erff_sve.c
+@@ -62,18 +62,17 @@ svfloat32_t SV_NAME_F1 (erf) (svfloat32_t x, const svbool_t pg)
+ 
+   svfloat32_t shift = sv_f32 (dat->shift);
+   svfloat32_t z = svadd_x (pg, a, shift);
+-  svuint32_t i
+-      = svsub_x (pg, svreinterpret_u32 (z), svreinterpret_u32 (shift));
+-
+-  /* Saturate lookup index.  */
+-  i = svsel (a_ge_max, sv_u32 (512), i);
++  svuint32_t i = svand_x (pg, svreinterpret_u32 (z), 0xfff);
++  i = svadd_x (pg, i, i);
+ 
+   /* r and erf(r) set to 0 for |x| below min.  */
+   svfloat32_t r = svsub_z (a_gt_min, z, shift);
+-  svfloat32_t erfr = svld1_gather_index (a_gt_min, __sv_erff_data.erf, i);
++  svfloat32_t erfr
++      = svld1_gather_index (a_gt_min, &__v_erff_data.tab[0].erf, i);
+ 
+   /* scale set to 2/sqrt(pi) for |x| below min.  */
+-  svfloat32_t scale = svld1_gather_index (a_gt_min, __sv_erff_data.scale, i);
++  svfloat32_t scale
++      = svld1_gather_index (a_gt_min, &__v_erff_data.tab[0].scale, i);
+   scale = svsel (a_gt_min, scale, sv_f32 (dat->scale));
+ 
+   /* erf(x) ~ erf(r) + scale * d * (1 - r * d + 1/3 * d^2).  */
+diff --git a/sysdeps/aarch64/fpu/sv_erf_data.c b/sysdeps/aarch64/fpu/sv_erf_data.c
+deleted file mode 100644
+index a53878f893..0000000000
+--- a/sysdeps/aarch64/fpu/sv_erf_data.c
++++ /dev/null
+@@ -1,1570 +0,0 @@
+-/* Table for SVE erf approximation
+-
+-   Copyright (C) 2024 Free Software Foundation, Inc.
+-   This file is part of the GNU C Library.
+-
+-   The GNU C Library is free software; you can redistribute it and/or
+-   modify it under the terms of the GNU Lesser General Public
+-   License as published by the Free Software Foundation; either
+-   version 2.1 of the License, or (at your option) any later version.
+-
+-   The GNU C Library is distributed in the hope that it will be useful,
+-   but WITHOUT ANY WARRANTY; without even the implied warranty of
+-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+-   Lesser General Public License for more details.
+-
+-   You should have received a copy of the GNU Lesser General Public
+-   License along with the GNU C Library; if not, see
+-   <https://www.gnu.org/licenses/>.  */
+-
+-#include "vecmath_config.h"
+-
+-/* Lookup table used in vector erf.
+-   For each possible rounded input r (multiples of 1/128), between
+-   r = 0.0 and r = 6.0 (769 values):
+-   - the first entry __erf_data.tab.erf contains the values of erf(r),
+-   - the second entry __erf_data.tab.scale contains the values of
+-   2/sqrt(pi)*exp(-r^2). Note that indices 0 and 1 are never hit by the
+-   algorithm, since lookup is performed only for x >= 1/64-1/512.  */
+-const struct sv_erf_data __sv_erf_data = {
+-  .erf = { 0x0.0000000000000p+0,
+-	   0x1.20dbf3deb1340p-7,
+-	   0x1.20d77083f17a0p-6,
+-	   0x1.b137e0cf584dcp-6,
+-	   0x1.20c5645dd2538p-5,
+-	   0x1.68e5d3bbc9526p-5,
+-	   0x1.b0fafef135745p-5,
+-	   0x1.f902a77bd3821p-5,
+-	   0x1.207d480e90658p-4,
+-	   0x1.44703e87e8593p-4,
+-	   0x1.68591a1e83b5dp-4,
+-	   0x1.8c36beb8a8d23p-4,
+-	   0x1.b0081148a873ap-4,
+-	   0x1.d3cbf7e70a4b3p-4,
+-	   0x1.f78159ec8bb50p-4,
+-	   0x1.0d939005f65e5p-3,
+-	   0x1.1f5e1a35c3b89p-3,
+-	   0x1.311fc15f56d14p-3,
+-	   0x1.42d7fc2f64959p-3,
+-	   0x1.548642321d7c6p-3,
+-	   0x1.662a0bdf7a89fp-3,
+-	   0x1.77c2d2a765f9ep-3,
+-	   0x1.895010fdbdbfdp-3,
+-	   0x1.9ad142662e14dp-3,
+-	   0x1.ac45e37fe2526p-3,
+-	   0x1.bdad72110a648p-3,
+-	   0x1.cf076d1233237p-3,
+-	   0x1.e05354b96ff36p-3,
+-	   0x1.f190aa85540e2p-3,
+-	   0x1.015f78a3dcf3dp-2,
+-	   0x1.09eed6982b948p-2,
+-	   0x1.127631eb8de32p-2,
+-	   0x1.1af54e232d609p-2,
+-	   0x1.236bef825d9a2p-2,
+-	   0x1.2bd9db0f7827fp-2,
+-	   0x1.343ed6989b7d9p-2,
+-	   0x1.3c9aa8b84bedap-2,
+-	   0x1.44ed18d9f6462p-2,
+-	   0x1.4d35ef3e5372ep-2,
+-	   0x1.5574f4ffac98ep-2,
+-	   0x1.5da9f415ff23fp-2,
+-	   0x1.65d4b75b00471p-2,
+-	   0x1.6df50a8dff772p-2,
+-	   0x1.760aba57a76bfp-2,
+-	   0x1.7e15944d9d3e4p-2,
+-	   0x1.861566f5fd3c0p-2,
+-	   0x1.8e0a01cab516bp-2,
+-	   0x1.95f3353cbb146p-2,
+-	   0x1.9dd0d2b721f39p-2,
+-	   0x1.a5a2aca209394p-2,
+-	   0x1.ad68966569a87p-2,
+-	   0x1.b522646bbda68p-2,
+-	   0x1.bccfec24855b8p-2,
+-	   0x1.c4710406a65fcp-2,
+-	   0x1.cc058392a6d2dp-2,
+-	   0x1.d38d4354c3bd0p-2,
+-	   0x1.db081ce6e2a48p-2,
+-	   0x1.e275eaf25e458p-2,
+-	   0x1.e9d68931ae650p-2,
+-	   0x1.f129d471eabb1p-2,
+-	   0x1.f86faa9428f9dp-2,
+-	   0x1.ffa7ea8eb5fd0p-2,
+-	   0x1.03693a371519cp-1,
+-	   0x1.06f794ab2cae7p-1,
+-	   0x1.0a7ef5c18edd2p-1,
+-	   0x1.0dff4f247f6c6p-1,
+-	   0x1.1178930ada115p-1,
+-	   0x1.14eab43841b55p-1,
+-	   0x1.1855a5fd3dd50p-1,
+-	   0x1.1bb95c3746199p-1,
+-	   0x1.1f15cb50bc4dep-1,
+-	   0x1.226ae840d4d70p-1,
+-	   0x1.25b8a88b6dd7fp-1,
+-	   0x1.28ff0240d52cdp-1,
+-	   0x1.2c3debfd7d6c1p-1,
+-	   0x1.2f755ce9a21f4p-1,
+-	   0x1.32a54cb8db67bp-1,
+-	   0x1.35cdb3a9a144dp-1,
+-	   0x1.38ee8a84beb71p-1,
+-	   0x1.3c07ca9cb4f9ep-1,
+-	   0x1.3f196dcd0f135p-1,
+-	   0x1.42236e79a5fa6p-1,
+-	   0x1.4525c78dd5966p-1,
+-	   0x1.4820747ba2dc2p-1,
+-	   0x1.4b13713ad3513p-1,
+-	   0x1.4dfeba47f63ccp-1,
+-	   0x1.50e24ca35fd2cp-1,
+-	   0x1.53be25d016a4fp-1,
+-	   0x1.569243d2b3a9bp-1,
+-	   0x1.595ea53035283p-1,
+-	   0x1.5c2348ecc4dc3p-1,
+-	   0x1.5ee02e8a71a53p-1,
+-	   0x1.61955607dd15dp-1,
+-	   0x1.6442bfdedd397p-1,
+-	   0x1.66e86d0312e82p-1,
+-	   0x1.69865ee075011p-1,
+-	   0x1.6c1c9759d0e5fp-1,
+-	   0x1.6eab18c74091bp-1,
+-	   0x1.7131e5f496a5ap-1,
+-	   0x1.73b1021fc0cb8p-1,
+-	   0x1.762870f720c6fp-1,
+-	   0x1.78983697dc96fp-1,
+-	   0x1.7b00578c26037p-1,
+-	   0x1.7d60d8c979f7bp-1,
+-	   0x1.7fb9bfaed8078p-1,
+-	   0x1.820b1202f27fbp-1,
+-	   0x1.8454d5f25760dp-1,
+-	   0x1.8697120d92a4ap-1,
+-	   0x1.88d1cd474a2e0p-1,
+-	   0x1.8b050ef253c37p-1,
+-	   0x1.8d30debfc572ep-1,
+-	   0x1.8f5544bd00c04p-1,
+-	   0x1.91724951b8fc6p-1,
+-	   0x1.9387f53df5238p-1,
+-	   0x1.959651980da31p-1,
+-	   0x1.979d67caa6631p-1,
+-	   0x1.999d4192a5715p-1,
+-	   0x1.9b95e8fd26abap-1,
+-	   0x1.9d8768656cc42p-1,
+-	   0x1.9f71ca72cffb6p-1,
+-	   0x1.a1551a16aaeafp-1,
+-	   0x1.a331628a45b92p-1,
+-	   0x1.a506af4cc00f4p-1,
+-	   0x1.a6d50c20fa293p-1,
+-	   0x1.a89c850b7d54dp-1,
+-	   0x1.aa5d265064366p-1,
+-	   0x1.ac16fc7143263p-1,
+-	   0x1.adca142b10f98p-1,
+-	   0x1.af767a741088bp-1,
+-	   0x1.b11c3c79bb424p-1,
+-	   0x1.b2bb679ead19cp-1,
+-	   0x1.b4540978921eep-1,
+-	   0x1.b5e62fce16095p-1,
+-	   0x1.b771e894d602ep-1,
+-	   0x1.b8f741ef54f83p-1,
+-	   0x1.ba764a2af2b78p-1,
+-	   0x1.bbef0fbde6221p-1,
+-	   0x1.bd61a1453ab44p-1,
+-	   0x1.bece0d82d1a5cp-1,
+-	   0x1.c034635b66e23p-1,
+-	   0x1.c194b1d49a184p-1,
+-	   0x1.c2ef0812fc1bdp-1,
+-	   0x1.c443755820d64p-1,
+-	   0x1.c5920900b5fd1p-1,
+-	   0x1.c6dad2829ec62p-1,
+-	   0x1.c81de16b14cefp-1,
+-	   0x1.c95b455cce69dp-1,
+-	   0x1.ca930e0e2a825p-1,
+-	   0x1.cbc54b476248dp-1,
+-	   0x1.ccf20ce0c0d27p-1,
+-	   0x1.ce1962c0e0d8bp-1,
+-	   0x1.cf3b5cdaf0c39p-1,
+-	   0x1.d0580b2cfd249p-1,
+-	   0x1.d16f7dbe41ca0p-1,
+-	   0x1.d281c49d818d0p-1,
+-	   0x1.d38eefdf64fddp-1,
+-	   0x1.d4970f9ce00d9p-1,
+-	   0x1.d59a33f19ed42p-1,
+-	   0x1.d6986cfa798e7p-1,
+-	   0x1.d791cad3eff01p-1,
+-	   0x1.d8865d98abe01p-1,
+-	   0x1.d97635600bb89p-1,
+-	   0x1.da61623cb41e0p-1,
+-	   0x1.db47f43b2980dp-1,
+-	   0x1.dc29fb60715afp-1,
+-	   0x1.dd0787a8bb39dp-1,
+-	   0x1.dde0a90611a0dp-1,
+-	   0x1.deb56f5f12d28p-1,
+-	   0x1.df85ea8db188ep-1,
+-	   0x1.e0522a5dfda73p-1,
+-	   0x1.e11a3e8cf4eb8p-1,
+-	   0x1.e1de36c75ba58p-1,
+-	   0x1.e29e22a89d766p-1,
+-	   0x1.e35a11b9b61cep-1,
+-	   0x1.e4121370224ccp-1,
+-	   0x1.e4c6372cd8927p-1,
+-	   0x1.e5768c3b4a3fcp-1,
+-	   0x1.e62321d06c5e0p-1,
+-	   0x1.e6cc0709c8a0dp-1,
+-	   0x1.e7714aec96534p-1,
+-	   0x1.e812fc64db369p-1,
+-	   0x1.e8b12a44944a8p-1,
+-	   0x1.e94be342e6743p-1,
+-	   0x1.e9e335fb56f87p-1,
+-	   0x1.ea7730ed0bbb9p-1,
+-	   0x1.eb07e27a133aap-1,
+-	   0x1.eb9558e6b42cep-1,
+-	   0x1.ec1fa258c4beap-1,
+-	   0x1.eca6ccd709544p-1,
+-	   0x1.ed2ae6489ac1ep-1,
+-	   0x1.edabfc7453e63p-1,
+-	   0x1.ee2a1d004692cp-1,
+-	   0x1.eea5557137ae0p-1,
+-	   0x1.ef1db32a2277cp-1,
+-	   0x1.ef93436bc2daap-1,
+-	   0x1.f006135426b26p-1,
+-	   0x1.f0762fde45ee6p-1,
+-	   0x1.f0e3a5e1a1788p-1,
+-	   0x1.f14e8211e8c55p-1,
+-	   0x1.f1b6d0fea5f4dp-1,
+-	   0x1.f21c9f12f0677p-1,
+-	   0x1.f27ff89525acfp-1,
+-	   0x1.f2e0e9a6a8b09p-1,
+-	   0x1.f33f7e43a706bp-1,
+-	   0x1.f39bc242e43e6p-1,
+-	   0x1.f3f5c1558b19ep-1,
+-	   0x1.f44d870704911p-1,
+-	   0x1.f4a31ebcd47dfp-1,
+-	   0x1.f4f693b67bd77p-1,
+-	   0x1.f547f10d60597p-1,
+-	   0x1.f59741b4b97cfp-1,
+-	   0x1.f5e4907982a07p-1,
+-	   0x1.f62fe80272419p-1,
+-	   0x1.f67952cff6282p-1,
+-	   0x1.f6c0db3c34641p-1,
+-	   0x1.f7068b7b10fd9p-1,
+-	   0x1.f74a6d9a38383p-1,
+-	   0x1.f78c8b812d498p-1,
+-	   0x1.f7cceef15d631p-1,
+-	   0x1.f80ba18636f07p-1,
+-	   0x1.f848acb544e95p-1,
+-	   0x1.f88419ce4e184p-1,
+-	   0x1.f8bdf1fb78370p-1,
+-	   0x1.f8f63e416ebffp-1,
+-	   0x1.f92d077f8d56dp-1,
+-	   0x1.f96256700da8ep-1,
+-	   0x1.f99633a838a57p-1,
+-	   0x1.f9c8a7989af0dp-1,
+-	   0x1.f9f9ba8d3c733p-1,
+-	   0x1.fa2974addae45p-1,
+-	   0x1.fa57ddfe27376p-1,
+-	   0x1.fa84fe5e05c8dp-1,
+-	   0x1.fab0dd89d1309p-1,
+-	   0x1.fadb831a9f9c3p-1,
+-	   0x1.fb04f6868a944p-1,
+-	   0x1.fb2d3f20f9101p-1,
+-	   0x1.fb54641aebbc9p-1,
+-	   0x1.fb7a6c834b5a2p-1,
+-	   0x1.fb9f5f4739170p-1,
+-	   0x1.fbc3433260ca5p-1,
+-	   0x1.fbe61eef4cf6ap-1,
+-	   0x1.fc07f907bc794p-1,
+-	   0x1.fc28d7e4f9cd0p-1,
+-	   0x1.fc48c1d033c7ap-1,
+-	   0x1.fc67bcf2d7b8fp-1,
+-	   0x1.fc85cf56ecd38p-1,
+-	   0x1.fca2fee770c79p-1,
+-	   0x1.fcbf5170b578bp-1,
+-	   0x1.fcdacca0bfb73p-1,
+-	   0x1.fcf57607a6e7cp-1,
+-	   0x1.fd0f5317f582fp-1,
+-	   0x1.fd2869270a56fp-1,
+-	   0x1.fd40bd6d7a785p-1,
+-	   0x1.fd58550773cb5p-1,
+-	   0x1.fd6f34f52013ap-1,
+-	   0x1.fd85621b0876dp-1,
+-	   0x1.fd9ae142795e3p-1,
+-	   0x1.fdafb719e6a69p-1,
+-	   0x1.fdc3e835500b3p-1,
+-	   0x1.fdd7790ea5bc0p-1,
+-	   0x1.fdea6e062d0c9p-1,
+-	   0x1.fdfccb62e52d3p-1,
+-	   0x1.fe0e9552ebdd6p-1,
+-	   0x1.fe1fcfebe2083p-1,
+-	   0x1.fe307f2b503d0p-1,
+-	   0x1.fe40a6f70af4bp-1,
+-	   0x1.fe504b1d9696cp-1,
+-	   0x1.fe5f6f568b301p-1,
+-	   0x1.fe6e1742f7cf6p-1,
+-	   0x1.fe7c466dc57a1p-1,
+-	   0x1.fe8a004c19ae6p-1,
+-	   0x1.fe97483db8670p-1,
+-	   0x1.fea4218d6594ap-1,
+-	   0x1.feb08f7146046p-1,
+-	   0x1.febc950b3fa75p-1,
+-	   0x1.fec835695932ep-1,
+-	   0x1.fed37386190fbp-1,
+-	   0x1.fede5248e38f4p-1,
+-	   0x1.fee8d486585eep-1,
+-	   0x1.fef2fd00af31ap-1,
+-	   0x1.fefcce6813974p-1,
+-	   0x1.ff064b5afffbep-1,
+-	   0x1.ff0f766697c76p-1,
+-	   0x1.ff18520700971p-1,
+-	   0x1.ff20e0a7ba8c2p-1,
+-	   0x1.ff2924a3f7a83p-1,
+-	   0x1.ff312046f2339p-1,
+-	   0x1.ff38d5cc4227fp-1,
+-	   0x1.ff404760319b4p-1,
+-	   0x1.ff47772010262p-1,
+-	   0x1.ff4e671a85425p-1,
+-	   0x1.ff55194fe19dfp-1,
+-	   0x1.ff5b8fb26f5f6p-1,
+-	   0x1.ff61cc26c1578p-1,
+-	   0x1.ff67d08401202p-1,
+-	   0x1.ff6d9e943c231p-1,
+-	   0x1.ff733814af88cp-1,
+-	   0x1.ff789eb6130c9p-1,
+-	   0x1.ff7dd41ce2b4dp-1,
+-	   0x1.ff82d9e1a76d8p-1,
+-	   0x1.ff87b1913e853p-1,
+-	   0x1.ff8c5cad200a5p-1,
+-	   0x1.ff90dcaba4096p-1,
+-	   0x1.ff9532f846ab0p-1,
+-	   0x1.ff9960f3eb327p-1,
+-	   0x1.ff9d67f51ddbap-1,
+-	   0x1.ffa14948549a7p-1,
+-	   0x1.ffa506302ebaep-1,
+-	   0x1.ffa89fe5b3625p-1,
+-	   0x1.ffac17988ef4bp-1,
+-	   0x1.ffaf6e6f4f5c0p-1,
+-	   0x1.ffb2a5879f35ep-1,
+-	   0x1.ffb5bdf67fe6fp-1,
+-	   0x1.ffb8b8c88295fp-1,
+-	   0x1.ffbb970200110p-1,
+-	   0x1.ffbe599f4f9d9p-1,
+-	   0x1.ffc10194fcb64p-1,
+-	   0x1.ffc38fcffbb7cp-1,
+-	   0x1.ffc60535dd7f5p-1,
+-	   0x1.ffc862a501fd7p-1,
+-	   0x1.ffcaa8f4c9beap-1,
+-	   0x1.ffccd8f5c66d1p-1,
+-	   0x1.ffcef371ea4d7p-1,
+-	   0x1.ffd0f92cb6ba7p-1,
+-	   0x1.ffd2eae369a07p-1,
+-	   0x1.ffd4c94d29fdbp-1,
+-	   0x1.ffd6951b33686p-1,
+-	   0x1.ffd84ef9009eep-1,
+-	   0x1.ffd9f78c7524ap-1,
+-	   0x1.ffdb8f7605ee7p-1,
+-	   0x1.ffdd1750e1220p-1,
+-	   0x1.ffde8fb314ebfp-1,
+-	   0x1.ffdff92db56e5p-1,
+-	   0x1.ffe1544d01ccbp-1,
+-	   0x1.ffe2a1988857cp-1,
+-	   0x1.ffe3e19349dc7p-1,
+-	   0x1.ffe514bbdc197p-1,
+-	   0x1.ffe63b8c8b5f7p-1,
+-	   0x1.ffe7567b7b5e1p-1,
+-	   0x1.ffe865fac722bp-1,
+-	   0x1.ffe96a78a04a9p-1,
+-	   0x1.ffea645f6d6dap-1,
+-	   0x1.ffeb5415e7c44p-1,
+-	   0x1.ffec39ff380b9p-1,
+-	   0x1.ffed167b12ac2p-1,
+-	   0x1.ffede9e5d3262p-1,
+-	   0x1.ffeeb49896c6dp-1,
+-	   0x1.ffef76e956a9fp-1,
+-	   0x1.fff0312b010b5p-1,
+-	   0x1.fff0e3ad91ec2p-1,
+-	   0x1.fff18ebe2b0e1p-1,
+-	   0x1.fff232a72b48ep-1,
+-	   0x1.fff2cfb0453d9p-1,
+-	   0x1.fff3661e9569dp-1,
+-	   0x1.fff3f634b79f9p-1,
+-	   0x1.fff48032dbe40p-1,
+-	   0x1.fff50456dab8cp-1,
+-	   0x1.fff582dc48d30p-1,
+-	   0x1.fff5fbfc8a439p-1,
+-	   0x1.fff66feee5129p-1,
+-	   0x1.fff6dee89352ep-1,
+-	   0x1.fff7491cd4af6p-1,
+-	   0x1.fff7aebcff755p-1,
+-	   0x1.fff80ff8911fdp-1,
+-	   0x1.fff86cfd3e657p-1,
+-	   0x1.fff8c5f702ccfp-1,
+-	   0x1.fff91b102fca8p-1,
+-	   0x1.fff96c717b695p-1,
+-	   0x1.fff9ba420e834p-1,
+-	   0x1.fffa04a7928b1p-1,
+-	   0x1.fffa4bc63ee9ap-1,
+-	   0x1.fffa8fc0e5f33p-1,
+-	   0x1.fffad0b901755p-1,
+-	   0x1.fffb0ecebee1bp-1,
+-	   0x1.fffb4a210b172p-1,
+-	   0x1.fffb82cd9dcbfp-1,
+-	   0x1.fffbb8f1049c6p-1,
+-	   0x1.fffbeca6adbe9p-1,
+-	   0x1.fffc1e08f25f5p-1,
+-	   0x1.fffc4d3120aa1p-1,
+-	   0x1.fffc7a37857d2p-1,
+-	   0x1.fffca53375ce3p-1,
+-	   0x1.fffcce3b57bffp-1,
+-	   0x1.fffcf564ab6b7p-1,
+-	   0x1.fffd1ac4135f9p-1,
+-	   0x1.fffd3e6d5cd87p-1,
+-	   0x1.fffd607387b07p-1,
+-	   0x1.fffd80e8ce0dap-1,
+-	   0x1.fffd9fdeabccep-1,
+-	   0x1.fffdbd65e5ad0p-1,
+-	   0x1.fffdd98e903b2p-1,
+-	   0x1.fffdf46816833p-1,
+-	   0x1.fffe0e0140857p-1,
+-	   0x1.fffe26683972ap-1,
+-	   0x1.fffe3daa95b18p-1,
+-	   0x1.fffe53d558ae9p-1,
+-	   0x1.fffe68f4fa777p-1,
+-	   0x1.fffe7d156d244p-1,
+-	   0x1.fffe904222101p-1,
+-	   0x1.fffea2860ee1ep-1,
+-	   0x1.fffeb3ebb267bp-1,
+-	   0x1.fffec47d19457p-1,
+-	   0x1.fffed443e2787p-1,
+-	   0x1.fffee34943b15p-1,
+-	   0x1.fffef1960d85dp-1,
+-	   0x1.fffeff32af7afp-1,
+-	   0x1.ffff0c273bea2p-1,
+-	   0x1.ffff187b6bc0ep-1,
+-	   0x1.ffff2436a21dcp-1,
+-	   0x1.ffff2f5fefcaap-1,
+-	   0x1.ffff39fe16963p-1,
+-	   0x1.ffff44178c8d2p-1,
+-	   0x1.ffff4db27f146p-1,
+-	   0x1.ffff56d4d5e5ep-1,
+-	   0x1.ffff5f8435efcp-1,
+-	   0x1.ffff67c604180p-1,
+-	   0x1.ffff6f9f67e55p-1,
+-	   0x1.ffff77154e0d6p-1,
+-	   0x1.ffff7e2c6aea2p-1,
+-	   0x1.ffff84e93cd75p-1,
+-	   0x1.ffff8b500e77cp-1,
+-	   0x1.ffff9164f8e46p-1,
+-	   0x1.ffff972be5c59p-1,
+-	   0x1.ffff9ca891572p-1,
+-	   0x1.ffffa1de8c582p-1,
+-	   0x1.ffffa6d13de73p-1,
+-	   0x1.ffffab83e54b8p-1,
+-	   0x1.ffffaff99bac4p-1,
+-	   0x1.ffffb43555b5fp-1,
+-	   0x1.ffffb839e52f3p-1,
+-	   0x1.ffffbc09fa7cdp-1,
+-	   0x1.ffffbfa82616bp-1,
+-	   0x1.ffffc316d9ed0p-1,
+-	   0x1.ffffc6586abf6p-1,
+-	   0x1.ffffc96f1165ep-1,
+-	   0x1.ffffcc5cec0c1p-1,
+-	   0x1.ffffcf23ff5fcp-1,
+-	   0x1.ffffd1c637b2bp-1,
+-	   0x1.ffffd4456a10dp-1,
+-	   0x1.ffffd6a3554a1p-1,
+-	   0x1.ffffd8e1a2f22p-1,
+-	   0x1.ffffdb01e8546p-1,
+-	   0x1.ffffdd05a75eap-1,
+-	   0x1.ffffdeee4f810p-1,
+-	   0x1.ffffe0bd3e852p-1,
+-	   0x1.ffffe273c15b7p-1,
+-	   0x1.ffffe41314e06p-1,
+-	   0x1.ffffe59c6698bp-1,
+-	   0x1.ffffe710d565ep-1,
+-	   0x1.ffffe8717232dp-1,
+-	   0x1.ffffe9bf4098cp-1,
+-	   0x1.ffffeafb377d5p-1,
+-	   0x1.ffffec2641a9ep-1,
+-	   0x1.ffffed413e5b7p-1,
+-	   0x1.ffffee4d01cd6p-1,
+-	   0x1.ffffef4a55bd4p-1,
+-	   0x1.fffff039f9e8fp-1,
+-	   0x1.fffff11ca4876p-1,
+-	   0x1.fffff1f302bc1p-1,
+-	   0x1.fffff2bdb904dp-1,
+-	   0x1.fffff37d63a36p-1,
+-	   0x1.fffff43297019p-1,
+-	   0x1.fffff4dde0118p-1,
+-	   0x1.fffff57fc4a95p-1,
+-	   0x1.fffff618c3da6p-1,
+-	   0x1.fffff6a956450p-1,
+-	   0x1.fffff731ee681p-1,
+-	   0x1.fffff7b2f8ed6p-1,
+-	   0x1.fffff82cdcf1bp-1,
+-	   0x1.fffff89ffc4aap-1,
+-	   0x1.fffff90cb3c81p-1,
+-	   0x1.fffff9735b73bp-1,
+-	   0x1.fffff9d446cccp-1,
+-	   0x1.fffffa2fc5015p-1,
+-	   0x1.fffffa8621251p-1,
+-	   0x1.fffffad7a2652p-1,
+-	   0x1.fffffb248c39dp-1,
+-	   0x1.fffffb6d1e95dp-1,
+-	   0x1.fffffbb196132p-1,
+-	   0x1.fffffbf22c1e2p-1,
+-	   0x1.fffffc2f171e3p-1,
+-	   0x1.fffffc688a9cfp-1,
+-	   0x1.fffffc9eb76acp-1,
+-	   0x1.fffffcd1cbc28p-1,
+-	   0x1.fffffd01f36afp-1,
+-	   0x1.fffffd2f57d68p-1,
+-	   0x1.fffffd5a2041fp-1,
+-	   0x1.fffffd8271d12p-1,
+-	   0x1.fffffda86faa9p-1,
+-	   0x1.fffffdcc3b117p-1,
+-	   0x1.fffffdedf37edp-1,
+-	   0x1.fffffe0db6b91p-1,
+-	   0x1.fffffe2ba0ea5p-1,
+-	   0x1.fffffe47ccb60p-1,
+-	   0x1.fffffe62534d4p-1,
+-	   0x1.fffffe7b4c81ep-1,
+-	   0x1.fffffe92ced93p-1,
+-	   0x1.fffffea8ef9cfp-1,
+-	   0x1.fffffebdc2ec6p-1,
+-	   0x1.fffffed15bcbap-1,
+-	   0x1.fffffee3cc32cp-1,
+-	   0x1.fffffef5251c2p-1,
+-	   0x1.ffffff0576917p-1,
+-	   0x1.ffffff14cfb92p-1,
+-	   0x1.ffffff233ee1dp-1,
+-	   0x1.ffffff30d18e8p-1,
+-	   0x1.ffffff3d9480fp-1,
+-	   0x1.ffffff4993c46p-1,
+-	   0x1.ffffff54dab72p-1,
+-	   0x1.ffffff5f74141p-1,
+-	   0x1.ffffff6969fb8p-1,
+-	   0x1.ffffff72c5fb6p-1,
+-	   0x1.ffffff7b91176p-1,
+-	   0x1.ffffff83d3d07p-1,
+-	   0x1.ffffff8b962bep-1,
+-	   0x1.ffffff92dfba2p-1,
+-	   0x1.ffffff99b79d2p-1,
+-	   0x1.ffffffa0248e8p-1,
+-	   0x1.ffffffa62ce54p-1,
+-	   0x1.ffffffabd69b4p-1,
+-	   0x1.ffffffb127525p-1,
+-	   0x1.ffffffb624592p-1,
+-	   0x1.ffffffbad2affp-1,
+-	   0x1.ffffffbf370cdp-1,
+-	   0x1.ffffffc355dfdp-1,
+-	   0x1.ffffffc733572p-1,
+-	   0x1.ffffffcad3626p-1,
+-	   0x1.ffffffce39b67p-1,
+-	   0x1.ffffffd169d0cp-1,
+-	   0x1.ffffffd466fa5p-1,
+-	   0x1.ffffffd7344aap-1,
+-	   0x1.ffffffd9d4aabp-1,
+-	   0x1.ffffffdc4ad7ap-1,
+-	   0x1.ffffffde9964ep-1,
+-	   0x1.ffffffe0c2bf0p-1,
+-	   0x1.ffffffe2c92dbp-1,
+-	   0x1.ffffffe4aed5ep-1,
+-	   0x1.ffffffe675bbdp-1,
+-	   0x1.ffffffe81fc4ep-1,
+-	   0x1.ffffffe9aeb97p-1,
+-	   0x1.ffffffeb24467p-1,
+-	   0x1.ffffffec81ff2p-1,
+-	   0x1.ffffffedc95e7p-1,
+-	   0x1.ffffffeefbc85p-1,
+-	   0x1.fffffff01a8b6p-1,
+-	   0x1.fffffff126e1ep-1,
+-	   0x1.fffffff221f30p-1,
+-	   0x1.fffffff30cd3fp-1,
+-	   0x1.fffffff3e8892p-1,
+-	   0x1.fffffff4b606fp-1,
+-	   0x1.fffffff57632dp-1,
+-	   0x1.fffffff629e44p-1,
+-	   0x1.fffffff6d1e56p-1,
+-	   0x1.fffffff76ef3fp-1,
+-	   0x1.fffffff801c1fp-1,
+-	   0x1.fffffff88af67p-1,
+-	   0x1.fffffff90b2e3p-1,
+-	   0x1.fffffff982fc1p-1,
+-	   0x1.fffffff9f2e9fp-1,
+-	   0x1.fffffffa5b790p-1,
+-	   0x1.fffffffabd229p-1,
+-	   0x1.fffffffb18582p-1,
+-	   0x1.fffffffb6d844p-1,
+-	   0x1.fffffffbbd0aap-1,
+-	   0x1.fffffffc0748fp-1,
+-	   0x1.fffffffc4c96cp-1,
+-	   0x1.fffffffc8d462p-1,
+-	   0x1.fffffffcc9a41p-1,
+-	   0x1.fffffffd01f89p-1,
+-	   0x1.fffffffd36871p-1,
+-	   0x1.fffffffd678edp-1,
+-	   0x1.fffffffd954aep-1,
+-	   0x1.fffffffdbff2ap-1,
+-	   0x1.fffffffde7ba0p-1,
+-	   0x1.fffffffe0cd16p-1,
+-	   0x1.fffffffe2f664p-1,
+-	   0x1.fffffffe4fa30p-1,
+-	   0x1.fffffffe6daf7p-1,
+-	   0x1.fffffffe89b0cp-1,
+-	   0x1.fffffffea3c9ap-1,
+-	   0x1.fffffffebc1a9p-1,
+-	   0x1.fffffffed2c21p-1,
+-	   0x1.fffffffee7dc8p-1,
+-	   0x1.fffffffefb847p-1,
+-	   0x1.ffffffff0dd2bp-1,
+-	   0x1.ffffffff1ede9p-1,
+-	   0x1.ffffffff2ebdap-1,
+-	   0x1.ffffffff3d843p-1,
+-	   0x1.ffffffff4b453p-1,
+-	   0x1.ffffffff58126p-1,
+-	   0x1.ffffffff63fc3p-1,
+-	   0x1.ffffffff6f121p-1,
+-	   0x1.ffffffff79626p-1,
+-	   0x1.ffffffff82fabp-1,
+-	   0x1.ffffffff8be77p-1,
+-	   0x1.ffffffff94346p-1,
+-	   0x1.ffffffff9bec8p-1,
+-	   0x1.ffffffffa319fp-1,
+-	   0x1.ffffffffa9c63p-1,
+-	   0x1.ffffffffaffa4p-1,
+-	   0x1.ffffffffb5be5p-1,
+-	   0x1.ffffffffbb1a2p-1,
+-	   0x1.ffffffffc014ep-1,
+-	   0x1.ffffffffc4b56p-1,
+-	   0x1.ffffffffc901cp-1,
+-	   0x1.ffffffffccfffp-1,
+-	   0x1.ffffffffd0b56p-1,
+-	   0x1.ffffffffd4271p-1,
+-	   0x1.ffffffffd759dp-1,
+-	   0x1.ffffffffda520p-1,
+-	   0x1.ffffffffdd13cp-1,
+-	   0x1.ffffffffdfa2dp-1,
+-	   0x1.ffffffffe202dp-1,
+-	   0x1.ffffffffe4371p-1,
+-	   0x1.ffffffffe642ap-1,
+-	   0x1.ffffffffe8286p-1,
+-	   0x1.ffffffffe9eb0p-1,
+-	   0x1.ffffffffeb8d0p-1,
+-	   0x1.ffffffffed10ap-1,
+-	   0x1.ffffffffee782p-1,
+-	   0x1.ffffffffefc57p-1,
+-	   0x1.fffffffff0fa7p-1,
+-	   0x1.fffffffff218fp-1,
+-	   0x1.fffffffff3227p-1,
+-	   0x1.fffffffff4188p-1,
+-	   0x1.fffffffff4fc9p-1,
+-	   0x1.fffffffff5cfdp-1,
+-	   0x1.fffffffff6939p-1,
+-	   0x1.fffffffff748ep-1,
+-	   0x1.fffffffff7f0dp-1,
+-	   0x1.fffffffff88c5p-1,
+-	   0x1.fffffffff91c6p-1,
+-	   0x1.fffffffff9a1bp-1,
+-	   0x1.fffffffffa1d2p-1,
+-	   0x1.fffffffffa8f6p-1,
+-	   0x1.fffffffffaf92p-1,
+-	   0x1.fffffffffb5b0p-1,
+-	   0x1.fffffffffbb58p-1,
+-	   0x1.fffffffffc095p-1,
+-	   0x1.fffffffffc56dp-1,
+-	   0x1.fffffffffc9e8p-1,
+-	   0x1.fffffffffce0dp-1,
+-	   0x1.fffffffffd1e1p-1,
+-	   0x1.fffffffffd56cp-1,
+-	   0x1.fffffffffd8b3p-1,
+-	   0x1.fffffffffdbbap-1,
+-	   0x1.fffffffffde86p-1,
+-	   0x1.fffffffffe11dp-1,
+-	   0x1.fffffffffe380p-1,
+-	   0x1.fffffffffe5b6p-1,
+-	   0x1.fffffffffe7c0p-1,
+-	   0x1.fffffffffe9a2p-1,
+-	   0x1.fffffffffeb60p-1,
+-	   0x1.fffffffffecfbp-1,
+-	   0x1.fffffffffee77p-1,
+-	   0x1.fffffffffefd6p-1,
+-	   0x1.ffffffffff11ap-1,
+-	   0x1.ffffffffff245p-1,
+-	   0x1.ffffffffff359p-1,
+-	   0x1.ffffffffff457p-1,
+-	   0x1.ffffffffff542p-1,
+-	   0x1.ffffffffff61bp-1,
+-	   0x1.ffffffffff6e3p-1,
+-	   0x1.ffffffffff79bp-1,
+-	   0x1.ffffffffff845p-1,
+-	   0x1.ffffffffff8e2p-1,
+-	   0x1.ffffffffff973p-1,
+-	   0x1.ffffffffff9f8p-1,
+-	   0x1.ffffffffffa73p-1,
+-	   0x1.ffffffffffae4p-1,
+-	   0x1.ffffffffffb4cp-1,
+-	   0x1.ffffffffffbadp-1,
+-	   0x1.ffffffffffc05p-1,
+-	   0x1.ffffffffffc57p-1,
+-	   0x1.ffffffffffca2p-1,
+-	   0x1.ffffffffffce7p-1,
+-	   0x1.ffffffffffd27p-1,
+-	   0x1.ffffffffffd62p-1,
+-	   0x1.ffffffffffd98p-1,
+-	   0x1.ffffffffffdcap-1,
+-	   0x1.ffffffffffdf8p-1,
+-	   0x1.ffffffffffe22p-1,
+-	   0x1.ffffffffffe49p-1,
+-	   0x1.ffffffffffe6cp-1,
+-	   0x1.ffffffffffe8dp-1,
+-	   0x1.ffffffffffeabp-1,
+-	   0x1.ffffffffffec7p-1,
+-	   0x1.ffffffffffee1p-1,
+-	   0x1.ffffffffffef8p-1,
+-	   0x1.fffffffffff0ep-1,
+-	   0x1.fffffffffff22p-1,
+-	   0x1.fffffffffff34p-1,
+-	   0x1.fffffffffff45p-1,
+-	   0x1.fffffffffff54p-1,
+-	   0x1.fffffffffff62p-1,
+-	   0x1.fffffffffff6fp-1,
+-	   0x1.fffffffffff7bp-1,
+-	   0x1.fffffffffff86p-1,
+-	   0x1.fffffffffff90p-1,
+-	   0x1.fffffffffff9ap-1,
+-	   0x1.fffffffffffa2p-1,
+-	   0x1.fffffffffffaap-1,
+-	   0x1.fffffffffffb1p-1,
+-	   0x1.fffffffffffb8p-1,
+-	   0x1.fffffffffffbep-1,
+-	   0x1.fffffffffffc3p-1,
+-	   0x1.fffffffffffc8p-1,
+-	   0x1.fffffffffffcdp-1,
+-	   0x1.fffffffffffd1p-1,
+-	   0x1.fffffffffffd5p-1,
+-	   0x1.fffffffffffd9p-1,
+-	   0x1.fffffffffffdcp-1,
+-	   0x1.fffffffffffdfp-1,
+-	   0x1.fffffffffffe2p-1,
+-	   0x1.fffffffffffe4p-1,
+-	   0x1.fffffffffffe7p-1,
+-	   0x1.fffffffffffe9p-1,
+-	   0x1.fffffffffffebp-1,
+-	   0x1.fffffffffffedp-1,
+-	   0x1.fffffffffffeep-1,
+-	   0x1.ffffffffffff0p-1,
+-	   0x1.ffffffffffff1p-1,
+-	   0x1.ffffffffffff3p-1,
+-	   0x1.ffffffffffff4p-1,
+-	   0x1.ffffffffffff5p-1,
+-	   0x1.ffffffffffff6p-1,
+-	   0x1.ffffffffffff7p-1,
+-	   0x1.ffffffffffff7p-1,
+-	   0x1.ffffffffffff8p-1,
+-	   0x1.ffffffffffff9p-1,
+-	   0x1.ffffffffffff9p-1,
+-	   0x1.ffffffffffffap-1,
+-	   0x1.ffffffffffffbp-1,
+-	   0x1.ffffffffffffbp-1,
+-	   0x1.ffffffffffffbp-1,
+-	   0x1.ffffffffffffcp-1,
+-	   0x1.ffffffffffffcp-1,
+-	   0x1.ffffffffffffdp-1,
+-	   0x1.ffffffffffffdp-1,
+-	   0x1.ffffffffffffdp-1,
+-	   0x1.ffffffffffffdp-1,
+-	   0x1.ffffffffffffep-1,
+-	   0x1.ffffffffffffep-1,
+-	   0x1.ffffffffffffep-1,
+-	   0x1.ffffffffffffep-1,
+-	   0x1.ffffffffffffep-1,
+-	   0x1.ffffffffffffep-1,
+-	   0x1.fffffffffffffp-1,
+-	   0x1.fffffffffffffp-1,
+-	   0x1.fffffffffffffp-1,
+-	   0x1.fffffffffffffp-1,
+-	   0x1.fffffffffffffp-1,
+-	   0x1.fffffffffffffp-1,
+-	   0x1.fffffffffffffp-1,
+-	   0x1.fffffffffffffp-1,
+-	   0x1.fffffffffffffp-1,
+-	   0x1.fffffffffffffp-1,
+-	   0x1.fffffffffffffp-1,
+-	   0x1.0000000000000p+0,
+-	   0x1.0000000000000p+0,
+-	   0x1.0000000000000p+0,
+-	   0x1.0000000000000p+0,
+-	   0x1.0000000000000p+0,
+-	   0x1.0000000000000p+0,
+-	   0x1.0000000000000p+0,
+-	   0x1.0000000000000p+0,
+-	   0x1.0000000000000p+0,
+-	   0x1.0000000000000p+0,
+-	   0x1.0000000000000p+0,
+-  },
+-  .scale = { 0x1.20dd750429b6dp+0,
+-	     0x1.20d8f1975c85dp+0,
+-	     0x1.20cb67bd452c7p+0,
+-	     0x1.20b4d8bac36c1p+0,
+-	     0x1.209546ad13ccfp+0,
+-	     0x1.206cb4897b148p+0,
+-	     0x1.203b261cd0052p+0,
+-	     0x1.2000a00ae3804p+0,
+-	     0x1.1fbd27cdc72d3p+0,
+-	     0x1.1f70c3b4f2cc7p+0,
+-	     0x1.1f1b7ae44867fp+0,
+-	     0x1.1ebd5552f795bp+0,
+-	     0x1.1e565bca400d4p+0,
+-	     0x1.1de697e413d28p+0,
+-	     0x1.1d6e14099944ap+0,
+-	     0x1.1cecdb718d61cp+0,
+-	     0x1.1c62fa1e869b6p+0,
+-	     0x1.1bd07cdd189acp+0,
+-	     0x1.1b357141d95d5p+0,
+-	     0x1.1a91e5a748165p+0,
+-	     0x1.19e5e92b964abp+0,
+-	     0x1.19318bae53a04p+0,
+-	     0x1.1874ddcdfce24p+0,
+-	     0x1.17aff0e56ec10p+0,
+-	     0x1.16e2d7093cd8cp+0,
+-	     0x1.160da304ed92fp+0,
+-	     0x1.153068581b781p+0,
+-	     0x1.144b3b337c90cp+0,
+-	     0x1.135e3075d076bp+0,
+-	     0x1.12695da8b5bdep+0,
+-	     0x1.116cd8fd67618p+0,
+-	     0x1.1068b94962e5ep+0,
+-	     0x1.0f5d1602f7e41p+0,
+-	     0x1.0e4a073dc1b91p+0,
+-	     0x1.0d2fa5a70c168p+0,
+-	     0x1.0c0e0a8223359p+0,
+-	     0x1.0ae54fa490722p+0,
+-	     0x1.09b58f724416bp+0,
+-	     0x1.087ee4d9ad247p+0,
+-	     0x1.07416b4fbfe7cp+0,
+-	     0x1.05fd3ecbec297p+0,
+-	     0x1.04b27bc403d30p+0,
+-	     0x1.03613f2812dafp+0,
+-	     0x1.0209a65e29545p+0,
+-	     0x1.00abcf3e187a9p+0,
+-	     0x1.fe8fb01a47307p-1,
+-	     0x1.fbbbbef34b4b2p-1,
+-	     0x1.f8dc092d58ff8p-1,
+-	     0x1.f5f0cdaf15313p-1,
+-	     0x1.f2fa4c16c0019p-1,
+-	     0x1.eff8c4b1375dbp-1,
+-	     0x1.ecec7870ebca7p-1,
+-	     0x1.e9d5a8e4c934ep-1,
+-	     0x1.e6b4982f158b9p-1,
+-	     0x1.e38988fc46e72p-1,
+-	     0x1.e054be79d3042p-1,
+-	     0x1.dd167c4cf9d2ap-1,
+-	     0x1.d9cf06898cdafp-1,
+-	     0x1.d67ea1a8b5368p-1,
+-	     0x1.d325927fb9d89p-1,
+-	     0x1.cfc41e36c7df9p-1,
+-	     0x1.cc5a8a3fbea40p-1,
+-	     0x1.c8e91c4d01368p-1,
+-	     0x1.c5701a484ef9dp-1,
+-	     0x1.c1efca49a5011p-1,
+-	     0x1.be68728e29d5dp-1,
+-	     0x1.bada596f25436p-1,
+-	     0x1.b745c55905bf8p-1,
+-	     0x1.b3aafcc27502ep-1,
+-	     0x1.b00a46237d5bep-1,
+-	     0x1.ac63e7ecc1411p-1,
+-	     0x1.a8b8287ec6a09p-1,
+-	     0x1.a5074e2157620p-1,
+-	     0x1.a1519efaf889ep-1,
+-	     0x1.9d97610879642p-1,
+-	     0x1.99d8da149c13fp-1,
+-	     0x1.96164fafd8de3p-1,
+-	     0x1.925007283d7aap-1,
+-	     0x1.8e86458169af8p-1,
+-	     0x1.8ab94f6caa71dp-1,
+-	     0x1.86e9694134b9ep-1,
+-	     0x1.8316d6f48133dp-1,
+-	     0x1.7f41dc12c9e89p-1,
+-	     0x1.7b6abbb7aaf19p-1,
+-	     0x1.7791b886e7403p-1,
+-	     0x1.73b714a552763p-1,
+-	     0x1.6fdb11b1e0c34p-1,
+-	     0x1.6bfdf0beddaf5p-1,
+-	     0x1.681ff24b4ab04p-1,
+-	     0x1.6441563c665d4p-1,
+-	     0x1.60625bd75d07bp-1,
+-	     0x1.5c8341bb23767p-1,
+-	     0x1.58a445da7c74cp-1,
+-	     0x1.54c5a57629db0p-1,
+-	     0x1.50e79d1749ac9p-1,
+-	     0x1.4d0a6889dfd9fp-1,
+-	     0x1.492e42d78d2c5p-1,
+-	     0x1.4553664273d24p-1,
+-	     0x1.417a0c4049fd0p-1,
+-	     0x1.3da26d759aef5p-1,
+-	     0x1.39ccc1b136d5ap-1,
+-	     0x1.35f93fe7d1b3dp-1,
+-	     0x1.32281e2fd1a92p-1,
+-	     0x1.2e5991bd4cbfcp-1,
+-	     0x1.2a8dcede3673bp-1,
+-	     0x1.26c508f6bd0ffp-1,
+-	     0x1.22ff727dd6f7bp-1,
+-	     0x1.1f3d3cf9ffe5ap-1,
+-	     0x1.1b7e98fe26217p-1,
+-	     0x1.17c3b626c7a11p-1,
+-	     0x1.140cc3173f007p-1,
+-	     0x1.1059ed7740313p-1,
+-	     0x1.0cab61f084b93p-1,
+-	     0x1.09014c2ca74dap-1,
+-	     0x1.055bd6d32e8d7p-1,
+-	     0x1.01bb2b87c6968p-1,
+-	     0x1.fc3ee5d1524b0p-2,
+-	     0x1.f511a91a67d2ap-2,
+-	     0x1.edeeee0959518p-2,
+-	     0x1.e6d6ffaa65a25p-2,
+-	     0x1.dfca26f5bbf88p-2,
+-	     0x1.d8c8aace11e63p-2,
+-	     0x1.d1d2cfff91594p-2,
+-	     0x1.cae8d93f1d7b6p-2,
+-	     0x1.c40b0729ed547p-2,
+-	     0x1.bd3998457afdap-2,
+-	     0x1.b674c8ffc6283p-2,
+-	     0x1.afbcd3afe8ab6p-2,
+-	     0x1.a911f096fbc26p-2,
+-	     0x1.a27455e14c93cp-2,
+-	     0x1.9be437a7de946p-2,
+-	     0x1.9561c7f23a47bp-2,
+-	     0x1.8eed36b886d93p-2,
+-	     0x1.8886b1e5ecfd1p-2,
+-	     0x1.822e655b417e6p-2,
+-	     0x1.7be47af1f5d89p-2,
+-	     0x1.75a91a7f4d2edp-2,
+-	     0x1.6f7c69d7d3ef8p-2,
+-	     0x1.695e8cd31867ep-2,
+-	     0x1.634fa54fa285fp-2,
+-	     0x1.5d4fd33729015p-2,
+-	     0x1.575f3483021c3p-2,
+-	     0x1.517de540ce2a3p-2,
+-	     0x1.4babff975a04cp-2,
+-	     0x1.45e99bcbb7915p-2,
+-	     0x1.4036d0468a7a2p-2,
+-	     0x1.3a93b1998736cp-2,
+-	     0x1.35005285227f1p-2,
+-	     0x1.2f7cc3fe6f423p-2,
+-	     0x1.2a09153529381p-2,
+-	     0x1.24a55399ea239p-2,
+-	     0x1.1f518ae487dc8p-2,
+-	     0x1.1a0dc51a9934dp-2,
+-	     0x1.14da0a961fd14p-2,
+-	     0x1.0fb6620c550afp-2,
+-	     0x1.0aa2d09497f2bp-2,
+-	     0x1.059f59af7a906p-2,
+-	     0x1.00abff4dec7a3p-2,
+-	     0x1.f79183b101c5bp-3,
+-	     0x1.edeb406d9c824p-3,
+-	     0x1.e4652fadcb6b2p-3,
+-	     0x1.daff4969c0b04p-3,
+-	     0x1.d1b982c501370p-3,
+-	     0x1.c893ce1dcbef7p-3,
+-	     0x1.bf8e1b1ca2279p-3,
+-	     0x1.b6a856c3ed54fp-3,
+-	     0x1.ade26b7fbed95p-3,
+-	     0x1.a53c4135a6526p-3,
+-	     0x1.9cb5bd549b111p-3,
+-	     0x1.944ec2e4f5630p-3,
+-	     0x1.8c07329874652p-3,
+-	     0x1.83deeada4d25ap-3,
+-	     0x1.7bd5c7df3fe9cp-3,
+-	     0x1.73eba3b5b07b7p-3,
+-	     0x1.6c205655be71fp-3,
+-	     0x1.6473b5b15a7a1p-3,
+-	     0x1.5ce595c455b0ap-3,
+-	     0x1.5575c8a468361p-3,
+-	     0x1.4e241e912c305p-3,
+-	     0x1.46f066040a832p-3,
+-	     0x1.3fda6bc016994p-3,
+-	     0x1.38e1fae1d6a9dp-3,
+-	     0x1.3206dceef5f87p-3,
+-	     0x1.2b48d9e5dea1cp-3,
+-	     0x1.24a7b84d38971p-3,
+-	     0x1.1e233d434b813p-3,
+-	     0x1.17bb2c8d41535p-3,
+-	     0x1.116f48a6476ccp-3,
+-	     0x1.0b3f52ce8c383p-3,
+-	     0x1.052b0b1a174eap-3,
+-	     0x1.fe6460fef4680p-4,
+-	     0x1.f2a901ccafb37p-4,
+-	     0x1.e723726b824a9p-4,
+-	     0x1.dbd32ac4c99b0p-4,
+-	     0x1.d0b7a0f921e7cp-4,
+-	     0x1.c5d0497c09e74p-4,
+-	     0x1.bb1c972f23e50p-4,
+-	     0x1.b09bfb7d11a83p-4,
+-	     0x1.a64de673e8837p-4,
+-	     0x1.9c31c6df3b1b8p-4,
+-	     0x1.92470a61b6965p-4,
+-	     0x1.888d1d8e510a3p-4,
+-	     0x1.7f036c0107294p-4,
+-	     0x1.75a96077274bap-4,
+-	     0x1.6c7e64e7281cbp-4,
+-	     0x1.6381e2980956bp-4,
+-	     0x1.5ab342383d177p-4,
+-	     0x1.5211ebf41880bp-4,
+-	     0x1.499d478bca735p-4,
+-	     0x1.4154bc68d75c3p-4,
+-	     0x1.3937b1b319259p-4,
+-	     0x1.31458e6542847p-4,
+-	     0x1.297db960e4f63p-4,
+-	     0x1.21df9981f8e53p-4,
+-	     0x1.1a6a95b1e786fp-4,
+-	     0x1.131e14fa1625dp-4,
+-	     0x1.0bf97e95f2a64p-4,
+-	     0x1.04fc3a0481321p-4,
+-	     0x1.fc4b5e32d6259p-5,
+-	     0x1.eeea8c1b1db93p-5,
+-	     0x1.e1d4cf1e2450ap-5,
+-	     0x1.d508f9a1ea64ep-5,
+-	     0x1.c885df3451a07p-5,
+-	     0x1.bc4a54a84e834p-5,
+-	     0x1.b055303221015p-5,
+-	     0x1.a4a549829587ep-5,
+-	     0x1.993979e14fffdp-5,
+-	     0x1.8e109c4622913p-5,
+-	     0x1.83298d717210ep-5,
+-	     0x1.78832c03aa2b1p-5,
+-	     0x1.6e1c5893c380bp-5,
+-	     0x1.63f3f5c4de13bp-5,
+-	     0x1.5a08e85af27e0p-5,
+-	     0x1.505a174e9c929p-5,
+-	     0x1.46e66be002240p-5,
+-	     0x1.3dacd1a8d8ccdp-5,
+-	     0x1.34ac36ad8dafep-5,
+-	     0x1.2be38b6d92415p-5,
+-	     0x1.2351c2f2d1449p-5,
+-	     0x1.1af5d2e04f3f6p-5,
+-	     0x1.12ceb37ff9bc3p-5,
+-	     0x1.0adb5fcfa8c75p-5,
+-	     0x1.031ad58d56279p-5,
+-	     0x1.f7182a851bca2p-6,
+-	     0x1.e85c449e377f2p-6,
+-	     0x1.da0005e5f28dfp-6,
+-	     0x1.cc0180af00a8bp-6,
+-	     0x1.be5ecd2fcb5f9p-6,
+-	     0x1.b1160991ff737p-6,
+-	     0x1.a4255a00b9f03p-6,
+-	     0x1.978ae8b55ce1bp-6,
+-	     0x1.8b44e6031383ep-6,
+-	     0x1.7f5188610ddc8p-6,
+-	     0x1.73af0c737bb45p-6,
+-	     0x1.685bb5134ef13p-6,
+-	     0x1.5d55cb54cd53ap-6,
+-	     0x1.529b9e8cf9a1ep-6,
+-	     0x1.482b8455dc491p-6,
+-	     0x1.3e03d891b37dep-6,
+-	     0x1.3422fd6d12e2bp-6,
+-	     0x1.2a875b5ffab56p-6,
+-	     0x1.212f612dee7fbp-6,
+-	     0x1.181983e5133ddp-6,
+-	     0x1.0f443edc5ce49p-6,
+-	     0x1.06ae13b0d3255p-6,
+-	     0x1.fcab1483ea7fcp-7,
+-	     0x1.ec72615a894c4p-7,
+-	     0x1.dcaf3691fc448p-7,
+-	     0x1.cd5ec93c12431p-7,
+-	     0x1.be7e5ac24963bp-7,
+-	     0x1.b00b38d6b3575p-7,
+-	     0x1.a202bd6372dcep-7,
+-	     0x1.94624e78e0fafp-7,
+-	     0x1.87275e3a6869dp-7,
+-	     0x1.7a4f6aca256cbp-7,
+-	     0x1.6dd7fe3358230p-7,
+-	     0x1.61beae53b72b7p-7,
+-	     0x1.56011cc3b036dp-7,
+-	     0x1.4a9cf6bda3f4cp-7,
+-	     0x1.3f8ff5042a88ep-7,
+-	     0x1.34d7dbc76d7e5p-7,
+-	     0x1.2a727a89a3f14p-7,
+-	     0x1.205dac02bd6b9p-7,
+-	     0x1.1697560347b25p-7,
+-	     0x1.0d1d69569b82dp-7,
+-	     0x1.03ede1a45bfeep-7,
+-	     0x1.f60d8aa2a88f2p-8,
+-	     0x1.e4cc4abf7d065p-8,
+-	     0x1.d4143a9dfe965p-8,
+-	     0x1.c3e1a5f5c077cp-8,
+-	     0x1.b430ecf4a83a8p-8,
+-	     0x1.a4fe83fb9db25p-8,
+-	     0x1.9646f35a76623p-8,
+-	     0x1.8806d70b2fc36p-8,
+-	     0x1.7a3ade6c8b3e4p-8,
+-	     0x1.6cdfcbfc1e263p-8,
+-	     0x1.5ff2750fe7820p-8,
+-	     0x1.536fc18f7ce5cp-8,
+-	     0x1.4754abacdf1dcp-8,
+-	     0x1.3b9e3f9d06e3fp-8,
+-	     0x1.30499b503957fp-8,
+-	     0x1.2553ee2a336bfp-8,
+-	     0x1.1aba78ba3af89p-8,
+-	     0x1.107a8c7323a6ep-8,
+-	     0x1.06918b6355624p-8,
+-	     0x1.f9f9cfd9c3035p-9,
+-	     0x1.e77448fb66bb9p-9,
+-	     0x1.d58da68fd1170p-9,
+-	     0x1.c4412bf4b8f0bp-9,
+-	     0x1.b38a3af2e55b4p-9,
+-	     0x1.a3645330550ffp-9,
+-	     0x1.93cb11a30d765p-9,
+-	     0x1.84ba3004a50d0p-9,
+-	     0x1.762d84469c18fp-9,
+-	     0x1.6821000795a03p-9,
+-	     0x1.5a90b00981d93p-9,
+-	     0x1.4d78bba8ca5fdp-9,
+-	     0x1.40d564548fad7p-9,
+-	     0x1.34a305080681fp-9,
+-	     0x1.28de11c5031ebp-9,
+-	     0x1.1d83170fbf6fbp-9,
+-	     0x1.128eb96be8798p-9,
+-	     0x1.07fdb4dafea5fp-9,
+-	     0x1.fb99b8b8279e1p-10,
+-	     0x1.e7f232d9e2630p-10,
+-	     0x1.d4fed7195d7e8p-10,
+-	     0x1.c2b9cf7f893bfp-10,
+-	     0x1.b11d702b3deb1p-10,
+-	     0x1.a024365f771bdp-10,
+-	     0x1.8fc8c794b03b5p-10,
+-	     0x1.8005f08d6f1efp-10,
+-	     0x1.70d6a46e07ddap-10,
+-	     0x1.6235fbd7a4345p-10,
+-	     0x1.541f340697987p-10,
+-	     0x1.468dadf4080abp-10,
+-	     0x1.397ced7af2b15p-10,
+-	     0x1.2ce898809244ep-10,
+-	     0x1.20cc76202c5fap-10,
+-	     0x1.15246dda49d47p-10,
+-	     0x1.09ec86c75d497p-10,
+-	     0x1.fe41cd9bb4eeep-11,
+-	     0x1.e97ba3b77f306p-11,
+-	     0x1.d57f524723822p-11,
+-	     0x1.c245d4b998479p-11,
+-	     0x1.afc85e0f82e12p-11,
+-	     0x1.9e005769dbc1dp-11,
+-	     0x1.8ce75e9f6f8a0p-11,
+-	     0x1.7c7744d9378f7p-11,
+-	     0x1.6caa0d3582fe9p-11,
+-	     0x1.5d79eb71e893bp-11,
+-	     0x1.4ee1429bf7cc0p-11,
+-	     0x1.40daa3c89f5b6p-11,
+-	     0x1.3360ccd23db3ap-11,
+-	     0x1.266ea71d4f71ap-11,
+-	     0x1.19ff4663ae9dfp-11,
+-	     0x1.0e0de78654d1ep-11,
+-	     0x1.0295ef6591848p-11,
+-	     0x1.ef25d37f49fe1p-12,
+-	     0x1.da01102b5f851p-12,
+-	     0x1.c5b5412dcafadp-12,
+-	     0x1.b23a5a23e4210p-12,
+-	     0x1.9f8893d8fd1c1p-12,
+-	     0x1.8d986a4187285p-12,
+-	     0x1.7c629a822bc9ep-12,
+-	     0x1.6be02102b3520p-12,
+-	     0x1.5c0a378c90bcap-12,
+-	     0x1.4cda5374ea275p-12,
+-	     0x1.3e4a23d1f4702p-12,
+-	     0x1.30538fbb77ecdp-12,
+-	     0x1.22f0b496539bdp-12,
+-	     0x1.161be46ad3b50p-12,
+-	     0x1.09cfa445b00ffp-12,
+-	     0x1.fc0d55470cf51p-13,
+-	     0x1.e577bbcd49935p-13,
+-	     0x1.cfd4a5adec5bfp-13,
+-	     0x1.bb1a9657ce465p-13,
+-	     0x1.a740684026555p-13,
+-	     0x1.943d4a1d1ed39p-13,
+-	     0x1.8208bc334a6a5p-13,
+-	     0x1.709a8db59f25cp-13,
+-	     0x1.5feada379d8b7p-13,
+-	     0x1.4ff207314a102p-13,
+-	     0x1.40a8c1949f75ep-13,
+-	     0x1.3207fb7420eb9p-13,
+-	     0x1.2408e9ba3327fp-13,
+-	     0x1.16a501f0e42cap-13,
+-	     0x1.09d5f819c9e29p-13,
+-	     0x1.fb2b792b40a22p-14,
+-	     0x1.e3bcf436a1a95p-14,
+-	     0x1.cd55277c18d05p-14,
+-	     0x1.b7e94604479dcp-14,
+-	     0x1.a36eec00926ddp-14,
+-	     0x1.8fdc1b2dcf7b9p-14,
+-	     0x1.7d2737527c3f9p-14,
+-	     0x1.6b4702d7d5849p-14,
+-	     0x1.5a329b7d30748p-14,
+-	     0x1.49e17724f4d41p-14,
+-	     0x1.3a4b60ba9aa4dp-14,
+-	     0x1.2b6875310f785p-14,
+-	     0x1.1d312098e9dbap-14,
+-	     0x1.0f9e1b4dd36dfp-14,
+-	     0x1.02a8673a94691p-14,
+-	     0x1.ec929a665b449p-15,
+-	     0x1.d4f4b4c8e09edp-15,
+-	     0x1.be6abbb10a5aap-15,
+-	     0x1.a8e8cc1fadef6p-15,
+-	     0x1.94637d5bacfdbp-15,
+-	     0x1.80cfdc72220cfp-15,
+-	     0x1.6e2367dc27f95p-15,
+-	     0x1.5c540b4936fd2p-15,
+-	     0x1.4b581b8d170fcp-15,
+-	     0x1.3b2652b06c2b2p-15,
+-	     0x1.2bb5cc22e5db6p-15,
+-	     0x1.1cfe010e2052dp-15,
+-	     0x1.0ef6c4c84a0fep-15,
+-	     0x1.01984165a5f36p-15,
+-	     0x1.e9b5e8d00ce76p-16,
+-	     0x1.d16f5716c6c1ap-16,
+-	     0x1.ba4f035d60e02p-16,
+-	     0x1.a447b7b03f045p-16,
+-	     0x1.8f4ccca7fc90dp-16,
+-	     0x1.7b5223dac7336p-16,
+-	     0x1.684c227fcacefp-16,
+-	     0x1.562fac4329b48p-16,
+-	     0x1.44f21e49054f2p-16,
+-	     0x1.34894a5e24657p-16,
+-	     0x1.24eb7254ccf83p-16,
+-	     0x1.160f438c70913p-16,
+-	     0x1.07ebd2a2d2844p-16,
+-	     0x1.f4f12e9ab070ap-17,
+-	     0x1.db5ad0b27805cp-17,
+-	     0x1.c304efa2c6f4ep-17,
+-	     0x1.abe09e9144b5ep-17,
+-	     0x1.95df988e76644p-17,
+-	     0x1.80f439b4ee04bp-17,
+-	     0x1.6d11788a69c64p-17,
+-	     0x1.5a2adfa0b4bc4p-17,
+-	     0x1.4834877429b8fp-17,
+-	     0x1.37231085c7d9ap-17,
+-	     0x1.26eb9daed6f7ep-17,
+-	     0x1.1783ceac28910p-17,
+-	     0x1.08e1badf0fcedp-17,
+-	     0x1.f5f7d88472604p-18,
+-	     0x1.db92b5212fb8dp-18,
+-	     0x1.c282cd3957edap-18,
+-	     0x1.aab7abace48dcp-18,
+-	     0x1.94219bfcb4928p-18,
+-	     0x1.7eb1a2075864dp-18,
+-	     0x1.6a597219a93d9p-18,
+-	     0x1.570b69502f313p-18,
+-	     0x1.44ba864670882p-18,
+-	     0x1.335a62115bce2p-18,
+-	     0x1.22df298214423p-18,
+-	     0x1.133d96ae7e0ddp-18,
+-	     0x1.046aeabcfcdecp-18,
+-	     0x1.ecb9cfe1d8642p-19,
+-	     0x1.d21397ead99cbp-19,
+-	     0x1.b8d094c86d374p-19,
+-	     0x1.a0df0f0c626dcp-19,
+-	     0x1.8a2e269750a39p-19,
+-	     0x1.74adc8f4064d3p-19,
+-	     0x1.604ea819f007cp-19,
+-	     0x1.4d0231928c6f9p-19,
+-	     0x1.3aba85fe22e1fp-19,
+-	     0x1.296a70f414053p-19,
+-	     0x1.1905613b3abf2p-19,
+-	     0x1.097f6156f32c5p-19,
+-	     0x1.f59a20caf6695p-20,
+-	     0x1.d9c73698fb1dcp-20,
+-	     0x1.bf716c6168baep-20,
+-	     0x1.a6852c6b58392p-20,
+-	     0x1.8eefd70594a88p-20,
+-	     0x1.789fb715aae95p-20,
+-	     0x1.6383f726a8e04p-20,
+-	     0x1.4f8c96f26a26ap-20,
+-	     0x1.3caa61607f920p-20,
+-	     0x1.2acee2f5ecdb8p-20,
+-	     0x1.19ec60b1242edp-20,
+-	     0x1.09f5cf4dd2877p-20,
+-	     0x1.f5bd95d8730d8p-21,
+-	     0x1.d9371e2ff7c35p-21,
+-	     0x1.be41de54d155ap-21,
+-	     0x1.a4c89e08ef4f3p-21,
+-	     0x1.8cb738399b12cp-21,
+-	     0x1.75fa8dbc84becp-21,
+-	     0x1.608078a70dcbcp-21,
+-	     0x1.4c37c0394d094p-21,
+-	     0x1.39100d5687bfep-21,
+-	     0x1.26f9df8519bd6p-21,
+-	     0x1.15e6827001f18p-21,
+-	     0x1.05c803e4831c1p-21,
+-	     0x1.ed22548cffd35p-22,
+-	     0x1.d06ad6ecdf971p-22,
+-	     0x1.b551c847fbc96p-22,
+-	     0x1.9bc09f112b494p-22,
+-	     0x1.83a1ff0aa239dp-22,
+-	     0x1.6ce1aa3fd7bddp-22,
+-	     0x1.576c72b514859p-22,
+-	     0x1.43302cc4a0da8p-22,
+-	     0x1.301ba221dc9bbp-22,
+-	     0x1.1e1e857adc568p-22,
+-	     0x1.0d2966b1746f7p-22,
+-	     0x1.fa5b4f49cc6b2p-23,
+-	     0x1.dc3ae30b55c16p-23,
+-	     0x1.bfd7555a3bd68p-23,
+-	     0x1.a517d9e61628ap-23,
+-	     0x1.8be4f8f6c951fp-23,
+-	     0x1.74287ded49339p-23,
+-	     0x1.5dcd669f2cd34p-23,
+-	     0x1.48bfd38302870p-23,
+-	     0x1.34ecf8a3c124ap-23,
+-	     0x1.22430f521cbcfp-23,
+-	     0x1.10b1488aeb235p-23,
+-	     0x1.0027c00a263a6p-23,
+-	     0x1.e12ee004efc37p-24,
+-	     0x1.c3e44ae32b16bp-24,
+-	     0x1.a854ea14102a8p-24,
+-	     0x1.8e6761569f45dp-24,
+-	     0x1.7603bac345f65p-24,
+-	     0x1.5f1353cdad001p-24,
+-	     0x1.4980cb3c80949p-24,
+-	     0x1.3537f00b6ad4dp-24,
+-	     0x1.2225b12bffc68p-24,
+-	     0x1.10380e1adb7e9p-24,
+-	     0x1.febc107d5efaap-25,
+-	     0x1.df0f2a0ee6946p-25,
+-	     0x1.c14b2188bcee4p-25,
+-	     0x1.a553644f7f07dp-25,
+-	     0x1.8b0cfce0579dfp-25,
+-	     0x1.725e7c5dd20f7p-25,
+-	     0x1.5b2fe547a1340p-25,
+-	     0x1.456a974e92e93p-25,
+-	     0x1.30f93c3699078p-25,
+-	     0x1.1dc7b5b978cf8p-25,
+-	     0x1.0bc30c5d52f15p-25,
+-	     0x1.f5b2be65a0c7fp-26,
+-	     0x1.d5f3a8dea7357p-26,
+-	     0x1.b82915b03515bp-26,
+-	     0x1.9c3517e789488p-26,
+-	     0x1.81fb7df06136ep-26,
+-	     0x1.6961b8d641d06p-26,
+-	     0x1.524ec4d916caep-26,
+-	     0x1.3cab1343d18d1p-26,
+-	     0x1.2860757487a01p-26,
+-	     0x1.155a09065d4f7p-26,
+-	     0x1.0384250e4c9fcp-26,
+-	     0x1.e59890b926c78p-27,
+-	     0x1.c642116a8a9e3p-27,
+-	     0x1.a8e405e651ab6p-27,
+-	     0x1.8d5f98114f872p-27,
+-	     0x1.7397c5a66e307p-27,
+-	     0x1.5b71456c5a4c4p-27,
+-	     0x1.44d26de513197p-27,
+-	     0x1.2fa31d6371537p-27,
+-	     0x1.1bcca373b7b43p-27,
+-	     0x1.0939ab853339fp-27,
+-	     0x1.efac5187b2863p-28,
+-	     0x1.cf1e86235d0e6p-28,
+-	     0x1.b0a68a2128babp-28,
+-	     0x1.9423165bc4444p-28,
+-	     0x1.7974e743dea3cp-28,
+-	     0x1.607e9eacd1050p-28,
+-	     0x1.4924a74dec728p-28,
+-	     0x1.334d19e0c2160p-28,
+-	     0x1.1edfa3c5f5ccap-28,
+-	     0x1.0bc56f1b54701p-28,
+-	     0x1.f3d2185e047d9p-29,
+-	     0x1.d26cb87945e87p-29,
+-	     0x1.b334fac4b9f99p-29,
+-	     0x1.96076f7918d1cp-29,
+-	     0x1.7ac2d72fc2c63p-29,
+-	     0x1.614801550319ep-29,
+-	     0x1.4979ac8b28926p-29,
+-	     0x1.333c68e2d0548p-29,
+-	     0x1.1e767bce37dd7p-29,
+-	     0x1.0b0fc5b6d05a0p-29,
+-	     0x1.f1e3523b41d7dp-30,
+-	     0x1.d00de6608effep-30,
+-	     0x1.b0778b7b3301ap-30,
+-	     0x1.92fb04ec0f6cfp-30,
+-	     0x1.77756ec9f78fap-30,
+-	     0x1.5dc61922d5a06p-30,
+-	     0x1.45ce65699ff6dp-30,
+-	     0x1.2f71a5f159970p-30,
+-	     0x1.1a94ff571654fp-30,
+-	     0x1.071f4bbea09ecp-30,
+-	     0x1.e9f1ff8ddd774p-31,
+-	     0x1.c818223a202c7p-31,
+-	     0x1.a887bd2b4404dp-31,
+-	     0x1.8b1a336c5eb6bp-31,
+-	     0x1.6fab63324088ap-31,
+-	     0x1.56197e30205bap-31,
+-	     0x1.3e44e45301b92p-31,
+-	     0x1.281000bfe4c3fp-31,
+-	     0x1.135f28f2d50b4p-31,
+-	     0x1.00187dded5975p-31,
+-	     0x1.dc479de0ef001p-32,
+-	     0x1.bad4fdad3caa1p-32,
+-	     0x1.9baed3ed27ab8p-32,
+-	     0x1.7ead9ce4285bbp-32,
+-	     0x1.63ac6b4edc88ep-32,
+-	     0x1.4a88be2a6390cp-32,
+-	     0x1.332259185f1a0p-32,
+-	     0x1.1d5b1f3793044p-32,
+-	     0x1.0916f04b6e18bp-32,
+-	     0x1.ec77101de6926p-33,
+-	     0x1.c960bf23153e0p-33,
+-	     0x1.a8bd20fc65ef7p-33,
+-	     0x1.8a61745ec7d1dp-33,
+-	     0x1.6e25d0e756261p-33,
+-	     0x1.53e4f7d1666cbp-33,
+-	     0x1.3b7c27a7ddb0ep-33,
+-	     0x1.24caf2c32af14p-33,
+-	     0x1.0fb3186804d0fp-33,
+-	     0x1.f830c0bb41fd7p-34,
+-	     0x1.d3c0f1a91c846p-34,
+-	     0x1.b1e5acf351d87p-34,
+-	     0x1.92712d259ce66p-34,
+-	     0x1.7538c60a04476p-34,
+-	     0x1.5a14b04b47879p-34,
+-	     0x1.40dfd87456f4cp-34,
+-	     0x1.2977b1172b9d5p-34,
+-	     0x1.13bc07e891491p-34,
+-	     0x1.ff1dbb4300811p-35,
+-	     0x1.d9a880f306bd8p-35,
+-	     0x1.b6e45220b55e0p-35,
+-	     0x1.96a0b33f2c4dap-35,
+-	     0x1.78b07e9e924acp-35,
+-	     0x1.5ce9ab1670dd2p-35,
+-	     0x1.4325167006bb0p-35,
+-	     0x1.2b3e53538ff3fp-35,
+-	     0x1.15137a7f44864p-35,
+-	     0x1.0084ff125639dp-35,
+-	     0x1.daeb0b7311ec7p-36,
+-	     0x1.b7937d1c40c52p-36,
+-	     0x1.96d082f59ab06p-36,
+-	     0x1.7872d9fa10aadp-36,
+-	     0x1.5c4e8e37bc7d0p-36,
+-	     0x1.423ac0df49a40p-36,
+-	     0x1.2a117230ad284p-36,
+-	     0x1.13af4f04f9998p-36,
+-	     0x1.fde703724e560p-37,
+-	     0x1.d77f0c82e7641p-37,
+-	     0x1.b3ee02611d7ddp-37,
+-	     0x1.92ff33023d5bdp-37,
+-	     0x1.7481a9e69f53fp-37,
+-	     0x1.5847eda620959p-37,
+-	     0x1.3e27c1fcc74bdp-37,
+-	     0x1.25f9ee0b923dcp-37,
+-	     0x1.0f9a0686531ffp-37,
+-	     0x1.f5cc7718082afp-38,
+-	     0x1.cf7e53d6a2ca5p-38,
+-	     0x1.ac0f5f3229372p-38,
+-	     0x1.8b498644847eap-38,
+-	     0x1.6cfa9bcca59dcp-38,
+-	     0x1.50f411d4fd2cdp-38,
+-	     0x1.370ab8327af5ep-38,
+-	     0x1.1f167f88c6b6ep-38,
+-	     0x1.08f24085d4597p-38,
+-	     0x1.e8f70e181d619p-39,
+-	     0x1.c324c20e337dcp-39,
+-	     0x1.a03261574b54ep-39,
+-	     0x1.7fe903cdf5855p-39,
+-	     0x1.6215c58da3450p-39,
+-	     0x1.46897d4b69fc6p-39,
+-	     0x1.2d1877d731b7bp-39,
+-	     0x1.159a386b11517p-39,
+-	     0x1.ffd27ae9393cep-40,
+-	     0x1.d7c593130dd0bp-40,
+-	     0x1.b2cd607c79bcfp-40,
+-	     0x1.90ae4d3405651p-40,
+-	     0x1.71312dd1759e2p-40,
+-	     0x1.5422ef5d8949dp-40,
+-	     0x1.39544b0ecc957p-40,
+-	     0x1.20997f73e73ddp-40,
+-	     0x1.09ca0eaacd277p-40,
+-	     0x1.e9810295890ecp-41,
+-	     0x1.c2b45b5aa4a1dp-41,
+-	     0x1.9eee068fa7596p-41,
+-	     0x1.7df2b399c10a8p-41,
+-	     0x1.5f8b87a31bd85p-41,
+-	     0x1.4385c96e9a2d9p-41,
+-	     0x1.29b2933ef4cbcp-41,
+-	     0x1.11e68a6378f8ap-41,
+-	     0x1.f7f338086a86bp-42,
+-	     0x1.cf8d7d9ce040ap-42,
+-	     0x1.aa577251ae484p-42,
+-	     0x1.8811d739efb5ep-42,
+-	     0x1.68823e52970bep-42,
+-	     0x1.4b72ae68e8b4cp-42,
+-	     0x1.30b14dbe876bcp-42,
+-	     0x1.181012ef86610p-42,
+-	     0x1.01647ba798744p-42,
+-	     0x1.d90e917701675p-43,
+-	     0x1.b2a87e86d0c8ap-43,
+-	     0x1.8f53dcb377293p-43,
+-	     0x1.6ed2f2515e933p-43,
+-	     0x1.50ecc9ed47f19p-43,
+-	     0x1.356cd5ce7799ep-43,
+-	     0x1.1c229a587ab78p-43,
+-	     0x1.04e15ecc7f3f6p-43,
+-	     0x1.deffc7e6a6017p-44,
+-	     0x1.b7b040832f310p-44,
+-	     0x1.938e021f36d76p-44,
+-	     0x1.7258610b3b233p-44,
+-	     0x1.53d3bfc82a909p-44,
+-	     0x1.37c92babdc2fdp-44,
+-	     0x1.1e06010120f6ap-44,
+-	     0x1.065b9616170d4p-44,
+-	     0x1.e13dd96b3753ap-45,
+-	     0x1.b950d32467392p-45,
+-	     0x1.94a72263259a5p-45,
+-	     0x1.72fd93e036cdcp-45,
+-	     0x1.54164576929abp-45,
+-	     0x1.37b83c521fe96p-45,
+-	     0x1.1daf033182e96p-45,
+-	     0x1.05ca50205d26ap-45,
+-	     0x1.dfbb6235639fap-46,
+-	     0x1.b7807e294781fp-46,
+-	     0x1.9298add70a734p-46,
+-	     0x1.70beaf9c7ffb6p-46,
+-	     0x1.51b2cd6709222p-46,
+-	     0x1.353a6cf7f7fffp-46,
+-	     0x1.1b1fa8cbe84a7p-46,
+-	     0x1.0330f0fd69921p-46,
+-	     0x1.da81670f96f9bp-47,
+-	     0x1.b24a16b4d09aap-47,
+-	     0x1.8d6eeb6efdbd6p-47,
+-	     0x1.6ba91ac734785p-47,
+-	     0x1.4cb7966770ab5p-47,
+-	     0x1.305e9721d0981p-47,
+-	     0x1.1667311fff70ap-47,
+-	     0x1.fd3de10d62855p-48,
+-	     0x1.d1aefbcd48d0cp-48,
+-	     0x1.a9cc93c25aca9p-48,
+-	     0x1.85487ee3ea735p-48,
+-	     0x1.63daf8b4b1e0cp-48,
+-	     0x1.45421e69a6ca1p-48,
+-	     0x1.294175802d99ap-48,
+-	     0x1.0fa17bf41068fp-48,
+-	     0x1.f05e82aae2bb9p-49,
+-	     0x1.c578101b29058p-49,
+-	     0x1.9e39dc5dd2f7cp-49,
+-	     0x1.7a553a728bbf2p-49,
+-	     0x1.5982008db1304p-49,
+-	     0x1.3b7e00422e51bp-49,
+-	     0x1.200c898d9ee3ep-49,
+-	     0x1.06f5f7eb65a56p-49,
+-	     0x1.e00e9148a1d25p-50,
+-	     0x1.b623734024e92p-50,
+-	     0x1.8fd4e01891bf8p-50,
+-	     0x1.6cd44c7470d89p-50,
+-	     0x1.4cd9c04158cd7p-50,
+-	     0x1.2fa34bf5c8344p-50,
+-	     0x1.14f4890ff2461p-50,
+-	     0x1.f92c49dfa4df5p-51,
+-	     0x1.ccaaea71ab0dfp-51,
+-	     0x1.a40829f001197p-51,
+-	     0x1.7eef13b59e96cp-51,
+-	     0x1.5d11e1a252bf5p-51,
+-	     0x1.3e296303b2297p-51,
+-	     0x1.21f47009f43cep-51,
+-	     0x1.083768c5e4541p-51,
+-	     0x1.e1777d831265ep-52,
+-	     0x1.b69f10b0191b5p-52,
+-	     0x1.8f8a3a05b5b52p-52,
+-	     0x1.6be573c40c8e7p-52,
+-	     0x1.4b645ba991fdbp-52,
+-	     0x1.2dc119095729fp-52,
+-  },
+-};
+diff --git a/sysdeps/aarch64/fpu/sv_erff_data.c b/sysdeps/aarch64/fpu/sv_erff_data.c
+deleted file mode 100644
+index 6dcd72af69..0000000000
+--- a/sysdeps/aarch64/fpu/sv_erff_data.c
++++ /dev/null
+@@ -1,1058 +0,0 @@
+-/* Table for SVE erff approximation
+-
+-   Copyright (C) 2024 Free Software Foundation, Inc.
+-   This file is part of the GNU C Library.
+-
+-   The GNU C Library is free software; you can redistribute it and/or
+-   modify it under the terms of the GNU Lesser General Public
+-   License as published by the Free Software Foundation; either
+-   version 2.1 of the License, or (at your option) any later version.
+-
+-   The GNU C Library is distributed in the hope that it will be useful,
+-   but WITHOUT ANY WARRANTY; without even the implied warranty of
+-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+-   Lesser General Public License for more details.
+-
+-   You should have received a copy of the GNU Lesser General Public
+-   License along with the GNU C Library; if not, see
+-   <https://www.gnu.org/licenses/>.  */
+-
+-#include "vecmath_config.h"
+-
+-/* Lookup table used in SVE erff.
+-   For each possible rounded input r (multiples of 1/128), between
+-   r = 0.0 and r = 4.0 (513 values):
+-   - __erff_data.erf contains the values of erf(r),
+-   - __erff_data.scale contains the values of 2/sqrt(pi)*exp(-r^2).
+-   Note that indices 0 and 1 are never hit by the algorithm, since lookup is
+-   performed only for x >= 1/64-1/512.  */
+-const struct sv_erff_data __sv_erff_data = {
+-  .erf = { 0x0.000000p+0,
+-	   0x1.20dbf4p-7,
+-	   0x1.20d770p-6,
+-	   0x1.b137e0p-6,
+-	   0x1.20c564p-5,
+-	   0x1.68e5d4p-5,
+-	   0x1.b0fafep-5,
+-	   0x1.f902a8p-5,
+-	   0x1.207d48p-4,
+-	   0x1.44703ep-4,
+-	   0x1.68591ap-4,
+-	   0x1.8c36bep-4,
+-	   0x1.b00812p-4,
+-	   0x1.d3cbf8p-4,
+-	   0x1.f7815ap-4,
+-	   0x1.0d9390p-3,
+-	   0x1.1f5e1ap-3,
+-	   0x1.311fc2p-3,
+-	   0x1.42d7fcp-3,
+-	   0x1.548642p-3,
+-	   0x1.662a0cp-3,
+-	   0x1.77c2d2p-3,
+-	   0x1.895010p-3,
+-	   0x1.9ad142p-3,
+-	   0x1.ac45e4p-3,
+-	   0x1.bdad72p-3,
+-	   0x1.cf076ep-3,
+-	   0x1.e05354p-3,
+-	   0x1.f190aap-3,
+-	   0x1.015f78p-2,
+-	   0x1.09eed6p-2,
+-	   0x1.127632p-2,
+-	   0x1.1af54ep-2,
+-	   0x1.236bf0p-2,
+-	   0x1.2bd9dcp-2,
+-	   0x1.343ed6p-2,
+-	   0x1.3c9aa8p-2,
+-	   0x1.44ed18p-2,
+-	   0x1.4d35f0p-2,
+-	   0x1.5574f4p-2,
+-	   0x1.5da9f4p-2,
+-	   0x1.65d4b8p-2,
+-	   0x1.6df50ap-2,
+-	   0x1.760abap-2,
+-	   0x1.7e1594p-2,
+-	   0x1.861566p-2,
+-	   0x1.8e0a02p-2,
+-	   0x1.95f336p-2,
+-	   0x1.9dd0d2p-2,
+-	   0x1.a5a2acp-2,
+-	   0x1.ad6896p-2,
+-	   0x1.b52264p-2,
+-	   0x1.bccfecp-2,
+-	   0x1.c47104p-2,
+-	   0x1.cc0584p-2,
+-	   0x1.d38d44p-2,
+-	   0x1.db081cp-2,
+-	   0x1.e275eap-2,
+-	   0x1.e9d68ap-2,
+-	   0x1.f129d4p-2,
+-	   0x1.f86faap-2,
+-	   0x1.ffa7eap-2,
+-	   0x1.03693ap-1,
+-	   0x1.06f794p-1,
+-	   0x1.0a7ef6p-1,
+-	   0x1.0dff50p-1,
+-	   0x1.117894p-1,
+-	   0x1.14eab4p-1,
+-	   0x1.1855a6p-1,
+-	   0x1.1bb95cp-1,
+-	   0x1.1f15ccp-1,
+-	   0x1.226ae8p-1,
+-	   0x1.25b8a8p-1,
+-	   0x1.28ff02p-1,
+-	   0x1.2c3decp-1,
+-	   0x1.2f755cp-1,
+-	   0x1.32a54cp-1,
+-	   0x1.35cdb4p-1,
+-	   0x1.38ee8ap-1,
+-	   0x1.3c07cap-1,
+-	   0x1.3f196ep-1,
+-	   0x1.42236ep-1,
+-	   0x1.4525c8p-1,
+-	   0x1.482074p-1,
+-	   0x1.4b1372p-1,
+-	   0x1.4dfebap-1,
+-	   0x1.50e24cp-1,
+-	   0x1.53be26p-1,
+-	   0x1.569244p-1,
+-	   0x1.595ea6p-1,
+-	   0x1.5c2348p-1,
+-	   0x1.5ee02ep-1,
+-	   0x1.619556p-1,
+-	   0x1.6442c0p-1,
+-	   0x1.66e86ep-1,
+-	   0x1.69865ep-1,
+-	   0x1.6c1c98p-1,
+-	   0x1.6eab18p-1,
+-	   0x1.7131e6p-1,
+-	   0x1.73b102p-1,
+-	   0x1.762870p-1,
+-	   0x1.789836p-1,
+-	   0x1.7b0058p-1,
+-	   0x1.7d60d8p-1,
+-	   0x1.7fb9c0p-1,
+-	   0x1.820b12p-1,
+-	   0x1.8454d6p-1,
+-	   0x1.869712p-1,
+-	   0x1.88d1cep-1,
+-	   0x1.8b050ep-1,
+-	   0x1.8d30dep-1,
+-	   0x1.8f5544p-1,
+-	   0x1.91724ap-1,
+-	   0x1.9387f6p-1,
+-	   0x1.959652p-1,
+-	   0x1.979d68p-1,
+-	   0x1.999d42p-1,
+-	   0x1.9b95e8p-1,
+-	   0x1.9d8768p-1,
+-	   0x1.9f71cap-1,
+-	   0x1.a1551ap-1,
+-	   0x1.a33162p-1,
+-	   0x1.a506b0p-1,
+-	   0x1.a6d50cp-1,
+-	   0x1.a89c86p-1,
+-	   0x1.aa5d26p-1,
+-	   0x1.ac16fcp-1,
+-	   0x1.adca14p-1,
+-	   0x1.af767ap-1,
+-	   0x1.b11c3cp-1,
+-	   0x1.b2bb68p-1,
+-	   0x1.b4540ap-1,
+-	   0x1.b5e630p-1,
+-	   0x1.b771e8p-1,
+-	   0x1.b8f742p-1,
+-	   0x1.ba764ap-1,
+-	   0x1.bbef10p-1,
+-	   0x1.bd61a2p-1,
+-	   0x1.bece0ep-1,
+-	   0x1.c03464p-1,
+-	   0x1.c194b2p-1,
+-	   0x1.c2ef08p-1,
+-	   0x1.c44376p-1,
+-	   0x1.c5920ap-1,
+-	   0x1.c6dad2p-1,
+-	   0x1.c81de2p-1,
+-	   0x1.c95b46p-1,
+-	   0x1.ca930ep-1,
+-	   0x1.cbc54cp-1,
+-	   0x1.ccf20cp-1,
+-	   0x1.ce1962p-1,
+-	   0x1.cf3b5cp-1,
+-	   0x1.d0580cp-1,
+-	   0x1.d16f7ep-1,
+-	   0x1.d281c4p-1,
+-	   0x1.d38ef0p-1,
+-	   0x1.d49710p-1,
+-	   0x1.d59a34p-1,
+-	   0x1.d6986cp-1,
+-	   0x1.d791cap-1,
+-	   0x1.d8865ep-1,
+-	   0x1.d97636p-1,
+-	   0x1.da6162p-1,
+-	   0x1.db47f4p-1,
+-	   0x1.dc29fcp-1,
+-	   0x1.dd0788p-1,
+-	   0x1.dde0aap-1,
+-	   0x1.deb570p-1,
+-	   0x1.df85eap-1,
+-	   0x1.e0522ap-1,
+-	   0x1.e11a3ep-1,
+-	   0x1.e1de36p-1,
+-	   0x1.e29e22p-1,
+-	   0x1.e35a12p-1,
+-	   0x1.e41214p-1,
+-	   0x1.e4c638p-1,
+-	   0x1.e5768cp-1,
+-	   0x1.e62322p-1,
+-	   0x1.e6cc08p-1,
+-	   0x1.e7714ap-1,
+-	   0x1.e812fcp-1,
+-	   0x1.e8b12ap-1,
+-	   0x1.e94be4p-1,
+-	   0x1.e9e336p-1,
+-	   0x1.ea7730p-1,
+-	   0x1.eb07e2p-1,
+-	   0x1.eb9558p-1,
+-	   0x1.ec1fa2p-1,
+-	   0x1.eca6ccp-1,
+-	   0x1.ed2ae6p-1,
+-	   0x1.edabfcp-1,
+-	   0x1.ee2a1ep-1,
+-	   0x1.eea556p-1,
+-	   0x1.ef1db4p-1,
+-	   0x1.ef9344p-1,
+-	   0x1.f00614p-1,
+-	   0x1.f07630p-1,
+-	   0x1.f0e3a6p-1,
+-	   0x1.f14e82p-1,
+-	   0x1.f1b6d0p-1,
+-	   0x1.f21ca0p-1,
+-	   0x1.f27ff8p-1,
+-	   0x1.f2e0eap-1,
+-	   0x1.f33f7ep-1,
+-	   0x1.f39bc2p-1,
+-	   0x1.f3f5c2p-1,
+-	   0x1.f44d88p-1,
+-	   0x1.f4a31ep-1,
+-	   0x1.f4f694p-1,
+-	   0x1.f547f2p-1,
+-	   0x1.f59742p-1,
+-	   0x1.f5e490p-1,
+-	   0x1.f62fe8p-1,
+-	   0x1.f67952p-1,
+-	   0x1.f6c0dcp-1,
+-	   0x1.f7068cp-1,
+-	   0x1.f74a6ep-1,
+-	   0x1.f78c8cp-1,
+-	   0x1.f7cceep-1,
+-	   0x1.f80ba2p-1,
+-	   0x1.f848acp-1,
+-	   0x1.f8841ap-1,
+-	   0x1.f8bdf2p-1,
+-	   0x1.f8f63ep-1,
+-	   0x1.f92d08p-1,
+-	   0x1.f96256p-1,
+-	   0x1.f99634p-1,
+-	   0x1.f9c8a8p-1,
+-	   0x1.f9f9bap-1,
+-	   0x1.fa2974p-1,
+-	   0x1.fa57dep-1,
+-	   0x1.fa84fep-1,
+-	   0x1.fab0dep-1,
+-	   0x1.fadb84p-1,
+-	   0x1.fb04f6p-1,
+-	   0x1.fb2d40p-1,
+-	   0x1.fb5464p-1,
+-	   0x1.fb7a6cp-1,
+-	   0x1.fb9f60p-1,
+-	   0x1.fbc344p-1,
+-	   0x1.fbe61ep-1,
+-	   0x1.fc07fap-1,
+-	   0x1.fc28d8p-1,
+-	   0x1.fc48c2p-1,
+-	   0x1.fc67bcp-1,
+-	   0x1.fc85d0p-1,
+-	   0x1.fca2fep-1,
+-	   0x1.fcbf52p-1,
+-	   0x1.fcdaccp-1,
+-	   0x1.fcf576p-1,
+-	   0x1.fd0f54p-1,
+-	   0x1.fd286ap-1,
+-	   0x1.fd40bep-1,
+-	   0x1.fd5856p-1,
+-	   0x1.fd6f34p-1,
+-	   0x1.fd8562p-1,
+-	   0x1.fd9ae2p-1,
+-	   0x1.fdafb8p-1,
+-	   0x1.fdc3e8p-1,
+-	   0x1.fdd77ap-1,
+-	   0x1.fdea6ep-1,
+-	   0x1.fdfcccp-1,
+-	   0x1.fe0e96p-1,
+-	   0x1.fe1fd0p-1,
+-	   0x1.fe3080p-1,
+-	   0x1.fe40a6p-1,
+-	   0x1.fe504cp-1,
+-	   0x1.fe5f70p-1,
+-	   0x1.fe6e18p-1,
+-	   0x1.fe7c46p-1,
+-	   0x1.fe8a00p-1,
+-	   0x1.fe9748p-1,
+-	   0x1.fea422p-1,
+-	   0x1.feb090p-1,
+-	   0x1.febc96p-1,
+-	   0x1.fec836p-1,
+-	   0x1.fed374p-1,
+-	   0x1.fede52p-1,
+-	   0x1.fee8d4p-1,
+-	   0x1.fef2fep-1,
+-	   0x1.fefccep-1,
+-	   0x1.ff064cp-1,
+-	   0x1.ff0f76p-1,
+-	   0x1.ff1852p-1,
+-	   0x1.ff20e0p-1,
+-	   0x1.ff2924p-1,
+-	   0x1.ff3120p-1,
+-	   0x1.ff38d6p-1,
+-	   0x1.ff4048p-1,
+-	   0x1.ff4778p-1,
+-	   0x1.ff4e68p-1,
+-	   0x1.ff551ap-1,
+-	   0x1.ff5b90p-1,
+-	   0x1.ff61ccp-1,
+-	   0x1.ff67d0p-1,
+-	   0x1.ff6d9ep-1,
+-	   0x1.ff7338p-1,
+-	   0x1.ff789ep-1,
+-	   0x1.ff7dd4p-1,
+-	   0x1.ff82dap-1,
+-	   0x1.ff87b2p-1,
+-	   0x1.ff8c5cp-1,
+-	   0x1.ff90dcp-1,
+-	   0x1.ff9532p-1,
+-	   0x1.ff9960p-1,
+-	   0x1.ff9d68p-1,
+-	   0x1.ffa14ap-1,
+-	   0x1.ffa506p-1,
+-	   0x1.ffa8a0p-1,
+-	   0x1.ffac18p-1,
+-	   0x1.ffaf6ep-1,
+-	   0x1.ffb2a6p-1,
+-	   0x1.ffb5bep-1,
+-	   0x1.ffb8b8p-1,
+-	   0x1.ffbb98p-1,
+-	   0x1.ffbe5ap-1,
+-	   0x1.ffc102p-1,
+-	   0x1.ffc390p-1,
+-	   0x1.ffc606p-1,
+-	   0x1.ffc862p-1,
+-	   0x1.ffcaa8p-1,
+-	   0x1.ffccd8p-1,
+-	   0x1.ffcef4p-1,
+-	   0x1.ffd0fap-1,
+-	   0x1.ffd2eap-1,
+-	   0x1.ffd4cap-1,
+-	   0x1.ffd696p-1,
+-	   0x1.ffd84ep-1,
+-	   0x1.ffd9f8p-1,
+-	   0x1.ffdb90p-1,
+-	   0x1.ffdd18p-1,
+-	   0x1.ffde90p-1,
+-	   0x1.ffdffap-1,
+-	   0x1.ffe154p-1,
+-	   0x1.ffe2a2p-1,
+-	   0x1.ffe3e2p-1,
+-	   0x1.ffe514p-1,
+-	   0x1.ffe63cp-1,
+-	   0x1.ffe756p-1,
+-	   0x1.ffe866p-1,
+-	   0x1.ffe96ap-1,
+-	   0x1.ffea64p-1,
+-	   0x1.ffeb54p-1,
+-	   0x1.ffec3ap-1,
+-	   0x1.ffed16p-1,
+-	   0x1.ffedeap-1,
+-	   0x1.ffeeb4p-1,
+-	   0x1.ffef76p-1,
+-	   0x1.fff032p-1,
+-	   0x1.fff0e4p-1,
+-	   0x1.fff18ep-1,
+-	   0x1.fff232p-1,
+-	   0x1.fff2d0p-1,
+-	   0x1.fff366p-1,
+-	   0x1.fff3f6p-1,
+-	   0x1.fff480p-1,
+-	   0x1.fff504p-1,
+-	   0x1.fff582p-1,
+-	   0x1.fff5fcp-1,
+-	   0x1.fff670p-1,
+-	   0x1.fff6dep-1,
+-	   0x1.fff74ap-1,
+-	   0x1.fff7aep-1,
+-	   0x1.fff810p-1,
+-	   0x1.fff86cp-1,
+-	   0x1.fff8c6p-1,
+-	   0x1.fff91cp-1,
+-	   0x1.fff96cp-1,
+-	   0x1.fff9bap-1,
+-	   0x1.fffa04p-1,
+-	   0x1.fffa4cp-1,
+-	   0x1.fffa90p-1,
+-	   0x1.fffad0p-1,
+-	   0x1.fffb0ep-1,
+-	   0x1.fffb4ap-1,
+-	   0x1.fffb82p-1,
+-	   0x1.fffbb8p-1,
+-	   0x1.fffbecp-1,
+-	   0x1.fffc1ep-1,
+-	   0x1.fffc4ep-1,
+-	   0x1.fffc7ap-1,
+-	   0x1.fffca6p-1,
+-	   0x1.fffccep-1,
+-	   0x1.fffcf6p-1,
+-	   0x1.fffd1ap-1,
+-	   0x1.fffd3ep-1,
+-	   0x1.fffd60p-1,
+-	   0x1.fffd80p-1,
+-	   0x1.fffda0p-1,
+-	   0x1.fffdbep-1,
+-	   0x1.fffddap-1,
+-	   0x1.fffdf4p-1,
+-	   0x1.fffe0ep-1,
+-	   0x1.fffe26p-1,
+-	   0x1.fffe3ep-1,
+-	   0x1.fffe54p-1,
+-	   0x1.fffe68p-1,
+-	   0x1.fffe7ep-1,
+-	   0x1.fffe90p-1,
+-	   0x1.fffea2p-1,
+-	   0x1.fffeb4p-1,
+-	   0x1.fffec4p-1,
+-	   0x1.fffed4p-1,
+-	   0x1.fffee4p-1,
+-	   0x1.fffef2p-1,
+-	   0x1.ffff00p-1,
+-	   0x1.ffff0cp-1,
+-	   0x1.ffff18p-1,
+-	   0x1.ffff24p-1,
+-	   0x1.ffff30p-1,
+-	   0x1.ffff3ap-1,
+-	   0x1.ffff44p-1,
+-	   0x1.ffff4ep-1,
+-	   0x1.ffff56p-1,
+-	   0x1.ffff60p-1,
+-	   0x1.ffff68p-1,
+-	   0x1.ffff70p-1,
+-	   0x1.ffff78p-1,
+-	   0x1.ffff7ep-1,
+-	   0x1.ffff84p-1,
+-	   0x1.ffff8cp-1,
+-	   0x1.ffff92p-1,
+-	   0x1.ffff98p-1,
+-	   0x1.ffff9cp-1,
+-	   0x1.ffffa2p-1,
+-	   0x1.ffffa6p-1,
+-	   0x1.ffffacp-1,
+-	   0x1.ffffb0p-1,
+-	   0x1.ffffb4p-1,
+-	   0x1.ffffb8p-1,
+-	   0x1.ffffbcp-1,
+-	   0x1.ffffc0p-1,
+-	   0x1.ffffc4p-1,
+-	   0x1.ffffc6p-1,
+-	   0x1.ffffcap-1,
+-	   0x1.ffffccp-1,
+-	   0x1.ffffd0p-1,
+-	   0x1.ffffd2p-1,
+-	   0x1.ffffd4p-1,
+-	   0x1.ffffd6p-1,
+-	   0x1.ffffd8p-1,
+-	   0x1.ffffdcp-1,
+-	   0x1.ffffdep-1,
+-	   0x1.ffffdep-1,
+-	   0x1.ffffe0p-1,
+-	   0x1.ffffe2p-1,
+-	   0x1.ffffe4p-1,
+-	   0x1.ffffe6p-1,
+-	   0x1.ffffe8p-1,
+-	   0x1.ffffe8p-1,
+-	   0x1.ffffeap-1,
+-	   0x1.ffffeap-1,
+-	   0x1.ffffecp-1,
+-	   0x1.ffffeep-1,
+-	   0x1.ffffeep-1,
+-	   0x1.fffff0p-1,
+-	   0x1.fffff0p-1,
+-	   0x1.fffff2p-1,
+-	   0x1.fffff2p-1,
+-	   0x1.fffff2p-1,
+-	   0x1.fffff4p-1,
+-	   0x1.fffff4p-1,
+-	   0x1.fffff4p-1,
+-	   0x1.fffff6p-1,
+-	   0x1.fffff6p-1,
+-	   0x1.fffff6p-1,
+-	   0x1.fffff8p-1,
+-	   0x1.fffff8p-1,
+-	   0x1.fffff8p-1,
+-	   0x1.fffff8p-1,
+-	   0x1.fffffap-1,
+-	   0x1.fffffap-1,
+-	   0x1.fffffap-1,
+-	   0x1.fffffap-1,
+-	   0x1.fffffap-1,
+-	   0x1.fffffap-1,
+-	   0x1.fffffcp-1,
+-	   0x1.fffffcp-1,
+-	   0x1.fffffcp-1,
+-	   0x1.fffffcp-1,
+-	   0x1.fffffcp-1,
+-	   0x1.fffffcp-1,
+-	   0x1.fffffcp-1,
+-	   0x1.fffffcp-1,
+-	   0x1.fffffep-1,
+-	   0x1.fffffep-1,
+-	   0x1.fffffep-1,
+-	   0x1.fffffep-1,
+-	   0x1.fffffep-1,
+-	   0x1.fffffep-1,
+-	   0x1.fffffep-1,
+-	   0x1.fffffep-1,
+-	   0x1.fffffep-1,
+-	   0x1.fffffep-1,
+-	   0x1.fffffep-1,
+-	   0x1.fffffep-1,
+-	   0x1.fffffep-1,
+-	   0x1.fffffep-1,
+-	   0x1.fffffep-1,
+-	   0x1.fffffep-1,
+-	   0x1.fffffep-1,
+-	   0x1.fffffep-1,
+-	   0x1.000000p+0,
+-	   0x1.000000p+0,
+-	   0x1.000000p+0,
+-	   0x1.000000p+0,
+-	   0x1.000000p+0,
+-	   0x1.000000p+0,
+-	   0x1.000000p+0,
+-	   0x1.000000p+0,
+-	   0x1.000000p+0,
+-	   0x1.000000p+0,
+-	   0x1.000000p+0,
+-  },
+-  .scale = { 0x1.20dd76p+0,
+-	     0x1.20d8f2p+0,
+-	     0x1.20cb68p+0,
+-	     0x1.20b4d8p+0,
+-	     0x1.209546p+0,
+-	     0x1.206cb4p+0,
+-	     0x1.203b26p+0,
+-	     0x1.2000a0p+0,
+-	     0x1.1fbd28p+0,
+-	     0x1.1f70c4p+0,
+-	     0x1.1f1b7ap+0,
+-	     0x1.1ebd56p+0,
+-	     0x1.1e565cp+0,
+-	     0x1.1de698p+0,
+-	     0x1.1d6e14p+0,
+-	     0x1.1cecdcp+0,
+-	     0x1.1c62fap+0,
+-	     0x1.1bd07cp+0,
+-	     0x1.1b3572p+0,
+-	     0x1.1a91e6p+0,
+-	     0x1.19e5eap+0,
+-	     0x1.19318cp+0,
+-	     0x1.1874dep+0,
+-	     0x1.17aff0p+0,
+-	     0x1.16e2d8p+0,
+-	     0x1.160da4p+0,
+-	     0x1.153068p+0,
+-	     0x1.144b3cp+0,
+-	     0x1.135e30p+0,
+-	     0x1.12695ep+0,
+-	     0x1.116cd8p+0,
+-	     0x1.1068bap+0,
+-	     0x1.0f5d16p+0,
+-	     0x1.0e4a08p+0,
+-	     0x1.0d2fa6p+0,
+-	     0x1.0c0e0ap+0,
+-	     0x1.0ae550p+0,
+-	     0x1.09b590p+0,
+-	     0x1.087ee4p+0,
+-	     0x1.07416cp+0,
+-	     0x1.05fd3ep+0,
+-	     0x1.04b27cp+0,
+-	     0x1.036140p+0,
+-	     0x1.0209a6p+0,
+-	     0x1.00abd0p+0,
+-	     0x1.fe8fb0p-1,
+-	     0x1.fbbbbep-1,
+-	     0x1.f8dc0ap-1,
+-	     0x1.f5f0cep-1,
+-	     0x1.f2fa4cp-1,
+-	     0x1.eff8c4p-1,
+-	     0x1.ecec78p-1,
+-	     0x1.e9d5a8p-1,
+-	     0x1.e6b498p-1,
+-	     0x1.e38988p-1,
+-	     0x1.e054bep-1,
+-	     0x1.dd167cp-1,
+-	     0x1.d9cf06p-1,
+-	     0x1.d67ea2p-1,
+-	     0x1.d32592p-1,
+-	     0x1.cfc41ep-1,
+-	     0x1.cc5a8ap-1,
+-	     0x1.c8e91cp-1,
+-	     0x1.c5701ap-1,
+-	     0x1.c1efcap-1,
+-	     0x1.be6872p-1,
+-	     0x1.bada5ap-1,
+-	     0x1.b745c6p-1,
+-	     0x1.b3aafcp-1,
+-	     0x1.b00a46p-1,
+-	     0x1.ac63e8p-1,
+-	     0x1.a8b828p-1,
+-	     0x1.a5074ep-1,
+-	     0x1.a1519ep-1,
+-	     0x1.9d9762p-1,
+-	     0x1.99d8dap-1,
+-	     0x1.961650p-1,
+-	     0x1.925008p-1,
+-	     0x1.8e8646p-1,
+-	     0x1.8ab950p-1,
+-	     0x1.86e96ap-1,
+-	     0x1.8316d6p-1,
+-	     0x1.7f41dcp-1,
+-	     0x1.7b6abcp-1,
+-	     0x1.7791b8p-1,
+-	     0x1.73b714p-1,
+-	     0x1.6fdb12p-1,
+-	     0x1.6bfdf0p-1,
+-	     0x1.681ff2p-1,
+-	     0x1.644156p-1,
+-	     0x1.60625cp-1,
+-	     0x1.5c8342p-1,
+-	     0x1.58a446p-1,
+-	     0x1.54c5a6p-1,
+-	     0x1.50e79ep-1,
+-	     0x1.4d0a68p-1,
+-	     0x1.492e42p-1,
+-	     0x1.455366p-1,
+-	     0x1.417a0cp-1,
+-	     0x1.3da26ep-1,
+-	     0x1.39ccc2p-1,
+-	     0x1.35f940p-1,
+-	     0x1.32281ep-1,
+-	     0x1.2e5992p-1,
+-	     0x1.2a8dcep-1,
+-	     0x1.26c508p-1,
+-	     0x1.22ff72p-1,
+-	     0x1.1f3d3cp-1,
+-	     0x1.1b7e98p-1,
+-	     0x1.17c3b6p-1,
+-	     0x1.140cc4p-1,
+-	     0x1.1059eep-1,
+-	     0x1.0cab62p-1,
+-	     0x1.09014cp-1,
+-	     0x1.055bd6p-1,
+-	     0x1.01bb2cp-1,
+-	     0x1.fc3ee6p-2,
+-	     0x1.f511aap-2,
+-	     0x1.edeeeep-2,
+-	     0x1.e6d700p-2,
+-	     0x1.dfca26p-2,
+-	     0x1.d8c8aap-2,
+-	     0x1.d1d2d0p-2,
+-	     0x1.cae8dap-2,
+-	     0x1.c40b08p-2,
+-	     0x1.bd3998p-2,
+-	     0x1.b674c8p-2,
+-	     0x1.afbcd4p-2,
+-	     0x1.a911f0p-2,
+-	     0x1.a27456p-2,
+-	     0x1.9be438p-2,
+-	     0x1.9561c8p-2,
+-	     0x1.8eed36p-2,
+-	     0x1.8886b2p-2,
+-	     0x1.822e66p-2,
+-	     0x1.7be47ap-2,
+-	     0x1.75a91ap-2,
+-	     0x1.6f7c6ap-2,
+-	     0x1.695e8cp-2,
+-	     0x1.634fa6p-2,
+-	     0x1.5d4fd4p-2,
+-	     0x1.575f34p-2,
+-	     0x1.517de6p-2,
+-	     0x1.4bac00p-2,
+-	     0x1.45e99cp-2,
+-	     0x1.4036d0p-2,
+-	     0x1.3a93b2p-2,
+-	     0x1.350052p-2,
+-	     0x1.2f7cc4p-2,
+-	     0x1.2a0916p-2,
+-	     0x1.24a554p-2,
+-	     0x1.1f518ap-2,
+-	     0x1.1a0dc6p-2,
+-	     0x1.14da0ap-2,
+-	     0x1.0fb662p-2,
+-	     0x1.0aa2d0p-2,
+-	     0x1.059f5ap-2,
+-	     0x1.00ac00p-2,
+-	     0x1.f79184p-3,
+-	     0x1.edeb40p-3,
+-	     0x1.e46530p-3,
+-	     0x1.daff4ap-3,
+-	     0x1.d1b982p-3,
+-	     0x1.c893cep-3,
+-	     0x1.bf8e1cp-3,
+-	     0x1.b6a856p-3,
+-	     0x1.ade26cp-3,
+-	     0x1.a53c42p-3,
+-	     0x1.9cb5bep-3,
+-	     0x1.944ec2p-3,
+-	     0x1.8c0732p-3,
+-	     0x1.83deeap-3,
+-	     0x1.7bd5c8p-3,
+-	     0x1.73eba4p-3,
+-	     0x1.6c2056p-3,
+-	     0x1.6473b6p-3,
+-	     0x1.5ce596p-3,
+-	     0x1.5575c8p-3,
+-	     0x1.4e241ep-3,
+-	     0x1.46f066p-3,
+-	     0x1.3fda6cp-3,
+-	     0x1.38e1fap-3,
+-	     0x1.3206dcp-3,
+-	     0x1.2b48dap-3,
+-	     0x1.24a7b8p-3,
+-	     0x1.1e233ep-3,
+-	     0x1.17bb2cp-3,
+-	     0x1.116f48p-3,
+-	     0x1.0b3f52p-3,
+-	     0x1.052b0cp-3,
+-	     0x1.fe6460p-4,
+-	     0x1.f2a902p-4,
+-	     0x1.e72372p-4,
+-	     0x1.dbd32ap-4,
+-	     0x1.d0b7a0p-4,
+-	     0x1.c5d04ap-4,
+-	     0x1.bb1c98p-4,
+-	     0x1.b09bfcp-4,
+-	     0x1.a64de6p-4,
+-	     0x1.9c31c6p-4,
+-	     0x1.92470ap-4,
+-	     0x1.888d1ep-4,
+-	     0x1.7f036cp-4,
+-	     0x1.75a960p-4,
+-	     0x1.6c7e64p-4,
+-	     0x1.6381e2p-4,
+-	     0x1.5ab342p-4,
+-	     0x1.5211ecp-4,
+-	     0x1.499d48p-4,
+-	     0x1.4154bcp-4,
+-	     0x1.3937b2p-4,
+-	     0x1.31458ep-4,
+-	     0x1.297dbap-4,
+-	     0x1.21df9ap-4,
+-	     0x1.1a6a96p-4,
+-	     0x1.131e14p-4,
+-	     0x1.0bf97ep-4,
+-	     0x1.04fc3ap-4,
+-	     0x1.fc4b5ep-5,
+-	     0x1.eeea8cp-5,
+-	     0x1.e1d4d0p-5,
+-	     0x1.d508fap-5,
+-	     0x1.c885e0p-5,
+-	     0x1.bc4a54p-5,
+-	     0x1.b05530p-5,
+-	     0x1.a4a54ap-5,
+-	     0x1.99397ap-5,
+-	     0x1.8e109cp-5,
+-	     0x1.83298ep-5,
+-	     0x1.78832cp-5,
+-	     0x1.6e1c58p-5,
+-	     0x1.63f3f6p-5,
+-	     0x1.5a08e8p-5,
+-	     0x1.505a18p-5,
+-	     0x1.46e66cp-5,
+-	     0x1.3dacd2p-5,
+-	     0x1.34ac36p-5,
+-	     0x1.2be38cp-5,
+-	     0x1.2351c2p-5,
+-	     0x1.1af5d2p-5,
+-	     0x1.12ceb4p-5,
+-	     0x1.0adb60p-5,
+-	     0x1.031ad6p-5,
+-	     0x1.f7182ap-6,
+-	     0x1.e85c44p-6,
+-	     0x1.da0006p-6,
+-	     0x1.cc0180p-6,
+-	     0x1.be5ecep-6,
+-	     0x1.b1160ap-6,
+-	     0x1.a4255ap-6,
+-	     0x1.978ae8p-6,
+-	     0x1.8b44e6p-6,
+-	     0x1.7f5188p-6,
+-	     0x1.73af0cp-6,
+-	     0x1.685bb6p-6,
+-	     0x1.5d55ccp-6,
+-	     0x1.529b9ep-6,
+-	     0x1.482b84p-6,
+-	     0x1.3e03d8p-6,
+-	     0x1.3422fep-6,
+-	     0x1.2a875cp-6,
+-	     0x1.212f62p-6,
+-	     0x1.181984p-6,
+-	     0x1.0f443ep-6,
+-	     0x1.06ae14p-6,
+-	     0x1.fcab14p-7,
+-	     0x1.ec7262p-7,
+-	     0x1.dcaf36p-7,
+-	     0x1.cd5ecap-7,
+-	     0x1.be7e5ap-7,
+-	     0x1.b00b38p-7,
+-	     0x1.a202bep-7,
+-	     0x1.94624ep-7,
+-	     0x1.87275ep-7,
+-	     0x1.7a4f6ap-7,
+-	     0x1.6dd7fep-7,
+-	     0x1.61beaep-7,
+-	     0x1.56011cp-7,
+-	     0x1.4a9cf6p-7,
+-	     0x1.3f8ff6p-7,
+-	     0x1.34d7dcp-7,
+-	     0x1.2a727ap-7,
+-	     0x1.205dacp-7,
+-	     0x1.169756p-7,
+-	     0x1.0d1d6ap-7,
+-	     0x1.03ede2p-7,
+-	     0x1.f60d8ap-8,
+-	     0x1.e4cc4ap-8,
+-	     0x1.d4143ap-8,
+-	     0x1.c3e1a6p-8,
+-	     0x1.b430ecp-8,
+-	     0x1.a4fe84p-8,
+-	     0x1.9646f4p-8,
+-	     0x1.8806d8p-8,
+-	     0x1.7a3adep-8,
+-	     0x1.6cdfccp-8,
+-	     0x1.5ff276p-8,
+-	     0x1.536fc2p-8,
+-	     0x1.4754acp-8,
+-	     0x1.3b9e40p-8,
+-	     0x1.30499cp-8,
+-	     0x1.2553eep-8,
+-	     0x1.1aba78p-8,
+-	     0x1.107a8cp-8,
+-	     0x1.06918cp-8,
+-	     0x1.f9f9d0p-9,
+-	     0x1.e77448p-9,
+-	     0x1.d58da6p-9,
+-	     0x1.c4412cp-9,
+-	     0x1.b38a3ap-9,
+-	     0x1.a36454p-9,
+-	     0x1.93cb12p-9,
+-	     0x1.84ba30p-9,
+-	     0x1.762d84p-9,
+-	     0x1.682100p-9,
+-	     0x1.5a90b0p-9,
+-	     0x1.4d78bcp-9,
+-	     0x1.40d564p-9,
+-	     0x1.34a306p-9,
+-	     0x1.28de12p-9,
+-	     0x1.1d8318p-9,
+-	     0x1.128ebap-9,
+-	     0x1.07fdb4p-9,
+-	     0x1.fb99b8p-10,
+-	     0x1.e7f232p-10,
+-	     0x1.d4fed8p-10,
+-	     0x1.c2b9d0p-10,
+-	     0x1.b11d70p-10,
+-	     0x1.a02436p-10,
+-	     0x1.8fc8c8p-10,
+-	     0x1.8005f0p-10,
+-	     0x1.70d6a4p-10,
+-	     0x1.6235fcp-10,
+-	     0x1.541f34p-10,
+-	     0x1.468daep-10,
+-	     0x1.397ceep-10,
+-	     0x1.2ce898p-10,
+-	     0x1.20cc76p-10,
+-	     0x1.15246ep-10,
+-	     0x1.09ec86p-10,
+-	     0x1.fe41cep-11,
+-	     0x1.e97ba4p-11,
+-	     0x1.d57f52p-11,
+-	     0x1.c245d4p-11,
+-	     0x1.afc85ep-11,
+-	     0x1.9e0058p-11,
+-	     0x1.8ce75ep-11,
+-	     0x1.7c7744p-11,
+-	     0x1.6caa0ep-11,
+-	     0x1.5d79ecp-11,
+-	     0x1.4ee142p-11,
+-	     0x1.40daa4p-11,
+-	     0x1.3360ccp-11,
+-	     0x1.266ea8p-11,
+-	     0x1.19ff46p-11,
+-	     0x1.0e0de8p-11,
+-	     0x1.0295f0p-11,
+-	     0x1.ef25d4p-12,
+-	     0x1.da0110p-12,
+-	     0x1.c5b542p-12,
+-	     0x1.b23a5ap-12,
+-	     0x1.9f8894p-12,
+-	     0x1.8d986ap-12,
+-	     0x1.7c629ap-12,
+-	     0x1.6be022p-12,
+-	     0x1.5c0a38p-12,
+-	     0x1.4cda54p-12,
+-	     0x1.3e4a24p-12,
+-	     0x1.305390p-12,
+-	     0x1.22f0b4p-12,
+-	     0x1.161be4p-12,
+-	     0x1.09cfa4p-12,
+-	     0x1.fc0d56p-13,
+-	     0x1.e577bcp-13,
+-	     0x1.cfd4a6p-13,
+-	     0x1.bb1a96p-13,
+-	     0x1.a74068p-13,
+-	     0x1.943d4ap-13,
+-	     0x1.8208bcp-13,
+-	     0x1.709a8ep-13,
+-	     0x1.5feadap-13,
+-	     0x1.4ff208p-13,
+-	     0x1.40a8c2p-13,
+-	     0x1.3207fcp-13,
+-	     0x1.2408eap-13,
+-	     0x1.16a502p-13,
+-	     0x1.09d5f8p-13,
+-	     0x1.fb2b7ap-14,
+-	     0x1.e3bcf4p-14,
+-	     0x1.cd5528p-14,
+-	     0x1.b7e946p-14,
+-	     0x1.a36eecp-14,
+-	     0x1.8fdc1cp-14,
+-	     0x1.7d2738p-14,
+-	     0x1.6b4702p-14,
+-	     0x1.5a329cp-14,
+-	     0x1.49e178p-14,
+-	     0x1.3a4b60p-14,
+-	     0x1.2b6876p-14,
+-	     0x1.1d3120p-14,
+-	     0x1.0f9e1cp-14,
+-	     0x1.02a868p-14,
+-	     0x1.ec929ap-15,
+-	     0x1.d4f4b4p-15,
+-	     0x1.be6abcp-15,
+-	     0x1.a8e8ccp-15,
+-	     0x1.94637ep-15,
+-	     0x1.80cfdcp-15,
+-	     0x1.6e2368p-15,
+-	     0x1.5c540cp-15,
+-	     0x1.4b581cp-15,
+-	     0x1.3b2652p-15,
+-	     0x1.2bb5ccp-15,
+-	     0x1.1cfe02p-15,
+-	     0x1.0ef6c4p-15,
+-	     0x1.019842p-15,
+-	     0x1.e9b5e8p-16,
+-	     0x1.d16f58p-16,
+-	     0x1.ba4f04p-16,
+-	     0x1.a447b8p-16,
+-	     0x1.8f4cccp-16,
+-	     0x1.7b5224p-16,
+-	     0x1.684c22p-16,
+-	     0x1.562facp-16,
+-	     0x1.44f21ep-16,
+-	     0x1.34894ap-16,
+-	     0x1.24eb72p-16,
+-	     0x1.160f44p-16,
+-	     0x1.07ebd2p-16,
+-	     0x1.f4f12ep-17,
+-	     0x1.db5ad0p-17,
+-	     0x1.c304f0p-17,
+-	     0x1.abe09ep-17,
+-	     0x1.95df98p-17,
+-	     0x1.80f43ap-17,
+-	     0x1.6d1178p-17,
+-	     0x1.5a2ae0p-17,
+-	     0x1.483488p-17,
+-	     0x1.372310p-17,
+-	     0x1.26eb9ep-17,
+-	     0x1.1783cep-17,
+-	     0x1.08e1bap-17,
+-	     0x1.f5f7d8p-18,
+-	     0x1.db92b6p-18,
+-	     0x1.c282cep-18,
+-	     0x1.aab7acp-18,
+-	     0x1.94219cp-18,
+-	     0x1.7eb1a2p-18,
+-	     0x1.6a5972p-18,
+-	     0x1.570b6ap-18,
+-	     0x1.44ba86p-18,
+-	     0x1.335a62p-18,
+-	     0x1.22df2ap-18,
+-	     0x1.133d96p-18,
+-	     0x1.046aeap-18,
+-	     0x1.ecb9d0p-19,
+-	     0x1.d21398p-19,
+-	     0x1.b8d094p-19,
+-	     0x1.a0df10p-19,
+-	     0x1.8a2e26p-19,
+-	     0x1.74adc8p-19,
+-	     0x1.604ea8p-19,
+-	     0x1.4d0232p-19,
+-	     0x1.3aba86p-19,
+-	     0x1.296a70p-19,
+-	     0x1.190562p-19,
+-	     0x1.097f62p-19,
+-	     0x1.f59a20p-20,
+-	     0x1.d9c736p-20,
+-	     0x1.bf716cp-20,
+-	     0x1.a6852cp-20,
+-	     0x1.8eefd8p-20,
+-	     0x1.789fb8p-20,
+-	     0x1.6383f8p-20,
+-	     0x1.4f8c96p-20,
+-	     0x1.3caa62p-20,
+-	     0x1.2acee2p-20,
+-	     0x1.19ec60p-20,
+-	     0x1.09f5d0p-20,
+-	     0x1.f5bd96p-21,
+-	     0x1.d9371ep-21,
+-	     0x1.be41dep-21,
+-	     0x1.a4c89ep-21,
+-	     0x1.8cb738p-21,
+-	     0x1.75fa8ep-21,
+-	     0x1.608078p-21,
+-	     0x1.4c37c0p-21,
+-	     0x1.39100ep-21,
+-	     0x1.26f9e0p-21,
+-	     0x1.15e682p-21,
+-	     0x1.05c804p-21,
+-	     0x1.ed2254p-22,
+-	     0x1.d06ad6p-22,
+-	     0x1.b551c8p-22,
+-	     0x1.9bc0a0p-22,
+-	     0x1.83a200p-22,
+-	     0x1.6ce1aap-22,
+-	     0x1.576c72p-22,
+-	     0x1.43302cp-22,
+-	     0x1.301ba2p-22,
+-	     0x1.1e1e86p-22,
+-	     0x1.0d2966p-22,
+-	     0x1.fa5b50p-23,
+-	     0x1.dc3ae4p-23,
+-	     0x1.bfd756p-23,
+-	     0x1.a517dap-23,
+-	     0x1.8be4f8p-23,
+-	     0x1.74287ep-23,
+-	     0x1.5dcd66p-23,
+-	     0x1.48bfd4p-23,
+-	     0x1.34ecf8p-23,
+-	     0x1.224310p-23,
+-	     0x1.10b148p-23,
+-  },
+-};
+diff --git a/sysdeps/aarch64/fpu/vecmath_config.h b/sysdeps/aarch64/fpu/vecmath_config.h
+index 7f0a8aa5f2..862eefaf8f 100644
+--- a/sysdeps/aarch64/fpu/vecmath_config.h
++++ b/sysdeps/aarch64/fpu/vecmath_config.h
+@@ -75,49 +75,37 @@ extern const struct v_log10_data
+   } table[1 << V_LOG10_TABLE_BITS];
+ } __v_log10_data attribute_hidden;
+ 
+-extern const struct erff_data
++extern const struct v_erff_data
+ {
+   struct
+   {
+     float erf, scale;
+   } tab[513];
+-} __erff_data attribute_hidden;
++} __v_erff_data attribute_hidden;
+ 
+-extern const struct sv_erff_data
+-{
+-  float erf[513];
+-  float scale[513];
+-} __sv_erff_data attribute_hidden;
+-
+-extern const struct erf_data
++extern const struct v_erf_data
+ {
+   struct
+   {
+     double erf, scale;
+   } tab[769];
+-} __erf_data attribute_hidden;
+-
+-extern const struct sv_erf_data
+-{
+-  double erf[769];
+-  double scale[769];
+-} __sv_erf_data attribute_hidden;
++} __v_erf_data attribute_hidden;
+ 
+-extern const struct erfc_data
++extern const struct v_erfc_data
+ {
+   struct
+   {
+     double erfc, scale;
+   } tab[3488];
+-} __erfc_data attribute_hidden;
++} __v_erfc_data attribute_hidden;
+ 
+-extern const struct erfcf_data
++extern const struct v_erfcf_data
+ {
+   struct
+   {
+     float erfc, scale;
+   } tab[645];
+-} __erfcf_data attribute_hidden;
++} __v_erfcf_data attribute_hidden;
+ 
+ /* Some data for AdvSIMD and SVE pow's internal exp and log.  */
+ #define V_POW_EXP_TABLE_BITS 8
+
+commit 4148940836eee07d1138da6f1805280eeb8217e3
+Author: Pierre Blanchard <pierre.blanchard@arm.com>
+Date:   Mon Dec 9 15:53:04 2024 +0000
+
+    AArch64: Improve codegen in AdvSIMD pow
+    
+    Remove spurious ADRP. Improve memory access by shuffling constants and
+    using more indexed MLAs.
+    
+    A few more optimisation with no impact on accuracy
+    - force fmas contraction
+    - switch from shift-aided rint to rint instruction
+    
+    Between 1 and 5% throughput improvement on Neoverse
+    V1 depending on benchmark.
+    
+    (cherry picked from commit 569cfaaf4984ae70b23c61ee28a609b5aef93fea)
+
+diff --git a/sysdeps/aarch64/fpu/pow_advsimd.c b/sysdeps/aarch64/fpu/pow_advsimd.c
+index 3c91e3e183..81e134ac2f 100644
+--- a/sysdeps/aarch64/fpu/pow_advsimd.c
++++ b/sysdeps/aarch64/fpu/pow_advsimd.c
+@@ -22,9 +22,6 @@
+ /* Defines parameters of the approximation and scalar fallback.  */
+ #include "finite_pow.h"
+ 
+-#define VecSmallExp v_u64 (SmallExp)
+-#define VecThresExp v_u64 (ThresExp)
+-
+ #define VecSmallPowX v_u64 (SmallPowX)
+ #define VecThresPowX v_u64 (ThresPowX)
+ #define VecSmallPowY v_u64 (SmallPowY)
+@@ -32,36 +29,48 @@
+ 
+ static const struct data
+ {
+-  float64x2_t log_poly[6];
+-  float64x2_t exp_poly[3];
+-  float64x2_t ln2_hi, ln2_lo;
+-  float64x2_t shift, inv_ln2_n, ln2_hi_n, ln2_lo_n, small_powx;
+   uint64x2_t inf;
++  float64x2_t small_powx;
++  uint64x2_t offset, mask;
++  uint64x2_t mask_sub_0, mask_sub_1;
++  float64x2_t log_c0, log_c2, log_c4, log_c5;
++  double log_c1, log_c3;
++  double ln2_lo, ln2_hi;
++  uint64x2_t small_exp, thres_exp;
++  double ln2_lo_n, ln2_hi_n;
++  double inv_ln2_n, exp_c2;
++  float64x2_t exp_c0, exp_c1;
+ } data = {
++  /* Power threshold.  */
++  .inf = V2 (0x7ff0000000000000),
++  .small_powx = V2 (0x1p-126),
++  .offset = V2 (Off),
++  .mask = V2 (0xfffULL << 52),
++  .mask_sub_0 = V2 (1ULL << 52),
++  .mask_sub_1 = V2 (52ULL << 52),
+   /* Coefficients copied from v_pow_log_data.c
+      relative error: 0x1.11922ap-70 in [-0x1.6bp-8, 0x1.6bp-8]
+      Coefficients are scaled to match the scaling during evaluation.  */
+-  .log_poly
+-  = { V2 (0x1.555555555556p-2 * -2), V2 (-0x1.0000000000006p-2 * -2),
+-      V2 (0x1.999999959554ep-3 * 4), V2 (-0x1.555555529a47ap-3 * 4),
+-      V2 (0x1.2495b9b4845e9p-3 * -8), V2 (-0x1.0002b8b263fc3p-3 * -8) },
+-  .ln2_hi = V2 (0x1.62e42fefa3800p-1),
+-  .ln2_lo = V2 (0x1.ef35793c76730p-45),
++  .log_c0 = V2 (0x1.555555555556p-2 * -2),
++  .log_c1 = -0x1.0000000000006p-2 * -2,
++  .log_c2 = V2 (0x1.999999959554ep-3 * 4),
++  .log_c3 = -0x1.555555529a47ap-3 * 4,
++  .log_c4 = V2 (0x1.2495b9b4845e9p-3 * -8),
++  .log_c5 = V2 (-0x1.0002b8b263fc3p-3 * -8),
++  .ln2_hi = 0x1.62e42fefa3800p-1,
++  .ln2_lo = 0x1.ef35793c76730p-45,
+   /* Polynomial coefficients: abs error: 1.43*2^-58, ulp error: 0.549
+      (0.550 without fma) if |x| < ln2/512.  */
+-  .exp_poly = { V2 (0x1.fffffffffffd4p-2), V2 (0x1.5555571d6ef9p-3),
+-		V2 (0x1.5555576a5adcep-5) },
+-  .shift = V2 (0x1.8p52), /* round to nearest int. without intrinsics.  */
+-  .inv_ln2_n = V2 (0x1.71547652b82fep8), /* N/ln2.  */
+-  .ln2_hi_n = V2 (0x1.62e42fefc0000p-9), /* ln2/N.  */
+-  .ln2_lo_n = V2 (-0x1.c610ca86c3899p-45),
+-  .small_powx = V2 (0x1p-126),
+-  .inf = V2 (0x7ff0000000000000)
++  .exp_c0 = V2 (0x1.fffffffffffd4p-2),
++  .exp_c1 = V2 (0x1.5555571d6ef9p-3),
++  .exp_c2 = 0x1.5555576a5adcep-5,
++  .small_exp = V2 (0x3c90000000000000),
++  .thres_exp = V2 (0x03f0000000000000),
++  .inv_ln2_n = 0x1.71547652b82fep8, /* N/ln2.  */
++  .ln2_hi_n = 0x1.62e42fefc0000p-9, /* ln2/N.  */
++  .ln2_lo_n = -0x1.c610ca86c3899p-45,
+ };
+ 
+-#define A(i) data.log_poly[i]
+-#define C(i) data.exp_poly[i]
+-
+ /* This version implements an algorithm close to scalar pow but
+    - does not implement the trick in the exp's specialcase subroutine to avoid
+      double-rounding,
+@@ -91,10 +100,9 @@ v_log_inline (uint64x2_t ix, float64x2_t *tail, const struct data *d)
+   /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
+      The range is split into N subintervals.
+      The ith subinterval contains z and c is near its center.  */
+-  uint64x2_t tmp = vsubq_u64 (ix, v_u64 (Off));
+-  int64x2_t k
+-      = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); /* arithmetic shift.  */
+-  uint64x2_t iz = vsubq_u64 (ix, vandq_u64 (tmp, v_u64 (0xfffULL << 52)));
++  uint64x2_t tmp = vsubq_u64 (ix, d->offset);
++  int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52);
++  uint64x2_t iz = vsubq_u64 (ix, vandq_u64 (tmp, d->mask));
+   float64x2_t z = vreinterpretq_f64_u64 (iz);
+   float64x2_t kd = vcvtq_f64_s64 (k);
+   /* log(x) = k*Ln2 + log(c) + log1p(z/c-1).  */
+@@ -105,9 +113,10 @@ v_log_inline (uint64x2_t ix, float64x2_t *tail, const struct data *d)
+      |z/c - 1| < 1/N, so r = z/c - 1 is exactly representible.  */
+   float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, invc);
+   /* k*Ln2 + log(c) + r.  */
+-  float64x2_t t1 = vfmaq_f64 (logc, kd, d->ln2_hi);
++  float64x2_t ln2 = vld1q_f64 (&d->ln2_lo);
++  float64x2_t t1 = vfmaq_laneq_f64 (logc, kd, ln2, 1);
+   float64x2_t t2 = vaddq_f64 (t1, r);
+-  float64x2_t lo1 = vfmaq_f64 (logctail, kd, d->ln2_lo);
++  float64x2_t lo1 = vfmaq_laneq_f64 (logctail, kd, ln2, 0);
+   float64x2_t lo2 = vaddq_f64 (vsubq_f64 (t1, t2), r);
+   /* Evaluation is optimized assuming superscalar pipelined execution.  */
+   float64x2_t ar = vmulq_f64 (v_f64 (-0.5), r);
+@@ -118,9 +127,10 @@ v_log_inline (uint64x2_t ix, float64x2_t *tail, const struct data *d)
+   float64x2_t lo3 = vfmaq_f64 (vnegq_f64 (ar2), ar, r);
+   float64x2_t lo4 = vaddq_f64 (vsubq_f64 (t2, hi), ar2);
+   /* p = log1p(r) - r - A[0]*r*r.  */
+-  float64x2_t a56 = vfmaq_f64 (A (4), r, A (5));
+-  float64x2_t a34 = vfmaq_f64 (A (2), r, A (3));
+-  float64x2_t a12 = vfmaq_f64 (A (0), r, A (1));
++  float64x2_t odd_coeffs = vld1q_f64 (&d->log_c1);
++  float64x2_t a56 = vfmaq_f64 (d->log_c4, r, d->log_c5);
++  float64x2_t a34 = vfmaq_laneq_f64 (d->log_c2, r, odd_coeffs, 1);
++  float64x2_t a12 = vfmaq_laneq_f64 (d->log_c0, r, odd_coeffs, 0);
+   float64x2_t p = vfmaq_f64 (a34, ar2, a56);
+   p = vfmaq_f64 (a12, ar2, p);
+   p = vmulq_f64 (ar3, p);
+@@ -140,28 +150,28 @@ exp_special_case (float64x2_t x, float64x2_t xtail)
+ 
+ /* Computes sign*exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|.  */
+ static inline float64x2_t
+-v_exp_inline (float64x2_t x, float64x2_t xtail, const struct data *d)
++v_exp_inline (float64x2_t x, float64x2_t neg_xtail, const struct data *d)
+ {
+   /* Fallback to scalar exp_inline for all lanes if any lane
+      contains value of x s.t. |x| <= 2^-54 or >= 512.  */
+-  uint64x2_t abstop
+-      = vshrq_n_u64 (vandq_u64 (vreinterpretq_u64_f64 (x), d->inf), 52);
+-  uint64x2_t uoflowx
+-      = vcgeq_u64 (vsubq_u64 (abstop, VecSmallExp), VecThresExp);
++  uint64x2_t uoflowx = vcgeq_u64 (
++      vsubq_u64 (vreinterpretq_u64_f64 (vabsq_f64 (x)), d->small_exp),
++      d->thres_exp);
+   if (__glibc_unlikely (v_any_u64 (uoflowx)))
+-    return exp_special_case (x, xtail);
++    return exp_special_case (x, vnegq_f64 (neg_xtail));
+ 
+   /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)].  */
+   /* x = ln2/N*k + r, with k integer and r in [-ln2/2N, ln2/2N].  */
+-  float64x2_t z = vmulq_f64 (d->inv_ln2_n, x);
+   /* z - kd is in [-1, 1] in non-nearest rounding modes.  */
+-  float64x2_t kd = vaddq_f64 (z, d->shift);
+-  uint64x2_t ki = vreinterpretq_u64_f64 (kd);
+-  kd = vsubq_f64 (kd, d->shift);
+-  float64x2_t r = vfmsq_f64 (x, kd, d->ln2_hi_n);
+-  r = vfmsq_f64 (r, kd, d->ln2_lo_n);
++  float64x2_t exp_consts = vld1q_f64 (&d->inv_ln2_n);
++  float64x2_t z = vmulq_laneq_f64 (x, exp_consts, 0);
++  float64x2_t kd = vrndnq_f64 (z);
++  uint64x2_t ki = vreinterpretq_u64_s64 (vcvtaq_s64_f64 (z));
++  float64x2_t ln2_n = vld1q_f64 (&d->ln2_lo_n);
++  float64x2_t r = vfmsq_laneq_f64 (x, kd, ln2_n, 1);
++  r = vfmsq_laneq_f64 (r, kd, ln2_n, 0);
+   /* The code assumes 2^-200 < |xtail| < 2^-8/N.  */
+-  r = vaddq_f64 (r, xtail);
++  r = vsubq_f64 (r, neg_xtail);
+   /* 2^(k/N) ~= scale.  */
+   uint64x2_t idx = vandq_u64 (ki, v_u64 (N_EXP - 1));
+   uint64x2_t top = vshlq_n_u64 (ki, 52 - V_POW_EXP_TABLE_BITS);
+@@ -170,8 +180,8 @@ v_exp_inline (float64x2_t x, float64x2_t xtail, const struct data *d)
+   sbits = vaddq_u64 (sbits, top);
+   /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1).  */
+   float64x2_t r2 = vmulq_f64 (r, r);
+-  float64x2_t tmp = vfmaq_f64 (C (1), r, C (2));
+-  tmp = vfmaq_f64 (C (0), r, tmp);
++  float64x2_t tmp = vfmaq_laneq_f64 (d->exp_c1, r, exp_consts, 1);
++  tmp = vfmaq_f64 (d->exp_c0, r, tmp);
+   tmp = vfmaq_f64 (r, r2, tmp);
+   float64x2_t scale = vreinterpretq_f64_u64 (sbits);
+   /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
+@@ -230,8 +240,8 @@ float64x2_t VPCS_ATTR V_NAME_D2 (pow) (float64x2_t x, float64x2_t y)
+ 	{
+ 	  /* Normalize subnormal x so exponent becomes negative.  */
+ 	  uint64x2_t vix_norm = vreinterpretq_u64_f64 (
+-	      vabsq_f64 (vmulq_f64 (x, vcvtq_f64_u64 (v_u64 (1ULL << 52)))));
+-	  vix_norm = vsubq_u64 (vix_norm, v_u64 (52ULL << 52));
++	      vabsq_f64 (vmulq_f64 (x, vcvtq_f64_u64 (d->mask_sub_0))));
++	  vix_norm = vsubq_u64 (vix_norm, d->mask_sub_1);
+ 	  vix = vbslq_u64 (sub_x, vix_norm, vix);
+ 	}
+     }
+@@ -242,8 +252,7 @@ float64x2_t VPCS_ATTR V_NAME_D2 (pow) (float64x2_t x, float64x2_t y)
+ 
+   /* Vector Exp(y_loghi, y_loglo).  */
+   float64x2_t vehi = vmulq_f64 (y, vhi);
+-  float64x2_t velo = vmulq_f64 (y, vlo);
+   float64x2_t vemi = vfmsq_f64 (vehi, y, vhi);
+-  velo = vsubq_f64 (velo, vemi);
+-  return v_exp_inline (vehi, velo, d);
++  float64x2_t neg_velo = vfmsq_f64 (vemi, y, vlo);
++  return v_exp_inline (vehi, neg_velo, d);
+ }
+
+commit ae04f63087415eba9060143608b03db693854bb7
+Author: Pierre Blanchard <pierre.blanchard@arm.com>
+Date:   Mon Dec 9 15:54:34 2024 +0000
+
+    AArch64: Improve codegen in AdvSIMD logs
+    
+    Remove spurious ADRP and a few MOVs.
+    Reduce memory access by using more indexed MLAs in polynomial.
+    Align notation so that algorithms are easier to compare.
+    Speedup on Neoverse V1 for log10 (8%), log (8.5%), and log2 (10%).
+    Update error threshold in AdvSIMD log (now matches SVE log).
+    
+    (cherry picked from commit 8eb5ad2ebc94cc5bedbac57c226c02ec254479c7)
+
+diff --git a/sysdeps/aarch64/fpu/log10_advsimd.c b/sysdeps/aarch64/fpu/log10_advsimd.c
+index c065aaebae..f69ed21c39 100644
+--- a/sysdeps/aarch64/fpu/log10_advsimd.c
++++ b/sysdeps/aarch64/fpu/log10_advsimd.c
+@@ -18,36 +18,36 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include "v_math.h"
+-#include "poly_advsimd_f64.h"
+-
+-#define N (1 << V_LOG10_TABLE_BITS)
+ 
+ static const struct data
+ {
+-  uint64x2_t min_norm;
++  uint64x2_t off, sign_exp_mask, offset_lower_bound;
+   uint32x4_t special_bound;
+-  float64x2_t poly[5];
+-  float64x2_t invln10, log10_2, ln2;
+-  uint64x2_t sign_exp_mask;
++  double invln10, log10_2;
++  double c1, c3;
++  float64x2_t c0, c2, c4;
+ } data = {
+   /* Computed from log coefficients divided by log(10) then rounded to double
+      precision.  */
+-  .poly = { V2 (-0x1.bcb7b1526e506p-3), V2 (0x1.287a7636be1d1p-3),
+-	    V2 (-0x1.bcb7b158af938p-4), V2 (0x1.63c78734e6d07p-4),
+-	    V2 (-0x1.287461742fee4p-4) },
+-  .ln2 = V2 (0x1.62e42fefa39efp-1),
+-  .invln10 = V2 (0x1.bcb7b1526e50ep-2),
+-  .log10_2 = V2 (0x1.34413509f79ffp-2),
+-  .min_norm = V2 (0x0010000000000000), /* asuint64(0x1p-1022).  */
+-  .special_bound = V4 (0x7fe00000),    /* asuint64(inf) - min_norm.  */
++  .c0 = V2 (-0x1.bcb7b1526e506p-3),
++  .c1 = 0x1.287a7636be1d1p-3,
++  .c2 = V2 (-0x1.bcb7b158af938p-4),
++  .c3 = 0x1.63c78734e6d07p-4,
++  .c4 = V2 (-0x1.287461742fee4p-4),
++  .invln10 = 0x1.bcb7b1526e50ep-2,
++  .log10_2 = 0x1.34413509f79ffp-2,
++  .off = V2 (0x3fe6900900000000),
+   .sign_exp_mask = V2 (0xfff0000000000000),
++  /* Lower bound is 0x0010000000000000. For
++     optimised register use subnormals are detected after offset has been
++     subtracted, so lower bound - offset (which wraps around).  */
++  .offset_lower_bound = V2 (0x0010000000000000 - 0x3fe6900900000000),
++  .special_bound = V4 (0x7fe00000), /* asuint64(inf) - 0x0010000000000000.  */
+ };
+ 
+-#define Off v_u64 (0x3fe6900900000000)
++#define N (1 << V_LOG10_TABLE_BITS)
+ #define IndexMask (N - 1)
+ 
+-#define T(s, i) __v_log10_data.s[i]
+-
+ struct entry
+ {
+   float64x2_t invc;
+@@ -70,10 +70,11 @@ lookup (uint64x2_t i)
+ }
+ 
+ static float64x2_t VPCS_ATTR NOINLINE
+-special_case (float64x2_t x, float64x2_t y, float64x2_t hi, float64x2_t r2,
+-	      uint32x2_t special)
++special_case (float64x2_t hi, uint64x2_t u_off, float64x2_t y, float64x2_t r2,
++	      uint32x2_t special, const struct data *d)
+ {
+-  return v_call_f64 (log10, x, vfmaq_f64 (hi, r2, y), vmovl_u32 (special));
++  float64x2_t x = vreinterpretq_f64_u64 (vaddq_u64 (u_off, d->off));
++  return v_call_f64 (log10, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (special));
+ }
+ 
+ /* Fast implementation of double-precision vector log10
+@@ -85,19 +86,24 @@ special_case (float64x2_t x, float64x2_t y, float64x2_t hi, float64x2_t r2,
+ float64x2_t VPCS_ATTR V_NAME_D1 (log10) (float64x2_t x)
+ {
+   const struct data *d = ptr_barrier (&data);
+-  uint64x2_t ix = vreinterpretq_u64_f64 (x);
+-  uint32x2_t special = vcge_u32 (vsubhn_u64 (ix, d->min_norm),
+-				 vget_low_u32 (d->special_bound));
++
++  /* To avoid having to mov x out of the way, keep u after offset has been
++     applied, and recover x by adding the offset back in the special-case
++     handler.  */
++  uint64x2_t u = vreinterpretq_u64_f64 (x);
++  uint64x2_t u_off = vsubq_u64 (u, d->off);
+ 
+   /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
+      The range is split into N subintervals.
+      The ith subinterval contains z and c is near its center.  */
+-  uint64x2_t tmp = vsubq_u64 (ix, Off);
+-  int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52);
+-  uint64x2_t iz = vsubq_u64 (ix, vandq_u64 (tmp, d->sign_exp_mask));
++  int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52);
++  uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->sign_exp_mask));
+   float64x2_t z = vreinterpretq_f64_u64 (iz);
+ 
+-  struct entry e = lookup (tmp);
++  struct entry e = lookup (u_off);
++
++  uint32x2_t special = vcge_u32 (vsubhn_u64 (u_off, d->offset_lower_bound),
++				 vget_low_u32 (d->special_bound));
+ 
+   /* log10(x) = log1p(z/c-1)/log(10) + log10(c) + k*log10(2).  */
+   float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
+@@ -105,17 +111,22 @@ float64x2_t VPCS_ATTR V_NAME_D1 (log10) (float64x2_t x)
+ 
+   /* hi = r / log(10) + log10(c) + k*log10(2).
+      Constants in v_log10_data.c are computed (in extended precision) as
+-     e.log10c := e.logc * ivln10.  */
+-  float64x2_t w = vfmaq_f64 (e.log10c, r, d->invln10);
++     e.log10c := e.logc * invln10.  */
++  float64x2_t cte = vld1q_f64 (&d->invln10);
++  float64x2_t hi = vfmaq_laneq_f64 (e.log10c, r, cte, 0);
+ 
+   /* y = log10(1+r) + n * log10(2).  */
+-  float64x2_t hi = vfmaq_f64 (w, kd, d->log10_2);
++  hi = vfmaq_laneq_f64 (hi, kd, cte, 1);
+ 
+   /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi.  */
+   float64x2_t r2 = vmulq_f64 (r, r);
+-  float64x2_t y = v_pw_horner_4_f64 (r, r2, d->poly);
++  float64x2_t odd_coeffs = vld1q_f64 (&d->c1);
++  float64x2_t y = vfmaq_laneq_f64 (d->c2, r, odd_coeffs, 1);
++  float64x2_t p = vfmaq_laneq_f64 (d->c0, r, odd_coeffs, 0);
++  y = vfmaq_f64 (y, d->c4, r2);
++  y = vfmaq_f64 (p, y, r2);
+ 
+   if (__glibc_unlikely (v_any_u32h (special)))
+-    return special_case (x, y, hi, r2, special);
+-  return vfmaq_f64 (hi, r2, y);
++    return special_case (hi, u_off, y, r2, special, d);
++  return vfmaq_f64 (hi, y, r2);
+ }
+diff --git a/sysdeps/aarch64/fpu/log2_advsimd.c b/sysdeps/aarch64/fpu/log2_advsimd.c
+index 4057c552d8..1eea1f86eb 100644
+--- a/sysdeps/aarch64/fpu/log2_advsimd.c
++++ b/sysdeps/aarch64/fpu/log2_advsimd.c
+@@ -18,31 +18,33 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include "v_math.h"
+-#include "poly_advsimd_f64.h"
+-
+-#define N (1 << V_LOG2_TABLE_BITS)
+ 
+ static const struct data
+ {
+-  uint64x2_t min_norm;
++  uint64x2_t off, sign_exp_mask, offset_lower_bound;
+   uint32x4_t special_bound;
+-  float64x2_t poly[5];
+-  float64x2_t invln2;
+-  uint64x2_t sign_exp_mask;
++  float64x2_t c0, c2;
++  double c1, c3, invln2, c4;
+ } data = {
+   /* Each coefficient was generated to approximate log(r) for |r| < 0x1.fp-9
+      and N = 128, then scaled by log2(e) in extended precision and rounded back
+      to double precision.  */
+-  .poly = { V2 (-0x1.71547652b83p-1), V2 (0x1.ec709dc340953p-2),
+-	    V2 (-0x1.71547651c8f35p-2), V2 (0x1.2777ebe12dda5p-2),
+-	    V2 (-0x1.ec738d616fe26p-3) },
+-  .invln2 = V2 (0x1.71547652b82fep0),
+-  .min_norm = V2 (0x0010000000000000), /* asuint64(0x1p-1022).  */
+-  .special_bound = V4 (0x7fe00000),    /* asuint64(inf) - min_norm.  */
++  .c0 = V2 (-0x1.71547652b8300p-1),
++  .c1 = 0x1.ec709dc340953p-2,
++  .c2 = V2 (-0x1.71547651c8f35p-2),
++  .c3 = 0x1.2777ebe12dda5p-2,
++  .c4 = -0x1.ec738d616fe26p-3,
++  .invln2 = 0x1.71547652b82fep0,
++  .off = V2 (0x3fe6900900000000),
+   .sign_exp_mask = V2 (0xfff0000000000000),
++  /* Lower bound is 0x0010000000000000. For
++     optimised register use subnormals are detected after offset has been
++     subtracted, so lower bound - offset (which wraps around).  */
++  .offset_lower_bound = V2 (0x0010000000000000 - 0x3fe6900900000000),
++  .special_bound = V4 (0x7fe00000), /* asuint64(inf) - asuint64(0x1p-1022).  */
+ };
+ 
+-#define Off v_u64 (0x3fe6900900000000)
++#define N (1 << V_LOG2_TABLE_BITS)
+ #define IndexMask (N - 1)
+ 
+ struct entry
+@@ -67,10 +69,11 @@ lookup (uint64x2_t i)
+ }
+ 
+ static float64x2_t VPCS_ATTR NOINLINE
+-special_case (float64x2_t x, float64x2_t y, float64x2_t w, float64x2_t r2,
+-	      uint32x2_t special)
++special_case (float64x2_t hi, uint64x2_t u_off, float64x2_t y, float64x2_t r2,
++	      uint32x2_t special, const struct data *d)
+ {
+-  return v_call_f64 (log2, x, vfmaq_f64 (w, r2, y), vmovl_u32 (special));
++  float64x2_t x = vreinterpretq_f64_u64 (vaddq_u64 (u_off, d->off));
++  return v_call_f64 (log2, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (special));
+ }
+ 
+ /* Double-precision vector log2 routine. Implements the same algorithm as
+@@ -81,31 +84,41 @@ special_case (float64x2_t x, float64x2_t y, float64x2_t w, float64x2_t r2,
+ float64x2_t VPCS_ATTR V_NAME_D1 (log2) (float64x2_t x)
+ {
+   const struct data *d = ptr_barrier (&data);
+-  uint64x2_t ix = vreinterpretq_u64_f64 (x);
+-  uint32x2_t special = vcge_u32 (vsubhn_u64 (ix, d->min_norm),
+-				 vget_low_u32 (d->special_bound));
++
++  /* To avoid having to mov x out of the way, keep u after offset has been
++     applied, and recover x by adding the offset back in the special-case
++     handler.  */
++  uint64x2_t u = vreinterpretq_u64_f64 (x);
++  uint64x2_t u_off = vsubq_u64 (u, d->off);
+ 
+   /* x = 2^k z; where z is in range [Off,2*Off) and exact.
+      The range is split into N subintervals.
+      The ith subinterval contains z and c is near its center.  */
+-  uint64x2_t tmp = vsubq_u64 (ix, Off);
+-  int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52);
+-  uint64x2_t iz = vsubq_u64 (ix, vandq_u64 (tmp, d->sign_exp_mask));
++  int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52);
++  uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->sign_exp_mask));
+   float64x2_t z = vreinterpretq_f64_u64 (iz);
+ 
+-  struct entry e = lookup (tmp);
++  struct entry e = lookup (u_off);
+ 
+-  /* log2(x) = log1p(z/c-1)/log(2) + log2(c) + k.  */
++  uint32x2_t special = vcge_u32 (vsubhn_u64 (u_off, d->offset_lower_bound),
++				 vget_low_u32 (d->special_bound));
+ 
++  /* log2(x) = log1p(z/c-1)/log(2) + log2(c) + k.  */
+   float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
+   float64x2_t kd = vcvtq_f64_s64 (k);
+-  float64x2_t w = vfmaq_f64 (e.log2c, r, d->invln2);
++
++  float64x2_t invln2_and_c4 = vld1q_f64 (&d->invln2);
++  float64x2_t hi
++      = vfmaq_laneq_f64 (vaddq_f64 (e.log2c, kd), r, invln2_and_c4, 0);
+ 
+   float64x2_t r2 = vmulq_f64 (r, r);
+-  float64x2_t y = v_pw_horner_4_f64 (r, r2, d->poly);
+-  w = vaddq_f64 (kd, w);
++  float64x2_t odd_coeffs = vld1q_f64 (&d->c1);
++  float64x2_t y = vfmaq_laneq_f64 (d->c2, r, odd_coeffs, 1);
++  float64x2_t p = vfmaq_laneq_f64 (d->c0, r, odd_coeffs, 0);
++  y = vfmaq_laneq_f64 (y, r2, invln2_and_c4, 1);
++  y = vfmaq_f64 (p, r2, y);
+ 
+   if (__glibc_unlikely (v_any_u32h (special)))
+-    return special_case (x, y, w, r2, special);
+-  return vfmaq_f64 (w, r2, y);
++    return special_case (hi, u_off, y, r2, special, d);
++  return vfmaq_f64 (hi, y, r2);
+ }
+diff --git a/sysdeps/aarch64/fpu/log_advsimd.c b/sysdeps/aarch64/fpu/log_advsimd.c
+index 015a6da7d7..b1a27fbc29 100644
+--- a/sysdeps/aarch64/fpu/log_advsimd.c
++++ b/sysdeps/aarch64/fpu/log_advsimd.c
+@@ -21,27 +21,29 @@
+ 
+ static const struct data
+ {
+-  uint64x2_t min_norm;
++  uint64x2_t off, sign_exp_mask, offset_lower_bound;
+   uint32x4_t special_bound;
+-  float64x2_t poly[5];
+-  float64x2_t ln2;
+-  uint64x2_t sign_exp_mask;
++  float64x2_t c0, c2;
++  double c1, c3, ln2, c4;
+ } data = {
+-  /* Worst-case error: 1.17 + 0.5 ulp.
+-     Rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ].  */
+-  .poly = { V2 (-0x1.ffffffffffff7p-2), V2 (0x1.55555555170d4p-2),
+-	    V2 (-0x1.0000000399c27p-2), V2 (0x1.999b2e90e94cap-3),
+-	    V2 (-0x1.554e550bd501ep-3) },
+-  .ln2 = V2 (0x1.62e42fefa39efp-1),
+-  .min_norm = V2 (0x0010000000000000),
+-  .special_bound = V4 (0x7fe00000), /* asuint64(inf) - min_norm.  */
+-  .sign_exp_mask = V2 (0xfff0000000000000)
++  /* Rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ].  */
++  .c0 = V2 (-0x1.ffffffffffff7p-2),
++  .c1 = 0x1.55555555170d4p-2,
++  .c2 = V2 (-0x1.0000000399c27p-2),
++  .c3 = 0x1.999b2e90e94cap-3,
++  .c4 = -0x1.554e550bd501ep-3,
++  .ln2 = 0x1.62e42fefa39efp-1,
++  .sign_exp_mask = V2 (0xfff0000000000000),
++  .off = V2 (0x3fe6900900000000),
++  /* Lower bound is 0x0010000000000000. For
++     optimised register use subnormals are detected after offset has been
++     subtracted, so lower bound - offset (which wraps around).  */
++  .offset_lower_bound = V2 (0x0010000000000000 - 0x3fe6900900000000),
++  .special_bound = V4 (0x7fe00000), /* asuint64(inf) -  asuint64(0x1p-126).  */
+ };
+ 
+-#define A(i) d->poly[i]
+ #define N (1 << V_LOG_TABLE_BITS)
+ #define IndexMask (N - 1)
+-#define Off v_u64 (0x3fe6900900000000)
+ 
+ struct entry
+ {
+@@ -64,48 +66,56 @@ lookup (uint64x2_t i)
+ }
+ 
+ static float64x2_t VPCS_ATTR NOINLINE
+-special_case (float64x2_t x, float64x2_t y, float64x2_t hi, float64x2_t r2,
+-	      uint32x2_t cmp)
++special_case (float64x2_t hi, uint64x2_t u_off, float64x2_t y, float64x2_t r2,
++	      uint32x2_t special, const struct data *d)
+ {
+-  return v_call_f64 (log, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (cmp));
++  float64x2_t x = vreinterpretq_f64_u64 (vaddq_u64 (u_off, d->off));
++  return v_call_f64 (log, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (special));
+ }
+ 
++/* Double-precision vector log routine.
++   The maximum observed error is 2.17 ULP:
++   _ZGVnN2v_log(0x1.a6129884398a3p+0) got 0x1.ffffff1cca043p-2
++				     want 0x1.ffffff1cca045p-2.  */
+ float64x2_t VPCS_ATTR V_NAME_D1 (log) (float64x2_t x)
+ {
+   const struct data *d = ptr_barrier (&data);
+-  float64x2_t z, r, r2, p, y, kd, hi;
+-  uint64x2_t ix, iz, tmp;
+-  uint32x2_t cmp;
+-  int64x2_t k;
+-  struct entry e;
+ 
+-  ix = vreinterpretq_u64_f64 (x);
+-  cmp = vcge_u32 (vsubhn_u64 (ix, d->min_norm),
+-		  vget_low_u32 (d->special_bound));
++  /* To avoid having to mov x out of the way, keep u after offset has been
++     applied, and recover x by adding the offset back in the special-case
++     handler.  */
++  uint64x2_t u = vreinterpretq_u64_f64 (x);
++  uint64x2_t u_off = vsubq_u64 (u, d->off);
+ 
+   /* x = 2^k z; where z is in range [Off,2*Off) and exact.
+      The range is split into N subintervals.
+      The ith subinterval contains z and c is near its center.  */
+-  tmp = vsubq_u64 (ix, Off);
+-  k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); /* arithmetic shift.  */
+-  iz = vsubq_u64 (ix, vandq_u64 (tmp, d->sign_exp_mask));
+-  z = vreinterpretq_f64_u64 (iz);
+-  e = lookup (tmp);
++  int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52);
++  uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->sign_exp_mask));
++  float64x2_t z = vreinterpretq_f64_u64 (iz);
++
++  struct entry e = lookup (u_off);
++
++  uint32x2_t special = vcge_u32 (vsubhn_u64 (u_off, d->offset_lower_bound),
++				 vget_low_u32 (d->special_bound));
+ 
+   /* log(x) = log1p(z/c-1) + log(c) + k*Ln2.  */
+-  r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
+-  kd = vcvtq_f64_s64 (k);
++  float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
++  float64x2_t kd = vcvtq_f64_s64 (k);
+ 
+   /* hi = r + log(c) + k*Ln2.  */
+-  hi = vfmaq_f64 (vaddq_f64 (e.logc, r), kd, d->ln2);
++  float64x2_t ln2_and_c4 = vld1q_f64 (&d->ln2);
++  float64x2_t hi = vfmaq_laneq_f64 (vaddq_f64 (e.logc, r), kd, ln2_and_c4, 0);
++
+   /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi.  */
+-  r2 = vmulq_f64 (r, r);
+-  y = vfmaq_f64 (A (2), A (3), r);
+-  p = vfmaq_f64 (A (0), A (1), r);
+-  y = vfmaq_f64 (y, A (4), r2);
+-  y = vfmaq_f64 (p, y, r2);
+-
+-  if (__glibc_unlikely (v_any_u32h (cmp)))
+-    return special_case (x, y, hi, r2, cmp);
++  float64x2_t odd_coeffs = vld1q_f64 (&d->c1);
++  float64x2_t r2 = vmulq_f64 (r, r);
++  float64x2_t y = vfmaq_laneq_f64 (d->c2, r, odd_coeffs, 1);
++  float64x2_t p = vfmaq_laneq_f64 (d->c0, r, odd_coeffs, 0);
++  y = vfmaq_laneq_f64 (y, r2, ln2_and_c4, 1);
++  y = vfmaq_f64 (p, r2, y);
++
++  if (__glibc_unlikely (v_any_u32h (special)))
++    return special_case (hi, u_off, y, r2, special, d);
+   return vfmaq_f64 (hi, y, r2);
+ }
+
+commit 2aed9796bfb17b257e63b12cefdb7ff60be09626
+Author: Pierre Blanchard <pierre.blanchard@arm.com>
+Date:   Mon Dec 9 15:55:39 2024 +0000
+
+    AArch64: Improve codegen in users of ADVSIMD log1p helper
+    
+    Add inline helper for log1p and rearrange operations so MOV
+    is not necessary in reduction or around the special-case handler.
+    Reduce memory access by using more indexed MLAs in polynomial.
+    Speedup on Neoverse V1 for log1p (3.5%), acosh (7.5%) and atanh (10%).
+    
+    (cherry picked from commit ca0c0d0f26fbf75b9cacc65122b457e8fdec40b8)
+
+diff --git a/sysdeps/aarch64/fpu/acosh_advsimd.c b/sysdeps/aarch64/fpu/acosh_advsimd.c
+index c88283cf11..a98f4a2e4d 100644
+--- a/sysdeps/aarch64/fpu/acosh_advsimd.c
++++ b/sysdeps/aarch64/fpu/acosh_advsimd.c
+@@ -54,9 +54,8 @@ VPCS_ATTR float64x2_t V_NAME_D1 (acosh) (float64x2_t x)
+     x = vbslq_f64 (special, vreinterpretq_f64_u64 (d->one), x);
+ #endif
+ 
+-  float64x2_t xm1 = vsubq_f64 (x, v_f64 (1));
+-  float64x2_t y;
+-  y = vaddq_f64 (x, v_f64 (1));
++  float64x2_t xm1 = vsubq_f64 (x, v_f64 (1.0));
++  float64x2_t y = vaddq_f64 (x, v_f64 (1.0));
+   y = vmulq_f64 (y, xm1);
+   y = vsqrtq_f64 (y);
+   y = vaddq_f64 (xm1, y);
+diff --git a/sysdeps/aarch64/fpu/atanh_advsimd.c b/sysdeps/aarch64/fpu/atanh_advsimd.c
+index 3c3d0bd6ad..eb9769aeac 100644
+--- a/sysdeps/aarch64/fpu/atanh_advsimd.c
++++ b/sysdeps/aarch64/fpu/atanh_advsimd.c
+@@ -23,15 +23,19 @@
+ const static struct data
+ {
+   struct v_log1p_data log1p_consts;
+-  uint64x2_t one, half;
++  uint64x2_t one;
++  uint64x2_t sign_mask;
+ } data = { .log1p_consts = V_LOG1P_CONSTANTS_TABLE,
+ 	   .one = V2 (0x3ff0000000000000),
+-	   .half = V2 (0x3fe0000000000000) };
++	   .sign_mask = V2 (0x8000000000000000) };
+ 
+ static float64x2_t VPCS_ATTR NOINLINE
+-special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
++special_case (float64x2_t x, float64x2_t halfsign, float64x2_t y,
++	      uint64x2_t special, const struct data *d)
+ {
+-  return v_call_f64 (atanh, x, y, special);
++  y = log1p_inline (y, &d->log1p_consts);
++  return v_call_f64 (atanh, vbslq_f64 (d->sign_mask, halfsign, x),
++		     vmulq_f64 (halfsign, y), special);
+ }
+ 
+ /* Approximation for vector double-precision atanh(x) using modified log1p.
+@@ -43,11 +47,10 @@ float64x2_t V_NAME_D1 (atanh) (float64x2_t x)
+ {
+   const struct data *d = ptr_barrier (&data);
+ 
++  float64x2_t halfsign = vbslq_f64 (d->sign_mask, x, v_f64 (0.5));
+   float64x2_t ax = vabsq_f64 (x);
+   uint64x2_t ia = vreinterpretq_u64_f64 (ax);
+-  uint64x2_t sign = veorq_u64 (vreinterpretq_u64_f64 (x), ia);
+   uint64x2_t special = vcgeq_u64 (ia, d->one);
+-  float64x2_t halfsign = vreinterpretq_f64_u64 (vorrq_u64 (sign, d->half));
+ 
+ #if WANT_SIMD_EXCEPT
+   ax = v_zerofy_f64 (ax, special);
+@@ -55,10 +58,15 @@ float64x2_t V_NAME_D1 (atanh) (float64x2_t x)
+ 
+   float64x2_t y;
+   y = vaddq_f64 (ax, ax);
+-  y = vdivq_f64 (y, vsubq_f64 (v_f64 (1), ax));
+-  y = log1p_inline (y, &d->log1p_consts);
++  y = vdivq_f64 (y, vsubq_f64 (vreinterpretq_f64_u64 (d->one), ax));
+ 
+   if (__glibc_unlikely (v_any_u64 (special)))
+-    return special_case (x, vmulq_f64 (y, halfsign), special);
++#if WANT_SIMD_EXCEPT
++    return special_case (x, halfsign, y, special, d);
++#else
++    return special_case (ax, halfsign, y, special, d);
++#endif
++
++  y = log1p_inline (y, &d->log1p_consts);
+   return vmulq_f64 (y, halfsign);
+ }
+diff --git a/sysdeps/aarch64/fpu/log1p_advsimd.c b/sysdeps/aarch64/fpu/log1p_advsimd.c
+index 114064c696..1263587201 100644
+--- a/sysdeps/aarch64/fpu/log1p_advsimd.c
++++ b/sysdeps/aarch64/fpu/log1p_advsimd.c
+@@ -17,43 +17,26 @@
+    License along with the GNU C Library; if not, see
+    <https://www.gnu.org/licenses/>.  */
+ 
+-#include "v_math.h"
+-#include "poly_advsimd_f64.h"
++#define WANT_V_LOG1P_K0_SHORTCUT 0
++#include "v_log1p_inline.h"
+ 
+ const static struct data
+ {
+-  float64x2_t poly[19], ln2[2];
+-  uint64x2_t hf_rt2_top, one_m_hf_rt2_top, umask, inf, minus_one;
+-  int64x2_t one_top;
+-} data = {
+-  /* Generated using Remez, deg=20, in [sqrt(2)/2-1, sqrt(2)-1].  */
+-  .poly = { V2 (-0x1.ffffffffffffbp-2), V2 (0x1.55555555551a9p-2),
+-	    V2 (-0x1.00000000008e3p-2), V2 (0x1.9999999a32797p-3),
+-	    V2 (-0x1.555555552fecfp-3), V2 (0x1.249248e071e5ap-3),
+-	    V2 (-0x1.ffffff8bf8482p-4), V2 (0x1.c71c8f07da57ap-4),
+-	    V2 (-0x1.9999ca4ccb617p-4), V2 (0x1.7459ad2e1dfa3p-4),
+-	    V2 (-0x1.554d2680a3ff2p-4), V2 (0x1.3b4c54d487455p-4),
+-	    V2 (-0x1.2548a9ffe80e6p-4), V2 (0x1.0f389a24b2e07p-4),
+-	    V2 (-0x1.eee4db15db335p-5), V2 (0x1.e95b494d4a5ddp-5),
+-	    V2 (-0x1.15fdf07cb7c73p-4), V2 (0x1.0310b70800fcfp-4),
+-	    V2 (-0x1.cfa7385bdb37ep-6) },
+-  .ln2 = { V2 (0x1.62e42fefa3800p-1), V2 (0x1.ef35793c76730p-45) },
+-  /* top32(asuint64(sqrt(2)/2)) << 32.  */
+-  .hf_rt2_top = V2 (0x3fe6a09e00000000),
+-  /* (top32(asuint64(1)) - top32(asuint64(sqrt(2)/2))) << 32.  */
+-  .one_m_hf_rt2_top = V2 (0x00095f6200000000),
+-  .umask = V2 (0x000fffff00000000),
+-  .one_top = V2 (0x3ff),
+-  .inf = V2 (0x7ff0000000000000),
+-  .minus_one = V2 (0xbff0000000000000)
+-};
++  struct v_log1p_data d;
++  uint64x2_t inf, minus_one;
++} data = { .d = V_LOG1P_CONSTANTS_TABLE,
++	   .inf = V2 (0x7ff0000000000000),
++	   .minus_one = V2 (0xbff0000000000000) };
+ 
+ #define BottomMask v_u64 (0xffffffff)
+ 
+-static float64x2_t VPCS_ATTR NOINLINE
+-special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
++static float64x2_t NOINLINE VPCS_ATTR
++special_case (float64x2_t x, uint64x2_t cmp, const struct data *d)
+ {
+-  return v_call_f64 (log1p, x, y, special);
++  /* Side-step special lanes so fenv exceptions are not triggered
++     inadvertently.  */
++  float64x2_t x_nospecial = v_zerofy_f64 (x, cmp);
++  return v_call_f64 (log1p, x, log1p_inline (x_nospecial, &d->d), cmp);
+ }
+ 
+ /* Vector log1p approximation using polynomial on reduced interval. Routine is
+@@ -66,66 +49,14 @@ VPCS_ATTR float64x2_t V_NAME_D1 (log1p) (float64x2_t x)
+   const struct data *d = ptr_barrier (&data);
+   uint64x2_t ix = vreinterpretq_u64_f64 (x);
+   uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x));
+-  uint64x2_t special = vcgeq_u64 (ia, d->inf);
+ 
+-#if WANT_SIMD_EXCEPT
+-  special = vorrq_u64 (special,
+-		       vcgeq_u64 (ix, vreinterpretq_u64_f64 (v_f64 (-1))));
+-  if (__glibc_unlikely (v_any_u64 (special)))
+-    x = v_zerofy_f64 (x, special);
+-#else
+-  special = vorrq_u64 (special, vcleq_f64 (x, v_f64 (-1)));
+-#endif
++  uint64x2_t special_cases
++      = vorrq_u64 (vcgeq_u64 (ia, d->inf), vcgeq_u64 (ix, d->minus_one));
+ 
+-  /* With x + 1 = t * 2^k (where t = f + 1 and k is chosen such that f
+-			   is in [sqrt(2)/2, sqrt(2)]):
+-     log1p(x) = k*log(2) + log1p(f).
++  if (__glibc_unlikely (v_any_u64 (special_cases)))
++    return special_case (x, special_cases, d);
+ 
+-     f may not be representable exactly, so we need a correction term:
+-     let m = round(1 + x), c = (1 + x) - m.
+-     c << m: at very small x, log1p(x) ~ x, hence:
+-     log(1+x) - log(m) ~ c/m.
+-
+-     We therefore calculate log1p(x) by k*log2 + log1p(f) + c/m.  */
+-
+-  /* Obtain correctly scaled k by manipulation in the exponent.
+-     The scalar algorithm casts down to 32-bit at this point to calculate k and
+-     u_red. We stay in double-width to obtain f and k, using the same constants
+-     as the scalar algorithm but shifted left by 32.  */
+-  float64x2_t m = vaddq_f64 (x, v_f64 (1));
+-  uint64x2_t mi = vreinterpretq_u64_f64 (m);
+-  uint64x2_t u = vaddq_u64 (mi, d->one_m_hf_rt2_top);
+-
+-  int64x2_t ki
+-      = vsubq_s64 (vreinterpretq_s64_u64 (vshrq_n_u64 (u, 52)), d->one_top);
+-  float64x2_t k = vcvtq_f64_s64 (ki);
+-
+-  /* Reduce x to f in [sqrt(2)/2, sqrt(2)].  */
+-  uint64x2_t utop = vaddq_u64 (vandq_u64 (u, d->umask), d->hf_rt2_top);
+-  uint64x2_t u_red = vorrq_u64 (utop, vandq_u64 (mi, BottomMask));
+-  float64x2_t f = vsubq_f64 (vreinterpretq_f64_u64 (u_red), v_f64 (1));
+-
+-  /* Correction term c/m.  */
+-  float64x2_t cm = vdivq_f64 (vsubq_f64 (x, vsubq_f64 (m, v_f64 (1))), m);
+-
+-  /* Approximate log1p(x) on the reduced input using a polynomial. Because
+-     log1p(0)=0 we choose an approximation of the form:
+-       x + C0*x^2 + C1*x^3 + C2x^4 + ...
+-     Hence approximation has the form f + f^2 * P(f)
+-      where P(x) = C0 + C1*x + C2x^2 + ...
+-     Assembling this all correctly is dealt with at the final step.  */
+-  float64x2_t f2 = vmulq_f64 (f, f);
+-  float64x2_t p = v_pw_horner_18_f64 (f, f2, d->poly);
+-
+-  float64x2_t ylo = vfmaq_f64 (cm, k, d->ln2[1]);
+-  float64x2_t yhi = vfmaq_f64 (f, k, d->ln2[0]);
+-  float64x2_t y = vaddq_f64 (ylo, yhi);
+-
+-  if (__glibc_unlikely (v_any_u64 (special)))
+-    return special_case (vreinterpretq_f64_u64 (ix), vfmaq_f64 (y, f2, p),
+-			 special);
+-
+-  return vfmaq_f64 (y, f2, p);
++  return log1p_inline (x, &d->d);
+ }
+ 
+ strong_alias (V_NAME_D1 (log1p), V_NAME_D1 (logp1))
+diff --git a/sysdeps/aarch64/fpu/v_log1p_inline.h b/sysdeps/aarch64/fpu/v_log1p_inline.h
+index 242e43b6ee..834ff65adf 100644
+--- a/sysdeps/aarch64/fpu/v_log1p_inline.h
++++ b/sysdeps/aarch64/fpu/v_log1p_inline.h
+@@ -21,29 +21,30 @@
+ #define AARCH64_FPU_V_LOG1P_INLINE_H
+ 
+ #include "v_math.h"
+-#include "poly_advsimd_f64.h"
+ 
+ struct v_log1p_data
+ {
+-  float64x2_t poly[19], ln2[2];
++  float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16;
+   uint64x2_t hf_rt2_top, one_m_hf_rt2_top, umask;
+   int64x2_t one_top;
++  double c1, c3, c5, c7, c9, c11, c13, c15, c17, c18;
++  double ln2[2];
+ };
+ 
+ /* Coefficients generated using Remez, deg=20, in [sqrt(2)/2-1, sqrt(2)-1].  */
+ #define V_LOG1P_CONSTANTS_TABLE                                               \
+   {                                                                           \
+-    .poly = { V2 (-0x1.ffffffffffffbp-2), V2 (0x1.55555555551a9p-2),          \
+-	      V2 (-0x1.00000000008e3p-2), V2 (0x1.9999999a32797p-3),          \
+-	      V2 (-0x1.555555552fecfp-3), V2 (0x1.249248e071e5ap-3),          \
+-	      V2 (-0x1.ffffff8bf8482p-4), V2 (0x1.c71c8f07da57ap-4),          \
+-	      V2 (-0x1.9999ca4ccb617p-4), V2 (0x1.7459ad2e1dfa3p-4),          \
+-	      V2 (-0x1.554d2680a3ff2p-4), V2 (0x1.3b4c54d487455p-4),          \
+-	      V2 (-0x1.2548a9ffe80e6p-4), V2 (0x1.0f389a24b2e07p-4),          \
+-	      V2 (-0x1.eee4db15db335p-5), V2 (0x1.e95b494d4a5ddp-5),          \
+-	      V2 (-0x1.15fdf07cb7c73p-4), V2 (0x1.0310b70800fcfp-4),          \
+-	      V2 (-0x1.cfa7385bdb37ep-6) },                                   \
+-    .ln2 = { V2 (0x1.62e42fefa3800p-1), V2 (0x1.ef35793c76730p-45) },         \
++    .c0 = V2 (-0x1.ffffffffffffbp-2), .c1 = 0x1.55555555551a9p-2,             \
++    .c2 = V2 (-0x1.00000000008e3p-2), .c3 = 0x1.9999999a32797p-3,             \
++    .c4 = V2 (-0x1.555555552fecfp-3), .c5 = 0x1.249248e071e5ap-3,             \
++    .c6 = V2 (-0x1.ffffff8bf8482p-4), .c7 = 0x1.c71c8f07da57ap-4,             \
++    .c8 = V2 (-0x1.9999ca4ccb617p-4), .c9 = 0x1.7459ad2e1dfa3p-4,             \
++    .c10 = V2 (-0x1.554d2680a3ff2p-4), .c11 = 0x1.3b4c54d487455p-4,           \
++    .c12 = V2 (-0x1.2548a9ffe80e6p-4), .c13 = 0x1.0f389a24b2e07p-4,           \
++    .c14 = V2 (-0x1.eee4db15db335p-5), .c15 = 0x1.e95b494d4a5ddp-5,           \
++    .c16 = V2 (-0x1.15fdf07cb7c73p-4), .c17 = 0x1.0310b70800fcfp-4,           \
++    .c18 = -0x1.cfa7385bdb37ep-6,                                             \
++    .ln2 = { 0x1.62e42fefa3800p-1, 0x1.ef35793c76730p-45 },                   \
+     .hf_rt2_top = V2 (0x3fe6a09e00000000),                                    \
+     .one_m_hf_rt2_top = V2 (0x00095f6200000000),                              \
+     .umask = V2 (0x000fffff00000000), .one_top = V2 (0x3ff)                   \
+@@ -51,19 +52,45 @@ struct v_log1p_data
+ 
+ #define BottomMask v_u64 (0xffffffff)
+ 
++static inline float64x2_t
++eval_poly (float64x2_t m, float64x2_t m2, const struct v_log1p_data *d)
++{
++  /* Approximate log(1+m) on [-0.25, 0.5] using pairwise Horner.  */
++  float64x2_t c13 = vld1q_f64 (&d->c1);
++  float64x2_t c57 = vld1q_f64 (&d->c5);
++  float64x2_t c911 = vld1q_f64 (&d->c9);
++  float64x2_t c1315 = vld1q_f64 (&d->c13);
++  float64x2_t c1718 = vld1q_f64 (&d->c17);
++  float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, m, c1718, 0);
++  float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, m, c1315, 1);
++  float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, m, c1315, 0);
++  float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, m, c911, 1);
++  float64x2_t p89 = vfmaq_laneq_f64 (d->c8, m, c911, 0);
++  float64x2_t p67 = vfmaq_laneq_f64 (d->c6, m, c57, 1);
++  float64x2_t p45 = vfmaq_laneq_f64 (d->c4, m, c57, 0);
++  float64x2_t p23 = vfmaq_laneq_f64 (d->c2, m, c13, 1);
++  float64x2_t p01 = vfmaq_laneq_f64 (d->c0, m, c13, 0);
++  float64x2_t p = vfmaq_laneq_f64 (p1617, m2, c1718, 1);
++  p = vfmaq_f64 (p1415, m2, p);
++  p = vfmaq_f64 (p1213, m2, p);
++  p = vfmaq_f64 (p1011, m2, p);
++  p = vfmaq_f64 (p89, m2, p);
++  p = vfmaq_f64 (p67, m2, p);
++  p = vfmaq_f64 (p45, m2, p);
++  p = vfmaq_f64 (p23, m2, p);
++  return vfmaq_f64 (p01, m2, p);
++}
++
+ static inline float64x2_t
+ log1p_inline (float64x2_t x, const struct v_log1p_data *d)
+ {
+-  /* Helper for calculating log(x + 1). Copied from v_log1p_2u5.c, with several
+-     modifications:
++  /* Helper for calculating log(x + 1):
+      - No special-case handling - this should be dealt with by the caller.
+-     - Pairwise Horner polynomial evaluation for improved accuracy.
+      - Optionally simulate the shortcut for k=0, used in the scalar routine,
+-       using v_sel, for improved accuracy when the argument to log1p is close to
+-       0. This feature is enabled by defining WANT_V_LOG1P_K0_SHORTCUT as 1 in
+-       the source of the caller before including this file.
+-     See v_log1pf_2u1.c for details of the algorithm.  */
+-  float64x2_t m = vaddq_f64 (x, v_f64 (1));
++       using v_sel, for improved accuracy when the argument to log1p is close
++       to 0. This feature is enabled by defining WANT_V_LOG1P_K0_SHORTCUT as 1
++       in the source of the caller before including this file.  */
++  float64x2_t m = vaddq_f64 (x, v_f64 (1.0));
+   uint64x2_t mi = vreinterpretq_u64_f64 (m);
+   uint64x2_t u = vaddq_u64 (mi, d->one_m_hf_rt2_top);
+ 
+@@ -74,14 +101,14 @@ log1p_inline (float64x2_t x, const struct v_log1p_data *d)
+   /* Reduce x to f in [sqrt(2)/2, sqrt(2)].  */
+   uint64x2_t utop = vaddq_u64 (vandq_u64 (u, d->umask), d->hf_rt2_top);
+   uint64x2_t u_red = vorrq_u64 (utop, vandq_u64 (mi, BottomMask));
+-  float64x2_t f = vsubq_f64 (vreinterpretq_f64_u64 (u_red), v_f64 (1));
++  float64x2_t f = vsubq_f64 (vreinterpretq_f64_u64 (u_red), v_f64 (1.0));
+ 
+   /* Correction term c/m.  */
+-  float64x2_t cm = vdivq_f64 (vsubq_f64 (x, vsubq_f64 (m, v_f64 (1))), m);
++  float64x2_t cm = vdivq_f64 (vsubq_f64 (x, vsubq_f64 (m, v_f64 (1.0))), m);
+ 
+ #ifndef WANT_V_LOG1P_K0_SHORTCUT
+-#error                                                                         \
+-  "Cannot use v_log1p_inline.h without specifying whether you need the k0 shortcut for greater accuracy close to 0"
++# error                                                                       \
++      "Cannot use v_log1p_inline.h without specifying whether you need the k0 shortcut for greater accuracy close to 0"
+ #elif WANT_V_LOG1P_K0_SHORTCUT
+   /* Shortcut if k is 0 - set correction term to 0 and f to x. The result is
+      that the approximation is solely the polynomial.  */
+@@ -92,11 +119,12 @@ log1p_inline (float64x2_t x, const struct v_log1p_data *d)
+ 
+   /* Approximate log1p(f) on the reduced input using a polynomial.  */
+   float64x2_t f2 = vmulq_f64 (f, f);
+-  float64x2_t p = v_pw_horner_18_f64 (f, f2, d->poly);
++  float64x2_t p = eval_poly (f, f2, d);
+ 
+   /* Assemble log1p(x) = k * log2 + log1p(f) + c/m.  */
+-  float64x2_t ylo = vfmaq_f64 (cm, k, d->ln2[1]);
+-  float64x2_t yhi = vfmaq_f64 (f, k, d->ln2[0]);
++  float64x2_t ln2 = vld1q_f64 (&d->ln2[0]);
++  float64x2_t ylo = vfmaq_laneq_f64 (cm, k, ln2, 1);
++  float64x2_t yhi = vfmaq_laneq_f64 (f, k, ln2, 0);
+   return vfmaq_f64 (vaddq_f64 (ylo, yhi), f2, p);
+ }
+ 
+
+commit 9170b921fa49d2ef37141506837baaae92c7d3f8
+Author: Joana Cruz <Joana.Cruz@arm.com>
+Date:   Tue Dec 17 14:47:31 2024 +0000
+
+    AArch64: Improve codegen of AdvSIMD logf function family
+    
+    Load the polynomial evaluation coefficients into 2 vectors and use lanewise MLAs.
+    8% improvement in throughput microbenchmark on Neoverse V1 for log2 and log,
+    and 2% for log10.
+    
+    Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
+    (cherry picked from commit d6e034f5b222a9ed1aeb5de0c0c7d0dda8b63da3)
+
+diff --git a/sysdeps/aarch64/fpu/log10f_advsimd.c b/sysdeps/aarch64/fpu/log10f_advsimd.c
+index 82228b599a..0d792c3df9 100644
+--- a/sysdeps/aarch64/fpu/log10f_advsimd.c
++++ b/sysdeps/aarch64/fpu/log10f_advsimd.c
+@@ -18,21 +18,25 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include "v_math.h"
+-#include "poly_advsimd_f32.h"
+ 
+ static const struct data
+ {
++  float32x4_t c0, c2, c4, c6, inv_ln10, ln2;
+   uint32x4_t off, offset_lower_bound;
+   uint16x8_t special_bound;
+   uint32x4_t mantissa_mask;
+-  float32x4_t poly[8];
+-  float32x4_t inv_ln10, ln2;
++  float c1, c3, c5, c7;
+ } data = {
+   /* Use order 9 for log10(1+x), i.e. order 8 for log10(1+x)/x, with x in
+       [-1/3, 1/3] (offset=2/3). Max. relative error: 0x1.068ee468p-25.  */
+-  .poly = { V4 (-0x1.bcb79cp-3f), V4 (0x1.2879c8p-3f), V4 (-0x1.bcd472p-4f),
+-	    V4 (0x1.6408f8p-4f), V4 (-0x1.246f8p-4f), V4 (0x1.f0e514p-5f),
+-	    V4 (-0x1.0fc92cp-4f), V4 (0x1.f5f76ap-5f) },
++  .c0 = V4 (-0x1.bcb79cp-3f),
++  .c1 = 0x1.2879c8p-3f,
++  .c2 = V4 (-0x1.bcd472p-4f),
++  .c3 = 0x1.6408f8p-4f,
++  .c4 = V4 (-0x1.246f8p-4f),
++  .c5 = 0x1.f0e514p-5f,
++  .c6 = V4 (-0x1.0fc92cp-4f),
++  .c7 = 0x1.f5f76ap-5f,
+   .ln2 = V4 (0x1.62e43p-1f),
+   .inv_ln10 = V4 (0x1.bcb7b2p-2f),
+   /* Lower bound is the smallest positive normal float 0x00800000. For
+@@ -62,7 +66,7 @@ special_case (float32x4_t y, uint32x4_t u_off, float32x4_t p, float32x4_t r2,
+ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log10) (float32x4_t x)
+ {
+   const struct data *d = ptr_barrier (&data);
+-
++  float32x4_t c1357 = vld1q_f32 (&d->c1);
+   /* To avoid having to mov x out of the way, keep u after offset has been
+      applied, and recover x by adding the offset back in the special-case
+      handler.  */
+@@ -81,7 +85,16 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log10) (float32x4_t x)
+ 
+   /* y = log10(1+r) + n * log10(2).  */
+   float32x4_t r2 = vmulq_f32 (r, r);
+-  float32x4_t poly = v_pw_horner_7_f32 (r, r2, d->poly);
++
++  float32x4_t c01 = vfmaq_laneq_f32 (d->c0, r, c1357, 0);
++  float32x4_t c23 = vfmaq_laneq_f32 (d->c2, r, c1357, 1);
++  float32x4_t c45 = vfmaq_laneq_f32 (d->c4, r, c1357, 2);
++  float32x4_t c67 = vfmaq_laneq_f32 (d->c6, r, c1357, 3);
++
++  float32x4_t p47 = vfmaq_f32 (c45, r2, c67);
++  float32x4_t p27 = vfmaq_f32 (c23, r2, p47);
++  float32x4_t poly = vfmaq_f32 (c01, r2, p27);
++
+   /* y = Log10(2) * n + poly * InvLn(10).  */
+   float32x4_t y = vfmaq_f32 (r, d->ln2, n);
+   y = vmulq_f32 (y, d->inv_ln10);
+diff --git a/sysdeps/aarch64/fpu/log2f_advsimd.c b/sysdeps/aarch64/fpu/log2f_advsimd.c
+index 84effe4fe9..116c36c8e2 100644
+--- a/sysdeps/aarch64/fpu/log2f_advsimd.c
++++ b/sysdeps/aarch64/fpu/log2f_advsimd.c
+@@ -18,22 +18,27 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include "v_math.h"
+-#include "poly_advsimd_f32.h"
+ 
+ static const struct data
+ {
++  float32x4_t c0, c2, c4, c6, c8;
+   uint32x4_t off, offset_lower_bound;
+   uint16x8_t special_bound;
+   uint32x4_t mantissa_mask;
+-  float32x4_t poly[9];
++  float c1, c3, c5, c7;
+ } data = {
+   /* Coefficients generated using Remez algorithm approximate
+      log2(1+r)/r for r in [ -1/3, 1/3 ].
+      rel error: 0x1.c4c4b0cp-26.  */
+-  .poly = { V4 (0x1.715476p0f), /* (float)(1 / ln(2)).  */
+-	    V4 (-0x1.715458p-1f), V4 (0x1.ec701cp-2f), V4 (-0x1.7171a4p-2f),
+-	    V4 (0x1.27a0b8p-2f), V4 (-0x1.e5143ep-3f), V4 (0x1.9d8ecap-3f),
+-	    V4 (-0x1.c675bp-3f), V4 (0x1.9e495p-3f) },
++  .c0 = V4 (0x1.715476p0f), /* (float)(1 / ln(2)).  */
++  .c1 = -0x1.715458p-1f,
++  .c2 = V4 (0x1.ec701cp-2f),
++  .c3 = -0x1.7171a4p-2f,
++  .c4 = V4 (0x1.27a0b8p-2f),
++  .c5 = -0x1.e5143ep-3f,
++  .c6 = V4 (0x1.9d8ecap-3f),
++  .c7 = -0x1.c675bp-3f,
++  .c8 = V4 (0x1.9e495p-3f),
+   /* Lower bound is the smallest positive normal float 0x00800000. For
+      optimised register use subnormals are detected after offset has been
+      subtracted, so lower bound is 0x0080000 - offset (which wraps around).  */
+@@ -79,11 +84,21 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log2) (float32x4_t x)
+ 
+   /* y = log2(1+r) + n.  */
+   float32x4_t r2 = vmulq_f32 (r, r);
+-  float32x4_t p = v_pw_horner_8_f32 (r, r2, d->poly);
++
++  float32x4_t c1357 = vld1q_f32 (&d->c1);
++  float32x4_t c01 = vfmaq_laneq_f32 (d->c0, r, c1357, 0);
++  float32x4_t c23 = vfmaq_laneq_f32 (d->c2, r, c1357, 1);
++  float32x4_t c45 = vfmaq_laneq_f32 (d->c4, r, c1357, 2);
++  float32x4_t c67 = vfmaq_laneq_f32 (d->c6, r, c1357, 3);
++  float32x4_t p68 = vfmaq_f32 (c67, r2, d->c8);
++  float32x4_t p48 = vfmaq_f32 (c45, r2, p68);
++  float32x4_t p28 = vfmaq_f32 (c23, r2, p48);
++  float32x4_t p = vfmaq_f32 (c01, r2, p28);
+ 
+   if (__glibc_unlikely (v_any_u16h (special)))
+     return special_case (n, u_off, p, r, special, d);
+   return vfmaq_f32 (n, p, r);
+ }
++
+ libmvec_hidden_def (V_NAME_F1 (log2))
+ HALF_WIDTH_ALIAS_F1 (log2)
+diff --git a/sysdeps/aarch64/fpu/logf_advsimd.c b/sysdeps/aarch64/fpu/logf_advsimd.c
+index c20dbfd6c0..d9e64c732d 100644
+--- a/sysdeps/aarch64/fpu/logf_advsimd.c
++++ b/sysdeps/aarch64/fpu/logf_advsimd.c
+@@ -21,16 +21,19 @@
+ 
+ static const struct data
+ {
+-  uint32x4_t off, offset_lower_bound;
++  float32x4_t c2, c4, c6, ln2;
++  uint32x4_t off, offset_lower_bound, mantissa_mask;
+   uint16x8_t special_bound;
+-  uint32x4_t mantissa_mask;
+-  float32x4_t poly[7];
+-  float32x4_t ln2;
++  float c1, c3, c5, c0;
+ } data = {
+   /* 3.34 ulp error.  */
+-  .poly = { V4 (-0x1.3e737cp-3f), V4 (0x1.5a9aa2p-3f), V4 (-0x1.4f9934p-3f),
+-	    V4 (0x1.961348p-3f), V4 (-0x1.00187cp-2f), V4 (0x1.555d7cp-2f),
+-	    V4 (-0x1.ffffc8p-2f) },
++  .c0 = -0x1.3e737cp-3f,
++  .c1 = 0x1.5a9aa2p-3f,
++  .c2 = V4 (-0x1.4f9934p-3f),
++  .c3 = 0x1.961348p-3f,
++  .c4 = V4 (-0x1.00187cp-2f),
++  .c5 = 0x1.555d7cp-2f,
++  .c6 = V4 (-0x1.ffffc8p-2f),
+   .ln2 = V4 (0x1.62e43p-1f),
+   /* Lower bound is the smallest positive normal float 0x00800000. For
+      optimised register use subnormals are detected after offset has been
+@@ -41,8 +44,6 @@ static const struct data
+   .mantissa_mask = V4 (0x007fffff)
+ };
+ 
+-#define P(i) d->poly[7 - i]
+-
+ static float32x4_t VPCS_ATTR NOINLINE
+ special_case (float32x4_t p, uint32x4_t u_off, float32x4_t y, float32x4_t r2,
+ 	      uint16x4_t cmp, const struct data *d)
+@@ -55,33 +56,30 @@ special_case (float32x4_t p, uint32x4_t u_off, float32x4_t y, float32x4_t r2,
+ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log) (float32x4_t x)
+ {
+   const struct data *d = ptr_barrier (&data);
+-  float32x4_t n, p, q, r, r2, y;
+-  uint32x4_t u, u_off;
+-  uint16x4_t cmp;
++  float32x4_t c1350 = vld1q_f32 (&d->c1);
+ 
+   /* To avoid having to mov x out of the way, keep u after offset has been
+      applied, and recover x by adding the offset back in the special-case
+      handler.  */
+-  u_off = vreinterpretq_u32_f32 (x);
++  uint32x4_t u_off = vsubq_u32 (vreinterpretq_u32_f32 (x), d->off);
+ 
+   /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3.  */
+-  u_off = vsubq_u32 (u_off, d->off);
+-  n = vcvtq_f32_s32 (
++  float32x4_t n = vcvtq_f32_s32 (
+       vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend.  */
+-  u = vandq_u32 (u_off, d->mantissa_mask);
+-  u = vaddq_u32 (u, d->off);
+-  r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
++  uint16x4_t cmp = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound),
++			     vget_low_u16 (d->special_bound));
+ 
+-  cmp = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound),
+-		  vget_low_u16 (d->special_bound));
++  uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off);
++  float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
+ 
+   /* y = log(1+r) + n*ln2.  */
+-  r2 = vmulq_f32 (r, r);
++  float32x4_t r2 = vmulq_f32 (r, r);
+   /* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))).  */
+-  p = vfmaq_f32 (P (5), P (6), r);
+-  q = vfmaq_f32 (P (3), P (4), r);
+-  y = vfmaq_f32 (P (1), P (2), r);
+-  p = vfmaq_f32 (p, P (7), r2);
++  float32x4_t p = vfmaq_laneq_f32 (d->c2, r, c1350, 0);
++  float32x4_t q = vfmaq_laneq_f32 (d->c4, r, c1350, 1);
++  float32x4_t y = vfmaq_laneq_f32 (d->c6, r, c1350, 2);
++  p = vfmaq_laneq_f32 (p, r2, c1350, 3);
++
+   q = vfmaq_f32 (q, p, r2);
+   y = vfmaq_f32 (y, q, r2);
+   p = vfmaq_f32 (r, d->ln2, n);
+
+commit 41dc9e7c2d80bc5e886950b8a7bd21f77c9793b3
+Author: Joana Cruz <Joana.Cruz@arm.com>
+Date:   Tue Dec 17 14:49:30 2024 +0000
+
+    AArch64: Improve codegen of AdvSIMD atan(2)(f)
+    
+    Load the polynomial evaluation coefficients into 2 vectors and use lanewise MLAs.
+    8% improvement in throughput microbenchmark on Neoverse V1.
+    
+    Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
+    (cherry picked from commit 6914774b9d3460876d9ad4482782213ec01a752e)
+
+diff --git a/sysdeps/aarch64/fpu/atan2_advsimd.c b/sysdeps/aarch64/fpu/atan2_advsimd.c
+index b1e7a9b8fc..1a8f02109f 100644
+--- a/sysdeps/aarch64/fpu/atan2_advsimd.c
++++ b/sysdeps/aarch64/fpu/atan2_advsimd.c
+@@ -23,40 +23,57 @@
+ 
+ static const struct data
+ {
++  float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c18;
+   float64x2_t pi_over_2;
+-  float64x2_t poly[20];
++  double c1, c3, c5, c7, c9, c11, c13, c15, c17, c19;
++  uint64x2_t zeroinfnan, minustwo;
+ } data = {
+   /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
+-     the interval [2**-1022, 1.0].  */
+-  .poly = { V2 (-0x1.5555555555555p-2),	 V2 (0x1.99999999996c1p-3),
+-	    V2 (-0x1.2492492478f88p-3),	 V2 (0x1.c71c71bc3951cp-4),
+-	    V2 (-0x1.745d160a7e368p-4),	 V2 (0x1.3b139b6a88ba1p-4),
+-	    V2 (-0x1.11100ee084227p-4),	 V2 (0x1.e1d0f9696f63bp-5),
+-	    V2 (-0x1.aebfe7b418581p-5),	 V2 (0x1.842dbe9b0d916p-5),
+-	    V2 (-0x1.5d30140ae5e99p-5),	 V2 (0x1.338e31eb2fbbcp-5),
+-	    V2 (-0x1.00e6eece7de8p-5),	 V2 (0x1.860897b29e5efp-6),
+-	    V2 (-0x1.0051381722a59p-6),	 V2 (0x1.14e9dc19a4a4ep-7),
+-	    V2 (-0x1.d0062b42fe3bfp-9),	 V2 (0x1.17739e210171ap-10),
+-	    V2 (-0x1.ab24da7be7402p-13), V2 (0x1.358851160a528p-16), },
++	      [2**-1022, 1.0].  */
++  .c0 = V2 (-0x1.5555555555555p-2),
++  .c1 = 0x1.99999999996c1p-3,
++  .c2 = V2 (-0x1.2492492478f88p-3),
++  .c3 = 0x1.c71c71bc3951cp-4,
++  .c4 = V2 (-0x1.745d160a7e368p-4),
++  .c5 = 0x1.3b139b6a88ba1p-4,
++  .c6 = V2 (-0x1.11100ee084227p-4),
++  .c7 = 0x1.e1d0f9696f63bp-5,
++  .c8 = V2 (-0x1.aebfe7b418581p-5),
++  .c9 = 0x1.842dbe9b0d916p-5,
++  .c10 = V2 (-0x1.5d30140ae5e99p-5),
++  .c11 = 0x1.338e31eb2fbbcp-5,
++  .c12 = V2 (-0x1.00e6eece7de8p-5),
++  .c13 = 0x1.860897b29e5efp-6,
++  .c14 = V2 (-0x1.0051381722a59p-6),
++  .c15 = 0x1.14e9dc19a4a4ep-7,
++  .c16 = V2 (-0x1.d0062b42fe3bfp-9),
++  .c17 = 0x1.17739e210171ap-10,
++  .c18 = V2 (-0x1.ab24da7be7402p-13),
++  .c19 = 0x1.358851160a528p-16,
+   .pi_over_2 = V2 (0x1.921fb54442d18p+0),
++  .zeroinfnan = V2 (2 * 0x7ff0000000000000ul - 1),
++  .minustwo = V2 (0xc000000000000000),
+ };
+ 
+ #define SignMask v_u64 (0x8000000000000000)
+ 
+ /* Special cases i.e. 0, infinity, NaN (fall back to scalar calls).  */
+ static float64x2_t VPCS_ATTR NOINLINE
+-special_case (float64x2_t y, float64x2_t x, float64x2_t ret, uint64x2_t cmp)
++special_case (float64x2_t y, float64x2_t x, float64x2_t ret,
++	      uint64x2_t sign_xy, uint64x2_t cmp)
+ {
++  /* Account for the sign of x and y.  */
++  ret = vreinterpretq_f64_u64 (
++      veorq_u64 (vreinterpretq_u64_f64 (ret), sign_xy));
+   return v_call2_f64 (atan2, y, x, ret, cmp);
+ }
+ 
+ /* Returns 1 if input is the bit representation of 0, infinity or nan.  */
+ static inline uint64x2_t
+-zeroinfnan (uint64x2_t i)
++zeroinfnan (uint64x2_t i, const struct data *d)
+ {
+   /* (2 * i - 1) >= (2 * asuint64 (INFINITY) - 1).  */
+-  return vcgeq_u64 (vsubq_u64 (vaddq_u64 (i, i), v_u64 (1)),
+-		    v_u64 (2 * asuint64 (INFINITY) - 1));
++  return vcgeq_u64 (vsubq_u64 (vaddq_u64 (i, i), v_u64 (1)), d->zeroinfnan);
+ }
+ 
+ /* Fast implementation of vector atan2.
+@@ -66,12 +83,13 @@ zeroinfnan (uint64x2_t i)
+        want 0x1.92d628ab678cfp-1.  */
+ float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x)
+ {
+-  const struct data *data_ptr = ptr_barrier (&data);
++  const struct data *d = ptr_barrier (&data);
+ 
+   uint64x2_t ix = vreinterpretq_u64_f64 (x);
+   uint64x2_t iy = vreinterpretq_u64_f64 (y);
+ 
+-  uint64x2_t special_cases = vorrq_u64 (zeroinfnan (ix), zeroinfnan (iy));
++  uint64x2_t special_cases
++      = vorrq_u64 (zeroinfnan (ix, d), zeroinfnan (iy, d));
+ 
+   uint64x2_t sign_x = vandq_u64 (ix, SignMask);
+   uint64x2_t sign_y = vandq_u64 (iy, SignMask);
+@@ -81,18 +99,18 @@ float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x)
+   float64x2_t ay = vabsq_f64 (y);
+ 
+   uint64x2_t pred_xlt0 = vcltzq_f64 (x);
+-  uint64x2_t pred_aygtax = vcgtq_f64 (ay, ax);
++  uint64x2_t pred_aygtax = vcagtq_f64 (y, x);
+ 
+   /* Set up z for call to atan.  */
+   float64x2_t n = vbslq_f64 (pred_aygtax, vnegq_f64 (ax), ay);
+-  float64x2_t d = vbslq_f64 (pred_aygtax, ay, ax);
+-  float64x2_t z = vdivq_f64 (n, d);
++  float64x2_t q = vbslq_f64 (pred_aygtax, ay, ax);
++  float64x2_t z = vdivq_f64 (n, q);
+ 
+   /* Work out the correct shift.  */
+-  float64x2_t shift = vreinterpretq_f64_u64 (
+-      vandq_u64 (pred_xlt0, vreinterpretq_u64_f64 (v_f64 (-2.0))));
++  float64x2_t shift
++      = vreinterpretq_f64_u64 (vandq_u64 (pred_xlt0, d->minustwo));
+   shift = vbslq_f64 (pred_aygtax, vaddq_f64 (shift, v_f64 (1.0)), shift);
+-  shift = vmulq_f64 (shift, data_ptr->pi_over_2);
++  shift = vmulq_f64 (shift, d->pi_over_2);
+ 
+   /* Calculate the polynomial approximation.
+      Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of
+@@ -103,20 +121,52 @@ float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x)
+   float64x2_t x2 = vmulq_f64 (z2, z2);
+   float64x2_t x4 = vmulq_f64 (x2, x2);
+   float64x2_t x8 = vmulq_f64 (x4, x4);
+-  float64x2_t ret
+-      = vfmaq_f64 (v_estrin_7_f64 (z2, x2, x4, data_ptr->poly),
+-		   v_estrin_11_f64 (z2, x2, x4, x8, data_ptr->poly + 8), x8);
++
++  float64x2_t c13 = vld1q_f64 (&d->c1);
++  float64x2_t c57 = vld1q_f64 (&d->c5);
++  float64x2_t c911 = vld1q_f64 (&d->c9);
++  float64x2_t c1315 = vld1q_f64 (&d->c13);
++  float64x2_t c1719 = vld1q_f64 (&d->c17);
++
++  /* estrin_7.  */
++  float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0);
++  float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1);
++  float64x2_t p03 = vfmaq_f64 (p01, x2, p23);
++
++  float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0);
++  float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1);
++  float64x2_t p47 = vfmaq_f64 (p45, x2, p67);
++
++  float64x2_t p07 = vfmaq_f64 (p03, x4, p47);
++
++  /* estrin_11.  */
++  float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0);
++  float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1);
++  float64x2_t p811 = vfmaq_f64 (p89, x2, p1011);
++
++  float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, z2, c1315, 0);
++  float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, z2, c1315, 1);
++  float64x2_t p1215 = vfmaq_f64 (p1213, x2, p1415);
++
++  float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, z2, c1719, 0);
++  float64x2_t p1819 = vfmaq_laneq_f64 (d->c18, z2, c1719, 1);
++  float64x2_t p1619 = vfmaq_f64 (p1617, x2, p1819);
++
++  float64x2_t p815 = vfmaq_f64 (p811, x4, p1215);
++  float64x2_t p819 = vfmaq_f64 (p815, x8, p1619);
++
++  float64x2_t ret = vfmaq_f64 (p07, p819, x8);
+ 
+   /* Finalize. y = shift + z + z^3 * P(z^2).  */
+   ret = vfmaq_f64 (z, ret, vmulq_f64 (z2, z));
+   ret = vaddq_f64 (ret, shift);
+ 
++  if (__glibc_unlikely (v_any_u64 (special_cases)))
++    return special_case (y, x, ret, sign_xy, special_cases);
++
+   /* Account for the sign of x and y.  */
+   ret = vreinterpretq_f64_u64 (
+       veorq_u64 (vreinterpretq_u64_f64 (ret), sign_xy));
+ 
+-  if (__glibc_unlikely (v_any_u64 (special_cases)))
+-    return special_case (y, x, ret, special_cases);
+-
+   return ret;
+ }
+diff --git a/sysdeps/aarch64/fpu/atan2f_advsimd.c b/sysdeps/aarch64/fpu/atan2f_advsimd.c
+index 56e610caf1..88daacd76c 100644
+--- a/sysdeps/aarch64/fpu/atan2f_advsimd.c
++++ b/sysdeps/aarch64/fpu/atan2f_advsimd.c
+@@ -22,34 +22,39 @@
+ 
+ static const struct data
+ {
+-  float32x4_t poly[8];
+-  float32x4_t pi_over_2;
++  float32x4_t c0, pi_over_2, c4, c6, c2;
++  float c1, c3, c5, c7;
++  uint32x4_t comp_const;
+ } data = {
+   /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
+      [2**-128, 1.0].
+      Generated using fpminimax between FLT_MIN and 1.  */
+-  .poly = { V4 (-0x1.55555p-2f), V4 (0x1.99935ep-3f), V4 (-0x1.24051ep-3f),
+-	    V4 (0x1.bd7368p-4f), V4 (-0x1.491f0ep-4f), V4 (0x1.93a2c0p-5f),
+-	    V4 (-0x1.4c3c60p-6f), V4 (0x1.01fd88p-8f) },
+-  .pi_over_2 = V4 (0x1.921fb6p+0f),
++  .c0 = V4 (-0x1.55555p-2f),	    .c1 = 0x1.99935ep-3f,
++  .c2 = V4 (-0x1.24051ep-3f),	    .c3 = 0x1.bd7368p-4f,
++  .c4 = V4 (-0x1.491f0ep-4f),	    .c5 = 0x1.93a2c0p-5f,
++  .c6 = V4 (-0x1.4c3c60p-6f),	    .c7 = 0x1.01fd88p-8f,
++  .pi_over_2 = V4 (0x1.921fb6p+0f), .comp_const = V4 (2 * 0x7f800000lu - 1),
+ };
+ 
+ #define SignMask v_u32 (0x80000000)
+ 
+ /* Special cases i.e. 0, infinity and nan (fall back to scalar calls).  */
+ static float32x4_t VPCS_ATTR NOINLINE
+-special_case (float32x4_t y, float32x4_t x, float32x4_t ret, uint32x4_t cmp)
++special_case (float32x4_t y, float32x4_t x, float32x4_t ret,
++	      uint32x4_t sign_xy, uint32x4_t cmp)
+ {
++  /* Account for the sign of y.  */
++  ret = vreinterpretq_f32_u32 (
++      veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy));
+   return v_call2_f32 (atan2f, y, x, ret, cmp);
+ }
+ 
+ /* Returns 1 if input is the bit representation of 0, infinity or nan.  */
+ static inline uint32x4_t
+-zeroinfnan (uint32x4_t i)
++zeroinfnan (uint32x4_t i, const struct data *d)
+ {
+   /* 2 * i - 1 >= 2 * 0x7f800000lu - 1.  */
+-  return vcgeq_u32 (vsubq_u32 (vmulq_n_u32 (i, 2), v_u32 (1)),
+-		    v_u32 (2 * 0x7f800000lu - 1));
++  return vcgeq_u32 (vsubq_u32 (vmulq_n_u32 (i, 2), v_u32 (1)), d->comp_const);
+ }
+ 
+ /* Fast implementation of vector atan2f. Maximum observed error is
+@@ -58,12 +63,13 @@ zeroinfnan (uint32x4_t i)
+ 						 want 0x1.967f00p-1.  */
+ float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x)
+ {
+-  const struct data *data_ptr = ptr_barrier (&data);
++  const struct data *d = ptr_barrier (&data);
+ 
+   uint32x4_t ix = vreinterpretq_u32_f32 (x);
+   uint32x4_t iy = vreinterpretq_u32_f32 (y);
+ 
+-  uint32x4_t special_cases = vorrq_u32 (zeroinfnan (ix), zeroinfnan (iy));
++  uint32x4_t special_cases
++      = vorrq_u32 (zeroinfnan (ix, d), zeroinfnan (iy, d));
+ 
+   uint32x4_t sign_x = vandq_u32 (ix, SignMask);
+   uint32x4_t sign_y = vandq_u32 (iy, SignMask);
+@@ -77,14 +83,14 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x)
+ 
+   /* Set up z for call to atanf.  */
+   float32x4_t n = vbslq_f32 (pred_aygtax, vnegq_f32 (ax), ay);
+-  float32x4_t d = vbslq_f32 (pred_aygtax, ay, ax);
+-  float32x4_t z = vdivq_f32 (n, d);
++  float32x4_t q = vbslq_f32 (pred_aygtax, ay, ax);
++  float32x4_t z = vdivq_f32 (n, q);
+ 
+   /* Work out the correct shift.  */
+   float32x4_t shift = vreinterpretq_f32_u32 (
+       vandq_u32 (pred_xlt0, vreinterpretq_u32_f32 (v_f32 (-2.0f))));
+   shift = vbslq_f32 (pred_aygtax, vaddq_f32 (shift, v_f32 (1.0f)), shift);
+-  shift = vmulq_f32 (shift, data_ptr->pi_over_2);
++  shift = vmulq_f32 (shift, d->pi_over_2);
+ 
+   /* Calculate the polynomial approximation.
+      Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However,
+@@ -96,23 +102,27 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x)
+   float32x4_t z2 = vmulq_f32 (z, z);
+   float32x4_t z4 = vmulq_f32 (z2, z2);
+ 
+-  float32x4_t ret = vfmaq_f32 (
+-      v_pairwise_poly_3_f32 (z2, z4, data_ptr->poly), z4,
+-      vmulq_f32 (z4, v_pairwise_poly_3_f32 (z2, z4, data_ptr->poly + 4)));
++  float32x4_t c1357 = vld1q_f32 (&d->c1);
++  float32x4_t p01 = vfmaq_laneq_f32 (d->c0, z2, c1357, 0);
++  float32x4_t p23 = vfmaq_laneq_f32 (d->c2, z2, c1357, 1);
++  float32x4_t p45 = vfmaq_laneq_f32 (d->c4, z2, c1357, 2);
++  float32x4_t p67 = vfmaq_laneq_f32 (d->c6, z2, c1357, 3);
++  float32x4_t p03 = vfmaq_f32 (p01, z4, p23);
++  float32x4_t p47 = vfmaq_f32 (p45, z4, p67);
++
++  float32x4_t ret = vfmaq_f32 (p03, z4, vmulq_f32 (z4, p47));
+ 
+   /* y = shift + z * P(z^2).  */
+   ret = vaddq_f32 (vfmaq_f32 (z, ret, vmulq_f32 (z2, z)), shift);
+ 
+-  /* Account for the sign of y.  */
+-  ret = vreinterpretq_f32_u32 (
+-      veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy));
+-
+   if (__glibc_unlikely (v_any_u32 (special_cases)))
+     {
+-      return special_case (y, x, ret, special_cases);
++      return special_case (y, x, ret, sign_xy, special_cases);
+     }
+ 
+-  return ret;
++  /* Account for the sign of y.  */
++  return vreinterpretq_f32_u32 (
++      veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy));
+ }
+ libmvec_hidden_def (V_NAME_F2 (atan2))
+ HALF_WIDTH_ALIAS_F2(atan2)
+diff --git a/sysdeps/aarch64/fpu/atan_advsimd.c b/sysdeps/aarch64/fpu/atan_advsimd.c
+index a962be0f78..14f1809796 100644
+--- a/sysdeps/aarch64/fpu/atan_advsimd.c
++++ b/sysdeps/aarch64/fpu/atan_advsimd.c
+@@ -22,21 +22,22 @@
+ 
+ static const struct data
+ {
++  float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c18;
+   float64x2_t pi_over_2;
+-  float64x2_t poly[20];
++  double c1, c3, c5, c7, c9, c11, c13, c15, c17, c19;
+ } data = {
+   /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
+ 	      [2**-1022, 1.0].  */
+-  .poly = { V2 (-0x1.5555555555555p-2),	 V2 (0x1.99999999996c1p-3),
+-	    V2 (-0x1.2492492478f88p-3),	 V2 (0x1.c71c71bc3951cp-4),
+-	    V2 (-0x1.745d160a7e368p-4),	 V2 (0x1.3b139b6a88ba1p-4),
+-	    V2 (-0x1.11100ee084227p-4),	 V2 (0x1.e1d0f9696f63bp-5),
+-	    V2 (-0x1.aebfe7b418581p-5),	 V2 (0x1.842dbe9b0d916p-5),
+-	    V2 (-0x1.5d30140ae5e99p-5),	 V2 (0x1.338e31eb2fbbcp-5),
+-	    V2 (-0x1.00e6eece7de8p-5),	 V2 (0x1.860897b29e5efp-6),
+-	    V2 (-0x1.0051381722a59p-6),	 V2 (0x1.14e9dc19a4a4ep-7),
+-	    V2 (-0x1.d0062b42fe3bfp-9),	 V2 (0x1.17739e210171ap-10),
+-	    V2 (-0x1.ab24da7be7402p-13), V2 (0x1.358851160a528p-16), },
++  .c0 = V2 (-0x1.5555555555555p-2),	  .c1 = 0x1.99999999996c1p-3,
++  .c2 = V2 (-0x1.2492492478f88p-3),	  .c3 = 0x1.c71c71bc3951cp-4,
++  .c4 = V2 (-0x1.745d160a7e368p-4),	  .c5 = 0x1.3b139b6a88ba1p-4,
++  .c6 = V2 (-0x1.11100ee084227p-4),	  .c7 = 0x1.e1d0f9696f63bp-5,
++  .c8 = V2 (-0x1.aebfe7b418581p-5),	  .c9 = 0x1.842dbe9b0d916p-5,
++  .c10 = V2 (-0x1.5d30140ae5e99p-5),	  .c11 = 0x1.338e31eb2fbbcp-5,
++  .c12 = V2 (-0x1.00e6eece7de8p-5),	  .c13 = 0x1.860897b29e5efp-6,
++  .c14 = V2 (-0x1.0051381722a59p-6),	  .c15 = 0x1.14e9dc19a4a4ep-7,
++  .c16 = V2 (-0x1.d0062b42fe3bfp-9),	  .c17 = 0x1.17739e210171ap-10,
++  .c18 = V2 (-0x1.ab24da7be7402p-13),	  .c19 = 0x1.358851160a528p-16,
+   .pi_over_2 = V2 (0x1.921fb54442d18p+0),
+ };
+ 
+@@ -52,6 +53,11 @@ static const struct data
+ float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x)
+ {
+   const struct data *d = ptr_barrier (&data);
++  float64x2_t c13 = vld1q_f64 (&d->c1);
++  float64x2_t c57 = vld1q_f64 (&d->c5);
++  float64x2_t c911 = vld1q_f64 (&d->c9);
++  float64x2_t c1315 = vld1q_f64 (&d->c13);
++  float64x2_t c1719 = vld1q_f64 (&d->c17);
+ 
+   /* Small cases, infs and nans are supported by our approximation technique,
+      but do not set fenv flags correctly. Only trigger special case if we need
+@@ -90,9 +96,35 @@ float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x)
+   float64x2_t x2 = vmulq_f64 (z2, z2);
+   float64x2_t x4 = vmulq_f64 (x2, x2);
+   float64x2_t x8 = vmulq_f64 (x4, x4);
+-  float64x2_t y
+-      = vfmaq_f64 (v_estrin_7_f64 (z2, x2, x4, d->poly),
+-		   v_estrin_11_f64 (z2, x2, x4, x8, d->poly + 8), x8);
++
++  /* estrin_7.  */
++  float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0);
++  float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1);
++  float64x2_t p03 = vfmaq_f64 (p01, x2, p23);
++
++  float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0);
++  float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1);
++  float64x2_t p47 = vfmaq_f64 (p45, x2, p67);
++
++  float64x2_t p07 = vfmaq_f64 (p03, x4, p47);
++
++  /* estrin_11.  */
++  float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0);
++  float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1);
++  float64x2_t p811 = vfmaq_f64 (p89, x2, p1011);
++
++  float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, z2, c1315, 0);
++  float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, z2, c1315, 1);
++  float64x2_t p1215 = vfmaq_f64 (p1213, x2, p1415);
++
++  float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, z2, c1719, 0);
++  float64x2_t p1819 = vfmaq_laneq_f64 (d->c18, z2, c1719, 1);
++  float64x2_t p1619 = vfmaq_f64 (p1617, x2, p1819);
++
++  float64x2_t p815 = vfmaq_f64 (p811, x4, p1215);
++  float64x2_t p819 = vfmaq_f64 (p815, x8, p1619);
++
++  float64x2_t y = vfmaq_f64 (p07, p819, x8);
+ 
+   /* Finalize. y = shift + z + z^3 * P(z^2).  */
+   y = vfmaq_f64 (az, y, vmulq_f64 (z2, az));
+
+commit bf2b60a56036c951a798845223a2e04cc48507e4
+Author: Joana Cruz <Joana.Cruz@arm.com>
+Date:   Tue Dec 17 14:50:33 2024 +0000
+
+    AArch64: Improve codegen of AdvSIMD expf family
+    
+    Load the polynomial evaluation coefficients into 2 vectors and use lanewise MLAs.
+    Also use intrinsics instead of native operations.
+    expf: 3% improvement in throughput microbenchmark on Neoverse V1, exp2f: 5%,
+    exp10f: 13%, coshf: 14%.
+    
+    Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
+    (cherry picked from commit cff9648d0b50d19cdaf685f6767add040d4e1a8e)
+
+diff --git a/sysdeps/aarch64/fpu/coshf_advsimd.c b/sysdeps/aarch64/fpu/coshf_advsimd.c
+index c1ab4923b8..cd5c866521 100644
+--- a/sysdeps/aarch64/fpu/coshf_advsimd.c
++++ b/sysdeps/aarch64/fpu/coshf_advsimd.c
+@@ -23,19 +23,27 @@
+ static const struct data
+ {
+   struct v_expf_data expf_consts;
+-  uint32x4_t tiny_bound, special_bound;
++  uint32x4_t tiny_bound;
++  float32x4_t bound;
++#if WANT_SIMD_EXCEPT
++  uint32x4_t special_bound;
++#endif
+ } data = {
+   .expf_consts = V_EXPF_DATA,
+   .tiny_bound = V4 (0x20000000), /* 0x1p-63: Round to 1 below this.  */
+   /* 0x1.5a92d8p+6: expf overflows above this, so have to use special case.  */
++  .bound = V4 (0x1.5a92d8p+6),
++#if WANT_SIMD_EXCEPT
+   .special_bound = V4 (0x42ad496c),
++#endif
+ };
+ 
+ #if !WANT_SIMD_EXCEPT
+ static float32x4_t NOINLINE VPCS_ATTR
+-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
++special_case (float32x4_t x, float32x4_t half_t, float32x4_t half_over_t,
++	      uint32x4_t special)
+ {
+-  return v_call_f32 (coshf, x, y, special);
++  return v_call_f32 (coshf, x, vaddq_f32 (half_t, half_over_t), special);
+ }
+ #endif
+ 
+@@ -47,14 +55,13 @@ float32x4_t VPCS_ATTR V_NAME_F1 (cosh) (float32x4_t x)
+ {
+   const struct data *d = ptr_barrier (&data);
+ 
+-  float32x4_t ax = vabsq_f32 (x);
+-  uint32x4_t iax = vreinterpretq_u32_f32 (ax);
+-  uint32x4_t special = vcgeq_u32 (iax, d->special_bound);
+-
+ #if WANT_SIMD_EXCEPT
+   /* If fp exceptions are to be triggered correctly, fall back to the scalar
+      variant for all inputs if any input is a special value or above the bound
+      at which expf overflows.  */
++  float32x4_t ax = vabsq_f32 (x);
++  uint32x4_t iax = vreinterpretq_u32_f32 (ax);
++  uint32x4_t special = vcgeq_u32 (iax, d->special_bound);
+   if (__glibc_unlikely (v_any_u32 (special)))
+     return v_call_f32 (coshf, x, x, v_u32 (-1));
+ 
+@@ -63,10 +70,13 @@ float32x4_t VPCS_ATTR V_NAME_F1 (cosh) (float32x4_t x)
+      input to 0, which will generate no exceptions.  */
+   if (__glibc_unlikely (v_any_u32 (tiny)))
+     ax = v_zerofy_f32 (ax, tiny);
++  float32x4_t t = v_expf_inline (ax, &d->expf_consts);
++#else
++  uint32x4_t special = vcageq_f32 (x, d->bound);
++  float32x4_t t = v_expf_inline (x, &d->expf_consts);
+ #endif
+ 
+   /* Calculate cosh by exp(x) / 2 + exp(-x) / 2.  */
+-  float32x4_t t = v_expf_inline (ax, &d->expf_consts);
+   float32x4_t half_t = vmulq_n_f32 (t, 0.5);
+   float32x4_t half_over_t = vdivq_f32 (v_f32 (0.5), t);
+ 
+@@ -75,7 +85,7 @@ float32x4_t VPCS_ATTR V_NAME_F1 (cosh) (float32x4_t x)
+     return vbslq_f32 (tiny, v_f32 (1), vaddq_f32 (half_t, half_over_t));
+ #else
+   if (__glibc_unlikely (v_any_u32 (special)))
+-    return special_case (x, vaddq_f32 (half_t, half_over_t), special);
++    return special_case (x, half_t, half_over_t, special);
+ #endif
+ 
+   return vaddq_f32 (half_t, half_over_t);
+diff --git a/sysdeps/aarch64/fpu/exp10f_advsimd.c b/sysdeps/aarch64/fpu/exp10f_advsimd.c
+index cf53e73290..55d9cd83f2 100644
+--- a/sysdeps/aarch64/fpu/exp10f_advsimd.c
++++ b/sysdeps/aarch64/fpu/exp10f_advsimd.c
+@@ -18,16 +18,15 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include "v_math.h"
+-#include "poly_advsimd_f32.h"
+ 
+ #define ScaleBound 192.0f
+ 
+ static const struct data
+ {
+-  float32x4_t poly[5];
+-  float log10_2_and_inv[4];
+-  float32x4_t shift;
+-
++  float32x4_t c0, c1, c3;
++  float log10_2_high, log10_2_low, c2, c4;
++  float32x4_t inv_log10_2, special_bound;
++  uint32x4_t exponent_bias, special_offset, special_bias;
+ #if !WANT_SIMD_EXCEPT
+   float32x4_t scale_thresh;
+ #endif
+@@ -37,19 +36,24 @@ static const struct data
+      rel error: 0x1.89dafa3p-24
+      abs error: 0x1.167d55p-23 in [-log10(2)/2, log10(2)/2]
+      maxerr: 1.85943 +0.5 ulp.  */
+-  .poly = { V4 (0x1.26bb16p+1f), V4 (0x1.5350d2p+1f), V4 (0x1.04744ap+1f),
+-	    V4 (0x1.2d8176p+0f), V4 (0x1.12b41ap-1f) },
+-  .shift = V4 (0x1.8p23f),
+-
+-  /* Stores constants 1/log10(2), log10(2)_high, log10(2)_low, 0.  */
+-  .log10_2_and_inv = { 0x1.a934fp+1, 0x1.344136p-2, -0x1.ec10cp-27, 0 },
++  .c0 = V4 (0x1.26bb16p+1f),
++  .c1 = V4 (0x1.5350d2p+1f),
++  .c2 = 0x1.04744ap+1f,
++  .c3 = V4 (0x1.2d8176p+0f),
++  .c4 = 0x1.12b41ap-1f,
++  .inv_log10_2 = V4 (0x1.a934fp+1),
++  .log10_2_high = 0x1.344136p-2,
++  .log10_2_low = 0x1.ec10cp-27,
++  /* rint (log2 (2^127 / (1 + sqrt (2)))).  */
++  .special_bound = V4 (126.0f),
++  .exponent_bias = V4 (0x3f800000),
++  .special_offset = V4 (0x82000000),
++  .special_bias = V4 (0x7f000000),
+ #if !WANT_SIMD_EXCEPT
+   .scale_thresh = V4 (ScaleBound)
+ #endif
+ };
+ 
+-#define ExponentBias v_u32 (0x3f800000)
+-
+ #if WANT_SIMD_EXCEPT
+ 
+ # define SpecialBound 38.0f	       /* rint(log10(2^127)).  */
+@@ -67,17 +71,15 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
+ 
+ #else
+ 
+-# define SpecialBound 126.0f /* rint (log2 (2^127 / (1 + sqrt (2)))).  */
+-# define SpecialOffset v_u32 (0x82000000)
+-# define SpecialBias v_u32 (0x7f000000)
++# define SpecialBound 126.0f
+ 
+ static float32x4_t VPCS_ATTR NOINLINE
+ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
+ 	      float32x4_t scale, const struct data *d)
+ {
+   /* 2^n may overflow, break it up into s1*s2.  */
+-  uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset);
+-  float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias));
++  uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset);
++  float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias));
+   float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
+   uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
+   float32x4_t r2 = vmulq_f32 (s1, s1);
+@@ -112,23 +114,23 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp10) (float32x4_t x)
+   /* exp10(x) = 2^n * 10^r = 2^n * (1 + poly (r)),
+      with poly(r) in [1/sqrt(2), sqrt(2)] and
+      x = r + n * log10 (2), with r in [-log10(2)/2, log10(2)/2].  */
+-  float32x4_t log10_2_and_inv = vld1q_f32 (d->log10_2_and_inv);
+-  float32x4_t z = vfmaq_laneq_f32 (d->shift, x, log10_2_and_inv, 0);
+-  float32x4_t n = vsubq_f32 (z, d->shift);
+-  float32x4_t r = vfmsq_laneq_f32 (x, n, log10_2_and_inv, 1);
+-  r = vfmsq_laneq_f32 (r, n, log10_2_and_inv, 2);
+-  uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23);
++  float32x4_t log10_2_c24 = vld1q_f32 (&d->log10_2_high);
++  float32x4_t n = vrndaq_f32 (vmulq_f32 (x, d->inv_log10_2));
++  float32x4_t r = vfmsq_laneq_f32 (x, n, log10_2_c24, 0);
++  r = vfmaq_laneq_f32 (r, n, log10_2_c24, 1);
++  uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (n)), 23);
+ 
+-  float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, ExponentBias));
++  float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
+ 
+ #if !WANT_SIMD_EXCEPT
+-  uint32x4_t cmp = vcagtq_f32 (n, v_f32 (SpecialBound));
++  uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
+ #endif
+ 
+   float32x4_t r2 = vmulq_f32 (r, r);
+-  float32x4_t poly
+-      = vfmaq_f32 (vmulq_f32 (r, d->poly[0]),
+-		   v_pairwise_poly_3_f32 (r, r2, d->poly + 1), r2);
++  float32x4_t p12 = vfmaq_laneq_f32 (d->c1, r, log10_2_c24, 2);
++  float32x4_t p34 = vfmaq_laneq_f32 (d->c3, r, log10_2_c24, 3);
++  float32x4_t p14 = vfmaq_f32 (p12, r2, p34);
++  float32x4_t poly = vfmaq_f32 (vmulq_f32 (r, d->c0), p14, r2);
+ 
+   if (__glibc_unlikely (v_any_u32 (cmp)))
+ #if WANT_SIMD_EXCEPT
+diff --git a/sysdeps/aarch64/fpu/exp2f_advsimd.c b/sysdeps/aarch64/fpu/exp2f_advsimd.c
+index 69e0b193a1..a4220da63c 100644
+--- a/sysdeps/aarch64/fpu/exp2f_advsimd.c
++++ b/sysdeps/aarch64/fpu/exp2f_advsimd.c
+@@ -21,24 +21,28 @@
+ 
+ static const struct data
+ {
+-  float32x4_t poly[5];
+-  uint32x4_t exponent_bias;
++  float32x4_t c1, c3;
++  uint32x4_t exponent_bias, special_offset, special_bias;
+ #if !WANT_SIMD_EXCEPT
+-  float32x4_t special_bound, scale_thresh;
++  float32x4_t scale_thresh, special_bound;
+ #endif
++  float c0, c2, c4, zero;
+ } data = {
+   /* maxerr: 1.962 ulp.  */
+-  .poly = { V4 (0x1.59977ap-10f), V4 (0x1.3ce9e4p-7f), V4 (0x1.c6bd32p-5f),
+-	    V4 (0x1.ebf9bcp-3f), V4 (0x1.62e422p-1f) },
++  .c0 = 0x1.59977ap-10f,
++  .c1 = V4 (0x1.3ce9e4p-7f),
++  .c2 = 0x1.c6bd32p-5f,
++  .c3 = V4 (0x1.ebf9bcp-3f),
++  .c4 = 0x1.62e422p-1f,
+   .exponent_bias = V4 (0x3f800000),
++  .special_offset = V4 (0x82000000),
++  .special_bias = V4 (0x7f000000),
+ #if !WANT_SIMD_EXCEPT
+   .special_bound = V4 (126.0f),
+   .scale_thresh = V4 (192.0f),
+ #endif
+ };
+ 
+-#define C(i) d->poly[i]
+-
+ #if WANT_SIMD_EXCEPT
+ 
+ # define TinyBound v_u32 (0x20000000)	  /* asuint (0x1p-63).  */
+@@ -55,16 +59,13 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
+ 
+ #else
+ 
+-# define SpecialOffset v_u32 (0x82000000)
+-# define SpecialBias v_u32 (0x7f000000)
+-
+ static float32x4_t VPCS_ATTR NOINLINE
+ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
+ 	      float32x4_t scale, const struct data *d)
+ {
+   /* 2^n may overflow, break it up into s1*s2.  */
+-  uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset);
+-  float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias));
++  uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset);
++  float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias));
+   float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
+   uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
+   float32x4_t r2 = vmulq_f32 (s1, s1);
+@@ -80,13 +81,11 @@ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
+ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp2) (float32x4_t x)
+ {
+   const struct data *d = ptr_barrier (&data);
+-  float32x4_t n, r, r2, scale, p, q, poly;
+-  uint32x4_t cmp, e;
+ 
+ #if WANT_SIMD_EXCEPT
+   /* asuint(|x|) - TinyBound >= BigBound - TinyBound.  */
+   uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x));
+-  cmp = vcgeq_u32 (vsubq_u32 (ia, TinyBound), SpecialBound);
++  uint32x4_t cmp = vcgeq_u32 (vsubq_u32 (ia, TinyBound), SpecialBound);
+   float32x4_t xm = x;
+   /* If any lanes are special, mask them with 1 and retain a copy of x to allow
+      special_case to fix special lanes later. This is only necessary if fenv
+@@ -95,23 +94,24 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp2) (float32x4_t x)
+     x = vbslq_f32 (cmp, v_f32 (1), x);
+ #endif
+ 
+-    /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+-       x = n + r, with r in [-1/2, 1/2].  */
+-  n = vrndaq_f32 (x);
+-  r = vsubq_f32 (x, n);
+-  e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 23);
+-  scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
++  /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
++     x = n + r, with r in [-1/2, 1/2].  */
++  float32x4_t n = vrndaq_f32 (x);
++  float32x4_t r = vsubq_f32 (x, n);
++  uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 23);
++  float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
+ 
+ #if !WANT_SIMD_EXCEPT
+-  cmp = vcagtq_f32 (n, d->special_bound);
++  uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
+ #endif
+ 
+-  r2 = vmulq_f32 (r, r);
+-  p = vfmaq_f32 (C (1), C (0), r);
+-  q = vfmaq_f32 (C (3), C (2), r);
++  float32x4_t c024 = vld1q_f32 (&d->c0);
++  float32x4_t r2 = vmulq_f32 (r, r);
++  float32x4_t p = vfmaq_laneq_f32 (d->c1, r, c024, 0);
++  float32x4_t q = vfmaq_laneq_f32 (d->c3, r, c024, 1);
+   q = vfmaq_f32 (q, p, r2);
+-  p = vmulq_f32 (C (4), r);
+-  poly = vfmaq_f32 (p, q, r2);
++  p = vmulq_laneq_f32 (r, c024, 2);
++  float32x4_t poly = vfmaq_f32 (p, q, r2);
+ 
+   if (__glibc_unlikely (v_any_u32 (cmp)))
+ #if WANT_SIMD_EXCEPT
+diff --git a/sysdeps/aarch64/fpu/expf_advsimd.c b/sysdeps/aarch64/fpu/expf_advsimd.c
+index 5c9cb72620..70f137e2e5 100644
+--- a/sysdeps/aarch64/fpu/expf_advsimd.c
++++ b/sysdeps/aarch64/fpu/expf_advsimd.c
+@@ -21,20 +21,25 @@
+ 
+ static const struct data
+ {
+-  float32x4_t poly[5];
+-  float32x4_t inv_ln2, ln2_hi, ln2_lo;
+-  uint32x4_t exponent_bias;
++  float32x4_t c1, c3, c4, inv_ln2;
++  float ln2_hi, ln2_lo, c0, c2;
++  uint32x4_t exponent_bias, special_offset, special_bias;
+ #if !WANT_SIMD_EXCEPT
+   float32x4_t special_bound, scale_thresh;
+ #endif
+ } data = {
+   /* maxerr: 1.45358 +0.5 ulp.  */
+-  .poly = { V4 (0x1.0e4020p-7f), V4 (0x1.573e2ep-5f), V4 (0x1.555e66p-3f),
+-	    V4 (0x1.fffdb6p-2f), V4 (0x1.ffffecp-1f) },
++  .c0 = 0x1.0e4020p-7f,
++  .c1 = V4 (0x1.573e2ep-5f),
++  .c2 = 0x1.555e66p-3f,
++  .c3 = V4 (0x1.fffdb6p-2f),
++  .c4 = V4 (0x1.ffffecp-1f),
+   .inv_ln2 = V4 (0x1.715476p+0f),
+-  .ln2_hi = V4 (0x1.62e4p-1f),
+-  .ln2_lo = V4 (0x1.7f7d1cp-20f),
++  .ln2_hi = 0x1.62e4p-1f,
++  .ln2_lo = 0x1.7f7d1cp-20f,
+   .exponent_bias = V4 (0x3f800000),
++  .special_offset = V4 (0x82000000),
++  .special_bias = V4 (0x7f000000),
+ #if !WANT_SIMD_EXCEPT
+   .special_bound = V4 (126.0f),
+   .scale_thresh = V4 (192.0f),
+@@ -59,19 +64,17 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
+ 
+ #else
+ 
+-# define SpecialOffset v_u32 (0x82000000)
+-# define SpecialBias v_u32 (0x7f000000)
+-
+ static float32x4_t VPCS_ATTR NOINLINE
+ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
+ 	      float32x4_t scale, const struct data *d)
+ {
+   /* 2^n may overflow, break it up into s1*s2.  */
+-  uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset);
+-  float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias));
++  uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset);
++  float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias));
+   float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
+   uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
+   float32x4_t r2 = vmulq_f32 (s1, s1);
++  // (s2 + p*s2)*s1 = s2(p+1)s1
+   float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1);
+   /* Similar to r1 but avoids double rounding in the subnormal range.  */
+   float32x4_t r0 = vfmaq_f32 (scale, poly, scale);
+@@ -84,12 +87,11 @@ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
+ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp) (float32x4_t x)
+ {
+   const struct data *d = ptr_barrier (&data);
+-  float32x4_t n, r, r2, scale, p, q, poly;
+-  uint32x4_t cmp, e;
++  float32x4_t ln2_c02 = vld1q_f32 (&d->ln2_hi);
+ 
+ #if WANT_SIMD_EXCEPT
+   /* asuint(x) - TinyBound >= BigBound - TinyBound.  */
+-  cmp = vcgeq_u32 (
++  uint32x4_t cmp = vcgeq_u32 (
+       vsubq_u32 (vandq_u32 (vreinterpretq_u32_f32 (x), v_u32 (0x7fffffff)),
+ 		 TinyBound),
+       SpecialBound);
+@@ -103,22 +105,22 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp) (float32x4_t x)
+ 
+   /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+      x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
+-  n = vrndaq_f32 (vmulq_f32 (x, d->inv_ln2));
+-  r = vfmsq_f32 (x, n, d->ln2_hi);
+-  r = vfmsq_f32 (r, n, d->ln2_lo);
+-  e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23);
+-  scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
++  float32x4_t n = vrndaq_f32 (vmulq_f32 (x, d->inv_ln2));
++  float32x4_t r = vfmsq_laneq_f32 (x, n, ln2_c02, 0);
++  r = vfmsq_laneq_f32 (r, n, ln2_c02, 1);
++  uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23);
++  float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
+ 
+ #if !WANT_SIMD_EXCEPT
+-  cmp = vcagtq_f32 (n, d->special_bound);
++  uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
+ #endif
+ 
+-  r2 = vmulq_f32 (r, r);
+-  p = vfmaq_f32 (C (1), C (0), r);
+-  q = vfmaq_f32 (C (3), C (2), r);
++  float32x4_t r2 = vmulq_f32 (r, r);
++  float32x4_t p = vfmaq_laneq_f32 (d->c1, r, ln2_c02, 2);
++  float32x4_t q = vfmaq_laneq_f32 (d->c3, r, ln2_c02, 3);
+   q = vfmaq_f32 (q, p, r2);
+-  p = vmulq_f32 (C (4), r);
+-  poly = vfmaq_f32 (p, q, r2);
++  p = vmulq_f32 (d->c4, r);
++  float32x4_t poly = vfmaq_f32 (p, q, r2);
+ 
+   if (__glibc_unlikely (v_any_u32 (cmp)))
+ #if WANT_SIMD_EXCEPT
+diff --git a/sysdeps/aarch64/fpu/v_expf_inline.h b/sysdeps/aarch64/fpu/v_expf_inline.h
+index 08b06e0a6b..eacd2af241 100644
+--- a/sysdeps/aarch64/fpu/v_expf_inline.h
++++ b/sysdeps/aarch64/fpu/v_expf_inline.h
+@@ -24,50 +24,45 @@
+ 
+ struct v_expf_data
+ {
+-  float32x4_t poly[5];
+-  float32x4_t shift;
+-  float invln2_and_ln2[4];
++  float ln2_hi, ln2_lo, c0, c2;
++  float32x4_t inv_ln2, c1, c3, c4;
++  /* asuint(1.0f).  */
++  uint32x4_t exponent_bias;
+ };
+ 
+ /* maxerr: 1.45358 +0.5 ulp.  */
+ #define V_EXPF_DATA                                                           \
+   {                                                                           \
+-    .poly = { V4 (0x1.0e4020p-7f), V4 (0x1.573e2ep-5f), V4 (0x1.555e66p-3f),  \
+-	      V4 (0x1.fffdb6p-2f), V4 (0x1.ffffecp-1f) },                     \
+-    .shift = V4 (0x1.8p23f),                                                  \
+-    .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 },   \
++    .c0 = 0x1.0e4020p-7f, .c1 = V4 (0x1.573e2ep-5f), .c2 = 0x1.555e66p-3f,    \
++    .c3 = V4 (0x1.fffdb6p-2f), .c4 = V4 (0x1.ffffecp-1f),                     \
++    .ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f,                        \
++    .inv_ln2 = V4 (0x1.715476p+0f), .exponent_bias = V4 (0x3f800000),         \
+   }
+ 
+-#define ExponentBias v_u32 (0x3f800000) /* asuint(1.0f).  */
+-#define C(i) d->poly[i]
+-
+ static inline float32x4_t
+ v_expf_inline (float32x4_t x, const struct v_expf_data *d)
+ {
+-  /* Helper routine for calculating exp(x).
++  /* Helper routine for calculating exp(ax).
+      Copied from v_expf.c, with all special-case handling removed - the
+      calling routine should handle special values if required.  */
+ 
+-  /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+-     x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
+-  float32x4_t n, r, z;
+-  float32x4_t invln2_and_ln2 = vld1q_f32 (d->invln2_and_ln2);
+-  z = vfmaq_laneq_f32 (d->shift, x, invln2_and_ln2, 0);
+-  n = vsubq_f32 (z, d->shift);
+-  r = vfmsq_laneq_f32 (x, n, invln2_and_ln2, 1);
+-  r = vfmsq_laneq_f32 (r, n, invln2_and_ln2, 2);
+-  uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23);
+-  float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, ExponentBias));
++  /* exp(ax) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
++     ax = ln2*n + r, with r in [-ln2/2, ln2/2].  */
++  float32x4_t ax = vabsq_f32 (x);
++  float32x4_t ln2_c02 = vld1q_f32 (&d->ln2_hi);
++  float32x4_t n = vrndaq_f32 (vmulq_f32 (ax, d->inv_ln2));
++  float32x4_t r = vfmsq_laneq_f32 (ax, n, ln2_c02, 0);
++  r = vfmsq_laneq_f32 (r, n, ln2_c02, 1);
++  uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23);
++  float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
+ 
+   /* Custom order-4 Estrin avoids building high order monomial.  */
+   float32x4_t r2 = vmulq_f32 (r, r);
+-  float32x4_t p, q, poly;
+-  p = vfmaq_f32 (C (1), C (0), r);
+-  q = vfmaq_f32 (C (3), C (2), r);
++  float32x4_t p = vfmaq_laneq_f32 (d->c1, r, ln2_c02, 2);
++  float32x4_t q = vfmaq_laneq_f32 (d->c3, r, ln2_c02, 3);
+   q = vfmaq_f32 (q, p, r2);
+-  p = vmulq_f32 (C (4), r);
+-  poly = vfmaq_f32 (p, q, r2);
++  p = vmulq_f32 (d->c4, r);
++  float32x4_t poly = vfmaq_f32 (p, q, r2);
+   return vfmaq_f32 (scale, poly, scale);
+ }
+-
+ #endif
+
+commit abfd20ebbd2883f2c6e5f16709f7b9781c3c8068
+Author: Luna Lamb <luna.lamb@arm.com>
+Date:   Fri Jan 3 19:00:12 2025 +0000
+
+    AArch64: Improve codegen in AdvSIMD asinh
+    
+    Improves memory access and removes spills.
+    Load the polynomial evaluation coefficients into 2 vectors and use lanewise
+    MLAs.  Reduces MOVs 6->3 , LDR 11->5, STR/STP 2->0, ADRP 3->2.
+    
+    (cherry picked from commit 140b985e5a2071000122b3cb63ebfe88cf21dd29)
+
+diff --git a/sysdeps/aarch64/fpu/asinh_advsimd.c b/sysdeps/aarch64/fpu/asinh_advsimd.c
+index 6207e7da95..2739f98b39 100644
+--- a/sysdeps/aarch64/fpu/asinh_advsimd.c
++++ b/sysdeps/aarch64/fpu/asinh_advsimd.c
+@@ -20,41 +20,71 @@
+ #include "v_math.h"
+ #include "poly_advsimd_f64.h"
+ 
+-#define A(i) v_f64 (__v_log_data.poly[i])
+-#define N (1 << V_LOG_TABLE_BITS)
+-#define IndexMask (N - 1)
+-
+ const static struct data
+ {
+-  float64x2_t poly[18];
+-  uint64x2_t off, huge_bound, abs_mask;
+-  float64x2_t ln2, tiny_bound;
++  uint64x2_t huge_bound, abs_mask, off, mask;
++#if WANT_SIMD_EXCEPT
++  float64x2_t tiny_bound;
++#endif
++  float64x2_t lc0, lc2;
++  double lc1, lc3, ln2, lc4;
++
++  float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c17;
++  double c1, c3, c5, c7, c9, c11, c13, c15;
++
+ } data = {
+-  .off = V2 (0x3fe6900900000000),
+-  .ln2 = V2 (0x1.62e42fefa39efp-1),
+-  .huge_bound = V2 (0x5fe0000000000000),
++
++#if WANT_SIMD_EXCEPT
+   .tiny_bound = V2 (0x1p-26),
+-  .abs_mask = V2 (0x7fffffffffffffff),
++#endif
+   /* Even terms of polynomial s.t. asinh(x) is approximated by
+      asinh(x) ~= x + x^3 * (C0 + C1 * x + C2 * x^2 + C3 * x^3 + ...).
+      Generated using Remez, f = (asinh(sqrt(x)) - sqrt(x))/x^(3/2).  */
+-  .poly = { V2 (-0x1.55555555554a7p-3), V2 (0x1.3333333326c7p-4),
+-	    V2 (-0x1.6db6db68332e6p-5), V2 (0x1.f1c71b26fb40dp-6),
+-	    V2 (-0x1.6e8b8b654a621p-6), V2 (0x1.1c4daa9e67871p-6),
+-	    V2 (-0x1.c9871d10885afp-7), V2 (0x1.7a16e8d9d2ecfp-7),
+-	    V2 (-0x1.3ddca533e9f54p-7), V2 (0x1.0becef748dafcp-7),
+-	    V2 (-0x1.b90c7099dd397p-8), V2 (0x1.541f2bb1ffe51p-8),
+-	    V2 (-0x1.d217026a669ecp-9), V2 (0x1.0b5c7977aaf7p-9),
+-	    V2 (-0x1.e0f37daef9127p-11), V2 (0x1.388b5fe542a6p-12),
+-	    V2 (-0x1.021a48685e287p-14), V2 (0x1.93d4ba83d34dap-18) },
++
++  .c0 = V2 (-0x1.55555555554a7p-3),
++  .c1 = 0x1.3333333326c7p-4,
++  .c2 = V2 (-0x1.6db6db68332e6p-5),
++  .c3 = 0x1.f1c71b26fb40dp-6,
++  .c4 = V2 (-0x1.6e8b8b654a621p-6),
++  .c5 = 0x1.1c4daa9e67871p-6,
++  .c6 = V2 (-0x1.c9871d10885afp-7),
++  .c7 = 0x1.7a16e8d9d2ecfp-7,
++  .c8 = V2 (-0x1.3ddca533e9f54p-7),
++  .c9 = 0x1.0becef748dafcp-7,
++  .c10 = V2 (-0x1.b90c7099dd397p-8),
++  .c11 = 0x1.541f2bb1ffe51p-8,
++  .c12 = V2 (-0x1.d217026a669ecp-9),
++  .c13 = 0x1.0b5c7977aaf7p-9,
++  .c14 = V2 (-0x1.e0f37daef9127p-11),
++  .c15 = 0x1.388b5fe542a6p-12,
++  .c16 = V2 (-0x1.021a48685e287p-14),
++  .c17 = V2 (0x1.93d4ba83d34dap-18),
++
++  .lc0 = V2 (-0x1.ffffffffffff7p-2),
++  .lc1 = 0x1.55555555170d4p-2,
++  .lc2 = V2 (-0x1.0000000399c27p-2),
++  .lc3 = 0x1.999b2e90e94cap-3,
++  .lc4 = -0x1.554e550bd501ep-3,
++  .ln2 = 0x1.62e42fefa39efp-1,
++
++  .off = V2 (0x3fe6900900000000),
++  .huge_bound = V2 (0x5fe0000000000000),
++  .abs_mask = V2 (0x7fffffffffffffff),
++  .mask = V2 (0xfffULL << 52),
+ };
+ 
+ static float64x2_t NOINLINE VPCS_ATTR
+-special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
++special_case (float64x2_t x, float64x2_t y, uint64x2_t abs_mask,
++	      uint64x2_t special)
+ {
++  /* Copy sign.  */
++  y = vbslq_f64 (abs_mask, y, x);
+   return v_call_f64 (asinh, x, y, special);
+ }
+ 
++#define N (1 << V_LOG_TABLE_BITS)
++#define IndexMask (N - 1)
++
+ struct entry
+ {
+   float64x2_t invc;
+@@ -76,27 +106,34 @@ lookup (uint64x2_t i)
+ }
+ 
+ static inline float64x2_t
+-log_inline (float64x2_t x, const struct data *d)
++log_inline (float64x2_t xm, const struct data *d)
+ {
+-  /* Double-precision vector log, copied from ordinary vector log with some
+-     cosmetic modification and special-cases removed.  */
+-  uint64x2_t ix = vreinterpretq_u64_f64 (x);
+-  uint64x2_t tmp = vsubq_u64 (ix, d->off);
+-  int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52);
+-  uint64x2_t iz
+-      = vsubq_u64 (ix, vandq_u64 (tmp, vdupq_n_u64 (0xfffULL << 52)));
++
++  uint64x2_t u = vreinterpretq_u64_f64 (xm);
++  uint64x2_t u_off = vsubq_u64 (u, d->off);
++
++  int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52);
++  uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->mask));
+   float64x2_t z = vreinterpretq_f64_u64 (iz);
+-  struct entry e = lookup (tmp);
++
++  struct entry e = lookup (u_off);
++
++  /* log(x) = log1p(z/c-1) + log(c) + k*Ln2.  */
+   float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
+   float64x2_t kd = vcvtq_f64_s64 (k);
+-  float64x2_t hi = vfmaq_f64 (vaddq_f64 (e.logc, r), kd, d->ln2);
++
++  /* hi = r + log(c) + k*Ln2.  */
++  float64x2_t ln2_and_lc4 = vld1q_f64 (&d->ln2);
++  float64x2_t hi = vfmaq_laneq_f64 (vaddq_f64 (e.logc, r), kd, ln2_and_lc4, 0);
++
++  /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi.  */
++  float64x2_t odd_coeffs = vld1q_f64 (&d->lc1);
+   float64x2_t r2 = vmulq_f64 (r, r);
+-  float64x2_t y = vfmaq_f64 (A (2), A (3), r);
+-  float64x2_t p = vfmaq_f64 (A (0), A (1), r);
+-  y = vfmaq_f64 (y, A (4), r2);
+-  y = vfmaq_f64 (p, y, r2);
+-  y = vfmaq_f64 (hi, y, r2);
+-  return y;
++  float64x2_t y = vfmaq_laneq_f64 (d->lc2, r, odd_coeffs, 1);
++  float64x2_t p = vfmaq_laneq_f64 (d->lc0, r, odd_coeffs, 0);
++  y = vfmaq_laneq_f64 (y, r2, ln2_and_lc4, 1);
++  y = vfmaq_f64 (p, r2, y);
++  return vfmaq_f64 (hi, y, r2);
+ }
+ 
+ /* Double-precision implementation of vector asinh(x).
+@@ -106,23 +143,24 @@ log_inline (float64x2_t x, const struct data *d)
+    asinh(x) = sign(x) * log(|x| + sqrt(x^2 + 1)      if |x| >= 1
+ 	    = sign(x) * (|x| + |x|^3 * P(x^2))       otherwise
+    where log(x) is an optimized log approximation, and P(x) is a polynomial
+-   shared with the scalar routine. The greatest observed error 3.29 ULP, in
++   shared with the scalar routine. The greatest observed error 2.79 ULP, in
+    |x| >= 1:
+-   __v_asinh(0x1.2cd9d717e2c9bp+0) got 0x1.ffffcfd0e234fp-1
+-				  want 0x1.ffffcfd0e2352p-1.  */
++   _ZGVnN2v_asinh(0x1.2cd9d73ea76a6p+0) got 0x1.ffffd003219dap-1
++				       want  0x1.ffffd003219ddp-1.  */
+ VPCS_ATTR float64x2_t V_NAME_D1 (asinh) (float64x2_t x)
+ {
+   const struct data *d = ptr_barrier (&data);
+-
+   float64x2_t ax = vabsq_f64 (x);
+-  uint64x2_t iax = vreinterpretq_u64_f64 (ax);
+ 
+   uint64x2_t gt1 = vcgeq_f64 (ax, v_f64 (1));
+-  uint64x2_t special = vcgeq_u64 (iax, d->huge_bound);
+ 
+ #if WANT_SIMD_EXCEPT
++  uint64x2_t iax = vreinterpretq_u64_f64 (ax);
++  uint64x2_t special = vcgeq_u64 (iax, (d->huge_bound));
+   uint64x2_t tiny = vcltq_f64 (ax, d->tiny_bound);
+   special = vorrq_u64 (special, tiny);
++#else
++  uint64x2_t special = vcgeq_f64 (ax, vreinterpretq_f64_u64 (d->huge_bound));
+ #endif
+ 
+   /* Option 1: |x| >= 1.
+@@ -147,19 +185,45 @@ VPCS_ATTR float64x2_t V_NAME_D1 (asinh) (float64x2_t x)
+      overflow, and tiny lanes, which will underflow, by setting them to 0. They
+      will be fixed later, either by selecting x or falling back to the scalar
+      special-case. The largest observed error in this region is 1.47 ULPs:
+-     __v_asinh(0x1.fdfcd00cc1e6ap-1) got 0x1.c1d6bf874019bp-1
+-				    want 0x1.c1d6bf874019cp-1.  */
++     _ZGVnN2v_asinh(0x1.fdfcd00cc1e6ap-1) got 0x1.c1d6bf874019bp-1
++					 want 0x1.c1d6bf874019cp-1.  */
+   float64x2_t option_2 = v_f64 (0);
++
+   if (__glibc_likely (v_any_u64 (vceqzq_u64 (gt1))))
+     {
++
+ #if WANT_SIMD_EXCEPT
+       ax = v_zerofy_f64 (ax, vorrq_u64 (tiny, gt1));
+ #endif
+-      float64x2_t x2 = vmulq_f64 (ax, ax), x3 = vmulq_f64 (ax, x2),
+-		  z2 = vmulq_f64 (x2, x2), z4 = vmulq_f64 (z2, z2),
+-		  z8 = vmulq_f64 (z4, z4), z16 = vmulq_f64 (z8, z8);
+-      float64x2_t p = v_estrin_17_f64 (x2, z2, z4, z8, z16, d->poly);
+-      option_2 = vfmaq_f64 (ax, p, x3);
++      float64x2_t x2 = vmulq_f64 (ax, ax), z2 = vmulq_f64 (x2, x2);
++      /* Order-17 Pairwise Horner scheme.  */
++      float64x2_t c13 = vld1q_f64 (&d->c1);
++      float64x2_t c57 = vld1q_f64 (&d->c5);
++      float64x2_t c911 = vld1q_f64 (&d->c9);
++      float64x2_t c1315 = vld1q_f64 (&d->c13);
++
++      float64x2_t p01 = vfmaq_laneq_f64 (d->c0, x2, c13, 0);
++      float64x2_t p23 = vfmaq_laneq_f64 (d->c2, x2, c13, 1);
++      float64x2_t p45 = vfmaq_laneq_f64 (d->c4, x2, c57, 0);
++      float64x2_t p67 = vfmaq_laneq_f64 (d->c6, x2, c57, 1);
++      float64x2_t p89 = vfmaq_laneq_f64 (d->c8, x2, c911, 0);
++      float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, x2, c911, 1);
++      float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, x2, c1315, 0);
++      float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, x2, c1315, 1);
++      float64x2_t p1617 = vfmaq_f64 (d->c16, x2, d->c17);
++
++      float64x2_t p = vfmaq_f64 (p1415, z2, p1617);
++      p = vfmaq_f64 (p1213, z2, p);
++      p = vfmaq_f64 (p1011, z2, p);
++      p = vfmaq_f64 (p89, z2, p);
++
++      p = vfmaq_f64 (p67, z2, p);
++      p = vfmaq_f64 (p45, z2, p);
++
++      p = vfmaq_f64 (p23, z2, p);
++
++      p = vfmaq_f64 (p01, z2, p);
++      option_2 = vfmaq_f64 (ax, p, vmulq_f64 (ax, x2));
+ #if WANT_SIMD_EXCEPT
+       option_2 = vbslq_f64 (tiny, x, option_2);
+ #endif
+@@ -167,10 +231,10 @@ VPCS_ATTR float64x2_t V_NAME_D1 (asinh) (float64x2_t x)
+ 
+   /* Choose the right option for each lane.  */
+   float64x2_t y = vbslq_f64 (gt1, option_1, option_2);
+-  /* Copy sign.  */
+-  y = vbslq_f64 (d->abs_mask, y, x);
+-
+   if (__glibc_unlikely (v_any_u64 (special)))
+-    return special_case (x, y, special);
+-  return y;
++    {
++      return special_case (x, y, d->abs_mask, special);
++    }
++  /* Copy sign.  */
++  return vbslq_f64 (d->abs_mask, y, x);
+ }
+
+commit 5f45c0f91eae99b7d49f5c63b900441eb3491213
+Author: Luna Lamb <luna.lamb@arm.com>
+Date:   Fri Jan 3 19:02:52 2025 +0000
+
+    AArch64: Improve codegen in SVE tans
+    
+    Improves memory access.
+    Tan: MOVPRFX 7 -> 2, LD1RD 12 -> 5, move MOV away from return.
+    Tanf: MOV 2 -> 1, MOVPRFX 6 -> 3, LD1RW 5 -> 4, move mov away from return.
+    
+    (cherry picked from commit aa6609feb20ebf8653db639dabe2a6afc77b02cc)
+
+diff --git a/sysdeps/aarch64/fpu/tan_sve.c b/sysdeps/aarch64/fpu/tan_sve.c
+index b2e4447316..a7318fd417 100644
+--- a/sysdeps/aarch64/fpu/tan_sve.c
++++ b/sysdeps/aarch64/fpu/tan_sve.c
+@@ -22,24 +22,38 @@
+ 
+ static const struct data
+ {
+-  double poly[9];
+-  double half_pi_hi, half_pi_lo, inv_half_pi, range_val, shift;
++  double c2, c4, c6, c8;
++  double poly_1357[4];
++  double c0, inv_half_pi;
++  double half_pi_hi, half_pi_lo, range_val;
+ } data = {
+   /* Polynomial generated with FPMinimax.  */
+-  .poly = { 0x1.5555555555556p-2, 0x1.1111111110a63p-3, 0x1.ba1ba1bb46414p-5,
+-	    0x1.664f47e5b5445p-6, 0x1.226e5e5ecdfa3p-7, 0x1.d6c7ddbf87047p-9,
+-	    0x1.7ea75d05b583ep-10, 0x1.289f22964a03cp-11,
+-	    0x1.4e4fd14147622p-12, },
++  .c2 = 0x1.ba1ba1bb46414p-5,
++  .c4 = 0x1.226e5e5ecdfa3p-7,
++  .c6 = 0x1.7ea75d05b583ep-10,
++  .c8 = 0x1.4e4fd14147622p-12,
++  .poly_1357 = { 0x1.1111111110a63p-3, 0x1.664f47e5b5445p-6,
++		 0x1.d6c7ddbf87047p-9, 0x1.289f22964a03cp-11 },
++  .c0 = 0x1.5555555555556p-2,
++  .inv_half_pi = 0x1.45f306dc9c883p-1,
+   .half_pi_hi = 0x1.921fb54442d18p0,
+   .half_pi_lo = 0x1.1a62633145c07p-54,
+-  .inv_half_pi = 0x1.45f306dc9c883p-1,
+   .range_val = 0x1p23,
+-  .shift = 0x1.8p52,
+ };
+ 
+ static svfloat64_t NOINLINE
+-special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
++special_case (svfloat64_t x, svfloat64_t p, svfloat64_t q, svbool_t pg,
++	      svbool_t special)
+ {
++  svbool_t use_recip = svcmpeq (
++      pg, svand_x (pg, svreinterpret_u64 (svcvt_s64_x (pg, q)), 1), 0);
++
++  svfloat64_t n = svmad_x (pg, p, p, -1);
++  svfloat64_t d = svmul_x (svptrue_b64 (), p, 2);
++  svfloat64_t swap = n;
++  n = svneg_m (n, use_recip, d);
++  d = svsel (use_recip, swap, d);
++  svfloat64_t y = svdiv_x (svnot_z (pg, special), n, d);
+   return sv_call_f64 (tan, x, y, special);
+ }
+ 
+@@ -50,15 +64,10 @@ special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
+ svfloat64_t SV_NAME_D1 (tan) (svfloat64_t x, svbool_t pg)
+ {
+   const struct data *dat = ptr_barrier (&data);
+-
+-  /* Invert condition to catch NaNs and Infs as well as large values.  */
+-  svbool_t special = svnot_z (pg, svaclt (pg, x, dat->range_val));
+-
++  svfloat64_t half_pi_c0 = svld1rq (svptrue_b64 (), &dat->c0);
+   /* q = nearest integer to 2 * x / pi.  */
+-  svfloat64_t shift = sv_f64 (dat->shift);
+-  svfloat64_t q = svmla_x (pg, shift, x, dat->inv_half_pi);
+-  q = svsub_x (pg, q, shift);
+-  svint64_t qi = svcvt_s64_x (pg, q);
++  svfloat64_t q = svmul_lane (x, half_pi_c0, 1);
++  q = svrinta_x (pg, q);
+ 
+   /* Use q to reduce x to r in [-pi/4, pi/4], by:
+      r = x - q * pi/2, in extended precision.  */
+@@ -68,7 +77,7 @@ svfloat64_t SV_NAME_D1 (tan) (svfloat64_t x, svbool_t pg)
+   r = svmls_lane (r, q, half_pi, 1);
+   /* Further reduce r to [-pi/8, pi/8], to be reconstructed using double angle
+      formula.  */
+-  r = svmul_x (pg, r, 0.5);
++  r = svmul_x (svptrue_b64 (), r, 0.5);
+ 
+   /* Approximate tan(r) using order 8 polynomial.
+      tan(x) is odd, so polynomial has the form:
+@@ -76,29 +85,51 @@ svfloat64_t SV_NAME_D1 (tan) (svfloat64_t x, svbool_t pg)
+      Hence we first approximate P(r) = C1 + C2 * r^2 + C3 * r^4 + ...
+      Then compute the approximation by:
+      tan(r) ~= r + r^3 * (C0 + r^2 * P(r)).  */
+-  svfloat64_t r2 = svmul_x (pg, r, r);
+-  svfloat64_t r4 = svmul_x (pg, r2, r2);
+-  svfloat64_t r8 = svmul_x (pg, r4, r4);
++
++  svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
++  svfloat64_t r4 = svmul_x (svptrue_b64 (), r2, r2);
++  svfloat64_t r8 = svmul_x (svptrue_b64 (), r4, r4);
+   /* Use offset version coeff array by 1 to evaluate from C1 onwards.  */
+-  svfloat64_t p = sv_estrin_7_f64_x (pg, r2, r4, r8, dat->poly + 1);
+-  p = svmad_x (pg, p, r2, dat->poly[0]);
+-  p = svmla_x (pg, r, r2, svmul_x (pg, p, r));
++  svfloat64_t C_24 = svld1rq (svptrue_b64 (), &dat->c2);
++  svfloat64_t C_68 = svld1rq (svptrue_b64 (), &dat->c6);
++
++  /* Use offset version coeff array by 1 to evaluate from C1 onwards.  */
++  svfloat64_t p01 = svmla_lane (sv_f64 (dat->poly_1357[0]), r2, C_24, 0);
++  svfloat64_t p23 = svmla_lane_f64 (sv_f64 (dat->poly_1357[1]), r2, C_24, 1);
++  svfloat64_t p03 = svmla_x (pg, p01, p23, r4);
++
++  svfloat64_t p45 = svmla_lane (sv_f64 (dat->poly_1357[2]), r2, C_68, 0);
++  svfloat64_t p67 = svmla_lane (sv_f64 (dat->poly_1357[3]), r2, C_68, 1);
++  svfloat64_t p47 = svmla_x (pg, p45, p67, r4);
++
++  svfloat64_t p = svmla_x (pg, p03, p47, r8);
++
++  svfloat64_t z = svmul_x (svptrue_b64 (), p, r);
++  z = svmul_x (svptrue_b64 (), r2, z);
++  z = svmla_lane (z, r, half_pi_c0, 0);
++  p = svmla_x (pg, r, r2, z);
+ 
+   /* Recombination uses double-angle formula:
+      tan(2x) = 2 * tan(x) / (1 - (tan(x))^2)
+      and reciprocity around pi/2:
+      tan(x) = 1 / (tan(pi/2 - x))
+      to assemble result using change-of-sign and conditional selection of
+-     numerator/denominator dependent on odd/even-ness of q (hence quadrant).  */
+-  svbool_t use_recip
+-      = svcmpeq (pg, svand_x (pg, svreinterpret_u64 (qi), 1), 0);
++     numerator/denominator dependent on odd/even-ness of q (quadrant).  */
++
++  /* Invert condition to catch NaNs and Infs as well as large values.  */
++  svbool_t special = svnot_z (pg, svaclt (pg, x, dat->range_val));
++
++  if (__glibc_unlikely (svptest_any (pg, special)))
++    {
++      return special_case (x, p, q, pg, special);
++    }
++  svbool_t use_recip = svcmpeq (
++      pg, svand_x (pg, svreinterpret_u64 (svcvt_s64_x (pg, q)), 1), 0);
+ 
+   svfloat64_t n = svmad_x (pg, p, p, -1);
+-  svfloat64_t d = svmul_x (pg, p, 2);
++  svfloat64_t d = svmul_x (svptrue_b64 (), p, 2);
+   svfloat64_t swap = n;
+   n = svneg_m (n, use_recip, d);
+   d = svsel (use_recip, swap, d);
+-  if (__glibc_unlikely (svptest_any (pg, special)))
+-    return special_case (x, svdiv_x (svnot_z (pg, special), n, d), special);
+   return svdiv_x (pg, n, d);
+ }
+diff --git a/sysdeps/aarch64/fpu/tanf_sve.c b/sysdeps/aarch64/fpu/tanf_sve.c
+index f342583241..e850fb4882 100644
+--- a/sysdeps/aarch64/fpu/tanf_sve.c
++++ b/sysdeps/aarch64/fpu/tanf_sve.c
+@@ -60,21 +60,16 @@ svfloat32_t SV_NAME_F1 (tan) (svfloat32_t x, const svbool_t pg)
+ {
+   const struct data *d = ptr_barrier (&data);
+ 
+-  /* Determine whether input is too large to perform fast regression.  */
+-  svbool_t cmp = svacge (pg, x, d->range_val);
+-
+   svfloat32_t odd_coeffs = svld1rq (svptrue_b32 (), &d->c1);
+   svfloat32_t pi_vals = svld1rq (svptrue_b32 (), &d->pio2_1);
+ 
+   /* n = rint(x/(pi/2)).  */
+-  svfloat32_t q = svmla_lane (sv_f32 (d->shift), x, pi_vals, 3);
+-  svfloat32_t n = svsub_x (pg, q, d->shift);
++  svfloat32_t n = svrintn_x (pg, svmul_lane (x, pi_vals, 3));
+   /* n is already a signed integer, simply convert it.  */
+   svint32_t in = svcvt_s32_x (pg, n);
+   /* Determine if x lives in an interval, where |tan(x)| grows to infinity.  */
+   svint32_t alt = svand_x (pg, in, 1);
+   svbool_t pred_alt = svcmpne (pg, alt, 0);
+-
+   /* r = x - n * (pi/2)  (range reduction into 0 .. pi/4).  */
+   svfloat32_t r;
+   r = svmls_lane (x, n, pi_vals, 0);
+@@ -93,7 +88,7 @@ svfloat32_t SV_NAME_F1 (tan) (svfloat32_t x, const svbool_t pg)
+ 
+   /* Evaluate polynomial approximation of tangent on [-pi/4, pi/4],
+      using Estrin on z^2.  */
+-  svfloat32_t z2 = svmul_x (pg, z, z);
++  svfloat32_t z2 = svmul_x (svptrue_b32 (), r, r);
+   svfloat32_t p01 = svmla_lane (sv_f32 (d->c0), z2, odd_coeffs, 0);
+   svfloat32_t p23 = svmla_lane (sv_f32 (d->c2), z2, odd_coeffs, 1);
+   svfloat32_t p45 = svmla_lane (sv_f32 (d->c4), z2, odd_coeffs, 2);
+@@ -106,13 +101,14 @@ svfloat32_t SV_NAME_F1 (tan) (svfloat32_t x, const svbool_t pg)
+ 
+   svfloat32_t y = svmla_x (pg, z, p, svmul_x (pg, z, z2));
+ 
+-  /* Transform result back, if necessary.  */
+-  svfloat32_t inv_y = svdivr_x (pg, y, 1.0f);
+-
+   /* No need to pass pg to specialcase here since cmp is a strict subset,
+      guaranteed by the cmpge above.  */
++
++  /* Determine whether input is too large to perform fast regression.  */
++  svbool_t cmp = svacge (pg, x, d->range_val);
+   if (__glibc_unlikely (svptest_any (pg, cmp)))
+-    return special_case (x, svsel (pred_alt, inv_y, y), cmp);
++    return special_case (x, svdivr_x (pg, y, 1.0f), cmp);
+ 
++  svfloat32_t inv_y = svdivr_x (pg, y, 1.0f);
+   return svsel (pred_alt, inv_y, y);
+ }
+
+commit ab5ba6c188159bb5e12be95cd90458924c2fe592
+Author: Yat Long Poon <yatlong.poon@arm.com>
+Date:   Fri Jan 3 19:07:30 2025 +0000
+
+    AArch64: Improve codegen for SVE logs
+    
+    Reduce memory access by using lanewise MLA and moving constants to struct
+    and reduce number of MOVPRFXs.
+    Update maximum ULP error for double log_sve from 1 to 2.
+    Speedup on Neoverse V1 for log (3%), log2 (5%), and log10 (4%).
+    
+    (cherry picked from commit 32d193a372feb28f9da247bb7283d404b84429c6)
+
+diff --git a/sysdeps/aarch64/fpu/log10_sve.c b/sysdeps/aarch64/fpu/log10_sve.c
+index ab7362128d..f1cad2759a 100644
+--- a/sysdeps/aarch64/fpu/log10_sve.c
++++ b/sysdeps/aarch64/fpu/log10_sve.c
+@@ -23,28 +23,49 @@
+ #define Min 0x0010000000000000
+ #define Max 0x7ff0000000000000
+ #define Thres 0x7fe0000000000000 /* Max - Min.  */
+-#define Off 0x3fe6900900000000
+ #define N (1 << V_LOG10_TABLE_BITS)
+ 
++static const struct data
++{
++  double c0, c2;
++  double c1, c3;
++  double invln10, log10_2;
++  double c4;
++  uint64_t off;
++} data = {
++  .c0 = -0x1.bcb7b1526e506p-3,
++  .c1 = 0x1.287a7636be1d1p-3,
++  .c2 = -0x1.bcb7b158af938p-4,
++  .c3 = 0x1.63c78734e6d07p-4,
++  .c4 = -0x1.287461742fee4p-4,
++  .invln10 = 0x1.bcb7b1526e50ep-2,
++  .log10_2 = 0x1.34413509f79ffp-2,
++  .off = 0x3fe6900900000000,
++};
++
+ static svfloat64_t NOINLINE
+-special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
++special_case (svfloat64_t hi, svuint64_t tmp, svfloat64_t y, svfloat64_t r2,
++	      svbool_t special, const struct data *d)
+ {
+-  return sv_call_f64 (log10, x, y, special);
++  svfloat64_t x = svreinterpret_f64 (svadd_x (svptrue_b64 (), tmp, d->off));
++  return sv_call_f64 (log10, x, svmla_x (svptrue_b64 (), hi, r2, y), special);
+ }
+ 
+-/* SVE log10 algorithm.
++/* Double-precision SVE log10 routine.
+    Maximum measured error is 2.46 ulps.
+    SV_NAME_D1 (log10)(0x1.131956cd4b627p+0) got 0x1.fffbdf6eaa669p-6
+ 					   want 0x1.fffbdf6eaa667p-6.  */
+ svfloat64_t SV_NAME_D1 (log10) (svfloat64_t x, const svbool_t pg)
+ {
++  const struct data *d = ptr_barrier (&data);
++
+   svuint64_t ix = svreinterpret_u64 (x);
+   svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thres);
+ 
+   /* x = 2^k z; where z is in range [Off,2*Off) and exact.
+      The range is split into N subintervals.
+      The ith subinterval contains z and c is near its center.  */
+-  svuint64_t tmp = svsub_x (pg, ix, Off);
++  svuint64_t tmp = svsub_x (pg, ix, d->off);
+   svuint64_t i = svlsr_x (pg, tmp, 51 - V_LOG10_TABLE_BITS);
+   i = svand_x (pg, i, (N - 1) << 1);
+   svfloat64_t k = svcvt_f64_x (pg, svasr_x (pg, svreinterpret_s64 (tmp), 52));
+@@ -62,15 +83,19 @@ svfloat64_t SV_NAME_D1 (log10) (svfloat64_t x, const svbool_t pg)
+   svfloat64_t r = svmad_x (pg, invc, z, -1.0);
+ 
+   /* hi = log(c) + k*log(2).  */
+-  svfloat64_t w = svmla_x (pg, logc, r, __v_log10_data.invln10);
+-  svfloat64_t hi = svmla_x (pg, w, k, __v_log10_data.log10_2);
++  svfloat64_t invln10_log10_2 = svld1rq_f64 (svptrue_b64 (), &d->invln10);
++  svfloat64_t w = svmla_lane_f64 (logc, r, invln10_log10_2, 0);
++  svfloat64_t hi = svmla_lane_f64 (w, k, invln10_log10_2, 1);
+ 
+   /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi.  */
+-  svfloat64_t r2 = svmul_x (pg, r, r);
+-  svfloat64_t y = sv_pw_horner_4_f64_x (pg, r, r2, __v_log10_data.poly);
++  svfloat64_t odd_coeffs = svld1rq_f64 (svptrue_b64 (), &d->c1);
++  svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
++  svfloat64_t y = svmla_lane_f64 (sv_f64 (d->c2), r, odd_coeffs, 1);
++  svfloat64_t p = svmla_lane_f64 (sv_f64 (d->c0), r, odd_coeffs, 0);
++  y = svmla_x (pg, y, r2, d->c4);
++  y = svmla_x (pg, p, r2, y);
+ 
+   if (__glibc_unlikely (svptest_any (pg, special)))
+-    return special_case (x, svmla_x (svnot_z (pg, special), hi, r2, y),
+-			 special);
++    return special_case (hi, tmp, y, r2, special, d);
+   return svmla_x (pg, hi, r2, y);
+ }
+diff --git a/sysdeps/aarch64/fpu/log2_sve.c b/sysdeps/aarch64/fpu/log2_sve.c
+index 743fa2a913..908e638246 100644
+--- a/sysdeps/aarch64/fpu/log2_sve.c
++++ b/sysdeps/aarch64/fpu/log2_sve.c
+@@ -21,15 +21,32 @@
+ #include "poly_sve_f64.h"
+ 
+ #define N (1 << V_LOG2_TABLE_BITS)
+-#define Off 0x3fe6900900000000
+ #define Max (0x7ff0000000000000)
+ #define Min (0x0010000000000000)
+ #define Thresh (0x7fe0000000000000) /* Max - Min.  */
+ 
++static const struct data
++{
++  double c0, c2;
++  double c1, c3;
++  double invln2, c4;
++  uint64_t off;
++} data = {
++  .c0 = -0x1.71547652b83p-1,
++  .c1 = 0x1.ec709dc340953p-2,
++  .c2 = -0x1.71547651c8f35p-2,
++  .c3 = 0x1.2777ebe12dda5p-2,
++  .c4 = -0x1.ec738d616fe26p-3,
++  .invln2 = 0x1.71547652b82fep0,
++  .off = 0x3fe6900900000000,
++};
++
+ static svfloat64_t NOINLINE
+-special_case (svfloat64_t x, svfloat64_t y, svbool_t cmp)
++special_case (svfloat64_t w, svuint64_t tmp, svfloat64_t y, svfloat64_t r2,
++	      svbool_t special, const struct data *d)
+ {
+-  return sv_call_f64 (log2, x, y, cmp);
++  svfloat64_t x = svreinterpret_f64 (svadd_x (svptrue_b64 (), tmp, d->off));
++  return sv_call_f64 (log2, x, svmla_x (svptrue_b64 (), w, r2, y), special);
+ }
+ 
+ /* Double-precision SVE log2 routine.
+@@ -40,13 +57,15 @@ special_case (svfloat64_t x, svfloat64_t y, svbool_t cmp)
+ 					  want 0x1.fffb34198d9ddp-5.  */
+ svfloat64_t SV_NAME_D1 (log2) (svfloat64_t x, const svbool_t pg)
+ {
++  const struct data *d = ptr_barrier (&data);
++
+   svuint64_t ix = svreinterpret_u64 (x);
+   svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thresh);
+ 
+   /* x = 2^k z; where z is in range [Off,2*Off) and exact.
+      The range is split into N subintervals.
+      The ith subinterval contains z and c is near its center.  */
+-  svuint64_t tmp = svsub_x (pg, ix, Off);
++  svuint64_t tmp = svsub_x (pg, ix, d->off);
+   svuint64_t i = svlsr_x (pg, tmp, 51 - V_LOG2_TABLE_BITS);
+   i = svand_x (pg, i, (N - 1) << 1);
+   svfloat64_t k = svcvt_f64_x (pg, svasr_x (pg, svreinterpret_s64 (tmp), 52));
+@@ -59,15 +78,19 @@ svfloat64_t SV_NAME_D1 (log2) (svfloat64_t x, const svbool_t pg)
+ 
+   /* log2(x) = log1p(z/c-1)/log(2) + log2(c) + k.  */
+ 
++  svfloat64_t invln2_and_c4 = svld1rq_f64 (svptrue_b64 (), &d->invln2);
+   svfloat64_t r = svmad_x (pg, invc, z, -1.0);
+-  svfloat64_t w = svmla_x (pg, log2c, r, __v_log2_data.invln2);
+-
+-  svfloat64_t r2 = svmul_x (pg, r, r);
+-  svfloat64_t y = sv_pw_horner_4_f64_x (pg, r, r2, __v_log2_data.poly);
++  svfloat64_t w = svmla_lane_f64 (log2c, r, invln2_and_c4, 0);
+   w = svadd_x (pg, k, w);
+ 
++  svfloat64_t odd_coeffs = svld1rq_f64 (svptrue_b64 (), &d->c1);
++  svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
++  svfloat64_t y = svmla_lane_f64 (sv_f64 (d->c2), r, odd_coeffs, 1);
++  svfloat64_t p = svmla_lane_f64 (sv_f64 (d->c0), r, odd_coeffs, 0);
++  y = svmla_lane_f64 (y, r2, invln2_and_c4, 1);
++  y = svmla_x (pg, p, r2, y);
++
+   if (__glibc_unlikely (svptest_any (pg, special)))
+-    return special_case (x, svmla_x (svnot_z (pg, special), w, r2, y),
+-			 special);
++    return special_case (w, tmp, y, r2, special, d);
+   return svmla_x (pg, w, r2, y);
+ }
+diff --git a/sysdeps/aarch64/fpu/log_sve.c b/sysdeps/aarch64/fpu/log_sve.c
+index 9b689f2ec7..044223400b 100644
+--- a/sysdeps/aarch64/fpu/log_sve.c
++++ b/sysdeps/aarch64/fpu/log_sve.c
+@@ -19,39 +19,54 @@
+ 
+ #include "sv_math.h"
+ 
+-#define P(i) sv_f64 (__v_log_data.poly[i])
+ #define N (1 << V_LOG_TABLE_BITS)
+-#define Off (0x3fe6900900000000)
+-#define MaxTop (0x7ff)
+-#define MinTop (0x001)
+-#define ThreshTop (0x7fe) /* MaxTop - MinTop.  */
++#define Max (0x7ff0000000000000)
++#define Min (0x0010000000000000)
++#define Thresh (0x7fe0000000000000) /* Max - Min.  */
++
++static const struct data
++{
++  double c0, c2;
++  double c1, c3;
++  double ln2, c4;
++  uint64_t off;
++} data = {
++  .c0 = -0x1.ffffffffffff7p-2,
++  .c1 = 0x1.55555555170d4p-2,
++  .c2 = -0x1.0000000399c27p-2,
++  .c3 = 0x1.999b2e90e94cap-3,
++  .c4 = -0x1.554e550bd501ep-3,
++  .ln2 = 0x1.62e42fefa39efp-1,
++  .off = 0x3fe6900900000000,
++};
+ 
+ static svfloat64_t NOINLINE
+-special_case (svfloat64_t x, svfloat64_t y, svbool_t cmp)
++special_case (svfloat64_t hi, svuint64_t tmp, svfloat64_t y, svfloat64_t r2,
++	      svbool_t special, const struct data *d)
+ {
+-  return sv_call_f64 (log, x, y, cmp);
++  svfloat64_t x = svreinterpret_f64 (svadd_x (svptrue_b64 (), tmp, d->off));
++  return sv_call_f64 (log, x, svmla_x (svptrue_b64 (), hi, r2, y), special);
+ }
+ 
+-/* SVE port of AdvSIMD log algorithm.
+-   Maximum measured error is 2.17 ulp:
+-   SV_NAME_D1 (log)(0x1.a6129884398a3p+0) got 0x1.ffffff1cca043p-2
+-					 want 0x1.ffffff1cca045p-2.  */
++/* Double-precision SVE log routine.
++   Maximum measured error is 2.64 ulp:
++   SV_NAME_D1 (log)(0x1.95e54bc91a5e2p+184) got 0x1.fffffffe88cacp+6
++					   want 0x1.fffffffe88cafp+6.  */
+ svfloat64_t SV_NAME_D1 (log) (svfloat64_t x, const svbool_t pg)
+ {
++  const struct data *d = ptr_barrier (&data);
++
+   svuint64_t ix = svreinterpret_u64 (x);
+-  svuint64_t top = svlsr_x (pg, ix, 52);
+-  svbool_t cmp = svcmpge (pg, svsub_x (pg, top, MinTop), sv_u64 (ThreshTop));
++  svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thresh);
+ 
+   /* x = 2^k z; where z is in range [Off,2*Off) and exact.
+      The range is split into N subintervals.
+      The ith subinterval contains z and c is near its center.  */
+-  svuint64_t tmp = svsub_x (pg, ix, Off);
++  svuint64_t tmp = svsub_x (pg, ix, d->off);
+   /* Calculate table index = (tmp >> (52 - V_LOG_TABLE_BITS)) % N.
+      The actual value of i is double this due to table layout.  */
+   svuint64_t i
+       = svand_x (pg, svlsr_x (pg, tmp, (51 - V_LOG_TABLE_BITS)), (N - 1) << 1);
+-  svint64_t k
+-      = svasr_x (pg, svreinterpret_s64 (tmp), 52); /* Arithmetic shift.  */
+   svuint64_t iz = svsub_x (pg, ix, svand_x (pg, tmp, 0xfffULL << 52));
+   svfloat64_t z = svreinterpret_f64 (iz);
+   /* Lookup in 2 global lists (length N).  */
+@@ -59,18 +74,22 @@ svfloat64_t SV_NAME_D1 (log) (svfloat64_t x, const svbool_t pg)
+   svfloat64_t logc = svld1_gather_index (pg, &__v_log_data.table[0].logc, i);
+ 
+   /* log(x) = log1p(z/c-1) + log(c) + k*Ln2.  */
+-  svfloat64_t r = svmad_x (pg, invc, z, -1);
+-  svfloat64_t kd = svcvt_f64_x (pg, k);
++  svfloat64_t kd = svcvt_f64_x (pg, svasr_x (pg, svreinterpret_s64 (tmp), 52));
+   /* hi = r + log(c) + k*Ln2.  */
+-  svfloat64_t hi = svmla_x (pg, svadd_x (pg, logc, r), kd, __v_log_data.ln2);
++  svfloat64_t ln2_and_c4 = svld1rq_f64 (svptrue_b64 (), &d->ln2);
++  svfloat64_t r = svmad_x (pg, invc, z, -1);
++  svfloat64_t hi = svmla_lane_f64 (logc, kd, ln2_and_c4, 0);
++  hi = svadd_x (pg, r, hi);
++
+   /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi.  */
+-  svfloat64_t r2 = svmul_x (pg, r, r);
+-  svfloat64_t y = svmla_x (pg, P (2), r, P (3));
+-  svfloat64_t p = svmla_x (pg, P (0), r, P (1));
+-  y = svmla_x (pg, y, r2, P (4));
++  svfloat64_t odd_coeffs = svld1rq_f64 (svptrue_b64 (), &d->c1);
++  svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
++  svfloat64_t y = svmla_lane_f64 (sv_f64 (d->c2), r, odd_coeffs, 1);
++  svfloat64_t p = svmla_lane_f64 (sv_f64 (d->c0), r, odd_coeffs, 0);
++  y = svmla_lane_f64 (y, r2, ln2_and_c4, 1);
+   y = svmla_x (pg, p, r2, y);
+ 
+-  if (__glibc_unlikely (svptest_any (pg, cmp)))
+-    return special_case (x, svmla_x (svnot_z (pg, cmp), hi, r2, y), cmp);
++  if (__glibc_unlikely (svptest_any (pg, special)))
++    return special_case (hi, tmp, y, r2, special, d);
+   return svmla_x (pg, hi, r2, y);
+ }
+diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps
+index 6c96304611..b76c38dac2 100644
+--- a/sysdeps/aarch64/libm-test-ulps
++++ b/sysdeps/aarch64/libm-test-ulps
+@@ -1460,7 +1460,7 @@ float: 2
+ ldouble: 1
+ 
+ Function: "log_sve":
+-double: 1
++double: 2
+ float: 3
+ 
+ Function: "log_towardzero":
+
+commit aa7c61ea15e27ae14717e065a5d4c50baa472851
+Author: Yat Long Poon <yatlong.poon@arm.com>
+Date:   Fri Jan 3 19:09:05 2025 +0000
+
+    AArch64: Improve codegen for SVE log1pf users
+    
+    Reduce memory access by using lanewise MLA and reduce number of MOVPRFXs.
+    Move log1pf implementation to inline helper function.
+    Speedup on Neoverse V1 for log1pf (10%), acoshf (-1%), atanhf (2%), asinhf (2%).
+    
+    (cherry picked from commit 91c1fadba338752bf514cd4cca057b27b1b10eed)
+
+diff --git a/sysdeps/aarch64/fpu/acoshf_sve.c b/sysdeps/aarch64/fpu/acoshf_sve.c
+index 2110894e62..491365e24d 100644
+--- a/sysdeps/aarch64/fpu/acoshf_sve.c
++++ b/sysdeps/aarch64/fpu/acoshf_sve.c
+@@ -17,23 +17,26 @@
+    License along with the GNU C Library; if not, see
+    <https://www.gnu.org/licenses/>.  */
+ 
++#include "sv_math.h"
++#include "sv_log1pf_inline.h"
++
+ #define One 0x3f800000
+ #define Thres 0x20000000 /* asuint(0x1p64) - One.  */
+ 
+-#include "sv_log1pf_inline.h"
+-
+ static svfloat32_t NOINLINE
+-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
++special_case (svfloat32_t xm1, svfloat32_t tmp, svbool_t special)
+ {
++  svfloat32_t x = svadd_x (svptrue_b32 (), xm1, 1.0f);
++  svfloat32_t y = sv_log1pf_inline (tmp, svptrue_b32 ());
+   return sv_call_f32 (acoshf, x, y, special);
+ }
+ 
+ /* Single-precision SVE acosh(x) routine. Implements the same algorithm as
+    vector acoshf and log1p.
+ 
+-   Maximum error is 2.78 ULPs:
+-   SV_NAME_F1 (acosh) (0x1.01e996p+0) got 0x1.f45b42p-4
+-				     want 0x1.f45b3cp-4.  */
++   Maximum error is 2.47 ULPs:
++   SV_NAME_F1 (acosh) (0x1.01ca76p+0) got 0x1.e435a6p-4
++				     want 0x1.e435a2p-4.  */
+ svfloat32_t SV_NAME_F1 (acosh) (svfloat32_t x, const svbool_t pg)
+ {
+   svuint32_t ix = svreinterpret_u32 (x);
+@@ -41,9 +44,9 @@ svfloat32_t SV_NAME_F1 (acosh) (svfloat32_t x, const svbool_t pg)
+ 
+   svfloat32_t xm1 = svsub_x (pg, x, 1.0f);
+   svfloat32_t u = svmul_x (pg, xm1, svadd_x (pg, x, 1.0f));
+-  svfloat32_t y = sv_log1pf_inline (svadd_x (pg, xm1, svsqrt_x (pg, u)), pg);
++  svfloat32_t tmp = svadd_x (pg, xm1, svsqrt_x (pg, u));
+ 
+   if (__glibc_unlikely (svptest_any (pg, special)))
+-    return special_case (x, y, special);
+-  return y;
++    return special_case (xm1, tmp, special);
++  return sv_log1pf_inline (tmp, pg);
+ }
+diff --git a/sysdeps/aarch64/fpu/asinhf_sve.c b/sysdeps/aarch64/fpu/asinhf_sve.c
+index d85c3a685c..b7f253bf32 100644
+--- a/sysdeps/aarch64/fpu/asinhf_sve.c
++++ b/sysdeps/aarch64/fpu/asinhf_sve.c
+@@ -20,20 +20,23 @@
+ #include "sv_math.h"
+ #include "sv_log1pf_inline.h"
+ 
+-#define BigBound (0x5f800000)  /* asuint(0x1p64).  */
++#define BigBound 0x5f800000 /* asuint(0x1p64).  */
+ 
+ static svfloat32_t NOINLINE
+-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
++special_case (svuint32_t iax, svuint32_t sign, svfloat32_t y, svbool_t special)
+ {
++  svfloat32_t x = svreinterpret_f32 (sveor_x (svptrue_b32 (), iax, sign));
++  y = svreinterpret_f32 (
++      svorr_x (svptrue_b32 (), sign, svreinterpret_u32 (y)));
+   return sv_call_f32 (asinhf, x, y, special);
+ }
+ 
+ /* Single-precision SVE asinh(x) routine. Implements the same algorithm as
+    vector asinhf and log1p.
+ 
+-   Maximum error is 2.48 ULPs:
+-   SV_NAME_F1 (asinh) (0x1.008864p-3) got 0x1.ffbbbcp-4
+-				     want 0x1.ffbbb8p-4.  */
++   Maximum error is 1.92 ULPs:
++   SV_NAME_F1 (asinh) (-0x1.0922ecp-1) got -0x1.fd0bccp-2
++				      want -0x1.fd0bc8p-2.  */
+ svfloat32_t SV_NAME_F1 (asinh) (svfloat32_t x, const svbool_t pg)
+ {
+   svfloat32_t ax = svabs_x (pg, x);
+@@ -49,8 +52,6 @@ svfloat32_t SV_NAME_F1 (asinh) (svfloat32_t x, const svbool_t pg)
+       = sv_log1pf_inline (svadd_x (pg, ax, svdiv_x (pg, ax2, d)), pg);
+ 
+   if (__glibc_unlikely (svptest_any (pg, special)))
+-    return special_case (
+-	x, svreinterpret_f32 (svorr_x (pg, sign, svreinterpret_u32 (y))),
+-	special);
++    return special_case (iax, sign, y, special);
+   return svreinterpret_f32 (svorr_x (pg, sign, svreinterpret_u32 (y)));
+ }
+diff --git a/sysdeps/aarch64/fpu/atanhf_sve.c b/sysdeps/aarch64/fpu/atanhf_sve.c
+index dae83041ef..2d3005bbc8 100644
+--- a/sysdeps/aarch64/fpu/atanhf_sve.c
++++ b/sysdeps/aarch64/fpu/atanhf_sve.c
+@@ -17,21 +17,25 @@
+    License along with the GNU C Library; if not, see
+    <https://www.gnu.org/licenses/>.  */
+ 
++#include "sv_math.h"
+ #include "sv_log1pf_inline.h"
+ 
+ #define One (0x3f800000)
+ #define Half (0x3f000000)
+ 
+ static svfloat32_t NOINLINE
+-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
++special_case (svuint32_t iax, svuint32_t sign, svfloat32_t halfsign,
++	      svfloat32_t y, svbool_t special)
+ {
++  svfloat32_t x = svreinterpret_f32 (sveor_x (svptrue_b32 (), iax, sign));
++  y = svmul_x (svptrue_b32 (), halfsign, y);
+   return sv_call_f32 (atanhf, x, y, special);
+ }
+ 
+ /* Approximation for vector single-precision atanh(x) using modified log1p.
+-   The maximum error is 2.28 ULP:
+-   _ZGVsMxv_atanhf(0x1.ff1194p-5) got 0x1.ffbbbcp-5
+-				 want 0x1.ffbbb6p-5.  */
++   The maximum error is 1.99 ULP:
++   _ZGVsMxv_atanhf(0x1.f1583p-5) got 0x1.f1f4fap-5
++				want 0x1.f1f4f6p-5.  */
+ svfloat32_t SV_NAME_F1 (atanh) (svfloat32_t x, const svbool_t pg)
+ {
+   svfloat32_t ax = svabs_x (pg, x);
+@@ -48,7 +52,7 @@ svfloat32_t SV_NAME_F1 (atanh) (svfloat32_t x, const svbool_t pg)
+   y = sv_log1pf_inline (y, pg);
+ 
+   if (__glibc_unlikely (svptest_any (pg, special)))
+-    return special_case (x, svmul_x (pg, halfsign, y), special);
++    return special_case (iax, sign, halfsign, y, special);
+ 
+   return svmul_x (pg, halfsign, y);
+ }
+diff --git a/sysdeps/aarch64/fpu/log1pf_sve.c b/sysdeps/aarch64/fpu/log1pf_sve.c
+index 5256d5e94c..18a185c838 100644
+--- a/sysdeps/aarch64/fpu/log1pf_sve.c
++++ b/sysdeps/aarch64/fpu/log1pf_sve.c
+@@ -18,30 +18,13 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include "sv_math.h"
+-#include "poly_sve_f32.h"
+-
+-static const struct data
+-{
+-  float poly[8];
+-  float ln2, exp_bias;
+-  uint32_t four, three_quarters;
+-} data = {.poly = {/* Do not store first term of polynomial, which is -0.5, as
+-                      this can be fmov-ed directly instead of including it in
+-                      the main load-and-mla polynomial schedule.  */
+-		   0x1.5555aap-2f, -0x1.000038p-2f, 0x1.99675cp-3f,
+-		   -0x1.54ef78p-3f, 0x1.28a1f4p-3f, -0x1.0da91p-3f,
+-		   0x1.abcb6p-4f, -0x1.6f0d5ep-5f},
+-	  .ln2 = 0x1.62e43p-1f,
+-	  .exp_bias = 0x1p-23f,
+-	  .four = 0x40800000,
+-	  .three_quarters = 0x3f400000};
+-
+-#define SignExponentMask 0xff800000
++#include "sv_log1pf_inline.h"
+ 
+ static svfloat32_t NOINLINE
+-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
++special_case (svfloat32_t x, svbool_t special)
+ {
+-  return sv_call_f32 (log1pf, x, y, special);
++  return sv_call_f32 (log1pf, x, sv_log1pf_inline (x, svptrue_b32 ()),
++		      special);
+ }
+ 
+ /* Vector log1pf approximation using polynomial on reduced interval. Worst-case
+@@ -50,53 +33,14 @@ special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
+ 				 want 0x1.9f323ep-2.  */
+ svfloat32_t SV_NAME_F1 (log1p) (svfloat32_t x, svbool_t pg)
+ {
+-  const struct data *d = ptr_barrier (&data);
+   /* x < -1, Inf/Nan.  */
+   svbool_t special = svcmpeq (pg, svreinterpret_u32 (x), 0x7f800000);
+   special = svorn_z (pg, special, svcmpge (pg, x, -1));
+ 
+-  /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m
+-			   is in [-0.25, 0.5]):
+-     log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2).
+-
+-     We approximate log1p(m) with a polynomial, then scale by
+-     k*log(2). Instead of doing this directly, we use an intermediate
+-     scale factor s = 4*k*log(2) to ensure the scale is representable
+-     as a normalised fp32 number.  */
+-  svfloat32_t m = svadd_x (pg, x, 1);
+-
+-  /* Choose k to scale x to the range [-1/4, 1/2].  */
+-  svint32_t k
+-      = svand_x (pg, svsub_x (pg, svreinterpret_s32 (m), d->three_quarters),
+-		 sv_s32 (SignExponentMask));
+-
+-  /* Scale x by exponent manipulation.  */
+-  svfloat32_t m_scale = svreinterpret_f32 (
+-      svsub_x (pg, svreinterpret_u32 (x), svreinterpret_u32 (k)));
+-
+-  /* Scale up to ensure that the scale factor is representable as normalised
+-     fp32 number, and scale m down accordingly.  */
+-  svfloat32_t s = svreinterpret_f32 (svsubr_x (pg, k, d->four));
+-  m_scale = svadd_x (pg, m_scale, svmla_x (pg, sv_f32 (-1), s, 0.25));
+-
+-  /* Evaluate polynomial on reduced interval.  */
+-  svfloat32_t ms2 = svmul_x (pg, m_scale, m_scale),
+-	      ms4 = svmul_x (pg, ms2, ms2);
+-  svfloat32_t p = sv_estrin_7_f32_x (pg, m_scale, ms2, ms4, d->poly);
+-  p = svmad_x (pg, m_scale, p, -0.5);
+-  p = svmla_x (pg, m_scale, m_scale, svmul_x (pg, m_scale, p));
+-
+-  /* The scale factor to be applied back at the end - by multiplying float(k)
+-     by 2^-23 we get the unbiased exponent of k.  */
+-  svfloat32_t scale_back = svmul_x (pg, svcvt_f32_x (pg, k), d->exp_bias);
+-
+-  /* Apply the scaling back.  */
+-  svfloat32_t y = svmla_x (pg, p, scale_back, d->ln2);
+-
+   if (__glibc_unlikely (svptest_any (pg, special)))
+-    return special_case (x, y, special);
++    return special_case (x, special);
+ 
+-  return y;
++  return sv_log1pf_inline (x, pg);
+ }
+ 
+ strong_alias (SV_NAME_F1 (log1p), SV_NAME_F1 (logp1))
+diff --git a/sysdeps/aarch64/fpu/sv_log1pf_inline.h b/sysdeps/aarch64/fpu/sv_log1pf_inline.h
+index b94b2da055..850297d615 100644
+--- a/sysdeps/aarch64/fpu/sv_log1pf_inline.h
++++ b/sysdeps/aarch64/fpu/sv_log1pf_inline.h
+@@ -22,55 +22,76 @@
+ 
+ #include "sv_math.h"
+ #include "vecmath_config.h"
+-#include "poly_sve_f32.h"
++
++#define SignExponentMask 0xff800000
+ 
+ static const struct sv_log1pf_data
+ {
+-  float32_t poly[9];
+-  float32_t ln2;
+-  float32_t scale_back;
++  float c0, c2, c4, c6;
++  float c1, c3, c5, c7;
++  float ln2, exp_bias, quarter;
++  uint32_t four, three_quarters;
+ } sv_log1pf_data = {
+-  /* Polynomial generated using FPMinimax in [-0.25, 0.5].  */
+-  .poly = { -0x1p-1f, 0x1.5555aap-2f, -0x1.000038p-2f, 0x1.99675cp-3f,
+-	    -0x1.54ef78p-3f, 0x1.28a1f4p-3f, -0x1.0da91p-3f, 0x1.abcb6p-4f,
+-	    -0x1.6f0d5ep-5f },
+-  .scale_back = 0x1.0p-23f,
+-  .ln2 = 0x1.62e43p-1f,
++  /* Do not store first term of polynomial, which is -0.5, as
++     this can be fmov-ed directly instead of including it in
++     the main load-and-mla polynomial schedule.  */
++  .c0 = 0x1.5555aap-2f,		.c1 = -0x1.000038p-2f, .c2 = 0x1.99675cp-3f,
++  .c3 = -0x1.54ef78p-3f,	.c4 = 0x1.28a1f4p-3f,  .c5 = -0x1.0da91p-3f,
++  .c6 = 0x1.abcb6p-4f,		.c7 = -0x1.6f0d5ep-5f, .ln2 = 0x1.62e43p-1f,
++  .exp_bias = 0x1p-23f,		.quarter = 0x1p-2f,    .four = 0x40800000,
++  .three_quarters = 0x3f400000,
+ };
+ 
+-static inline svfloat32_t
+-eval_poly (svfloat32_t m, const float32_t *c, svbool_t pg)
+-{
+-  svfloat32_t p_12 = svmla_x (pg, sv_f32 (c[0]), m, sv_f32 (c[1]));
+-  svfloat32_t m2 = svmul_x (pg, m, m);
+-  svfloat32_t q = svmla_x (pg, m, m2, p_12);
+-  svfloat32_t p = sv_pw_horner_6_f32_x (pg, m, m2, c + 2);
+-  p = svmul_x (pg, m2, p);
+-
+-  return svmla_x (pg, q, m2, p);
+-}
+-
+ static inline svfloat32_t
+ sv_log1pf_inline (svfloat32_t x, svbool_t pg)
+ {
+   const struct sv_log1pf_data *d = ptr_barrier (&sv_log1pf_data);
+ 
+-  svfloat32_t m = svadd_x (pg, x, 1.0f);
+-
+-  svint32_t ks = svsub_x (pg, svreinterpret_s32 (m),
+-			  svreinterpret_s32 (svdup_f32 (0.75f)));
+-  ks = svand_x (pg, ks, 0xff800000);
+-  svuint32_t k = svreinterpret_u32 (ks);
+-  svfloat32_t s = svreinterpret_f32 (
+-      svsub_x (pg, svreinterpret_u32 (svdup_f32 (4.0f)), k));
+-
+-  svfloat32_t m_scale
+-      = svreinterpret_f32 (svsub_x (pg, svreinterpret_u32 (x), k));
+-  m_scale
+-      = svadd_x (pg, m_scale, svmla_x (pg, sv_f32 (-1.0f), sv_f32 (0.25f), s));
+-  svfloat32_t p = eval_poly (m_scale, d->poly, pg);
+-  svfloat32_t scale_back = svmul_x (pg, svcvt_f32_x (pg, k), d->scale_back);
+-  return svmla_x (pg, p, scale_back, d->ln2);
++  /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m
++			 is in [-0.25, 0.5]):
++   log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2).
++
++   We approximate log1p(m) with a polynomial, then scale by
++   k*log(2). Instead of doing this directly, we use an intermediate
++   scale factor s = 4*k*log(2) to ensure the scale is representable
++   as a normalised fp32 number.  */
++  svfloat32_t m = svadd_x (pg, x, 1);
++
++  /* Choose k to scale x to the range [-1/4, 1/2].  */
++  svint32_t k
++      = svand_x (pg, svsub_x (pg, svreinterpret_s32 (m), d->three_quarters),
++		 sv_s32 (SignExponentMask));
++
++  /* Scale x by exponent manipulation.  */
++  svfloat32_t m_scale = svreinterpret_f32 (
++      svsub_x (pg, svreinterpret_u32 (x), svreinterpret_u32 (k)));
++
++  /* Scale up to ensure that the scale factor is representable as normalised
++     fp32 number, and scale m down accordingly.  */
++  svfloat32_t s = svreinterpret_f32 (svsubr_x (pg, k, d->four));
++  svfloat32_t fconst = svld1rq_f32 (svptrue_b32 (), &d->ln2);
++  m_scale = svadd_x (pg, m_scale, svmla_lane_f32 (sv_f32 (-1), s, fconst, 2));
++
++  /* Evaluate polynomial on reduced interval.  */
++  svfloat32_t ms2 = svmul_x (svptrue_b32 (), m_scale, m_scale);
++
++  svfloat32_t c1357 = svld1rq_f32 (svptrue_b32 (), &d->c1);
++  svfloat32_t p01 = svmla_lane_f32 (sv_f32 (d->c0), m_scale, c1357, 0);
++  svfloat32_t p23 = svmla_lane_f32 (sv_f32 (d->c2), m_scale, c1357, 1);
++  svfloat32_t p45 = svmla_lane_f32 (sv_f32 (d->c4), m_scale, c1357, 2);
++  svfloat32_t p67 = svmla_lane_f32 (sv_f32 (d->c6), m_scale, c1357, 3);
++
++  svfloat32_t p = svmla_x (pg, p45, p67, ms2);
++  p = svmla_x (pg, p23, p, ms2);
++  p = svmla_x (pg, p01, p, ms2);
++
++  p = svmad_x (pg, m_scale, p, -0.5);
++  p = svmla_x (pg, m_scale, m_scale, svmul_x (pg, m_scale, p));
++
++  /* The scale factor to be applied back at the end - by multiplying float(k)
++   by 2^-23 we get the unbiased exponent of k.  */
++  svfloat32_t scale_back = svmul_lane_f32 (svcvt_f32_x (pg, k), fconst, 1);
++  return svmla_lane_f32 (p, scale_back, fconst, 0);
+ }
+ 
+ #endif
+
+commit d983f14c304df2d880c7b01e904e4a889064b9b3
+Author: Luna Lamb <luna.lamb@arm.com>
+Date:   Fri Jan 3 20:15:17 2025 +0000
+
+    AArch64: Improve codegen in SVE expm1f and users
+    
+    Use unpredicated muls, use absolute compare and improve memory access.
+    Expm1f, sinhf and tanhf show 7%, 5% and 1% improvement in throughput
+    microbenchmark on Neoverse V1.
+    
+    (cherry picked from commit f86b4cf87581cf1e45702b07880679ffa0b1f47a)
+
+diff --git a/sysdeps/aarch64/fpu/expm1f_sve.c b/sysdeps/aarch64/fpu/expm1f_sve.c
+index 7c852125cd..05a66400d4 100644
+--- a/sysdeps/aarch64/fpu/expm1f_sve.c
++++ b/sysdeps/aarch64/fpu/expm1f_sve.c
+@@ -18,7 +18,6 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include "sv_math.h"
+-#include "poly_sve_f32.h"
+ 
+ /* Largest value of x for which expm1(x) should round to -1.  */
+ #define SpecialBound 0x1.5ebc4p+6f
+@@ -28,20 +27,17 @@ static const struct data
+   /* These 4 are grouped together so they can be loaded as one quadword, then
+      used with _lane forms of svmla/svmls.  */
+   float c2, c4, ln2_hi, ln2_lo;
+-  float c0, c1, c3, inv_ln2, special_bound, shift;
++  float c0, inv_ln2, c1, c3, special_bound;
+ } data = {
+   /* Generated using fpminimax.  */
+   .c0 = 0x1.fffffep-2,		 .c1 = 0x1.5554aep-3,
+   .c2 = 0x1.555736p-5,		 .c3 = 0x1.12287cp-7,
+-  .c4 = 0x1.6b55a2p-10,
++  .c4 = 0x1.6b55a2p-10,		 .inv_ln2 = 0x1.715476p+0f,
++  .special_bound = SpecialBound, .ln2_lo = 0x1.7f7d1cp-20f,
++  .ln2_hi = 0x1.62e4p-1f,
+ 
+-  .special_bound = SpecialBound, .shift = 0x1.8p23f,
+-  .inv_ln2 = 0x1.715476p+0f,	 .ln2_hi = 0x1.62e4p-1f,
+-  .ln2_lo = 0x1.7f7d1cp-20f,
+ };
+ 
+-#define C(i) sv_f32 (d->c##i)
+-
+ static svfloat32_t NOINLINE
+ special_case (svfloat32_t x, svbool_t pg)
+ {
+@@ -71,9 +67,8 @@ svfloat32_t SV_NAME_F1 (expm1) (svfloat32_t x, svbool_t pg)
+      and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
+      exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
+      where 2^i is exact because i is an integer.  */
+-  svfloat32_t j = svmla_x (pg, sv_f32 (d->shift), x, d->inv_ln2);
+-  j = svsub_x (pg, j, d->shift);
+-  svint32_t i = svcvt_s32_x (pg, j);
++  svfloat32_t j = svmul_x (svptrue_b32 (), x, d->inv_ln2);
++  j = svrinta_x (pg, j);
+ 
+   svfloat32_t f = svmls_lane (x, j, lane_constants, 2);
+   f = svmls_lane (f, j, lane_constants, 3);
+@@ -83,17 +78,17 @@ svfloat32_t SV_NAME_F1 (expm1) (svfloat32_t x, svbool_t pg)
+ 	 x + ax^2 + bx^3 + cx^4 ....
+      So we calculate the polynomial P(f) = a + bf + cf^2 + ...
+      and assemble the approximation expm1(f) ~= f + f^2 * P(f).  */
+-  svfloat32_t p12 = svmla_lane (C (1), f, lane_constants, 0);
+-  svfloat32_t p34 = svmla_lane (C (3), f, lane_constants, 1);
+-  svfloat32_t f2 = svmul_x (pg, f, f);
++  svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), f, lane_constants, 0);
++  svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), f, lane_constants, 1);
++  svfloat32_t f2 = svmul_x (svptrue_b32 (), f, f);
+   svfloat32_t p = svmla_x (pg, p12, f2, p34);
+-  p = svmla_x (pg, C (0), f, p);
++
++  p = svmla_x (pg, sv_f32 (d->c0), f, p);
+   p = svmla_x (pg, f, f2, p);
+ 
+   /* Assemble the result.
+      expm1(x) ~= 2^i * (p + 1) - 1
+      Let t = 2^i.  */
+-  svfloat32_t t = svreinterpret_f32 (
+-      svadd_x (pg, svreinterpret_u32 (svlsl_x (pg, i, 23)), 0x3f800000));
+-  return svmla_x (pg, svsub_x (pg, t, 1), p, t);
++  svfloat32_t t = svscale_x (pg, sv_f32 (1.0f), svcvt_s32_x (pg, j));
++  return svmla_x (pg, svsub_x (pg, t, 1.0f), p, t);
+ }
+diff --git a/sysdeps/aarch64/fpu/sinhf_sve.c b/sysdeps/aarch64/fpu/sinhf_sve.c
+index 6c204b57a2..50dd386774 100644
+--- a/sysdeps/aarch64/fpu/sinhf_sve.c
++++ b/sysdeps/aarch64/fpu/sinhf_sve.c
+@@ -63,5 +63,5 @@ svfloat32_t SV_NAME_F1 (sinh) (svfloat32_t x, const svbool_t pg)
+   if (__glibc_unlikely (svptest_any (pg, special)))
+     return special_case (x, svmul_x (pg, t, halfsign), special);
+ 
+-  return svmul_x (pg, t, halfsign);
++  return svmul_x (svptrue_b32 (), t, halfsign);
+ }
+diff --git a/sysdeps/aarch64/fpu/sv_expm1f_inline.h b/sysdeps/aarch64/fpu/sv_expm1f_inline.h
+index 5b72451222..e46ddda543 100644
+--- a/sysdeps/aarch64/fpu/sv_expm1f_inline.h
++++ b/sysdeps/aarch64/fpu/sv_expm1f_inline.h
+@@ -27,21 +27,18 @@ struct sv_expm1f_data
+   /* These 4 are grouped together so they can be loaded as one quadword, then
+    used with _lane forms of svmla/svmls.  */
+   float32_t c2, c4, ln2_hi, ln2_lo;
+-  float32_t c0, c1, c3, inv_ln2, shift;
++  float c0, inv_ln2, c1, c3, special_bound;
+ };
+ 
+ /* Coefficients generated using fpminimax.  */
+ #define SV_EXPM1F_DATA                                                        \
+   {                                                                           \
+-    .c0 = 0x1.fffffep-2, .c1 = 0x1.5554aep-3, .c2 = 0x1.555736p-5,            \
+-    .c3 = 0x1.12287cp-7, .c4 = 0x1.6b55a2p-10,                                \
++    .c0 = 0x1.fffffep-2, .c1 = 0x1.5554aep-3, .inv_ln2 = 0x1.715476p+0f,      \
++    .c2 = 0x1.555736p-5, .c3 = 0x1.12287cp-7,                                 \
+                                                                               \
+-    .shift = 0x1.8p23f, .inv_ln2 = 0x1.715476p+0f, .ln2_hi = 0x1.62e4p-1f,    \
+-    .ln2_lo = 0x1.7f7d1cp-20f,                                                \
++    .c4 = 0x1.6b55a2p-10, .ln2_lo = 0x1.7f7d1cp-20f, .ln2_hi = 0x1.62e4p-1f,  \
+   }
+ 
+-#define C(i) sv_f32 (d->c##i)
+-
+ static inline svfloat32_t
+ expm1f_inline (svfloat32_t x, svbool_t pg, const struct sv_expm1f_data *d)
+ {
+@@ -55,9 +52,8 @@ expm1f_inline (svfloat32_t x, svbool_t pg, const struct sv_expm1f_data *d)
+      and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
+      exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
+      where 2^i is exact because i is an integer.  */
+-  svfloat32_t j = svmla_x (pg, sv_f32 (d->shift), x, d->inv_ln2);
+-  j = svsub_x (pg, j, d->shift);
+-  svint32_t i = svcvt_s32_x (pg, j);
++  svfloat32_t j = svmul_x (svptrue_b32 (), x, d->inv_ln2);
++  j = svrinta_x (pg, j);
+ 
+   svfloat32_t f = svmls_lane (x, j, lane_constants, 2);
+   f = svmls_lane (f, j, lane_constants, 3);
+@@ -67,18 +63,18 @@ expm1f_inline (svfloat32_t x, svbool_t pg, const struct sv_expm1f_data *d)
+ 	 x + ax^2 + bx^3 + cx^4 ....
+      So we calculate the polynomial P(f) = a + bf + cf^2 + ...
+      and assemble the approximation expm1(f) ~= f + f^2 * P(f).  */
+-  svfloat32_t p12 = svmla_lane (C (1), f, lane_constants, 0);
+-  svfloat32_t p34 = svmla_lane (C (3), f, lane_constants, 1);
+-  svfloat32_t f2 = svmul_x (pg, f, f);
++  svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), f, lane_constants, 0);
++  svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), f, lane_constants, 1);
++  svfloat32_t f2 = svmul_x (svptrue_b32 (), f, f);
+   svfloat32_t p = svmla_x (pg, p12, f2, p34);
+-  p = svmla_x (pg, C (0), f, p);
++  p = svmla_x (pg, sv_f32 (d->c0), f, p);
+   p = svmla_x (pg, f, f2, p);
+ 
+   /* Assemble the result.
+      expm1(x) ~= 2^i * (p + 1) - 1
+      Let t = 2^i.  */
+-  svfloat32_t t = svscale_x (pg, sv_f32 (1), i);
+-  return svmla_x (pg, svsub_x (pg, t, 1), p, t);
++  svfloat32_t t = svscale_x (pg, sv_f32 (1.0f), svcvt_s32_x (pg, j));
++  return svmla_x (pg, svsub_x (pg, t, 1.0f), p, t);
+ }
+ 
+ #endif
+diff --git a/sysdeps/aarch64/fpu/tanhf_sve.c b/sysdeps/aarch64/fpu/tanhf_sve.c
+index 0b94523cf5..80dd679346 100644
+--- a/sysdeps/aarch64/fpu/tanhf_sve.c
++++ b/sysdeps/aarch64/fpu/tanhf_sve.c
+@@ -19,20 +19,27 @@
+ 
+ #include "sv_expm1f_inline.h"
+ 
++/* Largest value of x for which tanhf(x) rounds to 1 (or -1 for negative).  */
++#define BoringBound 0x1.205966p+3f
++
+ static const struct data
+ {
+   struct sv_expm1f_data expm1f_consts;
+-  uint32_t boring_bound, onef;
++  uint32_t onef, special_bound;
++  float boring_bound;
+ } data = {
+   .expm1f_consts = SV_EXPM1F_DATA,
+-  /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for negative).  */
+-  .boring_bound = 0x41102cb3,
+   .onef = 0x3f800000,
++  .special_bound = 0x7f800000,
++  .boring_bound = BoringBound,
+ };
+ 
+ static svfloat32_t NOINLINE
+-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
++special_case (svfloat32_t x, svbool_t pg, svbool_t is_boring,
++	      svfloat32_t boring, svfloat32_t q, svbool_t special)
+ {
++  svfloat32_t y
++      = svsel_f32 (is_boring, boring, svdiv_x (pg, q, svadd_x (pg, q, 2.0)));
+   return sv_call_f32 (tanhf, x, y, special);
+ }
+ 
+@@ -47,15 +54,16 @@ svfloat32_t SV_NAME_F1 (tanh) (svfloat32_t x, const svbool_t pg)
+   svfloat32_t ax = svabs_x (pg, x);
+   svuint32_t iax = svreinterpret_u32 (ax);
+   svuint32_t sign = sveor_x (pg, svreinterpret_u32 (x), iax);
+-  svbool_t is_boring = svcmpgt (pg, iax, d->boring_bound);
+   svfloat32_t boring = svreinterpret_f32 (svorr_x (pg, sign, d->onef));
+-
+-  svbool_t special = svcmpgt (pg, iax, 0x7f800000);
++  svbool_t special = svcmpgt (pg, iax, d->special_bound);
++  svbool_t is_boring = svacgt (pg, x, d->boring_bound);
+ 
+   /* tanh(x) = (e^2x - 1) / (e^2x + 1).  */
+-  svfloat32_t q = expm1f_inline (svmul_x (pg, x, 2.0), pg, &d->expm1f_consts);
+-  svfloat32_t y = svdiv_x (pg, q, svadd_x (pg, q, 2.0));
++  svfloat32_t q = expm1f_inline (svmul_x (svptrue_b32 (), x, 2.0), pg,
++				 &d->expm1f_consts);
++
+   if (__glibc_unlikely (svptest_any (pg, special)))
+-    return special_case (x, svsel_f32 (is_boring, boring, y), special);
++    return special_case (x, pg, is_boring, boring, q, special);
++  svfloat32_t y = svdiv_x (pg, q, svadd_x (pg, q, 2.0));
+   return svsel_f32 (is_boring, boring, y);
+ }
+
+commit 0ff6a9ff79bca9384ce4ba20e8942d39cc377a14
+Author: Luna Lamb <luna.lamb@arm.com>
+Date:   Thu Feb 13 17:52:09 2025 +0000
+
+    Aarch64: Improve codegen in SVE asinh
+    
+    Use unpredicated muls, use lanewise mla's and improve memory access.
+    1% regression in throughput microbenchmark on Neoverse V1.
+    
+    Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
+    (cherry picked from commit 8f0e7fe61e0a2ad5ed777933703ce09053810ec4)
+
+diff --git a/sysdeps/aarch64/fpu/asinh_sve.c b/sysdeps/aarch64/fpu/asinh_sve.c
+index 28dc5c4587..fe8715e06c 100644
+--- a/sysdeps/aarch64/fpu/asinh_sve.c
++++ b/sysdeps/aarch64/fpu/asinh_sve.c
+@@ -18,36 +18,49 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include "sv_math.h"
+-#include "poly_sve_f64.h"
+ 
+ #define SignMask (0x8000000000000000)
+ #define One (0x3ff0000000000000)
+ #define Thres (0x5fe0000000000000) /* asuint64 (0x1p511).  */
++#define IndexMask (((1 << V_LOG_TABLE_BITS) - 1) << 1)
+ 
+ static const struct data
+ {
+-  double poly[18];
+-  double ln2, p3, p1, p4, p0, p2;
+-  uint64_t n;
+-  uint64_t off;
++  double even_coeffs[9];
++  double ln2, p3, p1, p4, p0, p2, c1, c3, c5, c7, c9, c11, c13, c15, c17;
++  uint64_t off, mask;
+ 
+ } data = {
+-  /* Polynomial generated using Remez on [2^-26, 1].  */
+-  .poly
+-  = { -0x1.55555555554a7p-3, 0x1.3333333326c7p-4, -0x1.6db6db68332e6p-5,
+-      0x1.f1c71b26fb40dp-6, -0x1.6e8b8b654a621p-6, 0x1.1c4daa9e67871p-6,
+-      -0x1.c9871d10885afp-7, 0x1.7a16e8d9d2ecfp-7, -0x1.3ddca533e9f54p-7,
+-      0x1.0becef748dafcp-7, -0x1.b90c7099dd397p-8, 0x1.541f2bb1ffe51p-8,
+-      -0x1.d217026a669ecp-9, 0x1.0b5c7977aaf7p-9, -0x1.e0f37daef9127p-11,
+-      0x1.388b5fe542a6p-12, -0x1.021a48685e287p-14, 0x1.93d4ba83d34dap-18 },
++   /* Polynomial generated using Remez on [2^-26, 1].  */
++  .even_coeffs ={
++    -0x1.55555555554a7p-3,
++    -0x1.6db6db68332e6p-5,
++    -0x1.6e8b8b654a621p-6,
++    -0x1.c9871d10885afp-7,
++    -0x1.3ddca533e9f54p-7,
++    -0x1.b90c7099dd397p-8,
++    -0x1.d217026a669ecp-9,
++    -0x1.e0f37daef9127p-11,
++    -0x1.021a48685e287p-14, },
++
++  .c1 = 0x1.3333333326c7p-4,
++  .c3 = 0x1.f1c71b26fb40dp-6,
++  .c5 = 0x1.1c4daa9e67871p-6,
++  .c7 = 0x1.7a16e8d9d2ecfp-7,
++  .c9 = 0x1.0becef748dafcp-7,
++  .c11 = 0x1.541f2bb1ffe51p-8,
++  .c13 = 0x1.0b5c7977aaf7p-9,
++  .c15 = 0x1.388b5fe542a6p-12,
++  .c17 = 0x1.93d4ba83d34dap-18,
++
+   .ln2 = 0x1.62e42fefa39efp-1,
+   .p0 = -0x1.ffffffffffff7p-2,
+   .p1 = 0x1.55555555170d4p-2,
+   .p2 = -0x1.0000000399c27p-2,
+   .p3 = 0x1.999b2e90e94cap-3,
+   .p4 = -0x1.554e550bd501ep-3,
+-  .n = 1 << V_LOG_TABLE_BITS,
+-  .off = 0x3fe6900900000000
++  .off = 0x3fe6900900000000,
++  .mask = 0xfffULL << 52,
+ };
+ 
+ static svfloat64_t NOINLINE
+@@ -64,11 +77,10 @@ __sv_log_inline (svfloat64_t x, const struct data *d, const svbool_t pg)
+      of the algorithm used.  */
+ 
+   svuint64_t ix = svreinterpret_u64 (x);
+-  svuint64_t tmp = svsub_x (pg, ix, d->off);
+-  svuint64_t i = svand_x (pg, svlsr_x (pg, tmp, (51 - V_LOG_TABLE_BITS)),
+-			  (d->n - 1) << 1);
+-  svint64_t k = svasr_x (pg, svreinterpret_s64 (tmp), 52);
+-  svuint64_t iz = svsub_x (pg, ix, svand_x (pg, tmp, 0xfffULL << 52));
++  svuint64_t i_off = svsub_x (pg, ix, d->off);
++  svuint64_t i
++      = svand_x (pg, svlsr_x (pg, i_off, (51 - V_LOG_TABLE_BITS)), IndexMask);
++  svuint64_t iz = svsub_x (pg, ix, svand_x (pg, i_off, d->mask));
+   svfloat64_t z = svreinterpret_f64 (iz);
+ 
+   svfloat64_t invc = svld1_gather_index (pg, &__v_log_data.table[0].invc, i);
+@@ -78,14 +90,14 @@ __sv_log_inline (svfloat64_t x, const struct data *d, const svbool_t pg)
+   svfloat64_t p1_p4 = svld1rq (svptrue_b64 (), &d->p1);
+ 
+   svfloat64_t r = svmla_x (pg, sv_f64 (-1.0), invc, z);
+-  svfloat64_t kd = svcvt_f64_x (pg, k);
++  svfloat64_t kd
++      = svcvt_f64_x (pg, svasr_x (pg, svreinterpret_s64 (i_off), 52));
+ 
+   svfloat64_t hi = svmla_lane (svadd_x (pg, logc, r), kd, ln2_p3, 0);
+-  svfloat64_t r2 = svmul_x (pg, r, r);
+-
++  svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
+   svfloat64_t y = svmla_lane (sv_f64 (d->p2), r, ln2_p3, 1);
+-
+   svfloat64_t p = svmla_lane (sv_f64 (d->p0), r, p1_p4, 0);
++
+   y = svmla_lane (y, r2, p1_p4, 1);
+   y = svmla_x (pg, p, r2, y);
+   y = svmla_x (pg, hi, r2, y);
+@@ -111,7 +123,6 @@ svfloat64_t SV_NAME_D1 (asinh) (svfloat64_t x, const svbool_t pg)
+   svuint64_t iax = svbic_x (pg, ix, SignMask);
+   svuint64_t sign = svand_x (pg, ix, SignMask);
+   svfloat64_t ax = svreinterpret_f64 (iax);
+-
+   svbool_t ge1 = svcmpge (pg, iax, One);
+   svbool_t special = svcmpge (pg, iax, Thres);
+ 
+@@ -120,7 +131,7 @@ svfloat64_t SV_NAME_D1 (asinh) (svfloat64_t x, const svbool_t pg)
+   svfloat64_t option_1 = sv_f64 (0);
+   if (__glibc_likely (svptest_any (pg, ge1)))
+     {
+-      svfloat64_t x2 = svmul_x (pg, ax, ax);
++      svfloat64_t x2 = svmul_x (svptrue_b64 (), ax, ax);
+       option_1 = __sv_log_inline (
+ 	  svadd_x (pg, ax, svsqrt_x (pg, svadd_x (pg, x2, 1))), d, pg);
+     }
+@@ -130,21 +141,53 @@ svfloat64_t SV_NAME_D1 (asinh) (svfloat64_t x, const svbool_t pg)
+      The largest observed error in this region is 1.51 ULPs:
+      _ZGVsMxv_asinh(0x1.fe12bf8c616a2p-1) got 0x1.c1e649ee2681bp-1
+ 					 want 0x1.c1e649ee2681dp-1.  */
++
+   svfloat64_t option_2 = sv_f64 (0);
+   if (__glibc_likely (svptest_any (pg, svnot_z (pg, ge1))))
+     {
+-      svfloat64_t x2 = svmul_x (pg, ax, ax);
+-      svfloat64_t x4 = svmul_x (pg, x2, x2);
+-      svfloat64_t p = sv_pw_horner_17_f64_x (pg, x2, x4, d->poly);
+-      option_2 = svmla_x (pg, ax, p, svmul_x (pg, x2, ax));
++      svfloat64_t x2 = svmul_x (svptrue_b64 (), ax, ax);
++      svfloat64_t x4 = svmul_x (svptrue_b64 (), x2, x2);
++      /* Order-17 Pairwise Horner scheme.  */
++      svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1);
++      svfloat64_t c57 = svld1rq (svptrue_b64 (), &d->c5);
++      svfloat64_t c911 = svld1rq (svptrue_b64 (), &d->c9);
++      svfloat64_t c1315 = svld1rq (svptrue_b64 (), &d->c13);
++
++      svfloat64_t p01 = svmla_lane (sv_f64 (d->even_coeffs[0]), x2, c13, 0);
++      svfloat64_t p23 = svmla_lane (sv_f64 (d->even_coeffs[1]), x2, c13, 1);
++      svfloat64_t p45 = svmla_lane (sv_f64 (d->even_coeffs[2]), x2, c57, 0);
++      svfloat64_t p67 = svmla_lane (sv_f64 (d->even_coeffs[3]), x2, c57, 1);
++      svfloat64_t p89 = svmla_lane (sv_f64 (d->even_coeffs[4]), x2, c911, 0);
++      svfloat64_t p1011 = svmla_lane (sv_f64 (d->even_coeffs[5]), x2, c911, 1);
++      svfloat64_t p1213
++	  = svmla_lane (sv_f64 (d->even_coeffs[6]), x2, c1315, 0);
++      svfloat64_t p1415
++	  = svmla_lane (sv_f64 (d->even_coeffs[7]), x2, c1315, 1);
++      svfloat64_t p1617 = svmla_x (pg, sv_f64 (d->even_coeffs[8]), x2, d->c17);
++
++      svfloat64_t p = svmla_x (pg, p1415, x4, p1617);
++      p = svmla_x (pg, p1213, x4, p);
++      p = svmla_x (pg, p1011, x4, p);
++      p = svmla_x (pg, p89, x4, p);
++
++      p = svmla_x (pg, p67, x4, p);
++      p = svmla_x (pg, p45, x4, p);
++
++      p = svmla_x (pg, p23, x4, p);
++
++      p = svmla_x (pg, p01, x4, p);
++
++      option_2 = svmla_x (pg, ax, p, svmul_x (svptrue_b64 (), x2, ax));
+     }
+ 
+-  /* Choose the right option for each lane.  */
+-  svfloat64_t y = svsel (ge1, option_1, option_2);
+-
+   if (__glibc_unlikely (svptest_any (pg, special)))
+     return special_case (
+-	x, svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign)),
++	x,
++	svreinterpret_f64 (sveor_x (
++	    pg, svreinterpret_u64 (svsel (ge1, option_1, option_2)), sign)),
+ 	special);
++
++  /* Choose the right option for each lane.  */
++  svfloat64_t y = svsel (ge1, option_1, option_2);
+   return svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign));
+ }
+
+commit 4b0bb84eb7e52a135c873fd9d0fc6c30599aedf4
+Author: Luna Lamb <luna.lamb@arm.com>
+Date:   Thu Feb 13 17:54:46 2025 +0000
+
+    Aarch64: Improve codegen in SVE exp and users, and update expf_inline
+    
+    Use unpredicted muls, and improve memory access.
+    7%, 3% and 1% improvement in throughput microbenchmark on Neoverse V1,
+    for exp, exp2 and cosh respectively.
+    
+    Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
+    (cherry picked from commit c0ff447edf19bd4630fe79adf5e8b896405b059f)
+
+diff --git a/sysdeps/aarch64/fpu/cosh_sve.c b/sysdeps/aarch64/fpu/cosh_sve.c
+index 919f34604a..e375dd8a34 100644
+--- a/sysdeps/aarch64/fpu/cosh_sve.c
++++ b/sysdeps/aarch64/fpu/cosh_sve.c
+@@ -23,7 +23,7 @@ static const struct data
+ {
+   float64_t poly[3];
+   float64_t inv_ln2, ln2_hi, ln2_lo, shift, thres;
+-  uint64_t index_mask, special_bound;
++  uint64_t special_bound;
+ } data = {
+   .poly = { 0x1.fffffffffffd4p-2, 0x1.5555571d6b68cp-3,
+ 	    0x1.5555576a59599p-5, },
+@@ -35,14 +35,16 @@ static const struct data
+   .shift = 0x1.8p+52,
+   .thres = 704.0,
+ 
+-  .index_mask = 0xff,
+   /* 0x1.6p9, above which exp overflows.  */
+   .special_bound = 0x4086000000000000,
+ };
+ 
+ static svfloat64_t NOINLINE
+-special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
++special_case (svfloat64_t x, svbool_t pg, svfloat64_t t, svbool_t special)
+ {
++  svfloat64_t half_t = svmul_x (svptrue_b64 (), t, 0.5);
++  svfloat64_t half_over_t = svdivr_x (pg, t, 0.5);
++  svfloat64_t y = svadd_x (pg, half_t, half_over_t);
+   return sv_call_f64 (cosh, x, y, special);
+ }
+ 
+@@ -60,12 +62,12 @@ exp_inline (svfloat64_t x, const svbool_t pg, const struct data *d)
+ 
+   svuint64_t u = svreinterpret_u64 (z);
+   svuint64_t e = svlsl_x (pg, u, 52 - V_EXP_TAIL_TABLE_BITS);
+-  svuint64_t i = svand_x (pg, u, d->index_mask);
++  svuint64_t i = svand_x (svptrue_b64 (), u, 0xff);
+ 
+   svfloat64_t y = svmla_x (pg, sv_f64 (d->poly[1]), r, d->poly[2]);
+   y = svmla_x (pg, sv_f64 (d->poly[0]), r, y);
+   y = svmla_x (pg, sv_f64 (1.0), r, y);
+-  y = svmul_x (pg, r, y);
++  y = svmul_x (svptrue_b64 (), r, y);
+ 
+   /* s = 2^(n/N).  */
+   u = svld1_gather_index (pg, __v_exp_tail_data, i);
+@@ -94,12 +96,12 @@ svfloat64_t SV_NAME_D1 (cosh) (svfloat64_t x, const svbool_t pg)
+   /* Up to the point that exp overflows, we can use it to calculate cosh by
+      exp(|x|) / 2 + 1 / (2 * exp(|x|)).  */
+   svfloat64_t t = exp_inline (ax, pg, d);
+-  svfloat64_t half_t = svmul_x (pg, t, 0.5);
+-  svfloat64_t half_over_t = svdivr_x (pg, t, 0.5);
+ 
+   /* Fall back to scalar for any special cases.  */
+   if (__glibc_unlikely (svptest_any (pg, special)))
+-    return special_case (x, svadd_x (pg, half_t, half_over_t), special);
++    return special_case (x, pg, t, special);
+ 
++  svfloat64_t half_t = svmul_x (svptrue_b64 (), t, 0.5);
++  svfloat64_t half_over_t = svdivr_x (pg, t, 0.5);
+   return svadd_x (pg, half_t, half_over_t);
+ }
+diff --git a/sysdeps/aarch64/fpu/exp10_sve.c b/sysdeps/aarch64/fpu/exp10_sve.c
+index ddf64708cb..bfd3fb9e19 100644
+--- a/sysdeps/aarch64/fpu/exp10_sve.c
++++ b/sysdeps/aarch64/fpu/exp10_sve.c
+@@ -18,21 +18,23 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include "sv_math.h"
+-#include "poly_sve_f64.h"
+ 
+ #define SpecialBound 307.0 /* floor (log10 (2^1023)).  */
+ 
+ static const struct data
+ {
+-  double poly[5];
++  double c1, c3, c2, c4, c0;
+   double shift, log10_2, log2_10_hi, log2_10_lo, scale_thres, special_bound;
+ } data = {
+   /* Coefficients generated using Remez algorithm.
+      rel error: 0x1.9fcb9b3p-60
+      abs error: 0x1.a20d9598p-60 in [ -log10(2)/128, log10(2)/128 ]
+      max ulp err 0.52 +0.5.  */
+-  .poly = { 0x1.26bb1bbb55516p1, 0x1.53524c73cd32ap1, 0x1.0470591daeafbp1,
+-	    0x1.2bd77b1361ef6p0, 0x1.142b5d54e9621p-1 },
++  .c0 = 0x1.26bb1bbb55516p1,
++  .c1 = 0x1.53524c73cd32ap1,
++  .c2 = 0x1.0470591daeafbp1,
++  .c3 = 0x1.2bd77b1361ef6p0,
++  .c4 = 0x1.142b5d54e9621p-1,
+   /* 1.5*2^46+1023. This value is further explained below.  */
+   .shift = 0x1.800000000ffc0p+46,
+   .log10_2 = 0x1.a934f0979a371p1,     /* 1/log2(10).  */
+@@ -70,9 +72,9 @@ special_case (svbool_t pg, svfloat64_t s, svfloat64_t y, svfloat64_t n,
+   /* |n| > 1280 => 2^(n) overflows.  */
+   svbool_t p_cmp = svacgt (pg, n, d->scale_thres);
+ 
+-  svfloat64_t r1 = svmul_x (pg, s1, s1);
++  svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1);
+   svfloat64_t r2 = svmla_x (pg, s2, s2, y);
+-  svfloat64_t r0 = svmul_x (pg, r2, s1);
++  svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1);
+ 
+   return svsel (p_cmp, r1, r0);
+ }
+@@ -103,11 +105,14 @@ svfloat64_t SV_NAME_D1 (exp10) (svfloat64_t x, svbool_t pg)
+      comes at significant performance cost.  */
+   svuint64_t u = svreinterpret_u64 (z);
+   svfloat64_t scale = svexpa (u);
+-
++  svfloat64_t c24 = svld1rq (svptrue_b64 (), &d->c2);
+   /* Approximate exp10(r) using polynomial.  */
+-  svfloat64_t r2 = svmul_x (pg, r, r);
+-  svfloat64_t y = svmla_x (pg, svmul_x (pg, r, d->poly[0]), r2,
+-			   sv_pairwise_poly_3_f64_x (pg, r, r2, d->poly + 1));
++  svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
++  svfloat64_t p12 = svmla_lane (sv_f64 (d->c1), r, c24, 0);
++  svfloat64_t p34 = svmla_lane (sv_f64 (d->c3), r, c24, 1);
++  svfloat64_t p14 = svmla_x (pg, p12, p34, r2);
++
++  svfloat64_t y = svmla_x (pg, svmul_x (svptrue_b64 (), r, d->c0), r2, p14);
+ 
+   /* Assemble result as exp10(x) = 2^n * exp10(r).  If |x| > SpecialBound
+      multiplication may overflow, so use special case routine.  */
+diff --git a/sysdeps/aarch64/fpu/exp2_sve.c b/sysdeps/aarch64/fpu/exp2_sve.c
+index 22848ebfa5..5dfb77cdbc 100644
+--- a/sysdeps/aarch64/fpu/exp2_sve.c
++++ b/sysdeps/aarch64/fpu/exp2_sve.c
+@@ -18,7 +18,6 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include "sv_math.h"
+-#include "poly_sve_f64.h"
+ 
+ #define N (1 << V_EXP_TABLE_BITS)
+ 
+@@ -27,15 +26,15 @@
+ 
+ static const struct data
+ {
+-  double poly[4];
++  double c0, c2;
++  double c1, c3;
+   double shift, big_bound, uoflow_bound;
+ } data = {
+   /* Coefficients are computed using Remez algorithm with
+      minimisation of the absolute error.  */
+-  .poly = { 0x1.62e42fefa3686p-1, 0x1.ebfbdff82c241p-3, 0x1.c6b09b16de99ap-5,
+-	    0x1.3b2abf5571ad8p-7 },
+-  .shift = 0x1.8p52 / N,
+-  .uoflow_bound = UOFlowBound,
++  .c0 = 0x1.62e42fefa3686p-1, .c1 = 0x1.ebfbdff82c241p-3,
++  .c2 = 0x1.c6b09b16de99ap-5, .c3 = 0x1.3b2abf5571ad8p-7,
++  .shift = 0x1.8p52 / N,      .uoflow_bound = UOFlowBound,
+   .big_bound = BigBound,
+ };
+ 
+@@ -67,9 +66,9 @@ special_case (svbool_t pg, svfloat64_t s, svfloat64_t y, svfloat64_t n,
+   /* |n| > 1280 => 2^(n) overflows.  */
+   svbool_t p_cmp = svacgt (pg, n, d->uoflow_bound);
+ 
+-  svfloat64_t r1 = svmul_x (pg, s1, s1);
++  svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1);
+   svfloat64_t r2 = svmla_x (pg, s2, s2, y);
+-  svfloat64_t r0 = svmul_x (pg, r2, s1);
++  svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1);
+ 
+   return svsel (p_cmp, r1, r0);
+ }
+@@ -99,11 +98,14 @@ svfloat64_t SV_NAME_D1 (exp2) (svfloat64_t x, svbool_t pg)
+   svuint64_t top = svlsl_x (pg, ki, 52 - V_EXP_TABLE_BITS);
+   svfloat64_t scale = svreinterpret_f64 (svadd_x (pg, sbits, top));
+ 
++  svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1);
+   /* Approximate exp2(r) using polynomial.  */
+-  svfloat64_t r2 = svmul_x (pg, r, r);
+-  svfloat64_t p = sv_pairwise_poly_3_f64_x (pg, r, r2, d->poly);
+-  svfloat64_t y = svmul_x (pg, r, p);
+-
++  /* y = exp2(r) - 1 ~= C0 r + C1 r^2 + C2 r^3 + C3 r^4.  */
++  svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
++  svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), r, c13, 0);
++  svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), r, c13, 1);
++  svfloat64_t p = svmla_x (pg, p01, p23, r2);
++  svfloat64_t y = svmul_x (svptrue_b64 (), r, p);
+   /* Assemble exp2(x) = exp2(r) * scale.  */
+   if (__glibc_unlikely (svptest_any (pg, special)))
+     return special_case (pg, scale, y, kd, d);
+diff --git a/sysdeps/aarch64/fpu/exp_sve.c b/sysdeps/aarch64/fpu/exp_sve.c
+index aabaaa1d61..b2421d493f 100644
+--- a/sysdeps/aarch64/fpu/exp_sve.c
++++ b/sysdeps/aarch64/fpu/exp_sve.c
+@@ -21,12 +21,15 @@
+ 
+ static const struct data
+ {
+-  double poly[4];
++  double c0, c2;
++  double c1, c3;
+   double ln2_hi, ln2_lo, inv_ln2, shift, thres;
++
+ } data = {
+-  .poly = { /* ulp error: 0.53.  */
+-	    0x1.fffffffffdbcdp-2, 0x1.555555555444cp-3, 0x1.555573c6a9f7dp-5,
+-	    0x1.1111266d28935p-7 },
++  .c0 = 0x1.fffffffffdbcdp-2,
++  .c1 = 0x1.555555555444cp-3,
++  .c2 = 0x1.555573c6a9f7dp-5,
++  .c3 = 0x1.1111266d28935p-7,
+   .ln2_hi = 0x1.62e42fefa3800p-1,
+   .ln2_lo = 0x1.ef35793c76730p-45,
+   /* 1/ln2.  */
+@@ -36,7 +39,6 @@ static const struct data
+   .thres = 704.0,
+ };
+ 
+-#define C(i) sv_f64 (d->poly[i])
+ #define SpecialOffset 0x6000000000000000 /* 0x1p513.  */
+ /* SpecialBias1 + SpecialBias1 = asuint(1.0).  */
+ #define SpecialBias1 0x7000000000000000 /* 0x1p769.  */
+@@ -56,20 +58,20 @@ special_case (svbool_t pg, svfloat64_t s, svfloat64_t y, svfloat64_t n)
+   svuint64_t b
+       = svdup_u64_z (p_sign, SpecialOffset); /* Inactive lanes set to 0.  */
+ 
+-  /* Set s1 to generate overflow depending on sign of exponent n.  */
+-  svfloat64_t s1 = svreinterpret_f64 (
+-      svsubr_x (pg, b, SpecialBias1)); /* 0x70...0 - b.  */
+-  /* Offset s to avoid overflow in final result if n is below threshold.  */
++  /* Set s1 to generate overflow depending on sign of exponent n,
++     ie. s1 = 0x70...0 - b.  */
++  svfloat64_t s1 = svreinterpret_f64 (svsubr_x (pg, b, SpecialBias1));
++  /* Offset s to avoid overflow in final result if n is below threshold.
++     ie. s2 = as_u64 (s) - 0x3010...0 + b.  */
+   svfloat64_t s2 = svreinterpret_f64 (
+-      svadd_x (pg, svsub_x (pg, svreinterpret_u64 (s), SpecialBias2),
+-	       b)); /* as_u64 (s) - 0x3010...0 + b.  */
++      svadd_x (pg, svsub_x (pg, svreinterpret_u64 (s), SpecialBias2), b));
+ 
+   /* |n| > 1280 => 2^(n) overflows.  */
+   svbool_t p_cmp = svacgt (pg, n, 1280.0);
+ 
+-  svfloat64_t r1 = svmul_x (pg, s1, s1);
++  svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1);
+   svfloat64_t r2 = svmla_x (pg, s2, s2, y);
+-  svfloat64_t r0 = svmul_x (pg, r2, s1);
++  svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1);
+ 
+   return svsel (p_cmp, r1, r0);
+ }
+@@ -103,16 +105,16 @@ svfloat64_t SV_NAME_D1 (exp) (svfloat64_t x, const svbool_t pg)
+   svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2);
+   svuint64_t u = svreinterpret_u64 (z);
+   svfloat64_t n = svsub_x (pg, z, d->shift);
+-
++  svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1);
+   /* r = x - n * ln2, r is in [-ln2/(2N), ln2/(2N)].  */
+   svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi);
+   svfloat64_t r = svmls_lane (x, n, ln2, 0);
+   r = svmls_lane (r, n, ln2, 1);
+ 
+   /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5.  */
+-  svfloat64_t r2 = svmul_x (pg, r, r);
+-  svfloat64_t p01 = svmla_x (pg, C (0), C (1), r);
+-  svfloat64_t p23 = svmla_x (pg, C (2), C (3), r);
++  svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
++  svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), r, c13, 0);
++  svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), r, c13, 1);
+   svfloat64_t p04 = svmla_x (pg, p01, p23, r2);
+   svfloat64_t y = svmla_x (pg, r, p04, r2);
+ 
+diff --git a/sysdeps/aarch64/fpu/sv_expf_inline.h b/sysdeps/aarch64/fpu/sv_expf_inline.h
+index 6166df6553..75781fb4dd 100644
+--- a/sysdeps/aarch64/fpu/sv_expf_inline.h
++++ b/sysdeps/aarch64/fpu/sv_expf_inline.h
+@@ -61,7 +61,7 @@ expf_inline (svfloat32_t x, const svbool_t pg, const struct sv_expf_data *d)
+   /* scale = 2^(n/N).  */
+   svfloat32_t scale = svexpa (svreinterpret_u32 (z));
+ 
+-  /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6.  */
++  /* poly(r) = exp(r) - 1 ~= C0 r + C1 r^2 + C2 r^3 + C3 r^4 + C4 r^5.  */
+   svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, lane_consts, 2);
+   svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, lane_consts, 3);
+   svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
+@@ -71,5 +71,4 @@ expf_inline (svfloat32_t x, const svbool_t pg, const struct sv_expf_data *d)
+ 
+   return svmla_x (pg, scale, scale, poly);
+ }
+-
+ #endif
+
+commit 194185c28954dfa11a6ded8b32f34fee680d3218
+Author: Yat Long Poon <yatlong.poon@arm.com>
+Date:   Thu Feb 13 18:00:50 2025 +0000
+
+    AArch64: Improve codegen for SVE erfcf
+    
+    Reduce number of MOV/MOVPRFXs and use unpredicated FMUL.
+    Replace MUL with LSL.  Speedup on Neoverse V1: 6%.
+    
+    Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
+    (cherry picked from commit f5ff34cb3c75ec1061c75bb9188b3c1176426947)
+
+diff --git a/sysdeps/aarch64/fpu/erfcf_sve.c b/sysdeps/aarch64/fpu/erfcf_sve.c
+index ecacb933ac..e4869263e3 100644
+--- a/sysdeps/aarch64/fpu/erfcf_sve.c
++++ b/sysdeps/aarch64/fpu/erfcf_sve.c
+@@ -76,7 +76,7 @@ svfloat32_t SV_NAME_F1 (erfc) (svfloat32_t x, const svbool_t pg)
+   svuint32_t i = svqadd (svreinterpret_u32 (z), dat->off_idx);
+ 
+   /* Lookup erfc(r) and 2/sqrt(pi)*exp(-r^2) in tables.  */
+-  i = svmul_x (pg, i, 2);
++  i = svlsl_x (svptrue_b32 (), i, 1);
+   const float32_t *p = &__v_erfcf_data.tab[0].erfc - 2 * dat->off_arr;
+   svfloat32_t erfcr = svld1_gather_index (pg, p, i);
+   svfloat32_t scale = svld1_gather_index (pg, p + 1, i);
+@@ -84,15 +84,15 @@ svfloat32_t SV_NAME_F1 (erfc) (svfloat32_t x, const svbool_t pg)
+   /* erfc(x) ~ erfc(r) - scale * d * poly(r, d).  */
+   svfloat32_t r = svsub_x (pg, z, shift);
+   svfloat32_t d = svsub_x (pg, a, r);
+-  svfloat32_t d2 = svmul_x (pg, d, d);
+-  svfloat32_t r2 = svmul_x (pg, r, r);
++  svfloat32_t d2 = svmul_x (svptrue_b32 (), d, d);
++  svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
+ 
+   svfloat32_t coeffs = svld1rq (svptrue_b32 (), &dat->third);
+-  svfloat32_t third = svdup_lane (coeffs, 0);
+ 
+   svfloat32_t p1 = r;
+-  svfloat32_t p2 = svmls_lane (third, r2, coeffs, 1);
+-  svfloat32_t p3 = svmul_x (pg, r, svmla_lane (sv_f32 (-0.5), r2, coeffs, 0));
++  svfloat32_t p2 = svmls_lane (sv_f32 (dat->third), r2, coeffs, 1);
++  svfloat32_t p3
++      = svmul_x (svptrue_b32 (), r, svmla_lane (sv_f32 (-0.5), r2, coeffs, 0));
+   svfloat32_t p4 = svmla_lane (sv_f32 (dat->two_over_five), r2, coeffs, 2);
+   p4 = svmls_x (pg, sv_f32 (dat->tenth), r2, p4);
+ 
+
+commit 7dc549c5a4af3c32689147550144397116404d22
+Author: Yat Long Poon <yatlong.poon@arm.com>
+Date:   Thu Feb 13 18:02:01 2025 +0000
+
+    AArch64: Improve codegen for SVE pow
+    
+    Move constants to struct.  Improve memory access with indexed/unpredicated
+    instructions.  Eliminate register spills.  Speedup on Neoverse V1: 24%.
+    
+    Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
+    (cherry picked from commit 0b195651db3ae793187c7dd6d78b5a7a8da9d5e6)
+
+diff --git a/sysdeps/aarch64/fpu/pow_sve.c b/sysdeps/aarch64/fpu/pow_sve.c
+index 4c0bf8956c..4242d22a49 100644
+--- a/sysdeps/aarch64/fpu/pow_sve.c
++++ b/sysdeps/aarch64/fpu/pow_sve.c
+@@ -44,19 +44,18 @@
+ 
+ /* Data is defined in v_pow_log_data.c.  */
+ #define N_LOG (1 << V_POW_LOG_TABLE_BITS)
+-#define A __v_pow_log_data.poly
+ #define Off 0x3fe6955500000000
+ 
+ /* Data is defined in v_pow_exp_data.c.  */
+ #define N_EXP (1 << V_POW_EXP_TABLE_BITS)
+ #define SignBias (0x800 << V_POW_EXP_TABLE_BITS)
+-#define C __v_pow_exp_data.poly
+ #define SmallExp 0x3c9 /* top12(0x1p-54).  */
+ #define BigExp 0x408   /* top12(512.).  */
+ #define ThresExp 0x03f /* BigExp - SmallExp.  */
+ #define HugeExp 0x409  /* top12(1024.).  */
+ 
+ /* Constants associated with pow.  */
++#define SmallBoundX 0x1p-126
+ #define SmallPowX 0x001 /* top12(0x1p-126).  */
+ #define BigPowX 0x7ff	/* top12(INFINITY).  */
+ #define ThresPowX 0x7fe /* BigPowX - SmallPowX.  */
+@@ -64,6 +63,31 @@
+ #define BigPowY 0x43e	/* top12(0x1.749p62).  */
+ #define ThresPowY 0x080 /* BigPowY - SmallPowY.  */
+ 
++static const struct data
++{
++  double log_c0, log_c2, log_c4, log_c6, ln2_hi, ln2_lo;
++  double log_c1, log_c3, log_c5, off;
++  double n_over_ln2, exp_c2, ln2_over_n_hi, ln2_over_n_lo;
++  double exp_c0, exp_c1;
++} data = {
++  .log_c0 = -0x1p-1,
++  .log_c1 = -0x1.555555555556p-1,
++  .log_c2 = 0x1.0000000000006p-1,
++  .log_c3 = 0x1.999999959554ep-1,
++  .log_c4 = -0x1.555555529a47ap-1,
++  .log_c5 = -0x1.2495b9b4845e9p0,
++  .log_c6 = 0x1.0002b8b263fc3p0,
++  .off = Off,
++  .exp_c0 = 0x1.fffffffffffd4p-2,
++  .exp_c1 = 0x1.5555571d6ef9p-3,
++  .exp_c2 = 0x1.5555576a5adcep-5,
++  .ln2_hi = 0x1.62e42fefa3800p-1,
++  .ln2_lo = 0x1.ef35793c76730p-45,
++  .n_over_ln2 = 0x1.71547652b82fep0 * N_EXP,
++  .ln2_over_n_hi = 0x1.62e42fefc0000p-9,
++  .ln2_over_n_lo = -0x1.c610ca86c3899p-45,
++};
++
+ /* Check if x is an integer.  */
+ static inline svbool_t
+ sv_isint (svbool_t pg, svfloat64_t x)
+@@ -82,7 +106,7 @@ sv_isnotint (svbool_t pg, svfloat64_t x)
+ static inline svbool_t
+ sv_isodd (svbool_t pg, svfloat64_t x)
+ {
+-  svfloat64_t y = svmul_x (pg, x, 0.5);
++  svfloat64_t y = svmul_x (svptrue_b64 (), x, 0.5);
+   return sv_isnotint (pg, y);
+ }
+ 
+@@ -121,7 +145,7 @@ zeroinfnan (uint64_t i)
+ static inline svbool_t
+ sv_zeroinfnan (svbool_t pg, svuint64_t i)
+ {
+-  return svcmpge (pg, svsub_x (pg, svmul_x (pg, i, 2), 1),
++  return svcmpge (pg, svsub_x (pg, svadd_x (pg, i, i), 1),
+ 		  2 * asuint64 (INFINITY) - 1);
+ }
+ 
+@@ -174,16 +198,17 @@ sv_call_specialcase (svfloat64_t x1, svuint64_t u1, svuint64_t u2,
+    additional 15 bits precision.  IX is the bit representation of x, but
+    normalized in the subnormal range using the sign bit for the exponent.  */
+ static inline svfloat64_t
+-sv_log_inline (svbool_t pg, svuint64_t ix, svfloat64_t *tail)
++sv_log_inline (svbool_t pg, svuint64_t ix, svfloat64_t *tail,
++	       const struct data *d)
+ {
+   /* x = 2^k z; where z is in range [Off,2*Off) and exact.
+      The range is split into N subintervals.
+      The ith subinterval contains z and c is near its center.  */
+-  svuint64_t tmp = svsub_x (pg, ix, Off);
++  svuint64_t tmp = svsub_x (pg, ix, d->off);
+   svuint64_t i = svand_x (pg, svlsr_x (pg, tmp, 52 - V_POW_LOG_TABLE_BITS),
+ 			  sv_u64 (N_LOG - 1));
+   svint64_t k = svasr_x (pg, svreinterpret_s64 (tmp), 52);
+-  svuint64_t iz = svsub_x (pg, ix, svand_x (pg, tmp, sv_u64 (0xfffULL << 52)));
++  svuint64_t iz = svsub_x (pg, ix, svlsl_x (pg, svreinterpret_u64 (k), 52));
+   svfloat64_t z = svreinterpret_f64 (iz);
+   svfloat64_t kd = svcvt_f64_x (pg, k);
+ 
+@@ -199,40 +224,85 @@ sv_log_inline (svbool_t pg, svuint64_t ix, svfloat64_t *tail)
+      |z/c - 1| < 1/N, so r = z/c - 1 is exactly representible.  */
+   svfloat64_t r = svmad_x (pg, z, invc, -1.0);
+   /* k*Ln2 + log(c) + r.  */
+-  svfloat64_t t1 = svmla_x (pg, logc, kd, __v_pow_log_data.ln2_hi);
++
++  svfloat64_t ln2_hilo = svld1rq_f64 (svptrue_b64 (), &d->ln2_hi);
++  svfloat64_t t1 = svmla_lane_f64 (logc, kd, ln2_hilo, 0);
+   svfloat64_t t2 = svadd_x (pg, t1, r);
+-  svfloat64_t lo1 = svmla_x (pg, logctail, kd, __v_pow_log_data.ln2_lo);
++  svfloat64_t lo1 = svmla_lane_f64 (logctail, kd, ln2_hilo, 1);
+   svfloat64_t lo2 = svadd_x (pg, svsub_x (pg, t1, t2), r);
+ 
+   /* Evaluation is optimized assuming superscalar pipelined execution.  */
+-  svfloat64_t ar = svmul_x (pg, r, -0.5); /* A[0] = -0.5.  */
+-  svfloat64_t ar2 = svmul_x (pg, r, ar);
+-  svfloat64_t ar3 = svmul_x (pg, r, ar2);
++
++  svfloat64_t log_c02 = svld1rq_f64 (svptrue_b64 (), &d->log_c0);
++  svfloat64_t ar = svmul_lane_f64 (r, log_c02, 0);
++  svfloat64_t ar2 = svmul_x (svptrue_b64 (), r, ar);
++  svfloat64_t ar3 = svmul_x (svptrue_b64 (), r, ar2);
+   /* k*Ln2 + log(c) + r + A[0]*r*r.  */
+   svfloat64_t hi = svadd_x (pg, t2, ar2);
+-  svfloat64_t lo3 = svmla_x (pg, svneg_x (pg, ar2), ar, r);
++  svfloat64_t lo3 = svmls_x (pg, ar2, ar, r);
+   svfloat64_t lo4 = svadd_x (pg, svsub_x (pg, t2, hi), ar2);
+   /* p = log1p(r) - r - A[0]*r*r.  */
+   /* p = (ar3 * (A[1] + r * A[2] + ar2 * (A[3] + r * A[4] + ar2 * (A[5] + r *
+      A[6])))).  */
+-  svfloat64_t a56 = svmla_x (pg, sv_f64 (A[5]), r, A[6]);
+-  svfloat64_t a34 = svmla_x (pg, sv_f64 (A[3]), r, A[4]);
+-  svfloat64_t a12 = svmla_x (pg, sv_f64 (A[1]), r, A[2]);
++
++  svfloat64_t log_c46 = svld1rq_f64 (svptrue_b64 (), &d->log_c4);
++  svfloat64_t a56 = svmla_lane_f64 (sv_f64 (d->log_c5), r, log_c46, 1);
++  svfloat64_t a34 = svmla_lane_f64 (sv_f64 (d->log_c3), r, log_c46, 0);
++  svfloat64_t a12 = svmla_lane_f64 (sv_f64 (d->log_c1), r, log_c02, 1);
+   svfloat64_t p = svmla_x (pg, a34, ar2, a56);
+   p = svmla_x (pg, a12, ar2, p);
+-  p = svmul_x (pg, ar3, p);
++  p = svmul_x (svptrue_b64 (), ar3, p);
+   svfloat64_t lo = svadd_x (
+-      pg, svadd_x (pg, svadd_x (pg, svadd_x (pg, lo1, lo2), lo3), lo4), p);
++      pg, svadd_x (pg, svsub_x (pg, svadd_x (pg, lo1, lo2), lo3), lo4), p);
+   svfloat64_t y = svadd_x (pg, hi, lo);
+   *tail = svadd_x (pg, svsub_x (pg, hi, y), lo);
+   return y;
+ }
+ 
++static inline svfloat64_t
++sv_exp_core (svbool_t pg, svfloat64_t x, svfloat64_t xtail,
++	     svuint64_t sign_bias, svfloat64_t *tmp, svuint64_t *sbits,
++	     svuint64_t *ki, const struct data *d)
++{
++  /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)].  */
++  /* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N].  */
++  svfloat64_t n_over_ln2_and_c2 = svld1rq_f64 (svptrue_b64 (), &d->n_over_ln2);
++  svfloat64_t z = svmul_lane_f64 (x, n_over_ln2_and_c2, 0);
++  /* z - kd is in [-1, 1] in non-nearest rounding modes.  */
++  svfloat64_t kd = svrinta_x (pg, z);
++  *ki = svreinterpret_u64 (svcvt_s64_x (pg, kd));
++
++  svfloat64_t ln2_over_n_hilo
++      = svld1rq_f64 (svptrue_b64 (), &d->ln2_over_n_hi);
++  svfloat64_t r = x;
++  r = svmls_lane_f64 (r, kd, ln2_over_n_hilo, 0);
++  r = svmls_lane_f64 (r, kd, ln2_over_n_hilo, 1);
++  /* The code assumes 2^-200 < |xtail| < 2^-8/N.  */
++  r = svadd_x (pg, r, xtail);
++  /* 2^(k/N) ~= scale.  */
++  svuint64_t idx = svand_x (pg, *ki, N_EXP - 1);
++  svuint64_t top
++      = svlsl_x (pg, svadd_x (pg, *ki, sign_bias), 52 - V_POW_EXP_TABLE_BITS);
++  /* This is only a valid scale when -1023*N < k < 1024*N.  */
++  *sbits = svld1_gather_index (pg, __v_pow_exp_data.sbits, idx);
++  *sbits = svadd_x (pg, *sbits, top);
++  /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1).  */
++  svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
++  *tmp = svmla_lane_f64 (sv_f64 (d->exp_c1), r, n_over_ln2_and_c2, 1);
++  *tmp = svmla_x (pg, sv_f64 (d->exp_c0), r, *tmp);
++  *tmp = svmla_x (pg, r, r2, *tmp);
++  svfloat64_t scale = svreinterpret_f64 (*sbits);
++  /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
++     is no spurious underflow here even without fma.  */
++  z = svmla_x (pg, scale, scale, *tmp);
++  return z;
++}
++
+ /* Computes sign*exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|.
+    The sign_bias argument is SignBias or 0 and sets the sign to -1 or 1.  */
+ static inline svfloat64_t
+ sv_exp_inline (svbool_t pg, svfloat64_t x, svfloat64_t xtail,
+-	       svuint64_t sign_bias)
++	       svuint64_t sign_bias, const struct data *d)
+ {
+   /* 3 types of special cases: tiny (uflow and spurious uflow), huge (oflow)
+      and other cases of large values of x (scale * (1 + TMP) oflow).  */
+@@ -240,73 +310,46 @@ sv_exp_inline (svbool_t pg, svfloat64_t x, svfloat64_t xtail,
+   /* |x| is large (|x| >= 512) or tiny (|x| <= 0x1p-54).  */
+   svbool_t uoflow = svcmpge (pg, svsub_x (pg, abstop, SmallExp), ThresExp);
+ 
+-  /* Conditions special, uflow and oflow are all expressed as uoflow &&
+-     something, hence do not bother computing anything if no lane in uoflow is
+-     true.  */
+-  svbool_t special = svpfalse_b ();
+-  svbool_t uflow = svpfalse_b ();
+-  svbool_t oflow = svpfalse_b ();
++  svfloat64_t tmp;
++  svuint64_t sbits, ki;
+   if (__glibc_unlikely (svptest_any (pg, uoflow)))
+     {
++      svfloat64_t z
++	  = sv_exp_core (pg, x, xtail, sign_bias, &tmp, &sbits, &ki, d);
++
+       /* |x| is tiny (|x| <= 0x1p-54).  */
+-      uflow = svcmpge (pg, svsub_x (pg, abstop, SmallExp), 0x80000000);
++      svbool_t uflow
++	  = svcmpge (pg, svsub_x (pg, abstop, SmallExp), 0x80000000);
+       uflow = svand_z (pg, uoflow, uflow);
+       /* |x| is huge (|x| >= 1024).  */
+-      oflow = svcmpge (pg, abstop, HugeExp);
++      svbool_t oflow = svcmpge (pg, abstop, HugeExp);
+       oflow = svand_z (pg, uoflow, svbic_z (pg, oflow, uflow));
++
+       /* For large |x| values (512 < |x| < 1024) scale * (1 + TMP) can overflow
+-	 or underflow.  */
+-      special = svbic_z (pg, uoflow, svorr_z (pg, uflow, oflow));
++    or underflow.  */
++      svbool_t special = svbic_z (pg, uoflow, svorr_z (pg, uflow, oflow));
++
++      /* Update result with special and large cases.  */
++      z = sv_call_specialcase (tmp, sbits, ki, z, special);
++
++      /* Handle underflow and overflow.  */
++      svbool_t x_is_neg = svcmplt (pg, x, 0);
++      svuint64_t sign_mask
++	  = svlsl_x (pg, sign_bias, 52 - V_POW_EXP_TABLE_BITS);
++      svfloat64_t res_uoflow
++	  = svsel (x_is_neg, sv_f64 (0.0), sv_f64 (INFINITY));
++      res_uoflow = svreinterpret_f64 (
++	  svorr_x (pg, svreinterpret_u64 (res_uoflow), sign_mask));
++      /* Avoid spurious underflow for tiny x.  */
++      svfloat64_t res_spurious_uflow
++	  = svreinterpret_f64 (svorr_x (pg, sign_mask, 0x3ff0000000000000));
++
++      z = svsel (oflow, res_uoflow, z);
++      z = svsel (uflow, res_spurious_uflow, z);
++      return z;
+     }
+ 
+-  /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)].  */
+-  /* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N].  */
+-  svfloat64_t z = svmul_x (pg, x, __v_pow_exp_data.n_over_ln2);
+-  /* z - kd is in [-1, 1] in non-nearest rounding modes.  */
+-  svfloat64_t shift = sv_f64 (__v_pow_exp_data.shift);
+-  svfloat64_t kd = svadd_x (pg, z, shift);
+-  svuint64_t ki = svreinterpret_u64 (kd);
+-  kd = svsub_x (pg, kd, shift);
+-  svfloat64_t r = x;
+-  r = svmls_x (pg, r, kd, __v_pow_exp_data.ln2_over_n_hi);
+-  r = svmls_x (pg, r, kd, __v_pow_exp_data.ln2_over_n_lo);
+-  /* The code assumes 2^-200 < |xtail| < 2^-8/N.  */
+-  r = svadd_x (pg, r, xtail);
+-  /* 2^(k/N) ~= scale.  */
+-  svuint64_t idx = svand_x (pg, ki, N_EXP - 1);
+-  svuint64_t top
+-      = svlsl_x (pg, svadd_x (pg, ki, sign_bias), 52 - V_POW_EXP_TABLE_BITS);
+-  /* This is only a valid scale when -1023*N < k < 1024*N.  */
+-  svuint64_t sbits = svld1_gather_index (pg, __v_pow_exp_data.sbits, idx);
+-  sbits = svadd_x (pg, sbits, top);
+-  /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1).  */
+-  svfloat64_t r2 = svmul_x (pg, r, r);
+-  svfloat64_t tmp = svmla_x (pg, sv_f64 (C[1]), r, C[2]);
+-  tmp = svmla_x (pg, sv_f64 (C[0]), r, tmp);
+-  tmp = svmla_x (pg, r, r2, tmp);
+-  svfloat64_t scale = svreinterpret_f64 (sbits);
+-  /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
+-     is no spurious underflow here even without fma.  */
+-  z = svmla_x (pg, scale, scale, tmp);
+-
+-  /* Update result with special and large cases.  */
+-  if (__glibc_unlikely (svptest_any (pg, special)))
+-    z = sv_call_specialcase (tmp, sbits, ki, z, special);
+-
+-  /* Handle underflow and overflow.  */
+-  svuint64_t sign_bit = svlsr_x (pg, svreinterpret_u64 (x), 63);
+-  svbool_t x_is_neg = svcmpne (pg, sign_bit, 0);
+-  svuint64_t sign_mask = svlsl_x (pg, sign_bias, 52 - V_POW_EXP_TABLE_BITS);
+-  svfloat64_t res_uoflow = svsel (x_is_neg, sv_f64 (0.0), sv_f64 (INFINITY));
+-  res_uoflow = svreinterpret_f64 (
+-      svorr_x (pg, svreinterpret_u64 (res_uoflow), sign_mask));
+-  z = svsel (oflow, res_uoflow, z);
+-  /* Avoid spurious underflow for tiny x.  */
+-  svfloat64_t res_spurious_uflow
+-      = svreinterpret_f64 (svorr_x (pg, sign_mask, 0x3ff0000000000000));
+-  z = svsel (uflow, res_spurious_uflow, z);
+-
+-  return z;
++  return sv_exp_core (pg, x, xtail, sign_bias, &tmp, &sbits, &ki, d);
+ }
+ 
+ static inline double
+@@ -341,47 +384,39 @@ pow_sc (double x, double y)
+ 
+ svfloat64_t SV_NAME_D2 (pow) (svfloat64_t x, svfloat64_t y, const svbool_t pg)
+ {
++  const struct data *d = ptr_barrier (&data);
++
+   /* This preamble handles special case conditions used in the final scalar
+      fallbacks. It also updates ix and sign_bias, that are used in the core
+      computation too, i.e., exp( y * log (x) ).  */
+   svuint64_t vix0 = svreinterpret_u64 (x);
+   svuint64_t viy0 = svreinterpret_u64 (y);
+-  svuint64_t vtopx0 = svlsr_x (svptrue_b64 (), vix0, 52);
+ 
+   /* Negative x cases.  */
+-  svuint64_t sign_bit = svlsr_m (pg, vix0, 63);
+-  svbool_t xisneg = svcmpeq (pg, sign_bit, 1);
++  svbool_t xisneg = svcmplt (pg, x, 0);
+ 
+   /* Set sign_bias and ix depending on sign of x and nature of y.  */
+-  svbool_t yisnotint_xisneg = svpfalse_b ();
++  svbool_t yint_or_xpos = pg;
+   svuint64_t sign_bias = sv_u64 (0);
+   svuint64_t vix = vix0;
+-  svuint64_t vtopx1 = vtopx0;
+   if (__glibc_unlikely (svptest_any (pg, xisneg)))
+     {
+       /* Determine nature of y.  */
+-      yisnotint_xisneg = sv_isnotint (xisneg, y);
+-      svbool_t yisint_xisneg = sv_isint (xisneg, y);
++      yint_or_xpos = sv_isint (xisneg, y);
+       svbool_t yisodd_xisneg = sv_isodd (xisneg, y);
+       /* ix set to abs(ix) if y is integer.  */
+-      vix = svand_m (yisint_xisneg, vix0, 0x7fffffffffffffff);
+-      vtopx1 = svand_m (yisint_xisneg, vtopx0, 0x7ff);
++      vix = svand_m (yint_or_xpos, vix0, 0x7fffffffffffffff);
+       /* Set to SignBias if x is negative and y is odd.  */
+       sign_bias = svsel (yisodd_xisneg, sv_u64 (SignBias), sv_u64 (0));
+     }
+ 
+-  /* Special cases of x or y: zero, inf and nan.  */
+-  svbool_t xspecial = sv_zeroinfnan (pg, vix0);
+-  svbool_t yspecial = sv_zeroinfnan (pg, viy0);
+-  svbool_t special = svorr_z (pg, xspecial, yspecial);
+-
+   /* Small cases of x: |x| < 0x1p-126.  */
+-  svuint64_t vabstopx0 = svand_x (pg, vtopx0, 0x7ff);
+-  svbool_t xsmall = svcmplt (pg, vabstopx0, SmallPowX);
+-  if (__glibc_unlikely (svptest_any (pg, xsmall)))
++  svbool_t xsmall = svaclt (yint_or_xpos, x, SmallBoundX);
++  if (__glibc_unlikely (svptest_any (yint_or_xpos, xsmall)))
+     {
+       /* Normalize subnormal x so exponent becomes negative.  */
+-      svbool_t topx_is_null = svcmpeq (xsmall, vtopx1, 0);
++      svuint64_t vtopx = svlsr_x (svptrue_b64 (), vix, 52);
++      svbool_t topx_is_null = svcmpeq (xsmall, vtopx, 0);
+ 
+       svuint64_t vix_norm = svreinterpret_u64 (svmul_m (xsmall, x, 0x1p52));
+       vix_norm = svand_m (xsmall, vix_norm, 0x7fffffffffffffff);
+@@ -391,20 +426,24 @@ svfloat64_t SV_NAME_D2 (pow) (svfloat64_t x, svfloat64_t y, const svbool_t pg)
+ 
+   /* y_hi = log(ix, &y_lo).  */
+   svfloat64_t vlo;
+-  svfloat64_t vhi = sv_log_inline (pg, vix, &vlo);
++  svfloat64_t vhi = sv_log_inline (yint_or_xpos, vix, &vlo, d);
+ 
+   /* z = exp(y_hi, y_lo, sign_bias).  */
+-  svfloat64_t vehi = svmul_x (pg, y, vhi);
+-  svfloat64_t velo = svmul_x (pg, y, vlo);
+-  svfloat64_t vemi = svmls_x (pg, vehi, y, vhi);
+-  velo = svsub_x (pg, velo, vemi);
+-  svfloat64_t vz = sv_exp_inline (pg, vehi, velo, sign_bias);
++  svfloat64_t vehi = svmul_x (svptrue_b64 (), y, vhi);
++  svfloat64_t vemi = svmls_x (yint_or_xpos, vehi, y, vhi);
++  svfloat64_t velo = svnmls_x (yint_or_xpos, vemi, y, vlo);
++  svfloat64_t vz = sv_exp_inline (yint_or_xpos, vehi, velo, sign_bias, d);
+ 
+   /* Cases of finite y and finite negative x.  */
+-  vz = svsel (yisnotint_xisneg, sv_f64 (__builtin_nan ("")), vz);
++  vz = svsel (yint_or_xpos, vz, sv_f64 (__builtin_nan ("")));
++
++  /* Special cases of x or y: zero, inf and nan.  */
++  svbool_t xspecial = sv_zeroinfnan (svptrue_b64 (), vix0);
++  svbool_t yspecial = sv_zeroinfnan (svptrue_b64 (), viy0);
++  svbool_t special = svorr_z (svptrue_b64 (), xspecial, yspecial);
+ 
+   /* Cases of zero/inf/nan x or y.  */
+-  if (__glibc_unlikely (svptest_any (pg, special)))
++  if (__glibc_unlikely (svptest_any (svptrue_b64 (), special)))
+     vz = sv_call2_f64 (pow_sc, x, y, vz, special);
+ 
+   return vz;
+
+commit 06fd8ad78f35a6cc65dc7c6c08ce55faf6ad079d
+Author: Yat Long Poon <yatlong.poon@arm.com>
+Date:   Thu Feb 13 18:03:04 2025 +0000
+
+    AArch64: Improve codegen for SVE powf
+    
+    Improve memory access with indexed/unpredicated instructions.
+    Eliminate register spills.  Speedup on Neoverse V1: 3%.
+    
+    Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
+    (cherry picked from commit 95e807209b680257a9afe81a507754f1565dbb4d)
+
+diff --git a/sysdeps/aarch64/fpu/powf_sve.c b/sysdeps/aarch64/fpu/powf_sve.c
+index 4f6a142325..08d7019a18 100644
+--- a/sysdeps/aarch64/fpu/powf_sve.c
++++ b/sysdeps/aarch64/fpu/powf_sve.c
+@@ -26,7 +26,6 @@
+ #define Tlogc __v_powf_data.logc
+ #define Texp __v_powf_data.scale
+ #define SignBias (1 << (V_POWF_EXP2_TABLE_BITS + 11))
+-#define Shift 0x1.8p52
+ #define Norm 0x1p23f /* 0x4b000000.  */
+ 
+ /* Overall ULP error bound for pow is 2.6 ulp
+@@ -36,7 +35,7 @@ static const struct data
+   double log_poly[4];
+   double exp_poly[3];
+   float uflow_bound, oflow_bound, small_bound;
+-  uint32_t sign_bias, sign_mask, subnormal_bias, off;
++  uint32_t sign_bias, subnormal_bias, off;
+ } data = {
+   /* rel err: 1.5 * 2^-30. Each coefficients is multiplied the value of
+      V_POWF_EXP2_N.  */
+@@ -53,7 +52,6 @@ static const struct data
+   .small_bound = 0x1p-126f,
+   .off = 0x3f35d000,
+   .sign_bias = SignBias,
+-  .sign_mask = 0x80000000,
+   .subnormal_bias = 0x0b800000, /* 23 << 23.  */
+ };
+ 
+@@ -86,7 +84,7 @@ svisodd (svbool_t pg, svfloat32_t x)
+ static inline svbool_t
+ sv_zeroinfnan (svbool_t pg, svuint32_t i)
+ {
+-  return svcmpge (pg, svsub_x (pg, svmul_x (pg, i, 2u), 1),
++  return svcmpge (pg, svsub_x (pg, svadd_x (pg, i, i), 1),
+ 		  2u * 0x7f800000 - 1);
+ }
+ 
+@@ -150,9 +148,14 @@ powf_specialcase (float x, float y, float z)
+ }
+ 
+ /* Scalar fallback for special case routines with custom signature.  */
+-static inline svfloat32_t
+-sv_call_powf_sc (svfloat32_t x1, svfloat32_t x2, svfloat32_t y, svbool_t cmp)
++static svfloat32_t NOINLINE
++sv_call_powf_sc (svfloat32_t x1, svfloat32_t x2, svfloat32_t y)
+ {
++  /* Special cases of x or y: zero, inf and nan.  */
++  svbool_t xspecial = sv_zeroinfnan (svptrue_b32 (), svreinterpret_u32 (x1));
++  svbool_t yspecial = sv_zeroinfnan (svptrue_b32 (), svreinterpret_u32 (x2));
++  svbool_t cmp = svorr_z (svptrue_b32 (), xspecial, yspecial);
++
+   svbool_t p = svpfirst (cmp, svpfalse ());
+   while (svptest_any (cmp, p))
+     {
+@@ -182,30 +185,30 @@ sv_powf_core_ext (const svbool_t pg, svuint64_t i, svfloat64_t z, svint64_t k,
+ 
+   /* Polynomial to approximate log1p(r)/ln2.  */
+   svfloat64_t logx = A (0);
+-  logx = svmla_x (pg, A (1), r, logx);
+-  logx = svmla_x (pg, A (2), r, logx);
+-  logx = svmla_x (pg, A (3), r, logx);
+-  logx = svmla_x (pg, y0, r, logx);
++  logx = svmad_x (pg, r, logx, A (1));
++  logx = svmad_x (pg, r, logx, A (2));
++  logx = svmad_x (pg, r, logx, A (3));
++  logx = svmad_x (pg, r, logx, y0);
+   *pylogx = svmul_x (pg, y, logx);
+ 
+   /* z - kd is in [-1, 1] in non-nearest rounding modes.  */
+-  svfloat64_t kd = svadd_x (pg, *pylogx, Shift);
+-  svuint64_t ki = svreinterpret_u64 (kd);
+-  kd = svsub_x (pg, kd, Shift);
++  svfloat64_t kd = svrinta_x (svptrue_b64 (), *pylogx);
++  svuint64_t ki = svreinterpret_u64 (svcvt_s64_x (svptrue_b64 (), kd));
+ 
+   r = svsub_x (pg, *pylogx, kd);
+ 
+   /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1).  */
+-  svuint64_t t
+-      = svld1_gather_index (pg, Texp, svand_x (pg, ki, V_POWF_EXP2_N - 1));
+-  svuint64_t ski = svadd_x (pg, ki, sign_bias);
+-  t = svadd_x (pg, t, svlsl_x (pg, ski, 52 - V_POWF_EXP2_TABLE_BITS));
++  svuint64_t t = svld1_gather_index (
++      svptrue_b64 (), Texp, svand_x (svptrue_b64 (), ki, V_POWF_EXP2_N - 1));
++  svuint64_t ski = svadd_x (svptrue_b64 (), ki, sign_bias);
++  t = svadd_x (svptrue_b64 (), t,
++	       svlsl_x (svptrue_b64 (), ski, 52 - V_POWF_EXP2_TABLE_BITS));
+   svfloat64_t s = svreinterpret_f64 (t);
+ 
+   svfloat64_t p = C (0);
+   p = svmla_x (pg, C (1), p, r);
+   p = svmla_x (pg, C (2), p, r);
+-  p = svmla_x (pg, s, p, svmul_x (pg, s, r));
++  p = svmla_x (pg, s, p, svmul_x (svptrue_b64 (), s, r));
+ 
+   return p;
+ }
+@@ -219,19 +222,16 @@ sv_powf_core (const svbool_t pg, svuint32_t i, svuint32_t iz, svint32_t k,
+ {
+   const svbool_t ptrue = svptrue_b64 ();
+ 
+-  /* Unpack and promote input vectors (pg, y, z, i, k and sign_bias) into two in
+-     order to perform core computation in double precision.  */
++  /* Unpack and promote input vectors (pg, y, z, i, k and sign_bias) into two
++   * in order to perform core computation in double precision.  */
+   const svbool_t pg_lo = svunpklo (pg);
+   const svbool_t pg_hi = svunpkhi (pg);
+-  svfloat64_t y_lo = svcvt_f64_x (
+-      ptrue, svreinterpret_f32 (svunpklo (svreinterpret_u32 (y))));
+-  svfloat64_t y_hi = svcvt_f64_x (
+-      ptrue, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (y))));
+-  svfloat32_t z = svreinterpret_f32 (iz);
+-  svfloat64_t z_lo = svcvt_f64_x (
+-      ptrue, svreinterpret_f32 (svunpklo (svreinterpret_u32 (z))));
+-  svfloat64_t z_hi = svcvt_f64_x (
+-      ptrue, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (z))));
++  svfloat64_t y_lo
++      = svcvt_f64_x (pg, svreinterpret_f32 (svunpklo (svreinterpret_u32 (y))));
++  svfloat64_t y_hi
++      = svcvt_f64_x (pg, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (y))));
++  svfloat64_t z_lo = svcvt_f64_x (pg, svreinterpret_f32 (svunpklo (iz)));
++  svfloat64_t z_hi = svcvt_f64_x (pg, svreinterpret_f32 (svunpkhi (iz)));
+   svuint64_t i_lo = svunpklo (i);
+   svuint64_t i_hi = svunpkhi (i);
+   svint64_t k_lo = svunpklo (k);
+@@ -258,9 +258,9 @@ sv_powf_core (const svbool_t pg, svuint32_t i, svuint32_t iz, svint32_t k,
+ /* Implementation of SVE powf.
+    Provides the same accuracy as AdvSIMD powf, since it relies on the same
+    algorithm. The theoretical maximum error is under 2.60 ULPs.
+-   Maximum measured error is 2.56 ULPs:
+-   SV_NAME_F2 (pow) (0x1.004118p+0, 0x1.5d14a4p+16) got 0x1.fd4bp+127
+-						   want 0x1.fd4b06p+127.  */
++   Maximum measured error is 2.57 ULPs:
++   SV_NAME_F2 (pow) (0x1.031706p+0, 0x1.ce2ec2p+12) got 0x1.fff868p+127
++						   want 0x1.fff862p+127.  */
+ svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg)
+ {
+   const struct data *d = ptr_barrier (&data);
+@@ -269,21 +269,19 @@ svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg)
+   svuint32_t viy0 = svreinterpret_u32 (y);
+ 
+   /* Negative x cases.  */
+-  svuint32_t sign_bit = svand_m (pg, vix0, d->sign_mask);
+-  svbool_t xisneg = svcmpeq (pg, sign_bit, d->sign_mask);
++  svbool_t xisneg = svcmplt (pg, x, sv_f32 (0));
+ 
+   /* Set sign_bias and ix depending on sign of x and nature of y.  */
+-  svbool_t yisnotint_xisneg = svpfalse_b ();
++  svbool_t yint_or_xpos = pg;
+   svuint32_t sign_bias = sv_u32 (0);
+   svuint32_t vix = vix0;
+   if (__glibc_unlikely (svptest_any (pg, xisneg)))
+     {
+       /* Determine nature of y.  */
+-      yisnotint_xisneg = svisnotint (xisneg, y);
+-      svbool_t yisint_xisneg = svisint (xisneg, y);
++      yint_or_xpos = svisint (xisneg, y);
+       svbool_t yisodd_xisneg = svisodd (xisneg, y);
+       /* ix set to abs(ix) if y is integer.  */
+-      vix = svand_m (yisint_xisneg, vix0, 0x7fffffff);
++      vix = svand_m (yint_or_xpos, vix0, 0x7fffffff);
+       /* Set to SignBias if x is negative and y is odd.  */
+       sign_bias = svsel (yisodd_xisneg, sv_u32 (d->sign_bias), sv_u32 (0));
+     }
+@@ -294,8 +292,8 @@ svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg)
+   svbool_t cmp = svorr_z (pg, xspecial, yspecial);
+ 
+   /* Small cases of x: |x| < 0x1p-126.  */
+-  svbool_t xsmall = svaclt (pg, x, d->small_bound);
+-  if (__glibc_unlikely (svptest_any (pg, xsmall)))
++  svbool_t xsmall = svaclt (yint_or_xpos, x, d->small_bound);
++  if (__glibc_unlikely (svptest_any (yint_or_xpos, xsmall)))
+     {
+       /* Normalize subnormal x so exponent becomes negative.  */
+       svuint32_t vix_norm = svreinterpret_u32 (svmul_x (xsmall, x, Norm));
+@@ -304,32 +302,35 @@ svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg)
+       vix = svsel (xsmall, vix_norm, vix);
+     }
+   /* Part of core computation carried in working precision.  */
+-  svuint32_t tmp = svsub_x (pg, vix, d->off);
+-  svuint32_t i = svand_x (pg, svlsr_x (pg, tmp, (23 - V_POWF_LOG2_TABLE_BITS)),
+-			  V_POWF_LOG2_N - 1);
+-  svuint32_t top = svand_x (pg, tmp, 0xff800000);
+-  svuint32_t iz = svsub_x (pg, vix, top);
+-  svint32_t k
+-      = svasr_x (pg, svreinterpret_s32 (top), (23 - V_POWF_EXP2_TABLE_BITS));
+-
+-  /* Compute core in extended precision and return intermediate ylogx results to
+-      handle cases of underflow and underflow in exp.  */
++  svuint32_t tmp = svsub_x (yint_or_xpos, vix, d->off);
++  svuint32_t i = svand_x (
++      yint_or_xpos, svlsr_x (yint_or_xpos, tmp, (23 - V_POWF_LOG2_TABLE_BITS)),
++      V_POWF_LOG2_N - 1);
++  svuint32_t top = svand_x (yint_or_xpos, tmp, 0xff800000);
++  svuint32_t iz = svsub_x (yint_or_xpos, vix, top);
++  svint32_t k = svasr_x (yint_or_xpos, svreinterpret_s32 (top),
++			 (23 - V_POWF_EXP2_TABLE_BITS));
++
++  /* Compute core in extended precision and return intermediate ylogx results
++   * to handle cases of underflow and underflow in exp.  */
+   svfloat32_t ylogx;
+-  svfloat32_t ret = sv_powf_core (pg, i, iz, k, y, sign_bias, &ylogx, d);
++  svfloat32_t ret
++      = sv_powf_core (yint_or_xpos, i, iz, k, y, sign_bias, &ylogx, d);
+ 
+   /* Handle exp special cases of underflow and overflow.  */
+-  svuint32_t sign = svlsl_x (pg, sign_bias, 20 - V_POWF_EXP2_TABLE_BITS);
++  svuint32_t sign
++      = svlsl_x (yint_or_xpos, sign_bias, 20 - V_POWF_EXP2_TABLE_BITS);
+   svfloat32_t ret_oflow
+-      = svreinterpret_f32 (svorr_x (pg, sign, asuint (INFINITY)));
++      = svreinterpret_f32 (svorr_x (yint_or_xpos, sign, asuint (INFINITY)));
+   svfloat32_t ret_uflow = svreinterpret_f32 (sign);
+-  ret = svsel (svcmple (pg, ylogx, d->uflow_bound), ret_uflow, ret);
+-  ret = svsel (svcmpgt (pg, ylogx, d->oflow_bound), ret_oflow, ret);
++  ret = svsel (svcmple (yint_or_xpos, ylogx, d->uflow_bound), ret_uflow, ret);
++  ret = svsel (svcmpgt (yint_or_xpos, ylogx, d->oflow_bound), ret_oflow, ret);
+ 
+   /* Cases of finite y and finite negative x.  */
+-  ret = svsel (yisnotint_xisneg, sv_f32 (__builtin_nanf ("")), ret);
++  ret = svsel (yint_or_xpos, ret, sv_f32 (__builtin_nanf ("")));
+ 
+-  if (__glibc_unlikely (svptest_any (pg, cmp)))
+-    return sv_call_powf_sc (x, y, ret, cmp);
++  if (__glibc_unlikely (svptest_any (cmp, cmp)))
++    return sv_call_powf_sc (x, y, ret);
+ 
+   return ret;
+ }
+
+commit fd9a3a36fdcf14d1678c469e8b9033a46aa6c6fb
+Author: Wilco Dijkstra <wilco.dijkstra@arm.com>
+Date:   Thu Feb 27 20:34:34 2025 +0000
+
+    Revert "AArch64: Add vector logp1 alias for log1p"
+    
+    This reverts commit a991a0fc7c051d7ef2ea7778e0a699f22d4e53d7.
+
+diff --git a/bits/libm-simd-decl-stubs.h b/bits/libm-simd-decl-stubs.h
+index 5019e8e25c..08a41c46ad 100644
+--- a/bits/libm-simd-decl-stubs.h
++++ b/bits/libm-simd-decl-stubs.h
+@@ -253,17 +253,6 @@
+ #define __DECL_SIMD_log1pf64x
+ #define __DECL_SIMD_log1pf128x
+ 
+-#define __DECL_SIMD_logp1
+-#define __DECL_SIMD_logp1f
+-#define __DECL_SIMD_logp1l
+-#define __DECL_SIMD_logp1f16
+-#define __DECL_SIMD_logp1f32
+-#define __DECL_SIMD_logp1f64
+-#define __DECL_SIMD_logp1f128
+-#define __DECL_SIMD_logp1f32x
+-#define __DECL_SIMD_logp1f64x
+-#define __DECL_SIMD_logp1f128x
+-
+ #define __DECL_SIMD_atanh
+ #define __DECL_SIMD_atanhf
+ #define __DECL_SIMD_atanhl
+diff --git a/math/bits/mathcalls.h b/math/bits/mathcalls.h
+index 92856becc4..6cb594b6ff 100644
+--- a/math/bits/mathcalls.h
++++ b/math/bits/mathcalls.h
+@@ -126,7 +126,7 @@ __MATHCALL (log2p1,, (_Mdouble_ __x));
+ __MATHCALL (log10p1,, (_Mdouble_ __x));
+ 
+ /* Return log(1 + X).  */
+-__MATHCALL_VEC (logp1,, (_Mdouble_ __x));
++__MATHCALL (logp1,, (_Mdouble_ __x));
+ #endif
+ 
+ #if defined __USE_XOPEN_EXTENDED || defined __USE_ISOC99
+diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions
+index 015211f5f4..cc15ce2d1e 100644
+--- a/sysdeps/aarch64/fpu/Versions
++++ b/sysdeps/aarch64/fpu/Versions
+@@ -135,11 +135,4 @@ libmvec {
+     _ZGVsMxv_tanh;
+     _ZGVsMxv_tanhf;
+   }
+-  GLIBC_2.41 {
+-    _ZGVnN2v_logp1;
+-    _ZGVnN2v_logp1f;
+-    _ZGVnN4v_logp1f;
+-    _ZGVsMxv_logp1;
+-    _ZGVsMxv_logp1f;
+-  }
+ }
+diff --git a/sysdeps/aarch64/fpu/advsimd_f32_protos.h b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
+index 5909bb4ce9..097d403ffe 100644
+--- a/sysdeps/aarch64/fpu/advsimd_f32_protos.h
++++ b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
+@@ -36,7 +36,6 @@ libmvec_hidden_proto (V_NAME_F2(hypot));
+ libmvec_hidden_proto (V_NAME_F1(log10));
+ libmvec_hidden_proto (V_NAME_F1(log1p));
+ libmvec_hidden_proto (V_NAME_F1(log2));
+-libmvec_hidden_proto (V_NAME_F1(logp1));
+ libmvec_hidden_proto (V_NAME_F1(log));
+ libmvec_hidden_proto (V_NAME_F2(pow));
+ libmvec_hidden_proto (V_NAME_F1(sin));
+diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h
+index f295fe185d..7484150131 100644
+--- a/sysdeps/aarch64/fpu/bits/math-vector.h
++++ b/sysdeps/aarch64/fpu/bits/math-vector.h
+@@ -113,10 +113,6 @@
+ # define __DECL_SIMD_log2 __DECL_SIMD_aarch64
+ # undef __DECL_SIMD_log2f
+ # define __DECL_SIMD_log2f __DECL_SIMD_aarch64
+-# undef __DECL_SIMD_logp1
+-# define __DECL_SIMD_logp1 __DECL_SIMD_aarch64
+-# undef __DECL_SIMD_logp1f
+-# define __DECL_SIMD_logp1f __DECL_SIMD_aarch64
+ # undef __DECL_SIMD_pow
+ # define __DECL_SIMD_pow __DECL_SIMD_aarch64
+ # undef __DECL_SIMD_powf
+@@ -184,7 +180,6 @@ __vpcs __f32x4_t _ZGVnN4v_logf (__f32x4_t);
+ __vpcs __f32x4_t _ZGVnN4v_log10f (__f32x4_t);
+ __vpcs __f32x4_t _ZGVnN4v_log1pf (__f32x4_t);
+ __vpcs __f32x4_t _ZGVnN4v_log2f (__f32x4_t);
+-__vpcs __f32x4_t _ZGVnN4v_logp1f (__f32x4_t);
+ __vpcs __f32x4_t _ZGVnN4vv_powf (__f32x4_t, __f32x4_t);
+ __vpcs __f32x4_t _ZGVnN4v_sinf (__f32x4_t);
+ __vpcs __f32x4_t _ZGVnN4v_sinhf (__f32x4_t);
+@@ -212,7 +207,6 @@ __vpcs __f64x2_t _ZGVnN2v_log (__f64x2_t);
+ __vpcs __f64x2_t _ZGVnN2v_log10 (__f64x2_t);
+ __vpcs __f64x2_t _ZGVnN2v_log1p (__f64x2_t);
+ __vpcs __f64x2_t _ZGVnN2v_log2 (__f64x2_t);
+-__vpcs __f64x2_t _ZGVnN2v_logp1 (__f64x2_t);
+ __vpcs __f64x2_t _ZGVnN2vv_pow (__f64x2_t, __f64x2_t);
+ __vpcs __f64x2_t _ZGVnN2v_sin (__f64x2_t);
+ __vpcs __f64x2_t _ZGVnN2v_sinh (__f64x2_t);
+@@ -245,7 +239,6 @@ __sv_f32_t _ZGVsMxv_logf (__sv_f32_t, __sv_bool_t);
+ __sv_f32_t _ZGVsMxv_log10f (__sv_f32_t, __sv_bool_t);
+ __sv_f32_t _ZGVsMxv_log1pf (__sv_f32_t, __sv_bool_t);
+ __sv_f32_t _ZGVsMxv_log2f (__sv_f32_t, __sv_bool_t);
+-__sv_f32_t _ZGVsMxv_logp1f (__sv_f32_t, __sv_bool_t);
+ __sv_f32_t _ZGVsMxvv_powf (__sv_f32_t, __sv_f32_t, __sv_bool_t);
+ __sv_f32_t _ZGVsMxv_sinf (__sv_f32_t, __sv_bool_t);
+ __sv_f32_t _ZGVsMxv_sinhf (__sv_f32_t, __sv_bool_t);
+@@ -273,7 +266,6 @@ __sv_f64_t _ZGVsMxv_log (__sv_f64_t, __sv_bool_t);
+ __sv_f64_t _ZGVsMxv_log10 (__sv_f64_t, __sv_bool_t);
+ __sv_f64_t _ZGVsMxv_log1p (__sv_f64_t, __sv_bool_t);
+ __sv_f64_t _ZGVsMxv_log2 (__sv_f64_t, __sv_bool_t);
+-__sv_f64_t _ZGVsMxv_logp1 (__sv_f64_t, __sv_bool_t);
+ __sv_f64_t _ZGVsMxvv_pow (__sv_f64_t, __sv_f64_t, __sv_bool_t);
+ __sv_f64_t _ZGVsMxv_sin (__sv_f64_t, __sv_bool_t);
+ __sv_f64_t _ZGVsMxv_sinh (__sv_f64_t, __sv_bool_t);
+diff --git a/sysdeps/aarch64/fpu/log1p_advsimd.c b/sysdeps/aarch64/fpu/log1p_advsimd.c
+index 1263587201..9d18578ce6 100644
+--- a/sysdeps/aarch64/fpu/log1p_advsimd.c
++++ b/sysdeps/aarch64/fpu/log1p_advsimd.c
+@@ -58,5 +58,3 @@ VPCS_ATTR float64x2_t V_NAME_D1 (log1p) (float64x2_t x)
+ 
+   return log1p_inline (x, &d->d);
+ }
+-
+-strong_alias (V_NAME_D1 (log1p), V_NAME_D1 (logp1))
+diff --git a/sysdeps/aarch64/fpu/log1p_sve.c b/sysdeps/aarch64/fpu/log1p_sve.c
+index b21cfb2c90..04f7e5720e 100644
+--- a/sysdeps/aarch64/fpu/log1p_sve.c
++++ b/sysdeps/aarch64/fpu/log1p_sve.c
+@@ -116,5 +116,3 @@ svfloat64_t SV_NAME_D1 (log1p) (svfloat64_t x, svbool_t pg)
+ 
+   return y;
+ }
+-
+-strong_alias (SV_NAME_D1 (log1p), SV_NAME_D1 (logp1))
+diff --git a/sysdeps/aarch64/fpu/log1pf_advsimd.c b/sysdeps/aarch64/fpu/log1pf_advsimd.c
+index 00006fc703..f2d47962fe 100644
+--- a/sysdeps/aarch64/fpu/log1pf_advsimd.c
++++ b/sysdeps/aarch64/fpu/log1pf_advsimd.c
+@@ -93,6 +93,3 @@ VPCS_ATTR float32x4_t V_NAME_F1 (log1p) (float32x4_t x)
+ 
+ libmvec_hidden_def (V_NAME_F1 (log1p))
+ HALF_WIDTH_ALIAS_F1 (log1p)
+-strong_alias (V_NAME_F1 (log1p), V_NAME_F1 (logp1))
+-libmvec_hidden_def (V_NAME_F1 (logp1))
+-HALF_WIDTH_ALIAS_F1 (logp1)
+diff --git a/sysdeps/aarch64/fpu/log1pf_sve.c b/sysdeps/aarch64/fpu/log1pf_sve.c
+index 18a185c838..4f17c44e2d 100644
+--- a/sysdeps/aarch64/fpu/log1pf_sve.c
++++ b/sysdeps/aarch64/fpu/log1pf_sve.c
+@@ -42,5 +42,3 @@ svfloat32_t SV_NAME_F1 (log1p) (svfloat32_t x, svbool_t pg)
+ 
+   return sv_log1pf_inline (x, pg);
+ }
+-
+-strong_alias (SV_NAME_F1 (log1p), SV_NAME_F1 (logp1))
+diff --git a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
+index 98687cae0d..b685106954 100644
+--- a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
++++ b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
+@@ -128,8 +128,3 @@ GLIBC_2.40 _ZGVsMxvv_hypot F
+ GLIBC_2.40 _ZGVsMxvv_hypotf F
+ GLIBC_2.40 _ZGVsMxvv_pow F
+ GLIBC_2.40 _ZGVsMxvv_powf F
+-GLIBC_2.41 _ZGVnN2v_logp1 F
+-GLIBC_2.41 _ZGVnN2v_logp1f F
+-GLIBC_2.41 _ZGVnN4v_logp1f F
+-GLIBC_2.41 _ZGVsMxv_logp1 F
+-GLIBC_2.41 _ZGVsMxv_logp1f F
+
+commit 64896b7d329809127035fde42768a6f7eeffed75
+Author: Wilco Dijkstra <wilco.dijkstra@arm.com>
+Date:   Wed Aug 7 14:43:47 2024 +0100
+
+    AArch64: Improve generic strlen
+    
+    Improve performance by handling another 16 bytes before entering the loop.
+    Use ADDHN in the loop to avoid SHRN+FMOV when it terminates.  Change final
+    size computation to avoid increasing latency.  On Neoverse V1 performance
+    of the random strlen benchmark improves by 4.6%.
+    
+    Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
+    (cherry picked from commit 3dc426b642dcafdbc11a99f2767e081d086f5fc7)
+
+diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
+index ab2a576cdb..352fb40d3a 100644
+--- a/sysdeps/aarch64/strlen.S
++++ b/sysdeps/aarch64/strlen.S
+@@ -1,4 +1,5 @@
+-/* Copyright (C) 2012-2024 Free Software Foundation, Inc.
++/* Generic optimized strlen using SIMD.
++   Copyright (C) 2012-2024 Free Software Foundation, Inc.
+ 
+    This file is part of the GNU C Library.
+ 
+@@ -56,36 +57,50 @@ ENTRY (STRLEN)
+ 	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
+ 	fmov	synd, dend
+ 	lsr	synd, synd, shift
+-	cbz	synd, L(loop)
++	cbz	synd, L(next16)
+ 
+ 	rbit	synd, synd
+ 	clz	result, synd
+ 	lsr	result, result, 2
+ 	ret
+ 
++L(next16):
++	ldr	data, [src, 16]
++	cmeq	vhas_nul.16b, vdata.16b, 0
++	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
++	fmov	synd, dend
++	cbz	synd, L(loop)
++	add	src, src, 16
++#ifndef __AARCH64EB__
++	rbit	synd, synd
++#endif
++	sub	result, src, srcin
++	clz	tmp, synd
++	add	result, result, tmp, lsr 2
++	ret
++
+ 	.p2align 5
+ L(loop):
+-	ldr	data, [src, 16]
++	ldr	data, [src, 32]!
+ 	cmeq	vhas_nul.16b, vdata.16b, 0
+-	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
++	addhn	vend.8b, vhas_nul.8h, vhas_nul.8h
+ 	fmov	synd, dend
+ 	cbnz	synd, L(loop_end)
+-	ldr	data, [src, 32]!
++	ldr	data, [src, 16]
+ 	cmeq	vhas_nul.16b, vdata.16b, 0
+-	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
++	addhn	vend.8b, vhas_nul.8h, vhas_nul.8h
+ 	fmov	synd, dend
+ 	cbz	synd, L(loop)
+-	sub	src, src, 16
++	add	src, src, 16
+ L(loop_end):
+-	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
+-	sub	result, src, srcin
+-	fmov	synd, dend
++	sub	result, shift, src, lsl 2	/* (srcin - src) << 2.  */
+ #ifndef __AARCH64EB__
+ 	rbit	synd, synd
++	sub	result, result, 3
+ #endif
+-	add	result, result, 16
+ 	clz	tmp, synd
+-	add	result, result, tmp, lsr 2
++	sub	result, tmp, result
++	lsr	result, result, 2
+ 	ret
+ 
+ END (STRLEN)
+
+commit 544fb349d35efd5f86ed7e482759ff21496a32fd
+Author: Wilco Dijkstra <wilco.dijkstra@arm.com>
+Date:   Mon Sep 9 15:26:47 2024 +0100
+
+    AArch64: Optimize memset
+    
+    Improve small memsets by avoiding branches and use overlapping stores.
+    Use DC ZVA for copies over 128 bytes.  Remove unnecessary code for ZVA sizes
+    other than 64 and 128.  Performance of random memset benchmark improves by 24%
+    on Neoverse N1.
+    
+    Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
+    (cherry picked from commit cec3aef32412779e207f825db0d057ebb4628ae8)
+
+diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
+index 7ef77ee8c9..caafb019e2 100644
+--- a/sysdeps/aarch64/memset.S
++++ b/sysdeps/aarch64/memset.S
+@@ -1,4 +1,5 @@
+-/* Copyright (C) 2012-2024 Free Software Foundation, Inc.
++/* Generic optimized memset using SIMD.
++   Copyright (C) 2012-2024 Free Software Foundation, Inc.
+ 
+    This file is part of the GNU C Library.
+ 
+@@ -17,7 +18,6 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include <sysdep.h>
+-#include "memset-reg.h"
+ 
+ #ifndef MEMSET
+ # define MEMSET memset
+@@ -25,130 +25,132 @@
+ 
+ /* Assumptions:
+  *
+- * ARMv8-a, AArch64, unaligned accesses
++ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
+  *
+  */
+ 
+-ENTRY (MEMSET)
++#define dstin	x0
++#define val	x1
++#define valw	w1
++#define count	x2
++#define dst	x3
++#define dstend	x4
++#define zva_val	x5
++#define off	x3
++#define dstend2	x5
+ 
++ENTRY (MEMSET)
+ 	PTR_ARG (0)
+ 	SIZE_ARG (2)
+ 
+ 	dup	v0.16B, valw
++	cmp	count, 16
++	b.lo	L(set_small)
++
+ 	add	dstend, dstin, count
++	cmp	count, 64
++	b.hs	L(set_128)
+ 
+-	cmp	count, 96
+-	b.hi	L(set_long)
+-	cmp	count, 16
+-	b.hs	L(set_medium)
+-	mov	val, v0.D[0]
++	/* Set 16..63 bytes.  */
++	mov	off, 16
++	and	off, off, count, lsr 1
++	sub	dstend2, dstend, off
++	str	q0, [dstin]
++	str	q0, [dstin, off]
++	str	q0, [dstend2, -16]
++	str	q0, [dstend, -16]
++	ret
+ 
++	.p2align 4
+ 	/* Set 0..15 bytes.  */
+-	tbz	count, 3, 1f
+-	str	val, [dstin]
+-	str	val, [dstend, -8]
+-	ret
+-	nop
+-1:	tbz	count, 2, 2f
+-	str	valw, [dstin]
+-	str	valw, [dstend, -4]
++L(set_small):
++	add	dstend, dstin, count
++	cmp	count, 4
++	b.lo	2f
++	lsr	off, count, 3
++	sub	dstend2, dstend, off, lsl 2
++	str	s0, [dstin]
++	str	s0, [dstin, off, lsl 2]
++	str	s0, [dstend2, -4]
++	str	s0, [dstend, -4]
+ 	ret
++
++	/* Set 0..3 bytes.  */
+ 2:	cbz	count, 3f
++	lsr	off, count, 1
+ 	strb	valw, [dstin]
+-	tbz	count, 1, 3f
+-	strh	valw, [dstend, -2]
++	strb	valw, [dstin, off]
++	strb	valw, [dstend, -1]
+ 3:	ret
+ 
+-	/* Set 17..96 bytes.  */
+-L(set_medium):
+-	str	q0, [dstin]
+-	tbnz	count, 6, L(set96)
+-	str	q0, [dstend, -16]
+-	tbz	count, 5, 1f
+-	str	q0, [dstin, 16]
+-	str	q0, [dstend, -32]
+-1:	ret
+-
+ 	.p2align 4
+-	/* Set 64..96 bytes.  Write 64 bytes from the start and
+-	   32 bytes from the end.  */
+-L(set96):
+-	str	q0, [dstin, 16]
++L(set_128):
++	bic	dst, dstin, 15
++	cmp	count, 128
++	b.hi	L(set_long)
++	stp	q0, q0, [dstin]
+ 	stp	q0, q0, [dstin, 32]
++	stp	q0, q0, [dstend, -64]
+ 	stp	q0, q0, [dstend, -32]
+ 	ret
+ 
+-	.p2align 3
+-	nop
++	.p2align 4
+ L(set_long):
+-	and	valw, valw, 255
+-	bic	dst, dstin, 15
+ 	str	q0, [dstin]
+-	cmp	count, 256
+-	ccmp	valw, 0, 0, cs
+-	b.eq	L(try_zva)
+-L(no_zva):
+-	sub	count, dstend, dst	/* Count is 16 too large.  */
+-	sub	dst, dst, 16		/* Dst is biased by -32.  */
+-	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
+-1:	stp	q0, q0, [dst, 32]
+-	stp	q0, q0, [dst, 64]!
+-L(tail64):
+-	subs	count, count, 64
+-	b.hi	1b
+-2:	stp	q0, q0, [dstend, -64]
++	str	q0, [dst, 16]
++	tst	valw, 255
++	b.ne	L(no_zva)
++#ifndef ZVA64_ONLY
++	mrs	zva_val, dczid_el0
++	and	zva_val, zva_val, 31
++	cmp	zva_val, 4		/* ZVA size is 64 bytes.  */
++	b.ne	L(zva_128)
++#endif
++	stp	q0, q0, [dst, 32]
++	bic	dst, dstin, 63
++	sub	count, dstend, dst	/* Count is now 64 too large.  */
++	sub	count, count, 64 + 64	/* Adjust count and bias for loop.  */
++
++	/* Write last bytes before ZVA loop.  */
++	stp	q0, q0, [dstend, -64]
+ 	stp	q0, q0, [dstend, -32]
++
++	.p2align 4
++L(zva64_loop):
++	add	dst, dst, 64
++	dc	zva, dst
++	subs	count, count, 64
++	b.hi	L(zva64_loop)
+ 	ret
+ 
+-L(try_zva):
+-#ifndef ZVA64_ONLY
+ 	.p2align 3
+-	mrs	tmp1, dczid_el0
+-	tbnz	tmp1w, 4, L(no_zva)
+-	and	tmp1w, tmp1w, 15
+-	cmp	tmp1w, 4	/* ZVA size is 64 bytes.  */
+-	b.ne	 L(zva_128)
+-	nop
+-#endif
+-	/* Write the first and last 64 byte aligned block using stp rather
+-	   than using DC ZVA.  This is faster on some cores.
+-	 */
+-	.p2align 4
+-L(zva_64):
+-	str	q0, [dst, 16]
++L(no_zva):
++	sub	count, dstend, dst	/* Count is 32 too large.  */
++	sub	count, count, 64 + 32	/* Adjust count and bias for loop.  */
++L(no_zva_loop):
+ 	stp	q0, q0, [dst, 32]
+-	bic	dst, dst, 63
+ 	stp	q0, q0, [dst, 64]
+-	stp	q0, q0, [dst, 96]
+-	sub	count, dstend, dst	/* Count is now 128 too large.	*/
+-	sub	count, count, 128+64+64	/* Adjust count and bias for loop.  */
+-	add	dst, dst, 128
+-1:	dc	zva, dst
+ 	add	dst, dst, 64
+ 	subs	count, count, 64
+-	b.hi	1b
+-	stp	q0, q0, [dst, 0]
+-	stp	q0, q0, [dst, 32]
++	b.hi	L(no_zva_loop)
+ 	stp	q0, q0, [dstend, -64]
+ 	stp	q0, q0, [dstend, -32]
+ 	ret
+ 
+ #ifndef ZVA64_ONLY
+-	.p2align 3
++	.p2align 4
+ L(zva_128):
+-	cmp	tmp1w, 5	/* ZVA size is 128 bytes.  */
+-	b.ne	L(zva_other)
++	cmp	zva_val, 5		/* ZVA size is 128 bytes.  */
++	b.ne	L(no_zva)
+ 
+-	str	q0, [dst, 16]
+ 	stp	q0, q0, [dst, 32]
+ 	stp	q0, q0, [dst, 64]
+ 	stp	q0, q0, [dst, 96]
+ 	bic	dst, dst, 127
+ 	sub	count, dstend, dst	/* Count is now 128 too large.	*/
+-	sub	count, count, 128+128	/* Adjust count and bias for loop.  */
+-	add	dst, dst, 128
+-1:	dc	zva, dst
+-	add	dst, dst, 128
++	sub	count, count, 128 + 128	/* Adjust count and bias for loop.  */
++1:	add	dst, dst, 128
++	dc	zva, dst
+ 	subs	count, count, 128
+ 	b.hi	1b
+ 	stp	q0, q0, [dstend, -128]
+@@ -156,35 +158,6 @@ L(zva_128):
+ 	stp	q0, q0, [dstend, -64]
+ 	stp	q0, q0, [dstend, -32]
+ 	ret
+-
+-L(zva_other):
+-	mov	tmp2w, 4
+-	lsl	zva_lenw, tmp2w, tmp1w
+-	add	tmp1, zva_len, 64	/* Max alignment bytes written.	 */
+-	cmp	count, tmp1
+-	blo	L(no_zva)
+-
+-	sub	tmp2, zva_len, 1
+-	add	tmp1, dst, zva_len
+-	add	dst, dst, 16
+-	subs	count, tmp1, dst	/* Actual alignment bytes to write.  */
+-	bic	tmp1, tmp1, tmp2	/* Aligned dc zva start address.  */
+-	beq	2f
+-1:	stp	q0, q0, [dst], 64
+-	stp	q0, q0, [dst, -32]
+-	subs	count, count, 64
+-	b.hi	1b
+-2:	mov	dst, tmp1
+-	sub	count, dstend, tmp1	/* Remaining bytes to write.  */
+-	subs	count, count, zva_len
+-	b.lo	4f
+-3:	dc	zva, dst
+-	add	dst, dst, zva_len
+-	subs	count, count, zva_len
+-	b.hs	3b
+-4:	add	count, count, zva_len
+-	sub	dst, dst, 32		/* Bias dst for tail loop.  */
+-	b	L(tail64)
+ #endif
+ 
+ END (MEMSET)
+
+commit 41eb2f8b5847079caca90a74659456adbb80ec29
+Author: Wilco Dijkstra <wilco.dijkstra@arm.com>
+Date:   Mon Nov 25 18:43:08 2024 +0000
+
+    AArch64: Remove zva_128 from memset
+    
+    Remove ZVA 128 support from memset - the new memset no longer
+    guarantees count >= 256, which can result in underflow and a
+    crash if ZVA size is 128 ([1]).  Since only one CPU uses a ZVA
+    size of 128 and its memcpy implementation was removed in commit
+    e162ab2bf1b82c40f29e1925986582fa07568ce8, remove this special
+    case too.
+    
+    [1] https://sourceware.org/pipermail/libc-alpha/2024-November/161626.html
+    
+    Reviewed-by: Andrew Pinski <quic_apinski@quicinc.com>
+    (cherry picked from commit a08d9a52f967531a77e1824c23b5368c6434a72d)
+
+diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
+index caafb019e2..71814d0b2f 100644
+--- a/sysdeps/aarch64/memset.S
++++ b/sysdeps/aarch64/memset.S
+@@ -104,7 +104,7 @@ L(set_long):
+ 	mrs	zva_val, dczid_el0
+ 	and	zva_val, zva_val, 31
+ 	cmp	zva_val, 4		/* ZVA size is 64 bytes.  */
+-	b.ne	L(zva_128)
++	b.ne	L(no_zva)
+ #endif
+ 	stp	q0, q0, [dst, 32]
+ 	bic	dst, dstin, 63
+@@ -137,28 +137,5 @@ L(no_zva_loop):
+ 	stp	q0, q0, [dstend, -32]
+ 	ret
+ 
+-#ifndef ZVA64_ONLY
+-	.p2align 4
+-L(zva_128):
+-	cmp	zva_val, 5		/* ZVA size is 128 bytes.  */
+-	b.ne	L(no_zva)
+-
+-	stp	q0, q0, [dst, 32]
+-	stp	q0, q0, [dst, 64]
+-	stp	q0, q0, [dst, 96]
+-	bic	dst, dst, 127
+-	sub	count, dstend, dst	/* Count is now 128 too large.	*/
+-	sub	count, count, 128 + 128	/* Adjust count and bias for loop.  */
+-1:	add	dst, dst, 128
+-	dc	zva, dst
+-	subs	count, count, 128
+-	b.hi	1b
+-	stp	q0, q0, [dstend, -128]
+-	stp	q0, q0, [dstend, -96]
+-	stp	q0, q0, [dstend, -64]
+-	stp	q0, q0, [dstend, -32]
+-	ret
+-#endif
+-
+ END (MEMSET)
+ libc_hidden_builtin_def (MEMSET)
+
+commit 27fa0268ead054810a5e2669d0b5bb88ceb05b05
+Author: Wilco Dijkstra <wilco.dijkstra@arm.com>
+Date:   Wed Jul 24 15:17:47 2024 +0100
+
+    math: Improve layout of expf data
+    
+    GCC aligns global data to 16 bytes if their size is >= 16 bytes.  This patch
+    changes the exp2f_data struct slightly so that the fields are better aligned.
+    As a result on targets that support them, load-pair instructions accessing
+    poly_scaled and invln2_scaled are now 16-byte aligned.
+    
+    Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
+    (cherry picked from commit 44fa9c1080fe6a9539f0d2345b9d2ae37b8ee57a)
+
+diff --git a/sysdeps/ieee754/flt-32/math_config.h b/sysdeps/ieee754/flt-32/math_config.h
+index 729f22cd4f..dc07ebd459 100644
+--- a/sysdeps/ieee754/flt-32/math_config.h
++++ b/sysdeps/ieee754/flt-32/math_config.h
+@@ -166,9 +166,9 @@ extern const struct exp2f_data
+   uint64_t tab[1 << EXP2F_TABLE_BITS];
+   double shift_scaled;
+   double poly[EXP2F_POLY_ORDER];
+-  double shift;
+   double invln2_scaled;
+   double poly_scaled[EXP2F_POLY_ORDER];
++  double shift;
+ } __exp2f_data attribute_hidden;
+ 
+ #define LOGF_TABLE_BITS 4
+
+commit 7038970f1f485fb660606f0c596f432fdef250f6
+Author: Wilco Dijkstra <wilco.dijkstra@arm.com>
+Date:   Tue Dec 24 18:01:59 2024 +0000
+
+    AArch64: Add SVE memset
+    
+    Add SVE memset based on the generic memset with predicated load for sizes < 16.
+    Unaligned memsets of 128-1024 are improved by ~20% on average by using aligned
+    stores for the last 64 bytes.  Performance of random memset benchmark improves
+    by ~2% on Neoverse V1.
+    
+    Reviewed-by: Yury Khrustalev <yury.khrustalev@arm.com>
+    (cherry picked from commit 163b1bbb76caba4d9673c07940c5930a1afa7548)
+
+diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
+index 3e251cc234..6880ebc035 100644
+--- a/sysdeps/aarch64/multiarch/Makefile
++++ b/sysdeps/aarch64/multiarch/Makefile
+@@ -16,6 +16,7 @@ sysdep_routines += \
+   memset_kunpeng \
+   memset_mops \
+   memset_oryon1 \
++  memset_sve_zva64 \
+   memset_zva64 \
+   strlen_asimd \
+   strlen_generic \
+diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+index b2fda541f9..1f101a719b 100644
+--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+@@ -61,6 +61,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_kunpeng)
+ #if HAVE_AARCH64_SVE_ASM
+ 	      IFUNC_IMPL_ADD (array, i, memset, sve && !bti && zva_size == 256, __memset_a64fx)
++	      IFUNC_IMPL_ADD (array, i, memset, sve && zva_size == 64, __memset_sve_zva64)
+ #endif
+ 	      IFUNC_IMPL_ADD (array, i, memset, mops, __memset_mops)
+ 	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic))
+diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
+index bd063c16c9..4f65295e77 100644
+--- a/sysdeps/aarch64/multiarch/memset.c
++++ b/sysdeps/aarch64/multiarch/memset.c
+@@ -36,6 +36,7 @@ extern __typeof (__redirect_memset) __memset_a64fx attribute_hidden;
+ extern __typeof (__redirect_memset) __memset_generic attribute_hidden;
+ extern __typeof (__redirect_memset) __memset_mops attribute_hidden;
+ extern __typeof (__redirect_memset) __memset_oryon1 attribute_hidden;
++extern __typeof (__redirect_memset) __memset_sve_zva64 attribute_hidden;
+ 
+ static inline __typeof (__redirect_memset) *
+ select_memset_ifunc (void)
+@@ -49,6 +50,9 @@ select_memset_ifunc (void)
+     {
+       if (IS_A64FX (midr) && zva_size == 256)
+ 	return __memset_a64fx;
++
++      if (zva_size == 64)
++	return __memset_sve_zva64;
+     }
+ 
+   if (IS_ORYON1 (midr) && zva_size == 64)
+diff --git a/sysdeps/aarch64/multiarch/memset_sve_zva64.S b/sysdeps/aarch64/multiarch/memset_sve_zva64.S
+new file mode 100644
+index 0000000000..7fb40fdd9e
+--- /dev/null
++++ b/sysdeps/aarch64/multiarch/memset_sve_zva64.S
+@@ -0,0 +1,123 @@
++/* Optimized memset for SVE.
++   Copyright (C) 2025 Free Software Foundation, Inc.
++
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library.  If not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <sysdep.h>
++
++/* Assumptions:
++ *
++ * ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses.
++ * ZVA size is 64.
++ */
++
++#if HAVE_AARCH64_SVE_ASM
++
++.arch armv8.2-a+sve
++
++#define dstin	x0
++#define val	x1
++#define valw	w1
++#define count	x2
++#define dst	x3
++#define dstend	x4
++#define zva_val	x5
++#define vlen	x5
++#define off	x3
++#define dstend2 x5
++
++ENTRY (__memset_sve_zva64)
++	dup	v0.16B, valw
++	cmp	count, 16
++	b.lo	L(set_16)
++
++	add	dstend, dstin, count
++	cmp	count, 64
++	b.hs	L(set_128)
++
++	/* Set 16..63 bytes.  */
++	mov	off, 16
++	and	off, off, count, lsr 1
++	sub	dstend2, dstend, off
++	str	q0, [dstin]
++	str	q0, [dstin, off]
++	str	q0, [dstend2, -16]
++	str	q0, [dstend, -16]
++	ret
++
++	.p2align 4
++L(set_16):
++	whilelo p0.b, xzr, count
++	st1b	z0.b, p0, [dstin]
++	ret
++
++	.p2align 4
++L(set_128):
++	bic	dst, dstin, 15
++	cmp	count, 128
++	b.hi	L(set_long)
++	stp	q0, q0, [dstin]
++	stp	q0, q0, [dstin, 32]
++	stp	q0, q0, [dstend, -64]
++	stp	q0, q0, [dstend, -32]
++	ret
++
++	.p2align 4
++L(set_long):
++	cmp	count, 256
++	b.lo	L(no_zva)
++	tst	valw, 255
++	b.ne	L(no_zva)
++
++	str	q0, [dstin]
++	str	q0, [dst, 16]
++	bic	dst, dstin, 31
++	stp	q0, q0, [dst, 32]
++	bic	dst, dstin, 63
++	sub	count, dstend, dst	/* Count is now 64 too large.  */
++	sub	count, count, 128	/* Adjust count and bias for loop.  */
++
++	sub	x8, dstend, 1		/* Write last bytes before ZVA loop.  */
++	bic	x8, x8, 15
++	stp	q0, q0, [x8, -48]
++	str	q0, [x8, -16]
++	str	q0, [dstend, -16]
++
++	.p2align 4
++L(zva64_loop):
++	add	dst, dst, 64
++	dc	zva, dst
++	subs	count, count, 64
++	b.hi	L(zva64_loop)
++	ret
++
++L(no_zva):
++	str	q0, [dstin]
++	sub	count, dstend, dst	/* Count is 16 too large.  */
++	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
++L(no_zva_loop):
++	stp	q0, q0, [dst, 16]
++	stp	q0, q0, [dst, 48]
++	add	dst, dst, 64
++	subs	count, count, 64
++	b.hi	L(no_zva_loop)
++	stp	q0, q0, [dstend, -64]
++	stp	q0, q0, [dstend, -32]
++	ret
++
++END (__memset_sve_zva64)
++#endif
+
+commit d6175a44e95fe443d0fbfed37a9ff7424f1e2661
+Author: Wilco Dijkstra <wilco.dijkstra@arm.com>
+Date:   Thu Feb 27 16:28:52 2025 +0000
+
+    AArch64: Use prefer_sve_ifuncs for SVE memset
+    
+    Use prefer_sve_ifuncs for SVE memset just like memcpy.
+    
+    Reviewed-by: Yury Khrustalev <yury.khrustalev@arm.com>
+    (cherry picked from commit 0f044be1dae5169d0e57f8d487b427863aeadab4)
+
+diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
+index 4f65295e77..bb1e865c97 100644
+--- a/sysdeps/aarch64/multiarch/memset.c
++++ b/sysdeps/aarch64/multiarch/memset.c
+@@ -51,7 +51,7 @@ select_memset_ifunc (void)
+       if (IS_A64FX (midr) && zva_size == 256)
+ 	return __memset_a64fx;
+ 
+-      if (zva_size == 64)
++      if (prefer_sve_ifuncs && zva_size == 64)
+ 	return __memset_sve_zva64;
+     }
+ 
+
+commit d8e8342369831808b00324790c8809ba33408ee7
+Author: Wilco Dijkstra <wilco.dijkstra@arm.com>
+Date:   Fri Dec 13 15:43:07 2024 +0000
+
+    math: Improve layout of exp/exp10 data
+    
+    GCC aligns global data to 16 bytes if their size is >= 16 bytes.  This patch
+    changes the exp_data struct slightly so that the fields are better aligned
+    and without gaps.  As a result on targets that support them, more load-pair
+    instructions are used in exp.  Exp10 is improved by moving invlog10_2N later
+    so that neglog10_2hiN and neglog10_2loN can be loaded using load-pair.
+    
+    The exp benchmark improves 2.5%, "144bits" by 7.2%, "768bits" by 12.7% on
+    Neoverse V2.  Exp10 improves by 1.5%.
+    
+    Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
+    (cherry picked from commit 5afaf99edb326fd9f36eb306a828d129a3a1d7f7)
+
+diff --git a/sysdeps/ieee754/dbl-64/math_config.h b/sysdeps/ieee754/dbl-64/math_config.h
+index ef87cfa6be..05515fd95a 100644
+--- a/sysdeps/ieee754/dbl-64/math_config.h
++++ b/sysdeps/ieee754/dbl-64/math_config.h
+@@ -195,16 +195,18 @@ check_uflow (double x)
+ extern const struct exp_data
+ {
+   double invln2N;
+-  double shift;
+   double negln2hiN;
+   double negln2loN;
+   double poly[4]; /* Last four coefficients.  */
++  double shift;
++
+   double exp2_shift;
+   double exp2_poly[EXP2_POLY_ORDER];
+-  double invlog10_2N;
++
+   double neglog10_2hiN;
+   double neglog10_2loN;
+   double exp10_poly[5];
++  double invlog10_2N;
+   uint64_t tab[2*(1 << EXP_TABLE_BITS)];
+ } __exp_data attribute_hidden;
+ 
+
+commit 3e820e17a8cef84645d83b67abcbc3f88c7fd268
+Author: Michael Jeanson <mjeanson@efficios.com>
+Date:   Fri Feb 14 13:54:22 2025 -0500
+
+    nptl: clear the whole rseq area before registration
+    
+    Due to the extensible nature of the rseq area we can't explictly
+    initialize fields that are not part of the ABI yet. It was agreed with
+    upstream that all new fields will be documented as zero initialized by
+    userspace. Future kernels configured with CONFIG_DEBUG_RSEQ will
+    validate the content of all fields during registration.
+    
+    Replace the explicit field initialization with a memset of the whole
+    rseq area which will cover fields as they are added to future kernels.
+    
+    Signed-off-by: Michael Jeanson <mjeanson@efficios.com>
+    Reviewed-by: Florian Weimer <fweimer@redhat.com>
+    (cherry picked from commit 689a62a4217fae78b9ce0db781dc2a421f2b1ab4)
+
+diff --git a/sysdeps/nptl/dl-tls_init_tp.c b/sysdeps/nptl/dl-tls_init_tp.c
+index 7803e19fd1..ed10185e37 100644
+--- a/sysdeps/nptl/dl-tls_init_tp.c
++++ b/sysdeps/nptl/dl-tls_init_tp.c
+@@ -23,6 +23,7 @@
+ #include <tls.h>
+ #include <rseq-internal.h>
+ #include <thread_pointer.h>
++#include <dl-symbol-redir-ifunc.h>
+ 
+ #define TUNABLE_NAMESPACE pthread
+ #include <dl-tunables.h>
+diff --git a/sysdeps/unix/sysv/linux/rseq-internal.h b/sysdeps/unix/sysv/linux/rseq-internal.h
+index ef3eab1fef..76de2b7ff0 100644
+--- a/sysdeps/unix/sysv/linux/rseq-internal.h
++++ b/sysdeps/unix/sysv/linux/rseq-internal.h
+@@ -52,13 +52,12 @@ rseq_register_current_thread (struct pthread *self, bool do_rseq)
+            but still expected size 32.  */
+         size = RSEQ_AREA_SIZE_INITIAL;
+ 
+-      /* Initialize the rseq fields that are read by the kernel on
+-         registration, there is no guarantee that struct pthread is
+-         cleared on all architectures.  */
++      /* Initialize the whole rseq area to zero prior to registration.  */
++      memset (&self->rseq_area, 0, size);
++
++      /* Set the cpu_id field to RSEQ_CPU_ID_UNINITIALIZED, this is checked by
++         the kernel at registration when CONFIG_DEBUG_RSEQ is enabled.  */
+       THREAD_SETMEM (self, rseq_area.cpu_id, RSEQ_CPU_ID_UNINITIALIZED);
+-      THREAD_SETMEM (self, rseq_area.cpu_id_start, 0);
+-      THREAD_SETMEM (self, rseq_area.rseq_cs, 0);
+-      THREAD_SETMEM (self, rseq_area.flags, 0);
+ 
+       int ret = INTERNAL_SYSCALL_CALL (rseq, &self->rseq_area,
+                                        size, 0, RSEQ_SIG);
+
+commit ee1ab9302363066b49cf8862b96664ed35eda81c
+Author: Sunil K Pandey <skpgkp2@gmail.com>
+Date:   Mon Mar 10 10:24:07 2025 -0700
+
+    x86_64: Add tanh with FMA
+    
+    On Skylake, it improves tanh bench performance by:
+    
+            Before          After           Improvement
+    max     110.89          95.826          14%
+    min     20.966          20.157          4%
+    mean    30.9601         29.8431         4%
+    
+    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+    (cherry picked from commit c6352111c72a20b3588ae304dd99b63e25dd6d85)
+
+diff --git a/sysdeps/ieee754/dbl-64/s_tanh.c b/sysdeps/ieee754/dbl-64/s_tanh.c
+index 673a97102d..13063db04e 100644
+--- a/sysdeps/ieee754/dbl-64/s_tanh.c
++++ b/sysdeps/ieee754/dbl-64/s_tanh.c
+@@ -46,6 +46,11 @@ static char rcsid[] = "$NetBSD: s_tanh.c,v 1.7 1995/05/10 20:48:22 jtc Exp $";
+ 
+ static const double one = 1.0, two = 2.0, tiny = 1.0e-300;
+ 
++#ifndef SECTION
++# define SECTION
++#endif
++
++SECTION
+ double
+ __tanh (double x)
+ {
+diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
+index cbe09d49f4..0f69f7089c 100644
+--- a/sysdeps/x86_64/fpu/multiarch/Makefile
++++ b/sysdeps/x86_64/fpu/multiarch/Makefile
+@@ -10,6 +10,7 @@ CFLAGS-s_expm1-fma.c = -mfma -mavx2
+ CFLAGS-s_log1p-fma.c = -mfma -mavx2
+ CFLAGS-s_sin-fma.c = -mfma -mavx2
+ CFLAGS-s_tan-fma.c = -mfma -mavx2
++CFLAGS-s_tanh-fma.c = -mfma -mavx2
+ CFLAGS-s_sincos-fma.c = -mfma -mavx2
+ 
+ CFLAGS-e_exp2f-fma.c = -mfma -mavx2
+@@ -92,6 +93,7 @@ libm-sysdep_routines += \
+   s_sinf-sse2 \
+   s_tan-avx \
+   s_tan-fma \
++  s_tanh-fma \
+   s_trunc-sse4_1 \
+   s_truncf-sse4_1 \
+ # libm-sysdep_routines
+diff --git a/sysdeps/x86_64/fpu/multiarch/s_tanh-fma.c b/sysdeps/x86_64/fpu/multiarch/s_tanh-fma.c
+new file mode 100644
+index 0000000000..1b808b1227
+--- /dev/null
++++ b/sysdeps/x86_64/fpu/multiarch/s_tanh-fma.c
+@@ -0,0 +1,11 @@
++#define __tanh __tanh_fma
++#define __expm1 __expm1_fma
++
++/* NB: __expm1 may be expanded to __expm1_fma in the following
++   prototypes.  */
++extern long double __expm1l (long double);
++extern long double __expm1f128 (long double);
++
++#define SECTION __attribute__ ((section (".text.fma")))
++
++#include <sysdeps/ieee754/dbl-64/s_tanh.c>
+diff --git a/sysdeps/x86_64/fpu/multiarch/s_tanh.c b/sysdeps/x86_64/fpu/multiarch/s_tanh.c
+new file mode 100644
+index 0000000000..5539b6c61c
+--- /dev/null
++++ b/sysdeps/x86_64/fpu/multiarch/s_tanh.c
+@@ -0,0 +1,31 @@
++/* Multiple versions of tanh.
++   Copyright (C) 2025 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <sysdeps/x86/isa-level.h>
++#if MINIMUM_X86_ISA_LEVEL < AVX2_X86_ISA_LEVEL
++
++extern double __redirect_tanh (double);
++
++# define SYMBOL_NAME tanh
++# include "ifunc-fma.h"
++
++libc_ifunc_redirected (__redirect_tanh, __tanh, IFUNC_SELECTOR ());
++
++# define __tanh __tanh_sse2
++#endif
++#include <sysdeps/ieee754/dbl-64/s_tanh.c>
+
+commit e854f6d37cbeabb9130fed74b587befad8b4ba08
+Author: Sunil K Pandey <skpgkp2@gmail.com>
+Date:   Sat Mar 8 08:51:10 2025 -0800
+
+    x86_64: Add sinh with FMA
+    
+    On SPR, it improves sinh bench performance by:
+    
+                            Before          After           Improvement
+    reciprocal-throughput   14.2017         11.815          17%
+    latency                 36.4917         35.2114         4%
+    
+    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+    (cherry picked from commit dded0d20f67ba1925ccbcb9cf28f0c75febe0dbe)
+
+diff --git a/benchtests/sinh-inputs b/benchtests/sinh-inputs
+index 7b1ac46a39..2fcb2fabf8 100644
+--- a/benchtests/sinh-inputs
++++ b/benchtests/sinh-inputs
+@@ -1,6 +1,7 @@
+ ## args: double
+ ## ret: double
+ ## includes: math.h
++## name: workload-random
+ 0x1.bcb6129b5ff2bp8
+ -0x1.63057386325ebp9
+ 0x1.62f1d7dc4e8bfp9
+diff --git a/sysdeps/ieee754/dbl-64/e_sinh.c b/sysdeps/ieee754/dbl-64/e_sinh.c
+index b4b5857ddd..3f787967f9 100644
+--- a/sysdeps/ieee754/dbl-64/e_sinh.c
++++ b/sysdeps/ieee754/dbl-64/e_sinh.c
+@@ -41,6 +41,11 @@ static char rcsid[] = "$NetBSD: e_sinh.c,v 1.7 1995/05/10 20:46:13 jtc Exp $";
+ 
+ static const double one = 1.0, shuge = 1.0e307;
+ 
++#ifndef SECTION
++# define SECTION
++#endif
++
++SECTION
+ double
+ __ieee754_sinh (double x)
+ {
+@@ -90,4 +95,7 @@ __ieee754_sinh (double x)
+   /* |x| > overflowthresold, sinh(x) overflow */
+   return math_narrow_eval (x * shuge);
+ }
++
++#ifndef __ieee754_sinh
+ libm_alias_finite (__ieee754_sinh, __sinh)
++#endif
+diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
+index 0f69f7089c..b527cab8d1 100644
+--- a/sysdeps/x86_64/fpu/multiarch/Makefile
++++ b/sysdeps/x86_64/fpu/multiarch/Makefile
+@@ -5,6 +5,7 @@ CFLAGS-e_exp-fma.c = -mfma -mavx2
+ CFLAGS-e_log-fma.c = -mfma -mavx2
+ CFLAGS-e_log2-fma.c = -mfma -mavx2
+ CFLAGS-e_pow-fma.c = -mfma -mavx2
++CFLAGS-e_sinh-fma.c = -mfma -mavx2
+ CFLAGS-s_atan-fma.c = -mfma -mavx2
+ CFLAGS-s_expm1-fma.c = -mfma -mavx2
+ CFLAGS-s_log1p-fma.c = -mfma -mavx2
+@@ -67,6 +68,7 @@ libm-sysdep_routines += \
+   e_logf-fma \
+   e_pow-fma \
+   e_powf-fma \
++  e_sinh-fma \
+   s_atan-avx \
+   s_atan-fma \
+   s_ceil-sse4_1 \
+diff --git a/sysdeps/x86_64/fpu/multiarch/e_sinh-fma.c b/sysdeps/x86_64/fpu/multiarch/e_sinh-fma.c
+new file mode 100644
+index 0000000000..e0e1e39a7a
+--- /dev/null
++++ b/sysdeps/x86_64/fpu/multiarch/e_sinh-fma.c
+@@ -0,0 +1,12 @@
++#define __ieee754_sinh __ieee754_sinh_fma
++#define __ieee754_exp __ieee754_exp_fma
++#define __expm1 __expm1_fma
++
++/* NB: __expm1 may be expanded to __expm1_fma in the following
++   prototypes.  */
++extern long double __expm1l (long double);
++extern long double __expm1f128 (long double);
++
++#define SECTION __attribute__ ((section (".text.fma")))
++
++#include <sysdeps/ieee754/dbl-64/e_sinh.c>
+diff --git a/sysdeps/x86_64/fpu/multiarch/e_sinh.c b/sysdeps/x86_64/fpu/multiarch/e_sinh.c
+new file mode 100644
+index 0000000000..3d3c18ccdf
+--- /dev/null
++++ b/sysdeps/x86_64/fpu/multiarch/e_sinh.c
+@@ -0,0 +1,35 @@
++/* Multiple versions of sinh.
++   Copyright (C) 2025 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <sysdeps/x86/isa-level.h>
++#if MINIMUM_X86_ISA_LEVEL < AVX2_X86_ISA_LEVEL
++# include <libm-alias-finite.h>
++
++extern double __redirect_ieee754_sinh (double);
++
++# define SYMBOL_NAME ieee754_sinh
++# include "ifunc-fma.h"
++
++libc_ifunc_redirected (__redirect_ieee754_sinh, __ieee754_sinh,
++		       IFUNC_SELECTOR ());
++
++libm_alias_finite (__ieee754_sinh, __sinh)
++
++# define __ieee754_sinh __ieee754_sinh_sse2
++#endif
++#include <sysdeps/ieee754/dbl-64/e_sinh.c>
+
+commit e5f5dfdda28def8362896bdb1748bb27dfc8be73
+Author: Sunil K Pandey <skpgkp2@gmail.com>
+Date:   Wed Mar 5 16:13:38 2025 -0800
+
+    x86_64: Add atanh with FMA
+    
+    On SPR, it improves atanh bench performance by:
+    
+                            Before          After           Improvement
+    reciprocal-throughput   15.1715         14.8628         2%
+    latency                 57.1941         56.1883         2%
+    
+    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+    (cherry picked from commit c7c4a5906f326f1290b1c2413a83c530564ec4b8)
+
+diff --git a/benchtests/atanh-inputs b/benchtests/atanh-inputs
+index 455aa65b65..4985293254 100644
+--- a/benchtests/atanh-inputs
++++ b/benchtests/atanh-inputs
+@@ -1,6 +1,7 @@
+ ## args: double
+ ## ret: double
+ ## includes: math.h
++## name: workload-random
+ 0x1.5a2730bacd94ap-1
+ -0x1.b57eb40fc048ep-21
+ -0x1.c0b185fb450e2p-17
+diff --git a/sysdeps/ieee754/dbl-64/e_atanh.c b/sysdeps/ieee754/dbl-64/e_atanh.c
+index 11a2a45799..05ac0a1b30 100644
+--- a/sysdeps/ieee754/dbl-64/e_atanh.c
++++ b/sysdeps/ieee754/dbl-64/e_atanh.c
+@@ -44,6 +44,11 @@
+ 
+ static const double huge = 1e300;
+ 
++#ifndef SECTION
++# define SECTION
++#endif
++
++SECTION
+ double
+ __ieee754_atanh (double x)
+ {
+@@ -73,4 +78,7 @@ __ieee754_atanh (double x)
+ 
+   return copysign (t, x);
+ }
++
++#ifndef __ieee754_atanh
+ libm_alias_finite (__ieee754_atanh, __atanh)
++#endif
+diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
+index b527cab8d1..bc479b42d2 100644
+--- a/sysdeps/x86_64/fpu/multiarch/Makefile
++++ b/sysdeps/x86_64/fpu/multiarch/Makefile
+@@ -1,6 +1,7 @@
+ ifeq ($(subdir),math)
+ CFLAGS-e_asin-fma.c = -mfma -mavx2
+ CFLAGS-e_atan2-fma.c = -mfma -mavx2
++CFLAGS-e_atanh-fma.c = -mfma -mavx2
+ CFLAGS-e_exp-fma.c = -mfma -mavx2
+ CFLAGS-e_log-fma.c = -mfma -mavx2
+ CFLAGS-e_log2-fma.c = -mfma -mavx2
+@@ -57,6 +58,7 @@ libm-sysdep_routines += \
+   e_asin-fma \
+   e_atan2-avx \
+   e_atan2-fma \
++  e_atanh-fma \
+   e_exp-avx \
+   e_exp-fma \
+   e_exp2f-fma \
+diff --git a/sysdeps/x86_64/fpu/multiarch/e_atanh-fma.c b/sysdeps/x86_64/fpu/multiarch/e_atanh-fma.c
+new file mode 100644
+index 0000000000..c3f2f9e550
+--- /dev/null
++++ b/sysdeps/x86_64/fpu/multiarch/e_atanh-fma.c
+@@ -0,0 +1,6 @@
++#define __ieee754_atanh __ieee754_atanh_fma
++#define __log1p __log1p_fma
++
++#define SECTION __attribute__ ((section (".text.fma")))
++
++#include <sysdeps/ieee754/dbl-64/e_atanh.c>
+diff --git a/sysdeps/x86_64/fpu/multiarch/e_atanh.c b/sysdeps/x86_64/fpu/multiarch/e_atanh.c
+new file mode 100644
+index 0000000000..d2b785dfc0
+--- /dev/null
++++ b/sysdeps/x86_64/fpu/multiarch/e_atanh.c
+@@ -0,0 +1,34 @@
++/* Multiple versions of atanh.
++   Copyright (C) 2025 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <sysdeps/x86/isa-level.h>
++#if MINIMUM_X86_ISA_LEVEL < AVX2_X86_ISA_LEVEL
++# include <libm-alias-finite.h>
++
++extern double __redirect_ieee754_atanh (double);
++
++# define SYMBOL_NAME ieee754_atanh
++# include "ifunc-fma.h"
++
++libc_ifunc_redirected (__redirect_ieee754_atanh, __ieee754_atanh, IFUNC_SELECTOR ());
++
++libm_alias_finite (__ieee754_atanh, __atanh)
++
++# define __ieee754_atanh __ieee754_atanh_sse2
++#endif
++#include <sysdeps/ieee754/dbl-64/e_atanh.c>
+
+commit 8fc492bb4234edc1a5e8c3b7f76ba345ea7109ec
+Author: Florian Weimer <fweimer@redhat.com>
+Date:   Fri Mar 28 09:26:06 2025 +0100
+
+    x86: Skip XSAVE state size reset if ISA level requires XSAVE
+    
+    If we have to use XSAVE or XSAVEC trampolines, do not adjust the size
+    information they need.  Technically, it is an operator error to try to
+    run with -XSAVE,-XSAVEC on such builds, but this change here disables
+    some unnecessary code with higher ISA levels and simplifies testing.
+    
+    Related to commit befe2d3c4dec8be2cdd01a47132e47bdb7020922
+    ("x86-64: Don't use SSE resolvers for ISA level 3 or above").
+    
+    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+    (cherry picked from commit 59585ddaa2d44f22af04bb4b8bd4ad1e302c4c02)
+
+diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
+index c096dd390a..b5b264db7f 100644
+--- a/sysdeps/x86/cpu-features.c
++++ b/sysdeps/x86/cpu-features.c
+@@ -24,6 +24,7 @@
+ #include <dl-cacheinfo.h>
+ #include <dl-minsigstacksize.h>
+ #include <dl-hwcap2.h>
++#include <gcc-macros.h>
+ 
+ extern void TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *)
+   attribute_hidden;
+@@ -1119,6 +1120,9 @@ no_cpuid:
+ 	       TUNABLE_CALLBACK (set_prefer_map_32bit_exec));
+ #endif
+ 
++  /* Do not add the logic to disable XSAVE/XSAVEC if this glibc build
++     requires AVX and therefore XSAVE or XSAVEC support.  */
++#ifndef GCCMACRO__AVX__
+   bool disable_xsave_features = false;
+ 
+   if (!CPU_FEATURE_USABLE_P (cpu_features, OSXSAVE))
+@@ -1172,6 +1176,7 @@ no_cpuid:
+ 
+       CPU_FEATURE_UNSET (cpu_features, FMA4);
+     }
++#endif
+ 
+ #ifdef __x86_64__
+   GLRO(dl_hwcap) = HWCAP_X86_64;
+
+commit df22af58f66e6815c054b1c56249356c2994935a
+Author: Florian Weimer <fweimer@redhat.com>
+Date:   Fri Mar 28 09:26:59 2025 +0100
+
+    x86: Use separate variable for TLSDESC XSAVE/XSAVEC state size (bug 32810)
+    
+    Previously, the initialization code reused the xsave_state_full_size
+    member of struct cpu_features for the TLSDESC state size.  However,
+    the tunable processing code assumes that this member has the
+    original XSAVE (non-compact) state size, so that it can use its
+    value if XSAVEC is disabled via tunable.
+    
+    This change uses a separate variable and not a struct member because
+    the value is only needed in ld.so and the static libc, but not in
+    libc.so.  As a result, struct cpu_features layout does not change,
+    helping a future backport of this change.
+    
+    Fixes commit 9b7091415af47082664717210ac49d51551456ab ("x86-64:
+    Update _dl_tlsdesc_dynamic to preserve AMX registers").
+    
+    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+    (cherry picked from commit 145097dff170507fe73190e8e41194f5b5f7e6bf)
+
+diff --git a/NEWS b/NEWS
+index 57feba81cd..7a6985f5dd 100644
+--- a/NEWS
++++ b/NEWS
+@@ -22,6 +22,7 @@ The following bugs are resolved with this release:
+   [32231] elf: Change ldconfig auxcache magic number
+   [32245] glibc -Wstringop-overflow= build failure on hppa
+   [32470] x86: Avoid integer truncation with large cache sizes
++  [32810] Crash on x86-64 if XSAVEC disable via tunable
+ 
+ Version 2.40
+ 
+diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
+index 5311b594af..8819fba1b7 100644
+--- a/sysdeps/x86/Makefile
++++ b/sysdeps/x86/Makefile
+@@ -21,6 +21,9 @@ tests += \
+   tst-cpu-features-supports-static \
+   tst-get-cpu-features \
+   tst-get-cpu-features-static \
++  tst-gnu2-tls2-x86-noxsave \
++  tst-gnu2-tls2-x86-noxsavec \
++  tst-gnu2-tls2-x86-noxsavexsavec \
+   tst-hwcap-tunables \
+ # tests
+ tests-static += \
+@@ -91,6 +94,22 @@ CFLAGS-tst-gnu2-tls2.c += -msse
+ CFLAGS-tst-gnu2-tls2mod0.c += -msse2 -mtune=haswell
+ CFLAGS-tst-gnu2-tls2mod1.c += -msse2 -mtune=haswell
+ CFLAGS-tst-gnu2-tls2mod2.c += -msse2 -mtune=haswell
++
++LDFLAGS-tst-gnu2-tls2-x86-noxsave += -Wl,-z,lazy
++LDFLAGS-tst-gnu2-tls2-x86-noxsavec += -Wl,-z,lazy
++LDFLAGS-tst-gnu2-tls2-x86-noxsavexsavec += -Wl,-z,lazy
++
++# Test for bug 32810: incorrect XSAVE state size if XSAVEC is disabled
++# via tunable.
++tst-gnu2-tls2-x86-noxsave-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVE
++tst-gnu2-tls2-x86-noxsavec-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVEC
++tst-gnu2-tls2-x86-noxsavexsavec-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVE,-XSAVEC
++$(objpfx)tst-gnu2-tls2-x86-noxsave.out \
++$(objpfx)tst-gnu2-tls2-x86-noxsavec.out \
++$(objpfx)tst-gnu2-tls2-x86-noxsavexsavec.out: \
++  $(objpfx)tst-gnu2-tls2mod0.so \
++  $(objpfx)tst-gnu2-tls2mod1.so \
++  $(objpfx)tst-gnu2-tls2mod2.so
+ endif
+ 
+ ifeq ($(subdir),math)
+diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
+index b5b264db7f..ec27337337 100644
+--- a/sysdeps/x86/cpu-features.c
++++ b/sysdeps/x86/cpu-features.c
+@@ -84,6 +84,8 @@ extern void TUNABLE_CALLBACK (set_x86_shstk) (tunable_val_t *)
+ # include <dl-cet.h>
+ #endif
+ 
++unsigned long int _dl_x86_features_tlsdesc_state_size;
++
+ static void
+ update_active (struct cpu_features *cpu_features)
+ {
+@@ -318,6 +320,7 @@ update_active (struct cpu_features *cpu_features)
+ 		= xsave_state_full_size;
+ 	      cpu_features->xsave_state_full_size
+ 		= xsave_state_full_size;
++	      _dl_x86_features_tlsdesc_state_size = xsave_state_full_size;
+ 
+ 	      /* Check if XSAVEC is available.  */
+ 	      if (CPU_FEATURES_CPU_P (cpu_features, XSAVEC))
+@@ -406,11 +409,9 @@ update_active (struct cpu_features *cpu_features)
+ 			= ALIGN_UP ((amx_size
+ 				     + TLSDESC_CALL_REGISTER_SAVE_AREA),
+ 				    64);
+-		      /* Set xsave_state_full_size to the compact AMX
+-			 state size for XSAVEC.  NB: xsave_state_full_size
+-			 is only used in _dl_tlsdesc_dynamic_xsave and
+-			 _dl_tlsdesc_dynamic_xsavec.  */
+-		      cpu_features->xsave_state_full_size = amx_size;
++		      /* Set TLSDESC state size to the compact AMX
++			 state size for XSAVEC.  */
++		      _dl_x86_features_tlsdesc_state_size = amx_size;
+ #endif
+ 		      cpu_features->xsave_state_size
+ 			= ALIGN_UP (size + TLSDESC_CALL_REGISTER_SAVE_AREA,
+diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
+index ccc6b64dc2..a0b31d80f6 100644
+--- a/sysdeps/x86/cpu-tunables.c
++++ b/sysdeps/x86/cpu-tunables.c
+@@ -164,6 +164,8 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
+ 		  /* Update xsave_state_size to XSAVE state size.  */
+ 		  cpu_features->xsave_state_size
+ 		    = cpu_features->xsave_state_full_size;
++		  _dl_x86_features_tlsdesc_state_size
++		    = cpu_features->xsave_state_full_size;
+ 		  CPU_FEATURE_UNSET (cpu_features, XSAVEC);
+ 		}
+ 	    }
+diff --git a/sysdeps/x86/dl-diagnostics-cpu.c b/sysdeps/x86/dl-diagnostics-cpu.c
+index 49eeb5f70a..41100a908a 100644
+--- a/sysdeps/x86/dl-diagnostics-cpu.c
++++ b/sysdeps/x86/dl-diagnostics-cpu.c
+@@ -89,6 +89,8 @@ _dl_diagnostics_cpu (void)
+                             cpu_features->xsave_state_size);
+   print_cpu_features_value ("xsave_state_full_size",
+                             cpu_features->xsave_state_full_size);
++  print_cpu_features_value ("tlsdesc_state_full_size",
++                            _dl_x86_features_tlsdesc_state_size);
+   print_cpu_features_value ("data_cache_size", cpu_features->data_cache_size);
+   print_cpu_features_value ("shared_cache_size",
+                             cpu_features->shared_cache_size);
+diff --git a/sysdeps/x86/include/cpu-features.h b/sysdeps/x86/include/cpu-features.h
+index aaae44f0e1..03c71387dd 100644
+--- a/sysdeps/x86/include/cpu-features.h
++++ b/sysdeps/x86/include/cpu-features.h
+@@ -934,8 +934,6 @@ struct cpu_features
+   /* The full state size for XSAVE when XSAVEC is disabled by
+ 
+      GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVEC
+-
+-     and the AMX state size when XSAVEC is available.
+    */
+   unsigned int xsave_state_full_size;
+   /* Data cache size for use in memory and string routines, typically
+@@ -989,6 +987,13 @@ extern const struct cpu_features *_dl_x86_get_cpu_features (void)
+ 
+ #define __get_cpu_features() _dl_x86_get_cpu_features()
+ 
++#if IS_IN (rtld) || IS_IN (libc)
++/* XSAVE/XSAVEC state size used by TLS descriptors.  Compared to
++   xsave_state_size from struct cpu_features, this includes additional
++   registers.  */
++extern unsigned long int _dl_x86_features_tlsdesc_state_size attribute_hidden;
++#endif
++
+ #if defined (_LIBC) && !IS_IN (nonlib)
+ /* Unused for x86.  */
+ # define INIT_ARCH()
+diff --git a/sysdeps/x86/tst-gnu2-tls2-x86-noxsave.c b/sysdeps/x86/tst-gnu2-tls2-x86-noxsave.c
+new file mode 100644
+index 0000000000..f0024c143d
+--- /dev/null
++++ b/sysdeps/x86/tst-gnu2-tls2-x86-noxsave.c
+@@ -0,0 +1 @@
++#include <elf/tst-gnu2-tls2.c>
+diff --git a/sysdeps/x86/tst-gnu2-tls2-x86-noxsavec.c b/sysdeps/x86/tst-gnu2-tls2-x86-noxsavec.c
+new file mode 100644
+index 0000000000..f0024c143d
+--- /dev/null
++++ b/sysdeps/x86/tst-gnu2-tls2-x86-noxsavec.c
+@@ -0,0 +1 @@
++#include <elf/tst-gnu2-tls2.c>
+diff --git a/sysdeps/x86/tst-gnu2-tls2-x86-noxsavexsavec.c b/sysdeps/x86/tst-gnu2-tls2-x86-noxsavexsavec.c
+new file mode 100644
+index 0000000000..f0024c143d
+--- /dev/null
++++ b/sysdeps/x86/tst-gnu2-tls2-x86-noxsavexsavec.c
+@@ -0,0 +1 @@
++#include <elf/tst-gnu2-tls2.c>
+diff --git a/sysdeps/x86_64/dl-tlsdesc-dynamic.h b/sysdeps/x86_64/dl-tlsdesc-dynamic.h
+index 9f02cfc3eb..44d948696f 100644
+--- a/sysdeps/x86_64/dl-tlsdesc-dynamic.h
++++ b/sysdeps/x86_64/dl-tlsdesc-dynamic.h
+@@ -99,7 +99,7 @@ _dl_tlsdesc_dynamic:
+ # endif
+ #else
+ 	/* Allocate stack space of the required size to save the state.  */
+-	sub	_rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_FULL_SIZE_OFFSET(%rip), %RSP_LP
++	sub	_dl_x86_features_tlsdesc_state_size(%rip), %RSP_LP
+ #endif
+ 	/* Besides rdi and rsi, saved above, save rcx, rdx, r8, r9,
+ 	   r10 and r11.  */
+
+commit a87d9a2c2cc17a3b22fd3be8d106336f4dcf2042
+Author: Florian Weimer <fweimer@redhat.com>
+Date:   Mon Mar 31 21:33:18 2025 +0200
+
+    x86: Link tst-gnu2-tls2-x86-noxsave{,c,xsavec} with libpthread
+    
+    This fixes a test build failure on Hurd.
+    
+    Fixes commit 145097dff170507fe73190e8e41194f5b5f7e6bf ("x86: Use separate
+    variable for TLSDESC XSAVE/XSAVEC state size (bug 32810)").
+    
+    Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
+    (cherry picked from commit c6e2895695118ab59c7b17feb0fcb75a53e3478c)
+
+diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
+index 8819fba1b7..01b0192ddf 100644
+--- a/sysdeps/x86/Makefile
++++ b/sysdeps/x86/Makefile
+@@ -104,6 +104,9 @@ LDFLAGS-tst-gnu2-tls2-x86-noxsavexsavec += -Wl,-z,lazy
+ tst-gnu2-tls2-x86-noxsave-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVE
+ tst-gnu2-tls2-x86-noxsavec-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVEC
+ tst-gnu2-tls2-x86-noxsavexsavec-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVE,-XSAVEC
++$(objpfx)tst-gnu2-tls2-x86-noxsave: $(shared-thread-library)
++$(objpfx)tst-gnu2-tls2-x86-noxsavec: $(shared-thread-library)
++$(objpfx)tst-gnu2-tls2-x86-noxsavexsavec: $(shared-thread-library)
+ $(objpfx)tst-gnu2-tls2-x86-noxsave.out \
+ $(objpfx)tst-gnu2-tls2-x86-noxsavec.out \
+ $(objpfx)tst-gnu2-tls2-x86-noxsavexsavec.out: \
+
+commit 8fe27af20c8b25b84e12bcd52353862a95044aa2
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Wed Aug 14 14:37:30 2024 +0800
+
+    x86: Use `Avoid_Non_Temporal_Memset` to control non-temporal path
+    
+    This is just a refactor and there should be no behavioral change from
+    this commit.
+    
+    The goal is to make `Avoid_Non_Temporal_Memset` a more universal knob
+    for controlling whether we use non-temporal memset rather than having
+    extra logic based on vendor.
+    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+    
+    (cherry picked from commit b93dddfaf440aa12f45d7c356f6ffe9f27d35577)
+
+diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
+index ec27337337..8841020b36 100644
+--- a/sysdeps/x86/cpu-features.c
++++ b/sysdeps/x86/cpu-features.c
+@@ -758,6 +758,12 @@ init_cpu_features (struct cpu_features *cpu_features)
+   unsigned int stepping = 0;
+   enum cpu_features_kind kind;
+ 
++  /* Default is avoid non-temporal memset for non Intel/AMD hardware. This is,
++     as of writing this, we only have benchmarks indicatings it profitability
++     on Intel/AMD.  */
++  cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
++      |= bit_arch_Avoid_Non_Temporal_Memset;
++
+   cpu_features->cachesize_non_temporal_divisor = 4;
+ #if !HAS_CPUID
+   if (__get_cpuid_max (0, 0) == 0)
+@@ -783,6 +789,11 @@ init_cpu_features (struct cpu_features *cpu_features)
+ 
+       update_active (cpu_features);
+ 
++      /* Benchmarks indicate non-temporal memset can be profitable on Intel
++	hardware.  */
++      cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
++	  &= ~bit_arch_Avoid_Non_Temporal_Memset;
++
+       if (family == 0x06)
+ 	{
+ 	  model += extended_model;
+@@ -993,6 +1004,11 @@ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht
+ 
+       ecx = cpu_features->features[CPUID_INDEX_1].cpuid.ecx;
+ 
++      /* Benchmarks indicate non-temporal memset can be profitable on AMD
++	hardware.  */
++      cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
++	  &= ~bit_arch_Avoid_Non_Temporal_Memset;
++
+       if (CPU_FEATURE_USABLE_P (cpu_features, AVX))
+ 	{
+ 	  /* Since the FMA4 bit is in CPUID_INDEX_80000001 and
+diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
+index ac97414b5b..7b1b61c096 100644
+--- a/sysdeps/x86/dl-cacheinfo.h
++++ b/sysdeps/x86/dl-cacheinfo.h
+@@ -988,14 +988,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+   if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
+     rep_movsb_threshold = 2112;
+ 
+-  /* Non-temporal stores are more performant on Intel and AMD hardware above
+-     non_temporal_threshold. Enable this for both Intel and AMD hardware. */
+-  unsigned long int memset_non_temporal_threshold = SIZE_MAX;
+-  if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)
+-      && (cpu_features->basic.kind == arch_kind_intel
+-	  || cpu_features->basic.kind == arch_kind_amd))
+-    memset_non_temporal_threshold = non_temporal_threshold;
+-
+   /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
+      cases slower than the vectorized path (and for some alignments,
+      it is really slow, check BZ #30994).  */
+@@ -1017,6 +1009,13 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+   if (tunable_size != 0)
+     shared = tunable_size;
+ 
++  /* Non-temporal stores are more performant on some hardware above
++     non_temporal_threshold. Currently Prefer_Non_Temporal is set for for both
++     Intel and AMD hardware. */
++  unsigned long int memset_non_temporal_threshold = SIZE_MAX;
++  if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset))
++    memset_non_temporal_threshold = non_temporal_threshold;
++
+   tunable_size = TUNABLE_GET (x86_non_temporal_threshold, long int, NULL);
+   if (tunable_size > minimum_non_temporal_threshold
+       && tunable_size <= maximum_non_temporal_threshold)
+
+commit 7c6bd71b4dbdadab34e4fd21ec09b86b32daf443
+Author: Sunil K Pandey <skpgkp2@gmail.com>
+Date:   Thu Apr 3 13:00:45 2025 -0700
+
+    x86: Optimize xstate size calculation
+    
+    Scan xstate IDs up to the maximum supported xstate ID.  Remove the
+    separate AMX xstate calculation.  Instead, exclude the AMX space from
+    the start of TILECFG to the end of TILEDATA in xsave_state_size.
+    
+    Completed validation on SKL/SKX/SPR/SDE and compared xsave state size
+    with "ld.so --list-diagnostics" option, no regression.
+    
+    Co-Authored-By: H.J. Lu <hjl.tools@gmail.com>
+    Reviewed-by: Sunil K Pandey <skpgkp2@gmail.com>
+    (cherry picked from commit 70b648855185e967e54668b101d24704c3fb869d)
+
+diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
+index 8841020b36..1d5e2a0072 100644
+--- a/sysdeps/x86/cpu-features.c
++++ b/sysdeps/x86/cpu-features.c
+@@ -325,13 +325,8 @@ update_active (struct cpu_features *cpu_features)
+ 	      /* Check if XSAVEC is available.  */
+ 	      if (CPU_FEATURES_CPU_P (cpu_features, XSAVEC))
+ 		{
+-		  unsigned int xstate_comp_offsets[32];
+-		  unsigned int xstate_comp_sizes[32];
+-#ifdef __x86_64__
+-		  unsigned int xstate_amx_comp_offsets[32];
+-		  unsigned int xstate_amx_comp_sizes[32];
+-		  unsigned int amx_ecx;
+-#endif
++		  unsigned int xstate_comp_offsets[X86_XSTATE_MAX_ID + 1];
++		  unsigned int xstate_comp_sizes[X86_XSTATE_MAX_ID + 1];
+ 		  unsigned int i;
+ 
+ 		  xstate_comp_offsets[0] = 0;
+@@ -339,39 +334,16 @@ update_active (struct cpu_features *cpu_features)
+ 		  xstate_comp_offsets[2] = 576;
+ 		  xstate_comp_sizes[0] = 160;
+ 		  xstate_comp_sizes[1] = 256;
+-#ifdef __x86_64__
+-		  xstate_amx_comp_offsets[0] = 0;
+-		  xstate_amx_comp_offsets[1] = 160;
+-		  xstate_amx_comp_offsets[2] = 576;
+-		  xstate_amx_comp_sizes[0] = 160;
+-		  xstate_amx_comp_sizes[1] = 256;
+-#endif
+ 
+-		  for (i = 2; i < 32; i++)
++		  for (i = 2; i <= X86_XSTATE_MAX_ID; i++)
+ 		    {
+ 		      if ((FULL_STATE_SAVE_MASK & (1 << i)) != 0)
+ 			{
+ 			  __cpuid_count (0xd, i, eax, ebx, ecx, edx);
+-#ifdef __x86_64__
+-			  /* Include this in xsave_state_full_size.  */
+-			  amx_ecx = ecx;
+-			  xstate_amx_comp_sizes[i] = eax;
+-			  if ((AMX_STATE_SAVE_MASK & (1 << i)) != 0)
+-			    {
+-			      /* Exclude this from xsave_state_size.  */
+-			      ecx = 0;
+-			      xstate_comp_sizes[i] = 0;
+-			    }
+-			  else
+-#endif
+-			    xstate_comp_sizes[i] = eax;
++			  xstate_comp_sizes[i] = eax;
+ 			}
+ 		      else
+ 			{
+-#ifdef __x86_64__
+-			  amx_ecx = 0;
+-			  xstate_amx_comp_sizes[i] = 0;
+-#endif
+ 			  ecx = 0;
+ 			  xstate_comp_sizes[i] = 0;
+ 			}
+@@ -380,42 +352,32 @@ update_active (struct cpu_features *cpu_features)
+ 			{
+ 			  xstate_comp_offsets[i]
+ 			    = (xstate_comp_offsets[i - 1]
+-			       + xstate_comp_sizes[i -1]);
++			       + xstate_comp_sizes[i - 1]);
+ 			  if ((ecx & (1 << 1)) != 0)
+ 			    xstate_comp_offsets[i]
+ 			      = ALIGN_UP (xstate_comp_offsets[i], 64);
+-#ifdef __x86_64__
+-			  xstate_amx_comp_offsets[i]
+-			    = (xstate_amx_comp_offsets[i - 1]
+-			       + xstate_amx_comp_sizes[i - 1]);
+-			  if ((amx_ecx & (1 << 1)) != 0)
+-			    xstate_amx_comp_offsets[i]
+-			      = ALIGN_UP (xstate_amx_comp_offsets[i],
+-					  64);
+-#endif
+ 			}
+ 		    }
+ 
+ 		  /* Use XSAVEC.  */
+ 		  unsigned int size
+-		    = xstate_comp_offsets[31] + xstate_comp_sizes[31];
++		    = (xstate_comp_offsets[X86_XSTATE_MAX_ID]
++		       + xstate_comp_sizes[X86_XSTATE_MAX_ID]);
+ 		  if (size)
+ 		    {
++		      size = ALIGN_UP (size + TLSDESC_CALL_REGISTER_SAVE_AREA,
++				       64);
+ #ifdef __x86_64__
+-		      unsigned int amx_size
+-			= (xstate_amx_comp_offsets[31]
+-			   + xstate_amx_comp_sizes[31]);
+-		      amx_size
+-			= ALIGN_UP ((amx_size
+-				     + TLSDESC_CALL_REGISTER_SAVE_AREA),
+-				    64);
+-		      /* Set TLSDESC state size to the compact AMX
+-			 state size for XSAVEC.  */
+-		      _dl_x86_features_tlsdesc_state_size = amx_size;
++		      _dl_x86_features_tlsdesc_state_size = size;
++		      /* Exclude the AMX space from the start of TILECFG
++			 space to the end of TILEDATA space.  If CPU
++			 doesn't support AMX, TILECFG offset is the same
++			 as TILEDATA + 1 offset.  Otherwise, they are
++			 multiples of 64.  */
++		      size -= (xstate_comp_offsets[X86_XSTATE_TILEDATA_ID + 1]
++			       - xstate_comp_offsets[X86_XSTATE_TILECFG_ID]);
+ #endif
+-		      cpu_features->xsave_state_size
+-			= ALIGN_UP (size + TLSDESC_CALL_REGISTER_SAVE_AREA,
+-				    64);
++		      cpu_features->xsave_state_size = size;
+ 		      CPU_FEATURE_SET (cpu_features, XSAVEC);
+ 		    }
+ 		}
+diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
+index 7359149e17..1d6cabd816 100644
+--- a/sysdeps/x86/sysdep.h
++++ b/sysdeps/x86/sysdep.h
+@@ -102,6 +102,9 @@
+    | (1 << X86_XSTATE_ZMM_ID)		\
+    | (1 << X86_XSTATE_APX_F_ID))
+ 
++/* The maximum supported xstate ID.  */
++# define X86_XSTATE_MAX_ID	X86_XSTATE_APX_F_ID
++
+ /* AMX state mask.  */
+ # define AMX_STATE_SAVE_MASK		\
+   ((1 << X86_XSTATE_TILECFG_ID) | (1 << X86_XSTATE_TILEDATA_ID))
+@@ -123,6 +126,9 @@
+    | (1 << X86_XSTATE_K_ID)		\
+    | (1 << X86_XSTATE_ZMM_H_ID))
+ 
++/* The maximum supported xstate ID.  */
++# define X86_XSTATE_MAX_ID	X86_XSTATE_ZMM_H_ID
++
+ /* States to be included in xsave_state_size.  */
+ # define FULL_STATE_SAVE_MASK		STATE_SAVE_MASK
+ #endif
+
+commit 44f92df8007d57f82b1518e219a0dbb60389ef2c
+Author: Sunil K Pandey <skpgkp2@gmail.com>
+Date:   Thu Apr 3 18:14:20 2025 -0700
+
+    x86: Add ARL/PTL/CWF model detection support
+    
+    - Add ARROWLAKE model detection.
+    - Add PANTHERLAKE model detection.
+    - Add CLEARWATERFOREST model detection.
+    
+    Intel® Architecture Instruction Set Extensions Programming Reference
+    https://cdrdv2.intel.com/v1/dl/getContent/671368 Section 1.2.
+    
+    No regression, validated model detection on SDE.
+    
+    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+    (cherry picked from commit e53eb952b970ac94c97d74fb447418fb327ca096)
+
+diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
+index 1d5e2a0072..7f21a8227e 100644
+--- a/sysdeps/x86/cpu-features.c
++++ b/sysdeps/x86/cpu-features.c
+@@ -512,6 +512,7 @@ enum
+   INTEL_ATOM_GOLDMONT,
+   INTEL_ATOM_GOLDMONT_PLUS,
+   INTEL_ATOM_SIERRAFOREST,
++  INTEL_ATOM_CLEARWATERFOREST,
+   INTEL_ATOM_GRANDRIDGE,
+   INTEL_ATOM_TREMONT,
+ 
+@@ -539,6 +540,7 @@ enum
+   INTEL_BIGCORE_METEORLAKE,
+   INTEL_BIGCORE_LUNARLAKE,
+   INTEL_BIGCORE_ARROWLAKE,
++  INTEL_BIGCORE_PANTHERLAKE,
+   INTEL_BIGCORE_GRANITERAPIDS,
+ 
+   /* Mixed (bigcore + atom SOC).  */
+@@ -584,6 +586,8 @@ intel_get_fam6_microarch (unsigned int model,
+       return INTEL_ATOM_GOLDMONT_PLUS;
+     case 0xAF:
+       return INTEL_ATOM_SIERRAFOREST;
++    case 0xDD:
++      return INTEL_ATOM_CLEARWATERFOREST;
+     case 0xB6:
+       return INTEL_ATOM_GRANDRIDGE;
+     case 0x86:
+@@ -691,8 +695,12 @@ intel_get_fam6_microarch (unsigned int model,
+       return INTEL_BIGCORE_METEORLAKE;
+     case 0xbd:
+       return INTEL_BIGCORE_LUNARLAKE;
++    case 0xb5:
++    case 0xc5:
+     case 0xc6:
+       return INTEL_BIGCORE_ARROWLAKE;
++    case 0xCC:
++      return INTEL_BIGCORE_PANTHERLAKE;
+     case 0xAD:
+     case 0xAE:
+       return INTEL_BIGCORE_GRANITERAPIDS;
+@@ -808,6 +816,7 @@ init_cpu_features (struct cpu_features *cpu_features)
+ 	    Default tuned atom microarch.
+ 	    case INTEL_ATOM_SIERRAFOREST:
+ 	    case INTEL_ATOM_GRANDRIDGE:
++	    case INTEL_ATOM_CLEARWATERFOREST:
+ 	   */
+ 
+ 	      /* Bigcore/Default Tuning.  */
+@@ -864,6 +873,7 @@ init_cpu_features (struct cpu_features *cpu_features)
+ 	    case INTEL_BIGCORE_METEORLAKE:
+ 	    case INTEL_BIGCORE_LUNARLAKE:
+ 	    case INTEL_BIGCORE_ARROWLAKE:
++	    case INTEL_BIGCORE_PANTHERLAKE:
+ 	    case INTEL_BIGCORE_SAPPHIRERAPIDS:
+ 	    case INTEL_BIGCORE_EMERALDRAPIDS:
+ 	    case INTEL_BIGCORE_GRANITERAPIDS:
+
+commit 9ee8083c4edbe5e92af7aabb23261309f03ef05c
+Author: Sunil K Pandey <sunil.k.pandey@intel.com>
+Date:   Fri Apr 11 08:52:52 2025 -0700
+
+    x86: Handle unknown Intel processor with default tuning
+    
+    Enable default tuning for unknown Intel processor.
+    
+    Tested on x86, no regression.
+    
+    Co-Authored-By: H.J. Lu <hjl.tools@gmail.com>
+    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+    (cherry picked from commit 9f0deff558d1d6b08c425c157f50de85013ada9c)
+
+diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
+index 7f21a8227e..1a6e694abf 100644
+--- a/sysdeps/x86/cpu-features.c
++++ b/sysdeps/x86/cpu-features.c
+@@ -502,8 +502,8 @@ _Static_assert (((index_arch_Fast_Unaligned_Load
+ 		"Incorrect index_arch_Fast_Unaligned_Load");
+ 
+ 
+-/* Intel Family-6 microarch list.  */
+-enum
++/* Intel microarch list.  */
++enum intel_microarch
+ {
+   /* Atom processors.  */
+   INTEL_ATOM_BONNELL,
+@@ -555,7 +555,7 @@ enum
+   INTEL_UNKNOWN,
+ };
+ 
+-static unsigned int
++static enum intel_microarch
+ intel_get_fam6_microarch (unsigned int model,
+ 			  __attribute__ ((unused)) unsigned int stepping)
+ {
+@@ -764,134 +764,20 @@ init_cpu_features (struct cpu_features *cpu_features)
+       cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
+ 	  &= ~bit_arch_Avoid_Non_Temporal_Memset;
+ 
++      enum intel_microarch microarch = INTEL_UNKNOWN;
+       if (family == 0x06)
+ 	{
+ 	  model += extended_model;
+-	  unsigned int microarch
+-	      = intel_get_fam6_microarch (model, stepping);
++	  microarch = intel_get_fam6_microarch (model, stepping);
+ 
++	  /* Disable TSX on some processors to avoid TSX on kernels that
++	     weren't updated with the latest microcode package (which
++	     disables broken feature by default).  */
+ 	  switch (microarch)
+ 	    {
+-	      /* Atom / KNL tuning.  */
+-	    case INTEL_ATOM_BONNELL:
+-	      /* BSF is slow on Bonnell.  */
+-	      cpu_features->preferred[index_arch_Slow_BSF]
+-		  |= bit_arch_Slow_BSF;
+-	      break;
+-
+-	      /* Unaligned load versions are faster than SSSE3
+-		     on Airmont, Silvermont, Goldmont, and Goldmont Plus.  */
+-	    case INTEL_ATOM_AIRMONT:
+-	    case INTEL_ATOM_SILVERMONT:
+-	    case INTEL_ATOM_GOLDMONT:
+-	    case INTEL_ATOM_GOLDMONT_PLUS:
+-
+-          /* Knights Landing.  Enable Silvermont optimizations.  */
+-	    case INTEL_KNIGHTS_LANDING:
+-
+-	      cpu_features->preferred[index_arch_Fast_Unaligned_Load]
+-		  |= (bit_arch_Fast_Unaligned_Load
+-		      | bit_arch_Fast_Unaligned_Copy
+-		      | bit_arch_Prefer_PMINUB_for_stringop
+-		      | bit_arch_Slow_SSE4_2);
+-	      break;
+-
+-	    case INTEL_ATOM_TREMONT:
+-	      /* Enable rep string instructions, unaligned load, unaligned
+-		 copy, pminub and avoid SSE 4.2 on Tremont.  */
+-	      cpu_features->preferred[index_arch_Fast_Rep_String]
+-		  |= (bit_arch_Fast_Rep_String
+-		      | bit_arch_Fast_Unaligned_Load
+-		      | bit_arch_Fast_Unaligned_Copy
+-		      | bit_arch_Prefer_PMINUB_for_stringop
+-		      | bit_arch_Slow_SSE4_2);
+-	      break;
+-
+-	   /*
+-	    Default tuned Knights microarch.
+-	    case INTEL_KNIGHTS_MILL:
+-        */
+-
+-	   /*
+-	    Default tuned atom microarch.
+-	    case INTEL_ATOM_SIERRAFOREST:
+-	    case INTEL_ATOM_GRANDRIDGE:
+-	    case INTEL_ATOM_CLEARWATERFOREST:
+-	   */
+-
+-	      /* Bigcore/Default Tuning.  */
+ 	    default:
+-	    default_tuning:
+-	      /* Unknown family 0x06 processors.  Assuming this is one
+-		 of Core i3/i5/i7 processors if AVX is available.  */
+-	      if (!CPU_FEATURES_CPU_P (cpu_features, AVX))
+-		break;
+-
+-	    enable_modern_features:
+-	      /* Rep string instructions, unaligned load, unaligned copy,
+-		 and pminub are fast on Intel Core i3, i5 and i7.  */
+-	      cpu_features->preferred[index_arch_Fast_Rep_String]
+-		  |= (bit_arch_Fast_Rep_String
+-		      | bit_arch_Fast_Unaligned_Load
+-		      | bit_arch_Fast_Unaligned_Copy
+-		      | bit_arch_Prefer_PMINUB_for_stringop);
+ 	      break;
+ 
+-	    case INTEL_BIGCORE_NEHALEM:
+-	    case INTEL_BIGCORE_WESTMERE:
+-	      /* Older CPUs prefer non-temporal stores at lower threshold.  */
+-	      cpu_features->cachesize_non_temporal_divisor = 8;
+-	      goto enable_modern_features;
+-
+-	      /* Older Bigcore microarch (smaller non-temporal store
+-		 threshold).  */
+-	    case INTEL_BIGCORE_SANDYBRIDGE:
+-	    case INTEL_BIGCORE_IVYBRIDGE:
+-	    case INTEL_BIGCORE_HASWELL:
+-	    case INTEL_BIGCORE_BROADWELL:
+-	      cpu_features->cachesize_non_temporal_divisor = 8;
+-	      goto default_tuning;
+-
+-	      /* Newer Bigcore microarch (larger non-temporal store
+-		 threshold).  */
+-	    case INTEL_BIGCORE_SKYLAKE_AVX512:
+-	    case INTEL_BIGCORE_CANNONLAKE:
+-	      /* Benchmarks indicate non-temporal memset is not
+-		     necessarily profitable on SKX (and in some cases much
+-		     worse). This is likely unique to SKX due its it unique
+-		     mesh interconnect (not present on ICX or BWD). Disable
+-		     non-temporal on all Skylake servers. */
+-	      cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
+-		  |= bit_arch_Avoid_Non_Temporal_Memset;
+-	    case INTEL_BIGCORE_COMETLAKE:
+-	    case INTEL_BIGCORE_SKYLAKE:
+-	    case INTEL_BIGCORE_KABYLAKE:
+-	    case INTEL_BIGCORE_ICELAKE:
+-	    case INTEL_BIGCORE_TIGERLAKE:
+-	    case INTEL_BIGCORE_ROCKETLAKE:
+-	    case INTEL_BIGCORE_RAPTORLAKE:
+-	    case INTEL_BIGCORE_METEORLAKE:
+-	    case INTEL_BIGCORE_LUNARLAKE:
+-	    case INTEL_BIGCORE_ARROWLAKE:
+-	    case INTEL_BIGCORE_PANTHERLAKE:
+-	    case INTEL_BIGCORE_SAPPHIRERAPIDS:
+-	    case INTEL_BIGCORE_EMERALDRAPIDS:
+-	    case INTEL_BIGCORE_GRANITERAPIDS:
+-	      cpu_features->cachesize_non_temporal_divisor = 2;
+-	      goto default_tuning;
+-
+-	      /* Default tuned Mixed (bigcore + atom SOC). */
+-	    case INTEL_MIXED_LAKEFIELD:
+-	    case INTEL_MIXED_ALDERLAKE:
+-	      cpu_features->cachesize_non_temporal_divisor = 2;
+-	      goto default_tuning;
+-	    }
+-
+-	      /* Disable TSX on some processors to avoid TSX on kernels that
+-		 weren't updated with the latest microcode package (which
+-		 disables broken feature by default).  */
+-	  switch (microarch)
+-	    {
+ 	    case INTEL_BIGCORE_SKYLAKE_AVX512:
+ 	      /* 0x55 (Skylake-avx512) && stepping <= 5 disable TSX. */
+ 	      if (stepping <= 5)
+@@ -900,38 +786,152 @@ init_cpu_features (struct cpu_features *cpu_features)
+ 
+ 	    case INTEL_BIGCORE_KABYLAKE:
+ 	      /* NB: Although the errata documents that for model == 0x8e
+-		     (kabylake skylake client), only 0xb stepping or lower are
+-		     impacted, the intention of the errata was to disable TSX on
+-		     all client processors on all steppings.  Include 0xc
+-		     stepping which is an Intel Core i7-8665U, a client mobile
+-		     processor.  */
++		 (kabylake skylake client), only 0xb stepping or lower are
++		 impacted, the intention of the errata was to disable TSX on
++		 all client processors on all steppings.  Include 0xc
++		 stepping which is an Intel Core i7-8665U, a client mobile
++		 processor.  */
+ 	      if (stepping > 0xc)
+ 		break;
+ 	      /* Fall through.  */
+ 	    case INTEL_BIGCORE_SKYLAKE:
+-		/* Disable Intel TSX and enable RTM_ALWAYS_ABORT for
+-		   processors listed in:
+-
+-https://www.intel.com/content/www/us/en/support/articles/000059422/processors.html
+-		 */
+-	    disable_tsx:
+-		CPU_FEATURE_UNSET (cpu_features, HLE);
+-		CPU_FEATURE_UNSET (cpu_features, RTM);
+-		CPU_FEATURE_SET (cpu_features, RTM_ALWAYS_ABORT);
+-		break;
++	      /* Disable Intel TSX and enable RTM_ALWAYS_ABORT for
++		 processors listed in:
++
++		 https://www.intel.com/content/www/us/en/support/articles/000059422/processors.html
++	       */
++disable_tsx:
++	      CPU_FEATURE_UNSET (cpu_features, HLE);
++	      CPU_FEATURE_UNSET (cpu_features, RTM);
++	      CPU_FEATURE_SET (cpu_features, RTM_ALWAYS_ABORT);
++	      break;
+ 
+ 	    case INTEL_BIGCORE_HASWELL:
+-		/* Xeon E7 v3 (model == 0x3f) with stepping >= 4 has working
+-		   TSX.  Haswell also include other model numbers that have
+-		   working TSX.  */
+-		if (model == 0x3f && stepping >= 4)
++	      /* Xeon E7 v3 (model == 0x3f) with stepping >= 4 has working
++		 TSX.  Haswell also includes other model numbers that have
++		 working TSX.  */
++	      if (model == 0x3f && stepping >= 4)
+ 		break;
+ 
+-		CPU_FEATURE_UNSET (cpu_features, RTM);
+-		break;
++	      CPU_FEATURE_UNSET (cpu_features, RTM);
++	      break;
+ 	    }
+ 	}
+ 
++      switch (microarch)
++	{
++	  /* Atom / KNL tuning.  */
++	case INTEL_ATOM_BONNELL:
++	  /* BSF is slow on Bonnell.  */
++	  cpu_features->preferred[index_arch_Slow_BSF]
++	    |= bit_arch_Slow_BSF;
++	  break;
++
++	  /* Unaligned load versions are faster than SSSE3
++	     on Airmont, Silvermont, Goldmont, and Goldmont Plus.  */
++	case INTEL_ATOM_AIRMONT:
++	case INTEL_ATOM_SILVERMONT:
++	case INTEL_ATOM_GOLDMONT:
++	case INTEL_ATOM_GOLDMONT_PLUS:
++
++	  /* Knights Landing.  Enable Silvermont optimizations.  */
++	case INTEL_KNIGHTS_LANDING:
++
++	  cpu_features->preferred[index_arch_Fast_Unaligned_Load]
++	    |= (bit_arch_Fast_Unaligned_Load
++		| bit_arch_Fast_Unaligned_Copy
++		| bit_arch_Prefer_PMINUB_for_stringop
++		| bit_arch_Slow_SSE4_2);
++	  break;
++
++	case INTEL_ATOM_TREMONT:
++	  /* Enable rep string instructions, unaligned load, unaligned
++	     copy, pminub and avoid SSE 4.2 on Tremont.  */
++	  cpu_features->preferred[index_arch_Fast_Rep_String]
++	    |= (bit_arch_Fast_Rep_String
++		| bit_arch_Fast_Unaligned_Load
++		| bit_arch_Fast_Unaligned_Copy
++		| bit_arch_Prefer_PMINUB_for_stringop
++		| bit_arch_Slow_SSE4_2);
++	  break;
++
++	  /*
++	     Default tuned Knights microarch.
++	     case INTEL_KNIGHTS_MILL:
++	     */
++
++	  /*
++	     Default tuned atom microarch.
++	     case INTEL_ATOM_SIERRAFOREST:
++	     case INTEL_ATOM_GRANDRIDGE:
++	     case INTEL_ATOM_CLEARWATERFOREST:
++	     */
++
++	  /* Bigcore/Default Tuning.  */
++	default:
++	default_tuning:
++	  /* Unknown Intel processors.  Assuming this is one of Core
++	     i3/i5/i7 processors if AVX is available.  */
++	  if (!CPU_FEATURES_CPU_P (cpu_features, AVX))
++	    break;
++
++	enable_modern_features:
++	  /* Rep string instructions, unaligned load, unaligned copy,
++	     and pminub are fast on Intel Core i3, i5 and i7.  */
++	  cpu_features->preferred[index_arch_Fast_Rep_String]
++	    |= (bit_arch_Fast_Rep_String
++		| bit_arch_Fast_Unaligned_Load
++		| bit_arch_Fast_Unaligned_Copy
++		| bit_arch_Prefer_PMINUB_for_stringop);
++	  break;
++
++	case INTEL_BIGCORE_NEHALEM:
++	case INTEL_BIGCORE_WESTMERE:
++	  /* Older CPUs prefer non-temporal stores at lower threshold.  */
++	  cpu_features->cachesize_non_temporal_divisor = 8;
++	  goto enable_modern_features;
++
++	  /* Older Bigcore microarch (smaller non-temporal store
++	     threshold).  */
++	case INTEL_BIGCORE_SANDYBRIDGE:
++	case INTEL_BIGCORE_IVYBRIDGE:
++	case INTEL_BIGCORE_HASWELL:
++	case INTEL_BIGCORE_BROADWELL:
++	  cpu_features->cachesize_non_temporal_divisor = 8;
++	  goto default_tuning;
++
++	  /* Newer Bigcore microarch (larger non-temporal store
++	     threshold).  */
++	case INTEL_BIGCORE_SKYLAKE_AVX512:
++	case INTEL_BIGCORE_CANNONLAKE:
++	  /* Benchmarks indicate non-temporal memset is not
++	     necessarily profitable on SKX (and in some cases much
++	     worse). This is likely unique to SKX due to its unique
++	     mesh interconnect (not present on ICX or BWD). Disable
++	     non-temporal on all Skylake servers. */
++	  cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
++	    |= bit_arch_Avoid_Non_Temporal_Memset;
++	  /* fallthrough */
++	case INTEL_BIGCORE_COMETLAKE:
++	case INTEL_BIGCORE_SKYLAKE:
++	case INTEL_BIGCORE_KABYLAKE:
++	case INTEL_BIGCORE_ICELAKE:
++	case INTEL_BIGCORE_TIGERLAKE:
++	case INTEL_BIGCORE_ROCKETLAKE:
++	case INTEL_BIGCORE_RAPTORLAKE:
++	case INTEL_BIGCORE_METEORLAKE:
++	case INTEL_BIGCORE_LUNARLAKE:
++	case INTEL_BIGCORE_ARROWLAKE:
++	case INTEL_BIGCORE_PANTHERLAKE:
++	case INTEL_BIGCORE_SAPPHIRERAPIDS:
++	case INTEL_BIGCORE_EMERALDRAPIDS:
++	case INTEL_BIGCORE_GRANITERAPIDS:
++	  /* Default tuned Mixed (bigcore + atom SOC). */
++	case INTEL_MIXED_LAKEFIELD:
++	case INTEL_MIXED_ALDERLAKE:
++	  cpu_features->cachesize_non_temporal_divisor = 2;
++	  goto default_tuning;
++	}
+ 
+       /* Since AVX512ER is unique to Xeon Phi, set Prefer_No_VZEROUPPER
+          if AVX512ER is available.  Don't use AVX512 to avoid lower CPU
+
+commit d8a1a1aef7a58b991505b9a1349a40736dec3abf
+Author: H.J. Lu <hjl.tools@gmail.com>
+Date:   Sat Apr 12 08:37:29 2025 -0700
+
+    x86: Detect Intel Diamond Rapids
+    
+    Detect Intel Diamond Rapids and tune it similar to Intel Granite Rapids.
+    
+    Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
+    Reviewed-by: Sunil K Pandey <skpgkp2@gmail.com>
+    (cherry picked from commit de14f1959ee5f9b845a7cae43bee03068b8136f0)
+
+diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
+index 1a6e694abf..52a2f03bdd 100644
+--- a/sysdeps/x86/cpu-features.c
++++ b/sysdeps/x86/cpu-features.c
+@@ -542,6 +542,7 @@ enum intel_microarch
+   INTEL_BIGCORE_ARROWLAKE,
+   INTEL_BIGCORE_PANTHERLAKE,
+   INTEL_BIGCORE_GRANITERAPIDS,
++  INTEL_BIGCORE_DIAMONDRAPIDS,
+ 
+   /* Mixed (bigcore + atom SOC).  */
+   INTEL_MIXED_LAKEFIELD,
+@@ -817,6 +818,16 @@ disable_tsx:
+ 	      break;
+ 	    }
+ 	}
++      else if (family == 19)
++	switch (model)
++	  {
++	  case 0x01:
++	    microarch = INTEL_BIGCORE_DIAMONDRAPIDS;
++	    break;
++
++	  default:
++	    break;
++	  }
+ 
+       switch (microarch)
+ 	{
+@@ -926,6 +937,7 @@ disable_tsx:
+ 	case INTEL_BIGCORE_SAPPHIRERAPIDS:
+ 	case INTEL_BIGCORE_EMERALDRAPIDS:
+ 	case INTEL_BIGCORE_GRANITERAPIDS:
++	case INTEL_BIGCORE_DIAMONDRAPIDS:
+ 	  /* Default tuned Mixed (bigcore + atom SOC). */
+ 	case INTEL_MIXED_LAKEFIELD:
+ 	case INTEL_MIXED_ALDERLAKE:
+
+commit 736e6735053f12181d3d287898dd5fdb9e8baf59
+Author: Frank Barrus <frankbarrus_sw@shaggy.cc>
+Date:   Wed Dec 4 07:55:02 2024 -0500
+
+    pthreads NPTL: lost wakeup fix 2
+    
+    This fixes the lost wakeup (from a bug in signal stealing) with a change
+    in the usage of g_signals[] in the condition variable internal state.
+    It also completely eliminates the concept and handling of signal stealing,
+    as well as the need for signalers to block to wait for waiters to wake
+    up every time there is a G1/G2 switch.  This greatly reduces the average
+    and maximum latency for pthread_cond_signal.
+    
+    The g_signals[] field now contains a signal count that is relative to
+    the current g1_start value.  Since it is a 32-bit field, and the LSB is
+    still reserved (though not currently used anymore), it has a 31-bit value
+    that corresponds to the low 31 bits of the sequence number in g1_start.
+    (since g1_start also has an LSB flag, this means bits 31:1 in g_signals
+    correspond to bits 31:1 in g1_start, plus the current signal count)
+    
+    By making the signal count relative to g1_start, there is no longer
+    any ambiguity or A/B/A issue, and thus any checks before blocking,
+    including the futex call itself, are guaranteed not to block if the G1/G2
+    switch occurs, even if the signal count remains the same.  This allows
+    initially safely blocking in G2 until the switch to G1 occurs, and
+    then transitioning from G1 to a new G1 or G2, and always being able to
+    distinguish the state change.  This removes the race condition and A/B/A
+    problems that otherwise ocurred if a late (pre-empted) waiter were to
+    resume just as the futex call attempted to block on g_signal since
+    otherwise there was no last opportunity to re-check things like whether
+    the current G1 group was already closed.
+    
+    By fixing these issues, the signal stealing code can be eliminated,
+    since there is no concept of signal stealing anymore.  The code to block
+    for all waiters to exit g_refs can also be removed, since any waiters
+    that are still in the g_refs region can be guaranteed to safely wake
+    up and exit.  If there are still any left at this time, they are all
+    sent one final futex wakeup to ensure that they are not blocked any
+    longer, but there is no need for the signaller to block and wait for
+    them to wake up and exit the g_refs region.
+    
+    The signal count is then effectively "zeroed" but since it is now
+    relative to g1_start, this is done by advancing it to a new value that
+    can be observed by any pending blocking waiters.  Any late waiters can
+    always tell the difference, and can thus just cleanly exit if they are
+    in a stale G1 or G2.  They can never steal a signal from the current
+    G1 if they are not in the current G1, since the signal value that has
+    to match in the cmpxchg has the low 31 bits of the g1_start value
+    contained in it, and that's first checked, and then it won't match if
+    there's a G1/G2 change.
+    
+    Note: the 31-bit sequence number used in g_signals is designed to
+    handle wrap-around when checking the signal count, but if the entire
+    31-bit wraparound (2 billion signals) occurs while there is still a
+    late waiter that has not yet resumed, and it happens to then match
+    the current g1_start low bits, and the pre-emption occurs after the
+    normal "closed group" checks (which are 64-bit) but then hits the
+    futex syscall and signal consuming code, then an A/B/A issue could
+    still result and cause an incorrect assumption about whether it
+    should block.  This particular scenario seems unlikely in practice.
+    Note that once awake from the futex, the waiter would notice the
+    closed group before consuming the signal (since that's still a 64-bit
+    check that would not be aliased in the wrap-around in g_signals),
+    so the biggest impact would be blocking on the futex until the next
+    full wakeup from a G1/G2 switch.
+    
+    Signed-off-by: Frank Barrus <frankbarrus_sw@shaggy.cc>
+    Reviewed-by: Carlos O'Donell <carlos@redhat.com>
+    (cherry picked from commit 1db84775f831a1494993ce9c118deaf9537cc50a)
+
+diff --git a/nptl/pthread_cond_common.c b/nptl/pthread_cond_common.c
+index 3487557bb8..4855b8899f 100644
+--- a/nptl/pthread_cond_common.c
++++ b/nptl/pthread_cond_common.c
+@@ -201,7 +201,6 @@ static bool __attribute__ ((unused))
+ __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
+     unsigned int *g1index, int private)
+ {
+-  const unsigned int maxspin = 0;
+   unsigned int g1 = *g1index;
+ 
+   /* If there is no waiter in G2, we don't do anything.  The expression may
+@@ -222,84 +221,46 @@ __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
+      * New waiters arriving concurrently with the group switching will all go
+        into G2 until we atomically make the switch.  Waiters existing in G2
+        are not affected.
+-     * Waiters in G1 will be closed out immediately by setting a flag in
+-       __g_signals, which will prevent waiters from blocking using a futex on
+-       __g_signals and also notifies them that the group is closed.  As a
+-       result, they will eventually remove their group reference, allowing us
+-       to close switch group roles.  */
+-
+-  /* First, set the closed flag on __g_signals.  This tells waiters that are
+-     about to wait that they shouldn't do that anymore.  This basically
+-     serves as an advance notification of the upcoming change to __g1_start;
+-     waiters interpret it as if __g1_start was larger than their waiter
+-     sequence position.  This allows us to change __g1_start after waiting
+-     for all existing waiters with group references to leave, which in turn
+-     makes recovery after stealing a signal simpler because it then can be
+-     skipped if __g1_start indicates that the group is closed (otherwise,
+-     we would have to recover always because waiters don't know how big their
+-     groups are).  Relaxed MO is fine.  */
+-  atomic_fetch_or_relaxed (cond->__data.__g_signals + g1, 1);
+-
+-  /* Wait until there are no group references anymore.  The fetch-or operation
+-     injects us into the modification order of __g_refs; release MO ensures
+-     that waiters incrementing __g_refs after our fetch-or see the previous
+-     changes to __g_signals and to __g1_start that had to happen before we can
+-     switch this G1 and alias with an older group (we have two groups, so
+-     aliasing requires switching group roles twice).  Note that nobody else
+-     can have set the wake-request flag, so we do not have to act upon it.
+-
+-     Also note that it is harmless if older waiters or waiters from this G1
+-     get a group reference after we have quiesced the group because it will
+-     remain closed for them either because of the closed flag in __g_signals
+-     or the later update to __g1_start.  New waiters will never arrive here
+-     but instead continue to go into the still current G2.  */
+-  unsigned r = atomic_fetch_or_release (cond->__data.__g_refs + g1, 0);
+-  while ((r >> 1) > 0)
+-    {
+-      for (unsigned int spin = maxspin; ((r >> 1) > 0) && (spin > 0); spin--)
+-	{
+-	  /* TODO Back off.  */
+-	  r = atomic_load_relaxed (cond->__data.__g_refs + g1);
+-	}
+-      if ((r >> 1) > 0)
+-	{
+-	  /* There is still a waiter after spinning.  Set the wake-request
+-	     flag and block.  Relaxed MO is fine because this is just about
+-	     this futex word.
+-
+-	     Update r to include the set wake-request flag so that the upcoming
+-	     futex_wait only blocks if the flag is still set (otherwise, we'd
+-	     violate the basic client-side futex protocol).  */
+-	  r = atomic_fetch_or_relaxed (cond->__data.__g_refs + g1, 1) | 1;
+-
+-	  if ((r >> 1) > 0)
+-	    futex_wait_simple (cond->__data.__g_refs + g1, r, private);
+-	  /* Reload here so we eventually see the most recent value even if we
+-	     do not spin.   */
+-	  r = atomic_load_relaxed (cond->__data.__g_refs + g1);
+-	}
+-    }
+-  /* Acquire MO so that we synchronize with the release operation that waiters
+-     use to decrement __g_refs and thus happen after the waiters we waited
+-     for.  */
+-  atomic_thread_fence_acquire ();
++     * Waiters in G1 will be closed out immediately by the advancing of
++       __g_signals to the next "lowseq" (low 31 bits of the new g1_start),
++       which will prevent waiters from blocking using a futex on
++       __g_signals since it provides enough signals for all possible
++       remaining waiters.  As a result, they can each consume a signal
++       and they will eventually remove their group reference.  */
+ 
+   /* Update __g1_start, which finishes closing this group.  The value we add
+      will never be negative because old_orig_size can only be zero when we
+      switch groups the first time after a condvar was initialized, in which
+-     case G1 will be at index 1 and we will add a value of 1.  See above for
+-     why this takes place after waiting for quiescence of the group.
++     case G1 will be at index 1 and we will add a value of 1.
+      Relaxed MO is fine because the change comes with no additional
+      constraints that others would have to observe.  */
+   __condvar_add_g1_start_relaxed (cond,
+       (old_orig_size << 1) + (g1 == 1 ? 1 : - 1));
+ 
+-  /* Now reopen the group, thus enabling waiters to again block using the
+-     futex controlled by __g_signals.  Release MO so that observers that see
+-     no signals (and thus can block) also see the write __g1_start and thus
+-     that this is now a new group (see __pthread_cond_wait_common for the
+-     matching acquire MO loads).  */
+-  atomic_store_release (cond->__data.__g_signals + g1, 0);
++  unsigned int lowseq = ((old_g1_start + old_orig_size) << 1) & ~1U;
++
++  /* If any waiters still hold group references (and thus could be blocked),
++     then wake them all up now and prevent any running ones from blocking.
++     This is effectively a catch-all for any possible current or future
++     bugs that can allow the group size to reach 0 before all G1 waiters
++     have been awakened or at least given signals to consume, or any
++     other case that can leave blocked (or about to block) older waiters..  */
++  if ((atomic_fetch_or_release (cond->__data.__g_refs + g1, 0) >> 1) > 0)
++   {
++    /* First advance signals to the end of the group (i.e. enough signals
++       for the entire G1 group) to ensure that waiters which have not
++       yet blocked in the futex will not block.
++       Note that in the vast majority of cases, this should never
++       actually be necessary, since __g_signals will have enough
++       signals for the remaining g_refs waiters.  As an optimization,
++       we could check this first before proceeding, although that
++       could still leave the potential for futex lost wakeup bugs
++       if the signal count was non-zero but the futex wakeup
++       was somehow lost.  */
++    atomic_store_release (cond->__data.__g_signals + g1, lowseq);
++
++    futex_wake (cond->__data.__g_signals + g1, INT_MAX, private);
++   }
+ 
+   /* At this point, the old G1 is now a valid new G2 (but not in use yet).
+      No old waiter can neither grab a signal nor acquire a reference without
+@@ -311,6 +272,10 @@ __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
+   g1 ^= 1;
+   *g1index ^= 1;
+ 
++  /* Now advance the new G1 g_signals to the new lowseq, giving it
++     an effective signal count of 0 to start.  */
++  atomic_store_release (cond->__data.__g_signals + g1, lowseq);
++
+   /* These values are just observed by signalers, and thus protected by the
+      lock.  */
+   unsigned int orig_size = wseq - (old_g1_start + old_orig_size);
+diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c
+index 66786c7b90..3d290e39c8 100644
+--- a/nptl/pthread_cond_wait.c
++++ b/nptl/pthread_cond_wait.c
+@@ -238,9 +238,7 @@ __condvar_cleanup_waiting (void *arg)
+    signaled), and a reference count.
+ 
+    The group reference count is used to maintain the number of waiters that
+-   are using the group's futex.  Before a group can change its role, the
+-   reference count must show that no waiters are using the futex anymore; this
+-   prevents ABA issues on the futex word.
++   are using the group's futex.
+ 
+    To represent which intervals in the waiter sequence the groups cover (and
+    thus also which group slot contains G1 or G2), we use a 64b counter to
+@@ -300,11 +298,12 @@ __condvar_cleanup_waiting (void *arg)
+        last reference.
+      * Reference count used by waiters concurrently with signalers that have
+        acquired the condvar-internal lock.
+-   __g_signals: The number of signals that can still be consumed.
++   __g_signals: The number of signals that can still be consumed, relative to
++     the current g1_start.  (i.e. bits 31 to 1 of __g_signals are bits
++     31 to 1 of g1_start with the signal count added)
+      * Used as a futex word by waiters.  Used concurrently by waiters and
+        signalers.
+-     * LSB is true iff this group has been completely signaled (i.e., it is
+-       closed).
++     * LSB is currently reserved and 0.
+    __g_size: Waiters remaining in this group (i.e., which have not been
+      signaled yet.
+      * Accessed by signalers and waiters that cancel waiting (both do so only
+@@ -328,18 +327,6 @@ __condvar_cleanup_waiting (void *arg)
+    sufficient because if a waiter can see a sufficiently large value, it could
+    have also consume a signal in the waiters group.
+ 
+-   Waiters try to grab a signal from __g_signals without holding a reference
+-   count, which can lead to stealing a signal from a more recent group after
+-   their own group was already closed.  They cannot always detect whether they
+-   in fact did because they do not know when they stole, but they can
+-   conservatively add a signal back to the group they stole from; if they
+-   did so unnecessarily, all that happens is a spurious wake-up.  To make this
+-   even less likely, __g1_start contains the index of the current g2 too,
+-   which allows waiters to check if there aliasing on the group slots; if
+-   there wasn't, they didn't steal from the current G1, which means that the
+-   G1 they stole from must have been already closed and they do not need to
+-   fix anything.
+-
+    It is essential that the last field in pthread_cond_t is __g_signals[1]:
+    The previous condvar used a pointer-sized field in pthread_cond_t, so a
+    PTHREAD_COND_INITIALIZER from that condvar implementation might only
+@@ -435,6 +422,9 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
+     {
+       while (1)
+ 	{
++          uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
++          unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
++
+ 	  /* Spin-wait first.
+ 	     Note that spinning first without checking whether a timeout
+ 	     passed might lead to what looks like a spurious wake-up even
+@@ -446,35 +436,45 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
+ 	     having to compare against the current time seems to be the right
+ 	     choice from a performance perspective for most use cases.  */
+ 	  unsigned int spin = maxspin;
+-	  while (signals == 0 && spin > 0)
++	  while (spin > 0 && ((int)(signals - lowseq) < 2))
+ 	    {
+ 	      /* Check that we are not spinning on a group that's already
+ 		 closed.  */
+-	      if (seq < (__condvar_load_g1_start_relaxed (cond) >> 1))
+-		goto done;
++	      if (seq < (g1_start >> 1))
++		break;
+ 
+ 	      /* TODO Back off.  */
+ 
+ 	      /* Reload signals.  See above for MO.  */
+ 	      signals = atomic_load_acquire (cond->__data.__g_signals + g);
++              g1_start = __condvar_load_g1_start_relaxed (cond);
++              lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
+ 	      spin--;
+ 	    }
+ 
+-	  /* If our group will be closed as indicated by the flag on signals,
+-	     don't bother grabbing a signal.  */
+-	  if (signals & 1)
+-	    goto done;
+-
+-	  /* If there is an available signal, don't block.  */
+-	  if (signals != 0)
++          if (seq < (g1_start >> 1))
++	    {
++              /* If the group is closed already,
++	         then this waiter originally had enough extra signals to
++	         consume, up until the time its group was closed.  */
++	       goto done;
++            }
++
++	  /* If there is an available signal, don't block.
++             If __g1_start has advanced at all, then we must be in G1
++	     by now, perhaps in the process of switching back to an older
++	     G2, but in either case we're allowed to consume the available
++	     signal and should not block anymore.  */
++	  if ((int)(signals - lowseq) >= 2)
+ 	    break;
+ 
+ 	  /* No signals available after spinning, so prepare to block.
+ 	     We first acquire a group reference and use acquire MO for that so
+ 	     that we synchronize with the dummy read-modify-write in
+ 	     __condvar_quiesce_and_switch_g1 if we read from that.  In turn,
+-	     in this case this will make us see the closed flag on __g_signals
+-	     that designates a concurrent attempt to reuse the group's slot.
++	     in this case this will make us see the advancement of __g_signals
++	     to the upcoming new g1_start that occurs with a concurrent
++	     attempt to reuse the group's slot.
+ 	     We use acquire MO for the __g_signals check to make the
+ 	     __g1_start check work (see spinning above).
+ 	     Note that the group reference acquisition will not mask the
+@@ -482,15 +482,24 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
+ 	     an atomic read-modify-write operation and thus extend the release
+ 	     sequence.  */
+ 	  atomic_fetch_add_acquire (cond->__data.__g_refs + g, 2);
+-	  if (((atomic_load_acquire (cond->__data.__g_signals + g) & 1) != 0)
+-	      || (seq < (__condvar_load_g1_start_relaxed (cond) >> 1)))
++	  signals = atomic_load_acquire (cond->__data.__g_signals + g);
++          g1_start = __condvar_load_g1_start_relaxed (cond);
++          lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
++
++          if (seq < (g1_start >> 1))
+ 	    {
+-	      /* Our group is closed.  Wake up any signalers that might be
+-		 waiting.  */
++              /* group is closed already, so don't block */
+ 	      __condvar_dec_grefs (cond, g, private);
+ 	      goto done;
+ 	    }
+ 
++	  if ((int)(signals - lowseq) >= 2)
++	    {
++	      /* a signal showed up or G1/G2 switched after we grabbed the refcount */
++	      __condvar_dec_grefs (cond, g, private);
++	      break;
++            }
++
+ 	  // Now block.
+ 	  struct _pthread_cleanup_buffer buffer;
+ 	  struct _condvar_cleanup_buffer cbuffer;
+@@ -501,7 +510,7 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
+ 	  __pthread_cleanup_push (&buffer, __condvar_cleanup_waiting, &cbuffer);
+ 
+ 	  err = __futex_abstimed_wait_cancelable64 (
+-	    cond->__data.__g_signals + g, 0, clockid, abstime, private);
++	    cond->__data.__g_signals + g, signals, clockid, abstime, private);
+ 
+ 	  __pthread_cleanup_pop (&buffer, 0);
+ 
+@@ -524,6 +533,8 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
+ 	  signals = atomic_load_acquire (cond->__data.__g_signals + g);
+ 	}
+ 
++       if (seq < (__condvar_load_g1_start_relaxed (cond) >> 1))
++	 goto done;
+     }
+   /* Try to grab a signal.  Use acquire MO so that we see an up-to-date value
+      of __g1_start below (see spinning above for a similar case).  In
+@@ -532,69 +543,6 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
+   while (!atomic_compare_exchange_weak_acquire (cond->__data.__g_signals + g,
+ 						&signals, signals - 2));
+ 
+-  /* We consumed a signal but we could have consumed from a more recent group
+-     that aliased with ours due to being in the same group slot.  If this
+-     might be the case our group must be closed as visible through
+-     __g1_start.  */
+-  uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
+-  if (seq < (g1_start >> 1))
+-    {
+-      /* We potentially stole a signal from a more recent group but we do not
+-	 know which group we really consumed from.
+-	 We do not care about groups older than current G1 because they are
+-	 closed; we could have stolen from these, but then we just add a
+-	 spurious wake-up for the current groups.
+-	 We will never steal a signal from current G2 that was really intended
+-	 for G2 because G2 never receives signals (until it becomes G1).  We
+-	 could have stolen a signal from G2 that was conservatively added by a
+-	 previous waiter that also thought it stole a signal -- but given that
+-	 that signal was added unnecessarily, it's not a problem if we steal
+-	 it.
+-	 Thus, the remaining case is that we could have stolen from the current
+-	 G1, where "current" means the __g1_start value we observed.  However,
+-	 if the current G1 does not have the same slot index as we do, we did
+-	 not steal from it and do not need to undo that.  This is the reason
+-	 for putting a bit with G2's index into__g1_start as well.  */
+-      if (((g1_start & 1) ^ 1) == g)
+-	{
+-	  /* We have to conservatively undo our potential mistake of stealing
+-	     a signal.  We can stop trying to do that when the current G1
+-	     changes because other spinning waiters will notice this too and
+-	     __condvar_quiesce_and_switch_g1 has checked that there are no
+-	     futex waiters anymore before switching G1.
+-	     Relaxed MO is fine for the __g1_start load because we need to
+-	     merely be able to observe this fact and not have to observe
+-	     something else as well.
+-	     ??? Would it help to spin for a little while to see whether the
+-	     current G1 gets closed?  This might be worthwhile if the group is
+-	     small or close to being closed.  */
+-	  unsigned int s = atomic_load_relaxed (cond->__data.__g_signals + g);
+-	  while (__condvar_load_g1_start_relaxed (cond) == g1_start)
+-	    {
+-	      /* Try to add a signal.  We don't need to acquire the lock
+-		 because at worst we can cause a spurious wake-up.  If the
+-		 group is in the process of being closed (LSB is true), this
+-		 has an effect similar to us adding a signal.  */
+-	      if (((s & 1) != 0)
+-		  || atomic_compare_exchange_weak_relaxed
+-		       (cond->__data.__g_signals + g, &s, s + 2))
+-		{
+-		  /* If we added a signal, we also need to add a wake-up on
+-		     the futex.  We also need to do that if we skipped adding
+-		     a signal because the group is being closed because
+-		     while __condvar_quiesce_and_switch_g1 could have closed
+-		     the group, it might still be waiting for futex waiters to
+-		     leave (and one of those waiters might be the one we stole
+-		     the signal from, which cause it to block using the
+-		     futex).  */
+-		  futex_wake (cond->__data.__g_signals + g, 1, private);
+-		  break;
+-		}
+-	      /* TODO Back off.  */
+-	    }
+-	}
+-    }
+-
+  done:
+ 
+   /* Confirm that we have been woken.  We do that before acquiring the mutex
+
+commit 88d999d840e77c9917f08870094a23ce42294848
+Author: Malte Skarupke <malteskarupke@fastmail.fm>
+Date:   Wed Dec 4 07:55:22 2024 -0500
+
+    nptl: Update comments and indentation for new condvar implementation
+    
+    Some comments were wrong after the most recent commit. This fixes that.
+    
+    Also fixing indentation where it was using spaces instead of tabs.
+    
+    Signed-off-by: Malte Skarupke <malteskarupke@fastmail.fm>
+    Reviewed-by: Carlos O'Donell <carlos@redhat.com>
+    (cherry picked from commit 0cc973160c23bb67f895bc887dd6942d29f8fee3)
+
+diff --git a/nptl/pthread_cond_common.c b/nptl/pthread_cond_common.c
+index 4855b8899f..3475d15123 100644
+--- a/nptl/pthread_cond_common.c
++++ b/nptl/pthread_cond_common.c
+@@ -221,8 +221,9 @@ __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
+      * New waiters arriving concurrently with the group switching will all go
+        into G2 until we atomically make the switch.  Waiters existing in G2
+        are not affected.
+-     * Waiters in G1 will be closed out immediately by the advancing of
+-       __g_signals to the next "lowseq" (low 31 bits of the new g1_start),
++     * Waiters in G1 have already received a signal and been woken. If they
++       haven't woken yet, they will be closed out immediately by the advancing
++       of __g_signals to the next "lowseq" (low 31 bits of the new g1_start),
+        which will prevent waiters from blocking using a futex on
+        __g_signals since it provides enough signals for all possible
+        remaining waiters.  As a result, they can each consume a signal
+diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c
+index 3d290e39c8..ad2cee7d59 100644
+--- a/nptl/pthread_cond_wait.c
++++ b/nptl/pthread_cond_wait.c
+@@ -249,7 +249,7 @@ __condvar_cleanup_waiting (void *arg)
+    figure out whether they are in a group that has already been completely
+    signaled (i.e., if the current G1 starts at a later position that the
+    waiter's position).  Waiters cannot determine whether they are currently
+-   in G2 or G1 -- but they do not have too because all they are interested in
++   in G2 or G1 -- but they do not have to because all they are interested in
+    is whether there are available signals, and they always start in G2 (whose
+    group slot they know because of the bit in the waiter sequence.  Signalers
+    will simply fill the right group until it is completely signaled and can
+@@ -412,7 +412,7 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
+     }
+ 
+   /* Now wait until a signal is available in our group or it is closed.
+-     Acquire MO so that if we observe a value of zero written after group
++     Acquire MO so that if we observe (signals == lowseq) after group
+      switching in __condvar_quiesce_and_switch_g1, we synchronize with that
+      store and will see the prior update of __g1_start done while switching
+      groups too.  */
+@@ -422,8 +422,8 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
+     {
+       while (1)
+ 	{
+-          uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
+-          unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
++	  uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
++	  unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
+ 
+ 	  /* Spin-wait first.
+ 	     Note that spinning first without checking whether a timeout
+@@ -447,21 +447,21 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
+ 
+ 	      /* Reload signals.  See above for MO.  */
+ 	      signals = atomic_load_acquire (cond->__data.__g_signals + g);
+-              g1_start = __condvar_load_g1_start_relaxed (cond);
+-              lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
++	      g1_start = __condvar_load_g1_start_relaxed (cond);
++	      lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
+ 	      spin--;
+ 	    }
+ 
+-          if (seq < (g1_start >> 1))
++	  if (seq < (g1_start >> 1))
+ 	    {
+-              /* If the group is closed already,
++	      /* If the group is closed already,
+ 	         then this waiter originally had enough extra signals to
+ 	         consume, up until the time its group was closed.  */
+ 	       goto done;
+-            }
++	    }
+ 
+ 	  /* If there is an available signal, don't block.
+-             If __g1_start has advanced at all, then we must be in G1
++	     If __g1_start has advanced at all, then we must be in G1
+ 	     by now, perhaps in the process of switching back to an older
+ 	     G2, but in either case we're allowed to consume the available
+ 	     signal and should not block anymore.  */
+@@ -483,22 +483,23 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
+ 	     sequence.  */
+ 	  atomic_fetch_add_acquire (cond->__data.__g_refs + g, 2);
+ 	  signals = atomic_load_acquire (cond->__data.__g_signals + g);
+-          g1_start = __condvar_load_g1_start_relaxed (cond);
+-          lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
++	  g1_start = __condvar_load_g1_start_relaxed (cond);
++	  lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
+ 
+-          if (seq < (g1_start >> 1))
++	  if (seq < (g1_start >> 1))
+ 	    {
+-              /* group is closed already, so don't block */
++	      /* group is closed already, so don't block */
+ 	      __condvar_dec_grefs (cond, g, private);
+ 	      goto done;
+ 	    }
+ 
+ 	  if ((int)(signals - lowseq) >= 2)
+ 	    {
+-	      /* a signal showed up or G1/G2 switched after we grabbed the refcount */
++	      /* a signal showed up or G1/G2 switched after we grabbed the
++	         refcount */
+ 	      __condvar_dec_grefs (cond, g, private);
+ 	      break;
+-            }
++	    }
+ 
+ 	  // Now block.
+ 	  struct _pthread_cleanup_buffer buffer;
+@@ -536,10 +537,8 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
+        if (seq < (__condvar_load_g1_start_relaxed (cond) >> 1))
+ 	 goto done;
+     }
+-  /* Try to grab a signal.  Use acquire MO so that we see an up-to-date value
+-     of __g1_start below (see spinning above for a similar case).  In
+-     particular, if we steal from a more recent group, we will also see a
+-     more recent __g1_start below.  */
++  /* Try to grab a signal.  See above for MO.  (if we do another loop
++     iteration we need to see the correct value of g1_start)  */
+   while (!atomic_compare_exchange_weak_acquire (cond->__data.__g_signals + g,
+ 						&signals, signals - 2));
+ 
+
+commit 136a29f9d0a3924828d5a16be82d054637517c95
+Author: Malte Skarupke <malteskarupke@fastmail.fm>
+Date:   Wed Dec 4 07:55:50 2024 -0500
+
+    nptl: Remove unnecessary catch-all-wake in condvar group switch
+    
+    This wake is unnecessary. We only switch groups after every sleeper in a group
+    has been woken. Sure, they may take a while to actually wake up and may still
+    hold a reference, but waking them a second time doesn't speed that up. Instead
+    this just makes the code more complicated and may hide problems.
+    
+    In particular this safety wake wouldn't even have helped with the bug that was
+    fixed by Barrus' patch: The bug there was that pthread_cond_signal would not
+    switch g1 when it should, so we wouldn't even have entered this code path.
+    
+    Signed-off-by: Malte Skarupke <malteskarupke@fastmail.fm>
+    Reviewed-by: Carlos O'Donell <carlos@redhat.com>
+    (cherry picked from commit b42cc6af11062c260c7dfa91f1c89891366fed3e)
+
+diff --git a/nptl/pthread_cond_common.c b/nptl/pthread_cond_common.c
+index 3475d15123..30b8eee149 100644
+--- a/nptl/pthread_cond_common.c
++++ b/nptl/pthread_cond_common.c
+@@ -221,13 +221,7 @@ __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
+      * New waiters arriving concurrently with the group switching will all go
+        into G2 until we atomically make the switch.  Waiters existing in G2
+        are not affected.
+-     * Waiters in G1 have already received a signal and been woken. If they
+-       haven't woken yet, they will be closed out immediately by the advancing
+-       of __g_signals to the next "lowseq" (low 31 bits of the new g1_start),
+-       which will prevent waiters from blocking using a futex on
+-       __g_signals since it provides enough signals for all possible
+-       remaining waiters.  As a result, they can each consume a signal
+-       and they will eventually remove their group reference.  */
++     * Waiters in G1 have already received a signal and been woken.  */
+ 
+   /* Update __g1_start, which finishes closing this group.  The value we add
+      will never be negative because old_orig_size can only be zero when we
+@@ -240,29 +234,6 @@ __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
+ 
+   unsigned int lowseq = ((old_g1_start + old_orig_size) << 1) & ~1U;
+ 
+-  /* If any waiters still hold group references (and thus could be blocked),
+-     then wake them all up now and prevent any running ones from blocking.
+-     This is effectively a catch-all for any possible current or future
+-     bugs that can allow the group size to reach 0 before all G1 waiters
+-     have been awakened or at least given signals to consume, or any
+-     other case that can leave blocked (or about to block) older waiters..  */
+-  if ((atomic_fetch_or_release (cond->__data.__g_refs + g1, 0) >> 1) > 0)
+-   {
+-    /* First advance signals to the end of the group (i.e. enough signals
+-       for the entire G1 group) to ensure that waiters which have not
+-       yet blocked in the futex will not block.
+-       Note that in the vast majority of cases, this should never
+-       actually be necessary, since __g_signals will have enough
+-       signals for the remaining g_refs waiters.  As an optimization,
+-       we could check this first before proceeding, although that
+-       could still leave the potential for futex lost wakeup bugs
+-       if the signal count was non-zero but the futex wakeup
+-       was somehow lost.  */
+-    atomic_store_release (cond->__data.__g_signals + g1, lowseq);
+-
+-    futex_wake (cond->__data.__g_signals + g1, INT_MAX, private);
+-   }
+-
+   /* At this point, the old G1 is now a valid new G2 (but not in use yet).
+      No old waiter can neither grab a signal nor acquire a reference without
+      noticing that __g1_start is larger.
+
+commit 2a259b6d77dc5bdab5c8f4ee0e69572d5699d4bf
+Author: Malte Skarupke <malteskarupke@fastmail.fm>
+Date:   Wed Dec 4 07:56:13 2024 -0500
+
+    nptl: Remove unnecessary quadruple check in pthread_cond_wait
+    
+    pthread_cond_wait was checking whether it was in a closed group no less than
+    four times. Checking once is enough. Here are the four checks:
+    
+    1. While spin-waiting. This was dead code: maxspin is set to 0 and has been
+       for years.
+    2. Before deciding to go to sleep, and before incrementing grefs: I kept this
+    3. After incrementing grefs. There is no reason to think that the group would
+       close while we do an atomic increment. Obviously it could close at any
+       point, but that doesn't mean we have to recheck after every step. This
+       check was equally good as check 2, except it has to do more work.
+    4. When we find ourselves in a group that has a signal. We only get here after
+       we check that we're not in a closed group. There is no need to check again.
+       The check would only have helped in cases where the compare_exchange in the
+       next line would also have failed. Relying on the compare_exchange is fine.
+    
+    Removing the duplicate checks clarifies the code.
+    
+    Signed-off-by: Malte Skarupke <malteskarupke@fastmail.fm>
+    Reviewed-by: Carlos O'Donell <carlos@redhat.com>
+    (cherry picked from commit 4f7b051f8ee3feff1b53b27a906f245afaa9cee1)
+
+diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c
+index ad2cee7d59..cfdd13bb87 100644
+--- a/nptl/pthread_cond_wait.c
++++ b/nptl/pthread_cond_wait.c
+@@ -366,7 +366,6 @@ static __always_inline int
+ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
+     clockid_t clockid, const struct __timespec64 *abstime)
+ {
+-  const int maxspin = 0;
+   int err;
+   int result = 0;
+ 
+@@ -425,33 +424,6 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
+ 	  uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
+ 	  unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
+ 
+-	  /* Spin-wait first.
+-	     Note that spinning first without checking whether a timeout
+-	     passed might lead to what looks like a spurious wake-up even
+-	     though we should return ETIMEDOUT (e.g., if the caller provides
+-	     an absolute timeout that is clearly in the past).  However,
+-	     (1) spurious wake-ups are allowed, (2) it seems unlikely that a
+-	     user will (ab)use pthread_cond_wait as a check for whether a
+-	     point in time is in the past, and (3) spinning first without
+-	     having to compare against the current time seems to be the right
+-	     choice from a performance perspective for most use cases.  */
+-	  unsigned int spin = maxspin;
+-	  while (spin > 0 && ((int)(signals - lowseq) < 2))
+-	    {
+-	      /* Check that we are not spinning on a group that's already
+-		 closed.  */
+-	      if (seq < (g1_start >> 1))
+-		break;
+-
+-	      /* TODO Back off.  */
+-
+-	      /* Reload signals.  See above for MO.  */
+-	      signals = atomic_load_acquire (cond->__data.__g_signals + g);
+-	      g1_start = __condvar_load_g1_start_relaxed (cond);
+-	      lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
+-	      spin--;
+-	    }
+-
+ 	  if (seq < (g1_start >> 1))
+ 	    {
+ 	      /* If the group is closed already,
+@@ -482,24 +454,6 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
+ 	     an atomic read-modify-write operation and thus extend the release
+ 	     sequence.  */
+ 	  atomic_fetch_add_acquire (cond->__data.__g_refs + g, 2);
+-	  signals = atomic_load_acquire (cond->__data.__g_signals + g);
+-	  g1_start = __condvar_load_g1_start_relaxed (cond);
+-	  lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
+-
+-	  if (seq < (g1_start >> 1))
+-	    {
+-	      /* group is closed already, so don't block */
+-	      __condvar_dec_grefs (cond, g, private);
+-	      goto done;
+-	    }
+-
+-	  if ((int)(signals - lowseq) >= 2)
+-	    {
+-	      /* a signal showed up or G1/G2 switched after we grabbed the
+-	         refcount */
+-	      __condvar_dec_grefs (cond, g, private);
+-	      break;
+-	    }
+ 
+ 	  // Now block.
+ 	  struct _pthread_cleanup_buffer buffer;
+@@ -533,9 +487,6 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
+ 	  /* Reload signals.  See above for MO.  */
+ 	  signals = atomic_load_acquire (cond->__data.__g_signals + g);
+ 	}
+-
+-       if (seq < (__condvar_load_g1_start_relaxed (cond) >> 1))
+-	 goto done;
+     }
+   /* Try to grab a signal.  See above for MO.  (if we do another loop
+      iteration we need to see the correct value of g1_start)  */
+
+commit a2465f4293ecc37ac4650fbd02e517bc6fd801c6
+Author: Malte Skarupke <malteskarupke@fastmail.fm>
+Date:   Wed Dec 4 07:56:38 2024 -0500
+
+    nptl: Remove g_refs from condition variables
+    
+    This variable used to be needed to wait in group switching until all sleepers
+    have confirmed that they have woken. This is no longer needed. Nothing waits
+    on this variable so there is no need to track how many threads are currently
+    asleep in each group.
+    
+    Signed-off-by: Malte Skarupke <malteskarupke@fastmail.fm>
+    Reviewed-by: Carlos O'Donell <carlos@redhat.com>
+    (cherry picked from commit c36fc50781995e6758cae2b6927839d0157f213c)
+
+diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c
+index cfdd13bb87..411fc0380b 100644
+--- a/nptl/pthread_cond_wait.c
++++ b/nptl/pthread_cond_wait.c
+@@ -143,23 +143,6 @@ __condvar_cancel_waiting (pthread_cond_t *cond, uint64_t seq, unsigned int g,
+     }
+ }
+ 
+-/* Wake up any signalers that might be waiting.  */
+-static void
+-__condvar_dec_grefs (pthread_cond_t *cond, unsigned int g, int private)
+-{
+-  /* Release MO to synchronize-with the acquire load in
+-     __condvar_quiesce_and_switch_g1.  */
+-  if (atomic_fetch_add_release (cond->__data.__g_refs + g, -2) == 3)
+-    {
+-      /* Clear the wake-up request flag before waking up.  We do not need more
+-	 than relaxed MO and it doesn't matter if we apply this for an aliased
+-	 group because we wake all futex waiters right after clearing the
+-	 flag.  */
+-      atomic_fetch_and_relaxed (cond->__data.__g_refs + g, ~(unsigned int) 1);
+-      futex_wake (cond->__data.__g_refs + g, INT_MAX, private);
+-    }
+-}
+-
+ /* Clean-up for cancellation of waiters waiting for normal signals.  We cancel
+    our registration as a waiter, confirm we have woken up, and re-acquire the
+    mutex.  */
+@@ -171,8 +154,6 @@ __condvar_cleanup_waiting (void *arg)
+   pthread_cond_t *cond = cbuffer->cond;
+   unsigned g = cbuffer->wseq & 1;
+ 
+-  __condvar_dec_grefs (cond, g, cbuffer->private);
+-
+   __condvar_cancel_waiting (cond, cbuffer->wseq >> 1, g, cbuffer->private);
+   /* FIXME With the current cancellation implementation, it is possible that
+      a thread is cancelled after it has returned from a syscall.  This could
+@@ -327,15 +308,6 @@ __condvar_cleanup_waiting (void *arg)
+    sufficient because if a waiter can see a sufficiently large value, it could
+    have also consume a signal in the waiters group.
+ 
+-   It is essential that the last field in pthread_cond_t is __g_signals[1]:
+-   The previous condvar used a pointer-sized field in pthread_cond_t, so a
+-   PTHREAD_COND_INITIALIZER from that condvar implementation might only
+-   initialize 4 bytes to zero instead of the 8 bytes we need (i.e., 44 bytes
+-   in total instead of the 48 we need).  __g_signals[1] is not accessed before
+-   the first group switch (G2 starts at index 0), which will set its value to
+-   zero after a harmless fetch-or whose return value is ignored.  This
+-   effectively completes initialization.
+-
+ 
+    Limitations:
+    * This condvar isn't designed to allow for more than
+@@ -440,21 +412,6 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
+ 	  if ((int)(signals - lowseq) >= 2)
+ 	    break;
+ 
+-	  /* No signals available after spinning, so prepare to block.
+-	     We first acquire a group reference and use acquire MO for that so
+-	     that we synchronize with the dummy read-modify-write in
+-	     __condvar_quiesce_and_switch_g1 if we read from that.  In turn,
+-	     in this case this will make us see the advancement of __g_signals
+-	     to the upcoming new g1_start that occurs with a concurrent
+-	     attempt to reuse the group's slot.
+-	     We use acquire MO for the __g_signals check to make the
+-	     __g1_start check work (see spinning above).
+-	     Note that the group reference acquisition will not mask the
+-	     release MO when decrementing the reference count because we use
+-	     an atomic read-modify-write operation and thus extend the release
+-	     sequence.  */
+-	  atomic_fetch_add_acquire (cond->__data.__g_refs + g, 2);
+-
+ 	  // Now block.
+ 	  struct _pthread_cleanup_buffer buffer;
+ 	  struct _condvar_cleanup_buffer cbuffer;
+@@ -471,18 +428,11 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
+ 
+ 	  if (__glibc_unlikely (err == ETIMEDOUT || err == EOVERFLOW))
+ 	    {
+-	      __condvar_dec_grefs (cond, g, private);
+-	      /* If we timed out, we effectively cancel waiting.  Note that
+-		 we have decremented __g_refs before cancellation, so that a
+-		 deadlock between waiting for quiescence of our group in
+-		 __condvar_quiesce_and_switch_g1 and us trying to acquire
+-		 the lock during cancellation is not possible.  */
++	      /* If we timed out, we effectively cancel waiting.  */
+ 	      __condvar_cancel_waiting (cond, seq, g, private);
+ 	      result = err;
+ 	      goto done;
+ 	    }
+-	  else
+-	    __condvar_dec_grefs (cond, g, private);
+ 
+ 	  /* Reload signals.  See above for MO.  */
+ 	  signals = atomic_load_acquire (cond->__data.__g_signals + g);
+diff --git a/nptl/tst-cond22.c b/nptl/tst-cond22.c
+index 1336e9c79d..bdcb45c536 100644
+--- a/nptl/tst-cond22.c
++++ b/nptl/tst-cond22.c
+@@ -106,13 +106,13 @@ do_test (void)
+       status = 1;
+     }
+ 
+-  printf ("cond = { 0x%x:%x, 0x%x:%x, %u/%u/%u, %u/%u/%u, %u, %u }\n",
++  printf ("cond = { 0x%x:%x, 0x%x:%x, %u/%u, %u/%u, %u, %u }\n",
+ 	  c.__data.__wseq.__value32.__high,
+ 	  c.__data.__wseq.__value32.__low,
+ 	  c.__data.__g1_start.__value32.__high,
+ 	  c.__data.__g1_start.__value32.__low,
+-	  c.__data.__g_signals[0], c.__data.__g_refs[0], c.__data.__g_size[0],
+-	  c.__data.__g_signals[1], c.__data.__g_refs[1], c.__data.__g_size[1],
++	  c.__data.__g_signals[0], c.__data.__g_size[0],
++	  c.__data.__g_signals[1], c.__data.__g_size[1],
+ 	  c.__data.__g1_orig_size, c.__data.__wrefs);
+ 
+   if (pthread_create (&th, NULL, tf, (void *) 1l) != 0)
+@@ -152,13 +152,13 @@ do_test (void)
+       status = 1;
+     }
+ 
+-  printf ("cond = { 0x%x:%x, 0x%x:%x, %u/%u/%u, %u/%u/%u, %u, %u }\n",
++  printf ("cond = { 0x%x:%x, 0x%x:%x, %u/%u, %u/%u, %u, %u }\n",
+ 	  c.__data.__wseq.__value32.__high,
+ 	  c.__data.__wseq.__value32.__low,
+ 	  c.__data.__g1_start.__value32.__high,
+ 	  c.__data.__g1_start.__value32.__low,
+-	  c.__data.__g_signals[0], c.__data.__g_refs[0], c.__data.__g_size[0],
+-	  c.__data.__g_signals[1], c.__data.__g_refs[1], c.__data.__g_size[1],
++	  c.__data.__g_signals[0], c.__data.__g_size[0],
++	  c.__data.__g_signals[1], c.__data.__g_size[1],
+ 	  c.__data.__g1_orig_size, c.__data.__wrefs);
+ 
+   return status;
+diff --git a/sysdeps/nptl/bits/thread-shared-types.h b/sysdeps/nptl/bits/thread-shared-types.h
+index df54eef6f7..a3d482f80f 100644
+--- a/sysdeps/nptl/bits/thread-shared-types.h
++++ b/sysdeps/nptl/bits/thread-shared-types.h
+@@ -95,8 +95,7 @@ struct __pthread_cond_s
+ {
+   __atomic_wide_counter __wseq;
+   __atomic_wide_counter __g1_start;
+-  unsigned int __g_refs[2] __LOCK_ALIGNMENT;
+-  unsigned int __g_size[2];
++  unsigned int __g_size[2] __LOCK_ALIGNMENT;
+   unsigned int __g1_orig_size;
+   unsigned int __wrefs;
+   unsigned int __g_signals[2];
+diff --git a/sysdeps/nptl/pthread.h b/sysdeps/nptl/pthread.h
+index 3d4f4a756c..9af75d6eae 100644
+--- a/sysdeps/nptl/pthread.h
++++ b/sysdeps/nptl/pthread.h
+@@ -152,7 +152,7 @@ enum
+ 
+ 
+ /* Conditional variable handling.  */
+-#define PTHREAD_COND_INITIALIZER { { {0}, {0}, {0, 0}, {0, 0}, 0, 0, {0, 0} } }
++#define PTHREAD_COND_INITIALIZER { { {0}, {0}, {0, 0}, 0, 0, {0, 0} } }
+ 
+ 
+ /* Cleanup buffers */
+
+commit fa110993a6390ae5c97dff613ef02b59ec78c5da
+Author: Malte Skarupke <malteskarupke@fastmail.fm>
+Date:   Wed Dec 4 08:03:44 2024 -0500
+
+    nptl: Use a single loop in pthread_cond_wait instaed of a nested loop
+    
+    The loop was a little more complicated than necessary. There was only one
+    break statement out of the inner loop, and the outer loop was nearly empty.
+    So just remove the outer loop, moving its code to the one break statement in
+    the inner loop. This allows us to replace all gotos with break statements.
+    
+    Signed-off-by: Malte Skarupke <malteskarupke@fastmail.fm>
+    Reviewed-by: Carlos O'Donell <carlos@redhat.com>
+    (cherry picked from commit 929a4764ac90382616b6a21f099192b2475da674)
+
+diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c
+index 411fc0380b..683cb2b133 100644
+--- a/nptl/pthread_cond_wait.c
++++ b/nptl/pthread_cond_wait.c
+@@ -382,17 +382,15 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
+       return err;
+     }
+ 
+-  /* Now wait until a signal is available in our group or it is closed.
+-     Acquire MO so that if we observe (signals == lowseq) after group
+-     switching in __condvar_quiesce_and_switch_g1, we synchronize with that
+-     store and will see the prior update of __g1_start done while switching
+-     groups too.  */
+-  unsigned int signals = atomic_load_acquire (cond->__data.__g_signals + g);
+-
+-  do
+-    {
++
+       while (1)
+ 	{
++	  /* Now wait until a signal is available in our group or it is closed.
++	     Acquire MO so that if we observe (signals == lowseq) after group
++	     switching in __condvar_quiesce_and_switch_g1, we synchronize with that
++	     store and will see the prior update of __g1_start done while switching
++	     groups too.  */
++	  unsigned int signals = atomic_load_acquire (cond->__data.__g_signals + g);
+ 	  uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
+ 	  unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
+ 
+@@ -401,7 +399,7 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
+ 	      /* If the group is closed already,
+ 	         then this waiter originally had enough extra signals to
+ 	         consume, up until the time its group was closed.  */
+-	       goto done;
++	       break;
+ 	    }
+ 
+ 	  /* If there is an available signal, don't block.
+@@ -410,7 +408,16 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
+ 	     G2, but in either case we're allowed to consume the available
+ 	     signal and should not block anymore.  */
+ 	  if ((int)(signals - lowseq) >= 2)
+-	    break;
++	    {
++	      /* Try to grab a signal.  See above for MO.  (if we do another loop
++		 iteration we need to see the correct value of g1_start)  */
++		      if (atomic_compare_exchange_weak_acquire (
++		      		cond->__data.__g_signals + g,
++			&signals, signals - 2))
++		      	break;
++		      else
++		      	continue;
++	    }
+ 
+ 	  // Now block.
+ 	  struct _pthread_cleanup_buffer buffer;
+@@ -431,19 +438,9 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
+ 	      /* If we timed out, we effectively cancel waiting.  */
+ 	      __condvar_cancel_waiting (cond, seq, g, private);
+ 	      result = err;
+-	      goto done;
++	      break;
+ 	    }
+-
+-	  /* Reload signals.  See above for MO.  */
+-	  signals = atomic_load_acquire (cond->__data.__g_signals + g);
+ 	}
+-    }
+-  /* Try to grab a signal.  See above for MO.  (if we do another loop
+-     iteration we need to see the correct value of g1_start)  */
+-  while (!atomic_compare_exchange_weak_acquire (cond->__data.__g_signals + g,
+-						&signals, signals - 2));
+-
+- done:
+ 
+   /* Confirm that we have been woken.  We do that before acquiring the mutex
+      to allow for execution of pthread_cond_destroy while having acquired the
+
+commit afbf0d46850dcd1b626d892ad8fde2162067ddc7
+Author: Malte Skarupke <malteskarupke@fastmail.fm>
+Date:   Wed Dec 4 08:04:10 2024 -0500
+
+    nptl: Fix indentation
+    
+    In my previous change I turned a nested loop into a simple loop. I'm doing
+    the resulting indentation changes in a separate commit to make the diff on
+    the previous commit easier to review.
+    
+    Signed-off-by: Malte Skarupke <malteskarupke@fastmail.fm>
+    Reviewed-by: Carlos O'Donell <carlos@redhat.com>
+    (cherry picked from commit ee6c14ed59d480720721aaacc5fb03213dc153da)
+
+diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c
+index 683cb2b133..7fc9dadf15 100644
+--- a/nptl/pthread_cond_wait.c
++++ b/nptl/pthread_cond_wait.c
+@@ -383,65 +383,65 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
+     }
+ 
+ 
+-      while (1)
+-	{
+-	  /* Now wait until a signal is available in our group or it is closed.
+-	     Acquire MO so that if we observe (signals == lowseq) after group
+-	     switching in __condvar_quiesce_and_switch_g1, we synchronize with that
+-	     store and will see the prior update of __g1_start done while switching
+-	     groups too.  */
+-	  unsigned int signals = atomic_load_acquire (cond->__data.__g_signals + g);
+-	  uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
+-	  unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
+-
+-	  if (seq < (g1_start >> 1))
+-	    {
+-	      /* If the group is closed already,
+-	         then this waiter originally had enough extra signals to
+-	         consume, up until the time its group was closed.  */
+-	       break;
+-	    }
+-
+-	  /* If there is an available signal, don't block.
+-	     If __g1_start has advanced at all, then we must be in G1
+-	     by now, perhaps in the process of switching back to an older
+-	     G2, but in either case we're allowed to consume the available
+-	     signal and should not block anymore.  */
+-	  if ((int)(signals - lowseq) >= 2)
+-	    {
+-	      /* Try to grab a signal.  See above for MO.  (if we do another loop
+-		 iteration we need to see the correct value of g1_start)  */
+-		      if (atomic_compare_exchange_weak_acquire (
+-		      		cond->__data.__g_signals + g,
++  while (1)
++    {
++      /* Now wait until a signal is available in our group or it is closed.
++         Acquire MO so that if we observe (signals == lowseq) after group
++         switching in __condvar_quiesce_and_switch_g1, we synchronize with that
++         store and will see the prior update of __g1_start done while switching
++         groups too.  */
++      unsigned int signals = atomic_load_acquire (cond->__data.__g_signals + g);
++      uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
++      unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
++
++      if (seq < (g1_start >> 1))
++        {
++          /* If the group is closed already,
++             then this waiter originally had enough extra signals to
++             consume, up until the time its group was closed.  */
++           break;
++        }
++
++      /* If there is an available signal, don't block.
++         If __g1_start has advanced at all, then we must be in G1
++         by now, perhaps in the process of switching back to an older
++         G2, but in either case we're allowed to consume the available
++         signal and should not block anymore.  */
++      if ((int)(signals - lowseq) >= 2)
++        {
++	  /* Try to grab a signal.  See above for MO.  (if we do another loop
++	     iteration we need to see the correct value of g1_start)  */
++	    if (atomic_compare_exchange_weak_acquire (
++			cond->__data.__g_signals + g,
+ 			&signals, signals - 2))
+-		      	break;
+-		      else
+-		      	continue;
+-	    }
+-
+-	  // Now block.
+-	  struct _pthread_cleanup_buffer buffer;
+-	  struct _condvar_cleanup_buffer cbuffer;
+-	  cbuffer.wseq = wseq;
+-	  cbuffer.cond = cond;
+-	  cbuffer.mutex = mutex;
+-	  cbuffer.private = private;
+-	  __pthread_cleanup_push (&buffer, __condvar_cleanup_waiting, &cbuffer);
+-
+-	  err = __futex_abstimed_wait_cancelable64 (
+-	    cond->__data.__g_signals + g, signals, clockid, abstime, private);
+-
+-	  __pthread_cleanup_pop (&buffer, 0);
+-
+-	  if (__glibc_unlikely (err == ETIMEDOUT || err == EOVERFLOW))
+-	    {
+-	      /* If we timed out, we effectively cancel waiting.  */
+-	      __condvar_cancel_waiting (cond, seq, g, private);
+-	      result = err;
+ 	      break;
+-	    }
++	    else
++	      continue;
+ 	}
+ 
++      // Now block.
++      struct _pthread_cleanup_buffer buffer;
++      struct _condvar_cleanup_buffer cbuffer;
++      cbuffer.wseq = wseq;
++      cbuffer.cond = cond;
++      cbuffer.mutex = mutex;
++      cbuffer.private = private;
++      __pthread_cleanup_push (&buffer, __condvar_cleanup_waiting, &cbuffer);
++
++      err = __futex_abstimed_wait_cancelable64 (
++        cond->__data.__g_signals + g, signals, clockid, abstime, private);
++
++      __pthread_cleanup_pop (&buffer, 0);
++
++      if (__glibc_unlikely (err == ETIMEDOUT || err == EOVERFLOW))
++        {
++          /* If we timed out, we effectively cancel waiting.  */
++          __condvar_cancel_waiting (cond, seq, g, private);
++          result = err;
++          break;
++        }
++    }
++
+   /* Confirm that we have been woken.  We do that before acquiring the mutex
+      to allow for execution of pthread_cond_destroy while having acquired the
+      mutex.  */
+
+commit 2ad69497346cc20ef4d568108f1de49b2f451c55
+Author: Malte Skarupke <malteskarupke@fastmail.fm>
+Date:   Wed Dec 4 08:04:54 2024 -0500
+
+    nptl: rename __condvar_quiesce_and_switch_g1
+    
+    This function no longer waits for threads to leave g1, so rename it to
+    __condvar_switch_g1
+    
+    Signed-off-by: Malte Skarupke <malteskarupke@fastmail.fm>
+    Reviewed-by: Carlos O'Donell <carlos@redhat.com>
+    (cherry picked from commit 4b79e27a5073c02f6bff9aa8f4791230a0ab1867)
+
+diff --git a/nptl/pthread_cond_broadcast.c b/nptl/pthread_cond_broadcast.c
+index aada91639a..38bba17bfc 100644
+--- a/nptl/pthread_cond_broadcast.c
++++ b/nptl/pthread_cond_broadcast.c
+@@ -60,7 +60,7 @@ ___pthread_cond_broadcast (pthread_cond_t *cond)
+ 				cond->__data.__g_size[g1] << 1);
+       cond->__data.__g_size[g1] = 0;
+ 
+-      /* We need to wake G1 waiters before we quiesce G1 below.  */
++      /* We need to wake G1 waiters before we switch G1 below.  */
+       /* TODO Only set it if there are indeed futex waiters.  We could
+ 	 also try to move this out of the critical section in cases when
+ 	 G2 is empty (and we don't need to quiesce).  */
+@@ -69,7 +69,7 @@ ___pthread_cond_broadcast (pthread_cond_t *cond)
+ 
+   /* G1 is complete.  Step (2) is next unless there are no waiters in G2, in
+      which case we can stop.  */
+-  if (__condvar_quiesce_and_switch_g1 (cond, wseq, &g1, private))
++  if (__condvar_switch_g1 (cond, wseq, &g1, private))
+     {
+       /* Step (3): Send signals to all waiters in the old G2 / new G1.  */
+       atomic_fetch_add_relaxed (cond->__data.__g_signals + g1,
+diff --git a/nptl/pthread_cond_common.c b/nptl/pthread_cond_common.c
+index 30b8eee149..5044273cc2 100644
+--- a/nptl/pthread_cond_common.c
++++ b/nptl/pthread_cond_common.c
+@@ -189,16 +189,15 @@ __condvar_get_private (int flags)
+     return FUTEX_SHARED;
+ }
+ 
+-/* This closes G1 (whose index is in G1INDEX), waits for all futex waiters to
+-   leave G1, converts G1 into a fresh G2, and then switches group roles so that
+-   the former G2 becomes the new G1 ending at the current __wseq value when we
+-   eventually make the switch (WSEQ is just an observation of __wseq by the
+-   signaler).
++/* This closes G1 (whose index is in G1INDEX), converts G1 into a fresh G2,
++   and then switches group roles so that the former G2 becomes the new G1
++   ending at the current __wseq value when we eventually make the switch
++   (WSEQ is just an observation of __wseq by the signaler).
+    If G2 is empty, it will not switch groups because then it would create an
+    empty G1 which would require switching groups again on the next signal.
+    Returns false iff groups were not switched because G2 was empty.  */
+ static bool __attribute__ ((unused))
+-__condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
++__condvar_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
+     unsigned int *g1index, int private)
+ {
+   unsigned int g1 = *g1index;
+@@ -214,8 +213,7 @@ __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
+ 	  + cond->__data.__g_size[g1 ^ 1]) == 0)
+ 	return false;
+ 
+-  /* Now try to close and quiesce G1.  We have to consider the following kinds
+-     of waiters:
++  /* We have to consider the following kinds of waiters:
+      * Waiters from less recent groups than G1 are not affected because
+        nothing will change for them apart from __g1_start getting larger.
+      * New waiters arriving concurrently with the group switching will all go
+@@ -223,12 +221,12 @@ __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
+        are not affected.
+      * Waiters in G1 have already received a signal and been woken.  */
+ 
+-  /* Update __g1_start, which finishes closing this group.  The value we add
+-     will never be negative because old_orig_size can only be zero when we
+-     switch groups the first time after a condvar was initialized, in which
+-     case G1 will be at index 1 and we will add a value of 1.
+-     Relaxed MO is fine because the change comes with no additional
+-     constraints that others would have to observe.  */
++  /* Update __g1_start, which closes this group.  The value we add will never
++     be negative because old_orig_size can only be zero when we switch groups
++     the first time after a condvar was initialized, in which case G1 will be
++     at index 1 and we will add a value of 1. Relaxed MO is fine because the
++     change comes with no additional constraints that others would have to
++     observe.  */
+   __condvar_add_g1_start_relaxed (cond,
+       (old_orig_size << 1) + (g1 == 1 ? 1 : - 1));
+ 
+diff --git a/nptl/pthread_cond_signal.c b/nptl/pthread_cond_signal.c
+index 43d6286ecd..f095497142 100644
+--- a/nptl/pthread_cond_signal.c
++++ b/nptl/pthread_cond_signal.c
+@@ -69,18 +69,17 @@ ___pthread_cond_signal (pthread_cond_t *cond)
+   bool do_futex_wake = false;
+ 
+   /* If G1 is still receiving signals, we put the signal there.  If not, we
+-     check if G2 has waiters, and if so, quiesce and switch G1 to the former
+-     G2; if this results in a new G1 with waiters (G2 might have cancellations
+-     already, see __condvar_quiesce_and_switch_g1), we put the signal in the
+-     new G1.  */
++     check if G2 has waiters, and if so, switch G1 to the former G2; if this
++     results in a new G1 with waiters (G2 might have cancellations already,
++     see __condvar_switch_g1), we put the signal in the new G1. */
+   if ((cond->__data.__g_size[g1] != 0)
+-      || __condvar_quiesce_and_switch_g1 (cond, wseq, &g1, private))
++      || __condvar_switch_g1 (cond, wseq, &g1, private))
+     {
+       /* Add a signal.  Relaxed MO is fine because signaling does not need to
+-	 establish a happens-before relation (see above).  We do not mask the
+-	 release-MO store when initializing a group in
+-	 __condvar_quiesce_and_switch_g1 because we use an atomic
+-	 read-modify-write and thus extend that store's release sequence.  */
++         establish a happens-before relation (see above).  We do not mask the
++         release-MO store when initializing a group in __condvar_switch_g1
++         because we use an atomic read-modify-write and thus extend that
++         store's release sequence.  */
+       atomic_fetch_add_relaxed (cond->__data.__g_signals + g1, 2);
+       cond->__data.__g_size[g1]--;
+       /* TODO Only set it if there are indeed futex waiters.  */
+diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c
+index 7fc9dadf15..80bb728211 100644
+--- a/nptl/pthread_cond_wait.c
++++ b/nptl/pthread_cond_wait.c
+@@ -354,8 +354,7 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
+      because we do not need to establish any happens-before relation with
+      signalers (see __pthread_cond_signal); modification order alone
+      establishes a total order of waiters/signals.  We do need acquire MO
+-     to synchronize with group reinitialization in
+-     __condvar_quiesce_and_switch_g1.  */
++     to synchronize with group reinitialization in __condvar_switch_g1.  */
+   uint64_t wseq = __condvar_fetch_add_wseq_acquire (cond, 2);
+   /* Find our group's index.  We always go into what was G2 when we acquired
+      our position.  */
+@@ -387,9 +386,9 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
+     {
+       /* Now wait until a signal is available in our group or it is closed.
+          Acquire MO so that if we observe (signals == lowseq) after group
+-         switching in __condvar_quiesce_and_switch_g1, we synchronize with that
+-         store and will see the prior update of __g1_start done while switching
+-         groups too.  */
++         switching in __condvar_switch_g1, we synchronize with that store and
++         will see the prior update of __g1_start done while switching groups
++         too.  */
+       unsigned int signals = atomic_load_acquire (cond->__data.__g_signals + g);
+       uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
+       unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
+
+commit 7f71824b8039b8afc150dd5c881b61faf10675ef
+Author: Malte Skarupke <malteskarupke@fastmail.fm>
+Date:   Wed Dec 4 08:05:40 2024 -0500
+
+    nptl: Use all of g1_start and g_signals
+    
+    The LSB of g_signals was unused. The LSB of g1_start was used to indicate
+    which group is G2. This was used to always go to sleep in pthread_cond_wait
+    if a waiter is in G2. A comment earlier in the file says that this is not
+    correct to do:
+    
+     "Waiters cannot determine whether they are currently in G2 or G1 -- but they
+      do not have to because all they are interested in is whether there are
+      available signals"
+    
+    I either would have had to update the comment, or get rid of the check. I
+    chose to get rid of the check. In fact I don't quite know why it was there.
+    There will never be available signals for group G2, so we didn't need the
+    special case. Even if there were, this would just be a spurious wake. This
+    might have caught some cases where the count has wrapped around, but it
+    wouldn't reliably do that, (and even if it did, why would you want to force a
+    sleep in that case?) and we don't support that many concurrent waiters
+    anyway. Getting rid of it allows us to use one more bit, making us more
+    robust to wraparound.
+    
+    Signed-off-by: Malte Skarupke <malteskarupke@fastmail.fm>
+    Reviewed-by: Carlos O'Donell <carlos@redhat.com>
+    (cherry picked from commit 91bb902f58264a2fd50fbce8f39a9a290dd23706)
+
+diff --git a/nptl/pthread_cond_broadcast.c b/nptl/pthread_cond_broadcast.c
+index 38bba17bfc..51afa62adf 100644
+--- a/nptl/pthread_cond_broadcast.c
++++ b/nptl/pthread_cond_broadcast.c
+@@ -57,7 +57,7 @@ ___pthread_cond_broadcast (pthread_cond_t *cond)
+     {
+       /* Add as many signals as the remaining size of the group.  */
+       atomic_fetch_add_relaxed (cond->__data.__g_signals + g1,
+-				cond->__data.__g_size[g1] << 1);
++				cond->__data.__g_size[g1]);
+       cond->__data.__g_size[g1] = 0;
+ 
+       /* We need to wake G1 waiters before we switch G1 below.  */
+@@ -73,7 +73,7 @@ ___pthread_cond_broadcast (pthread_cond_t *cond)
+     {
+       /* Step (3): Send signals to all waiters in the old G2 / new G1.  */
+       atomic_fetch_add_relaxed (cond->__data.__g_signals + g1,
+-				cond->__data.__g_size[g1] << 1);
++				cond->__data.__g_size[g1]);
+       cond->__data.__g_size[g1] = 0;
+       /* TODO Only set it if there are indeed futex waiters.  */
+       do_futex_wake = true;
+diff --git a/nptl/pthread_cond_common.c b/nptl/pthread_cond_common.c
+index 5044273cc2..389402913c 100644
+--- a/nptl/pthread_cond_common.c
++++ b/nptl/pthread_cond_common.c
+@@ -208,9 +208,9 @@ __condvar_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
+      behavior.
+      Note that this works correctly for a zero-initialized condvar too.  */
+   unsigned int old_orig_size = __condvar_get_orig_size (cond);
+-  uint64_t old_g1_start = __condvar_load_g1_start_relaxed (cond) >> 1;
+-  if (((unsigned) (wseq - old_g1_start - old_orig_size)
+-	  + cond->__data.__g_size[g1 ^ 1]) == 0)
++  uint64_t old_g1_start = __condvar_load_g1_start_relaxed (cond);
++  uint64_t new_g1_start = old_g1_start + old_orig_size;
++  if (((unsigned) (wseq - new_g1_start) + cond->__data.__g_size[g1 ^ 1]) == 0)
+ 	return false;
+ 
+   /* We have to consider the following kinds of waiters:
+@@ -221,16 +221,10 @@ __condvar_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
+        are not affected.
+      * Waiters in G1 have already received a signal and been woken.  */
+ 
+-  /* Update __g1_start, which closes this group.  The value we add will never
+-     be negative because old_orig_size can only be zero when we switch groups
+-     the first time after a condvar was initialized, in which case G1 will be
+-     at index 1 and we will add a value of 1. Relaxed MO is fine because the
+-     change comes with no additional constraints that others would have to
+-     observe.  */
+-  __condvar_add_g1_start_relaxed (cond,
+-      (old_orig_size << 1) + (g1 == 1 ? 1 : - 1));
+-
+-  unsigned int lowseq = ((old_g1_start + old_orig_size) << 1) & ~1U;
++  /* Update __g1_start, which closes this group.  Relaxed MO is fine because
++     the change comes with no additional constraints that others would have
++     to observe.  */
++  __condvar_add_g1_start_relaxed (cond, old_orig_size);
+ 
+   /* At this point, the old G1 is now a valid new G2 (but not in use yet).
+      No old waiter can neither grab a signal nor acquire a reference without
+@@ -242,13 +236,13 @@ __condvar_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
+   g1 ^= 1;
+   *g1index ^= 1;
+ 
+-  /* Now advance the new G1 g_signals to the new lowseq, giving it
++  /* Now advance the new G1 g_signals to the new g1_start, giving it
+      an effective signal count of 0 to start.  */
+-  atomic_store_release (cond->__data.__g_signals + g1, lowseq);
++  atomic_store_release (cond->__data.__g_signals + g1, (unsigned)new_g1_start);
+ 
+   /* These values are just observed by signalers, and thus protected by the
+      lock.  */
+-  unsigned int orig_size = wseq - (old_g1_start + old_orig_size);
++  unsigned int orig_size = wseq - new_g1_start;
+   __condvar_set_orig_size (cond, orig_size);
+   /* Use and addition to not loose track of cancellations in what was
+      previously G2.  */
+diff --git a/nptl/pthread_cond_signal.c b/nptl/pthread_cond_signal.c
+index f095497142..fa3a5c3d8f 100644
+--- a/nptl/pthread_cond_signal.c
++++ b/nptl/pthread_cond_signal.c
+@@ -80,7 +80,7 @@ ___pthread_cond_signal (pthread_cond_t *cond)
+          release-MO store when initializing a group in __condvar_switch_g1
+          because we use an atomic read-modify-write and thus extend that
+          store's release sequence.  */
+-      atomic_fetch_add_relaxed (cond->__data.__g_signals + g1, 2);
++      atomic_fetch_add_relaxed (cond->__data.__g_signals + g1, 1);
+       cond->__data.__g_size[g1]--;
+       /* TODO Only set it if there are indeed futex waiters.  */
+       do_futex_wake = true;
+diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c
+index 80bb728211..0f1dfcb595 100644
+--- a/nptl/pthread_cond_wait.c
++++ b/nptl/pthread_cond_wait.c
+@@ -84,7 +84,7 @@ __condvar_cancel_waiting (pthread_cond_t *cond, uint64_t seq, unsigned int g,
+      not hold a reference on the group.  */
+   __condvar_acquire_lock (cond, private);
+ 
+-  uint64_t g1_start = __condvar_load_g1_start_relaxed (cond) >> 1;
++  uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
+   if (g1_start > seq)
+     {
+       /* Our group is closed, so someone provided enough signals for it.
+@@ -259,7 +259,6 @@ __condvar_cleanup_waiting (void *arg)
+      * Waiters fetch-add while having acquire the mutex associated with the
+        condvar.  Signalers load it and fetch-xor it concurrently.
+    __g1_start: Starting position of G1 (inclusive)
+-     * LSB is index of current G2.
+      * Modified by signalers while having acquired the condvar-internal lock
+        and observed concurrently by waiters.
+    __g1_orig_size: Initial size of G1
+@@ -280,11 +279,9 @@ __condvar_cleanup_waiting (void *arg)
+      * Reference count used by waiters concurrently with signalers that have
+        acquired the condvar-internal lock.
+    __g_signals: The number of signals that can still be consumed, relative to
+-     the current g1_start.  (i.e. bits 31 to 1 of __g_signals are bits
+-     31 to 1 of g1_start with the signal count added)
++     the current g1_start.  (i.e. g1_start with the signal count added)
+      * Used as a futex word by waiters.  Used concurrently by waiters and
+        signalers.
+-     * LSB is currently reserved and 0.
+    __g_size: Waiters remaining in this group (i.e., which have not been
+      signaled yet.
+      * Accessed by signalers and waiters that cancel waiting (both do so only
+@@ -391,9 +388,8 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
+          too.  */
+       unsigned int signals = atomic_load_acquire (cond->__data.__g_signals + g);
+       uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
+-      unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
+ 
+-      if (seq < (g1_start >> 1))
++      if (seq < g1_start)
+         {
+           /* If the group is closed already,
+              then this waiter originally had enough extra signals to
+@@ -406,13 +402,13 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
+          by now, perhaps in the process of switching back to an older
+          G2, but in either case we're allowed to consume the available
+          signal and should not block anymore.  */
+-      if ((int)(signals - lowseq) >= 2)
++      if ((int)(signals - (unsigned int)g1_start) > 0)
+         {
+ 	  /* Try to grab a signal.  See above for MO.  (if we do another loop
+ 	     iteration we need to see the correct value of g1_start)  */
+ 	    if (atomic_compare_exchange_weak_acquire (
+ 			cond->__data.__g_signals + g,
+-			&signals, signals - 2))
++			&signals, signals - 1))
+ 	      break;
+ 	    else
+ 	      continue;
+
+commit 8d3dd23e3de8b4c6e4b94f8bbfab971c3b8a55be
+Author: Florian Weimer <fweimer@redhat.com>
+Date:   Thu Mar 13 06:07:07 2025 +0100
+
+    nptl: PTHREAD_COND_INITIALIZER compatibility with pre-2.41 versions (bug 32786)
+    
+    The new initializer and struct layout does not initialize the
+    __g_signals field in the old struct layout before the change in
+    commit c36fc50781995e6758cae2b6927839d0157f213c ("nptl: Remove
+    g_refs from condition variables").  Bring back fields at the end
+    of struct __pthread_cond_s, so that they are again zero-initialized.
+    
+    Reviewed-by: Sam James <sam@gentoo.org>
+
+diff --git a/sysdeps/nptl/bits/thread-shared-types.h b/sysdeps/nptl/bits/thread-shared-types.h
+index a3d482f80f..bccc2003ec 100644
+--- a/sysdeps/nptl/bits/thread-shared-types.h
++++ b/sysdeps/nptl/bits/thread-shared-types.h
+@@ -99,6 +99,8 @@ struct __pthread_cond_s
+   unsigned int __g1_orig_size;
+   unsigned int __wrefs;
+   unsigned int __g_signals[2];
++  unsigned int __unused_initialized_1;
++  unsigned int __unused_initialized_2;
+ };
+ 
+ typedef unsigned int __tss_t;
+diff --git a/sysdeps/nptl/pthread.h b/sysdeps/nptl/pthread.h
+index 9af75d6eae..e0f24418fe 100644
+--- a/sysdeps/nptl/pthread.h
++++ b/sysdeps/nptl/pthread.h
+@@ -152,7 +152,7 @@ enum
+ 
+ 
+ /* Conditional variable handling.  */
+-#define PTHREAD_COND_INITIALIZER { { {0}, {0}, {0, 0}, 0, 0, {0, 0} } }
++#define PTHREAD_COND_INITIALIZER { { {0}, {0}, {0, 0}, 0, 0, {0, 0}, 0, 0 } }
+ 
+ 
+ /* Cleanup buffers */
+
+commit 33b33e9dd0ff26158b1b83cc4347a39c073e490e
+Author: Arjun Shankar <arjun@redhat.com>
+Date:   Fri Oct 18 16:03:25 2024 +0200
+
+    libio: Fix a deadlock after fork in popen
+    
+    popen modifies its file handler book-keeping under a lock that wasn't
+    being taken during fork.  This meant that a concurrent popen and fork
+    could end up copying the lock in a "locked" state into the fork child,
+    where subsequently calling popen would lead to a deadlock due to the
+    already (spuriously) held lock.
+    
+    This commit fixes the deadlock by appropriately taking the lock before
+    fork, and releasing/resetting it in the parent/child after the fork.
+    
+    A new test for concurrent popen and fork is also added.  It consistently
+    hangs (and therefore fails via timeout) without the fix applied.
+    Reviewed-by: Florian Weimer <fweimer@redhat.com>
+    
+    (cherry picked from commit 9f0d2c0ee6c728643fcf9a4879e9f20f5e45ce5f)
+
+diff --git a/libio/Makefile b/libio/Makefile
+index 5292baa4e0..7faba230ac 100644
+--- a/libio/Makefile
++++ b/libio/Makefile
+@@ -117,6 +117,7 @@ tests = \
+   tst-mmap-offend \
+   tst-mmap-setvbuf \
+   tst-mmap2-eofsync \
++  tst-popen-fork \
+   tst-popen1 \
+   tst-setvbuf1 \
+   tst-sprintf-chk-ub \
+diff --git a/libio/iopopen.c b/libio/iopopen.c
+index d01cb0648e..352513a291 100644
+--- a/libio/iopopen.c
++++ b/libio/iopopen.c
+@@ -57,6 +57,26 @@ unlock (void *not_used)
+ }
+ #endif
+ 
++/* These lock/unlock/resetlock functions are used during fork.  */
++
++void
++_IO_proc_file_chain_lock (void)
++{
++  _IO_lock_lock (proc_file_chain_lock);
++}
++
++void
++_IO_proc_file_chain_unlock (void)
++{
++  _IO_lock_unlock (proc_file_chain_lock);
++}
++
++void
++_IO_proc_file_chain_resetlock (void)
++{
++  _IO_lock_init (proc_file_chain_lock);
++}
++
+ /* POSIX states popen shall ensure that any streams from previous popen()
+    calls that remain open in the parent process should be closed in the new
+    child process.
+diff --git a/libio/libioP.h b/libio/libioP.h
+index 616253fcd0..a83a411fdf 100644
+--- a/libio/libioP.h
++++ b/libio/libioP.h
+@@ -429,6 +429,12 @@ libc_hidden_proto (_IO_list_resetlock)
+ extern void _IO_enable_locks (void) __THROW;
+ libc_hidden_proto (_IO_enable_locks)
+ 
++/* Functions for operating popen's proc_file_chain_lock during fork.  */
++
++extern void _IO_proc_file_chain_lock (void) __THROW attribute_hidden;
++extern void _IO_proc_file_chain_unlock (void) __THROW attribute_hidden;
++extern void _IO_proc_file_chain_resetlock (void) __THROW attribute_hidden;
++
+ /* Default jumptable functions. */
+ 
+ extern int _IO_default_underflow (FILE *) __THROW;
+diff --git a/libio/tst-popen-fork.c b/libio/tst-popen-fork.c
+new file mode 100644
+index 0000000000..1df30fc6c0
+--- /dev/null
++++ b/libio/tst-popen-fork.c
+@@ -0,0 +1,80 @@
++/* Test concurrent popen and fork.
++   Copyright (C) 2024 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <stdio.h>
++#include <stdatomic.h>
++#include <pthread.h>
++#include <unistd.h>
++#include <sys/wait.h>
++
++#include <support/check.h>
++#include <support/xthread.h>
++#include <support/xunistd.h>
++
++static void
++popen_and_pclose (void)
++{
++  FILE *f = popen ("true", "r");
++  TEST_VERIFY_EXIT (f != NULL);
++  pclose (f);
++  return;
++}
++
++static atomic_bool done = ATOMIC_VAR_INIT (0);
++
++static void *
++popen_and_pclose_forever (__attribute__ ((unused))
++                          void *arg)
++{
++  while (!atomic_load_explicit (&done, memory_order_acquire))
++    popen_and_pclose ();
++  return NULL;
++}
++
++static int
++do_test (void)
++{
++
++  /* Repeatedly call popen in a loop during the entire test.  */
++  pthread_t t = xpthread_create (NULL, popen_and_pclose_forever, NULL);
++
++  /* Repeatedly fork off and reap child processes one-by-one.
++     Each child calls popen once, then exits, leading to the possibility
++     that a child forks *during* our own popen call, thus inheriting any
++     intermediate popen state, possibly including lock state(s).  */
++  for (int i = 0; i < 100; i++)
++    {
++      int cpid = xfork ();
++
++      if (cpid == 0)
++        {
++          popen_and_pclose ();
++          _exit (0);
++        }
++      else
++        xwaitpid (cpid, NULL, 0);
++    }
++
++  /* Stop calling popen.  */
++  atomic_store_explicit (&done, 1, memory_order_release);
++  xpthread_join (t);
++
++  return 0;
++}
++
++#include <support/test-driver.c>
+diff --git a/posix/fork.c b/posix/fork.c
+index 298765a1ff..cf9b80e7c0 100644
+--- a/posix/fork.c
++++ b/posix/fork.c
+@@ -62,6 +62,7 @@ __libc_fork (void)
+       call_function_static_weak (__nss_database_fork_prepare_parent,
+ 				 &nss_database_data);
+ 
++      _IO_proc_file_chain_lock ();
+       _IO_list_lock ();
+ 
+       /* Acquire malloc locks.  This needs to come last because fork
+@@ -92,6 +93,7 @@ __libc_fork (void)
+ 
+ 	  /* Reset locks in the I/O code.  */
+ 	  _IO_list_resetlock ();
++	  _IO_proc_file_chain_resetlock ();
+ 
+ 	  call_function_static_weak (__nss_database_fork_subprocess,
+ 				     &nss_database_data);
+@@ -121,6 +123,7 @@ __libc_fork (void)
+ 
+ 	  /* We execute this even if the 'fork' call failed.  */
+ 	  _IO_list_unlock ();
++	  _IO_proc_file_chain_unlock ();
+ 	}
+ 
+       /* Run the handlers registered for the parent.  */
+
+commit 7c3c9ae28685a9142a8cfa3521bbca74c1007d0b
+Author: Arjun Shankar <arjun@redhat.com>
+Date:   Fri Oct 25 09:33:45 2024 +0200
+
+    libio: Correctly link tst-popen-fork against libpthread
+    
+    tst-popen-fork failed to build for Hurd due to not being linked with
+    libpthread.  This commit fixes that.
+    
+    Tested with build-many-glibcs.py for i686-gnu.
+    
+    Reviewed-by: Florian Weimer <fweimer@redhat.com>
+    (cherry picked from commit 6a290b2895b77be839fcb7c44a6a9879560097ad)
+
+diff --git a/libio/Makefile b/libio/Makefile
+index 7faba230ac..f2e98f96eb 100644
+--- a/libio/Makefile
++++ b/libio/Makefile
+@@ -142,6 +142,8 @@ tests = \
+   tst_wscanf \
+   # tests
+ 
++$(objpfx)tst-popen-fork: $(shared-thread-library)
++
+ tests-internal = tst-vtables tst-vtables-interposed
+ 
+ ifeq (yes,$(build-shared))
+
+commit 8667345b83c8ca528a093d4db53f57a1bb1688e4
+Author: Florian Weimer <fweimer@redhat.com>
+Date:   Thu Feb 13 21:56:52 2025 +0100
+
+    elf: Keep using minimal malloc after early DTV resize (bug 32412)
+    
+    If an auditor loads many TLS-using modules during startup, it is
+    possible to trigger DTV resizing.  Previously, the DTV was marked
+    as allocated by the main malloc afterwards, even if the minimal
+    malloc was still in use.  With this change, _dl_resize_dtv marks
+    the resized DTV as allocated with the minimal malloc.
+    
+    The new test reuses TLS-using modules from other auditing tests.
+    
+    Reviewed-by: DJ Delorie <dj@redhat.com>
+    (cherry picked from commit aa3d7bd5299b33bffc118aa618b59bfa66059bcb)
+
+diff --git a/elf/Makefile b/elf/Makefile
+index dc686c3bff..be64c59887 100644
+--- a/elf/Makefile
++++ b/elf/Makefile
+@@ -378,6 +378,7 @@ tests += \
+   tst-align3 \
+   tst-audit-tlsdesc \
+   tst-audit-tlsdesc-dlopen \
++  tst-audit-tlsdesc-dlopen2 \
+   tst-audit1 \
+   tst-audit2 \
+   tst-audit8 \
+@@ -817,6 +818,7 @@ modules-names += \
+   tst-auditmanymod8 \
+   tst-auditmanymod9 \
+   tst-auditmod-tlsdesc  \
++  tst-auditmod-tlsdesc2 \
+   tst-auditmod1 \
+   tst-auditmod11 \
+   tst-auditmod12 \
+@@ -3040,6 +3042,9 @@ $(objpfx)tst-audit-tlsdesc.out: $(objpfx)tst-auditmod-tlsdesc.so
+ tst-audit-tlsdesc-ENV = LD_AUDIT=$(objpfx)tst-auditmod-tlsdesc.so
+ $(objpfx)tst-audit-tlsdesc-dlopen.out: $(objpfx)tst-auditmod-tlsdesc.so
+ tst-audit-tlsdesc-dlopen-ENV = LD_AUDIT=$(objpfx)tst-auditmod-tlsdesc.so
++$(objpfx)tst-audit-tlsdesc-dlopen2.out: $(objpfx)tst-auditmod-tlsdesc2.so \
++  $(patsubst %, $(objpfx)%.so, $(tlsmod17a-modules))
++tst-audit-tlsdesc-dlopen2-ENV = LD_AUDIT=$(objpfx)tst-auditmod-tlsdesc2.so
+ 
+ $(objpfx)tst-dlmopen-twice.out: \
+   $(objpfx)tst-dlmopen-twice-mod1.so \
+diff --git a/elf/dl-tls.c b/elf/dl-tls.c
+index 3d529b722c..b13e752358 100644
+--- a/elf/dl-tls.c
++++ b/elf/dl-tls.c
+@@ -528,6 +528,13 @@ _dl_resize_dtv (dtv_t *dtv, size_t max_modid)
+       if (newp == NULL)
+ 	oom ();
+       memcpy (newp, &dtv[-1], (2 + oldsize) * sizeof (dtv_t));
++#ifdef SHARED
++      /* Auditors can trigger a DTV resize event while the full malloc
++	 is not yet in use.  Mark the new DTV allocation as the
++	 initial allocation.  */
++      if (!__rtld_malloc_is_complete ())
++	GL(dl_initial_dtv) = &newp[1];
++#endif
+     }
+   else
+     {
+diff --git a/elf/tst-audit-tlsdesc-dlopen2.c b/elf/tst-audit-tlsdesc-dlopen2.c
+new file mode 100644
+index 0000000000..7ba2c4129a
+--- /dev/null
++++ b/elf/tst-audit-tlsdesc-dlopen2.c
+@@ -0,0 +1,46 @@
++/* Loading TLS-using modules from auditors (bug 32412).  Main program.
++   Copyright (C) 2021-2025 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <support/xdlfcn.h>
++#include <stdio.h>
++
++static int
++do_test (void)
++{
++  puts ("info: start of main program");
++
++  /* Load TLS-using modules, to trigger DTV resizing.  The dynamic
++     linker will load them again (requiring their own TLS) because the
++     dlopen calls from the auditor were in the auditing namespace.  */
++  for (int i = 1; i <= 19; ++i)
++    {
++      char dso[30];
++      snprintf (dso, sizeof (dso), "tst-tlsmod17a%d.so", i);
++      char sym[30];
++      snprintf (sym, sizeof(sym), "tlsmod17a%d", i);
++
++      void *handle = xdlopen (dso, RTLD_LAZY);
++      int (*func) (void) = xdlsym (handle, sym);
++      /* Trigger TLS allocation.  */
++      func ();
++    }
++
++  return 0;
++}
++
++#include <support/test-driver.c>
+diff --git a/elf/tst-auditmod-tlsdesc2.c b/elf/tst-auditmod-tlsdesc2.c
+new file mode 100644
+index 0000000000..50275cd34d
+--- /dev/null
++++ b/elf/tst-auditmod-tlsdesc2.c
+@@ -0,0 +1,59 @@
++/* Loading TLS-using modules from auditors (bug 32412).  Audit module.
++   Copyright (C) 2021-2025 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <dlfcn.h>
++#include <link.h>
++#include <stdbool.h>
++#include <stdio.h>
++#include <unistd.h>
++
++unsigned int
++la_version (unsigned int version)
++{
++  /* Open some modules, to trigger DTV resizing before the switch to
++     the main malloc.  */
++  for (int i = 1; i <= 19; ++i)
++    {
++      char dso[30];
++      snprintf (dso, sizeof (dso), "tst-tlsmod17a%d.so", i);
++      char sym[30];
++      snprintf (sym, sizeof(sym), "tlsmod17a%d", i);
++
++      void *handle = dlopen (dso, RTLD_LAZY);
++      if (handle == NULL)
++        {
++          printf ("error: dlmopen from auditor: %s\n", dlerror  ());
++          fflush (stdout);
++          _exit (1);
++        }
++      int (*func) (void) = dlsym (handle, sym);
++      if (func == NULL)
++        {
++          printf ("error: dlsym from auditor: %s\n", dlerror  ());
++          fflush (stdout);
++          _exit (1);
++        }
++      /* Trigger TLS allocation.  */
++      func ();
++    }
++
++  puts ("info: TLS-using modules loaded from auditor");
++  fflush (stdout);
++
++  return LAV_CURRENT;
++}
+
+commit b3002f303cedb8262cbc1ec22999ea36482efa0e
+Author: Florian Weimer <fweimer@redhat.com>
+Date:   Tue May 20 19:36:02 2025 +0200
+
+    support: Use const char * argument in support_capture_subprogram_self_sgid
+    
+    The function does not modify the passed-in string, so make this clear
+    via the prototype.
+    
+    Reviewed-by: Carlos O'Donell <carlos@redhat.com>
+    (cherry picked from commit f0c09fe61678df6f7f18fe1ebff074e62fa5ca7a)
+
+diff --git a/support/capture_subprocess.h b/support/capture_subprocess.h
+index 93b7245d2a..5406d9f6c0 100644
+--- a/support/capture_subprocess.h
++++ b/support/capture_subprocess.h
+@@ -45,8 +45,7 @@ struct support_capture_subprocess support_capture_subprogram
+ /* Copy the running program into a setgid binary and run it with CHILD_ID
+    argument.  If execution is successful, return the exit status of the child
+    program, otherwise return a non-zero failure exit code.  */
+-int support_capture_subprogram_self_sgid
+-  (char *child_id);
++int support_capture_subprogram_self_sgid (const char *child_id);
+ 
+ /* Deallocate the subprocess data captured by
+    support_capture_subprocess.  */
+diff --git a/support/support_capture_subprocess.c b/support/support_capture_subprocess.c
+index 53847194cb..2383481911 100644
+--- a/support/support_capture_subprocess.c
++++ b/support/support_capture_subprocess.c
+@@ -110,7 +110,7 @@ support_capture_subprogram (const char *file, char *const argv[],
+    safely make it SGID with the TARGET group ID.  Then runs the
+    executable.  */
+ static int
+-copy_and_spawn_sgid (char *child_id, gid_t gid)
++copy_and_spawn_sgid (const char *child_id, gid_t gid)
+ {
+   char *dirname = xasprintf ("%s/tst-tunables-setuid.%jd",
+ 			     test_dir, (intmax_t) getpid ());
+@@ -182,7 +182,7 @@ copy_and_spawn_sgid (char *child_id, gid_t gid)
+   ret = 0;
+   infd = outfd = -1;
+ 
+-  char * const args[] = {execname, child_id, NULL};
++  char * const args[] = {execname, (char *) child_id, NULL};
+ 
+   status = support_subprogram_wait (args[0], args);
+ 
+@@ -211,7 +211,7 @@ err:
+ }
+ 
+ int
+-support_capture_subprogram_self_sgid (char *child_id)
++support_capture_subprogram_self_sgid (const char *child_id)
+ {
+   gid_t target = 0;
+   const int count = 64;
+
+commit 61dcce21e06834f7248a8d516c9ec20788fc728c
+Author: Florian Weimer <fweimer@redhat.com>
+Date:   Mon Dec 23 13:57:55 2024 +0100
+
+    support: Add support_record_failure_barrier
+    
+    This can be used to stop execution after a TEST_COMPARE_BLOB
+    failure, for example.
+    
+    (cherry picked from commit d0b8aa6de4529231fadfe604ac2c434e559c2d9e)
+
+diff --git a/support/check.h b/support/check.h
+index 7ea22c7a2c..8f41e5b99f 100644
+--- a/support/check.h
++++ b/support/check.h
+@@ -207,6 +207,9 @@ void support_record_failure_reset (void);
+    failures or not.  */
+ int support_record_failure_is_failed (void);
+ 
++/* Terminate the process if any failures have been encountered so far.  */
++void support_record_failure_barrier (void);
++
+ __END_DECLS
+ 
+ #endif /* SUPPORT_CHECK_H */
+diff --git a/support/support_record_failure.c b/support/support_record_failure.c
+index 978123701d..72ee2b232f 100644
+--- a/support/support_record_failure.c
++++ b/support/support_record_failure.c
+@@ -112,3 +112,13 @@ support_record_failure_is_failed (void)
+      synchronization for reliable test error reporting anyway.  */
+   return __atomic_load_n (&state->failed, __ATOMIC_RELAXED);
+ }
++
++void
++support_record_failure_barrier (void)
++{
++  if (__atomic_load_n (&state->failed, __ATOMIC_RELAXED))
++    {
++      puts ("error: exiting due to previous errors");
++      exit (1);
++    }
++}
+
+commit 079ac4a172a8f6ba37acf1e80e57f5042d2c7561
+Author: Florian Weimer <fweimer@redhat.com>
+Date:   Tue May 20 19:45:06 2025 +0200
+
+    elf: Test case for bug 32976 (CVE-2025-4802)
+    
+    Check that LD_LIBRARY_PATH is ignored for AT_SECURE statically
+    linked binaries, using support_capture_subprogram_self_sgid.
+    
+    Reviewed-by: Carlos O'Donell <carlos@redhat.com>
+    (cherry picked from commit d8f7a79335b0d861c12c42aec94c04cd5bb181e2)
+
+diff --git a/elf/Makefile b/elf/Makefile
+index be64c59887..afd4eb6fdd 100644
+--- a/elf/Makefile
++++ b/elf/Makefile
+@@ -266,6 +266,7 @@ tests-static-normal := \
+   tst-array1-static \
+   tst-array5-static \
+   tst-dl-iter-static \
++  tst-dlopen-sgid \
+   tst-dst-static \
+   tst-env-setuid-static \
+   tst-getauxval-static \
+@@ -859,6 +860,7 @@ modules-names += \
+   tst-dlmopen-twice-mod1 \
+   tst-dlmopen-twice-mod2 \
+   tst-dlmopen1mod \
++  tst-dlopen-sgid-mod \
+   tst-dlopen-tlsreinitmod1 \
+   tst-dlopen-tlsreinitmod2 \
+   tst-dlopen-tlsreinitmod3 \
+@@ -3153,3 +3155,5 @@ $(objpfx)tst-dlopen-tlsreinit3.out: $(objpfx)tst-auditmod1.so
+ tst-dlopen-tlsreinit3-ENV = LD_AUDIT=$(objpfx)tst-auditmod1.so
+ $(objpfx)tst-dlopen-tlsreinit4.out: $(objpfx)tst-auditmod1.so
+ tst-dlopen-tlsreinit4-ENV = LD_AUDIT=$(objpfx)tst-auditmod1.so
++
++$(objpfx)tst-dlopen-sgid.out: $(objpfx)tst-dlopen-sgid-mod.so
+diff --git a/elf/tst-dlopen-sgid-mod.c b/elf/tst-dlopen-sgid-mod.c
+new file mode 100644
+index 0000000000..5eb79eef48
+--- /dev/null
++++ b/elf/tst-dlopen-sgid-mod.c
+@@ -0,0 +1 @@
++/* Opening this object should not succeed.  */
+diff --git a/elf/tst-dlopen-sgid.c b/elf/tst-dlopen-sgid.c
+new file mode 100644
+index 0000000000..47829a405e
+--- /dev/null
++++ b/elf/tst-dlopen-sgid.c
+@@ -0,0 +1,104 @@
++/* Test case for ignored LD_LIBRARY_PATH in static startug (bug 32976).
++   Copyright (C) 2025 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <dlfcn.h>
++#include <gnu/lib-names.h>
++#include <stddef.h>
++#include <stdint.h>
++#include <stdlib.h>
++#include <string.h>
++#include <support/capture_subprocess.h>
++#include <support/check.h>
++#include <support/support.h>
++#include <support/temp_file.h>
++#include <unistd.h>
++
++/* This is the name of our test object.  Use a custom module for
++   testing, so that this object does not get picked up from the system
++   path.  */
++static const char dso_name[] = "tst-dlopen-sgid-mod.so";
++
++/* Used to mark the recursive invocation.  */
++static const char magic_argument[] = "run-actual-test";
++
++static int
++do_test (void)
++{
++/* Pathname of the directory that receives the shared objects this
++   test attempts to load.  */
++  char *libdir = support_create_temp_directory ("tst-dlopen-sgid-");
++
++  /* This is supposed to be ignored and stripped.  */
++  TEST_COMPARE (setenv ("LD_LIBRARY_PATH", libdir, 1), 0);
++
++  /* Copy of libc.so.6.  */
++  {
++    char *from = xasprintf ("%s/%s", support_objdir_root, LIBC_SO);
++    char *to = xasprintf ("%s/%s", libdir, LIBC_SO);
++    add_temp_file (to);
++    support_copy_file (from, to);
++    free (to);
++    free (from);
++  }
++
++  /* Copy of the test object.   */
++  {
++    char *from = xasprintf ("%s/elf/%s", support_objdir_root, dso_name);
++    char *to = xasprintf ("%s/%s", libdir, dso_name);
++    add_temp_file (to);
++    support_copy_file (from, to);
++    free (to);
++    free (from);
++  }
++
++  TEST_COMPARE (support_capture_subprogram_self_sgid (magic_argument), 0);
++
++  free (libdir);
++
++  return 0;
++}
++
++static void
++alternative_main (int argc, char **argv)
++{
++  if (argc == 2 && strcmp (argv[1], magic_argument) == 0)
++    {
++      if (getgid () == getegid ())
++        /* This can happen if the file system is mounted nosuid.  */
++        FAIL_UNSUPPORTED ("SGID failed: GID and EGID match (%jd)\n",
++                          (intmax_t) getgid ());
++
++      /* Should be removed due to SGID.  */
++      TEST_COMPARE_STRING (getenv ("LD_LIBRARY_PATH"), NULL);
++
++      TEST_VERIFY (dlopen (dso_name, RTLD_NOW) == NULL);
++      {
++        const char *message = dlerror ();
++        TEST_COMPARE_STRING (message,
++                             "tst-dlopen-sgid-mod.so:"
++                             " cannot open shared object file:"
++                             " No such file or directory");
++      }
++
++      support_record_failure_barrier ();
++      exit (EXIT_SUCCESS);
++    }
++}
++
++#define PREPARE alternative_main
++#include <support/test-driver.c>
+
+commit 56e75b810ac39b0e390be5b66397dca0cdfa4d80
+Author: Sunil K Pandey <sunil.k.pandey@intel.com>
+Date:   Tue May 20 10:07:27 2025 -0700
+
+    x86_64: Fix typo in ifunc-impl-list.c.
+    
+    Fix wcsncpy and wcpncpy typo in ifunc-impl-list.c.
+    
+    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+    (cherry picked from commit f2aeb6ff941dccc4c777b5621e77addea6cc076c)
+
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index 0bbb71bbbf..3db45db39b 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -922,7 +922,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 				     (CPU_FEATURE_USABLE (AVX2)
+ 				      && CPU_FEATURE_USABLE (BMI2)),
+ 				     __wcsncpy_avx2)
+-	      X86_IFUNC_IMPL_ADD_V2 (array, i, wcpncpy,
++	      X86_IFUNC_IMPL_ADD_V2 (array, i, wcsncpy,
+ 				     1,
+ 				     __wcsncpy_generic))
+ 
+@@ -952,7 +952,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 				     (CPU_FEATURE_USABLE (AVX2)
+ 				      && CPU_FEATURE_USABLE (BMI2)),
+ 				     __wcpncpy_avx2)
+-	      X86_IFUNC_IMPL_ADD_V2 (array, i, wcsncpy,
++	      X86_IFUNC_IMPL_ADD_V2 (array, i, wcpncpy,
+ 				     1,
+ 				     __wcpncpy_generic))
+ 
+
+commit c8e10f14328518954072df64aafd574e67cfdde5
+Author: Florian Weimer <fweimer@redhat.com>
+Date:   Wed May 21 08:43:32 2025 +0200
+
+    elf: Fix subprocess status handling for tst-dlopen-sgid (bug 32987)
+    
+    This should really move into support_capture_subprogram_self_sgid.
+    
+    Reviewed-by: Sam James <sam@gentoo.org>
+    (cherry picked from commit 35fc356fa3b4f485bd3ba3114c9f774e5df7d3c2)
+
+diff --git a/NEWS b/NEWS
+index 7a6985f5dd..4b290ad4bf 100644
+--- a/NEWS
++++ b/NEWS
+@@ -23,6 +23,7 @@ The following bugs are resolved with this release:
+   [32245] glibc -Wstringop-overflow= build failure on hppa
+   [32470] x86: Avoid integer truncation with large cache sizes
+   [32810] Crash on x86-64 if XSAVEC disable via tunable
++  [32987] elf: Fix subprocess status handling for tst-dlopen-sgid
+ 
+ Version 2.40
+ 
+diff --git a/elf/tst-dlopen-sgid.c b/elf/tst-dlopen-sgid.c
+index 47829a405e..5688b79f2e 100644
+--- a/elf/tst-dlopen-sgid.c
++++ b/elf/tst-dlopen-sgid.c
+@@ -26,6 +26,8 @@
+ #include <support/check.h>
+ #include <support/support.h>
+ #include <support/temp_file.h>
++#include <support/test-driver.h>
++#include <sys/wait.h>
+ #include <unistd.h>
+ 
+ /* This is the name of our test object.  Use a custom module for
+@@ -66,10 +68,16 @@ do_test (void)
+     free (from);
+   }
+ 
+-  TEST_COMPARE (support_capture_subprogram_self_sgid (magic_argument), 0);
+-
+   free (libdir);
+ 
++  int status = support_capture_subprogram_self_sgid (magic_argument);
++
++  if (WEXITSTATUS (status) == EXIT_UNSUPPORTED)
++    return EXIT_UNSUPPORTED;
++
++  if (!WIFEXITED (status))
++    FAIL_EXIT1 ("Unexpected exit status %d from child process\n", status);
++
+   return 0;
+ }
+ 
+
+commit 42a5a940c974d02540c8da26d6374c744d148cb9
+Author: Carlos O'Donell <carlos@redhat.com>
+Date:   Wed Jun 11 09:19:17 2025 -0400
+
+    ppc64le: Revert "powerpc: Optimized strncmp for power10" (CVE-2025-5745)
+    
+    This reverts commit 23f0d81608d0ca6379894ef81670cf30af7fd081
+    
+    Reason for revert: Power10 strncmp clobbers non-volatile vector
+    registers (Bug 33060)
+    
+    Tested on ppc64le with no regressions.
+    
+    (cherry picked from commit 63c60101ce7c5eac42be90f698ba02099b41b965)
+
+diff --git a/sysdeps/powerpc/powerpc64/le/power10/strncmp.S b/sysdeps/powerpc/powerpc64/le/power10/strncmp.S
+deleted file mode 100644
+index d4ba76acae..0000000000
+--- a/sysdeps/powerpc/powerpc64/le/power10/strncmp.S
++++ /dev/null
+@@ -1,271 +0,0 @@
+-/* Optimized strncmp implementation for PowerPC64/POWER10.
+-   Copyright (C) 2024 Free Software Foundation, Inc.
+-   This file is part of the GNU C Library.
+-
+-   The GNU C Library is free software; you can redistribute it and/or
+-   modify it under the terms of the GNU Lesser General Public
+-   License as published by the Free Software Foundation; either
+-   version 2.1 of the License, or (at your option) any later version.
+-
+-   The GNU C Library is distributed in the hope that it will be useful,
+-   but WITHOUT ANY WARRANTY; without even the implied warranty of
+-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+-   Lesser General Public License for more details.
+-
+-   You should have received a copy of the GNU Lesser General Public
+-   License along with the GNU C Library; if not, see
+-   <https://www.gnu.org/licenses/>.  */
+-
+-#include <sysdep.h>
+-
+-/* Implements the function
+-
+-   int [r3] strncmp (const char *s1 [r3], const char *s2 [r4], size_t [r5] n)
+-
+-   The implementation uses unaligned doubleword access to avoid specialized
+-   code paths depending of data alignment for first 32 bytes and uses
+-   vectorised loops after that.  */
+-
+-#ifndef STRNCMP
+-# define STRNCMP strncmp
+-#endif
+-
+-/* TODO: Change this to actual instructions when minimum binutils is upgraded
+-   to 2.27.  Macros are defined below for these newer instructions in order
+-   to maintain compatibility.  */
+-
+-#define LXVP(xtp,dq,ra)              \
+-	.long(((6)<<(32-6))          \
+-	| ((((xtp)-32)>>1)<<(32-10)) \
+-	| ((1)<<(32-11))             \
+-	| ((ra)<<(32-16))            \
+-	| dq)
+-
+-#define COMPARE_16(vreg1,vreg2,offset) \
+-	lxv	  vreg1+32,offset(r3); \
+-	lxv	  vreg2+32,offset(r4); \
+-	vcmpnezb. v7,vreg1,vreg2;      \
+-	bne	  cr6,L(different);    \
+-	cmpldi	  cr7,r5,16;           \
+-	ble	  cr7,L(ret0);         \
+-	addi	  r5,r5,-16;
+-
+-#define COMPARE_32(vreg1,vreg2,offset,label1,label2) \
+-	LXVP(vreg1+32,offset,r3);                    \
+-	LXVP(vreg2+32,offset,r4);                    \
+-	vcmpnezb. v7,vreg1+1,vreg2+1;                \
+-	bne	  cr6,L(label1);                     \
+-	vcmpnezb. v7,vreg1,vreg2;                    \
+-	bne	  cr6,L(label2);                     \
+-	cmpldi	  cr7,r5,32;                         \
+-	ble	  cr7,L(ret0);                       \
+-	addi	  r5,r5,-32;
+-
+-#define TAIL_FIRST_16B(vreg1,vreg2) \
+-	vctzlsbb r6,v7;             \
+-	cmpld	 cr7,r5,r6;         \
+-	ble	 cr7,L(ret0);       \
+-	vextubrx r5,r6,vreg1;       \
+-	vextubrx r4,r6,vreg2;       \
+-	subf	 r3,r4,r5;          \
+-	blr;
+-
+-#define TAIL_SECOND_16B(vreg1,vreg2) \
+-	vctzlsbb r6,v7;              \
+-	addi	 r0,r6,16;           \
+-	cmpld	 cr7,r5,r0;          \
+-	ble	 cr7,L(ret0);        \
+-	vextubrx r5,r6,vreg1;        \
+-	vextubrx r4,r6,vreg2;        \
+-	subf	 r3,r4,r5;           \
+-	blr;
+-
+-#define CHECK_N_BYTES(reg1,reg2,len_reg) \
+-	sldi	  r6,len_reg,56;	 \
+-	lxvl	  32+v4,reg1,r6;	 \
+-	lxvl	  32+v5,reg2,r6;	 \
+-	add	  reg1,reg1,len_reg;	 \
+-	add	  reg2,reg2,len_reg;	 \
+-	vcmpnezb  v7,v4,v5;		 \
+-	vctzlsbb  r6,v7;		 \
+-	cmpld	  cr7,r6,len_reg;	 \
+-	blt	  cr7,L(different);	 \
+-	cmpld	  cr7,r5,len_reg;	 \
+-	ble	  cr7,L(ret0);		 \
+-	sub	  r5,r5,len_reg;	 \
+-
+-	/* TODO: change this to .machine power10 when the minimum required
+-	 binutils allows it.  */
+-	.machine  power9
+-ENTRY_TOCLESS (STRNCMP, 4)
+-	/* Check if size is 0.  */
+-	cmpdi	 cr0,r5,0
+-	beq	 cr0,L(ret0)
+-	andi.   r7,r3,4095
+-	andi.   r8,r4,4095
+-	cmpldi  cr0,r7,4096-16
+-	cmpldi  cr1,r8,4096-16
+-	bgt     cr0,L(crosses)
+-	bgt     cr1,L(crosses)
+-	COMPARE_16(v4,v5,0)
+-	addi	r3,r3,16
+-	addi	r4,r4,16
+-
+-L(crosses):
+-	andi.	 r7,r3,15
+-	subfic	 r7,r7,16	/* r7(nalign1) = 16 - (str1 & 15).  */
+-	andi.	 r9,r4,15
+-	subfic	 r8,r9,16	/* r8(nalign2) = 16 - (str2 & 15).  */
+-	cmpld	 cr7,r7,r8
+-	beq	 cr7,L(same_aligned)
+-	blt	 cr7,L(nalign1_min)
+-
+-	/* nalign2 is minimum and s2 pointer is aligned.  */
+-	CHECK_N_BYTES(r3,r4,r8)
+-	/* Are we on the 64B hunk which crosses a page?  */
+-	andi.   r10,r3,63       /* Determine offset into 64B hunk.  */
+-	andi.   r8,r3,15        /* The offset into the 16B hunk.  */
+-	neg     r7,r3
+-	andi.   r9,r7,15        /* Number of bytes after a 16B cross.  */
+-	rlwinm. r7,r7,26,0x3F   /* ((r4-4096))>>6&63.  */
+-	beq     L(compare_64_pagecross)
+-	mtctr   r7
+-	b       L(compare_64B_unaligned)
+-
+-	/* nalign1 is minimum and s1 pointer is aligned.  */
+-L(nalign1_min):
+-	CHECK_N_BYTES(r3,r4,r7)
+-	/* Are we on the 64B hunk which crosses a page?  */
+-	andi.   r10,r4,63       /* Determine offset into 64B hunk.  */
+-	andi.   r8,r4,15        /* The offset into the 16B hunk.  */
+-	neg     r7,r4
+-	andi.   r9,r7,15        /* Number of bytes after a 16B cross.  */
+-	rlwinm. r7,r7,26,0x3F   /* ((r4-4096))>>6&63.  */
+-	beq     L(compare_64_pagecross)
+-	mtctr   r7
+-
+-	.p2align 5
+-L(compare_64B_unaligned):
+-	COMPARE_16(v4,v5,0)
+-	COMPARE_16(v4,v5,16)
+-	COMPARE_16(v4,v5,32)
+-	COMPARE_16(v4,v5,48)
+-	addi    r3,r3,64
+-	addi    r4,r4,64
+-	bdnz    L(compare_64B_unaligned)
+-
+-	/* Cross the page boundary of s2, carefully. Only for first
+-	iteration we have to get the count of 64B blocks to be checked.
+-	From second iteration and beyond, loop counter is always 63.  */
+-L(compare_64_pagecross):
+-	li      r11, 63
+-	mtctr   r11
+-	cmpldi  r10,16
+-	ble     L(cross_4)
+-	cmpldi  r10,32
+-	ble     L(cross_3)
+-	cmpldi  r10,48
+-	ble     L(cross_2)
+-L(cross_1):
+-	CHECK_N_BYTES(r3,r4,r9)
+-	CHECK_N_BYTES(r3,r4,r8)
+-	COMPARE_16(v4,v5,0)
+-	COMPARE_16(v4,v5,16)
+-	COMPARE_16(v4,v5,32)
+-	addi    r3,r3,48
+-	addi    r4,r4,48
+-	b       L(compare_64B_unaligned)
+-L(cross_2):
+-	COMPARE_16(v4,v5,0)
+-	addi    r3,r3,16
+-	addi    r4,r4,16
+-	CHECK_N_BYTES(r3,r4,r9)
+-	CHECK_N_BYTES(r3,r4,r8)
+-	COMPARE_16(v4,v5,0)
+-	COMPARE_16(v4,v5,16)
+-	addi    r3,r3,32
+-	addi    r4,r4,32
+-	b       L(compare_64B_unaligned)
+-L(cross_3):
+-	COMPARE_16(v4,v5,0)
+-	COMPARE_16(v4,v5,16)
+-	addi    r3,r3,32
+-	addi    r4,r4,32
+-	CHECK_N_BYTES(r3,r4,r9)
+-	CHECK_N_BYTES(r3,r4,r8)
+-	COMPARE_16(v4,v5,0)
+-	addi    r3,r3,16
+-	addi    r4,r4,16
+-	b       L(compare_64B_unaligned)
+-L(cross_4):
+-	COMPARE_16(v4,v5,0)
+-	COMPARE_16(v4,v5,16)
+-	COMPARE_16(v4,v5,32)
+-	addi    r3,r3,48
+-	addi    r4,r4,48
+-	CHECK_N_BYTES(r3,r4,r9)
+-	CHECK_N_BYTES(r3,r4,r8)
+-	b       L(compare_64B_unaligned)
+-
+-L(same_aligned):
+-	CHECK_N_BYTES(r3,r4,r7)
+-	/* Align s1 to 32B and adjust s2 address.
+-	   Use lxvp only if both s1 and s2 are 32B aligned.  */
+-	COMPARE_16(v4,v5,0)
+-	COMPARE_16(v4,v5,16)
+-	COMPARE_16(v4,v5,32)
+-	COMPARE_16(v4,v5,48)
+-	addi	r3,r3,64
+-	addi	r4,r4,64
+-	COMPARE_16(v4,v5,0)
+-	COMPARE_16(v4,v5,16)
+-	addi	r5,r5,32
+-
+-	clrldi  r6,r3,59
+-	subfic	r7,r6,32
+-	add	r3,r3,r7
+-	add	r4,r4,r7
+-	subf	r5,r7,r5
+-	andi.	r7,r4,0x1F
+-	beq	cr0,L(32B_aligned_loop)
+-
+-	.p2align 5
+-L(16B_aligned_loop):
+-	COMPARE_16(v4,v5,0)
+-	COMPARE_16(v4,v5,16)
+-	COMPARE_16(v4,v5,32)
+-	COMPARE_16(v4,v5,48)
+-	addi	r3,r3,64
+-	addi	r4,r4,64
+-	b	L(16B_aligned_loop)
+-
+-	/* Calculate and return the difference.  */
+-L(different):
+-	TAIL_FIRST_16B(v4,v5)
+-
+-	.p2align 5
+-L(32B_aligned_loop):
+-	COMPARE_32(v14,v16,0,tail1,tail2)
+-	COMPARE_32(v18,v20,32,tail3,tail4)
+-	COMPARE_32(v22,v24,64,tail5,tail6)
+-	COMPARE_32(v26,v28,96,tail7,tail8)
+-	addi	r3,r3,128
+-	addi	r4,r4,128
+-	b	L(32B_aligned_loop)
+-
+-L(tail1): TAIL_FIRST_16B(v15,v17)
+-L(tail2): TAIL_SECOND_16B(v14,v16)
+-L(tail3): TAIL_FIRST_16B(v19,v21)
+-L(tail4): TAIL_SECOND_16B(v18,v20)
+-L(tail5): TAIL_FIRST_16B(v23,v25)
+-L(tail6): TAIL_SECOND_16B(v22,v24)
+-L(tail7): TAIL_FIRST_16B(v27,v29)
+-L(tail8): TAIL_SECOND_16B(v26,v28)
+-
+-	.p2align 5
+-L(ret0):
+-	li	r3,0
+-	blr
+-
+-END(STRNCMP)
+-libc_hidden_builtin_def(strncmp)
+diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
+index b847c19049..a38ff46448 100644
+--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
++++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
+@@ -34,7 +34,7 @@ ifneq (,$(filter %le,$(config-machine)))
+ sysdep_routines += memchr-power10 memcmp-power10 memcpy-power10 \
+ 		   memmove-power10 memset-power10 rawmemchr-power9 \
+ 		   rawmemchr-power10 strcmp-power9 strcmp-power10 \
+-		   strncmp-power9 strncmp-power10 strcpy-power9 stpcpy-power9 \
++		   strncmp-power9 strcpy-power9 stpcpy-power9 \
+ 		   strlen-power9 strncpy-power9 stpncpy-power9 strlen-power10
+ endif
+ CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
+diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+index 2bb47d3527..30fd89e109 100644
+--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+@@ -164,9 +164,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   /* Support sysdeps/powerpc/powerpc64/multiarch/strncmp.c.  */
+   IFUNC_IMPL (i, name, strncmp,
+ #ifdef __LITTLE_ENDIAN__
+-	      IFUNC_IMPL_ADD (array, i, strncmp, hwcap2 & PPC_FEATURE2_ARCH_3_1
+-			      && hwcap & PPC_FEATURE_HAS_VSX,
+-			      __strncmp_power10)
+ 	      IFUNC_IMPL_ADD (array, i, strncmp, hwcap2 & PPC_FEATURE2_ARCH_3_00
+ 			      && hwcap & PPC_FEATURE_HAS_ALTIVEC,
+ 			      __strncmp_power9)
+diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncmp-power10.S b/sysdeps/powerpc/powerpc64/multiarch/strncmp-power10.S
+deleted file mode 100644
+index d7026c12e2..0000000000
+--- a/sysdeps/powerpc/powerpc64/multiarch/strncmp-power10.S
++++ /dev/null
+@@ -1,25 +0,0 @@
+-/* Copyright (C) 2024 Free Software Foundation, Inc.
+-   This file is part of the GNU C Library.
+-
+-   The GNU C Library is free software; you can redistribute it and/or
+-   modify it under the terms of the GNU Lesser General Public
+-   License as published by the Free Software Foundation; either
+-   version 2.1 of the License, or (at your option) any later version.
+-
+-   The GNU C Library is distributed in the hope that it will be useful,
+-   but WITHOUT ANY WARRANTY; without even the implied warranty of
+-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+-   Lesser General Public License for more details.
+-
+-   You should have received a copy of the GNU Lesser General Public
+-   License along with the GNU C Library; if not, see
+-   <https://www.gnu.org/licenses/>.  */
+-
+-#if defined __LITTLE_ENDIAN__ && IS_IN (libc)
+-#define STRNCMP __strncmp_power10
+-
+-#undef libc_hidden_builtin_def
+-#define libc_hidden_builtin_def(name)
+-
+-#include <sysdeps/powerpc/powerpc64/le/power10/strncmp.S>
+-#endif
+diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncmp.c b/sysdeps/powerpc/powerpc64/multiarch/strncmp.c
+index a5ed67f766..6178f4a432 100644
+--- a/sysdeps/powerpc/powerpc64/multiarch/strncmp.c
++++ b/sysdeps/powerpc/powerpc64/multiarch/strncmp.c
+@@ -29,7 +29,6 @@ extern __typeof (strncmp) __strncmp_ppc attribute_hidden;
+ extern __typeof (strncmp) __strncmp_power8 attribute_hidden;
+ # ifdef __LITTLE_ENDIAN__
+ extern __typeof (strncmp) __strncmp_power9 attribute_hidden;
+-extern __typeof (strncmp) __strncmp_power10 attribute_hidden;
+ # endif
+ # undef strncmp
+ 
+@@ -37,9 +36,6 @@ extern __typeof (strncmp) __strncmp_power10 attribute_hidden;
+    ifunc symbol properly.  */
+ libc_ifunc_redirected (__redirect_strncmp, strncmp,
+ # ifdef __LITTLE_ENDIAN__
+-			(hwcap2 & PPC_FEATURE2_ARCH_3_1
+-			 && hwcap & PPC_FEATURE_HAS_VSX)
+-			? __strncmp_power10 :
+ 			(hwcap2 & PPC_FEATURE2_ARCH_3_00
+ 			 && hwcap & PPC_FEATURE_HAS_ALTIVEC)
+ 			? __strncmp_power9 :
+
+commit 2ad6e55ea5cb23af5af7af35d5f80cd93032f96a
+Author: Carlos O'Donell <carlos@redhat.com>
+Date:   Wed Jun 11 09:43:50 2025 -0400
+
+    ppc64le: Revert "powerpc: Fix performance issues of strcmp power10" (CVE-2025-5702)
+    
+    This reverts commit 90bcc8721ef82b7378d2b080141228660e862d56
+    
+    This change is in the chain of the final revert that fixes the CVE
+    i.e. 3367d8e180848030d1646f088759f02b8dfe0d6f
+    
+    Reason for revert: Power10 strcmp clobbers non-volatile vector
+    registers (Bug 33056)
+    
+    Tested on ppc64le with no regressions.
+    
+    (cherry picked from commit c22de63588df7a8a0edceea9bb02534064c9d201)
+
+diff --git a/sysdeps/powerpc/powerpc64/le/power10/strcmp.S b/sysdeps/powerpc/powerpc64/le/power10/strcmp.S
+index f0d6732a25..00f1e9c170 100644
+--- a/sysdeps/powerpc/powerpc64/le/power10/strcmp.S
++++ b/sysdeps/powerpc/powerpc64/le/power10/strcmp.S
+@@ -62,7 +62,7 @@
+ 	lxvl	  32+v5,reg2,r0;         \
+ 	add	  reg1,reg1,len_reg;     \
+ 	add	  reg2,reg2,len_reg;     \
+-	vcmpnezb  v7,v4,v5;              \
++	vcmpnezb. v7,v4,v5;              \
+ 	vctzlsbb  r6,v7;                 \
+ 	cmpld	  cr7,r6,len_reg;        \
+ 	blt	  cr7,L(different);      \
+@@ -72,110 +72,70 @@
+ 
+ 	.machine  power9
+ ENTRY_TOCLESS (STRCMP, 4)
+-	andi.	r7,r3,4095
+-	andi.	r8,r4,4095
+-	cmpldi	cr0,r7,4096-16
+-	cmpldi	cr1,r8,4096-16
+-	bgt	cr0,L(crosses)
+-	bgt	cr1,L(crosses)
+-	COMPARE_16(v4,v5,0)
+-
+-L(crosses):
+-	andi.	r7,r3,15
+-	subfic	r7,r7,16	/* r7(nalign1) = 16 - (str1 & 15).  */
+-	andi.	r9,r4,15
+-	subfic	r5,r9,16	/* r5(nalign2) = 16 - (str2 & 15).  */
+-	cmpld	cr7,r7,r5
+-	beq	cr7,L(same_aligned)
+-	blt	cr7,L(nalign1_min)
++	li	 r11,16
++	/* eq bit of cr1 used as swap status flag to indicate if
++	source pointers were swapped.  */
++	crclr	 4*cr1+eq
++	vspltisb v19,-1
++	andi.	 r7,r3,15
++	sub	 r7,r11,r7	/* r7(nalign1) = 16 - (str1 & 15).  */
++	andi.	 r9,r4,15
++	sub	 r5,r11,r9	/* r5(nalign2) = 16 - (str2 & 15).  */
++	cmpld	 cr7,r7,r5
++	beq	 cr7,L(same_aligned)
++	blt	 cr7,L(nalign1_min)
++	/* Swap r3 and r4, and r7 and r5 such that r3 and r7 hold the
++	pointer which is closer to the next 16B boundary so that only
++	one CHECK_N_BYTES is needed before entering the loop below.  */
++	mr	 r8,r4
++	mr	 r4,r3
++	mr	 r3,r8
++	mr	 r12,r7
++	mr	 r7,r5
++	mr	 r5,r12
++	crset	 4*cr1+eq	/* Set bit on swapping source pointers.  */
+ 
+-	/* nalign2 is minimum and s2 pointer is aligned.  */
+-	CHECK_N_BYTES(r3,r4,r5)
+-	/* Are we on the 64B hunk which crosses a page?  */
+-	andi.	r10,r3,63	/* Determine offset into 64B hunk.  */
+-	andi.	r8,r3,15        /* The offset into the 16B hunk.  */
+-	neg	r7,r3
+-	andi.	r9,r7,15	/* Number of bytes after a 16B cross.  */
+-	rlwinm.	r7,r7,26,0x3F	/* ((r3-4096))>>6&63.  */
+-	beq	L(compare_64_pagecross)
+-	mtctr	r7
+-	b	L(compare_64B_unaligned)
+-
+-	/* nalign1 is minimum and s1 pointer is aligned.  */
++	.p2align 5
+ L(nalign1_min):
+ 	CHECK_N_BYTES(r3,r4,r7)
+-	/* Are we on the 64B hunk which crosses a page?  */
+-	andi.	r10,r4,63	/* Determine offset into 64B hunk.  */
+-	andi.	r8,r4,15	/* The offset into the 16B hunk.  */
+-	neg	r7,r4
+-	andi.	r9,r7,15	/* Number of bytes after a 16B cross.  */
+-	rlwinm. r7,r7,26,0x3F	/* ((r4-4096))>>6&63.  */
+-	beq	L(compare_64_pagecross)
+-	mtctr	r7
+ 
+ 	.p2align 5
+-L(compare_64B_unaligned):
+-	COMPARE_16(v4,v5,0)
+-	COMPARE_16(v4,v5,16)
+-	COMPARE_16(v4,v5,32)
+-	COMPARE_16(v4,v5,48)
+-	addi	r3,r3,64
+-	addi	r4,r4,64
+-	bdnz	L(compare_64B_unaligned)
++L(s1_aligned):
++	/* r9 and r5 is number of bytes to be read after and before
++	 page boundary correspondingly.  */
++	sub 	r5,r5,r7
++	subfic	r9,r5,16
++	/* Now let r7 hold the count of quadwords which can be
++	checked without crossing a page boundary. quadword offset is
++	(str2>>4)&0xFF.  */
++	rlwinm	r7,r4,28,0xFF
++	/* Below check is required only for first iteration. For second
++	iteration and beyond, the new loop counter is always 255.  */
++	cmpldi	r7,255
++	beq	L(L3)
++	/* Get the initial loop count by 255-((str2>>4)&0xFF).  */
++	subfic  r11,r7,255
+ 
+-	/* Cross the page boundary of s2, carefully. Only for first
+-	iteration we have to get the count of 64B blocks to be checked.
+-	From second iteration and beyond, loop counter is always 63.  */
+-L(compare_64_pagecross):
+-	li	r11, 63
++	.p2align 5
++L(L1):
+ 	mtctr	r11
+-	cmpldi	r10,16
+-	ble	L(cross_4)
+-	cmpldi	r10,32
+-	ble	L(cross_3)
+-	cmpldi	r10,48
+-	ble	L(cross_2)
+-L(cross_1):
+-	CHECK_N_BYTES(r3,r4,r9)
+-	CHECK_N_BYTES(r3,r4,r8)
+-	COMPARE_16(v4,v5,0)
+-	COMPARE_16(v4,v5,16)
+-	COMPARE_16(v4,v5,32)
+-	addi	r3,r3,48
+-	addi	r4,r4,48
+-	b	L(compare_64B_unaligned)
+-L(cross_2):
+-	COMPARE_16(v4,v5,0)
+-	addi	r3,r3,16
+-	addi	r4,r4,16
+-	CHECK_N_BYTES(r3,r4,r9)
+-	CHECK_N_BYTES(r3,r4,r8)
+-	COMPARE_16(v4,v5,0)
+-	COMPARE_16(v4,v5,16)
+-	addi	r3,r3,32
+-	addi	r4,r4,32
+-	b	L(compare_64B_unaligned)
+-L(cross_3):
+-	COMPARE_16(v4,v5,0)
+-	COMPARE_16(v4,v5,16)
+-	addi	r3,r3,32
+-	addi	r4,r4,32
+-	CHECK_N_BYTES(r3,r4,r9)
+-	CHECK_N_BYTES(r3,r4,r8)
+-	COMPARE_16(v4,v5,0)
++
++	.p2align 5
++L(L2):
++	COMPARE_16(v4,v5,0)	/* Load 16B blocks using lxv.  */
+ 	addi	r3,r3,16
+ 	addi	r4,r4,16
+-	b	L(compare_64B_unaligned)
+-L(cross_4):
+-	COMPARE_16(v4,v5,0)
+-	COMPARE_16(v4,v5,16)
+-	COMPARE_16(v4,v5,32)
+-	addi	r3,r3,48
+-	addi	r4,r4,48
++	bdnz	L(L2)
++	/* Cross the page boundary of s2, carefully.  */
++
++	.p2align 5
++L(L3):
++	CHECK_N_BYTES(r3,r4,r5)
+ 	CHECK_N_BYTES(r3,r4,r9)
+-	CHECK_N_BYTES(r3,r4,r8)
+-	b	L(compare_64B_unaligned)
++	li 	r11,255		/* Load the new loop counter.  */
++	b	L(L1)
+ 
++	.p2align 5
+ L(same_aligned):
+ 	CHECK_N_BYTES(r3,r4,r7)
+         /* Align s1 to 32B and adjust s2 address.
+@@ -208,7 +168,18 @@ L(16B_aligned_loop):
+ 
+ 	/* Calculate and return the difference.  */
+ L(different):
+-	TAIL(v4,v5)
++	vctzlsbb r6,v7
++	vextubrx r5,r6,v4
++	vextubrx r4,r6,v5
++	bt  	 4*cr1+eq,L(swapped)
++	subf	 r3,r4,r5
++	blr
++
++	/* If src pointers were swapped, then swap the
++	indices and calculate the return value.  */
++L(swapped):
++	subf     r3,r5,r4
++	blr
+ 
+ 	.p2align 5
+ L(32B_aligned_loop):
+
+commit 672f31b90e501b4ba10ba12ab4c6051f77589912
+Author: Carlos O'Donell <carlos@redhat.com>
+Date:   Wed Jun 11 09:33:45 2025 -0400
+
+    ppc64le: Revert "powerpc : Add optimized memchr for POWER10" (Bug 33059)
+    
+    This reverts commit b9182c793caa05df5d697427c0538936e6396d4b
+    
+    Reason for revert: Power10 memchr clobbers v20 vector register
+    (Bug 33059)
+    
+    This is not a security issue, unlike CVE-2025-5745 and
+    CVE-2025-5702.
+    
+    Tested on ppc64le without regression.
+    
+    (cherry picked from commit a7877bb6685300f159fa095c9f50b22b112cddb8)
+
+diff --git a/sysdeps/powerpc/powerpc64/le/power10/memchr.S b/sysdeps/powerpc/powerpc64/le/power10/memchr.S
+deleted file mode 100644
+index 53e5716d72..0000000000
+--- a/sysdeps/powerpc/powerpc64/le/power10/memchr.S
++++ /dev/null
+@@ -1,315 +0,0 @@
+-/* Optimized memchr implementation for POWER10 LE.
+-   Copyright (C) 2021-2024 Free Software Foundation, Inc.
+-   This file is part of the GNU C Library.
+-
+-   The GNU C Library is free software; you can redistribute it and/or
+-   modify it under the terms of the GNU Lesser General Public
+-   License as published by the Free Software Foundation; either
+-   version 2.1 of the License, or (at your option) any later version.
+-
+-   The GNU C Library is distributed in the hope that it will be useful,
+-   but WITHOUT ANY WARRANTY; without even the implied warranty of
+-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
+-   Lesser General Public License for more details.
+-
+-   You should have received a copy of the GNU Lesser General Public
+-   License along with the GNU C Library; if not, see
+-   <https://www.gnu.org/licenses/>.  */
+-
+-#include <sysdep.h>
+-
+-# ifndef MEMCHR
+-#  define MEMCHR __memchr
+-# endif
+-# define M_VREG_ZERO v20
+-# define M_OFF_START_LOOP 256
+-# define MEMCHR_SUBTRACT_VECTORS \
+-	vsububm   v4,v4,v18;	    \
+-	vsububm   v5,v5,v18;	    \
+-	vsububm   v6,v6,v18;	    \
+-	vsububm   v7,v7,v18;
+-# define M_TAIL(vreg,increment)	   \
+-	vctzlsbb  r4,vreg;	   \
+-	cmpld     r5,r4;	   \
+-	ble       L(null);	   \
+-	addi	  r4,r4,increment; \
+-	add	  r3,r6,r4;	   \
+-	blr
+-
+-/* TODO: Replace macros by the actual instructions when minimum binutils becomes
+-   >= 2.35.  This is used to keep compatibility with older versions.  */
+-#define M_VEXTRACTBM(rt,vrb)	 \
+-	.long(((4)<<(32-6))	 \
+-	      | ((rt)<<(32-11))	 \
+-	      | ((8)<<(32-16))	 \
+-	      | ((vrb)<<(32-21)) \
+-	      | 1602)
+-
+-#define M_LXVP(xtp,dq,ra)		   \
+-	.long(((6)<<(32-6))		   \
+-	      | ((((xtp)-32)>>1)<<(32-10)) \
+-	      | ((1)<<(32-11))		   \
+-	      | ((ra)<<(32-16))		   \
+-	      | dq)
+-
+-#define CHECK16B(vreg,offset,addr,label) \
+-	lxv	  vreg+32,offset(addr);	\
+-	vcmpequb. vreg,vreg,v18;	\
+-	bne	  cr6,L(label);		\
+-	cmpldi	  r5,16;		\
+-	ble	  L(null);		\
+-	addi	  r5,r5,-16;
+-
+-/* Load 4 quadwords, merge into one VR for speed and check for NULLs.  r6 has #
+-   of bytes already checked.  */
+-#define CHECK64B(offset,addr,label)	    \
+-	M_LXVP(v4+32,offset,addr);	    \
+-	M_LXVP(v6+32,offset+32,addr);	    \
+-	MEMCHR_SUBTRACT_VECTORS;	    \
+-	vminub	  v14,v4,v5;		    \
+-	vminub	  v15,v6,v7;		    \
+-	vminub	  v16,v14,v15;		    \
+-	vcmpequb. v0,v16,M_VREG_ZERO;	    \
+-	beq	  cr6,$+12;		    \
+-	li	  r7,offset;		    \
+-	b     	  L(label);          	    \
+-	cmpldi	  r5,64;		    \
+-	ble	  L(null);		    \
+-	addi	  r5,r5,-64
+-
+-/* Implements the function
+-   void *[r3] memchr (const void *s [r3], int c [r4], size_t n [r5]).  */
+-
+-	.machine power9
+-
+-ENTRY_TOCLESS (MEMCHR)
+-	CALL_MCOUNT 3
+-
+-	cmpldi	r5,0
+-	beq	L(null)
+-	mr	r0,r5
+-	xori	r6,r4,0xff
+-
+-	mtvsrd	v18+32,r4	/* matching char in v18  */
+-	mtvsrd	v19+32,r6	/* non matching char in v19  */
+-
+-	vspltb	v18,v18,7	/* replicate  */
+-	vspltb	v19,v19,7	/* replicate  */
+-	vspltisb  M_VREG_ZERO,0
+-
+-	/* Next 16B-aligned address. Prepare address for L(aligned).  */
+-	addi	  r6,r3,16
+-	clrrdi	  r6,r6,4
+-
+-	/* Align data and fill bytes not loaded with non matching char.	 */
+-	lvx	  v0,0,r3
+-	lvsr	  v1,0,r3
+-	vperm	  v0,v19,v0,v1
+-
+-	vcmpequb. v6,v0,v18
+-	bne	  cr6,L(found)
+-	sub	  r4,r6,r3
+-	cmpld	  r5,r4
+-	ble	  L(null)
+-	sub	  r5,r5,r4
+-
+-	/* Test up to OFF_START_LOOP-16 bytes in 16B chunks.  The main loop is
+-	   optimized for longer strings, so checking the first bytes in 16B
+-	   chunks benefits a lot small strings.  */
+-	.p2align 5
+-L(aligned):
+-	cmpldi	r5,0
+-	beq     L(null)
+-
+-	CHECK16B(v0,0,r6,tail1)
+-	CHECK16B(v1,16,r6,tail2)
+-	CHECK16B(v2,32,r6,tail3)
+-	CHECK16B(v3,48,r6,tail4)
+-	CHECK16B(v4,64,r6,tail5)
+-	CHECK16B(v5,80,r6,tail6)
+-	CHECK16B(v6,96,r6,tail7)
+-	CHECK16B(v7,112,r6,tail8)
+-	CHECK16B(v8,128,r6,tail9)
+-	CHECK16B(v9,144,r6,tail10)
+-	CHECK16B(v10,160,r6,tail11)
+-	CHECK16B(v0,176,r6,tail12)
+-	CHECK16B(v1,192,r6,tail13)
+-	CHECK16B(v2,208,r6,tail14)
+-	CHECK16B(v3,224,r6,tail15)
+-
+-	cmpdi	cr5,r4,0	/* Check if c == 0.  This will be useful to
+-				   choose how we will perform the main loop.  */
+-
+-	/* Prepare address for the loop.  */
+-	addi	  r4,r3,M_OFF_START_LOOP
+-	clrrdi	  r4,r4,6
+-	sub	  r6,r4,r3
+-	sub	  r5,r0,r6
+-	addi	  r6,r4,128
+-
+-	/* If c == 0, use the loop without the vsububm.  */
+-	beq	cr5,L(loop)
+-
+-	/* This is very similar to the block after L(loop), the difference is
+-	   that here MEMCHR_SUBTRACT_VECTORS is not empty, and we subtract
+-	   each byte loaded by the char we are looking for, this way we can keep
+-	   using vminub to merge the results and checking for nulls.  */
+-	.p2align 5
+-L(memchr_loop):
+-	CHECK64B(0,r4,pre_tail_64b)
+-	CHECK64B(64,r4,pre_tail_64b)
+-	addi	r4,r4,256
+-
+-	CHECK64B(0,r6,tail_64b)
+-	CHECK64B(64,r6,tail_64b)
+-	addi	r6,r6,256
+-
+-	CHECK64B(0,r4,pre_tail_64b)
+-	CHECK64B(64,r4,pre_tail_64b)
+-	addi	r4,r4,256
+-
+-	CHECK64B(0,r6,tail_64b)
+-	CHECK64B(64,r6,tail_64b)
+-	addi	r6,r6,256
+-
+-	b	L(memchr_loop)
+-	/* Switch to a more aggressive approach checking 64B each time.  Use 2
+-	   pointers 128B apart and unroll the loop once to make the pointer
+-	   updates and usages separated enough to avoid stalls waiting for
+-	   address calculation.  */
+-	.p2align 5
+-L(loop):
+-#undef MEMCHR_SUBTRACT_VECTORS
+-#define MEMCHR_SUBTRACT_VECTORS /* nothing */
+-	CHECK64B(0,r4,pre_tail_64b)
+-	CHECK64B(64,r4,pre_tail_64b)
+-	addi	  r4,r4,256
+-
+-	CHECK64B(0,r6,tail_64b)
+-	CHECK64B(64,r6,tail_64b)
+-	addi	  r6,r6,256
+-
+-	CHECK64B(0,r4,pre_tail_64b)
+-	CHECK64B(64,r4,pre_tail_64b)
+-	addi      r4,r4,256
+-
+-	CHECK64B(0,r6,tail_64b)
+-	CHECK64B(64,r6,tail_64b)
+-	addi      r6,r6,256
+-
+-	b	  L(loop)
+-
+-	.p2align  5
+-L(pre_tail_64b):
+-	mr	r6,r4
+-L(tail_64b):
+-	/* OK, we found a null byte.  Let's look for it in the current 64-byte
+-	   block and mark it in its corresponding VR.  lxvp vx,0(ry) puts the
+-	   low 16B bytes into vx+1, and the high into vx, so the order here is
+-	   v5, v4, v7, v6.  */
+-	vcmpequb  v1,v5,M_VREG_ZERO
+-	vcmpequb  v2,v4,M_VREG_ZERO
+-	vcmpequb  v3,v7,M_VREG_ZERO
+-	vcmpequb  v4,v6,M_VREG_ZERO
+-
+-	/* Take into account the other 64B blocks we had already checked.  */
+-	add	r6,r6,r7
+-	/* Extract first bit of each byte.  */
+-	M_VEXTRACTBM(r8,v1)
+-	M_VEXTRACTBM(r9,v2)
+-	M_VEXTRACTBM(r10,v3)
+-	M_VEXTRACTBM(r11,v4)
+-
+-	/* Shift each value into their corresponding position.  */
+-	sldi	  r9,r9,16
+-	sldi	  r10,r10,32
+-	sldi	  r11,r11,48
+-
+-	/* Merge the results.  */
+-	or	  r8,r8,r9
+-	or	  r9,r10,r11
+-	or	  r11,r9,r8
+-
+-	cnttzd	  r0,r11	  /* Count trailing zeros before the match.  */
+-	cmpld     r5,r0
+-	ble	  L(null)
+-	add	  r3,r6,r0	  /* Compute final address.  */
+-	blr
+-
+-	.p2align  5
+-L(tail1):
+-	M_TAIL(v0,0)
+-
+-	.p2align  5
+-L(tail2):
+-	M_TAIL(v1,16)
+-
+-	.p2align  5
+-L(tail3):
+-	M_TAIL(v2,32)
+-
+-	.p2align  5
+-L(tail4):
+-	M_TAIL(v3,48)
+-
+-	.p2align  5
+-L(tail5):
+-	M_TAIL(v4,64)
+-
+-	.p2align  5
+-L(tail6):
+-	M_TAIL(v5,80)
+-
+-	.p2align  5
+-L(tail7):
+-	M_TAIL(v6,96)
+-
+-	.p2align  5
+-L(tail8):
+-	M_TAIL(v7,112)
+-
+-	.p2align  5
+-L(tail9):
+-	M_TAIL(v8,128)
+-
+-	.p2align  5
+-L(tail10):
+-	M_TAIL(v9,144)
+-
+-	.p2align  5
+-L(tail11):
+-	M_TAIL(v10,160)
+-
+-	.p2align  5
+-L(tail12):
+-	M_TAIL(v0,176)
+-
+-	.p2align  5
+-L(tail13):
+-	M_TAIL(v1,192)
+-
+-	.p2align  5
+-L(tail14):
+-	M_TAIL(v2,208)
+-
+-	.p2align  5
+-L(tail15):
+-	M_TAIL(v3,224)
+-
+-	.p2align  5
+-L(found):
+-	vctzlsbb  r7,v6
+-	cmpld     r5,r7
+-	ble       L(null)
+-	add       r3,r3,r7
+-	blr
+-
+-	.p2align  5
+-L(null):
+-	li	r3,0
+-	blr
+-
+-END (MEMCHR)
+-
+-weak_alias (__memchr, memchr)
+-libc_hidden_builtin_def (memchr)
+diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
+index a38ff46448..fa1107dfd9 100644
+--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
++++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
+@@ -31,10 +31,10 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
+ 		   strncase-power8
+ 
+ ifneq (,$(filter %le,$(config-machine)))
+-sysdep_routines += memchr-power10 memcmp-power10 memcpy-power10 \
+-		   memmove-power10 memset-power10 rawmemchr-power9 \
+-		   rawmemchr-power10 strcmp-power9 strcmp-power10 \
+-		   strncmp-power9 strcpy-power9 stpcpy-power9 \
++sysdep_routines += memcmp-power10 memcpy-power10 memmove-power10 memset-power10 \
++		   rawmemchr-power9 rawmemchr-power10 \
++		   strcmp-power9 strcmp-power10 strncmp-power9 \
++		   strcpy-power9 stpcpy-power9 \
+ 		   strlen-power9 strncpy-power9 stpncpy-power9 strlen-power10
+ endif
+ CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
+diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+index 30fd89e109..9b3e617306 100644
+--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+@@ -226,12 +226,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 
+   /* Support sysdeps/powerpc/powerpc64/multiarch/memchr.c.  */
+   IFUNC_IMPL (i, name, memchr,
+-#ifdef __LITTLE_ENDIAN__
+-	      IFUNC_IMPL_ADD (array, i, memchr,
+-		              hwcap2 & PPC_FEATURE2_ARCH_3_1
+-			      && hwcap & PPC_FEATURE_HAS_VSX,
+-			      __memchr_power10)
+-#endif
+ 	      IFUNC_IMPL_ADD (array, i, memchr,
+ 			      hwcap2 & PPC_FEATURE2_ARCH_2_07
+ 			      && hwcap & PPC_FEATURE_HAS_ALTIVEC,
+diff --git a/sysdeps/powerpc/powerpc64/multiarch/memchr-power10.S b/sysdeps/powerpc/powerpc64/multiarch/memchr-power10.S
+deleted file mode 100644
+index 7d35ef28a9..0000000000
+--- a/sysdeps/powerpc/powerpc64/multiarch/memchr-power10.S
++++ /dev/null
+@@ -1,28 +0,0 @@
+-/* Optimized memchr implementation for POWER10/PPC64.
+-   Copyright (C) 2016-2024 Free Software Foundation, Inc.
+-   This file is part of the GNU C Library.
+-
+-   The GNU C Library is free software; you can redistribute it and/or
+-   modify it under the terms of the GNU Lesser General Public
+-   License as published by the Free Software Foundation; either
+-   version 2.1 of the License, or (at your option) any later version.
+-
+-   The GNU C Library is distributed in the hope that it will be useful,
+-   but WITHOUT ANY WARRANTY; without even the implied warranty of
+-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+-   Lesser General Public License for more details.
+-
+-   You should have received a copy of the GNU Lesser General Public
+-   License along with the GNU C Library; if not, see
+-   <https://www.gnu.org/licenses/>.  */
+-
+-#if defined __LITTLE_ENDIAN__ && IS_IN (libc)
+-#define MEMCHR __memchr_power10
+-
+-#undef libc_hidden_builtin_def
+-#define libc_hidden_builtin_def(name)
+-#undef weak_alias
+-#define weak_alias(name,alias)
+-
+-#include <sysdeps/powerpc/powerpc64/le/power10/memchr.S>
+-#endif
+diff --git a/sysdeps/powerpc/powerpc64/multiarch/memchr.c b/sysdeps/powerpc/powerpc64/multiarch/memchr.c
+index 57d23e7b18..b4655dfcaa 100644
+--- a/sysdeps/powerpc/powerpc64/multiarch/memchr.c
++++ b/sysdeps/powerpc/powerpc64/multiarch/memchr.c
+@@ -25,23 +25,15 @@ extern __typeof (__memchr) __memchr_ppc attribute_hidden;
+ extern __typeof (__memchr) __memchr_power7 attribute_hidden;
+ extern __typeof (__memchr) __memchr_power8 attribute_hidden;
+ 
+-# ifdef __LITTLE_ENDIAN__
+-extern __typeof (__memchr) __memchr_power10 attribute_hidden;
+-# endif
+ /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
+    ifunc symbol properly.  */
+ libc_ifunc (__memchr,
+-# ifdef __LITTLE_ENDIAN__
+-	    (hwcap2 & PPC_FEATURE2_ARCH_3_1
+-	     && hwcap & PPC_FEATURE_HAS_VSX)
+-	    ? __memchr_power10 :
+-# endif
+-	      (hwcap2 & PPC_FEATURE2_ARCH_2_07
+-	      && hwcap & PPC_FEATURE_HAS_ALTIVEC)
+-	      ? __memchr_power8 :
+-	        (hwcap & PPC_FEATURE_ARCH_2_06)
+-	        ? __memchr_power7
+-	        : __memchr_ppc);
++	    (hwcap2 & PPC_FEATURE2_ARCH_2_07
++	     && hwcap & PPC_FEATURE_HAS_ALTIVEC)
++	    ? __memchr_power8 :
++	    (hwcap & PPC_FEATURE_ARCH_2_06)
++            ? __memchr_power7
++            : __memchr_ppc);
+ 
+ weak_alias (__memchr, memchr)
+ libc_hidden_builtin_def (memchr)
+
+commit 7e12550b8e3a11764a4a9090ce6bd3fc23fc8a8e
+Author: Carlos O'Donell <carlos@redhat.com>
+Date:   Mon Jun 16 13:09:57 2025 -0400
+
+    ppc64le: Revert "powerpc: Optimized strcmp for power10" (CVE-2025-5702)
+    
+    This reverts commit 3367d8e180848030d1646f088759f02b8dfe0d6f
+    
+    Reason for revert: Power10 strcmp clobbers non-volatile vector
+    registers (Bug 33056)
+    
+    Tested on ppc64le without regression.
+    
+    (cherry picked from commit 15808c77b35319e67ee0dc8f984a9a1a434701bc)
+
+diff --git a/sysdeps/powerpc/powerpc64/le/power10/strcmp.S b/sysdeps/powerpc/powerpc64/le/power10/strcmp.S
+deleted file mode 100644
+index 00f1e9c170..0000000000
+--- a/sysdeps/powerpc/powerpc64/le/power10/strcmp.S
++++ /dev/null
+@@ -1,204 +0,0 @@
+-/* Optimized strcmp implementation for PowerPC64/POWER10.
+-   Copyright (C) 2021-2024 Free Software Foundation, Inc.
+-   This file is part of the GNU C Library.
+-
+-   The GNU C Library is free software; you can redistribute it and/or
+-   modify it under the terms of the GNU Lesser General Public
+-   License as published by the Free Software Foundation; either
+-   version 2.1 of the License, or (at your option) any later version.
+-
+-   The GNU C Library is distributed in the hope that it will be useful,
+-   but WITHOUT ANY WARRANTY; without even the implied warranty of
+-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+-   Lesser General Public License for more details.
+-
+-   You should have received a copy of the GNU Lesser General Public
+-   License along with the GNU C Library; if not, see
+-   <https://www.gnu.org/licenses/>.  */
+-#include <sysdep.h>
+-
+-#ifndef STRCMP
+-# define STRCMP strcmp
+-#endif
+-
+-/* Implements the function
+-   int [r3] strcmp (const char *s1 [r3], const char *s2 [r4]).  */
+-
+-/* TODO: Change this to actual instructions when minimum binutils is upgraded
+-   to 2.27.  Macros are defined below for these newer instructions in order
+-   to maintain compatibility.  */
+-
+-#define LXVP(xtp,dq,ra)		     \
+-	.long(((6)<<(32-6))	     \
+-	| ((((xtp)-32)>>1)<<(32-10)) \
+-	| ((1)<<(32-11))	     \
+-	| ((ra)<<(32-16))	     \
+-	| dq)
+-
+-#define COMPARE_16(vreg1,vreg2,offset)  \
+-	lxv       vreg1+32,offset(r3);  \
+-	lxv       vreg2+32,offset(r4);	\
+-	vcmpnezb. v7,vreg1,vreg2;	\
+-	bne       cr6,L(different);     \
+-
+-#define COMPARE_32(vreg1,vreg2,offset,label1,label2) \
+-	LXVP(vreg1+32,offset,r3);                    \
+-	LXVP(vreg2+32,offset,r4);                    \
+-	vcmpnezb. v7,vreg1+1,vreg2+1;                \
+-	bne	  cr6,L(label1);                     \
+-	vcmpnezb. v7,vreg1,vreg2;                    \
+-	bne	  cr6,L(label2);                     \
+-
+-#define TAIL(vreg1,vreg2)     \
+-	vctzlsbb r6,v7;	      \
+-	vextubrx r5,r6,vreg1; \
+-	vextubrx r4,r6,vreg2; \
+-	subf	 r3,r4,r5;    \
+-	blr;                  \
+-
+-#define CHECK_N_BYTES(reg1,reg2,len_reg) \
+-	sldi	  r0,len_reg,56;         \
+-	lxvl	  32+v4,reg1,r0;         \
+-	lxvl	  32+v5,reg2,r0;         \
+-	add	  reg1,reg1,len_reg;     \
+-	add	  reg2,reg2,len_reg;     \
+-	vcmpnezb. v7,v4,v5;              \
+-	vctzlsbb  r6,v7;                 \
+-	cmpld	  cr7,r6,len_reg;        \
+-	blt	  cr7,L(different);      \
+-
+-	/* TODO: change this to .machine power10 when the minimum required
+-	binutils allows it.  */
+-
+-	.machine  power9
+-ENTRY_TOCLESS (STRCMP, 4)
+-	li	 r11,16
+-	/* eq bit of cr1 used as swap status flag to indicate if
+-	source pointers were swapped.  */
+-	crclr	 4*cr1+eq
+-	vspltisb v19,-1
+-	andi.	 r7,r3,15
+-	sub	 r7,r11,r7	/* r7(nalign1) = 16 - (str1 & 15).  */
+-	andi.	 r9,r4,15
+-	sub	 r5,r11,r9	/* r5(nalign2) = 16 - (str2 & 15).  */
+-	cmpld	 cr7,r7,r5
+-	beq	 cr7,L(same_aligned)
+-	blt	 cr7,L(nalign1_min)
+-	/* Swap r3 and r4, and r7 and r5 such that r3 and r7 hold the
+-	pointer which is closer to the next 16B boundary so that only
+-	one CHECK_N_BYTES is needed before entering the loop below.  */
+-	mr	 r8,r4
+-	mr	 r4,r3
+-	mr	 r3,r8
+-	mr	 r12,r7
+-	mr	 r7,r5
+-	mr	 r5,r12
+-	crset	 4*cr1+eq	/* Set bit on swapping source pointers.  */
+-
+-	.p2align 5
+-L(nalign1_min):
+-	CHECK_N_BYTES(r3,r4,r7)
+-
+-	.p2align 5
+-L(s1_aligned):
+-	/* r9 and r5 is number of bytes to be read after and before
+-	 page boundary correspondingly.  */
+-	sub 	r5,r5,r7
+-	subfic	r9,r5,16
+-	/* Now let r7 hold the count of quadwords which can be
+-	checked without crossing a page boundary. quadword offset is
+-	(str2>>4)&0xFF.  */
+-	rlwinm	r7,r4,28,0xFF
+-	/* Below check is required only for first iteration. For second
+-	iteration and beyond, the new loop counter is always 255.  */
+-	cmpldi	r7,255
+-	beq	L(L3)
+-	/* Get the initial loop count by 255-((str2>>4)&0xFF).  */
+-	subfic  r11,r7,255
+-
+-	.p2align 5
+-L(L1):
+-	mtctr	r11
+-
+-	.p2align 5
+-L(L2):
+-	COMPARE_16(v4,v5,0)	/* Load 16B blocks using lxv.  */
+-	addi	r3,r3,16
+-	addi	r4,r4,16
+-	bdnz	L(L2)
+-	/* Cross the page boundary of s2, carefully.  */
+-
+-	.p2align 5
+-L(L3):
+-	CHECK_N_BYTES(r3,r4,r5)
+-	CHECK_N_BYTES(r3,r4,r9)
+-	li 	r11,255		/* Load the new loop counter.  */
+-	b	L(L1)
+-
+-	.p2align 5
+-L(same_aligned):
+-	CHECK_N_BYTES(r3,r4,r7)
+-        /* Align s1 to 32B and adjust s2 address.
+-	   Use lxvp only if both s1 and s2 are 32B aligned.  */
+-	COMPARE_16(v4,v5,0)
+-	COMPARE_16(v4,v5,16)
+-	COMPARE_16(v4,v5,32)
+-	COMPARE_16(v4,v5,48)
+-	addi	r3,r3,64
+-	addi	r4,r4,64
+-	COMPARE_16(v4,v5,0)
+-	COMPARE_16(v4,v5,16)
+-
+-	clrldi	r6,r3,59
+-	subfic	r5,r6,32
+-	add	r3,r3,r5
+-	add	r4,r4,r5
+-	andi.	r5,r4,0x1F
+-	beq	cr0,L(32B_aligned_loop)
+-
+-	.p2align 5
+-L(16B_aligned_loop):
+-	COMPARE_16(v4,v5,0)
+-	COMPARE_16(v4,v5,16)
+-	COMPARE_16(v4,v5,32)
+-	COMPARE_16(v4,v5,48)
+-	addi	r3,r3,64
+-	addi	r4,r4,64
+-	b	L(16B_aligned_loop)
+-
+-	/* Calculate and return the difference.  */
+-L(different):
+-	vctzlsbb r6,v7
+-	vextubrx r5,r6,v4
+-	vextubrx r4,r6,v5
+-	bt  	 4*cr1+eq,L(swapped)
+-	subf	 r3,r4,r5
+-	blr
+-
+-	/* If src pointers were swapped, then swap the
+-	indices and calculate the return value.  */
+-L(swapped):
+-	subf     r3,r5,r4
+-	blr
+-
+-	.p2align 5
+-L(32B_aligned_loop):
+-	COMPARE_32(v14,v16,0,tail1,tail2)
+-	COMPARE_32(v18,v20,32,tail3,tail4)
+-	COMPARE_32(v22,v24,64,tail5,tail6)
+-	COMPARE_32(v26,v28,96,tail7,tail8)
+-	addi	r3,r3,128
+-	addi	r4,r4,128
+-	b	L(32B_aligned_loop)
+-
+-L(tail1): TAIL(v15,v17)
+-L(tail2): TAIL(v14,v16)
+-L(tail3): TAIL(v19,v21)
+-L(tail4): TAIL(v18,v20)
+-L(tail5): TAIL(v23,v25)
+-L(tail6): TAIL(v22,v24)
+-L(tail7): TAIL(v27,v29)
+-L(tail8): TAIL(v26,v28)
+-
+-END (STRCMP)
+-libc_hidden_builtin_def (strcmp)
+diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
+index fa1107dfd9..9f15f3207f 100644
+--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
++++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
+@@ -33,8 +33,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
+ ifneq (,$(filter %le,$(config-machine)))
+ sysdep_routines += memcmp-power10 memcpy-power10 memmove-power10 memset-power10 \
+ 		   rawmemchr-power9 rawmemchr-power10 \
+-		   strcmp-power9 strcmp-power10 strncmp-power9 \
+-		   strcpy-power9 stpcpy-power9 \
++		   strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
+ 		   strlen-power9 strncpy-power9 stpncpy-power9 strlen-power10
+ endif
+ CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
+diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+index 9b3e617306..78443b7f34 100644
+--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+@@ -377,10 +377,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   /* Support sysdeps/powerpc/powerpc64/multiarch/strcmp.c.  */
+   IFUNC_IMPL (i, name, strcmp,
+ #ifdef __LITTLE_ENDIAN__
+-	      IFUNC_IMPL_ADD (array, i, strcmp,
+-			      (hwcap2 & PPC_FEATURE2_ARCH_3_1)
+-			      && (hwcap & PPC_FEATURE_HAS_VSX),
+-			      __strcmp_power10)
+ 	      IFUNC_IMPL_ADD (array, i, strcmp,
+ 			      hwcap2 & PPC_FEATURE2_ARCH_3_00
+ 			      && hwcap & PPC_FEATURE_HAS_ALTIVEC,
+diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcmp-power10.S b/sysdeps/powerpc/powerpc64/multiarch/strcmp-power10.S
+deleted file mode 100644
+index 1a9f6069f5..0000000000
+--- a/sysdeps/powerpc/powerpc64/multiarch/strcmp-power10.S
++++ /dev/null
+@@ -1,26 +0,0 @@
+-/* Optimized strcmp implementation for POWER10/PPC64.
+-   Copyright (C) 2021-2024 Free Software Foundation, Inc.
+-   This file is part of the GNU C Library.
+-
+-   The GNU C Library is free software; you can redistribute it and/or
+-   modify it under the terms of the GNU Lesser General Public
+-   License as published by the Free Software Foundation; either
+-   version 2.1 of the License, or (at your option) any later version.
+-
+-   The GNU C Library is distributed in the hope that it will be useful,
+-   but WITHOUT ANY WARRANTY; without even the implied warranty of
+-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+-   Lesser General Public License for more details.
+-
+-   You should have received a copy of the GNU Lesser General Public
+-   License along with the GNU C Library; if not, see
+-   <https://www.gnu.org/licenses/>.  */
+-
+-#if defined __LITTLE_ENDIAN__ && IS_IN (libc)
+-#define STRCMP __strcmp_power10
+-
+-#undef libc_hidden_builtin_def
+-#define libc_hidden_builtin_def(name)
+-
+-#include <sysdeps/powerpc/powerpc64/le/power10/strcmp.S>
+-#endif /* __LITTLE_ENDIAN__ && IS_IN (libc) */
+diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c
+index ff32496fab..06b9b4090f 100644
+--- a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c
++++ b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c
+@@ -29,16 +29,12 @@ extern __typeof (strcmp) __strcmp_power7 attribute_hidden;
+ extern __typeof (strcmp) __strcmp_power8 attribute_hidden;
+ # ifdef __LITTLE_ENDIAN__
+ extern __typeof (strcmp) __strcmp_power9 attribute_hidden;
+-extern __typeof (strcmp) __strcmp_power10 attribute_hidden;
+ # endif
+ 
+ # undef strcmp
+ 
+ libc_ifunc_redirected (__redirect_strcmp, strcmp,
+ # ifdef __LITTLE_ENDIAN__
+-		        (hwcap2 & PPC_FEATURE2_ARCH_3_1
+-			 && hwcap & PPC_FEATURE_HAS_VSX)
+-			? __strcmp_power10 :
+ 			(hwcap2 & PPC_FEATURE2_ARCH_3_00
+ 			 && hwcap & PPC_FEATURE_HAS_ALTIVEC)
+ 			? __strcmp_power9 :
+
+commit 23a02e382c8ffebfed00a082d8898f1aa468b5da
+Author: Florian Weimer <fweimer@redhat.com>
+Date:   Wed May 21 16:47:34 2025 +0200
+
+    support: Pick group in support_capture_subprogram_self_sgid if UID == 0
+    
+    When running as root, it is likely that we can run under any group.
+    Pick a harmless group from /etc/group in this case.
+    
+    Reviewed-by: Carlos O'Donell <carlos@redhat.com>
+    (cherry picked from commit 2f769cec448d84a62b7dd0d4ff56978fe22c0cd6)
+
+diff --git a/support/support_capture_subprocess.c b/support/support_capture_subprocess.c
+index 2383481911..1cb344eb04 100644
+--- a/support/support_capture_subprocess.c
++++ b/support/support_capture_subprocess.c
+@@ -21,7 +21,11 @@
+ 
+ #include <errno.h>
+ #include <fcntl.h>
++#include <grp.h>
++#include <scratch_buffer.h>
++#include <stdio_ext.h>
+ #include <stdlib.h>
++#include <string.h>
+ #include <support/check.h>
+ #include <support/xunistd.h>
+ #include <support/xsocket.h>
+@@ -210,10 +214,48 @@ err:
+   return status;
+ }
+ 
++/* Returns true if a group with NAME has been found, and writes its
++   GID to *TARGET.  */
++static bool
++find_sgid_group (gid_t *target, const char *name)
++{
++  /* Do not use getgrname_r because it does not work in statically
++     linked binaries if the system libc is different.  */
++  FILE *fp = fopen ("/etc/group", "rce");
++  if (fp == NULL)
++    return false;
++  __fsetlocking (fp, FSETLOCKING_BYCALLER);
++
++  bool ok = false;
++  struct scratch_buffer buf;
++  scratch_buffer_init (&buf);
++  while (true)
++    {
++      struct group grp;
++      struct group *result = NULL;
++      int status = fgetgrent_r (fp, &grp, buf.data, buf.length, &result);
++      if (status == 0 && result != NULL)
++	{
++	  if (strcmp (result->gr_name, name) == 0)
++	    {
++	      *target = result->gr_gid;
++	      ok = true;
++	      break;
++	    }
++	}
++      else if (errno != ERANGE)
++	break;
++      else if (!scratch_buffer_grow (&buf))
++	break;
++    }
++  scratch_buffer_free (&buf);
++  fclose (fp);
++  return ok;
++}
++
+ int
+ support_capture_subprogram_self_sgid (const char *child_id)
+ {
+-  gid_t target = 0;
+   const int count = 64;
+   gid_t groups[count];
+ 
+@@ -225,6 +267,7 @@ support_capture_subprogram_self_sgid (const char *child_id)
+ 		     (intmax_t) getuid ());
+ 
+   gid_t current = getgid ();
++  gid_t target = current;
+   for (int i = 0; i < ret; ++i)
+     {
+       if (groups[i] != current)
+@@ -234,9 +277,16 @@ support_capture_subprogram_self_sgid (const char *child_id)
+ 	}
+     }
+ 
+-  if (target == 0)
+-    FAIL_UNSUPPORTED("Could not find a suitable GID for user %jd\n",
+-		     (intmax_t) getuid ());
++  if (target == current)
++    {
++      /* If running as root, try to find a harmless group for SGID.  */
++      if (getuid () != 0
++	  || (!find_sgid_group (&target, "nogroup")
++	      && !find_sgid_group (&target, "bin")
++	      && !find_sgid_group (&target, "daemon")))
++	FAIL_UNSUPPORTED("Could not find a suitable GID for user %jd\n",
++			 (intmax_t) getuid ());
++    }
+ 
+   return copy_and_spawn_sgid (child_id, target);
+ }
+
+commit dbc83657e290bdad3245259be80fb84cbe10304c
+Author: Florian Weimer <fweimer@redhat.com>
+Date:   Thu May 22 14:36:37 2025 +0200
+
+    Fix error reporting (false negatives) in SGID tests
+    
+    And simplify the interface of support_capture_subprogram_self_sgid.
+    
+    Use the existing framework for temporary directories (now with
+    mode 0700) and directory/file deletion.  Handle all execution
+    errors within support_capture_subprogram_self_sgid.  In particular,
+    this includes test failures because the invoked program did not
+    exit with exit status zero.  Existing tests that expect exit
+    status 42 are adjusted to use zero instead.
+    
+    In addition, fix callers not to call exit (0) with test failures
+    pending (which may mask them, especially when running with --direct).
+    
+    Fixes commit 35fc356fa3b4f485bd3ba3114c9f774e5df7d3c2
+    ("elf: Fix subprocess status handling for tst-dlopen-sgid (bug 32987)").
+    
+    Reviewed-by: Carlos O'Donell <carlos@redhat.com>
+    (cherry picked from commit 3a3fb2ed83f79100c116c824454095ecfb335ad7)
+
+diff --git a/elf/tst-dlopen-sgid.c b/elf/tst-dlopen-sgid.c
+index 5688b79f2e..8aec52e19f 100644
+--- a/elf/tst-dlopen-sgid.c
++++ b/elf/tst-dlopen-sgid.c
+@@ -70,13 +70,7 @@ do_test (void)
+ 
+   free (libdir);
+ 
+-  int status = support_capture_subprogram_self_sgid (magic_argument);
+-
+-  if (WEXITSTATUS (status) == EXIT_UNSUPPORTED)
+-    return EXIT_UNSUPPORTED;
+-
+-  if (!WIFEXITED (status))
+-    FAIL_EXIT1 ("Unexpected exit status %d from child process\n", status);
++  support_capture_subprogram_self_sgid (magic_argument);
+ 
+   return 0;
+ }
+diff --git a/elf/tst-env-setuid-tunables.c b/elf/tst-env-setuid-tunables.c
+index a47219047f..233eec7631 100644
+--- a/elf/tst-env-setuid-tunables.c
++++ b/elf/tst-env-setuid-tunables.c
+@@ -105,10 +105,7 @@ do_test (int argc, char **argv)
+ 
+       if (ret != 0)
+ 	exit (1);
+-
+-      /* Special return code to make sure that the child executed all the way
+-	 through.  */
+-      exit (42);
++      return 0;
+     }
+   else
+     {
+@@ -127,18 +124,7 @@ do_test (int argc, char **argv)
+ 	      continue;
+ 	    }
+ 
+-	  int status = support_capture_subprogram_self_sgid (buf);
+-
+-	  /* Bail out early if unsupported.  */
+-	  if (WEXITSTATUS (status) == EXIT_UNSUPPORTED)
+-	    return EXIT_UNSUPPORTED;
+-
+-	  if (WEXITSTATUS (status) != 42)
+-	    {
+-	      printf ("    [%d] child failed with status %d\n", i,
+-		      WEXITSTATUS (status));
+-	      support_record_failure ();
+-	    }
++	  support_capture_subprogram_self_sgid (buf);
+ 	}
+       return 0;
+     }
+diff --git a/elf/tst-env-setuid.c b/elf/tst-env-setuid.c
+index 59f2ffeb88..ee3f058468 100644
+--- a/elf/tst-env-setuid.c
++++ b/elf/tst-env-setuid.c
+@@ -147,10 +147,7 @@ do_test (int argc, char **argv)
+ 
+       if (ret != 0)
+ 	exit (1);
+-
+-      /* Special return code to make sure that the child executed all the way
+-	 through.  */
+-      exit (42);
++      return 0;
+     }
+   else
+     {
+@@ -174,17 +171,7 @@ do_test (int argc, char **argv)
+ 	free (profilepath);
+       }
+ 
+-      int status = support_capture_subprogram_self_sgid (SETGID_CHILD);
+-
+-      if (WEXITSTATUS (status) == EXIT_UNSUPPORTED)
+-	exit (EXIT_UNSUPPORTED);
+-
+-      if (WEXITSTATUS (status) != 42)
+-	{
+-	  printf ("    child failed with status %d\n",
+-		  WEXITSTATUS (status));
+-	  support_record_failure ();
+-	}
++      support_capture_subprogram_self_sgid (SETGID_CHILD);
+ 
+       return 0;
+     }
+diff --git a/stdlib/tst-secure-getenv.c b/stdlib/tst-secure-getenv.c
+index cc26ed6d15..cefee58d46 100644
+--- a/stdlib/tst-secure-getenv.c
++++ b/stdlib/tst-secure-getenv.c
+@@ -57,13 +57,7 @@ do_test (void)
+       exit (1);
+     }
+ 
+-  int status = support_capture_subprogram_self_sgid (MAGIC_ARGUMENT);
+-
+-  if (WEXITSTATUS (status) == EXIT_UNSUPPORTED)
+-    return EXIT_UNSUPPORTED;
+-
+-  if (!WIFEXITED (status))
+-    FAIL_EXIT1 ("Unexpected exit status %d from child process\n", status);
++  support_capture_subprogram_self_sgid (MAGIC_ARGUMENT);
+ 
+   return 0;
+ }
+@@ -82,6 +76,7 @@ alternative_main (int argc, char **argv)
+       if (secure_getenv ("PATH") != NULL)
+ 	FAIL_EXIT (4, "PATH variable not filtered out\n");
+ 
++      support_record_failure_barrier ();
+       exit (EXIT_SUCCESS);
+     }
+ }
+diff --git a/support/capture_subprocess.h b/support/capture_subprocess.h
+index 5406d9f6c0..57bb941e7d 100644
+--- a/support/capture_subprocess.h
++++ b/support/capture_subprocess.h
+@@ -42,10 +42,12 @@ struct support_capture_subprocess support_capture_subprocess
+ struct support_capture_subprocess support_capture_subprogram
+   (const char *file, char *const argv[], char *const envp[]);
+ 
+-/* Copy the running program into a setgid binary and run it with CHILD_ID
+-   argument.  If execution is successful, return the exit status of the child
+-   program, otherwise return a non-zero failure exit code.  */
+-int support_capture_subprogram_self_sgid (const char *child_id);
++/* Copy the running program into a setgid binary and run it with
++   CHILD_ID argument.  If the program exits with a non-zero status,
++   exit with that exit status (or status 1 if the program did not exit
++   normally).  If the test cannot be performed, exit with
++   EXIT_UNSUPPORTED.  */
++void support_capture_subprogram_self_sgid (const char *child_id);
+ 
+ /* Deallocate the subprocess data captured by
+    support_capture_subprocess.  */
+diff --git a/support/support_capture_subprocess.c b/support/support_capture_subprocess.c
+index 1cb344eb04..cbc6951064 100644
+--- a/support/support_capture_subprocess.c
++++ b/support/support_capture_subprocess.c
+@@ -31,6 +31,7 @@
+ #include <support/xsocket.h>
+ #include <support/xspawn.h>
+ #include <support/support.h>
++#include <support/temp_file.h>
+ #include <support/test-driver.h>
+ 
+ static void
+@@ -113,105 +114,44 @@ support_capture_subprogram (const char *file, char *const argv[],
+ /* Copies the executable into a restricted directory, so that we can
+    safely make it SGID with the TARGET group ID.  Then runs the
+    executable.  */
+-static int
++static void
+ copy_and_spawn_sgid (const char *child_id, gid_t gid)
+ {
+-  char *dirname = xasprintf ("%s/tst-tunables-setuid.%jd",
+-			     test_dir, (intmax_t) getpid ());
++  char *dirname = support_create_temp_directory ("tst-glibc-sgid-");
+   char *execname = xasprintf ("%s/bin", dirname);
+-  int infd = -1;
+-  int outfd = -1;
+-  int ret = 1, status = 1;
+-
+-  TEST_VERIFY (mkdir (dirname, 0700) == 0);
+-  if (support_record_failure_is_failed ())
+-    goto err;
++  add_temp_file (execname);
+ 
+-  infd = open ("/proc/self/exe", O_RDONLY);
+-  if (infd < 0)
++  if (access ("/proc/self/exe", R_OK) != 0)
+     FAIL_UNSUPPORTED ("unsupported: Cannot read binary from procfs\n");
+ 
+-  outfd = open (execname, O_WRONLY | O_CREAT | O_EXCL, 0700);
+-  TEST_VERIFY (outfd >= 0);
+-  if (support_record_failure_is_failed ())
+-    goto err;
+-
+-  char buf[4096];
+-  for (;;)
+-    {
+-      ssize_t rdcount = read (infd, buf, sizeof (buf));
+-      TEST_VERIFY (rdcount >= 0);
+-      if (support_record_failure_is_failed ())
+-	goto err;
+-      if (rdcount == 0)
+-	break;
+-      char *p = buf;
+-      char *end = buf + rdcount;
+-      while (p != end)
+-	{
+-	  ssize_t wrcount = write (outfd, buf, end - p);
+-	  if (wrcount == 0)
+-	    errno = ENOSPC;
+-	  TEST_VERIFY (wrcount > 0);
+-	  if (support_record_failure_is_failed ())
+-	    goto err;
+-	  p += wrcount;
+-	}
+-    }
++  support_copy_file ("/proc/self/exe", execname);
+ 
+-  bool chowned = false;
+-  TEST_VERIFY ((chowned = fchown (outfd, getuid (), gid) == 0)
+-	       || errno == EPERM);
+-  if (support_record_failure_is_failed ())
+-    goto err;
+-  else if (!chowned)
+-    {
+-      ret = 77;
+-      goto err;
+-    }
++  if (chown (execname, getuid (), gid) != 0)
++    FAIL_UNSUPPORTED ("cannot change group of \"%s\" to %jd: %m",
++		      execname, (intmax_t) gid);
+ 
+-  TEST_VERIFY (fchmod (outfd, 02750) == 0);
+-  if (support_record_failure_is_failed ())
+-    goto err;
+-  TEST_VERIFY (close (outfd) == 0);
+-  if (support_record_failure_is_failed ())
+-    goto err;
+-  TEST_VERIFY (close (infd) == 0);
+-  if (support_record_failure_is_failed ())
+-    goto err;
++  if (chmod (execname, 02750) != 0)
++    FAIL_UNSUPPORTED ("cannot make \"%s\" SGID: %m ", execname);
+ 
+   /* We have the binary, now spawn the subprocess.  Avoid using
+      support_subprogram because we only want the program exit status, not the
+      contents.  */
+-  ret = 0;
+-  infd = outfd = -1;
+ 
+   char * const args[] = {execname, (char *) child_id, NULL};
++  int status = support_subprogram_wait (args[0], args);
+ 
+-  status = support_subprogram_wait (args[0], args);
++  free (execname);
++  free (dirname);
+ 
+-err:
+-  if (outfd >= 0)
+-    close (outfd);
+-  if (infd >= 0)
+-    close (infd);
+-  if (execname != NULL)
+-    {
+-      unlink (execname);
+-      free (execname);
+-    }
+-  if (dirname != NULL)
++  if (WIFEXITED (status))
+     {
+-      rmdir (dirname);
+-      free (dirname);
++      if (WEXITSTATUS (status) == 0)
++	return;
++      else
++	exit (WEXITSTATUS (status));
+     }
+-
+-  if (ret == 77)
+-    FAIL_UNSUPPORTED ("Failed to make sgid executable for test\n");
+-  if (ret != 0)
+-    FAIL_EXIT1 ("Failed to make sgid executable for test\n");
+-
+-  return status;
++  else
++    FAIL_EXIT1 ("subprogram failed with status %d", status);
+ }
+ 
+ /* Returns true if a group with NAME has been found, and writes its
+@@ -253,7 +193,7 @@ find_sgid_group (gid_t *target, const char *name)
+   return ok;
+ }
+ 
+-int
++void
+ support_capture_subprogram_self_sgid (const char *child_id)
+ {
+   const int count = 64;
+@@ -288,7 +228,7 @@ support_capture_subprogram_self_sgid (const char *child_id)
+ 			 (intmax_t) getuid ());
+     }
+ 
+-  return copy_and_spawn_sgid (child_id, target);
++  copy_and_spawn_sgid (child_id, target);
+ }
+ 
+ void
+
+commit 2eb180377b96771b8368b0915669c8c7b267e739
+Author: Florian Weimer <fweimer@redhat.com>
+Date:   Mon Jul 21 21:43:49 2025 +0200
+
+    posix: Fix double-free after allocation failure in regcomp (bug 33185)
+    
+    If a memory allocation failure occurs during bracket expression
+    parsing in regcomp, a double-free error may result.
+    
+    Reported-by: Anastasia Belova <abelova@astralinux.ru>
+    Co-authored-by: Paul Eggert <eggert@cs.ucla.edu>
+    Reviewed-by: Andreas K. Huettel <dilfridge@gentoo.org>
+    (cherry picked from commit 7ea06e994093fa0bcca0d0ee2c1db271d8d7885d)
+
+diff --git a/NEWS b/NEWS
+index 4b290ad4bf..253b07ae99 100644
+--- a/NEWS
++++ b/NEWS
+@@ -24,6 +24,7 @@ The following bugs are resolved with this release:
+   [32470] x86: Avoid integer truncation with large cache sizes
+   [32810] Crash on x86-64 if XSAVEC disable via tunable
+   [32987] elf: Fix subprocess status handling for tst-dlopen-sgid
++  [33185] Fix double-free after allocation failure in regcomp
+ 
+ Version 2.40
+ 
+diff --git a/posix/Makefile b/posix/Makefile
+index 2c598cd20a..830278a423 100644
+--- a/posix/Makefile
++++ b/posix/Makefile
+@@ -303,6 +303,7 @@ tests := \
+   tst-posix_spawn-setsid \
+   tst-preadwrite \
+   tst-preadwrite64 \
++  tst-regcomp-bracket-free \
+   tst-regcomp-truncated \
+   tst-regex \
+   tst-regex2 \
+diff --git a/posix/regcomp.c b/posix/regcomp.c
+index 5380d3c7b9..6595bb3c0d 100644
+--- a/posix/regcomp.c
++++ b/posix/regcomp.c
+@@ -3384,6 +3384,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
+     {
+ #ifdef RE_ENABLE_I18N
+       free_charset (mbcset);
++      mbcset = NULL;
+ #endif
+       /* Build a tree for simple bracket.  */
+       br_token.type = SIMPLE_BRACKET;
+@@ -3399,7 +3400,8 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
+  parse_bracket_exp_free_return:
+   re_free (sbcset);
+ #ifdef RE_ENABLE_I18N
+-  free_charset (mbcset);
++  if (__glibc_likely (mbcset != NULL))
++    free_charset (mbcset);
+ #endif /* RE_ENABLE_I18N */
+   return NULL;
+ }
+diff --git a/posix/tst-regcomp-bracket-free.c b/posix/tst-regcomp-bracket-free.c
+new file mode 100644
+index 0000000000..3c091d8c44
+--- /dev/null
++++ b/posix/tst-regcomp-bracket-free.c
+@@ -0,0 +1,176 @@
++/* Test regcomp bracket parsing with injected allocation failures (bug 33185).
++   Copyright (C) 2025 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++/* This test invokes regcomp multiple times, failing one memory
++   allocation in each call.  The function call should fail with
++   REG_ESPACE (or succeed if it can recover from the allocation
++   failure).  Previously, there was double-free bug.  */
++
++#include <errno.h>
++#include <regex.h>
++#include <stdio.h>
++#include <string.h>
++#include <support/check.h>
++#include <support/namespace.h>
++#include <support/support.h>
++
++/* Data structure allocated via MAP_SHARED, so that writes from the
++   subprocess are visible.  */
++struct shared_data
++{
++  /* Number of tracked allocations performed so far.  */
++  volatile unsigned int allocation_count;
++
++  /* If this number is reached, one allocation fails.  */
++  volatile unsigned int failing_allocation;
++
++  /* The subprocess stores the expected name here.  */
++  char name[100];
++};
++
++/* Allocation count in shared mapping.  */
++static struct shared_data *shared;
++
++/* Returns true if a failure should be injected for this allocation.  */
++static bool
++fail_this_allocation (void)
++{
++  if (shared != NULL)
++    {
++      unsigned int count = shared->allocation_count;
++      shared->allocation_count = count + 1;
++      return count == shared->failing_allocation;
++    }
++  else
++    return false;
++}
++
++/* Failure-injecting wrappers for allocation functions used by glibc.  */
++
++void *
++malloc (size_t size)
++{
++  if (fail_this_allocation ())
++    {
++      errno = ENOMEM;
++      return NULL;
++    }
++  extern __typeof (malloc) __libc_malloc;
++  return __libc_malloc (size);
++}
++
++void *
++calloc (size_t a, size_t b)
++{
++  if (fail_this_allocation ())
++    {
++      errno = ENOMEM;
++      return NULL;
++    }
++  extern __typeof (calloc) __libc_calloc;
++  return __libc_calloc (a, b);
++}
++
++void *
++realloc (void *ptr, size_t size)
++{
++  if (fail_this_allocation ())
++    {
++      errno = ENOMEM;
++      return NULL;
++    }
++  extern __typeof (realloc) __libc_realloc;
++  return __libc_realloc (ptr, size);
++}
++
++/* No-op subprocess to verify that support_isolate_in_subprocess does
++   not perform any heap allocations.  */
++static void
++no_op (void *ignored)
++{
++}
++
++/* Perform a regcomp call in a subprocess.  Used to count its
++   allocations.  */
++static void
++initialize (void *regexp1)
++{
++  const char *regexp = regexp1;
++
++  shared->allocation_count = 0;
++
++  regex_t reg;
++  TEST_COMPARE (regcomp (&reg, regexp, 0), 0);
++}
++
++/* Perform regcomp in a subprocess with fault injection.  */
++static void
++test_in_subprocess (void *regexp1)
++{
++  const char *regexp = regexp1;
++  unsigned int inject_at = shared->failing_allocation;
++
++  regex_t reg;
++  int ret = regcomp (&reg, regexp, 0);
++
++  if (ret != 0)
++    {
++      TEST_COMPARE (ret, REG_ESPACE);
++      printf ("info: allocation %u failure results in return value %d,"
++              " error %s (%d)\n",
++              inject_at, ret, strerrorname_np (errno), errno);
++    }
++}
++
++static int
++do_test (void)
++{
++  char regexp[] = "[:alpha:]";
++
++  shared = support_shared_allocate (sizeof (*shared));
++
++  /* Disable fault injection.  */
++  shared->failing_allocation = ~0U;
++
++  support_isolate_in_subprocess (no_op, NULL);
++  TEST_COMPARE (shared->allocation_count, 0);
++
++  support_isolate_in_subprocess (initialize, regexp);
++
++  /* The number of allocations in the successful case, plus some
++     slack.  Once the number of expected allocations is exceeded,
++     injecting further failures does not make a difference.  */
++  unsigned int maximum_allocation_count = shared->allocation_count;
++  printf ("info: successful call performs %u allocations\n",
++          maximum_allocation_count);
++  maximum_allocation_count += 10;
++
++  for (unsigned int inject_at = 0; inject_at <= maximum_allocation_count;
++       ++inject_at)
++    {
++      shared->allocation_count = 0;
++      shared->failing_allocation = inject_at;
++      support_isolate_in_subprocess (test_in_subprocess, regexp);
++    }
++
++  support_shared_free (shared);
++
++  return 0;
++}
++
++#include <support/test-driver.c>
diff --git a/pkgs/development/libraries/glibc/common.nix b/pkgs/development/libraries/glibc/common.nix
index e943ad8f4b97..76685fed8771 100644
--- a/pkgs/development/libraries/glibc/common.nix
+++ b/pkgs/development/libraries/glibc/common.nix
@@ -68,7 +68,7 @@ stdenv.mkDerivation (
       /*
         No tarballs for stable upstream branch, only https://sourceware.org/git/glibc.git and using git would complicate bootstrapping.
          $ git fetch --all -p && git checkout origin/release/2.40/master && git describe
-         glibc-2.40-66-g7d4b6bcae9
+         glibc-2.40-142-g2eb180377b
          $ git show --minimal --reverse glibc-2.40.. ':!ADVISORIES' > 2.40-master.patch
 
         To compare the archive contents zdiff can be used.