Blob Blame History Raw
From 316ef0d22b6f41722cc71316a6b59e60e7c9576c Mon Sep 17 00:00:00 2001
From: Julian Seward <jseward.tech@gmail.com>
Date: Mon, 11 Mar 2024 17:53:14 +0100
Subject: [PATCH] Handle gcc __builtin_strcmp using 128/256 bit vectors with
 sse4.1, avx/avx2

* amd64 front end: redo the translation into IR for PTEST, so as to
  use only IROps which we know Memcheck can do exact instrumentation
  for.  Handling for both the 128- and 256-bit cases is has been
  changed.

* ir_opt.c: add some constant folding rules to support the above.  In
  particular, for the case `ptest %reg, %reg` (the same reg twice), we
  want rflags.C to be set to a defined-1 even if %reg is completely
  undefined.  Doing that requires folding `x and not(x)` to zero when
  x has type V128 or V256.

* memcheck/tests/amd64/rh2257546_{128,256}.c: new test cases

https://bugzilla.redhat.com/show_bug.cgi?id=2257546
---
 VEX/priv/guest_amd64_toIR.c                   | 64 +++++++++----------
 VEX/priv/ir_opt.c                             | 34 ++++++++++
 memcheck/tests/amd64/Makefile.am              |  7 +-
 memcheck/tests/amd64/rh2257546_128.c          | 32 ++++++++++
 memcheck/tests/amd64/rh2257546_128.stderr.exp |  0
 memcheck/tests/amd64/rh2257546_128.stdout.exp |  1 +
 memcheck/tests/amd64/rh2257546_128.vgtest     |  2 +
 memcheck/tests/amd64/rh2257546_256.c          | 32 ++++++++++
 memcheck/tests/amd64/rh2257546_256.stderr.exp |  0
 memcheck/tests/amd64/rh2257546_256.stdout.exp |  1 +
 memcheck/tests/amd64/rh2257546_256.vgtest     |  3 +
 11 files changed, 141 insertions(+), 35 deletions(-)
 create mode 100644 memcheck/tests/amd64/rh2257546_128.c
 create mode 100644 memcheck/tests/amd64/rh2257546_128.stderr.exp
 create mode 100644 memcheck/tests/amd64/rh2257546_128.stdout.exp
 create mode 100644 memcheck/tests/amd64/rh2257546_128.vgtest
 create mode 100644 memcheck/tests/amd64/rh2257546_256.c
 create mode 100644 memcheck/tests/amd64/rh2257546_256.stderr.exp
 create mode 100644 memcheck/tests/amd64/rh2257546_256.stdout.exp
 create mode 100644 memcheck/tests/amd64/rh2257546_256.vgtest

diff --git a/VEX/priv/guest_amd64_toIR.c b/VEX/priv/guest_amd64_toIR.c
index 0414aa5c5..d7c25042d 100644
--- a/VEX/priv/guest_amd64_toIR.c
+++ b/VEX/priv/guest_amd64_toIR.c
@@ -16826,13 +16826,18 @@ static Long dis_VBLENDV_256 ( const VexAbiInfo* vbi, Prefix pfx, Long delta,
 
 static void finish_xTESTy ( IRTemp andV, IRTemp andnV, Int sign )
 {
-   /* Set Z=1 iff (vecE & vecG) == 0
-      Set C=1 iff (vecE & not vecG) == 0
+   /* Set Z=1 iff (vecE & vecG) == 0--(128)--0
+      Set C=1 iff (vecE & not vecG) == 0--(128)--0
+
+      For the case `sign == 0`, be careful to use only IROps that can be
+      instrumented exactly by memcheck.  This is because PTEST is used for
+      __builtin_strcmp in gcc14.  See
+      https://bugzilla.redhat.com/show_bug.cgi?id=2257546
    */
 
    /* andV, andnV:  vecE & vecG,  vecE and not(vecG) */
 
-   /* andV resp. andnV, reduced to 64-bit values, by or-ing the top
+   /* andV resp. andnV, are reduced to 64-bit values by or-ing the top
       and bottom 64-bits together.  It relies on this trick:
 
       InterleaveLO64x2([a,b],[c,d]) == [b,d]    hence
@@ -16862,11 +16867,13 @@ static void finish_xTESTy ( IRTemp andV, IRTemp andnV, Int sign )
                      binop(Iop_InterleaveHI64x2,
                            mkexpr(andnV), mkexpr(andnV)))));
 
+   // Make z64 and c64 be either all-0s or all-1s
    IRTemp z64 = newTemp(Ity_I64);
    IRTemp c64 = newTemp(Ity_I64);
+
    if (sign == 64) {
-      /* When only interested in the most significant bit, just shift
-         arithmetically right and negate.  */
+      /* When only interested in the most significant bit, just copy bit 63
+         into all bit positions, then invert. */
       assign(z64,
              unop(Iop_Not64,
                   binop(Iop_Sar64, mkexpr(and64), mkU8(63))));
@@ -16874,37 +16881,28 @@ static void finish_xTESTy ( IRTemp andV, IRTemp andnV, Int sign )
       assign(c64,
              unop(Iop_Not64,
                   binop(Iop_Sar64, mkexpr(andn64), mkU8(63))));
-   } else {
-      if (sign == 32) {
-         /* When interested in bit 31 and bit 63, mask those bits and
-            fallthrough into the PTEST handling.  */
-         IRTemp t0 = newTemp(Ity_I64);
-         IRTemp t1 = newTemp(Ity_I64);
-         IRTemp t2 = newTemp(Ity_I64);
-         assign(t0, mkU64(0x8000000080000000ULL));
-         assign(t1, binop(Iop_And64, mkexpr(and64), mkexpr(t0)));
-         assign(t2, binop(Iop_And64, mkexpr(andn64), mkexpr(t0)));
-         and64 = t1;
-         andn64 = t2;
-      }
-      /* Now convert and64, andn64 to all-zeroes or all-1s, so we can
-         slice out the Z and C bits conveniently.  We use the standard
-         trick all-zeroes -> all-zeroes, anything-else -> all-ones
-         done by "(x | -x) >>s (word-size - 1)".
-      */
+   } else if (sign == 32) {
+      /* If we're interested into bits 63 and 31, OR bit 31 into bit 63, copy
+         bit 63 into all bit positions, then invert. */
+      IRTemp and3264 = newTemp(Ity_I64);
+      assign(and3264, binop(Iop_Or64, mkexpr(and64),
+                            binop(Iop_Shl64, mkexpr(and64), mkU8(32))));
       assign(z64,
              unop(Iop_Not64,
-                  binop(Iop_Sar64,
-                        binop(Iop_Or64,
-                              binop(Iop_Sub64, mkU64(0), mkexpr(and64)),
-                                    mkexpr(and64)), mkU8(63))));
+                  binop(Iop_Sar64, mkexpr(and3264), mkU8(63))));
 
+      IRTemp andn3264 = newTemp(Ity_I64);
+      assign(andn3264, binop(Iop_Or64, mkexpr(andn64),
+                             binop(Iop_Shl64, mkexpr(andn64), mkU8(32))));
       assign(c64,
              unop(Iop_Not64,
-                  binop(Iop_Sar64,
-                        binop(Iop_Or64,
-                              binop(Iop_Sub64, mkU64(0), mkexpr(andn64)),
-                                    mkexpr(andn64)), mkU8(63))));
+                  binop(Iop_Sar64, mkexpr(andn3264), mkU8(63))));
+   } else {
+      vassert(sign == 0);
+      assign(z64, IRExpr_ITE(binop(Iop_CmpEQ64, mkexpr(and64), mkU64(0)),
+                             mkU64(~0ULL), mkU64(0ULL)));
+      assign(c64, IRExpr_ITE(binop(Iop_CmpEQ64, mkexpr(andn64), mkU64(0)),
+                             mkU64(~0ULL), mkU64(0ULL)));
    }
 
    /* And finally, slice out the Z and C flags and set the flags
@@ -16966,9 +16964,7 @@ static Long dis_xTESTy_128 ( const VexAbiInfo* vbi, Prefix pfx,
    IRTemp andnV = newTemp(Ity_V128);
    assign(andV,  binop(Iop_AndV128, mkexpr(vecE), mkexpr(vecG)));
    assign(andnV, binop(Iop_AndV128,
-                       mkexpr(vecE),
-                       binop(Iop_XorV128, mkexpr(vecG),
-                                          mkV128(0xFFFF))));
+                       mkexpr(vecE), unop(Iop_NotV128, mkexpr(vecG))));
 
    finish_xTESTy ( andV, andnV, sign );
    return delta;
diff --git a/VEX/priv/ir_opt.c b/VEX/priv/ir_opt.c
index f918e9f85..6453f4fdf 100644
--- a/VEX/priv/ir_opt.c
+++ b/VEX/priv/ir_opt.c
@@ -1693,6 +1693,17 @@ static IRExpr* fold_Expr_WRK ( IRExpr** env, IRExpr* e )
             break;
          }
 
+         /* Similarly .. */
+         case Iop_V256toV128_0: case Iop_V256toV128_1: {
+            UInt v256 = e->Iex.Unop.arg->Iex.Const.con->Ico.V256;
+            if (v256 == 0x00000000) {
+               e2 = IRExpr_Const(IRConst_V128(0x0000));
+            } else {
+               goto unhandled;
+            }
+            break;
+         }
+
          case Iop_ZeroHI64ofV128: {
             /* Could do better here -- only need to look at the bottom 64 bits
                of the argument, really. */
@@ -2129,6 +2140,8 @@ static IRExpr* fold_Expr_WRK ( IRExpr** env, IRExpr* e )
             }
 
             /* -- V128 stuff -- */
+            case Iop_InterleaveLO64x2: // This, and the HI64, are created
+            case Iop_InterleaveHI64x2: // by the amd64 PTEST translation
             case Iop_InterleaveLO8x16: {
                /* This turns up a lot in Memcheck instrumentation of
                   Icc generated code.  I don't know why. */
@@ -2321,6 +2334,27 @@ static IRExpr* fold_Expr_WRK ( IRExpr** env, IRExpr* e )
                      break;
                   }
                }
+               /* AndV128( x, NotV128( x ) ) ==> 0...0  and
+                  AndV256( x, NotV256( x ) ) ==> 0...0
+                  This is generated by amd64 `ptest %xmmReg, %xmmReg`
+                    (the same reg both times)
+                  See https://bugzilla.redhat.com/show_bug.cgi?id=2257546 */
+               if (e->Iex.Binop.op == Iop_AndV128
+                   || e->Iex.Binop.op == Iop_AndV256) {
+                  Bool isV256 = e->Iex.Binop.op == Iop_AndV256;
+                  IRExpr* x1 = chase(env, e->Iex.Binop.arg1);
+                  IRExpr* rhs = chase(env, e->Iex.Binop.arg2);
+                  if (x1 && rhs
+                         && rhs->tag == Iex_Unop
+                         && rhs->Iex.Unop.op == (isV256 ? Iop_NotV256
+                                                        : Iop_NotV128)) {
+                     IRExpr* x2 = chase(env, rhs->Iex.Unop.arg);
+                     if (x2 && sameIRExprs(env, x1, x2)) {
+                        e2 = mkZeroOfPrimopResultType(e->Iex.Binop.op);
+                        break;
+                     }
+                  }
+               }
                break;
 
             case Iop_OrV128:
diff --git a/memcheck/tests/amd64/Makefile.am b/memcheck/tests/amd64/Makefile.am
index 0d2812dd8..906b8f393 100644
--- a/memcheck/tests/amd64/Makefile.am
+++ b/memcheck/tests/amd64/Makefile.am
@@ -20,6 +20,10 @@ EXTRA_DIST = \
 		insn-pmovmskb.stderr.exp-clang \
 	more_x87_fp.stderr.exp more_x87_fp.stdout.exp more_x87_fp.vgtest \
 	pcmpgt.stderr.exp pcmpgt.vgtest \
+	rh2257546_128.vgtest \
+		rh2257546_128.stderr.exp rh2257546_128.stdout.exp \
+	rh2257546_256.vgtest \
+		rh2257546_256.stderr.exp rh2257546_256.stdout.exp \
 	sh-mem-vec128-plo-no.vgtest \
 		sh-mem-vec128-plo-no.stderr.exp \
 		sh-mem-vec128-plo-no.stdout.exp \
@@ -46,12 +50,13 @@ check_PROGRAMS = \
 	insn-bsfl \
 	insn-pmovmskb \
 	pcmpgt \
+	rh2257546_128 \
 	sh-mem-vec128 \
 	sse_memory \
 	xor-undef-amd64
 
 if BUILD_AVX_TESTS
- check_PROGRAMS += sh-mem-vec256 xsave-avx
+ check_PROGRAMS += rh2257546_256 sh-mem-vec256 xsave-avx
 endif
 if HAVE_ASM_CONSTRAINT_P
  check_PROGRAMS += insn-pcmpistri
diff --git a/memcheck/tests/amd64/rh2257546_128.c b/memcheck/tests/amd64/rh2257546_128.c
new file mode 100644
index 000000000..a405aa775
--- /dev/null
+++ b/memcheck/tests/amd64/rh2257546_128.c
@@ -0,0 +1,32 @@
+
+// This should run on memcheck without reporting an undef-value error.
+// See https://bugzilla.redhat.com/show_bug.cgi?id=2257546
+
+#include <stdio.h>
+#include <malloc.h>
+
+int main ( void ) 
+{
+   char* c1 = malloc(16);
+   c1[0] = 'x'; c1[1] = 'y'; c1[2] = 'x'; c1[3] = 0;
+
+   char* c2 = "foobarxyzzyfoobarzyzzy";  // strlen > 16
+
+   long long int res;
+   __asm__ __volatile__(
+   "movdqu   (%1),   %%xmm4"   "\n\t"
+   "movdqu   (%2),   %%xmm5"   "\n\t"
+   "pxor     %%xmm4, %%xmm5"   "\n\t"
+   "ptest    %%xmm5, %%xmm5"   "\n\t"
+   "je       zzz1f"            "\n\t"
+   "mov      $99, %0"          "\n\t"
+   "jmp      zzzafter"         "\n"
+   "zzz1f:"                    "\n\t"
+   "mov $88, %0"               "\n"
+   "zzzafter:"                 "\n\t"
+   : /*OUT*/"=r"(res) : /*IN*/"r"(c1),"r"(c2) : /*TRASH*/"xmm4","xmm5","cc"
+   );
+   printf("res = %lld\n", res);
+   free(c1);
+   return 0;
+}
diff --git a/memcheck/tests/amd64/rh2257546_128.stderr.exp b/memcheck/tests/amd64/rh2257546_128.stderr.exp
new file mode 100644
index 000000000..e69de29bb
diff --git a/memcheck/tests/amd64/rh2257546_128.stdout.exp b/memcheck/tests/amd64/rh2257546_128.stdout.exp
new file mode 100644
index 000000000..454131089
--- /dev/null
+++ b/memcheck/tests/amd64/rh2257546_128.stdout.exp
@@ -0,0 +1 @@
+res = 99
diff --git a/memcheck/tests/amd64/rh2257546_128.vgtest b/memcheck/tests/amd64/rh2257546_128.vgtest
new file mode 100644
index 000000000..94414cd6f
--- /dev/null
+++ b/memcheck/tests/amd64/rh2257546_128.vgtest
@@ -0,0 +1,2 @@
+prog: rh2257546_128
+vgopts: -q
diff --git a/memcheck/tests/amd64/rh2257546_256.c b/memcheck/tests/amd64/rh2257546_256.c
new file mode 100644
index 000000000..235005ca6
--- /dev/null
+++ b/memcheck/tests/amd64/rh2257546_256.c
@@ -0,0 +1,32 @@
+
+// This should run on memcheck without reporting an undef-value error.
+// See https://bugzilla.redhat.com/show_bug.cgi?id=2257546
+
+#include <stdio.h>
+#include <malloc.h>
+
+int main ( void ) 
+{
+   char* c1 = malloc(32);
+   c1[0] = 'x'; c1[1] = 'y'; c1[2] = 'x'; c1[3] = 0;
+
+   char* c2 = "foobarxyzzyfoobarzyzzyandawholelotmoretoo";  // strlen > 32
+
+   long long int res;
+   __asm__ __volatile__(
+   "vmovdqu  (%1),   %%ymm4"           "\n\t"
+   "vmovdqu  (%2),   %%ymm5"           "\n\t"
+   "vpxor    %%ymm4, %%ymm5, %%ymm5"   "\n\t"
+   "vptest   %%ymm5, %%ymm5"           "\n\t"
+   "je       zzz1f"                    "\n\t"
+   "mov      $99, %0"                  "\n\t"
+   "jmp      zzzafter"                 "\n"
+   "zzz1f:"                            "\n\t"
+   "mov $88, %0"                       "\n"
+   "zzzafter:"                         "\n\t"
+   : /*OUT*/"=r"(res) : /*IN*/"r"(c1),"r"(c2) : /*TRASH*/"ymm4","ymm5","cc"
+   );
+   printf("res = %lld\n", res);
+   free(c1);
+   return 0;
+}
diff --git a/memcheck/tests/amd64/rh2257546_256.stderr.exp b/memcheck/tests/amd64/rh2257546_256.stderr.exp
new file mode 100644
index 000000000..e69de29bb
diff --git a/memcheck/tests/amd64/rh2257546_256.stdout.exp b/memcheck/tests/amd64/rh2257546_256.stdout.exp
new file mode 100644
index 000000000..454131089
--- /dev/null
+++ b/memcheck/tests/amd64/rh2257546_256.stdout.exp
@@ -0,0 +1 @@
+res = 99
diff --git a/memcheck/tests/amd64/rh2257546_256.vgtest b/memcheck/tests/amd64/rh2257546_256.vgtest
new file mode 100644
index 000000000..86eef8fe2
--- /dev/null
+++ b/memcheck/tests/amd64/rh2257546_256.vgtest
@@ -0,0 +1,3 @@
+prereq: test -e rh2257546_256
+prog: rh2257546_256
+vgopts: -q
-- 
2.43.0