Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9854,6 +9854,27 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
SDValue ZeroOverFlow = getConstant(0, DL, VTList.VTs[1]);
return getNode(ISD::MERGE_VALUES, DL, VTList, {N1, ZeroOverFlow}, Flags);
}

if (VTList.VTs[0].isVector() &&
VTList.VTs[0].getVectorElementType() == MVT::i1 &&
VTList.VTs[1].getVectorElementType() == MVT::i1) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why limit this to vectors only?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Mainly because all other add/sub/sat cases did

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would you prefer if I generalized this for scalars as well?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it would make sense to switch all of them to work on scalars as well, but that can be a separate change...

SDValue F1 = getFreeze(N1);
SDValue F2 = getFreeze(N2);
// {vXi1,vXi1} (u/s)addo(vXi1 x, vXi1y) -> {xor(x,y),and(x,y)}
if (Opcode == ISD::UADDO || Opcode == ISD::SADDO)
return getNode(ISD::MERGE_VALUES, DL, VTList,
{getNode(ISD::XOR, DL, VTList.VTs[0], F1, F2),
getNode(ISD::AND, DL, VTList.VTs[1], F1, F2)},
Flags);
// {vXi1,vXi1} (u/s)subo(vXi1 x, vXi1y) -> {xor(x,y),and(~x,y)}
if (Opcode == ISD::USUBO || Opcode == ISD::SSUBO) {
SDValue NotF1 = getNOT(DL, F1, VTList.VTs[0]);
return getNode(ISD::MERGE_VALUES, DL, VTList,
{getNode(ISD::XOR, DL, VTList.VTs[0], F1, F2),
getNode(ISD::AND, DL, VTList.VTs[1], NotF1, F2)},
Flags);
}
}
break;
}
case ISD::SMUL_LOHI:
Expand Down
20 changes: 8 additions & 12 deletions llvm/test/CodeGen/AArch64/vec_uaddo.ll
Original file line number Diff line number Diff line change
Expand Up @@ -245,21 +245,17 @@ define <4 x i32> @uaddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind {
define <4 x i32> @uaddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
; CHECK-LABEL: uaddo_v4i1:
; CHECK: // %bb.0:
; CHECK-NEXT: movi v2.4h, #1
; CHECK-NEXT: eor v2.8b, v0.8b, v1.8b
; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-NEXT: adrp x8, .LCPI10_0
; CHECK-NEXT: shl v2.4h, v2.4h, #15
; CHECK-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-NEXT: cmlt v1.4h, v2.4h, #0
; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI10_0]
; CHECK-NEXT: shl v0.4s, v0.4s, #31
; CHECK-NEXT: and v1.8b, v1.8b, v2.8b
; CHECK-NEXT: and v0.8b, v0.8b, v2.8b
; CHECK-NEXT: add v0.4h, v0.4h, v1.4h
; CHECK-NEXT: fmov d1, d0
; CHECK-NEXT: shl v2.4h, v0.4h, #15
; CHECK-NEXT: cmlt v2.4h, v2.4h, #0
; CHECK-NEXT: bic v1.4h, #2
; CHECK-NEXT: cmeq v0.4h, v1.4h, v0.4h
; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI10_0]
; CHECK-NEXT: and v1.8b, v2.8b, v1.8b
; CHECK-NEXT: mvn v0.8b, v0.8b
; CHECK-NEXT: cmlt v0.4s, v0.4s, #0
; CHECK-NEXT: addv h1, v1.4h
; CHECK-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-NEXT: fmov w8, s1
; CHECK-NEXT: strb w8, [x0]
; CHECK-NEXT: ret
Expand Down
38 changes: 38 additions & 0 deletions llvm/test/CodeGen/X86/pr69080.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s --check-prefixes=SSE
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f | FileCheck %s --check-prefixes=AVX

define { <4 x i1>, <4 x i1> } @uaddo(<4 x i1> %a) {
; SSE-LABEL: uaddo:
; SSE: # %bb.0:
; SSE-NEXT: movaps %xmm0, %xmm1
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: uaddo:
; AVX: # %bb.0:
; AVX-NEXT: vmovaps %xmm0, %xmm1
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%f = call { <4 x i1>, <4 x i1> } @llvm.uadd.with.overflow.v4i1(<4 x i1> %a, <4 x i1> %a)
ret { <4 x i1>, <4 x i1> } %f
}
declare { <4 x i1>, <4 x i1> } @llvm.uadd.with.overflow.v4i1(<4 x i1>, <4 x i1>)

define { <4 x i1>, <4 x i1> } @saddo(<4 x i1> %a) {
; SSE-LABEL: saddo:
; SSE: # %bb.0:
; SSE-NEXT: movaps %xmm0, %xmm1
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: saddo:
; AVX: # %bb.0:
; AVX-NEXT: vmovaps %xmm0, %xmm1
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%f = call { <4 x i1>, <4 x i1> } @llvm.sadd.with.overflow.v4i1(<4 x i1> %a, <4 x i1> %a)
ret { <4 x i1>, <4 x i1> } %f
}
declare { <4 x i1>, <4 x i1> } @llvm.sadd.with.overflow.v4i1(<4 x i1>, <4 x i1>)
35 changes: 12 additions & 23 deletions llvm/test/CodeGen/X86/vec_saddo.ll
Original file line number Diff line number Diff line change
Expand Up @@ -976,46 +976,35 @@ define <4 x i32> @saddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind {
define <4 x i32> @saddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
; SSE-LABEL: saddo_v4i1:
; SSE: # %bb.0:
; SSE-NEXT: pslld $31, %xmm1
; SSE-NEXT: psrad $31, %xmm1
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: pxor %xmm1, %xmm2
; SSE-NEXT: pslld $31, %xmm2
; SSE-NEXT: movmskps %xmm2, %eax
; SSE-NEXT: pand %xmm1, %xmm0
; SSE-NEXT: pslld $31, %xmm0
; SSE-NEXT: psrad $31, %xmm0
; SSE-NEXT: paddd %xmm1, %xmm0
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: pslld $31, %xmm1
; SSE-NEXT: movmskps %xmm1, %eax
; SSE-NEXT: psrad $31, %xmm1
; SSE-NEXT: pcmpeqd %xmm1, %xmm0
; SSE-NEXT: pcmpeqd %xmm1, %xmm1
; SSE-NEXT: pxor %xmm1, %xmm0
; SSE-NEXT: movb %al, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: saddo_v4i1:
; AVX: # %bb.0:
; AVX-NEXT: vpslld $31, %xmm1, %xmm1
; AVX-NEXT: vpsrad $31, %xmm1, %xmm1
; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm2
; AVX-NEXT: vpslld $31, %xmm2, %xmm2
; AVX-NEXT: vmovmskps %xmm2, %eax
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpslld $31, %xmm0, %xmm0
; AVX-NEXT: vpsrad $31, %xmm0, %xmm0
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpslld $31, %xmm0, %xmm1
; AVX-NEXT: vpsrad $31, %xmm1, %xmm2
; AVX-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0
; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX-NEXT: vmovmskps %xmm1, %eax
; AVX-NEXT: movb %al, (%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: saddo_v4i1:
; AVX512: # %bb.0:
; AVX512-NEXT: vpslld $31, %xmm0, %xmm0
; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0
; AVX512-NEXT: vpslld $31, %xmm1, %xmm1
; AVX512-NEXT: vptestmd %xmm1, %xmm1, %k1
; AVX512-NEXT: vpslld $31, %xmm1, %xmm0
; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k1
; AVX512-NEXT: kxorw %k1, %k0, %k2
; AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k0 {%k2}
; AVX512-NEXT: kxorw %k0, %k1, %k1
; AVX512-NEXT: kandw %k1, %k0, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: kshiftlw $12, %k2, %k0
Expand Down
36 changes: 13 additions & 23 deletions llvm/test/CodeGen/X86/vec_ssubo.ll
Original file line number Diff line number Diff line change
Expand Up @@ -985,34 +985,24 @@ define <4 x i32> @ssubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind {
define <4 x i32> @ssubo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
; SSE-LABEL: ssubo_v4i1:
; SSE: # %bb.0:
; SSE-NEXT: pslld $31, %xmm1
; SSE-NEXT: psrad $31, %xmm1
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: pxor %xmm1, %xmm2
; SSE-NEXT: pslld $31, %xmm2
; SSE-NEXT: movmskps %xmm2, %eax
; SSE-NEXT: pandn %xmm1, %xmm0
; SSE-NEXT: pslld $31, %xmm0
; SSE-NEXT: psrad $31, %xmm0
; SSE-NEXT: psubd %xmm1, %xmm0
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: pslld $31, %xmm1
; SSE-NEXT: movmskps %xmm1, %eax
; SSE-NEXT: psrad $31, %xmm1
; SSE-NEXT: pcmpeqd %xmm1, %xmm0
; SSE-NEXT: pcmpeqd %xmm1, %xmm1
; SSE-NEXT: pxor %xmm1, %xmm0
; SSE-NEXT: movb %al, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: ssubo_v4i1:
; AVX: # %bb.0:
; AVX-NEXT: vpslld $31, %xmm1, %xmm1
; AVX-NEXT: vpsrad $31, %xmm1, %xmm1
; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm2
; AVX-NEXT: vpslld $31, %xmm2, %xmm2
; AVX-NEXT: vmovmskps %xmm2, %eax
; AVX-NEXT: vpandn %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpslld $31, %xmm0, %xmm0
; AVX-NEXT: vpsrad $31, %xmm0, %xmm0
; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpslld $31, %xmm0, %xmm1
; AVX-NEXT: vpsrad $31, %xmm1, %xmm2
; AVX-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0
; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX-NEXT: vmovmskps %xmm1, %eax
; AVX-NEXT: movb %al, (%rdi)
; AVX-NEXT: retq
;
Expand All @@ -1022,11 +1012,11 @@ define <4 x i32> @ssubo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0
; AVX512-NEXT: vpslld $31, %xmm1, %xmm1
; AVX512-NEXT: vptestmd %xmm1, %xmm1, %k1
; AVX512-NEXT: kxorw %k1, %k0, %k1
; AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k2 {%k1}
; AVX512-NEXT: kxorw %k1, %k0, %k0
; AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k1 {%k1}
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k2} {z}
; AVX512-NEXT: kshiftlw $12, %k1, %k0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: kshiftlw $12, %k0, %k0
; AVX512-NEXT: kshiftrw $12, %k0, %k0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: movb %al, (%rdi)
Expand Down
63 changes: 20 additions & 43 deletions llvm/test/CodeGen/X86/vec_uaddo.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1075,61 +1075,38 @@ define <4 x i32> @uaddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind {
define <4 x i32> @uaddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
; SSE-LABEL: uaddo_v4i1:
; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1]
; SSE-NEXT: pand %xmm2, %xmm1
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: paddd %xmm1, %xmm0
; SSE-NEXT: pand %xmm0, %xmm2
; SSE-NEXT: pcmpeqd %xmm0, %xmm2
; SSE-NEXT: pcmpeqd %xmm1, %xmm1
; SSE-NEXT: pxor %xmm2, %xmm1
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: pxor %xmm1, %xmm2
; SSE-NEXT: pslld $31, %xmm2
; SSE-NEXT: movmskps %xmm2, %eax
; SSE-NEXT: pand %xmm1, %xmm0
; SSE-NEXT: pslld $31, %xmm0
; SSE-NEXT: movmskps %xmm0, %eax
; SSE-NEXT: psrad $31, %xmm0
; SSE-NEXT: movb %al, (%rdi)
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: uaddo_v4i1:
; AVX1: # %bb.0:
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1]
; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm0
; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
; AVX1-NEXT: vmovmskps %xmm1, %eax
; AVX1-NEXT: movb %al, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: uaddo_v4i1:
; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1]
; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1
; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm0
; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpslld $31, %xmm1, %xmm1
; AVX2-NEXT: vmovmskps %xmm1, %eax
; AVX2-NEXT: movb %al, (%rdi)
; AVX2-NEXT: retq
; AVX-LABEL: uaddo_v4i1:
; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm2
; AVX-NEXT: vpslld $31, %xmm2, %xmm2
; AVX-NEXT: vmovmskps %xmm2, %eax
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpslld $31, %xmm0, %xmm0
; AVX-NEXT: vpsrad $31, %xmm0, %xmm0
; AVX-NEXT: movb %al, (%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: uaddo_v4i1:
; AVX512: # %bb.0:
; AVX512-NEXT: vpslld $31, %xmm0, %xmm0
; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0
; AVX512-NEXT: vpslld $31, %xmm1, %xmm0
; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k1
; AVX512-NEXT: kxorw %k1, %k0, %k1
; AVX512-NEXT: kandnw %k0, %k1, %k2
; AVX512-NEXT: kxorw %k1, %k0, %k2
; AVX512-NEXT: kandw %k1, %k0, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k2} {z}
; AVX512-NEXT: kshiftlw $12, %k1, %k0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: kshiftlw $12, %k2, %k0
; AVX512-NEXT: kshiftrw $12, %k0, %k0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: movb %al, (%rdi)
Expand Down
63 changes: 20 additions & 43 deletions llvm/test/CodeGen/X86/vec_usubo.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1122,61 +1122,38 @@ define <4 x i32> @usubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind {
define <4 x i32> @usubo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
; SSE-LABEL: usubo_v4i1:
; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1]
; SSE-NEXT: pand %xmm2, %xmm1
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: psubd %xmm1, %xmm0
; SSE-NEXT: pand %xmm0, %xmm2
; SSE-NEXT: pcmpeqd %xmm0, %xmm2
; SSE-NEXT: pcmpeqd %xmm1, %xmm1
; SSE-NEXT: pxor %xmm2, %xmm1
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: pxor %xmm1, %xmm2
; SSE-NEXT: pslld $31, %xmm2
; SSE-NEXT: movmskps %xmm2, %eax
; SSE-NEXT: pandn %xmm1, %xmm0
; SSE-NEXT: pslld $31, %xmm0
; SSE-NEXT: movmskps %xmm0, %eax
; SSE-NEXT: psrad $31, %xmm0
; SSE-NEXT: movb %al, (%rdi)
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: usubo_v4i1:
; AVX1: # %bb.0:
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1]
; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm0
; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
; AVX1-NEXT: vmovmskps %xmm1, %eax
; AVX1-NEXT: movb %al, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: usubo_v4i1:
; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1]
; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm1
; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm0
; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpslld $31, %xmm1, %xmm1
; AVX2-NEXT: vmovmskps %xmm1, %eax
; AVX2-NEXT: movb %al, (%rdi)
; AVX2-NEXT: retq
; AVX-LABEL: usubo_v4i1:
; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm2
; AVX-NEXT: vpslld $31, %xmm2, %xmm2
; AVX-NEXT: vmovmskps %xmm2, %eax
; AVX-NEXT: vpandn %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpslld $31, %xmm0, %xmm0
; AVX-NEXT: vpsrad $31, %xmm0, %xmm0
; AVX-NEXT: movb %al, (%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: usubo_v4i1:
; AVX512: # %bb.0:
; AVX512-NEXT: vpslld $31, %xmm0, %xmm0
; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0
; AVX512-NEXT: vpslld $31, %xmm1, %xmm1
; AVX512-NEXT: vptestmd %xmm1, %xmm1, %k1
; AVX512-NEXT: kxorw %k1, %k0, %k1
; AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k2 {%k1}
; AVX512-NEXT: kxorw %k1, %k0, %k0
; AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k1 {%k1}
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k2} {z}
; AVX512-NEXT: kshiftlw $12, %k1, %k0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: kshiftlw $12, %k0, %k0
; AVX512-NEXT: kshiftrw $12, %k0, %k0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: movb %al, (%rdi)
Expand Down