22; RUN: llc < %s -mtriple=x86_64-- -mcpu=nehalem     | FileCheck %s --check-prefixes=NHM 
33; RUN: llc < %s -mtriple=x86_64-- -mcpu=sandybridge | FileCheck %s --check-prefixes=FAST-SCALAR,SNB 
44; RUN: llc < %s -mtriple=x86_64-- -mcpu=broadwell   | FileCheck %s --check-prefixes=FAST-SCALAR,BDW 
5- ; RUN: llc < %s -mtriple=x86_64-- -mcpu=skylake     | FileCheck %s --check-prefixes=FAST-SCALAR,SKL  
6- ; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver1      | FileCheck %s --check-prefixes=SLOW -SCALAR,ZN1  
7- ; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver3      | FileCheck %s --check-prefixes=SLOW -SCALAR,ZN3  
5+ ; RUN: llc < %s -mtriple=x86_64-- -mcpu=skylake     | FileCheck %s --check-prefixes=FAST-SCALAR,FAST-VECTOR  
6+ ; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver1      | FileCheck %s --check-prefixes=FAST -SCALAR,FAST-VECTOR  
7+ ; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver3      | FileCheck %s --check-prefixes=FAST -SCALAR,FAST-VECTOR  
88
99define  float  @f32_no_daz (float  %f ) #0  {
1010; NHM-LABEL: f32_no_daz: 
@@ -26,19 +26,6 @@ define float @f32_no_daz(float %f) #0 {
2626; FAST-SCALAR:       # %bb.0: 
2727; FAST-SCALAR-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 
2828; FAST-SCALAR-NEXT:    retq 
29- ; 
30- ; SLOW-SCALAR-LABEL: f32_no_daz: 
31- ; SLOW-SCALAR:       # %bb.0: 
32- ; SLOW-SCALAR-NEXT:    vrsqrtss %xmm0, %xmm0, %xmm1 
33- ; SLOW-SCALAR-NEXT:    vbroadcastss {{.*#+}} xmm3 = [NaN,NaN,NaN,NaN] 
34- ; SLOW-SCALAR-NEXT:    vmulss %xmm1, %xmm0, %xmm2 
35- ; SLOW-SCALAR-NEXT:    vfmadd213ss {{.*#+}} xmm1 = (xmm2 * xmm1) + mem 
36- ; SLOW-SCALAR-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 
37- ; SLOW-SCALAR-NEXT:    vandps %xmm3, %xmm0, %xmm0 
38- ; SLOW-SCALAR-NEXT:    vcmpltss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 
39- ; SLOW-SCALAR-NEXT:    vmulss %xmm1, %xmm2, %xmm1 
40- ; SLOW-SCALAR-NEXT:    vandnps %xmm1, %xmm0, %xmm0 
41- ; SLOW-SCALAR-NEXT:    retq 
4229  %call  = tail  call  fast float  @llvm.sqrt.f32 (float  %f ) #2 
4330  ret  float  %call 
4431}
@@ -91,42 +78,10 @@ define <4 x float> @v4f32_no_daz(<4 x float> %f) #0 {
9178; BDW-NEXT:    vandps %xmm1, %xmm0, %xmm0 
9279; BDW-NEXT:    retq 
9380; 
94- ; SKL-LABEL: v4f32_no_daz: 
95- ; SKL:       # %bb.0: 
96- ; SKL-NEXT:    vsqrtps %xmm0, %xmm0 
97- ; SKL-NEXT:    retq 
98- ; 
99- ; ZN1-LABEL: v4f32_no_daz: 
100- ; ZN1:       # %bb.0: 
101- ; ZN1-NEXT:    vrsqrtps %xmm0, %xmm1 
102- ; ZN1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] 
103- ; ZN1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [NaN,NaN,NaN,NaN] 
104- ; ZN1-NEXT:    vmulps %xmm1, %xmm0, %xmm2 
105- ; ZN1-NEXT:    vfmadd231ps {{.*#+}} xmm3 = (xmm2 * xmm1) + xmm3 
106- ; ZN1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] 
107- ; ZN1-NEXT:    vandps %xmm4, %xmm0, %xmm0 
108- ; ZN1-NEXT:    vmulps %xmm1, %xmm2, %xmm1 
109- ; ZN1-NEXT:    vmulps %xmm3, %xmm1, %xmm1 
110- ; ZN1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] 
111- ; ZN1-NEXT:    vcmpleps %xmm0, %xmm3, %xmm0 
112- ; ZN1-NEXT:    vandps %xmm1, %xmm0, %xmm0 
113- ; ZN1-NEXT:    retq 
114- ; 
115- ; ZN3-LABEL: v4f32_no_daz: 
116- ; ZN3:       # %bb.0: 
117- ; ZN3-NEXT:    vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] 
118- ; ZN3-NEXT:    vrsqrtps %xmm0, %xmm1 
119- ; ZN3-NEXT:    vbroadcastss {{.*#+}} xmm4 = [NaN,NaN,NaN,NaN] 
120- ; ZN3-NEXT:    vmulps %xmm1, %xmm0, %xmm2 
121- ; ZN3-NEXT:    vfmadd231ps {{.*#+}} xmm3 = (xmm2 * xmm1) + xmm3 
122- ; ZN3-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] 
123- ; ZN3-NEXT:    vandps %xmm4, %xmm0, %xmm0 
124- ; ZN3-NEXT:    vmulps %xmm1, %xmm2, %xmm1 
125- ; ZN3-NEXT:    vmulps %xmm3, %xmm1, %xmm1 
126- ; ZN3-NEXT:    vbroadcastss {{.*#+}} xmm3 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] 
127- ; ZN3-NEXT:    vcmpleps %xmm0, %xmm3, %xmm0 
128- ; ZN3-NEXT:    vandps %xmm1, %xmm0, %xmm0 
129- ; ZN3-NEXT:    retq 
81+ ; FAST-VECTOR-LABEL: v4f32_no_daz: 
82+ ; FAST-VECTOR:       # %bb.0: 
83+ ; FAST-VECTOR-NEXT:    vsqrtps %xmm0, %xmm0 
84+ ; FAST-VECTOR-NEXT:    retq 
13085  %call  = tail  call  fast <4  x float > @llvm.sqrt.v4f32 (<4  x float > %f ) #2 
13186  ret  <4  x float > %call 
13287}
@@ -194,42 +149,10 @@ define <8 x float> @v8f32_no_daz(<8 x float> %f) #0 {
194149; BDW-NEXT:    vandps %ymm1, %ymm0, %ymm0 
195150; BDW-NEXT:    retq 
196151; 
197- ; SKL-LABEL: v8f32_no_daz: 
198- ; SKL:       # %bb.0: 
199- ; SKL-NEXT:    vsqrtps %ymm0, %ymm0 
200- ; SKL-NEXT:    retq 
201- ; 
202- ; ZN1-LABEL: v8f32_no_daz: 
203- ; ZN1:       # %bb.0: 
204- ; ZN1-NEXT:    vrsqrtps %ymm0, %ymm1 
205- ; ZN1-NEXT:    vbroadcastss {{.*#+}} ymm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] 
206- ; ZN1-NEXT:    vbroadcastss {{.*#+}} ymm4 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] 
207- ; ZN1-NEXT:    vmulps %ymm1, %ymm0, %ymm2 
208- ; ZN1-NEXT:    vandps %ymm4, %ymm0, %ymm0 
209- ; ZN1-NEXT:    vfmadd231ps {{.*#+}} ymm3 = (ymm2 * ymm1) + ymm3 
210- ; ZN1-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] 
211- ; ZN1-NEXT:    vmulps %ymm1, %ymm2, %ymm1 
212- ; ZN1-NEXT:    vmulps %ymm3, %ymm1, %ymm1 
213- ; ZN1-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] 
214- ; ZN1-NEXT:    vcmpleps %ymm0, %ymm3, %ymm0 
215- ; ZN1-NEXT:    vandps %ymm1, %ymm0, %ymm0 
216- ; ZN1-NEXT:    retq 
217- ; 
218- ; ZN3-LABEL: v8f32_no_daz: 
219- ; ZN3:       # %bb.0: 
220- ; ZN3-NEXT:    vbroadcastss {{.*#+}} ymm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] 
221- ; ZN3-NEXT:    vrsqrtps %ymm0, %ymm1 
222- ; ZN3-NEXT:    vbroadcastss {{.*#+}} ymm4 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] 
223- ; ZN3-NEXT:    vmulps %ymm1, %ymm0, %ymm2 
224- ; ZN3-NEXT:    vfmadd231ps {{.*#+}} ymm3 = (ymm2 * ymm1) + ymm3 
225- ; ZN3-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] 
226- ; ZN3-NEXT:    vandps %ymm4, %ymm0, %ymm0 
227- ; ZN3-NEXT:    vmulps %ymm1, %ymm2, %ymm1 
228- ; ZN3-NEXT:    vmulps %ymm3, %ymm1, %ymm1 
229- ; ZN3-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] 
230- ; ZN3-NEXT:    vcmpleps %ymm0, %ymm3, %ymm0 
231- ; ZN3-NEXT:    vandps %ymm1, %ymm0, %ymm0 
232- ; ZN3-NEXT:    retq 
152+ ; FAST-VECTOR-LABEL: v8f32_no_daz: 
153+ ; FAST-VECTOR:       # %bb.0: 
154+ ; FAST-VECTOR-NEXT:    vsqrtps %ymm0, %ymm0 
155+ ; FAST-VECTOR-NEXT:    retq 
233156  %call  = tail  call  fast <8  x float > @llvm.sqrt.v8f32 (<8  x float > %f ) #2 
234157  ret  <8  x float > %call 
235158}
@@ -256,18 +179,6 @@ define float @f32_daz(float %f) #1 {
256179; FAST-SCALAR:       # %bb.0: 
257180; FAST-SCALAR-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 
258181; FAST-SCALAR-NEXT:    retq 
259- ; 
260- ; SLOW-SCALAR-LABEL: f32_daz: 
261- ; SLOW-SCALAR:       # %bb.0: 
262- ; SLOW-SCALAR-NEXT:    vrsqrtss %xmm0, %xmm0, %xmm1 
263- ; SLOW-SCALAR-NEXT:    vmulss %xmm1, %xmm0, %xmm2 
264- ; SLOW-SCALAR-NEXT:    vfmadd213ss {{.*#+}} xmm1 = (xmm2 * xmm1) + mem 
265- ; SLOW-SCALAR-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 
266- ; SLOW-SCALAR-NEXT:    vmulss %xmm1, %xmm2, %xmm1 
267- ; SLOW-SCALAR-NEXT:    vxorps %xmm2, %xmm2, %xmm2 
268- ; SLOW-SCALAR-NEXT:    vcmpeqss %xmm2, %xmm0, %xmm0 
269- ; SLOW-SCALAR-NEXT:    vandnps %xmm1, %xmm0, %xmm0 
270- ; SLOW-SCALAR-NEXT:    retq 
271182  %call  = tail  call  fast float  @llvm.sqrt.f32 (float  %f ) #2 
272183  ret  float  %call 
273184}
@@ -315,38 +226,10 @@ define <4 x float> @v4f32_daz(<4 x float> %f) #1 {
315226; BDW-NEXT:    vandps %xmm1, %xmm0, %xmm0 
316227; BDW-NEXT:    retq 
317228; 
318- ; SKL-LABEL: v4f32_daz: 
319- ; SKL:       # %bb.0: 
320- ; SKL-NEXT:    vsqrtps %xmm0, %xmm0 
321- ; SKL-NEXT:    retq 
322- ; 
323- ; ZN1-LABEL: v4f32_daz: 
324- ; ZN1:       # %bb.0: 
325- ; ZN1-NEXT:    vrsqrtps %xmm0, %xmm1 
326- ; ZN1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] 
327- ; ZN1-NEXT:    vmulps %xmm1, %xmm0, %xmm2 
328- ; ZN1-NEXT:    vfmadd231ps {{.*#+}} xmm3 = (xmm2 * xmm1) + xmm3 
329- ; ZN1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] 
330- ; ZN1-NEXT:    vmulps %xmm1, %xmm2, %xmm1 
331- ; ZN1-NEXT:    vxorps %xmm2, %xmm2, %xmm2 
332- ; ZN1-NEXT:    vcmpneqps %xmm2, %xmm0, %xmm0 
333- ; ZN1-NEXT:    vmulps %xmm3, %xmm1, %xmm1 
334- ; ZN1-NEXT:    vandps %xmm1, %xmm0, %xmm0 
335- ; ZN1-NEXT:    retq 
336- ; 
337- ; ZN3-LABEL: v4f32_daz: 
338- ; ZN3:       # %bb.0: 
339- ; ZN3-NEXT:    vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] 
340- ; ZN3-NEXT:    vrsqrtps %xmm0, %xmm1 
341- ; ZN3-NEXT:    vmulps %xmm1, %xmm0, %xmm2 
342- ; ZN3-NEXT:    vfmadd231ps {{.*#+}} xmm3 = (xmm2 * xmm1) + xmm3 
343- ; ZN3-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] 
344- ; ZN3-NEXT:    vmulps %xmm1, %xmm2, %xmm1 
345- ; ZN3-NEXT:    vxorps %xmm2, %xmm2, %xmm2 
346- ; ZN3-NEXT:    vcmpneqps %xmm2, %xmm0, %xmm0 
347- ; ZN3-NEXT:    vmulps %xmm3, %xmm1, %xmm1 
348- ; ZN3-NEXT:    vandps %xmm1, %xmm0, %xmm0 
349- ; ZN3-NEXT:    retq 
229+ ; FAST-VECTOR-LABEL: v4f32_daz: 
230+ ; FAST-VECTOR:       # %bb.0: 
231+ ; FAST-VECTOR-NEXT:    vsqrtps %xmm0, %xmm0 
232+ ; FAST-VECTOR-NEXT:    retq 
350233  %call  = tail  call  fast <4  x float > @llvm.sqrt.v4f32 (<4  x float > %f ) #2 
351234  ret  <4  x float > %call 
352235}
@@ -405,38 +288,10 @@ define <8 x float> @v8f32_daz(<8 x float> %f) #1 {
405288; BDW-NEXT:    vandps %ymm1, %ymm0, %ymm0 
406289; BDW-NEXT:    retq 
407290; 
408- ; SKL-LABEL: v8f32_daz: 
409- ; SKL:       # %bb.0: 
410- ; SKL-NEXT:    vsqrtps %ymm0, %ymm0 
411- ; SKL-NEXT:    retq 
412- ; 
413- ; ZN1-LABEL: v8f32_daz: 
414- ; ZN1:       # %bb.0: 
415- ; ZN1-NEXT:    vrsqrtps %ymm0, %ymm1 
416- ; ZN1-NEXT:    vbroadcastss {{.*#+}} ymm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] 
417- ; ZN1-NEXT:    vmulps %ymm1, %ymm0, %ymm2 
418- ; ZN1-NEXT:    vfmadd231ps {{.*#+}} ymm3 = (ymm2 * ymm1) + ymm3 
419- ; ZN1-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] 
420- ; ZN1-NEXT:    vmulps %ymm1, %ymm2, %ymm1 
421- ; ZN1-NEXT:    vxorps %xmm2, %xmm2, %xmm2 
422- ; ZN1-NEXT:    vcmpneqps %ymm2, %ymm0, %ymm0 
423- ; ZN1-NEXT:    vmulps %ymm3, %ymm1, %ymm1 
424- ; ZN1-NEXT:    vandps %ymm1, %ymm0, %ymm0 
425- ; ZN1-NEXT:    retq 
426- ; 
427- ; ZN3-LABEL: v8f32_daz: 
428- ; ZN3:       # %bb.0: 
429- ; ZN3-NEXT:    vbroadcastss {{.*#+}} ymm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] 
430- ; ZN3-NEXT:    vrsqrtps %ymm0, %ymm1 
431- ; ZN3-NEXT:    vmulps %ymm1, %ymm0, %ymm2 
432- ; ZN3-NEXT:    vfmadd231ps {{.*#+}} ymm3 = (ymm2 * ymm1) + ymm3 
433- ; ZN3-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] 
434- ; ZN3-NEXT:    vmulps %ymm1, %ymm2, %ymm1 
435- ; ZN3-NEXT:    vxorps %xmm2, %xmm2, %xmm2 
436- ; ZN3-NEXT:    vcmpneqps %ymm2, %ymm0, %ymm0 
437- ; ZN3-NEXT:    vmulps %ymm3, %ymm1, %ymm1 
438- ; ZN3-NEXT:    vandps %ymm1, %ymm0, %ymm0 
439- ; ZN3-NEXT:    retq 
291+ ; FAST-VECTOR-LABEL: v8f32_daz: 
292+ ; FAST-VECTOR:       # %bb.0: 
293+ ; FAST-VECTOR-NEXT:    vsqrtps %ymm0, %ymm0 
294+ ; FAST-VECTOR-NEXT:    retq 
440295  %call  = tail  call  fast <8  x float > @llvm.sqrt.v8f32 (<8  x float > %f ) #2 
441296  ret  <8  x float > %call 
442297}
0 commit comments