@@ -1362,6 +1362,10 @@ bool emitter::TakesRexWPrefix(const instrDesc* id) const
13621362 case INS_shlx:
13631363 case INS_shrx:
13641364#endif // TARGET_AMD64
1365+ case INS_vcvtsd2usi:
1366+ case INS_vcvtss2usi:
1367+ case INS_vcvttsd2usi:
1368+ case INS_vcvttss2usi:
13651369 {
13661370 if (attr == EA_8BYTE)
13671371 {
@@ -2582,6 +2586,10 @@ bool emitter::emitInsCanOnlyWriteSSE2OrAVXReg(instrDesc* id)
25822586 case INS_sarx:
25832587 case INS_shrx:
25842588#endif
2589+ case INS_vcvtsd2usi:
2590+ case INS_vcvtss2usi:
2591+ case INS_vcvttsd2usi:
2592+ case INS_vcvttss2usi:
25852593 {
25862594 // These SSE instructions write to a general purpose integer register.
25872595 return false;
@@ -3010,7 +3018,7 @@ inline bool hasTupleTypeInfo(instruction ins)
30103018// Return Value:
30113019// the tuple type info for a given CPU instruction.
30123020//
3013- inline insTupleType insTupleTypeInfo(instruction ins)
3021+ insTupleType emitter:: insTupleTypeInfo(instruction ins) const
30143022{
30153023 assert((unsigned)ins < ArrLen(insTupleTypeInfos));
30163024 assert(insTupleTypeInfos[ins] != INS_TT_NONE);
@@ -3020,9 +3028,9 @@ inline insTupleType insTupleTypeInfo(instruction ins)
30203028// Return true if the instruction uses the SSE38 or SSE3A macro in instrsXArch.h.
30213029bool emitter::EncodedBySSE38orSSE3A(instruction ins) const
30223030{
3023- const size_t SSE38 = 0x0F660038 ;
3024- const size_t SSE3A = 0x0F66003A ;
3025- const size_t MASK = 0xFFFF00FF ;
3031+ const size_t SSE38 = 0x0F000038 ;
3032+ const size_t SSE3A = 0x0F00003A ;
3033+ const size_t MASK = 0xFF0000FF ;
30263034
30273035 size_t insCode = 0;
30283036
@@ -3044,8 +3052,19 @@ bool emitter::EncodedBySSE38orSSE3A(instruction ins) const
30443052 insCode = insCodeMR(ins);
30453053 }
30463054
3047- insCode &= MASK;
3048- return insCode == SSE38 || insCode == SSE3A;
3055+ size_t mskCode = insCode & MASK;
3056+
3057+ if ((mskCode != SSE38) && (mskCode != SSE3A))
3058+ {
3059+ return false;
3060+ }
3061+
3062+ #if defined(DEBUG)
3063+ insCode = (insCode >> 16) & 0xFF;
3064+ assert((insCode == 0x66) || (insCode == 0xF2) || (insCode == 0xF3));
3065+ #endif // DEBUG
3066+
3067+ return true;
30493068}
30503069
30513070/*****************************************************************************
@@ -11214,6 +11233,10 @@ void emitter::emitDispIns(
1121411233 case INS_cvtss2si:
1121511234 case INS_cvtsd2si:
1121611235 case INS_cvttss2si:
11236+ case INS_vcvtsd2usi:
11237+ case INS_vcvtss2usi:
11238+ case INS_vcvttsd2usi:
11239+ case INS_vcvttss2usi:
1121711240 {
1121811241 printf(" %s, %s", emitRegName(id->idReg1(), attr), emitRegName(id->idReg2(), EA_16BYTE));
1121911242 break;
@@ -15528,9 +15551,9 @@ ssize_t emitter::TryEvexCompressDisp8Byte(instrDesc* id, ssize_t dsp, bool* dspI
1552815551 disp8Compression = inputSize * 4;
1552915552 break;
1553015553 case INS_TT_TUPLE8:
15531- // N = input size in bytes * 4 , 32bit for 512 only
15554+ // N = input size in bytes * 8 , 32bit for 512 only
1553215555 assert((inputSize == 4 && vectorLength >= 64));
15533- disp8Compression = inputSize * 4 ;
15556+ disp8Compression = inputSize * 8 ;
1553415557 break;
1553515558 case INS_TT_HALF_MEM:
1553615559 // N = vector length in bytes / 2
@@ -17825,11 +17848,39 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
1782517848 case INS_cvttps2dq:
1782617849 case INS_cvtps2dq:
1782717850 case INS_cvtdq2ps:
17851+ case INS_vcvtpd2qq:
17852+ case INS_vcvtpd2uqq:
17853+ case INS_vcvtps2udq:
17854+ case INS_vcvtqq2pd:
17855+ case INS_vcvttps2udq:
17856+ case INS_vcvtudq2ps:
17857+ case INS_vcvttpd2qq:
17858+ case INS_vcvttpd2uqq:
17859+ case INS_vcvtuqq2pd:
17860+ result.insThroughput = PERFSCORE_THROUGHPUT_2X;
17861+ result.insLatency += PERFSCORE_LATENCY_4C;
17862+ break;
17863+
17864+ case INS_vpmovdb:
1782817865 case INS_vpmovdw:
17866+ case INS_vpmovqb:
1782917867 case INS_vpmovqd:
17868+ case INS_vpmovqw:
17869+ case INS_vpmovsdb:
17870+ case INS_vpmovsdw:
17871+ case INS_vpmovsqb:
17872+ case INS_vpmovsqd:
17873+ case INS_vpmovsqw:
17874+ case INS_vpmovswb:
17875+ case INS_vpmovusdb:
17876+ case INS_vpmovusdw:
17877+ case INS_vpmovusqb:
17878+ case INS_vpmovusqd:
17879+ case INS_vpmovusqw:
17880+ case INS_vpmovuswb:
1783017881 case INS_vpmovwb:
17831- result.insThroughput = PERFSCORE_THROUGHPUT_2X ;
17832- result.insLatency += PERFSCORE_LATENCY_4C;
17882+ result.insThroughput = PERFSCORE_THROUGHPUT_2C ;
17883+ result.insLatency += (opSize == EA_16BYTE) ? PERFSCORE_LATENCY_2C : PERFSCORE_LATENCY_4C;
1783317884 break;
1783417885
1783517886 case INS_haddps:
@@ -17892,12 +17943,20 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
1789217943 case INS_cvtsi2ss32:
1789317944 case INS_cvtsi2sd64:
1789417945 case INS_cvtsi2ss64:
17946+ case INS_vcvtsd2usi:
17947+ case INS_vcvttsd2usi:
17948+ case INS_vcvtusi2sd32:
17949+ case INS_vcvtusi2sd64:
17950+ case INS_vcvtusi2ss32:
17951+ case INS_vcvtusi2ss64:
1789517952 result.insThroughput = PERFSCORE_THROUGHPUT_1C;
1789617953 result.insLatency += PERFSCORE_LATENCY_7C;
1789717954 break;
1789817955
1789917956 case INS_cvttss2si:
1790017957 case INS_cvtss2si:
17958+ case INS_vcvtss2usi:
17959+ case INS_vcvttss2usi:
1790117960 result.insThroughput = PERFSCORE_THROUGHPUT_1C;
1790217961 result.insLatency += opSize == EA_8BYTE ? PERFSCORE_LATENCY_8C : PERFSCORE_LATENCY_7C;
1790317962 break;
@@ -18241,6 +18300,15 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
1824118300 case INS_cvtdq2pd:
1824218301 case INS_cvtpd2ps:
1824318302 case INS_cvttpd2dq:
18303+ case INS_vcvtpd2udq:
18304+ case INS_vcvtps2qq:
18305+ case INS_vcvtps2uqq:
18306+ case INS_vcvtqq2ps:
18307+ case INS_vcvttpd2udq:
18308+ case INS_vcvttps2qq:
18309+ case INS_vcvttps2uqq:
18310+ case INS_vcvtudq2pd:
18311+ case INS_vcvtuqq2ps:
1824418312 result.insThroughput = PERFSCORE_THROUGHPUT_1C;
1824518313 result.insLatency += opSize == EA_32BYTE ? PERFSCORE_LATENCY_7C : PERFSCORE_LATENCY_5C;
1824618314 break;
@@ -18282,17 +18350,25 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
1828218350 case INS_vpbroadcastq_gpr:
1828318351 case INS_vbroadcasti128:
1828418352 case INS_vbroadcastf128:
18353+ case INS_vbroadcastf64x2:
18354+ case INS_vbroadcasti64x2:
18355+ case INS_vbroadcastf64x4:
18356+ case INS_vbroadcasti64x4:
18357+ case INS_vbroadcastf32x2:
18358+ case INS_vbroadcasti32x2:
18359+ case INS_vbroadcastf32x8:
18360+ case INS_vbroadcasti32x8:
1828518361 case INS_vbroadcastss:
1828618362 case INS_vbroadcastsd:
1828718363 if (memAccessKind == PERFSCORE_MEMORY_NONE)
1828818364 {
1828918365 result.insThroughput = PERFSCORE_THROUGHPUT_1C;
18290- result.insLatency = opSize == EA_32BYTE ? PERFSCORE_LATENCY_3C : PERFSCORE_LATENCY_1C ;
18366+ result.insLatency = opSize == EA_16BYTE ? PERFSCORE_LATENCY_1C : PERFSCORE_LATENCY_3C ;
1829118367 }
1829218368 else
1829318369 {
1829418370 result.insThroughput = PERFSCORE_THROUGHPUT_2X;
18295- result.insLatency += opSize == EA_32BYTE ? PERFSCORE_LATENCY_3C : PERFSCORE_LATENCY_2C ;
18371+ result.insLatency += opSize == EA_16BYTE ? PERFSCORE_LATENCY_2C : PERFSCORE_LATENCY_3C ;
1829618372 if (ins == INS_vpbroadcastb || ins == INS_vpbroadcastw)
1829718373 {
1829818374 result.insLatency += PERFSCORE_LATENCY_1C;
0 commit comments