@@ -8013,7 +8013,7 @@ pub unsafe fn _mm_maskz_dbsad_epu8<const IMM8: i32>(
8013
8013
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_movepi16_mask&expand=3873)
8014
8014
#[inline]
8015
8015
#[target_feature(enable = "avx512bw")]
8016
- #[cfg_attr(test, assert_instr(mov ))] // should be vpmovw2m but msvc does not generate it
8016
+ #[cfg_attr(test, assert_instr(vpmovw2m ))]
8017
8017
pub unsafe fn _mm512_movepi16_mask(a: __m512i) -> __mmask32 {
8018
8018
let filter = _mm512_set1_epi16(1 << 15);
8019
8019
let a = _mm512_and_si512(a, filter);
@@ -8025,7 +8025,7 @@ pub unsafe fn _mm512_movepi16_mask(a: __m512i) -> __mmask32 {
8025
8025
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_movepi16_mask&expand=3872)
8026
8026
#[inline]
8027
8027
#[target_feature(enable = "avx512bw,avx512vl")]
8028
- #[cfg_attr(test, assert_instr(mov ))] // should be vpmovw2m but msvc does not generate it
8028
+ #[cfg_attr(test, assert_instr(vpmovw2m ))]
8029
8029
pub unsafe fn _mm256_movepi16_mask(a: __m256i) -> __mmask16 {
8030
8030
let filter = _mm256_set1_epi16(1 << 15);
8031
8031
let a = _mm256_and_si256(a, filter);
@@ -8037,7 +8037,7 @@ pub unsafe fn _mm256_movepi16_mask(a: __m256i) -> __mmask16 {
8037
8037
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movepi16_mask&expand=3871)
8038
8038
#[inline]
8039
8039
#[target_feature(enable = "avx512bw,avx512vl")]
8040
- #[cfg_attr(test, assert_instr(mov ))] // should be vpmovw2m but msvc does not generate it
8040
+ #[cfg_attr(test, assert_instr(vpmovw2m ))]
8041
8041
pub unsafe fn _mm_movepi16_mask(a: __m128i) -> __mmask8 {
8042
8042
let filter = _mm_set1_epi16(1 << 15);
8043
8043
let a = _mm_and_si128(a, filter);
@@ -8049,7 +8049,7 @@ pub unsafe fn _mm_movepi16_mask(a: __m128i) -> __mmask8 {
8049
8049
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_movepi8_mask&expand=3883)
8050
8050
#[inline]
8051
8051
#[target_feature(enable = "avx512bw")]
8052
- #[cfg_attr(test, assert_instr(mov ))] // should be vpmovb2m but msvc does not generate it
8052
+ #[cfg_attr(test, assert_instr(vpmovb2m ))]
8053
8053
pub unsafe fn _mm512_movepi8_mask(a: __m512i) -> __mmask64 {
8054
8054
let filter = _mm512_set1_epi8(1 << 7);
8055
8055
let a = _mm512_and_si512(a, filter);
@@ -8061,7 +8061,8 @@ pub unsafe fn _mm512_movepi8_mask(a: __m512i) -> __mmask64 {
8061
8061
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_movepi8_mask&expand=3882)
8062
8062
#[inline]
8063
8063
#[target_feature(enable = "avx512bw,avx512vl")]
8064
- #[cfg_attr(test, assert_instr(mov))] // should be vpmovb2m but msvc does not generate it
8064
+ #[cfg_attr(test, assert_instr(vpmovmskb))] // should be vpmovb2m but compiled to vpmovmskb in the test shim because that takes less cycles than
8065
+ // using vpmovb2m plus converting the mask register to a standard register.
8065
8066
pub unsafe fn _mm256_movepi8_mask(a: __m256i) -> __mmask32 {
8066
8067
let filter = _mm256_set1_epi8(1 << 7);
8067
8068
let a = _mm256_and_si256(a, filter);
@@ -8073,7 +8074,8 @@ pub unsafe fn _mm256_movepi8_mask(a: __m256i) -> __mmask32 {
8073
8074
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movepi8_mask&expand=3881)
8074
8075
#[inline]
8075
8076
#[target_feature(enable = "avx512bw,avx512vl")]
8076
- #[cfg_attr(test, assert_instr(mov))] // should be vpmovb2m but msvc does not generate it
8077
+ #[cfg_attr(test, assert_instr(vpmovmskb))] // should be vpmovb2m but compiled to vpmovmskb in the test shim because that takes less cycles than
8078
+ // using vpmovb2m plus converting the mask register to a standard register.
8077
8079
pub unsafe fn _mm_movepi8_mask(a: __m128i) -> __mmask16 {
8078
8080
let filter = _mm_set1_epi8(1 << 7);
8079
8081
let a = _mm_and_si128(a, filter);
@@ -8216,8 +8218,9 @@ pub unsafe fn _mm_movm_epi8(k: __mmask16) -> __m128i {
8216
8218
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_kadd_mask32&expand=3207)
8217
8219
#[inline]
8218
8220
#[target_feature(enable = "avx512bw")]
8219
- #[cfg_attr(test, assert_instr(mov))] // generate normal and code instead of kaddd
8220
- //llvm.x86.avx512.kadd.d
8221
+ #[cfg_attr(all(test, target_arch = "x86"), assert_instr(add))]
8222
+ #[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(lea))] // generate normal lea/add code instead of kaddd
8223
+ //llvm.x86.avx512.kadd.d
8221
8224
pub unsafe fn _kadd_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
8222
8225
transmute(a + b)
8223
8226
}
@@ -8227,7 +8230,9 @@ pub unsafe fn _kadd_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
8227
8230
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_kadd_mask64&expand=3208)
8228
8231
#[inline]
8229
8232
#[target_feature(enable = "avx512bw")]
8230
- #[cfg_attr(test, assert_instr(mov))] // generate normal and code instead of kaddq
8233
+ #[cfg_attr(all(test, target_arch = "x86"), assert_instr(add))]
8234
+ #[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(lea))] // generate normal lea/add code instead of kaddd
8235
+ //llvm.x86.avx512.kadd.d
8231
8236
pub unsafe fn _kadd_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
8232
8237
transmute(a + b)
8233
8238
}
0 commit comments