AVX-512-Arithmetic-XMM

AVX-512-Arithmetic-XMM#

_mm_mask_abs_epi8#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i src, __mmask16 k, __m128i a
Param ETypes:: UI8 src, MASK k, SI8 a

__m128i _mm_mask_abs_epi8(__m128i src, __mmask16 k,
                          __m128i a)

Intel Description

Compute the absolute value of packed signed 8-bit integers in “a”, and store the unsigned results in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 15
        i := j*8
        IF k[j]
                dst[i+7:i] := ABS(a[i+7:i])
        ELSE
                dst[i+7:i] := src[i+7:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_abs_epi8#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __mmask16 k, __m128i a
Param ETypes:: MASK k, SI8 a

__m128i _mm_maskz_abs_epi8(__mmask16 k, __m128i a);

Intel Description

Compute the absolute value of packed signed 8-bit integers in “a”, and store the unsigned results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 15
        i := j*8
        IF k[j]
                dst[i+7:i] := ABS(a[i+7:i])
        ELSE
                dst[i+7:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_abs_epi16#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i src, __mmask8 k, __m128i a
Param ETypes:: UI16 src, MASK k, SI16 a

__m128i _mm_mask_abs_epi16(__m128i src, __mmask8 k,
                           __m128i a)

Intel Description

Compute the absolute value of packed signed 16-bit integers in “a”, and store the unsigned results in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        i := j*16
        IF k[j]
                dst[i+15:i] := ABS(a[i+15:i])
        ELSE
                dst[i+15:i] := src[i+15:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_abs_epi16#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __mmask8 k, __m128i a
Param ETypes:: MASK k, SI16 a

__m128i _mm_maskz_abs_epi16(__mmask8 k, __m128i a);

Intel Description

Compute the absolute value of packed signed 16-bit integers in “a”, and store the unsigned results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        i := j*16
        IF k[j]
                dst[i+15:i] := ABS(a[i+15:i])
        ELSE
                dst[i+15:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_add_epi8#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i src, __mmask16 k, __m128i a, __m128i b
Param ETypes:: UI8 src, MASK k, UI8 a, UI8 b

__m128i _mm_mask_add_epi8(__m128i src, __mmask16 k,
                          __m128i a, __m128i b)

Intel Description

Add packed 8-bit integers in “a” and “b”, and store the results in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 15
        i := j*8
        IF k[j]
                dst[i+7:i] := a[i+7:i] + b[i+7:i]
        ELSE
                dst[i+7:i] := src[i+7:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_add_epi8#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __mmask16 k, __m128i a, __m128i b
Param ETypes:: MASK k, UI8 a, UI8 b

__m128i _mm_maskz_add_epi8(__mmask16 k, __m128i a,
                           __m128i b)

Intel Description

Add packed 8-bit integers in “a” and “b”, and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 15
        i := j*8
        IF k[j]
                dst[i+7:i] := a[i+7:i] + b[i+7:i]
        ELSE
                dst[i+7:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_adds_epi8#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i src, __mmask16 k, __m128i a, __m128i b
Param ETypes:: UI8 src, MASK k, SI8 a, SI8 b

__m128i _mm_mask_adds_epi8(__m128i src, __mmask16 k,
                           __m128i a, __m128i b)

Intel Description

Add packed signed 8-bit integers in “a” and “b” using saturation, and store the results in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 15
        i := j*8
        IF k[j]
                dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
        ELSE
                dst[i+7:i] := src[i+7:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_adds_epi8#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __mmask16 k, __m128i a, __m128i b
Param ETypes:: MASK k, SI8 a, SI8 b

__m128i _mm_maskz_adds_epi8(__mmask16 k, __m128i a,
                            __m128i b)

Intel Description

Add packed signed 8-bit integers in “a” and “b” using saturation, and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 15
        i := j*8
        IF k[j]
                dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
        ELSE
                dst[i+7:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_adds_epi16#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i src, __mmask8 k, __m128i a, __m128i b
Param ETypes:: UI16 src, MASK k, SI16 a, SI16 b

__m128i _mm_mask_adds_epi16(__m128i src, __mmask8 k,
                            __m128i a, __m128i b)

Intel Description

Add packed signed 16-bit integers in “a” and “b” using saturation, and store the results in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        i := j*16
        IF k[j]
                dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] )
        ELSE
                dst[i+15:i] := src[i+15:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_adds_epi16#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __mmask8 k, __m128i a, __m128i b
Param ETypes:: MASK k, SI16 a, SI16 b

__m128i _mm_maskz_adds_epi16(__mmask8 k, __m128i a,
                             __m128i b)

Intel Description

Add packed signed 16-bit integers in “a” and “b” using saturation, and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        i := j*16
        IF k[j]
                dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] )
        ELSE
                dst[i+15:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_adds_epu8#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i src, __mmask16 k, __m128i a, __m128i b
Param ETypes:: UI8 src, MASK k, UI8 a, UI8 b

__m128i _mm_mask_adds_epu8(__m128i src, __mmask16 k,
                           __m128i a, __m128i b)

Intel Description

Add packed unsigned 8-bit integers in “a” and “b” using saturation, and store the results in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 15
        i := j*8
        IF k[j]
                dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] )
        ELSE
                dst[i+7:i] := src[i+7:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_adds_epu8#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __mmask16 k, __m128i a, __m128i b
Param ETypes:: MASK k, UI8 a, UI8 b

__m128i _mm_maskz_adds_epu8(__mmask16 k, __m128i a,
                            __m128i b)

Intel Description

Add packed unsigned 8-bit integers in “a” and “b” using saturation, and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 15
        i := j*8
        IF k[j]
                dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] )
        ELSE
                dst[i+7:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_adds_epu16#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i src, __mmask8 k, __m128i a, __m128i b
Param ETypes:: UI16 src, MASK k, UI16 a, UI16 b

__m128i _mm_mask_adds_epu16(__m128i src, __mmask8 k,
                            __m128i a, __m128i b)

Intel Description

Add packed unsigned 16-bit integers in “a” and “b” using saturation, and store the results in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        i := j*16
        IF k[j]
                dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] )
        ELSE
                dst[i+15:i] := src[i+15:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_adds_epu16#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __mmask8 k, __m128i a, __m128i b
Param ETypes:: MASK k, UI16 a, UI16 b

__m128i _mm_maskz_adds_epu16(__mmask8 k, __m128i a,
                             __m128i b)

Intel Description

Add packed unsigned 16-bit integers in “a” and “b” using saturation, and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        i := j*16
        IF k[j]
                dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] )
        ELSE
                dst[i+15:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_add_epi16#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i src, __mmask8 k, __m128i a, __m128i b
Param ETypes:: UI16 src, MASK k, UI16 a, UI16 b

__m128i _mm_mask_add_epi16(__m128i src, __mmask8 k,
                           __m128i a, __m128i b)

Intel Description

Add packed 16-bit integers in “a” and “b”, and store the results in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        i := j*16
        IF k[j]
                dst[i+15:i] := a[i+15:i] + b[i+15:i]
        ELSE
                dst[i+15:i] := src[i+15:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_add_epi16#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __mmask8 k, __m128i a, __m128i b
Param ETypes:: MASK k, UI16 a, UI16 b

__m128i _mm_maskz_add_epi16(__mmask8 k, __m128i a,
                            __m128i b)

Intel Description

Add packed 16-bit integers in “a” and “b”, and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        i := j*16
        IF k[j]
                dst[i+15:i] := a[i+15:i] + b[i+15:i]
        ELSE
                dst[i+15:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_avg_epu8#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i src, __mmask16 k, __m128i a, __m128i b
Param ETypes:: UI8 src, MASK k, UI8 a, UI8 b

__m128i _mm_mask_avg_epu8(__m128i src, __mmask16 k,
                          __m128i a, __m128i b)

Intel Description

Average packed unsigned 8-bit integers in “a” and “b”, and store the results in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 15
        i := j*8
        IF k[j]
                dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
        ELSE
                dst[i+7:i] := src[i+7:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_avg_epu8#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __mmask16 k, __m128i a, __m128i b
Param ETypes:: MASK k, UI8 a, UI8 b

__m128i _mm_maskz_avg_epu8(__mmask16 k, __m128i a,
                           __m128i b)

Intel Description

Average packed unsigned 8-bit integers in “a” and “b”, and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 15
        i := j*8
        IF k[j]
                dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
        ELSE
                dst[i+7:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_avg_epu16#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i src, __mmask8 k, __m128i a, __m128i b
Param ETypes:: UI16 src, MASK k, UI16 a, UI16 b

__m128i _mm_mask_avg_epu16(__m128i src, __mmask8 k,
                           __m128i a, __m128i b)

Intel Description

Average packed unsigned 16-bit integers in “a” and “b”, and store the results in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        i := j*16
        IF k[j]
                dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
        ELSE
                dst[i+15:i] := src[i+15:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_avg_epu16#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __mmask8 k, __m128i a, __m128i b
Param ETypes:: MASK k, UI16 a, UI16 b

__m128i _mm_maskz_avg_epu16(__mmask8 k, __m128i a,
                            __m128i b)

Intel Description

Average packed unsigned 16-bit integers in “a” and “b”, and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        i := j*16
        IF k[j]
                dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
        ELSE
                dst[i+15:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_maddubs_epi16#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i src, __mmask8 k, __m128i a, __m128i b
Param ETypes:: SI16 src, MASK k, UI8 a, SI8 b

__m128i _mm_mask_maddubs_epi16(__m128i src, __mmask8 k,
                               __m128i a, __m128i b)

Intel Description

Multiply packed unsigned 8-bit integers in “a” by packed signed 8-bit integers in “b”, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        i := j*16
        IF k[j]
                dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
        ELSE
                dst[i+15:i] := src[i+15:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_maddubs_epi16#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __mmask8 k, __m128i a, __m128i b
Param ETypes:: MASK k, UI8 a, SI8 b

__m128i _mm_maskz_maddubs_epi16(__mmask8 k, __m128i a,
                                __m128i b)

Intel Description

Multiply packed unsigned 8-bit integers in “a” by packed signed 8-bit integers in “b”, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        i := j*16
        IF k[j]
                dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
        ELSE
                dst[i+15:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_madd_epi16#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i src, __mmask8 k, __m128i a, __m128i b
Param ETypes:: SI32 src, MASK k, SI16 a, SI16 b

__m128i _mm_mask_madd_epi16(__m128i src, __mmask8 k,
                            __m128i a, __m128i b)

Intel Description

Multiply packed signed 16-bit integers in “a” and “b”, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        IF k[j]
                dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i])
        ELSE
                dst[i+31:i] := src[i+31:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_madd_epi16#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __mmask8 k, __m128i a, __m128i b
Param ETypes:: MASK k, SI16 a, SI16 b

__m128i _mm_maskz_madd_epi16(__mmask8 k, __m128i a,
                             __m128i b)

Intel Description

Multiply packed signed 16-bit integers in “a” and “b”, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        IF k[j]
                dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i])
        ELSE
                dst[i+31:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_max_epi8#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i src, __mmask16 k, __m128i a, __m128i b
Param ETypes:: UI8 src, MASK k, SI8 a, SI8 b

__m128i _mm_mask_max_epi8(__m128i src, __mmask16 k,
                          __m128i a, __m128i b)

Intel Description

Compare packed signed 8-bit integers in “a” and “b”, and store packed maximum values in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 15
        i := j*8
        IF k[j]
                dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
        ELSE
                dst[i+7:i] := src[i+7:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_max_epi8#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __mmask16 k, __m128i a, __m128i b
Param ETypes:: MASK k, SI8 a, SI8 b

__m128i _mm_maskz_max_epi8(__mmask16 k, __m128i a,
                           __m128i b)

Intel Description

Compare packed signed 8-bit integers in “a” and “b”, and store packed maximum values in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 15
        i := j*8
        IF k[j]
                dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
        ELSE
                dst[i+7:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_max_epi16#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i src, __mmask8 k, __m128i a, __m128i b
Param ETypes:: UI16 src, MASK k, SI16 a, SI16 b

__m128i _mm_mask_max_epi16(__m128i src, __mmask8 k,
                           __m128i a, __m128i b)

Intel Description

Compare packed signed 16-bit integers in “a” and “b”, and store packed maximum values in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        i := j*16
        IF k[j]
                dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
        ELSE
                dst[i+15:i] := src[i+15:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_max_epi16#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __mmask8 k, __m128i a, __m128i b
Param ETypes:: MASK k, SI16 a, SI16 b

__m128i _mm_maskz_max_epi16(__mmask8 k, __m128i a,
                            __m128i b)

Intel Description

Compare packed signed 16-bit integers in “a” and “b”, and store packed maximum values in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        i := j*16
        IF k[j]
                dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
        ELSE
                dst[i+15:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_max_epu8#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i src, __mmask16 k, __m128i a, __m128i b
Param ETypes:: UI8 src, MASK k, UI8 a, UI8 b

__m128i _mm_mask_max_epu8(__m128i src, __mmask16 k,
                          __m128i a, __m128i b)

Intel Description

Compare packed unsigned 8-bit integers in “a” and “b”, and store packed maximum values in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 15
        i := j*8
        IF k[j]
                dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
        ELSE
                dst[i+7:i] := src[i+7:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_max_epu8#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __mmask16 k, __m128i a, __m128i b
Param ETypes:: MASK k, UI8 a, UI8 b

__m128i _mm_maskz_max_epu8(__mmask16 k, __m128i a,
                           __m128i b)

Intel Description

Compare packed unsigned 8-bit integers in “a” and “b”, and store packed maximum values in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 15
        i := j*8
        IF k[j]
                dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
        ELSE
                dst[i+7:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_max_epu16#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i src, __mmask8 k, __m128i a, __m128i b
Param ETypes:: UI16 src, MASK k, UI16 a, UI16 b

__m128i _mm_mask_max_epu16(__m128i src, __mmask8 k,
                           __m128i a, __m128i b)

Intel Description

Compare packed unsigned 16-bit integers in “a” and “b”, and store packed maximum values in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        i := j*16
        IF k[j]
                dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
        ELSE
                dst[i+15:i] := src[i+15:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_max_epu16#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __mmask8 k, __m128i a, __m128i b
Param ETypes:: MASK k, UI16 a, UI16 b

__m128i _mm_maskz_max_epu16(__mmask8 k, __m128i a,
                            __m128i b)

Intel Description

Compare packed unsigned 16-bit integers in “a” and “b”, and store packed maximum values in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        i := j*16
        IF k[j]
                dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
        ELSE
                dst[i+15:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_min_epi8#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i src, __mmask16 k, __m128i a, __m128i b
Param ETypes:: UI8 src, MASK k, SI8 a, SI8 b

__m128i _mm_mask_min_epi8(__m128i src, __mmask16 k,
                          __m128i a, __m128i b)

Intel Description

Compare packed signed 8-bit integers in “a” and “b”, and store packed minimum values in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 15
        i := j*8
        IF k[j]
                dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
        ELSE
                dst[i+7:i] := src[i+7:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_min_epi8#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __mmask16 k, __m128i a, __m128i b
Param ETypes:: MASK k, SI8 a, SI8 b

__m128i _mm_maskz_min_epi8(__mmask16 k, __m128i a,
                           __m128i b)

Intel Description

Compare packed signed 8-bit integers in “a” and “b”, and store packed minimum values in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 15
        i := j*8
        IF k[j]
                dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
        ELSE
                dst[i+7:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_min_epi16#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i src, __mmask8 k, __m128i a, __m128i b
Param ETypes:: UI16 src, MASK k, SI16 a, SI16 b

__m128i _mm_mask_min_epi16(__m128i src, __mmask8 k,
                           __m128i a, __m128i b)

Intel Description

Compare packed signed 16-bit integers in “a” and “b”, and store packed minimum values in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        i := j*16
        IF k[j]
                dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
        ELSE
                dst[i+15:i] := src[i+15:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_min_epi16#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __mmask8 k, __m128i a, __m128i b
Param ETypes:: MASK k, SI16 a, SI16 b

__m128i _mm_maskz_min_epi16(__mmask8 k, __m128i a,
                            __m128i b)

Intel Description

Compare packed signed 16-bit integers in “a” and “b”, and store packed minimum values in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        i := j*16
        IF k[j]
                dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
        ELSE
                dst[i+15:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_min_epu8#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i src, __mmask16 k, __m128i a, __m128i b
Param ETypes:: UI8 src, MASK k, UI8 a, UI8 b

__m128i _mm_mask_min_epu8(__m128i src, __mmask16 k,
                          __m128i a, __m128i b)

Intel Description

Compare packed unsigned 8-bit integers in “a” and “b”, and store packed minimum values in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 15
        i := j*8
        IF k[j]
                dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
        ELSE
                dst[i+7:i] := src[i+7:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_min_epu8#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __mmask16 k, __m128i a, __m128i b
Param ETypes:: MASK k, UI8 a, UI8 b

__m128i _mm_maskz_min_epu8(__mmask16 k, __m128i a,
                           __m128i b)

Intel Description

Compare packed unsigned 8-bit integers in “a” and “b”, and store packed minimum values in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 15
        i := j*8
        IF k[j]
                dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
        ELSE
                dst[i+7:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_min_epu16#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i src, __mmask8 k, __m128i a, __m128i b
Param ETypes:: UI16 src, MASK k, UI16 a, UI16 b

__m128i _mm_mask_min_epu16(__m128i src, __mmask8 k,
                           __m128i a, __m128i b)

Intel Description

Compare packed unsigned 16-bit integers in “a” and “b”, and store packed minimum values in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        i := j*16
        IF k[j]
                dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
        ELSE
                dst[i+15:i] := src[i+15:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_min_epu16#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __mmask8 k, __m128i a, __m128i b
Param ETypes:: MASK k, UI16 a, UI16 b

__m128i _mm_maskz_min_epu16(__mmask8 k, __m128i a,
                            __m128i b)

Intel Description

Compare packed unsigned 16-bit integers in “a” and “b”, and store packed minimum values in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        i := j*16
        IF k[j]
                dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
        ELSE
                dst[i+15:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_mulhrs_epi16#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i src, __mmask8 k, __m128i a, __m128i b
Param ETypes:: UI16 src, MASK k, SI16 a, SI16 b

__m128i _mm_mask_mulhrs_epi16(__m128i src, __mmask8 k,
                              __m128i a, __m128i b)

Intel Description

Multiply packed signed 16-bit integers in “a” and “b”, producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        i := j*16
        IF k[j]
                tmp[31:0] := ((SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])) >> 14) + 1
                dst[i+15:i] := tmp[16:1]
        ELSE
                dst[i+15:i] := src[i+15:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_mulhrs_epi16#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __mmask8 k, __m128i a, __m128i b
Param ETypes:: MASK k, SI16 a, SI16 b

__m128i _mm_maskz_mulhrs_epi16(__mmask8 k, __m128i a,
                               __m128i b)

Intel Description

Multiply packed signed 16-bit integers in “a” and “b”, producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        i := j*16
        IF k[j]
                tmp[31:0] := ((SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])) >> 14) + 1
                dst[i+15:i] := tmp[16:1]
        ELSE
                dst[i+15:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_mulhi_epu16#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i src, __mmask8 k, __m128i a, __m128i b
Param ETypes:: UI16 src, MASK k, UI16 a, UI16 b

__m128i _mm_mask_mulhi_epu16(__m128i src, __mmask8 k,
                             __m128i a, __m128i b)

Intel Description

Multiply the packed unsigned 16-bit integers in “a” and “b”, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        i := j*16
        IF k[j]
                tmp[31:0] := a[i+15:i] * b[i+15:i]
                dst[i+15:i] := tmp[31:16]
        ELSE
                dst[i+15:i] := src[i+15:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_mulhi_epu16#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __mmask8 k, __m128i a, __m128i b
Param ETypes:: MASK k, UI16 a, UI16 b

__m128i _mm_maskz_mulhi_epu16(__mmask8 k, __m128i a,
                              __m128i b)

Intel Description

Multiply the packed unsigned 16-bit integers in “a” and “b”, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        i := j*16
        IF k[j]
                tmp[31:0] := a[i+15:i] * b[i+15:i]
                dst[i+15:i] := tmp[31:16]
        ELSE
                dst[i+15:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_mulhi_epi16#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i src, __mmask8 k, __m128i a, __m128i b
Param ETypes:: UI16 src, MASK k, SI16 a, SI16 b

__m128i _mm_mask_mulhi_epi16(__m128i src, __mmask8 k,
                             __m128i a, __m128i b)

Intel Description

Multiply the packed signed 16-bit integers in “a” and “b”, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        i := j*16
        IF k[j]
                tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])
                dst[i+15:i] := tmp[31:16]
        ELSE
                dst[i+15:i] := src[i+15:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_mulhi_epi16#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __mmask8 k, __m128i a, __m128i b
Param ETypes:: MASK k, SI16 a, SI16 b

__m128i _mm_maskz_mulhi_epi16(__mmask8 k, __m128i a,
                              __m128i b)

Intel Description

Multiply the packed signed 16-bit integers in “a” and “b”, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        i := j*16
        IF k[j]
                tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])
                dst[i+15:i] := tmp[31:16]
        ELSE
                dst[i+15:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_mullo_epi16#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i src, __mmask8 k, __m128i a, __m128i b
Param ETypes:: UI16 src, MASK k, UI16 a, UI16 b

__m128i _mm_mask_mullo_epi16(__m128i src, __mmask8 k,
                             __m128i a, __m128i b)

Intel Description

Multiply the packed 16-bit integers in “a” and “b”, producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        i := j*16
        IF k[j]
                tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])
                dst[i+15:i] := tmp[15:0]
        ELSE
                dst[i+15:i] := src[i+15:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_mullo_epi16#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __mmask8 k, __m128i a, __m128i b
Param ETypes:: MASK k, UI16 a, UI16 b

__m128i _mm_maskz_mullo_epi16(__mmask8 k, __m128i a,
                              __m128i b)

Intel Description

Multiply the packed 16-bit integers in “a” and “b”, producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        i := j*16
        IF k[j]
                tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])
                dst[i+15:i] := tmp[15:0]
        ELSE
                dst[i+15:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_sub_epi8#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i src, __mmask16 k, __m128i a, __m128i b
Param ETypes:: UI8 src, MASK k, UI8 a, UI8 b

__m128i _mm_mask_sub_epi8(__m128i src, __mmask16 k,
                          __m128i a, __m128i b)

Intel Description

Subtract packed 8-bit integers in “b” from packed 8-bit integers in “a”, and store the results in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 15
        i := j*8
        IF k[j]
                dst[i+7:i] := a[i+7:i] - b[i+7:i]
        ELSE
                dst[i+7:i] := src[i+7:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_sub_epi8#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __mmask16 k, __m128i a, __m128i b
Param ETypes:: MASK k, UI8 a, UI8 b

__m128i _mm_maskz_sub_epi8(__mmask16 k, __m128i a,
                           __m128i b)

Intel Description

Subtract packed 8-bit integers in “b” from packed 8-bit integers in “a”, and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 15
        i := j*8
        IF k[j]
                dst[i+7:i] := a[i+7:i] - b[i+7:i]
        ELSE
                dst[i+7:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_subs_epi8#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i src, __mmask16 k, __m128i a, __m128i b
Param ETypes:: UI8 src, MASK k, SI8 a, SI8 b

__m128i _mm_mask_subs_epi8(__m128i src, __mmask16 k,
                           __m128i a, __m128i b)

Intel Description

Subtract packed signed 8-bit integers in “b” from packed 8-bit integers in “a” using saturation, and store the results in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 15
        i := j*8
        IF k[j]
                dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i])
        ELSE
                dst[i+7:i] := src[i+7:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_subs_epi8#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __mmask16 k, __m128i a, __m128i b
Param ETypes:: MASK k, SI8 a, SI8 b

__m128i _mm_maskz_subs_epi8(__mmask16 k, __m128i a,
                            __m128i b)

Intel Description

Subtract packed signed 8-bit integers in “b” from packed 8-bit integers in “a” using saturation, and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 15
        i := j*8
        IF k[j]
                dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i])
        ELSE
                dst[i+7:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_subs_epi16#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i src, __mmask8 k, __m128i a, __m128i b
Param ETypes:: UI16 src, MASK k, SI16 a, SI16 b

__m128i _mm_mask_subs_epi16(__m128i src, __mmask8 k,
                            __m128i a, __m128i b)

Intel Description

Subtract packed signed 16-bit integers in “b” from packed 16-bit integers in “a” using saturation, and store the results in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        i := j*16
        IF k[j]
                dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i])
        ELSE
                dst[i+15:i] := src[i+15:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_subs_epi16#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __mmask8 k, __m128i a, __m128i b
Param ETypes:: MASK k, SI16 a, SI16 b

__m128i _mm_maskz_subs_epi16(__mmask8 k, __m128i a,
                             __m128i b)

Intel Description

Subtract packed signed 16-bit integers in “b” from packed 16-bit integers in “a” using saturation, and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        i := j*16
        IF k[j]
                dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i])
        ELSE
                dst[i+15:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_subs_epu8#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i src, __mmask16 k, __m128i a, __m128i b
Param ETypes:: UI8 src, MASK k, UI8 a, UI8 b

__m128i _mm_mask_subs_epu8(__m128i src, __mmask16 k,
                           __m128i a, __m128i b)

Intel Description

Subtract packed unsigned 8-bit integers in “b” from packed unsigned 8-bit integers in “a” using saturation, and store the results in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 15
        i := j*8
        IF k[j]
                dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i])
        ELSE
                dst[i+7:i] := src[i+7:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_subs_epu8#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __mmask16 k, __m128i a, __m128i b
Param ETypes:: MASK k, UI8 a, UI8 b

__m128i _mm_maskz_subs_epu8(__mmask16 k, __m128i a,
                            __m128i b)

Intel Description

Subtract packed unsigned 8-bit integers in “b” from packed unsigned 8-bit integers in “a” using saturation, and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 15
        i := j*8
        IF k[j]
                dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i])
        ELSE
                dst[i+7:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_subs_epu16#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i src, __mmask8 k, __m128i a, __m128i b
Param ETypes:: UI16 src, MASK k, UI16 a, UI16 b

__m128i _mm_mask_subs_epu16(__m128i src, __mmask8 k,
                            __m128i a, __m128i b)

Intel Description

Subtract packed unsigned 16-bit integers in “b” from packed unsigned 16-bit integers in “a” using saturation, and store the results in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        i := j*16
        IF k[j]
                dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i])
        ELSE
                dst[i+15:i] := src[i+15:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_subs_epu16#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __mmask8 k, __m128i a, __m128i b
Param ETypes:: MASK k, UI16 a, UI16 b

__m128i _mm_maskz_subs_epu16(__mmask8 k, __m128i a,
                             __m128i b)

Intel Description

Subtract packed unsigned 16-bit integers in “b” from packed unsigned 16-bit integers in “a” using saturation, and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        i := j*16
        IF k[j]
                dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i])
        ELSE
                dst[i+15:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_sub_epi16#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i src, __mmask8 k, __m128i a, __m128i b
Param ETypes:: UI16 src, MASK k, UI16 a, UI16 b

__m128i _mm_mask_sub_epi16(__m128i src, __mmask8 k,
                           __m128i a, __m128i b)

Intel Description

Subtract packed 16-bit integers in “b” from packed 16-bit integers in “a”, and store the results in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        i := j*16
        IF k[j]
                dst[i+15:i] := a[i+15:i] - b[i+15:i]
        ELSE
                dst[i+15:i] := src[i+15:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_sub_epi16#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __mmask8 k, __m128i a, __m128i b
Param ETypes:: MASK k, UI16 a, UI16 b

__m128i _mm_maskz_sub_epi16(__mmask8 k, __m128i a,
                            __m128i b)

Intel Description

Subtract packed 16-bit integers in “b” from packed 16-bit integers in “a”, and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        i := j*16
        IF k[j]
                dst[i+15:i] := a[i+15:i] - b[i+15:i]
        ELSE
                dst[i+15:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_reduce_add_epi16#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: short
Param Types:: __m128i a
Param ETypes:: SI16 a

short _mm_reduce_add_epi16(__m128i a);

Intel Description

Reduce the packed 16-bit integers in “a” by addition. Returns the sum of all elements in “a”.

Intel Implementation Psudeo-Code

DEFINE REDUCE_ADD(src, len) {
        IF len == 2
                RETURN src[15:0] + src[31:16]
        FI
        len := len / 2
        FOR j:= 0 to (len-1)
                i := j*16
                src[i+15:i] := src[i+15:i] + src[i+16*len+31:i+16*len]
        ENDFOR
        RETURN REDUCE_ADD(src[16*len-1:0], len)
}
dst[15:0] := REDUCE_ADD(a, 8)

_mm_mask_reduce_add_epi16#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: short
Param Types:: __mmask8 k, __m128i a
Param ETypes:: MASK k, SI16 a

short _mm_mask_reduce_add_epi16(__mmask8 k, __m128i a);

Intel Description

Reduce the packed 16-bit integers in “a” by addition using mask “k”. Returns the sum of all active elements in “a”.

Intel Implementation Psudeo-Code

DEFINE REDUCE_ADD(src, len) {
        IF len == 2
                RETURN src[15:0] + src[31:16]
        FI
        len := len / 2
        FOR j:= 0 to (len-1)
                i := j*16
                src[i+15:i] := src[i+15:i] + src[i+16*len+15:i+16*len]
        ENDFOR
        RETURN REDUCE_ADD(src[16*len-1:0], len)
}
tmp := a
FOR j := 0 to 7
        i := j*16
        IF k[j]
                tmp[i+15:i] := a[i+15:i]
        ELSE
                tmp[i+15:i] := 0
        FI
ENDFOR
dst[15:0] := REDUCE_ADD(tmp, 8)

_mm_reduce_add_epi8#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: char
Param Types:: __m128i a
Param ETypes:: SI8 a

char _mm_reduce_add_epi8(__m128i a);

Intel Description

Reduce the packed 8-bit integers in “a” by addition. Returns the sum of all elements in “a”.

Intel Implementation Psudeo-Code

DEFINE REDUCE_ADD(src, len) {
        IF len == 2
                RETURN src[7:0] + src[15:8]
        FI
        len := len / 2
        FOR j:= 0 to (len-1)
                i := j*8
                src[i+7:i] := src[i+7:i] + src[i+8*len+15:i+8*len]
        ENDFOR
        RETURN REDUCE_ADD(src[8*len-1:0], len)
}
dst[7:0] := REDUCE_ADD(a, 16)

_mm_mask_reduce_add_epi8#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: char
Param Types:: __mmask16 k, __m128i a
Param ETypes:: MASK k, SI8 a

char _mm_mask_reduce_add_epi8(__mmask16 k, __m128i a);

Intel Description

Reduce the packed 8-bit integers in “a” by addition using mask “k”. Returns the sum of all active elements in “a”.

Intel Implementation Psudeo-Code

DEFINE REDUCE_ADD(src, len) {
        IF len == 2
                RETURN src[7:0] + src[15:8]
        FI
        len := len / 2
        FOR j:= 0 to (len-1)
                i := j*8
                src[i+7:i] := src[i+7:i] + src[i+8*len+7:i+8*len]
        ENDFOR
        RETURN REDUCE_ADD(src[8*len-1:0], len)
}
tmp := a
FOR j := 0 to 15
        i := j*8
        IF k[j]
                tmp[i+7:i] := a[i+7:i]
        ELSE
                tmp[i+7:i] := 0
        FI
ENDFOR
dst[7:0] := REDUCE_ADD(tmp, 16)

_mm_reduce_mul_epi16#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: short
Param Types:: __m128i a
Param ETypes:: SI16 a

short _mm_reduce_mul_epi16(__m128i a);

Intel Description

Reduce the packed 16-bit integers in “a” by multiplication. Returns the sum of all elements in “a”.

Intel Implementation Psudeo-Code

DEFINE REDUCE_MUL(src, len) {
        IF len == 2
                RETURN src[15:0] * src[31:16]
        FI
        len := len / 2
        FOR j:= 0 to (len-1)
                i := j*16
                src[i+15:i] := src[i+15:i] * src[i+16*len+31:i+16*len]
        ENDFOR
        RETURN REDUCE_MUL(src[16*len-1:0], len)
}
dst[15:0] := REDUCE_MUL(a, 8)

_mm_mask_reduce_mul_epi16#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: short
Param Types:: __mmask8 k, __m128i a
Param ETypes:: MASK k, SI16 a

short _mm_mask_reduce_mul_epi16(__mmask8 k, __m128i a);

Intel Description

Reduce the packed 16-bit integers in “a” by multiplication using mask “k”. Returns the sum of all active elements in “a”.

Intel Implementation Psudeo-Code

DEFINE REDUCE_MUL(src, len) {
        IF len == 2
                RETURN src[15:0] * src[31:16]
        FI
        len := len / 2
        FOR j:= 0 to (len-1)
                i := j*16
                src[i+15:i] := src[i+15:i] * src[i+16*len+15:i+16*len]
        ENDFOR
        RETURN REDUCE_MUL(src[16*len-1:0], len)
}
tmp := a
FOR j := 0 to 7
        i := j*16
        IF k[j]
                tmp[i+15:i] := a[i+15:i]
        ELSE
                tmp[i+15:i] := 1
        FI
ENDFOR
dst[15:0] := REDUCE_MUL(tmp, 8)

_mm_reduce_mul_epi8#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: char
Param Types:: __m128i a
Param ETypes:: SI8 a

char _mm_reduce_mul_epi8(__m128i a);

Intel Description

Reduce the packed 8-bit integers in “a” by multiplication. Returns the sum of all elements in “a”.

Intel Implementation Psudeo-Code

DEFINE REDUCE_MUL(src, len) {
        IF len == 2
                RETURN src[7:0] * src[15:8]
        FI
        len := len / 2
        FOR j:= 0 to (len-1)
                i := j*8
                src[i+7:i] := src[i+7:i] * src[i+8*len+15:i+8*len]
        ENDFOR
        RETURN REDUCE_MUL(src[8*len-1:0], len)
}
dst[7:0] := REDUCE_MUL(a, 16)

_mm_mask_reduce_mul_epi8#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: char
Param Types:: __mmask16 k, __m128i a
Param ETypes:: MASK k, SI8 a

char _mm_mask_reduce_mul_epi8(__mmask16 k, __m128i a);

Intel Description

Reduce the packed 8-bit integers in “a” by multiplication using mask “k”. Returns the sum of all active elements in “a”.

Intel Implementation Psudeo-Code

DEFINE REDUCE_MUL(src, len) {
        IF len == 2
                RETURN src[7:0] * src[15:8]
        FI
        len := len / 2
        FOR j:= 0 to (len-1)
                i := j*8
                src[i+7:i] := src[i+7:i] * src[i+8*len+7:i+8*len]
        ENDFOR
        RETURN REDUCE_MUL(src[8*len-1:0], len)
}
tmp := a
FOR j := 0 to 15
        i := j*8
        IF k[j]
                tmp[i+7:i] := a[i+7:i]
        ELSE
                tmp[i+7:i] := 1
        FI
ENDFOR
dst[7:0] := REDUCE_MUL(tmp, 16)

_mm_reduce_or_epi16#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: short
Param Types:: __m128i a
Param ETypes:: SI16 a

short _mm_reduce_or_epi16(__m128i a);

Intel Description

Reduce the packed 16-bit integers in “a” by multiplication. Returns the sum of all elements in “a”.

Intel Implementation Psudeo-Code

DEFINE REDUCE_OR(src, len) {
        IF len == 2
                RETURN src[15:0] OR src[31:16]
        FI
        len := len / 2
        FOR j:= 0 to (len-1)
                i := j*16
                src[i+15:i] := src[i+15:i] OR src[i+16*len+31:i+16*len]
        ENDFOR
        RETURN REDUCE_OR(src[16*len-1:0], len)
}
dst[15:0] := REDUCE_OR(a, 8)

_mm_mask_reduce_or_epi16#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: short
Param Types:: __mmask8 k, __m128i a
Param ETypes:: MASK k, SI16 a

short _mm_mask_reduce_or_epi16(__mmask8 k, __m128i a);

Intel Description

Reduce the packed 16-bit integers in “a” by multiplication using mask “k”. Returns the sum of all active elements in “a”.

Intel Implementation Psudeo-Code

DEFINE REDUCE_OR(src, len) {
        IF len == 2
                RETURN src[15:0] OR src[31:16]
        FI
        len := len / 2
        FOR j:= 0 to (len-1)
                i := j*16
                src[i+15:i] := src[i+15:i] OR src[i+16*len+15:i+16*len]
        ENDFOR
        RETURN REDUCE_OR(src[16*len-1:0], len)
}
tmp := a
FOR j := 0 to 7
        i := j*16
        IF k[j]
                tmp[i+15:i] := a[i+15:i]
        ELSE
                tmp[i+15:i] := 0
        FI
ENDFOR
dst[15:0] := REDUCE_OR(tmp, 8)

_mm_reduce_or_epi8#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: char
Param Types:: __m128i a
Param ETypes:: SI8 a

char _mm_reduce_or_epi8(__m128i a);

Intel Description

Reduce the packed 8-bit integers in “a” by multiplication. Returns the sum of all elements in “a”.

Intel Implementation Psudeo-Code

DEFINE REDUCE_OR(src, len) {
        IF len == 2
                RETURN src[7:0] OR src[15:8]
        FI
        len := len / 2
        FOR j:= 0 to (len-1)
                i := j*8
                src[i+7:i] := src[i+7:i] OR src[i+8*len+15:i+8*len]
        ENDFOR
        RETURN REDUCE_OR(src[8*len-1:0], len)
}
dst[7:0] := REDUCE_OR(a, 16)

_mm_mask_reduce_or_epi8#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: char
Param Types:: __mmask16 k, __m128i a
Param ETypes:: MASK k, SI8 a

char _mm_mask_reduce_or_epi8(__mmask16 k, __m128i a);

Intel Description

Reduce the packed 8-bit integers in “a” by multiplication using mask “k”. Returns the sum of all active elements in “a”.

Intel Implementation Psudeo-Code

DEFINE REDUCE_OR(src, len) {
        IF len == 2
                RETURN src[7:0] OR src[15:8]
        FI
        len := len / 2
        FOR j:= 0 to (len-1)
                i := j*8
                src[i+7:i] := src[i+7:i] OR src[i+8*len+7:i+8*len]
        ENDFOR
        RETURN REDUCE_OR(src[8*len-1:0], len)
}
tmp := a
FOR j := 0 to 15
        i := j*8
        IF k[j]
                tmp[i+7:i] := a[i+7:i]
        ELSE
                tmp[i+7:i] := 0
        FI
ENDFOR
dst[7:0] := REDUCE_OR(tmp, 16)

_mm_reduce_and_epi16#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: short
Param Types:: __m128i a
Param ETypes:: SI16 a

short _mm_reduce_and_epi16(__m128i a);

Intel Description

Reduce the packed 16-bit integers in “a” by multiplication. Returns the sum of all elements in “a”.

Intel Implementation Psudeo-Code

DEFINE REDUCE_AND(src, len) {
        IF len == 2
                RETURN src[15:0] AND src[31:16]
        FI
        len := len / 2
        FOR j:= 0 to (len-1)
                i := j*16
                src[i+15:i] := src[i+15:i] AND src[i+16*len+31:i+16*len]
        ENDFOR
        RETURN REDUCE_AND(src[16*len-1:0], len)
}
dst[15:0] := REDUCE_AND(a, 8)

_mm_mask_reduce_and_epi16#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: short
Param Types:: __mmask8 k, __m128i a
Param ETypes:: MASK k, SI16 a

short _mm_mask_reduce_and_epi16(__mmask8 k, __m128i a);

Intel Description

Reduce the packed 16-bit integers in “a” by multiplication using mask “k”. Returns the sum of all active elements in “a”.

Intel Implementation Psudeo-Code

DEFINE REDUCE_AND(src, len) {
        IF len == 2
                RETURN src[15:0] AND src[31:16]
        FI
        len := len / 2
        FOR j:= 0 to (len-1)
                i := j*16
                src[i+15:i] := src[i+15:i] AND src[i+16*len+15:i+16*len]
        ENDFOR
        RETURN REDUCE_AND(src[16*len-1:0], len)
}
tmp := a
FOR j := 0 to 7
        i := j*16
        IF k[j]
                tmp[i+15:i] := a[i+15:i]
        ELSE
                tmp[i+15:i] := 0xFFFF
        FI
ENDFOR
dst[15:0] := REDUCE_AND(tmp, 8)

_mm_reduce_and_epi8#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: char
Param Types:: __m128i a
Param ETypes:: SI8 a

char _mm_reduce_and_epi8(__m128i a);

Intel Description

Reduce the packed 8-bit integers in “a” by multiplication. Returns the sum of all elements in “a”.

Intel Implementation Psudeo-Code

DEFINE REDUCE_AND(src, len) {
        IF len == 2
                RETURN src[7:0] AND src[15:8]
        FI
        len := len / 2
        FOR j:= 0 to (len-1)
                i := j*8
                src[i+7:i] := src[i+7:i] AND src[i+8*len+15:i+8*len]
        ENDFOR
        RETURN REDUCE_AND(src[8*len-1:0], len)
}
dst[7:0] := REDUCE_AND(a, 16)

_mm_mask_reduce_and_epi8#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: char
Param Types:: __mmask16 k, __m128i a
Param ETypes:: MASK k, SI8 a

char _mm_mask_reduce_and_epi8(__mmask16 k, __m128i a);

Intel Description

Reduce the packed 8-bit integers in “a” by multiplication using mask “k”. Returns the sum of all active elements in “a”.

Intel Implementation Psudeo-Code

DEFINE REDUCE_AND(src, len) {
        IF len == 2
                RETURN src[7:0] AND src[15:8]
        FI
        len := len / 2
        FOR j:= 0 to (len-1)
                i := j*8
                src[i+7:i] := src[i+7:i] AND src[i+8*len+7:i+8*len]
        ENDFOR
        RETURN REDUCE_AND(src[8*len-1:0], len)
}
tmp := a
FOR j := 0 to 15
        i := j*8
        IF k[j]
                tmp[i+7:i] := a[i+7:i]
        ELSE
                tmp[i+7:i] := 0xFF
        FI
ENDFOR
dst[7:0] := REDUCE_AND(tmp, 16)

_mm_mask_mullo_epi64#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i src, __mmask8 k, __m128i a, __m128i b
Param ETypes:: UI64 src, MASK k, UI64 a, UI64 b

__m128i _mm_mask_mullo_epi64(__m128i src, __mmask8 k,
                             __m128i a, __m128i b)

Intel Description

Multiply the packed 64-bit integers in “a” and “b”, producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        IF k[j]
                tmp[127:0] := a[i+63:i] * b[i+63:i]
                dst[i+63:i] := tmp[63:0]
        ELSE
                dst[i+63:i] := src[i+63:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_mullo_epi64#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __mmask8 k, __m128i a, __m128i b
Param ETypes:: MASK k, UI64 a, UI64 b

__m128i _mm_maskz_mullo_epi64(__mmask8 k, __m128i a,
                              __m128i b)

Intel Description

Multiply the packed 64-bit integers in “a” and “b”, producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        IF k[j]
                tmp[127:0] := a[i+63:i] * b[i+63:i]
                dst[i+63:i] := tmp[63:0]
        ELSE
                dst[i+63:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mullo_epi64#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i a, __m128i b
Param ETypes:: UI64 a, UI64 b

__m128i _mm_mullo_epi64(__m128i a, __m128i b);

Intel Description

Multiply the packed 64-bit integers in “a” and “b”, producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in “dst”.

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        tmp[127:0] := a[i+63:i] * b[i+63:i]
        dst[i+63:i] := tmp[63:0]
ENDFOR
dst[MAX:128] := 0

_mm_mask_add_pd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __m128d src, __mmask8 k, __m128d a, __m128d b
Param ETypes:: FP64 src, MASK k, FP64 a, FP64 b

__m128d _mm_mask_add_pd(__m128d src, __mmask8 k, __m128d a,
                        __m128d b)

Intel Description

Add packed double-precision (64-bit) floating-point elements in “a” and “b”, and store the results in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        IF k[j]
                dst[i+63:i] := a[i+63:i] + b[i+63:i]
        ELSE
                dst[i+63:i] := src[i+63:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_add_pd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __mmask8 k, __m128d a, __m128d b
Param ETypes:: MASK k, FP64 a, FP64 b

__m128d _mm_maskz_add_pd(__mmask8 k, __m128d a, __m128d b);

Intel Description

Add packed double-precision (64-bit) floating-point elements in “a” and “b”, and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        IF k[j]
                dst[i+63:i] := a[i+63:i] + b[i+63:i]
        ELSE
                dst[i+63:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_add_ps#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __m128 src, __mmask8 k, __m128 a, __m128 b
Param ETypes:: FP32 src, MASK k, FP32 a, FP32 b

__m128 _mm_mask_add_ps(__m128 src, __mmask8 k, __m128 a,
                       __m128 b)

Intel Description

Add packed single-precision (32-bit) floating-point elements in “a” and “b”, and store the results in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        IF k[j]
                dst[i+31:i] := a[i+31:i] + b[i+31:i]
        ELSE
                dst[i+31:i] := src[i+31:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_add_ps#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __mmask8 k, __m128 a, __m128 b
Param ETypes:: MASK k, FP32 a, FP32 b

__m128 _mm_maskz_add_ps(__mmask8 k, __m128 a, __m128 b);

Intel Description

Add packed single-precision (32-bit) floating-point elements in “a” and “b”, and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        IF k[j]
                dst[i+31:i] := a[i+31:i] + b[i+31:i]
        ELSE
                dst[i+31:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_div_pd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __m128d src, __mmask8 k, __m128d a, __m128d b
Param ETypes:: FP64 src, MASK k, FP64 a, FP64 b

__m128d _mm_mask_div_pd(__m128d src, __mmask8 k, __m128d a,
                        __m128d b)

Intel Description

Divide packed double-precision (64-bit) floating-point elements in “a” by packed elements in “b”, and store the results in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := 64*j
        IF k[j]
                dst[i+63:i] := a[i+63:i] / b[i+63:i]
        ELSE
                dst[i+63:i] := src[i+63:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_div_pd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __mmask8 k, __m128d a, __m128d b
Param ETypes:: MASK k, FP64 a, FP64 b

__m128d _mm_maskz_div_pd(__mmask8 k, __m128d a, __m128d b);

Intel Description

Divide packed double-precision (64-bit) floating-point elements in “a” by packed elements in “b”, and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := 64*j
        IF k[j]
                dst[i+63:i] := a[i+63:i] / b[i+63:i]
        ELSE
                dst[i+63:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_div_ps#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __m128 src, __mmask8 k, __m128 a, __m128 b
Param ETypes:: FP32 src, MASK k, FP32 a, FP32 b

__m128 _mm_mask_div_ps(__m128 src, __mmask8 k, __m128 a,
                       __m128 b)

Intel Description

Divide packed single-precision (32-bit) floating-point elements in “a” by packed elements in “b”, and store the results in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := 32*j
        IF k[j]
                dst[i+31:i] := a[i+31:i] / b[i+31:i]
        ELSE
                dst[i+31:i] := src[i+31:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_div_ps#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __mmask8 k, __m128 a, __m128 b
Param ETypes:: MASK k, FP32 a, FP32 b

__m128 _mm_maskz_div_ps(__mmask8 k, __m128 a, __m128 b);

Intel Description

Divide packed single-precision (32-bit) floating-point elements in “a” by packed elements in “b”, and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := 32*j
        IF k[j]
                dst[i+31:i] := a[i+31:i] / b[i+31:i]
        ELSE
                dst[i+31:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask3_fmadd_pd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __m128d a, __m128d b, __m128d c, __mmask8 k
Param ETypes:: FP64 a, FP64 b, FP64 c, MASK k

__m128d _mm_mask3_fmadd_pd(__m128d a, __m128d b, __m128d c,
                           __mmask8 k)

Intel Description

Multiply packed double-precision (64-bit) floating-point elements in “a” and “b”, add the intermediate result to packed elements in “c”, and store the results in “dst” using writemask “k” (elements are copied from “c” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        IF k[j]
                dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
        ELSE
                dst[i+63:i] := c[i+63:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_fmadd_pd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __m128d a, __mmask8 k, __m128d b, __m128d c
Param ETypes:: FP64 a, MASK k, FP64 b, FP64 c

__m128d _mm_mask_fmadd_pd(__m128d a, __mmask8 k, __m128d b,
                          __m128d c)

Intel Description

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        IF k[j]
                dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
        ELSE
                dst[i+63:i] := a[i+63:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_fmadd_pd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __mmask8 k, __m128d a, __m128d b, __m128d c
Param ETypes:: MASK k, FP64 a, FP64 b, FP64 c

__m128d _mm_maskz_fmadd_pd(__mmask8 k, __m128d a, __m128d b,
                           __m128d c)

Intel Description

Multiply packed double-precision (64-bit) floating-point elements in “a” and “b”, add the intermediate result to packed elements in “c”, and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        IF k[j]
                dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
        ELSE
                dst[i+63:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask3_fmadd_ps#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __m128 a, __m128 b, __m128 c, __mmask8 k
Param ETypes:: FP32 a, FP32 b, FP32 c, MASK k

__m128 _mm_mask3_fmadd_ps(__m128 a, __m128 b, __m128 c,
                          __mmask8 k)

Intel Description

Multiply packed single-precision (32-bit) floating-point elements in “a” and “b”, add the intermediate result to packed elements in “c”, and store the results in “dst” using writemask “k” (elements are copied from “c” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        IF k[j]
                dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
        ELSE
                dst[i+31:i] := c[i+31:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_fmadd_ps#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __m128 a, __mmask8 k, __m128 b, __m128 c
Param ETypes:: FP32 a, MASK k, FP32 b, FP32 c

__m128 _mm_mask_fmadd_ps(__m128 a, __mmask8 k, __m128 b,
                         __m128 c)

Intel Description

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        IF k[j]
                dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
        ELSE
                dst[i+31:i] := a[i+31:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_fmadd_ps#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __mmask8 k, __m128 a, __m128 b, __m128 c
Param ETypes:: MASK k, FP32 a, FP32 b, FP32 c

__m128 _mm_maskz_fmadd_ps(__mmask8 k, __m128 a, __m128 b,
                          __m128 c)

Intel Description

Multiply packed single-precision (32-bit) floating-point elements in “a” and “b”, add the intermediate result to packed elements in “c”, and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        IF k[j]
                dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
        ELSE
                dst[i+31:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask3_fmaddsub_pd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __m128d a, __m128d b, __m128d c, __mmask8 k
Param ETypes:: FP64 a, FP64 b, FP64 c, MASK k

__m128d _mm_mask3_fmaddsub_pd(__m128d a, __m128d b,
                              __m128d c, __mmask8 k)

Intel Description

Multiply packed double-precision (64-bit) floating-point elements in “a” and “b”, alternatively add and subtract packed elements in “c” to/from the intermediate result, and store the results in “dst” using writemask “k” (elements are copied from “c” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        IF k[j]
                IF ((j & 1) == 0)
                        dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
                ELSE
                        dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
                FI
        ELSE
                dst[i+63:i] := c[i+63:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_fmaddsub_pd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __m128d a, __mmask8 k, __m128d b, __m128d c
Param ETypes:: FP64 a, MASK k, FP64 b, FP64 c

__m128d _mm_mask_fmaddsub_pd(__m128d a, __mmask8 k,
                             __m128d b, __m128d c)

Intel Description

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        IF k[j]
                IF ((j & 1) == 0)
                        dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
                ELSE
                        dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
                FI
        ELSE
                dst[i+63:i] := a[i+63:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_fmaddsub_pd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __mmask8 k, __m128d a, __m128d b, __m128d c
Param ETypes:: MASK k, FP64 a, FP64 b, FP64 c

__m128d _mm_maskz_fmaddsub_pd(__mmask8 k, __m128d a,
                              __m128d b, __m128d c)

Intel Description

Multiply packed double-precision (64-bit) floating-point elements in “a” and “b”, alternatively add and subtract packed elements in “c” to/from the intermediate result, and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        IF k[j]
                IF ((j & 1) == 0)
                        dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
                ELSE
                        dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
                FI
        ELSE
                dst[i+63:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask3_fmaddsub_ps#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __m128 a, __m128 b, __m128 c, __mmask8 k
Param ETypes:: FP32 a, FP32 b, FP32 c, MASK k

__m128 _mm_mask3_fmaddsub_ps(__m128 a, __m128 b, __m128 c,
                             __mmask8 k)

Intel Description

Multiply packed single-precision (32-bit) floating-point elements in “a” and “b”, alternatively add and subtract packed elements in “c” to/from the intermediate result, and store the results in “dst” using writemask “k” (elements are copied from “c” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        IF k[j]
                IF ((j & 1) == 0)
                        dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
                ELSE
                        dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
                FI
        ELSE
                dst[i+31:i] := c[i+31:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_fmaddsub_ps#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __m128 a, __mmask8 k, __m128 b, __m128 c
Param ETypes:: FP32 a, MASK k, FP32 b, FP32 c

__m128 _mm_mask_fmaddsub_ps(__m128 a, __mmask8 k, __m128 b,
                            __m128 c)

Intel Description

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        IF k[j]
                IF ((j & 1) == 0)
                        dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
                ELSE
                        dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
                FI
        ELSE
                dst[i+31:i] := a[i+31:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_fmaddsub_ps#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __mmask8 k, __m128 a, __m128 b, __m128 c
Param ETypes:: MASK k, FP32 a, FP32 b, FP32 c

__m128 _mm_maskz_fmaddsub_ps(__mmask8 k, __m128 a, __m128 b,
                             __m128 c)

Intel Description

Multiply packed single-precision (32-bit) floating-point elements in “a” and “b”, alternatively add and subtract packed elements in “c” to/from the intermediate result, and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        IF k[j]
                IF ((j & 1) == 0)
                        dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
                ELSE
                        dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
                FI
        ELSE
                dst[i+31:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask3_fmsub_pd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __m128d a, __m128d b, __m128d c, __mmask8 k
Param ETypes:: FP64 a, FP64 b, FP64 c, MASK k

__m128d _mm_mask3_fmsub_pd(__m128d a, __m128d b, __m128d c,
                           __mmask8 k)

Intel Description

Multiply packed double-precision (64-bit) floating-point elements in “a” and “b”, subtract packed elements in “c” from the intermediate result, and store the results in “dst” using writemask “k” (elements are copied from “c” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        IF k[j]
                dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
        ELSE
                dst[i+63:i] := c[i+63:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_fmsub_pd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __m128d a, __mmask8 k, __m128d b, __m128d c
Param ETypes:: FP64 a, MASK k, FP64 b, FP64 c

__m128d _mm_mask_fmsub_pd(__m128d a, __mmask8 k, __m128d b,
                          __m128d c)

Intel Description

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        IF k[j]
                dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
        ELSE
                dst[i+63:i] := a[i+63:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_fmsub_pd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __mmask8 k, __m128d a, __m128d b, __m128d c
Param ETypes:: MASK k, FP64 a, FP64 b, FP64 c

__m128d _mm_maskz_fmsub_pd(__mmask8 k, __m128d a, __m128d b,
                           __m128d c)

Intel Description

Multiply packed double-precision (64-bit) floating-point elements in “a” and “b”, subtract packed elements in “c” from the intermediate result, and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        IF k[j]
                dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
        ELSE
                dst[i+63:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask3_fmsub_ps#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __m128 a, __m128 b, __m128 c, __mmask8 k
Param ETypes:: FP32 a, FP32 b, FP32 c, MASK k

__m128 _mm_mask3_fmsub_ps(__m128 a, __m128 b, __m128 c,
                          __mmask8 k)

Intel Description

Multiply packed single-precision (32-bit) floating-point elements in “a” and “b”, subtract packed elements in “c” from the intermediate result, and store the results in “dst” using writemask “k” (elements are copied from “c” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        IF k[j]
                dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
        ELSE
                dst[i+31:i] := c[i+31:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_fmsub_ps#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __m128 a, __mmask8 k, __m128 b, __m128 c
Param ETypes:: FP32 a, MASK k, FP32 b, FP32 c

__m128 _mm_mask_fmsub_ps(__m128 a, __mmask8 k, __m128 b,
                         __m128 c)

Intel Description

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        IF k[j]
                dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
        ELSE
                dst[i+31:i] := a[i+31:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_fmsub_ps#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __mmask8 k, __m128 a, __m128 b, __m128 c
Param ETypes:: MASK k, FP32 a, FP32 b, FP32 c

__m128 _mm_maskz_fmsub_ps(__mmask8 k, __m128 a, __m128 b,
                          __m128 c)

Intel Description

Multiply packed single-precision (32-bit) floating-point elements in “a” and “b”, subtract packed elements in “c” from the intermediate result, and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        IF k[j]
                dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
        ELSE
                dst[i+31:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask3_fmsubadd_pd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __m128d a, __m128d b, __m128d c, __mmask8 k
Param ETypes:: FP64 a, FP64 b, FP64 c, MASK k

__m128d _mm_mask3_fmsubadd_pd(__m128d a, __m128d b,
                              __m128d c, __mmask8 k)

Intel Description

Multiply packed double-precision (64-bit) floating-point elements in “a” and “b”, alternatively subtract and add packed elements in “c” from/to the intermediate result, and store the results in “dst” using writemask “k” (elements are copied from “c” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        IF k[j]
                IF ((j & 1) == 0)
                        dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
                ELSE
                        dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
                FI
        ELSE
                dst[i+63:i] := c[i+63:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_fmsubadd_pd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __m128d a, __mmask8 k, __m128d b, __m128d c
Param ETypes:: FP64 a, MASK k, FP64 b, FP64 c

__m128d _mm_mask_fmsubadd_pd(__m128d a, __mmask8 k,
                             __m128d b, __m128d c)

Intel Description

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        IF k[j]
                IF ((j & 1) == 0)
                        dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
                ELSE
                        dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
                FI
        ELSE
                dst[i+63:i] := a[i+63:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_fmsubadd_pd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __mmask8 k, __m128d a, __m128d b, __m128d c
Param ETypes:: MASK k, FP64 a, FP64 b, FP64 c

__m128d _mm_maskz_fmsubadd_pd(__mmask8 k, __m128d a,
                              __m128d b, __m128d c)

Intel Description

Multiply packed double-precision (64-bit) floating-point elements in “a” and “b”, alternatively subtract and add packed elements in “c” from/to the intermediate result, and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        IF k[j]
                IF ((j & 1) == 0)
                        dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
                ELSE
                        dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
                FI
        ELSE
                dst[i+63:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask3_fmsubadd_ps#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __m128 a, __m128 b, __m128 c, __mmask8 k
Param ETypes:: FP32 a, FP32 b, FP32 c, MASK k

__m128 _mm_mask3_fmsubadd_ps(__m128 a, __m128 b, __m128 c,
                             __mmask8 k)

Intel Description

Multiply packed single-precision (32-bit) floating-point elements in “a” and “b”, alternatively subtract and add packed elements in “c” from/to the intermediate result, and store the results in “dst” using writemask “k” (elements are copied from “c” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        IF k[j]
                IF ((j & 1) == 0)
                        dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
                ELSE
                        dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
                FI
        ELSE
                dst[i+31:i] := c[i+31:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_fmsubadd_ps#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __m128 a, __mmask8 k, __m128 b, __m128 c
Param ETypes:: FP32 a, MASK k, FP32 b, FP32 c

__m128 _mm_mask_fmsubadd_ps(__m128 a, __mmask8 k, __m128 b,
                            __m128 c)

Intel Description

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        IF k[j]
                IF ((j & 1) == 0)
                        dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
                ELSE
                        dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
                FI
        ELSE
                dst[i+31:i] := a[i+31:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_fmsubadd_ps#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __mmask8 k, __m128 a, __m128 b, __m128 c
Param ETypes:: MASK k, FP32 a, FP32 b, FP32 c

__m128 _mm_maskz_fmsubadd_ps(__mmask8 k, __m128 a, __m128 b,
                             __m128 c)

Intel Description

Multiply packed single-precision (32-bit) floating-point elements in “a” and “b”, alternatively subtract and add packed elements in “c” from/to the intermediate result, and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        IF k[j]
                IF ((j & 1) == 0)
                        dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
                ELSE
                        dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
                FI
        ELSE
                dst[i+31:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask3_fnmadd_pd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __m128d a, __m128d b, __m128d c, __mmask8 k
Param ETypes:: FP64 a, FP64 b, FP64 c, MASK k

__m128d _mm_mask3_fnmadd_pd(__m128d a, __m128d b, __m128d c,
                            __mmask8 k)

Intel Description

Multiply packed double-precision (64-bit) floating-point elements in “a” and “b”, add the negated intermediate result to packed elements in “c”, and store the results in “dst” using writemask “k” (elements are copied from “c” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        IF k[j]
                dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
        ELSE
                dst[i+63:i] := c[i+63:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_fnmadd_pd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __m128d a, __mmask8 k, __m128d b, __m128d c
Param ETypes:: FP64 a, MASK k, FP64 b, FP64 c

__m128d _mm_mask_fnmadd_pd(__m128d a, __mmask8 k, __m128d b,
                           __m128d c)

Intel Description

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        IF k[j]
                dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
        ELSE
                dst[i+63:i] := a[i+63:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_fnmadd_pd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __mmask8 k, __m128d a, __m128d b, __m128d c
Param ETypes:: MASK k, FP64 a, FP64 b, FP64 c

__m128d _mm_maskz_fnmadd_pd(__mmask8 k, __m128d a,
                            __m128d b, __m128d c)

Intel Description

Multiply packed double-precision (64-bit) floating-point elements in “a” and “b”, add the negated intermediate result to packed elements in “c”, and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        IF k[j]
                dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
        ELSE
                dst[i+63:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask3_fnmadd_ps#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __m128 a, __m128 b, __m128 c, __mmask8 k
Param ETypes:: FP32 a, FP32 b, FP32 c, MASK k

__m128 _mm_mask3_fnmadd_ps(__m128 a, __m128 b, __m128 c,
                           __mmask8 k)

Intel Description

Multiply packed single-precision (32-bit) floating-point elements in “a” and “b”, add the negated intermediate result to packed elements in “c”, and store the results in “dst” using writemask “k” (elements are copied from “c” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        IF k[j]
                dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
        ELSE
                dst[i+31:i] := c[i+31:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_fnmadd_ps#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __m128 a, __mmask8 k, __m128 b, __m128 c
Param ETypes:: FP32 a, MASK k, FP32 b, FP32 c

__m128 _mm_mask_fnmadd_ps(__m128 a, __mmask8 k, __m128 b,
                          __m128 c)

Intel Description

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        IF k[j]
                dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
        ELSE
                dst[i+31:i] := a[i+31:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_fnmadd_ps#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __mmask8 k, __m128 a, __m128 b, __m128 c
Param ETypes:: MASK k, FP32 a, FP32 b, FP32 c

__m128 _mm_maskz_fnmadd_ps(__mmask8 k, __m128 a, __m128 b,
                           __m128 c)

Intel Description

Multiply packed single-precision (32-bit) floating-point elements in “a” and “b”, add the negated intermediate result to packed elements in “c”, and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        IF k[j]
                dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
        ELSE
                dst[i+31:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask3_fnmsub_pd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __m128d a, __m128d b, __m128d c, __mmask8 k
Param ETypes:: FP64 a, FP64 b, FP64 c, MASK k

__m128d _mm_mask3_fnmsub_pd(__m128d a, __m128d b, __m128d c,
                            __mmask8 k)

Intel Description

Multiply packed double-precision (64-bit) floating-point elements in “a” and “b”, subtract packed elements in “c” from the negated intermediate result, and store the results in “dst” using writemask “k” (elements are copied from “c” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        IF k[j]
                dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
        ELSE
                dst[i+63:i] := c[i+63:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_fnmsub_pd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __m128d a, __mmask8 k, __m128d b, __m128d c
Param ETypes:: FP64 a, MASK k, FP64 b, FP64 c

__m128d _mm_mask_fnmsub_pd(__m128d a, __mmask8 k, __m128d b,
                           __m128d c)

Intel Description

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        IF k[j]
                dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
        ELSE
                dst[i+63:i] := a[i+63:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_fnmsub_pd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __mmask8 k, __m128d a, __m128d b, __m128d c
Param ETypes:: MASK k, FP64 a, FP64 b, FP64 c

__m128d _mm_maskz_fnmsub_pd(__mmask8 k, __m128d a,
                            __m128d b, __m128d c)

Intel Description

Multiply packed double-precision (64-bit) floating-point elements in “a” and “b”, subtract packed elements in “c” from the negated intermediate result, and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        IF k[j]
                dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
        ELSE
                dst[i+63:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask3_fnmsub_ps#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __m128 a, __m128 b, __m128 c, __mmask8 k
Param ETypes:: FP32 a, FP32 b, FP32 c, MASK k

__m128 _mm_mask3_fnmsub_ps(__m128 a, __m128 b, __m128 c,
                           __mmask8 k)

Intel Description

Multiply packed single-precision (32-bit) floating-point elements in “a” and “b”, subtract packed elements in “c” from the negated intermediate result, and store the results in “dst” using writemask “k” (elements are copied from “c” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        IF k[j]
                dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
        ELSE
                dst[i+31:i] := c[i+31:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_fnmsub_ps#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __m128 a, __mmask8 k, __m128 b, __m128 c
Param ETypes:: FP32 a, MASK k, FP32 b, FP32 c

__m128 _mm_mask_fnmsub_ps(__m128 a, __mmask8 k, __m128 b,
                          __m128 c)

Intel Description

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        IF k[j]
                dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
        ELSE
                dst[i+31:i] := a[i+31:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_fnmsub_ps#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __mmask8 k, __m128 a, __m128 b, __m128 c
Param ETypes:: MASK k, FP32 a, FP32 b, FP32 c

__m128 _mm_maskz_fnmsub_ps(__mmask8 k, __m128 a, __m128 b,
                           __m128 c)

Intel Description

Multiply packed single-precision (32-bit) floating-point elements in “a” and “b”, subtract packed elements in “c” from the negated intermediate result, and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        IF k[j]
                dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
        ELSE
                dst[i+31:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_max_pd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __m128d src, __mmask8 k, __m128d a, __m128d b
Param ETypes:: FP64 src, MASK k, FP64 a, FP64 b

__m128d _mm_mask_max_pd(__m128d src, __mmask8 k, __m128d a,
                        __m128d b)

Intel Description

Compare packed double-precision (64-bit) floating-point elements in “a” and “b”, and store packed maximum values in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set). [max_float_note]

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        IF k[j]
                dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
        ELSE
                dst[i+63:i] := src[i+63:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_max_pd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __mmask8 k, __m128d a, __m128d b
Param ETypes:: MASK k, FP64 a, FP64 b

__m128d _mm_maskz_max_pd(__mmask8 k, __m128d a, __m128d b);

Intel Description

Compare packed double-precision (64-bit) floating-point elements in “a” and “b”, and store packed maximum values in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set). [max_float_note]

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        IF k[j]
                dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
        ELSE
                dst[i+63:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_max_ps#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __m128 src, __mmask8 k, __m128 a, __m128 b
Param ETypes:: FP32 src, MASK k, FP32 a, FP32 b

__m128 _mm_mask_max_ps(__m128 src, __mmask8 k, __m128 a,
                       __m128 b)

Intel Description

Compare packed single-precision (32-bit) floating-point elements in “a” and “b”, and store packed maximum values in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set). [max_float_note]

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        IF k[j]
                dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
        ELSE
                dst[i+31:i] := src[i+31:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_max_ps#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __mmask8 k, __m128 a, __m128 b
Param ETypes:: MASK k, FP32 a, FP32 b

__m128 _mm_maskz_max_ps(__mmask8 k, __m128 a, __m128 b);

Intel Description

Compare packed single-precision (32-bit) floating-point elements in “a” and “b”, and store packed maximum values in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set). [max_float_note]

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        IF k[j]
                dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
        ELSE
                dst[i+31:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_min_pd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __m128d src, __mmask8 k, __m128d a, __m128d b
Param ETypes:: FP64 src, MASK k, FP64 a, FP64 b

__m128d _mm_mask_min_pd(__m128d src, __mmask8 k, __m128d a,
                        __m128d b)

Intel Description

Compare packed double-precision (64-bit) floating-point elements in “a” and “b”, and store packed minimum values in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set). [min_float_note]

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        IF k[j]
                dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
        ELSE
                dst[i+63:i] := src[i+63:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_min_pd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __mmask8 k, __m128d a, __m128d b
Param ETypes:: MASK k, FP64 a, FP64 b

__m128d _mm_maskz_min_pd(__mmask8 k, __m128d a, __m128d b);

Intel Description

Compare packed double-precision (64-bit) floating-point elements in “a” and “b”, and store packed minimum values in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set). [min_float_note]

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        IF k[j]
                dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
        ELSE
                dst[i+63:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_min_ps#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __m128 src, __mmask8 k, __m128 a, __m128 b
Param ETypes:: FP32 src, MASK k, FP32 a, FP32 b

__m128 _mm_mask_min_ps(__m128 src, __mmask8 k, __m128 a,
                       __m128 b)

Intel Description

Compare packed single-precision (32-bit) floating-point elements in “a” and “b”, and store packed minimum values in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set). [min_float_note]

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        IF k[j]
                dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
        ELSE
                dst[i+31:i] := src[i+31:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_min_ps#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __mmask8 k, __m128 a, __m128 b
Param ETypes:: MASK k, FP32 a, FP32 b

__m128 _mm_maskz_min_ps(__mmask8 k, __m128 a, __m128 b);

Intel Description

Compare packed single-precision (32-bit) floating-point elements in “a” and “b”, and store packed minimum values in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set). [min_float_note]

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        IF k[j]
                dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
        ELSE
                dst[i+31:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_mul_pd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __m128d src, __mmask8 k, __m128d a, __m128d b
Param ETypes:: FP64 src, MASK k, FP64 a, FP64 b

__m128d _mm_mask_mul_pd(__m128d src, __mmask8 k, __m128d a,
                        __m128d b)

Intel Description

Multiply packed double-precision (64-bit) floating-point elements in “a” and “b”, and store the results in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        IF k[j]
                dst[i+63:i] := a[i+63:i] * b[i+63:i]
        ELSE
                dst[i+63:i] := src[i+63:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_mul_pd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __mmask8 k, __m128d a, __m128d b
Param ETypes:: MASK k, FP64 a, FP64 b

__m128d _mm_maskz_mul_pd(__mmask8 k, __m128d a, __m128d b);

Intel Description

Multiply packed double-precision (64-bit) floating-point elements in “a” and “b”, and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        IF k[j]
                dst[i+63:i] := a[i+63:i] * b[i+63:i]
        ELSE
                dst[i+63:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_mul_ps#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __m128 src, __mmask8 k, __m128 a, __m128 b
Param ETypes:: FP32 src, MASK k, FP32 a, FP32 b

__m128 _mm_mask_mul_ps(__m128 src, __mmask8 k, __m128 a,
                       __m128 b)

Intel Description

Multiply packed single-precision (32-bit) floating-point elements in “a” and “b”, and store the results in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        IF k[j]
                dst[i+31:i] := a[i+31:i] * b[i+31:i]
        ELSE
                dst[i+31:i] := src[i+31:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_mul_ps#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __mmask8 k, __m128 a, __m128 b
Param ETypes:: MASK k, FP32 a, FP32 b

__m128 _mm_maskz_mul_ps(__mmask8 k, __m128 a, __m128 b);

Intel Description

Multiply packed single-precision (32-bit) floating-point elements in “a” and “b”, and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        IF k[j]
                dst[i+31:i] := a[i+31:i] * b[i+31:i]
        ELSE
                dst[i+31:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_abs_epi32#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i src, __mmask8 k, __m128i a
Param ETypes:: UI32 src, MASK k, SI32 a

__m128i _mm_mask_abs_epi32(__m128i src, __mmask8 k,
                           __m128i a)

Intel Description

Compute the absolute value of packed signed 32-bit integers in “a”, and store the unsigned results in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        IF k[j]
                dst[i+31:i] := ABS(a[i+31:i])
        ELSE
                dst[i+31:i] := src[i+31:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_abs_epi32#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __mmask8 k, __m128i a
Param ETypes:: MASK k, SI32 a

__m128i _mm_maskz_abs_epi32(__mmask8 k, __m128i a);

Intel Description

Compute the absolute value of packed signed 32-bit integers in “a”, and store the unsigned results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        IF k[j]
                dst[i+31:i] := ABS(a[i+31:i])
        ELSE
                dst[i+31:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_abs_epi64#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i a
Param ETypes:: SI64 a

__m128i _mm_abs_epi64(__m128i a);

Intel Description

Compute the absolute value of packed signed 64-bit integers in “a”, and store the unsigned results in “dst”.

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        dst[i+63:i] := ABS(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

_mm_mask_abs_epi64#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i src, __mmask8 k, __m128i a
Param ETypes:: UI64 src, MASK k, SI64 a

__m128i _mm_mask_abs_epi64(__m128i src, __mmask8 k,
                           __m128i a)

Intel Description

Compute the absolute value of packed signed 64-bit integers in “a”, and store the unsigned results in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        IF k[j]
                dst[i+63:i] := ABS(a[i+63:i])
        ELSE
                dst[i+63:i] := src[i+63:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_abs_epi64#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __mmask8 k, __m128i a
Param ETypes:: MASK k, SI64 a

__m128i _mm_maskz_abs_epi64(__mmask8 k, __m128i a);

Intel Description

Compute the absolute value of packed signed 64-bit integers in “a”, and store the unsigned results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        IF k[j]
                dst[i+63:i] := ABS(a[i+63:i])
        ELSE
                dst[i+63:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_add_epi32#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i src, __mmask8 k, __m128i a, __m128i b
Param ETypes:: UI32 src, MASK k, UI32 a, UI32 b

__m128i _mm_mask_add_epi32(__m128i src, __mmask8 k,
                           __m128i a, __m128i b)

Intel Description

Add packed 32-bit integers in “a” and “b”, and store the results in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        IF k[j]
                dst[i+31:i] := a[i+31:i] + b[i+31:i]
        ELSE
                dst[i+31:i] := src[i+31:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_add_epi32#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __mmask8 k, __m128i a, __m128i b
Param ETypes:: MASK k, UI32 a, UI32 b

__m128i _mm_maskz_add_epi32(__mmask8 k, __m128i a,
                            __m128i b)

Intel Description

Add packed 32-bit integers in “a” and “b”, and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        IF k[j]
                dst[i+31:i] := a[i+31:i] + b[i+31:i]
        ELSE
                dst[i+31:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_add_epi64#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i src, __mmask8 k, __m128i a, __m128i b
Param ETypes:: UI64 src, MASK k, UI64 a, UI64 b

__m128i _mm_mask_add_epi64(__m128i src, __mmask8 k,
                           __m128i a, __m128i b)

Intel Description

Add packed 64-bit integers in “a” and “b”, and store the results in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        IF k[j]
                dst[i+63:i] := a[i+63:i] + b[i+63:i]
        ELSE
                dst[i+63:i] := src[i+63:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_add_epi64#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __mmask8 k, __m128i a, __m128i b
Param ETypes:: MASK k, UI64 a, UI64 b

__m128i _mm_maskz_add_epi64(__mmask8 k, __m128i a,
                            __m128i b)

Intel Description

Add packed 64-bit integers in “a” and “b”, and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        IF k[j]
                dst[i+63:i] := a[i+63:i] + b[i+63:i]
        ELSE
                dst[i+63:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_max_epi32#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i src, __mmask8 k, __m128i a, __m128i b
Param ETypes:: UI32 src, MASK k, SI32 a, SI32 b

__m128i _mm_mask_max_epi32(__m128i src, __mmask8 k,
                           __m128i a, __m128i b)

Intel Description

Compare packed signed 32-bit integers in “a” and “b”, and store packed maximum values in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        IF k[j]
                dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
        ELSE
                dst[i+31:i] := src[i+31:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_max_epi32#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __mmask8 k, __m128i a, __m128i b
Param ETypes:: MASK k, SI32 a, SI32 b

__m128i _mm_maskz_max_epi32(__mmask8 k, __m128i a,
                            __m128i b)

Intel Description

Compare packed signed 32-bit integers in “a” and “b”, and store packed maximum values in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        IF k[j]
                dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
        ELSE
                dst[i+31:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_max_epi64#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i src, __mmask8 k, __m128i a, __m128i b
Param ETypes:: UI64 src, MASK k, SI64 a, SI64 b

__m128i _mm_mask_max_epi64(__m128i src, __mmask8 k,
                           __m128i a, __m128i b)

Intel Description

Compare packed signed 64-bit integers in “a” and “b”, and store packed maximum values in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        IF k[j]
                dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
        ELSE
                dst[i+63:i] := src[i+63:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_max_epi64#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __mmask8 k, __m128i a, __m128i b
Param ETypes:: MASK k, SI64 a, SI64 b

__m128i _mm_maskz_max_epi64(__mmask8 k, __m128i a,
                            __m128i b)

Intel Description

Compare packed signed 64-bit integers in “a” and “b”, and store packed maximum values in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        IF k[j]
                dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
        ELSE
                dst[i+63:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_max_epi64#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i a, __m128i b
Param ETypes:: SI64 a, SI64 b

__m128i _mm_max_epi64(__m128i a, __m128i b);

Intel Description

Compare packed signed 64-bit integers in “a” and “b”, and store packed maximum values in “dst”.

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
ENDFOR
dst[MAX:128] := 0

_mm_mask_max_epu32#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i src, __mmask8 k, __m128i a, __m128i b
Param ETypes:: UI32 src, MASK k, UI32 a, UI32 b

__m128i _mm_mask_max_epu32(__m128i src, __mmask8 k,
                           __m128i a, __m128i b)

Intel Description

Compare packed unsigned 32-bit integers in “a” and “b”, and store packed maximum values in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        IF k[j]
                dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
        ELSE
                dst[i+31:i] := src[i+31:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_max_epu32#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __mmask8 k, __m128i a, __m128i b
Param ETypes:: MASK k, UI32 a, UI32 b

__m128i _mm_maskz_max_epu32(__mmask8 k, __m128i a,
                            __m128i b)

Intel Description

Compare packed unsigned 32-bit integers in “a” and “b”, and store packed maximum values in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        IF k[j]
                dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
        ELSE
                dst[i+31:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_max_epu64#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i src, __mmask8 k, __m128i a, __m128i b
Param ETypes:: UI64 src, MASK k, UI64 a, UI64 b

__m128i _mm_mask_max_epu64(__m128i src, __mmask8 k,
                           __m128i a, __m128i b)

Intel Description

Compare packed unsigned 64-bit integers in “a” and “b”, and store packed maximum values in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        IF k[j]
                dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
        ELSE
                dst[i+63:i] := src[i+63:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_max_epu64#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __mmask8 k, __m128i a, __m128i b
Param ETypes:: MASK k, UI64 a, UI64 b

__m128i _mm_maskz_max_epu64(__mmask8 k, __m128i a,
                            __m128i b)

Intel Description

Compare packed unsigned 64-bit integers in “a” and “b”, and store packed maximum values in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        IF k[j]
                dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
        ELSE
                dst[i+63:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_max_epu64#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i a, __m128i b
Param ETypes:: UI64 a, UI64 b

__m128i _mm_max_epu64(__m128i a, __m128i b);

Intel Description

Compare packed unsigned 64-bit integers in “a” and “b”, and store packed maximum values in “dst”.

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
ENDFOR
dst[MAX:128] := 0

_mm_mask_min_epi32#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i src, __mmask8 k, __m128i a, __m128i b
Param ETypes:: UI32 src, MASK k, SI32 a, SI32 b

__m128i _mm_mask_min_epi32(__m128i src, __mmask8 k,
                           __m128i a, __m128i b)

Intel Description

Compare packed signed 32-bit integers in “a” and “b”, and store packed minimum values in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        IF k[j]
                dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
        ELSE
                dst[i+31:i] := src[i+31:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_min_epi32#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __mmask8 k, __m128i a, __m128i b
Param ETypes:: MASK k, SI32 a, SI32 b

__m128i _mm_maskz_min_epi32(__mmask8 k, __m128i a,
                            __m128i b)

Intel Description

Compare packed signed 32-bit integers in “a” and “b”, and store packed minimum values in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        IF k[j]
                dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
        ELSE
                dst[i+31:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_min_epi64#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i src, __mmask8 k, __m128i a, __m128i b
Param ETypes:: UI64 src, MASK k, SI64 a, SI64 b

__m128i _mm_mask_min_epi64(__m128i src, __mmask8 k,
                           __m128i a, __m128i b)

Intel Description

Compare packed signed 64-bit integers in “a” and “b”, and store packed minimum values in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        IF k[j]
                dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
        ELSE
                dst[i+63:i] := src[i+63:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_min_epi64#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __mmask8 k, __m128i a, __m128i b
Param ETypes:: MASK k, SI64 a, SI64 b

__m128i _mm_maskz_min_epi64(__mmask8 k, __m128i a,
                            __m128i b)

Intel Description

Compare packed signed 64-bit integers in “a” and “b”, and store packed minimum values in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        IF k[j]
                dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
        ELSE
                dst[i+63:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_min_epi64#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i a, __m128i b
Param ETypes:: SI64 a, SI64 b

__m128i _mm_min_epi64(__m128i a, __m128i b);

Intel Description

Compare packed signed 64-bit integers in “a” and “b”, and store packed minimum values in “dst”.

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
ENDFOR
dst[MAX:128] := 0

_mm_mask_min_epu32#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i src, __mmask8 k, __m128i a, __m128i b
Param ETypes:: UI32 src, MASK k, UI32 a, UI32 b

__m128i _mm_mask_min_epu32(__m128i src, __mmask8 k,
                           __m128i a, __m128i b)

Intel Description

Compare packed unsigned 32-bit integers in “a” and “b”, and store packed minimum values in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        IF k[j]
                dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
        ELSE
                dst[i+31:i] := src[i+31:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_min_epu32#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __mmask8 k, __m128i a, __m128i b
Param ETypes:: MASK k, UI32 a, UI32 b

__m128i _mm_maskz_min_epu32(__mmask8 k, __m128i a,
                            __m128i b)

Intel Description

Compare packed unsigned 32-bit integers in “a” and “b”, and store packed minimum values in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        IF k[j]
                dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
        ELSE
                dst[i+31:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_min_epu64#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i src, __mmask8 k, __m128i a, __m128i b
Param ETypes:: UI64 src, MASK k, UI64 a, UI64 b

__m128i _mm_mask_min_epu64(__m128i src, __mmask8 k,
                           __m128i a, __m128i b)

Intel Description

Compare packed unsigned 64-bit integers in “a” and “b”, and store packed minimum values in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        IF k[j]
                dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
        ELSE
                dst[i+63:i] := src[i+63:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_min_epu64#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __mmask8 k, __m128i a, __m128i b
Param ETypes:: MASK k, UI64 a, UI64 b

__m128i _mm_maskz_min_epu64(__mmask8 k, __m128i a,
                            __m128i b)

Intel Description

Compare packed unsigned 64-bit integers in “a” and “b”, and store packed minimum values in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        IF k[j]
                dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
        ELSE
                dst[i+63:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_min_epu64#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i a, __m128i b
Param ETypes:: UI64 a, UI64 b

__m128i _mm_min_epu64(__m128i a, __m128i b);

Intel Description

Compare packed unsigned 64-bit integers in “a” and “b”, and store packed minimum values in “dst”.

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
ENDFOR
dst[MAX:128] := 0

_mm_mask_mul_epi32#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i src, __mmask8 k, __m128i a, __m128i b
Param ETypes:: SI64 src, MASK k, SI32 a, SI32 b

__m128i _mm_mask_mul_epi32(__m128i src, __mmask8 k,
                           __m128i a, __m128i b)

Intel Description

Multiply the low signed 32-bit integers from each packed 64-bit element in “a” and “b”, and store the signed 64-bit results in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        IF k[j]
                dst[i+63:i] := SignExtend64(a[i+31:i]) * SignExtend64(b[i+31:i])
        ELSE
                dst[i+63:i] := src[i+63:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_mul_epi32#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __mmask8 k, __m128i a, __m128i b
Param ETypes:: MASK k, SI32 a, SI32 b

__m128i _mm_maskz_mul_epi32(__mmask8 k, __m128i a,
                            __m128i b)

Intel Description

Multiply the low signed 32-bit integers from each packed 64-bit element in “a” and “b”, and store the signed 64-bit results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        IF k[j]
                dst[i+63:i] := SignExtend64(a[i+31:i]) * SignExtend64(b[i+31:i])
        ELSE
                dst[i+63:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_mullo_epi32#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i src, __mmask8 k, __m128i a, __m128i b
Param ETypes:: UI32 src, MASK k, UI32 a, UI32 b

__m128i _mm_mask_mullo_epi32(__m128i src, __mmask8 k,
                             __m128i a, __m128i b)

Intel Description

Multiply the packed 32-bit integers in “a” and “b”, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        IF k[j]
                tmp[63:0] := a[i+31:i] * b[i+31:i]
                dst[i+31:i] := tmp[31:0]
        ELSE
                dst[i+31:i] := src[i+31:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_mullo_epi32#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __mmask8 k, __m128i a, __m128i b
Param ETypes:: MASK k, UI32 a, UI32 b

__m128i _mm_maskz_mullo_epi32(__mmask8 k, __m128i a,
                              __m128i b)

Intel Description

Multiply the packed 32-bit integers in “a” and “b”, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        IF k[j]
                tmp[63:0] := a[i+31:i] * b[i+31:i]
                dst[i+31:i] := tmp[31:0]
        ELSE
                dst[i+31:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_mul_epu32#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i src, __mmask8 k, __m128i a, __m128i b
Param ETypes:: UI64 src, MASK k, UI32 a, UI32 b

__m128i _mm_mask_mul_epu32(__m128i src, __mmask8 k,
                           __m128i a, __m128i b)

Intel Description

Multiply the low unsigned 32-bit integers from each packed 64-bit element in “a” and “b”, and store the unsigned 64-bit results in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        IF k[j]
                dst[i+63:i] := a[i+31:i] * b[i+31:i]
        ELSE
                dst[i+63:i] := src[i+63:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_mul_epu32#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __mmask8 k, __m128i a, __m128i b
Param ETypes:: MASK k, UI32 a, UI32 b

__m128i _mm_maskz_mul_epu32(__mmask8 k, __m128i a,
                            __m128i b)

Intel Description

Multiply the low unsigned 32-bit integers from each packed 64-bit element in “a” and “b”, and store the unsigned 64-bit results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        IF k[j]
                dst[i+63:i] := a[i+31:i] * b[i+31:i]
        ELSE
                dst[i+63:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_sub_epi32#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i src, __mmask8 k, __m128i a, __m128i b
Param ETypes:: UI32 src, MASK k, UI32 a, UI32 b

__m128i _mm_mask_sub_epi32(__m128i src, __mmask8 k,
                           __m128i a, __m128i b)

Intel Description

Subtract packed 32-bit integers in “b” from packed 32-bit integers in “a”, and store the results in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        IF k[j]
                dst[i+31:i] := a[i+31:i] - b[i+31:i]
        ELSE
                dst[i+31:i] := src[i+31:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_sub_epi32#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __mmask8 k, __m128i a, __m128i b
Param ETypes:: MASK k, UI32 a, UI32 b

__m128i _mm_maskz_sub_epi32(__mmask8 k, __m128i a,
                            __m128i b)

Intel Description

Subtract packed 32-bit integers in “b” from packed 32-bit integers in “a”, and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        IF k[j]
                dst[i+31:i] := a[i+31:i] - b[i+31:i]
        ELSE
                dst[i+31:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_sub_epi64#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i src, __mmask8 k, __m128i a, __m128i b
Param ETypes:: UI64 src, MASK k, UI64 a, UI64 b

__m128i _mm_mask_sub_epi64(__m128i src, __mmask8 k,
                           __m128i a, __m128i b)

Intel Description

Subtract packed 64-bit integers in “b” from packed 64-bit integers in “a”, and store the results in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        IF k[j]
                dst[i+63:i] := a[i+63:i] - b[i+63:i]
        ELSE
                dst[i+63:i] := src[i+63:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_sub_epi64#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __mmask8 k, __m128i a, __m128i b
Param ETypes:: MASK k, UI64 a, UI64 b

__m128i _mm_maskz_sub_epi64(__mmask8 k, __m128i a,
                            __m128i b)

Intel Description

Subtract packed 64-bit integers in “b” from packed 64-bit integers in “a”, and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        IF k[j]
                dst[i+63:i] := a[i+63:i] - b[i+63:i]
        ELSE
                dst[i+63:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_rcp14_pd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __m128d src, __mmask8 k, __m128d a
Param ETypes:: FP64 src, MASK k, FP64 a

__m128d _mm_mask_rcp14_pd(__m128d src, __mmask8 k,
                          __m128d a)

Intel Description

Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in “a”, and store the results in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        IF k[j]
                dst[i+63:i] := (1.0 / a[i+63:i])
        ELSE
                dst[i+63:i] := src[i+63:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_rcp14_pd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __mmask8 k, __m128d a
Param ETypes:: MASK k, FP64 a

__m128d _mm_maskz_rcp14_pd(__mmask8 k, __m128d a);

Intel Description

Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in “a”, and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        IF k[j]
                dst[i+63:i] := (1.0 / a[i+63:i])
        ELSE
                dst[i+63:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_rcp14_pd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __m128d a
Param ETypes:: FP64 a

__m128d _mm_rcp14_pd(__m128d a);

Intel Description

Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in “a”, and store the results in “dst”. The maximum relative error for this approximation is less than 2^-14.

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        dst[i+63:i] := (1.0 / a[i+63:i])
ENDFOR
dst[MAX:128] := 0

_mm_mask_rcp14_ps#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __m128 src, __mmask8 k, __m128 a
Param ETypes:: FP32 src, MASK k, FP32 a

__m128 _mm_mask_rcp14_ps(__m128 src, __mmask8 k, __m128 a);

Intel Description

Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in “a”, and store the results in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        IF k[j]
                dst[i+31:i] := (1.0 / a[i+31:i])
        ELSE
                dst[i+31:i] := src[i+31:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_rcp14_ps#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __mmask8 k, __m128 a
Param ETypes:: MASK k, FP32 a

__m128 _mm_maskz_rcp14_ps(__mmask8 k, __m128 a);

Intel Description

Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in “a”, and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        IF k[j]
                dst[i+31:i] := (1.0 / a[i+31:i])
        ELSE
                dst[i+31:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_rcp14_ps#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __m128 a
Param ETypes:: FP32 a

__m128 _mm_rcp14_ps(__m128 a);

Intel Description

Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in “a”, and store the results in “dst”. The maximum relative error for this approximation is less than 2^-14.

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        dst[i+31:i] := (1.0 / a[i+31:i])
ENDFOR
dst[MAX:128] := 0

_mm_rsqrt14_pd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __m128d a
Param ETypes:: FP64 a

__m128d _mm_rsqrt14_pd(__m128d a);

Intel Description

Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in “a”, and store the results in “dst”. The maximum relative error for this approximation is less than 2^-14.

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        dst[i+63:i] := (1.0 / SQRT(a[i+63:i]))
ENDFOR
dst[MAX:128] := 0

_mm_mask_rsqrt14_pd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __m128d src, __mmask8 k, __m128d a
Param ETypes:: FP64 src, MASK k, FP64 a

__m128d _mm_mask_rsqrt14_pd(__m128d src, __mmask8 k,
                            __m128d a)

Intel Description

Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in “a”, and store the results in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        IF k[j]
                dst[i+63:i] := (1.0 / SQRT(a[i+63:i]))
        ELSE
                dst[i+63:i] := src[i+63:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_rsqrt14_pd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __mmask8 k, __m128d a
Param ETypes:: MASK k, FP64 a

__m128d _mm_maskz_rsqrt14_pd(__mmask8 k, __m128d a);

Intel Description

Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in “a”, and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        IF k[j]
                dst[i+63:i] := (1.0 / SQRT(a[i+63:i]))
        ELSE
                dst[i+63:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_rsqrt14_ps#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __m128 a
Param ETypes:: FP32 a

__m128 _mm_rsqrt14_ps(__m128 a);

Intel Description

Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in “a”, and store the results in “dst”. The maximum relative error for this approximation is less than 2^-14.

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        dst[i+31:i] := (1.0 / SQRT(a[i+31:i]))
ENDFOR
dst[MAX:128] := 0

_mm_mask_rsqrt14_ps#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __m128 src, __mmask8 k, __m128 a
Param ETypes:: FP32 src, MASK k, FP32 a

__m128 _mm_mask_rsqrt14_ps(__m128 src, __mmask8 k,
                           __m128 a)

Intel Description

Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in “a”, and store the results in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        IF k[j]
                dst[i+31:i] := (1.0 / SQRT(a[i+31:i]))
        ELSE
                dst[i+31:i] := src[i+31:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_rsqrt14_ps#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __mmask8 k, __m128 a
Param ETypes:: MASK k, FP32 a

__m128 _mm_maskz_rsqrt14_ps(__mmask8 k, __m128 a);

Intel Description

Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in “a”, and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        IF k[j]
                dst[i+31:i] := (1.0 / SQRT(a[i+31:i]))
        ELSE
                dst[i+31:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_sub_pd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __m128d src, __mmask8 k, __m128d a, __m128d b
Param ETypes:: FP64 src, MASK k, FP64 a, FP64 b

__m128d _mm_mask_sub_pd(__m128d src, __mmask8 k, __m128d a,
                        __m128d b)

Intel Description

Subtract packed double-precision (64-bit) floating-point elements in “b” from packed double-precision (64-bit) floating-point elements in “a”, and store the results in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        IF k[j]
                dst[i+63:i] := a[i+63:i] - b[i+63:i]
        ELSE
                dst[i+63:i] := src[i+63:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_sub_pd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __mmask8 k, __m128d a, __m128d b
Param ETypes:: MASK k, FP64 a, FP64 b

__m128d _mm_maskz_sub_pd(__mmask8 k, __m128d a, __m128d b);

Intel Description

Subtract packed double-precision (64-bit) floating-point elements in “b” from packed double-precision (64-bit) floating-point elements in “a”, and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        IF k[j]
                dst[i+63:i] := a[i+63:i] - b[i+63:i]
        ELSE
                dst[i+63:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_sub_ps#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __m128 src, __mmask8 k, __m128 a, __m128 b
Param ETypes:: FP32 src, MASK k, FP32 a, FP32 b

__m128 _mm_mask_sub_ps(__m128 src, __mmask8 k, __m128 a,
                       __m128 b)

Intel Description

Subtract packed single-precision (32-bit) floating-point elements in “b” from packed single-precision (32-bit) floating-point elements in “a”, and store the results in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        IF k[j]
                dst[i+31:i] := a[i+31:i] - b[i+31:i]
        ELSE
                dst[i+31:i] := src[i+31:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_sub_ps#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __mmask8 k, __m128 a, __m128 b
Param ETypes:: MASK k, FP32 a, FP32 b

__m128 _mm_maskz_sub_ps(__mmask8 k, __m128 a, __m128 b);

Intel Description

Subtract packed single-precision (32-bit) floating-point elements in “b” from packed single-precision (32-bit) floating-point elements in “a”, and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        IF k[j]
                dst[i+31:i] := a[i+31:i] - b[i+31:i]
        ELSE
                dst[i+31:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_add_round_sd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __m128d a, __m128d b, int rounding
Param ETypes:: FP64 a, FP64 b, IMM rounding

__m128d _mm_add_round_sd(__m128d a, __m128d b,
                         int rounding)

Intel Description

Add the lower double-precision (64-bit) floating-point element in “a” and “b”, store the result in the lower element of “dst”, and copy the upper element from “a” to the upper element of “dst”.: [round_note]

Intel Implementation Psudeo-Code

dst[63:0] := a[63:0] + b[63:0]
dst[127:64] := a[127:64]
dst[MAX:128] := 0

_mm_mask_add_round_sd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __m128d src, __mmask8 k, __m128d a, __m128d b, int rounding
Param ETypes:: FP64 src, MASK k, FP64 a, FP64 b, IMM rounding

__m128d _mm_mask_add_round_sd(__m128d src, __mmask8 k,
                              __m128d a, __m128d b,
                              int rounding)

Intel Description

Add the lower double-precision (64-bit) floating-point element in “a” and “b”, store the result in the lower element of “dst” using writemask “k” (the element is copied from “src” when mask bit 0 is not set), and copy the upper element from “a” to the upper element of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst[63:0] := a[63:0] + b[63:0]
ELSE
        dst[63:0] := src[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

_mm_mask_add_sd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __m128d src, __mmask8 k, __m128d a, __m128d b
Param ETypes:: FP64 src, MASK k, FP64 a, FP64 b

__m128d _mm_mask_add_sd(__m128d src, __mmask8 k, __m128d a,
                        __m128d b)

Intel Description

Add the lower double-precision (64-bit) floating-point element in “a” and “b”, store the result in the lower element of “dst” using writemask “k” (the element is copied from “src” when mask bit 0 is not set), and copy the upper element from “a” to the upper element of “dst”.

Intel Implementation Psudeo-Code

IF k[0]
        dst[63:0] := a[63:0] + b[63:0]
ELSE
        dst[63:0] := src[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

_mm_maskz_add_round_sd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __mmask8 k, __m128d a, __m128d b, int rounding
Param ETypes:: MASK k, FP64 a, FP64 b, IMM rounding

__m128d _mm_maskz_add_round_sd(__mmask8 k, __m128d a,
                               __m128d b, int rounding)

Intel Description

Add the lower double-precision (64-bit) floating-point element in “a” and “b”, store the result in the lower element of “dst” using zeromask “k” (the element is zeroed out when mask bit 0 is not set), and copy the upper element from “a” to the upper element of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst[63:0] := a[63:0] + b[63:0]
ELSE
        dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

_mm_maskz_add_sd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __mmask8 k, __m128d a, __m128d b
Param ETypes:: MASK k, FP64 a, FP64 b

__m128d _mm_maskz_add_sd(__mmask8 k, __m128d a, __m128d b);

Intel Description

Add the lower double-precision (64-bit) floating-point element in “a” and “b”, store the result in the lower element of “dst” using zeromask “k” (the element is zeroed out when mask bit 0 is not set), and copy the upper element from “a” to the upper element of “dst”.

Intel Implementation Psudeo-Code

IF k[0]
        dst[63:0] := a[63:0] + b[63:0]
ELSE
        dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

_mm_add_round_ss#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __m128 a, __m128 b, int rounding
Param ETypes:: FP32 a, FP32 b, IMM rounding

__m128 _mm_add_round_ss(__m128 a, __m128 b, int rounding);

Intel Description

Add the lower single-precision (32-bit) floating-point element in “a” and “b”, store the result in the lower element of “dst”, and copy the upper 3 packed elements from “a” to the upper elements of “dst”.: [round_note]

Intel Implementation Psudeo-Code

dst[31:0] := a[31:0] + b[31:0]
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_mask_add_round_ss#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __m128 src, __mmask8 k, __m128 a, __m128 b, int rounding
Param ETypes:: FP32 src, MASK k, FP32 a, FP32 b, IMM rounding

__m128 _mm_mask_add_round_ss(__m128 src, __mmask8 k,
                             __m128 a, __m128 b,
                             int rounding)

Intel Description

Add the lower single-precision (32-bit) floating-point element in “a” and “b”, store the result in the lower element of “dst” using writemask “k” (the element is copied from “src” when mask bit 0 is not set), and copy the upper 3 packed elements from “a” to the upper elements of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst[31:0] := a[31:0] + b[31:0]
ELSE
        dst[31:0] := src[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_mask_add_ss#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __m128 src, __mmask8 k, __m128 a, __m128 b
Param ETypes:: FP32 src, MASK k, FP32 a, FP32 b

__m128 _mm_mask_add_ss(__m128 src, __mmask8 k, __m128 a,
                       __m128 b)

Intel Description

Add the lower single-precision (32-bit) floating-point element in “a” and “b”, store the result in the lower element of “dst” using writemask “k” (the element is copied from “src” when mask bit 0 is not set), and copy the upper 3 packed elements from “a” to the upper elements of “dst”.

Intel Implementation Psudeo-Code

IF k[0]
        dst[31:0] := a[31:0] + b[31:0]
ELSE
        dst[31:0] := src[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_maskz_add_round_ss#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __mmask8 k, __m128 a, __m128 b, int rounding
Param ETypes:: MASK k, FP32 a, FP32 b, IMM rounding

__m128 _mm_maskz_add_round_ss(__mmask8 k, __m128 a,
                              __m128 b, int rounding)

Intel Description

Add the lower single-precision (32-bit) floating-point element in “a” and “b”, store the result in the lower element of “dst” using zeromask “k” (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from “a” to the upper elements of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst[31:0] := a[31:0] + b[31:0]
ELSE
        dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_maskz_add_ss#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __mmask8 k, __m128 a, __m128 b
Param ETypes:: MASK k, FP32 a, FP32 b

__m128 _mm_maskz_add_ss(__mmask8 k, __m128 a, __m128 b);

Intel Description

Add the lower single-precision (32-bit) floating-point element in “a” and “b”, store the result in the lower element of “dst” using zeromask “k” (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from “a” to the upper elements of “dst”.

Intel Implementation Psudeo-Code

IF k[0]
        dst[31:0] := a[31:0] + b[31:0]
ELSE
        dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_div_round_sd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __m128d a, __m128d b, int rounding
Param ETypes:: FP64 a, FP64 b, IMM rounding

__m128d _mm_div_round_sd(__m128d a, __m128d b,
                         int rounding)

Intel Description

Divide the lower double-precision (64-bit) floating-point element in “a” by the lower double-precision (64-bit) floating-point element in “b”, store the result in the lower element of “dst”, and copy the upper element from “a” to the upper element of “dst”.: [round_note]

Intel Implementation Psudeo-Code

dst[63:0] := a[63:0] / b[63:0]
dst[127:64] := a[127:64]
dst[MAX:128] := 0

_mm_mask_div_round_sd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __m128d src, __mmask8 k, __m128d a, __m128d b, int rounding
Param ETypes:: FP64 src, MASK k, FP64 a, FP64 b, IMM rounding

__m128d _mm_mask_div_round_sd(__m128d src, __mmask8 k,
                              __m128d a, __m128d b,
                              int rounding)

Intel Description

Divide the lower double-precision (64-bit) floating-point element in “a” by the lower double-precision (64-bit) floating-point element in “b”, store the result in the lower element of “dst” using writemask “k” (the element is copied from “src” when mask bit 0 is not set), and copy the upper element from “a” to the upper element of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst[63:0] := a[63:0] / b[63:0]
ELSE
        dst[63:0] := src[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

_mm_mask_div_sd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __m128d src, __mmask8 k, __m128d a, __m128d b
Param ETypes:: FP64 src, MASK k, FP64 a, FP64 b

__m128d _mm_mask_div_sd(__m128d src, __mmask8 k, __m128d a,
                        __m128d b)

Intel Description

Divide the lower double-precision (64-bit) floating-point element in “a” by the lower double-precision (64-bit) floating-point element in “b”, store the result in the lower element of “dst” using writemask “k” (the element is copied from “src” when mask bit 0 is not set), and copy the upper element from “a” to the upper element of “dst”.

Intel Implementation Psudeo-Code

IF k[0]
        dst[63:0] := a[63:0] / b[63:0]
ELSE
        dst[63:0] := src[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

_mm_maskz_div_round_sd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __mmask8 k, __m128d a, __m128d b, int rounding
Param ETypes:: MASK k, FP64 a, FP64 b, IMM rounding

__m128d _mm_maskz_div_round_sd(__mmask8 k, __m128d a,
                               __m128d b, int rounding)

Intel Description

Divide the lower double-precision (64-bit) floating-point element in “a” by the lower double-precision (64-bit) floating-point element in “b”, store the result in the lower element of “dst” using zeromask “k” (the element is zeroed out when mask bit 0 is not set), and copy the upper element from “a” to the upper element of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst[63:0] := a[63:0] / b[63:0]
ELSE
        dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

_mm_maskz_div_sd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __mmask8 k, __m128d a, __m128d b
Param ETypes:: MASK k, FP64 a, FP64 b

__m128d _mm_maskz_div_sd(__mmask8 k, __m128d a, __m128d b);

Intel Description

Divide the lower double-precision (64-bit) floating-point element in “a” by the lower double-precision (64-bit) floating-point element in “b”, store the result in the lower element of “dst” using zeromask “k” (the element is zeroed out when mask bit 0 is not set), and copy the upper element from “a” to the upper element of “dst”.

Intel Implementation Psudeo-Code

IF k[0]
        dst[63:0] := a[63:0] / b[63:0]
ELSE
        dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

_mm_div_round_ss#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __m128 a, __m128 b, int rounding
Param ETypes:: FP32 a, FP32 b, IMM rounding

__m128 _mm_div_round_ss(__m128 a, __m128 b, int rounding);

Intel Description

Divide the lower single-precision (32-bit) floating-point element in “a” by the lower single-precision (32-bit) floating-point element in “b”, store the result in the lower element of “dst”, and copy the upper 3 packed elements from “a” to the upper elements of “dst”.: [round_note]

Intel Implementation Psudeo-Code

dst[31:0] := a[31:0] / b[31:0]
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_mask_div_round_ss#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __m128 src, __mmask8 k, __m128 a, __m128 b, int rounding
Param ETypes:: FP32 src, MASK k, FP32 a, FP32 b, IMM rounding

__m128 _mm_mask_div_round_ss(__m128 src, __mmask8 k,
                             __m128 a, __m128 b,
                             int rounding)

Intel Description

Divide the lower single-precision (32-bit) floating-point element in “a” by the lower single-precision (32-bit) floating-point element in “b”, store the result in the lower element of “dst” using writemask “k” (the element is copied from “src” when mask bit 0 is not set), and copy the upper 3 packed elements from “a” to the upper elements of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst[31:0] := a[31:0] / b[31:0]
ELSE
        dst[31:0] := src[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_mask_div_ss#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __m128 src, __mmask8 k, __m128 a, __m128 b
Param ETypes:: FP32 src, MASK k, FP32 a, FP32 b

__m128 _mm_mask_div_ss(__m128 src, __mmask8 k, __m128 a,
                       __m128 b)

Intel Description

Divide the lower single-precision (32-bit) floating-point element in “a” by the lower single-precision (32-bit) floating-point element in “b”, store the result in the lower element of “dst” using writemask “k” (the element is copied from “src” when mask bit 0 is not set), and copy the upper 3 packed elements from “a” to the upper elements of “dst”.

Intel Implementation Psudeo-Code

IF k[0]
        dst[31:0] := a[31:0] / b[31:0]
ELSE
        dst[31:0] := src[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_maskz_div_round_ss#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __mmask8 k, __m128 a, __m128 b, int rounding
Param ETypes:: MASK k, FP32 a, FP32 b, IMM rounding

__m128 _mm_maskz_div_round_ss(__mmask8 k, __m128 a,
                              __m128 b, int rounding)

Intel Description

Divide the lower single-precision (32-bit) floating-point element in “a” by the lower single-precision (32-bit) floating-point element in “b”, store the result in the lower element of “dst” using zeromask “k” (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from “a” to the upper elements of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst[31:0] := a[31:0] / b[31:0]
ELSE
        dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_maskz_div_ss#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __mmask8 k, __m128 a, __m128 b
Param ETypes:: MASK k, FP32 a, FP32 b

__m128 _mm_maskz_div_ss(__mmask8 k, __m128 a, __m128 b);

Intel Description

Divide the lower single-precision (32-bit) floating-point element in “a” by the lower single-precision (32-bit) floating-point element in “b”, store the result in the lower element of “dst” using zeromask “k” (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from “a” to the upper elements of “dst”.

Intel Implementation Psudeo-Code

IF k[0]
        dst[31:0] := a[31:0] / b[31:0]
ELSE
        dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_fmadd_round_sd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __m128d a, __m128d b, __m128d c, int rounding
Param ETypes:: FP64 a, FP64 b, FP64 c, IMM rounding

__m128d _mm_fmadd_round_sd(__m128d a, __m128d b, __m128d c,
                           int rounding)

Intel Description

Multiply the lower double-precision (64-bit) floating-point elements in “a” and “b”, and add the intermediate result to the lower element in “c”. Store the result in the lower element of “dst”, and copy the upper element from “a” to the upper element of “dst”.: [round_note]

Intel Implementation Psudeo-Code

dst[63:0] := (a[63:0] * b[63:0]) + c[63:0]
dst[127:64] := a[127:64]
dst[MAX:128] := 0

_mm_mask3_fmadd_round_sd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __m128d a, __m128d b, __m128d c, __mmask8 k, int rounding
Param ETypes:: FP64 a, FP64 b, FP64 c, MASK k, IMM rounding

__m128d _mm_mask3_fmadd_round_sd(__m128d a, __m128d b,
                                 __m128d c, __mmask8 k,
                                 int rounding)

Intel Description

Multiply the lower double-precision (64-bit) floating-point elements in “a” and “b”, and add the intermediate result to the lower element in “c”. Store the result in the lower element of “dst” using writemask “k” (the element is copied from “c” when mask bit 0 is not set), and copy the upper element from “c” to the upper element of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst[63:0] := (a[63:0] * b[63:0]) + c[63:0]
ELSE
        dst[63:0] := c[63:0]
FI
dst[127:64] := c[127:64]
dst[MAX:128] := 0

_mm_mask3_fmadd_sd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __m128d a, __m128d b, __m128d c, __mmask8 k
Param ETypes:: FP64 a, FP64 b, FP64 c, MASK k

__m128d _mm_mask3_fmadd_sd(__m128d a, __m128d b, __m128d c,
                           __mmask8 k)

Intel Description

Multiply the lower double-precision (64-bit) floating-point elements in “a” and “b”, and add the intermediate result to the lower element in “c”. Store the result in the lower element of “dst” using writemask “k” (the element is copied from “c” when mask bit 0 is not set), and copy the upper element from “c” to the upper element of “dst”.

Intel Implementation Psudeo-Code

IF k[0]
        dst[63:0] := (a[63:0] * b[63:0]) + c[63:0]
ELSE
        dst[63:0] := c[63:0]
FI
dst[127:64] := c[127:64]
dst[MAX:128] := 0

_mm_mask_fmadd_round_sd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __m128d a, __mmask8 k, __m128d b, __m128d c, int rounding
Param ETypes:: FP64 a, MASK k, FP64 b, FP64 c, IMM rounding

__m128d _mm_mask_fmadd_round_sd(__m128d a, __mmask8 k,
                                __m128d b, __m128d c,
                                int rounding)

Intel Description

Multiply the lower double-precision (64-bit) floating-point elements in “a” and “b”, and add the intermediate result to the lower element in “c”. Store the result in the lower element of “dst” using writemask “k” (the element is copied from “a” when mask bit 0 is not set), and copy the upper element from “a” to the upper element of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst[63:0] := (a[63:0] * b[63:0]) + c[63:0]
ELSE
        dst[63:0] := a[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

_mm_mask_fmadd_sd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __m128d a, __mmask8 k, __m128d b, __m128d c
Param ETypes:: FP64 a, MASK k, FP64 b, FP64 c

__m128d _mm_mask_fmadd_sd(__m128d a, __mmask8 k, __m128d b,
                          __m128d c)

Intel Description

Multiply the lower double-precision (64-bit) floating-point elements in “a” and “b”, and add the intermediate result to the lower element in “c”. Store the result in the lower element of “dst” using writemask “k” (the element is copied from “a” when mask bit 0 is not set), and copy the upper element from “a” to the upper element of “dst”.

Intel Implementation Psudeo-Code

IF k[0]
        dst[63:0] := (a[63:0] * b[63:0]) + c[63:0]
ELSE
        dst[63:0] := a[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

_mm_maskz_fmadd_round_sd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __mmask8 k, __m128d a, __m128d b, __m128d c, int rounding
Param ETypes:: MASK k, FP64 a, FP64 b, FP64 c, IMM rounding

__m128d _mm_maskz_fmadd_round_sd(__mmask8 k, __m128d a,
                                 __m128d b, __m128d c,
                                 int rounding)

Intel Description

Multiply the lower double-precision (64-bit) floating-point elements in “a” and “b”, and add the intermediate result to the lower element in “c”. Store the result in the lower element of “dst” using zeromask “k” (the element is zeroed out when mask bit 0 is not set), and copy the upper element from “a” to the upper element of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst[63:0] := (a[63:0] * b[63:0]) + c[63:0]
ELSE
        dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

_mm_maskz_fmadd_sd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __mmask8 k, __m128d a, __m128d b, __m128d c
Param ETypes:: MASK k, FP64 a, FP64 b, FP64 c

__m128d _mm_maskz_fmadd_sd(__mmask8 k, __m128d a, __m128d b,
                           __m128d c)

Intel Description

Multiply the lower double-precision (64-bit) floating-point elements in “a” and “b”, and add the intermediate result to the lower element in “c”. Store the result in the lower element of “dst” using zeromask “k” (the element is zeroed out when mask bit 0 is not set), and copy the upper element from “a” to the upper element of “dst”.

Intel Implementation Psudeo-Code

IF k[0]
        dst[63:0] := (a[63:0] * b[63:0]) + c[63:0]
ELSE
        dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

_mm_mask3_fmadd_round_ss#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __m128 a, __m128 b, __m128 c, __mmask8 k, int rounding
Param ETypes:: FP32 a, FP32 b, FP32 c, MASK k, IMM rounding

__m128 _mm_mask3_fmadd_round_ss(__m128 a, __m128 b,
                                __m128 c, __mmask8 k,
                                int rounding)

Intel Description

Multiply the lower single-precision (32-bit) floating-point elements in “a” and “b”, and add the intermediate result to the lower element in “c”. Store the result in the lower element of “dst” using writemask “k” (the element is copied from “c” when mask bit 0 is not set), and copy the upper 3 packed elements from “c” to the upper elements of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst[31:0] := (a[31:0] * b[31:0]) + c[31:0]
ELSE
        dst[31:0] := c[31:0]
FI
dst[127:32] := c[127:32]
dst[MAX:128] := 0

_mm_mask3_fmadd_ss#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __m128 a, __m128 b, __m128 c, __mmask8 k
Param ETypes:: FP32 a, FP32 b, FP32 c, MASK k

__m128 _mm_mask3_fmadd_ss(__m128 a, __m128 b, __m128 c,
                          __mmask8 k)

Intel Description

Multiply the lower single-precision (32-bit) floating-point elements in “a” and “b”, and add the intermediate result to the lower element in “c”. Store the result in the lower element of “dst” using writemask “k” (the element is copied from “c” when mask bit 0 is not set), and copy the upper 3 packed elements from “c” to the upper elements of “dst”.

Intel Implementation Psudeo-Code

IF k[0]
        dst[31:0] := (a[31:0] * b[31:0]) + c[31:0]
ELSE
        dst[31:0] := c[31:0]
FI
dst[127:32] := c[127:32]
dst[MAX:128] := 0

_mm_fmadd_round_ss#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __m128 a, __m128 b, __m128 c, int rounding
Param ETypes:: FP32 a, FP32 b, FP32 c, IMM rounding

__m128 _mm_fmadd_round_ss(__m128 a, __m128 b, __m128 c,
                          int rounding)

Intel Description

Multiply the lower single-precision (32-bit) floating-point elements in “a” and “b”, and add the intermediate result to the lower element in “c”. Store the result in the lower element of “dst”, and copy the upper 3 packed elements from “a” to the upper elements of “dst”.: [round_note]

Intel Implementation Psudeo-Code

dst[31:0] := (a[31:0] * b[31:0]) + c[31:0]
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_mask_fmadd_round_ss#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __m128 a, __mmask8 k, __m128 b, __m128 c, int rounding
Param ETypes:: FP32 a, MASK k, FP32 b, FP32 c, IMM rounding

__m128 _mm_mask_fmadd_round_ss(__m128 a, __mmask8 k,
                               __m128 b, __m128 c,
                               int rounding)

Intel Description

Multiply the lower single-precision (32-bit) floating-point elements in “a” and “b”, and add the intermediate result to the lower element in “c”. Store the result in the lower element of “dst” using writemask “k” (the element is copied from “a” when mask bit 0 is not set), and copy the upper 3 packed elements from “a” to the upper elements of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst[31:0] := (a[31:0] * b[31:0]) + c[31:0]
ELSE
        dst[31:0] := a[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_mask_fmadd_ss#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __m128 a, __mmask8 k, __m128 b, __m128 c
Param ETypes:: FP32 a, MASK k, FP32 b, FP32 c

__m128 _mm_mask_fmadd_ss(__m128 a, __mmask8 k, __m128 b,
                         __m128 c)

Intel Description

Multiply the lower single-precision (32-bit) floating-point elements in “a” and “b”, and add the intermediate result to the lower element in “c”. Store the result in the lower element of “dst” using writemask “k” (the element is copied from “a” when mask bit 0 is not set), and copy the upper 3 packed elements from “a” to the upper elements of “dst”.

Intel Implementation Psudeo-Code

IF k[0]
        dst[31:0] := (a[31:0] * b[31:0]) + c[31:0]
ELSE
        dst[31:0] := a[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_maskz_fmadd_round_ss#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __mmask8 k, __m128 a, __m128 b, __m128 c, int rounding
Param ETypes:: MASK k, FP32 a, FP32 b, FP32 c, IMM rounding

__m128 _mm_maskz_fmadd_round_ss(__mmask8 k, __m128 a,
                                __m128 b, __m128 c,
                                int rounding)

Intel Description

Multiply the lower single-precision (32-bit) floating-point elements in “a” and “b”, and add the intermediate result to the lower element in “c”. Store the result in the lower element of “dst” using zeromask “k” (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from “a” to the upper elements of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst[31:0] := (a[31:0] * b[31:0]) + c[31:0]
ELSE
        dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_maskz_fmadd_ss#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __mmask8 k, __m128 a, __m128 b, __m128 c
Param ETypes:: MASK k, FP32 a, FP32 b, FP32 c

__m128 _mm_maskz_fmadd_ss(__mmask8 k, __m128 a, __m128 b,
                          __m128 c)

Intel Description

Multiply the lower single-precision (32-bit) floating-point elements in “a” and “b”, and add the intermediate result to the lower element in “c”. Store the result in the lower element of “dst” using zeromask “k” (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from “a” to the upper elements of “dst”.

Intel Implementation Psudeo-Code

IF k[0]
        dst[31:0] := (a[31:0] * b[31:0]) + c[31:0]
ELSE
        dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_fmsub_round_sd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __m128d a, __m128d b, __m128d c, int rounding
Param ETypes:: FP64 a, FP64 b, FP64 c, IMM rounding

__m128d _mm_fmsub_round_sd(__m128d a, __m128d b, __m128d c,
                           int rounding)

Intel Description

Multiply the lower double-precision (64-bit) floating-point elements in “a” and “b”, and subtract the lower element in “c” from the intermediate result. Store the result in the lower element of “dst”, and copy the upper element from “a” to the upper element of “dst”.: [round_note]

Intel Implementation Psudeo-Code

dst[63:0] := (a[63:0] * b[63:0]) - c[63:0]
dst[127:64] := a[127:64]
dst[MAX:128] := 0

_mm_mask3_fmsub_round_sd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __m128d a, __m128d b, __m128d c, __mmask8 k, int rounding
Param ETypes:: FP64 a, FP64 b, FP64 c, MASK k, IMM rounding

__m128d _mm_mask3_fmsub_round_sd(__m128d a, __m128d b,
                                 __m128d c, __mmask8 k,
                                 int rounding)

Intel Description

Multiply the lower double-precision (64-bit) floating-point elements in “a” and “b”, and subtract the lower element in “c” from the intermediate result. Store the result in the lower element of “dst” using writemask “k” (the element is copied from “c” when mask bit 0 is not set), and copy the upper element from “c” to the upper element of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst[63:0] := (a[63:0] * b[63:0]) - c[63:0]
ELSE
        dst[63:0] := c[63:0]
FI
dst[127:64] := c[127:64]
dst[MAX:128] := 0

_mm_mask3_fmsub_sd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __m128d a, __m128d b, __m128d c, __mmask8 k
Param ETypes:: FP64 a, FP64 b, FP64 c, MASK k

__m128d _mm_mask3_fmsub_sd(__m128d a, __m128d b, __m128d c,
                           __mmask8 k)

Intel Description

Multiply the lower double-precision (64-bit) floating-point elements in “a” and “b”, and subtract the lower element in “c” from the intermediate result. Store the result in the lower element of “dst” using writemask “k” (the element is copied from “c” when mask bit 0 is not set), and copy the upper element from “c” to the upper element of “dst”.

Intel Implementation Psudeo-Code

IF k[0]
        dst[63:0] := (a[63:0] * b[63:0]) - c[63:0]
ELSE
        dst[63:0] := c[63:0]
FI
dst[127:64] := c[127:64]
dst[MAX:128] := 0

_mm_mask_fmsub_round_sd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __m128d a, __mmask8 k, __m128d b, __m128d c, int rounding
Param ETypes:: FP64 a, MASK k, FP64 b, FP64 c, IMM rounding

__m128d _mm_mask_fmsub_round_sd(__m128d a, __mmask8 k,
                                __m128d b, __m128d c,
                                int rounding)

Intel Description

Multiply the lower double-precision (64-bit) floating-point elements in “a” and “b”, and subtract the lower element in “c” from the intermediate result. Store the result in the lower element of “dst” using writemask “k” (the element is copied from “a” when mask bit 0 is not set), and copy the upper element from “a” to the upper element of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst[63:0] := (a[63:0] * b[63:0]) - c[63:0]
ELSE
        dst[63:0] := a[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

_mm_mask_fmsub_sd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __m128d a, __mmask8 k, __m128d b, __m128d c
Param ETypes:: FP64 a, MASK k, FP64 b, FP64 c

__m128d _mm_mask_fmsub_sd(__m128d a, __mmask8 k, __m128d b,
                          __m128d c)

Intel Description

Multiply the lower double-precision (64-bit) floating-point elements in “a” and “b”, and subtract the lower element in “c” from the intermediate result. Store the result in the lower element of “dst” using writemask “k” (the element is copied from “a” when mask bit 0 is not set), and copy the upper element from “a” to the upper element of “dst”.

Intel Implementation Psudeo-Code

IF k[0]
        dst[63:0] := (a[63:0] * b[63:0]) - c[63:0]
ELSE
        dst[63:0] := a[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

_mm_maskz_fmsub_round_sd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __mmask8 k, __m128d a, __m128d b, __m128d c, int rounding
Param ETypes:: MASK k, FP64 a, FP64 b, FP64 c, IMM rounding

__m128d _mm_maskz_fmsub_round_sd(__mmask8 k, __m128d a,
                                 __m128d b, __m128d c,
                                 int rounding)

Intel Description

Multiply the lower double-precision (64-bit) floating-point elements in “a” and “b”, and subtract the lower element in “c” from the intermediate result. Store the result in the lower element of “dst” using zeromask “k” (the element is zeroed out when mask bit 0 is not set), and copy the upper element from “a” to the upper element of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst[63:0] := (a[63:0] * b[63:0]) - c[63:0]
ELSE
        dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

_mm_maskz_fmsub_sd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __mmask8 k, __m128d a, __m128d b, __m128d c
Param ETypes:: MASK k, FP64 a, FP64 b, FP64 c

__m128d _mm_maskz_fmsub_sd(__mmask8 k, __m128d a, __m128d b,
                           __m128d c)

Intel Description

Multiply the lower double-precision (64-bit) floating-point elements in “a” and “b”, and subtract the lower element in “c” from the intermediate result. Store the result in the lower element of “dst” using zeromask “k” (the element is zeroed out when mask bit 0 is not set), and copy the upper element from “a” to the upper element of “dst”.

Intel Implementation Psudeo-Code

IF k[0]
        dst[63:0] := (a[63:0] * b[63:0]) - c[63:0]
ELSE
        dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

_mm_fmsub_round_ss#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __m128 a, __m128 b, __m128 c, int rounding
Param ETypes:: FP32 a, FP32 b, FP32 c, IMM rounding

__m128 _mm_fmsub_round_ss(__m128 a, __m128 b, __m128 c,
                          int rounding)

Intel Description

Multiply the lower single-precision (32-bit) floating-point elements in “a” and “b”, and subtract the lower element in “c” from the intermediate result. Store the result in the lower element of “dst”, and copy the upper 3 packed elements from “a” to the upper elements of “dst”.: [round_note]

Intel Implementation Psudeo-Code

dst[31:0] := (a[31:0] * b[31:0]) - c[31:0]
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_mask3_fmsub_round_ss#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __m128 a, __m128 b, __m128 c, __mmask8 k, int rounding
Param ETypes:: FP32 a, FP32 b, FP32 c, MASK k, IMM rounding

__m128 _mm_mask3_fmsub_round_ss(__m128 a, __m128 b,
                                __m128 c, __mmask8 k,
                                int rounding)

Intel Description

Multiply the lower single-precision (32-bit) floating-point elements in “a” and “b”, and subtract the lower element in “c” from the intermediate result. Store the result in the lower element of “dst” using writemask “k” (the element is copied from “c” when mask bit 0 is not set), and copy the upper 3 packed elements from “c” to the upper elements of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst[31:0] := (a[31:0] * b[31:0]) - c[31:0]
ELSE
        dst[31:0] := c[31:0]
FI
dst[127:32] := c[127:32]
dst[MAX:128] := 0

_mm_mask3_fmsub_ss#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __m128 a, __m128 b, __m128 c, __mmask8 k
Param ETypes:: FP32 a, FP32 b, FP32 c, MASK k

__m128 _mm_mask3_fmsub_ss(__m128 a, __m128 b, __m128 c,
                          __mmask8 k)

Intel Description

Multiply the lower single-precision (32-bit) floating-point elements in “a” and “b”, and subtract the lower element in “c” from the intermediate result. Store the result in the lower element of “dst” using writemask “k” (the element is copied from “c” when mask bit 0 is not set), and copy the upper 3 packed elements from “c” to the upper elements of “dst”.

Intel Implementation Psudeo-Code

IF k[0]
        dst[31:0] := (a[31:0] * b[31:0]) - c[31:0]
ELSE
        dst[31:0] := c[31:0]
FI
dst[127:32] := c[127:32]
dst[MAX:128] := 0

_mm_mask_fmsub_round_ss#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __m128 a, __mmask8 k, __m128 b, __m128 c, int rounding
Param ETypes:: FP32 a, MASK k, FP32 b, FP32 c, IMM rounding

__m128 _mm_mask_fmsub_round_ss(__m128 a, __mmask8 k,
                               __m128 b, __m128 c,
                               int rounding)

Intel Description

Multiply the lower single-precision (32-bit) floating-point elements in “a” and “b”, and subtract the lower element in “c” from the intermediate result. Store the result in the lower element of “dst” using writemask “k” (the element is copied from “a” when mask bit 0 is not set), and copy the upper 3 packed elements from “a” to the upper elements of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst[31:0] := (a[31:0] * b[31:0]) - c[31:0]
ELSE
        dst[31:0] := a[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_mask_fmsub_ss#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __m128 a, __mmask8 k, __m128 b, __m128 c
Param ETypes:: FP32 a, MASK k, FP32 b, FP32 c

__m128 _mm_mask_fmsub_ss(__m128 a, __mmask8 k, __m128 b,
                         __m128 c)

Intel Description

Multiply the lower single-precision (32-bit) floating-point elements in “a” and “b”, and subtract the lower element in “c” from the intermediate result. Store the result in the lower element of “dst” using writemask “k” (the element is copied from “a” when mask bit 0 is not set), and copy the upper 3 packed elements from “a” to the upper elements of “dst”.

Intel Implementation Psudeo-Code

IF k[0]
        dst[31:0] := (a[31:0] * b[31:0]) - c[31:0]
ELSE
        dst[31:0] := a[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_maskz_fmsub_round_ss#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __mmask8 k, __m128 a, __m128 b, __m128 c, int rounding
Param ETypes:: MASK k, FP32 a, FP32 b, FP32 c, IMM rounding

__m128 _mm_maskz_fmsub_round_ss(__mmask8 k, __m128 a,
                                __m128 b, __m128 c,
                                int rounding)

Intel Description

Multiply the lower single-precision (32-bit) floating-point elements in “a” and “b”, and subtract the lower element in “c” from the intermediate result. Store the result in the lower element of “dst” using zeromask “k” (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from “a” to the upper elements of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst[31:0] := (a[31:0] * b[31:0]) - c[31:0]
ELSE
        dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_maskz_fmsub_ss#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __mmask8 k, __m128 a, __m128 b, __m128 c
Param ETypes:: MASK k, FP32 a, FP32 b, FP32 c

__m128 _mm_maskz_fmsub_ss(__mmask8 k, __m128 a, __m128 b,
                          __m128 c)

Intel Description

Multiply the lower single-precision (32-bit) floating-point elements in “a” and “b”, and subtract the lower element in “c” from the intermediate result. Store the result in the lower element of “dst” using zeromask “k” (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from “a” to the upper elements of “dst”.

Intel Implementation Psudeo-Code

IF k[0]
        dst[31:0] := (a[31:0] * b[31:0]) - c[31:0]
ELSE
        dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_fnmadd_round_sd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __m128d a, __m128d b, __m128d c, int rounding
Param ETypes:: FP64 a, FP64 b, FP64 c, IMM rounding

__m128d _mm_fnmadd_round_sd(__m128d a, __m128d b, __m128d c,
                            int rounding)

Intel Description

Multiply the lower double-precision (64-bit) floating-point elements in “a” and “b”, and add the negated intermediate result to the lower element in “c”. Store the result in the lower element of “dst”, and copy the upper element from “a” to the upper element of “dst”.: [round_note]

Intel Implementation Psudeo-Code

dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0]
dst[127:64] := a[127:64]
dst[MAX:128] := 0

_mm_mask3_fnmadd_round_sd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __m128d a, __m128d b, __m128d c, __mmask8 k, int rounding
Param ETypes:: FP64 a, FP64 b, FP64 c, MASK k, IMM rounding

__m128d _mm_mask3_fnmadd_round_sd(__m128d a, __m128d b,
                                  __m128d c, __mmask8 k,
                                  int rounding)

Intel Description

Multiply the lower double-precision (64-bit) floating-point elements in “a” and “b”, and add the negated intermediate result to the lower element in “c”. Store the result in the lower element of “dst” using writemask “k” (the element is copied from “c” when mask bit 0 is not set), and copy the upper element from “c” to the upper element of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0]
ELSE
        dst[63:0] := c[63:0]
FI
dst[127:64] := c[127:64]
dst[MAX:128] := 0

_mm_mask3_fnmadd_sd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __m128d a, __m128d b, __m128d c, __mmask8 k
Param ETypes:: FP64 a, FP64 b, FP64 c, MASK k

__m128d _mm_mask3_fnmadd_sd(__m128d a, __m128d b, __m128d c,
                            __mmask8 k)

Intel Description

Multiply the lower double-precision (64-bit) floating-point elements in “a” and “b”, and add the negated intermediate result to the lower element in “c”. Store the result in the lower element of “dst” using writemask “k” (the element is copied from “c” when mask bit 0 is not set), and copy the upper element from “c” to the upper element of “dst”.

Intel Implementation Psudeo-Code

IF k[0]
        dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0]
ELSE
        dst[63:0] := c[63:0]
FI
dst[127:64] := c[127:64]
dst[MAX:128] := 0

_mm_mask_fnmadd_round_sd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __m128d a, __mmask8 k, __m128d b, __m128d c, int rounding
Param ETypes:: FP64 a, MASK k, FP64 b, FP64 c, IMM rounding

__m128d _mm_mask_fnmadd_round_sd(__m128d a, __mmask8 k,
                                 __m128d b, __m128d c,
                                 int rounding)

Intel Description

Multiply the lower double-precision (64-bit) floating-point elements in “a” and “b”, and add the negated intermediate result to the lower element in “c”. Store the result in the lower element of “dst” using writemask “k” (the element is copied from “a” when mask bit 0 is not set), and copy the upper element from “a” to the upper element of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0]
ELSE
        dst[63:0] := a[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

_mm_mask_fnmadd_sd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __m128d a, __mmask8 k, __m128d b, __m128d c
Param ETypes:: FP64 a, MASK k, FP64 b, FP64 c

__m128d _mm_mask_fnmadd_sd(__m128d a, __mmask8 k, __m128d b,
                           __m128d c)

Intel Description

Multiply the lower double-precision (64-bit) floating-point elements in “a” and “b”, and add the negated intermediate result to the lower element in “c”. Store the result in the lower element of “dst” using writemask “k” (the element is copied from “a” when mask bit 0 is not set), and copy the upper element from “a” to the upper element of “dst”.

Intel Implementation Psudeo-Code

IF k[0]
        dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0]
ELSE
        dst[63:0] := a[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

_mm_maskz_fnmadd_round_sd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __mmask8 k, __m128d a, __m128d b, __m128d c, int rounding
Param ETypes:: MASK k, FP64 a, FP64 b, FP64 c, IMM rounding

__m128d _mm_maskz_fnmadd_round_sd(__mmask8 k, __m128d a,
                                  __m128d b, __m128d c,
                                  int rounding)

Intel Description

Multiply the lower double-precision (64-bit) floating-point elements in “a” and “b”, and add the negated intermediate result to the lower element in “c”. Store the result in the lower element of “dst” using zeromask “k” (the element is zeroed out when mask bit 0 is not set), and copy the upper element from “a” to the upper element of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0]
ELSE
        dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

_mm_maskz_fnmadd_sd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __mmask8 k, __m128d a, __m128d b, __m128d c
Param ETypes:: MASK k, FP64 a, FP64 b, FP64 c

__m128d _mm_maskz_fnmadd_sd(__mmask8 k, __m128d a,
                            __m128d b, __m128d c)

Intel Description

Multiply the lower double-precision (64-bit) floating-point elements in “a” and “b”, and add the negated intermediate result to the lower element in “c”. Store the result in the lower element of “dst” using zeromask “k” (the element is zeroed out when mask bit 0 is not set), and copy the upper element from “a” to the upper element of “dst”.

Intel Implementation Psudeo-Code

IF k[0]
        dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0]
ELSE
        dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

_mm_fnmadd_round_ss#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __m128 a, __m128 b, __m128 c, int rounding
Param ETypes:: FP32 a, FP32 b, FP32 c, IMM rounding

__m128 _mm_fnmadd_round_ss(__m128 a, __m128 b, __m128 c,
                           int rounding)

Intel Description

Multiply the lower single-precision (32-bit) floating-point elements in “a” and “b”, and add the negated intermediate result to the lower element in “c”. Store the result in the lower element of “dst”, and copy the upper 3 packed elements from “a” to the upper elements of “dst”.: [round_note]

Intel Implementation Psudeo-Code

dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0]
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_mask3_fnmadd_round_ss#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __m128 a, __m128 b, __m128 c, __mmask8 k, int rounding
Param ETypes:: FP32 a, FP32 b, FP32 c, MASK k, IMM rounding

__m128 _mm_mask3_fnmadd_round_ss(__m128 a, __m128 b,
                                 __m128 c, __mmask8 k,
                                 int rounding)

Intel Description

Multiply the lower single-precision (32-bit) floating-point elements in “a” and “b”, and add the negated intermediate result to the lower element in “c”. Store the result in the lower element of “dst” using writemask “k” (the element is copied from “c” when mask bit 0 is not set), and copy the upper 3 packed elements from “c” to the upper elements of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0]
ELSE
        dst[31:0] := c[31:0]
FI
dst[127:32] := c[127:32]
dst[MAX:128] := 0

_mm_mask3_fnmadd_ss#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __m128 a, __m128 b, __m128 c, __mmask8 k
Param ETypes:: FP32 a, FP32 b, FP32 c, MASK k

__m128 _mm_mask3_fnmadd_ss(__m128 a, __m128 b, __m128 c,
                           __mmask8 k)

Intel Description

Multiply the lower single-precision (32-bit) floating-point elements in “a” and “b”, and add the negated intermediate result to the lower element in “c”. Store the result in the lower element of “dst” using writemask “k” (the element is copied from “c” when mask bit 0 is not set), and copy the upper 3 packed elements from “c” to the upper elements of “dst”.

Intel Implementation Psudeo-Code

IF k[0]
        dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0]
ELSE
        dst[31:0] := c[31:0]
FI
dst[127:32] := c[127:32]
dst[MAX:128] := 0

_mm_mask_fnmadd_round_ss#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __m128 a, __mmask8 k, __m128 b, __m128 c, int rounding
Param ETypes:: FP32 a, MASK k, FP32 b, FP32 c, IMM rounding

__m128 _mm_mask_fnmadd_round_ss(__m128 a, __mmask8 k,
                                __m128 b, __m128 c,
                                int rounding)

Intel Description

Multiply the lower single-precision (32-bit) floating-point elements in “a” and “b”, and add the negated intermediate result to the lower element in “c”. Store the result in the lower element of “dst” using writemask “k” (the element is copied from “a” when mask bit 0 is not set), and copy the upper 3 packed elements from “a” to the upper elements of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0]
ELSE
        dst[31:0] := a[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_mask_fnmadd_ss#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __m128 a, __mmask8 k, __m128 b, __m128 c
Param ETypes:: FP32 a, MASK k, FP32 b, FP32 c

__m128 _mm_mask_fnmadd_ss(__m128 a, __mmask8 k, __m128 b,
                          __m128 c)

Intel Description

Multiply the lower single-precision (32-bit) floating-point elements in “a” and “b”, and add the negated intermediate result to the lower element in “c”. Store the result in the lower element of “dst” using writemask “k” (the element is copied from “a” when mask bit 0 is not set), and copy the upper 3 packed elements from “a” to the upper elements of “dst”.

Intel Implementation Psudeo-Code

IF k[0]
        dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0]
ELSE
        dst[31:0] := a[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_maskz_fnmadd_round_ss#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __mmask8 k, __m128 a, __m128 b, __m128 c, int rounding
Param ETypes:: MASK k, FP32 a, FP32 b, FP32 c, IMM rounding

__m128 _mm_maskz_fnmadd_round_ss(__mmask8 k, __m128 a,
                                 __m128 b, __m128 c,
                                 int rounding)

Intel Description

Multiply the lower single-precision (32-bit) floating-point elements in “a” and “b”, and add the negated intermediate result to the lower element in “c”. Store the result in the lower element of “dst” using zeromask “k” (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from “a” to the upper elements of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0]
ELSE
        dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_maskz_fnmadd_ss#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __mmask8 k, __m128 a, __m128 b, __m128 c
Param ETypes:: MASK k, FP32 a, FP32 b, FP32 c

__m128 _mm_maskz_fnmadd_ss(__mmask8 k, __m128 a, __m128 b,
                           __m128 c)

Intel Description

Multiply the lower single-precision (32-bit) floating-point elements in “a” and “b”, and add the negated intermediate result to the lower element in “c”. Store the result in the lower element of “dst” using zeromask “k” (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from “a” to the upper elements of “dst”.

Intel Implementation Psudeo-Code

IF k[0]
        dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0]
ELSE
        dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_fnmsub_round_sd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __m128d a, __m128d b, __m128d c, int rounding
Param ETypes:: FP64 a, FP64 b, FP64 c, IMM rounding

__m128d _mm_fnmsub_round_sd(__m128d a, __m128d b, __m128d c,
                            int rounding)

Intel Description

Multiply the lower double-precision (64-bit) floating-point elements in “a” and “b”, and subtract the lower element in “c” from the negated intermediate result. Store the result in the lower element of “dst”, and copy the upper element from “a” to the upper element of “dst”.: [round_note]

Intel Implementation Psudeo-Code

dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0]
dst[127:64] := a[127:64]
dst[MAX:128] := 0

_mm_mask3_fnmsub_round_sd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __m128d a, __m128d b, __m128d c, __mmask8 k, int rounding
Param ETypes:: FP64 a, FP64 b, FP64 c, MASK k, IMM rounding

__m128d _mm_mask3_fnmsub_round_sd(__m128d a, __m128d b,
                                  __m128d c, __mmask8 k,
                                  int rounding)

Intel Description

Multiply the lower double-precision (64-bit) floating-point elements in “a” and “b”, and subtract the lower element in “c” from the negated intermediate result. Store the result in the lower element of “dst” using writemask “k” (the element is copied from “c” when mask bit 0 is not set), and copy the upper element from “c” to the upper element of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0]
ELSE
        dst[63:0] := c[63:0]
FI
dst[127:64] := c[127:64]
dst[MAX:128] := 0

_mm_mask3_fnmsub_sd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __m128d a, __m128d b, __m128d c, __mmask8 k
Param ETypes:: FP64 a, FP64 b, FP64 c, MASK k

__m128d _mm_mask3_fnmsub_sd(__m128d a, __m128d b, __m128d c,
                            __mmask8 k)

Intel Description

Multiply the lower double-precision (64-bit) floating-point elements in “a” and “b”, and subtract the lower element in “c” from the negated intermediate result. Store the result in the lower element of “dst” using writemask “k” (the element is copied from “c” when mask bit 0 is not set), and copy the upper element from “c” to the upper element of “dst”.

Intel Implementation Psudeo-Code

IF k[0]
        dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0]
ELSE
        dst[63:0] := c[63:0]
FI
dst[127:64] := c[127:64]
dst[MAX:128] := 0

_mm_mask_fnmsub_round_sd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __m128d a, __mmask8 k, __m128d b, __m128d c, int rounding
Param ETypes:: FP64 a, MASK k, FP64 b, FP64 c, IMM rounding

__m128d _mm_mask_fnmsub_round_sd(__m128d a, __mmask8 k,
                                 __m128d b, __m128d c,
                                 int rounding)

Intel Description

Multiply the lower double-precision (64-bit) floating-point elements in “a” and “b”, and subtract the lower element in “c” from the negated intermediate result. Store the result in the lower element of “dst” using writemask “k” (the element is copied from “c” when mask bit 0 is not set), and copy the upper element from “a” to the upper element of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0]
ELSE
        dst[63:0] := a[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

_mm_mask_fnmsub_sd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __m128d a, __mmask8 k, __m128d b, __m128d c
Param ETypes:: FP64 a, MASK k, FP64 b, FP64 c

__m128d _mm_mask_fnmsub_sd(__m128d a, __mmask8 k, __m128d b,
                           __m128d c)

Intel Description

Intel Implementation Psudeo-Code

IF k[0]
        dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0]
ELSE
        dst[63:0] := a[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

_mm_maskz_fnmsub_round_sd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __mmask8 k, __m128d a, __m128d b, __m128d c, int rounding
Param ETypes:: MASK k, FP64 a, FP64 b, FP64 c, IMM rounding

__m128d _mm_maskz_fnmsub_round_sd(__mmask8 k, __m128d a,
                                  __m128d b, __m128d c,
                                  int rounding)

Intel Description

Multiply the lower double-precision (64-bit) floating-point elements in “a” and “b”, and subtract the lower element in “c” from the negated intermediate result. Store the result in “dst” using zeromask “k” (the element is zeroed out when mask bit 0 is not set), and copy the upper element from “a” to the upper element of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0]
ELSE
        dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

_mm_maskz_fnmsub_sd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __mmask8 k, __m128d a, __m128d b, __m128d c
Param ETypes:: MASK k, FP64 a, FP64 b, FP64 c

__m128d _mm_maskz_fnmsub_sd(__mmask8 k, __m128d a,
                            __m128d b, __m128d c)

Intel Description

Multiply the lower double-precision (64-bit) floating-point elements in “a” and “b”, and subtract the lower element in “c” from the negated intermediate result. Store the result in “dst” using zeromask “k” (the element is zeroed out when mask bit 0 is not set), and copy the upper element from “a” to the upper element of “dst”.

Intel Implementation Psudeo-Code

IF k[0]
        dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0]
ELSE
        dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

_mm_fnmsub_round_ss#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __m128 a, __m128 b, __m128 c, int rounding
Param ETypes:: FP32 a, FP32 b, FP32 c, IMM rounding

__m128 _mm_fnmsub_round_ss(__m128 a, __m128 b, __m128 c,
                           int rounding)

Intel Description

Multiply the lower single-precision (32-bit) floating-point elements in “a” and “b”, subtract the lower element in “c” from the negated intermediate result, store the result in the lower element of “dst”, and copy the upper 3 packed elements from “a” to the upper elements of “dst”.: [round_note]

Intel Implementation Psudeo-Code

dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0]
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_mask3_fnmsub_round_ss#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __m128 a, __m128 b, __m128 c, __mmask8 k, int rounding
Param ETypes:: FP32 a, FP32 b, FP32 c, MASK k, IMM rounding

__m128 _mm_mask3_fnmsub_round_ss(__m128 a, __m128 b,
                                 __m128 c, __mmask8 k,
                                 int rounding)

Intel Description

Multiply the lower single-precision (32-bit) floating-point elements in “a” and “b”, subtract the lower element in “c” from the negated intermediate result. Store the result in the lower element of “dst” using writemask “k” (the element is copied from “c” when mask bit 0 is not set), and copy the upper 3 packed elements from “c” to the upper elements of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0]
ELSE
        dst[31:0] := c[31:0]
FI
dst[127:32] := c[127:32]
dst[MAX:128] := 0

_mm_mask3_fnmsub_ss#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __m128 a, __m128 b, __m128 c, __mmask8 k
Param ETypes:: FP32 a, FP32 b, FP32 c, MASK k

__m128 _mm_mask3_fnmsub_ss(__m128 a, __m128 b, __m128 c,
                           __mmask8 k)

Intel Description

Multiply the lower single-precision (32-bit) floating-point elements in “a” and “b”, and subtract the lower element in “c” from the negated intermediate result. Store the result in the lower element of “dst” using writemask “k” (the element is copied from “c” when mask bit 0 is not set), and copy the upper 3 packed elements from “c” to the upper elements of “dst”.

Intel Implementation Psudeo-Code

IF k[0]
        dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0]
ELSE
        dst[31:0] := c[31:0]
FI
dst[127:32] := c[127:32]
dst[MAX:128] := 0

_mm_mask_fnmsub_round_ss#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __m128 a, __mmask8 k, __m128 b, __m128 c, int rounding
Param ETypes:: FP32 a, MASK k, FP32 b, FP32 c, IMM rounding

__m128 _mm_mask_fnmsub_round_ss(__m128 a, __mmask8 k,
                                __m128 b, __m128 c,
                                int rounding)

Intel Description

Multiply the lower single-precision (32-bit) floating-point elements in “a” and “b”, and subtract the lower element in “c” from the negated intermediate result. Store the result in the lower element of “dst” using writemask “k” (the element is copied from “c” when mask bit 0 is not set), and copy the upper 3 packed elements from “a” to the upper elements of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0]
ELSE
        dst[31:0] := a[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_mask_fnmsub_ss#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __m128 a, __mmask8 k, __m128 b, __m128 c
Param ETypes:: FP32 a, MASK k, FP32 b, FP32 c

__m128 _mm_mask_fnmsub_ss(__m128 a, __mmask8 k, __m128 b,
                          __m128 c)

Intel Description

Intel Implementation Psudeo-Code

IF k[0]
        dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0]
ELSE
        dst[31:0] := a[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_maskz_fnmsub_round_ss#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __mmask8 k, __m128 a, __m128 b, __m128 c, int rounding
Param ETypes:: MASK k, FP32 a, FP32 b, FP32 c, IMM rounding

__m128 _mm_maskz_fnmsub_round_ss(__mmask8 k, __m128 a,
                                 __m128 b, __m128 c,
                                 int rounding)

Intel Description

Multiply the lower single-precision (32-bit) floating-point elements in “a” and “b”, and subtract the lower element in “c” from the negated intermediate result. Store the result in the lower element of “dst” using zeromask “k” (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from “a” to the upper elements of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0]
ELSE
        dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_maskz_fnmsub_ss#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __mmask8 k, __m128 a, __m128 b, __m128 c
Param ETypes:: MASK k, FP32 a, FP32 b, FP32 c

__m128 _mm_maskz_fnmsub_ss(__mmask8 k, __m128 a, __m128 b,
                           __m128 c)

Intel Description

Multiply the lower single-precision (32-bit) floating-point elements in “a” and “b”, and subtract the lower element in “c” from the negated intermediate result. Store the result in the lower element of “dst” using zeromask “k” (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from “a” to the upper elements of “dst”.

Intel Implementation Psudeo-Code

IF k[0]
        dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0]
ELSE
        dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_mask_mul_round_sd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __m128d src, __mmask8 k, __m128d a, __m128d b, int rounding
Param ETypes:: FP64 src, MASK k, FP64 a, FP64 b, IMM rounding

__m128d _mm_mask_mul_round_sd(__m128d src, __mmask8 k,
                              __m128d a, __m128d b,
                              int rounding)

Intel Description

Multiply the lower double-precision (64-bit) floating-point element in “a” and “b”, store the result in the lower element of “dst” using writemask “k” (the element is copied from “src” when mask bit 0 is not set), and copy the upper element from “a” to the upper element of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst[63:0] := a[63:0] * b[63:0]
ELSE
        dst[63:0] := src[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

_mm_mask_mul_sd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __m128d src, __mmask8 k, __m128d a, __m128d b
Param ETypes:: FP64 src, MASK k, FP64 a, FP64 b

__m128d _mm_mask_mul_sd(__m128d src, __mmask8 k, __m128d a,
                        __m128d b)

Intel Description

Multiply the lower double-precision (64-bit) floating-point element in “a” and “b”, store the result in the lower element of “dst” using writemask “k” (the element is copied from “src” when mask bit 0 is not set), and copy the upper element from “a” to the upper element of “dst”.

Intel Implementation Psudeo-Code

IF k[0]
        dst[63:0] := a[63:0] * b[63:0]
ELSE
        dst[63:0] := src[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

_mm_maskz_mul_round_sd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __mmask8 k, __m128d a, __m128d b, int rounding
Param ETypes:: MASK k, FP64 a, FP64 b, IMM rounding

__m128d _mm_maskz_mul_round_sd(__mmask8 k, __m128d a,
                               __m128d b, int rounding)

Intel Description

Multiply the lower double-precision (64-bit) floating-point element in “a” and “b”, store the result in the lower element of “dst” using zeromask “k” (the element is zeroed out when mask bit 0 is not set), and copy the upper element from “a” to the upper element of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst[63:0] := a[63:0] * b[63:0]
ELSE
        dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

_mm_maskz_mul_sd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __mmask8 k, __m128d a, __m128d b
Param ETypes:: MASK k, FP64 a, FP64 b

__m128d _mm_maskz_mul_sd(__mmask8 k, __m128d a, __m128d b);

Intel Description

Multiply the lower double-precision (64-bit) floating-point element in “a” and “b”, store the result in the lower element of “dst” using zeromask “k” (the element is zeroed out when mask bit 0 is not set), and copy the upper element from “a” to the upper element of “dst”.

Intel Implementation Psudeo-Code

IF k[0]
        dst[63:0] := a[63:0] * b[63:0]
ELSE
        dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

_mm_mul_round_sd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __m128d a, __m128d b, int rounding
Param ETypes:: FP64 a, FP64 b, IMM rounding

__m128d _mm_mul_round_sd(__m128d a, __m128d b,
                         int rounding)

Intel Description

Multiply the lower double-precision (64-bit) floating-point element in “a” and “b”, store the result in the lower element of “dst”, and copy the upper element from “a” to the upper element of “dst”.: [round_note]

Intel Implementation Psudeo-Code

dst[63:0] := a[63:0] * b[63:0]
dst[127:64] := a[127:64]
dst[MAX:128] := 0

_mm_mask_mul_round_ss#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __m128 src, __mmask8 k, __m128 a, __m128 b, int rounding
Param ETypes:: FP32 src, MASK k, FP32 a, FP32 b, IMM rounding

__m128 _mm_mask_mul_round_ss(__m128 src, __mmask8 k,
                             __m128 a, __m128 b,
                             int rounding)

Intel Description

Multiply the lower single-precision (32-bit) floating-point element in “a” and “b”, store the result in the lower element of “dst” using writemask “k” (the element is copied from “src” when mask bit 0 is not set), and copy the upper 3 packed elements from “a” to the upper elements of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst[31:0] := a[31:0] * b[31:0]
ELSE
        dst[31:0] := src[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_mask_mul_ss#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __m128 src, __mmask8 k, __m128 a, __m128 b
Param ETypes:: FP32 src, MASK k, FP32 a, FP32 b

__m128 _mm_mask_mul_ss(__m128 src, __mmask8 k, __m128 a,
                       __m128 b)

Intel Description

Multiply the lower single-precision (32-bit) floating-point element in “a” and “b”, store the result in the lower element of “dst” using writemask “k” (the element is copied from “src” when mask bit 0 is not set), and copy the upper 3 packed elements from “a” to the upper elements of “dst”.

Intel Implementation Psudeo-Code

IF k[0]
        dst[31:0] := a[31:0] * b[31:0]
ELSE
        dst[31:0] := src[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_maskz_mul_round_ss#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __mmask8 k, __m128 a, __m128 b, int rounding
Param ETypes:: MASK k, FP32 a, FP32 b, IMM rounding

__m128 _mm_maskz_mul_round_ss(__mmask8 k, __m128 a,
                              __m128 b, int rounding)

Intel Description

Multiply the lower single-precision (32-bit) floating-point element in “a” and “b”, store the result in the lower element of “dst” using zeromask “k” (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from “a” to the upper elements of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst[31:0] := a[31:0] * b[31:0]
ELSE
        dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_maskz_mul_ss#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __mmask8 k, __m128 a, __m128 b
Param ETypes:: MASK k, FP32 a, FP32 b

__m128 _mm_maskz_mul_ss(__mmask8 k, __m128 a, __m128 b);

Intel Description

Multiply the lower single-precision (32-bit) floating-point element in “a” and “b”, store the result in the lower element of “dst” using zeromask “k” (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from “a” to the upper elements of “dst”.

Intel Implementation Psudeo-Code

IF k[0]
        dst[31:0] := a[31:0] * b[31:0]
ELSE
        dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_mul_round_ss#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __m128 a, __m128 b, int rounding
Param ETypes:: FP32 a, FP32 b, IMM rounding

__m128 _mm_mul_round_ss(__m128 a, __m128 b, int rounding);

Intel Description

Multiply the lower single-precision (32-bit) floating-point element in “a” and “b”, store the result in the lower element of “dst”, and copy the upper 3 packed elements from “a” to the upper elements of “dst”.: [round_note]

Intel Implementation Psudeo-Code

dst[31:0] := a[31:0] * b[31:0]
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_mask_sub_round_sd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __m128d src, __mmask8 k, __m128d a, __m128d b, int rounding
Param ETypes:: FP64 src, MASK k, FP64 a, FP64 b, IMM rounding

__m128d _mm_mask_sub_round_sd(__m128d src, __mmask8 k,
                              __m128d a, __m128d b,
                              int rounding)

Intel Description

Subtract the lower double-precision (64-bit) floating-point element in “b” from the lower double-precision (64-bit) floating-point element in “a”, store the result in the lower element of “dst” using writemask “k” (the element is copied from “src” when mask bit 0 is not set), and copy the upper element from “a” to the upper element of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst[63:0] := a[63:0] - b[63:0]
ELSE
        dst[63:0] := src[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

_mm_mask_sub_sd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __m128d src, __mmask8 k, __m128d a, __m128d b
Param ETypes:: FP64 src, MASK k, FP64 a, FP64 b

__m128d _mm_mask_sub_sd(__m128d src, __mmask8 k, __m128d a,
                        __m128d b)

Intel Description

Subtract the lower double-precision (64-bit) floating-point element in “b” from the lower double-precision (64-bit) floating-point element in “a”, store the result in the lower element of “dst” using writemask “k” (the element is copied from “src” when mask bit 0 is not set), and copy the upper element from “a” to the upper element of “dst”.

Intel Implementation Psudeo-Code

IF k[0]
        dst[63:0] := a[63:0] - b[63:0]
ELSE
        dst[63:0] := src[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

_mm_maskz_sub_round_sd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __mmask8 k, __m128d a, __m128d b, int rounding
Param ETypes:: MASK k, FP64 a, FP64 b, IMM rounding

__m128d _mm_maskz_sub_round_sd(__mmask8 k, __m128d a,
                               __m128d b, int rounding)

Intel Description

Subtract the lower double-precision (64-bit) floating-point element in “b” from the lower double-precision (64-bit) floating-point element in “a”, store the result in the lower element of “dst” using zeromask “k” (the element is zeroed out when mask bit 0 is not set), and copy the upper element from “a” to the upper element of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst[63:0] := a[63:0] - b[63:0]
ELSE
        dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

_mm_maskz_sub_sd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __mmask8 k, __m128d a, __m128d b
Param ETypes:: MASK k, FP64 a, FP64 b

__m128d _mm_maskz_sub_sd(__mmask8 k, __m128d a, __m128d b);

Intel Description

Subtract the lower double-precision (64-bit) floating-point element in “b” from the lower double-precision (64-bit) floating-point element in “a”, store the result in the lower element of “dst” using zeromask “k” (the element is zeroed out when mask bit 0 is not set), and copy the upper element from “a” to the upper element of “dst”.

Intel Implementation Psudeo-Code

IF k[0]
        dst[63:0] := a[63:0] - b[63:0]
ELSE
        dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

_mm_sub_round_sd#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128d
Param Types:: __m128d a, __m128d b, int rounding
Param ETypes:: FP64 a, FP64 b, IMM rounding

__m128d _mm_sub_round_sd(__m128d a, __m128d b,
                         int rounding)

Intel Description

Subtract the lower double-precision (64-bit) floating-point element in “b” from the lower double-precision (64-bit) floating-point element in “a”, store the result in the lower element of “dst”, and copy the upper element from “a” to the upper element of “dst”.: [round_note]

Intel Implementation Psudeo-Code

dst[63:0] := a[63:0] - b[63:0]
dst[127:64] := a[127:64]
dst[MAX:128] := 0

_mm_mask_sub_round_ss#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __m128 src, __mmask8 k, __m128 a, __m128 b, int rounding
Param ETypes:: FP32 src, MASK k, FP32 a, FP32 b, IMM rounding

__m128 _mm_mask_sub_round_ss(__m128 src, __mmask8 k,
                             __m128 a, __m128 b,
                             int rounding)

Intel Description

Subtract the lower single-precision (32-bit) floating-point element in “b” from the lower single-precision (32-bit) floating-point element in “a”, store the result in the lower element of “dst” using writemask “k” (the element is copied from “src” when mask bit 0 is not set), and copy the upper 3 packed elements from “a” to the upper elements of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst[31:0] := a[31:0] - b[31:0]
ELSE
        dst[31:0] := src[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_mask_sub_ss#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __m128 src, __mmask8 k, __m128 a, __m128 b
Param ETypes:: FP32 src, MASK k, FP32 a, FP32 b

__m128 _mm_mask_sub_ss(__m128 src, __mmask8 k, __m128 a,
                       __m128 b)

Intel Description

Subtract the lower single-precision (32-bit) floating-point element in “b” from the lower single-precision (32-bit) floating-point element in “a”, store the result in the lower element of “dst” using writemask “k” (the element is copied from “src” when mask bit 0 is not set), and copy the upper 3 packed elements from “a” to the upper elements of “dst”.

Intel Implementation Psudeo-Code

IF k[0]
        dst[31:0] := a[31:0] - b[31:0]
ELSE
        dst[31:0] := src[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_maskz_sub_round_ss#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __mmask8 k, __m128 a, __m128 b, int rounding
Param ETypes:: MASK k, FP32 a, FP32 b, IMM rounding

__m128 _mm_maskz_sub_round_ss(__mmask8 k, __m128 a,
                              __m128 b, int rounding)

Intel Description

Subtract the lower single-precision (32-bit) floating-point element in “b” from the lower single-precision (32-bit) floating-point element in “a”, store the result in the lower element of “dst” using zeromask “k” (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from “a” to the upper elements of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst[31:0] := a[31:0] - b[31:0]
ELSE
        dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_maskz_sub_ss#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __mmask8 k, __m128 a, __m128 b
Param ETypes:: MASK k, FP32 a, FP32 b

__m128 _mm_maskz_sub_ss(__mmask8 k, __m128 a, __m128 b);

Intel Description

Subtract the lower single-precision (32-bit) floating-point element in “b” from the lower single-precision (32-bit) floating-point element in “a”, store the result in the lower element of “dst” using zeromask “k” (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from “a” to the upper elements of “dst”.

Intel Implementation Psudeo-Code

IF k[0]
        dst[31:0] := a[31:0] - b[31:0]
ELSE
        dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_sub_round_ss#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __m128 a, __m128 b, int rounding
Param ETypes:: FP32 a, FP32 b, IMM rounding

__m128 _mm_sub_round_ss(__m128 a, __m128 b, int rounding);

Intel Description

Subtract the lower single-precision (32-bit) floating-point element in “b” from the lower single-precision (32-bit) floating-point element in “a”, store the result in the lower element of “dst”, and copy the upper 3 packed elements from “a” to the upper elements of “dst”.: [round_note]

Intel Implementation Psudeo-Code

dst[31:0] := a[31:0] - b[31:0]
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_madd52lo_epu64#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i a, __m128i b, __m128i c
Param ETypes:: UI64 a, UI64 b, UI64 c

__m128i _mm_madd52lo_epu64(__m128i __X, __m128i __Y,
                           __m128i __Z)

Intel Description

Multiply packed unsigned 52-bit integers in each 64-bit element of “b” and “c” to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in “a”, and store the results in “dst”.

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
        dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0])
ENDFOR
dst[MAX:128] := 0

_mm_mask_madd52lo_epu64#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i a, __mmask8 k, __m128i b, __m128i c
Param ETypes:: UI64 a, MASK k, UI64 b, UI64 c

__m128i _mm_mask_madd52lo_epu64(__m128i a, __mmask8 k,
                                __m128i b, __m128i c)

Intel Description

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        IF k[j]
                tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
                dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0])
        ELSE
                dst[i+63:i] := a[i+63:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_madd52lo_epu64#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __mmask8 k, __m128i a, __m128i b, __m128i c
Param ETypes:: MASK k, UI64 a, UI64 b, UI64 c

__m128i _mm_maskz_madd52lo_epu64(__mmask8 k, __m128i a,
                                 __m128i b, __m128i c)

Intel Description

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        IF k[j]
                tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
                dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0])
        ELSE
                dst[i+63:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_madd52hi_epu64#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i a, __m128i b, __m128i c
Param ETypes:: UI64 a, UI64 b, UI64 c

__m128i _mm_madd52hi_epu64(__m128i __X, __m128i __Y,
                           __m128i __Z)

Intel Description

Multiply packed unsigned 52-bit integers in each 64-bit element of “b” and “c” to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in “a”, and store the results in “dst”.

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
        dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52])
ENDFOR
dst[MAX:128] := 0

_mm_mask_madd52hi_epu64#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i a, __mmask8 k, __m128i b, __m128i c
Param ETypes:: UI64 a, MASK k, UI64 b, UI64 c

__m128i _mm_mask_madd52hi_epu64(__m128i a, __mmask8 k,
                                __m128i b, __m128i c)

Intel Description

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        IF k[j]
                tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
                dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52])
        ELSE
                dst[i+63:i] := a[i+63:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_madd52hi_epu64#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __mmask8 k, __m128i a, __m128i b, __m128i c
Param ETypes:: MASK k, UI64 a, UI64 b, UI64 c

__m128i _mm_maskz_madd52hi_epu64(__mmask8 k, __m128i a,
                                 __m128i b, __m128i c)

Intel Description

Intel Implementation Psudeo-Code

FOR j := 0 to 1
        i := j*64
        IF k[j]
                tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
                dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52])
        ELSE
                dst[i+63:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_dpbf16_ps#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __m128 src, __m128bh a, __m128bh b
Param ETypes:: FP32 src, BF16 a, BF16 b

__m128 _mm_dpbf16_ps(__m128 src, __m128bh a, __m128bh b);

Intel Description

Intel Implementation Psudeo-Code

DEFINE make_fp32(x[15:0]) {
        y.fp32  := 0.0
        y[31:16] := x[15:0]
        RETURN y
}
dst := src
FOR j := 0 to 3
        dst.fp32[j] += make_fp32(a.bf16[2*j+1]) * make_fp32(b.bf16[2*j+1])
        dst.fp32[j] += make_fp32(a.bf16[2*j+0]) * make_fp32(b.bf16[2*j+0])
ENDFOR
dst[MAX:128] := 0

_mm_mask_dpbf16_ps#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __m128 src, __mmask8 k, __m128bh a, __m128bh b
Param ETypes:: FP32 src, MASK k, BF16 a, BF16 b

__m128 _mm_mask_dpbf16_ps(__m128 src, __mmask8 k,
                          __m128bh a, __m128bh b)

Intel Description

Compute dot-product of BF16 (16-bit) floating-point pairs in “a” and “b”, accumulating the intermediate single-precision (32-bit) floating-point elements with elements in “src”, and store the results in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

DEFINE make_fp32(x[15:0]) {
        y.fp32  := 0.0
        y[31:16] := x[15:0]
        RETURN y
}
dst := src
FOR j := 0 to 3
        IF k[j]
                dst.fp32[j] += make_fp32(a.bf16[2*j+1]) * make_fp32(b.bf16[2*j+1])
                dst.fp32[j] += make_fp32(a.bf16[2*j+0]) * make_fp32(b.bf16[2*j+0])
        ELSE
                dst.dword[j] := src.dword[j]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_dpbf16_ps#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128
Param Types:: __mmask8 k, __m128 src, __m128bh a, __m128bh b
Param ETypes:: MASK k, FP32 src, BF16 a, BF16 b

__m128 _mm_maskz_dpbf16_ps(__mmask8 k, __m128 src,
                           __m128bh a, __m128bh b)

Intel Description

Compute dot-product of BF16 (16-bit) floating-point pairs in “a” and “b”, accumulating the intermediate single-precision (32-bit) floating-point elements with elements in “src”, and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

DEFINE make_fp32(x[15:0]) {
        y.fp32  := 0.0
        y[31:16] := x[15:0]
        RETURN y
}
dst := src
FOR j := 0 to 3
        IF k[j]
                dst.fp32[j] += make_fp32(a.bf16[2*j+1]) * make_fp32(b.bf16[2*j+1])
                dst.fp32[j] += make_fp32(a.bf16[2*j+0]) * make_fp32(b.bf16[2*j+0])
        ELSE
                dst.dword[j] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_add_ph#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b
Param ETypes:: FP16 a, FP16 b

__m128h _mm_add_ph(__m128h a, __m128h b);

Intel Description

Add packed half-precision (16-bit) floating-point elements in “a” and “b”, and store the results in “dst”.

Intel Implementation Psudeo-Code

FOR j := 0 TO 7
        dst.fp16[j] := a.fp16[j] + b.fp16[j]
ENDFOR
dst[MAX:128] := 0

_mm_mask_add_ph#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h src, __mmask8 k, __m128h a, __m128h b
Param ETypes:: FP16 src, MASK k, FP16 a, FP16 b

__m128h _mm_mask_add_ph(__m128h src, __mmask8 k, __m128h a,
                        __m128h b)

Intel Description

Add packed half-precision (16-bit) floating-point elements in “a” and “b”, and store the results in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 TO 7
        IF k[j]
                dst.fp16[j] := a.fp16[j] + b.fp16[j]
        ELSE
                dst.fp16[j] := src.fp16[j]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_add_ph#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __mmask8 k, __m128h a, __m128h b
Param ETypes:: MASK k, FP16 a, FP16 b

__m128h _mm_maskz_add_ph(__mmask8 k, __m128h a, __m128h b);

Intel Description

Add packed half-precision (16-bit) floating-point elements in “a” and “b”, and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 TO 7
        IF k[j]
                dst.fp16[j] := a.fp16[j] + b.fp16[j]
        ELSE
                dst.fp16[j] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_div_ph#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b
Param ETypes:: FP16 a, FP16 b

__m128h _mm_div_ph(__m128h a, __m128h b);

Intel Description

Divide packed half-precision (16-bit) floating-point elements in “a” by packed elements in “b”, and store the results in “dst”.

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        dst.fp16[j] := a.fp16[j] / b.fp16[j]
ENDFOR
dst[MAX:128] := 0

_mm_mask_div_ph#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h src, __mmask8 k, __m128h a, __m128h b
Param ETypes:: FP16 src, MASK k, FP16 a, FP16 b

__m128h _mm_mask_div_ph(__m128h src, __mmask8 k, __m128h a,
                        __m128h b)

Intel Description

Divide packed half-precision (16-bit) floating-point elements in “a” by packed elements in “b”, and store the results in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        IF k[j]
                dst.fp16[j] := a.fp16[j] / b.fp16[j]
        ELSE
                dst.fp16[j] := src.fp16[j]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_div_ph#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __mmask8 k, __m128h a, __m128h b
Param ETypes:: MASK k, FP16 a, FP16 b

__m128h _mm_maskz_div_ph(__mmask8 k, __m128h a, __m128h b);

Intel Description

Divide packed half-precision (16-bit) floating-point elements in “a” by packed elements in “b”, and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        IF k[j]
                dst.fp16[j] := a.fp16[j] / b.fp16[j]
        ELSE
                dst.fp16[j] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_fmadd_ph#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b, __m128h c
Param ETypes:: FP16 a, FP16 b, FP16 c

__m128h _mm_fmadd_ph(__m128h a, __m128h b, __m128h c);

Intel Description

Multiply packed half-precision (16-bit) floating-point elements in “a” and “b”, add the intermediate result to packed elements in “c”, and store the results in “dst”.

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
ENDFOR
dst[MAX:128] := 0

_mm_mask_fmadd_ph#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __mmask8 k, __m128h b, __m128h c
Param ETypes:: FP16 a, MASK k, FP16 b, FP16 c

__m128h _mm_mask_fmadd_ph(__m128h a, __mmask8 k, __m128h b,
                          __m128h c)

Intel Description

Multiply packed half-precision (16-bit) floating-point elements in “a” and “b”, add the intermediate result to packed elements in “c”, and store the results in “dst” using writemask “k” (elements are copied from “a” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        IF k[j]
                dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
        ELSE
                dst.fp16[j] := a.fp16[j]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask3_fmadd_ph#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b, __m128h c, __mmask8 k
Param ETypes:: FP16 a, FP16 b, FP16 c, MASK k

__m128h _mm_mask3_fmadd_ph(__m128h a, __m128h b, __m128h c,
                           __mmask8 k)

Intel Description

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        IF k[j]
                dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
        ELSE
                dst.fp16[j] := c.fp16[j]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_fmadd_ph#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __mmask8 k, __m128h a, __m128h b, __m128h c
Param ETypes:: MASK k, FP16 a, FP16 b, FP16 c

__m128h _mm_maskz_fmadd_ph(__mmask8 k, __m128h a, __m128h b,
                           __m128h c)

Intel Description

Multiply packed half-precision (16-bit) floating-point elements in “a” and “b”, add the intermediate result to packed elements in “c”, and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        IF k[j]
                dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
        ELSE
                dst.fp16[j] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_fnmadd_ph#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b, __m128h c
Param ETypes:: FP16 a, FP16 b, FP16 c

__m128h _mm_fnmadd_ph(__m128h a, __m128h b, __m128h c);

Intel Description

Multiply packed half-precision (16-bit) floating-point elements in “a” and “b”, add the negated intermediate result to packed elements in “c”, and store the results in “dst”.

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j]
ENDFOR
dst[MAX:128] := 0

_mm_mask_fnmadd_ph#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __mmask8 k, __m128h b, __m128h c
Param ETypes:: FP16 a, MASK k, FP16 b, FP16 c

__m128h _mm_mask_fnmadd_ph(__m128h a, __mmask8 k, __m128h b,
                           __m128h c)

Intel Description

Multiply packed half-precision (16-bit) floating-point elements in “a” and “b”, add the negated intermediate result to packed elements in “c”, and store the results in “dst” using writemask “k” (elements are copied from “a” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        IF k[j]
                dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j]
        ELSE
                dst.fp16[j] := a.fp16[j]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask3_fnmadd_ph#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b, __m128h c, __mmask8 k
Param ETypes:: FP16 a, FP16 b, FP16 c, MASK k

__m128h _mm_mask3_fnmadd_ph(__m128h a, __m128h b, __m128h c,
                            __mmask8 k)

Intel Description

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        IF k[j]
                dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j]
        ELSE
                dst.fp16[j] := c.fp16[j]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_fnmadd_ph#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __mmask8 k, __m128h a, __m128h b, __m128h c
Param ETypes:: MASK k, FP16 a, FP16 b, FP16 c

__m128h _mm_maskz_fnmadd_ph(__mmask8 k, __m128h a,
                            __m128h b, __m128h c)

Intel Description

Multiply packed half-precision (16-bit) floating-point elements in “a” and “b”, add the negated intermediate result to packed elements in “c”, and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        IF k[j]
                dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j]
        ELSE
                dst.fp16[j] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_fmsub_ph#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b, __m128h c
Param ETypes:: FP16 a, FP16 b, FP16 c

__m128h _mm_fmsub_ph(__m128h a, __m128h b, __m128h c);

Intel Description

Multiply packed half-precision (16-bit) floating-point elements in “a” and “b”, subtract packed elements in “c” from the intermediate result, and store the results in “dst”.

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
ENDFOR
dst[MAX:128] := 0

_mm_mask_fmsub_ph#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __mmask8 k, __m128h b, __m128h c
Param ETypes:: FP16 a, MASK k, FP16 b, FP16 c

__m128h _mm_mask_fmsub_ph(__m128h a, __mmask8 k, __m128h b,
                          __m128h c)

Intel Description

Multiply packed half-precision (16-bit) floating-point elements in “a” and “b”, subtract packed elements in “c” from the intermediate result, and store the results in “dst” using writemask “k” (elements are copied from “a” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        IF k[j]
                dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
        ELSE
                dst.fp16[j] := a.fp16[j]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask3_fmsub_ph#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b, __m128h c, __mmask8 k
Param ETypes:: FP16 a, FP16 b, FP16 c, MASK k

__m128h _mm_mask3_fmsub_ph(__m128h a, __m128h b, __m128h c,
                           __mmask8 k)

Intel Description

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        IF k[j]
                dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
        ELSE
                dst.fp16[j] := c.fp16[j]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_fmsub_ph#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __mmask8 k, __m128h a, __m128h b, __m128h c
Param ETypes:: MASK k, FP16 a, FP16 b, FP16 c

__m128h _mm_maskz_fmsub_ph(__mmask8 k, __m128h a, __m128h b,
                           __m128h c)

Intel Description

Multiply packed half-precision (16-bit) floating-point elements in “a” and “b”, subtract packed elements in “c” from the intermediate result, and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        IF k[j]
                dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
        ELSE
                dst.fp16[j] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_fnmsub_ph#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b, __m128h c
Param ETypes:: FP16 a, FP16 b, FP16 c

__m128h _mm_fnmsub_ph(__m128h a, __m128h b, __m128h c);

Intel Description

Multiply packed half-precision (16-bit) floating-point elements in “a” and “b”, subtract packed elements in “c” from the negated intermediate result, and store the results in “dst”.

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j]
ENDFOR
dst[MAX:128] := 0

_mm_mask_fnmsub_ph#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __mmask8 k, __m128h b, __m128h c
Param ETypes:: FP16 a, MASK k, FP16 b, FP16 c

__m128h _mm_mask_fnmsub_ph(__m128h a, __mmask8 k, __m128h b,
                           __m128h c)

Intel Description

Multiply packed half-precision (16-bit) floating-point elements in “a” and “b”, subtract packed elements in “c” from the negated intermediate result, and store the results in “dst” using writemask “k” (elements are copied from “a” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        IF k[j]
                dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j]
        ELSE
                dst.fp16[j] := a.fp16[j]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask3_fnmsub_ph#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b, __m128h c, __mmask8 k
Param ETypes:: FP16 a, FP16 b, FP16 c, MASK k

__m128h _mm_mask3_fnmsub_ph(__m128h a, __m128h b, __m128h c,
                            __mmask8 k)

Intel Description

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        IF k[j]
                dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j]
        ELSE
                dst.fp16[j] := c.fp16[j]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_fnmsub_ph#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __mmask8 k, __m128h a, __m128h b, __m128h c
Param ETypes:: MASK k, FP16 a, FP16 b, FP16 c

__m128h _mm_maskz_fnmsub_ph(__mmask8 k, __m128h a,
                            __m128h b, __m128h c)

Intel Description

Multiply packed half-precision (16-bit) floating-point elements in “a” and “b”, subtract packed elements in “c” from the negated intermediate result, and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        IF k[j]
                dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j]
        ELSE
                dst.fp16[j] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_fmaddsub_ph#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b, __m128h c
Param ETypes:: FP16 a, FP16 b, FP16 c

__m128h _mm_fmaddsub_ph(__m128h a, __m128h b, __m128h c);

Intel Description

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        IF ((j & 1) == 0)
                dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
        ELSE
                dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_fmaddsub_ph#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __mmask8 k, __m128h b, __m128h c
Param ETypes:: FP16 a, MASK k, FP16 b, FP16 c

__m128h _mm_mask_fmaddsub_ph(__m128h a, __mmask8 k,
                             __m128h b, __m128h c)

Intel Description

Multiply packed half-precision (16-bit) floating-point elements in “a” and “b”, alternatively add and subtract packed elements in “c” to/from the intermediate result, and store the results in “dst” using writemask “k” (elements are copied from “a” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        IF k[j]
                IF ((j & 1) == 0)
                        dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
                ELSE
                        dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
                FI
        ELSE
                dst.fp16[j] := a.fp16[j]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask3_fmaddsub_ph#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b, __m128h c, __mmask8 k
Param ETypes:: FP16 a, FP16 b, FP16 c, MASK k

__m128h _mm_mask3_fmaddsub_ph(__m128h a, __m128h b,
                              __m128h c, __mmask8 k)

Intel Description

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        IF k[j]
                IF ((j & 1) == 0)
                        dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
                ELSE
                        dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
                FI
        ELSE
                dst.fp16[j] := c.fp16[j]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_fmaddsub_ph#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __mmask8 k, __m128h a, __m128h b, __m128h c
Param ETypes:: MASK k, FP16 a, FP16 b, FP16 c

__m128h _mm_maskz_fmaddsub_ph(__mmask8 k, __m128h a,
                              __m128h b, __m128h c)

Intel Description

Multiply packed half-precision (16-bit) floating-point elements in “a” and “b”, alternatively add and subtract packed elements in “c” to/from the intermediate result, and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        IF k[j]
                IF ((j & 1) == 0)
                        dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
                ELSE
                        dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
                FI
        ELSE
                dst.fp16[j] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_fmsubadd_ph#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b, __m128h c
Param ETypes:: FP16 a, FP16 b, FP16 c

__m128h _mm_fmsubadd_ph(__m128h a, __m128h b, __m128h c);

Intel Description

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        IF ((j & 1) == 0)
                dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
        ELSE
                dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_fmsubadd_ph#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __mmask8 k, __m128h b, __m128h c
Param ETypes:: FP16 a, MASK k, FP16 b, FP16 c

__m128h _mm_mask_fmsubadd_ph(__m128h a, __mmask8 k,
                             __m128h b, __m128h c)

Intel Description

Multiply packed half-precision (16-bit) floating-point elements in “a” and “b”, alternatively subtract and add packed elements in “c” to/from the intermediate result, and store the results in “dst” using writemask “k” (elements are copied from “a” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        IF k[j]
                IF ((j & 1) == 0)
                        dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
                ELSE
                        dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
                FI
        ELSE
                dst.fp16[j] := a.fp16[j]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask3_fmsubadd_ph#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b, __m128h c, __mmask8 k
Param ETypes:: FP16 a, FP16 b, FP16 c, MASK k

__m128h _mm_mask3_fmsubadd_ph(__m128h a, __m128h b,
                              __m128h c, __mmask8 k)

Intel Description

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        IF k[j]
                IF ((j & 1) == 0)
                        dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
                ELSE
                        dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
                FI
        ELSE
                dst.fp16[j] := c.fp16[j]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_fmsubadd_ph#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __mmask8 k, __m128h a, __m128h b, __m128h c
Param ETypes:: MASK k, FP16 a, FP16 b, FP16 c

__m128h _mm_maskz_fmsubadd_ph(__mmask8 k, __m128h a,
                              __m128h b, __m128h c)

Intel Description

Multiply packed half-precision (16-bit) floating-point elements in “a” and “b”, alternatively subtract and add packed elements in “c” to/from the intermediate result, and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        IF k[j]
                IF ((j & 1) == 0)
                        dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j]
                ELSE
                        dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j]
                FI
        ELSE
                dst.fp16[j] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_sub_ph#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b
Param ETypes:: FP16 a, FP16 b

__m128h _mm_sub_ph(__m128h a, __m128h b);

Intel Description

Subtract packed half-precision (16-bit) floating-point elements in “b” from packed half-precision (16-bit) floating-point elements in “a”, and store the results in “dst”.

Intel Implementation Psudeo-Code

FOR j := 0 TO 7
        dst.fp16[j] := a.fp16[j] - b.fp16[j]
ENDFOR
dst[MAX:128] := 0

_mm_mask_sub_ph#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h src, __mmask8 k, __m128h a, __m128h b
Param ETypes:: FP16 src, MASK k, FP16 a, FP16 b

__m128h _mm_mask_sub_ph(__m128h src, __mmask8 k, __m128h a,
                        __m128h b)

Intel Description

Subtract packed half-precision (16-bit) floating-point elements in “b” from packed half-precision (16-bit) floating-point elements in “a”, and store the results in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 TO 7
        IF k[j]
                dst.fp16[j] := a.fp16[j] - b.fp16[j]
        ELSE
                dst.fp16[j] := src.fp16[j]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_sub_ph#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __mmask8 k, __m128h a, __m128h b
Param ETypes:: MASK k, FP16 a, FP16 b

__m128h _mm_maskz_sub_ph(__mmask8 k, __m128h a, __m128h b);

Intel Description

Subtract packed half-precision (16-bit) floating-point elements in “b” from packed half-precision (16-bit) floating-point elements in “a”, and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 TO 7
        IF k[j]
                dst.fp16[j] := a.fp16[j] - b.fp16[j]
        ELSE
                dst.fp16[j] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mul_ph#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b
Param ETypes:: FP16 a, FP16 b

__m128h _mm_mul_ph(__m128h a, __m128h b);

Intel Description

Multiply packed half-precision (16-bit) floating-point elements in “a” and “b”, and store the results in “dst”.

Intel Implementation Psudeo-Code

FOR i := 0 TO 7
        dst.fp16[i] := a.fp16[i] * b.fp16[i]
ENDFOR
dst[MAX:128] := 0

_mm_mask_mul_ph#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h src, __mmask8 k, __m128h a, __m128h b
Param ETypes:: FP16 src, MASK k, FP16 a, FP16 b

__m128h _mm_mask_mul_ph(__m128h src, __mmask8 k, __m128h a,
                        __m128h b)

Intel Description

Multiply packed half-precision (16-bit) floating-point elements in “a” and “b”, and store the results in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR i := 0 TO 7
        IF k[i]
                dst.fp16[i] := a.fp16[i] * b.fp16[i]
        ELSE
                dst.fp16[i] := src.fp16[i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_mul_ph#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __mmask8 k, __m128h a, __m128h b
Param ETypes:: MASK k, FP16 a, FP16 b

__m128h _mm_maskz_mul_ph(__mmask8 k, __m128h a, __m128h b);

Intel Description

Multiply packed half-precision (16-bit) floating-point elements in “a” and “b”, and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR i := 0 TO 7
        IF k[i]
                dst.fp16[i] := a.fp16[i] * b.fp16[i]
        ELSE
                dst.fp16[i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_fmul_pch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b
Param ETypes:: FP16 a, FP16 b

__m128h _mm_fmul_pch(__m128h a, __m128h b);

Intel Description

Multiply packed complex numbers in “a” and “b”, and store the results in “dst”. Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number “complex = vec.fp16[0] + i * vec.fp16[1]”.

Intel Implementation Psudeo-Code

FOR i := 0 to 3
        dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1])
        dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1])
ENDFOR
dst[MAX:128] := 0

_mm_mul_pch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b
Param ETypes:: FP16 a, FP16 b

__m128h _mm_mul_pch(__m128h a, __m128h b);

Intel Description

Intel Implementation Psudeo-Code

FOR i := 0 to 3
        dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1])
        dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1])
ENDFOR
dst[MAX:128] := 0

_mm_mask_fmul_pch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h src, __mmask8 k, __m128h a, __m128h b
Param ETypes:: FP16 src, MASK k, FP16 a, FP16 b

__m128h _mm_mask_fmul_pch(__m128h src, __mmask8 k,
                          __m128h a, __m128h b)

Intel Description

Multiply packed complex numbers in “a” and “b”, and store the results in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number “complex = vec.fp16[0] + i * vec.fp16[1]”.

Intel Implementation Psudeo-Code

FOR i := 0 to 3
        IF k[i]
                dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1])
                dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1])
        ELSE
                dst.fp16[2*i+0] := src.fp16[2*i+0]
                dst.fp16[2*i+1] := src.fp16[2*i+1]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_mul_pch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h src, __mmask8 k, __m128h a, __m128h b
Param ETypes:: FP16 src, MASK k, FP16 a, FP16 b

__m128h _mm_mask_mul_pch(__m128h src, __mmask8 k, __m128h a,
                         __m128h b)

Intel Description

Intel Implementation Psudeo-Code

FOR i := 0 to 3
        IF k[i]
                dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1])
                dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1])
        ELSE
                dst.fp16[2*i+0] := src.fp16[2*i+0]
                dst.fp16[2*i+1] := src.fp16[2*i+1]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_fmul_pch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __mmask8 k, __m128h a, __m128h b
Param ETypes:: MASK k, FP16 a, FP16 b

__m128h _mm_maskz_fmul_pch(__mmask8 k, __m128h a,
                           __m128h b)

Intel Description

Multiply packed complex numbers in “a” and “b”, and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number “complex = vec.fp16[0] + i * vec.fp16[1]”.

Intel Implementation Psudeo-Code

FOR i := 0 to 3
        IF k[i]
                dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1])
                dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1])
        ELSE
                dst.fp16[2*i+0] := 0
                dst.fp16[2*i+1] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_mul_pch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __mmask8 k, __m128h a, __m128h b
Param ETypes:: MASK k, FP16 a, FP16 b

__m128h _mm_maskz_mul_pch(__mmask8 k, __m128h a, __m128h b);

Intel Description

Intel Implementation Psudeo-Code

FOR i := 0 to 3
        IF k[i]
                dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1])
                dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1])
        ELSE
                dst.fp16[2*i+0] := 0
                dst.fp16[2*i+1] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_fcmul_pch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b
Param ETypes:: FP16 a, FP16 b

__m128h _mm_fcmul_pch(__m128h a, __m128h b);

Intel Description

Multiply packed complex numbers in “a” by the complex conjugates of packed complex numbers in “b”, and store the results in “dst”. Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number “complex = vec.fp16[0] + i * vec.fp16[1]”, or the complex conjugate “conjugate = vec.fp16[0] - i * vec.fp16[1]”.

Intel Implementation Psudeo-Code

FOR i := 0 to 3
        dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1])
        dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1])
ENDFOR
dst[MAX:128] := 0

_mm_cmul_pch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b
Param ETypes:: FP16 a, FP16 b

__m128h _mm_cmul_pch(__m128h a, __m128h b);

Intel Description

Intel Implementation Psudeo-Code

FOR i := 0 to 3
        dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1])
        dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1])
ENDFOR
dst[MAX:128] := 0

_mm_mask_fcmul_pch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h src, __mmask8 k, __m128h a, __m128h b
Param ETypes:: FP16 src, MASK k, FP16 a, FP16 b

__m128h _mm_mask_fcmul_pch(__m128h src, __mmask8 k,
                           __m128h a, __m128h b)

Intel Description

Multiply packed complex numbers in “a” by the complex conjugates of packed complex numbers in “b”, and store the results in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number “complex = vec.fp16[0] + i * vec.fp16[1]”, or the complex conjugate “conjugate = vec.fp16[0] - i * vec.fp16[1]”.

Intel Implementation Psudeo-Code

FOR i := 0 to 3
        IF k[i]
                dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1])
                dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1])
        ELSE
                dst.fp16[2*i+0] := src.fp16[2*i+0]
                dst.fp16[2*i+1] := src.fp16[2*i+1]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_cmul_pch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h src, __mmask8 k, __m128h a, __m128h b
Param ETypes:: FP16 src, MASK k, FP16 a, FP16 b

__m128h _mm_mask_cmul_pch(__m128h src, __mmask8 k,
                          __m128h a, __m128h b)

Intel Description

Intel Implementation Psudeo-Code

FOR i := 0 to 3
        IF k[i]
                dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1])
                dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1])
        ELSE
                dst.fp16[2*i+0] := src.fp16[2*i+0]
                dst.fp16[2*i+1] := src.fp16[2*i+1]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_fcmul_pch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __mmask8 k, __m128h a, __m128h b
Param ETypes:: MASK k, FP16 a, FP16 b

__m128h _mm_maskz_fcmul_pch(__mmask8 k, __m128h a,
                            __m128h b)

Intel Description

Multiply packed complex numbers in “a” by the complex conjugates of packed complex numbers in “b”, and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number “complex = vec.fp16[0] + i * vec.fp16[1]”, or the complex conjugate “conjugate = vec.fp16[0] - i * vec.fp16[1]”.

Intel Implementation Psudeo-Code

FOR i := 0 to 3
        IF k[i]
                dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1])
                dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1])
        ELSE
                dst.fp16[2*i+0] := 0
                dst.fp16[2*i+1] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_cmul_pch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __mmask8 k, __m128h a, __m128h b
Param ETypes:: MASK k, FP16 a, FP16 b

__m128h _mm_maskz_cmul_pch(__mmask8 k, __m128h a,
                           __m128h b)

Intel Description

Intel Implementation Psudeo-Code

FOR i := 0 to 3
        IF k[i]
                dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1])
                dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1])
        ELSE
                dst.fp16[2*i+0] := 0
                dst.fp16[2*i+1] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_fmadd_pch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b, __m128h c
Param ETypes:: FP16 a, FP16 b, FP16 c

__m128h _mm_fmadd_pch(__m128h a, __m128h b, __m128h c);

Intel Description

Multiply packed complex numbers in “a” and “b”, accumulate to the corresponding complex numbers in “c”, and store the results in “dst”. Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number “complex = vec.fp16[0] + i * vec.fp16[1]”.

Intel Implementation Psudeo-Code

FOR i := 0 to 3
        dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0]
        dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1]
ENDFOR
dst[MAX:128] := 0

_mm_mask_fmadd_pch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __mmask8 k, __m128h b, __m128h c
Param ETypes:: FP16 a, MASK k, FP16 b, FP16 c

__m128h _mm_mask_fmadd_pch(__m128h a, __mmask8 k, __m128h b,
                           __m128h c)

Intel Description

Multiply packed complex numbers in “a” and “b”, accumulate to the corresponding complex numbers in “c”, and store the results in “dst” using writemask “k” (elements are copied from “a” when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number “complex = vec.fp16[0] + i * vec.fp16[1]”.

Intel Implementation Psudeo-Code

FOR i := 0 to 3
        IF k[i]
                dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0]
                dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1]
        ELSE
                dst.fp16[2*i+0] := a.fp16[2*i+0]
                dst.fp16[2*i+1] := a.fp16[2*i+1]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask3_fmadd_pch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b, __m128h c, __mmask8 k
Param ETypes:: FP16 a, FP16 b, FP16 c, MASK k

__m128h _mm_mask3_fmadd_pch(__m128h a, __m128h b, __m128h c,
                            __mmask8 k)

Intel Description

Multiply packed complex numbers in “a” and “b”, accumulate to the corresponding complex numbers in “c”, and store the results in “dst” using writemask “k” (elements are copied from “c” when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number “complex = vec.fp16[0] + i * vec.fp16[1]”.

Intel Implementation Psudeo-Code

FOR i := 0 to 3
        IF k[i]
                dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0]
                dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1]
        ELSE
                dst.fp16[2*i+0] := c.fp16[2*i+0]
                dst.fp16[2*i+1] := c.fp16[2*i+1]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_fmadd_pch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __mmask8 k, __m128h a, __m128h b, __m128h c
Param ETypes:: MASK k, FP16 a, FP16 b, FP16 c

__m128h _mm_maskz_fmadd_pch(__mmask8 k, __m128h a,
                            __m128h b, __m128h c)

Intel Description

Multiply packed complex numbers in “a” and “b”, accumulate to the corresponding complex numbers in “c”, and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number “complex = vec.fp16[0] + i * vec.fp16[1]”.

Intel Implementation Psudeo-Code

FOR i := 0 to 3
        IF k[i]
                dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0]
                dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1]
        ELSE
                dst.fp16[2*i+0] := 0
                dst.fp16[2*i+1] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_fcmadd_pch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b, __m128h c
Param ETypes:: FP16 a, FP16 b, FP16 c

__m128h _mm_fcmadd_pch(__m128h a, __m128h b, __m128h c);

Intel Description

Multiply packed complex numbers in “a” by the complex conjugates of packed complex numbers in “b”, accumulate to the corresponding complex numbers in “c”, and store the results in “dst”. Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number “complex = vec.fp16[0] + i * vec.fp16[1]”, or the complex conjugate “conjugate = vec.fp16[0] - i * vec.fp16[1]”.

Intel Implementation Psudeo-Code

FOR i := 0 to 3
        dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0]
        dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1]
ENDFOR
dst[MAX:128] := 0

_mm_mask_fcmadd_pch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __mmask8 k, __m128h b, __m128h c
Param ETypes:: FP16 a, MASK k, FP16 b, FP16 c

__m128h _mm_mask_fcmadd_pch(__m128h a, __mmask8 k,
                            __m128h b, __m128h c)

Intel Description

Multiply packed complex numbers in “a” by the complex conjugates of packed complex numbers in “b”, accumulate to the corresponding complex numbers in “c”, and store the results in “dst” using writemask “k” (elements are copied from “a” when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number “complex = vec.fp16[0] + i * vec.fp16[1]”, or the complex conjugate “conjugate = vec.fp16[0] - i * vec.fp16[1]”.

Intel Implementation Psudeo-Code

FOR i := 0 to 3
        IF k[i]
                dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0]
                dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1]
        ELSE
                dst.fp16[2*i+0] := a.fp16[2*i+0]
                dst.fp16[2*i+1] := a.fp16[2*i+1]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask3_fcmadd_pch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b, __m128h c, __mmask8 k
Param ETypes:: FP16 a, FP16 b, FP16 c, MASK k

__m128h _mm_mask3_fcmadd_pch(__m128h a, __m128h b,
                             __m128h c, __mmask8 k)

Intel Description

Multiply packed complex numbers in “a” by the complex conjugates of packed complex numbers in “b”, accumulate to the corresponding complex numbers in “c”, and store the results in “dst” using writemask “k” (elements are copied from “c” when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number “complex = vec.fp16[0] + i * vec.fp16[1]”, or the complex conjugate “conjugate = vec.fp16[0] - i * vec.fp16[1]”.

Intel Implementation Psudeo-Code

FOR i := 0 to 3
        IF k[i]
                dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0]
                dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1]
        ELSE
                dst.fp16[2*i+0] := c.fp16[2*i+0]
                dst.fp16[2*i+1] := c.fp16[2*i+1]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_fcmadd_pch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __mmask8 k, __m128h a, __m128h b, __m128h c
Param ETypes:: MASK k, FP16 a, FP16 b, FP16 c

__m128h _mm_maskz_fcmadd_pch(__mmask8 k, __m128h a,
                             __m128h b, __m128h c)

Intel Description

Multiply packed complex numbers in “a” by the complex conjugates of packed complex numbers in “b”, accumulate to the corresponding complex numbers in “c”, and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number “complex = vec.fp16[0] + i * vec.fp16[1]”, or the complex conjugate “conjugate = vec.fp16[0] - i * vec.fp16[1]”.

Intel Implementation Psudeo-Code

FOR i := 0 to 3
        IF k[i]
                dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0]
                dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1]
        ELSE
                dst.fp16[2*i+0] := 0
                dst.fp16[2*i+1] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_reduce_add_ph#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: _Float16
Param Types:: __m128h a
Param ETypes:: FP16 a

_Float16 _mm_reduce_add_ph(__m128h a);

Intel Description

Reduce the packed half-precision (16-bit) floating-point elements in “a” by addition. Returns the sum of all elements in “a”.

Intel Implementation Psudeo-Code

tmp := a
FOR i := 0 to 3
        tmp.fp16[i] := tmp.fp16[i] + tmp.fp16[i+4]
ENDFOR
FOR i := 0 to 1
        tmp.fp16[i] := tmp.fp16[i] + tmp.fp16[i+2]
ENDFOR
dst.fp16[0] := tmp.fp16[0] + tmp.fp16[1]

_mm_reduce_mul_ph#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: _Float16
Param Types:: __m128h a
Param ETypes:: FP16 a

_Float16 _mm_reduce_mul_ph(__m128h a);

Intel Description

Reduce the packed half-precision (16-bit) floating-point elements in “a” by multiplication. Returns the product of all elements in “a”.

Intel Implementation Psudeo-Code

tmp := a
FOR i := 0 to 3
        tmp.fp16[i] := tmp.fp16[i] * tmp.fp16[i+4]
ENDFOR
FOR i := 0 to 1
        tmp.fp16[i] := tmp.fp16[i] * tmp.fp16[i+2]
ENDFOR
dst.fp16[0] := tmp.fp16[0] * tmp.fp16[1]

_mm_reduce_max_ph#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: _Float16
Param Types:: __m128h a
Param ETypes:: FP16 a

_Float16 _mm_reduce_max_ph(__m128h a);

Intel Description

Reduce the packed half-precision (16-bit) floating-point elements in “a” by maximum. Returns the maximum of all elements in “a”.

Intel Implementation Psudeo-Code

tmp := a
FOR i := 0 to 3
        tmp.fp16[i] := (tmp.fp16[i] > tmp.fp16[i+4] ? tmp.fp16[i] : tmp.fp16[i+4])
ENDFOR
FOR i := 0 to 1
        tmp.fp16[i] := (tmp.fp16[i] > tmp.fp16[i+2] ? tmp.fp16[i] : tmp.fp16[i+2])
ENDFOR
dst.fp16[0] := (tmp.fp16[0] > tmp.fp16[1] ? tmp.fp16[0] : tmp.fp16[1])

_mm_reduce_min_ph#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: _Float16
Param Types:: __m128h a
Param ETypes:: FP16 a

_Float16 _mm_reduce_min_ph(__m128h a);

Intel Description

Reduce the packed half-precision (16-bit) floating-point elements in “a” by minimum. Returns the minimum of all elements in “a”.

Intel Implementation Psudeo-Code

tmp := a
FOR i := 0 to 3
        tmp.fp16[i] := (tmp.fp16[i] < tmp.fp16[i+4] ? tmp.fp16[i] : tmp.fp16[i+4])
ENDFOR
FOR i := 0 to 1
        tmp.fp16[i] := (tmp.fp16[i] < tmp.fp16[i+2] ? tmp.fp16[i] : tmp.fp16[i+2])
ENDFOR
dst.fp16[0] := (tmp.fp16[0] < tmp.fp16[1] ? tmp.fp16[0] : tmp.fp16[1])

_mm_abs_ph#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h v2
Param ETypes:: FP16 v2

__m128h _mm_abs_ph(__m128h v2);

Intel Description

Finds the absolute value of each packed half-precision (16-bit) floating-point element in “v2”, storing the results in “dst”.

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        dst.fp16[j] := ABS(v2.fp16[j])
ENDFOR
dst[MAX:128] := 0

_mm_conj_pch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a
Param ETypes:: FP16 a

__m128h _mm_conj_pch(__m128h a);

Intel Description

Compute the complex conjugates of complex numbers in “a”, and store the results in “dst”. Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number “complex = vec.fp16[0] + i * vec.fp16[1]”, or the complex conjugate “conjugate = vec.fp16[0] - i * vec.fp16[1]”.

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        dst[i+31:i] := a[i+31:i] XOR FP32(-0.0)
ENDFOR
dst[MAX:128] := 0

_mm_mask_conj_pch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h src, __mmask8 k, __m128h a
Param ETypes:: FP16 src, MASK k, FP16 a

__m128h _mm_mask_conj_pch(__m128h src, __mmask8 k,
                          __m128h a)

Intel Description

Compute the complex conjugates of complex numbers in “a”, and store the results in “dst” using writemask “k” (elements are copied from “a” when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number “complex = vec.fp16[0] + i * vec.fp16[1]”, or the complex conjugate “conjugate = vec.fp16[0] - i * vec.fp16[1]”.

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        IF k[j]
                dst[i+31:i] := a[i+31:i] XOR FP32(-0.0)
        ELSE
                dst[i+31:i] := src[i+31:i]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_maskz_conj_pch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __mmask8 k, __m128h a
Param ETypes:: MASK k, FP16 a

__m128h _mm_maskz_conj_pch(__mmask8 k, __m128h a);

Intel Description

Compute the complex conjugates of complex numbers in “a”, and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number “complex = vec.fp16[0] + i * vec.fp16[1]”, or the complex conjugate “conjugate = vec.fp16[0] - i * vec.fp16[1]”.

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        IF k[j]
                dst[i+31:i] := a[i+31:i] XOR FP32(-0.0)
        ELSE
                dst[i+31:i] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_add_sh#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b
Param ETypes:: FP16 a, FP16 b

__m128h _mm_add_sh(__m128h a, __m128h b);

Intel Description

Add the lower half-precision (16-bit) floating-point elements in “a” and “b”, store the result in the lower element of “dst”, and copy the upper 7 packed elements from “a” to the upper elements of “dst”.

Intel Implementation Psudeo-Code

dst.fp16[0] := a.fp16[0] + b.fp16[0]
dst[127:16] := a[127:16]
dst[MAX:128] := 0

_mm_add_round_sh#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b, int rounding
Param ETypes:: FP16 a, FP16 b, IMM rounding

__m128h _mm_add_round_sh(__m128h a, __m128h b,
                         int rounding)

Intel Description

Add the lower half-precision (16-bit) floating-point elements in “a” and “b”, store the result in the lower element of “dst”, and copy the upper 7 packed elements from “a” to the upper elements of “dst”.: [round_note]

Intel Implementation Psudeo-Code

dst.fp16[0] := a.fp16[0] + b.fp16[0]
dst[127:16] := a[127:16]
dst[MAX:128] := 0

_mm_mask_add_sh#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h src, __mmask8 k, __m128h a, __m128h b
Param ETypes:: FP16 src, MASK k, FP16 a, FP16 b

__m128h _mm_mask_add_sh(__m128h src, __mmask8 k, __m128h a,
                        __m128h b)

Intel Description

Add the lower half-precision (16-bit) floating-point elements in “a” and “b”, store the result in the lower element of “dst” using writemask “k” (the element is copied from “src” when mask bit 0 is not set), and copy the upper 7 packed elements from “a” to the upper elements of “dst”.

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := a.fp16[0] + b.fp16[0]
ELSE
        dst.fp16[0] := src.fp16[0]
FI
dst[127:16] := a[127:16]
dst[MAX:128] := 0

_mm_mask_add_round_sh#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h src, __mmask8 k, __m128h a, __m128h b, int rounding
Param ETypes:: FP16 src, MASK k, FP16 a, FP16 b, IMM rounding

__m128h _mm_mask_add_round_sh(__m128h src, __mmask8 k,
                              __m128h a, __m128h b,
                              int rounding)

Intel Description

Add the lower half-precision (16-bit) floating-point elements in “a” and “b”, store the result in the lower element of “dst” using writemask “k” (the element is copied from “src” when mask bit 0 is not set), and copy the upper 7 packed elements from “a” to the upper elements of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := a.fp16[0] + b.fp16[0]
ELSE
        dst.fp16[0] := src.fp16[0]
FI
dst[127:16] := a[127:16]
dst[MAX:128] := 0

_mm_maskz_add_sh#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __mmask8 k, __m128h a, __m128h b
Param ETypes:: MASK k, FP16 a, FP16 b

__m128h _mm_maskz_add_sh(__mmask8 k, __m128h a, __m128h b);

Intel Description

Add the lower half-precision (16-bit) floating-point elements in “a” and “b”, store the result in the lower element of “dst” using zeromask “k” (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from “a” to the upper elements of “dst”.

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := a.fp16[0] + b.fp16[0]
ELSE
        dst.fp16[0] := 0
FI
dst[127:16] := a[127:16]
dst[MAX:128] := 0

_mm_maskz_add_round_sh#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __mmask8 k, __m128h a, __m128h b, int rounding
Param ETypes:: MASK k, FP16 a, FP16 b, IMM rounding

__m128h _mm_maskz_add_round_sh(__mmask8 k, __m128h a,
                               __m128h b, int rounding)

Intel Description

Add the lower half-precision (16-bit) floating-point elements in “a” and “b”, store the result in the lower element of “dst” using zeromask “k” (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from “a” to the upper elements of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := a.fp16[0] + b.fp16[0]
ELSE
        dst.fp16[0] := 0
FI
dst[127:16] := a[127:16]
dst[MAX:128] := 0

_mm_div_sh#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b
Param ETypes:: FP16 a, FP16 b

__m128h _mm_div_sh(__m128h a, __m128h b);

Intel Description

Divide the lower half-precision (16-bit) floating-point element in “a” by the lower half-precision (16-bit) floating-point element in “b”, store the result in the lower element of “dst”, and copy the upper 7 packed elements from “a” to the upper elements of “dst”.

Intel Implementation Psudeo-Code

dst.fp16[0] := a.fp16[0] / b.fp16[0]
dst[127:16] := a[127:16]
dst[MAX:128] := 0

_mm_mask_div_sh#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h src, __mmask8 k, __m128h a, __m128h b
Param ETypes:: FP16 src, MASK k, FP16 a, FP16 b

__m128h _mm_mask_div_sh(__m128h src, __mmask8 k, __m128h a,
                        __m128h b)

Intel Description

Divide the lower half-precision (16-bit) floating-point element in “a” by the lower half-precision (16-bit) floating-point element in “b”, store the result in the lower element of “dst” using writemask “k” (the element is copied from “src” when mask bit 0 is not set), and copy the upper 7 packed elements from “a” to the upper elements of “dst”.

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := a.fp16[0] / b.fp16[0]
ELSE
        dst.fp16[0] := src.fp16[0]
FI
dst[127:16] := a[127:16]
dst[MAX:128] := 0

_mm_maskz_div_sh#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __mmask8 k, __m128h a, __m128h b
Param ETypes:: MASK k, FP16 a, FP16 b

__m128h _mm_maskz_div_sh(__mmask8 k, __m128h a, __m128h b);

Intel Description

Divide the lower half-precision (16-bit) floating-point element in “a” by the lower half-precision (16-bit) floating-point element in “b”, store the result in the lower element of “dst” using zeromask “k” (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from “a” to the upper elements of “dst”.

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := a.fp16[0] / b.fp16[0]
ELSE
        dst.fp16[0] := 0
FI
dst[127:16] := a[127:16]
dst[MAX:128] := 0

_mm_div_round_sh#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b, int rounding
Param ETypes:: FP16 a, FP16 b, IMM rounding

__m128h _mm_div_round_sh(__m128h a, __m128h b,
                         int rounding)

Intel Description

Divide the lower half-precision (16-bit) floating-point element in “a” by the lower half-precision (16-bit) floating-point element in “b”, store the result in the lower element of “dst”, and copy the upper 7 packed elements from “a” to the upper elements of “dst”.: [round_note]

Intel Implementation Psudeo-Code

dst.fp16[0] := a.fp16[0] / b.fp16[0]
dst[127:16] := a[127:16]
dst[MAX:128] := 0

_mm_mask_div_round_sh#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h src, __mmask8 k, __m128h a, __m128h b, int rounding
Param ETypes:: FP16 src, MASK k, FP16 a, FP16 b, IMM rounding

__m128h _mm_mask_div_round_sh(__m128h src, __mmask8 k,
                              __m128h a, __m128h b,
                              int rounding)

Intel Description

Divide the lower half-precision (16-bit) floating-point element in “a” by the lower half-precision (16-bit) floating-point element in “b”, store the result in the lower element of “dst” using writemask “k” (the element is copied from “src” when mask bit 0 is not set), and copy the upper 7 packed elements from “a” to the upper elements of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := a.fp16[0] / b.fp16[0]
ELSE
        dst.fp16[0] := src.fp16[0]
FI
dst[127:16] := a[127:16]
dst[MAX:128] := 0

_mm_maskz_div_round_sh#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __mmask8 k, __m128h a, __m128h b, int rounding
Param ETypes:: MASK k, FP16 a, FP16 b, IMM rounding

__m128h _mm_maskz_div_round_sh(__mmask8 k, __m128h a,
                               __m128h b, int rounding)

Intel Description

Divide the lower half-precision (16-bit) floating-point element in “a” by the lower half-precision (16-bit) floating-point element in “b”, store the result in the lower element of “dst” using zeromask “k” (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from “a” to the upper elements of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := a.fp16[0] / b.fp16[0]
ELSE
        dst.fp16[0] := 0
FI
dst[127:16] := a[127:16]
dst[MAX:128] := 0

_mm_fmadd_sh#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b, __m128h c
Param ETypes:: FP16 a, FP16 b, FP16 c

__m128h _mm_fmadd_sh(__m128h a, __m128h b, __m128h c);

Intel Description

Multiply the lower half-precision (16-bit) floating-point elements in “a” and “b”, and add the intermediate result to the lower element in “c”. Store the result in the lower element of “dst”, and copy the upper 7 packed elements from “a” to the upper elements of “dst”.

Intel Implementation Psudeo-Code

dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + c.fp16[0]
dst[127:16] := a[127:16]
dst[MAX:128] := 0

_mm_mask_fmadd_sh#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __mmask8 k, __m128h b, __m128h c
Param ETypes:: FP16 a, MASK k, FP16 b, FP16 c

__m128h _mm_mask_fmadd_sh(__m128h a, __mmask8 k, __m128h b,
                          __m128h c)

Intel Description

Multiply the lower half-precision (16-bit) floating-point elements in “a” and “b”, and add the intermediate result to the lower element in “c”. Store the result in the lower element of “dst” using writemask “k” (the element is copied from “a” when mask bit 0 is not set), and copy the upper 7 packed elements from “a” to the upper elements of “dst”.

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + c.fp16[0]
ELSE
        dst.fp16[0] := a.fp16[0]
FI
dst[127:16] := a[127:16]
dst[MAX:128] := 0

_mm_mask3_fmadd_sh#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b, __m128h c, __mmask8 k
Param ETypes:: FP16 a, FP16 b, FP16 c, MASK k

__m128h _mm_mask3_fmadd_sh(__m128h a, __m128h b, __m128h c,
                           __mmask8 k)

Intel Description

Multiply the lower half-precision (16-bit) floating-point elements in “a” and “b”, and add the intermediate result to the lower element in “c”. Store the result in the lower element of “dst” using writemask “k” (the element is copied from “c” when mask bit 0 is not set), and copy the upper 7 packed elements from “c” to the upper elements of “dst”.

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + c.fp16[0]
ELSE
        dst.fp16[0] := c.fp16[0]
FI
dst[127:16] := c[127:16]
dst[MAX:128] := 0

_mm_maskz_fmadd_sh#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __mmask8 k, __m128h a, __m128h b, __m128h c
Param ETypes:: MASK k, FP16 a, FP16 b, FP16 c

__m128h _mm_maskz_fmadd_sh(__mmask8 k, __m128h a, __m128h b,
                           __m128h c)

Intel Description

Multiply the lower half-precision (16-bit) floating-point elements in “a” and “b”, and add the intermediate result to the lower element in “c”. Store the result in the lower element of “dst” using zeromask “k” (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from “a” to the upper elements of “dst”.

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + c.fp16[0]
ELSE
        dst.fp16[0] := 0
FI
dst[127:16] := a[127:16]
dst[MAX:128] := 0

_mm_fmadd_round_sh#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b, __m128h c, const int rounding
Param ETypes:: FP16 a, FP16 b, FP16 c, IMM rounding

__m128h _mm_fmadd_round_sh(__m128h a, __m128h b, __m128h c,
                           const int rounding)

Intel Description

Multiply the lower half-precision (16-bit) floating-point elements in “a” and “b”, and add the intermediate result to the lower element in “c”. Store the result in the lower element of “dst”, and copy the upper 7 packed elements from “a” to the upper elements of “dst”.: [round_note]

Intel Implementation Psudeo-Code

dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + c.fp16[0]
dst[127:16] := a[127:16]
dst[MAX:128] := 0

_mm_mask_fmadd_round_sh#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __mmask8 k, __m128h b, __m128h c, const int rounding
Param ETypes:: FP16 a, MASK k, FP16 b, FP16 c, IMM rounding

__m128h _mm_mask_fmadd_round_sh(__m128h a, __mmask8 k,
                                __m128h b, __m128h c,
                                const int rounding)

Intel Description

Multiply the lower half-precision (16-bit) floating-point elements in “a” and “b”, and add the intermediate result to the lower element in “c”. Store the result in the lower element of “dst” using writemask “k” (the element is copied from “a” when mask bit 0 is not set), and copy the upper 7 packed elements from “a” to the upper elements of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + c.fp16[0]
ELSE
        dst.fp16[0] := a.fp16[0]
FI
dst[127:16] := a[127:16]
dst[MAX:128] := 0

_mm_mask3_fmadd_round_sh#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b, __m128h c, __mmask8 k, const int rounding
Param ETypes:: FP16 a, FP16 b, FP16 c, MASK k, IMM rounding

__m128h _mm_mask3_fmadd_round_sh(__m128h a, __m128h b,
                                 __m128h c, __mmask8 k,
                                 const int rounding)

Intel Description

Multiply the lower half-precision (16-bit) floating-point elements in “a” and “b”, and add the intermediate result to the lower element in “c”. Store the result in the lower element of “dst” using writemask “k” (the element is copied from “c” when mask bit 0 is not set), and copy the upper 7 packed elements from “c” to the upper elements of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + c.fp16[0]
ELSE
        dst.fp16[0] := c.fp16[0]
FI
dst[127:16] := c[127:16]
dst[MAX:128] := 0

_mm_maskz_fmadd_round_sh#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __mmask8 k, __m128h a, __m128h b, __m128h c, const int rounding
Param ETypes:: MASK k, FP16 a, FP16 b, FP16 c, IMM rounding

__m128h _mm_maskz_fmadd_round_sh(__mmask8 k, __m128h a,
                                 __m128h b, __m128h c,
                                 const int rounding)

Intel Description

Multiply the lower half-precision (16-bit) floating-point elements in “a” and “b”, and add the intermediate result to the lower element in “c”. Store the result in the lower element of “dst” using zeromask “k” (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from “a” to the upper elements of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + c.fp16[0]
ELSE
        dst.fp16[0] := 0
FI
dst[127:16] := a[127:16]
dst[MAX:128] := 0

_mm_fnmadd_sh#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b, __m128h c
Param ETypes:: FP16 a, FP16 b, FP16 c

__m128h _mm_fnmadd_sh(__m128h a, __m128h b, __m128h c);

Intel Description

Multiply the lower half-precision (16-bit) floating-point elements in “a” and “b”, and add the negated intermediate result to the lower element in “c”. Store the result in the lower element of “dst”, and copy the upper 7 packed elements from “a” to the upper elements of “dst”.

Intel Implementation Psudeo-Code

dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) + c.fp16[0]
dst[127:16] := a[127:16]
dst[MAX:128] := 0

_mm_mask_fnmadd_sh#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __mmask8 k, __m128h b, __m128h c
Param ETypes:: FP16 a, MASK k, FP16 b, FP16 c

__m128h _mm_mask_fnmadd_sh(__m128h a, __mmask8 k, __m128h b,
                           __m128h c)

Intel Description

Multiply the lower half-precision (16-bit) floating-point elements in “a” and “b”, and add the negated intermediate result to the lower element in “c”. Store the result in the lower element of “dst” using writemask “k” (the element is copied from “a” when mask bit 0 is not set), and copy the upper 7 packed elements from “a” to the upper elements of “dst”.

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) + c.fp16[0]
ELSE
        dst.fp16[0] := a.fp16[0]
FI
dst[127:16] := a[127:16]
dst[MAX:128] := 0

_mm_mask3_fnmadd_sh#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b, __m128h c, __mmask8 k
Param ETypes:: FP16 a, FP16 b, FP16 c, MASK k

__m128h _mm_mask3_fnmadd_sh(__m128h a, __m128h b, __m128h c,
                            __mmask8 k)

Intel Description

Multiply the lower half-precision (16-bit) floating-point elements in “a” and “b”, and add the negated intermediate result to the lower element in “c”. Store the result in the lower element of “dst” using writemask “k” (the element is copied from “c” when mask bit 0 is not set), and copy the upper 7 packed elements from “c” to the upper elements of “dst”.

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) + c.fp16[0]
ELSE
        dst.fp16[0] := c.fp16[0]
FI
dst[127:16] := c[127:16]
dst[MAX:128] := 0

_mm_maskz_fnmadd_sh#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __mmask8 k, __m128h a, __m128h b, __m128h c
Param ETypes:: MASK k, FP16 a, FP16 b, FP16 c

__m128h _mm_maskz_fnmadd_sh(__mmask8 k, __m128h a,
                            __m128h b, __m128h c)

Intel Description

Multiply the lower half-precision (16-bit) floating-point elements in “a” and “b”, and add the negated intermediate result to the lower element in “c”. Store the result in the lower element of “dst” using zeromask “k” (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from “a” to the upper elements of “dst”.

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) + c.fp16[0]
ELSE
        dst.fp16[0] := 0
FI
dst[127:16] := a[127:16]
dst[MAX:128] := 0

_mm_fnmadd_round_sh#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b, __m128h c, const int rounding
Param ETypes:: FP16 a, FP16 b, FP16 c, IMM rounding

__m128h _mm_fnmadd_round_sh(__m128h a, __m128h b, __m128h c,
                            const int rounding)

Intel Description

Multiply the lower half-precision (16-bit) floating-point elements in “a” and “b”, and add the negated intermediate result to the lower element in “c”. Store the result in the lower element of “dst”, and copy the upper 7 packed elements from “a” to the upper elements of “dst”.: [round_note]

Intel Implementation Psudeo-Code

dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) + c.fp16[0]
dst[127:16] := a[127:16]
dst[MAX:128] := 0

_mm_mask_fnmadd_round_sh#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __mmask8 k, __m128h b, __m128h c, const int rounding
Param ETypes:: FP16 a, MASK k, FP16 b, FP16 c, IMM rounding

__m128h _mm_mask_fnmadd_round_sh(__m128h a, __mmask8 k,
                                 __m128h b, __m128h c,
                                 const int rounding)

Intel Description

Multiply the lower half-precision (16-bit) floating-point elements in “a” and “b”, and add the negated intermediate result to the lower element in “c”. Store the result in the lower element of “dst” using writemask “k” (the element is copied from “a” when mask bit 0 is not set), and copy the upper 7 packed elements from “a” to the upper elements of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) + c.fp16[0]
ELSE
        dst.fp16[0] := a.fp16[0]
FI
dst[127:16] := a[127:16]
dst[MAX:128] := 0

_mm_mask3_fnmadd_round_sh#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b, __m128h c, __mmask8 k, const int rounding
Param ETypes:: FP16 a, FP16 b, FP16 c, MASK k, IMM rounding

__m128h _mm_mask3_fnmadd_round_sh(__m128h a, __m128h b,
                                  __m128h c, __mmask8 k,
                                  const int rounding)

Intel Description

Multiply the lower half-precision (16-bit) floating-point elements in “a” and “b”, and add the negated intermediate result to the lower element in “c”. Store the result in the lower element of “dst” using writemask “k” (the element is copied from “c” when mask bit 0 is not set), and copy the upper 7 packed elements from “c” to the upper elements of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) + c.fp16[0]
ELSE
        dst.fp16[0] := c.fp16[0]
FI
dst[127:16] := c[127:16]
dst[MAX:128] := 0

_mm_maskz_fnmadd_round_sh#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __mmask8 k, __m128h a, __m128h b, __m128h c, const int rounding
Param ETypes:: MASK k, FP16 a, FP16 b, FP16 c, IMM rounding

__m128h _mm_maskz_fnmadd_round_sh(__mmask8 k, __m128h a,
                                  __m128h b, __m128h c,
                                  const int rounding)

Intel Description

Multiply the lower half-precision (16-bit) floating-point elements in “a” and “b”, and add the negated intermediate result to the lower element in “c”. Store the result in the lower element of “dst” using zeromask “k” (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from “a” to the upper elements of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) + c.fp16[0]
ELSE
        dst.fp16[0] := 0
FI
dst[127:16] := a[127:16]
dst[MAX:128] := 0

_mm_fmsub_sh#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b, __m128h c
Param ETypes:: FP16 a, FP16 b, FP16 c

__m128h _mm_fmsub_sh(__m128h a, __m128h b, __m128h c);

Intel Description

Multiply the lower half-precision (16-bit) floating-point elements in “a” and “b”, and subtract the lower element in “c” from the intermediate result. Store the result in the lower element of “dst”, and copy the upper 7 packed elements from “a” to the upper elements of “dst”.

Intel Implementation Psudeo-Code

dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - c.fp16[0]
dst[127:16] := a[127:16]
dst[MAX:128] := 0

_mm_mask_fmsub_sh#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __mmask8 k, __m128h b, __m128h c
Param ETypes:: FP16 a, MASK k, FP16 b, FP16 c

__m128h _mm_mask_fmsub_sh(__m128h a, __mmask8 k, __m128h b,
                          __m128h c)

Intel Description

Multiply the lower half-precision (16-bit) floating-point elements in “a” and “b”, and subtract the lower element in “c” from the intermediate result. Store the result in the lower element of “dst” using writemask “k” (the element is copied from “a” when mask bit 0 is not set), and copy the upper 7 packed elements from “a” to the upper elements of “dst”.

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - c.fp16[0]
ELSE
        dst.fp16[0] := a.fp16[0]
FI
dst[127:16] := a[127:16]
dst[MAX:128] := 0

_mm_mask3_fmsub_sh#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b, __m128h c, __mmask8 k
Param ETypes:: FP16 a, FP16 b, FP16 c, MASK k

__m128h _mm_mask3_fmsub_sh(__m128h a, __m128h b, __m128h c,
                           __mmask8 k)

Intel Description

Multiply the lower half-precision (16-bit) floating-point elements in “a” and “b”, and subtract the lower element in “c” from the intermediate result. Store the result in the lower element of “dst” using writemask “k” (the element is copied from “c” when mask bit 0 is not set), and copy the upper 7 packed elements from “c” to the upper elements of “dst”.

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - c.fp16[0]
ELSE
        dst.fp16[0] := c.fp16[0]
FI
dst[127:16] := c[127:16]
dst[MAX:128] := 0

_mm_maskz_fmsub_sh#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __mmask8 k, __m128h a, __m128h b, __m128h c
Param ETypes:: MASK k, FP16 a, FP16 b, FP16 c

__m128h _mm_maskz_fmsub_sh(__mmask8 k, __m128h a, __m128h b,
                           __m128h c)

Intel Description

Multiply the lower half-precision (16-bit) floating-point elements in “a” and “b”, and subtract the lower element in “c” from the intermediate result. Store the result in the lower element of “dst” using zeromask “k” (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from “a” to the upper elements of “dst”.

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - c.fp16[0]
ELSE
        dst.fp16[0] := 0
FI
dst[127:16] := a[127:16]
dst[MAX:128] := 0

_mm_fmsub_round_sh#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b, __m128h c, const int rounding
Param ETypes:: FP16 a, FP16 b, FP16 c, IMM rounding

__m128h _mm_fmsub_round_sh(__m128h a, __m128h b, __m128h c,
                           const int rounding)

Intel Description

Multiply the lower half-precision (16-bit) floating-point elements in “a” and “b”, and subtract the lower element in “c” from the intermediate result. Store the result in the lower element of “dst”, and copy the upper 7 packed elements from “a” to the upper elements of “dst”.: [round_note]

Intel Implementation Psudeo-Code

dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - c.fp16[0]
dst[127:16] := a[127:16]
dst[MAX:128] := 0

_mm_mask_fmsub_round_sh#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __mmask8 k, __m128h b, __m128h c, const int rounding
Param ETypes:: FP16 a, MASK k, FP16 b, FP16 c, IMM rounding

__m128h _mm_mask_fmsub_round_sh(__m128h a, __mmask8 k,
                                __m128h b, __m128h c,
                                const int rounding)

Intel Description

Multiply the lower half-precision (16-bit) floating-point elements in “a” and “b”, and subtract the lower element in “c” from the intermediate result. Store the result in the lower element of “dst” using writemask “k” (the element is copied from “a” when mask bit 0 is not set), and copy the upper 7 packed elements from “a” to the upper elements of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - c.fp16[0]
ELSE
        dst.fp16[0] := a.fp16[0]
FI
dst[127:16] := a[127:16]
dst[MAX:128] := 0

_mm_mask3_fmsub_round_sh#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b, __m128h c, __mmask8 k, const int rounding
Param ETypes:: FP16 a, FP16 b, FP16 c, MASK k, IMM rounding

__m128h _mm_mask3_fmsub_round_sh(__m128h a, __m128h b,
                                 __m128h c, __mmask8 k,
                                 const int rounding)

Intel Description

Multiply the lower half-precision (16-bit) floating-point elements in “a” and “b”, and subtract the lower element in “c” from the intermediate result. Store the result in the lower element of “dst” using writemask “k” (the element is copied from “c” when mask bit 0 is not set), and copy the upper 7 packed elements from “c” to the upper elements of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - c.fp16[0]
ELSE
        dst.fp16[0] := c.fp16[0]
FI
dst[127:16] := c[127:16]
dst[MAX:128] := 0

_mm_maskz_fmsub_round_sh#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __mmask8 k, __m128h a, __m128h b, __m128h c, const int rounding
Param ETypes:: MASK k, FP16 a, FP16 b, FP16 c, IMM rounding

__m128h _mm_maskz_fmsub_round_sh(__mmask8 k, __m128h a,
                                 __m128h b, __m128h c,
                                 const int rounding)

Intel Description

Multiply the lower half-precision (16-bit) floating-point elements in “a” and “b”, and subtract the lower element in “c” from the intermediate result. Store the result in the lower element of “dst” using zeromask “k” (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from “a” to the upper elements of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - c.fp16[0]
ELSE
        dst.fp16[0] := 0
FI
dst[127:16] := a[127:16]
dst[MAX:128] := 0

_mm_fnmsub_sh#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b, __m128h c
Param ETypes:: FP16 a, FP16 b, FP16 c

__m128h _mm_fnmsub_sh(__m128h a, __m128h b, __m128h c);

Intel Description

Multiply the lower half-precision (16-bit) floating-point elements in “a” and “b”, and subtract the lower element in “c” from the negated intermediate result. Store the result in the lower element of “dst”, and copy the upper 7 packed elements from “a” to the upper elements of “dst”.

Intel Implementation Psudeo-Code

dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) - c.fp16[0]
dst[127:16] := a[127:16]
dst[MAX:128] := 0

_mm_mask_fnmsub_sh#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __mmask8 k, __m128h b, __m128h c
Param ETypes:: FP16 a, MASK k, FP16 b, FP16 c

__m128h _mm_mask_fnmsub_sh(__m128h a, __mmask8 k, __m128h b,
                           __m128h c)

Intel Description

Multiply the lower half-precision (16-bit) floating-point elements in “a” and “b”, and subtract the lower element in “c” from the negated intermediate result. Store the result in the lower element of “dst” using writemask “k” (the element is copied from “a” when mask bit 0 is not set), and copy the upper 7 packed elements from “a” to the upper elements of “dst”.

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) - c.fp16[0]
ELSE
        dst.fp16[0] := a.fp16[0]
FI
dst[127:16] := a[127:16]
dst[MAX:128] := 0

_mm_mask3_fnmsub_sh#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b, __m128h c, __mmask8 k
Param ETypes:: FP16 a, FP16 b, FP16 c, MASK k

__m128h _mm_mask3_fnmsub_sh(__m128h a, __m128h b, __m128h c,
                            __mmask8 k)

Intel Description

Multiply the lower half-precision (16-bit) floating-point elements in “a” and “b”, and subtract the lower element in “c” from the negated intermediate result. Store the result in the lower element of “dst” using writemask “k” (the element is copied from “c” when mask bit 0 is not set), and copy the upper 7 packed elements from “c” to the upper elements of “dst”.

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) - c.fp16[0]
ELSE
        dst.fp16[0] := c.fp16[0]
FI
dst[127:16] := c[127:16]
dst[MAX:128] := 0

_mm_maskz_fnmsub_sh#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __mmask8 k, __m128h a, __m128h b, __m128h c
Param ETypes:: MASK k, FP16 a, FP16 b, FP16 c

__m128h _mm_maskz_fnmsub_sh(__mmask8 k, __m128h a,
                            __m128h b, __m128h c)

Intel Description

Multiply the lower half-precision (16-bit) floating-point elements in “a” and “b”, and subtract the lower element in “c” from the negated intermediate result. Store the result in the lower element of “dst” using zeromask “k” (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from “a” to the upper elements of “dst”.

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) - c.fp16[0]
ELSE
        dst.fp16[0] := 0
FI
dst[127:16] := a[127:16]
dst[MAX:128] := 0

_mm_fnmsub_round_sh#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b, __m128h c, const int rounding
Param ETypes:: FP16 a, FP16 b, FP16 c, IMM rounding

__m128h _mm_fnmsub_round_sh(__m128h a, __m128h b, __m128h c,
                            const int rounding)

Intel Description

Multiply the lower half-precision (16-bit) floating-point elements in “a” and “b”, and subtract the lower element in “c” from the negated intermediate result. Store the result in the lower element of “dst”, and copy the upper 7 packed elements from “a” to the upper elements of “dst”.: [round_note]

Intel Implementation Psudeo-Code

dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) - c.fp16[0]
dst[127:16] := a[127:16]
dst[MAX:128] := 0

_mm_mask_fnmsub_round_sh#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __mmask8 k, __m128h b, __m128h c, const int rounding
Param ETypes:: FP16 a, MASK k, FP16 b, FP16 c, IMM rounding

__m128h _mm_mask_fnmsub_round_sh(__m128h a, __mmask8 k,
                                 __m128h b, __m128h c,
                                 const int rounding)

Intel Description

Multiply the lower half-precision (16-bit) floating-point elements in “a” and “b”, and subtract the lower element in “c” from the negated intermediate result. Store the result in the lower element of “dst” using writemask “k” (the element is copied from “a” when mask bit 0 is not set), and copy the upper 7 packed elements from “a” to the upper elements of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) - c.fp16[0]
ELSE
        dst.fp16[0] := a.fp16[0]
FI
dst[127:16] := a[127:16]
dst[MAX:128] := 0

_mm_mask3_fnmsub_round_sh#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b, __m128h c, __mmask8 k, const int rounding
Param ETypes:: FP16 a, FP16 b, FP16 c, MASK k, IMM rounding

__m128h _mm_mask3_fnmsub_round_sh(__m128h a, __m128h b,
                                  __m128h c, __mmask8 k,
                                  const int rounding)

Intel Description

Multiply the lower half-precision (16-bit) floating-point elements in “a” and “b”, and subtract the lower element in “c” from the negated intermediate result. Store the result in the lower element of “dst” using writemask “k” (the element is copied from “c” when mask bit 0 is not set), and copy the upper 7 packed elements from “c” to the upper elements of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) - c.fp16[0]
ELSE
        dst.fp16[0] := c.fp16[0]
FI
dst[127:16] := c[127:16]
dst[MAX:128] := 0

_mm_maskz_fnmsub_round_sh#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __mmask8 k, __m128h a, __m128h b, __m128h c, const int rounding
Param ETypes:: MASK k, FP16 a, FP16 b, FP16 c, IMM rounding

__m128h _mm_maskz_fnmsub_round_sh(__mmask8 k, __m128h a,
                                  __m128h b, __m128h c,
                                  const int rounding)

Intel Description

Multiply the lower half-precision (16-bit) floating-point elements in “a” and “b”, and subtract the lower element in “c” from the negated intermediate result. Store the result in the lower element of “dst” using zeromask “k” (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from “a” to the upper elements of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) - c.fp16[0]
ELSE
        dst.fp16[0] := 0
FI
dst[127:16] := a[127:16]
dst[MAX:128] := 0

_mm_sub_sh#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b
Param ETypes:: FP16 a, FP16 b

__m128h _mm_sub_sh(__m128h a, __m128h b);

Intel Description

Subtract the lower half-precision (16-bit) floating-point element in “b” from the lower half-precision (16-bit) floating-point element in “a”, store the result in the lower element of “dst”, and copy the upper 7 packed elements from “a” to the upper elements of “dst”.

Intel Implementation Psudeo-Code

dst.fp16[0] := a.fp16[0] - b.fp16[0]
dst[127:16] := a[127:16]
dst[MAX:128] := 0

_mm_sub_round_sh#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b, int rounding
Param ETypes:: FP16 a, FP16 b, IMM rounding

__m128h _mm_sub_round_sh(__m128h a, __m128h b,
                         int rounding)

Intel Description

Subtract the lower half-precision (16-bit) floating-point element in “b” from the lower half-precision (16-bit) floating-point element in “a”, store the result in the lower element of “dst”, and copy the upper 7 packed elements from “a” to the upper elements of “dst”.: [round_note]

Intel Implementation Psudeo-Code

dst.fp16[0] := a.fp16[0] - b.fp16[0]
dst[127:16] := a[127:16]
dst[MAX:128] := 0

_mm_mask_sub_sh#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h src, __mmask8 k, __m128h a, __m128h b
Param ETypes:: FP16 src, MASK k, FP16 a, FP16 b

__m128h _mm_mask_sub_sh(__m128h src, __mmask8 k, __m128h a,
                        __m128h b)

Intel Description

Subtract the lower half-precision (16-bit) floating-point element in “b” from the lower half-precision (16-bit) floating-point element in “a”, store the result in the lower element of “dst” using writemask “k” (the element is copied from “src” when mask bit 0 is not set), and copy the upper 7 packed elements from “a” to the upper elements of “dst”.

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := a.fp16[0] - b.fp16[0]
ELSE
        dst.fp16[0] := src.fp16[0]
FI
dst[127:16] := a[127:16]
dst[MAX:128] := 0

_mm_mask_sub_round_sh#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h src, __mmask8 k, __m128h a, __m128h b, int rounding
Param ETypes:: FP16 src, MASK k, FP16 a, FP16 b, IMM rounding

__m128h _mm_mask_sub_round_sh(__m128h src, __mmask8 k,
                              __m128h a, __m128h b,
                              int rounding)

Intel Description

Subtract the lower half-precision (16-bit) floating-point element in “b” from the lower half-precision (16-bit) floating-point element in “a”, store the result in the lower element of “dst” using writemask “k” (the element is copied from “src” when mask bit 0 is not set), and copy the upper 7 packed elements from “a” to the upper elements of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := a.fp16[0] - b.fp16[0]
ELSE
        dst.fp16[0] := src.fp16[0]
FI
dst[127:16] := a[127:16]
dst[MAX:128] := 0

_mm_maskz_sub_sh#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __mmask8 k, __m128h a, __m128h b
Param ETypes:: MASK k, FP16 a, FP16 b

__m128h _mm_maskz_sub_sh(__mmask8 k, __m128h a, __m128h b);

Intel Description

Subtract the lower half-precision (16-bit) floating-point element in “b” from the lower half-precision (16-bit) floating-point element in “a”, store the result in the lower element of “dst” using zeromask “k” (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from “a” to the upper elements of “dst”.

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := a.fp16[0] - b.fp16[0]
ELSE
        dst.fp16[0] := 0
FI
dst[127:16] := a[127:16]
dst[MAX:128] := 0

_mm_maskz_sub_round_sh#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __mmask8 k, __m128h a, __m128h b, int rounding
Param ETypes:: MASK k, FP16 a, FP16 b, IMM rounding

__m128h _mm_maskz_sub_round_sh(__mmask8 k, __m128h a,
                               __m128h b, int rounding)

Intel Description

Subtract the lower half-precision (16-bit) floating-point element in “b” from the lower half-precision (16-bit) floating-point element in “a”, store the result in the lower element of “dst” using zeromask “k” (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from “a” to the upper elements of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := a.fp16[0] - b.fp16[0]
ELSE
        dst.fp16[0] := 0
FI
dst[127:16] := a[127:16]
dst[MAX:128] := 0

_mm_mul_sh#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b
Param ETypes:: FP16 a, FP16 b

__m128h _mm_mul_sh(__m128h a, __m128h b);

Intel Description

Multiply the lower half-precision (16-bit) floating-point element in “a” and “b”, store the result in the lower element of “dst”, and copy the upper 7 packed elements from “a” to the upper elements of “dst”.

Intel Implementation Psudeo-Code

dst.fp16[0] := a.fp16[0] * b.fp16[0]
dst[127:16] := a[127:16]
dst[MAX:128] := 0

_mm_mul_round_sh#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b, int rounding
Param ETypes:: FP16 a, FP16 b, IMM rounding

__m128h _mm_mul_round_sh(__m128h a, __m128h b,
                         int rounding)

Intel Description

Multiply the lower half-precision (16-bit) floating-point element in “a” and “b”, store the result in the lower element of “dst”, and copy the upper 7 packed elements from “a” to the upper elements of “dst”.: [round_note]

Intel Implementation Psudeo-Code

dst.fp16[0] := a.fp16[0] * b.fp16[0]
dst[127:16] := a[127:16]
dst[MAX:128] := 0

_mm_mask_mul_sh#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h src, __mmask8 k, __m128h a, __m128h b
Param ETypes:: FP16 src, MASK k, FP16 a, FP16 b

__m128h _mm_mask_mul_sh(__m128h src, __mmask8 k, __m128h a,
                        __m128h b)

Intel Description

Multiply the lower half-precision (16-bit) floating-point element in “a” and “b”, store the result in the lower element of “dst” using writemask “k” (the element is copied from “src” when mask bit 0 is not set), and copy the upper 7 packed elements from “a” to the upper elements of “dst”.

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := a.fp16[0] * b.fp16[0]
ELSE
        dst.fp16[0] := src.fp16[0]
FI
dst[127:16] := a[127:16]
dst[MAX:128] := 0

_mm_mask_mul_round_sh#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h src, __mmask8 k, __m128h a, __m128h b, int rounding
Param ETypes:: FP16 src, MASK k, FP16 a, FP16 b, IMM rounding

__m128h _mm_mask_mul_round_sh(__m128h src, __mmask8 k,
                              __m128h a, __m128h b,
                              int rounding)

Intel Description

Multiply the lower half-precision (16-bit) floating-point element in “a” and “b”, store the result in the lower element of “dst” using writemask “k” (the element is copied from “src” when mask bit 0 is not set), and copy the upper 7 packed elements from “a” to the upper elements of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := a.fp16[0] * b.fp16[0]
ELSE
        dst.fp16[0] := src.fp16[0]
FI
dst[127:16] := a[127:16]
dst[MAX:128] := 0

_mm_maskz_mul_sh#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __mmask8 k, __m128h a, __m128h b
Param ETypes:: MASK k, FP16 a, FP16 b

__m128h _mm_maskz_mul_sh(__mmask8 k, __m128h a, __m128h b);

Intel Description

Multiply the lower half-precision (16-bit) floating-point element in “a” and “b”, store the result in the lower element of “dst” using zeromask “k” (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from “a” to the upper elements of “dst”.

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := a.fp16[0] * b.fp16[0]
ELSE
        dst.fp16[0] := 0
FI
dst[127:16] := a[127:16]
dst[MAX:128] := 0

_mm_maskz_mul_round_sh#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __mmask8 k, __m128h a, __m128h b, int rounding
Param ETypes:: MASK k, FP16 a, FP16 b, IMM rounding

__m128h _mm_maskz_mul_round_sh(__mmask8 k, __m128h a,
                               __m128h b, int rounding)

Intel Description

Multiply the lower half-precision (16-bit) floating-point element in “a” and “b”, store the result in the lower element of “dst” using zeromask “k” (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from “a” to the upper elements of “dst”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := a.fp16[0] * b.fp16[0]
ELSE
        dst.fp16[0] := 0
FI
dst[127:16] := a[127:16]
dst[MAX:128] := 0

_mm_fmul_sch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b
Param ETypes:: FP16 a, FP16 b

__m128h _mm_fmul_sch(__m128h a, __m128h b);

Intel Description

Multiply the lower complex numbers in “a” and “b”, and store the result in the lower elements of “dst”, and copy the upper 6 packed elements from “a” to the upper elements of “dst”. Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number “complex = vec.fp16[0] + i * vec.fp16[1]”.

Intel Implementation Psudeo-Code

dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1])
dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1])
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_mul_sch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b
Param ETypes:: FP16 a, FP16 b

__m128h _mm_mul_sch(__m128h a, __m128h b);

Intel Description

Intel Implementation Psudeo-Code

dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1])
dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1])
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_mask_fmul_sch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h src, __mmask8 k, __m128h a, __m128h b
Param ETypes:: FP16 src, MASK k, FP16 a, FP16 b

__m128h _mm_mask_fmul_sch(__m128h src, __mmask8 k,
                          __m128h a, __m128h b)

Intel Description

Multiply the lower complex numbers in “a” and “b”, and store the result in the lower elements of “dst” using writemask “k” (elements are copied from “src” when mask bit 0 is not set), and copy the upper 6 packed elements from “a” to the upper elements of “dst”. Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number “complex = vec.fp16[0] + i * vec.fp16[1]”.

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1])
        dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1])
ELSE
        dst.fp16[0] := src.fp16[0]
        dst.fp16[1] := src.fp16[1]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_mask_mul_sch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h src, __mmask8 k, __m128h a, __m128h b
Param ETypes:: FP16 src, MASK k, FP16 a, FP16 b

__m128h _mm_mask_mul_sch(__m128h src, __mmask8 k, __m128h a,
                         __m128h b)

Intel Description

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1])
        dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1])
ELSE
        dst.fp16[0] := src.fp16[0]
        dst.fp16[1] := src.fp16[1]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_maskz_fmul_sch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __mmask8 k, __m128h a, __m128h b
Param ETypes:: MASK k, FP16 a, FP16 b

__m128h _mm_maskz_fmul_sch(__mmask8 k, __m128h a,
                           __m128h b)

Intel Description

Multiply the lower complex numbers in “a” and “b”, and store the result in the lower elements of “dst” using zeromask “k” (elements are zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements from “a” to the upper elements of “dst”. Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number “complex = vec.fp16[0] + i * vec.fp16[1]”.

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1])
        dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1])
ELSE
        dst.fp16[0] := 0
        dst.fp16[1] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_maskz_mul_sch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __mmask8 k, __m128h a, __m128h b
Param ETypes:: MASK k, FP16 a, FP16 b

__m128h _mm_maskz_mul_sch(__mmask8 k, __m128h a, __m128h b);

Intel Description

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1])
        dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1])
ELSE
        dst.fp16[0] := 0
        dst.fp16[1] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_fmul_round_sch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b, const int rounding
Param ETypes:: FP16 a, FP16 b, IMM rounding

__m128h _mm_fmul_round_sch(__m128h a, __m128h b,
                           const int rounding)

Intel Description

Multiply the lower complex numbers in “a” and “b”, and store the result in the lower elements of “dst”, and copy the upper 6 packed elements from “a” to the upper elements of “dst”. Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number “complex = vec.fp16[0] + i * vec.fp16[1]”.: [round_note]

Intel Implementation Psudeo-Code

dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1])
dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1])
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_mul_round_sch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b, const int rounding
Param ETypes:: FP16 a, FP16 b, IMM rounding

__m128h _mm_mul_round_sch(__m128h a, __m128h b,
                          const int rounding)

Intel Description

Multiply the lower complex numbers in “a” and “b”, and store the result in the lower elements of “dst”, and copy the upper 6 packed elements from “a” to the upper elements of “dst”. Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number “complex = vec.fp16[0] + i * vec.fp16[1]”.: [round_note]

Intel Implementation Psudeo-Code

dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1])
dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1])
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_mask_fmul_round_sch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h src, __mmask8 k, __m128h a, __m128h b, const int rounding
Param ETypes:: FP16 src, MASK k, FP16 a, FP16 b, IMM rounding

__m128h _mm_mask_fmul_round_sch(__m128h src, __mmask8 k,
                                __m128h a, __m128h b,
                                const int rounding)

Intel Description

Multiply the lower complex numbers in “a” and “b”, and store the result in the lower elements of “dst” using writemask “k” (elements are copied from “src” when mask bit 0 is not set), and copy the upper 6 packed elements from “a” to the upper elements of “dst”. Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number “complex = vec.fp16[0] + i * vec.fp16[1]”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1])
        dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1])
ELSE
        dst.fp16[0] := src.fp16[0]
        dst.fp16[1] := src.fp16[1]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_mask_mul_round_sch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h src, __mmask8 k, __m128h a, __m128h b, const int rounding
Param ETypes:: FP16 src, MASK k, FP16 a, FP16 b, IMM rounding

__m128h _mm_mask_mul_round_sch(__m128h src, __mmask8 k,
                               __m128h a, __m128h b,
                               const int rounding)

Intel Description

Multiply the lower complex numbers in “a” and “b”, and store the result in the lower elements of “dst” using writemask “k” (elements are copied from “src” when mask bit 0 is not set), and copy the upper 6 packed elements from “a” to the upper elements of “dst”. Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number “complex = vec.fp16[0] + i * vec.fp16[1]”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1])
        dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1])
ELSE
        dst.fp16[0] := src.fp16[0]
        dst.fp16[1] := src.fp16[1]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_maskz_fmul_round_sch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __mmask8 k, __m128h a, __m128h b, const int rounding
Param ETypes:: MASK k, FP16 a, FP16 b, IMM rounding

__m128h _mm_maskz_fmul_round_sch(__mmask8 k, __m128h a,
                                 __m128h b,
                                 const int rounding)

Intel Description

Multiply the lower complex numbers in “a” and “b”, and store the result in the lower elements of “dst” using zeromask “k” (elements are zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements from “a” to the upper elements of “dst”. Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number “complex = vec.fp16[0] + i * vec.fp16[1]”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1])
        dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1])
ELSE
        dst.fp16[0] := 0
        dst.fp16[1] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_maskz_mul_round_sch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __mmask8 k, __m128h a, __m128h b, const int rounding
Param ETypes:: MASK k, FP16 a, FP16 b, IMM rounding

__m128h _mm_maskz_mul_round_sch(__mmask8 k, __m128h a,
                                __m128h b,
                                const int rounding)

Intel Description

Multiply the lower complex numbers in “a” and “b”, and store the result in the lower elements of “dst” using zeromask “k” (elements are zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements from “a” to the upper elements of “dst”. Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number “complex = vec.fp16[0] + i * vec.fp16[1]”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1])
        dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1])
ELSE
        dst.fp16[0] := 0
        dst.fp16[1] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_fcmul_sch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b
Param ETypes:: FP16 a, FP16 b

__m128h _mm_fcmul_sch(__m128h a, __m128h b);

Intel Description

Multiply the lower complex number in “a” by the complex conjugate of the lower complex number in “b”, and store the result in the lower elements of “dst”, and copy the upper 6 packed elements from “a” to the upper elements of “dst”. Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number “complex = vec.fp16[0] + i * vec.fp16[1]”, or the complex conjugate “conjugate = vec.fp16[0] - i * vec.fp16[1]”.

Intel Implementation Psudeo-Code

dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1])
dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1])
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_cmul_sch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b
Param ETypes:: FP16 a, FP16 b

__m128h _mm_cmul_sch(__m128h a, __m128h b);

Intel Description

Intel Implementation Psudeo-Code

dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1])
dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1])
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_mask_fcmul_sch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h src, __mmask8 k, __m128h a, __m128h b
Param ETypes:: FP16 src, MASK k, FP16 a, FP16 b

__m128h _mm_mask_fcmul_sch(__m128h src, __mmask8 k,
                           __m128h a, __m128h b)

Intel Description

Multiply the lower complex number in “a” by the complex conjugate of the lower complex number in “b”, and store the result in the lower elements of “dst” using writemask “k” (elements are copied from “src” when mask bit 0 is not set), and copy the upper 6 packed elements from “a” to the upper elements of “dst”. Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number “complex = vec.fp16[0] + i * vec.fp16[1]”, or the complex conjugate “conjugate = vec.fp16[0] - i * vec.fp16[1]”.

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1])
        dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1])
ELSE
        dst.fp16[0] := src.fp16[0]
        dst.fp16[1] := src.fp16[1]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_mask_cmul_sch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h src, __mmask8 k, __m128h a, __m128h b
Param ETypes:: FP16 src, MASK k, FP16 a, FP16 b

__m128h _mm_mask_cmul_sch(__m128h src, __mmask8 k,
                          __m128h a, __m128h b)

Intel Description

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1])
        dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1])
ELSE
        dst.fp16[0] := src.fp16[0]
        dst.fp16[1] := src.fp16[1]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_maskz_fcmul_sch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __mmask8 k, __m128h a, __m128h b
Param ETypes:: MASK k, FP16 a, FP16 b

__m128h _mm_maskz_fcmul_sch(__mmask8 k, __m128h a,
                            __m128h b)

Intel Description

Multiply the lower complex number in “a” by the complex conjugate of the lower complex number in “b”, and store the result in the lower elements of “dst” using zeromask “k” (elements are zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements from “a” to the upper elements of “dst”. Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number “complex = vec.fp16[0] + i * vec.fp16[1]”, or the complex conjugate “conjugate = vec.fp16[0] - i * vec.fp16[1]”.

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1])
        dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1])
ELSE
        dst.fp16[0] := 0
        dst.fp16[1] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_maskz_cmul_sch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __mmask8 k, __m128h a, __m128h b
Param ETypes:: MASK k, FP16 a, FP16 b

__m128h _mm_maskz_cmul_sch(__mmask8 k, __m128h a,
                           __m128h b)

Intel Description

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1])
        dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1])
ELSE
        dst.fp16[0] := 0
        dst.fp16[1] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_fcmul_round_sch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b, const int rounding
Param ETypes:: FP16 a, FP16 b, IMM rounding

__m128h _mm_fcmul_round_sch(__m128h a, __m128h b,
                            const int rounding)

Intel Description

Multiply the lower complex number in “a” by the complex conjugate of the lower complex number in “b”, and store the result in the lower elements of “dst”, and copy the upper 6 packed elements from “a” to the upper elements of “dst”. Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number “complex = vec.fp16[0] + i * vec.fp16[1]”, or the complex conjugate “conjugate = vec.fp16[0] - i * vec.fp16[1]”.: [round_note]

Intel Implementation Psudeo-Code

dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1])
dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1])
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_cmul_round_sch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b, const int rounding
Param ETypes:: FP16 a, FP16 b, IMM rounding

__m128h _mm_cmul_round_sch(__m128h a, __m128h b,
                           const int rounding)

Intel Description

Multiply the lower complex number in “a” by the complex conjugate of the lower complex number in “b”, and store the result in the lower elements of “dst”, and copy the upper 6 packed elements from “a” to the upper elements of “dst”. Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number “complex = vec.fp16[0] + i * vec.fp16[1]”, or the complex conjugate “conjugate = vec.fp16[0] - i * vec.fp16[1]”.: [round_note]

Intel Implementation Psudeo-Code

dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1])
dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1])
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_mask_fcmul_round_sch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h src, __mmask8 k, __m128h a, __m128h b, const int rounding
Param ETypes:: FP16 src, MASK k, FP16 a, FP16 b, IMM rounding

__m128h _mm_mask_fcmul_round_sch(__m128h src, __mmask8 k,
                                 __m128h a, __m128h b,
                                 const int rounding)

Intel Description

Multiply the lower complex number in “a” by the complex conjugate of the lower complex number in “b”, and store the result in the lower elements of “dst” using writemask “k” (elements are copied from “src” when mask bit 0 is not set), and copy the upper 6 packed elements from “a” to the upper elements of “dst”. Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number “complex = vec.fp16[0] + i * vec.fp16[1]”, or the complex conjugate “conjugate = vec.fp16[0] - i * vec.fp16[1]”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1])
        dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1])
ELSE
        dst.fp16[0] := src.fp16[0]
        dst.fp16[1] := src.fp16[1]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_mask_cmul_round_sch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h src, __mmask8 k, __m128h a, __m128h b, const int rounding
Param ETypes:: FP16 src, MASK k, FP16 a, FP16 b, IMM rounding

__m128h _mm_mask_cmul_round_sch(__m128h src, __mmask8 k,
                                __m128h a, __m128h b,
                                const int rounding)

Intel Description

Multiply the lower complex number in “a” by the complex conjugate of the lower complex number in “b”, and store the result in the lower elements of “dst” using writemask “k” (elements are copied from “src” when mask bit 0 is not set), and copy the upper 6 packed elements from “a” to the upper elements of “dst”. Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number “complex = vec.fp16[0] + i * vec.fp16[1]”, or the complex conjugate “conjugate = vec.fp16[0] - i * vec.fp16[1]”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1])
        dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1])
ELSE
        dst.fp16[0] := src.fp16[0]
        dst.fp16[1] := src.fp16[1]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_maskz_fcmul_round_sch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __mmask8 k, __m128h a, __m128h b, const int rounding
Param ETypes:: MASK k, FP16 a, FP16 b, IMM rounding

__m128h _mm_maskz_fcmul_round_sch(__mmask8 k, __m128h a,
                                  __m128h b,
                                  const int rounding)

Intel Description

Multiply the lower complex number in “a” by the complex conjugate of the lower complex number in “b”, and store the result in the lower elements of “dst” using zeromask “k” (elements are zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements from “a” to the upper elements of “dst”. Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number “complex = vec.fp16[0] + i * vec.fp16[1]”, or the complex conjugate “conjugate = vec.fp16[0] - i * vec.fp16[1]”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1])
        dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1])
ELSE
        dst.fp16[0] := 0
        dst.fp16[1] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_maskz_cmul_round_sch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __mmask8 k, __m128h a, __m128h b, const int rounding
Param ETypes:: MASK k, FP16 a, FP16 b, IMM rounding

__m128h _mm_maskz_cmul_round_sch(__mmask8 k, __m128h a,
                                 __m128h b,
                                 const int rounding)

Intel Description

Multiply the lower complex number in “a” by the complex conjugate of the lower complex number in “b”, and store the result in the lower elements of “dst” using zeromask “k” (elements are zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements from “a” to the upper elements of “dst”. Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number “complex = vec.fp16[0] + i * vec.fp16[1]”, or the complex conjugate “conjugate = vec.fp16[0] - i * vec.fp16[1]”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1])
        dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1])
ELSE
        dst.fp16[0] := 0
        dst.fp16[1] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_fmadd_sch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b, __m128h c
Param ETypes:: FP16 a, FP16 b, FP16 c

__m128h _mm_fmadd_sch(__m128h a, __m128h b, __m128h c);

Intel Description

Multiply the lower complex numbers in “a” and “b”, accumulate to the lower complex number in “c”, and store the result in the lower elements of “dst”, and copy the upper 6 packed elements from “a” to the upper elements of “dst”. Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number “complex = vec.fp16[0] + i * vec.fp16[1]”.

Intel Implementation Psudeo-Code

dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) + c.fp16[0]
dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) + c.fp16[1]
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_mask_fmadd_sch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __mmask8 k, __m128h b, __m128h c
Param ETypes:: FP16 a, MASK k, FP16 b, FP16 c

__m128h _mm_mask_fmadd_sch(__m128h a, __mmask8 k, __m128h b,
                           __m128h c)

Intel Description

Multiply the lower complex numbers in “a” and “b”, accumulate to the lower complex number in “c”, and store the result in the lower elements of “dst” using writemask “k” (elements are copied from “a” when mask bit 0 is not set), and copy the upper 6 packed elements from “a” to the upper elements of “dst”. Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number “complex = vec.fp16[0] + i * vec.fp16[1]”.

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) + c.fp16[0]
        dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) + c.fp16[1]
ELSE
        dst.fp16[0] := a.fp16[0]
        dst.fp16[1] := a.fp16[1]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_mask3_fmadd_sch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b, __m128h c, __mmask8 k
Param ETypes:: FP16 a, FP16 b, FP16 c, MASK k

__m128h _mm_mask3_fmadd_sch(__m128h a, __m128h b, __m128h c,
                            __mmask8 k)

Intel Description

Multiply the lower complex number in “a” and “b”, accumulate to the lower complex number in “c”, and store the result in the lower elements of “dst” using writemask “k” (elements are copied from “c” when mask bit 0 is not set), and copy the upper 6 packed elements from “c” to the upper elements of “dst”. Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number “complex = vec.fp16[0] + i * vec.fp16[1]”.

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) + c.fp16[0]
        dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) + c.fp16[1]
ELSE
        dst.fp16[0] := c.fp16[0]
        dst.fp16[1] := c.fp16[1]
FI
dst[127:32] := c[127:32]
dst[MAX:128] := 0

_mm_maskz_fmadd_sch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __mmask8 k, __m128h a, __m128h b, __m128h c
Param ETypes:: MASK k, FP16 a, FP16 b, FP16 c

__m128h _mm_maskz_fmadd_sch(__mmask8 k, __m128h a,
                            __m128h b, __m128h c)

Intel Description

Multiply the lower complex numbers in “a” and “b”, accumulate to the lower complex number in “c”, and store the result in the lower elements of “dst” using zeromask “k” (elements are zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements from “a” to the upper elements of “dst”. Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number “complex = vec.fp16[0] + i * vec.fp16[1]”.

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) + c.fp16[0]
        dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) + c.fp16[1]
ELSE
        dst.fp16[0] := 0
        dst.fp16[1] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_fmadd_round_sch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b, __m128h c, const int rounding
Param ETypes:: FP16 a, FP16 b, FP16 c, IMM rounding

__m128h _mm_fmadd_round_sch(__m128h a, __m128h b, __m128h c,
                            const int rounding)

Intel Description

Multiply the lower complex numbers in “a” and “b”, accumulate to the lower complex number in “c”, and store the result in the lower elements of “dst”, and copy the upper 6 packed elements from “a” to the upper elements of “dst”. Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number “complex = vec.fp16[0] + i * vec.fp16[1]”.: [round_note]

Intel Implementation Psudeo-Code

dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) + c.fp16[0]
dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) + c.fp16[1]
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_mask_fmadd_round_sch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __mmask8 k, __m128h b, __m128h c, const int rounding
Param ETypes:: FP16 a, MASK k, FP16 b, FP16 c, IMM rounding

__m128h _mm_mask_fmadd_round_sch(__m128h a, __mmask8 k,
                                 __m128h b, __m128h c,
                                 const int rounding)

Intel Description

Multiply the lower complex numbers in “a” and “b”, accumulate to the lower complex number in “c”, and store the result in the lower elements of “dst” using writemask “k” (elements are copied from “a” when mask bit 0 is not set), and copy the upper 6 packed elements from “a” to the upper elements of “dst”. Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number “complex = vec.fp16[0] + i * vec.fp16[1]”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) + c.fp16[0]
        dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) + c.fp16[1]
ELSE
        dst.fp16[0] := a.fp16[0]
        dst.fp16[1] := a.fp16[1]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_mask3_fmadd_round_sch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b, __m128h c, __mmask8 k, const int rounding
Param ETypes:: FP16 a, FP16 b, FP16 c, MASK k, IMM rounding

__m128h _mm_mask3_fmadd_round_sch(__m128h a, __m128h b,
                                  __m128h c, __mmask8 k,
                                  const int rounding)

Intel Description

Multiply the lower complex numbers in “a” and “b”, accumulate to the lower complex number in “c”, and store the result in the lower elements of “dst” using writemask “k” (elements are copied from “c” when mask bit 0 is not set), and copy the upper 6 packed elements from “c” to the upper elements of “dst”. Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number “complex = vec.fp16[0] + i * vec.fp16[1]”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) + c.fp16[0]
        dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) + c.fp16[1]
ELSE
        dst.fp16[0] := c.fp16[0]
        dst.fp16[1] := c.fp16[1]
FI
dst[127:32] := c[127:32]
dst[MAX:128] := 0

_mm_maskz_fmadd_round_sch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __mmask8 k, __m128h a, __m128h b, __m128h c, const int rounding
Param ETypes:: MASK k, FP16 a, FP16 b, FP16 c, IMM rounding

__m128h _mm_maskz_fmadd_round_sch(__mmask8 k, __m128h a,
                                  __m128h b, __m128h c,
                                  const int rounding)

Intel Description

Multiply the lower complex numbers in “a” and “b”, accumulate to the lower complex number in “c”, and store the result in the lower elements of “dst” using zeromask “k” (elements are zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements from “a” to the upper elements of “dst”. Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number “complex = vec.fp16[0] + i * vec.fp16[1]”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) + c.fp16[0]
        dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) + c.fp16[1]
ELSE
        dst.fp16[0] := 0
        dst.fp16[1] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_fcmadd_sch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b, __m128h c
Param ETypes:: FP16 a, FP16 b, FP16 c

__m128h _mm_fcmadd_sch(__m128h a, __m128h b, __m128h c);

Intel Description

Multiply the lower complex number in “a” by the complex conjugate of the lower complex number in “b”, accumulate to the lower complex number in “c”, and store the result in the lower elements of “dst”, and copy the upper 6 packed elements from “a” to the upper elements of “dst”. Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number “complex = vec.fp16[0] + i * vec.fp16[1]”, or the complex conjugate “conjugate = vec.fp16[0] - i * vec.fp16[1]”.

Intel Implementation Psudeo-Code

dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) + c.fp16[0]
dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) + c.fp16[1]
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_mask_fcmadd_sch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __mmask8 k, __m128h b, __m128h c
Param ETypes:: FP16 a, MASK k, FP16 b, FP16 c

__m128h _mm_mask_fcmadd_sch(__m128h a, __mmask8 k,
                            __m128h b, __m128h c)

Intel Description

Multiply the lower complex number in “a” by the complex conjugate of the lower complex number in “b”, accumulate to the lower complex number in “c”, and store the result in the lower elements of “dst” using writemask “k” (elements are copied from “a” when mask bit 0 is not set), and copy the upper 6 packed elements from “a” to the upper elements of “dst”. Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number “complex = vec.fp16[0] + i * vec.fp16[1]”, or the complex conjugate “conjugate = vec.fp16[0] - i * vec.fp16[1]”.

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) + c.fp16[0]
        dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) + c.fp16[1]
ELSE
        dst.fp16[0] := a.fp16[0]
        dst.fp16[1] := a.fp16[1]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_mask3_fcmadd_sch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b, __m128h c, __mmask8 k
Param ETypes:: FP16 a, FP16 b, FP16 c, MASK k

__m128h _mm_mask3_fcmadd_sch(__m128h a, __m128h b,
                             __m128h c, __mmask8 k)

Intel Description

Multiply the lower complex number in “a” by the complex conjugate of the lower complex number in “b”, accumulate to the lower complex number in “c”, and store the result in the lower elements of “dst” using writemask “k” (elements are copied from “c” when mask bit 0 is not set), and copy the upper 6 packed elements from “c” to the upper elements of “dst”. Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number “complex = vec.fp16[0] + i * vec.fp16[1]”, or the complex conjugate “conjugate = vec.fp16[0] - i * vec.fp16[1]”.

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) + c.fp16[0]
        dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) + c.fp16[1]
ELSE
        dst.fp16[0] := c.fp16[0]
        dst.fp16[1] := c.fp16[1]
FI
dst[127:32] := c[127:32]
dst[MAX:128] := 0

_mm_maskz_fcmadd_sch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __mmask8 k, __m128h a, __m128h b, __m128h c
Param ETypes:: MASK k, FP16 a, FP16 b, FP16 c

__m128h _mm_maskz_fcmadd_sch(__mmask8 k, __m128h a,
                             __m128h b, __m128h c)

Intel Description

Multiply the lower complex number in “a” by the complex conjugate of the lower complex number in “b”, accumulate to the lower complex number in “c”, and store the result in the lower elements of “dst” using zeromask “k” (elements are zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements from “a” to the upper elements of “dst”. Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number “complex = vec.fp16[0] + i * vec.fp16[1]”, or the complex conjugate “conjugate = vec.fp16[0] - i * vec.fp16[1]”.

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) + c.fp16[0]
        dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) + c.fp16[1]
ELSE
        dst.fp16[0] := 0
        dst.fp16[1] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_fcmadd_round_sch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b, __m128h c, const int rounding
Param ETypes:: FP16 a, FP16 b, FP16 c, IMM rounding

__m128h _mm_fcmadd_round_sch(__m128h a, __m128h b,
                             __m128h c, const int rounding)

Intel Description

Multiply the lower complex number in “a” by the complex conjugate of the lower complex number in “b”, accumulate to the lower complex number in “c”, and store the result in the lower elements of “dst”, and copy the upper 6 packed elements from “a” to the upper elements of “dst”. Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number “complex = vec.fp16[0] + i * vec.fp16[1]”, or the complex conjugate “conjugate = vec.fp16[0] - i * vec.fp16[1]”.: [round_note]

Intel Implementation Psudeo-Code

dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) + c.fp16[0]
dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) + c.fp16[1]
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_mask_fcmadd_round_sch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __mmask8 k, __m128h b, __m128h c, const int rounding
Param ETypes:: FP16 a, MASK k, FP16 b, FP16 c, IMM rounding

__m128h _mm_mask_fcmadd_round_sch(__m128h a, __mmask8 k,
                                  __m128h b, __m128h c,
                                  const int rounding)

Intel Description

Multiply the lower complex number in “a” by the complex conjugate of the lower complex number in “b”, accumulate to the lower complex number in “c”, and store the result in the lower elements of “dst” using writemask “k” (elements are copied from “a” when mask bit 0 is not set), and copy the upper 6 packed elements from “a” to the upper elements of “dst”. Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number “complex = vec.fp16[0] + i * vec.fp16[1]”, or the complex conjugate “conjugate = vec.fp16[0] - i * vec.fp16[1]”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) + c.fp16[0]
        dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) + c.fp16[1]
ELSE
        dst.fp16[0] := a.fp16[0]
        dst.fp16[1] := a.fp16[1]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_mask3_fcmadd_round_sch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __m128h a, __m128h b, __m128h c, __mmask8 k, const int rounding
Param ETypes:: FP16 a, FP16 b, FP16 c, MASK k, IMM rounding

__m128h _mm_mask3_fcmadd_round_sch(__m128h a, __m128h b,
                                   __m128h c, __mmask8 k,
                                   const int rounding)

Intel Description

Multiply the lower complex number in “a” by the complex conjugate of the lower complex number in “b”, accumulate to the lower complex number in “c”, and store the result in the lower elements of “dst” using writemask “k” (elements are copied from “c” when mask bit 0 is not set), and copy the upper 6 packed elements from “c” to the upper elements of “dst”. Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number “complex = vec.fp16[0] + i * vec.fp16[1]”, or the complex conjugate “conjugate = vec.fp16[0] - i * vec.fp16[1]”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) + c.fp16[0]
        dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) + c.fp16[1]
ELSE
        dst.fp16[0] := c.fp16[0]
        dst.fp16[1] := c.fp16[1]
FI
dst[127:32] := c[127:32]
dst[MAX:128] := 0

_mm_maskz_fcmadd_round_sch#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128h
Param Types:: __mmask8 k, __m128h a, __m128h b, __m128h c, const int rounding
Param ETypes:: MASK k, FP16 a, FP16 b, FP16 c, IMM rounding

__m128h _mm_maskz_fcmadd_round_sch(__mmask8 k, __m128h a,
                                   __m128h b, __m128h c,
                                   const int rounding)

Intel Description

Multiply the lower complex number in “a” by the complex conjugate of the lower complex number in “b”, accumulate to the lower complex number in “c”, and store the result in the lower elements of “dst” using zeromask “k” (elements are zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements from “a” to the upper elements of “dst”. Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number “complex = vec.fp16[0] + i * vec.fp16[1]”, or the complex conjugate “conjugate = vec.fp16[0] - i * vec.fp16[1]”.: [round_note]

Intel Implementation Psudeo-Code

IF k[0]
        dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) + c.fp16[0]
        dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) + c.fp16[1]
ELSE
        dst.fp16[0] := 0
        dst.fp16[1] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

_mm_maskz_dpwssds_epi32#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __mmask8 k, __m128i src, __m128i a, __m128i b
Param ETypes:: MASK k, SI32 src, SI16 a, SI16 b

__m128i _mm_maskz_dpwssds_epi32(__mmask8 k, __m128i src,
                                __m128i a, __m128i b)

Intel Description

Multiply groups of 2 adjacent pairs of signed 16-bit integers in “a” with corresponding 16-bit integers in “b”, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in “src” using signed saturation, and store the packed 32-bit results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        IF k[j]
                tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j])
                tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1])
                dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2)
        ELSE
                dst.dword[j] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_dpwssds_epi32#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i src, __mmask8 k, __m128i a, __m128i b
Param ETypes:: SI32 src, MASK k, SI16 a, SI16 b

__m128i _mm_mask_dpwssds_epi32(__m128i src, __mmask8 k,
                               __m128i a, __m128i b)

Intel Description

Multiply groups of 2 adjacent pairs of signed 16-bit integers in “a” with corresponding 16-bit integers in “b”, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in “src” using signed saturation, and store the packed 32-bit results in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        IF k[j]
                tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j])
                tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1])
                dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2)
        ELSE
                dst.dword[j] := src.dword[j]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_dpwssds_epi32#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i src, __m128i a, __m128i b
Param ETypes:: SI32 src, SI16 a, SI16 b

__m128i _mm_dpwssds_epi32(__m128i src, __m128i a,
                          __m128i b)

Intel Description

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j])
        tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1])
        dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2)
ENDFOR
dst[MAX:128] := 0

_mm_maskz_dpwssd_epi32#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __mmask8 k, __m128i src, __m128i a, __m128i b
Param ETypes:: MASK k, SI32 src, SI16 a, SI16 b

__m128i _mm_maskz_dpwssd_epi32(__mmask8 k, __m128i src,
                               __m128i a, __m128i b)

Intel Description

Multiply groups of 2 adjacent pairs of signed 16-bit integers in “a” with corresponding 16-bit integers in “b”, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in “src”, and store the packed 32-bit results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        IF k[j]
                tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j])
                tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1])
                dst.dword[j] := src.dword[j] + tmp1 + tmp2
        ELSE
                dst.dword[j] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_dpwssd_epi32#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i src, __mmask8 k, __m128i a, __m128i b
Param ETypes:: SI32 src, MASK k, SI16 a, SI16 b

__m128i _mm_mask_dpwssd_epi32(__m128i src, __mmask8 k,
                              __m128i a, __m128i b)

Intel Description

Multiply groups of 2 adjacent pairs of signed 16-bit integers in “a” with corresponding 16-bit integers in “b”, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in “src”, and store the packed 32-bit results in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        IF k[j]
                tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j])
                tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1])
                dst.dword[j] := src.dword[j] + tmp1 + tmp2
        ELSE
                dst.dword[j] := src.dword[j]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_dpwssd_epi32#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i src, __m128i a, __m128i b
Param ETypes:: SI32 src, SI16 a, SI16 b

__m128i _mm_dpwssd_epi32(__m128i src, __m128i a, __m128i b);

Intel Description

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j])
        tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1])
        dst.dword[j] := src.dword[j] + tmp1 + tmp2
ENDFOR
dst[MAX:128] := 0

_mm_maskz_dpbusds_epi32#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __mmask8 k, __m128i src, __m128i a, __m128i b
Param ETypes:: MASK k, SI32 src, UI8 a, SI8 b

__m128i _mm_maskz_dpbusds_epi32(__mmask8 k, __m128i src,
                                __m128i a, __m128i b)

Intel Description

Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in “a” with corresponding signed 8-bit integers in “b”, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in “src” using signed saturation, and store the packed 32-bit results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        IF k[j]
                tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j]))
                tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1]))
                tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2]))
                tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3]))
                dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
        ELSE
                dst.dword[j] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_dpbusds_epi32#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i src, __mmask8 k, __m128i a, __m128i b
Param ETypes:: SI32 src, MASK k, UI8 a, SI8 b

__m128i _mm_mask_dpbusds_epi32(__m128i src, __mmask8 k,
                               __m128i a, __m128i b)

Intel Description

Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in “a” with corresponding signed 8-bit integers in “b”, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in “src” using signed saturation, and store the packed 32-bit results in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        IF k[j]
                tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j]))
                tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1]))
                tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2]))
                tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3]))
                dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
        ELSE
                dst.dword[j] := src.dword[j]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_dpbusds_epi32#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i src, __m128i a, __m128i b
Param ETypes:: SI32 src, UI8 a, SI8 b

__m128i _mm_dpbusds_epi32(__m128i src, __m128i a,
                          __m128i b)

Intel Description

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j]))
        tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1]))
        tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2]))
        tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3]))
        dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
ENDFOR
dst[MAX:128] := 0

_mm_maskz_dpbusd_epi32#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __mmask8 k, __m128i src, __m128i a, __m128i b
Param ETypes:: MASK k, SI32 src, UI8 a, SI8 b

__m128i _mm_maskz_dpbusd_epi32(__mmask8 k, __m128i src,
                               __m128i a, __m128i b)

Intel Description

Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in “a” with corresponding signed 8-bit integers in “b”, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in “src”, and store the packed 32-bit results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        IF k[j]
                tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j]))
                tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1]))
                tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2]))
                tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3]))
                dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
        ELSE
                dst.dword[j] := 0
        FI
ENDFOR
dst[MAX:128] := 0

_mm_mask_dpbusd_epi32#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i src, __mmask8 k, __m128i a, __m128i b
Param ETypes:: SI32 src, MASK k, UI8 a, SI8 b

__m128i _mm_mask_dpbusd_epi32(__m128i src, __mmask8 k,
                              __m128i a, __m128i b)

Intel Description

Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in “a” with corresponding signed 8-bit integers in “b”, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in “src”, and store the packed 32-bit results in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        IF k[j]
                tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j]))
                tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1]))
                tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2]))
                tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3]))
                dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
        ELSE
                dst.dword[j] := src.dword[j]
        FI
ENDFOR
dst[MAX:128] := 0

_mm_dpbusd_epi32#

Tech:: AVX-512
Category:: Arithmetic
Header:: immintrin.h
Searchable:: AVX-512-Arithmetic-XMM
Register:: XMM 128 bit
Return Type:: __m128i
Param Types:: __m128i src, __m128i a, __m128i b
Param ETypes:: SI32 src, UI8 a, SI8 b

__m128i _mm_dpbusd_epi32(__m128i src, __m128i a, __m128i b);

Intel Description

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j]))
        tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1]))
        tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2]))
        tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3]))
        dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
ENDFOR
dst[MAX:128] := 0

AVX-512-Arithmetic-XMM

Contents

AVX-512-Arithmetic-XMM#

_mm_mask_abs_epi8#

_mm_maskz_abs_epi8#

_mm_mask_abs_epi16#

_mm_maskz_abs_epi16#

_mm_mask_add_epi8#

_mm_maskz_add_epi8#

_mm_mask_adds_epi8#

_mm_maskz_adds_epi8#

_mm_mask_adds_epi16#

_mm_maskz_adds_epi16#

_mm_mask_adds_epu8#

_mm_maskz_adds_epu8#

_mm_mask_adds_epu16#

_mm_maskz_adds_epu16#

_mm_mask_add_epi16#

_mm_maskz_add_epi16#

_mm_mask_avg_epu8#

_mm_maskz_avg_epu8#

_mm_mask_avg_epu16#

_mm_maskz_avg_epu16#

_mm_mask_maddubs_epi16#

_mm_maskz_maddubs_epi16#

_mm_mask_madd_epi16#

_mm_maskz_madd_epi16#

_mm_mask_max_epi8#

_mm_maskz_max_epi8#

_mm_mask_max_epi16#

_mm_maskz_max_epi16#

_mm_mask_max_epu8#

_mm_maskz_max_epu8#

_mm_mask_max_epu16#

_mm_maskz_max_epu16#

_mm_mask_min_epi8#

_mm_maskz_min_epi8#

_mm_mask_min_epi16#

_mm_maskz_min_epi16#

_mm_mask_min_epu8#

_mm_maskz_min_epu8#

_mm_mask_min_epu16#

_mm_maskz_min_epu16#

_mm_mask_mulhrs_epi16#

_mm_maskz_mulhrs_epi16#

_mm_mask_mulhi_epu16#

_mm_maskz_mulhi_epu16#

_mm_mask_mulhi_epi16#

_mm_maskz_mulhi_epi16#

_mm_mask_mullo_epi16#

_mm_maskz_mullo_epi16#

_mm_mask_sub_epi8#

_mm_maskz_sub_epi8#

_mm_mask_subs_epi8#

_mm_maskz_subs_epi8#

_mm_mask_subs_epi16#

_mm_maskz_subs_epi16#

_mm_mask_subs_epu8#

_mm_maskz_subs_epu8#

_mm_mask_subs_epu16#

_mm_maskz_subs_epu16#

_mm_mask_sub_epi16#

_mm_maskz_sub_epi16#

_mm_reduce_add_epi16#

_mm_mask_reduce_add_epi16#

_mm_reduce_add_epi8#

_mm_mask_reduce_add_epi8#

_mm_reduce_mul_epi16#

_mm_mask_reduce_mul_epi16#

_mm_reduce_mul_epi8#

_mm_mask_reduce_mul_epi8#

_mm_reduce_or_epi16#

_mm_mask_reduce_or_epi16#

_mm_reduce_or_epi8#

_mm_mask_reduce_or_epi8#

_mm_reduce_and_epi16#

_mm_mask_reduce_and_epi16#

_mm_reduce_and_epi8#

_mm_mask_reduce_and_epi8#

_mm_mask_mullo_epi64#