AVX-512-Load-YMM#
_mm256_mask_loadu_epi16#
- Tech:
AVX-512
- Category:
Load
- Header:
immintrin.h
- Searchable:
AVX-512-Load-YMM
- Register:
YMM 256 bit
- Return Type:
__m256i
- Param Types:
__m256i src, __mmask16 k, void const* mem_addr
- Param ETypes:
UI16 src, MASK k, UI16 mem_addr
__m256i _mm256_mask_loadu_epi16(__m256i src, __mmask16 k,
void const* mem_addr)
Intel Description
- Load packed 16-bit integers from memory into “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).
“mem_addr” does not need to be aligned on any particular boundary.
Intel Implementation Psudeo-Code
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := MEM[mem_addr+i+15:mem_addr+i]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:256] := 0
_mm256_maskz_loadu_epi16#
- Tech:
AVX-512
- Category:
Load
- Header:
immintrin.h
- Searchable:
AVX-512-Load-YMM
- Register:
YMM 256 bit
- Return Type:
__m256i
- Param Types:
__mmask16 k, void const* mem_addr
- Param ETypes:
MASK k, UI16 mem_addr
__m256i _mm256_maskz_loadu_epi16(__mmask16 k,
void const* mem_addr)
Intel Description
- Load packed 16-bit integers from memory into “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).
“mem_addr” does not need to be aligned on any particular boundary.
Intel Implementation Psudeo-Code
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := MEM[mem_addr+i+15:mem_addr+i]
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
_mm256_mask_loadu_epi8#
- Tech:
AVX-512
- Category:
Load
- Header:
immintrin.h
- Searchable:
AVX-512-Load-YMM
- Register:
YMM 256 bit
- Return Type:
__m256i
- Param Types:
__m256i src, __mmask32 k, void const* mem_addr
- Param ETypes:
UI8 src, MASK k, UI8 mem_addr
__m256i _mm256_mask_loadu_epi8(__m256i src, __mmask32 k,
void const* mem_addr)
Intel Description
- Load packed 8-bit integers from memory into “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).
“mem_addr” does not need to be aligned on any particular boundary.
Intel Implementation Psudeo-Code
FOR j := 0 to 31
i := j*8
IF k[j]
dst[i+7:i] := MEM[mem_addr+i+7:mem_addr+i]
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:256] := 0
_mm256_maskz_loadu_epi8#
- Tech:
AVX-512
- Category:
Load
- Header:
immintrin.h
- Searchable:
AVX-512-Load-YMM
- Register:
YMM 256 bit
- Return Type:
__m256i
- Param Types:
__mmask32 k, void const* mem_addr
- Param ETypes:
MASK k, UI8 mem_addr
__m256i _mm256_maskz_loadu_epi8(__mmask32 k,
void const* mem_addr)
Intel Description
- Load packed 8-bit integers from memory into “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).
“mem_addr” does not need to be aligned on any particular boundary.
Intel Implementation Psudeo-Code
FOR j := 0 to 31
i := j*8
IF k[j]
dst[i+7:i] := MEM[mem_addr+i+7:mem_addr+i]
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
_mm256_loadu_epi16#
- Tech:
AVX-512
- Category:
Load
- Header:
immintrin.h
- Searchable:
AVX-512-Load-YMM
- Register:
YMM 256 bit
- Return Type:
__m256i
- Param Types:
void const* mem_addr
- Param ETypes:
UI16 mem_addr
__m256i _mm256_loadu_epi16(void const* mem_addr);
Intel Description
- Load 256-bits (composed of 16 packed 16-bit integers) from memory into “dst”.
“mem_addr” does not need to be aligned on any particular boundary.
Intel Implementation Psudeo-Code
dst[255:0] := MEM[mem_addr+255:mem_addr]
dst[MAX:256] := 0
_mm256_loadu_epi8#
- Tech:
AVX-512
- Category:
Load
- Header:
immintrin.h
- Searchable:
AVX-512-Load-YMM
- Register:
YMM 256 bit
- Return Type:
__m256i
- Param Types:
void const* mem_addr
- Param ETypes:
UI8 mem_addr
__m256i _mm256_loadu_epi8(void const* mem_addr);
Intel Description
- Load 256-bits (composed of 32 packed 8-bit integers) from memory into “dst”.
“mem_addr” does not need to be aligned on any particular boundary.
Intel Implementation Psudeo-Code
dst[255:0] := MEM[mem_addr+255:mem_addr]
dst[MAX:256] := 0
_mm256_mask_expandloadu_pd#
- Tech:
AVX-512
- Category:
Load
- Header:
immintrin.h
- Searchable:
AVX-512-Load-YMM
- Register:
YMM 256 bit
- Return Type:
__m256d
- Param Types:
__m256d src, __mmask8 k, void const* mem_addr
- Param ETypes:
FP64 src, MASK k, FP64 mem_addr
__m256d _mm256_mask_expandloadu_pd(__m256d src, __mmask8 k,
void const* mem_addr)
Intel Description
Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at “mem_addr” (those with their respective bit set in mask “k”), and store the results in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).
Intel Implementation Psudeo-Code
m := 0
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m]
m := m + 64
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
_mm256_maskz_expandloadu_pd#
- Tech:
AVX-512
- Category:
Load
- Header:
immintrin.h
- Searchable:
AVX-512-Load-YMM
- Register:
YMM 256 bit
- Return Type:
__m256d
- Param Types:
__mmask8 k, void const* mem_addr
- Param ETypes:
MASK k, FP64 mem_addr
__m256d _mm256_maskz_expandloadu_pd(__mmask8 k,
void const* mem_addr)
Intel Description
Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at “mem_addr” (those with their respective bit set in mask “k”), and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).
Intel Implementation Psudeo-Code
m := 0
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m]
m := m + 64
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
_mm256_mask_expandloadu_ps#
- Tech:
AVX-512
- Category:
Load
- Header:
immintrin.h
- Searchable:
AVX-512-Load-YMM
- Register:
YMM 256 bit
- Return Type:
__m256
- Param Types:
__m256 src, __mmask8 k, void const* mem_addr
- Param ETypes:
FP32 src, MASK k, FP32 mem_addr
__m256 _mm256_mask_expandloadu_ps(__m256 src, __mmask8 k,
void const* mem_addr)
Intel Description
Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at “mem_addr” (those with their respective bit set in mask “k”), and store the results in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).
Intel Implementation Psudeo-Code
m := 0
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m]
m := m + 32
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
_mm256_maskz_expandloadu_ps#
- Tech:
AVX-512
- Category:
Load
- Header:
immintrin.h
- Searchable:
AVX-512-Load-YMM
- Register:
YMM 256 bit
- Return Type:
__m256
- Param Types:
__mmask8 k, void const* mem_addr
- Param ETypes:
MASK k, FP32 mem_addr
__m256 _mm256_maskz_expandloadu_ps(__mmask8 k,
void const* mem_addr)
Intel Description
Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at “mem_addr” (those with their respective bit set in mask “k”), and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).
Intel Implementation Psudeo-Code
m := 0
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m]
m := m + 32
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
_mm256_mmask_i32gather_pd#
- Tech:
AVX-512
- Category:
Load
- Header:
immintrin.h
- Searchable:
AVX-512-Load-YMM
- Register:
YMM 256 bit
- Return Type:
__m256d
- Param Types:
__m256d src, __mmask8 k, __m128i vindex, void const* base_addr, const int scale
- Param ETypes:
FP64 src, MASK k, SI32 vindex, FP64 base_addr, IMM scale
__m256d _mm256_mmask_i32gather_pd(__m256d src, __mmask8 k,
__m128i vindex,
void const* base_addr,
const int scale)
Intel Description
Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at “base_addr” and offset by each 32-bit element in “vindex” (each index is scaled by the factor in “scale”). Gathered elements are merged into “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set). “scale” should be 1, 2, 4 or 8.
Intel Implementation Psudeo-Code
FOR j := 0 to 3
i := j*64
m := j*32
IF k[j]
addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
dst[i+63:i] := MEM[addr+63:addr]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
_mm256_mmask_i32gather_ps#
- Tech:
AVX-512
- Category:
Load
- Header:
immintrin.h
- Searchable:
AVX-512-Load-YMM
- Register:
YMM 256 bit
- Return Type:
__m256
- Param Types:
__m256 src, __mmask8 k, __m256i vindex, void const* base_addr, const int scale
- Param ETypes:
FP32 src, MASK k, SI32 vindex, FP32 base_addr, IMM scale
__m256 _mm256_mmask_i32gather_ps(__m256 src, __mmask8 k,
__m256i vindex,
void const* base_addr,
const int scale)
Intel Description
Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at “base_addr” and offset by each 32-bit element in “vindex” (each index is scaled by the factor in “scale”). Gathered elements are merged into “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set). “scale” should be 1, 2, 4 or 8.
Intel Implementation Psudeo-Code
FOR j := 0 to 7
i := j*32
m := j*32
IF k[j]
addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
dst[i+31:i] := MEM[addr+31:addr]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
_mm256_mmask_i64gather_pd#
- Tech:
AVX-512
- Category:
Load
- Header:
immintrin.h
- Searchable:
AVX-512-Load-YMM
- Register:
YMM 256 bit
- Return Type:
__m256d
- Param Types:
__m256d src, __mmask8 k, __m256i vindex, void const* base_addr, const int scale
- Param ETypes:
FP64 src, MASK k, SI64 vindex, FP64 base_addr, IMM scale
__m256d _mm256_mmask_i64gather_pd(__m256d src, __mmask8 k,
__m256i vindex,
void const* base_addr,
const int scale)
Intel Description
Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at “base_addr” and offset by each 64-bit element in “vindex” (each index is scaled by the factor in “scale”). Gathered elements are merged into “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set). “scale” should be 1, 2, 4 or 8.
Intel Implementation Psudeo-Code
FOR j := 0 to 3
i := j*64
m := j*64
IF k[j]
addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
dst[i+63:i] := MEM[addr+63:addr]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
_mm256_mmask_i64gather_ps#
- Tech:
AVX-512
- Category:
Load
- Header:
immintrin.h
- Searchable:
AVX-512-Load-YMM
- Register:
YMM 256 bit
- Return Type:
__m128
- Param Types:
__m128 src, __mmask8 k, __m256i vindex, void const* base_addr, const int scale
- Param ETypes:
FP32 src, MASK k, SI64 vindex, FP32 base_addr, IMM scale
__m128 _mm256_mmask_i64gather_ps(__m128 src, __mmask8 k,
__m256i vindex,
void const* base_addr,
const int scale)
Intel Description
Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at “base_addr” and offset by each 64-bit element in “vindex” (each index is scaled by the factor in “scale”). Gathered elements are merged into “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set). “scale” should be 1, 2, 4 or 8.
Intel Implementation Psudeo-Code
FOR j := 0 to 3
i := j*32
m := j*64
IF k[j]
addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
dst[i+31:i] := MEM[addr+31:addr]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
_mm256_mask_load_pd#
- Tech:
AVX-512
- Category:
Load
- Header:
immintrin.h
- Searchable:
AVX-512-Load-YMM
- Register:
YMM 256 bit
- Return Type:
__m256d
- Param Types:
__m256d src, __mmask8 k, void const* mem_addr
- Param ETypes:
FP64 src, MASK k, FP64 mem_addr
__m256d _mm256_mask_load_pd(__m256d src, __mmask8 k,
void const* mem_addr)
Intel Description
Load packed double-precision (64-bit) floating-point elements from memory into “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set). “mem_addr” must be aligned on a 32-byte boundary or a general-protection exception may be generated.
Intel Implementation Psudeo-Code
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
_mm256_maskz_load_pd#
- Tech:
AVX-512
- Category:
Load
- Header:
immintrin.h
- Searchable:
AVX-512-Load-YMM
- Register:
YMM 256 bit
- Return Type:
__m256d
- Param Types:
__mmask8 k, void const* mem_addr
- Param ETypes:
MASK k, FP64 mem_addr
__m256d _mm256_maskz_load_pd(__mmask8 k,
void const* mem_addr)
Intel Description
Load packed double-precision (64-bit) floating-point elements from memory into “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set). “mem_addr” must be aligned on a 32-byte boundary or a general-protection exception may be generated.
Intel Implementation Psudeo-Code
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
_mm256_mask_load_ps#
- Tech:
AVX-512
- Category:
Load
- Header:
immintrin.h
- Searchable:
AVX-512-Load-YMM
- Register:
YMM 256 bit
- Return Type:
__m256
- Param Types:
__m256 src, __mmask8 k, void const* mem_addr
- Param ETypes:
FP32 src, MASK k, FP32 mem_addr
__m256 _mm256_mask_load_ps(__m256 src, __mmask8 k,
void const* mem_addr)
Intel Description
Load packed single-precision (32-bit) floating-point elements from memory into “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set). “mem_addr” must be aligned on a 32-byte boundary or a general-protection exception may be generated.
Intel Implementation Psudeo-Code
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
_mm256_maskz_load_ps#
- Tech:
AVX-512
- Category:
Load
- Header:
immintrin.h
- Searchable:
AVX-512-Load-YMM
- Register:
YMM 256 bit
- Return Type:
__m256
- Param Types:
__mmask8 k, void const* mem_addr
- Param ETypes:
MASK k, FP32 mem_addr
__m256 _mm256_maskz_load_ps(__mmask8 k,
void const* mem_addr)
Intel Description
Load packed single-precision (32-bit) floating-point elements from memory into “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set). “mem_addr” must be aligned on a 32-byte boundary or a general-protection exception may be generated.
Intel Implementation Psudeo-Code
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
_mm256_mask_load_epi32#
- Tech:
AVX-512
- Category:
Load
- Header:
immintrin.h
- Searchable:
AVX-512-Load-YMM
- Register:
YMM 256 bit
- Return Type:
__m256i
- Param Types:
__m256i src, __mmask8 k, void const* mem_addr
- Param ETypes:
UI32 src, MASK k, UI32 mem_addr
__m256i _mm256_mask_load_epi32(__m256i src, __mmask8 k,
void const* mem_addr)
Intel Description
- Load packed 32-bit integers from memory into “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).
“mem_addr” must be aligned on a 32-byte boundary or a general-protection exception may be generated.
Intel Implementation Psudeo-Code
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
_mm256_maskz_load_epi32#
- Tech:
AVX-512
- Category:
Load
- Header:
immintrin.h
- Searchable:
AVX-512-Load-YMM
- Register:
YMM 256 bit
- Return Type:
__m256i
- Param Types:
__mmask8 k, void const* mem_addr
- Param ETypes:
MASK k, UI32 mem_addr
__m256i _mm256_maskz_load_epi32(__mmask8 k,
void const* mem_addr)
Intel Description
- Load packed 32-bit integers from memory into “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).
“mem_addr” must be aligned on a 32-byte boundary or a general-protection exception may be generated.
Intel Implementation Psudeo-Code
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
_mm256_mask_load_epi64#
- Tech:
AVX-512
- Category:
Load
- Header:
immintrin.h
- Searchable:
AVX-512-Load-YMM
- Register:
YMM 256 bit
- Return Type:
__m256i
- Param Types:
__m256i src, __mmask8 k, void const* mem_addr
- Param ETypes:
UI64 src, MASK k, UI64 mem_addr
__m256i _mm256_mask_load_epi64(__m256i src, __mmask8 k,
void const* mem_addr)
Intel Description
- Load packed 64-bit integers from memory into “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).
“mem_addr” must be aligned on a 32-byte boundary or a general-protection exception may be generated.
Intel Implementation Psudeo-Code
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
_mm256_maskz_load_epi64#
- Tech:
AVX-512
- Category:
Load
- Header:
immintrin.h
- Searchable:
AVX-512-Load-YMM
- Register:
YMM 256 bit
- Return Type:
__m256i
- Param Types:
__mmask8 k, void const* mem_addr
- Param ETypes:
MASK k, UI64 mem_addr
__m256i _mm256_maskz_load_epi64(__mmask8 k,
void const* mem_addr)
Intel Description
- Load packed 64-bit integers from memory into “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).
“mem_addr” must be aligned on a 32-byte boundary or a general-protection exception may be generated.
Intel Implementation Psudeo-Code
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
_mm256_mask_loadu_epi32#
- Tech:
AVX-512
- Category:
Load
- Header:
immintrin.h
- Searchable:
AVX-512-Load-YMM
- Register:
YMM 256 bit
- Return Type:
__m256i
- Param Types:
__m256i src, __mmask8 k, void const* mem_addr
- Param ETypes:
UI32 src, MASK k, UI32 mem_addr
__m256i _mm256_mask_loadu_epi32(__m256i src, __mmask8 k,
void const* mem_addr)
Intel Description
- Load packed 32-bit integers from memory into “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).
“mem_addr” does not need to be aligned on any particular boundary.
Intel Implementation Psudeo-Code
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
_mm256_maskz_loadu_epi32#
- Tech:
AVX-512
- Category:
Load
- Header:
immintrin.h
- Searchable:
AVX-512-Load-YMM
- Register:
YMM 256 bit
- Return Type:
__m256i
- Param Types:
__mmask8 k, void const* mem_addr
- Param ETypes:
MASK k, UI32 mem_addr
__m256i _mm256_maskz_loadu_epi32(__mmask8 k,
void const* mem_addr)
Intel Description
- Load packed 32-bit integers from memory into “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).
“mem_addr” does not need to be aligned on any particular boundary.
Intel Implementation Psudeo-Code
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
_mm256_mask_loadu_epi64#
- Tech:
AVX-512
- Category:
Load
- Header:
immintrin.h
- Searchable:
AVX-512-Load-YMM
- Register:
YMM 256 bit
- Return Type:
__m256i
- Param Types:
__m256i src, __mmask8 k, void const* mem_addr
- Param ETypes:
UI64 src, MASK k, UI64 mem_addr
__m256i _mm256_mask_loadu_epi64(__m256i src, __mmask8 k,
void const* mem_addr)
Intel Description
- Load packed 64-bit integers from memory into “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).
“mem_addr” does not need to be aligned on any particular boundary.
Intel Implementation Psudeo-Code
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
_mm256_maskz_loadu_epi64#
- Tech:
AVX-512
- Category:
Load
- Header:
immintrin.h
- Searchable:
AVX-512-Load-YMM
- Register:
YMM 256 bit
- Return Type:
__m256i
- Param Types:
__mmask8 k, void const* mem_addr
- Param ETypes:
MASK k, UI64 mem_addr
__m256i _mm256_maskz_loadu_epi64(__mmask8 k,
void const* mem_addr)
Intel Description
- Load packed 64-bit integers from memory into “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).
“mem_addr” does not need to be aligned on any particular boundary.
Intel Implementation Psudeo-Code
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
_mm256_mask_loadu_pd#
- Tech:
AVX-512
- Category:
Load
- Header:
immintrin.h
- Searchable:
AVX-512-Load-YMM
- Register:
YMM 256 bit
- Return Type:
__m256d
- Param Types:
__m256d src, __mmask8 k, void const* mem_addr
- Param ETypes:
FP64 src, MASK k, FP64 mem_addr
__m256d _mm256_mask_loadu_pd(__m256d src, __mmask8 k,
void const* mem_addr)
Intel Description
- Load packed double-precision (64-bit) floating-point elements from memoy into “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).
“mem_addr” does not need to be aligned on any particular boundary.
Intel Implementation Psudeo-Code
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
_mm256_maskz_loadu_pd#
- Tech:
AVX-512
- Category:
Load
- Header:
immintrin.h
- Searchable:
AVX-512-Load-YMM
- Register:
YMM 256 bit
- Return Type:
__m256d
- Param Types:
__mmask8 k, void const* mem_addr
- Param ETypes:
MASK k, FP64 mem_addr
__m256d _mm256_maskz_loadu_pd(__mmask8 k,
void const* mem_addr)
Intel Description
- Load packed double-precision (64-bit) floating-point elements from memoy into “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).
“mem_addr” does not need to be aligned on any particular boundary.
Intel Implementation Psudeo-Code
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
_mm256_mask_loadu_ps#
- Tech:
AVX-512
- Category:
Load
- Header:
immintrin.h
- Searchable:
AVX-512-Load-YMM
- Register:
YMM 256 bit
- Return Type:
__m256
- Param Types:
__m256 src, __mmask8 k, void const* mem_addr
- Param ETypes:
FP32 src, MASK k, FP32 mem_addr
__m256 _mm256_mask_loadu_ps(__m256 src, __mmask8 k,
void const* mem_addr)
Intel Description
- Load packed single-precision (32-bit) floating-point elements from memory into “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).
“mem_addr” does not need to be aligned on any particular boundary.
Intel Implementation Psudeo-Code
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
_mm256_maskz_loadu_ps#
- Tech:
AVX-512
- Category:
Load
- Header:
immintrin.h
- Searchable:
AVX-512-Load-YMM
- Register:
YMM 256 bit
- Return Type:
__m256
- Param Types:
__mmask8 k, void const* mem_addr
- Param ETypes:
MASK k, FP32 mem_addr
__m256 _mm256_maskz_loadu_ps(__mmask8 k,
void const* mem_addr)
Intel Description
- Load packed single-precision (32-bit) floating-point elements from memory into “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).
“mem_addr” does not need to be aligned on any particular boundary.
Intel Implementation Psudeo-Code
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
_mm256_mask_expandloadu_epi32#
- Tech:
AVX-512
- Category:
Load
- Header:
immintrin.h
- Searchable:
AVX-512-Load-YMM
- Register:
YMM 256 bit
- Return Type:
__m256i
- Param Types:
__m256i src, __mmask8 k, void const* mem_addr
- Param ETypes:
UI32 src, MASK k, UI32 mem_addr
__m256i _mm256_mask_expandloadu_epi32(__m256i src,
__mmask8 k,
void const* mem_addr)
Intel Description
Load contiguous active 32-bit integers from unaligned memory at “mem_addr” (those with their respective bit set in mask “k”), and store the results in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).
Intel Implementation Psudeo-Code
m := 0
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m]
m := m + 32
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
_mm256_maskz_expandloadu_epi32#
- Tech:
AVX-512
- Category:
Load
- Header:
immintrin.h
- Searchable:
AVX-512-Load-YMM
- Register:
YMM 256 bit
- Return Type:
__m256i
- Param Types:
__mmask8 k, void const* mem_addr
- Param ETypes:
MASK k, UI32 mem_addr
__m256i _mm256_maskz_expandloadu_epi32(
__mmask8 k, void const* mem_addr)
Intel Description
Load contiguous active 32-bit integers from unaligned memory at “mem_addr” (those with their respective bit set in mask “k”), and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).
Intel Implementation Psudeo-Code
m := 0
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m]
m := m + 32
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
_mm256_mask_expandloadu_epi64#
- Tech:
AVX-512
- Category:
Load
- Header:
immintrin.h
- Searchable:
AVX-512-Load-YMM
- Register:
YMM 256 bit
- Return Type:
__m256i
- Param Types:
__m256i src, __mmask8 k, void const* mem_addr
- Param ETypes:
UI64 src, MASK k, UI64 mem_addr
__m256i _mm256_mask_expandloadu_epi64(__m256i src,
__mmask8 k,
void const* mem_addr)
Intel Description
Load contiguous active 64-bit integers from unaligned memory at “mem_addr” (those with their respective bit set in mask “k”), and store the results in “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set).
Intel Implementation Psudeo-Code
m := 0
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m]
m := m + 64
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
_mm256_maskz_expandloadu_epi64#
- Tech:
AVX-512
- Category:
Load
- Header:
immintrin.h
- Searchable:
AVX-512-Load-YMM
- Register:
YMM 256 bit
- Return Type:
__m256i
- Param Types:
__mmask8 k, void const* mem_addr
- Param ETypes:
MASK k, UI64 mem_addr
__m256i _mm256_maskz_expandloadu_epi64(
__mmask8 k, void const* mem_addr)
Intel Description
Load contiguous active 64-bit integers from unaligned memory at “mem_addr” (those with their respective bit set in mask “k”), and store the results in “dst” using zeromask “k” (elements are zeroed out when the corresponding mask bit is not set).
Intel Implementation Psudeo-Code
m := 0
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m]
m := m + 64
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
_mm256_mmask_i32gather_epi32#
- Tech:
AVX-512
- Category:
Load
- Header:
immintrin.h
- Searchable:
AVX-512-Load-YMM
- Register:
YMM 256 bit
- Return Type:
__m256i
- Param Types:
__m256i src, __mmask8 k, __m256i vindex, void const* base_addr, const int scale
- Param ETypes:
UI32 src, MASK k, SI32 vindex, UI32 base_addr, IMM scale
__m256i _mm256_mmask_i32gather_epi32(__m256i src,
__mmask8 k,
__m256i vindex,
void const* base_addr,
const int scale)
Intel Description
Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at “base_addr” and offset by each 32-bit element in “vindex” (each index is scaled by the factor in “scale”). Gathered elements are merged into “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set). “scale” should be 1, 2, 4 or 8.
Intel Implementation Psudeo-Code
FOR j := 0 to 7
i := j*32
m := j*32
IF k[j]
addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
dst[i+31:i] := MEM[addr+31:addr]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
_mm256_mmask_i32gather_epi64#
- Tech:
AVX-512
- Category:
Load
- Header:
immintrin.h
- Searchable:
AVX-512-Load-YMM
- Register:
YMM 256 bit
- Return Type:
__m256i
- Param Types:
__m256i src, __mmask8 k, __m128i vindex, void const* base_addr, const int scale
- Param ETypes:
UI64 src, MASK k, SI32 vindex, UI32 base_addr, IMM scale
__m256i _mm256_mmask_i32gather_epi64(__m256i src,
__mmask8 k,
__m128i vindex,
void const* base_addr,
const int scale)
Intel Description
Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at “base_addr” and offset by each 32-bit element in “vindex” (each index is scaled by the factor in “scale”). Gathered elements are merged into “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set). “scale” should be 1, 2, 4 or 8.
Intel Implementation Psudeo-Code
FOR j := 0 to 3
i := j*64
m := j*32
IF k[j]
addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
dst[i+63:i] := MEM[addr+63:addr]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
_mm256_mmask_i64gather_epi32#
- Tech:
AVX-512
- Category:
Load
- Header:
immintrin.h
- Searchable:
AVX-512-Load-YMM
- Register:
YMM 256 bit
- Return Type:
__m128i
- Param Types:
__m128i src, __mmask8 k, __m256i vindex, void const* base_addr, const int scale
- Param ETypes:
UI32 src, MASK k, SI64 vindex, UI32 base_addr, IMM scale
__m128i _mm256_mmask_i64gather_epi32(__m128i src,
__mmask8 k,
__m256i vindex,
void const* base_addr,
const int scale)
Intel Description
Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at “base_addr” and offset by each 64-bit element in “vindex” (each index is scaled by the factor in “scale”). Gathered elements are merged into “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set). “scale” should be 1, 2, 4 or 8.
Intel Implementation Psudeo-Code
FOR j := 0 to 3
i := j*32
m := j*64
IF k[j]
addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
dst[i+31:i] := MEM[addr+31:addr]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
_mm256_mmask_i64gather_epi64#
- Tech:
AVX-512
- Category:
Load
- Header:
immintrin.h
- Searchable:
AVX-512-Load-YMM
- Register:
YMM 256 bit
- Return Type:
__m256i
- Param Types:
__m256i src, __mmask8 k, __m256i vindex, void const* base_addr, const int scale
- Param ETypes:
UI64 src, MASK k, SI64 vindex, UI64 base_addr, IMM scale
__m256i _mm256_mmask_i64gather_epi64(__m256i src,
__mmask8 k,
__m256i vindex,
void const* base_addr,
const int scale)
Intel Description
Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at “base_addr” and offset by each 64-bit element in “vindex” (each index is scaled by the factor in “scale”). Gathered elements are merged into “dst” using writemask “k” (elements are copied from “src” when the corresponding mask bit is not set). “scale” should be 1, 2, 4 or 8.
Intel Implementation Psudeo-Code
FOR j := 0 to 3
i := j*64
m := j*64
IF k[j]
addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
dst[i+63:i] := MEM[addr+63:addr]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
_mm256_loadu_epi64#
- Tech:
AVX-512
- Category:
Load
- Header:
immintrin.h
- Searchable:
AVX-512-Load-YMM
- Register:
YMM 256 bit
- Return Type:
__m256i
- Param Types:
void const* mem_addr
- Param ETypes:
UI64 mem_addr
__m256i _mm256_loadu_epi64(void const* mem_addr);
Intel Description
- Load 256-bits (composed of 4 packed 64-bit integers) from memory into “dst”.
“mem_addr” does not need to be aligned on any particular boundary.
Intel Implementation Psudeo-Code
dst[255:0] := MEM[mem_addr+255:mem_addr]
dst[MAX:256] := 0
_mm256_loadu_epi32#
- Tech:
AVX-512
- Category:
Load
- Header:
immintrin.h
- Searchable:
AVX-512-Load-YMM
- Register:
YMM 256 bit
- Return Type:
__m256i
- Param Types:
void const* mem_addr
- Param ETypes:
UI32 mem_addr
__m256i _mm256_loadu_epi32(void const* mem_addr);
Intel Description
- Load 256-bits (composed of 8 packed 32-bit integers) from memory into “dst”.
“mem_addr” does not need to be aligned on any particular boundary.
Intel Implementation Psudeo-Code
dst[255:0] := MEM[mem_addr+255:mem_addr]
dst[MAX:256] := 0
_mm256_load_epi64#
- Tech:
AVX-512
- Category:
Load
- Header:
immintrin.h
- Searchable:
AVX-512-Load-YMM
- Register:
YMM 256 bit
- Return Type:
__m256i
- Param Types:
void const* mem_addr
- Param ETypes:
UI64 mem_addr
__m256i _mm256_load_epi64(void const* mem_addr);
Intel Description
- Load 256-bits (composed of 4 packed 64-bit integers) from memory into “dst”.
“mem_addr” must be aligned on a 32-byte boundary or a general-protection exception may be generated.
Intel Implementation Psudeo-Code
dst[255:0] := MEM[mem_addr+255:mem_addr]
dst[MAX:256] := 0
_mm256_load_epi32#
- Tech:
AVX-512
- Category:
Load
- Header:
immintrin.h
- Searchable:
AVX-512-Load-YMM
- Register:
YMM 256 bit
- Return Type:
__m256i
- Param Types:
void const* mem_addr
- Param ETypes:
UI32 mem_addr
__m256i _mm256_load_epi32(void const* mem_addr);
Intel Description
- Load 256-bits (composed of 8 packed 32-bit integers) from memory into “dst”.
“mem_addr” must be aligned on a 32-byte boundary or a general-protection exception may be generated.
Intel Implementation Psudeo-Code
dst[255:0] := MEM[mem_addr+255:mem_addr]
dst[MAX:256] := 0
_mm256_load_ph#
- Tech:
AVX-512
- Category:
Load
- Header:
immintrin.h
- Searchable:
AVX-512-Load-YMM
- Register:
YMM 256 bit
- Return Type:
__m256h
- Param Types:
void const* mem_addr
- Param ETypes:
FP16 mem_addr
__m256h _mm256_load_ph(void const* mem_addr);
Intel Description
- Load 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from memory into “dst”.
“mem_addr” must be aligned on a 32-byte boundary or a general-protection exception may be generated.
Intel Implementation Psudeo-Code
dst[255:0] := MEM[mem_addr+255:mem_addr]
dst[MAX:256] := 0
_mm256_loadu_ph#
- Tech:
AVX-512
- Category:
Load
- Header:
immintrin.h
- Searchable:
AVX-512-Load-YMM
- Register:
YMM 256 bit
- Return Type:
__m256h
- Param Types:
void const* mem_addr
- Param ETypes:
FP16 mem_addr
__m256h _mm256_loadu_ph(void const* mem_addr);
Intel Description
- Load 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from memory into “dst”.
“mem_addr” does not need to be aligned on any particular boundary.
Intel Implementation Psudeo-Code
dst[255:0] := MEM[mem_addr+255:mem_addr]
dst[MAX:256] := 0