AVX_ALL-Load-YMM#

_mm256_broadcast_ss#

Tech:: AVX_ALL
Category:: Load
Header:: immintrin.h
Searchable:: AVX_ALL-Load-YMM
Register:: YMM 256 bit
Return Type:: __m256
Param Types:: float const * mem_addr
Param ETypes:: FP32 mem_addr

__m256 _mm256_broadcast_ss(float const * mem_addr);

Intel Description

Broadcast a single-precision (32-bit) floating-point element from memory to all elements of “dst”.

Intel Implementation Psudeo-Code

tmp[31:0] := MEM[mem_addr+31:mem_addr]
FOR j := 0 to 7
        i := j*32
        dst[i+31:i] := tmp[31:0]
ENDFOR
dst[MAX:256] := 0

_mm256_load_pd#

Tech:: AVX_ALL
Category:: Load
Header:: immintrin.h
Searchable:: AVX_ALL-Load-YMM
Register:: YMM 256 bit
Return Type:: __m256d
Param Types:: double const * mem_addr
Param ETypes:: FP64 mem_addr

__m256d _mm256_load_pd(double const * mem_addr);

Intel Description

Load 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from memory into “dst”.: “mem_addr” must be aligned on a 32-byte boundary or a general-protection exception may be generated.

Intel Implementation Psudeo-Code

dst[255:0] := MEM[mem_addr+255:mem_addr]
dst[MAX:256] := 0

_mm256_load_ps#

Tech:: AVX_ALL
Category:: Load
Header:: immintrin.h
Searchable:: AVX_ALL-Load-YMM
Register:: YMM 256 bit
Return Type:: __m256
Param Types:: float const * mem_addr
Param ETypes:: FP32 mem_addr

__m256 _mm256_load_ps(float const * mem_addr);

Intel Description

Load 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from memory into “dst”.: “mem_addr” must be aligned on a 32-byte boundary or a general-protection exception may be generated.

Intel Implementation Psudeo-Code

dst[255:0] := MEM[mem_addr+255:mem_addr]
dst[MAX:256] := 0

_mm256_loadu_pd#

Tech:: AVX_ALL
Category:: Load
Header:: immintrin.h
Searchable:: AVX_ALL-Load-YMM
Register:: YMM 256 bit
Return Type:: __m256d
Param Types:: double const * mem_addr
Param ETypes:: FP64 mem_addr

__m256d _mm256_loadu_pd(double const * mem_addr);

Intel Description

Load 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from memory into “dst”.: “mem_addr” does not need to be aligned on any particular boundary.

Intel Implementation Psudeo-Code

dst[255:0] := MEM[mem_addr+255:mem_addr]
dst[MAX:256] := 0

_mm256_loadu_ps#

Tech:: AVX_ALL
Category:: Load
Header:: immintrin.h
Searchable:: AVX_ALL-Load-YMM
Register:: YMM 256 bit
Return Type:: __m256
Param Types:: float const * mem_addr
Param ETypes:: FP32 mem_addr

__m256 _mm256_loadu_ps(float const * mem_addr);

Intel Description

Load 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from memory into “dst”.: “mem_addr” does not need to be aligned on any particular boundary.

Intel Implementation Psudeo-Code

dst[255:0] := MEM[mem_addr+255:mem_addr]
dst[MAX:256] := 0

_mm256_load_si256#

Tech:: AVX_ALL
Category:: Load
Header:: immintrin.h
Searchable:: AVX_ALL-Load-YMM
Register:: YMM 256 bit
Return Type:: __m256i
Param Types:: __m256i const * mem_addr
Param ETypes:: M256 mem_addr

__m256i _mm256_load_si256(__m256i const * mem_addr);

Intel Description

Load 256-bits of integer data from memory into “dst”.: “mem_addr” must be aligned on a 32-byte boundary or a general-protection exception may be generated.

Intel Implementation Psudeo-Code

dst[255:0] := MEM[mem_addr+255:mem_addr]
dst[MAX:256] := 0

_mm256_loadu_si256#

Tech:: AVX_ALL
Category:: Load
Header:: immintrin.h
Searchable:: AVX_ALL-Load-YMM
Register:: YMM 256 bit
Return Type:: __m256i
Param Types:: __m256i const * mem_addr
Param ETypes:: M256 mem_addr

__m256i _mm256_loadu_si256(__m256i const * mem_addr);

Intel Description

Load 256-bits of integer data from memory into “dst”.: “mem_addr” does not need to be aligned on any particular boundary.

Intel Implementation Psudeo-Code

dst[255:0] := MEM[mem_addr+255:mem_addr]
dst[MAX:256] := 0

_mm256_maskload_pd#

Tech:: AVX_ALL
Category:: Load
Header:: immintrin.h
Searchable:: AVX_ALL-Load-YMM
Register:: YMM 256 bit
Return Type:: __m256d
Param Types:: double const * mem_addr, __m256i mask
Param ETypes:: FP64 mem_addr, MASK mask

__m256d _mm256_maskload_pd(double const* mem_addr,
                           __m256i mask)

Intel Description

Load packed double-precision (64-bit) floating-point elements from memory into “dst” using “mask” (elements are zeroed out when the high bit of the corresponding element is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*64
        IF mask[i+63]
                dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
        ELSE
                dst[i+63:i] := 0
        FI
ENDFOR
dst[MAX:256] := 0

_mm256_maskload_ps#

Tech:: AVX_ALL
Category:: Load
Header:: immintrin.h
Searchable:: AVX_ALL-Load-YMM
Register:: YMM 256 bit
Return Type:: __m256
Param Types:: float const * mem_addr, __m256i mask
Param ETypes:: FP32 mem_addr, MASK mask

__m256 _mm256_maskload_ps(float const* mem_addr,
                          __m256i mask)

Intel Description

Load packed single-precision (32-bit) floating-point elements from memory into “dst” using “mask” (elements are zeroed out when the high bit of the corresponding element is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        i := j*32
        IF mask[i+31]
                dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
        ELSE
                dst[i+31:i] := 0
        FI
ENDFOR
dst[MAX:256] := 0

_mm256_lddqu_si256#

Tech:: AVX_ALL
Category:: Load
Header:: immintrin.h
Searchable:: AVX_ALL-Load-YMM
Register:: YMM 256 bit
Return Type:: __m256i
Param Types:: __m256i const * mem_addr
Param ETypes:: M256 mem_addr

__m256i _mm256_lddqu_si256(__m256i const * mem_addr);

Intel Description

Load 256-bits of integer data from unaligned memory into “dst”. This intrinsic may perform better than “_mm256_loadu_si256” when the data crosses a cache line boundary.

Intel Implementation Psudeo-Code

dst[255:0] := MEM[mem_addr+255:mem_addr]
dst[MAX:256] := 0

_mm256_loadu2_m128#

Tech:: AVX_ALL
Category:: Load
Header:: immintrin.h
Searchable:: AVX_ALL-Load-YMM
Register:: YMM 256 bit
Return Type:: __m256
Param Types:: float const* hiaddr, float const* loaddr
Param ETypes:: FP32 hiaddr, FP32 loaddr

__m256 _mm256_loadu2_m128(float const* hiaddr,
                          float const* loaddr)

Intel Description

Load two 128-bit values (composed of 4 packed single-precision (32-bit) floating-point elements) from memory, and combine them into a 256-bit value in “dst”.: “hiaddr” and “loaddr” do not need to be aligned on any particular boundary.

Intel Implementation Psudeo-Code

dst[127:0] := MEM[loaddr+127:loaddr]
dst[255:128] := MEM[hiaddr+127:hiaddr]
dst[MAX:256] := 0

_mm256_loadu2_m128d#

Tech:: AVX_ALL
Category:: Load
Header:: immintrin.h
Searchable:: AVX_ALL-Load-YMM
Register:: YMM 256 bit
Return Type:: __m256d
Param Types:: double const* hiaddr, double const* loaddr
Param ETypes:: FP64 hiaddr, FP64 loaddr

__m256d _mm256_loadu2_m128d(double const* hiaddr,
                            double const* loaddr)

Intel Description

Load two 128-bit values (composed of 2 packed double-precision (64-bit) floating-point elements) from memory, and combine them into a 256-bit value in “dst”.: “hiaddr” and “loaddr” do not need to be aligned on any particular boundary.

Intel Implementation Psudeo-Code

dst[127:0] := MEM[loaddr+127:loaddr]
dst[255:128] := MEM[hiaddr+127:hiaddr]
dst[MAX:256] := 0

_mm256_loadu2_m128i#

Tech:: AVX_ALL
Category:: Load
Header:: immintrin.h
Searchable:: AVX_ALL-Load-YMM
Register:: YMM 256 bit
Return Type:: __m256i
Param Types:: __m128i const* hiaddr, __m128i const* loaddr
Param ETypes:: M128 hiaddr, M128 loaddr

__m256i _mm256_loadu2_m128i(__m128i const* hiaddr,
                            __m128i const* loaddr)

Intel Description

Load two 128-bit values (composed of integer data) from memory, and combine them into a 256-bit value in “dst”.: “hiaddr” and “loaddr” do not need to be aligned on any particular boundary.

Intel Implementation Psudeo-Code

dst[127:0] := MEM[loaddr+127:loaddr]
dst[255:128] := MEM[hiaddr+127:hiaddr]
dst[MAX:256] := 0

_mm256_i32gather_pd#

Tech:: AVX_ALL
Category:: Load
Header:: immintrin.h
Searchable:: AVX_ALL-Load-YMM
Register:: YMM 256 bit
Return Type:: __m256d
Param Types:: double const* base_addr, __m128i vindex, const int scale
Param ETypes:: FP64 base_addr, SI32 vindex, IMM scale

__m256d _mm256_i32gather_pd(double const* base_addr,
                            __m128i vindex,
                            const int scale)

Intel Description

Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at “base_addr” and offset by each 32-bit element in “vindex” (each index is scaled by the factor in “scale”). Gathered elements are merged into “dst”. “scale” should be 1, 2, 4 or 8.

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*64
        m := j*32
        addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
        dst[i+63:i] := MEM[addr+63:addr]
ENDFOR
dst[MAX:256] := 0

_mm256_i32gather_ps#

Tech:: AVX_ALL
Category:: Load
Header:: immintrin.h
Searchable:: AVX_ALL-Load-YMM
Register:: YMM 256 bit
Return Type:: __m256
Param Types:: float const* base_addr, __m256i vindex, const int scale
Param ETypes:: FP32 base_addr, SI32 vindex, IMM scale

__m256 _mm256_i32gather_ps(float const* base_addr,
                           __m256i vindex, const int scale)

Intel Description

Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at “base_addr” and offset by each 32-bit element in “vindex” (each index is scaled by the factor in “scale”). Gathered elements are merged into “dst”. “scale” should be 1, 2, 4 or 8.

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        i := j*32
        m := j*32
        addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
        dst[i+31:i] := MEM[addr+31:addr]
ENDFOR
dst[MAX:256] := 0

_mm256_i32gather_epi32#

Tech:: AVX_ALL
Category:: Load
Header:: immintrin.h
Searchable:: AVX_ALL-Load-YMM
Register:: YMM 256 bit
Return Type:: __m256i
Param Types:: int const* base_addr, __m256i vindex, const int scale
Param ETypes:: UI32 base_addr, SI32 vindex, IMM scale

__m256i _mm256_i32gather_epi32(int const* base_addr,
                               __m256i vindex,
                               const int scale)

Intel Description

Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at “base_addr” and offset by each 32-bit element in “vindex” (each index is scaled by the factor in “scale”). Gathered elements are merged into “dst”. “scale” should be 1, 2, 4 or 8.

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        i := j*32
        m := j*32
        addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
        dst[i+31:i] := MEM[addr+31:addr]
ENDFOR
dst[MAX:256] := 0

_mm256_i32gather_epi64#

Tech:: AVX_ALL
Category:: Load
Header:: immintrin.h
Searchable:: AVX_ALL-Load-YMM
Register:: YMM 256 bit
Return Type:: __m256i
Param Types:: __int64 const* base_addr, __m128i vindex, const int scale
Param ETypes:: UI64 base_addr, SI32 vindex, IMM scale

__m256i _mm256_i32gather_epi64(__int64 const* base_addr,
                               __m128i vindex,
                               const int scale)

Intel Description

Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at “base_addr” and offset by each 32-bit element in “vindex” (each index is scaled by the factor in “scale”). Gathered elements are merged into “dst”. “scale” should be 1, 2, 4 or 8.

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*64
        m := j*32
        addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
        dst[i+63:i] := MEM[addr+63:addr]
ENDFOR
dst[MAX:256] := 0

_mm256_i64gather_pd#

Tech:: AVX_ALL
Category:: Load
Header:: immintrin.h
Searchable:: AVX_ALL-Load-YMM
Register:: YMM 256 bit
Return Type:: __m256d
Param Types:: double const* base_addr, __m256i vindex, const int scale
Param ETypes:: FP64 base_addr, SI64 vindex, IMM scale

__m256d _mm256_i64gather_pd(double const* base_addr,
                            __m256i vindex,
                            const int scale)

Intel Description

Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at “base_addr” and offset by each 64-bit element in “vindex” (each index is scaled by the factor in “scale”). Gathered elements are merged into “dst”. “scale” should be 1, 2, 4 or 8.

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*64
        m := j*64
        addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
        dst[i+63:i] := MEM[addr+63:addr]
ENDFOR
dst[MAX:256] := 0

_mm256_i64gather_ps#

Tech:: AVX_ALL
Category:: Load
Header:: immintrin.h
Searchable:: AVX_ALL-Load-YMM
Register:: YMM 256 bit
Return Type:: __m128
Param Types:: float const* base_addr, __m256i vindex, const int scale
Param ETypes:: FP32 base_addr, SI64 vindex, IMM scale

__m128 _mm256_i64gather_ps(float const* base_addr,
                           __m256i vindex, const int scale)

Intel Description

Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at “base_addr” and offset by each 64-bit element in “vindex” (each index is scaled by the factor in “scale”). Gathered elements are merged into “dst”. “scale” should be 1, 2, 4 or 8.

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        m := j*64
        addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
        dst[i+31:i] := MEM[addr+31:addr]
ENDFOR
dst[MAX:128] := 0

_mm256_i64gather_epi32#

Tech:: AVX_ALL
Category:: Load
Header:: immintrin.h
Searchable:: AVX_ALL-Load-YMM
Register:: YMM 256 bit
Return Type:: __m128i
Param Types:: int const* base_addr, __m256i vindex, const int scale
Param ETypes:: UI32 base_addr, SI64 vindex, IMM scale

__m128i _mm256_i64gather_epi32(int const* base_addr,
                               __m256i vindex,
                               const int scale)

Intel Description

Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at “base_addr” and offset by each 64-bit element in “vindex” (each index is scaled by the factor in “scale”). Gathered elements are merged into “dst”. “scale” should be 1, 2, 4 or 8.

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        m := j*64
        addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
        dst[i+31:i] := MEM[addr+31:addr]
ENDFOR
dst[MAX:128] := 0

_mm256_i64gather_epi64#

Tech:: AVX_ALL
Category:: Load
Header:: immintrin.h
Searchable:: AVX_ALL-Load-YMM
Register:: YMM 256 bit
Return Type:: __m256i
Param Types:: __int64 const* base_addr, __m256i vindex, const int scale
Param ETypes:: UI64 base_addr, SI64 vindex, IMM scale

__m256i _mm256_i64gather_epi64(__int64 const* base_addr,
                               __m256i vindex,
                               const int scale)

Intel Description

Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at “base_addr” and offset by each 64-bit element in “vindex” (each index is scaled by the factor in “scale”). Gathered elements are merged into “dst”. “scale” should be 1, 2, 4 or 8.

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*64
        m := j*64
        addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
        dst[i+63:i] := MEM[addr+63:addr]
ENDFOR
dst[MAX:256] := 0

_mm256_mask_i32gather_pd#

Tech:: AVX_ALL
Category:: Load
Header:: immintrin.h
Searchable:: AVX_ALL-Load-YMM
Register:: YMM 256 bit
Return Type:: __m256d
Param Types:: __m256d src, double const* base_addr, __m128i vindex, __m256d mask, const int scale
Param ETypes:: FP64 src, FP64 base_addr, SI32 vindex, MASK mask, IMM scale

__m256d _mm256_mask_i32gather_pd(__m256d src,
                                 double const* base_addr,
                                 __m128i vindex,
                                 __m256d mask,
                                 const int scale)

Intel Description

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*64
        m := j*32
        IF mask[i+63]
                addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
                dst[i+63:i] := MEM[addr+63:addr]
        ELSE
                dst[i+63:i] := src[i+63:i]
        FI
ENDFOR
mask[MAX:256] := 0
dst[MAX:256] := 0

_mm256_mask_i32gather_ps#

Tech:: AVX_ALL
Category:: Load
Header:: immintrin.h
Searchable:: AVX_ALL-Load-YMM
Register:: YMM 256 bit
Return Type:: __m256
Param Types:: __m256 src, float const* base_addr, __m256i vindex, __m256 mask, const int scale
Param ETypes:: FP32 src, FP32 base_addr, SI32 vindex, MASK mask, IMM scale

__m256 _mm256_mask_i32gather_ps(__m256 src,
                                float const* base_addr,
                                __m256i vindex, __m256 mask,
                                const int scale)

Intel Description

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        i := j*32
        m := j*32
        IF mask[i+31]
                addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
                dst[i+31:i] := MEM[addr+31:addr]
        ELSE
                dst[i+31:i] := src[i+31:i]
        FI
ENDFOR
mask[MAX:256] := 0
dst[MAX:256] := 0

_mm256_mask_i32gather_epi32#

Tech:: AVX_ALL
Category:: Load
Header:: immintrin.h
Searchable:: AVX_ALL-Load-YMM
Register:: YMM 256 bit
Return Type:: __m256i
Param Types:: __m256i src, int const* base_addr, __m256i vindex, __m256i mask, const int scale
Param ETypes:: UI32 src, UI32 base_addr, SI32 vindex, MASK mask, IMM scale

__m256i _mm256_mask_i32gather_epi32(__m256i src,
                                    int const* base_addr,
                                    __m256i vindex,
                                    __m256i mask,
                                    const int scale)

Intel Description

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        i := j*32
        m := j*32
        IF mask[i+31]
                addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
                dst[i+31:i] := MEM[addr+31:addr]
        ELSE
                dst[i+31:i] := src[i+31:i]
        FI
ENDFOR
mask[MAX:256] := 0
dst[MAX:256] := 0

_mm256_mask_i32gather_epi64#

Tech:: AVX_ALL
Category:: Load
Header:: immintrin.h
Searchable:: AVX_ALL-Load-YMM
Register:: YMM 256 bit
Return Type:: __m256i
Param Types:: __m256i src, __int64 const* base_addr, __m128i vindex, __m256i mask, const int scale
Param ETypes:: UI64 src, UI64 base_addr, SI32 vindex, MASK mask, IMM scale

__m256i _mm256_mask_i32gather_epi64(
    __m256i src, __int64 const* base_addr, __m128i vindex,
    __m256i mask, const int scale)

Intel Description

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*64
        m := j*32
        IF mask[i+63]
                addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
                dst[i+63:i] := MEM[addr+63:addr]
        ELSE
                dst[i+63:i] := src[i+63:i]
        FI
ENDFOR
mask[MAX:256] := 0
dst[MAX:256] := 0

_mm256_mask_i64gather_pd#

Tech:: AVX_ALL
Category:: Load
Header:: immintrin.h
Searchable:: AVX_ALL-Load-YMM
Register:: YMM 256 bit
Return Type:: __m256d
Param Types:: __m256d src, double const* base_addr, __m256i vindex, __m256d mask, const int scale
Param ETypes:: FP64 src, FP64 base_addr, SI64 vindex, MASK mask, IMM scale

__m256d _mm256_mask_i64gather_pd(__m256d src,
                                 double const* base_addr,
                                 __m256i vindex,
                                 __m256d mask,
                                 const int scale)

Intel Description

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*64
        m := j*64
        IF mask[i+63]
                addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
                dst[i+63:i] := MEM[addr+63:addr]
        ELSE
                dst[i+63:i] := src[i+63:i]
        FI
ENDFOR
mask[MAX:256] := 0
dst[MAX:256] := 0

_mm256_mask_i64gather_ps#

Tech:: AVX_ALL
Category:: Load
Header:: immintrin.h
Searchable:: AVX_ALL-Load-YMM
Register:: YMM 256 bit
Return Type:: __m128
Param Types:: __m128 src, float const* base_addr, __m256i vindex, __m128 mask, const int scale
Param ETypes:: FP32 src, FP32 base_addr, SI64 vindex, MASK mask, IMM scale

__m128 _mm256_mask_i64gather_ps(__m128 src,
                                float const* base_addr,
                                __m256i vindex, __m128 mask,
                                const int scale)

Intel Description

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        m := j*64
        IF mask[i+31]
                addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
                dst[i+31:i] := MEM[addr+31:addr]
        ELSE
                dst[i+31:i] := src[i+31:i]
        FI
ENDFOR
mask[MAX:128] := 0
dst[MAX:128] := 0

_mm256_mask_i64gather_epi32#

Tech:: AVX_ALL
Category:: Load
Header:: immintrin.h
Searchable:: AVX_ALL-Load-YMM
Register:: YMM 256 bit
Return Type:: __m128i
Param Types:: __m128i src, int const* base_addr, __m256i vindex, __m128i mask, const int scale
Param ETypes:: UI32 src, UI32 base_addr, SI64 vindex, MASK mask, IMM scale

__m128i _mm256_mask_i64gather_epi32(__m128i src,
                                    int const* base_addr,
                                    __m256i vindex,
                                    __m128i mask,
                                    const int scale)

Intel Description

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        m := j*64
        IF mask[i+31]
                addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
                dst[i+31:i] := MEM[addr+31:addr]
        ELSE
                dst[i+31:i] := src[i+31:i]
        FI
ENDFOR
mask[MAX:128] := 0
dst[MAX:128] := 0

_mm256_mask_i64gather_epi64#

Tech:: AVX_ALL
Category:: Load
Header:: immintrin.h
Searchable:: AVX_ALL-Load-YMM
Register:: YMM 256 bit
Return Type:: __m256i
Param Types:: __m256i src, __int64 const* base_addr, __m256i vindex, __m256i mask, const int scale
Param ETypes:: UI64 src, UI64 base_addr, SI64 vindex, MASK mask, IMM scale

__m256i _mm256_mask_i64gather_epi64(
    __m256i src, __int64 const* base_addr, __m256i vindex,
    __m256i mask, const int scale)

Intel Description

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*64
        m := j*64
        IF mask[i+63]
                addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
                dst[i+63:i] := MEM[addr+63:addr]
        ELSE
                dst[i+63:i] := src[i+63:i]
        FI
ENDFOR
mask[MAX:256] := 0
dst[MAX:256] := 0

_mm256_maskload_epi32#

Tech:: AVX_ALL
Category:: Load
Header:: immintrin.h
Searchable:: AVX_ALL-Load-YMM
Register:: YMM 256 bit
Return Type:: __m256i
Param Types:: int const* mem_addr, __m256i mask
Param ETypes:: UI32 mem_addr, MASK mask

__m256i _mm256_maskload_epi32(int const* mem_addr,
                              __m256i mask)

Intel Description

Load packed 32-bit integers from memory into “dst” using “mask” (elements are zeroed out when the highest bit is not set in the corresponding element).

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        i := j*32
        IF mask[i+31]
                dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
        ELSE
                dst[i+31:i] := 0
        FI
ENDFOR
dst[MAX:256] := 0

_mm256_maskload_epi64#

Tech:: AVX_ALL
Category:: Load
Header:: immintrin.h
Searchable:: AVX_ALL-Load-YMM
Register:: YMM 256 bit
Return Type:: __m256i
Param Types:: __int64 const* mem_addr, __m256i mask
Param ETypes:: UI64 mem_addr, MASK mask

__m256i _mm256_maskload_epi64(__int64 const* mem_addr,
                              __m256i mask)

Intel Description

Load packed 64-bit integers from memory into “dst” using “mask” (elements are zeroed out when the highest bit is not set in the corresponding element).

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*64
        IF mask[i+63]
                dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
        ELSE
                dst[i+63:i] := 0
        FI
ENDFOR
dst[MAX:256] := 0

_mm256_stream_load_si256#

Tech:: AVX_ALL
Category:: Load
Header:: immintrin.h
Searchable:: AVX_ALL-Load-YMM
Register:: YMM 256 bit
Return Type:: __m256i
Param Types:: void const* mem_addr
Param ETypes:: M256 mem_addr

__m256i _mm256_stream_load_si256(void const* mem_addr);

Intel Description

Load 256-bits of integer data from memory into “dst” using a non-temporal memory hint.: “mem_addr” must be aligned on a 32-byte boundary or a general-protection exception may be generated.

Intel Implementation Psudeo-Code

dst[255:0] := MEM[mem_addr+255:mem_addr]
dst[MAX:256] := 0

AVX_ALL-Load-YMM

Contents

AVX_ALL-Load-YMM#

_mm256_broadcast_ss#

_mm256_load_pd#

_mm256_load_ps#

_mm256_loadu_pd#

_mm256_loadu_ps#

_mm256_load_si256#

_mm256_loadu_si256#

_mm256_maskload_pd#

_mm256_maskload_ps#

_mm256_lddqu_si256#

_mm256_loadu2_m128#

_mm256_loadu2_m128d#

_mm256_loadu2_m128i#

_mm256_i32gather_pd#

_mm256_i32gather_ps#

_mm256_i32gather_epi32#

_mm256_i32gather_epi64#

_mm256_i64gather_pd#

_mm256_i64gather_ps#

_mm256_i64gather_epi32#

_mm256_i64gather_epi64#

_mm256_mask_i32gather_pd#

_mm256_mask_i32gather_ps#

_mm256_mask_i32gather_epi32#

_mm256_mask_i32gather_epi64#

_mm256_mask_i64gather_pd#

_mm256_mask_i64gather_ps#

_mm256_mask_i64gather_epi32#

_mm256_mask_i64gather_epi64#

_mm256_maskload_epi32#

_mm256_maskload_epi64#

_mm256_stream_load_si256#