AVX_ALL-Load-YMM#

_mm256_broadcast_ss#

Tech:

AVX_ALL

Category:

Load

Header:

immintrin.h

Searchable:

AVX_ALL-Load-YMM

Register:

YMM 256 bit

Return Type:

__m256

Param Types:

float const * mem_addr

Param ETypes:

FP32 mem_addr

__m256 _mm256_broadcast_ss(float const * mem_addr);

Intel Description

Broadcast a single-precision (32-bit) floating-point element from memory to all elements of “dst”.

Intel Implementation Psudeo-Code

tmp[31:0] := MEM[mem_addr+31:mem_addr]
FOR j := 0 to 7
        i := j*32
        dst[i+31:i] := tmp[31:0]
ENDFOR
dst[MAX:256] := 0

_mm256_load_pd#

Tech:

AVX_ALL

Category:

Load

Header:

immintrin.h

Searchable:

AVX_ALL-Load-YMM

Register:

YMM 256 bit

Return Type:

__m256d

Param Types:

double const * mem_addr

Param ETypes:

FP64 mem_addr

__m256d _mm256_load_pd(double const * mem_addr);

Intel Description

Load 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from memory into “dst”.

“mem_addr” must be aligned on a 32-byte boundary or a general-protection exception may be generated.

Intel Implementation Psudeo-Code

dst[255:0] := MEM[mem_addr+255:mem_addr]
dst[MAX:256] := 0

_mm256_load_ps#

Tech:

AVX_ALL

Category:

Load

Header:

immintrin.h

Searchable:

AVX_ALL-Load-YMM

Register:

YMM 256 bit

Return Type:

__m256

Param Types:

float const * mem_addr

Param ETypes:

FP32 mem_addr

__m256 _mm256_load_ps(float const * mem_addr);

Intel Description

Load 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from memory into “dst”.

“mem_addr” must be aligned on a 32-byte boundary or a general-protection exception may be generated.

Intel Implementation Psudeo-Code

dst[255:0] := MEM[mem_addr+255:mem_addr]
dst[MAX:256] := 0

_mm256_loadu_pd#

Tech:

AVX_ALL

Category:

Load

Header:

immintrin.h

Searchable:

AVX_ALL-Load-YMM

Register:

YMM 256 bit

Return Type:

__m256d

Param Types:

double const * mem_addr

Param ETypes:

FP64 mem_addr

__m256d _mm256_loadu_pd(double const * mem_addr);

Intel Description

Load 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from memory into “dst”.

“mem_addr” does not need to be aligned on any particular boundary.

Intel Implementation Psudeo-Code

dst[255:0] := MEM[mem_addr+255:mem_addr]
dst[MAX:256] := 0

_mm256_loadu_ps#

Tech:

AVX_ALL

Category:

Load

Header:

immintrin.h

Searchable:

AVX_ALL-Load-YMM

Register:

YMM 256 bit

Return Type:

__m256

Param Types:

float const * mem_addr

Param ETypes:

FP32 mem_addr

__m256 _mm256_loadu_ps(float const * mem_addr);

Intel Description

Load 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from memory into “dst”.

“mem_addr” does not need to be aligned on any particular boundary.

Intel Implementation Psudeo-Code

dst[255:0] := MEM[mem_addr+255:mem_addr]
dst[MAX:256] := 0

_mm256_load_si256#

Tech:

AVX_ALL

Category:

Load

Header:

immintrin.h

Searchable:

AVX_ALL-Load-YMM

Register:

YMM 256 bit

Return Type:

__m256i

Param Types:

__m256i const * mem_addr

Param ETypes:

M256 mem_addr

__m256i _mm256_load_si256(__m256i const * mem_addr);

Intel Description

Load 256-bits of integer data from memory into “dst”.

“mem_addr” must be aligned on a 32-byte boundary or a general-protection exception may be generated.

Intel Implementation Psudeo-Code

dst[255:0] := MEM[mem_addr+255:mem_addr]
dst[MAX:256] := 0

_mm256_loadu_si256#

Tech:

AVX_ALL

Category:

Load

Header:

immintrin.h

Searchable:

AVX_ALL-Load-YMM

Register:

YMM 256 bit

Return Type:

__m256i

Param Types:

__m256i const * mem_addr

Param ETypes:

M256 mem_addr

__m256i _mm256_loadu_si256(__m256i const * mem_addr);

Intel Description

Load 256-bits of integer data from memory into “dst”.

“mem_addr” does not need to be aligned on any particular boundary.

Intel Implementation Psudeo-Code

dst[255:0] := MEM[mem_addr+255:mem_addr]
dst[MAX:256] := 0

_mm256_maskload_pd#

Tech:

AVX_ALL

Category:

Load

Header:

immintrin.h

Searchable:

AVX_ALL-Load-YMM

Register:

YMM 256 bit

Return Type:

__m256d

Param Types:

double const * mem_addr, __m256i mask

Param ETypes:

FP64 mem_addr, MASK mask

__m256d _mm256_maskload_pd(double const* mem_addr,
                           __m256i mask)

Intel Description

Load packed double-precision (64-bit) floating-point elements from memory into “dst” using “mask” (elements are zeroed out when the high bit of the corresponding element is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*64
        IF mask[i+63]
                dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
        ELSE
                dst[i+63:i] := 0
        FI
ENDFOR
dst[MAX:256] := 0

_mm256_maskload_ps#

Tech:

AVX_ALL

Category:

Load

Header:

immintrin.h

Searchable:

AVX_ALL-Load-YMM

Register:

YMM 256 bit

Return Type:

__m256

Param Types:

float const * mem_addr, __m256i mask

Param ETypes:

FP32 mem_addr, MASK mask

__m256 _mm256_maskload_ps(float const* mem_addr,
                          __m256i mask)

Intel Description

Load packed single-precision (32-bit) floating-point elements from memory into “dst” using “mask” (elements are zeroed out when the high bit of the corresponding element is not set).

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        i := j*32
        IF mask[i+31]
                dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
        ELSE
                dst[i+31:i] := 0
        FI
ENDFOR
dst[MAX:256] := 0

_mm256_lddqu_si256#

Tech:

AVX_ALL

Category:

Load

Header:

immintrin.h

Searchable:

AVX_ALL-Load-YMM

Register:

YMM 256 bit

Return Type:

__m256i

Param Types:

__m256i const * mem_addr

Param ETypes:

M256 mem_addr

__m256i _mm256_lddqu_si256(__m256i const * mem_addr);

Intel Description

Load 256-bits of integer data from unaligned memory into “dst”. This intrinsic may perform better than “_mm256_loadu_si256” when the data crosses a cache line boundary.

Intel Implementation Psudeo-Code

dst[255:0] := MEM[mem_addr+255:mem_addr]
dst[MAX:256] := 0

_mm256_loadu2_m128#

Tech:

AVX_ALL

Category:

Load

Header:

immintrin.h

Searchable:

AVX_ALL-Load-YMM

Register:

YMM 256 bit

Return Type:

__m256

Param Types:

float const* hiaddr, float const* loaddr

Param ETypes:

FP32 hiaddr, FP32 loaddr

__m256 _mm256_loadu2_m128(float const* hiaddr,
                          float const* loaddr)

Intel Description

Load two 128-bit values (composed of 4 packed single-precision (32-bit) floating-point elements) from memory, and combine them into a 256-bit value in “dst”.

“hiaddr” and “loaddr” do not need to be aligned on any particular boundary.

Intel Implementation Psudeo-Code

dst[127:0] := MEM[loaddr+127:loaddr]
dst[255:128] := MEM[hiaddr+127:hiaddr]
dst[MAX:256] := 0

_mm256_loadu2_m128d#

Tech:

AVX_ALL

Category:

Load

Header:

immintrin.h

Searchable:

AVX_ALL-Load-YMM

Register:

YMM 256 bit

Return Type:

__m256d

Param Types:

double const* hiaddr, double const* loaddr

Param ETypes:

FP64 hiaddr, FP64 loaddr

__m256d _mm256_loadu2_m128d(double const* hiaddr,
                            double const* loaddr)

Intel Description

Load two 128-bit values (composed of 2 packed double-precision (64-bit) floating-point elements) from memory, and combine them into a 256-bit value in “dst”.

“hiaddr” and “loaddr” do not need to be aligned on any particular boundary.

Intel Implementation Psudeo-Code

dst[127:0] := MEM[loaddr+127:loaddr]
dst[255:128] := MEM[hiaddr+127:hiaddr]
dst[MAX:256] := 0

_mm256_loadu2_m128i#

Tech:

AVX_ALL

Category:

Load

Header:

immintrin.h

Searchable:

AVX_ALL-Load-YMM

Register:

YMM 256 bit

Return Type:

__m256i

Param Types:

__m128i const* hiaddr, __m128i const* loaddr

Param ETypes:

M128 hiaddr, M128 loaddr

__m256i _mm256_loadu2_m128i(__m128i const* hiaddr,
                            __m128i const* loaddr)

Intel Description

Load two 128-bit values (composed of integer data) from memory, and combine them into a 256-bit value in “dst”.

“hiaddr” and “loaddr” do not need to be aligned on any particular boundary.

Intel Implementation Psudeo-Code

dst[127:0] := MEM[loaddr+127:loaddr]
dst[255:128] := MEM[hiaddr+127:hiaddr]
dst[MAX:256] := 0

_mm256_i32gather_pd#

Tech:

AVX_ALL

Category:

Load

Header:

immintrin.h

Searchable:

AVX_ALL-Load-YMM

Register:

YMM 256 bit

Return Type:

__m256d

Param Types:

double const* base_addr, __m128i vindex, const int scale

Param ETypes:

FP64 base_addr, SI32 vindex, IMM scale

__m256d _mm256_i32gather_pd(double const* base_addr,
                            __m128i vindex,
                            const int scale)

Intel Description

Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at “base_addr” and offset by each 32-bit element in “vindex” (each index is scaled by the factor in “scale”). Gathered elements are merged into “dst”. “scale” should be 1, 2, 4 or 8.

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*64
        m := j*32
        addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
        dst[i+63:i] := MEM[addr+63:addr]
ENDFOR
dst[MAX:256] := 0

_mm256_i32gather_ps#

Tech:

AVX_ALL

Category:

Load

Header:

immintrin.h

Searchable:

AVX_ALL-Load-YMM

Register:

YMM 256 bit

Return Type:

__m256

Param Types:

float const* base_addr, __m256i vindex, const int scale

Param ETypes:

FP32 base_addr, SI32 vindex, IMM scale

__m256 _mm256_i32gather_ps(float const* base_addr,
                           __m256i vindex, const int scale)

Intel Description

Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at “base_addr” and offset by each 32-bit element in “vindex” (each index is scaled by the factor in “scale”). Gathered elements are merged into “dst”. “scale” should be 1, 2, 4 or 8.

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        i := j*32
        m := j*32
        addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
        dst[i+31:i] := MEM[addr+31:addr]
ENDFOR
dst[MAX:256] := 0

_mm256_i32gather_epi32#

Tech:

AVX_ALL

Category:

Load

Header:

immintrin.h

Searchable:

AVX_ALL-Load-YMM

Register:

YMM 256 bit

Return Type:

__m256i

Param Types:

int const* base_addr, __m256i vindex, const int scale

Param ETypes:

UI32 base_addr, SI32 vindex, IMM scale

__m256i _mm256_i32gather_epi32(int const* base_addr,
                               __m256i vindex,
                               const int scale)

Intel Description

Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at “base_addr” and offset by each 32-bit element in “vindex” (each index is scaled by the factor in “scale”). Gathered elements are merged into “dst”. “scale” should be 1, 2, 4 or 8.

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        i := j*32
        m := j*32
        addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
        dst[i+31:i] := MEM[addr+31:addr]
ENDFOR
dst[MAX:256] := 0

_mm256_i32gather_epi64#

Tech:

AVX_ALL

Category:

Load

Header:

immintrin.h

Searchable:

AVX_ALL-Load-YMM

Register:

YMM 256 bit

Return Type:

__m256i

Param Types:

__int64 const* base_addr, __m128i vindex, const int scale

Param ETypes:

UI64 base_addr, SI32 vindex, IMM scale

__m256i _mm256_i32gather_epi64(__int64 const* base_addr,
                               __m128i vindex,
                               const int scale)

Intel Description

Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at “base_addr” and offset by each 32-bit element in “vindex” (each index is scaled by the factor in “scale”). Gathered elements are merged into “dst”. “scale” should be 1, 2, 4 or 8.

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*64
        m := j*32
        addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
        dst[i+63:i] := MEM[addr+63:addr]
ENDFOR
dst[MAX:256] := 0

_mm256_i64gather_pd#

Tech:

AVX_ALL

Category:

Load

Header:

immintrin.h

Searchable:

AVX_ALL-Load-YMM

Register:

YMM 256 bit

Return Type:

__m256d

Param Types:

double const* base_addr, __m256i vindex, const int scale

Param ETypes:

FP64 base_addr, SI64 vindex, IMM scale

__m256d _mm256_i64gather_pd(double const* base_addr,
                            __m256i vindex,
                            const int scale)

Intel Description

Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at “base_addr” and offset by each 64-bit element in “vindex” (each index is scaled by the factor in “scale”). Gathered elements are merged into “dst”. “scale” should be 1, 2, 4 or 8.

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*64
        m := j*64
        addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
        dst[i+63:i] := MEM[addr+63:addr]
ENDFOR
dst[MAX:256] := 0

_mm256_i64gather_ps#

Tech:

AVX_ALL

Category:

Load

Header:

immintrin.h

Searchable:

AVX_ALL-Load-YMM

Register:

YMM 256 bit

Return Type:

__m128

Param Types:

float const* base_addr, __m256i vindex, const int scale

Param ETypes:

FP32 base_addr, SI64 vindex, IMM scale

__m128 _mm256_i64gather_ps(float const* base_addr,
                           __m256i vindex, const int scale)

Intel Description

Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at “base_addr” and offset by each 64-bit element in “vindex” (each index is scaled by the factor in “scale”). Gathered elements are merged into “dst”. “scale” should be 1, 2, 4 or 8.

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        m := j*64
        addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
        dst[i+31:i] := MEM[addr+31:addr]
ENDFOR
dst[MAX:128] := 0

_mm256_i64gather_epi32#

Tech:

AVX_ALL

Category:

Load

Header:

immintrin.h

Searchable:

AVX_ALL-Load-YMM

Register:

YMM 256 bit

Return Type:

__m128i

Param Types:

int const* base_addr, __m256i vindex, const int scale

Param ETypes:

UI32 base_addr, SI64 vindex, IMM scale

__m128i _mm256_i64gather_epi32(int const* base_addr,
                               __m256i vindex,
                               const int scale)

Intel Description

Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at “base_addr” and offset by each 64-bit element in “vindex” (each index is scaled by the factor in “scale”). Gathered elements are merged into “dst”. “scale” should be 1, 2, 4 or 8.

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        m := j*64
        addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
        dst[i+31:i] := MEM[addr+31:addr]
ENDFOR
dst[MAX:128] := 0

_mm256_i64gather_epi64#

Tech:

AVX_ALL

Category:

Load

Header:

immintrin.h

Searchable:

AVX_ALL-Load-YMM

Register:

YMM 256 bit

Return Type:

__m256i

Param Types:

__int64 const* base_addr, __m256i vindex, const int scale

Param ETypes:

UI64 base_addr, SI64 vindex, IMM scale

__m256i _mm256_i64gather_epi64(__int64 const* base_addr,
                               __m256i vindex,
                               const int scale)

Intel Description

Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at “base_addr” and offset by each 64-bit element in “vindex” (each index is scaled by the factor in “scale”). Gathered elements are merged into “dst”. “scale” should be 1, 2, 4 or 8.

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*64
        m := j*64
        addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
        dst[i+63:i] := MEM[addr+63:addr]
ENDFOR
dst[MAX:256] := 0

_mm256_mask_i32gather_pd#

Tech:

AVX_ALL

Category:

Load

Header:

immintrin.h

Searchable:

AVX_ALL-Load-YMM

Register:

YMM 256 bit

Return Type:

__m256d

Param Types:

__m256d src, double const* base_addr, __m128i vindex, __m256d mask, const int scale

Param ETypes:

FP64 src, FP64 base_addr, SI32 vindex, MASK mask, IMM scale

__m256d _mm256_mask_i32gather_pd(__m256d src,
                                 double const* base_addr,
                                 __m128i vindex,
                                 __m256d mask,
                                 const int scale)

Intel Description

Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at “base_addr” and offset by each 32-bit element in “vindex” (each index is scaled by the factor in “scale”). Gathered elements are merged into “dst” using “mask” (elements are copied from “src” when the highest bit is not set in the corresponding element). “scale” should be 1, 2, 4 or 8.

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*64
        m := j*32
        IF mask[i+63]
                addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
                dst[i+63:i] := MEM[addr+63:addr]
        ELSE
                dst[i+63:i] := src[i+63:i]
        FI
ENDFOR
mask[MAX:256] := 0
dst[MAX:256] := 0

_mm256_mask_i32gather_ps#

Tech:

AVX_ALL

Category:

Load

Header:

immintrin.h

Searchable:

AVX_ALL-Load-YMM

Register:

YMM 256 bit

Return Type:

__m256

Param Types:

__m256 src, float const* base_addr, __m256i vindex, __m256 mask, const int scale

Param ETypes:

FP32 src, FP32 base_addr, SI32 vindex, MASK mask, IMM scale

__m256 _mm256_mask_i32gather_ps(__m256 src,
                                float const* base_addr,
                                __m256i vindex, __m256 mask,
                                const int scale)

Intel Description

Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at “base_addr” and offset by each 32-bit element in “vindex” (each index is scaled by the factor in “scale”). Gathered elements are merged into “dst” using “mask” (elements are copied from “src” when the highest bit is not set in the corresponding element). “scale” should be 1, 2, 4 or 8.

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        i := j*32
        m := j*32
        IF mask[i+31]
                addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
                dst[i+31:i] := MEM[addr+31:addr]
        ELSE
                dst[i+31:i] := src[i+31:i]
        FI
ENDFOR
mask[MAX:256] := 0
dst[MAX:256] := 0

_mm256_mask_i32gather_epi32#

Tech:

AVX_ALL

Category:

Load

Header:

immintrin.h

Searchable:

AVX_ALL-Load-YMM

Register:

YMM 256 bit

Return Type:

__m256i

Param Types:

__m256i src, int const* base_addr, __m256i vindex, __m256i mask, const int scale

Param ETypes:

UI32 src, UI32 base_addr, SI32 vindex, MASK mask, IMM scale

__m256i _mm256_mask_i32gather_epi32(__m256i src,
                                    int const* base_addr,
                                    __m256i vindex,
                                    __m256i mask,
                                    const int scale)

Intel Description

Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at “base_addr” and offset by each 32-bit element in “vindex” (each index is scaled by the factor in “scale”). Gathered elements are merged into “dst” using “mask” (elements are copied from “src” when the highest bit is not set in the corresponding element). “scale” should be 1, 2, 4 or 8.

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        i := j*32
        m := j*32
        IF mask[i+31]
                addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
                dst[i+31:i] := MEM[addr+31:addr]
        ELSE
                dst[i+31:i] := src[i+31:i]
        FI
ENDFOR
mask[MAX:256] := 0
dst[MAX:256] := 0

_mm256_mask_i32gather_epi64#

Tech:

AVX_ALL

Category:

Load

Header:

immintrin.h

Searchable:

AVX_ALL-Load-YMM

Register:

YMM 256 bit

Return Type:

__m256i

Param Types:

__m256i src, __int64 const* base_addr, __m128i vindex, __m256i mask, const int scale

Param ETypes:

UI64 src, UI64 base_addr, SI32 vindex, MASK mask, IMM scale

__m256i _mm256_mask_i32gather_epi64(
    __m256i src, __int64 const* base_addr, __m128i vindex,
    __m256i mask, const int scale)

Intel Description

Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at “base_addr” and offset by each 32-bit element in “vindex” (each index is scaled by the factor in “scale”). Gathered elements are merged into “dst” using “mask” (elements are copied from “src” when the highest bit is not set in the corresponding element). “scale” should be 1, 2, 4 or 8.

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*64
        m := j*32
        IF mask[i+63]
                addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
                dst[i+63:i] := MEM[addr+63:addr]
        ELSE
                dst[i+63:i] := src[i+63:i]
        FI
ENDFOR
mask[MAX:256] := 0
dst[MAX:256] := 0

_mm256_mask_i64gather_pd#

Tech:

AVX_ALL

Category:

Load

Header:

immintrin.h

Searchable:

AVX_ALL-Load-YMM

Register:

YMM 256 bit

Return Type:

__m256d

Param Types:

__m256d src, double const* base_addr, __m256i vindex, __m256d mask, const int scale

Param ETypes:

FP64 src, FP64 base_addr, SI64 vindex, MASK mask, IMM scale

__m256d _mm256_mask_i64gather_pd(__m256d src,
                                 double const* base_addr,
                                 __m256i vindex,
                                 __m256d mask,
                                 const int scale)

Intel Description

Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at “base_addr” and offset by each 64-bit element in “vindex” (each index is scaled by the factor in “scale”). Gathered elements are merged into “dst” using “mask” (elements are copied from “src” when the highest bit is not set in the corresponding element). “scale” should be 1, 2, 4 or 8.

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*64
        m := j*64
        IF mask[i+63]
                addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
                dst[i+63:i] := MEM[addr+63:addr]
        ELSE
                dst[i+63:i] := src[i+63:i]
        FI
ENDFOR
mask[MAX:256] := 0
dst[MAX:256] := 0

_mm256_mask_i64gather_ps#

Tech:

AVX_ALL

Category:

Load

Header:

immintrin.h

Searchable:

AVX_ALL-Load-YMM

Register:

YMM 256 bit

Return Type:

__m128

Param Types:

__m128 src, float const* base_addr, __m256i vindex, __m128 mask, const int scale

Param ETypes:

FP32 src, FP32 base_addr, SI64 vindex, MASK mask, IMM scale

__m128 _mm256_mask_i64gather_ps(__m128 src,
                                float const* base_addr,
                                __m256i vindex, __m128 mask,
                                const int scale)

Intel Description

Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at “base_addr” and offset by each 64-bit element in “vindex” (each index is scaled by the factor in “scale”). Gathered elements are merged into “dst” using “mask” (elements are copied from “src” when the highest bit is not set in the corresponding element). “scale” should be 1, 2, 4 or 8.

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        m := j*64
        IF mask[i+31]
                addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
                dst[i+31:i] := MEM[addr+31:addr]
        ELSE
                dst[i+31:i] := src[i+31:i]
        FI
ENDFOR
mask[MAX:128] := 0
dst[MAX:128] := 0

_mm256_mask_i64gather_epi32#

Tech:

AVX_ALL

Category:

Load

Header:

immintrin.h

Searchable:

AVX_ALL-Load-YMM

Register:

YMM 256 bit

Return Type:

__m128i

Param Types:

__m128i src, int const* base_addr, __m256i vindex, __m128i mask, const int scale

Param ETypes:

UI32 src, UI32 base_addr, SI64 vindex, MASK mask, IMM scale

__m128i _mm256_mask_i64gather_epi32(__m128i src,
                                    int const* base_addr,
                                    __m256i vindex,
                                    __m128i mask,
                                    const int scale)

Intel Description

Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at “base_addr” and offset by each 64-bit element in “vindex” (each index is scaled by the factor in “scale”). Gathered elements are merged into “dst” using “mask” (elements are copied from “src” when the highest bit is not set in the corresponding element). “scale” should be 1, 2, 4 or 8.

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*32
        m := j*64
        IF mask[i+31]
                addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
                dst[i+31:i] := MEM[addr+31:addr]
        ELSE
                dst[i+31:i] := src[i+31:i]
        FI
ENDFOR
mask[MAX:128] := 0
dst[MAX:128] := 0

_mm256_mask_i64gather_epi64#

Tech:

AVX_ALL

Category:

Load

Header:

immintrin.h

Searchable:

AVX_ALL-Load-YMM

Register:

YMM 256 bit

Return Type:

__m256i

Param Types:

__m256i src, __int64 const* base_addr, __m256i vindex, __m256i mask, const int scale

Param ETypes:

UI64 src, UI64 base_addr, SI64 vindex, MASK mask, IMM scale

__m256i _mm256_mask_i64gather_epi64(
    __m256i src, __int64 const* base_addr, __m256i vindex,
    __m256i mask, const int scale)

Intel Description

Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at “base_addr” and offset by each 64-bit element in “vindex” (each index is scaled by the factor in “scale”). Gathered elements are merged into “dst” using “mask” (elements are copied from “src” when the highest bit is not set in the corresponding element). “scale” should be 1, 2, 4 or 8.

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*64
        m := j*64
        IF mask[i+63]
                addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
                dst[i+63:i] := MEM[addr+63:addr]
        ELSE
                dst[i+63:i] := src[i+63:i]
        FI
ENDFOR
mask[MAX:256] := 0
dst[MAX:256] := 0

_mm256_maskload_epi32#

Tech:

AVX_ALL

Category:

Load

Header:

immintrin.h

Searchable:

AVX_ALL-Load-YMM

Register:

YMM 256 bit

Return Type:

__m256i

Param Types:

int const* mem_addr, __m256i mask

Param ETypes:

UI32 mem_addr, MASK mask

__m256i _mm256_maskload_epi32(int const* mem_addr,
                              __m256i mask)

Intel Description

Load packed 32-bit integers from memory into “dst” using “mask” (elements are zeroed out when the highest bit is not set in the corresponding element).

Intel Implementation Psudeo-Code

FOR j := 0 to 7
        i := j*32
        IF mask[i+31]
                dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
        ELSE
                dst[i+31:i] := 0
        FI
ENDFOR
dst[MAX:256] := 0

_mm256_maskload_epi64#

Tech:

AVX_ALL

Category:

Load

Header:

immintrin.h

Searchable:

AVX_ALL-Load-YMM

Register:

YMM 256 bit

Return Type:

__m256i

Param Types:

__int64 const* mem_addr, __m256i mask

Param ETypes:

UI64 mem_addr, MASK mask

__m256i _mm256_maskload_epi64(__int64 const* mem_addr,
                              __m256i mask)

Intel Description

Load packed 64-bit integers from memory into “dst” using “mask” (elements are zeroed out when the highest bit is not set in the corresponding element).

Intel Implementation Psudeo-Code

FOR j := 0 to 3
        i := j*64
        IF mask[i+63]
                dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
        ELSE
                dst[i+63:i] := 0
        FI
ENDFOR
dst[MAX:256] := 0

_mm256_stream_load_si256#

Tech:

AVX_ALL

Category:

Load

Header:

immintrin.h

Searchable:

AVX_ALL-Load-YMM

Register:

YMM 256 bit

Return Type:

__m256i

Param Types:

void const* mem_addr

Param ETypes:

M256 mem_addr

__m256i _mm256_stream_load_si256(void const* mem_addr);

Intel Description

Load 256-bits of integer data from memory into “dst” using a non-temporal memory hint.

“mem_addr” must be aligned on a 32-byte boundary or a general-protection exception may be generated.

Intel Implementation Psudeo-Code

dst[255:0] := MEM[mem_addr+255:mem_addr]
dst[MAX:256] := 0