Implementing a masked SVML-like function explicitly in user defined way

Published Date
05 - May - 2017
| Last Updated
05 - May - 2017
Implementing a masked SVML-like function explicitly in user defin...

Intel Compiler provides SIMD intrinsics APIs for short vector math library (SVML) and starting with AVX512 generation it also exposes masked versions of SVML functions to the users. e.g. see zmmintrin.h:

extern __m512d __ICL_INTRINCC _mm512_mask_exp_pd(__m512d, __mmask8, __m512d);

Masked SIMD functions are handy, just like masked instructions – one can use mask as a vector predicate to avoid computations on certain elements in a vector register e.g. because of unwanted floating point, memory or performance side-effects. Intel Compiler autovectorizer could always optimize this loop with condition into a masked SVML function call

   for (int32_t i=0; i<LEN; i++)
      if (x[i] > 0.0)
        y[i] = exp(x[i]);
        y[i] = 0.0;

AVX512(-xCORE-AVX512) code generation(disassembly) snippet for above code:

 ..B1.24:                        # Preds ..B1.59 ..B1.23
                                # Execution count [8.48e-01]
        vpcmpud   $1, %ymm16, %ymm18, %k6                       #54.17
        vmovupd   (%rbx,%r12,8), %zmm2{%k6}{z}                  #55.9
        vcmppd    $6, %zmm17, %zmm2, %k5                        #55.16
        kandw     %k5, %k6, %k4                                 #55.16
        vmovupd   (%rbx,%r12,8), %zmm1{%k4}{z}                  #56.18
        vmovaps   %zmm17, %zmm0                                 #56.14
        kmovw     %k4, %k1                                      #56.14
        call      __svml_exp8_mask                              #56.14
                                # LOE rbx rsi r12 r13 r14 edi r15d ymm16 ymm18 ymm19 zmm0 zmm17 k4 k5 k6
..B1.59:                        # Preds ..B1.24
                                # Execution count [8.48e-01]
        vpaddd    %ymm19, %ymm18, %ymm18                        #54.17
        kandnw    %k6, %k5, %k1                                 #58.7
        vmovupd   %zmm0, (%r13,%r12,8){%k4}                     #56.7
        vmovupd   %zmm17, (%r13,%r12,8){%k1}                    #58.7
        addq      $8, %r12                                      #54.17
        cmpq      %rsi, %r12                                    #54.17
        jb        ..B1.24       # Prob 82%                      #54.17

Before AVX512 , the x86 vector unit instruction set didn’t provide architectural support for vector masks but the desired behaviour could be easily emulated. For example disassembly of AVX2(-xCORE-AVX2) for above conditional code.

..B1.11:                        # Preds ..B1.14 ..B1.10
                                # Execution count [0.00e+00]
        vmovupd   (%rbx,%r14,8), %ymm0                          #55.9
        vcmpgtpd  %ymm10, %ymm0, %ymm11                         #55.16
        vptest    %ymm8, %ymm11                                 #55.16
        je        ..B1.13       # Prob 20%                      #55.16
                                # LOE rbx r12 r13 r14 r15d ymm0 ymm8 ymm9 ymm10 ymm11
..B1.12:                        # Preds ..B1.11
                                # Execution count [8.48e-01]
        vmovdqa   %ymm11, %ymm1                                 #56.14
        call      __svml_exp4_mask                              #56.14
                                # LOE rbx r12 r13 r14 r15d ymm0 ymm8 ymm9 ymm10 ymm11
..B1.39:                        # Preds ..B1.12
                                # Execution count [8.48e-01]
        vmovdqa   %ymm0, %ymm2                                  #56.14
        vmovupd   (%r12,%r14,8), %ymm0                          #56.7
        vblendvpd %ymm11, %ymm2, %ymm0, %ymm2                   #56.7
        jmp       ..B1.14       # Prob 100%                     #56.7
                                # LOE rbx r12 r13 r14 r15d ymm2 ymm8 ymm9 ymm10 ymm11
..B1.13:                        # Preds ..B1.11
                                # Execution count [0.00e+00]
        vmovupd   (%r12,%r14,8), %ymm2                          #58.7
                                # LOE rbx r12 r13 r14 r15d ymm2 ymm8 ymm9 ymm10 ymm11
..B1.14:                        # Preds ..B1.39 ..B1.13
                                # Execution count [8.48e-01]
        vxorpd    %ymm11, %ymm9, %ymm0                          #55.16
        vandnpd   %ymm2, %ymm0, %ymm1                           #58.7
        vmovupd   %ymm1, (%r12,%r14,8)                          #58.7
        addq      $4, %r14                                      #54.17
        cmpq      $8388608, %r14                                #54.17
        jb        ..B1.11       # Prob 82%                      #54.17

So users benefited from masked functions in SVML even before architecture added support for vector masks. In below recipe we would like to address users that do not rely on autovectorizer and chose to call SVML through intrinsics on pre-AVX512 platforms. We are not exposing pre-AVX512 masked APIs through intrinsics this time, instead we show how users could implement their own masked vector math functions if needed. Here’s an example:

static __forceinline __m256d _mm256_mask_exp_pd(__m256d old_dst, __m256d mask, __m256d src)
    // Need to patch masked off inputs with good values
    // that do not cause side-effects like over/underflow/nans/denormals, etc.
    // 0.5 is a good value for EXP and most other functions.
    // acosh is not defined in 0.5, so it can rather use 2.0
    // 0.0 and 1.0 are often bad points, e.g. think log()
   __m256d patchValue = _mm256_set1_pd(0.5);
    __m256d patchedSrc = _mm256_blendv_pd(patchValue, src, mask);
    // compute SVML function on a full register
    // NOTE: one may choose to totally skip expensive call to exp
    // if the mask was all-zeros, this is left as an exercise to
    // the reader.
    __m256d res = _mm256_exp_pd(patchedSrc);
    // discard masked off results, restore values from old_dst
    old_dst = _mm256_blendv_pd(old_dst, res, mask);
    return old_dst;

One would probably achieve better performance if masked function was inlined, thus we use static __forceinline in the declaration.And here’s how one would use this function if the original loop was written with intrinsics:

void vfoo(int n4, double * a, double *r)
    int i;
    for (i = 0; i < n4; i+=4)
        __m256d src, dst, mask;
        src = _mm256_load_pd(a + i);
        // fill mask based on desired condition
        mask = _mm256_cmp_pd(src, _mm256_setzero_pd(), _CMP_GT_OQ);
        // do something useful for the else path
        dst = _mm256_setzero_pd();
        // compute masked exp that will preserve above useful values
        dst = _mm256_mask_exp_pd(dst, mask, src);
        _mm256_store_pd(r + i, dst);

Here’s the assembly listing for the above loop:

..B1.3:                         # Preds ..B1.8 ..B1.2
                                # Execution count [5.00e+00]
        vmovupd   (%rdi,%r12,8), %ymm1                          #25.30
        vcmpgt_oqpd %ymm9, %ymm1, %ymm10                        #28.16
        vblendvpd %ymm10, %ymm1, %ymm8, %ymm0                   #32.15
        call      __svml_exp4                                   #32.15
                                # LOE rbx rsi rdi r12 r13 r14 r15 ymm0 ymm8 ymm9 ymm10
..B1.8:                         # Preds ..B1.3
                                # Execution count [5.00e+00]
        vblendvpd %ymm10, %ymm0, %ymm9, %ymm1                   #32.15
        vmovupd   %ymm1, (%rsi,%r12,8)                          #34.25
        addq      $4, %r12                                      #22.25
        cmpq      %r13, %r12                                    #22.21
        jl        ..B1.3        # Prob 82%                      #22.21

Note:Similarly we can develop our own masked version of intrinsics for other functions like log,sqrt,cos,sin also by just trivial change of “exp” to “cos", "sin" ..etc. as in the above sample code. Mind the note on patch value though.

For more such intel Modern Code and tools from Intel, please visit the Intel® Modern Code