Дмитрий Вовк: Векторизация кода под мобильные платформы

by Dmitriy Vovk

CODE VECTORIZATION for mobile devices

Hardware •  Typical hardware found in modern mobile devices: –  ARMv7 architecture –  Cortex A8\Cortex A9\Custom cores (Krait, SwiN) –  800 – 1500 MHz –  1-‐4 cores –  Thumb-‐2 instrucXons set –  VFPv3 –  NEON, opXonal for Cortex A9. Nvidia Tegra 2 has

no NEON support

NEON •  NEON is a general purpose SIMD engine designed by ARM for ARM processor architecture

•  16 registers, 128 bit wide each. Supports operaXons on 8, 16, 32 and 64 bits integers and 32 bits float values

NEON •  NEON can be used for: – SoNware geometry instancing; – Skinning on ES 1.1; – As a general vertex processor; – Other, typical, applicaXons for SIMD.

NEON •  Some unified shader architectures, like popular ImaginaXon Technologies USSE1 (PowerVR SGX 530-‐545) are scalar, NEON is vector by nature. Move your vertex processing to CPU from GPU to speedup calculaXons*

•  ??????? •  PROFIT!!!111

•  *NOTE. That doesn’t apply to USSE2 hardware

NEON •  The weakest side of mobile GPUs is a fill rate. Fill rate is quickly killed by blending. 2D games are heavy on this. PowerVR USSE engine doesn’t care what to do – vertex or fragments processing. Moving you vertex processing to CPU (NEON) will leave some room space for fragment processing.

NEON •  There are 3 ways to use NEON vectorizaXon in your code: 1.  Intrinsics 2.  Handwrijen NEON assembly 3.  AutovectorizaXon by compiler. –mllvm –

vectorize –mllvm –bb-‐vectorize-‐aligned-‐only compiler flags for LLVM. -‐Bree-‐vectorizer-‐verbose=4 -‐mfpu=neon -‐funsafe-‐math-‐opGmizaGons -‐Bree-‐vectorize for GCC

DEMO

Measurements •  Intrinsics:

Measurements •  Assembly :

Measurements •  Summary:

•  Intrinsics got me 25% speedup over assembly. •  Note that speed of intrinsics code vary from compiler to compiler.

Running 'me, ms CPU usage, %

Intrinsics 2764 19

Assembly 3664 20

FPU 6209 25-‐28

FPU autovectorized 5028 22-‐24

NEON •  Intrinsics advantages over assembly: – Higher level code; – No need to manage registers; – You can vectorize basic blocks and build soluXon to every new problem with this blocks. In contrast to assembly – you have to solve each new problem from scratch;

NEON •  Assembly advantages over intrinsics: – Code generated from intrinsics vary from compiler to compiler and can give you really big difference in speed. Assembly code will always be the same.

Code void Update() { GLKMatrix4 modelviewMat = { 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1 }; const float Y_DELTA = 420.0f / QUADS_COUNT; for (int i = 0; i < QUADS_COUNT * VERTS_PER_QUAD; i += VERTS_PER_QUAD) { modelviewMat.m[12] = random() % 260; modelviewMat.m[13] = Y_DELTA ; #ifdef ASM CalculateSpriteVertsWorldPos((float32x4x4_t*)proj.m, (float32x4x4_t*)modelviewMat.m, (float32x4_t*)&data[i + 0].pos, (float32x4_t*)&data[i + 1].pos, (float32x4_t*)&data[i + 2].pos, (float32x4_t*)&data[i + 3].pos); #else float32x4x4_t modelviewProj; Matrix4ByMatrix4((float32x4x4_t*)proj.m, (float32x4x4_t*)modelviewMat.m, &modelviewProj); for (int j = 0; j < 4; ++j) { Matrix4ByVec4(&modelviewProj, (float32x4_t*)&squareVerXces[j], (float32x4_t*)&data[i + j].pos); } #endif } glBindBuffer(GL_ARRAY_BUFFER, vertexBuffer); glBufferData(GL_ARRAY_BUFFER, sizeof(data), data, GL_STREAM_DRAW); }

Code __ajribute__((always_inline)) void Matrix4ByVec4(const float32x4x4_t* __restrict__ mat, const float32x4_t* __restrict__ vec, float32x4_t* __restrict__ result) { (*result) = vmulq_n_f32((*mat).val[0], (*vec)[0]); (*result) = vmlaq_n_f32((*result), (*mat).val[1], (*vec)[1]); (*result) = vmlaq_n_f32((*result), (*mat).val[2], (*vec)[2]); (*result) = vmlaq_n_f32((*result), (*mat).val[3], (*vec)[3]); }

Code __ajribute__((always_inline)) void Matrix4ByMatrix4(const float32x4x4_t* __restrict__ m1, const float32x4x4_t* __restrict__ m2, float32x4x4_t* __restrict__ r) { #ifdef INTRINSICS (*r).val[0] = vmulq_n_f32((*m1).val[0], vgetq_lane_f32((*m2).val[0], 0)); (*r).val[1] = vmulq_n_f32((*m1).val[0], vgetq_lane_f32((*m2).val[1], 0)); (*r).val[2] = vmulq_n_f32((*m1).val[0], vgetq_lane_f32((*m2).val[2], 0)); (*r).val[3] = vmulq_n_f32((*m1).val[0], vgetq_lane_f32((*m2).val[3], 0)); (*r).val[0] = vmlaq_n_f32((*r).val[0], (*m1).val[1], vgetq_lane_f32((*m2).val[0], 1)); (*r).val[1] = vmlaq_n_f32((*r).val[1], (*m1).val[1], vgetq_lane_f32((*m2).val[1], 1)); (*r).val[2] = vmlaq_n_f32((*r).val[2], (*m1).val[1], vgetq_lane_f32((*m2).val[2], 1)); (*r).val[3] = vmlaq_n_f32((*r).val[3], (*m1).val[1], vgetq_lane_f32((*m2).val[3], 1)); (*r).val[0] = vmlaq_n_f32((*r).val[0], (*m1).val[2], vgetq_lane_f32((*m2).val[0], 2)); (*r).val[1] = vmlaq_n_f32((*r).val[1], (*m1).val[2], vgetq_lane_f32((*m2).val[1], 2)); (*r).val[2] = vmlaq_n_f32((*r).val[2], (*m1).val[2], vgetq_lane_f32((*m2).val[2], 2)); (*r).val[3] = vmlaq_n_f32((*r).val[3], (*m1).val[2], vgetq_lane_f32((*m2).val[3], 2)); (*r).val[0] = vmlaq_n_f32((*r).val[0], (*m1).val[3], vgetq_lane_f32((*m2).val[0], 3)); (*r).val[1] = vmlaq_n_f32((*r).val[1], (*m1).val[3], vgetq_lane_f32((*m2).val[1], 3)); (*r).val[2] = vmlaq_n_f32((*r).val[2], (*m1).val[3], vgetq_lane_f32((*m2).val[2], 3)); (*r).val[3] = vmlaq_n_f32((*r).val[3], (*m1).val[3], vgetq_lane_f32((*m2).val[3], 3)); }

Code __asm__ volaXle ( "vldmia %6, { q0-‐q3 } \n\t" "vldmia %0, { q8-‐q11 }\n\t" "vmul.f32 q12, q8, d0[0]\n\t" "vmul.f32 q13, q8, d2[0]\n\t" "vmul.f32 q14, q8, d4[0]\n\t" "vmul.f32 q15, q8, d6[0]\n\t" "vmla.f32 q12, q9, d0[1]\n\t" "vmla.f32 q13, q9, d2[1]\n\t" "vmla.f32 q14, q9, d4[1]\n\t" "vmla.f32 q15, q9, d6[1]\n\t" "vmla.f32 q12, q10, d1[0]\n\t" "vmla.f32 q13, q10, d3[0]\n\t" "vmla.f32 q14, q10, d5[0]\n\t" "vmla.f32 q15, q10, d7[0]\n\t"

"vmla.f32 q12, q11, d1[1]\n\t" "vmla.f32 q13, q11, d3[1]\n\t" "vmla.f32 q14, q11, d5[1]\n\t" "vmla.f32 q15, q11, d7[1]\n\t" "vldmia %1, { q0-‐q3 } \n\t" "vmul.f32 q8, q12, d0[0]\n\t" "vmul.f32 q9, q12, d2[0]\n\t" "vmul.f32 q10, q12, d4[0]\n\t" "vmul.f32 q11, q12, d6[0]\n\t" "vmla.f32 q8, q13, d0[1]\n\t" "vmla.f32 q8, q14, d1[0]\n\t" "vmla.f32 q8, q15, d1[1]\n\t" "vmla.f32 q9, q13, d2[1]\n\t" "vmla.f32 q9, q14, d3[0]\n\t" "vmla.f32 q9, q15, d3[1]\n\t"

"vmla.f32 q10, q13, d4[1]\n\t" "vmla.f32 q10, q14, d5[0]\n\t" "vmla.f32 q10, q15, d5[1]\n\t" "vmla.f32 q11, q13, d6[1]\n\t" "vmla.f32 q11, q14, d7[0]\n\t" "vmla.f32 q11, q15, d7[1]\n\t" "vstmia %2, { q8 }\n\t" "vstmia %3, { q9 }\n\t" "vstmia %4, { q10 }\n\t" "vstmia %5, { q11 }" : : "r" (proj), "r" (squareVerXces), "r" (v1), "r" (v2), "r" (v3), "r" (v4), "r" (modelView) : "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" );

Docs •  For detailed explanaXon on intrinsics\assembly see: hjp://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0491e/CIHJBEFE.html

Contact me

hjp://www.linkedin.com/in/dvovk/ hjp://nukecode.blogspot.com/

Documents

Дмитрий Вовк: Векторизация кода под мобильные платформы