57
Нижегородский государственный университет им. Н.И.Лобачевского Факультет Вычислительной математики и кибернетики Лабораторная работа №4 Оптимизация расчетов на примере задачи вычисления справедливой цены опциона Европейского типа Программирование для Intel Xeon Phi Мееров И.Б., Сысоев А.В. Кафедра математического обеспечения ЭВМ При поддержке компании Intel

Программирование для Intel Xeon Phi

  • Upload
    arista

  • View
    85

  • Download
    5

Embed Size (px)

DESCRIPTION

Лабораторная работа № 4 Оптимизация расчетов на примере задачи вычисления справедливой цены опциона Европейского типа. Программирование для Intel Xeon Phi. При поддержке компании Intel. Мееров И.Б. , Сысоев А.В. Кафедра математического обеспечения ЭВМ. Вы думаете, все так просто? - PowerPoint PPT Presentation

Citation preview

4

4 Intel Xeon Phi .., .. Intel . .. 1, ? , . *

. .*

*

#. , 2013 .

Xeon Xeon Phi:

#. , 2013 .

1. #. , 2013 .

.: . . Intel Xeon Intel Xeon Phi. .

#. , 2013 .

2 Intel Xeon E5-2690 (2.9 GHz)2 Intel Xeon Phi 7110X (61 )64 GB Linux CentOS 6.2, Intel Parallel Studio XE 2013 SP1#. , 2013 .

2. #. , 2013 .

- (1) (2) - t - t - ( ) - ( ) - ( ) - (E=0 P=1, , Wt Ws ~ N(0, t-s), s < t), Wt () P=1. - .

#. , 2013 .

-

. , S ( ). , S ( ). , .

#. , 2013 .

-,

:

(3)

( ) (, -), Wt N(0, t).

#. , 2013 .

?

tS , Wt - . 1 , .

#. , 2013 .

P1 P2, P2 t P1 P1 K, . P2 () C P1. K (, strike price), C .

#. , 2013 .

- . P1 P2. C T ( , maturity, ) : K . ST K. ST < K, , C, C. ST > K, K, ( C ST K).

#. , 2013 .

, / . P2: (4)

T , (1. t = T 1. t = 0)#. , 2013 .

- . t=0 (F ):

(5)#. , 2013 .

3. #. , 2013 .

. , , , . , , . .

#. , 2013 .

#. , 2013 .

4. #. , 2013 .

(AoS Array of Structures)

(SOA Structure of Arrays): , . .#. , 2013 .

. int numThreads = 1;int N = 60000000;int main(int argc, char *argv[]){ int version; if (argc < 2) { printf("Usage: size version [#of_threads]\n"); return 1; } N = atoi(argv[1]); version = atoi(argv[2]); if (argc > 3) numThreads = atoi(argv[3]);

//

float res = GetOptionPrice(); printf("%.8f;\n", res); return 0;}#. , 2013 .

. const float sig = 0.2f;const float r = 0.05f;const float T = 3.0f;const float S0 = 100.0f;const float K = 100.0f;

float GetOptionPrice() { float C, d1, d2, p1, p2; d1 = (logf(S0 / K) + (r + sig * sig * 0.5f) * T) / (sig * sqrtf(T)); d2 = (logf(S0 / K) + (r - sig * sig * 0.5f) * T) / (sig * sqrtf(T)); p1 = cdfnormf(d1); p2 = cdfnormf(d2); C = S0 * p1 - K * expf((-1.0f) * r * T) * p2; return C;}#. , 2013 .

. __declspec(noinline) void GetOptionPricesV0( float *pT, float *pK, float *pS0, float *pC){ int i; float d1, d2, p1, p2; for (i = 0; i < N; i++) { d1 = (log(pS0[i] / pK[i]) + (r + sig * sig * 0.5) * pT[i]) / (sig * sqrt(pT[i])); d2 = (log(pS0[i] / pK[i]) + (r - sig * sig * 0.5) * pT[i]) / (sig * sqrt(pT[i])); p1 = cdfnormf(d1); p2 = cdfnormf(d2); pC[i] = pS0[i] * p1 - pK[i] * exp((-1.0) * r * pT[i]) * p2; }}#. , 2013 .

. __declspec(noinline) void GetOptionPricesV0( float *pT, float *pK, float *pS0, float *pC){ int i; float d1, d2, p1, p2; for (i = 0; i < N; i++) { d1 = (log(pS0[i] / pK[i]) + (r + sig * sig * 0.5) * pT[i]) / (sig * sqrt(pT[i])); d2 = (log(pS0[i] / pK[i]) + (r - sig * sig * 0.5) * pT[i]) / (sig * sqrt(pT[i])); p1 = cdfnormf(d1); p2 = cdfnormf(d2); pC[i] = pS0[i] * p1 - pK[i] * exp((-1.0) * r * pT[i]) * p2; }}#. , 2013 .

. N60000000120000000180000000240000000 V0 ()17,00234,00451,00867,970 !

#. , 2013 .

1. __declspec(noinline) void GetOptionPricesV0( float *pT, float *pK, float *pS0, float *pC){ int i; float d1, d2, p1, p2; for (i = 0; i < N; i++) { d1 = (logf(pS0[i] / pK[i]) + (r + sig * sig * 0.5f) * pT[i]) / (sig * sqrtf(pT[i])); d2 = (logf(pS0[i] / pK[i]) + (r - sig * sig * 0.5f) * pT[i]) / (sig * sqrtf(pT[i])); p1 = cdfnormf(d1); p2 = cdfnormf(d2); pC[i] = pS0[i] * p1 - pK[i] * expf((-1.0f) * r * pT[i]) * p2; }}#. , 2013 .

1. N60000000120000000180000000240000000 V0()17,00234,00451,00867,970 V1()16,77633,54950,33766,989 ? 3

#. , 2013 .

2. : cdfnorm() vs. erf() erff() cdfnormf(), .

: ?

#. , 2013 .

2. : cdfnorm() vs. erf()__declspec(noinline) void GetOptionPricesV2(float *pT, float *pK, float *pS0, float *pC){ int i; float d1, d2, erf1, erf2; for (i = 0; i < N; i++) { d1 = (logf(pS0[i] / pK[i]) + (r + sig * sig * 0.5f) * pT[i]) / (sig * sqrtf(pT[i])); d2 = (logf(pS0[i] / pK[i]) + (r - sig * sig * 0.5f) * pT[i]) / (sig * sqrtf(pT[i])); erf1 = 0.5f + 0.5f * erff(d1 / sqrtf(2.0f)); erf2 = 0.5f + 0.5f * erff(d2 / sqrtf(2.0f)); pC[i] = pS0[i] * erf1 pK[i] * expf((-1.0f) * r * pT[i]) * erf2; }}#. , 2013 .

2. : cdfnorm() vs. erf()N60000000120000000180000000240000000 V017,00234,00451,00867,970 V116,77633,54950,33766,989 V22,8715,7278,64911,230 !

#. , 2013 .

3. : restrict ?

restrict?

restrict?

restrict, . ?#. , 2013 .

3. : restrict ? vec-report3 vec-report6 (Linux) Qvec-report3 Qvec-report6 (Windows) mavx ( SSE, AVX).

, ? , .

#. , 2013 .

3. : restrict__declspec(noinline) void GetOptionPricesV3( float * restrict pT, float * restrict pK, float * restrict pS0, float * restrict pC) { int i; float d1, d2, erf1, erf2; for (i = 0; i < N; i++) { d1 = (logf(pS0[i] / pK[i]) + (r + sig * sig * 0.5f) * pT[i]) / (sig * sqrtf(pT[i])); d2 = (logf(pS0[i] / pK[i]) + (r - sig * sig * 0.5f) * pT[i]) / (sig * sqrtf(pT[i])); erf1 = 0.5f + 0.5f * erff(d1 / sqrtf(2.0f)); erf2 = 0.5f + 0.5f * erff(d2 / sqrtf(2.0f)); pC[i] = pS0[i] * erf1 pK[i] * expf((-1.0f) * r * pT[i]) * erf2; }}#. , 2013 .

4. : simd__declspec(noinline) void GetOptionPricesV4(float *pT, float *pK, float *pS0, float *pC){ int i; float d1, d2, erf1, erf2;#pragma simd for (i = 0; i < N; i++) { d1 = (logf(pS0[i] / pK[i]) + (r + sig * sig * 0.5f) * pT[i]) / (sig * sqrtf(pT[i])); d2 = (logf(pS0[i] / pK[i]) + (r - sig * sig * 0.5f) * pT[i]) / (sig * sqrtf(pT[i])); erf1 = 0.5f + 0.5f * erff(d1 / sqrtf(2.0f)); erf2 = 0.5f + 0.5f * erff(d2 / sqrtf(2.0f)); pC[i] = pS0[i] * erf1 - pK[i] * expf((-1.0f) * r * pT[i]) * erf2; }}#. , 2013 .

4*. : ivdep vector always__declspec(noinline) void GetOptionPricesV4(float *pT, float *pK, float *pS0, float *pC){ int i; float d1, d2, erf1, erf2;#pragma ivdep#pragma vector always for (i = 0; i < N; i++) { d1 = (logf(pS0[i] / pK[i]) + (r + sig * sig * 0.5f) * pT[i]) / (sig * sqrtf(pT[i])); d2 = (logf(pS0[i] / pK[i]) + (r - sig * sig * 0.5f) * pT[i]) / (sig * sqrtf(pT[i])); erf1 = 0.5f + 0.5f * erff(d1 / sqrtf(2.0f)); erf2 = 0.5f + 0.5f * erff(d2 / sqrtf(2.0f)); pC[i] = pS0[i] * erf1 - pK[i] * expf((-1.0f) * r * pT[i]) * erf2; }}#. , 2013 .

3-4. N60000000120000000180000000240000000 V017,00234,00451,00867,970 V116,77633,54950,33766,989 V22,8715,7278,64911,230 V30,5221,0491,5832,091 V40,5211,0361,5662,0671. : loop was vectorized (SIMD loop was vectorized)2. 3 , . 3. 8 , 5,43. .. 2 3!

#. , 2013 .

__declspec(noinline) void GetOptionPricesV4(float *pT, float *pK, float *pS0, float *pC){ int i; float d1, d2, erf1, erf2;#pragma simd // Intel . ivdep for (i = 0; i < N; i++) { d1 = (logf(pS0[i] / pK[i]) + (r + sig * sig * 0.5f) * pT[i]) / (sig * sqrtf(pT[i])); d2 = (logf(pS0[i] / pK[i]) + (r - sig * sig * 0.5f) * pT[i]) / (sig * sqrtf(pT[i])); erf1 = 0.5f + 0.5f * erff(d1 / sqrtf(2.0f)); erf2 = 0.5f + 0.5f * erff(d2 / sqrtf(2.0f)); pC[i] = pS0[i] * erf1 - pK[i] * expf((-1.0f) * r * pT[i]) * erf2; }}5.43 #. , 2013 .

5. const float invsqrt2 = 0.707106781f;__declspec(noinline) void GetOptionPricesV5(float *pT, float *pK, float *pS0, float *pC){ int i; float d1, d2, erf1, erf2;#pragma simd for (i = 0; i < N; i++) { d1 = (logf(pS0[i] / pK[i]) + (r + sig * sig * 0.5f) * pT[i]) / (sig * sqrtf(pT[i])); d2 = (logf(pS0[i] / pK[i]) + (r - sig * sig * 0.5f) * pT[i]) / (sig * sqrtf(pT[i])); erf1 = 0.5f + 0.5f * erff(d1 * invsqrt2); erf2 = 0.5f + 0.5f * erff(d2 * invsqrt2); pC[i] = pS0[i] * erf1 - pK[i] * expf((-1.0f) * r * pT[i]) * erf2; }} . ? , #. , 2013 .

5. N60000000120000000180000000240000000 V017,00234,00451,00867,970 V116,77633,54950,33766,989 V22,8715,7278,64911,230 V30,5221,0491,5832,091 V40,5211,0361,5662,067 V50,5271,0471,5802,085

#. , 2013 .

6. . __declspec(noinline) void GetOptionPricesV6(float *pT, float *pK, float *pS0, float *pC) { int i; float d1, d2, erf1, erf2, invf; float sig2 = sig * sig;#pragma simd for (i = 0; i < N; i++) { invf = invsqrtf(sig2 * pT[i]); d1 = (logf(pS0[i] / pK[i]) + (r + sig2 * 0.5f) * pT[i]) * invf; d2 = (logf(pS0[i] / pK[i]) + (r - sig2 * 0.5f) * pT[i]) * invf; erf1 = 0.5f + 0.5f * erff(d1 * invsqrt2); erf2 = 0.5f + 0.5f * erff(d2 * invsqrt2); pC[i] = pS0[i] * erf1 - pK[i] * expf((-1.0f) * r * pT[i]) * erf2; }}#. , 2013 .

6. . N60000000120000000180000000240000000 V017,00234,00451,00867,970 V116,77633,54950,33766,989 V22,8715,7278,64911,230 V30,5221,0491,5832,091 V40,5211,0361,5662,067 V50,5271,0471,5802,085 V60,5381,0711,6142,133 . .

#. , 2013 .

6.1. SSE: 16, AVX: 32, Xeon Phi: 64memalign() -> __mm_malloc()Windows: __declspec(align(XX)) float T[N];Linux: float T[N] __attribute__((aligned(64)));#pragma vector aligned, __assume_aligned, __assumeint main(int argc, char *argv[]){ pT = (float *)memalign(32, 4 * N * sizeof(float));// pT = new float[4 * N]; ... free(pT);// delete [] pT; return 0;} #. , 2013 .

6.2. .icc ... -fimf-precision=low -fimf-domain-exclusion=31 N60000000120000000180000000240000000 V017,00234,00451,00867,970 V116,77633,54950,33766,989 V22,8715,7278,64911,230 V30,5221,0491,5832,091 V40,5211,0361,5662,067 V50,5271,0471,5802,085 V60,5381,0711,6142,133 V6.10,5391,0721,6172,135 V6.20,4380,8711,3141,724#. , 2013 .

7. #pragma omp parallel for private(invf, d1, d2, erf1, erf2)N60000000120000000180000000240000000 V017,00234,00451,00867,970 V116,77633,54950,33766,989 V22,8715,7278,64911,230 V30,5221,0491,5832,091 V40,5211,0361,5662,067 V50,5271,0471,5802,085 V60,5381,0711,6142,133 V6.10,5391,0721,6172,135 V6.20,4380,8711,3141,724 V7(16 )0,0580,0840,1260,153#. , 2013 .

7.1. . .

: ?#. , 2013 .

7.1. N60000000120000000180000000240000000 V017,00234,00451,00867,970 V116,77633,54950,33766,989 V22,8715,7278,64911,230 V30,5221,0491,5832,091 V40,5211,0361,5662,067 V50,5271,0471,5802,085 V60,5381,0711,6142,133 V6.10,5391,0721,6172,135 V6.20,4380,8711,3141,724 V70,0580,0840,1260,153 V6.30,4090,8121,2261,603 V7.10,0330,0620,0910,118#. , 2013 .

7.1. 7.1 6.3 , 7 6.2, 12.54 (60 . ) 13,61 (240 . ). 6.3 7,5% , 6.2, 7.1 , 7 ( 60 . , 70%, ).#. , 2013 .

Xeon Phi , 7. , , . , 6, Xeon. Xeon Phi -mmic. memalign() 32 64. . .

#. , 2013 .

Xeon Phi. Xeon Phi ( 6.2) 23%, , 2,3 . ( 60%) , , , . 6.3. . N60000000120000000180000000240000000 V61,5443,0894,6336,174 V6.11,5453,0914,6346,179 V6.20,6761,3522,0272,703 V6.30,4220,8451,2691,690#. , 2013 .

Xeon Phi. N60000000120000000180000000240000000 V70,1340,1490,1640,175S(V6.2/V7)5,03369,05012,33115,437 V7.10,0080,0170,0250,033S(V6.3/V7.1)50,58551,17851,78351,546N60000000120000000180000000240000000 V70,2340,2550,2570,255S(V6.2/V7)2,8855,3037,88310,590 V7.10,0070,0140,0210,028S(V6.3/V7.1)59,42259,58760,38959,839N60000000120000000180000000240000000 V70,5320,5270,5330,558S(V6.2/V7)1,2692,5643,8004,842 V7.10,0080,0160,0240,031S(V6.3/V7.1)53,28654,24853,96953,96460120240#. , 2013 .

Xeon Phi. 7 . , . ( 7.1) ( 50,5 60,4). 120 , 60 , Xeon Phi.

- ?#. , 2013 .

8. - 4 (pT, pK, PS0, pC). 3 , (pC) . , pC, . , ( pC nontemporal data). , , , nontemporal data, streaming stores, .#pragma vector nontemporal#. , 2013 .

8. -N60000000120000000180000000240000000 V80,0090,0180,0270,035S(V6.3/V8)46,87847,50547,61047,832N60000000120000000180000000240000000 V80,0070,0130,0190,026S(V6.3/V8)63,87365,04865,42665,887N60000000120000000180000000240000000 V80,0070,0130,0190,026S(V6.3/V8)63,22264,76865,37965,42060120240 54 65.

#. , 2013 .

Xeon vs. Xeon PhiN60000000120000000180000000240000000Xeon0,0300,0610,0900,116Xeon Phi0,0070,0130,0190,026#. , 2013 .

, . , Call Put. , Xeon Xeon Phi, , Xeon Phi . Xeon Phi . , .#. , 2013 .

. , 2. . 3- . .: , 2007. . 832. .. . , 2004. 1076. .., .., .., .. . . . 4 . : - , 2013. 1394 .#. , 2013 .

, ..., , . . [email protected] ,..., [email protected]

#. , 2013 .

12.012.42.882.62.542.362.242.832.572.651.342.482.472.532.142.832.292.142.091.82.242.332.942.882.562.172.092.562.82.352.772.62.212.992.152.492.62.42.022.142.212.12.592.742.972.051.42.262.852.452.332.692.642.72.91.82.352.462.012.492.822.122.562.913.82.842.322.752.672.962.242.42.062.12.162.282.242.52.172.122.632.583.42.482.722.012.592.042.712.8821.82.532.662.882.962.012.142.522.213.82.282.472.752.052.372.392.12.082.092.282.522.992.632.592.952.362.732.952.722.862.832.752.332.462.022.1232.192.932.942.042.152.482.312.752.22.742.432.252.282.6132.662.012.882.152.262.562.352.472.692.152.352.852.112.622.822.762.442.042.562.432.52.012.072.922.552.72.342.322.022.22.172.892.372.382.722.572.45

1 2 3 4 5 6 7 8 9

1 1 2 3 4 5 6 7 8 902.012.42.882.62.542.362.242.832.57t12.651.342.482.472.532.142.832.292.14t22.091.82.242.332.942.882.562.172.09t32.562.82.352.772.62.212.992.152.49t42.62.42.022.142.212.12.592.742.97t52.051.42.262.852.452.332.692.642.7t62.91.82.352.462.012.492.822.122.56t72.913.82.842.322.752.672.962.242.4t82.062.12.162.282.242.52.172.122.63t92.583.42.482.722.012.592.042.712.88t1021.82.532.662.882.962.012.142.52t112.213.82.282.472.752.052.372.392.1t122.082.092.282.522.992.632.592.952.36t132.732.952.722.862.832.752.332.462.02t142.1232.192.932.942.042.152.482.31t152.752.22.742.432.252.282.6132.66t162.012.882.152.262.562.352.472.692.15t172.352.852.112.622.822.762.442.042.56t182.432.52.012.072.922.552.72.342.32T2.022.22.172.892.372.382.722.572.45