//C code



float x = color.r-mean[0], y = color.g-mean[1], z = color.b-mean[2];

float expTerm = x*(invCovar[0][0]*x+invCovar[0][1]*y+invCovar[0][2]*z)+
y*(invCovar[1][0]*x+invCovar[1][1]*y+invCovar[1][2]*z)+
z*(invCovar[2][0]*x+invCovar[2][1]*y+invCovar[2][2]*z);

 

//SIMD Code

__m128 color_src, mean_src, sub_result;
__m128 tmp_1, tmp_2, tmp_3;
__m128 result_1, result_2, result_3;
__m128 result_4, result_5, result_6;
__m128 result_7, result_8, result_9;

color_src    = _mm_set_ps( 0, color.b, color.g, color.r );
mean_src    = _mm_set_ps( 0, mean[2], mean[1], mean[0] );
sub_result    = _mm_sub_ps( color_src, mean_src ); //0,z,y,x

tmp_1 = _mm_set_ps( 0, invCovar[0][2], invCovar[0][1], invCovar[0][0] );
tmp_2 = _mm_set_ps( 0, invCovar[1][2], invCovar[1][1], invCovar[1][0] );
tmp_3 = _mm_set_ps( 0, invCovar[2][2], invCovar[2][1], invCovar[2][0] );

result_1 = _mm_mul_ps( sub_result, tmp_1); //[0][A2][A1][A0]
result_2 = _mm_mul_ps( sub_result, tmp_2); //[0][B2][B1][B0]
result_3 = _mm_mul_ps( sub_result, tmp_3); //[0][C2][C1][C0]
    
result_4 = _mm_hadd_ps( result_1, result_2); //[0+B2][B1+B0][0+A2][A1+A0]
result_5 = _mm_hadd_ps( result_3, result_2); //[0+B2][B1+B0][0+C2][C1+C0]
result_6 = _mm_hadd_ps( result_4, result_5); //[0+B2+B1+B0][0+C2+C1+C0][0+B2+B1+B0][0+A2+A1+A0]
result_7 = _mm_mul_ps( result_6, sub_result); //[0][Z*(0+C2+C1+C0)][Y*(0+B2+B1+B0)][X*(0+A2+A1+A0)]

result_8 = _mm_hadd_ps( result_7, result_7); //4個橫加
result_9 = _mm_hadd_ps( result_8, result_8);

 C++ Time = 0.000135761ms
 SIMD Time = 0.000134692ms
arrow
arrow
    全站熱搜

    chunyuan 發表在 痞客邦 留言(0) 人氣()