SSE指令集入门教程
简介
官方文档:intel intrinsics guide
SSE指令集使用方法和CUDA类似
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
| void sse_cal(float *a,float*b) { __m128 m1, m2, m3; __m128 SSEA = _mm_load_ss(a); __m128 SSEB = _mm_load_ss(b); __m128 h = _mm_set_ss(1.0f);
for(int i=0;i<LOOP;i++) { m1 = _mm_mul_ss(SSEA, SSEB); m2 = _mm_sqrt_ss(SSEB); m3 = _mm_add_ss(m1,m2); SSEA = _mm_add_ss(SSEA, h); SSEB = _mm_add_ss(SSEB, h); } }
|
参考:
入门教程
SSE指令的使用学习
SSE双线性插值
OpenMP+SSE
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39
| void TransformPointCloud(std::vector<Vector3>& pointcloud, const Eigen::Matrix4& transformation_matrix) { if (transformation_matrix == Eigen::Matrix4::Identity()) { return; } #pragma parallel for for (int i = 0; i < pointcloud.size(); i++) { pointcloud[i] = transformation_matrix.topLeftCorner<3, 3>()*pointcloud[i] + transformation_matrix.topRightCorner<3, 1>(); } }
void TransformPointCloud_SSE(std::vector<Vector3>& pointsIn, std::vector<Vector3>& pointsOut, const Eigen::Matrix4& transformation_matrix) { int num = pointsIn.size(); if (num != pointsOut.size()) pointsOut.resize(num); __m128 c[4]; for (int i = 0; i < 4; i++) { c[i] = _mm_load_ps(transformation_matrix.col(i).data()); } #pragma omp parallel for for (int i = 0; i < num; i++) { Vector4 temp; __m128 p0 = _mm_mul_ps(_mm_load_ps1(&(pointsIn[i].x())), c[0]); __m128 p1 = _mm_mul_ps(_mm_load_ps1(&(pointsIn[i].y())), c[1]); __m128 p2 = _mm_mul_ps(_mm_load_ps1(&(pointsIn[i].z())), c[2]); _mm_store_ps(temp.data(), _mm_add_ps(p0, _mm_add_ps(p1, _mm_add_ps(p2, c[3])))); pointsOut[i] = temp.head<3>(); } }
|
780000个点运行100次
普通点云变换时间为:582ms
SSE加速变换时间为:173ms