SSE coding

SSE指令集入门教程

简介

官方文档:intel intrinsics guide

SSE指令集使用方法和CUDA类似

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
void sse_cal(float *a,float*b)
{
__m128 m1, m2, m3; //声明变量
__m128 SSEA = _mm_load_ss(a); //将a地址指向的值复制给SSEA
__m128 SSEB = _mm_load_ss(b); //将a地址指向的值复制给SSEA
__m128 h = _mm_set_ss(1.0f); //声明变量并赋值


for(int i=0;i<LOOP;i++)
{
//类似于cuda里面的thrust函数进行一些常规操作
m1 = _mm_mul_ss(SSEA, SSEB);//相乘
m2 = _mm_sqrt_ss(SSEB); //平方和
m3 = _mm_add_ss(m1,m2); //相加
SSEA = _mm_add_ss(SSEA, h);
SSEB = _mm_add_ss(SSEB, h);
}
}

参考:

入门教程

SSE指令的使用学习

SSE双线性插值

OpenMP+SSE

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
void TransformPointCloud(std::vector<Vector3>& pointcloud, const Eigen::Matrix4& transformation_matrix)
{
if (transformation_matrix == Eigen::Matrix4::Identity())
{
return;
}
#pragma parallel for
for (int i = 0; i < pointcloud.size(); i++)
{
pointcloud[i] = transformation_matrix.topLeftCorner<3, 3>()*pointcloud[i] + transformation_matrix.topRightCorner<3, 1>();
}
}


void TransformPointCloud_SSE(std::vector<Vector3>& pointsIn, std::vector<Vector3>& pointsOut, const Eigen::Matrix4& transformation_matrix)
{
int num = pointsIn.size();
if (num != pointsOut.size())
pointsOut.resize(num);
__m128 c[4];
for (int i = 0; i < 4; i++)
{
c[i] = _mm_load_ps(transformation_matrix.col(i).data());
}
//T11*X+T21*Y+T31*Z+T41
//T12*X+T22*Y+T32*Z+T42
//T13*X+T23*Y+T33*Z+T43
//T14*X+T24*Y+T34*Z+T44
#pragma omp parallel for
for (int i = 0; i < num; i++)
{
Vector4 temp;
__m128 p0 = _mm_mul_ps(_mm_load_ps1(&(pointsIn[i].x())), c[0]);
__m128 p1 = _mm_mul_ps(_mm_load_ps1(&(pointsIn[i].y())), c[1]);
__m128 p2 = _mm_mul_ps(_mm_load_ps1(&(pointsIn[i].z())), c[2]);
_mm_store_ps(temp.data(), _mm_add_ps(p0, _mm_add_ps(p1, _mm_add_ps(p2, c[3]))));
pointsOut[i] = temp.head<3>();
}
}

780000个点运行100次

普通点云变换时间为:582ms
SSE加速变换时间为:173ms

# SSE

评论

Your browser is out-of-date!

Update your browser to view this website correctly. Update my browser now

×