SSE coding

SSE指令集入门教程

简介

SSE指令集使用方法和CUDA类似

void sse_cal(float *a,float*b)
{
    __m128 m1, m2, m3;            //声明变量
    __m128 SSEA = _mm_load_ss(a); //将a地址指向的值复制给SSEA
    __m128 SSEB = _mm_load_ss(b); //将a地址指向的值复制给SSEA
    __m128 h = _mm_set_ss(1.0f);  //声明变量并赋值


    for(int i=0;i<LOOP;i++)
    {
        //类似于cuda里面的thrust函数进行一些常规操作
        m1 = _mm_mul_ss(SSEA, SSEB);//相乘
        m2 = _mm_sqrt_ss(SSEB);     //平方和
        m3 = _mm_add_ss(m1,m2);     //相加
        SSEA = _mm_add_ss(SSEA, h); 
        SSEB = _mm_add_ss(SSEB, h); 
    }
}

参考：

入门教程

SSE指令的使用学习

SSE双线性插值

OpenMP+SSE

void TransformPointCloud(std::vector<Vector3>& pointcloud, const Eigen::Matrix4& transformation_matrix)
{
	if (transformation_matrix == Eigen::Matrix4::Identity())
	{
		return;
	}
#pragma parallel for
	for (int i = 0; i < pointcloud.size(); i++)
	{
		pointcloud[i] = transformation_matrix.topLeftCorner<3, 3>()*pointcloud[i] + transformation_matrix.topRightCorner<3, 1>();
	}
}


void TransformPointCloud_SSE(std::vector<Vector3>& pointsIn, std::vector<Vector3>& pointsOut, const Eigen::Matrix4& transformation_matrix)
{
	int num = pointsIn.size();
	if (num != pointsOut.size())
		pointsOut.resize(num);
	__m128 c[4];
	for (int i = 0; i < 4; i++)
	{
		c[i] = _mm_load_ps(transformation_matrix.col(i).data());
	}
	//T11*X+T21*Y+T31*Z+T41
	//T12*X+T22*Y+T32*Z+T42
	//T13*X+T23*Y+T33*Z+T43
	//T14*X+T24*Y+T34*Z+T44
#pragma omp parallel for
	for (int i = 0; i < num; i++)
	{
		Vector4 temp;
		__m128 p0 = _mm_mul_ps(_mm_load_ps1(&(pointsIn[i].x())), c[0]);
		__m128 p1 = _mm_mul_ps(_mm_load_ps1(&(pointsIn[i].y())), c[1]);
		__m128 p2 = _mm_mul_ps(_mm_load_ps1(&(pointsIn[i].z())), c[2]);
		_mm_store_ps(temp.data(), _mm_add_ps(p0, _mm_add_ps(p1, _mm_add_ps(p2, c[3]))));
		pointsOut[i] = temp.head<3>();
	}
}

780000个点运行100次

普通点云变换时间为：582ms
SSE加速变换时间为：173ms

# SSE

支付宝

微信

SSE coding

简介

OpenMP+SSE

喜欢这篇文章？打赏一下作者吧

评论

链接

分类

标签云

最新文章

归档

标签

最新文章

归档

标签

Your browser is out-of-date!