我在以下有 Eigen C++代码并进行 squredNorm 计算 10000 万次。
无论如何可以使其更强大/更快.
#include <Eigen/Core>
#include <tbb/parallel_for.h>
#include "tbb/tbb.h"
#include <mutex>
#include <opencv2/opencv.hpp>
int main(){
int numberOFdata = 10000008;
Eigen::MatrixXf feat = Eigen::MatrixXf::Random(numberOFdata,512);
Eigen::MatrixXf b_cmp= Eigen::MatrixXf::Random(1,512);
int count_feature = feat.rows();
std::vector<int> found_number ;
std::mutex mutex1;
for (int loop = 0 ; loop<16 ; loop++){
double start_1 = static_cast<double>(cv::getTickCount());
tbb::affinity_partitioner ap;
tbb::parallel_for( tbb::blocked_range<int>(0,count_feature),
[&](tbb::blocked_range<int> r )
{
for (int i=r.begin(); i<r.end(); ++i)
{
auto distance = ( feat.row(i)- b_cmp ).squaredNorm();
if (distance < 0.5) {
mutex1.lock();
found_number.push_back(i);
mutex1.unlock();
}
}
},ap);
double timefin = ((double)cv::getTickCount() - start_1) / cv::getTickFrequency();
std::cout << count_feature << " TOTAL : " << timefin << std::endl;
}
}
编译标志:
-Xpreprocessor -std=c++11 -fopenmp -pthread -O3 -mavx2 -march=native -funroll-loops -fpermissive
特征版本 3.3.7 TBB OpenCV和Eigen链接。
您可以删除 opencv 并使用不同的已用时间计算。
谢谢
如果您以与访问feat
相同的顺序存储,您应该快约 4 倍(即,在您的情况下为Eigen::RowMajor
)。
删除所有与本征无关的内容的最小示例:
int numberOFdata = 10000008;
Eigen::Matrix<float,Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> feat = Eigen::MatrixXf::Random(numberOFdata, 512);
Eigen::RowVectorXf b_cmp = Eigen::MatrixXf::Random(1, 512);
int count_feature = feat.rows();
std::vector<int> found_number;
for (int loop = 0; loop < 16; loop++) {
auto start = std::chrono::steady_clock::now();
{
for (int i = 0; i < feat.rows(); ++i) {
float distance = (feat.row(i) - b_cmp).squaredNorm();
if (distance < 0.5f) {
found_number.push_back(i);
}
}
};
auto end = std::chrono::steady_clock::now();
std::chrono::duration<double> diff = end-start;
std::cout << count_feature << " TOTAL : " <<
diff.count() << std::endl;
}
Godbolt-Demo(由于内存限制而减小了feat
的维度):https://godbolt.org/z/b6r5K4Yxv