OpenCV (C++) 提速技巧(以Haar小波变换为例)
最近在写利用小波变换进行图像融合的算法,该算法对性能有要求,因此整理了两点能够简单快速提升算法速度的技巧,以下以笔者写的小波变换代码为例。
并行计算
cv::parallel_for_()是OpenCV的一个并行框架,其利用OpenMP、Windows并发、pthreads等并行框架构建。当调用cv::parallel_for_() 函数时,OpenCV会基于线程池分配CPU资源,将任务划分为多个独立且并行执行的较小子任务。但是需要注意的是,如果在嵌套循环的每层都使用并行化,可能会导致线程过多,反而增加调度开销,降低性能。
以下是未经优化的原始Haar小波变换及逆变换算法。
struct WaveletCoeffs {
cv::Mat cA; // 低频分量
cv::Mat cH; // 水平高频
cv::Mat cV; // 垂直高频
cv::Mat cD; // 对角高频
};
WaveletCoeffs dwt(const cv::Mat& img) {
cv::Mat img_float{};
img.convertTo(img_float, CV_32FC1);
const int h = img_float.rows;
const int w = img_float.cols;
const int pad_h = (h % 2 != 0) ? 1 : 0;
const int pad_w = (w % 2 != 0) ? 1 : 0;
cv::copyMakeBorder(img_float, img_float, 0, pad_h, 0, pad_w, cv::BORDER_REFLECT_101);
const int half_rows = img_float.rows / 2;
const int half_cols = img_float.cols / 2;
cv::Mat cA = cv::Mat::zeros(half_rows, half_cols, CV_32FC1);
cv::Mat cH = cv::Mat::zeros(half_rows, half_cols, CV_32FC1);
cv::Mat cV = cv::Mat::zeros(half_rows, half_cols, CV_32FC1);
cv::Mat cD = cv::Mat::zeros(half_rows, half_cols, CV_32FC1);
for (int r = 0; r < half_rows; r++) {
for (int c = 0; c < half_cols; c++) {
const float b00 = img_float.at<float>(2 * r, 2 * c);
const float b01 = img_float.at<float>(2 * r, 2 * c + 1);
const float b10 = img_float.at<float>(2 * r + 1, 2 * c);
const float b11 = img_float.at<float>(2 * r + 1, 2 * c + 1);
cA.at<float>(r, c) = (b00 + b01 + b10 + b11) * 0.5f;
cH.at<float>(r, c) = (b00 - b01 + b10 - b11) * 0.5f;
cV.at<float>(r, c) = (b00 + b01 - b10 - b11) * 0.5f;
cD.at<float>(r, c) = (b00 - b01 - b10 + b11) * 0.5f;
}
}
return {cA, cH, cV, cD};
}
cv::Mat idwt(const WaveletCoeffs& coeffs) {
const cv::Mat& cA = coeffs.cA;
const cv::Mat& cH = coeffs.cH;
const cv::Mat& cV = coeffs.cV;
const cv::Mat& cD = coeffs.cD;
const int half_rows = cA.rows;
const int half_cols = cA.cols;
const int rows = half_rows * 2;
const int cols = half_cols * 2;
cv::Mat recovered = cv::Mat::zeros(rows, cols, CV_32FC1);
for (int r = 0; r < half_rows; r++) {
for (int c = 0; c < half_cols; c++) {
const float val_cA = cA.at<float>(r, c);
const float val_cH = cH.at<float>(r, c);
const float val_cV = cV.at<float>(r, c);
const float val_cD = cD.at<float>(r, c);
recovered.at<float>(2 * r, 2 * c) = (val_cA + val_cH + val_cV + val_cD) * 0.5f;
recovered.at<float>(2 * r, 2 * c + 1) = (val_cA - val_cH + val_cV - val_cD) * 0.5f;
recovered.at<float>(2 * r + 1, 2 * c) = (val_cA + val_cH - val_cV - val_cD) * 0.5f;
recovered.at<float>(2 * r + 1, 2 * c + 1) = (val_cA - val_cH - val_cV + val_cD) * 0.5f;
}
}
cv::Mat result{};
recovered.convertTo(result, CV_8UC1);
return result;
}
在这里,笔者使用两张1800x1800分辨率的灰度(单通道)图像,利用小波变换进行多对焦图像融合,预热并运行三次输出平均融合时间。
const auto start_fuse_time = cv::getTickCount();
cv::Mat fused_img = fusion(img_lst);
const auto fuse_time = cv::getTickCount();
std::cout << "融合耗时:"
<< static_cast<double>(fuse_time - start_fuse_time) / cv::getTickFrequency() * 1000
<< " ms" << std::endl;
// >>> 融合耗时:29.1 ms
接下来笔者将最外层的行循环改为并行计算,并利用同样的方式测试时间。
WaveletCoeffs dwt(const cv::Mat& img) {
cv::Mat img_float{};
img.convertTo(img_float, CV_32FC1);
const int h = img_float.rows;
const int w = img_float.cols;
const int pad_h = (h % 2 != 0) ? 1 : 0;
const int pad_w = (w % 2 != 0) ? 1 : 0;
cv::copyMakeBorder(img_float, img_float, 0, pad_h, 0, pad_w, cv::BORDER_REFLECT_101);
const int half_rows = img_float.rows / 2;
const int half_cols = img_float.cols / 2;
cv::Mat cA = cv::Mat::zeros(half_rows, half_cols, CV_32FC1);
cv::Mat cH = cv::Mat::zeros(half_rows, half_cols, CV_32FC1);
cv::Mat cV = cv::Mat::zeros(half_rows, half_cols, CV_32FC1);
cv::Mat cD = cv::Mat::zeros(half_rows, half_cols, CV_32FC1);
// 在这里采用并行化
cv::parallel_for_(cv::Range(0, half_rows), [&](const cv::Range& range) {
for (int r = range.start; r < range.end; r++) {
for (int c = 0; c < half_cols; c++) {
const float b00 = img_float.at<float>(2 * r, 2 * c);
const float b01 = img_float.at<float>(2 * r, 2 * c + 1);
const float b10 = img_float.at<float>(2 * r + 1, 2 * c);
const float b11 = img_float.at<float>(2 * r + 1, 2 * c + 1);
cA.at<float>(r, c) = (b00 + b01 + b10 + b11) * 0.5f;
cH.at<float>(r, c) = (b00 - b01 + b10 - b11) * 0.5f;
cV.at<float>(r, c) = (b00 + b01 - b10 - b11) * 0.5f;
cD.at<float>(r, c) = (b00 - b01 - b10 + b11) * 0.5f;
}
}
});
return {cA, cH, cV, cD};
}
cv::Mat idwt(const WaveletCoeffs& coeffs) {
const cv::Mat& cA = coeffs.cA;
const cv::Mat& cH = coeffs.cH;
const cv::Mat& cV = coeffs.cV;
const cv::Mat& cD = coeffs.cD;
const int half_rows = cA.rows;
const int half_cols = cA.cols;
const int rows = half_rows * 2;
const int cols = half_cols * 2;
cv::Mat recovered = cv::Mat::zeros(rows, cols, CV_32FC1);
// 在这里采用并行化
cv::parallel_for_(cv::Range(0, half_rows), [&](const cv::Range& range) {
for (int r = range.start; r < range.end; r++) {
for (int c = 0; c < half_cols; c++) {
const float val_cA = cA.at<float>(r, c);
const float val_cH = cH.at<float>(r, c);
const float val_cV = cV.at<float>(r, c);
const float val_cD = cD.at<float>(r, c);
recovered.at<float>(2 * r, 2 * c) = (val_cA + val_cH + val_cV + val_cD) * 0.5f;
recovered.at<float>(2 * r, 2 * c + 1) = (val_cA - val_cH + val_cV - val_cD) * 0.5f;
recovered.at<float>(2 * r + 1, 2 * c) = (val_cA + val_cH - val_cV - val_cD) * 0.5f;
recovered.at<float>(2 * r + 1, 2 * c + 1) = (val_cA - val_cH - val_cV + val_cD) * 0.5f;
}
}
});
cv::Mat result{};
recovered.convertTo(result, CV_8UC1);
return result;
}
// >>> 融合耗时:12.6 ms
可以看到,速度提升了大约56%,那如果两层嵌套循环都使用并行化对于速度提升会有帮助吗,笔者的CPU为AMD 9900X,硬件层面来讲供线程调度的核心数是管够的。以下为伪代码。
cv::parallel_for_(cv::Range(0, half_rows), [&](const cv::Range& range) {
for (int r = range.start; r < range.end; r++) {
cv::parallel_for_(cv::Range(0, half_cols), [&](const cv::Range& range2) {
for (int c = range2.start; c < range.end; c++) {
...
}
});
}
});
// 融合耗时:12.8 ms
可以看出速度并没有明显提升,有可能是上下文切换开销过高或者是线程调度不均匀导致的。
指针访问
另一种提升算法速度的便是在访问像素点时采用指针的形式访问。在OpenCV中访问矩阵的某个像素常采用的方式是mat.at<type>();,在以上代码中则以const float val_cA = cA.at<float>(r, c);形式出现。这种方法会先对该位置进行边界检查,再去访问。如果访问过于频繁,无疑会拖慢运行速度。因此在确保循环边界的情况下,可以直接使用指针操作来避免边界检查,形式如auto* r = mat.ptr<type>();。不过需要注意,通过指针访问其参数只能有一个,即按行/列访问。
以下为仅将原版算法改为使用指针访问的小波变换算法。
WaveletCoeffs dwt(const cv::Mat& img) {
cv::Mat img_float{};
img.convertTo(img_float, CV_32FC1);
const int h = img_float.rows;
const int w = img_float.cols;
const int pad_h = (h % 2 != 0) ? 1 : 0;
const int pad_w = (w % 2 != 0) ? 1 : 0;
cv::copyMakeBorder(img_float, img_float, 0, pad_h, 0, pad_w, cv::BORDER_REFLECT_101);
const int half_rows = img_float.rows / 2;
const int half_cols = img_float.cols / 2;
cv::Mat cA = cv::Mat::zeros(half_rows, half_cols, CV_32FC1);
cv::Mat cH = cv::Mat::zeros(half_rows, half_cols, CV_32FC1);
cv::Mat cV = cv::Mat::zeros(half_rows, half_cols, CV_32FC1);
cv::Mat cD = cv::Mat::zeros(half_rows, half_cols, CV_32FC1);
// 仅修改指针访问部分
for (int r = 0; r < half_rows; r++) {
const auto* row0 = img_float.ptr<float>(2 * r);
const auto* row1 = img_float.ptr<float>(2 * r + 1);
auto* cA_row = cA.ptr<float>(r);
auto* cH_row = cH.ptr<float>(r);
auto* cV_row = cV.ptr<float>(r);
auto* cD_row = cD.ptr<float>(r);
for (int c = 0; c < half_cols; c++) {
const float b00 = row0[2 * c];
const float b01 = row0[2 * c + 1];
const float b10 = row1[2 * c];
const float b11 = row1[2 * c + 1];
cA_row[c] = (b00 + b01 + b10 + b11) * 0.5f;
cH_row[c] = (b00 - b01 + b10 - b11) * 0.5f;
cV_row[c] = (b00 + b01 - b10 - b11) * 0.5f;
cD_row[c] = (b00 - b01 - b10 + b11) * 0.5f;
}
}
return {cA, cH, cV, cD};
}
cv::Mat idwt(const WaveletCoeffs& coeffs) {
const cv::Mat& cA = coeffs.cA;
const cv::Mat& cH = coeffs.cH;
const cv::Mat& cV = coeffs.cV;
const cv::Mat& cD = coeffs.cD;
const int half_rows = cA.rows;
const int half_cols = cA.cols;
const int rows = half_rows * 2;
const int cols = half_cols * 2;
cv::Mat recovered = cv::Mat::zeros(rows, cols, CV_32FC1);
// 仅修改指针访问部分
for (int r = 0; r < half_rows; r++) {
const auto* cA_row = cA.ptr<float>(r);
const auto* cH_row = cH.ptr<float>(r);
const auto* cV_row = cV.ptr<float>(r);
const auto* cD_row = cD.ptr<float>(r);
auto* rec_row0 = recovered.ptr<float>(2 * r);
auto* rec_row1 = recovered.ptr<float>(2 * r + 1);
for (int c = 0; c < half_cols; c++) {
const float val_cA = cA_row[c];
const float val_cH = cH_row[c];
const float val_cV = cV_row[c];
const float val_cD = cD_row[c];
rec_row0[2 * c] = (val_cA + val_cH + val_cV + val_cD) * 0.5f;
rec_row0[2 * c + 1] = (val_cA - val_cH + val_cV - val_cD) * 0.5f;
rec_row1[2 * c] = (val_cA + val_cH - val_cV - val_cD) * 0.5f;
rec_row1[2 * c + 1] = (val_cA - val_cH - val_cV + val_cD) * 0.5f;
}
}
cv::Mat result{};
recovered.convertTo(result, CV_8UC1);
return result;
}
// >>> 融合耗时:12.1 ms
可以看到,直接通过指针访问像素点,其运算速度已经达到了使用并行计算的速度,其效率提升是非常可观的。

浙公网安备 33010602011771号