OpenCV (C++) 提速技巧(以Haar小波变换为例)

最近在写利用小波变换进行图像融合的算法,该算法对性能有要求,因此整理了两点能够简单快速提升算法速度的技巧,以下以笔者写的小波变换代码为例。

并行计算

cv::parallel_for_()是OpenCV的一个并行框架,其利用OpenMP、Windows并发、pthreads等并行框架构建。当调用cv::parallel_for_() 函数时,OpenCV会基于线程池分配CPU资源,将任务划分为多个独立且并行执行的较小子任务。但是需要注意的是,如果在嵌套循环的每层都使用并行化,可能会导致线程过多,反而增加调度开销,降低性能。

以下是未经优化的原始Haar小波变换及逆变换算法。

struct WaveletCoeffs {
  cv::Mat cA;  // 低频分量
  cv::Mat cH;  // 水平高频
  cv::Mat cV;  // 垂直高频
  cv::Mat cD;  // 对角高频
};

WaveletCoeffs dwt(const cv::Mat& img) {
  cv::Mat img_float{};
  img.convertTo(img_float, CV_32FC1);

  const int h = img_float.rows;
  const int w = img_float.cols;
  const int pad_h = (h % 2 != 0) ? 1 : 0;
  const int pad_w = (w % 2 != 0) ? 1 : 0;
  cv::copyMakeBorder(img_float, img_float, 0, pad_h, 0, pad_w, cv::BORDER_REFLECT_101);

  const int half_rows = img_float.rows / 2;
  const int half_cols = img_float.cols / 2;
  cv::Mat cA = cv::Mat::zeros(half_rows, half_cols, CV_32FC1);
  cv::Mat cH = cv::Mat::zeros(half_rows, half_cols, CV_32FC1);
  cv::Mat cV = cv::Mat::zeros(half_rows, half_cols, CV_32FC1);
  cv::Mat cD = cv::Mat::zeros(half_rows, half_cols, CV_32FC1);

  for (int r = 0; r < half_rows; r++) {
    for (int c = 0; c < half_cols; c++) {
      const float b00 = img_float.at<float>(2 * r, 2 * c);
      const float b01 = img_float.at<float>(2 * r, 2 * c + 1);
      const float b10 = img_float.at<float>(2 * r + 1, 2 * c);
      const float b11 = img_float.at<float>(2 * r + 1, 2 * c + 1);

      cA.at<float>(r, c) = (b00 + b01 + b10 + b11) * 0.5f;
      cH.at<float>(r, c) = (b00 - b01 + b10 - b11) * 0.5f;
      cV.at<float>(r, c) = (b00 + b01 - b10 - b11) * 0.5f;
      cD.at<float>(r, c) = (b00 - b01 - b10 + b11) * 0.5f;
    }
  }

  return {cA, cH, cV, cD};
}

cv::Mat idwt(const WaveletCoeffs& coeffs) {
  const cv::Mat& cA = coeffs.cA;
  const cv::Mat& cH = coeffs.cH;
  const cv::Mat& cV = coeffs.cV;
  const cv::Mat& cD = coeffs.cD;

  const int half_rows = cA.rows;
  const int half_cols = cA.cols;
  const int rows = half_rows * 2;
  const int cols = half_cols * 2;
  cv::Mat recovered = cv::Mat::zeros(rows, cols, CV_32FC1);

  for (int r = 0; r < half_rows; r++) {
    for (int c = 0; c < half_cols; c++) {
      const float val_cA = cA.at<float>(r, c);
      const float val_cH = cH.at<float>(r, c);
      const float val_cV = cV.at<float>(r, c);
      const float val_cD = cD.at<float>(r, c);

      recovered.at<float>(2 * r, 2 * c) = (val_cA + val_cH + val_cV + val_cD) * 0.5f;
      recovered.at<float>(2 * r, 2 * c + 1) = (val_cA - val_cH + val_cV - val_cD) * 0.5f;
      recovered.at<float>(2 * r + 1, 2 * c) = (val_cA + val_cH - val_cV - val_cD) * 0.5f;
      recovered.at<float>(2 * r + 1, 2 * c + 1) = (val_cA - val_cH - val_cV + val_cD) * 0.5f;
    }
  }
  cv::Mat result{};
  recovered.convertTo(result, CV_8UC1);

  return result;
}

在这里,笔者使用两张1800x1800分辨率的灰度(单通道)图像,利用小波变换进行多对焦图像融合,预热并运行三次输出平均融合时间。

const auto start_fuse_time = cv::getTickCount();
cv::Mat fused_img = fusion(img_lst);
const auto fuse_time = cv::getTickCount();
std::cout << "融合耗时:"
  << static_cast<double>(fuse_time - start_fuse_time) / cv::getTickFrequency() * 1000
  << " ms" << std::endl;

// >>> 融合耗时:29.1 ms

接下来笔者将最外层的行循环改为并行计算,并利用同样的方式测试时间。

WaveletCoeffs dwt(const cv::Mat& img) {
  cv::Mat img_float{};
  img.convertTo(img_float, CV_32FC1);

  const int h = img_float.rows;
  const int w = img_float.cols;
  const int pad_h = (h % 2 != 0) ? 1 : 0;
  const int pad_w = (w % 2 != 0) ? 1 : 0;
  cv::copyMakeBorder(img_float, img_float, 0, pad_h, 0, pad_w, cv::BORDER_REFLECT_101);

  const int half_rows = img_float.rows / 2;
  const int half_cols = img_float.cols / 2;
  cv::Mat cA = cv::Mat::zeros(half_rows, half_cols, CV_32FC1);
  cv::Mat cH = cv::Mat::zeros(half_rows, half_cols, CV_32FC1);
  cv::Mat cV = cv::Mat::zeros(half_rows, half_cols, CV_32FC1);
  cv::Mat cD = cv::Mat::zeros(half_rows, half_cols, CV_32FC1);

  // 在这里采用并行化
  cv::parallel_for_(cv::Range(0, half_rows), [&](const cv::Range& range) {
    for (int r = range.start; r < range.end; r++) {
      for (int c = 0; c < half_cols; c++) {
        const float b00 = img_float.at<float>(2 * r, 2 * c);
        const float b01 = img_float.at<float>(2 * r, 2 * c + 1);
        const float b10 = img_float.at<float>(2 * r + 1, 2 * c);
        const float b11 = img_float.at<float>(2 * r + 1, 2 * c + 1);

        cA.at<float>(r, c) = (b00 + b01 + b10 + b11) * 0.5f;
        cH.at<float>(r, c) = (b00 - b01 + b10 - b11) * 0.5f;
        cV.at<float>(r, c) = (b00 + b01 - b10 - b11) * 0.5f;
        cD.at<float>(r, c) = (b00 - b01 - b10 + b11) * 0.5f;
      }
    }
  });

  return {cA, cH, cV, cD};
}

cv::Mat idwt(const WaveletCoeffs& coeffs) {
  const cv::Mat& cA = coeffs.cA;
  const cv::Mat& cH = coeffs.cH;
  const cv::Mat& cV = coeffs.cV;
  const cv::Mat& cD = coeffs.cD;

  const int half_rows = cA.rows;
  const int half_cols = cA.cols;
  const int rows = half_rows * 2;
  const int cols = half_cols * 2;
  cv::Mat recovered = cv::Mat::zeros(rows, cols, CV_32FC1);

  // 在这里采用并行化
  cv::parallel_for_(cv::Range(0, half_rows), [&](const cv::Range& range) {
    for (int r = range.start; r < range.end; r++) {
      for (int c = 0; c < half_cols; c++) {
        const float val_cA = cA.at<float>(r, c);
        const float val_cH = cH.at<float>(r, c);
        const float val_cV = cV.at<float>(r, c);
        const float val_cD = cD.at<float>(r, c);

        recovered.at<float>(2 * r, 2 * c) = (val_cA + val_cH + val_cV + val_cD) * 0.5f;
        recovered.at<float>(2 * r, 2 * c + 1) = (val_cA - val_cH + val_cV - val_cD) * 0.5f;
        recovered.at<float>(2 * r + 1, 2 * c) = (val_cA + val_cH - val_cV - val_cD) * 0.5f;
        recovered.at<float>(2 * r + 1, 2 * c + 1) = (val_cA - val_cH - val_cV + val_cD) * 0.5f;
      }
    }
  });

  cv::Mat result{};
  recovered.convertTo(result, CV_8UC1);

  return result;
}

// >>> 融合耗时:12.6 ms 

可以看到,速度提升了大约56%,那如果两层嵌套循环都使用并行化对于速度提升会有帮助吗,笔者的CPU为AMD 9900X,硬件层面来讲供线程调度的核心数是管够的。以下为伪代码。

cv::parallel_for_(cv::Range(0, half_rows), [&](const cv::Range& range) {
  for (int r = range.start; r < range.end; r++) {
    cv::parallel_for_(cv::Range(0, half_cols), [&](const cv::Range& range2) {
        for (int c = range2.start; c < range.end; c++) {
          ...
        }
    });
  }
});

// 融合耗时:12.8 ms

可以看出速度并没有明显提升,有可能是上下文切换开销过高或者是线程调度不均匀导致的。

指针访问

另一种提升算法速度的便是在访问像素点时采用指针的形式访问。在OpenCV中访问矩阵的某个像素常采用的方式是mat.at<type>();,在以上代码中则以const float val_cA = cA.at<float>(r, c);形式出现。这种方法会先对该位置进行边界检查,再去访问。如果访问过于频繁,无疑会拖慢运行速度。因此在确保循环边界的情况下,可以直接使用指针操作来避免边界检查,形式如auto* r = mat.ptr<type>();。不过需要注意,通过指针访问其参数只能有一个,即按行/列访问。

以下为仅将原版算法改为使用指针访问的小波变换算法。

WaveletCoeffs dwt(const cv::Mat& img) {
  cv::Mat img_float{};
  img.convertTo(img_float, CV_32FC1);

  const int h = img_float.rows;
  const int w = img_float.cols;
  const int pad_h = (h % 2 != 0) ? 1 : 0;
  const int pad_w = (w % 2 != 0) ? 1 : 0;
  cv::copyMakeBorder(img_float, img_float, 0, pad_h, 0, pad_w, cv::BORDER_REFLECT_101);

  const int half_rows = img_float.rows / 2;
  const int half_cols = img_float.cols / 2;
  cv::Mat cA = cv::Mat::zeros(half_rows, half_cols, CV_32FC1);
  cv::Mat cH = cv::Mat::zeros(half_rows, half_cols, CV_32FC1);
  cv::Mat cV = cv::Mat::zeros(half_rows, half_cols, CV_32FC1);
  cv::Mat cD = cv::Mat::zeros(half_rows, half_cols, CV_32FC1);

  // 仅修改指针访问部分
  for (int r = 0; r < half_rows; r++) {
    const auto* row0 = img_float.ptr<float>(2 * r);
    const auto* row1 = img_float.ptr<float>(2 * r + 1);
    auto* cA_row = cA.ptr<float>(r);
    auto* cH_row = cH.ptr<float>(r);
    auto* cV_row = cV.ptr<float>(r);
    auto* cD_row = cD.ptr<float>(r);

    for (int c = 0; c < half_cols; c++) {
      const float b00 = row0[2 * c];
      const float b01 = row0[2 * c + 1];
      const float b10 = row1[2 * c];
      const float b11 = row1[2 * c + 1];

      cA_row[c] = (b00 + b01 + b10 + b11) * 0.5f;
      cH_row[c] = (b00 - b01 + b10 - b11) * 0.5f;
      cV_row[c] = (b00 + b01 - b10 - b11) * 0.5f;
      cD_row[c] = (b00 - b01 - b10 + b11) * 0.5f;
    }
  }

  return {cA, cH, cV, cD};
}

cv::Mat idwt(const WaveletCoeffs& coeffs) {
  const cv::Mat& cA = coeffs.cA;
  const cv::Mat& cH = coeffs.cH;
  const cv::Mat& cV = coeffs.cV;
  const cv::Mat& cD = coeffs.cD;

  const int half_rows = cA.rows;
  const int half_cols = cA.cols;
  const int rows = half_rows * 2;
  const int cols = half_cols * 2;
  cv::Mat recovered = cv::Mat::zeros(rows, cols, CV_32FC1);

  // 仅修改指针访问部分
  for (int r = 0; r < half_rows; r++) {
    const auto* cA_row = cA.ptr<float>(r);
    const auto* cH_row = cH.ptr<float>(r);
    const auto* cV_row = cV.ptr<float>(r);
    const auto* cD_row = cD.ptr<float>(r);

    auto* rec_row0 = recovered.ptr<float>(2 * r);
    auto* rec_row1 = recovered.ptr<float>(2 * r + 1);

    for (int c = 0; c < half_cols; c++) {
      const float val_cA = cA_row[c];
      const float val_cH = cH_row[c];
      const float val_cV = cV_row[c];
      const float val_cD = cD_row[c];

      rec_row0[2 * c] = (val_cA + val_cH + val_cV + val_cD) * 0.5f;
      rec_row0[2 * c + 1] = (val_cA - val_cH + val_cV - val_cD) * 0.5f;
      rec_row1[2 * c] = (val_cA + val_cH - val_cV - val_cD) * 0.5f;
      rec_row1[2 * c + 1] = (val_cA - val_cH - val_cV + val_cD) * 0.5f;
    }
  }

  cv::Mat result{};
  recovered.convertTo(result, CV_8UC1);

  return result;
}

// >>> 融合耗时:12.1 ms

可以看到,直接通过指针访问像素点,其运算速度已经达到了使用并行计算的速度,其效率提升是非常可观的。

posted @ 2025-12-15 13:46  絵守辛玥  阅读(118)  评论(0)    收藏  举报