我有哪些选项可以将OpenCV reduce循环转换为本地iOS代码.SIMD有人吗



哪种原生iOS框架最适合用来消除OpenCV中编写的cpu占用?

/// Reduce the channel elements of given Mat to a single channel
static func reduce(input: Mat) throws -> Mat {

let output = Mat(rows: input.rows(), cols: input.cols(), type: CvType.CV_8UC1)

for x in 0 ..< input.rows() {
for y in 0 ..< input.cols() {
let value = input.get(row: x, col: y)
let dataValue = value.reduce(0, +)
try output.put(row: x, col: y, data: [dataValue])
}
}

return output
}

大约需要20多秒的时间来完成这些获取,并将这些代码放入真实世界的数据中。

假设输入矩阵为CV_64FC2,则为每行调用computeSumX2C函数。

未测试。

#include <arm_neon.h>
#include <stdint.h>
#include <stddef.h>
// Load 8 FP64 values, add pairwise, narrow uint64 to uint32, combine into a single vector
inline uint32x4_t reduce4( const double* rsi )
{
// Load 8 values
float64x2x4_t f64 = vld1q_f64_x4( rsi );
// Add them pairwise
float64x2_t f64_1 = vpaddq_f64( f64.val[ 0 ], f64.val[ 1 ] );
float64x2_t f64_2 = vpaddq_f64( f64.val[ 2 ], f64.val[ 3 ] );
// Convert FP64 to uint64
uint64x2_t i64_1 = vcvtq_u64_f64( f64_1 );
uint64x2_t i64_2 = vcvtq_u64_f64( f64_2 );
// Convert int64 to int32 in a single vector, using saturation
uint32x2_t low = vqmovn_u64( i64_1 );
return vqmovn_high_u64( low, i64_2 );
}
// Compute pairwise sum of FP64 values, cast to bytes
void computeSumX2( uint8_t* rdi, size_t length, const double* rsi )
{
const double* const rsiEnd = rsi + length * 2;
size_t lengthAligned = ( length / 16 ) * 16;
const double* const rsiEndAligned = rsi + lengthAligned * 2;
for( ; rsi < rsiEndAligned; rsi += 16 * 2, rdi += 16 )
{
// Each iteration of the loop loads 32 source values, stores 16 bytes
uint16x4_t low16 = vqmovn_u32( reduce4( rsi ) );
uint16x8_t u16 = vqmovn_high_u32( low16, reduce4( rsi + 8 ) );
uint8x8_t low8 = vqmovn_u16( u16 );
low16 = vqmovn_u32( reduce4( rsi + 8 * 2 ) );
u16 = vqmovn_high_u32( low16, reduce4( rsi + 8 * 3 ) );
uint8x16_t res = vqmovn_high_u16( low8, u16 );
vst1q_u8( rdi, res );
}
for( ; rsi < rsiEnd; rsi += 2, rdi++ )
{
// Each iteration of the loop loads 2 source values, stores a single byte
float64x2_t f64 = vld1q_f64( rsi );
double sum = vaddvq_f64( f64 );
*rdi = (uint8_t)sum;
}
}

对于像我这样对ARM本质理解不深的人一个更简单的解决方案是像Soots那样桥接到Objective C代码因此,抛弃了粗糙的Swift api,转而使用opencv,绕过了使用get-and-put进行的代价高昂的内存复制。

void fasterSumX2( const char *input,
int rows,
int columns,
long step,
int channels,
char* output,
long output_step
)
{
for(int j = 0;j < rows;j++){
for(int i = 0;i < columns;i++){
long offset = step * j + i * channels;
const unsigned char *ptr = (const unsigned char *)(input + offset);
int res = ptr[0]+ptr[1];
if (res > 0) {
if (res > 255) {
assert(false);
}
}
*(output + output_step * j + i) = res;
}
}
}

最新更新