ARM Neon 优化的一些例子.docVIP

  • 76
  • 0
  • 约1.99千字
  • 约 6页
  • 2018-01-04 发布于河南
  • 举报
ARM Neon 优化的一些例子

累加数组内起始元素到制定个数元素的累加和: uint32_t vector_add_of_n(uint32_t *ptr, uint32_t nitems) { uint32_t result, *i; uint32x2_t vec64a, vec64b; uint32x4_t vec128 = vdupq_n_u32(0); for (i = ptr; i (ptr + nitems); i += 4) { uint32x4_t temp128 = vld1q_u32(i); vec128 = vaddq_u32(vec128, temp128); } vec64a = vget_low_u32(vec128); vec64b = vget_high_u32(vec128); vec64a = vadd_u32 (vec64a, vec64b); result = vget_lane_u32(vec64a, 0); result += vget_lane_u32(vec64a, 1); return result; } 两个数组相关映射元素的乘积和: void fir(short * y,const short *x, const short *h,int n_out, int n_coefs) { int n, k; int sum; int16x4_t h_vec; int16x4_t x_vec; int32x4_t result_vec; for (n = 0; n n_out; n++) { sum = 0; result_vec = vdupq_n_s32(0); for(k = 0; k n_coefs / 4; k++) { h_vec = vld1_s16(h[k*4]); x_vec = vld1_s16(x[n - n_coefs + 1 + k*4]); result_vec = vmlal_s16(result_vec, h_vec, x_vec); } sum += vgetq_lane_s32(result_vec, 0); sum += vgetq_lane_s32(result_vec, 1); sum += vgetq_lane_s32(result_vec, 2); sum += vgetq_lane_s32(result_vec, 3); if(n_coefs % 4) { for(k = n_coefs - (n_coefs % 4); k n_coefs; k++) sum += h[k] * x[n - n_coefs + 1 + k]; } y[n] = ((sum15) + 1) 1; } } 彩色转灰度:(C) void reference_convert (uint8_t * __restrict dest, uint8_t * __restrict src, int n) { int i; for (i=0; in; i++) { int r = *src++; int g = *src++; int b = *src++; int y = (r*77)+(g*151)+(b*28); *dest++ = (y8); } } void neon_convert (uint8_t * __restrict dest, uint8_t * __restrict src, int n) { int i; uint8x8_t rfac = vdup_n_u8 (77); uint8x8_t gfac = vdup_n_u8 (151); uint8x8_t bfac = vdup_n_u8 (28); n/=8; for (i=0; in; i++) { uint16x8_t temp; uint8x8x3_t rgb = vld3_u8 (src); uint8x8_t result; temp = vmull_u8 (rgb.val[0], rfac); temp = vmlal_u8 (temp,rgb.val[1], gfac); temp = vmlal_u8 (temp,rgb.val[2], bfac); result = vshrn_n_u16 (temp, 8); vst1_u8 (dest, result); src += 8*3; dest += 8; } }

文档评论(0)

1亿VIP精品文档

相关文档