Test on rpi 5 with gcc 12: apply_bdof_8_8x16_c: 7315.2 ( 1.00x) apply_bdof_8_8x16_neon: 1876.8 ( 3.90x) apply_bdof_8_16x8_c: 7170.5 ( 1.00x) apply_bdof_8_16x8_neon: 1752.8 ( 4.09x) apply_bdof_8_16x16_c: 14695.2 ( 1.00x) apply_bdof_8_16x16_neon: 3490.5 ( 4.21x) apply_bdof_10_8x16_c: 7371.5 ( 1.00x) apply_bdof_10_8x16_neon: 1863.8 ( 3.96x) apply_bdof_10_16x8_c: 7172.0 ( 1.00x) apply_bdof_10_16x8_neon: 1766.0 ( 4.06x) apply_bdof_10_16x16_c: 14551.5 ( 1.00x) apply_bdof_10_16x16_neon: 3576.0 ( 4.07x) apply_bdof_12_8x16_c: 7236.5 ( 1.00x) apply_bdof_12_8x16_neon: 1863.8 ( 3.88x) apply_bdof_12_16x8_c: 7316.5 ( 1.00x) apply_bdof_12_16x8_neon: 1758.8 ( 4.16x) apply_bdof_12_16x16_c: 14691.2 ( 1.00x) apply_bdof_12_16x16_neon: 3480.5 ( 4.22x)
65 lines
3.1 KiB
C
65 lines
3.1 KiB
C
/*
|
|
* Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com>
|
|
*
|
|
* This file is part of FFmpeg.
|
|
*
|
|
* FFmpeg is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
*
|
|
* FFmpeg is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
* License along with FFmpeg; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
*/
|
|
|
|
#include "libavcodec/bit_depth_template.c"
|
|
|
|
void FUNC2(ff_vvc_apply_bdof_block, BIT_DEPTH, _neon)(pixel* dst,
|
|
ptrdiff_t dst_stride, const int16_t *src0, const int16_t *src1,
|
|
const int16_t **gh, const int16_t **gv, int16_t *vx, int16_t *vy);
|
|
|
|
static void FUNC(apply_bdof)(uint8_t *_dst, ptrdiff_t _dst_stride,
|
|
const int16_t *_src0, const int16_t *_src1,
|
|
int block_w, int block_h) {
|
|
// +2 for pad left and right
|
|
int16_t gradient_buf_h[2][BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2];
|
|
int16_t gradient_buf_v[2][BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2];
|
|
int16_t *gradient_h[2] = {&gradient_buf_h[0][1], &gradient_buf_h[1][1]};
|
|
int16_t *gradient_v[2] = {&gradient_buf_v[0][1], &gradient_buf_v[1][1]};
|
|
ptrdiff_t dst_stride = _dst_stride / sizeof(pixel);
|
|
pixel *dst = (pixel *) _dst;
|
|
|
|
ff_vvc_prof_grad_filter_8x_neon(gradient_h[0], gradient_v[0],
|
|
BDOF_BLOCK_SIZE,
|
|
_src0, MAX_PB_SIZE, block_w, block_h);
|
|
ff_vvc_prof_grad_filter_8x_neon(gradient_h[1], gradient_v[1],
|
|
BDOF_BLOCK_SIZE,
|
|
_src1, MAX_PB_SIZE, block_w, block_h);
|
|
|
|
for (int y = 0; y < block_h; y += BDOF_MIN_BLOCK_SIZE) {
|
|
for (int x = 0; x < block_w; x += BDOF_MIN_BLOCK_SIZE * 2) {
|
|
const int16_t *src0 = _src0 + y * MAX_PB_SIZE + x;
|
|
const int16_t *src1 = _src1 + y * MAX_PB_SIZE + x;
|
|
pixel *d = dst + x;
|
|
int idx = BDOF_BLOCK_SIZE * y + x;
|
|
const int16_t *gh[] = {gradient_h[0] + idx, gradient_h[1] + idx};
|
|
const int16_t *gv[] = {gradient_v[0] + idx, gradient_v[1] + idx};
|
|
int16_t vx[2], vy[2];
|
|
int pad_mask = !x | ((!y) << 1) |
|
|
((x + 2 * BDOF_MIN_BLOCK_SIZE == block_w) << 2) |
|
|
((y + BDOF_MIN_BLOCK_SIZE == block_h) << 3);
|
|
ff_vvc_derive_bdof_vx_vy_neon(src0, src1, pad_mask, gh, gv, vx, vy);
|
|
FUNC2(ff_vvc_apply_bdof_block, BIT_DEPTH, _neon)(d, dst_stride,
|
|
src0, src1, gh, gv,
|
|
vx, vy);
|
|
}
|
|
dst += BDOF_MIN_BLOCK_SIZE * dst_stride;
|
|
}
|
|
}
|