The issue is that there is an explicit lack of synchronization as only the very first invocation writes symbols and updates the state, which other invocations then store.
359 lines
9.9 KiB
Plaintext
359 lines
9.9 KiB
Plaintext
/*
|
|
* FFv1 codec
|
|
*
|
|
* Copyright (c) 2024 Lynne <dev@lynne.ee>
|
|
*
|
|
* This file is part of FFmpeg.
|
|
*
|
|
* FFmpeg is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
*
|
|
* FFmpeg is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
* License along with FFmpeg; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
*/
|
|
|
|
#ifndef GOLOMB
|
|
#ifdef CACHED_SYMBOL_READER
|
|
shared uint8_t state[CONTEXT_SIZE];
|
|
#define WRITE(c, off, val) put_rac_direct(c, state[off], val)
|
|
#else
|
|
#define WRITE(c, off, val) put_rac(c, uint64_t(slice_state) + (state_off + off), val)
|
|
#endif
|
|
|
|
/* Note - only handles signed values */
|
|
void put_symbol(inout RangeCoder c, uint state_off, int v)
|
|
{
|
|
bool is_nil = (v == 0);
|
|
WRITE(c, 0, is_nil);
|
|
if (is_nil)
|
|
return;
|
|
|
|
const int a = abs(v);
|
|
const int e = findMSB(a);
|
|
|
|
for (int i = 0; i < e; i++)
|
|
WRITE(c, 1 + min(i, 9), true);
|
|
WRITE(c, 1 + min(e, 9), false);
|
|
|
|
for (int i = e - 1; i >= 0; i--)
|
|
WRITE(c, 22 + min(i, 9), bool(bitfieldExtract(a, i, 1)));
|
|
|
|
WRITE(c, 22 - 11 + min(e, 10), v < 0);
|
|
}
|
|
|
|
void encode_line_pcm(inout SliceContext sc, readonly uimage2D img,
|
|
ivec2 sp, int y, int p, int comp, int bits)
|
|
{
|
|
int w = sc.slice_dim.x;
|
|
|
|
#ifdef CACHED_SYMBOL_READER
|
|
if (gl_LocalInvocationID.x > 0)
|
|
return;
|
|
#endif
|
|
|
|
#ifndef RGB
|
|
if (p > 0 && p < 3) {
|
|
w >>= chroma_shift.x;
|
|
sp >>= chroma_shift;
|
|
}
|
|
#endif
|
|
|
|
for (int x = 0; x < w; x++) {
|
|
uint v = imageLoad(img, sp + LADDR(ivec2(x, y)))[comp];
|
|
for (int i = (bits - 1); i >= 0; i--)
|
|
put_rac_equi(sc.c, bool(bitfieldExtract(v, i, 1)));
|
|
}
|
|
}
|
|
|
|
void encode_line(inout SliceContext sc, readonly uimage2D img, uint state_off,
|
|
ivec2 sp, int y, int p, int comp, int bits,
|
|
uint8_t quant_table_idx, const int run_index)
|
|
{
|
|
int w = sc.slice_dim.x;
|
|
|
|
#ifndef RGB
|
|
if (p > 0 && p < 3) {
|
|
w >>= chroma_shift.x;
|
|
sp >>= chroma_shift;
|
|
}
|
|
#endif
|
|
|
|
for (int x = 0; x < w; x++) {
|
|
ivec2 d = get_pred(img, sp, ivec2(x, y), comp, w,
|
|
quant_table_idx, extend_lookup[quant_table_idx] > 0);
|
|
d[1] = int(imageLoad(img, sp + LADDR(ivec2(x, y)))[comp]) - d[1];
|
|
|
|
if (d[0] < 0)
|
|
d = -d;
|
|
|
|
d[1] = fold(d[1], bits);
|
|
|
|
uint context_off = state_off + CONTEXT_SIZE*d[0];
|
|
#ifdef CACHED_SYMBOL_READER
|
|
u8buf sb = u8buf(uint64_t(slice_state) + context_off + gl_LocalInvocationID.x);
|
|
state[gl_LocalInvocationID.x] = sb.v;
|
|
barrier();
|
|
if (gl_LocalInvocationID.x == 0)
|
|
#endif
|
|
|
|
put_symbol(sc.c, context_off, d[1]);
|
|
|
|
#ifdef CACHED_SYMBOL_READER
|
|
barrier();
|
|
sb.v = state[gl_LocalInvocationID.x];
|
|
#endif
|
|
}
|
|
}
|
|
|
|
#else /* GOLOMB */
|
|
|
|
void encode_line(inout SliceContext sc, readonly uimage2D img, uint state_off,
|
|
ivec2 sp, int y, int p, int comp, int bits,
|
|
uint8_t quant_table_idx, inout int run_index)
|
|
{
|
|
int w = sc.slice_dim.x;
|
|
|
|
#ifndef RGB
|
|
if (p > 0 && p < 3) {
|
|
w >>= chroma_shift.x;
|
|
sp >>= chroma_shift;
|
|
}
|
|
#endif
|
|
|
|
int run_count = 0;
|
|
bool run_mode = false;
|
|
|
|
for (int x = 0; x < w; x++) {
|
|
ivec2 d = get_pred(img, sp, ivec2(x, y), comp, w,
|
|
quant_table_idx, extend_lookup[quant_table_idx] > 0);
|
|
d[1] = int(imageLoad(img, sp + LADDR(ivec2(x, y)))[comp]) - d[1];
|
|
|
|
if (d[0] < 0)
|
|
d = -d;
|
|
|
|
d[1] = fold(d[1], bits);
|
|
|
|
if (d[0] == 0)
|
|
run_mode = true;
|
|
|
|
if (run_mode) {
|
|
if (d[1] != 0) {
|
|
/* A very unlikely loop */
|
|
while (run_count >= 1 << log2_run[run_index]) {
|
|
run_count -= 1 << log2_run[run_index];
|
|
run_index++;
|
|
put_bits(sc.pb, 1, 1);
|
|
}
|
|
|
|
put_bits(sc.pb, 1 + log2_run[run_index], run_count);
|
|
if (run_index != 0)
|
|
run_index--;
|
|
run_count = 0;
|
|
run_mode = false;
|
|
if (d[1] > 0)
|
|
d[1]--;
|
|
} else {
|
|
run_count++;
|
|
}
|
|
}
|
|
|
|
if (!run_mode) {
|
|
VlcState sb = VlcState(uint64_t(slice_state) + state_off + VLC_STATE_SIZE*d[0]);
|
|
Symbol sym = get_vlc_symbol(sb, d[1], bits);
|
|
put_bits(sc.pb, sym.bits, sym.val);
|
|
}
|
|
}
|
|
|
|
if (run_mode) {
|
|
while (run_count >= (1 << log2_run[run_index])) {
|
|
run_count -= 1 << log2_run[run_index];
|
|
run_index++;
|
|
put_bits(sc.pb, 1, 1);
|
|
}
|
|
|
|
if (run_count > 0)
|
|
put_bits(sc.pb, 1, 1);
|
|
}
|
|
}
|
|
#endif
|
|
|
|
#ifdef RGB
|
|
ivec4 load_components(ivec2 pos)
|
|
{
|
|
ivec4 pix = ivec4(imageLoad(src[0], pos));
|
|
if (planar_rgb != 0) {
|
|
for (int i = 1; i < (3 + transparency); i++)
|
|
pix[i] = int(imageLoad(src[i], pos)[0]);
|
|
}
|
|
|
|
return ivec4(pix[fmt_lut[0]], pix[fmt_lut[1]],
|
|
pix[fmt_lut[2]], pix[fmt_lut[3]]);
|
|
}
|
|
|
|
void transform_sample(inout ivec4 pix, ivec2 rct_coef)
|
|
{
|
|
pix.b -= pix.g;
|
|
pix.r -= pix.g;
|
|
pix.g += (pix.r*rct_coef.x + pix.b*rct_coef.y) >> 2;
|
|
pix.b += rct_offset;
|
|
pix.r += rct_offset;
|
|
}
|
|
|
|
void preload_rgb(in SliceContext sc, ivec2 sp, int w, int y, bool apply_rct)
|
|
{
|
|
for (uint x = gl_LocalInvocationID.x; x < w; x += gl_WorkGroupSize.x) {
|
|
ivec2 lpos = sp + LADDR(ivec2(x, y));
|
|
ivec2 pos = sc.slice_pos + ivec2(x, y);
|
|
|
|
ivec4 pix = load_components(pos);
|
|
|
|
if (expectEXT(apply_rct, true))
|
|
transform_sample(pix, sc.slice_rct_coef);
|
|
|
|
imageStore(tmp, lpos, pix);
|
|
}
|
|
}
|
|
#endif
|
|
|
|
void encode_slice(inout SliceContext sc, const uint slice_idx)
|
|
{
|
|
ivec2 sp = sc.slice_pos;
|
|
|
|
#ifndef RGB
|
|
int bits = bits_per_raw_sample;
|
|
#else
|
|
int bits = 9;
|
|
if (bits != 8 || sc.slice_coding_mode != 0)
|
|
bits = bits_per_raw_sample + int(sc.slice_coding_mode != 1);
|
|
|
|
sp.y = int(gl_WorkGroupID.y)*RGB_LINECACHE;
|
|
#endif
|
|
|
|
#ifndef GOLOMB
|
|
if (sc.slice_coding_mode == 1) {
|
|
#ifndef RGB
|
|
for (int c = 0; c < components; c++) {
|
|
|
|
int h = sc.slice_dim.y;
|
|
if (c > 0 && c < 3)
|
|
h >>= chroma_shift.y;
|
|
|
|
/* Takes into account dual-plane YUV formats */
|
|
int p = min(c, planes - 1);
|
|
int comp = c - p;
|
|
|
|
for (int y = 0; y < h; y++)
|
|
encode_line_pcm(sc, src[p], sp, y, p, comp, bits);
|
|
}
|
|
#else
|
|
for (int y = 0; y < sc.slice_dim.y; y++) {
|
|
preload_rgb(sc, sp, sc.slice_dim.x, y, false);
|
|
|
|
encode_line_pcm(sc, tmp, sp, y, 0, 1, bits);
|
|
encode_line_pcm(sc, tmp, sp, y, 0, 2, bits);
|
|
encode_line_pcm(sc, tmp, sp, y, 0, 0, bits);
|
|
if (transparency == 1)
|
|
encode_line_pcm(sc, tmp, sp, y, 0, 3, bits);
|
|
}
|
|
#endif
|
|
} else
|
|
#endif
|
|
{
|
|
u8vec4 quant_table_idx = sc.quant_table_idx.xyyz;
|
|
u32vec4 slice_state_off = (slice_idx*codec_planes + uvec4(0, 1, 1, 2))*plane_state_size;
|
|
|
|
#ifndef RGB
|
|
for (int c = 0; c < components; c++) {
|
|
int run_index = 0;
|
|
|
|
int h = sc.slice_dim.y;
|
|
if (c > 0 && c < 3)
|
|
h >>= chroma_shift.y;
|
|
|
|
int p = min(c, planes - 1);
|
|
int comp = c - p;
|
|
|
|
for (int y = 0; y < h; y++)
|
|
encode_line(sc, src[p], slice_state_off[c], sp, y, p,
|
|
comp, bits, quant_table_idx[c], run_index);
|
|
}
|
|
#else
|
|
int run_index = 0;
|
|
for (int y = 0; y < sc.slice_dim.y; y++) {
|
|
preload_rgb(sc, sp, sc.slice_dim.x, y, true);
|
|
|
|
encode_line(sc, tmp, slice_state_off[0],
|
|
sp, y, 0, 1, bits, quant_table_idx[0], run_index);
|
|
encode_line(sc, tmp, slice_state_off[1],
|
|
sp, y, 0, 2, bits, quant_table_idx[1], run_index);
|
|
encode_line(sc, tmp, slice_state_off[2],
|
|
sp, y, 0, 0, bits, quant_table_idx[2], run_index);
|
|
if (transparency == 1)
|
|
encode_line(sc, tmp, slice_state_off[3],
|
|
sp, y, 0, 3, bits, quant_table_idx[3], run_index);
|
|
}
|
|
#endif
|
|
}
|
|
}
|
|
|
|
void finalize_slice(inout SliceContext sc, const uint slice_idx)
|
|
{
|
|
#ifdef CACHED_SYMBOL_READER
|
|
if (gl_LocalInvocationID.x > 0)
|
|
return;
|
|
#endif
|
|
|
|
#ifdef GOLOMB
|
|
uint32_t enc_len = sc.hdr_len + flush_put_bits(sc.pb);
|
|
#else
|
|
uint32_t enc_len = rac_terminate(sc.c);
|
|
#endif
|
|
|
|
u8buf bs = u8buf(sc.c.bytestream_start);
|
|
|
|
/* Append slice length */
|
|
u8vec4 enc_len_p = unpack8(enc_len);
|
|
bs[enc_len + 0].v = enc_len_p.z;
|
|
bs[enc_len + 1].v = enc_len_p.y;
|
|
bs[enc_len + 2].v = enc_len_p.x;
|
|
enc_len += 3;
|
|
|
|
/* Calculate and write CRC */
|
|
if (ec != 0) {
|
|
bs[enc_len].v = uint8_t(0);
|
|
enc_len++;
|
|
|
|
uint32_t crc = crcref;
|
|
for (int i = 0; i < enc_len; i++)
|
|
crc = crc_ieee[(crc & 0xFF) ^ uint32_t(bs[i].v)] ^ (crc >> 8);
|
|
|
|
if (crcref != 0x00000000)
|
|
crc ^= 0x8CD88196;
|
|
|
|
u8vec4 crc_p = unpack8(crc);
|
|
bs[enc_len + 0].v = crc_p.x;
|
|
bs[enc_len + 1].v = crc_p.y;
|
|
bs[enc_len + 2].v = crc_p.z;
|
|
bs[enc_len + 3].v = crc_p.w;
|
|
enc_len += 4;
|
|
}
|
|
|
|
slice_results[slice_idx*2 + 0] = enc_len;
|
|
slice_results[slice_idx*2 + 1] = uint64_t(bs) - uint64_t(out_data);
|
|
}
|
|
|
|
void main(void)
|
|
{
|
|
const uint slice_idx = gl_WorkGroupID.y*gl_NumWorkGroups.x + gl_WorkGroupID.x;
|
|
encode_slice(slice_ctx[slice_idx], slice_idx);
|
|
finalize_slice(slice_ctx[slice_idx], slice_idx);
|
|
}
|