Files
LibreELEC.tv/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch
Shaun2029 ce9deb7aa7 Fixes: Libreelec 8.2.0.1 Rpi3 3D iso not playing properly
This amends commit b27b2a9 with omited MVC parser code for ffmpeg-99.1003-pfcd_hevc_optimisations.patch
2017-12-15 18:12:22 +00:00

30518 lines
955 KiB
Diff

diff --git a/.gitignore b/.gitignore
index 524fb73c16..bcc983739f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
*.a
*.o
*.o.*
+*.bin
*.d
*.def
*.dll
@@ -23,6 +24,7 @@
.\#*
/.config
/.version
+/build/
/ffmpeg
/ffplay
/ffprobe
diff --git a/ffmpeg.c b/ffmpeg.c
index cdded8673f..5eee7dfd40 100644
--- a/ffmpeg.c
+++ b/ffmpeg.c
@@ -23,6 +23,11 @@
* multimedia converter based on the FFmpeg libraries
*/
+#ifdef RPI
+#define RPI_DISPLAY
+#define RPI_DISPLAY_ALL 0
+#endif
+
#include "config.h"
#include <ctype.h>
#include <string.h>
@@ -42,6 +47,7 @@
#include "libavformat/avformat.h"
#include "libavdevice/avdevice.h"
#include "libswresample/swresample.h"
+#include "libavutil/atomic.h"
#include "libavutil/opt.h"
#include "libavutil/channel_layout.h"
#include "libavutil/parseutils.h"
@@ -66,6 +72,25 @@
# include "libavfilter/buffersrc.h"
# include "libavfilter/buffersink.h"
+#ifdef RPI_DISPLAY
+#pragma GCC diagnostic push
+// Many many redundant decls in the header files
+#pragma GCC diagnostic ignored "-Wredundant-decls"
+#include <bcm_host.h>
+#include <interface/mmal/mmal.h>
+#include <interface/mmal/mmal_parameters_camera.h>
+#include <interface/mmal/mmal_buffer.h>
+#include <interface/mmal/mmal_port.h>
+#include <interface/mmal/util/mmal_util.h>
+#include <interface/mmal/util/mmal_default_components.h>
+#include <interface/mmal/util/mmal_connection.h>
+#include <interface/mmal/util/mmal_util_params.h>
+#pragma GCC diagnostic pop
+#include "libavcodec/rpi_qpu.h"
+#include "libavutil/rpi_sand_fns.h"
+#include "libavcodec/rpi_zc.h"
+#endif
+
#if HAVE_SYS_RESOURCE_H
#include <sys/time.h>
#include <sys/types.h>
@@ -158,6 +183,241 @@ static int restore_tty;
static void free_input_threads(void);
#endif
+#ifdef RPI_DISPLAY
+
+#define NUM_BUFFERS 4
+
+
+typedef struct rpi_display_env_s
+{
+ MMAL_COMPONENT_T* display;
+ MMAL_COMPONENT_T* isp;
+ MMAL_PORT_T * port_in; // Input port of either isp or display depending on pipe setup
+ MMAL_CONNECTION_T * conn;
+
+ MMAL_POOL_T *rpi_pool;
+ volatile int rpi_display_count;
+ enum AVPixelFormat avfmt;
+} rpi_display_env_t;
+
+static rpi_display_env_t * rpi_display_env = NULL;
+
+
+static MMAL_POOL_T* display_alloc_pool(MMAL_PORT_T* port)
+{
+ MMAL_POOL_T* pool;
+ mmal_port_parameter_set_boolean(port, MMAL_PARAMETER_ZERO_COPY, MMAL_TRUE); // Does this mark that the buffer contains a vc_handle? Would have expected a vc_image?
+ pool = mmal_port_pool_create(port, NUM_BUFFERS, 0);
+ assert(pool);
+
+ return pool;
+}
+
+static void display_cb_input(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer) {
+ rpi_display_env_t *const de = (rpi_display_env_t *)port->userdata;
+ av_rpi_zc_unref(buffer->user_data);
+ avpriv_atomic_int_add_and_fetch(&de->rpi_display_count, -1);
+ mmal_buffer_header_release(buffer);
+}
+
+static void display_cb_control(MMAL_PORT_T *port,MMAL_BUFFER_HEADER_T *buffer) {
+ mmal_buffer_header_release(buffer);
+}
+
+#define DISPLAY_PORT_DEPTH 4
+
+static rpi_display_env_t *
+display_init(const enum AVPixelFormat req_fmt, size_t x, size_t y, size_t w, size_t h)
+{
+ MMAL_STATUS_T err;
+ MMAL_DISPLAYREGION_T region =
+ {
+ .hdr = {MMAL_PARAMETER_DISPLAYREGION, sizeof(region)},
+ .set = MMAL_DISPLAY_SET_LAYER | MMAL_DISPLAY_SET_FULLSCREEN | MMAL_DISPLAY_SET_DEST_RECT,
+ .layer = 2,
+ .fullscreen = 0,
+ .dest_rect = {x, y, w, h}
+ };
+#if RPI_ZC_SAND_8_IN_10_BUF
+ const enum AVPixelFormat fmt = (req_fmt == AV_PIX_FMT_YUV420P10 || av_rpi_is_sand_format(req_fmt)) ? AV_PIX_FMT_SAND128 : req_fmt;
+#else
+ const enum AVPixelFormat fmt = (req_fmt == AV_PIX_FMT_YUV420P10) ? AV_PIX_FMT_SAND128 : req_fmt;
+#endif
+ const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(fmt, w, h);
+ rpi_display_env_t * de;
+ int isp_req = (fmt == AV_PIX_FMT_SAND64_10);
+
+ bcm_host_init(); // Needs to be done by someone...
+
+ if ((de = av_mallocz(sizeof(*de))) == NULL) {
+ return NULL;
+ }
+
+ mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, &de->display);
+ av_assert0(de->display);
+ de->port_in = de->display->input[0];
+
+ if (isp_req)
+ {
+ mmal_component_create("vc.ril.isp", &de->isp);
+ de->port_in = de->isp->input[0];
+ }
+
+ mmal_port_parameter_set(de->display->input[0], &region.hdr);
+
+ {
+ MMAL_PORT_T * const port = de->port_in;
+ MMAL_ES_FORMAT_T* const format = port->format;
+ port->userdata = (struct MMAL_PORT_USERDATA_T *)de;
+ port->buffer_num = DISPLAY_PORT_DEPTH;
+ format->encoding = fmt == AV_PIX_FMT_SAND128 ? MMAL_ENCODING_YUVUV128 :
+ fmt == AV_PIX_FMT_SAND64_10 ? MMAL_ENCODING_YUVUV64_16 :
+ MMAL_ENCODING_I420;
+ format->es->video.width = geo.stride_y;
+ format->es->video.height = (fmt == AV_PIX_FMT_SAND128 || fmt == AV_PIX_FMT_SAND64_10) ?
+ (h + 15) & ~15 : geo.height_y; // Magic
+ format->es->video.crop.x = 0;
+ format->es->video.crop.y = 0;
+ format->es->video.crop.width = w;
+ format->es->video.crop.height = h;
+ mmal_port_format_commit(port);
+ }
+
+ de->rpi_pool = display_alloc_pool(de->port_in);
+ mmal_port_enable(de->port_in,display_cb_input);
+
+ if (isp_req) {
+ MMAL_PORT_T * const port_out = de->isp->output[0];
+ mmal_log_dump_port(de->port_in);
+ mmal_format_copy(port_out->format, de->port_in->format);
+ if (fmt == AV_PIX_FMT_SAND64_10) {
+ if ((err = mmal_port_parameter_set_int32(de->port_in, MMAL_PARAMETER_CCM_SHIFT, 5)) != MMAL_SUCCESS ||
+ (err = mmal_port_parameter_set_int32(port_out, MMAL_PARAMETER_OUTPUT_SHIFT, 1)) != MMAL_SUCCESS)
+ {
+ av_log(NULL, AV_LOG_WARNING, "Failed to set ISP output port shift\n");
+ }
+ else
+ av_log(NULL, AV_LOG_WARNING, "Set ISP output port shift OK\n");
+
+ }
+ port_out->format->encoding = MMAL_ENCODING_I420;
+ mmal_log_dump_port(port_out);
+ if ((err = mmal_port_format_commit(port_out)) != MMAL_SUCCESS)
+ {
+ av_log(NULL, AV_LOG_ERROR, "Failed to set ISP output port format\n");
+ goto fail;
+ }
+ if ((err = mmal_connection_create(&de->conn, port_out, de->display->input[0], MMAL_CONNECTION_FLAG_TUNNELLING)) != MMAL_SUCCESS) {
+ av_log(NULL, AV_LOG_ERROR, "Failed to create connection\n");
+ goto fail;
+ }
+ if ((err = mmal_connection_enable(de->conn)) != MMAL_SUCCESS) {
+ av_log(NULL, AV_LOG_ERROR, "Failed to enable connection\n");
+ goto fail;
+ }
+ mmal_port_enable(de->isp->control,display_cb_control);
+ mmal_component_enable(de->isp);
+ }
+
+ mmal_component_enable(de->display);
+ mmal_port_enable(de->display->control,display_cb_control);
+ de->avfmt = fmt;
+
+ printf("Allocated display %dx%d in %dx%d, fmt=%d\n", w, h, geo.stride_y, geo.height_y, fmt);
+
+ return de;
+
+fail:
+ // **** Free stuff
+ return NULL;
+}
+
+static void display_frame(struct AVCodecContext * const s, rpi_display_env_t * const de, const AVFrame* const fr)
+{
+ MMAL_BUFFER_HEADER_T* buf;
+
+ if (de == NULL)
+ return;
+
+ if (avpriv_atomic_int_get(&de->rpi_display_count) >= DISPLAY_PORT_DEPTH - 1) {
+ av_log(s, AV_LOG_VERBOSE, "Frame dropped\n");
+ return;
+ }
+
+ buf = mmal_queue_get(de->rpi_pool->queue);
+ if (!buf) {
+ // Running too fast so drop the frame
+ printf("Q alloc failure\n");
+ return;
+ }
+ assert(buf);
+ buf->cmd = 0;
+ buf->offset = 0; // Offset to valid data
+ buf->flags = 0;
+ {
+ const AVRpiZcRefPtr fr_buf = av_rpi_zc_ref(s, fr, de->avfmt, 1);
+ if (fr_buf == NULL) {
+ mmal_buffer_header_release(buf);
+ return;
+ }
+
+ buf->user_data = fr_buf;
+ buf->data = (uint8_t *)av_rpi_zc_vc_handle(fr_buf); // Cast our handle to a pointer for mmal
+ buf->offset = av_rpi_zc_offset(fr_buf);
+ buf->length = av_rpi_zc_length(fr_buf);
+ buf->alloc_size = av_rpi_zc_numbytes(fr_buf);
+ avpriv_atomic_int_add_and_fetch(&de->rpi_display_count, 1);
+ }
+#if RPI_DISPLAY_ALL
+ while (avpriv_atomic_int_get(&de->rpi_display_count) >= DISPLAY_PORT_DEPTH - 1) {
+ usleep(5000);
+ }
+#endif
+
+ if (mmal_port_send_buffer(de->port_in, buf) != MMAL_SUCCESS)
+ {
+ av_log(s, AV_LOG_ERROR, "mmal_port_send_buffer failed: depth=%d\n", de->rpi_display_count);
+ display_cb_input(de->port_in, buf);
+ }
+}
+
+static void display_exit(rpi_display_env_t ** const pde)
+{
+ rpi_display_env_t * const de = *pde;
+ *pde = NULL;
+
+ if (de != NULL) {
+// sleep(120);
+
+ if (de->port_in != NULL) {
+ mmal_port_disable(de->port_in);
+ }
+
+ // The above disable should kick out all buffers - check that
+ if (avpriv_atomic_int_get(&de->rpi_display_count) != 0) {
+ av_log(NULL, AV_LOG_WARNING, "Exiting with display count non-zero:%d\n", avpriv_atomic_int_get(&de->rpi_display_count));
+ }
+
+ if (de->conn != NULL) {
+ mmal_connection_destroy(de->conn);
+ }
+ if (de->isp != NULL) {
+ mmal_component_destroy(de->isp);
+ }
+ if (de->display != NULL) {
+ mmal_component_destroy(de->display);
+ }
+ if (de->rpi_pool != NULL) {
+ mmal_port_pool_destroy(de->display->input[0], de->rpi_pool);
+ }
+
+ av_free(de);
+ }
+}
+
+#endif
+
+
/* sub2video hack:
Convert subtitles to video with alpha to insert them in filter graphs.
This is a temporary solution until libavfilter gets real subtitles support.
@@ -540,6 +800,11 @@ static void ffmpeg_cleanup(int ret)
avformat_close_input(&input_files[i]->ctx);
av_freep(&input_files[i]);
}
+
+#ifdef RPI_DISPLAY
+ display_exit(&rpi_display_env);
+#endif
+
for (i = 0; i < nb_input_streams; i++) {
InputStream *ist = input_streams[i];
@@ -551,6 +816,9 @@ static void ffmpeg_cleanup(int ret)
av_freep(&ist->filters);
av_freep(&ist->hwaccel_device);
+#ifdef RPI_DISPLAY
+ av_rpi_zc_uninit(ist->dec_ctx);
+#endif
avcodec_free_context(&ist->dec_ctx);
av_freep(&input_streams[i]);
@@ -581,6 +849,7 @@ static void ffmpeg_cleanup(int ret)
}
term_exit();
ffmpeg_exited = 1;
+
}
void remove_avoptions(AVDictionary **a, AVDictionary *b)
@@ -944,6 +1213,15 @@ static void do_video_out(AVFormatContext *s,
if (ost->source_index >= 0)
ist = input_streams[ost->source_index];
+#ifdef RPI_DISPLAY
+ if (next_picture && ist != NULL)
+ {
+ if (rpi_display_env == NULL)
+ rpi_display_env = display_init(next_picture->format, 0, 0, next_picture->width, next_picture->height);
+ display_frame(ist->dec_ctx, rpi_display_env, next_picture);
+ }
+#endif
+
if (filter->inputs[0]->frame_rate.num > 0 &&
filter->inputs[0]->frame_rate.den > 0)
duration = 1/(av_q2d(filter->inputs[0]->frame_rate) * av_q2d(enc->time_base));
@@ -2544,6 +2822,12 @@ static int init_input_stream(int ist_index, char *error, int error_len)
ist->dec_ctx->opaque = ist;
ist->dec_ctx->get_format = get_format;
ist->dec_ctx->get_buffer2 = get_buffer;
+
+#ifdef RPI_DISPLAY
+ // Overrides the above get_buffer2
+ av_rpi_zc_init(ist->dec_ctx);
+#endif
+
ist->dec_ctx->thread_safe_callbacks = 1;
av_opt_set_int(ist->dec_ctx, "refcounted_frames", 1, 0);
diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index bb28aea1e2..741aa0bdc4 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -5,6 +5,16 @@ NAME = avcodec
HEADERS = avcodec.h \
avdct.h \
avfft.h \
+ rpi_opts.h \
+ rpi_qpu.h \
+ rpi_shader.h \
+ rpi_shader_cmd.h \
+ rpi_shader_template.h \
+ rpi_shader_template_fn.h \
+ rpi_mailbox.h \
+ rpi_hevc_transform8.h \
+ rpi_hevc_transform10.h \
+ rpi_zc.h \
d3d11va.h \
dirac.h \
dv_profile.h \
@@ -43,6 +53,11 @@ OBJS = allcodecs.o \
resample.o \
resample2.o \
utils.o \
+ rpi_qpu.o \
+ rpi_shader.o \
+ rpi_shader_template.o \
+ rpi_mailbox.o \
+ rpi_zc.o \
vorbis_parser.o \
xiph.o \
@@ -1079,3 +1094,30 @@ $(SUBDIR)qdm2.o: $(SUBDIR)qdm2_tables.h
$(SUBDIR)sinewin.o: $(SUBDIR)sinewin_tables.h
$(SUBDIR)sinewin_fixed.o: $(SUBDIR)sinewin_fixed_tables.h
endif
+
+QASM_PY := ../local/bin/qasm.py
+VASMVIDCORE := ../local/bin/vasmvidcore_std
+
+ifneq ("$(wildcard $(QASM_PY))","")
+$(SUBDIR)rpi_shader.c: $(SUBDIR)rpi_shader.qasm
+ $(QASM_PY) -mc_c:rpi_shader,rpi_shader,rpi_shader $< > $@
+
+$(SUBDIR)rpi_shader.h: $(SUBDIR)rpi_shader.qasm
+ $(QASM_PY) -mc_h:rpi_shader,rpi_shader,rpi_shader $< > $@
+endif
+
+ifneq ("$(wildcard $(VASMVIDCORE))","")
+$(SUBDIR)rpi_hevc_transform8.bin: $(SUBDIR)rpi_hevc_transform.s
+ $(VASMVIDCORE) -Fbin -DBIT_DEPTH=8 $< -o $@
+$(SUBDIR)rpi_hevc_transform10.bin: $(SUBDIR)rpi_hevc_transform.s
+ $(VASMVIDCORE) -Fbin -DBIT_DEPTH=10 $< -o $@
+
+$(SUBDIR)rpi_hevc_transform8.h: $(SUBDIR)rpi_hevc_transform8.bin
+ python pi-util/make_array.py $<
+$(SUBDIR)rpi_hevc_transform10.h: $(SUBDIR)rpi_hevc_transform10.bin
+ python pi-util/make_array.py $<
+
+endif
+
+$(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_hevc_transform8.h $(SUBDIR)rpi_hevc_transform10.h
+$(SUBDIR)hevcdec.o $(SUBDIR)rpi_shader_template.o $(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_shader.h
diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
index 54efaad..02a89c3 100644
--- a/libavcodec/allcodecs.c
+++ b/libavcodec/allcodecs.c
@@ -667,6 +667,7 @@ void avcodec_register_all(void)
REGISTER_PARSER(H261, h261);
REGISTER_PARSER(H263, h263);
REGISTER_PARSER(H264, h264);
+ REGISTER_PARSER(H264_MVC, h264_mvc);
REGISTER_PARSER(HEVC, hevc);
REGISTER_PARSER(MJPEG, mjpeg);
REGISTER_PARSER(MLP, mlp);
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index a4ceca7f46..f8229a80e2 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -131,9 +131,14 @@ NEON-OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_neon.o \
NEON-OBJS-$(CONFIG_LLAUDDSP) += arm/lossless_audiodsp_neon.o
NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_neon.o
NEON-OBJS-$(CONFIG_HEVC_DECODER) += arm/hevcdsp_init_neon.o \
+ arm/hevc_misc_neon.o \
arm/hevcdsp_deblock_neon.o \
+ arm/hevcdsp_epel_neon.o \
arm/hevcdsp_idct_neon.o \
- arm/hevcdsp_qpel_neon.o
+ arm/hevcdsp_cres_neon.o \
+ arm/hevcdsp_res16_neon.o \
+ arm/hevcdsp_qpel_neon.o \
+ arm/hevcdsp_sao_neon.o
NEON-OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_neon.o
NEON-OBJS-$(CONFIG_RV40_DECODER) += arm/rv34dsp_neon.o \
arm/rv40dsp_neon.o
diff --git a/libavcodec/arm/cabac.h b/libavcodec/arm/cabac.h
index fdbf86b45e..0a3980a1ef 100644
--- a/libavcodec/arm/cabac.h
+++ b/libavcodec/arm/cabac.h
@@ -26,13 +26,34 @@
#include "libavutil/internal.h"
#include "libavcodec/cabac.h"
+
+#if UNCHECKED_BITSTREAM_READER
+#define LOAD_16BITS_BEHI\
+ "ldrh %[tmp] , [%[ptr]] , #2 \n\t"\
+ "rev %[tmp] , %[tmp] \n\t"
+#elif CONFIG_THUMB
+#define LOAD_16BITS_BEHI\
+ "ldr %[tmp] , [%[c], %[end]] \n\t"\
+ "cmp %[tmp] , %[ptr] \n\t"\
+ "it cs \n\t"\
+ "ldrhcs %[tmp] , [%[ptr]] , #2 \n\t"\
+ "rev %[tmp] , %[tmp] \n\t"
+#else
+#define LOAD_16BITS_BEHI\
+ "ldr %[tmp] , [%[c], %[end]] \n\t"\
+ "cmp %[tmp] , %[ptr] \n\t"\
+ "ldrcsh %[tmp] , [%[ptr]] , #2 \n\t"\
+ "rev %[tmp] , %[tmp] \n\t"
+#endif
+
+
#define get_cabac_inline get_cabac_inline_arm
static av_always_inline int get_cabac_inline_arm(CABACContext *c,
uint8_t *const state)
{
int bit;
+#if 0
void *reg_b, *reg_c, *tmp;
-
__asm__ volatile(
"ldrb %[bit] , [%[state]] \n\t"
"add %[r_b] , %[tables] , %[lps_off] \n\t"
@@ -100,9 +121,141 @@ static av_always_inline int get_cabac_inline_arm(CABACContext *c,
[mlps_off]"I"(H264_MLPS_STATE_OFFSET + 128)
: "memory", "cc"
);
+#else
+ // *** Not thumb compatible yet
+ unsigned int reg_b, tmp;
+ __asm__ (
+ "ldrb %[bit] , [%[state]] \n\t"
+ "sub %[r_b] , %[mlps_tables], %[lps_off] \n\t"
+ "and %[tmp] , %[range] , #0xC0 \n\t"
+ "add %[r_b] , %[r_b] , %[bit] \n\t"
+ "ldrb %[tmp] , [%[r_b] , %[tmp], lsl #1] \n\t"
+// %bit = *state
+// %range = range
+// %tmp = RangeLPS
+ "sub %[range] , %[range] , %[tmp] \n\t"
+
+ "cmp %[low] , %[range] , lsl #17 \n\t"
+ "ittt ge \n\t"
+ "subge %[low] , %[low] , %[range], lsl #17 \n\t"
+ "mvnge %[bit] , %[bit] \n\t"
+ "movge %[range] , %[tmp] \n\t"
+
+ "clz %[tmp] , %[range] \n\t"
+ "sub %[tmp] , #23 \n\t"
+
+ "ldrb %[r_b] , [%[mlps_tables], %[bit]] \n\t"
+ "lsl %[low] , %[low] , %[tmp] \n\t"
+ "lsl %[range] , %[range] , %[tmp] \n\t"
+
+ "strb %[r_b] , [%[state]] \n\t"
+ "lsls %[tmp] , %[low] , #16 \n\t"
+
+ "bne 2f \n\t"
+ LOAD_16BITS_BEHI
+ "lsr %[tmp] , %[tmp] , #15 \n\t"
+ "movw %[r_b] , #0xFFFF \n\t"
+ "sub %[tmp] , %[tmp] , %[r_b] \n\t"
+
+ "rbit %[r_b] , %[low] \n\t"
+ "clz %[r_b] , %[r_b] \n\t"
+ "sub %[r_b] , %[r_b] , #16 \n\t"
+#if CONFIG_THUMB
+ "lsl %[tmp] , %[tmp] , %[r_b] \n\t"
+ "add %[low] , %[low] , %[tmp] \n\t"
+#else
+ "add %[low] , %[low] , %[tmp], lsl %[r_b] \n\t"
+#endif
+ "2: \n\t"
+ : [bit]"=&r"(bit),
+ [low]"+&r"(c->low),
+ [range]"+&r"(c->range),
+ [r_b]"=&r"(reg_b),
+ [ptr]"+&r"(c->bytestream),
+ [tmp]"=&r"(tmp)
+ : [state]"r"(state),
+ [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
+ [byte]"M"(offsetof(CABACContext, bytestream)),
+#if !UNCHECKED_BITSTREAM_READER
+ [c]"r"(c),
+ [end]"M"(offsetof(CABACContext, bytestream_end)),
+#endif
+ [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
+ : "memory", "cc"
+ );
+#endif
return bit & 1;
}
+
+#define get_cabac_bypass get_cabac_bypass_arm
+static inline int get_cabac_bypass_arm(CABACContext * const c)
+{
+ int rv = 0;
+ unsigned int tmp;
+ __asm (
+ "lsl %[low] , #1 \n\t"
+ "cmp %[low] , %[range] , lsl #17 \n\t"
+ "adc %[rv] , %[rv] , #0 \n\t"
+ "it cs \n\t"
+ "subcs %[low] , %[low] , %[range], lsl #17 \n\t"
+ "lsls %[tmp] , %[low] , #16 \n\t"
+ "bne 1f \n\t"
+ LOAD_16BITS_BEHI
+ "add %[low] , %[low] , %[tmp], lsr #15 \n\t"
+ "movw %[tmp] , #0xFFFF \n\t"
+ "sub %[low] , %[low] , %[tmp] \n\t"
+ "1: \n\t"
+ : // Outputs
+ [rv]"+&r"(rv),
+ [low]"+&r"(c->low),
+ [tmp]"=&r"(tmp),
+ [ptr]"+&r"(c->bytestream)
+ : // Inputs
+#if !UNCHECKED_BITSTREAM_READER
+ [c]"r"(c),
+ [end]"M"(offsetof(CABACContext, bytestream_end)),
+#endif
+ [range]"r"(c->range)
+ : "cc"
+ );
+ return rv;
+}
+
+
+#define get_cabac_bypass_sign get_cabac_bypass_sign_arm
+static inline int get_cabac_bypass_sign_arm(CABACContext * const c, int rv)
+{
+ unsigned int tmp;
+ __asm (
+ "lsl %[low] , #1 \n\t"
+ "cmp %[low] , %[range] , lsl #17 \n\t"
+ "ite cc \n\t"
+ "rsbcc %[rv] , %[rv] , #0 \n\t"
+ "subcs %[low] , %[low] , %[range], lsl #17 \n\t"
+ "lsls %[tmp] , %[low] , #16 \n\t"
+ "bne 1f \n\t"
+ LOAD_16BITS_BEHI
+ "add %[low] , %[low] , %[tmp], lsr #15 \n\t"
+ "movw %[tmp] , #0xFFFF \n\t"
+ "sub %[low] , %[low] , %[tmp] \n\t"
+ "1: \n\t"
+ : // Outputs
+ [rv]"+&r"(rv),
+ [low]"+&r"(c->low),
+ [tmp]"=&r"(tmp),
+ [ptr]"+&r"(c->bytestream)
+ : // Inputs
+#if !UNCHECKED_BITSTREAM_READER
+ [c]"r"(c),
+ [end]"M"(offsetof(CABACContext, bytestream_end)),
+#endif
+ [range]"r"(c->range)
+ : "cc"
+ );
+ return rv;
+}
+
#endif /* HAVE_ARMV6T2_INLINE */
#endif /* AVCODEC_ARM_CABAC_H */
diff --git a/libavcodec/arm/hevc_cabac.h b/libavcodec/arm/hevc_cabac.h
new file mode 100644
index 0000000000..31d3c59205
--- /dev/null
+++ b/libavcodec/arm/hevc_cabac.h
@@ -0,0 +1,491 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ARM_HEVC_CABAC_H
+#define AVCODEC_ARM_HEVC_CABAC_H
+
+#include "config.h"
+#if HAVE_ARMV6T2_INLINE
+
+#define hevc_mem_bits32 hevc_mem_bits32_arm
+static inline uint32_t hevc_mem_bits32_arm(const void * p, const unsigned int bits)
+{
+ unsigned int n;
+ __asm__ (
+ "rev %[n], %[x] \n\t"
+ : [n]"=r"(n)
+ : [x]"r"(*(const uint32_t *)((const uint8_t *)p + (bits >> 3)))
+ :
+ );
+ return n << (bits & 7);
+}
+
+
+// ---------------------------------------------------------------------------
+//
+// Helper fns - little bits of code where ARM has an instraction that the
+// compiler doesn't know about / use
+
+#define trans_scale_sat trans_scale_sat_arm
+static inline int trans_scale_sat_arm(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift)
+{
+ int rv;
+ int t = ((level * (int)(scale * scale_m)) >> shift) + 1;
+
+ __asm__ (
+ "ssat %[rv], #16, %[t], ASR #1 \n\t"
+ : [rv]"=r"(rv)
+ : [t]"r"(t)
+ :
+ );
+ return rv;
+}
+
+#define update_rice update_rice_arm
+static inline void update_rice_arm(uint8_t * const stat_coeff,
+ const unsigned int last_coeff_abs_level_remaining,
+ const unsigned int c_rice_param)
+{
+ int t;
+ __asm__ (
+ "lsl %[t], %[coeff], #1 \n\t"
+ "lsrs %[t], %[t], %[shift] \n\t"
+ "it eq \n\t"
+ "subeq %[stat], %[stat], #1 \n\t"
+ "cmp %[t], #6 \n\t"
+ "adc %[stat], %[stat], #0 \n\t"
+ "usat %[stat], #8, %[stat] \n\t"
+ : [stat]"+&r"(*stat_coeff),
+ [t]"=&r"(t)
+ : [coeff]"r"(last_coeff_abs_level_remaining),
+ [shift]"r"(c_rice_param)
+ : "cc"
+ );
+}
+
+// ---------------------------------------------------------------------------
+//
+// CABAC get loops
+//
+// Where the loop is simple enough we can normally do 10-30% better than the
+// compiler
+
+// Get the residual greater than 1 bits
+
+#define get_cabac_greater1_bits get_cabac_greater1_bits_arm
+static inline unsigned int get_cabac_greater1_bits_arm(CABACContext * const c, const unsigned int n,
+ uint8_t * const state0)
+{
+ unsigned int i, reg_b, st, tmp, bit, rv;
+ __asm__ (
+ "mov %[i] , #0 \n\t"
+ "mov %[rv] , #0 \n\t"
+ "1: \n\t"
+ "add %[i] , %[i] , #1 \n\t"
+ "cmp %[rv] , #0 \n\t"
+ "ite eq \n\t"
+ "usateq %[st] , #2 , %[i] \n\t"
+ "movne %[st] , #0 \n\t"
+
+ "ldrb %[bit] , [%[state0], %[st]] \n\t"
+ "sub %[r_b] , %[mlps_tables], %[lps_off] \n\t"
+ "and %[tmp] , %[range] , #0xC0 \n\t"
+ "add %[r_b] , %[r_b] , %[bit] \n\t"
+ "ldrb %[tmp] , [%[r_b], %[tmp], lsl #1] \n\t"
+ "sub %[range] , %[range] , %[tmp] \n\t"
+
+ "cmp %[low] , %[range], lsl #17 \n\t"
+ "ittt ge \n\t"
+ "subge %[low] , %[low] , %[range], lsl #17 \n\t"
+ "mvnge %[bit] , %[bit] \n\t"
+ "movge %[range] , %[tmp] \n\t"
+
+ "ldrb %[r_b] , [%[mlps_tables], %[bit]] \n\t"
+ "and %[bit] , %[bit] , #1 \n\t"
+ "orr %[rv] , %[bit] , %[rv], lsl #1 \n\t"
+
+ "clz %[tmp] , %[range] \n\t"
+ "sub %[tmp] , #23 \n\t"
+
+ "lsl %[low] , %[low] , %[tmp] \n\t"
+ "lsl %[range] , %[range] , %[tmp] \n\t"
+
+ "strb %[r_b] , [%[state0], %[st]] \n\t"
+// There is a small speed gain from combining both conditions, using a single
+// branch and then working out what that meant later
+ "lsls %[tmp] , %[low] , #16 \n\t"
+ "it ne \n\t"
+ "cmpne %[n] , %[i] \n\t"
+ "bne 1b \n\t"
+
+// If reload is not required then we must have run out of flags to decode
+ "tst %[tmp] , %[tmp] \n\t"
+ "bne 2f \n\t"
+
+// Do reload
+ "ldrh %[tmp] , [%[bptr]] , #2 \n\t"
+ "movw %[r_b] , #0xFFFF \n\t"
+ "rev %[tmp] , %[tmp] \n\t"
+ "rsb %[tmp] , %[r_b] , %[tmp], lsr #15 \n\t"
+
+ "rbit %[r_b] , %[low] \n\t"
+ "clz %[r_b] , %[r_b] \n\t"
+ "sub %[r_b] , %[r_b] , #16 \n\t"
+
+#if CONFIG_THUMB
+ "lsl %[tmp] , %[tmp] , %[r_b] \n\t"
+ "add %[low] , %[low] , %[tmp] \n\t"
+#else
+ "add %[low] , %[low] , %[tmp], lsl %[r_b] \n\t"
+#endif
+
+ "cmp %[n] , %[i] \n\t"
+ "bne 1b \n\t"
+ "2: \n\t"
+ : [bit]"=&r"(bit),
+ [low]"+&r"(c->low),
+ [range]"+&r"(c->range),
+ [r_b]"=&r"(reg_b),
+ [bptr]"+&r"(c->bytestream),
+ [i]"=&r"(i),
+ [tmp]"=&r"(tmp),
+ [st]"=&r"(st),
+ [rv]"=&r"(rv)
+ : [state0]"r"(state0),
+ [n]"r"(n),
+ [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
+ [byte]"M"(offsetof(CABACContext, bytestream)),
+ [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
+ : "memory", "cc"
+ );
+ return rv;
+}
+
+
+// n must be > 0 on entry
+#define get_cabac_sig_coeff_flag_idxs get_cabac_sig_coeff_flag_idxs_arm
+static inline uint8_t * get_cabac_sig_coeff_flag_idxs_arm(CABACContext * const c, uint8_t * const state0,
+ unsigned int n,
+ const uint8_t const * ctx_map,
+ uint8_t * p)
+{
+ unsigned int reg_b, tmp, st, bit;
+ __asm__ (
+ "1: \n\t"
+// Get bin from map
+ "ldrb %[st] , [%[ctx_map], %[n]] \n\t"
+
+// Load state & ranges
+ "sub %[r_b] , %[mlps_tables], %[lps_off] \n\t"
+ "ldrb %[bit] , [%[state0], %[st]] \n\t"
+ "and %[tmp] , %[range] , #0xC0 \n\t"
+ "add %[r_b] , %[r_b] , %[tmp], lsl #1 \n\t"
+ "ldrb %[tmp] , [%[r_b], %[bit]] \n\t"
+ "sub %[range] , %[range] , %[tmp] \n\t"
+
+ "cmp %[low] , %[range], lsl #17 \n\t"
+ "ittt ge \n\t"
+ "subge %[low] , %[low] , %[range], lsl #17 \n\t"
+ "mvnge %[bit] , %[bit] \n\t"
+ "movge %[range] , %[tmp] \n\t"
+
+ "ldrb %[r_b] , [%[mlps_tables], %[bit]] \n\t"
+ "tst %[bit] , #1 \n\t"
+// GCC asm seems to need strbne written differently for thumb and arm
+#if CONFIG_THUMB
+ "it ne \n\t"
+ "strbne %[n] , [%[idx]] , #1 \n\t"
+#else
+ "strneb %[n] , [%[idx]] , #1 \n\t"
+#endif
+
+// Renorm
+ "clz %[tmp] , %[range] \n\t"
+ "sub %[tmp] , #23 \n\t"
+ "lsl %[low] , %[low] , %[tmp] \n\t"
+ "lsl %[range] , %[range] , %[tmp] \n\t"
+
+ "strb %[r_b] , [%[state0], %[st]] \n\t"
+// There is a small speed gain from combining both conditions, using a single
+// branch and then working out what that meant later
+ "subs %[n] , %[n] , #1 \n\t"
+#if CONFIG_THUMB
+ "itt ne \n\t"
+ "lslsne %[tmp] , %[low] , #16 \n\t"
+ "bne 1b \n\t"
+#else
+ "lslnes %[tmp] , %[low] , #16 \n\t"
+ "bne 1b \n\t"
+#endif
+
+// If we have bits left then n must be 0 so give up now
+ "lsls %[tmp] , %[low] , #16 \n\t"
+ "bne 2f \n\t"
+
+// Do reload
+ "ldrh %[tmp] , [%[bptr]] , #2 \n\t"
+ "movw %[r_b] , #0xFFFF \n\t"
+ "rev %[tmp] , %[tmp] \n\t"
+ "rsb %[tmp] , %[r_b] , %[tmp], lsr #15 \n\t"
+
+ "rbit %[r_b] , %[low] \n\t"
+ "clz %[r_b] , %[r_b] \n\t"
+ "sub %[r_b] , %[r_b] , #16 \n\t"
+
+#if CONFIG_THUMB
+ "lsl %[tmp] , %[tmp] , %[r_b] \n\t"
+ "add %[low] , %[low] , %[tmp] \n\t"
+#else
+ "add %[low] , %[low] , %[tmp], lsl %[r_b] \n\t"
+#endif
+
+// Check to see if we still have more to do
+ "cmp %[n] , #0 \n\t"
+ "bne 1b \n\t"
+ "2: \n\t"
+ : [bit]"=&r"(bit),
+ [low]"+&r"(c->low),
+ [range]"+&r"(c->range),
+ [r_b]"=&r"(reg_b),
+ [bptr]"+&r"(c->bytestream),
+ [idx]"+&r"(p),
+ [n]"+&r"(n),
+ [tmp]"=&r"(tmp),
+ [st]"=&r"(st)
+ : [state0]"r"(state0),
+ [ctx_map]"r"(ctx_map),
+ [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
+ [byte]"M"(offsetof(CABACContext, bytestream)),
+ [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
+ : "memory", "cc"
+ );
+
+ return p;
+}
+
+// ---------------------------------------------------------------------------
+//
+// CABAC_BY22 functions
+//
+// By and large these are (at best) no faster than their C equivalents - the
+// only one worth having is _peek where we do a slightly better job than the
+// compiler
+//
+// The others have been stashed here for reference in case larger scale asm
+// is attempted in which case they might be a useful base
+
+
+#define get_cabac_by22_peek get_cabac_by22_peek_arm
+static inline uint32_t get_cabac_by22_peek_arm(const CABACContext *const c)
+{
+ uint32_t rv, tmp;
+ __asm__ (
+ "bic %[rv] , %[low], #1 \n\t"
+ "cmp %[inv] , #0 \n\t"
+ "it ne \n\t"
+ "umullne %[tmp] , %[rv] , %[inv], %[rv] \n\t"
+ : // Outputs
+ [rv]"=&r"(rv),
+ [tmp]"=r"(tmp)
+ : // Inputs
+ [low]"r"(c->low),
+ [inv]"r"(c->range)
+ : // Clobbers
+ "cc"
+ );
+ return rv << 1;
+}
+
+#if 0
+
+// ***** Slower than the C :-(
+#define get_cabac_by22_flush get_cabac_by22_flush_arm
+static inline void get_cabac_by22_flush_arm(CABACContext *const c, const unsigned int n, const uint32_t val)
+{
+ uint32_t m, tmp;
+ __asm__ (
+ "add %[bits], %[bits], %[n] \n\t"
+ "ldr %[m], [%[ptr], %[bits], lsr #3] \n\t"
+
+ "rsb %[tmp], %[n], #32 \n\t"
+ "lsr %[tmp], %[val], %[tmp] \n\t"
+ "mul %[tmp], %[range], %[tmp] \n\t"
+
+ "rev %[m], %[m] \n\t"
+
+ "lsl %[tmp], %[tmp], #23 \n\t"
+ "rsb %[low], %[tmp], %[low], lsl %[n] \n\t"
+
+ "and %[tmp], %[bits], #7 \n\t"
+ "lsl %[m], %[m], %[tmp] \n\t"
+
+ "orr %[low], %[low], %[m], lsr #9 \n\t"
+ : // Outputs
+ [m]"=&r"(m),
+ [tmp]"=&r"(tmp),
+ [bits]"+&r"(c->by22.bits),
+ [low]"+&r"(c->low)
+ : // Inputs
+ [n]"r"(n),
+ [val]"r"(val),
+ [inv]"r"(c->range),
+ [range]"r"(c->by22.range),
+ [ptr]"r"(c->bytestream)
+ : // Clobbers
+ );
+}
+
+
+// Works but slower than C
+#define coeff_abs_level_remaining_decode_by22(c,r) coeff_abs_level_remaining_decode_by22_arm(c, r)
+static int coeff_abs_level_remaining_decode_by22_arm(CABACContext * const c, const unsigned int c_rice_param)
+{
+ uint32_t n, val, tmp, level;
+
+// PROFILE_START();
+
+ __asm__ (
+ // Peek
+ "bic %[val], %[low], #1 \n\t"
+ "cmp %[inv], #0 \n\t"
+ "umullne %[tmp], %[val], %[inv], %[val] \n\t"
+ "lsl %[val], %[val], #1 \n\t"
+
+ // Count bits (n = prefix)
+ "mvn %[n], %[val] \n\t"
+ "clz %[n], %[n] \n\t"
+
+ "lsl %[level], %[val], %[n] \n\t"
+ "subs %[tmp], %[n], #3 \n\t"
+ "blo 2f \n\t"
+
+ // prefix >= 3
+ // < tmp = prefix - 3
+ // > tmp = prefix + rice - 3
+ "add %[tmp], %[tmp], %[rice] \n\t"
+ // > n = prefix * 2 + rice - 3
+ "add %[n], %[tmp], %[n] \n\t"
+ "cmp %[n], #21 \n\t"
+ "bhi 3f \n\t"
+
+ "orr %[level], %[level], #0x80000000 \n\t"
+ "rsb %[tmp], %[tmp], #31 \n\t"
+ "lsr %[level], %[level], %[tmp] \n\t"
+
+ "mov %[tmp], #2 \n\t"
+ "add %[level], %[level], %[tmp], lsl %[rice] \n\t"
+ "b 1f \n\t"
+
+ // > 22 bits used in total - need reload
+ "3: \n\t"
+
+ // Stash prefix + rice - 3 in level (only spare reg)
+ "mov %[level], %[tmp] \n\t"
+ // Restore n to flush value (prefix)
+ "sub %[n], %[n], %[tmp] \n\t"
+
+ // Flush + reload
+
+// "rsb %[tmp], %[n], #32 \n\t"
+// "lsr %[tmp], %[val], %[tmp] \n\t"
+// "mul %[tmp], %[range], %[tmp] \n\t"
+
+ // As it happens we know that all the bits we are flushing are 1
+ // so we can cheat slightly
+ "rsb %[tmp], %[range], %[range], lsl %[n] \n\t"
+ "lsl %[tmp], %[tmp], #23 \n\t"
+ "rsb %[low], %[tmp], %[low], lsl %[n] \n\t"
+
+ "add %[bits], %[bits], %[n] \n\t"
+ "ldr %[n], [%[ptr], %[bits], lsr #3] \n\t"
+ "rev %[n], %[n] \n\t"
+ "and %[tmp], %[bits], #7 \n\t"
+ "lsl %[n], %[n], %[tmp] \n\t"
+
+ "orr %[low], %[low], %[n], lsr #9 \n\t"
+
+ // (reload)
+
+ "bic %[val], %[low], #1 \n\t"
+ "cmp %[inv], #0 \n\t"
+ "umullne %[tmp], %[val], %[inv], %[val] \n\t"
+ "lsl %[val], %[val], #1 \n\t"
+
+ // Build value
+
+ "mov %[n], %[level] \n\t"
+
+ "orr %[tmp], %[val], #0x80000000 \n\t"
+ "rsb %[level], %[level], #31 \n\t"
+ "lsr %[level], %[tmp], %[level] \n\t"
+
+ "mov %[tmp], #2 \n\t"
+ "add %[level], %[level], %[tmp], lsl %[rice] \n\t"
+ "b 1f \n\t"
+
+ // prefix < 3
+ "2: \n\t"
+ "rsb %[tmp], %[rice], #31 \n\t"
+ "lsr %[level], %[level], %[tmp] \n\t"
+ "orr %[level], %[level], %[n], lsl %[rice] \n\t"
+ "add %[n], %[n], %[rice] \n\t"
+
+ "1: \n\t"
+ // Flush
+ "add %[n], %[n], #1 \n\t"
+
+ "rsb %[tmp], %[n], #32 \n\t"
+ "lsr %[tmp], %[val], %[tmp] \n\t"
+
+ "add %[bits], %[bits], %[n] \n\t"
+ "ldr %[val], [%[ptr], %[bits], lsr #3] \n\t"
+
+ "mul %[tmp], %[range], %[tmp] \n\t"
+ "lsl %[tmp], %[tmp], #23 \n\t"
+ "rsb %[low], %[tmp], %[low], lsl %[n] \n\t"
+
+ "rev %[val], %[val] \n\t"
+ "and %[tmp], %[bits], #7 \n\t"
+ "lsl %[val], %[val], %[tmp] \n\t"
+
+ "orr %[low], %[low], %[val], lsr #9 \n\t"
+ : // Outputs
+ [level]"=&r"(level),
+ [n]"=&r"(n),
+ [val]"=&r"(val),
+ [tmp]"=&r"(tmp),
+ [bits]"+&r"(c->by22.bits),
+ [low]"+&r"(c->low)
+ : // Inputs
+ [rice]"r"(c_rice_param),
+ [inv]"r"(c->range),
+ [range]"r"(c->by22.range),
+ [ptr]"r"(c->bytestream)
+ : // Clobbers
+ "cc"
+ );
+
+// PROFILE_ACC(residual_abs);
+
+ return level;
+}
+#endif
+
+#endif /* HAVE_ARMV6T2_INLINE */
+
+#endif /* AVCODEC_ARM_HEVC_CABAC_H */
diff --git a/libavcodec/arm/hevc_idct_fn_neon.S b/libavcodec/arm/hevc_idct_fn_neon.S
new file mode 100644
index 0000000000..380d3c8d3b
--- /dev/null
+++ b/libavcodec/arm/hevc_idct_fn_neon.S
@@ -0,0 +1,224 @@
+@ Included multiple times from hevc_idct_neon.S
+@ Macros defined there
+
+#define DC_SHIFT (15 - BIT_DEPTH)
+#define DC_ADD (1 | (1 << (14 - BIT_DEPTH)))
+#define TRN_SHIFT (20 - BIT_DEPTH)
+
+function JOIN(ff_hevc_idct_4x4_dc_neon_, BIT_DEPTH), export=1
+ ldrsh r1, [r0]
+ add r1, #DC_ADD
+ asr r1, #DC_SHIFT
+ vdup.16 q0, r1
+ vdup.16 q1, r1
+ vst1.16 {q0, q1}, [r0]
+ bx lr
+endfunc
+
+function JOIN(ff_hevc_idct_8x8_dc_neon_, BIT_DEPTH), export=1
+ ldrsh r1, [r0]
+ add r1, #DC_ADD
+ asr r1, #DC_SHIFT
+ vdup.16 q8, r1
+ vdup.16 q9, r1
+ vmov.16 q10, q8
+ vmov.16 q11, q8
+ vmov.16 q12, q8
+ vmov.16 q13, q8
+ vmov.16 q14, q8
+ vmov.16 q15, q8
+ vstm r0, {q8-q15}
+ bx lr
+endfunc
+
+function JOIN(ff_hevc_idct_16x16_dc_neon_, BIT_DEPTH), export=1
+ ldrsh r1, [r0]
+ add r1, #DC_ADD
+ asr r1, #DC_SHIFT
+ vdup.16 q8, r1
+ vdup.16 q9, r1
+ vmov.16 q10, q8
+ vmov.16 q11, q8
+ vmov.16 q12, q8
+ vmov.16 q13, q8
+ vmov.16 q14, q8
+ vmov.16 q15, q8
+ vstm r0!, {q8-q15}
+ vstm r0!, {q8-q15}
+ vstm r0!, {q8-q15}
+ vstm r0, {q8-q15}
+ bx lr
+endfunc
+
+function JOIN(ff_hevc_idct_32x32_dc_neon_, BIT_DEPTH), export=1
+ ldrsh r1, [r0]
+ add r1, #DC_ADD
+ asr r1, #DC_SHIFT
+ mov r3, #16
+ vdup.16 q8, r1
+ vdup.16 q9, r1
+ vmov.16 q10, q8
+ vmov.16 q11, q8
+ vmov.16 q12, q8
+ vmov.16 q13, q8
+ vmov.16 q14, q8
+ vmov.16 q15, q8
+1: subs r3, #1
+ vstm r0!, {q8-q15}
+ bne 1b
+ bx lr
+endfunc
+
+
+function JOIN(ff_hevc_transform_4x4_neon_, BIT_DEPTH), export=1
+ vpush {d8-d15}
+ vld1.16 {q14, q15}, [r0] // coeffs
+ ldr r3, =0x00240053 // 36 and 83
+ vmov.32 d0[0], r3
+
+ tr4_shift d28, d29, d30, d31, #7
+
+ vtrn.16 d28, d29
+ vtrn.16 d30, d31
+ vtrn.32 q14, q15
+
+ tr4_shift d28, d29, d30, d31, #(TRN_SHIFT)
+
+ vtrn.16 d28, d29
+ vtrn.16 d30, d31
+ vtrn.32 q14, q15
+
+ vst1.16 {q14, q15}, [r0]
+ vpop {d8-d15}
+ bx lr
+endfunc
+
+
+
+function JOIN(ff_hevc_transform_luma_4x4_neon_, BIT_DEPTH), export=1
+ vpush {d8-d15}
+ vld1.16 {q14, q15}, [r0] // coeffs
+ ldr r3, =0x4a // 74
+ vmov.32 d0[0], r3
+ ldr r3, =0x1d // 29
+ vmov.32 d0[1], r3
+ ldr r3, =0x37 // 55
+ vmov.32 d1[0], r3
+
+ tr4_luma_shift d28, d29, d30, d31, #7
+
+ vtrn.16 d28, d29
+ vtrn.16 d30, d31
+ vtrn.32 q14, q15
+
+ tr4_luma_shift d28, d29, d30, d31, #(TRN_SHIFT)
+
+ vtrn.16 d28, d29
+ vtrn.16 d30, d31
+ vtrn.32 q14, q15
+ vst1.16 {q14, q15}, [r0]
+ vpop {d8-d15}
+ bx lr
+endfunc
+
+
+
+function JOIN(ff_hevc_transform_8x8_neon_, BIT_DEPTH), export=1
+ push {r4-r8}
+ vpush {d8-d15}
+ mov r5, #16
+
+ adrl r3, tr4f
+ vld1.16 {d0, d1}, [r3]
+
+ // left half
+ vld1.16 {d24}, [r0], r5
+ vld1.16 {d25}, [r0], r5
+ vld1.16 {d26}, [r0], r5
+ vld1.16 {d27}, [r0], r5
+ vld1.16 {d28}, [r0], r5
+ vld1.16 {d29}, [r0], r5
+ vld1.16 {d30}, [r0], r5
+ vld1.16 {d31}, [r0], r5
+ sub r0, #128
+ tr8_begin d25, d27, d29, d31
+ tr4 d24, d26, d28, d30
+ tr8_end #7
+ vst1.16 {d2}, [r0], r5
+ vst1.16 {d3}, [r0], r5
+ vst1.16 {d4}, [r0], r5
+ vst1.16 {d5}, [r0], r5
+ vst1.16 {d6}, [r0], r5
+ vst1.16 {d7}, [r0], r5
+ vst1.16 {d8}, [r0], r5
+ vst1.16 {d9}, [r0], r5
+ sub r0, #128
+ //skip right half if col_limit in r1 is less than 4
+ cmp r1, #4
+ blt 1f
+ //right half
+ add r0, #8
+ vld1.16 {d24}, [r0], r5
+ vld1.16 {d25}, [r0], r5
+ vld1.16 {d26}, [r0], r5
+ vld1.16 {d27}, [r0], r5
+ vld1.16 {d28}, [r0], r5
+ vld1.16 {d29}, [r0], r5
+ vld1.16 {d30}, [r0], r5
+ vld1.16 {d31}, [r0], r5
+ sub r0, #128
+ tr8_begin d25, d27, d29, d31
+ tr4 d24, d26, d28, d30
+ tr8_end #7
+ vst1.16 {d2}, [r0], r5
+ vst1.16 {d3}, [r0], r5
+ vst1.16 {d4}, [r0], r5
+ vst1.16 {d5}, [r0], r5
+ vst1.16 {d6}, [r0], r5
+ vst1.16 {d7}, [r0], r5
+ vst1.16 {d8}, [r0], r5
+ vst1.16 {d9}, [r0], r5
+ sub r0, #136
+1:
+ // top half
+ vldm r0, {q12-q15} // coeffs
+ transpose_16b_4x4 d24, d26, d28, d30
+ transpose_16b_4x4 d25, d27, d29, d31
+ tr8_begin d26, d30, d27, d31
+ tr4 d24, d28, d25, d29
+ tr8_end #(TRN_SHIFT)
+ transpose_16b_4x4 d2, d3, d4, d5
+ transpose_16b_4x4 d6, d7, d8, d9
+ vswp d7, d5
+ vswp d7, d8
+ vswp d3, d6
+ vswp d6, d4
+ vstm r0!, {q1-q4}
+
+ // bottom half
+ vldm r0, {q12-q15} // coeffs
+ transpose_16b_4x4 d24, d26, d28, d30
+ transpose_16b_4x4 d25, d27, d29, d31
+ tr8_begin d26, d30, d27, d31
+ tr4 d24, d28, d25, d29
+ tr8_end #(TRN_SHIFT)
+ transpose_16b_4x4 d2, d3, d4, d5
+ transpose_16b_4x4 d6, d7, d8, d9
+ vswp d7, d5
+ vswp d7, d8
+ vswp d3, d6
+ vswp d6, d4
+ //vstm r0, {q1-q4}
+ vst1.16 {q1-q2}, [r0]
+ add r0, #32
+ vst1.16 {q3-q4}, [r0]
+ sub r0, #32
+ vpop {d8-d15}
+ pop {r4-r8}
+ bx lr
+endfunc
+
+#undef DC_SHIFT
+#undef DC_ADD
+#undef TRN_SHIFT
+
diff --git a/libavcodec/arm/hevc_misc_neon.S b/libavcodec/arm/hevc_misc_neon.S
new file mode 100644
index 0000000000..373576b4cb
--- /dev/null
+++ b/libavcodec/arm/hevc_misc_neon.S
@@ -0,0 +1,62 @@
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
+@ rpi_zap_coeff_vals_neon(
+@ uint16_t * buf, [r0]
+@ unsigned int log_n_m2) [r1]
+
+function rpi_zap_coeff_vals_neon, export=1
+ vmov.i64 q8, #0
+ adr r12, zc_tab
+ vmov.i64 q9, #0
+ tst r0, #63
+ vmov.i64 q10, #0
+ add r0, #63
+ vmov.i64 q11, #0
+ and r0, #~63
+ ldr pc, [r12, r1, lsl #2]
+
+zc_tab:
+ .word zc_lc2
+ .word zc_lc3
+ .word zc_lc4
+ .word zc_lc5
+
+@ 4*4*2: "32 bytes" 64 or 0 depending on dst address
+zc_lc2:
+ it eq
+ vstmeq r0, {q8-q11}
+ bx lr
+
+@ 16*16*2 = 512 = 64 * 8
+zc_lc4:
+ vstm r0!, {q8-q11}
+ vstm r0!, {q8-q11}
+ vstm r0!, {q8-q11}
+ vstm r0!, {q8-q11}
+ vstm r0!, {q8-q11}
+ vstm r0!, {q8-q11}
+@ 8*8*2 = 128
+zc_lc3:
+ vstm r0!, {q8-q11}
+ vstm r0, {q8-q11}
+ bx lr
+
+@ 32*32*2 = 2048 = 128 * 16
+zc_lc5:
+ vmov.i64 q12, #0
+ vmov.i64 q13, #0
+ vmov.i64 q14, #0
+ vmov.i64 q15, #0
+ mov r2, #4
+1:
+ vstm r0!, {q8-q15}
+ subs r2, #1
+ vstm r0!, {q8-q15}
+ vstm r0!, {q8-q15}
+ vstm r0!, {q8-q15}
+ bne 1b
+ bx lr
+
+endfunc
+
diff --git a/libavcodec/arm/hevcdsp_cres_neon.S b/libavcodec/arm/hevcdsp_cres_neon.S
new file mode 100644
index 0000000000..bafefd4318
--- /dev/null
+++ b/libavcodec/arm/hevcdsp_cres_neon.S
@@ -0,0 +1,296 @@
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
+@ General notes:
+@
+@ Residual is only guaranteed to be cliped to 16 bits
+@ This means that we do need to do movul, qadd, qmovun
+@ rather than addw, qmovun (if we were clipped to 15 then we could get away
+@ with this)
+
+@ ============================================================================
+@ U add
+
+@ add_residual4x4_c(
+@ uint8_t *_dst, [r0]
+@ const int16_t *res, [r1]
+@ ptrdiff_t stride, [r2]
+@ int dc_v) [r3]
+
+function ff_hevc_add_residual_4x4_u_neon_8, export=1
+ vld1.8 {d16}, [r0, :64], r2
+ vld1.8 {d17}, [r0, :64], r2
+ vld1.8 {d18}, [r0, :64], r2
+ vld1.8 {d19}, [r0, :64], r2
+ vld1.16 {q0, q1}, [r1]
+ vdup.16 q2, r3
+ vdup.16 q3, r3
+ vmovl.u8 q10, d16
+ sub r0, r0, r2, lsl #2
+ vmovl.u8 q11, d17
+ vmovl.u8 q12, d18
+ vmovl.u8 q13, d19
+ vzip.16 q0, q2
+ vzip.16 q1, q3
+ vqadd.s16 q0, q10
+ vqadd.s16 q2, q11
+ vqadd.s16 q1, q12
+ vqadd.s16 q3, q13
+ vqmovun.s16 d0, q0
+ vqmovun.s16 d1, q2
+ vqmovun.s16 d2, q1
+ vqmovun.s16 d3, q3
+ vst1.8 {d0}, [r0, :64], r2
+ vst1.8 {d1}, [r0, :64], r2
+ vst1.8 {d2}, [r0, :64], r2
+ vst1.8 {d3}, [r0, :64]
+ bx lr
+endfunc
+
+@ add_residual8x8_c(
+@ uint8_t *_dst, [r0]
+@ const int16_t *res, [r1]
+@ ptrdiff_t stride) [r2]
+@ int dc_v) [r3]
+
+function ff_hevc_add_residual_8x8_u_neon_8, export=1
+ mov r12, #4
+ vdup.16 q15, r3
+1:
+ vld2.8 {d16, d17}, [r0, :128], r2
+ vld2.8 {d18, d19}, [r0, :128]
+ vld1.16 {q0, q1}, [r1, :256]!
+ subs r12, #1
+ vmovl.u8 q10, d16
+ sub r0, r2
+ vmovl.u8 q11, d18
+ vqadd.s16 q0, q10
+ vaddw.u8 q2, q15, d17
+ vqadd.s16 q1, q11
+ vaddw.u8 q3, q15, d19
+ vqmovun.s16 d16, q0
+ vqmovun.s16 d17, q2
+ vqmovun.s16 d18, q1
+ vqmovun.s16 d19, q3
+ vst2.8 {d16, d17}, [r0, :128], r2
+ vst2.8 {d18, d19}, [r0, :128], r2
+ bne 1b
+ bx lr
+endfunc
+
+@ add_residual16x16_u(
+@ uint8_t *_dst, [r0]
+@ const int16_t *res, [r1]
+@ ptrdiff_t stride) [r2]
+@ int dc_v) [r3]
+
+function ff_hevc_add_residual_16x16_u_neon_8, export=1
+ mov r12, #16
+ vdup.16 q15, r3
+1:
+ vld2.8 {q8, q9}, [r0, :256]
+ vld1.16 {q0, q1}, [r1, :256]!
+ subs r12, #1
+ vmovl.u8 q10, d16
+ vmovl.u8 q11, d17
+ vqadd.s16 q0, q10
+ vqadd.s16 q1, q11
+ vaddw.u8 q2, q15, d18
+ vaddw.u8 q3, q15, d19
+ vqmovun.s16 d16, q0
+ vqmovun.s16 d17, q1
+ vqmovun.s16 d18, q2
+ vqmovun.s16 d19, q3
+ vst2.8 {q8, q9}, [r0, :256], r2
+ bne 1b
+ bx lr
+endfunc
+
+@ ============================================================================
+@ V add
+
+@ add_residual4x4_v(
+@ uint8_t *_dst, [r0]
+@ const int16_t *res, [r1]
+@ ptrdiff_t stride) [r2]
+
+function ff_hevc_add_residual_4x4_v_neon_8, export=1
+ vld1.8 {d16}, [r0, :64], r2
+ vld1.8 {d17}, [r0, :64], r2
+ vld1.8 {d18}, [r0, :64], r2
+ vld1.8 {d19}, [r0, :64], r2
+ vld1.16 {q2, q3}, [r1]
+ vdup.16 q0, r3
+ vdup.16 q1, r3
+ vmovl.u8 q10, d16
+ sub r0, r0, r2, lsl #2
+ vmovl.u8 q11, d17
+ vmovl.u8 q12, d18
+ vmovl.u8 q13, d19
+ vzip.16 q0, q2
+ vzip.16 q1, q3
+ vqadd.s16 q0, q10
+ vqadd.s16 q2, q11
+ vqadd.s16 q1, q12
+ vqadd.s16 q3, q13
+ vqmovun.s16 d0, q0
+ vqmovun.s16 d1, q2
+ vqmovun.s16 d2, q1
+ vqmovun.s16 d3, q3
+ vst1.8 {d0}, [r0, :64], r2
+ vst1.8 {d1}, [r0, :64], r2
+ vst1.8 {d2}, [r0, :64], r2
+ vst1.8 {d3}, [r0, :64]
+ bx lr
+endfunc
+
+@ add_residual8x8_v(
+@ uint8_t *_dst, [r0]
+@ const int16_t *res, [r1]
+@ ptrdiff_t stride) [r2]
+
+function ff_hevc_add_residual_8x8_v_neon_8, export=1
+ mov r12, #4
+ vdup.16 q15, r3
+1:
+ vld2.8 {d16, d17}, [r0, :128], r2
+ vld2.8 {d18, d19}, [r0, :128]
+ vld1.16 {q0, q1}, [r1, :256]!
+ subs r12, #1
+ vmovl.u8 q10, d17
+ sub r0, r2
+ vmovl.u8 q11, d19
+ vqadd.s16 q0, q10
+ vqadd.s16 q1, q11
+ vaddw.u8 q2, q15, d16
+ vaddw.u8 q3, q15, d18
+ vqmovun.s16 d17, q0
+ vqmovun.s16 d16, q2
+ vqmovun.s16 d19, q1
+ vqmovun.s16 d18, q3
+ vst2.8 {d16, d17}, [r0, :128], r2
+ vst2.8 {d18, d19}, [r0, :128], r2
+ bne 1b
+ bx lr
+endfunc
+
+@ add_residual16x16_v(
+@ uint8_t *_dst, [r0]
+@ const int16_t *res, [r1]
+@ ptrdiff_t stride) [r2]
+
+function ff_hevc_add_residual_16x16_v_neon_8, export=1
+ mov r12, #16
+ vdup.16 q15, r3
+1:
+ vld2.8 {q8, q9}, [r0, :256]
+ vld1.16 {q0, q1}, [r1, :256]!
+ subs r12, #1
+ vmovl.u8 q10, d18
+ vmovl.u8 q11, d19
+ vaddw.u8 q2, q15, d16
+ vaddw.u8 q3, q15, d17
+ vqadd.s16 q0, q10
+ vqadd.s16 q1, q11
+ vqmovun.s16 d16, q2
+ vqmovun.s16 d17, q3
+ vqmovun.s16 d18, q0
+ vqmovun.s16 d19, q1
+ vst2.8 {q8, q9}, [r0, :256], r2
+ bne 1b
+ bx lr
+endfunc
+
+@ ============================================================================
+@ U & V add
+
+@ add_residual4x4_c(
+@ uint8_t *_dst, [r0]
+@ const int16_t *res, [r1]
+@ ptrdiff_t stride) [r2]
+
+function ff_hevc_add_residual_4x4_c_neon_8, export=1
+ vld1.8 {d16}, [r0, :64], r2
+ vld1.8 {d17}, [r0, :64], r2
+ vld1.8 {d18}, [r0, :64], r2
+ vld1.8 {d19}, [r0, :64], r2
+ vldm r1, {q0-q3} @ Q0/1 gets all of U, Q2/3 gets all of V
+ vmovl.u8 q10, d16
+ sub r0, r0, r2, lsl #2
+ vmovl.u8 q11, d17
+ vmovl.u8 q12, d18
+ vmovl.u8 q13, d19
+ vzip.16 q0, q2
+ vzip.16 q1, q3
+ vqadd.s16 q0, q10
+ vqadd.s16 q2, q11
+ vqadd.s16 q1, q12
+ vqadd.s16 q3, q13
+ vqmovun.s16 d0, q0
+ vqmovun.s16 d1, q2
+ vqmovun.s16 d2, q1
+ vqmovun.s16 d3, q3
+ vst1.8 {d0}, [r0, :64], r2
+ vst1.8 {d1}, [r0, :64], r2
+ vst1.8 {d2}, [r0, :64], r2
+ vst1.8 {d3}, [r0, :64]
+ bx lr
+endfunc
+
+@ add_residual8x8_c(
+@ uint8_t *_dst, [r0]
+@ const int16_t *res, [r1]
+@ ptrdiff_t stride) [r2]
+
+function ff_hevc_add_residual_8x8_c_neon_8, export=1
+ mov r12, #8
+ add r3, r1, #(8*8*2) @ Offset to V
+1:
+ vld2.8 {d16, d17}, [r0, :128]
+ vld1.16 {q0}, [r1, :128]!
+ vld1.16 {q1}, [r3, :128]!
+ subs r12, #1
+ vmovl.u8 q10, d16
+ vmovl.u8 q11, d17
+ vqadd.s16 q0, q10
+ vqadd.s16 q1, q11
+ vqmovun.s16 d0, q0
+ vqmovun.s16 d1, q1
+ vst2.8 {d0, d1}, [r0, :128], r2
+ bne 1b
+ bx lr
+endfunc
+
+@ add_residual16x16_c(
+@ uint8_t *_dst, [r0]
+@ const int16_t *res, [r1]
+@ ptrdiff_t stride) [r2]
+
+function ff_hevc_add_residual_16x16_c_neon_8, export=1
+ mov r12, #16
+ add r3, r1, #(16*16*2) @ Offset to V
+1:
+ vld2.8 {q8, q9}, [r0, :256]
+ vld1.16 {q0, q1}, [r1, :256]!
+ vld1.16 {q2, q3}, [r3, :256]!
+ subs r12, #1
+ vmovl.u8 q10, d16
+ vmovl.u8 q11, d17
+ vmovl.u8 q12, d18
+ vmovl.u8 q13, d19
+ vqadd.s16 q0, q10
+ vqadd.s16 q1, q11
+ vqadd.s16 q2, q12
+ vqadd.s16 q3, q13
+ vqmovun.s16 d0, q0
+ vqmovun.s16 d1, q1
+ vqmovun.s16 d2, q2
+ vqmovun.s16 d3, q3
+ vst2.8 {q0, q1}, [r0, :256], r2
+ bne 1b
+ bx lr
+endfunc
+
+@ 32x32 chroma never occurs so NIF
+
+@ ============================================================================
diff --git a/libavcodec/arm/hevcdsp_deblock_neon.S b/libavcodec/arm/hevcdsp_deblock_neon.S
index 166bddb104..15c4329cdb 100644
--- a/libavcodec/arm/hevcdsp_deblock_neon.S
+++ b/libavcodec/arm/hevcdsp_deblock_neon.S
@@ -15,7 +15,7 @@
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1
*/
@@ -24,70 +24,238 @@
.macro hevc_loop_filter_chroma_start
ldr r12, [r2]
- ldr r3, [r2, #4]
- add r2, r3, r12
- cmp r2, #0
+ ldr r2, [r2, #4]
+ orrs r2, r12, r2, lsl #16
it eq
bxeq lr
.endm
-.macro hevc_loop_filter_chroma_body
- vsubl.u8 q3, d4, d2
- vsubl.u8 q11, d18, d19
- vshl.i16 q3, #2
- vadd.i16 q11, q3
- vdup.16 d0, r12
- vdup.16 d1, r3
- vrshr.s16 q11, q11, #3
- vneg.s16 q12, q0
+@ Uses: d2, d4, d18, d19
+@ Returns: d2, d4
+@ Modifies: d0-d7, d22-d25, r12
+
+.macro hevc_loop_filter_chroma_body P1, P0, Q0, Q1
+ vsubl.u8 q0, \Q0, \P0
+ vsubl.u8 q1, \P1, \Q1
+ vdup.16 d4, r2
+ lsr r2, r2, #16
+ vshl.i16 q0, #2
+ ldr r12, [sp, #0] @ r12 = &no_q
+ vadd.i16 q0, q1
+ ldrh r3, [r3] @ r3[0:8] = no_p[0], r3[8:15] = no_p[1]
+ vdup.16 d5, r2
+
+ vrshr.s16 q0, q0, #3
+ ldrh r12, [r12]
+ vneg.s16 q3, q2
+ vmin.s16 q0, q0, q2
+ vmovl.u8 q2, \Q0
+ vmax.s16 q0, q0, q3
+ vaddw.u8 q1, q0, \P0
+ vsub.i16 q2, q0
+ orrs r12, r3, r12, lsl #16 @ So should have b1:no_p[0], b9:no_p[1], b17: no_q[0], b25:no_q[1]
+ vqmovun.s16 \P0, q1
+ vqmovun.s16 \Q0, q2
+.endm
+
+@ Uses r2 (tc a;b)
+@ Modifies: q0-q3
+@ On exit
+@ r12 (and flags) contain no_p;no_q
+.macro hevc_loop_filter_chroma_body_16 P1, P0, Q0, Q1, bit_depth
+ vsub.i16 q0, \Q0, \P0
+ lsl r12, r2, #(\bit_depth - 8)
+ vsub.i16 q1, \P1, \Q1
+ vshl.i16 q0, #2
+ vdup.16 d4, r12
+ lsr r12, r12, #16
+ vadd.i16 q0, q1
+ ldrh r3, [r3]
+ vdup.16 d5, r12
+
+ vrshr.s16 q0, q0, #3
+ vneg.s16 q3, q2
+ movw r12, #(1 << \bit_depth) - 1
+ vmin.s16 q0, q0, q2
+ vmax.s16 q0, q0, q3
+ vdup.i16 q3, r12
+ ldr r12, [sp, #0]
+
+ vadd.i16 \P0, q0, \P0
+ vsub.i16 \Q0, q0
+
+ vmov.i64 q2, #0
+ ldrh r12, [r12]
+ vmin.s16 \P0, q3
+ vmin.s16 \Q0, q3
+ orrs r12, r3, r12, lsl #16 @ So should have b1:no_p[0], b9:no_p[1], b17: no_q[0], b25:no_q[1]
+ vmax.s16 \P0, q2
+ vmax.s16 \Q0, q2
+.endm
+
+
+@ Preserves r12
+@ Clobbers r2
+.macro hevc_loop_filter_uv_body2 P1u, P1v, P0u, P0v, Q0u, Q0v, Q1u, Q1v
+ vsubl.u8 q0, \Q0u, \P0u
+ vsubl.u8 q1, \Q0v, \P0v
+ vsubl.u8 q2, \P1u, \Q1u
+ vsubl.u8 q3, \P1v, \Q1v
+ vshl.i16 q0, #2
+ vshl.i16 q1, #2
+ vadd.i16 q0, q2
+ vdup.16 d4, r2
+ lsr r2, #16
+ vadd.i16 q1, q3
+
+ @ r2[0:7] -> d4.16 (all), r2[8:15] -> d5.16(all)
+ vrshr.s16 q0, #3
+ vdup.16 d6, r2
vmovl.u8 q2, d4
- vmin.s16 q11, q11, q0
- vmax.s16 q11, q11, q12
- vaddw.u8 q1, q11, d2
- vsub.i16 q2, q11
- vqmovun.s16 d2, q1
- vqmovun.s16 d4, q2
+ vmovl.u8 q3, d6
+ vuzp.16 d4, d5
+ vrshr.s16 q1, #3
+ vuzp.16 d6, d7
+
+ vmin.s16 q0, q2
+ vneg.s16 q2, q2
+ vmin.s16 q1, q3
+ vneg.s16 q3, q3
+ vmax.s16 q0, q2
+ vaddw.u8 q2, q0, \P0u
+ vmax.s16 q1, q3
+ vaddw.u8 q3, q1, \P0v
+
+ vqmovun.s16 \P0u, q2
+ vmovl.u8 q2, \Q0u
+ vqmovun.s16 \P0v, q3
+ vmovl.u8 q3, \Q0v
+ vsub.i16 q2, q0
+ vsub.i16 q3, q1
+
+ vqmovun.s16 \Q0u, q2
+ vqmovun.s16 \Q0v, q3
.endm
+@ Preserves r12
+@ Clobbers r2
+.macro hevc_loop_filter_uv_body2_16 P1u, P1v, P0u, P0v, Q0u, Q0v, Q1u, Q1v, bit_depth
+ vsub.i16 q0, \Q0u, \P0u
+ vsub.i16 q1, \Q0v, \P0v
+ vsub.i16 q2, \P1u, \Q1u
+ vsub.i16 q3, \P1v, \Q1v
+ vshl.i16 q0, #2
+ vshl.i16 q1, #2
+ vadd.i16 q0, q2
+ vdup.16 d4, r2
+ lsr r2, #16
+ vadd.i16 q1, q3
+
+ @ r2[0:7] -> d4.16 (all), r2[8:15] -> d5.16(all)
+ vrshr.s16 q0, #3
+ vdup.16 d6, r2
+ vshll.u8 q2, d4, #\bit_depth - 8
+ vshll.u8 q3, d6, #\bit_depth - 8
+ vuzp.16 d4, d5
+ vrshr.s16 q1, #3
+ vuzp.16 d6, d7
+
+ movw r2, #(1 << \bit_depth) - 1
+ vmin.s16 q0, q2
+ vneg.s16 q2, q2
+ vmin.s16 q1, q3
+ vneg.s16 q3, q3
+ vmax.s16 q0, q2
+ vmov.i64 q2, #0
+ vmax.s16 q1, q3
+ vdup.i16 q3, r2
+ vadd.i16 \P0u, q0
+ vsub.i16 \Q0u, q0
+ vadd.i16 \P0v, q1
+ vsub.i16 \Q0v, q1
+
+ vmax.s16 \P0u, q2
+ vmax.s16 \Q0u, q2
+ vmax.s16 \P0v, q2
+ vmax.s16 \Q0v, q2
+ vmin.s16 \P0u, q3
+ vmin.s16 \Q0u, q3
+ vmin.s16 \P0v, q3
+ vmin.s16 \Q0v, q3
+.endm
+
+
+
.macro hevc_loop_filter_luma_start
ldr r12, [r3]
ldr r3, [r3, #4]
- lsl r3, #16
- orr r3, r12
- cmp r3, #0
+ orrs r3, r12, r3, lsl #16
it eq
bxeq lr
- lsr r3, #16
.endm
-.macro hevc_loop_filter_luma_body
- vmovl.u8 q8, d16
- vmovl.u8 q9, d18
- vmovl.u8 q10, d20
- vmovl.u8 q11, d22
- vmovl.u8 q12, d24
- vmovl.u8 q13, d26
- vmovl.u8 q14, d28
- vmovl.u8 q15, d30
+@ Uses: r2, r3, r12
+@ Modifies: r5, r6, r7, r8, r9
+
+@ Input:
+@ r2 beta (raw: needs shift for bitdepth > 8)
+@ r3[ 0:15] tc[0] (raw: needs shift for bitdepth > 8)
+@ r3[16:31] tc[1] (raw: needs shift for bitdepth > 8)
+@ [sp,#96] &no_p[0]
+@ [sp,#100] &no_q[0]
+@
+@ Input & output
+@ 8-bit: d16-d23
+@ 16-bit: q8-q15
+@
+@ Output
+@ Z r10==0
+@ r10[ 0:7 ] no_p[0]
+@ r10[ 8:15] no_p[1]
+@ r10[16:23] no_q[0]
+@ r10[24:31] no_q[1]
+
+.macro m_filter_luma bit_depth
+.if \bit_depth == 8
+ vmovl.u8 q15, d23
+ vmovl.u8 q14, d22
+ vmovl.u8 q13, d21
+ vmovl.u8 q12, d20
+ vmovl.u8 q11, d19
+ vmovl.u8 q10, d18
+ vmovl.u8 q9, d17
+ vmovl.u8 q8, d16
+.endif
vadd.i16 q7, q9, q11
+.if \bit_depth > 8
+ lsl r2, r2, #(\bit_depth - 8)
+.endif
vadd.i16 q6, q14, q12
+.if \bit_depth > 8
+ lsl r3, r3, #(\bit_depth - 8)
+.endif
vsub.i16 q7, q10
+ ldr r5, [sp, #96] @ Bolt no_x values together into r10
vsub.i16 q6, q13
vabd.s16 q7, q7, q10
vabd.s16 q6, q6, q13
-
+ ldrh r10, [r5]
vdup.16 q0, r2
vmov q4, q7
vmov q5, q6
- vdup.16 d4, r12
+ ldr r5, [sp, #100]
+ vdup.16 d4, r3
+ lsr r3, r3, #16
vtrn.16 q7, q4
+ ldrh r5, [r5]
vtrn.16 q6, q5
vshl.u64 q7, #32
vshr.u64 q4, #32
vshl.u64 q6, #32
+ orr r10, r10, r5, lsl #16
vshr.u64 q5, #32
vshr.u64 q7, #32
vshr.u64 q6, #32
@@ -152,7 +320,7 @@
and r9, r8, r7
cmp r9, #0
- beq weakfilter_\@
+ beq 1f
vadd.i16 q2, q11, q12
vadd.i16 q4, q9, q8
@@ -210,11 +378,11 @@
vbit q13, q3, q5
vbit q14, q2, q5
-weakfilter_\@:
+1:
mvn r8, r8
and r9, r8, r7
cmp r9, #0
- beq ready_\@
+ beq 2f
vdup.16 q4, r2
@@ -275,111 +443,1041 @@ weakfilter_\@:
vbit q11, q0, q5
vbit q12, q4, q5
-ready_\@:
+2:
+.if \bit_depth == 8
vqmovun.s16 d16, q8
- vqmovun.s16 d18, q9
- vqmovun.s16 d20, q10
- vqmovun.s16 d22, q11
- vqmovun.s16 d24, q12
- vqmovun.s16 d26, q13
- vqmovun.s16 d28, q14
- vqmovun.s16 d30, q15
+ cmp r10, #0
+ vqmovun.s16 d17, q9
+ vqmovun.s16 d18, q10
+ vqmovun.s16 d19, q11
+ vqmovun.s16 d20, q12
+ vqmovun.s16 d21, q13
+ vqmovun.s16 d22, q14
+ vqmovun.s16 d23, q15
+.else
+ movw r12, #(1 << \bit_depth - 1)
+ vmov.i64 q0, #0
+ vdup.i16 q1, r12
+ @ q8 & q15 should be unaltered and so don't require clipping
+ vmax.s16 q9, q0
+ cmp r10, #0
+ vmax.s16 q10, q0
+ vmax.s16 q11, q0
+ vmax.s16 q12, q0
+ vmax.s16 q13, q0
+ vmax.s16 q14, q0
+ vmin.s16 q9, q1
+ vmin.s16 q10, q1
+ vmin.s16 q11, q1
+ vmin.s16 q12, q1
+ vmin.s16 q13, q1
+ vmin.s16 q14, q1
+.endif
+ mov pc, lr
.endm
+function hevc_loop_filter_luma_body
+ m_filter_luma 8
+endfunc
+
+@ ff_hevc_v_loop_filter_luma2_neon(src (r0), stride (r1), beta (r2), tc (r3), np_p (sp[0]), no_q (sp[4]), src2 (sp[8]))
+function ff_hevc_v_loop_filter_luma2_neon_8, export=1
+ hevc_loop_filter_luma_start
+ push {r4-r10,lr} @ 8 regs = 32 bytes
+
+ ldr r4, [sp, #40]
+ b v_loop_luma_common
+endfunc
+
+
+@ void ff_hevc_v_loop_filter_luma_neon(
+@ uint8_t *_pix, [r0]
+@ ptrdiff_t _stride, [r1]
+@ int _beta, [r2]
+@ int *_tc, [r3]
+@ uint8_t *_no_p, [sp+0]
+@ uint8_t *_no_q) [sp+4]
+
+
function ff_hevc_v_loop_filter_luma_neon, export=1
hevc_loop_filter_luma_start
- push {r5-r11}
+ push {r4-r10,lr}
+
+ sub r4, r0, #4
+v_loop_luma_common:
vpush {d8-d15}
- sub r0, #4
- vld1.8 {d16}, [r0], r1
- vld1.8 {d18}, [r0], r1
- vld1.8 {d20}, [r0], r1
- vld1.8 {d22}, [r0], r1
- vld1.8 {d24}, [r0], r1
- vld1.8 {d26}, [r0], r1
- vld1.8 {d28}, [r0], r1
- vld1.8 {d30}, [r0], r1
- sub r0, r0, r1, lsl #3
- transpose_8x8 d16, d18, d20, d22, d24, d26, d28, d30
- hevc_loop_filter_luma_body
- transpose_8x8 d16, d18, d20, d22, d24, d26, d28, d30
- vst1.8 {d16}, [r0], r1
- vst1.8 {d18}, [r0], r1
- vst1.8 {d20}, [r0], r1
- vst1.8 {d22}, [r0], r1
- vst1.8 {d24}, [r0], r1
- vst1.8 {d26}, [r0], r1
- vst1.8 {d28}, [r0], r1
- vst1.8 {d30}, [r0]
+
+ @ Uses slightly fewer instructions to do laned loads than unlaned
+ @ and transpose. This also means that we can use the same code for
+ @ both split & unsplit deblock
+ vld4.8 {d16[0],d17[0],d18[0],d19[0]}, [r4:32], r1
+ vld4.8 {d20[0],d21[0],d22[0],d23[0]}, [r0:32], r1
+
+ vld4.8 {d16[1],d17[1],d18[1],d19[1]}, [r4:32], r1
+ vld4.8 {d20[1],d21[1],d22[1],d23[1]}, [r0:32], r1
+
+ vld4.8 {d16[2],d17[2],d18[2],d19[2]}, [r4:32], r1
+ vld4.8 {d20[2],d21[2],d22[2],d23[2]}, [r0:32], r1
+
+ vld4.8 {d16[3],d17[3],d18[3],d19[3]}, [r4:32], r1
+ vld4.8 {d20[3],d21[3],d22[3],d23[3]}, [r0:32], r1
+
+ vld4.8 {d16[4],d17[4],d18[4],d19[4]}, [r4:32], r1
+ vld4.8 {d20[4],d21[4],d22[4],d23[4]}, [r0:32], r1
+
+ vld4.8 {d16[5],d17[5],d18[5],d19[5]}, [r4:32], r1
+ vld4.8 {d20[5],d21[5],d22[5],d23[5]}, [r0:32], r1
+
+ vld4.8 {d16[6],d17[6],d18[6],d19[6]}, [r4:32], r1
+ vld4.8 {d20[6],d21[6],d22[6],d23[6]}, [r0:32], r1
+
+ vld4.8 {d16[7],d17[7],d18[7],d19[7]}, [r4:32]
+ vld4.8 {d20[7],d21[7],d22[7],d23[7]}, [r0:32]
+
+ bl hevc_loop_filter_luma_body
+
+ neg r1, r1
+
+ @ no_p[1]
+ tst r10, #0xff00
+ add r2, r4, r1, lsl #2
+ bne 1f
+ vst4.8 {d16[7],d17[7],d18[7],d19[7]}, [r4:32], r1
+ vst4.8 {d16[6],d17[6],d18[6],d19[6]}, [r4:32], r1
+ vst4.8 {d16[5],d17[5],d18[5],d19[5]}, [r4:32], r1
+ vst4.8 {d16[4],d17[4],d18[4],d19[4]}, [r4:32]
+1:
+ @ no_p[0]
+ tst r10, #0xff
+ bne 1f
+ vst4.8 {d16[3],d17[3],d18[3],d19[3]}, [r2:32], r1
+ vst4.8 {d16[2],d17[2],d18[2],d19[2]}, [r2:32], r1
+ vst4.8 {d16[1],d17[1],d18[1],d19[1]}, [r2:32], r1
+ vst4.8 {d16[0],d17[0],d18[0],d19[0]}, [r2:32]
+1:
+ @ no_q[1]
+ tst r10, #0xff000000
+ add r2, r0, r1, lsl #2
+ bne 1f
+ vst4.8 {d20[7],d21[7],d22[7],d23[7]}, [r0:32], r1
+ vst4.8 {d20[6],d21[6],d22[6],d23[6]}, [r0:32], r1
+ vst4.8 {d20[5],d21[5],d22[5],d23[5]}, [r0:32], r1
+ vst4.8 {d20[4],d21[4],d22[4],d23[4]}, [r0:32]
+1:
+ @ no_q[0]
+ tst r10, #0xff0000
+ bne 1f
+ vst4.8 {d20[3],d21[3],d22[3],d23[3]}, [r2:32], r1
+ vst4.8 {d20[2],d21[2],d22[2],d23[2]}, [r2:32], r1
+ vst4.8 {d20[1],d21[1],d22[1],d23[1]}, [r2:32], r1
+ vst4.8 {d20[0],d21[0],d22[0],d23[0]}, [r2:32]
+1:
+bypasswrite:
vpop {d8-d15}
- pop {r5-r11}
- bx lr
+ pop {r4-r10,pc}
endfunc
+.macro m_filter_v_luma_common_16 bit_depth
+ vpush {d8-d15}
+
+ @ Uses slightly fewer instructions to do laned loads than unlaned
+ @ and transpose. This also means that we can use the same code for
+ @ both split & unsplit deblock
+ vld4.16 {d16[0], d18[0], d20[0], d22[0]}, [r4], r1
+ vld4.16 {d24[0], d26[0], d28[0], d30[0]}, [r0], r1
+
+ vld4.16 {d16[1], d18[1], d20[1], d22[1]}, [r4], r1
+ vld4.16 {d24[1], d26[1], d28[1], d30[1]}, [r0], r1
+
+ vld4.16 {d16[2], d18[2], d20[2], d22[2]}, [r4], r1
+ vld4.16 {d24[2], d26[2], d28[2], d30[2]}, [r0], r1
+
+ vld4.16 {d16[3], d18[3], d20[3], d22[3]}, [r4], r1
+ vld4.16 {d24[3], d26[3], d28[3], d30[3]}, [r0], r1
+
+ vld4.16 {d17[0], d19[0], d21[0], d23[0]}, [r4], r1
+ vld4.16 {d25[0], d27[0], d29[0], d31[0]}, [r0], r1
+
+ vld4.16 {d17[1], d19[1], d21[1], d23[1]}, [r4], r1
+ vld4.16 {d25[1], d27[1], d29[1], d31[1]}, [r0], r1
+
+ vld4.16 {d17[2], d19[2], d21[2], d23[2]}, [r4], r1
+ vld4.16 {d25[2], d27[2], d29[2], d31[2]}, [r0], r1
+
+ vld4.16 {d17[3], d19[3], d21[3], d23[3]}, [r4]
+ vld4.16 {d25[3], d27[3], d29[3], d31[3]}, [r0]
+
+ bl hevc_loop_filter_luma_body_\bit_depth
+
+ neg r1, r1
+
+ @ p[1]
+ tst r10, #0xff00
+ add r2, r4, r1, lsl #2
+ bne 1f
+ vst4.16 {d17[3], d19[3], d21[3], d23[3]}, [r4], r1
+ vst4.16 {d17[2], d19[2], d21[2], d23[2]}, [r4], r1
+ vst4.16 {d17[1], d19[1], d21[1], d23[1]}, [r4], r1
+ vst4.16 {d17[0], d19[0], d21[0], d23[0]}, [r4]
+1:
+ @ p[0]
+ tst r10, #0xff
+ bne 1f
+ vst4.16 {d16[3], d18[3], d20[3], d22[3]}, [r2], r1
+ vst4.16 {d16[2], d18[2], d20[2], d22[2]}, [r2], r1
+ vst4.16 {d16[1], d18[1], d20[1], d22[1]}, [r2], r1
+ vst4.16 {d16[0], d18[0], d20[0], d22[0]}, [r2]
+1:
+ @ q[1]
+ tst r10, #0xff000000
+ add r2, r0, r1, lsl #2
+ bne 1f
+ vst4.16 {d25[3], d27[3], d29[3], d31[3]}, [r0], r1
+ vst4.16 {d25[2], d27[2], d29[2], d31[2]}, [r0], r1
+ vst4.16 {d25[1], d27[1], d29[1], d31[1]}, [r0], r1
+ vst4.16 {d25[0], d27[0], d29[0], d31[0]}, [r0]
+1:
+ @ q[0]
+ tst r10, #0xff0000
+ bne 1f
+ vst4.16 {d24[3], d26[3], d28[3], d30[3]}, [r2], r1
+ vst4.16 {d24[2], d26[2], d28[2], d30[2]}, [r2], r1
+ vst4.16 {d24[1], d26[1], d28[1], d30[1]}, [r2], r1
+ vst4.16 {d24[0], d26[0], d28[0], d30[0]}, [r2]
+1:
+ vpop {d8-d15}
+ pop {r4-r10,pc}
+.endm
+
+
+
+
+@ void (*hevc_h_loop_filter_luma)(uint8_t *pix, [r0]
+@ ptrdiff_t stride, [r1]
+@ int beta, [r2]
+@ int32_t *tc, [r3]
+@ uint8_t *no_p, sp[0]
+@ uint8_t *no_q); sp[4]
+@
+@ Src should always be on 8 byte boundry & all in the same slice
+
function ff_hevc_h_loop_filter_luma_neon, export=1
hevc_loop_filter_luma_start
- push {r5-r11}
+ push {r4-r10,lr}
+
vpush {d8-d15}
sub r0, r0, r1, lsl #2
+
vld1.8 {d16}, [r0], r1
+ vld1.8 {d17}, [r0], r1
vld1.8 {d18}, [r0], r1
+ vld1.8 {d19}, [r0], r1
vld1.8 {d20}, [r0], r1
+ vld1.8 {d21}, [r0], r1
vld1.8 {d22}, [r0], r1
- vld1.8 {d24}, [r0], r1
- vld1.8 {d26}, [r0], r1
- vld1.8 {d28}, [r0], r1
- vld1.8 {d30}, [r0], r1
- sub r0, r0, r1, lsl #3
- add r0, r1
- hevc_loop_filter_luma_body
- vst1.8 {d18}, [r0], r1
- vst1.8 {d20}, [r0], r1
- vst1.8 {d22}, [r0], r1
- vst1.8 {d24}, [r0], r1
- vst1.8 {d26}, [r0], r1
- vst1.8 {d28}, [r0]
-bypasswrite:
+ vld1.8 {d23}, [r0]
+
+ bl hevc_loop_filter_luma_body
+
vpop {d8-d15}
- pop {r5-r11}
- bx lr
+
+ neg r1, r1
+ add r0, r0, r1
+
+ bne 1f
+
+ vst1.8 {d22}, [r0], r1
+ vst1.8 {d21}, [r0], r1
+ vst1.8 {d20}, [r0], r1
+ vst1.8 {d19}, [r0], r1
+ vst1.8 {d18}, [r0], r1
+ vst1.8 {d17}, [r0]
+
+ pop {r4-r10,pc}
+
+@ Partial write
+1:
+ vmov r2, r3, d22
+ vmov r4, r5, d21
+ vmov r6, r7, d20
+
+ tst r10, #0xff0000
+ ittt eq
+ streq r2, [r0]
+ streq r4, [r0, r1]
+ streq r6, [r0, r1, lsl # 1]
+
+ add r0, r0, #4
+ tst r10, #0xff000000
+ ittt eq
+ streq r3, [r0]
+ streq r5, [r0, r1]
+ streq r7, [r0, r1, lsl # 1]
+
+ vmov r2, r3, d19
+ vmov r4, r5, d18
+ vmov r6, r7, d17
+ add r0, r0, r1
+ add r0, r0, r1, lsl # 1
+
+ tst r10, #0xff00
+ ittt eq
+ streq r3, [r0]
+ streq r5, [r0, r1]
+ streq r7, [r0, r1, lsl # 1]
+
+ tst r10, #0xff
+ ittt eq
+ streq r2, [r0, #-4]!
+ streq r4, [r0, r1]
+ streq r6, [r0, r1, lsl # 1]
+
+ pop {r4-r10,pc}
+
+endfunc
+
+
+.macro m_filter_h_luma_16 bit_depth
+ hevc_loop_filter_luma_start
+ push {r4-r10,lr}
+
+ vpush {d8-d15}
+ sub r0, r0, r1, lsl #2
+
+ vld1.16 { q8}, [r0], r1
+ vld1.16 { q9}, [r0], r1
+ vld1.16 {q10}, [r0], r1
+ vld1.16 {q11}, [r0], r1
+ vld1.16 {q12}, [r0], r1
+ vld1.16 {q13}, [r0], r1
+ vld1.16 {q14}, [r0], r1
+ vld1.16 {q15}, [r0]
+
+ bl hevc_loop_filter_luma_body_\bit_depth
+
+ vpop {d8-d15}
+
+ sub r0, r1
+ neg r1, r1
+ bne 1f
+
+ vst1.16 {q14}, [r0], r1
+ vst1.16 {q13}, [r0], r1
+ vst1.16 {q12}, [r0], r1
+ vst1.16 {q11}, [r0], r1
+ vst1.16 {q10}, [r0], r1
+ vst1.16 { q9}, [r0]
+ pop {r4-r10,pc}
+
+@ Partial write
+1:
+ tst r10, #0xff0000
+ mov r2, r0
+ bne 1f
+ vst1.16 {d28}, [r2], r1
+ vst1.16 {d26}, [r2], r1
+ vst1.16 {d24}, [r2]
+
+1:
+ tst r10, #0xff000000
+ add r2, r0, #8
+ bne 1f
+ vst1.16 {d29}, [r2], r1
+ vst1.16 {d27}, [r2], r1
+ vst1.16 {d25}, [r2]
+
+1:
+ tst r10, #0xff
+ @ r0 = r0 + r1 * 3
+ add r0, r0, r1
+ add r0, r0, r1, lsl # 1
+ add r2, r0, #8
+ bne 1f
+ vst1.16 {d22}, [r0], r1
+ vst1.16 {d20}, [r0], r1
+ vst1.16 {d18}, [r0]
+
+1:
+ tst r10, #0xff00
+ bne 1f
+ vst1.16 {d23}, [r2], r1
+ vst1.16 {d21}, [r2], r1
+ vst1.16 {d19}, [r2]
+
+1:
+ pop {r4-r10,pc}
+.endm
+
+
+@ void ff_hevc_h_loop_filter_uv_neon(uint8_t * src_r, // r0
+@ unsigned int stride, // r1
+@ uint32_t tc4, // r2
+@ unsigned int no_f); // r3
+@
+@ no-F = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1]
+function ff_hevc_h_loop_filter_uv_neon_8, export=1
+ sub r0, r0, r1, lsl #1
+ vld2.8 {d16,d17}, [r0], r1
+ vld2.8 {d18,d19}, [r0], r1
+ vld2.8 {d26,d27}, [r0], r1
+ vld2.8 {d28,d29}, [r0]
+ sub r0, r0, r1, lsl #1
+ hevc_loop_filter_uv_body2 d16, d17, d18, d19, d26, d27, d28, d29
+ cmp r3, #0
+ bne 1f
+ vst2.8 {d18,d19}, [r0], r1
+ vst2.8 {d26,d27}, [r0]
+ bx lr
+
+ @ At least one no_f bit is set
+ @ Which means we need to break this apart in an ugly fashion
+1: vzip.8 d18, d19
+ lsls r2, r3, #31 @ b0 -> N, b1 -> C
+ vzip.8 d26, d27
+ sub r1, r1, #8
+
+ bmi 1f
+ vst1.8 {d18}, [r0]
+1: add r0, r0, #8
+ bcs 2f
+ vst1.8 {d19}, [r0]
+2: lsls r2, r3, #29 @ b2 -> N, b3 -> C
+ add r0, r0, r1
+
+ bmi 1f
+ vst1.8 {d26}, [r0]
+1: it cs
+ bxcs lr
+ add r0, r0, #8
+ vst1.8 {d27}, [r0]
+ bx lr
+
+endfunc
+
+
+@ void ff_hevc_h_loop_filter_uv_neon_10(uint8_t * src_r, // r0
+@ unsigned int stride, // r1
+@ uint32_t tc4, // r2
+@ unsigned int no_f); // r3
+@
+@ no-F = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1]
+@
+@ Macro here actual function near bottom
+
+.macro m_filter_h_uv_16 bit_depth
+ sub r0, r0, r1, lsl #1
+ vld2.16 {q8, q9 }, [r0], r1
+ vld2.16 {q10, q11}, [r0], r1
+ vld2.16 {q12, q13}, [r0], r1
+ vld2.16 {q14, q15}, [r0]
+ sub r0, r0, r1, lsl #1
+
+ hevc_loop_filter_uv_body2_16 q8, q9, q10, q11, q12, q13, q14, q15, \bit_depth
+
+ cmp r3, #0
+ bne 1f
+ vst2.16 {q10, q11}, [r0], r1
+ vst2.16 {q12, q13}, [r0]
+ bx lr
+
+ @ At least one no_f bit is set
+ @ Which means we need to break this apart in an ugly fashion
+1: vzip.16 q10, q11
+ lsls r2, r3, #31 @ b0 -> N, b1 -> C
+ vzip.16 q12, q13
+ sub r1, r1, #16
+
+ bmi 1f
+ vst1.16 {q10}, [r0]
+1: add r0, r0, #16
+ bcs 2f
+ vst1.16 {q11}, [r0]
+2: lsls r2, r3, #29 @ b2 -> N, b3 -> C
+ add r0, r0, r1
+
+ bmi 1f
+ vst1.16 {q12}, [r0]
+1: it cs
+ bxcs lr
+ add r0, r0, #16
+ vst1.16 {q13}, [r0]
+ bx lr
+.endm
+
+
+@ void ff_hevc_v_loop_filter_uv2_neon(uint8_t * src_r, // r0
+@ unsigned int stride, // r1
+@ uint32_t tc4, // r2
+@ uint8_t * src_l, // r3
+@ unsigned int no_f); // sp[0]
+@
+@ no_f = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1]
+
+function ff_hevc_v_loop_filter_uv2_neon_8, export=1
+ vld4.8 {d16[0], d17[0], d18[0], d19[0]}, [r3], r1
+ vld4.8 {d20[0], d21[0], d22[0], d23[0]}, [r0], r1
+ sub r12, r0, r3
+
+ vld4.8 {d16[1], d17[1], d18[1], d19[1]}, [r3], r1
+ vld4.8 {d20[1], d21[1], d22[1], d23[1]}, [r0], r1
+ cmp r12, #4
+
+ vld4.8 {d16[2], d17[2], d18[2], d19[2]}, [r3], r1
+ vld4.8 {d20[2], d21[2], d22[2], d23[2]}, [r0], r1
+
+ vld4.8 {d16[3], d17[3], d18[3], d19[3]}, [r3], r1
+ vld4.8 {d20[3], d21[3], d22[3], d23[3]}, [r0], r1
+
+ vld4.8 {d16[4], d17[4], d18[4], d19[4]}, [r3], r1
+ vld4.8 {d20[4], d21[4], d22[4], d23[4]}, [r0], r1
+
+ vld4.8 {d16[5], d17[5], d18[5], d19[5]}, [r3], r1
+ vld4.8 {d20[5], d21[5], d22[5], d23[5]}, [r0], r1
+
+ vld4.8 {d16[6], d17[6], d18[6], d19[6]}, [r3], r1
+ vld4.8 {d20[6], d21[6], d22[6], d23[6]}, [r0], r1
+
+ vld4.8 {d16[7], d17[7], d18[7], d19[7]}, [r3]
+ vld4.8 {d20[7], d21[7], d22[7], d23[7]}, [r0]
+ it eq
+ ldreq r12, [sp, #0]
+
+ hevc_loop_filter_uv_body2 d16, d17, d18, d19, d20, d21, d22, d23
+ cmp r12, #0
+ add r3, #2
+ neg r1, r1
+ bne 1f
+
+@ Much/most of the time r0 == r3 + 4 and no_f == 0
+@ so it is worth having this special case
+ vst4.8 {d18[7], d19[7], d20[7], d21[7]}, [r3], r1
+ vst4.8 {d18[6], d19[6], d20[6], d21[6]}, [r3], r1
+ vst4.8 {d18[5], d19[5], d20[5], d21[5]}, [r3], r1
+ vst4.8 {d18[4], d19[4], d20[4], d21[4]}, [r3], r1
+ vst4.8 {d18[3], d19[3], d20[3], d21[3]}, [r3], r1
+ vst4.8 {d18[2], d19[2], d20[2], d21[2]}, [r3], r1
+ vst4.8 {d18[1], d19[1], d20[1], d21[1]}, [r3], r1
+ vst4.8 {d18[0], d19[0], d20[0], d21[0]}, [r3]
+ bx lr
+
+@ Either split or partial
+1:
+ ldr r12, [sp, #0]
+ lsls r12, #29 @ b2 -> N, b3 -> C
+ add r2, r0, r1, lsl #2
+ bcs 1f
+ vst2.8 {d20[7], d21[7]}, [r0], r1
+ vst2.8 {d20[6], d21[6]}, [r0], r1
+ vst2.8 {d20[5], d21[5]}, [r0], r1
+ vst2.8 {d20[4], d21[4]}, [r0]
+1:
+ bmi 2f
+ vst2.8 {d20[3], d21[3]}, [r2], r1
+ vst2.8 {d20[2], d21[2]}, [r2], r1
+ vst2.8 {d20[1], d21[1]}, [r2], r1
+ vst2.8 {d20[0], d21[0]}, [r2]
+
+2:
+ lsls r12, #2
+ add r2, r3, r1, lsl #2
+ bcs 3f
+ vst2.8 {d18[7], d19[7]}, [r3], r1
+ vst2.8 {d18[6], d19[6]}, [r3], r1
+ vst2.8 {d18[5], d19[5]}, [r3], r1
+ vst2.8 {d18[4], d19[4]}, [r3]
+3:
+ it mi
+ bxmi lr
+ vst2.8 {d18[3], d19[3]}, [r2], r1
+ vst2.8 {d18[2], d19[2]}, [r2], r1
+ vst2.8 {d18[1], d19[1]}, [r2], r1
+ vst2.8 {d18[0], d19[0]}, [r2]
+ bx lr
endfunc
+
+@ void ff_hevc_v_loop_filter_uv2_neon(uint8_t * src_r, // r0
+@ unsigned int stride, // r1
+@ uint32_t tc4, // r2
+@ uint8_t * src_l, // r3
+@ unsigned int no_f); // sp[0]
+@
+@ no_f = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1]
+.macro m_filter_v_uv2_16 bit_depth
+ vld4.16 {d16[0], d18[0], d20[0], d22[0]}, [r3], r1
+ vld4.16 {d24[0], d26[0], d28[0], d30[0]}, [r0], r1
+ sub r12, r0, r3
+
+ vld4.16 {d16[1], d18[1], d20[1], d22[1]}, [r3], r1
+ vld4.16 {d24[1], d26[1], d28[1], d30[1]}, [r0], r1
+ cmp r12, #8
+
+ vld4.16 {d16[2], d18[2], d20[2], d22[2]}, [r3], r1
+ vld4.16 {d24[2], d26[2], d28[2], d30[2]}, [r0], r1
+
+ vld4.16 {d16[3], d18[3], d20[3], d22[3]}, [r3], r1
+ vld4.16 {d24[3], d26[3], d28[3], d30[3]}, [r0], r1
+
+ vld4.16 {d17[0], d19[0], d21[0], d23[0]}, [r3], r1
+ vld4.16 {d25[0], d27[0], d29[0], d31[0]}, [r0], r1
+
+ vld4.16 {d17[1], d19[1], d21[1], d23[1]}, [r3], r1
+ vld4.16 {d25[1], d27[1], d29[1], d31[1]}, [r0], r1
+
+ vld4.16 {d17[2], d19[2], d21[2], d23[2]}, [r3], r1
+ vld4.16 {d25[2], d27[2], d29[2], d31[2]}, [r0], r1
+
+ vld4.16 {d17[3], d19[3], d21[3], d23[3]}, [r3]
+ vld4.16 {d25[3], d27[3], d29[3], d31[3]}, [r0]
+ it eq
+ ldreq r12, [sp, #0]
+
+ hevc_loop_filter_uv_body2_16 q8, q9, q10, q11, q12, q13, q14, q15, \bit_depth
+ cmp r12, #0
+ add r3, #4
+ neg r1, r1
+ bne 1f
+
+@ Much/most of the time r0 == r3 + 4 and no_f == 0
+@ so it is worth having this special case
+ vst4.16 {d21[3], d23[3],d25[3], d27[3]}, [r3], r1
+ vst4.16 {d21[2], d23[2],d25[2], d27[2]}, [r3], r1
+ vst4.16 {d21[1], d23[1],d25[1], d27[1]}, [r3], r1
+ vst4.16 {d21[0], d23[0],d25[0], d27[0]}, [r3], r1
+ vst4.16 {d20[3], d22[3],d24[3], d26[3]}, [r3], r1
+ vst4.16 {d20[2], d22[2],d24[2], d26[2]}, [r3], r1
+ vst4.16 {d20[1], d22[1],d24[1], d26[1]}, [r3], r1
+ vst4.16 {d20[0], d22[0],d24[0], d26[0]}, [r3], r1
+ bx lr
+
+@ Either split or partial
+1:
+ ldr r12, [sp, #0]
+ lsls r12, #29 @ b2 -> N, b3 -> C
+ add r2, r0, r1, lsl #2
+ bcs 1f
+ vst2.16 {d25[3], d27[3]}, [r0], r1
+ vst2.16 {d25[2], d27[2]}, [r0], r1
+ vst2.16 {d25[1], d27[1]}, [r0], r1
+ vst2.16 {d25[0], d27[0]}, [r0]
+1:
+ bmi 2f
+ vst2.16 {d24[3], d26[3]}, [r2], r1
+ vst2.16 {d24[2], d26[2]}, [r2], r1
+ vst2.16 {d24[1], d26[1]}, [r2], r1
+ vst2.16 {d24[0], d26[0]}, [r2]
+
+2:
+ lsls r12, #2
+ add r2, r3, r1, lsl #2
+ bcs 3f
+ vst2.16 {d21[3], d23[3]}, [r3], r1
+ vst2.16 {d21[2], d23[2]}, [r3], r1
+ vst2.16 {d21[1], d23[1]}, [r3], r1
+ vst2.16 {d21[0], d23[0]}, [r3]
+3:
+ it mi
+ bxmi lr
+ vst2.16 {d20[3], d22[3]}, [r2], r1
+ vst2.16 {d20[2], d22[2]}, [r2], r1
+ vst2.16 {d20[1], d22[1]}, [r2], r1
+ vst2.16 {d20[0], d22[0]}, [r2]
+ bx lr
+.endm
+
+
+
function ff_hevc_v_loop_filter_chroma_neon, export=1
hevc_loop_filter_chroma_start
+
+ sub r0, #2
+ vld4.8 {d16[0], d17[0], d18[0], d19[0]}, [r0], r1
+ vld4.8 {d16[1], d17[1], d18[1], d19[1]}, [r0], r1
+ vld4.8 {d16[2], d17[2], d18[2], d19[2]}, [r0], r1
+ vld4.8 {d16[3], d17[3], d18[3], d19[3]}, [r0], r1
+ vld4.8 {d16[4], d17[4], d18[4], d19[4]}, [r0], r1
+ vld4.8 {d16[5], d17[5], d18[5], d19[5]}, [r0], r1
+ vld4.8 {d16[6], d17[6], d18[6], d19[6]}, [r0], r1
+ vld4.8 {d16[7], d17[7], d18[7], d19[7]}, [r0], r1
+
+ sub r0, r0, r1, lsl #3
+ add r0, r0, #1
+ hevc_loop_filter_chroma_body d16, d17, d18, d19
+ bne 1f
+
+ vst2.8 {d17[0], d18[0]}, [r0], r1
+ vst2.8 {d17[1], d18[1]}, [r0], r1
+ vst2.8 {d17[2], d18[2]}, [r0], r1
+ vst2.8 {d17[3], d18[3]}, [r0], r1
+ vst2.8 {d17[4], d18[4]}, [r0], r1
+ vst2.8 {d17[5], d18[5]}, [r0], r1
+ vst2.8 {d17[6], d18[6]}, [r0], r1
+ vst2.8 {d17[7], d18[7]}, [r0], r1
+ bx lr
+
+1:
+ tst r12, #0xff @ P0a
+ bne 2f
+
+ vst1.8 {d17[0]}, [r0], r1
+ vst1.8 {d17[1]}, [r0], r1
+ vst1.8 {d17[2]}, [r0], r1
+ vst1.8 {d17[3]}, [r0], r1
+ sub r0, r0, r1, lsl #2
+
+2:
+ tst r12, #0xff0000 @ Q0a
+ add r0, #1
+ bne 3f
+ vst1.8 {d18[0]}, [r0], r1
+ vst1.8 {d18[1]}, [r0], r1
+ vst1.8 {d18[2]}, [r0], r1
+ vst1.8 {d18[3]}, [r0], r1
+ sub r0, r0, r1, lsl #2
+
+3:
+ tst r12, #0xff000000 @ Q0b
+ add r0, r0, r1, lsl #2
+ bne 4f
+ vst1.8 {d18[4]}, [r0], r1
+ vst1.8 {d18[5]}, [r0], r1
+ vst1.8 {d18[6]}, [r0], r1
+ vst1.8 {d18[7]}, [r0], r1
+ sub r0, r0, r1, lsl #2
+
+4:
+ tst r12, #0xff00 @ P0b
+ it ne
+ bxne lr
+
+ sub r0, #1
+ vst1.8 {d17[4]}, [r0], r1
+ vst1.8 {d17[5]}, [r0], r1
+ vst1.8 {d17[6]}, [r0], r1
+ vst1.8 {d17[7]}, [r0], r1
+ bx lr
+
+endfunc
+
+
+.macro m_filter_v_chroma_16 bit_depth
+ hevc_loop_filter_chroma_start
+
sub r0, #4
+ vld4.16 {d16[0], d18[0], d20[0], d22[0]}, [r0], r1
+ vld4.16 {d16[1], d18[1], d20[1], d22[1]}, [r0], r1
+ vld4.16 {d16[2], d18[2], d20[2], d22[2]}, [r0], r1
+ vld4.16 {d16[3], d18[3], d20[3], d22[3]}, [r0], r1
+ vld4.16 {d17[0], d19[0], d21[0], d23[0]}, [r0], r1
+ vld4.16 {d17[1], d19[1], d21[1], d23[1]}, [r0], r1
+ vld4.16 {d17[2], d19[2], d21[2], d23[2]}, [r0], r1
+ vld4.16 {d17[3], d19[3], d21[3], d23[3]}, [r0], r1
+
+ sub r0, r0, r1, lsl #3
+ add r0, r0, #2
+ hevc_loop_filter_chroma_body_16 q8, q9, q10, q11, \bit_depth
+ bne 1f
+
+ vst2.16 {d18[0], d20[0]}, [r0], r1
+ vst2.16 {d18[1], d20[1]}, [r0], r1
+ vst2.16 {d18[2], d20[2]}, [r0], r1
+ vst2.16 {d18[3], d20[3]}, [r0], r1
+ vst2.16 {d19[0], d21[0]}, [r0], r1
+ vst2.16 {d19[1], d21[1]}, [r0], r1
+ vst2.16 {d19[2], d21[2]}, [r0], r1
+ vst2.16 {d19[3], d21[3]}, [r0], r1
+ bx lr
+
+1:
+ tst r12, #0xff @ P0a
+ bne 2f
+
+ vst1.16 {d18[0]}, [r0], r1
+ vst1.16 {d18[1]}, [r0], r1
+ vst1.16 {d18[2]}, [r0], r1
+ vst1.16 {d18[3]}, [r0], r1
+ sub r0, r0, r1, lsl #2
+
+2:
+ tst r12, #0xff0000 @ Q0a
+ add r0, #1
+ bne 3f
+ vst1.16 {d20[0]}, [r0], r1
+ vst1.16 {d20[1]}, [r0], r1
+ vst1.16 {d20[2]}, [r0], r1
+ vst1.16 {d20[3]}, [r0], r1
+ sub r0, r0, r1, lsl #2
+
+3:
+ tst r12, #0xff000000 @ Q0b
+ add r0, r0, r1, lsl #2
+ bne 4f
+ vst1.16 {d21[0]}, [r0], r1
+ vst1.16 {d21[1]}, [r0], r1
+ vst1.16 {d21[2]}, [r0], r1
+ vst1.16 {d21[3]}, [r0], r1
+ sub r0, r0, r1, lsl #2
+
+4:
+ tst r12, #0xff00 @ P0b
+ it ne
+ bxne lr
+
+ sub r0, #1
+ vst1.16 {d19[0]}, [r0], r1
+ vst1.16 {d19[1]}, [r0], r1
+ vst1.16 {d19[2]}, [r0], r1
+ vst1.16 {d19[3]}, [r0], r1
+ bx lr
+.endm
+
+
+@ void ff_hevc_h_loop_filter_chroma_neon(
+@ uint8_t *_pix, [r0]
+@ ptrdiff_t _stride, [r1]
+@ int *_tc, [r2]
+@ uint8_t *_no_p, [r3]
+@ uint8_t *_no_q); [sp+0]
+
+function ff_hevc_h_loop_filter_chroma_neon, export=1
+ hevc_loop_filter_chroma_start
+ sub r0, r0, r1, lsl #1
vld1.8 {d16}, [r0], r1
vld1.8 {d17}, [r0], r1
vld1.8 {d18}, [r0], r1
- vld1.8 {d2}, [r0], r1
- vld1.8 {d4}, [r0], r1
- vld1.8 {d19}, [r0], r1
- vld1.8 {d20}, [r0], r1
- vld1.8 {d21}, [r0], r1
- sub r0, r0, r1, lsl #3
- transpose_8x8 d16, d17, d18, d2, d4, d19, d20, d21
- hevc_loop_filter_chroma_body
- transpose_8x8 d16, d17, d18, d2, d4, d19, d20, d21
- vst1.8 {d16}, [r0], r1
+ vld1.8 {d19}, [r0]
+ sub r0, r0, r1, lsl #1
+ hevc_loop_filter_chroma_body d16, d17, d18, d19
+ bne 1f @ Partial write
vst1.8 {d17}, [r0], r1
- vst1.8 {d18}, [r0], r1
- vst1.8 {d2}, [r0], r1
- vst1.8 {d4}, [r0], r1
- vst1.8 {d19}, [r0], r1
- vst1.8 {d20}, [r0], r1
- vst1.8 {d21}, [r0]
+ vst1.8 {d18}, [r0]
+ bx lr
+1:
+ tst r12, #0xff
+ vmov r2, r3, d17
+ it eq
+ streq r2, [r0]
+ tst r12, #0xff00
+ it eq
+ streq r3, [r0, #4]
+
+ add r0, r1
+ tst r12, #0xff0000
+ vmov r2, r3, d18
+ it eq
+ streq r2, [r0]
+ tst r12, #0xff000000
+ it eq
+ streq r3, [r0, #4]
+
bx lr
endfunc
-function ff_hevc_h_loop_filter_chroma_neon, export=1
+.macro m_filter_h_chroma_16 bit_depth
hevc_loop_filter_chroma_start
sub r0, r0, r1, lsl #1
- vld1.8 {d18}, [r0], r1
- vld1.8 {d2}, [r0], r1
- vld1.8 {d4}, [r0], r1
- vld1.8 {d19}, [r0]
+ vld1.16 {q8}, [r0], r1
+ vld1.16 {q9}, [r0], r1
+ vld1.16 {q10}, [r0], r1
+ vld1.16 {q11}, [r0]
sub r0, r0, r1, lsl #1
- hevc_loop_filter_chroma_body
- vst1.8 {d2}, [r0], r1
- vst1.8 {d4}, [r0]
+ hevc_loop_filter_chroma_body_16 q8, q9, q10, q11, \bit_depth
+ bne 1f @ Partial write
+ vst1.16 {q9}, [r0], r1
+ vst1.16 {q10}, [r0]
+ bx lr
+1:
+ tst r12, #0xff
+ bne 2f
+ vst1.16 {d18}, [r0]
+2:
+ tst r12, #0xff00
+ bne 3f
+ add r0, #8
+ vst1.16 {d19}, [r0]
+ sub r0, #8
+3:
+ tst r12, #0xff0000
+ add r0, r1
+ bne 4f
+ vst1.16 {d20}, [r0]
+4:
+ tst r12, #0xff000000
+ it ne
+ bxne lr
+ add r0, #8
+ vst1.16 {d21}, [r0]
+
bx lr
+.endm
+
+
+/* ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_i
+ * int *curr_rpl0, int *curr_
+ * MvField *curr, MvField *ne
+ */
+function ff_hevc_deblocking_boundary_strengths_neon, export=1
+ add ip, sp, #4*4
+ push {a2-a4,v1-v8,lr}
+ ldmia ip, {v5-v7}
+1: ldmdb ip, {v1-v4}
+ ldrsb a3, [v5, #8] @ curr->ref_idx
+ ldrsb v8, [v5, #9]
+ ldrsb ip, [v6, #8] @ neigh->ref_idx
+ ldrsb lr, [v6, #9]
+ ldr v1, [v1, a3, lsl #2]
+ ldrb a3, [v5, #10] @ curr->pred_flag
+ ldr v2, [v2, v8, lsl #2]
+ ldrb v8, [v6, #10] @ neigh->pred_flag
+ ldr v3, [v3, ip, lsl #2]
+ ldr v4, [v4, lr, lsl #2]
+ teq a3, #3
+ beq 20f
+ teq v8, #3
+ beq 90f
+
+ tst a3, #1
+ itee ne
+ ldrne a3, [v5, #0] @ curr->mv[0]
+ ldreq a3, [v5, #4] @ curr->mv[1]
+ moveq v1, v2
+ tst v8, #1
+ itee ne
+ ldrne v8, [v6, #0] @ neigh->mv[0]
+ ldreq v8, [v6, #4] @ neigh->mv[1]
+ moveq v3, v4
+ teq v1, v3
+ bne 10f
+ ldr lr, =0xFFFCFFFC
+ ssub16 ip, v8, a3
+ ssub16 a3, a3, v8
+ sel a3, a3, ip
+ ands a3, a3, lr
+ @ drop through
+10: it ne
+ movne a3, #1
+11: subs a2, a2, #1
+12:
+A strbhs a3, [v7], a4
+T itt hs
+T strbhs a3, [v7]
+T addhs v7, v7, a4
+ subs a2, a2, #1
+ bhs 12b
+
+ ldm sp, {a2, a3}
+ add ip, sp, #16*4
+ subs a1, a1, #1
+ add v5, v5, a3
+ add v6, v6, a3
+ bhi 1b
+ pop {a2-a4,v1-v8,pc}
+
+20: teq v8, #3
+ bne 10b
+
+ teq v1, v3
+ it eq
+ teqeq v2, v4
+ bne 40f
+ teq v1, v2
+ bne 30f
+
+ ldrd v1, v2, [v5] @ curr->mv
+ ldrd v3, v4, [v6] @ neigh->mv
+ ldr lr, =0xFFFCFFFC
+ ssub16 ip, v3, v1
+ ssub16 a3, v1, v3
+ sel a3, a3, ip
+ ands a3, a3, lr
+ bne 25f
+ ssub16 ip, v4, v2
+ ssub16 a3, v2, v4
+ sel a3, a3, ip
+ ands a3, a3, lr
+ beq 11b
+ @ drop through
+25: ssub16 ip, v4, v1
+ ssub16 a3, v1, v4
+ sel a3, a3, ip
+ ands a3, a3, lr
+ bne 10b
+ ssub16 ip, v3, v2
+ ssub16 a3, v2, v3
+ sel a3, a3, ip
+ ands a3, a3, lr
+ b 10b
+
+30: ldrd v1, v2, [v5] @ curr->mv
+ ldrd v3, v4, [v6] @ neigh->mv
+ ldr lr, =0xFFFCFFFC
+ ssub16 ip, v3, v1
+ ssub16 a3, v1, v3
+ sel a3, a3, ip
+ ands a3, a3, lr
+ bne 10b
+ ssub16 ip, v4, v2
+ ssub16 a3, v2, v4
+ sel a3, a3, ip
+ ands a3, a3, lr
+ b 10b
+
+40: teq v1, v4
+ ite eq
+ teqeq v2, v3
+ bne 10b
+
+ ldrd v1, v2, [v5] @ curr->mv
+ ldrd v3, v4, [v6] @ neigh->mv
+ ldr lr, =0xFFFCFFFC
+ b 25b
+
+90: mov a3, #1
+ b 11b
+endfunc
+
+@ =============================================================================
+@
+@ 10 bit
+
+function hevc_loop_filter_luma_body_10
+ m_filter_luma 10
+endfunc
+
+function ff_hevc_h_loop_filter_luma_neon_10, export=1
+ m_filter_h_luma_16 10
+endfunc
+
+function ff_hevc_v_loop_filter_luma2_neon_10, export=1
+ hevc_loop_filter_luma_start
+ push {r4-r10,lr} @ 8 regs = 32 bytes
+
+ ldr r4, [sp, #40]
+ b v_loop_luma_common_10
+endfunc
+
+function ff_hevc_v_loop_filter_luma_neon_10, export=1
+ hevc_loop_filter_luma_start
+ push {r4-r10,lr}
+
+ sub r4, r0, #8
+v_loop_luma_common_10:
+ m_filter_v_luma_common_16 10
+endfunc
+
+function ff_hevc_h_loop_filter_uv_neon_10, export=1
+ m_filter_h_uv_16 10
+endfunc
+
+function ff_hevc_v_loop_filter_uv2_neon_10, export=1
+ m_filter_v_uv2_16 10
+endfunc
+
+function ff_hevc_h_loop_filter_chroma_neon_10, export=1
+ m_filter_h_chroma_16 10
+endfunc
+
+function ff_hevc_v_loop_filter_chroma_neon_10, export=1
+ m_filter_v_chroma_16 10
endfunc
+
diff --git a/libavcodec/arm/hevcdsp_epel_neon.S b/libavcodec/arm/hevcdsp_epel_neon.S
new file mode 100644
index 0000000000..00eab9eeee
--- /dev/null
+++ b/libavcodec/arm/hevcdsp_epel_neon.S
@@ -0,0 +1,337 @@
+/*
+ * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
+#define MAX_PB_SIZE #64
+
+.macro vextin_d4
+ vld1.8 {q10}, [r1], r2
+ vmov d16, d20
+ vext.8 d17, d20, d21, #1
+ vext.8 d18, d20, d21, #2
+ vext.8 d19, d20, d21, #3
+.endm
+
+.macro vextin_d4_8
+ vld1.8 d16, [r1], r2
+ vext.8 d17, d16, d16, #1
+ vext.8 d18, d16, d16, #2
+ vext.8 d19, d16, d16, #3
+.endm
+
+.macro load_coeffs_16b coeffs
+ ldr \coeffs, [\coeffs]
+ vdup.i8 d0, \coeffs
+ lsr \coeffs, #8
+ vdup.i8 d1, \coeffs
+ lsr \coeffs, #8
+ vdup.i8 d2, \coeffs
+ lsr \coeffs, #8
+ vdup.i8 d3, \coeffs
+.endm
+
+.macro epel_filter_16b out=q12
+ vmull.u8 q3, d16, d0
+ vmull.u8 q11, d19, d3
+ vmull.u8 \out, d17, d1
+ vmull.u8 q10, d18, d2
+ vadd.s16 q3, q11
+ vadd.s16 \out, q10
+ vsub.s16 \out, q3
+.endm
+
+.macro load_coeffs_32b coeffs
+ ldr \coeffs, [\coeffs]
+ vmov.i64 d4, #0
+ vmov.8 d4[0], \coeffs
+ lsr \coeffs, #8
+ vmov.8 d4[2], \coeffs
+ lsr \coeffs, #8
+ vmov.8 d4[4], \coeffs
+ lsr \coeffs, #8
+ vmov.8 d4[6], \coeffs
+.endm
+
+.macro epel_filter_32b
+ vmull.s16 q3, d24, d4[0] //q12
+ vmull.s16 q4, d25, d4[0]
+ vmull.s16 q5, d30, d4[3] //q15
+ vmull.s16 q6, d31, d4[3]
+
+ vmull.s16 q7, d26, d4[1] // q13
+ vmull.s16 q8, d27, d4[1]
+ vmull.s16 q9, d28, d4[2] // q14
+ vmull.s16 q10, d29, d4[2]
+ vadd.s32 q3, q5
+ vadd.s32 q4, q6
+ vadd.s32 q7, q9
+ vadd.s32 q8, q10
+ vsub.s32 q7, q3
+ vsub.s32 q8, q4
+ vqshrn.s32 d6, q7, #6
+ vqshrn.s32 d7, q8, #6
+.endm
+
+.macro epel_filter_32b_4
+ vmull.s16 q3, d24, d4[0] //q12
+ vmull.s16 q5, d30, d4[3] //q15
+ vmull.s16 q7, d26, d4[1] // q13
+ vmull.s16 q9, d28, d4[2] // q14
+ vadd.s32 q3, q5
+ vadd.s32 q7, q9
+ vsub.s32 q7, q3
+ vqshrn.s32 d6, q7, #6
+.endm
+
+function ff_hevc_put_epel_h_neon_8, export=1
+ push {r4-r7}
+ mov r4, MAX_PB_SIZE
+ ldr r7, [sp, #16] // mx
+ ldr r5, [sp, #24] // width
+ sub r7, #1
+ lsl r7, #2
+ vpush {d8-d15}
+@ adr reaches if we are in thumb mode but not in arm
+T adr r12, epel_coeffs
+A adrl r12, epel_coeffs
+ add r7, r12
+ sub r1, #1
+ lsl r4, #1
+ load_coeffs_16b r7
+ mov r12, r3
+ mov r6, r0
+ mov r7, r1
+ cmp r5, #6
+ bgt 8f
+ cmp r5, #4
+ blt 2f
+ b 4f
+8: subs r3, #1
+ pld [r1]
+ vextin_d4
+ epel_filter_16b
+ vst1.16 {q12}, [r0], r4
+ bne 8b
+ subs r5, #8
+ beq 99f
+ mov r3, r12
+ add r6, #16
+ mov r0, r6
+ add r7, #8
+ mov r1, r7
+ cmp r5, #4
+ bgt 8b
+4: subs r3, #1
+ pld [r1]
+ vextin_d4_8
+ epel_filter_16b
+ vst1.16 d24, [r0], r4
+ bne 4b
+ subs r5, #4
+ beq 99f
+ mov r3, r12
+ add r6, #8
+ mov r0, r6
+ add r7, #4
+ mov r1, r7
+2: subs r3, #1
+ pld [r1]
+ vextin_d4_8
+ epel_filter_16b
+ vst1.32 d24[0], [r0], r4
+ bne 2b
+99: vpop {d8-d15}
+ pop {r4-r7}
+ bx lr
+endfunc
+
+function ff_hevc_put_epel_v_neon_8, export=1
+ push {r4-r7}
+ mov r4, MAX_PB_SIZE
+ ldr r7, [sp, #20] // my
+ ldr r5, [sp, #24] // width
+ sub r7, #1
+ lsl r7, #2
+ vpush {d8-d15}
+T adr r12, epel_coeffs
+A adrl r12, epel_coeffs
+ add r7, r12
+ load_coeffs_16b r7
+ sub r1, r2
+ lsl r4, #1
+ mov r12, r3
+ mov r6, r0
+ mov r7, r1
+0: pld [r1]
+ vld1.8 {d16}, [r1], r2
+ pld [r1]
+ vld1.8 {d17}, [r1], r2
+ pld [r1]
+ vld1.8 {d18}, [r1], r2
+ cmp r5, #6
+ bgt 8f
+ cmp r5, #4
+ blt 2f
+ b 4f
+8: pld [r1]
+ vld1.8 {d19}, [r1], r2
+ subs r3, #1
+ epel_filter_16b
+ vst1.16 {q12}, [r0], r4
+ vmov d16, d17
+ vmov d17, d18
+ vmov d18, d19
+ bne 8b
+ subs r5, #8
+ beq 99f
+ mov r3, r12
+ add r6, #16
+ mov r0, r6
+ add r7, #8
+ mov r1, r7
+ b 0b
+4: pld [r1]
+ vld1.8 {d19}, [r1], r2
+ subs r3, #1
+ epel_filter_16b
+ vst1.16 d24, [r0], r4
+ vmov d16, d17
+ vmov d17, d18
+ vmov d18, d19
+ bne 4b
+ subs r5, #4
+ beq 99f
+ mov r3, r12
+ add r6, #8
+ mov r0, r6
+ add r7, #4
+ mov r1, r7
+ b 0b
+2: pld [r1]
+ vld1.8 {d19}, [r1], r2
+ subs r3, #1
+ epel_filter_16b
+ vst1.32 d24[0], [r0], r4
+ vmov d16, d17
+ vmov d17, d18
+ vmov d18, d19
+ bne 2b
+99: vpop {d8-d15}
+ pop {r4-r7}
+ bx lr
+endfunc
+
+function ff_hevc_put_epel_hv_neon_8, export=1
+ push {r4-r7}
+ mov r4, MAX_PB_SIZE
+ ldr r6, [sp, #16] // mx
+ ldr r7, [sp, #20] // my
+ ldr r5, [sp, #24] // width
+ sub r7, #1
+ lsl r7, #2
+ vpush {d8-d15}
+ adr r12, epel_coeffs
+ sub r6, #1
+ lsl r6, #2
+ add r6, r12 // mx epel coeff offset
+ add r7, r12
+ sub r1, #1
+ sub r1, r2
+ lsl r4, #1
+ load_coeffs_16b r6
+ load_coeffs_32b r7
+ mov r12, r3
+ mov r6, r0
+ mov r7, r1
+0: pld [r1]
+ vextin_d4
+ epel_filter_16b q12
+ pld [r1]
+ vextin_d4
+ epel_filter_16b q13
+ pld [r1]
+ vextin_d4
+ epel_filter_16b q14
+ cmp r5, #6
+ bgt 8f
+ cmp r5, #4
+ blt 2f
+ b 4f
+8: pld [r1]
+ vextin_d4
+ epel_filter_16b q15
+ subs r3, #1
+ epel_filter_32b
+ vst1.16 {q3}, [r0], r4
+ vmov q12, q13
+ vmov q13, q14
+ vmov q14, q15
+ bne 8b
+ subs r5, #8
+ beq 99f
+ mov r3, r12
+ add r6, #16
+ mov r0, r6
+ add r7, #8
+ mov r1, r7
+ b 0b
+4: pld [r1]
+ vextin_d4_8
+ epel_filter_16b q15
+ subs r3, #1
+ epel_filter_32b_4
+ vst1.16 d6, [r0], r4
+ vmov q12, q13
+ vmov q13, q14
+ vmov q14, q15
+ bne 4b
+ subs r5, #4
+ beq 99f
+ mov r3, r12
+ add r6, #8
+ mov r0, r6
+ add r7, #4
+ mov r1, r7
+ b 0b
+2: pld [r1]
+ vextin_d4_8
+ epel_filter_16b q15
+ subs r3, #1
+ epel_filter_32b_4
+ vst1.32 d6[0], [r0], r4
+ vmov q12, q13
+ vmov q13, q14
+ vmov q14, q15
+ bne 2b
+99: vpop {d8-d15}
+ pop {r4-r7}
+ bx lr
+endfunc
+
+epel_coeffs:
+ .byte 2, 58, 10, 2
+ .byte 4, 54, 16, 2
+ .byte 6, 46, 28, 4
+ .byte 4, 36, 36, 4
+ .byte 4, 28, 46, 6
+ .byte 2, 16, 54, 4
+ .byte 2, 10, 58, 2
diff --git a/libavcodec/arm/hevcdsp_idct_neon.S b/libavcodec/arm/hevcdsp_idct_neon.S
index 13d540e5ff..9b6d745556 100644
--- a/libavcodec/arm/hevcdsp_idct_neon.S
+++ b/libavcodec/arm/hevcdsp_idct_neon.S
@@ -21,82 +21,6 @@
#include "libavutil/arm/asm.S"
#include "neon.S"
-function ff_hevc_idct_4x4_dc_neon_8, export=1
- ldrsh r1, [r0]
- ldr r2, =0x20
- add r1, #1
- asr r1, #1
- add r1, r2
- asr r1, #6
- vdup.16 q0, r1
- vdup.16 q1, r1
- vst1.16 {q0, q1}, [r0]
- bx lr
-endfunc
-
-function ff_hevc_idct_8x8_dc_neon_8, export=1
- ldrsh r1, [r0]
- ldr r2, =0x20
- add r1, #1
- asr r1, #1
- add r1, r2
- asr r1, #6
- vdup.16 q8, r1
- vdup.16 q9, r1
- vmov.16 q10, q8
- vmov.16 q11, q8
- vmov.16 q12, q8
- vmov.16 q13, q8
- vmov.16 q14, q8
- vmov.16 q15, q8
- vstm r0, {q8-q15}
- bx lr
-endfunc
-
-function ff_hevc_idct_16x16_dc_neon_8, export=1
- ldrsh r1, [r0]
- ldr r2, =0x20
- add r1, #1
- asr r1, #1
- add r1, r2
- asr r1, #6
- vdup.16 q8, r1
- vdup.16 q9, r1
- vmov.16 q10, q8
- vmov.16 q11, q8
- vmov.16 q12, q8
- vmov.16 q13, q8
- vmov.16 q14, q8
- vmov.16 q15, q8
- vstm r0!, {q8-q15}
- vstm r0!, {q8-q15}
- vstm r0!, {q8-q15}
- vstm r0, {q8-q15}
- bx lr
-endfunc
-
-function ff_hevc_idct_32x32_dc_neon_8, export=1
- ldrsh r1, [r0]
- ldr r2, =0x20
- add r1, #1
- asr r1, #1
- add r1, r2
- asr r1, #6
- mov r3, #16
- vdup.16 q8, r1
- vdup.16 q9, r1
- vmov.16 q10, q8
- vmov.16 q11, q8
- vmov.16 q12, q8
- vmov.16 q13, q8
- vmov.16 q14, q8
- vmov.16 q15, q8
-1: subs r3, #1
- vstm r0!, {q8-q15}
- bne 1b
- bx lr
-endfunc
-
function ff_hevc_transform_add_4x4_neon_8, export=1
vldm r1, {q0-q1}
vld1.32 d4[0], [r0], r2
@@ -168,6 +92,131 @@ function ff_hevc_transform_add_32x32_neon_8, export=1
bx lr
endfunc
+
+@ ff_hevc_add_residual_4x4_dc_neon_8(
+@ uint8_t * dst, // [r0]
+@ unsigned int stride, // [r1]
+@ int dc) // [r2]
+
+function ff_hevc_add_residual_4x4_dc_neon_8, export=1
+ vdup.16 q15, r2
+
+ vld1.32 d4[0], [r0], r1
+ vld1.32 d4[1], [r0], r1
+ vld1.32 d5[0], [r0], r1
+ vld1.32 d5[1], [r0], r1
+ sub r0, r0, r1, lsl #2
+ vaddw.u8 q0, q15, d4
+ vaddw.u8 q1, q15, d5
+ vqmovun.s16 d0, q0
+ vqmovun.s16 d1, q1
+ vst1.32 d0[0], [r0], r1
+ vst1.32 d0[1], [r0], r1
+ vst1.32 d1[0], [r0], r1
+ vst1.32 d1[1], [r0], r1
+ bx lr
+endfunc
+
+
+@ ff_hevc_add_residual_4x4_dc_c_neon_8(
+@ uint8_t * dst, // [r0]
+@ unsigned int stride, // [r1]
+@ int dc) // [r2]
+
+function ff_hevc_add_residual_4x4_dc_c_neon_8, export=1
+ vdup.32 q15, r2
+ mov r3, #4
+ b 1f
+endfunc
+
+@ ff_hevc_add_residual_8x8_dc_neon_8(
+@ uint8_t * dst, // [r0]
+@ unsigned int stride, // [r1]
+@ int dc) // [r2]
+
+function ff_hevc_add_residual_8x8_dc_neon_8, export=1
+ vdup.16 q15, r2
+ mov r3, #8
+
+1: subs r3, #1
+ vld1.8 d16, [r0]
+ vaddw.u8 q0, q15, d16
+ vqmovun.s16 d0, q0
+ vst1.32 d0, [r0], r1
+ bne 1b
+ bx lr
+endfunc
+
+
+@ ff_hevc_add_residual_8x8_dc_c_neon_8(
+@ uint8_t * dst, // [r0]
+@ unsigned int stride, // [r1]
+@ int dc) // [r2]
+
+function ff_hevc_add_residual_8x8_dc_c_neon_8, export=1
+ vdup.32 q15, r2
+ mov r3, #8
+ b 1f
+endfunc
+
+@ ff_hevc_add_residual_16x16_dc_neon_8(
+@ uint8_t * dst, // [r0]
+@ unsigned int stride, // [r1]
+@ int dc) // [r2]
+
+function ff_hevc_add_residual_16x16_dc_neon_8, export=1
+ vdup.16 q15, r2
+ mov r3, #16
+
+1: subs r3, #1
+ vld1.8 {q8}, [r0]
+ vaddw.u8 q0, q15, d16
+ vaddw.u8 q1, q15, d17
+ vqmovun.s16 d0, q0
+ vqmovun.s16 d1, q1
+ vst1.8 {q0}, [r0], r1
+ bne 1b
+ bx lr
+endfunc
+
+
+@ ff_hevc_add_residual_16x16_dc_c_neon_8(
+@ uint8_t * dst, // [r0]
+@ unsigned int stride, // [r1]
+@ int dc) // [r2]
+
+function ff_hevc_add_residual_16x16_dc_c_neon_8, export=1
+ vdup.32 q15, r2
+ mov r3, #16
+ b 1f
+endfunc
+
+@ ff_hevc_add_residual_32x32_dc_neon_8(
+@ uint8_t * dst, // [r0]
+@ unsigned int stride, // [r1]
+@ int dc) // [r2]
+
+function ff_hevc_add_residual_32x32_dc_neon_8, export=1
+ vdup.16 q15, r2
+ mov r3, #32
+
+1: subs r3, #1
+ vld1.8 {q8, q9}, [r0]
+ vaddw.u8 q0, q15, d16
+ vaddw.u8 q1, q15, d17
+ vaddw.u8 q2, q15, d18
+ vaddw.u8 q3, q15, d19
+ vqmovun.s16 d0, q0
+ vqmovun.s16 d1, q1
+ vqmovun.s16 d2, q2
+ vqmovun.s16 d3, q3
+ vst1.8 {q0, q1}, [r0], r1
+ bne 1b
+ bx lr
+endfunc
+
+
+
.macro transpose_16b_8x8 r0, r1, r2, r3, r4, r5, r6, r7
vtrn.64 \r0, \r4
vtrn.64 \r1, \r5
@@ -263,55 +312,6 @@ endfunc
vqrshrn.s32 \r3, q3, \shift
.endm
-function ff_hevc_transform_4x4_neon_8, export=1
- vpush {d8-d15}
- vld1.16 {q14, q15}, [r0] // coeffs
- ldr r3, =0x00240053 // 36 and 83
- vmov.32 d0[0], r3
-
- tr4_shift d28, d29, d30, d31, #7
-
- vtrn.16 d28, d29
- vtrn.16 d30, d31
- vtrn.32 q14, q15
-
- tr4_shift d28, d29, d30, d31, #12
-
- vtrn.16 d28, d29
- vtrn.16 d30, d31
- vtrn.32 q14, q15
-
- vst1.16 {q14, q15}, [r0]
- vpop {d8-d15}
- bx lr
-endfunc
-
-function ff_hevc_transform_luma_4x4_neon_8, export=1
- vpush {d8-d15}
- vld1.16 {q14, q15}, [r0] // coeffs
- ldr r3, =0x4a // 74
- vmov.32 d0[0], r3
- ldr r3, =0x1d // 29
- vmov.32 d0[1], r3
- ldr r3, =0x37 // 55
- vmov.32 d1[0], r3
-
- tr4_luma_shift d28, d29, d30, d31, #7
-
- vtrn.16 d28, d29
- vtrn.16 d30, d31
- vtrn.32 q14, q15
-
- tr4_luma_shift d28, d29, d30, d31, #12
-
- vtrn.16 d28, d29
- vtrn.16 d30, d31
- vtrn.32 q14, q15
- vst1.16 {q14, q15}, [r0]
- vpop {d8-d15}
- bx lr
-endfunc
-
.macro tr8_begin in0, in1, in2, in3
vmull.s16 q7, \in0, d1[1] // 89 * src1
vmull.s16 q8, \in0, d1[0] // 75 * src1
@@ -356,100 +356,6 @@ endfunc
vqrshrn.s32 d8, q5, \shift
.endm
-function ff_hevc_transform_8x8_neon_8, export=1
- push {r4-r8}
- vpush {d8-d15}
- mov r5, #16
-
- adr r3, tr4f
- vld1.16 {d0, d1}, [r3]
-
- // left half
- vld1.16 {d24}, [r0], r5
- vld1.16 {d25}, [r0], r5
- vld1.16 {d26}, [r0], r5
- vld1.16 {d27}, [r0], r5
- vld1.16 {d28}, [r0], r5
- vld1.16 {d29}, [r0], r5
- vld1.16 {d30}, [r0], r5
- vld1.16 {d31}, [r0], r5
- sub r0, #128
- tr8_begin d25, d27, d29, d31
- tr4 d24, d26, d28, d30
- tr8_end #7
- vst1.16 {d2}, [r0], r5
- vst1.16 {d3}, [r0], r5
- vst1.16 {d4}, [r0], r5
- vst1.16 {d5}, [r0], r5
- vst1.16 {d6}, [r0], r5
- vst1.16 {d7}, [r0], r5
- vst1.16 {d8}, [r0], r5
- vst1.16 {d9}, [r0], r5
- sub r0, #128
- //skip right half if col_limit in r1 is less than 4
- cmp r1, #4
- blt 1f
- //right half
- add r0, #8
- vld1.16 {d24}, [r0], r5
- vld1.16 {d25}, [r0], r5
- vld1.16 {d26}, [r0], r5
- vld1.16 {d27}, [r0], r5
- vld1.16 {d28}, [r0], r5
- vld1.16 {d29}, [r0], r5
- vld1.16 {d30}, [r0], r5
- vld1.16 {d31}, [r0], r5
- sub r0, #128
- tr8_begin d25, d27, d29, d31
- tr4 d24, d26, d28, d30
- tr8_end #7
- vst1.16 {d2}, [r0], r5
- vst1.16 {d3}, [r0], r5
- vst1.16 {d4}, [r0], r5
- vst1.16 {d5}, [r0], r5
- vst1.16 {d6}, [r0], r5
- vst1.16 {d7}, [r0], r5
- vst1.16 {d8}, [r0], r5
- vst1.16 {d9}, [r0], r5
- sub r0, #136
-1:
- // top half
- vldm r0, {q12-q15} // coeffs
- transpose_16b_4x4 d24, d26, d28, d30
- transpose_16b_4x4 d25, d27, d29, d31
- tr8_begin d26, d30, d27, d31
- tr4 d24, d28, d25, d29
- tr8_end #12
- transpose_16b_4x4 d2, d3, d4, d5
- transpose_16b_4x4 d6, d7, d8, d9
- vswp d7, d5
- vswp d7, d8
- vswp d3, d6
- vswp d6, d4
- vstm r0!, {q1-q4}
-
- // bottom half
- vldm r0, {q12-q15} // coeffs
- transpose_16b_4x4 d24, d26, d28, d30
- transpose_16b_4x4 d25, d27, d29, d31
- tr8_begin d26, d30, d27, d31
- tr4 d24, d28, d25, d29
- tr8_end #12
- transpose_16b_4x4 d2, d3, d4, d5
- transpose_16b_4x4 d6, d7, d8, d9
- vswp d7, d5
- vswp d7, d8
- vswp d3, d6
- vswp d6, d4
- //vstm r0, {q1-q4}
- vst1.16 {q1-q2}, [r0]
- add r0, #32
- vst1.16 {q3-q4}, [r0]
- sub r0, #32
- vpop {d8-d15}
- pop {r4-r8}
- bx lr
-endfunc
.align 4
tr4f:
@@ -463,3 +369,11 @@ tr16:
.word 0x00500046 // 80, d2[2] = 70
.word 0x0039002b // 57, d2[0] = 43
.word 0x00190009 // 25, d2[2] = 9
+
+#define BIT_DEPTH 8
+#include "hevc_idct_fn_neon.S"
+
+#undef BIT_DEPTH
+#define BIT_DEPTH 10
+#include "hevc_idct_fn_neon.S"
+
diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
index 55918077e2..e708b7c074 100644
--- a/libavcodec/arm/hevcdsp_init_neon.c
+++ b/libavcodec/arm/hevcdsp_init_neon.c
@@ -22,11 +22,41 @@
#include "libavutil/arm/cpu.h"
#include "libavcodec/hevcdsp.h"
#include "hevcdsp_arm.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/bit_depth_template.c"
void ff_hevc_v_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
void ff_hevc_h_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
void ff_hevc_v_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
void ff_hevc_h_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+
+void ff_hevc_v_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+void ff_hevc_h_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+void ff_hevc_v_loop_filter_chroma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+void ff_hevc_h_loop_filter_chroma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+
+#ifdef RPI
+void ff_hevc_v_loop_filter_luma2_neon_8(uint8_t * _pix_r,
+ unsigned int _stride, unsigned int beta, const int32_t tc[2],
+ const uint8_t no_p[2], const uint8_t no_q[2],
+ uint8_t * _pix_l);
+void ff_hevc_h_loop_filter_uv_neon_8(uint8_t * src, unsigned int stride, uint32_t tc4,
+ unsigned int no_f);
+void ff_hevc_v_loop_filter_uv2_neon_8(uint8_t * src_r, unsigned int stride, uint32_t tc4,
+ uint8_t * src_l,
+ unsigned int no_f);
+
+void ff_hevc_v_loop_filter_luma2_neon_10(uint8_t * _pix_r,
+ unsigned int _stride, unsigned int beta, const int32_t tc[2],
+ const uint8_t no_p[2], const uint8_t no_q[2],
+ uint8_t * _pix_l);
+void ff_hevc_h_loop_filter_uv_neon_10(uint8_t * src, unsigned int stride, uint32_t tc4,
+ unsigned int no_f);
+void ff_hevc_v_loop_filter_uv2_neon_10(uint8_t * src_r, unsigned int stride, uint32_t tc4,
+ uint8_t * src_l,
+ unsigned int no_f);
+#endif
+
void ff_hevc_transform_4x4_neon_8(int16_t *coeffs, int col_limit);
void ff_hevc_transform_8x8_neon_8(int16_t *coeffs, int col_limit);
void ff_hevc_idct_4x4_dc_neon_8(int16_t *coeffs);
@@ -34,14 +64,174 @@ void ff_hevc_idct_8x8_dc_neon_8(int16_t *coeffs);
void ff_hevc_idct_16x16_dc_neon_8(int16_t *coeffs);
void ff_hevc_idct_32x32_dc_neon_8(int16_t *coeffs);
void ff_hevc_transform_luma_4x4_neon_8(int16_t *coeffs);
+
+void ff_hevc_transform_4x4_neon_10(int16_t *coeffs, int col_limit);
+void ff_hevc_transform_8x8_neon_10(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_4x4_dc_neon_10(int16_t *coeffs);
+void ff_hevc_idct_8x8_dc_neon_10(int16_t *coeffs);
+void ff_hevc_idct_16x16_dc_neon_10(int16_t *coeffs);
+void ff_hevc_idct_32x32_dc_neon_10(int16_t *coeffs);
+void ff_hevc_transform_luma_4x4_neon_10(int16_t *coeffs);
+
void ff_hevc_transform_add_4x4_neon_8(uint8_t *_dst, int16_t *coeffs,
- ptrdiff_t stride);
+ ptrdiff_t stride);
void ff_hevc_transform_add_8x8_neon_8(uint8_t *_dst, int16_t *coeffs,
- ptrdiff_t stride);
+ ptrdiff_t stride);
void ff_hevc_transform_add_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
- ptrdiff_t stride);
+ ptrdiff_t stride);
void ff_hevc_transform_add_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
- ptrdiff_t stride);
+ ptrdiff_t stride);
+
+void ff_hevc_add_residual_4x4_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
+void ff_hevc_add_residual_8x8_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
+void ff_hevc_add_residual_16x16_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
+void ff_hevc_add_residual_32x32_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
+
+
+void ff_hevc_add_residual_4x4_neon_10(uint8_t *_dst, int16_t *coeffs,
+ ptrdiff_t stride);
+void ff_hevc_add_residual_8x8_neon_10(uint8_t *_dst, int16_t *coeffs,
+ ptrdiff_t stride);
+void ff_hevc_add_residual_16x16_neon_10(uint8_t *_dst, int16_t *coeffs,
+ ptrdiff_t stride);
+void ff_hevc_add_residual_32x32_neon_10(uint8_t *_dst, int16_t *coeffs,
+ ptrdiff_t stride);
+
+void ff_hevc_add_residual_4x4_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
+void ff_hevc_add_residual_8x8_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
+void ff_hevc_add_residual_16x16_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
+void ff_hevc_add_residual_32x32_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
+
+
+#if RPI_HEVC_SAND
+void ff_hevc_add_residual_4x4_u_neon_8(uint8_t *_dst, const int16_t * residual,
+ ptrdiff_t stride, int dc_v);
+void ff_hevc_add_residual_8x8_u_neon_8(uint8_t *_dst, const int16_t * residual,
+ ptrdiff_t stride, int dc_v);
+void ff_hevc_add_residual_16x16_u_neon_8(uint8_t *_dst, const int16_t * residual,
+ ptrdiff_t stride, int dc_v);
+void ff_hevc_add_residual_4x4_v_neon_8(uint8_t *_dst, const int16_t * residual,
+ ptrdiff_t stride, int dc_u);
+void ff_hevc_add_residual_8x8_v_neon_8(uint8_t *_dst, const int16_t * residual,
+ ptrdiff_t stride, int dc_u);
+void ff_hevc_add_residual_16x16_v_neon_8(uint8_t *_dst, const int16_t * residual,
+ ptrdiff_t stride, int dc_u);
+void ff_hevc_add_residual_4x4_c_neon_8(uint8_t *_dst, const int16_t * residual,
+ ptrdiff_t stride);
+void ff_hevc_add_residual_8x8_c_neon_8(uint8_t *_dst, const int16_t * residual,
+ ptrdiff_t stride);
+void ff_hevc_add_residual_16x16_c_neon_8(uint8_t *_dst, const int16_t * residual,
+ ptrdiff_t stride);
+void ff_hevc_add_residual_4x4_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
+void ff_hevc_add_residual_8x8_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
+void ff_hevc_add_residual_16x16_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
+
+
+void ff_hevc_add_residual_4x4_u_neon_10(uint8_t *_dst, const int16_t * residual,
+ ptrdiff_t stride, int dc_v);
+void ff_hevc_add_residual_8x8_u_neon_10(uint8_t *_dst, const int16_t * residual,
+ ptrdiff_t stride, int dc_v);
+void ff_hevc_add_residual_16x16_u_neon_10(uint8_t *_dst, const int16_t * residual,
+ ptrdiff_t stride, int dc_v);
+void ff_hevc_add_residual_4x4_v_neon_10(uint8_t *_dst, const int16_t * residual,
+ ptrdiff_t stride, int dc_u);
+void ff_hevc_add_residual_8x8_v_neon_10(uint8_t *_dst, const int16_t * residual,
+ ptrdiff_t stride, int dc_u);
+void ff_hevc_add_residual_16x16_v_neon_10(uint8_t *_dst, const int16_t * residual,
+ ptrdiff_t stride, int dc_u);
+void ff_hevc_add_residual_4x4_c_neon_10(uint8_t *_dst, const int16_t * residual,
+ ptrdiff_t stride);
+void ff_hevc_add_residual_8x8_c_neon_10(uint8_t *_dst, const int16_t * residual,
+ ptrdiff_t stride);
+void ff_hevc_add_residual_16x16_c_neon_10(uint8_t *_dst, const int16_t * residual,
+ ptrdiff_t stride);
+void ff_hevc_add_residual_4x4_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
+void ff_hevc_add_residual_8x8_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
+void ff_hevc_add_residual_16x16_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
+#endif
+
+void ff_hevc_sao_edge_8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
+void ff_hevc_sao_edge_16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
+void ff_hevc_sao_edge_32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
+void ff_hevc_sao_edge_64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
+
+void ff_hevc_sao_edge_8_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
+void ff_hevc_sao_edge_16_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
+void ff_hevc_sao_edge_32_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
+void ff_hevc_sao_edge_64_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
+
+#if RPI_HEVC_SAND
+void ff_hevc_sao_edge_c_8_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
+ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
+ int eo, int width, int height);
+void ff_hevc_sao_edge_c_16_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
+ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
+ int eo, int width, int height);
+void ff_hevc_sao_edge_c_32_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
+ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
+ int eo, int width, int height);
+
+void ff_hevc_sao_edge_c_8_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
+ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
+ int eo, int width, int height);
+void ff_hevc_sao_edge_c_16_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
+ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
+ int eo, int width, int height);
+void ff_hevc_sao_edge_c_32_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
+ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
+ int eo, int width, int height);
+
+void ff_hevc_sao_band_c_8_neon_8(uint8_t *_dst, const uint8_t *_src,
+ ptrdiff_t stride_dst, ptrdiff_t stride_src,
+ const int16_t *sao_offset_val_u, int sao_left_class_u,
+ const int16_t *sao_offset_val_v, int sao_left_class_v,
+ int width, int height);
+void ff_hevc_sao_band_c_16_neon_8(uint8_t *_dst, const uint8_t *_src,
+ ptrdiff_t stride_dst, ptrdiff_t stride_src,
+ const int16_t *sao_offset_val_u, int sao_left_class_u,
+ const int16_t *sao_offset_val_v, int sao_left_class_v,
+ int width, int height);
+void ff_hevc_sao_band_c_32_neon_8(uint8_t *_dst, const uint8_t *_src,
+ ptrdiff_t stride_dst, ptrdiff_t stride_src,
+ const int16_t *sao_offset_val_u, int sao_left_class_u,
+ const int16_t *sao_offset_val_v, int sao_left_class_v,
+ int width, int height);
+
+void ff_hevc_sao_band_c_8_neon_10(uint8_t *_dst, const uint8_t *_src,
+ ptrdiff_t stride_dst, ptrdiff_t stride_src,
+ const int16_t *sao_offset_val_u, int sao_left_class_u,
+ const int16_t *sao_offset_val_v, int sao_left_class_v,
+ int width, int height);
+void ff_hevc_sao_band_c_16_neon_10(uint8_t *_dst, const uint8_t *_src,
+ ptrdiff_t stride_dst, ptrdiff_t stride_src,
+ const int16_t *sao_offset_val_u, int sao_left_class_u,
+ const int16_t *sao_offset_val_v, int sao_left_class_v,
+ int width, int height);
+void ff_hevc_sao_band_c_32_neon_10(uint8_t *_dst, const uint8_t *_src,
+ ptrdiff_t stride_dst, ptrdiff_t stride_src,
+ const int16_t *sao_offset_val_u, int sao_left_class_u,
+ const int16_t *sao_offset_val_v, int sao_left_class_v,
+ int width, int height);
+#endif
+
+void ff_hevc_sao_band_8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+ int16_t *sao_offset_val, int sao_left_class, int width, int height);
+void ff_hevc_sao_band_16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+ int16_t *sao_offset_val, int sao_left_class, int width, int height);
+void ff_hevc_sao_band_32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+ int16_t *sao_offset_val, int sao_left_class, int width, int height);
+void ff_hevc_sao_band_64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+ int16_t *sao_offset_val, int sao_left_class, int width, int height);
+
+void ff_hevc_sao_band_8_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+ int16_t *sao_offset_val, int sao_left_class, int width, int height);
+void ff_hevc_sao_band_16_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+ int16_t *sao_offset_val, int sao_left_class, int width, int height);
+void ff_hevc_sao_band_32_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+ int16_t *sao_offset_val, int sao_left_class, int width, int height);
+void ff_hevc_sao_band_64_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+ int16_t *sao_offset_val, int sao_left_class, int width, int height);
+
#define PUT_PIXELS(name) \
void name(int16_t *dst, uint8_t *src, \
@@ -58,6 +248,15 @@ PUT_PIXELS(ff_hevc_put_pixels_w32_neon_8);
PUT_PIXELS(ff_hevc_put_pixels_w48_neon_8);
PUT_PIXELS(ff_hevc_put_pixels_w64_neon_8);
#undef PUT_PIXELS
+void ff_hevc_put_epel_h_neon_8(int16_t *dst, uint8_t *src,
+ ptrdiff_t srcstride, int height,
+ intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_epel_v_neon_8(int16_t *dst, uint8_t *src,
+ ptrdiff_t srcstride, int height,
+ intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_epel_hv_neon_8(int16_t *dst, uint8_t *src,
+ ptrdiff_t srcstride, int height,
+ intptr_t mx, intptr_t my, int width);
static void (*put_hevc_qpel_neon[4][4])(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
int height, int width);
@@ -142,25 +341,181 @@ void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t
put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, src2, MAX_PB_SIZE);
}
+void ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc,
+ int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
+ MvField *curr, MvField *neigh, uint8_t *bs);
+
+
+static void ff_hevc_sao_edge_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
+{
+ ff_hevc_sao_edge_32_neon_8(_dst, _src, stride_dst, _sao_offset_val, eo, 32, height);
+ ff_hevc_sao_edge_16_neon_8(_dst + 32, _src + 32, stride_dst, _sao_offset_val, eo, 16, height);
+}
+static void ff_hevc_sao_edge_48_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
+{
+ ff_hevc_sao_edge_32_neon_10(_dst, _src, stride_dst, _sao_offset_val, eo, 32, height);
+ ff_hevc_sao_edge_16_neon_10(_dst + 64, _src + 64, stride_dst, _sao_offset_val, eo, 16, height);
+}
+
+static void ff_hevc_sao_band_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+ int16_t *sao_offset_val, int sao_left_class, int width, int height)
+{
+ ff_hevc_sao_band_32_neon_8(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 32, height);
+ ff_hevc_sao_band_16_neon_8(_dst + 32, _src + 32, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
+}
+static void ff_hevc_sao_band_48_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+ int16_t *sao_offset_val, int sao_left_class, int width, int height)
+{
+ ff_hevc_sao_band_32_neon_10(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 32, height);
+ ff_hevc_sao_band_16_neon_10(_dst + 64, _src + 64, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
+}
+
+#if SAO_FILTER_N == 6
+static void ff_hevc_sao_edge_24_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
+{
+ ff_hevc_sao_edge_16_neon_8(_dst, _src, stride_dst, _sao_offset_val, eo, 16, height);
+ ff_hevc_sao_edge_8_neon_8(_dst + 16, _src + 16, stride_dst, _sao_offset_val, eo, 8, height);
+}
+static void ff_hevc_sao_edge_24_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
+{
+ ff_hevc_sao_edge_16_neon_10(_dst, _src, stride_dst, _sao_offset_val, eo, 16, height);
+ ff_hevc_sao_edge_8_neon_10(_dst + 32, _src + 32, stride_dst, _sao_offset_val, eo, 8, height);
+}
+
+static void ff_hevc_sao_band_24_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+ int16_t *sao_offset_val, int sao_left_class, int width, int height)
+{
+ ff_hevc_sao_band_16_neon_8(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
+ ff_hevc_sao_band_8_neon_8(_dst + 16, _src + 16, stride_dst, stride_src, sao_offset_val, sao_left_class, 8, height);
+}
+static void ff_hevc_sao_band_24_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+ int16_t *sao_offset_val, int sao_left_class, int width, int height)
+{
+ ff_hevc_sao_band_16_neon_10(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
+ ff_hevc_sao_band_8_neon_10(_dst + 32, _src + 32, stride_dst, stride_src, sao_offset_val, sao_left_class, 8, height);
+}
+
+#if RPI_HEVC_SAND
+static void ff_hevc_sao_edge_c_24_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
+ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
+ int eo, int width, int height)
+{
+ ff_hevc_sao_edge_c_16_neon_8(_dst, _src, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 16, height);
+ ff_hevc_sao_edge_c_8_neon_8(_dst + 32, _src + 32, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 8, height);
+}
+static void ff_hevc_sao_edge_c_24_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
+ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
+ int eo, int width, int height)
+{
+ ff_hevc_sao_edge_c_16_neon_10(_dst, _src, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 16, height);
+ ff_hevc_sao_edge_c_8_neon_10(_dst + 64, _src + 64, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 8, height);
+}
+
+static void ff_hevc_sao_band_c_24_neon_8(uint8_t *_dst, const uint8_t *_src,
+ ptrdiff_t stride_dst, ptrdiff_t stride_src,
+ const int16_t *sao_offset_val_u, int sao_left_class_u,
+ const int16_t *sao_offset_val_v, int sao_left_class_v,
+ int width, int height)
+{
+ ff_hevc_sao_band_c_16_neon_8(_dst, _src, stride_dst, stride_src,
+ sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 16, height);
+ ff_hevc_sao_band_c_8_neon_8(_dst + 32, _src + 32, stride_dst, stride_src,
+ sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 8, height);
+}
+static void ff_hevc_sao_band_c_24_neon_10(uint8_t *_dst, const uint8_t *_src,
+ ptrdiff_t stride_dst, ptrdiff_t stride_src,
+ const int16_t *sao_offset_val_u, int sao_left_class_u,
+ const int16_t *sao_offset_val_v, int sao_left_class_v,
+ int width, int height)
+{
+ ff_hevc_sao_band_c_16_neon_10(_dst, _src, stride_dst, stride_src,
+ sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 16, height);
+ ff_hevc_sao_band_c_8_neon_10(_dst + 64, _src + 64, stride_dst, stride_src,
+ sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 8, height);
+}
+#endif
+#endif
+
+
+
+#if (2*MAX_PB_SIZE + FF_INPUT_BUFFER_PADDING_SIZE) != 160
+#error SAO edge src stride not 160 - value used in .S
+#endif
+
av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
{
if (bit_depth == 8) {
int x;
c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_neon;
+ c->hevc_v_loop_filter_luma_c = ff_hevc_v_loop_filter_luma_neon;
c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_neon;
+ c->hevc_h_loop_filter_luma_c = ff_hevc_h_loop_filter_luma_neon;
c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_neon;
+ c->hevc_v_loop_filter_chroma_c = ff_hevc_v_loop_filter_chroma_neon;
c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_neon;
+ c->hevc_h_loop_filter_chroma_c = ff_hevc_h_loop_filter_chroma_neon;
+#ifdef RPI
+ c->hevc_v_loop_filter_luma2 = ff_hevc_v_loop_filter_luma2_neon_8;
+ c->hevc_h_loop_filter_uv = ff_hevc_h_loop_filter_uv_neon_8;
+ c->hevc_v_loop_filter_uv2 = ff_hevc_v_loop_filter_uv2_neon_8;
+#endif
c->idct[0] = ff_hevc_transform_4x4_neon_8;
c->idct[1] = ff_hevc_transform_8x8_neon_8;
c->idct_dc[0] = ff_hevc_idct_4x4_dc_neon_8;
c->idct_dc[1] = ff_hevc_idct_8x8_dc_neon_8;
c->idct_dc[2] = ff_hevc_idct_16x16_dc_neon_8;
c->idct_dc[3] = ff_hevc_idct_32x32_dc_neon_8;
- c->transform_add[0] = ff_hevc_transform_add_4x4_neon_8;
- c->transform_add[1] = ff_hevc_transform_add_8x8_neon_8;
- c->transform_add[2] = ff_hevc_transform_add_16x16_neon_8;
- c->transform_add[3] = ff_hevc_transform_add_32x32_neon_8;
+ c->transform_add[0] = ff_hevc_transform_add_4x4_neon_8;
+ c->transform_add[1] = ff_hevc_transform_add_8x8_neon_8;
+ c->transform_add[2] = ff_hevc_transform_add_16x16_neon_8;
+ c->transform_add[3] = ff_hevc_transform_add_32x32_neon_8;
+ c->add_residual_dc[0] = ff_hevc_add_residual_4x4_dc_neon_8;
+ c->add_residual_dc[1] = ff_hevc_add_residual_8x8_dc_neon_8;
+ c->add_residual_dc[2] = ff_hevc_add_residual_16x16_dc_neon_8;
+ c->add_residual_dc[3] = ff_hevc_add_residual_32x32_dc_neon_8;
+#if RPI_HEVC_SAND
+ c->add_residual_u[0] = ff_hevc_add_residual_4x4_u_neon_8;
+ c->add_residual_u[1] = ff_hevc_add_residual_8x8_u_neon_8;
+ c->add_residual_u[2] = ff_hevc_add_residual_16x16_u_neon_8;
+ c->add_residual_v[0] = ff_hevc_add_residual_4x4_v_neon_8;
+ c->add_residual_v[1] = ff_hevc_add_residual_8x8_v_neon_8;
+ c->add_residual_v[2] = ff_hevc_add_residual_16x16_v_neon_8;
+ c->add_residual_c[0] = ff_hevc_add_residual_4x4_c_neon_8;
+ c->add_residual_c[1] = ff_hevc_add_residual_8x8_c_neon_8;
+ c->add_residual_c[2] = ff_hevc_add_residual_16x16_c_neon_8;
+ c->add_residual_dc_c[0] = ff_hevc_add_residual_4x4_dc_c_neon_8;
+ c->add_residual_dc_c[1] = ff_hevc_add_residual_8x8_dc_c_neon_8;
+ c->add_residual_dc_c[2] = ff_hevc_add_residual_16x16_dc_c_neon_8;
+#endif
c->idct_4x4_luma = ff_hevc_transform_luma_4x4_neon_8;
+ c->sao_band_filter[0] = ff_hevc_sao_band_8_neon_8;
+ c->sao_band_filter[1] = ff_hevc_sao_band_16_neon_8;
+ c->sao_band_filter[2] = ff_hevc_sao_band_32_neon_8;
+ c->sao_band_filter[3] = ff_hevc_sao_band_48_neon_8;
+ c->sao_band_filter[4] = ff_hevc_sao_band_64_neon_8;
+ c->sao_edge_filter[0] = ff_hevc_sao_edge_8_neon_8;
+ c->sao_edge_filter[1] = ff_hevc_sao_edge_16_neon_8;
+ c->sao_edge_filter[2] = ff_hevc_sao_edge_32_neon_8;
+ c->sao_edge_filter[3] = ff_hevc_sao_edge_48_neon_8;
+ c->sao_edge_filter[4] = ff_hevc_sao_edge_64_neon_8;
+#if SAO_FILTER_N == 6
+ c->sao_band_filter[5] = ff_hevc_sao_band_24_neon_8;
+ c->sao_edge_filter[5] = ff_hevc_sao_edge_24_neon_8;
+#endif
+#if RPI_HEVC_SAND
+ c->sao_band_filter_c[0] = ff_hevc_sao_band_c_8_neon_8;
+ c->sao_band_filter_c[1] = ff_hevc_sao_band_c_16_neon_8;
+ c->sao_band_filter_c[2] = ff_hevc_sao_band_c_32_neon_8;
+
+ c->sao_edge_filter_c[0] = ff_hevc_sao_edge_c_8_neon_8;
+ c->sao_edge_filter_c[1] = ff_hevc_sao_edge_c_16_neon_8;
+ c->sao_edge_filter_c[2] = ff_hevc_sao_edge_c_32_neon_8;
+
+#if SAO_FILTER_N == 6
+ c->sao_band_filter_c[5] = ff_hevc_sao_band_c_24_neon_8;
+ c->sao_edge_filter_c[5] = ff_hevc_sao_edge_c_24_neon_8;
+#endif
+#endif
put_hevc_qpel_neon[1][0] = ff_hevc_put_qpel_v1_neon_8;
put_hevc_qpel_neon[2][0] = ff_hevc_put_qpel_v2_neon_8;
put_hevc_qpel_neon[3][0] = ff_hevc_put_qpel_v3_neon_8;
@@ -201,7 +556,21 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
c->put_hevc_qpel_bi[x][1][0] = ff_hevc_put_qpel_bi_neon_wrapper;
c->put_hevc_qpel_bi[x][0][1] = ff_hevc_put_qpel_bi_neon_wrapper;
c->put_hevc_qpel_bi[x][1][1] = ff_hevc_put_qpel_bi_neon_wrapper;
+ c->put_hevc_epel[x][1][0] = ff_hevc_put_epel_v_neon_8;
+ c->put_hevc_epel[x][0][1] = ff_hevc_put_epel_h_neon_8;
+ c->put_hevc_epel[x][1][1] = ff_hevc_put_epel_hv_neon_8;
}
+ c->put_hevc_epel[0][0][0] = ff_hevc_put_pixels_w2_neon_8;
+ c->put_hevc_epel[1][0][0] = ff_hevc_put_pixels_w4_neon_8;
+ c->put_hevc_epel[2][0][0] = ff_hevc_put_pixels_w6_neon_8;
+ c->put_hevc_epel[3][0][0] = ff_hevc_put_pixels_w8_neon_8;
+ c->put_hevc_epel[4][0][0] = ff_hevc_put_pixels_w12_neon_8;
+ c->put_hevc_epel[5][0][0] = ff_hevc_put_pixels_w16_neon_8;
+ c->put_hevc_epel[6][0][0] = ff_hevc_put_pixels_w24_neon_8;
+ c->put_hevc_epel[7][0][0] = ff_hevc_put_pixels_w32_neon_8;
+ c->put_hevc_epel[8][0][0] = ff_hevc_put_pixels_w48_neon_8;
+ c->put_hevc_epel[9][0][0] = ff_hevc_put_pixels_w64_neon_8;
+
c->put_hevc_qpel[0][0][0] = ff_hevc_put_pixels_w2_neon_8;
c->put_hevc_qpel[1][0][0] = ff_hevc_put_pixels_w4_neon_8;
c->put_hevc_qpel[2][0][0] = ff_hevc_put_pixels_w6_neon_8;
@@ -221,4 +590,82 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_qpel_uw_pixels_w48_neon_8;
c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_qpel_uw_pixels_w64_neon_8;
}
+ else if (bit_depth == 10) {
+ c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_neon_10;
+ c->hevc_v_loop_filter_luma_c = ff_hevc_v_loop_filter_luma_neon_10;
+ c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_neon_10;
+ c->hevc_h_loop_filter_luma_c = ff_hevc_h_loop_filter_luma_neon_10;
+ c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_neon_10;
+ c->hevc_v_loop_filter_chroma_c = ff_hevc_v_loop_filter_chroma_neon_10;
+ c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_neon_10;
+ c->hevc_h_loop_filter_chroma_c = ff_hevc_h_loop_filter_chroma_neon_10;
+#ifdef RPI
+ c->hevc_v_loop_filter_luma2 = ff_hevc_v_loop_filter_luma2_neon_10;
+ c->hevc_h_loop_filter_uv = ff_hevc_h_loop_filter_uv_neon_10;
+ c->hevc_v_loop_filter_uv2 = ff_hevc_v_loop_filter_uv2_neon_10;
+#endif
+ c->idct[0] = ff_hevc_transform_4x4_neon_10;
+ c->idct[1] = ff_hevc_transform_8x8_neon_10;
+ c->idct_dc[0] = ff_hevc_idct_4x4_dc_neon_10;
+ c->idct_dc[1] = ff_hevc_idct_8x8_dc_neon_10;
+ c->idct_dc[2] = ff_hevc_idct_16x16_dc_neon_10;
+ c->idct_dc[3] = ff_hevc_idct_32x32_dc_neon_10;
+ c->transform_add[0] = ff_hevc_add_residual_4x4_neon_10;
+ c->transform_add[1] = ff_hevc_add_residual_8x8_neon_10;
+ c->transform_add[2] = ff_hevc_add_residual_16x16_neon_10;
+ c->transform_add[3] = ff_hevc_add_residual_32x32_neon_10;
+ c->add_residual_dc[0] = ff_hevc_add_residual_4x4_dc_neon_10;
+ c->add_residual_dc[1] = ff_hevc_add_residual_8x8_dc_neon_10;
+ c->add_residual_dc[2] = ff_hevc_add_residual_16x16_dc_neon_10;
+ c->add_residual_dc[3] = ff_hevc_add_residual_32x32_dc_neon_10;
+#if RPI_HEVC_SAND
+ c->add_residual_u[0] = ff_hevc_add_residual_4x4_u_neon_10;
+ c->add_residual_u[1] = ff_hevc_add_residual_8x8_u_neon_10;
+ c->add_residual_u[2] = ff_hevc_add_residual_16x16_u_neon_10;
+ c->add_residual_v[0] = ff_hevc_add_residual_4x4_v_neon_10;
+ c->add_residual_v[1] = ff_hevc_add_residual_8x8_v_neon_10;
+ c->add_residual_v[2] = ff_hevc_add_residual_16x16_v_neon_10;
+ c->add_residual_c[0] = ff_hevc_add_residual_4x4_c_neon_10;
+ c->add_residual_c[1] = ff_hevc_add_residual_8x8_c_neon_10;
+ c->add_residual_c[2] = ff_hevc_add_residual_16x16_c_neon_10;
+ c->add_residual_dc_c[0] = ff_hevc_add_residual_4x4_dc_c_neon_10;
+ c->add_residual_dc_c[1] = ff_hevc_add_residual_8x8_dc_c_neon_10;
+ c->add_residual_dc_c[2] = ff_hevc_add_residual_16x16_dc_c_neon_10;
+#endif
+ c->idct_4x4_luma = ff_hevc_transform_luma_4x4_neon_10;
+ c->sao_band_filter[0] = ff_hevc_sao_band_8_neon_10;
+ c->sao_band_filter[1] = ff_hevc_sao_band_16_neon_10;
+ c->sao_band_filter[2] = ff_hevc_sao_band_32_neon_10;
+ c->sao_band_filter[3] = ff_hevc_sao_band_48_neon_10;
+ c->sao_band_filter[4] = ff_hevc_sao_band_64_neon_10;
+
+ c->sao_edge_filter[0] = ff_hevc_sao_edge_8_neon_10;
+ c->sao_edge_filter[1] = ff_hevc_sao_edge_16_neon_10;
+ c->sao_edge_filter[2] = ff_hevc_sao_edge_32_neon_10;
+ c->sao_edge_filter[3] = ff_hevc_sao_edge_48_neon_10;
+ c->sao_edge_filter[4] = ff_hevc_sao_edge_64_neon_10;
+#if SAO_FILTER_N == 6
+ c->sao_band_filter[5] = ff_hevc_sao_band_24_neon_10;
+ c->sao_edge_filter[5] = ff_hevc_sao_edge_24_neon_10;
+#endif
+#if RPI_HEVC_SAND
+ c->sao_band_filter_c[0] = ff_hevc_sao_band_c_8_neon_10;
+ c->sao_band_filter_c[1] = ff_hevc_sao_band_c_16_neon_10;
+ c->sao_band_filter_c[2] = ff_hevc_sao_band_c_32_neon_10;
+
+ c->sao_edge_filter_c[0] = ff_hevc_sao_edge_c_8_neon_10;
+ c->sao_edge_filter_c[1] = ff_hevc_sao_edge_c_16_neon_10;
+ c->sao_edge_filter_c[2] = ff_hevc_sao_edge_c_32_neon_10;
+
+#if SAO_FILTER_N == 6
+ c->sao_band_filter_c[5] = ff_hevc_sao_band_c_24_neon_10;
+ c->sao_edge_filter_c[5] = ff_hevc_sao_edge_c_24_neon_10;
+#endif
+#endif
+ }
+
+ assert(offsetof(MvField, mv) == 0);
+ assert(offsetof(MvField, ref_idx) == 8);
+ assert(offsetof(MvField, pred_flag) == 10);
+ c->hevc_deblocking_boundary_strengths = ff_hevc_deblocking_boundary_strengths_neon;
}
diff --git a/libavcodec/arm/hevcdsp_res16_neon.S b/libavcodec/arm/hevcdsp_res16_neon.S
new file mode 100644
index 0000000000..7cc5cd5e5c
--- /dev/null
+++ b/libavcodec/arm/hevcdsp_res16_neon.S
@@ -0,0 +1,610 @@
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
+#define BIT_DEPTH 10
+
+.macro clip16_4 Q0, Q1, Q2, Q3, Q_MIN, Q_MAX
+ vmax.s16 \Q0, \Q_MIN
+ vmax.s16 \Q1, \Q_MIN
+ vmax.s16 \Q2, \Q_MIN
+ vmax.s16 \Q3, \Q_MIN
+ vmin.s16 \Q0, \Q_MAX
+ vmin.s16 \Q1, \Q_MAX
+ vmin.s16 \Q2, \Q_MAX
+ vmin.s16 \Q3, \Q_MAX
+.endm
+
+@ add_residual4x4(
+@ uint8_t *_dst, [r0]
+@ int16_t *res, [r1]
+@ ptrdiff_t stride) [r2]
+
+function JOIN(ff_hevc_add_residual_4x4_neon_, BIT_DEPTH), export=1
+ vld1.16 {q10, q11}, [r1]
+ movw r3, #(1 << BIT_DEPTH) - 1
+ vld1.16 {d0}, [r0, :64], r2
+ vld1.16 {d1}, [r0, :64], r2
+ vld1.16 {d2}, [r0, :64], r2
+ vld1.16 {d3}, [r0, :64], r2
+ vmov.i64 q8, #0
+ vdup.i16 q9, r3
+ vqadd.s16 q0, q10
+ vqadd.s16 q1, q11
+ sub r0, r0, r2, lsl #2
+ vmax.s16 q0, q0, q8
+ vmax.s16 q1, q1, q8
+ vmin.s16 q0, q0, q9
+ vmin.s16 q1, q1, q9
+ vst1.16 {d0}, [r0, :64], r2
+ vst1.16 {d1}, [r0, :64], r2
+ vst1.16 {d2}, [r0, :64], r2
+ vst1.16 {d3}, [r0, :64], r2
+ bx lr
+
+endfunc
+
+@ add_residual4x4(
+@ uint8_t *_dst, [r0]
+@ ptrdiff_t stride, [r1]
+@ int dc) [r2]
+
+function JOIN(ff_hevc_add_residual_4x4_dc_neon_, BIT_DEPTH), export=1
+ movw r3, #(1 << BIT_DEPTH) - 1
+ vdup.i16 q9, r3
+ vld1.16 {d0}, [r0, :64], r1
+ vld1.16 {d1}, [r0, :64], r1
+ vdup.16 q15, r2
+ vld1.16 {d2}, [r0, :64], r1
+ vld1.16 {d3}, [r0, :64], r1
+ vmov.i64 q8, #0
+ vdup.i16 q9, r3
+ vqadd.s16 q0, q15
+ vqadd.s16 q1, q15
+ sub r0, r0, r1, lsl #2
+ vmax.s16 q0, q0, q8
+ vmax.s16 q1, q1, q8
+ vmin.s16 q0, q0, q9
+ vmin.s16 q1, q1, q9
+ vst1.16 {d0}, [r0, :64], r1
+ vst1.16 {d1}, [r0, :64], r1
+ vst1.16 {d2}, [r0, :64], r1
+ vst1.16 {d3}, [r0, :64], r1
+ bx lr
+
+endfunc
+
+
+@ add_residual8x8(
+@ uint8_t *_dst, [r0]
+@ int16_t *res, [r1]
+@ ptrdiff_t stride) [r2]
+
+function JOIN(ff_hevc_add_residual_8x8_neon_, BIT_DEPTH), export=1
+ movw r3, #(1 << BIT_DEPTH) - 1
+ vmov.i64 q8, #0
+ vdup.i16 q9, r3
+ mov r12, #2
+1:
+ vldm r1!, {q10-q13}
+ vld1.16 {q0}, [r0, :128], r2
+ subs r12, #1
+ vld1.16 {q1}, [r0, :128], r2
+ vqadd.s16 q0, q10
+ vld1.16 {q2}, [r0, :128], r2
+ vqadd.s16 q1, q11
+ vld1.16 {q3}, [r0, :128], r2
+ vqadd.s16 q2, q12
+ vqadd.s16 q3, q13
+ sub r0, r0, r2, lsl #2
+ vmax.s16 q0, q0, q8
+ vmax.s16 q1, q1, q8
+ vmax.s16 q2, q2, q8
+ vmax.s16 q3, q3, q8
+ vmin.s16 q0, q0, q9
+ vmin.s16 q1, q1, q9
+ vst1.16 {q0}, [r0, :128], r2
+ vmin.s16 q2, q2, q9
+ vst1.16 {q1}, [r0, :128], r2
+ vmin.s16 q3, q3, q9
+ vst1.16 {q2}, [r0, :128], r2
+ vst1.16 {q3}, [r0, :128], r2
+ bne 1b
+ bx lr
+
+endfunc
+
+@ add_residual4x4_dc_c(
+@ uint8_t *_dst, [r0]
+@ ptrdiff_t stride, [r1]
+@ int dc_uv) [r2]
+
+function JOIN(ff_hevc_add_residual_4x4_dc_c_neon_, BIT_DEPTH), export=1
+ mov r12, #1
+ vdup.32 q15, r2
+ b 9f
+endfunc
+
+@ add_residual8x8_dc(
+@ uint8_t *_dst, [r0]
+@ ptrdiff_t stride, [r1]
+@ int dc) [r2]
+
+function JOIN(ff_hevc_add_residual_8x8_dc_neon_, BIT_DEPTH), export=1
+ mov r12, #2
+ vdup.16 q15, r2
+9:
+ movw r3, #(1 << BIT_DEPTH) - 1
+ vmov.i64 q8, #0
+ vdup.i16 q9, r3
+1:
+ vld1.16 {q0}, [r0, :128], r1
+ subs r12, #1
+ vld1.16 {q1}, [r0, :128], r1
+ vqadd.s16 q0, q15
+ vld1.16 {q2}, [r0, :128], r1
+ vqadd.s16 q1, q15
+ vld1.16 {q3}, [r0, :128], r1
+ vqadd.s16 q2, q15
+ vqadd.s16 q3, q15
+ sub r0, r0, r1, lsl #2
+ vmax.s16 q0, q8
+ vmax.s16 q1, q8
+ vmax.s16 q2, q8
+ vmax.s16 q3, q8
+ vmin.s16 q0, q9
+ vmin.s16 q1, q9
+ vst1.16 {q0}, [r0, :128], r1
+ vmin.s16 q2, q9
+ vst1.16 {q1}, [r0, :128], r1
+ vmin.s16 q3, q9
+ vst1.16 {q2}, [r0, :128], r1
+ vst1.16 {q3}, [r0, :128], r1
+ bne 1b
+ bx lr
+
+endfunc
+
+@ add_residual16x16(
+@ uint8_t *_dst, [r0]
+@ int16_t *res, [r1]
+@ ptrdiff_t stride) [r2]
+
+function JOIN(ff_hevc_add_residual_16x16_neon_, BIT_DEPTH), export=1
+ movw r3, #(1 << BIT_DEPTH) - 1
+ vmov.i64 q8, #0
+ vdup.i16 q9, r3
+ mov r12, #8
+1:
+ vldm r1!, {q10-q13}
+ @ For RPI Sand we could guarantee :256 but not for general
+ @ non-RPI allocation. :128 is as good as we can claim
+ vld1.16 {q0, q1}, [r0, :128], r2
+ subs r12, #1
+ vld1.16 {q2, q3}, [r0, :128]
+ vqadd.s16 q0, q10
+ vqadd.s16 q1, q11
+ vqadd.s16 q2, q12
+ vqadd.s16 q3, q13
+ sub r0, r2
+ vmax.s16 q0, q0, q8
+ vmax.s16 q1, q1, q8
+ vmax.s16 q2, q2, q8
+ vmax.s16 q3, q3, q8
+ vmin.s16 q0, q0, q9
+ vmin.s16 q1, q1, q9
+ vmin.s16 q2, q2, q9
+ vmin.s16 q3, q3, q9
+ vst1.16 {q0, q1}, [r0, :128], r2
+ vst1.16 {q2, q3}, [r0, :128], r2
+ bne 1b
+ bx lr
+endfunc
+
+@ add_residual8x8_dc_c(
+@ uint8_t *_dst, [r0]
+@ ptrdiff_t stride, [r1]
+@ int dc_uv) [r2]
+
+function JOIN(ff_hevc_add_residual_8x8_dc_c_neon_, BIT_DEPTH), export=1
+ mov r12, #4
+ vdup.32 q15, r2
+ b 9f
+endfunc
+
+@ add_residual16x16_dc(
+@ uint8_t *_dst, [r0]
+@ ptrdiff_t stride, [r1]
+@ int dc) [r2]
+
+function JOIN(ff_hevc_add_residual_16x16_dc_neon_, BIT_DEPTH), export=1
+ vdup.i16 q15, r2
+ mov r12, #8
+9:
+ movw r3, #(1 << BIT_DEPTH) - 1
+ vmov.i64 q8, #0
+ vdup.i16 q9, r3
+1:
+ @ For RPI Sand we could guarantee :256 but not for general
+ @ non-RPI allocation. :128 is as good as we can claim
+ vld1.16 {q0, q1}, [r0, :128], r1
+ subs r12, #1
+ vld1.16 {q2, q3}, [r0, :128]
+ vqadd.s16 q0, q15
+ vqadd.s16 q1, q15
+ vqadd.s16 q2, q15
+ vqadd.s16 q3, q15
+ sub r0, r1
+ clip16_4 q0, q1, q2, q3, q8, q9
+ vst1.16 {q0, q1}, [r0, :128], r1
+ vst1.16 {q2, q3}, [r0, :128], r1
+ bne 1b
+ bx lr
+
+endfunc
+
+
+@ add_residual32x32(
+@ uint8_t *_dst, [r0]
+@ int16_t *res, [r1]
+@ ptrdiff_t stride) [r2]
+
+function JOIN(ff_hevc_add_residual_32x32_neon_, BIT_DEPTH), export=1
+ movw r3, #(1 << BIT_DEPTH) - 1
+ vmov.i64 q8, #0
+ vdup.i16 q9, r3
+ mov r12, #32
+1:
+ vldm r1!, {q10-q13}
+ vldm r0, {q0-q3}
+ subs r12, #1
+ vqadd.s16 q0, q10
+ vqadd.s16 q1, q11
+ vqadd.s16 q2, q12
+ vqadd.s16 q3, q13
+ clip16_4 q0, q1, q2, q3, q8, q9
+ vstm r0, {q0-q3}
+ add r0, r2
+ bne 1b
+ bx lr
+
+endfunc
+
+@ add_residual8x8_dc_c(
+@ uint8_t *_dst, [r0]
+@ ptrdiff_t stride, [r1]
+@ int dc_uv) [r2]
+
+function JOIN(ff_hevc_add_residual_16x16_dc_c_neon_, BIT_DEPTH), export=1
+ mov r12, #16
+ vdup.32 q15, r2
+ b 9f
+endfunc
+
+@ add_residual32x32_dc(
+@ uint8_t *_dst, [r0]
+@ ptrdiff_t stride, [r1]
+@ int dc) [r2]
+
+function JOIN(ff_hevc_add_residual_32x32_dc_neon_, BIT_DEPTH), export=1
+ vdup.i16 q15, r2
+ mov r12, #32
+9:
+ movw r3, #(1 << BIT_DEPTH) - 1
+ vmov.i64 q8, #0
+ vdup.i16 q9, r3
+1:
+ vldm r0, {q0-q3}
+ subs r12, #1
+ vqadd.s16 q0, q15
+ vqadd.s16 q1, q15
+ vqadd.s16 q2, q15
+ vqadd.s16 q3, q15
+ clip16_4 q0, q1, q2, q3, q8, q9
+ vstm r0, {q0-q3}
+ add r0, r1
+ bne 1b
+ bx lr
+
+endfunc
+
+@ ============================================================================
+@ U add
+
+@ add_residual4x4_u(
+@ uint8_t *_dst, [r0]
+@ const int16_t *res, [r1]
+@ ptrdiff_t stride, [r2]
+@ int dc) [r3]
+
+function JOIN(ff_hevc_add_residual_4x4_u_neon_, BIT_DEPTH), export=1
+ vld1.16 {q10, q11}, [r1, :256]
+ vdup.16 q15, r3
+ movw r3, #(1 << BIT_DEPTH) - 1
+ vmov.i64 q8, #0
+ vdup.i16 q9, r3
+
+ vld2.16 {d0, d2}, [r0, :128], r2
+ vld2.16 {d1, d3}, [r0, :128], r2
+ vld2.16 {d4, d6}, [r0, :128], r2
+ vld2.16 {d5, d7}, [r0, :128], r2
+
+ vqadd.s16 q0, q10
+ vqadd.s16 q1, q15
+ vqadd.s16 q2, q11
+ vqadd.s16 q3, q15
+ sub r0, r0, r2, lsl #2
+ clip16_4 q0, q1, q2, q3, q8, q9
+
+ vst2.16 {d0, d2}, [r0, :128], r2
+ vst2.16 {d1, d3}, [r0, :128], r2
+ vst2.16 {d4, d6}, [r0, :128], r2
+ vst2.16 {d5, d7}, [r0, :128]
+ bx lr
+endfunc
+
+@ add_residual8x8_u(
+@ uint8_t *_dst, [r0]
+@ const int16_t *res, [r1]
+@ ptrdiff_t stride, [r2]
+@ int dc) [r3]
+
+function JOIN(ff_hevc_add_residual_8x8_u_neon_, BIT_DEPTH), export=1
+ vdup.16 q15, r3
+ movw r3, #(1 << BIT_DEPTH) - 1
+ vmov.i64 q8, #0
+ mov r12, #4
+ vdup.i16 q9, r3
+1:
+ vld2.16 {q0, q1}, [r0, :256], r2
+ vld2.16 {q2, q3}, [r0, :256]
+ vld1.16 {q10, q11}, [r1, :256]!
+ subs r12, #1
+ vqadd.s16 q0, q10
+ vqadd.s16 q1, q15
+ vqadd.s16 q2, q11
+ vqadd.s16 q3, q15
+ sub r0, r2
+ clip16_4 q0, q1, q2, q3, q8, q9
+ vst2.16 {q0, q1}, [r0, :256], r2
+ vst2.16 {q2, q3}, [r0, :256], r2
+ bne 1b
+ bx lr
+endfunc
+
+@ add_residual16x16_u(
+@ uint8_t *_dst, [r0]
+@ const int16_t *res, [r1]
+@ ptrdiff_t stride, [r2]
+@ int dc) [r3]
+
+function JOIN(ff_hevc_add_residual_16x16_u_neon_, BIT_DEPTH), export=1
+ vdup.16 q15, r3
+ movw r3, #(1 << BIT_DEPTH) - 1
+ vmov.i64 q8, #0
+ mov r12, #16
+ vdup.i16 q9, r3
+ sub r2, #32
+1:
+ vld2.16 {q0, q1}, [r0, :256]!
+ vld2.16 {q2, q3}, [r0, :256]
+ vld1.16 {q10, q11}, [r1, :256]!
+ subs r12, #1
+ vqadd.s16 q0, q10
+ vqadd.s16 q1, q15
+ vqadd.s16 q2, q11
+ vqadd.s16 q3, q15
+ sub r0, #32
+ clip16_4 q0, q1, q2, q3, q8, q9
+ vst2.16 {q0, q1}, [r0, :256]!
+ vst2.16 {q2, q3}, [r0, :256], r2
+ bne 1b
+ bx lr
+endfunc
+
+@ ============================================================================
+@ V add
+
+@ add_residual4x4_v(
+@ uint8_t *_dst, [r0]
+@ const int16_t *res, [r1]
+@ ptrdiff_t stride, [r2]
+@ int dc) [r3]
+
+function JOIN(ff_hevc_add_residual_4x4_v_neon_, BIT_DEPTH), export=1
+ vld1.16 {q10, q11}, [r1, :256]
+ vdup.16 q15, r3
+ movw r3, #(1 << BIT_DEPTH) - 1
+ vmov.i64 q8, #0
+ vdup.i16 q9, r3
+
+ vld2.16 {d0, d2}, [r0, :128], r2
+ vld2.16 {d1, d3}, [r0, :128], r2
+ vld2.16 {d4, d6}, [r0, :128], r2
+ vld2.16 {d5, d7}, [r0, :128], r2
+
+ vqadd.s16 q0, q15
+ vqadd.s16 q1, q10
+ vqadd.s16 q2, q15
+ vqadd.s16 q3, q11
+ sub r0, r0, r2, lsl #2
+ clip16_4 q0, q1, q2, q3, q8, q9
+
+ vst2.16 {d0, d2}, [r0, :128], r2
+ vst2.16 {d1, d3}, [r0, :128], r2
+ vst2.16 {d4, d6}, [r0, :128], r2
+ vst2.16 {d5, d7}, [r0, :128]
+ bx lr
+endfunc
+
+@ add_residual8x8_v(
+@ uint8_t *_dst, [r0]
+@ const int16_t *res, [r1]
+@ ptrdiff_t stride, [r2]
+@ int dc) [r3]
+
+function JOIN(ff_hevc_add_residual_8x8_v_neon_, BIT_DEPTH), export=1
+ vdup.16 q15, r3
+ movw r3, #(1 << BIT_DEPTH) - 1
+ vmov.i64 q8, #0
+ mov r12, #4
+ vdup.i16 q9, r3
+1:
+ vld2.16 {q0, q1}, [r0, :256], r2
+ vld2.16 {q2, q3}, [r0, :256]
+ vld1.16 {q10, q11}, [r1, :256]!
+ subs r12, #1
+ vqadd.s16 q0, q15
+ vqadd.s16 q1, q10
+ vqadd.s16 q2, q15
+ vqadd.s16 q3, q11
+ sub r0, r2
+ clip16_4 q0, q1, q2, q3, q8, q9
+ vst2.16 {q0, q1}, [r0, :256], r2
+ vst2.16 {q2, q3}, [r0, :256], r2
+ bne 1b
+ bx lr
+endfunc
+
+@ add_residual16x16_v(
+@ uint8_t *_dst, [r0]
+@ const int16_t *res, [r1]
+@ ptrdiff_t stride, [r2]
+@ int dc) [r3]
+
+function JOIN(ff_hevc_add_residual_16x16_v_neon_, BIT_DEPTH), export=1
+ vdup.16 q15, r3
+ movw r3, #(1 << BIT_DEPTH) - 1
+ vmov.i64 q8, #0
+ mov r12, #16
+ vdup.i16 q9, r3
+ sub r2, #32
+1:
+ vld2.16 {q0, q1}, [r0, :256]!
+ vld2.16 {q2, q3}, [r0, :256]
+ vld1.16 {q10, q11}, [r1, :256]!
+ subs r12, #1
+ vqadd.s16 q0, q15
+ vqadd.s16 q1, q10
+ vqadd.s16 q2, q15
+ vqadd.s16 q3, q11
+ sub r0, #32
+ clip16_4 q0, q1, q2, q3, q8, q9
+ vst2.16 {q0, q1}, [r0, :256]!
+ vst2.16 {q2, q3}, [r0, :256], r2
+ bne 1b
+ bx lr
+endfunc
+
+@ ============================================================================
+@ U & V add
+
+@ add_residual4x4_c(
+@ uint8_t *_dst, [r0]
+@ const int16_t *res, [r1]
+@ ptrdiff_t stride) [r2]
+
+function JOIN(ff_hevc_add_residual_4x4_c_neon_, BIT_DEPTH), export=1
+ vldm r1, {q10-q13}
+ movw r3, #(1 << BIT_DEPTH) - 1
+ vmov.i64 q8, #0
+ vdup.i16 q9, r3
+
+ vld2.16 {d0, d2}, [r0, :128], r2
+ vld2.16 {d1, d3}, [r0, :128], r2
+ vld2.16 {d4, d6}, [r0, :128], r2
+ vld2.16 {d5, d7}, [r0, :128], r2
+
+ vqadd.s16 q0, q10
+ vqadd.s16 q2, q11
+ vqadd.s16 q1, q12
+ vqadd.s16 q3, q13
+ sub r0, r0, r2, lsl #2
+ vmax.s16 q0, q0, q8
+ vmax.s16 q1, q1, q8
+ vmax.s16 q2, q2, q8
+ vmax.s16 q3, q3, q8
+ vmin.s16 q0, q0, q9
+ vmin.s16 q1, q1, q9
+ vmin.s16 q2, q2, q9
+ vmin.s16 q3, q3, q9
+
+ vst2.16 {d0, d2}, [r0, :128], r2
+ vst2.16 {d1, d3}, [r0, :128], r2
+ vst2.16 {d4, d6}, [r0, :128], r2
+ vst2.16 {d5, d7}, [r0, :128]
+ bx lr
+endfunc
+
+@ add_residual8x8_c(
+@ uint8_t *_dst, [r0]
+@ const int16_t *res, [r1]
+@ ptrdiff_t stride) [r2]
+
+function JOIN(ff_hevc_add_residual_8x8_c_neon_, BIT_DEPTH), export=1
+ movw r3, #(1 << BIT_DEPTH) - 1
+ vmov.i64 q8, #0
+ mov r12, #4
+ vdup.i16 q9, r3
+ add r3, r1, #(8*8*2) @ Offset to V
+1:
+ vld2.16 {q0, q1}, [r0, :256], r2
+ vld2.16 {q2, q3}, [r0, :256]
+ vld1.16 {q10, q11}, [r1, :256]!
+ vld1.16 {q12, q13}, [r3, :256]!
+ subs r12, #1
+ vqadd.s16 q0, q10
+ vqadd.s16 q2, q11
+ vqadd.s16 q1, q12
+ vqadd.s16 q3, q13
+ sub r0, r2
+ vmax.s16 q0, q0, q8
+ vmax.s16 q1, q1, q8
+ vmax.s16 q2, q2, q8
+ vmax.s16 q3, q3, q8
+ vmin.s16 q0, q0, q9
+ vmin.s16 q1, q1, q9
+ vmin.s16 q2, q2, q9
+ vmin.s16 q3, q3, q9
+ vst2.16 {q0, q1}, [r0, :256], r2
+ vst2.16 {q2, q3}, [r0, :256], r2
+ bne 1b
+ bx lr
+endfunc
+
+@ add_residual16x16_c(
+@ uint8_t *_dst, [r0]
+@ const int16_t *res, [r1]
+@ ptrdiff_t stride) [r2]
+
+function JOIN(ff_hevc_add_residual_16x16_c_neon_, BIT_DEPTH), export=1
+ movw r3, #(1 << BIT_DEPTH) - 1
+ vmov.i64 q8, #0
+ mov r12, #16
+ vdup.i16 q9, r3
+ add r3, r1, #(16*16*2) @ Offset to V
+ sub r2, #32
+1:
+ vld2.16 {q0, q1}, [r0, :256]!
+ vld2.16 {q2, q3}, [r0, :256]
+ vld1.16 {q10, q11}, [r1, :256]!
+ vld1.16 {q12, q13}, [r3, :256]!
+ subs r12, #1
+ vqadd.s16 q0, q10
+ vqadd.s16 q2, q11
+ vqadd.s16 q1, q12
+ vqadd.s16 q3, q13
+ sub r0, #32
+ vmax.s16 q0, q0, q8
+ vmax.s16 q1, q1, q8
+ vmax.s16 q2, q2, q8
+ vmax.s16 q3, q3, q8
+ vmin.s16 q0, q0, q9
+ vmin.s16 q1, q1, q9
+ vmin.s16 q2, q2, q9
+ vmin.s16 q3, q3, q9
+ vst2.16 {q0, q1}, [r0, :256]!
+ vst2.16 {q2, q3}, [r0, :256], r2
+ bne 1b
+ bx lr
+endfunc
+
diff --git a/libavcodec/arm/hevcdsp_sao_neon.S b/libavcodec/arm/hevcdsp_sao_neon.S
new file mode 100644
index 0000000000..30113d9c93
--- /dev/null
+++ b/libavcodec/arm/hevcdsp_sao_neon.S
@@ -0,0 +1,1882 @@
+/*
+ * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
+.set EDGE_SRC_STRIDE, 160
+
+.macro sao_band_64b_8 XLAT0, XLAT1, Q_K128
+ vshr.u8 q12, q8, #3
+ vadd.s8 q8, \Q_K128
+ vshr.u8 q13, q9, #3
+ vadd.s8 q9, \Q_K128
+
+ vtbl.8 d24, \XLAT0, d24
+ vtbl.8 d25, \XLAT0, d25
+ vtbl.8 d26, \XLAT1, d26
+ vtbl.8 d27, \XLAT1, d27
+
+ vqadd.s8 q8, q12
+ vshr.u8 q12, q10, #3
+ vadd.s8 q10, \Q_K128
+ vqadd.s8 q9, q13
+ vshr.u8 q13, q11, #3
+ vadd.s8 q11, \Q_K128
+
+ vsub.s8 q8, \Q_K128
+ vtbl.8 d24, \XLAT0, d24
+ vtbl.8 d25, \XLAT0, d25
+ vsub.s8 q9, \Q_K128
+ vtbl.8 d26, \XLAT1, d26
+ vtbl.8 d27, \XLAT1, d27
+ vqadd.s8 q10, q12
+ vqadd.s8 q11, q13
+ vsub.s8 q10, \Q_K128
+ vsub.s8 q11, \Q_K128
+.endm
+
+.macro sao_band_16b_8 XLAT0, XLAT1, Q_K128
+ vshr.u8 q12, q8, #3
+ vadd.s8 q8, \Q_K128
+
+ vtbl.8 d24, \XLAT0, d24
+ vtbl.8 d25, \XLAT1, d25
+
+ vqadd.s8 q8, q12
+ vsub.s8 q8, \Q_K128
+.endm
+
+
+.macro clip16_4 Q0, Q1, Q2, Q3, Q_MIN, Q_MAX
+ vmax.s16 \Q0, \Q_MIN
+ vmax.s16 \Q1, \Q_MIN
+ vmax.s16 \Q2, \Q_MIN
+ vmax.s16 \Q3, \Q_MIN
+ vmin.s16 \Q0, \Q_MAX
+ vmin.s16 \Q1, \Q_MAX
+ vmin.s16 \Q2, \Q_MAX
+ vmin.s16 \Q3, \Q_MAX
+.endm
+
+@ Clobbers q12, q13
+.macro sao_band_64b_16 Q0, Q1, Q2, Q3, XLAT0, XLAT1, Q_MIN, Q_MAX, bit_depth
+ vshrn.i16 d24, \Q0, #(\bit_depth - 5)
+ vshrn.i16 d25, \Q1, #(\bit_depth - 5)
+ vshrn.i16 d26, \Q2, #(\bit_depth - 5)
+ vshrn.i16 d27, \Q3, #(\bit_depth - 5)
+ vtbl.8 d24, \XLAT0, d24
+ vtbl.8 d25, \XLAT1, d25
+ vtbl.8 d26, \XLAT0, d26
+ vtbl.8 d27, \XLAT1, d27
+ vaddw.s8 \Q0, d24
+ vaddw.s8 \Q1, d25
+ vaddw.s8 \Q2, d26
+ vaddw.s8 \Q3, d27
+ clip16_4 \Q0, \Q1, \Q2, \Q3, \Q_MIN, \Q_MAX
+.endm
+
+@ Clobbers q12
+.macro sao_band_32b_16 Q0, Q1, XLAT0, XLAT1, Q_MIN, Q_MAX, bit_depth
+ vshrn.i16 d24, \Q0, #(\bit_depth - 5)
+ vshrn.i16 d25, \Q1, #(\bit_depth - 5)
+ vtbl.8 d24, \XLAT0, d24
+ vtbl.8 d25, \XLAT1, d25
+ vaddw.s8 \Q0, d24
+ vaddw.s8 \Q1, d25
+ vmax.s16 \Q0, \Q_MIN
+ vmax.s16 \Q1, \Q_MIN
+ vmin.s16 \Q0, \Q_MAX
+ vmin.s16 \Q1, \Q_MAX
+.endm
+
+
+@ Standard coding rules for sao_offset_abs limit it to 0-31 (Table 9-38)
+@ so we are quite safe stuffing it into a byte array
+@ There may be a subsequent shl by log2_sao_offset_scale_luma/chroma
+@ (7.4.3.3.2 && 7-70) but we should still be safe to at least 12 bits of
+@ precision
+
+@ This, somewhat nasty, bit of code builds the {d0-d3} translation
+@ array via the stack
+@ Given that sao_left_class > 28 can cause wrap we can't just poke
+@ all 4 bytes in at once
+@
+@ It also loads other common regs
+
+function band_load_y
+ vmov.i64 q0, #0
+ ldr r12, [sp, #8] @ &sao_offset_val[0]
+ add r12, #2 @ 1st interesting val is [1]
+ vld1.16 {d16}, [r12] @ Unaligned
+ vmov.i64 q1, #0
+ ldr r12, [sp, #12] @ sao_left_class
+
+ mov r4, sp
+ sub sp, #32
+ and sp, #~63 @ Align stack so we can wrap with a simple AND
+ vst1.8 {q0, q1}, [sp, :256] @ Put zero array on stack
+ add r12, sp
+ vst1.8 {d16[0]}, [r12]!
+ and r12, #~32
+ vst1.8 {d16[2]}, [r12]!
+ and r12, #~32
+ vst1.8 {d16[4]}, [r12]!
+ and r12, #~32
+ vst1.8 {d16[6]}, [r12]
+ vld1.8 {q0, q1}, [sp, :256] @ Pop modified array
+ mov sp, r4
+
+ ldr r12, [sp, #20] @ height
+ pld [r1]
+
+ sub r12, #1
+ add r4, r1, r3
+ bx lr
+endfunc
+
+
+function band_load_c
+ vmov.i64 q2, #0
+ ldr r12, [sp, #8] @ &sao_offset_val1[0]
+ add r12, #2 @ 1st interesting val is [1]
+ vld1.16 {d16}, [r12] @ Unaligned
+ vmov.i64 q3, #0
+ ldr r12, [sp, #12] @ sao_left_class
+
+ mov r4, sp @ Remember SP
+ sub sp, #32
+ and sp, #~63 @ Align stack so we can wrap with a simple AND
+
+ vst1.8 {q2, q3}, [sp, :256] @ Put zero array on stack
+ add r12, sp
+ vst1.8 {d16[0]}, [r12]!
+ and r12, #~32
+ vst1.8 {d16[2]}, [r12]!
+ and r12, #~32
+ vst1.8 {d16[4]}, [r12]!
+ and r12, #~32
+ vst1.8 {d16[6]}, [r12]
+ vld1.8 {q0, q1}, [sp, :256] @ Pop modified array
+
+ @ And again for the 2nd set
+ ldr r12, [r4, #16] @ &sao_offset_val2[0]
+ add r12, #2 @ 1st interesting val is [1]
+ vld1.16 {d16}, [r12] @ Unaligned
+ ldr r12, [r4, #20] @ sao_left_class2
+
+ vst1.8 {q2, q3}, [sp, :256] @ Put zero array on stack (again)
+ add r12, sp
+ vst1.8 {d16[0]}, [r12]!
+ and r12, #~32
+ vst1.8 {d16[2]}, [r12]!
+ and r12, #~32
+ vst1.8 {d16[4]}, [r12]!
+ and r12, #~32
+ vst1.8 {d16[6]}, [r12]
+ vld1.8 {q2, q3}, [sp, :256] @ Pop modified array
+
+ mov sp, r4
+
+ ldr r12, [sp, #28] @ height
+ pld [r1]
+
+ subs r12, #1
+ add r4, r1, r3
+ bx lr
+endfunc
+
+
+@ ff_hevc_sao_band_64_neon_8 (
+@ uint8_t *_dst, [r0]
+@ uint8_t *_src, [r1]
+@ ptrdiff_t stride_dst, [r2]
+@ ptrdiff_t stride_src, [r3]
+@ int16_t *sao_offset_val, [sp, #0]
+@ int sao_left_class, [sp, #4]
+@ int width, [sp, #8]
+@ int height) [sp, #12]
+
+function ff_hevc_sao_band_64_neon_8, export=1
+ push {r4, lr}
+ bl band_load_y
+ vmov.u8 q15, #128
+
+1: subs r12, #1
+ vldm r1, {q8-q11}
+ pld [r4]
+ add r1, r3
+
+ sao_band_64b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15
+
+ it ne
+ addne r4, r3
+ vstm r0, {q8-q11}
+ add r0, r2
+ bpl 1b
+
+ pop {r4, pc}
+endfunc
+
+@ ff_hevc_sao_band_32_neon_8 (
+@ uint8_t *_dst, [r0]
+@ uint8_t *_src, [r1]
+@ ptrdiff_t stride_dst, [r2]
+@ ptrdiff_t stride_src, [r3]
+@ int16_t *sao_offset_val, [sp, #0]
+@ int sao_left_class, [sp, #4]
+@ int width, [sp, #8]
+@ int height) [sp, #12]
+
+function ff_hevc_sao_band_32_neon_8, export=1
+ push {r4, lr}
+ bl band_load_y
+ vmov.u8 q15, #128
+
+1: subs r12, #2
+ vld1.8 { q8, q9 }, [r1, :128], r3
+ vld1.8 {q10, q11}, [r1, :128], r3
+
+ sao_band_64b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15
+
+ vst1.8 { q8, q9 }, [r0, :128], r2
+ vst1.8 {q10, q11}, [r0, :128], r2
+ bpl 1b
+
+ pop {r4, pc}
+endfunc
+
+@ ff_hevc_sao_band_16_neon_8 (
+@ uint8_t *_dst, [r0]
+@ uint8_t *_src, [r1]
+@ ptrdiff_t stride_dst, [r2]
+@ ptrdiff_t stride_src, [r3]
+@ int16_t *sao_offset_val, [sp, #0]
+@ int sao_left_class, [sp, #4]
+@ int width, [sp, #8]
+@ int height) [sp, #12]
+
+function ff_hevc_sao_band_16_neon_8, export=1
+ push {r4, lr}
+ bl band_load_y
+ vmov.u8 q15, #128
+
+1: subs r12, #4
+ vld1.8 { q8}, [r1, :128], r3
+ vld1.8 { q9}, [r1, :128], r3
+ vld1.8 {q10}, [r1, :128], r3
+ vld1.8 {q11}, [r1, :128], r3
+
+ sao_band_64b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15
+
+ vst1.8 { q8}, [r0, :128], r2
+ vst1.8 { q9}, [r0, :128], r2
+ vst1.8 {q10}, [r0, :128], r2
+ vst1.8 {q11}, [r0, :128], r2
+ bpl 1b
+
+ pop {r4, pc}
+endfunc
+
+@ ff_hevc_sao_band_8_neon_8 (
+@ uint8_t *_dst, [r0]
+@ uint8_t *_src, [r1]
+@ ptrdiff_t stride_dst, [r2]
+@ ptrdiff_t stride_src, [r3]
+@ int16_t *sao_offset_val, [sp, #0]
+@ int sao_left_class, [sp, #4]
+@ int width, [sp, #8]
+@ int height) [sp, #12]
+
+function ff_hevc_sao_band_8_neon_8, export=1
+ push {r4, lr}
+ bl band_load_y
+ ldr lr, [sp, #16] @ width
+ vmov.u8 q15, #128
+ cmp lr, #8
+ blt 4f
+
+1: subs r12, #2
+ vld1.8 {d16}, [r1, :64], r3
+ vld1.8 {d17}, [r1, :64], r3
+
+ sao_band_16b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15
+
+ vst1.8 {d16}, [r0, :64], r2
+ vst1.8 {d17}, [r0, :64], r2
+ bpl 1b
+ pop {r4, pc}
+
+4:
+1: subs r12, #4
+ vld1.32 {d16[0]}, [r1, :32], r3
+ vld1.32 {d16[1]}, [r1, :32], r3
+ vld1.32 {d17[0]}, [r1, :32], r3
+ vld1.32 {d17[1]}, [r1, :32], r3
+
+ sao_band_16b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15
+
+ vst1.32 {d16[0]}, [r0, :32], r2
+ vst1.32 {d16[1]}, [r0, :32], r2
+ vst1.32 {d17[0]}, [r0, :32], r2
+ vst1.32 {d17[1]}, [r0, :32], r2
+ bpl 1b
+ pop {r4, pc}
+endfunc
+
+@ ff_hevc_sao_band_c_32_neon_8(
+@ uint8_t * dst [r0]
+@ uint8_t * src [r1]
+@ uint32_t dst_stride [r2]
+@ uint32_t src_stride [r3]
+@ const int16_t * table1 sp[0]
+@ uint32_t offset1 sp[4]
+@ const int16_t * table2 sp[8]
+@ uint32_t offset2 sp[12]
+@ int width sp[16]
+@ int height sp[20]
+
+function ff_hevc_sao_band_c_32_neon_8, export=1
+ push {r4, lr}
+ bl band_load_c
+
+ vmov.i8 q15, #128
+ sub r3, #32
+ sub r2, #32
+
+1: subs r12, #1
+ vld2.8 { q8, q9 }, [r1, :128]!
+ vld2.8 {q10, q11}, [r1, :128], r3
+
+ pld [r4]
+
+ sao_band_64b_8 "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q15
+
+ vst2.8 { q8, q9 }, [r0, :128]!
+ vst2.8 {q10, q11}, [r0, :128], r2
+
+ itt ne
+ addne r4, r3
+ addne r4, #32
+
+ bpl 1b
+
+ pop {r4, pc}
+endfunc
+
+@ ff_hevc_sao_band_c_16_neon_8(
+@ uint8_t * dst [r0]
+@ uint8_t * src [r1]
+@ uint32_t dst_stride [r2]
+@ uint32_t src_stride [r3]
+@ const int16_t * table1 sp[0]
+@ uint32_t offset1 sp[4]
+@ const int16_t * table2 sp[8]
+@ uint32_t offset2 sp[12]
+@ int width sp[16]
+@ int height sp[20]
+
+function ff_hevc_sao_band_c_16_neon_8, export=1
+ push {r4, lr}
+ bl band_load_c
+ vmov.i8 q15, #128
+
+1: subs r12, #2
+ vld2.8 { q8, q9 }, [r1, :128], r3
+ vld2.8 {q10, q11}, [r1, :128], r3
+
+ sao_band_64b_8 "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q15
+
+ vst2.8 { q8, q9 }, [r0, :128], r2
+ vst2.8 {q10, q11}, [r0, :128], r2
+
+ bpl 1b
+ pop {r4, pc}
+endfunc
+
+@ ff_hevc_sao_band_c_8_neon_8(
+@ uint8_t * dst [r0]
+@ uint8_t * src [r1]
+@ uint32_t dst_stride [r2]
+@ uint32_t src_stride [r3]
+@ const int16_t * table1 sp[0]
+@ uint32_t offset1 sp[4]
+@ const int16_t * table2 sp[8]
+@ uint32_t offset2 sp[12]
+@ int width sp[16]
+@ int height sp[20]
+
+function ff_hevc_sao_band_c_8_neon_8, export=1
+ push {r4, lr}
+ bl band_load_c
+ ldr lr, [sp, #16] @ width
+ vmov.u8 q15, #128
+ cmp lr, #8
+ blt 4f
+
+1: subs r12, #1
+ vld2.8 {d16, d17}, [r1, :128], r3
+
+ sao_band_16b_8 "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q15
+
+ vst2.8 {d16, d17}, [r0, :128], r2
+ bpl 1b
+ pop {r4, pc}
+
+4:
+1: subs r12, #1
+ vld1.8 {d16}, [r1, :64], r3
+ vld1.8 {d17}, [r1, :64], r3
+ vuzp.8 d16, d17
+
+ sao_band_16b_8 "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q15
+
+ vzip.8 d16, d17
+ vst1.8 {d16}, [r0, :64], r2
+ vst1.8 {d17}, [r0, :64], r2
+ bpl 1b
+ pop {r4, pc}
+endfunc
+
+
+@ ff_hevc_sao_band_64_neon_10 (
+@ uint8_t *_dst, [r0]
+@ uint8_t *_src, [r1]
+@ ptrdiff_t stride_dst, [r2]
+@ ptrdiff_t stride_src, [r3]
+@ int16_t *sao_offset_val, [sp, #0]
+@ int sao_left_class, [sp, #4]
+@ int width, [sp, #8]
+@ int height) [sp, #12]
+
+.macro band_64_16 bit_depth
+ push {r4, lr}
+ movw lr, #(1 << \bit_depth) - 1
+ vmov.i64 q2, #0
+ vdup.i16 q3, lr
+ bl band_load_y
+ vpush {q4-q7}
+
+1: subs r12, #1
+ vldm r1, {q4-q11}
+ add r1, r3
+ sao_band_64b_16 q4, q5, q6, q7, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q2, q3, \bit_depth
+ sao_band_64b_16 q8, q9, q10, q11, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q2, q3, \bit_depth
+ vstm r0, {q4-q11}
+ add r0, r2
+ bpl 1b
+
+ vpop {q4-q7}
+ pop {r4, pc}
+.endm
+
+function ff_hevc_sao_band_64_neon_10, export=1
+ band_64_16 10
+endfunc
+
+@ ff_hevc_sao_band_32_neon_10 (
+@ uint8_t *_dst, [r0]
+@ uint8_t *_src, [r1]
+@ ptrdiff_t stride_dst, [r2]
+@ ptrdiff_t stride_src, [r3]
+@ int16_t *sao_offset_val, [sp, #0]
+@ int sao_left_class, [sp, #4]
+@ int width, [sp, #8]
+@ int height) [sp, #12]
+
+.macro band_32_16 bit_depth
+ push {r4, lr}
+ movw lr, #(1 << \bit_depth) - 1
+ vmov.i64 q2, #0
+ vdup.i16 q3, lr
+ bl band_load_y
+
+1: subs r12, #1
+ vldm r1, {q8-q11}
+ add r1, r3
+ sao_band_64b_16 q8, q9, q10, q11, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q2, q3, \bit_depth
+ vstm r0, {q8-q11}
+ add r0, r2
+ bpl 1b
+
+ pop {r4, pc}
+.endm
+
+function ff_hevc_sao_band_32_neon_10, export=1
+ band_32_16 10
+endfunc
+
+@ ff_hevc_sao_band_16_neon_10 (
+@ uint8_t *_dst, [r0]
+@ uint8_t *_src, [r1]
+@ ptrdiff_t stride_dst, [r2]
+@ ptrdiff_t stride_src, [r3]
+@ int16_t *sao_offset_val, [sp, #0]
+@ int sao_left_class, [sp, #4]
+@ int width, [sp, #8]
+@ int height) [sp, #12]
+
+.macro band_16_16 bit_depth
+ push {r4, lr}
+ movw lr, #(1 << \bit_depth) - 1
+ vmov.i64 q14, #0
+ vdup.i16 q15, lr
+ bl band_load_y
+
+1: subs r12, #2
+ vld1.16 { q8, q9 }, [r1, :128], r3
+ vld1.16 {q10, q11}, [r1, :128], r3
+ sao_band_64b_16 q8, q9, q10, q11, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q14, q15, \bit_depth
+ vst1.16 { q8, q9 }, [r0, :128], r2
+ vst1.16 {q10, q11}, [r0, :128], r2
+ bpl 1b
+
+ pop {r4, pc}
+.endm
+
+function ff_hevc_sao_band_16_neon_10, export=1
+ band_16_16 10
+endfunc
+
+@ ff_hevc_sao_band_8_neon_10 (
+@ uint8_t *_dst, [r0]
+@ uint8_t *_src, [r1]
+@ ptrdiff_t stride_dst, [r2]
+@ ptrdiff_t stride_src, [r3]
+@ int16_t *sao_offset_val, [sp, #0]
+@ int sao_left_class, [sp, #4]
+@ int width, [sp, #8]
+@ int height) [sp, #12]
+
+.macro band_8_16 bit_depth
+ push {r4, lr}
+ movw lr, #(1 << \bit_depth) - 1
+ vmov.i64 q14, #0
+ vdup.i16 q15, lr
+ bl band_load_y
+ ldr lr, [sp, #16]
+ cmp lr, #8
+ blt 4f
+
+1: subs r12, #2
+ vld1.16 { q8}, [r1, :128], r3
+ vld1.16 { q9}, [r1, :128], r3
+ sao_band_32b_16 q8, q9, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q14, q15, \bit_depth
+ vst1.16 { q8}, [r0, :128], r2
+ vst1.16 { q9}, [r0, :128], r2
+ bpl 1b
+ pop {r4, pc}
+
+4:
+1: subs r12, #4
+ vld1.16 {d16}, [r1, :64], r3
+ vld1.16 {d17}, [r1, :64], r3
+ vld1.16 {d18}, [r1, :64], r3
+ vld1.16 {d19}, [r1, :64], r3
+ sao_band_32b_16 q8, q9, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q14, q15, \bit_depth
+ vst1.16 {d16}, [r0, :64], r2
+ vst1.16 {d17}, [r0, :64], r2
+ vst1.16 {d18}, [r0, :64], r2
+ vst1.16 {d19}, [r0, :64], r2
+ bpl 1b
+ pop {r4, pc}
+.endm
+
+function ff_hevc_sao_band_8_neon_10, export=1
+ band_8_16 10
+endfunc
+
+
+@ ff_hevc_sao_band_c_32_neon_10(
+@ uint8_t * dst [r0]
+@ uint8_t * src [r1]
+@ uint32_t dst_stride [r2]
+@ uint32_t src_stride [r3]
+@ const int16_t * table1 sp[0]
+@ uint32_t offset1 sp[4]
+@ const int16_t * table2 sp[8]
+@ uint32_t offset2 sp[12]
+@ int width sp[16]
+@ int height sp[20]
+
+.macro band_c_32_16 bit_depth
+ push {r4, lr}
+ bl band_load_c
+ vpush {q4-q7}
+ movw lr, #(1 << \bit_depth) - 1
+ vmov.i64 q14, #0
+ vdup.i16 q15, lr
+ sub r2, #96
+
+1: subs r12, #1
+
+ vld2.16 { q4, q5 }, [r1, :128]!
+ vld2.16 { q6, q7 }, [r1, :128]!
+ vld2.16 { q8, q9 }, [r1, :128]!
+ vld2.16 {q10, q11}, [r1, :128], r3
+
+ pld [r4]
+ sub r1, #96
+
+ sao_band_64b_16 q4, q5, q6, q7, "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q14, q15, \bit_depth
+ sao_band_64b_16 q8, q9, q10, q11, "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q14, q15, \bit_depth
+
+ it ne
+ addne r4, r3
+
+ vst2.16 { q4, q5 }, [r0, :128]!
+ vst2.16 { q6, q7 }, [r0, :128]!
+ vst2.16 { q8, q9 }, [r0, :128]!
+ vst2.16 {q10, q11}, [r0, :128], r2
+
+ bpl 1b
+
+ vpop {q4-q7}
+ pop {r4, pc}
+.endm
+
+function ff_hevc_sao_band_c_32_neon_10, export=1
+ band_c_32_16 10
+endfunc
+
+
+@ ff_hevc_sao_band_c_16_neon_10(
+@ uint8_t * dst [r0]
+@ uint8_t * src [r1]
+@ uint32_t dst_stride [r2]
+@ uint32_t src_stride [r3]
+@ const int16_t * table1 sp[0]
+@ uint32_t offset1 sp[4]
+@ const int16_t * table2 sp[8]
+@ uint32_t offset2 sp[12]
+@ int width sp[16]
+@ int height sp[20]
+
+.macro band_c_16_16 bit_depth
+ push {r4, lr}
+ bl band_load_c
+ movw lr, #(1 << \bit_depth) - 1
+ vmov.i64 q14, #0
+ vdup.i16 q15, lr
+ sub r2, #32
+ sub r3, #32
+
+1: subs r12, #1
+
+ vld2.16 { q8, q9 }, [r1, :128]!
+ vld2.16 {q10, q11}, [r1, :128], r3
+
+ sao_band_64b_16 q8, q9, q10, q11, "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q14, q15, \bit_depth
+
+ vst2.16 { q8, q9 }, [r0, :128]!
+ vst2.16 {q10, q11}, [r0, :128], r2
+
+ bpl 1b
+ pop {r4, pc}
+.endm
+
+function ff_hevc_sao_band_c_16_neon_10, export=1
+ band_c_16_16 10
+endfunc
+
+
+@ ff_hevc_sao_band_c_8_neon_10(
+@ uint8_t * dst [r0]
+@ uint8_t * src [r1]
+@ uint32_t dst_stride [r2]
+@ uint32_t src_stride [r3]
+@ const int16_t * table1 sp[0]
+@ uint32_t offset1 sp[4]
+@ const int16_t * table2 sp[8]
+@ uint32_t offset2 sp[12]
+@ int width sp[16]
+@ int height sp[20]
+
+.macro band_c_8_16 bit_depth
+ push {r4, lr}
+ bl band_load_c
+ movw lr, #(1 << \bit_depth) - 1
+ vmov.i64 q14, #0
+ vdup.i16 q15, lr
+ ldr lr, [sp, #24] @ width
+ cmp lr, #8
+ blt 4f
+
+1: subs r12, #1
+ vld2.16 { q8, q9 }, [r1, :128], r3
+
+ sao_band_32b_16 q8, q9, "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q14, q15, \bit_depth
+
+ vst2.16 { q8, q9 }, [r0, :128], r2
+
+ bpl 1b
+ pop {r4, pc}
+
+4:
+1: subs r12, #2
+ vld2.16 {d16, d17}, [r1, :128], r3
+ vld2.16 {d18, d19}, [r1, :128], r3
+
+ sao_band_32b_16 q8, q9, "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q14, q15, \bit_depth
+
+ vst2.16 {d16, d17}, [r0, :128], r2
+ vst2.16 {d18, d19}, [r0, :128], r2
+
+ bpl 1b
+ pop {r4, pc}
+.endm
+
+function ff_hevc_sao_band_c_8_neon_10, export=1
+ band_c_8_16 10
+endfunc
+
+
+@ =============================================================================
+@ SAO EDGE
+
+@ r0 destination address
+@ r2 stride to post-increment r0 with
+@ [r5] translate values
+@
+@ a <- c <- b
+@ a in q0 - q3
+@ c in q4 - q7
+@ b in q8 - q11
+@
+@ q12-15 used as temp
+@
+@ Can be used for both Y & C as we unzip/zip the deltas and
+@ transform "u/v" separately via d26/d27. For Y d26=d27
+
+function edge_64b_body_8
+
+ vcgt.u8 q12, q4, q0 @ c > a -> -1 , otherwise 0
+ vcgt.u8 q13, q5, q1
+ vcgt.u8 q14, q6, q2
+ vcgt.u8 q15, q7, q3
+
+ vcgt.u8 q0, q4 @ a > c -> -1 , otherwise 0
+ vcgt.u8 q1, q5
+ vcgt.u8 q2, q6
+ vcgt.u8 q3, q7
+
+ vsub.s8 q0, q12 @ a = sign(c-a)
+ vsub.s8 q1, q13
+ vsub.s8 q2, q14
+ vsub.s8 q3, q15
+
+ vcgt.u8 q12, q4, q8 @ c > b -> -1 , otherwise 0
+ vcgt.u8 q13, q5, q9
+ vcgt.u8 q14, q6, q10
+ vcgt.u8 q15, q7, q11
+
+ vsub.s8 q0, q12
+ vsub.s8 q1, q13
+ vsub.s8 q2, q14
+ vsub.s8 q3, q15
+
+ vcgt.u8 q12, q8, q4 @ c < b -> -1 , otherwise 0
+ vcgt.u8 q13, q9, q5
+ vcgt.u8 q14, q10, q6
+ vcgt.u8 q15, q11, q7
+
+ vadd.s8 q0, q12 @ a = sign(c-a) + sign(c-b)
+ vadd.s8 q1, q13
+ vmov.u8 q12, #2
+ vadd.s8 q2, q14
+ vadd.s8 q3, q15
+
+ vadd.s8 q0, q12
+ vadd.s8 q1, q12
+
+ vld1.8 {d26, d27}, [r5]
+
+ vadd.s8 q2, q12
+ vuzp.8 q0, q1
+ vmov.u8 q15, #128
+ vadd.s8 q3, q12 @ a = 2 + sign(c-a) + sign(c-b)
+
+ vtbl.8 d0, {d26}, d0
+ vadd.s8 q12, q4, q15 @ Add -128 so we can use saturating signed add
+
+ vtbl.8 d1, {d26}, d1
+ vadd.s8 q14, q5, q15
+
+ vtbl.8 d2, {d27}, d2
+ vuzp.8 q2, q3
+
+ vtbl.8 d3, {d27}, d3
+
+ vtbl.8 d4, {d26}, d4
+ vzip.8 q0, q1
+
+ vtbl.8 d5, {d26}, d5
+ vqadd.s8 q0, q12
+ vqadd.s8 q1, q14
+ vadd.s8 q12, q6, q15 @ Add -128 so we can use saturating signed add
+
+ vtbl.8 d6, {d27}, d6
+ vadd.s8 q14, q7, q15 @ Add -128 so we can use saturating signed add
+
+ vtbl.8 d7, {d27}, d7
+ vzip.8 q2, q3
+
+ vsub.s8 q0, q15
+ vqadd.s8 q2, q12
+ vqadd.s8 q3, q14
+ vsub.s8 q1, q15
+ vsub.s8 q2, q15
+ vsub.s8 q3, q15
+
+ bx lr
+endfunc
+
+@ r0 destination address
+@ r2 stride to post-increment r0 with
+@ r4 upper clip value
+@ [r5] translate values
+@
+@ a <- c <- b
+@ a in q0 - q3
+@ c in q4 - q7
+@ b in q8 - q11
+@
+@ q12-15 used as temp
+@
+@ Can be used for both Y & C as we unzip/zip the deltas and
+@ transform "u/v" separately via d26/d27. For Y d26=d27
+
+function edge_64b_body_16
+
+ vcgt.u16 q12, q4, q0 // c > a -> -1 , otherwise 0
+ vcgt.u16 q13, q5, q1
+ vcgt.u16 q14, q6, q2
+ vcgt.u16 q15, q7, q3
+
+ vcgt.u16 q0, q0, q4 // a > c -> -1 , otherwise 0
+ vcgt.u16 q1, q1, q5
+ vcgt.u16 q2, q2, q6
+ vcgt.u16 q3, q3, q7
+
+ vsub.s16 q0, q0, q12 // a = sign(c-a)
+ vsub.s16 q1, q1, q13
+ vsub.s16 q2, q2, q14
+ vsub.s16 q3, q3, q15
+
+ vcgt.u16 q12, q4, q8 // c > b -> -1 , otherwise 0
+ vcgt.u16 q13, q5, q9
+ vcgt.u16 q14, q6, q10
+ vcgt.u16 q15, q7, q11
+
+ vsub.s16 q0, q0, q12
+ vsub.s16 q1, q1, q13
+ vsub.s16 q2, q2, q14
+ vsub.s16 q3, q3, q15
+
+ vcgt.u16 q12, q8, q4 // c < b -> -1 , otherwise 0
+ vcgt.u16 q13, q9, q5
+ vcgt.u16 q14, q10, q6
+ vcgt.u16 q15, q11, q7
+
+ vadd.s16 q0, q0, q12 // a = sign(c-a) + sign(c-b)
+ vadd.s16 q1, q1, q13
+ vmov.u8 q12, #2
+ vadd.s16 q2, q2, q14
+ vadd.s16 q3, q3, q15
+
+ vmovn.s16 d0, q0
+ vmovn.s16 d1, q1
+ vmovn.s16 d2, q2
+ vmovn.s16 d3, q3
+
+ vuzp.8 q0, q1
+
+ vld1.8 {d26, d27}, [r5]
+
+ vadd.s8 q0, q0, q12
+ vadd.s8 q1, q1, q12
+
+ vtbl.8 d0, {d26}, d0
+ vtbl.8 d1, {d26}, d1
+ vtbl.8 d2, {d27}, d2
+ vtbl.8 d3, {d27}, d3
+
+ vmov.i64 q12, #0
+
+ vzip.8 q0, q1
+
+ vdup.i16 q13, r4
+
+ @ Avoid overwrite whilst widening
+ vaddw.s8 q2, q6, d2
+ vaddw.s8 q3, q7, d3
+ vaddw.s8 q1, q5, d1
+ vaddw.s8 q0, q4, d0
+
+ @ now clip
+ clip16_4 q2, q3, q1, q0, q12, q13
+
+ bx lr
+endfunc
+
+
+@ a <- c <- b
+@ a in q0
+@ c in q1
+@ b in q2
+@ Temp q3, q9, q10
+@
+@ d16, d17 (q8) xlat U, V
+@ q14.u8 #2
+@ q15.u8 #128
+
+function edge_16b_body_8
+ vcgt.u8 q3, q1, q0 @ c > a -> -1 , otherwise 0
+ vcgt.u8 q0, q1 @ a > c -> -1 , otherwise 0
+ vcgt.u8 q9, q1, q2 @ c > b -> -1 , otherwise 0
+ vcgt.u8 q10, q2, q1 @ c < b -> -1 , otherwise 0
+
+ vsub.s8 q0, q3
+ vsub.s8 q10, q9
+ vadd.s8 q0, q10 @ a = sign(c-a)
+
+ vadd.s8 q0, q14
+ vuzp.8 d0, d1
+ vadd.s8 q3, q1, q15 @ Add -128 so we can use saturating signed add
+
+ vtbl.8 d0, {d16}, d0
+ vtbl.8 d1, {d17}, d1
+
+ vzip.8 d0, d1
+ vqadd.s8 q0, q3
+ vsub.s8 q0, q15
+
+ bx lr
+endfunc
+
+@ a <- c <- b
+@ a in q0
+@ c in q1
+@ b in q2
+@ Temp q3
+@
+@ q12, #0
+@ d16, d17 xlat U, V
+@ q14.u8 #2
+@ q15.u16 max
+function edge_16b_body_16
+ vcgt.u16 q3, q1, q0 @ c > a -> -1 , otherwise 0
+ vcgt.u16 q0, q1 @ a > c -> -1 , otherwise 0
+ vsub.s16 q0, q3 @ a = sign(c-a)
+ vcgt.u16 q3, q1, q2 @ c > b -> -1 , otherwise 0
+ vsub.s16 q0, q3
+ vcgt.u16 q3, q2, q1 @ c < b -> -1 , otherwise 0
+ vadd.s16 q0, q3 @ a = sign(c-a) + sign(c-b)
+
+ vmovn.s16 d0, q0
+ @ d1 will have random contents that we transform but
+ @ that doesn't matter as we then discard them
+ vuzp.8 d0, d1
+
+ vadd.s8 q0, q0, q14
+
+ vtbl.8 d0, {d16}, d0
+ vtbl.8 d1, {d17}, d1
+
+ vzip.8 d0, d1
+
+ vaddw.s8 q0, q1, d0
+
+ @ now clip
+ vmax.s16 q0, q12
+ vmin.s16 q0, q15
+ bx lr
+endfunc
+
+
+@ ff_hevc_sao_edge_[c_]xx_neon(
+@ uint8_t *_dst, [r0]
+@ const uint8_t *_src, [r1]
+@ ptrdiff_t stride_dst, [r2]
+@ const int16_t *_sao_offset_val_u, [r3]
+@ const int16_t *_sao_offset_val_v, [sp, #0] // Chroma only
+@ int eo, [sp, #sp_base + 0]
+@ int width, [sp, #sp_base + 4]
+@ int height) [sp, #sp_base + 8]
+
+.macro edge_xxb_init, bit_depth, is_chroma, jump_tab, setup_64b = 0, setup_16b = 0, check_w4 = 0, do2 = 0
+ push {r4-r6, lr} @ 16 bytes
+.set sp_base, 16
+
+@ Build translate registers
+@ As translate values can only be 0-4 we don't care about junk in the rest
+@ of the register
+ mov r12, #2
+.if \is_chroma
+ ldr r4, [sp, #16]
+.set sp_base, sp_base + 4
+.endif
+ vld1.8 {d16[2]}, [r3], r12
+ vld1.8 {d16[0]}, [r3], r12
+ vld1.8 {d16[1]}, [r3], r12
+ vld1.8 {d16[3]}, [r3], r12
+ vld1.8 {d16[4]}, [r3]
+.if \is_chroma
+ vld1.8 {d17[2]}, [r4], r12
+ vld1.8 {d17[0]}, [r4], r12
+ vld1.8 {d17[1]}, [r4], r12
+ vld1.8 {d17[3]}, [r4], r12
+ vld1.8 {d17[4]}, [r4]
+.else
+ vmov d17, d16
+.endif
+
+@ Setup constant registers
+.if \bit_depth > 8
+ movw r4, (1 << \bit_depth) - 1
+.endif
+.if \setup_16b
+.if \bit_depth > 8
+ vmov.i64 q12, #0
+ vdup.16 q15, r4
+.else
+ vmov.u8 q15, #128
+.endif
+ vmov.u8 q14, #2
+.endif
+ movw r3, EDGE_SRC_STRIDE
+
+@ If setup_64b we need the xlat table on the stack and q4-q7 saved
+.if \setup_64b
+ sub r5, sp, #16
+ vpush {q4-q8} @ 80 bytes, q8 pushed first
+.set sp_base, sp_base + 80
+.endif
+
+@ Get jump address
+@ We have a special case for width 4 as the calling code doesn't detect it
+@ If we may have w4 then we add a 2nd jump table after the 1st
+.if \check_w4
+ ldr r12, [sp, #sp_base + 4] @ width
+ cmp r12, #8
+.endif
+ ldr r12, [sp, #sp_base + 0] @ e0
+ adr r6, \jump_tab
+.if \check_w4
+ it lt
+ addlt r6, #16
+.endif
+ ldr r6, [r6, r12, lsl #2]
+
+ ldr r12, [sp, #sp_base + 8] @ height
+
+@ For 16 bit width 64 (or chroma 32) we need to do this in 2 passes
+.if \do2
+ push {r0, r1, r6, r12}
+ blx r6
+ pop {r0, r1, r6, r12}
+
+ add r0, #64
+ add r1, #64
+.endif
+
+ blx r6
+
+@ Tidy up & return
+.if \setup_64b
+ vpop {q4-q8} @ spurious but harmless load of q8
+.endif
+ pop {r4-r6, pc}
+.endm
+
+
+.macro edge_16b_init, bit_depth, is_chroma, check_w4, jump_tab
+ edge_xxb_init \bit_depth, \is_chroma, \jump_tab, check_w4=\check_w4, setup_16b=1
+.endm
+
+.macro edge_64b_init, bit_depth, is_chroma, do2, jump_tab
+ edge_xxb_init \bit_depth, \is_chroma, \jump_tab, do2=\do2, setup_64b=1
+.endm
+
+
+.macro edge_64b_e0, body_fn, pb
+ mov r6, lr
+ sub r1, #8
+1: vldm r1, {d7-d16}
+ subs r12, #1
+ add r1, r3
+ // load a
+ vext.8 q0, q3, q4, #(16 - \pb)
+ vext.8 q1, q4, q5, #(16 - \pb)
+ vext.8 q2, q5, q6, #(16 - \pb)
+ vext.8 q3, q6, q7, #(16 - \pb)
+ // load b
+ vext.8 q11, q7, q8, #\pb @ Avoid overwrite
+ vext.8 q8, q4, q5, #\pb
+ vext.8 q9, q5, q6, #\pb
+ vext.8 q10, q6, q7, #\pb
+ bl \body_fn
+ vstm r0, {q0-q3}
+ add r0, r0, r2
+ bgt 1b
+ bx r6
+.endm
+
+.macro edge_32bx2_e0, body_fn, pb
+ mov r6, lr
+
+1: subs r12, #2
+
+ vld1.8 {q4-q5}, [r1]
+ sub r1, #\pb
+ vld1.8 {q0-q1}, [r1]
+ add r1, #(\pb * 2)
+ vld1.8 {q8-q9}, [r1], r3
+ sub r1, #\pb
+ vld1.8 {q6-q7}, [r1]
+ sub r1, #\pb
+ vld1.8 {q2-q3}, [r1]
+ add r1, #(\pb * 2)
+ vld1.8 {q10-q11}, [r1], r3
+ sub r1, #\pb
+
+ bl \body_fn
+
+ vst1.8 {q0,q1}, [r0], r2
+ vst1.8 {q2,q3}, [r0], r2
+
+ bgt 1b
+ bx r6
+.endm
+
+.macro edge_16b_e0, body_fn, pb
+ mov r6, lr
+ sub r1, #\pb
+ sub r3, #\pb * 2
+
+1: subs r12, #1
+
+ vld1.64 {q0}, [r1] @ load a
+ add r1, #\pb
+ vld1.64 {q1}, [r1, :128] @ load c
+ add r1, #\pb
+ vld1.64 {q2}, [r1], r3 @ load b
+
+ bl \body_fn
+ vst1.8 {q0}, [r0], r2
+ bgt 1b
+ bx r6
+.endm
+
+.macro edge_8bx2_e0, body_fn, pb
+ mov r6, lr
+
+1: subs r12, #2
+
+ vld1.8 {d2}, [r1, :64]
+ sub r1, #\pb
+ vld1.8 {d0}, [r1]
+ add r1, #(\pb * 2)
+ vld1.8 {d4}, [r1], r3
+ sub r1, #\pb
+ vld1.8 {d3}, [r1, :64]
+ sub r1, #\pb
+ vld1.8 {d1}, [r1]
+ add r1, #(\pb * 2)
+ vld1.8 {d5}, [r1], r3
+ sub r1, #\pb
+
+ bl \body_fn
+
+ vst1.8 {d0}, [r0, :64], r2
+ vst1.8 {d1}, [r0, :64], r2
+
+ bgt 1b
+ bx r6
+.endm
+
+.macro edge_4bx4_e0, body_fn, pb
+ mov r6, lr
+
+1: subs r12, #4
+
+ vld1.32 {d2[0]}, [r1]
+ sub r1, #\pb
+ vld1.32 {d0[0]}, [r1]
+ add r1, #(\pb * 2)
+ vld1.32 {d4[0]}, [r1], r3 @ R
+ vld1.32 {d4[1]}, [r1]
+ sub r1, #\pb
+ vld1.32 {d2[1]}, [r1]
+ sub r1, #\pb
+ vld1.32 {d0[1]}, [r1], r3 @ L
+ vld1.32 {d1[0]}, [r1]
+ add r1, #\pb
+ vld1.32 {d3[0]}, [r1]
+ add r1, #\pb
+ vld1.32 {d5[0]}, [r1], r3 @ R
+ vld1.32 {d5[1]}, [r1]
+ sub r1, #(\pb * 2)
+ vld1.32 {d1[1]}, [r1]
+ add r1, #\pb
+ vld1.32 {d3[1]}, [r1], r3 @ M
+
+ bl \body_fn
+
+ vst1.32 {d0[0]}, [r0], r2
+ vst1.32 {d0[1]}, [r0], r2
+ vst1.32 {d1[0]}, [r0], r2
+ vst1.32 {d1[1]}, [r0], r2
+
+ bgt 1b
+ bx r6
+.endm
+
+
+.macro edge_64b_e1, body_fn
+ mov r6, lr
+ sub r1, r3
+ // load a
+ vld1.8 {q0-q1}, [r1, :128]!
+ vld1.8 {q2-q3}, [r1, :128], r3
+ sub r1, #32
+ // load c
+ vld1.8 {q4-q5}, [r1, :128]!
+ vld1.8 {q6-q7}, [r1, :128], r3
+ sub r1, #32
+1: subs r12, #1
+ // load b
+ vld1.8 {q8-q9}, [r1, :128]!
+ vld1.8 {q10-q11}, [r1, :128], r3
+ sub r1, #32
+ bl \body_fn
+ vstm r0, {q0-q3}
+ add r0, r0, r2
+ // copy c to a
+ vmov.64 q0, q4
+ vmov.64 q1, q5
+ vmov.64 q2, q6
+ vmov.64 q3, q7
+ // copy b to c
+ vmov.64 q4, q8
+ vmov.64 q5, q9
+ vmov.64 q6, q10
+ vmov.64 q7, q11
+ bgt 1b
+ bx r6
+.endm
+
+.macro edge_32bx2_e1, body_fn
+ mov r6, lr
+ sub r1, r3
+ // load a
+ vld1.8 {q0-q1}, [r1, :128], r3
+ vld1.8 {q4-q5}, [r1, :128], r3
+
+1: subs r12, #2
+ @ Given the data duplication here we could obviously do better than
+ @ using the generic body_fn but it almost certainly isn't worth it
+ vmov q2, q4
+ vmov q3, q5
+ vld1.8 {q8-q9}, [r1, :128], r3
+ vld1.8 {q10-q11}, [r1, :128], r3
+ vmov q6, q8
+ vmov q7, q9
+
+ bl \body_fn
+
+ vst1.8 {q0,q1}, [r0], r2
+ vst1.8 {q2,q3}, [r0], r2
+
+ // copy c to a
+ vmov.64 q0, q8
+ vmov.64 q1, q9
+
+ // copy b to c
+ vmov.64 q4, q10
+ vmov.64 q5, q11
+ bgt 1b
+ bx r6
+.endm
+
+.macro edge_16b_e1, body_fn
+ mov r6, lr
+ sub r1, r3
+ // load a
+ vld1.8 {q0}, [r1, :128], r3
+ // load c
+ vld1.8 {q1}, [r1, :128], r3
+1: subs r12, #1
+ // load b
+ vld1.8 {q2}, [r1, :128], r3
+ bl \body_fn
+ vst1.8 {q0}, [r0], r2
+ // copy c to a
+ vmov.64 q0, q1
+ // copy b to c
+ vmov.64 q1, q2
+ bgt 1b
+ bx r6
+.endm
+
+.macro edge_8bx2_e1, body_fn
+ mov r6, lr
+ sub r1, r3
+ // load a
+ vld1.8 {d0}, [r1, :64], r3
+ vld1.8 {d2}, [r1, :64], r3
+
+1: subs r12, #2
+ @ Given the data duplication here we could obviously do better than
+ @ using the generic body_fn but it almost certainly isn't worth it
+ vmov.64 d1, d2
+ vld1.8 {d4}, [r1, :64], r3
+ vld1.8 {d5}, [r1, :64], r3
+ vmov.64 d3, d4
+
+ bl \body_fn
+
+ vst1.8 {d0}, [r0], r2
+ vst1.8 {d1}, [r0], r2
+
+ // copy c to a
+ vmov.64 d0, d4
+ // copy b to c
+ vmov.64 d2, d5
+ bgt 1b
+ bx r6
+.endm
+
+.macro edge_4bx4_e1, body_fn
+ mov r6, lr
+debug_me:
+ sub r1, r3
+ // load a
+ vld1.32 {d0[0]}, [r1], r3
+ vld1.32 {d0[1]}, [r1], r3
+
+1: subs r12, #4
+ @ Given the data duplication here we could probably do better than
+ @ using the generic body_fn but it almost certainly isn't worth it
+ vld1.32 {d4[0]}, [r1], r3
+ vld1.32 {d4[1]}, [r1], r3
+ vld1.32 {d5[0]}, [r1], r3
+ vld1.32 {d5[1]}, [r1], r3
+
+ vmov.32 d1, d4
+ vext.32 d2, d0, d4, #1
+ vext.32 d3, d4, d5, #1
+
+ bl \body_fn
+
+ vst1.32 {d0[0]}, [r0], r2
+ vst1.32 {d0[1]}, [r0], r2
+ vst1.32 {d1[0]}, [r0], r2
+ vst1.32 {d1[1]}, [r0], r2
+
+ vmov.32 d0, d5
+ bgt 1b
+ bx r6
+.endm
+
+.macro edge_64b_e2, body_fn, pb
+ mov r6, lr
+ sub r1, #32
+ sub r3, #(32 - \pb)
+
+1: sub r1, r3
+ // load a
+ // TODO: fix unaligned load
+ // don't reload a like in eo1
+ vld1.8 {q0-q1}, [r1]!
+ vld1.8 {q2-q3}, [r1], r3
+ subs r12, #1
+ // load c
+ vld1.8 {q4-q5}, [r1, :128]!
+ vld1.8 {q6-q7}, [r1, :128], r3
+ // load b
+ vld1.8 {q8-q9}, [r1]!
+ vld1.8 {q10-q11}, [r1]
+ sub r1, #(64 + \pb)
+ bl \body_fn
+ vstm r0, {q0-q3}
+ add r0, r0, r2
+ bgt 1b
+
+ add r3, #(32 - \pb)
+ bx r6
+.endm
+
+.macro edge_32bx2_e2, body_fn, pb
+ mov r6, lr
+ sub r1, #\pb
+
+1: sub r1, r3
+ vld1.8 {q0-q1}, [r1], r3
+ vld1.8 {q2-q3}, [r1]
+ subs r12, #2
+ // load c
+ add r1, #\pb
+ vld1.8 {q4-q5}, [r1, :128], r3
+ vld1.8 {q6-q7}, [r1, :128]
+ // load b
+ add r1, #\pb
+ vld1.8 {q8-q9}, [r1], r3
+ vld1.8 {q10-q11}, [r1]
+ sub r1, #(\pb * 2)
+
+ bl \body_fn
+
+ vst1.8 {q0-q1}, [r0], r2
+ vst1.8 {q2-q3}, [r0], r2
+ bgt 1b
+
+ bx r6
+.endm
+
+.macro edge_16b_e2, body_fn, pb
+ mov r6, lr
+ add r3, #\pb
+
+1: sub r1, r3
+ // load a
+ vld1.8 {q0}, [r1], r3
+ subs r12, #1
+ // load c
+ vld1.8 {q1}, [r1, :128], r3
+ // load b
+ vld1.8 {q2}, [r1]
+ sub r1, #\pb
+ bl \body_fn
+ vst1.8 {q0}, [r0], r2
+ bgt 1b
+ bx r6
+.endm
+
+.macro edge_8bx2_e2, body_fn, pb
+ mov r6, lr
+ sub r1, #\pb
+
+1: sub r1, r3
+ vld1.8 {d0}, [r1], r3
+ vld1.8 {d1}, [r1]
+ subs r12, #2
+ // load c
+ add r1, #\pb
+ vld1.8 {d2}, [r1, :64], r3
+ vld1.8 {d3}, [r1, :64]
+ // load b
+ add r1, #\pb
+ vld1.8 {d4}, [r1], r3
+ vld1.8 {d5}, [r1]
+ sub r1, #(\pb * 2)
+
+ bl \body_fn
+
+ vst1.8 {d0}, [r0], r2
+ vst1.8 {d1}, [r0], r2
+ bgt 1b
+
+ bx r6
+.endm
+
+.macro edge_4bx4_e2, body_fn, pb
+ mov r6, lr
+ sub r1, #\pb
+
+1: sub r1, r3
+ @ line 0 {d0[0], -, - } r1 lo
+ vld1.32 {d0[0]}, [r1], r3
+ subs r12, #4
+ @ Line 1 {d0[1], d2[0], - } r1 lo
+ vld1.32 {d0[1]}, [r1]
+ add r1, #\pb
+ vld1.32 {d2[0]}, [r1], r3
+ @ Line 2 {d1[0], d2[1], d4[0]} r1 mid
+ vld1.32 {d2[1]}, [r1]
+ sub r1, #\pb
+ vld1.32 {d1[0]}, [r1]
+ add r1, #\pb * 2
+ vld1.32 {d4[0]}, [r1], r3
+ @ Line 2 {d1[1], d3[0], d4[1]} r1 hi
+ vld1.32 {d4[1]}, [r1]
+ sub r1, #\pb * 2
+ vld1.32 {d1[1]}, [r1]
+ add r1, #\pb
+ vld1.32 {d3[0]}, [r1], r3
+ @ Line 3 {-, d3[1], d5[0]} r1 mid
+ vld1.32 {d3[1]}, [r1]
+ add r1, #\pb
+ vld1.32 {d5[0]}, [r1], r3
+ @ Line 4 {-, -, d5[1]} r1 hi
+ vld1.32 {d5[1]}, [r1]
+ sub r1, #(\pb * 2)
+
+ bl \body_fn
+
+ vst1.32 {d0[0]}, [r0], r2
+ vst1.32 {d0[1]}, [r0], r2
+ vst1.32 {d1[0]}, [r0], r2
+ vst1.32 {d1[1]}, [r0], r2
+ bgt 1b
+
+ bx r6
+.endm
+
+.macro edge_64b_e3, body_fn, pb
+ @ e3 is the same as e2 but with the X offset reversed
+ edge_64b_e2 \body_fn, (-\pb)
+.endm
+
+.macro edge_32bx2_e3, body_fn, pb
+ @ e3 is the same as e2 but with the X offset reversed
+ edge_32bx2_e2 \body_fn, (-\pb)
+.endm
+
+.macro edge_16b_e3, body_fn, pb
+ @ e3 is the same as e2 but with the X offset reversed
+ edge_16b_e2 \body_fn, (-\pb)
+.endm
+
+.macro edge_8bx2_e3, body_fn, pb
+ @ e3 is the same as e2 but with the X offset reversed
+ edge_8bx2_e2 \body_fn, (-\pb)
+.endm
+
+.macro edge_4bx4_e3, body_fn, pb
+ @ e3 is the same as e2 but with the X offset reversed
+ edge_4bx4_e2 \body_fn, (-\pb)
+.endm
+
+.macro edge_64b_bodies, body_fn, pb
+ .word 0f
+ .word 10f
+ .word 20f
+ .word 30f
+
+0: edge_64b_e0 \body_fn, \pb
+10: edge_64b_e1 \body_fn
+20: edge_64b_e2 \body_fn, \pb
+30: edge_64b_e3 \body_fn, \pb
+.endm
+
+.macro edge_32bx2_bodies, body_fn, pb
+ .word 0f
+ .word 10f
+ .word 20f
+ .word 30f
+
+0: edge_32bx2_e0 \body_fn, \pb
+10: edge_32bx2_e1 \body_fn
+20: edge_32bx2_e2 \body_fn, \pb
+30: edge_32bx2_e3 \body_fn, \pb
+.endm
+
+.macro edge_16b_bodies, body_fn, pb
+ .word 0f
+ .word 10f
+ .word 20f
+ .word 30f
+
+0: edge_16b_e0 \body_fn, \pb
+10: edge_16b_e1 \body_fn
+20: edge_16b_e2 \body_fn, \pb
+30: edge_16b_e3 \body_fn, \pb
+.endm
+
+.macro edge_32bx2_16b_bodies, body_fn_64b, body_fn_16b, pb
+ .word 0f
+ .word 10f
+ .word 20f
+ .word 30f
+ .word 5f
+ .word 15f
+ .word 25f
+ .word 35f
+
+0: edge_32bx2_e0 \body_fn_64b, \pb
+10: edge_32bx2_e1 \body_fn_64b
+20: edge_32bx2_e2 \body_fn_64b, \pb
+30: edge_32bx2_e3 \body_fn_64b, \pb
+5: edge_16b_e0 \body_fn_16b, \pb
+15: edge_16b_e1 \body_fn_16b
+25: edge_16b_e2 \body_fn_16b, \pb
+35: edge_16b_e3 \body_fn_16b, \pb
+.endm
+
+.macro edge_16b_8bx2_bodies, body_fn, pb
+ .word 0f
+ .word 10f
+ .word 20f
+ .word 30f
+ .word 5f
+ .word 15f
+ .word 25f
+ .word 35f
+
+0: edge_16b_e0 \body_fn, \pb
+10: edge_16b_e1 \body_fn
+20: edge_16b_e2 \body_fn, \pb
+30: edge_16b_e3 \body_fn, \pb
+5: edge_8bx2_e0 \body_fn, \pb
+15: edge_8bx2_e1 \body_fn
+25: edge_8bx2_e2 \body_fn, \pb
+35: edge_8bx2_e3 \body_fn, \pb
+.endm
+
+.macro edge_8bx2_4bx4_bodies, body_fn, pb
+ .word 0f
+ .word 10f
+ .word 20f
+ .word 30f
+ .word 5f
+ .word 15f
+ .word 25f
+ .word 35f
+
+0: edge_8bx2_e0 \body_fn, \pb
+10: edge_8bx2_e1 \body_fn
+20: edge_8bx2_e2 \body_fn, \pb
+30: edge_8bx2_e3 \body_fn, \pb
+5: edge_4bx4_e0 \body_fn, \pb
+15: edge_4bx4_e1 \body_fn
+25: edge_4bx4_e2 \body_fn, \pb
+35: edge_4bx4_e3 \body_fn, \pb
+.endm
+
+@ void ff_hevc_sao_edge_8_neon_8(
+@ uint8_t *_dst, [r0]
+@ uint8_t *_src, [r1]
+@ int stride_dst, [r2]
+@ int16_t *_sao_offset_val, [r3]
+@ int eo, [sp, #0]
+@ int width, [sp, #4]
+@ int height) [sp, #8]
+
+function ff_hevc_sao_edge_8_neon_8, export=1
+ edge_16b_init 8, 0, 1, 99f
+99:
+ edge_8bx2_4bx4_bodies edge_16b_body_8, 1
+endfunc
+
+@ void ff_hevc_sao_edge_16_neon_8(
+@ uint8_t *_dst, [r0]
+@ uint8_t *_src, [r1]
+@ int stride_dst, [r2]
+@ int16_t *_sao_offset_val, [r3]
+@ int eo, [sp, #0]
+@ int width, [sp, #4]
+@ int height) [sp, #8]
+
+function ff_hevc_sao_edge_16_neon_8, export=1
+ edge_16b_init 8, 0, 0, 99f
+99:
+ edge_16b_bodies edge_16b_body_8, 1
+endfunc
+
+@ void ff_hevc_sao_edge_32_neon_8(
+@ uint8_t *_dst, [r0]
+@ uint8_t *_src, [r1]
+@ int stride_dst, [r2]
+@ int16_t *_sao_offset_val, [r3]
+@ int eo, [sp, #0]
+@ int width, [sp, #4]
+@ int height) [sp, #8]
+
+function ff_hevc_sao_edge_32_neon_8, export=1
+ edge_64b_init 8, 0, 0, 99f
+99:
+ edge_32bx2_bodies edge_64b_body_8, 1
+endfunc
+
+@ void ff_hevc_sao_edge_64_neon_8(
+@ uint8_t *_dst, [r0]
+@ uint8_t *_src, [r1]
+@ int stride_dst, [r2]
+@ int16_t *_sao_offset_val, [r3]
+@ int eo, [sp, #0]
+@ int width, [sp, #4]
+@ int height) [sp, #8]
+
+function ff_hevc_sao_edge_64_neon_8, export=1
+ edge_64b_init 8, 0, 0, 99f
+99:
+ edge_64b_bodies edge_64b_body_8, 1
+endfunc
+
+@ ff_hevc_sao_edge_c_8_neon_8(
+@ uint8_t *_dst, [r0]
+@ const uint8_t *_src, [r1]
+@ ptrdiff_t stride_dst, [r2]
+@ const int16_t *_sao_offset_val_u, [r3]
+@ const int16_t *_sao_offset_val_v, [sp, #0]
+@ int eo, [sp, #4]
+@ int width, [sp, #8]
+@ int height) [sp, #12]
+
+function ff_hevc_sao_edge_c_8_neon_8, export=1
+ edge_16b_init 8, 1, 1, 99f
+99:
+ edge_16b_8bx2_bodies edge_16b_body_8, 2
+endfunc
+
+@ ff_hevc_sao_edge_c_16_neon_8(
+@ uint8_t *_dst, [r0]
+@ const uint8_t *_src, [r1]
+@ ptrdiff_t stride_dst, [r2]
+@ const int16_t *_sao_offset_val_u, [r3]
+@ const int16_t *_sao_offset_val_v, [sp, #0]
+@ int eo, [sp, #4]
+@ int width, [sp, #8]
+@ int height) [sp, #12]
+
+function ff_hevc_sao_edge_c_16_neon_8, export=1
+ edge_64b_init 8, 1, 0, 99f
+99:
+ edge_32bx2_bodies edge_64b_body_8, 2
+endfunc
+
+@ ff_hevc_sao_edge_c_32_neon_8(
+@ uint8_t *_dst, [r0]
+@ const uint8_t *_src, [r1]
+@ ptrdiff_t stride_dst, [r2]
+@ const int16_t *_sao_offset_val_u, [r3]
+@ const int16_t *_sao_offset_val_v, [sp, #0]
+@ int eo, [sp, #4]
+@ int width, [sp, #8]
+@ int height) [sp, #12]
+
+function ff_hevc_sao_edge_c_32_neon_8, export=1
+ edge_64b_init 8, 1, 0, 99f
+99:
+ edge_64b_bodies edge_64b_body_8, 2
+endfunc
+
+@ void ff_hevc_sao_edge_8_neon_10(
+@ uint8_t *_dst, [r0]
+@ uint8_t *_src, [r1]
+@ int stride_dst, [r2]
+@ int16_t *_sao_offset_val, [r3]
+@ int eo, [sp, #0]
+@ int width, [sp, #4]
+@ int height) [sp, #8]
+
+function ff_hevc_sao_edge_8_neon_10, export=1
+ edge_16b_init 10, 0, 1, 99f
+99:
+ edge_16b_8bx2_bodies edge_16b_body_16, 2
+endfunc
+
+@ void ff_hevc_sao_edge_16_neon_10(
+@ uint8_t *_dst, [r0]
+@ uint8_t *_src, [r1]
+@ int stride_dst, [r2]
+@ int16_t *_sao_offset_val, [r3]
+@ int eo, [sp, #0]
+@ int width, [sp, #4]
+@ int height) [sp, #8]
+
+function ff_hevc_sao_edge_16_neon_10, export=1
+ edge_64b_init 10, 0, 0, 99f
+99:
+ edge_32bx2_bodies edge_64b_body_16, 2
+endfunc
+
+@ void ff_hevc_sao_edge_64_neon_10(
+@ uint8_t *_dst, [r0]
+@ uint8_t *_src, [r1]
+@ int stride_dst, [r2]
+@ int16_t *_sao_offset_val, [r3]
+@ int eo, [sp, #0]
+@ int width, [sp, #4]
+@ int height) [sp, #8]
+
+@ We simply split the 32 case into 2 vertical stripes
+@ and call the fns for w32
+@
+@ Calling code will always have src != dst so we don't have to worry
+@ about edge effects
+
+function ff_hevc_sao_edge_64_neon_10, export=1
+ edge_64b_init 10, 0, 1, 99f
+endfunc
+
+@ void ff_hevc_sao_edge_32_neon_10(
+@ uint8_t *_dst, [r0]
+@ uint8_t *_src, [r1]
+@ int stride_dst, [r2]
+@ int16_t *_sao_offset_val, [r3]
+@ int eo, [sp, #0]
+@ int width, [sp, #4]
+@ int height) [sp, #8]
+
+function ff_hevc_sao_edge_32_neon_10, export=1
+ edge_64b_init 10, 0, 0, 99f
+99:
+ edge_64b_bodies edge_64b_body_16, 2
+endfunc
+
+@ ff_hevc_sao_edge_c_8_neon_10(
+@ uint8_t *_dst, [r0]
+@ const uint8_t *_src, [r1]
+@ ptrdiff_t stride_dst, [r2]
+@ const int16_t *_sao_offset_val_u, [r3]
+@ const int16_t *_sao_offset_val_v, [sp, #0]
+@ int eo, [sp, #4]
+@ int width, [sp, #8]
+@ int height) [sp, #12]
+
+function ff_hevc_sao_edge_c_8_neon_10, export=1
+ edge_xxb_init 10, 1, 99f, check_w4=1, setup_16b=1, setup_64b=1
+99:
+ edge_32bx2_16b_bodies edge_64b_body_16, edge_16b_body_16, 4
+endfunc
+
+@ ff_hevc_sao_edge_c_32_neon_10(
+@ uint8_t *_dst, [r0]
+@ const uint8_t *_src, [r1]
+@ ptrdiff_t stride_dst, [r2]
+@ const int16_t *_sao_offset_val_u, [r3]
+@ const int16_t *_sao_offset_val_v, [sp, #0]
+@ int eo, [sp, #4]
+@ int width, [sp, #8]
+@ int height) [sp, #12]
+
+function ff_hevc_sao_edge_c_32_neon_10, export=1
+ edge_64b_init 10, 1, 1, 99f
+endfunc
+
+
+@ ff_hevc_sao_edge_c_16_neon_10(
+@ uint8_t *_dst, [r0]
+@ const uint8_t *_src, [r1]
+@ ptrdiff_t stride_dst, [r2]
+@ const int16_t *_sao_offset_val_u, [r3]
+@ const int16_t *_sao_offset_val_v, [sp, #0]
+@ int eo, [sp, #4]
+@ int width, [sp, #8]
+@ int height) [sp, #12]
+
+function ff_hevc_sao_edge_c_16_neon_10, export=1
+ edge_64b_init 10, 1, 0, 99f
+99:
+ edge_64b_bodies edge_64b_body_16, 4
+endfunc
+
diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
index 1be52e7a12..bae5df4bc6 100644
--- a/libavcodec/avcodec.h
+++ b/libavcodec/avcodec.h
@@ -410,6 +410,8 @@ enum AVCodecID {
AV_CODEC_ID_SHEERVIDEO,
AV_CODEC_ID_YLC,
+ AV_CODEC_ID_H264_MVC,
+
/* various PCM "codecs" */
AV_CODEC_ID_FIRST_AUDIO = 0x10000, ///< A dummy id pointing at the start of audio codecs
AV_CODEC_ID_PCM_S16LE = 0x10000,
@@ -3205,6 +3207,9 @@ typedef struct AVCodecContext {
#define FF_PROFILE_H264_HIGH_444_PREDICTIVE 244
#define FF_PROFILE_H264_HIGH_444_INTRA (244|FF_PROFILE_H264_INTRA)
#define FF_PROFILE_H264_CAVLC_444 44
+#define FF_PROFILE_H264_MULTIVIEW_HIGH 118
+#define FF_PROFILE_H264_STEREO_HIGH 128
+#define FF_PROFILE_H264_MULTIVIEW_HIGH_DEPTH 138
#define FF_PROFILE_VC1_SIMPLE 0
#define FF_PROFILE_VC1_MAIN 1
@@ -3515,6 +3520,13 @@ typedef struct AVCodecContext {
#define FF_SUB_TEXT_FMT_ASS_WITH_TIMINGS 1
#endif
+ /**
+ * Opaque pointer for use by replacement get_buffer2 code
+ *
+ * @author jc (08/02/2016)
+ */
+ void * get_buffer_context;
+
} AVCodecContext;
AVRational av_codec_get_pkt_timebase (const AVCodecContext *avctx);
diff --git a/libavcodec/cabac.h b/libavcodec/cabac.h
index 1bf1c620d6..ccfa991f60 100644
--- a/libavcodec/cabac.h
+++ b/libavcodec/cabac.h
@@ -43,7 +43,14 @@ extern const uint8_t ff_h264_cabac_tables[512 + 4*2*64 + 4*64 + 63];
typedef struct CABACContext{
int low;
int range;
- int outstanding_count;
+ union
+ {
+ int outstanding_count;
+ struct {
+ uint16_t bits;
+ uint16_t range;
+ } by22;
+ };
const uint8_t *bytestream_start;
const uint8_t *bytestream;
const uint8_t *bytestream_end;
diff --git a/libavcodec/codec_desc.c b/libavcodec/codec_desc.c
index 9d94b72..535ebf0 100644
--- a/libavcodec/codec_desc.c
+++ b/libavcodec/codec_desc.c
@@ -1563,6 +1563,13 @@ static const AVCodecDescriptor codec_descriptors[] = {
.long_name = NULL_IF_CONFIG_SMALL("YUY2 Lossless Codec"),
.props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
},
+ {
+ .id = AV_CODEC_ID_H264_MVC,
+ .type = AVMEDIA_TYPE_VIDEO,
+ .name = "h264_mvc",
+ .long_name = NULL_IF_CONFIG_SMALL("H264 MVC"),
+ .props = AV_CODEC_PROP_LOSSY,
+ },
/* various PCM "codecs" */
{
diff --git a/libavcodec/h264.h b/libavcodec/h264.h
index efe3555..16358aa 100644
--- a/libavcodec/h264.h
+++ b/libavcodec/h264.h
@@ -126,7 +126,9 @@ enum {
NAL_END_STREAM = 11,
NAL_FILLER_DATA = 12,
NAL_SPS_EXT = 13,
+ NAL_SPS_SUBSET = 15,
NAL_AUXILIARY_SLICE = 19,
+ NAL_SLICE_EXT = 20,
NAL_FF_IGNORE = 0xff0f001,
};
diff --git a/libavcodec/h264_parser.c b/libavcodec/h264_parser.c
index ce4bab2..b9b0c78 100644
--- a/libavcodec/h264_parser.c
+++ b/libavcodec/h264_parser.c
@@ -58,6 +58,8 @@ typedef struct H264ParseContext {
uint8_t parse_history[6];
int parse_history_count;
int parse_last_mb;
+ int is_mvc;
+ int slice_ext;
} H264ParseContext;
@@ -105,24 +107,27 @@ static int h264_find_frame_end(H264ParseContext *p, const uint8_t *buf,
} else if (state <= 5) {
int nalu_type = buf[i] & 0x1F;
if (nalu_type == NAL_SEI || nalu_type == NAL_SPS ||
- nalu_type == NAL_PPS || nalu_type == NAL_AUD) {
+ nalu_type == NAL_PPS || nalu_type == NAL_AUD ||
+ nalu_type == NAL_SPS_SUBSET) {
if (pc->frame_start_found) {
i++;
goto found;
}
} else if (nalu_type == NAL_SLICE || nalu_type == NAL_DPA ||
- nalu_type == NAL_IDR_SLICE) {
+ nalu_type == NAL_IDR_SLICE || (p->is_mvc && nalu_type == NAL_SLICE_EXT)) {
state += 8;
+
+ p->slice_ext = (nalu_type == NAL_SLICE_EXT);
continue;
}
state = 7;
} else {
p->parse_history[p->parse_history_count++] = buf[i];
- if (p->parse_history_count > 5) {
+ if (p->parse_history_count > 8) {
unsigned int mb, last_mb = p->parse_last_mb;
GetBitContext gb;
- init_get_bits(&gb, p->parse_history, 8*p->parse_history_count);
+ init_get_bits8(&gb, p->parse_history + 3*p->slice_ext, p->parse_history_count - 3*p->slice_ext);
p->parse_history_count = 0;
mb= get_ue_golomb_long(&gb);
p->parse_last_mb = mb;
@@ -145,7 +150,7 @@ found:
pc->frame_start_found = 0;
if (p->is_avc)
return next_avc;
- return i - (state & 5) - 5 * (state > 7);
+ return i - (state & 5) - 8 * (state > 7);
}
static int scan_mmco_reset(AVCodecParserContext *s, GetBitContext *gb,
@@ -585,7 +590,8 @@ static int h264_parse(AVCodecParserContext *s,
}
}
- parse_nal_units(s, avctx, buf, buf_size);
+ if (!p->is_mvc)
+ parse_nal_units(s, avctx, buf, buf_size);
if (avctx->framerate.num)
avctx->time_base = av_inv_q(av_mul_q(avctx->framerate, (AVRational){avctx->ticks_per_frame, 1}));
@@ -622,7 +628,7 @@ static int h264_split(AVCodecContext *avctx,
if ((state & 0xFFFFFF00) != 0x100)
break;
nalu_type = state & 0x1F;
- if (nalu_type == NAL_SPS) {
+ if (nalu_type == NAL_SPS || nalu_type == NAL_SPS_SUBSET) {
has_sps = 1;
} else if (nalu_type == NAL_PPS)
has_pps = 1;
@@ -672,3 +678,23 @@ AVCodecParser ff_h264_parser = {
.parser_close = h264_close,
.split = h264_split,
};
+
+static av_cold int init_mvc(AVCodecParserContext *s)
+{
+ H264ParseContext *p = s->priv_data;
+ int ret = init(s);
+ if (ret < 0)
+ return ret;
+
+ p->is_mvc = 1;
+ return 0;
+}
+
+AVCodecParser ff_h264_mvc_parser = {
+ .codec_ids = { AV_CODEC_ID_H264_MVC },
+ .priv_data_size = sizeof(H264ParseContext),
+ .parser_init = init_mvc,
+ .parser_parse = h264_parse,
+ .parser_close = h264_close,
+ .split = h264_split,
+};
diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
index c1fa67f67b..6f99021339 100644
--- a/libavcodec/hevc.c
+++ b/libavcodec/hevc.c
@@ -41,8 +41,346 @@
#include "hevc.h"
#include "profiles.h"
+#ifdef RPI
+ #include "rpi_qpu.h"
+ #include "rpi_shader.h"
+ #include "rpi_shader_cmd.h"
+ #include "rpi_shader_template.h"
+ #include "rpi_zc.h"
+ #include "libavutil/rpi_sand_fns.h"
+
+ // Define RPI_CACHE_UNIF_MVS to write motion vector uniform stream to cached memory
+ #define RPI_CACHE_UNIF_MVS 1
+
+ #include "pthread.h"
+ #include "libavutil/atomic.h"
+
+ static void worker_core(HEVCContext * const s);
+#endif
+
+#define DEBUG_DECODE_N 0 // 0 = do all, n = frames idr onwards
+
+#define PACK2(hi,lo) (((hi) << 16) | ((lo) & 0xffff))
+
+#ifndef av_mod_uintp2
+static av_always_inline av_const unsigned av_mod_uintp2_c(unsigned a, unsigned p)
+{
+ return a & ((1 << p) - 1);
+}
+# define av_mod_uintp2 av_mod_uintp2_c
+#endif
+
const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
+
+#if RPI_INTER
+
+#define MC_DUMMY_X (-32)
+#define MC_DUMMY_Y (-32)
+
+// UV still has min 4x4 pred
+// Allow for even spread +1 for setup, +1 for rounding
+// As we have load sharing this can (in theory) be exceeded so we have to
+// check after each CTU, but it is a good base size
+
+// Worst case (all 4x4) commands per CTU
+#define QPU_Y_CMD_PER_CTU_MAX (8 * 8)
+#define QPU_C_CMD_PER_CTU_MAX (4 * 4)
+
+#define QPU_C_COMMANDS (((RPI_MAX_WIDTH * 64) / (4 * 4)) / 4 + 2 * QPU_N_MAX)
+#define QPU_Y_COMMANDS (((RPI_MAX_WIDTH * 64) / (4 * 4)) + 2 * QPU_N_MAX)
+
+// The QPU code for UV blocks only works up to a block width of 8
+#define RPI_CHROMA_BLOCK_WIDTH 8
+
+#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
+
+
+// Actual filter goes -ve, +ve, +ve, -ve using these values
+static const uint32_t rpi_filter_coefs[8] = {
+ ENCODE_COEFFS( 0, 64, 0, 0),
+ ENCODE_COEFFS( 2, 58, 10, 2),
+ ENCODE_COEFFS( 4, 54, 16, 2),
+ ENCODE_COEFFS( 6, 46, 28, 4),
+ ENCODE_COEFFS( 4, 36, 36, 4),
+ ENCODE_COEFFS( 4, 28, 46, 6),
+ ENCODE_COEFFS( 2, 16, 54, 4),
+ ENCODE_COEFFS( 2, 10, 58, 2)
+};
+
+// Function arrays by QPU
+
+static const int * const inter_pred_setup_c_qpu[12] = {
+ mc_setup_c_q0, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn,
+ mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn,
+ mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn
+};
+
+static const int * const inter_pred_setup_c10_qpu[12] = {
+ mc_setup_c10_q0, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn,
+ mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn,
+ mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn
+};
+
+static const int * const inter_pred_setup_y_qpu[12] = {
+ mc_setup_y_q0, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn,
+ mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn,
+ mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn
+};
+
+static const int * const inter_pred_setup_y10_qpu[12] = {
+ mc_setup_y10_q0, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn,
+ mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn,
+ mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn
+};
+
+static const int * const inter_pred_sync_qpu[12] = {
+ mc_sync_q0, mc_sync_q1, mc_sync_q2, mc_sync_q3,
+ mc_sync_q4, mc_sync_q5, mc_sync_q6, mc_sync_q7,
+ mc_sync_q8, mc_sync_q9, mc_sync_q10, mc_sync_q11
+};
+
+static const int * const inter_pred_sync10_qpu[12] = {
+ mc_sync10_q0, mc_sync10_q1, mc_sync10_q2, mc_sync10_q3,
+ mc_sync10_q4, mc_sync10_q5, mc_sync10_q6, mc_sync10_q7,
+ mc_sync10_q8, mc_sync10_q9, mc_sync10_q10, mc_sync10_q11
+};
+
+static const int * const inter_pred_exit_c_qpu[12] = {
+ mc_exit_c_q0, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn,
+ mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn,
+ mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn
+};
+
+static const int * const inter_pred_exit_c10_qpu[12] = {
+ mc_exit_c10_q0, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn,
+ mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn,
+ mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn
+};
+
+static const int * const inter_pred_exit_y_qpu[12] = {
+ mc_exit_y_q0, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn,
+ mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn,
+ mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn
+};
+
+static const int * const inter_pred_exit_y10_qpu[12] = {
+ mc_exit_y10_q0, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn,
+ mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn,
+ mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn
+};
+
+typedef struct ipe_chan_info_s
+{
+ const unsigned int n;
+ const int * const * setup_fns;
+ const int * const * sync_fns;
+ const int * const * exit_fns;
+} ipe_chan_info_t;
+
+typedef struct ipe_init_info_s
+{
+ ipe_chan_info_t luma;
+ ipe_chan_info_t chroma;
+} ipe_init_info_t;
+
+static const ipe_init_info_t ipe_init_infos[9] = { // Alloc for bit depths of 8-16
+ { // 8
+ .luma = {QPU_MC_PRED_N_Y8, inter_pred_setup_y_qpu, inter_pred_sync_qpu, inter_pred_exit_y_qpu},
+ .chroma = {QPU_MC_PRED_N_C8, inter_pred_setup_c_qpu, inter_pred_sync_qpu, inter_pred_exit_c_qpu}
+ },
+ { // 9
+ .luma = {0},
+ .chroma = {0}
+ },
+ { // 10
+ .luma = {QPU_MC_PRED_N_Y10, inter_pred_setup_y10_qpu, inter_pred_sync10_qpu, inter_pred_exit_y10_qpu},
+ .chroma = {QPU_MC_PRED_N_C10, inter_pred_setup_c10_qpu, inter_pred_sync10_qpu, inter_pred_exit_c10_qpu}
+ }
+
+};
+
+static void set_ipe_from_ici(HEVCRpiInterPredEnv * const ipe, const ipe_chan_info_t * const ici)
+{
+ const unsigned int n = ici->n;
+ const unsigned int q1_size = (ipe->gptr.numbytes / n) & ~3; // Round down to word
+
+ ipe->n = n;
+ ipe->max_fill = q1_size - ipe->min_gap;
+ for(unsigned int i = 0; i < n; i++) {
+ HEVCRpiInterPredQ * const q = ipe->q + i;
+ q->qpu_mc_curr = q->qpu_mc_base =
+ (qpu_mc_pred_cmd_t *)(ipe->gptr.arm + i * q1_size);
+ q->code_setup = qpu_fn(ici->setup_fns[i]);
+ q->code_sync = qpu_fn(ici->sync_fns[i]);
+ q->code_exit = qpu_fn(ici->exit_fns[i]);
+ }
+}
+
+static void rpi_hevc_qpu_set_fns(HEVCContext * const s, const unsigned int bit_depth)
+{
+ const ipe_init_info_t * const iii = ipe_init_infos + bit_depth - 8;
+
+ av_assert0(bit_depth >= 8 && bit_depth <= 16);
+
+ rpi_hevc_qpu_init_fn(&s->qpu, bit_depth);
+
+ for (unsigned int i = 0; i != RPI_MAX_JOBS; ++i) {
+ HEVCRpiJob *const jb = s->jobs + i;
+ set_ipe_from_ici(&jb->chroma_ip, &iii->chroma);
+ set_ipe_from_ici(&jb->luma_ip, &iii->luma);
+ }
+}
+
+
+#endif
+
+
+#ifdef RPI
+
+//#define LOG_ENTER printf("Enter %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s);
+//#define LOG_EXIT printf("Exit %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s);
+
+#define LOG_ENTER
+#define LOG_EXIT
+
+#define USE_SEM 1
+
+// Call this when we have completed pass0 and wish to trigger pass1 for the current job
+static void worker_submit_job(HEVCContext * const s)
+{
+ LOG_ENTER
+ sem_post(&s->jb0->sem_in);
+ s->jb0->pending = 1;
+ s->pass0_job = (s->pass0_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
+ s->jb0 = s->jobs + s->pass0_job;
+ LOG_EXIT
+}
+
+// Call this to say we have completed pass1
+static void worker_complete_job(HEVCContext * const s)
+{
+ LOG_ENTER
+ sem_t * const sem = &s->jb1->sem_out;
+ // Must set job no before signalling as otherwise rpi_do_all_passes
+ // may call worker_core from the main thread with a bad job number
+ s->pass1_job = (s->pass1_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
+ s->jb1 = s->jobs + s->pass1_job;
+ sem_post(sem);
+ LOG_EXIT
+}
+
+
+// Call worker_pass0_ready to wait until the s->pass0_job slot becomes
+// available to receive the next job.
+static void worker_pass0_ready(HEVCContext *s)
+{
+ LOG_ENTER
+ HEVCRpiJob * const jb = s->jb0;
+ if (jb->pending) {
+ while (sem_wait(&jb->sem_out) == -1 && errno == EINTR)
+ /* Loop */;
+ jb->pending = 0;
+ }
+ LOG_EXIT
+}
+
+// Call this to wait for all jobs to have completed at the end of a frame
+static void worker_wait(HEVCContext * const s)
+{
+ LOG_ENTER
+ unsigned int i;
+ for (i = 0; i != RPI_MAX_JOBS; ++i) {
+ HEVCRpiJob * const jb = s->jobs + i;
+ if (jb->pending) {
+ while (sem_wait(&jb->sem_out) == -1 && errno == EINTR)
+ /* Loop */;
+ jb->pending = 0;
+ }
+ }
+ LOG_EXIT
+}
+
+static void *worker_start(void *arg)
+{
+ HEVCContext * const s = (HEVCContext *)arg;
+
+ for (;;)
+ {
+ HEVCRpiJob * const jb = s->jb1;
+ while (sem_wait(&jb->sem_in) == -1 && errno == EINTR)
+ /* Loop */;
+ if (jb->terminate)
+ break;
+
+ LOG_ENTER
+ worker_core(s);
+ worker_complete_job(s);
+ LOG_EXIT
+ }
+ return NULL;
+}
+
+static void worker_pic_free_all(HEVCContext * const s)
+{
+ unsigned int i;
+
+ // Free coeff stuff - allocation not the same for all buffers
+ for(i = 0; i < RPI_MAX_JOBS; i++)
+ {
+ HEVCRpiCoeffsEnv * const cf = &s->jobs[i].coeffs;
+
+ if (cf->s[0].buf != NULL)
+ av_freep(&cf->mptr);
+ if (cf->s[2].buf != NULL)
+ gpu_free(&cf->gptr);
+ memset(cf, 0, sizeof(*cf));
+ }
+}
+
+static int worker_pic_alloc_all(HEVCContext * const s, const unsigned int coeff_count)
+{
+ unsigned int i;
+
+ // Free coeff stuff - allocation not the same for all buffers
+ for(i = 0; i < RPI_MAX_JOBS; i++)
+ {
+ HEVCRpiCoeffsEnv * const cf = &s->jobs[i].coeffs;
+
+// av_assert0(cf->s[0].n == 0 && cf->s[0].buf == NULL);
+// av_assert0(cf->s[1].n == 0 && cf->s[1].buf == NULL);
+// av_assert0(cf->s[2].n == 0 && cf->s[2].buf == NULL);
+// av_assert0(cf->s[3].n == 0 && cf->s[3].buf == NULL);
+
+ if (gpu_malloc_cached((coeff_count + 32*32) * sizeof(cf->s[2].buf[0]), &cf->gptr) != 0)
+ goto fail;
+ cf->s[2].buf = (int16_t *)cf->gptr.arm;
+ cf->s[3].buf = cf->s[2].buf + coeff_count;
+
+ // Must be 64 byte aligned for our zero apping code so over-allocate &
+ // round
+ if ((cf->mptr = av_malloc(coeff_count * sizeof(cf->s[0].buf[0]) + 63)) == NULL)
+ goto fail;
+ cf->s[0].buf = (void *)(((intptr_t)cf->mptr + 63) & ~63);
+ }
+ return 0;
+
+fail:
+ printf("%s: **** Failed\n", __func__);
+ worker_pic_free_all(s);
+ return -1;
+}
+
+static void worker_pic_reset(HEVCRpiCoeffsEnv * const cf)
+{
+ unsigned int i;
+ for (i = 0; i != 4; ++i) {
+ cf->s[i].n = 0;
+ }
+}
+#endif
+
+
/**
* NOTE: Each function hls_foo correspond to the function foo in the
* specification (HLS stands for High Level Syntax).
@@ -55,6 +393,23 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
/* free everything allocated by pic_arrays_init() */
static void pic_arrays_free(HEVCContext *s)
{
+#ifdef RPI
+ worker_pic_free_all(s);
+#endif
+
+#ifdef RPI_DEBLOCK_VPU
+ {
+ int i;
+ for (i = 0; i != RPI_DEBLOCK_VPU_Q_COUNT; ++i) {
+ struct dblk_vpu_q_s * const dvq = s->dvq_ents + i;
+
+ if (dvq->vpu_cmds_arm) {
+ gpu_free(&dvq->deblock_vpu_gmem);
+ dvq->vpu_cmds_arm = 0;
+ }
+ }
+ }
+#endif
av_freep(&s->sao);
av_freep(&s->deblock);
@@ -91,6 +446,74 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
int ctb_count = sps->ctb_width * sps->ctb_height;
int min_pu_size = sps->min_pu_width * sps->min_pu_height;
+#ifdef RPI
+ const int coefs_in_ctb = (1 << sps->log2_ctb_size) * (1 << sps->log2_ctb_size);
+ const int coefs_per_luma = 64*64*RPI_CHUNK_SIZE*RPI_NUM_CHUNKS;
+ const int coefs_per_chroma = (coefs_per_luma * 2) >> sps->vshift[1] >> sps->hshift[1];
+ const int coefs_per_row = coefs_per_luma + coefs_per_chroma;
+
+ av_assert0(sps);
+ s->max_ctu_count = coefs_per_luma / coefs_in_ctb;
+#if RPI_ROUND_TO_LINES
+ // Round down to an integral quantity of lines
+ if (s->max_ctu_count > sps->ctb_width)
+ s->max_ctu_count -= s->max_ctu_count % sps->ctb_width;
+#endif
+
+ if (worker_pic_alloc_all(s, coefs_per_row) != 0)
+ goto fail;
+#endif
+#ifdef RPI_DEBLOCK_VPU
+ {
+ int i;
+ s->enable_rpi_deblock = !sps->sao_enabled;
+ s->setup_width = (sps->width+15) / 16;
+ s->setup_height = (sps->height+15) / 16;
+ s->uv_setup_width = ( (sps->width >> sps->hshift[1]) + 15) / 16;
+ s->uv_setup_height = ( (sps->height >> sps->vshift[1]) + 15) / 16;
+
+ for (i = 0; i != RPI_DEBLOCK_VPU_Q_COUNT; ++i)
+ {
+ struct dblk_vpu_q_s * const dvq = s->dvq_ents + i;
+ const unsigned int cmd_size = (sizeof(*dvq->vpu_cmds_arm) * 3 + 15) & ~15;
+ const unsigned int y_size = (sizeof(*dvq->y_setup_arm) * s->setup_width * s->setup_height + 15) & ~15;
+ const unsigned int uv_size = (sizeof(*dvq->uv_setup_arm) * s->uv_setup_width * s->uv_setup_height + 15) & ~15;
+ const unsigned int total_size =- cmd_size + y_size + uv_size;
+ int p_vc;
+ uint8_t * p_arm;
+ #if RPI_VPU_DEBLOCK_CACHED
+ gpu_malloc_cached(total_size, &dvq->deblock_vpu_gmem);
+ #else
+ gpu_malloc_uncached(total_size, &dvq->deblock_vpu_gmem);
+ #endif
+ p_vc = dvq->deblock_vpu_gmem.vc;
+ p_arm = dvq->deblock_vpu_gmem.arm;
+
+ // Zap all
+ memset(p_arm, 0, dvq->deblock_vpu_gmem.numbytes);
+
+ // Subdivide
+ dvq->vpu_cmds_arm = (void*)p_arm;
+ dvq->vpu_cmds_vc = p_vc;
+
+ p_arm += cmd_size;
+ p_vc += cmd_size;
+
+ dvq->y_setup_arm = (void*)p_arm;
+ dvq->y_setup_vc = (void*)p_vc;
+
+ p_arm += y_size;
+ p_vc += y_size;
+
+ dvq->uv_setup_arm = (void*)p_arm;
+ dvq->uv_setup_vc = (void*)p_vc;
+ }
+
+ s->dvq_n = 0;
+ s->dvq = s->dvq_ents + s->dvq_n;
+ }
+#endif
+
s->bs_width = (width >> 2) + 1;
s->bs_height = (height >> 2) + 1;
@@ -137,6 +560,29 @@ fail:
return AVERROR(ENOMEM);
}
+static void default_pred_weight_table(HEVCContext * const s)
+{
+ unsigned int i;
+ s->sh.luma_log2_weight_denom = 0;
+ s->sh.chroma_log2_weight_denom = 0;
+ for (i = 0; i < s->sh.nb_refs[L0]; i++) {
+ s->sh.luma_weight_l0[i] = 1;
+ s->sh.luma_offset_l0[i] = 0;
+ s->sh.chroma_weight_l0[i][0] = 1;
+ s->sh.chroma_offset_l0[i][0] = 0;
+ s->sh.chroma_weight_l0[i][1] = 1;
+ s->sh.chroma_offset_l0[i][1] = 0;
+ }
+ for (i = 0; i < s->sh.nb_refs[L1]; i++) {
+ s->sh.luma_weight_l1[i] = 1;
+ s->sh.luma_offset_l1[i] = 0;
+ s->sh.chroma_weight_l1[i][0] = 1;
+ s->sh.chroma_offset_l1[i][0] = 0;
+ s->sh.chroma_weight_l1[i][1] = 1;
+ s->sh.chroma_offset_l1[i][1] = 0;
+ }
+}
+
static void pred_weight_table(HEVCContext *s, GetBitContext *gb)
{
int i = 0;
@@ -337,8 +783,8 @@ static void export_stream_params(AVCodecContext *avctx, const HEVCParamSets *ps,
static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fmt)
{
#define HWACCEL_MAX (CONFIG_HEVC_DXVA2_HWACCEL + CONFIG_HEVC_D3D11VA_HWACCEL + CONFIG_HEVC_VAAPI_HWACCEL + CONFIG_HEVC_VDPAU_HWACCEL)
- enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmt = pix_fmts;
- int ret, i;
+ enum AVPixelFormat pix_fmts[HWACCEL_MAX + 4], *fmt = pix_fmts;
+ int ret;
pic_arrays_free(s);
s->ps.sps = NULL;
@@ -356,6 +802,12 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm
switch (sps->pix_fmt) {
case AV_PIX_FMT_YUV420P:
case AV_PIX_FMT_YUVJ420P:
+#if RPI_HEVC_SAND
+ // Currently geometry calc is stuffed for big sizes
+ if (sps->width < 2048 && sps->height <= 1088) {
+ *fmt++ = AV_PIX_FMT_SAND128;
+ }
+#endif
#if CONFIG_HEVC_DXVA2_HWACCEL
*fmt++ = AV_PIX_FMT_DXVA2_VLD;
#endif
@@ -370,6 +822,12 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm
#endif
break;
case AV_PIX_FMT_YUV420P10:
+#if RPI_HEVC_SAND
+ // Currently geometry calc is stuffed for big sizes
+ if (sps->width < 2048 && sps->height <= 1088) {
+ *fmt++ = AV_PIX_FMT_SAND64_10;
+ }
+#endif
#if CONFIG_HEVC_DXVA2_HWACCEL
*fmt++ = AV_PIX_FMT_DXVA2_VLD;
#endif
@@ -386,6 +844,7 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm
ret = ff_thread_get_format(s->avctx, pix_fmts);
if (ret < 0)
goto fail;
+
s->avctx->pix_fmt = ret;
}
else {
@@ -395,26 +854,36 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm
ff_hevc_pred_init(&s->hpc, sps->bit_depth);
ff_hevc_dsp_init (&s->hevcdsp, sps->bit_depth);
ff_videodsp_init (&s->vdsp, sps->bit_depth);
+#ifdef RPI
+ rpi_hevc_qpu_set_fns(s, sps->bit_depth);
+#endif
- for (i = 0; i < 3; i++) {
- av_freep(&s->sao_pixel_buffer_h[i]);
- av_freep(&s->sao_pixel_buffer_v[i]);
- }
+ av_freep(&s->sao_pixel_buffer_h[0]);
+ av_freep(&s->sao_pixel_buffer_v[0]);
if (sps->sao_enabled && !s->avctx->hwaccel) {
- int c_count = (sps->chroma_format_idc != 0) ? 3 : 1;
- int c_idx;
+ const unsigned int c_count = (sps->chroma_format_idc != 0) ? 3 : 1;
+ unsigned int c_idx;
+ size_t vsize[3] = {0};
+ size_t hsize[3] = {0};
for(c_idx = 0; c_idx < c_count; c_idx++) {
int w = sps->width >> sps->hshift[c_idx];
int h = sps->height >> sps->vshift[c_idx];
- s->sao_pixel_buffer_h[c_idx] =
- av_malloc((w * 2 * sps->ctb_height) <<
- sps->pixel_shift);
- s->sao_pixel_buffer_v[c_idx] =
- av_malloc((h * 2 * sps->ctb_width) <<
- sps->pixel_shift);
+ // ctb height & width are a min of 8 so this must a multiple of 16
+ // so no point rounding up!
+ hsize[c_idx] = (w * 2 * sps->ctb_height) << sps->pixel_shift;
+ vsize[c_idx] = (h * 2 * sps->ctb_width) << sps->pixel_shift;
}
+
+ // Allocate as a single lump so we can extend h[1] & v[1] into h[2] & v[2]
+ // when we have plaited chroma
+ s->sao_pixel_buffer_h[0] = av_malloc(hsize[0] + hsize[1] + hsize[2]);
+ s->sao_pixel_buffer_v[0] = av_malloc(vsize[0] + vsize[1] + vsize[2]);
+ s->sao_pixel_buffer_h[1] = s->sao_pixel_buffer_h[0] + hsize[0];
+ s->sao_pixel_buffer_h[2] = s->sao_pixel_buffer_h[1] + hsize[1];
+ s->sao_pixel_buffer_v[1] = s->sao_pixel_buffer_v[0] + vsize[0];
+ s->sao_pixel_buffer_v[2] = s->sao_pixel_buffer_v[1] + vsize[1];
}
s->ps.sps = sps;
@@ -680,6 +1149,11 @@ static int hls_slice_header(HEVCContext *s)
(s->ps.pps->weighted_bipred_flag && sh->slice_type == B_SLICE)) {
pred_weight_table(s, gb);
}
+ else
+ {
+ // Give us unit weights
+ default_pred_weight_table(s);
+ }
sh->max_num_merge_cand = 5 - get_ue_golomb_long(gb);
if (sh->max_num_merge_cand < 1 || sh->max_num_merge_cand > 5) {
@@ -937,6 +1411,39 @@ static int hls_cross_component_pred(HEVCContext *s, int idx) {
return 0;
}
+#ifdef RPI
+static inline HEVCPredCmd * rpi_new_intra_cmd(HEVCContext * const s)
+{
+ return s->jb0->intra.cmds + s->jb0->intra.n++;
+}
+
+static void rpi_intra_pred(HEVCContext *s, int log2_trafo_size, int x0, int y0, int c_idx)
+{
+ // U & V done on U call in the case of sliced frames
+ if (av_rpi_is_sand_frame(s->frame) && c_idx > 1)
+ return;
+
+ if (s->enable_rpi) {
+ HEVCLocalContext *lc = s->HEVClc;
+ HEVCPredCmd *cmd = rpi_new_intra_cmd(s);
+ cmd->type = RPI_PRED_INTRA;
+ cmd->size = log2_trafo_size;
+ cmd->na = (lc->na.cand_bottom_left<<4) + (lc->na.cand_left<<3) + (lc->na.cand_up_left<<2) + (lc->na.cand_up<<1) + lc->na.cand_up_right;
+ cmd->c_idx = c_idx;
+ cmd->i_pred.x = x0;
+ cmd->i_pred.y = y0;
+ cmd->i_pred.mode = c_idx ? lc->tu.intra_pred_mode_c : lc->tu.intra_pred_mode;
+ }
+ else if (av_rpi_is_sand_frame(s->frame) && c_idx != 0) {
+ s->hpc.intra_pred_c[log2_trafo_size - 2](s, x0, y0, c_idx);
+ }
+ else {
+ s->hpc.intra_pred[log2_trafo_size - 2](s, x0, y0, c_idx);
+ }
+
+}
+#endif
+
static int hls_transform_unit(HEVCContext *s, int x0, int y0,
int xBase, int yBase, int cb_xBase, int cb_yBase,
int log2_cb_size, int log2_trafo_size,
@@ -949,8 +1456,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
if (lc->cu.pred_mode == MODE_INTRA) {
int trafo_size = 1 << log2_trafo_size;
ff_hevc_set_neighbour_available(s, x0, y0, trafo_size, trafo_size);
-
+#ifdef RPI
+ rpi_intra_pred(s, log2_trafo_size, x0, y0, 0);
+#else
s->hpc.intra_pred[log2_trafo_size - 2](s, x0, y0, 0);
+#endif
}
if (cbf_luma || cbf_cb[0] || cbf_cr[0] ||
@@ -1036,7 +1546,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
if (lc->cu.pred_mode == MODE_INTRA) {
ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
+#ifdef RPI
+ rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 1);
+#else
s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (i << log2_trafo_size_c), 1);
+#endif
}
if (cbf_cb[i])
ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
@@ -1065,7 +1579,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
if (lc->cu.pred_mode == MODE_INTRA) {
ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
+#ifdef RPI
+ rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 2);
+#else
s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (i << log2_trafo_size_c), 2);
+#endif
}
if (cbf_cr[i])
ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
@@ -1094,7 +1612,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
if (lc->cu.pred_mode == MODE_INTRA) {
ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
trafo_size_h, trafo_size_v);
+#ifdef RPI
+ rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 1);
+#else
s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (i << log2_trafo_size), 1);
+#endif
}
if (cbf_cb[i])
ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
@@ -1104,7 +1626,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
if (lc->cu.pred_mode == MODE_INTRA) {
ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
trafo_size_h, trafo_size_v);
+#ifdef RPI
+ rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 2);
+#else
s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (i << log2_trafo_size), 2);
+#endif
}
if (cbf_cr[i])
ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
@@ -1116,26 +1642,46 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
int trafo_size_h = 1 << (log2_trafo_size_c + s->ps.sps->hshift[1]);
int trafo_size_v = 1 << (log2_trafo_size_c + s->ps.sps->vshift[1]);
ff_hevc_set_neighbour_available(s, x0, y0, trafo_size_h, trafo_size_v);
+#ifdef RPI
+ rpi_intra_pred(s, log2_trafo_size_c, x0, y0, 1);
+ rpi_intra_pred(s, log2_trafo_size_c, x0, y0, 2);
+#else
s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0, 1);
s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0, 2);
+#endif
if (s->ps.sps->chroma_format_idc == 2) {
ff_hevc_set_neighbour_available(s, x0, y0 + (1 << log2_trafo_size_c),
trafo_size_h, trafo_size_v);
+#ifdef RPI
+ rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 1);
+ rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 2);
+#else
s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (1 << log2_trafo_size_c), 1);
s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (1 << log2_trafo_size_c), 2);
+#endif
}
} else if (blk_idx == 3) {
int trafo_size_h = 1 << (log2_trafo_size + 1);
int trafo_size_v = 1 << (log2_trafo_size + s->ps.sps->vshift[1]);
ff_hevc_set_neighbour_available(s, xBase, yBase,
trafo_size_h, trafo_size_v);
+#ifdef RPI
+ rpi_intra_pred(s, log2_trafo_size, xBase, yBase, 1);
+ rpi_intra_pred(s, log2_trafo_size, xBase, yBase, 2);
+#else
s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase, 1);
s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase, 2);
+#endif
if (s->ps.sps->chroma_format_idc == 2) {
ff_hevc_set_neighbour_available(s, xBase, yBase + (1 << (log2_trafo_size)),
trafo_size_h, trafo_size_v);
+#ifdef RPI
+ rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 1);
+ rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 2);
+#else
s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (1 << (log2_trafo_size)), 1);
s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (1 << (log2_trafo_size)), 2);
+#endif
}
}
}
@@ -1281,47 +1827,119 @@ do {
return 0;
}
-static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size)
+
+static int pcm_extract(HEVCContext * const s, const uint8_t * pcm, const int length, const int x0, const int y0, const int cb_size)
{
- HEVCLocalContext *lc = s->HEVClc;
GetBitContext gb;
- int cb_size = 1 << log2_cb_size;
- int stride0 = s->frame->linesize[0];
- uint8_t *dst0 = &s->frame->data[0][y0 * stride0 + (x0 << s->ps.sps->pixel_shift)];
- int stride1 = s->frame->linesize[1];
- uint8_t *dst1 = &s->frame->data[1][(y0 >> s->ps.sps->vshift[1]) * stride1 + ((x0 >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
- int stride2 = s->frame->linesize[2];
- uint8_t *dst2 = &s->frame->data[2][(y0 >> s->ps.sps->vshift[2]) * stride2 + ((x0 >> s->ps.sps->hshift[2]) << s->ps.sps->pixel_shift)];
-
- int length = cb_size * cb_size * s->ps.sps->pcm.bit_depth +
- (((cb_size >> s->ps.sps->hshift[1]) * (cb_size >> s->ps.sps->vshift[1])) +
- ((cb_size >> s->ps.sps->hshift[2]) * (cb_size >> s->ps.sps->vshift[2]))) *
- s->ps.sps->pcm.bit_depth_chroma;
- const uint8_t *pcm = skip_bytes(&lc->cc, (length + 7) >> 3);
int ret;
- if (!s->sh.disable_deblocking_filter_flag)
- ff_hevc_deblocking_boundary_strengths(s, x0, y0, log2_cb_size);
-
ret = init_get_bits(&gb, pcm, length);
if (ret < 0)
return ret;
- s->hevcdsp.put_pcm(dst0, stride0, cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth);
- if (s->ps.sps->chroma_format_idc) {
- s->hevcdsp.put_pcm(dst1, stride1,
+#if RPI_HEVC_SAND
+ if (av_rpi_is_sand_frame(s->frame)) {
+ s->hevcdsp.put_pcm(av_rpi_sand_frame_pos_y(s->frame, x0, y0),
+ s->frame->linesize[0],
+ cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth);
+
+ s->hevcdsp.put_pcm_c(av_rpi_sand_frame_pos_c(s->frame, x0 >> s->ps.sps->hshift[1], y0 >> s->ps.sps->vshift[1]),
+ s->frame->linesize[1],
cb_size >> s->ps.sps->hshift[1],
cb_size >> s->ps.sps->vshift[1],
&gb, s->ps.sps->pcm.bit_depth_chroma);
- s->hevcdsp.put_pcm(dst2, stride2,
- cb_size >> s->ps.sps->hshift[2],
- cb_size >> s->ps.sps->vshift[2],
- &gb, s->ps.sps->pcm.bit_depth_chroma);
}
+ else
+#endif
+ {
+ const int stride0 = s->frame->linesize[0];
+ uint8_t * const dst0 = &s->frame->data[0][y0 * stride0 + (x0 << s->ps.sps->pixel_shift)];
+ const int stride1 = s->frame->linesize[1];
+ uint8_t * const dst1 = &s->frame->data[1][(y0 >> s->ps.sps->vshift[1]) * stride1 + ((x0 >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
+ const int stride2 = s->frame->linesize[2];
+ uint8_t * const dst2 = &s->frame->data[2][(y0 >> s->ps.sps->vshift[2]) * stride2 + ((x0 >> s->ps.sps->hshift[2]) << s->ps.sps->pixel_shift)];
+
+ s->hevcdsp.put_pcm(dst0, stride0, cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth);
+ if (s->ps.sps->chroma_format_idc) {
+ s->hevcdsp.put_pcm(dst1, stride1,
+ cb_size >> s->ps.sps->hshift[1],
+ cb_size >> s->ps.sps->vshift[1],
+ &gb, s->ps.sps->pcm.bit_depth_chroma);
+ s->hevcdsp.put_pcm(dst2, stride2,
+ cb_size >> s->ps.sps->hshift[2],
+ cb_size >> s->ps.sps->vshift[2],
+ &gb, s->ps.sps->pcm.bit_depth_chroma);
+ }
+ }
return 0;
}
+#ifdef RPI
+int16_t * rpi_alloc_coeff_buf(HEVCContext * const s, const int buf_no, const int n)
+{
+ HEVCRpiCoeffEnv *const cfe = s->jb0->coeffs.s + buf_no;
+ int16_t * const coeffs = (buf_no != 3) ? cfe->buf + cfe->n : cfe->buf - (cfe->n + n);
+ cfe->n += n;
+ return coeffs;
+}
+#endif
+
+// x * 2^(y*2)
+static inline unsigned int xyexp2(const unsigned int x, const unsigned int y)
+{
+ return x << (y * 2);
+}
+
+static int hls_pcm_sample(HEVCContext * const s, const int x0, const int y0, unsigned int log2_cb_size)
+{
+ // Length in bits
+ const unsigned int length = xyexp2(s->ps.sps->pcm.bit_depth, log2_cb_size) +
+ xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - s->ps.sps->vshift[1]) +
+ xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - s->ps.sps->vshift[2]);
+
+ const uint8_t * const pcm = skip_bytes(&s->HEVClc->cc, (length + 7) >> 3);
+
+ if (!s->sh.disable_deblocking_filter_flag)
+ ff_hevc_deblocking_boundary_strengths(s, x0, y0, log2_cb_size);
+
+#ifdef RPI
+ if (s->enable_rpi) {
+ // Copy coeffs
+ const int blen = (length + 7) >> 3;
+ // Round allocated bytes up to nearest 32 to avoid alignment confusion
+ // Allocation is in int16_t s
+ // As we are only using 1 byte per sample and the coeff buffer allows 2 per
+ // sample this rounding doesn't affect the total size we need to allocate for
+ // the coeff buffer
+ int16_t * const coeffs = rpi_alloc_coeff_buf(s, 0, ((blen + 31) & ~31) >> 1);
+ memcpy(coeffs, pcm, blen);
+
+ // Our coeff stash assumes that any partially allocated 64byte lump
+ // is zeroed so make that true.
+ {
+ uint8_t * const eopcm = (uint8_t *)coeffs + blen;
+ if ((-(intptr_t)eopcm & 63) != 0)
+ memset(eopcm, 0, -(intptr_t)eopcm & 63);
+ }
+
+ // Add command
+ {
+ HEVCPredCmd *const cmd = rpi_new_intra_cmd(s);
+ cmd->type = RPI_PRED_I_PCM;
+ cmd->size = log2_cb_size;
+ cmd->i_pcm.src = coeffs;
+ cmd->i_pcm.x = x0;
+ cmd->i_pcm.y = y0;
+ cmd->i_pcm.src_len = length;
+ }
+ return 0;
+ }
+#endif
+
+ return pcm_extract(s, pcm, length, x0, y0, 1 << log2_cb_size);
+}
+
/**
* 8.5.3.2.2.1 Luma sample unidirectional interpolation process
*
@@ -1353,6 +1971,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
(s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
int idx = ff_hevc_pel_weight[block_w];
+#ifdef DISABLE_MC
+ return;
+#endif
+
x_off += mv->x >> 2;
y_off += mv->y >> 2;
src += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
@@ -1399,7 +2021,7 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
* @param mv1 motion vector1 (relative to block position) to get pixel data from
* @param current_mv current motion vector structure
*/
- static void luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+static void luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv)
{
@@ -1423,6 +2045,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
uint8_t *src0 = ref0->data[0] + y_off0 * src0stride + (int)((unsigned)x_off0 << s->ps.sps->pixel_shift);
uint8_t *src1 = ref1->data[0] + y_off1 * src1stride + (int)((unsigned)x_off1 << s->ps.sps->pixel_shift);
+#ifdef DISABLE_MC
+ return;
+#endif
+
if (x_off0 < QPEL_EXTRA_BEFORE || y_off0 < QPEL_EXTRA_AFTER ||
x_off0 >= pic_width - block_w - QPEL_EXTRA_AFTER ||
y_off0 >= pic_height - block_h - QPEL_EXTRA_AFTER) {
@@ -1508,6 +2134,10 @@ static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
intptr_t _mx = mx << (1 - hshift);
intptr_t _my = my << (1 - vshift);
+#ifdef DISABLE_MC
+ return;
+#endif
+
x_off += mv->x >> (2 + hshift);
y_off += mv->y >> (2 + vshift);
src0 += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
@@ -1572,6 +2202,10 @@ static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVF
int hshift = s->ps.sps->hshift[1];
int vshift = s->ps.sps->vshift[1];
+#ifdef DISABLE_MC
+ return;
+#endif
+
intptr_t mx0 = av_mod_uintp2(mv0->x, 2 + hshift);
intptr_t my0 = av_mod_uintp2(mv0->y, 2 + vshift);
intptr_t mx1 = av_mod_uintp2(mv1->x, 2 + hshift);
@@ -1645,13 +2279,112 @@ static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVF
_mx1, _my1, block_w);
}
-static void hevc_await_progress(HEVCContext *s, HEVCFrame *ref,
- const Mv *mv, int y0, int height)
+#ifdef RPI
+void ff_hevc_rpi_progress_wait_field(HEVCContext * const s, HEVCRpiJob * const jb,
+ const HEVCFrame * const ref, const int val, const int field)
{
- int y = FFMAX(0, (mv->y >> 2) + y0 + height + 9);
+ if (ref->tf.progress != NULL && ((int *)ref->tf.progress->data)[field] < val) {
+ HEVCContext *const fs = ref->tf.owner->priv_data;
+ HEVCRPiFrameProgressState * const pstate = fs->progress_states + field;
+ sem_t * sem = NULL;
+
+ av_assert0(pthread_mutex_lock(&pstate->lock) == 0);
+ if (((volatile int *)ref->tf.progress->data)[field] < val) {
+ HEVCRPiFrameProgressWait * const pwait = &jb->progress_wait;
+
+ av_assert0(pwait->req == -1 && pwait->next == NULL);
- if (s->threads_type == FF_THREAD_FRAME )
- ff_thread_await_progress(&ref->tf, y, 0);
+ pwait->req = val;
+ pwait->next = NULL;
+ if (pstate->first == NULL)
+ pstate->first = pwait;
+ else
+ pstate->last->next = pwait;
+ pstate->last = pwait;
+ sem = &pwait->sem;
+ }
+ pthread_mutex_unlock(&pstate->lock);
+
+ if (sem != NULL) {
+ while (sem_wait(sem) != 0)
+ av_assert0(errno == EINTR);
+ }
+ }
+}
+
+void ff_hevc_rpi_progress_signal_field(HEVCContext * const s, const int val, const int field)
+{
+ HEVCRPiFrameProgressState *const pstate = s->progress_states + field;
+
+ ((int *)s->ref->tf.progress->data)[field] = val;
+
+ av_assert0(pthread_mutex_lock(&pstate->lock) == 0);
+ {
+ HEVCRPiFrameProgressWait ** ppwait = &pstate->first;
+ HEVCRPiFrameProgressWait * pwait;
+
+ while ((pwait = *ppwait) != NULL) {
+ if (pwait->req > val)
+ {
+ ppwait = &pwait->next;
+ pstate->last = pwait;
+ }
+ else
+ {
+ *ppwait = pwait->next;
+ pwait->req = -1;
+ pwait->next = NULL;
+ sem_post(&pwait->sem);
+ }
+ }
+ }
+ pthread_mutex_unlock(&pstate->lock);
+}
+
+static void ff_hevc_rpi_progress_init_state(HEVCRPiFrameProgressState * const pstate)
+{
+ pstate->first = NULL;
+ pstate->last = NULL;
+ pthread_mutex_init(&pstate->lock, NULL);
+}
+
+static void ff_hevc_rpi_progress_init_wait(HEVCRPiFrameProgressWait * const pwait)
+{
+ pwait->req = -1;
+ pwait->next = NULL;
+ sem_init(&pwait->sem, 0, 0);
+}
+
+static void ff_hevc_rpi_progress_kill_state(HEVCRPiFrameProgressState * const pstate)
+{
+ av_assert0(pstate->first == NULL);
+ pthread_mutex_destroy(&pstate->lock);
+}
+
+static void ff_hevc_rpi_progress_kill_wait(HEVCRPiFrameProgressWait * const pwait)
+{
+ sem_destroy(&pwait->sem);
+}
+#endif
+
+static void hevc_await_progress(HEVCContext *s, const HEVCFrame * const ref,
+ const Mv * const mv, const int y0, const int height)
+{
+ if (s->threads_type == FF_THREAD_FRAME) {
+ const int y = FFMAX(0, (mv->y >> 2) + y0 + height + 9);
+
+#ifdef RPI
+ if (s->enable_rpi) {
+ int16_t *const pr = s->jb0->progress + ref->dpb_no;
+ if (*pr < y) {
+ *pr = y;
+ }
+ }
+ else
+#endif
+ // It is a const ThreadFrame but the prototype isn't
+ ff_hevc_progress_wait_mv(s, s->jb0, ref, y);
+ }
}
static void hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW,
@@ -1699,14 +2432,542 @@ static void hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW,
}
}
-static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
- int nPbW, int nPbH,
- int log2_cb_size, int partIdx, int idx)
+
+#if RPI_INTER
+
+static HEVCRpiInterPredQ *
+rpi_nxt_pred(HEVCRpiInterPredEnv * const ipe, const unsigned int load_val, const uint32_t fn)
+{
+ HEVCRpiInterPredQ * yp = ipe->q + ipe->curr;
+ HEVCRpiInterPredQ * ypt = yp + 1;
+ for (unsigned int i = 1; i != ipe->n_grp; ++i, ++ypt) {
+ if (ypt->load < yp->load)
+ yp = ypt;
+ }
+
+ yp->load += load_val;
+ ipe->used_grp = 1;
+ yp->qpu_mc_curr->data[-1] = fn; // Link is always last el of previous cmd
+
+ return yp;
+}
+
+
+static void rpi_inter_pred_sync(HEVCRpiInterPredEnv * const ipe)
+{
+ for (unsigned int i = 0; i != ipe->n; ++i) {
+ HEVCRpiInterPredQ * const q = ipe->q + i;
+ q->qpu_mc_curr->data[-1] = q->code_sync;
+ q->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(q->qpu_mc_curr->data + 1);
+ q->load = 0;
+ }
+}
+
+// Returns 0 on success, -1 if Q is dangerously full
+static int rpi_inter_pred_next_ctu(HEVCRpiInterPredEnv * const ipe)
+{
+ if (!ipe->used_grp)
+ return 0;
+
+ if ((ipe->curr += ipe->n_grp) >= ipe->n)
+ {
+ ipe->curr = 0;
+ rpi_inter_pred_sync(ipe);
+ }
+ ipe->used = 1;
+ ipe->used_grp = 0;
+
+ for (unsigned int i = 0; i != ipe->n_grp; ++i) {
+ HEVCRpiInterPredQ * const q = ipe->q + i + ipe->curr;
+ if ((char *)q->qpu_mc_curr - (char *)q->qpu_mc_base > ipe->max_fill) {
+ return -1;
+ }
+ }
+ return 0;
+}
+
+static void rpi_inter_pred_reset(HEVCRpiInterPredEnv * const ipe)
+{
+ unsigned int i;
+ ipe->curr = 0;
+ ipe->used = 0;
+ ipe->used_grp = 0;
+ for (i = 0; i != ipe->n; ++i) {
+ HEVCRpiInterPredQ * const q = ipe->q + i;
+ q->qpu_mc_curr = q->qpu_mc_base;
+ q->load = 0;
+ q->last_l0 = NULL;
+ q->last_l1 = NULL;
+ }
+}
+
+static void rpi_inter_pred_alloc(HEVCRpiInterPredEnv * const ipe,
+ const unsigned int n_max, const unsigned int n_grp,
+ const unsigned int total_size, const unsigned int min_gap)
+{
+ memset(ipe, 0, sizeof(*ipe));
+ av_assert0((ipe->q = av_mallocz(n_max * sizeof(*ipe->q))) != NULL);
+ ipe->n_grp = n_grp;
+ ipe->min_gap = min_gap;
+
+#if RPI_CACHE_UNIF_MVS
+ gpu_malloc_cached(total_size, &ipe->gptr);
+#else
+ gpu_malloc_uncached(total_size, &ipe->gptr);
+#endif
+}
+
+
+#if RPI_QPU_EMU_Y
+#define get_mc_address_y(f) ((f)->data[0])
+#else
+#define get_mc_address_y(f) get_vc_address_y(f)
+#endif
+#if RPI_QPU_EMU_C
+#define get_mc_address_u(f) ((f)->data[1])
+#else
+#define get_mc_address_u(f) get_vc_address_u(f)
+#endif
+
+static inline int offset_depth_adj(const HEVCContext *const s, const int wt)
+{
+ return s->ps.sps->high_precision_offsets_enabled_flag ? wt :
+ wt << (s->ps.sps->bit_depth - 8);
+}
+
+static void
+rpi_pred_y(HEVCContext *const s, const int x0, const int y0,
+ const int nPbW, const int nPbH,
+ const Mv *const mv,
+ const int weight_mul,
+ const int weight_offset,
+ AVFrame *const src_frame)
+{
+ const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0);
+ const unsigned int mx = mv->x & 3;
+ const unsigned int my = mv->y & 3;
+ const unsigned int my_mx = (my << 8) | mx;
+ const uint32_t my2_mx2_my_mx = (my_mx << 16) | my_mx;
+ const qpu_mc_src_addr_t src_vc_address_y = get_mc_address_y(src_frame);
+ qpu_mc_dst_addr_t dst_addr = get_mc_address_y(s->frame) + y_off;
+ const uint32_t wo = PACK2(offset_depth_adj(s, weight_offset) * 2 + 1, weight_mul);
+ HEVCRpiInterPredEnv * const ipe = &s->jb0->luma_ip;
+ const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame);
+
+ if (my_mx == 0)
+ {
+ const int x1 = x0 + (mv->x >> 2);
+ const int y1 = y0 + (mv->y >> 2);
+ const int bh = nPbH;
+
+ for (int start_x = 0; start_x < nPbW; start_x += 16)
+ {
+ const int bw = FFMIN(nPbW - start_x, 16);
+ HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu.y_p00);
+ qpu_mc_src_t *const src1 = yp->last_l0;
+ qpu_mc_pred_y_p00_t *const cmd_y = &yp->qpu_mc_curr->y.p00;
+
+#if RPI_TSTATS
+ {
+ HEVCRpiStats *const ts = &s->tstats;
+ ++ts->y_pred1_x0y0;
+
+ if (nPbW > 8)
+ ++ts->y_pred1_wgt8;
+ else
+ ++ts->y_pred1_wle8;
+
+ if (nPbH > 16)
+ ++ts->y_pred1_hgt16;
+ else
+ ++ts->y_pred1_hle16;
+ }
+#endif
+
+ src1->x = x1 + start_x;
+ src1->y = y1;
+ src1->base = src_vc_address_y;
+ cmd_y->w = bw;
+ cmd_y->h = bh;
+ cmd_y->wo1 = wo;
+ cmd_y->dst_addr = dst_addr + (start_x << xshl);
+ yp->last_l0 = &cmd_y->next_src1;
+ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
+ }
+ }
+ else
+ {
+ const int x1_m3 = x0 + (mv->x >> 2) - 3;
+ const int y1_m3 = y0 + (mv->y >> 2) - 3;
+ const unsigned int bh = nPbH;
+ int start_x = 0;
+
+#if 1
+ // As Y-pred operates on two independant 8-wide src blocks we can merge
+ // this pred with the previous one if it the previous one is 8 pel wide,
+ // the same height as the current block, immediately to the left of our
+ // current dest block and mono-pred.
+
+ qpu_mc_pred_y_p_t *const last_y8_p = s->last_y8_p;
+ if (last_y8_p != NULL && last_y8_p->h == bh && last_y8_p->dst_addr + (8 << xshl) == dst_addr)
+ {
+ const int bw = FFMIN(nPbW, 8);
+ qpu_mc_src_t *const last_y8_src2 = s->last_y8_l1;
+
+ last_y8_src2->x = x1_m3;
+ last_y8_src2->y = y1_m3;
+ last_y8_src2->base = src_vc_address_y;
+ last_y8_p->w += bw;
+ last_y8_p->mymx21 = PACK2(my2_mx2_my_mx, last_y8_p->mymx21);
+ last_y8_p->wo2 = wo;
+
+ s->last_y8_p = NULL;
+ s->last_y8_l1 = NULL;
+ start_x = bw;
+#if RPI_TSTATS
+ ++s->tstats.y_pred1_y8_merge;
+#endif
+ }
+#endif
+
+ for (; start_x < nPbW; start_x += 16)
+ {
+ const int bw = FFMIN(nPbW - start_x, 16);
+ HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu.y_pxx);
+ qpu_mc_src_t *const src1 = yp->last_l0;
+ qpu_mc_src_t *const src2 = yp->last_l1;
+ qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
+#if RPI_TSTATS
+ {
+ HEVCRpiStats *const ts = &s->tstats;
+ if (mx == 0 && my == 0)
+ ++ts->y_pred1_x0y0;
+ else if (mx == 0)
+ ++ts->y_pred1_x0;
+ else if (my == 0)
+ ++ts->y_pred1_y0;
+ else
+ ++ts->y_pred1_xy;
+
+ if (nPbW > 8)
+ ++ts->y_pred1_wgt8;
+ else
+ ++ts->y_pred1_wle8;
+
+ if (nPbH > 16)
+ ++ts->y_pred1_hgt16;
+ else
+ ++ts->y_pred1_hle16;
+ }
+#endif
+ src1->x = x1_m3 + start_x;
+ src1->y = y1_m3;
+ src1->base = src_vc_address_y;
+ if (bw <= 8)
+ {
+ src2->x = MC_DUMMY_X;
+ src2->y = MC_DUMMY_Y;
+#if RPI_QPU_EMU_Y
+ src2->base = s->qpu_dummy_frame_emu;
+#else
+ src2->base = s->qpu_dummy_frame_qpu;
+#endif
+ }
+ else
+ {
+ src2->x = x1_m3 + start_x + 8;
+ src2->y = y1_m3;
+ src2->base = src_vc_address_y;
+ }
+ cmd_y->w = bw;
+ cmd_y->h = bh;
+ cmd_y->mymx21 = my2_mx2_my_mx;
+ cmd_y->wo1 = wo;
+ cmd_y->wo2 = wo;
+ cmd_y->dst_addr = dst_addr + (start_x << xshl);
+ yp->last_l0 = &cmd_y->next_src1;
+ yp->last_l1 = &cmd_y->next_src2;
+ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
+
+ if (bw == 8) {
+ s->last_y8_l1 = src2;
+ s->last_y8_p = cmd_y;
+ }
+ }
+ }
+}
+
+static void
+rpi_pred_y_b(HEVCContext * const s,
+ const int x0, const int y0,
+ const int nPbW, const int nPbH,
+ const struct MvField *const mv_field,
+ AVFrame *const src_frame,
+ AVFrame *const src_frame2)
+{
+ const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0);
+ const Mv * const mv = mv_field->mv + 0;
+ const Mv * const mv2 = mv_field->mv + 1;
+
+ const unsigned int mx = mv->x & 3;
+ const unsigned int my = mv->y & 3;
+ const unsigned int my_mx = (my<<8) | mx;
+ const unsigned int mx2 = mv2->x & 3;
+ const unsigned int my2 = mv2->y & 3;
+ const unsigned int my2_mx2 = (my2<<8) | mx2;
+ const uint32_t my2_mx2_my_mx = (my2_mx2 << 16) | my_mx;
+ const unsigned int ref_idx0 = mv_field->ref_idx[0];
+ const unsigned int ref_idx1 = mv_field->ref_idx[1];
+ const uint32_t wt_offset =
+ offset_depth_adj(s, s->sh.luma_offset_l0[ref_idx0] + s->sh.luma_offset_l1[ref_idx1]) + 1;
+ const uint32_t wo1 = PACK2(wt_offset, s->sh.luma_weight_l0[ref_idx0]);
+ const uint32_t wo2 = PACK2(wt_offset, s->sh.luma_weight_l1[ref_idx1]);
+
+ const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame);
+ qpu_mc_dst_addr_t dst = get_mc_address_y(s->frame) + y_off;
+ const qpu_mc_src_addr_t src1_base = get_mc_address_y(src_frame);
+ const qpu_mc_src_addr_t src2_base = get_mc_address_y(src_frame2);
+ HEVCRpiInterPredEnv * const ipe = &s->jb0->luma_ip;
+
+ if (my2_mx2_my_mx == 0)
+ {
+ const int x1 = x0 + (mv->x >> 2);
+ const int y1 = y0 + (mv->y >> 2);
+ const int x2 = x0 + (mv2->x >> 2);
+ const int y2 = y0 + (mv2->y >> 2);
+ const int bh = nPbH;
+
+ // Can do chunks a full 16 wide if we don't want the H filter
+ for (int start_x=0; start_x < nPbW; start_x += 16)
+ {
+ HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu.y_b00);
+ qpu_mc_src_t *const src1 = yp->last_l0;
+ qpu_mc_src_t *const src2 = yp->last_l1;
+ qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
+#if RPI_TSTATS
+ {
+ HEVCRpiStats *const ts = &s->tstats;
+ ++ts->y_pred2_x0y0;
+
+ if (nPbH > 16)
+ ++ts->y_pred2_hgt16;
+ else
+ ++ts->y_pred2_hle16;
+ }
+#endif
+ src1->x = x1 + start_x;
+ src1->y = y1;
+ src1->base = src1_base;
+ src2->x = x2 + start_x;
+ src2->y = y2;
+ src2->base = src2_base;
+ cmd_y->w = FFMIN(nPbW - start_x, 16);
+ cmd_y->h = bh;
+ cmd_y->mymx21 = 0;
+ cmd_y->wo1 = wo1;
+ cmd_y->wo2 = wo2;
+ cmd_y->dst_addr = dst + (start_x << xshl);
+ yp->last_l0 = &cmd_y->next_src1;
+ yp->last_l1 = &cmd_y->next_src2;
+ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
+ }
+ }
+ else
+ {
+ // Filter requires a run-up of 3
+ const int x1 = x0 + (mv->x >> 2) - 3;
+ const int y1 = y0 + (mv->y >> 2) - 3;
+ const int x2 = x0 + (mv2->x >> 2) - 3;
+ const int y2 = y0 + (mv2->y >> 2) - 3;
+ const int bh = nPbH;
+
+ for (int start_x=0; start_x < nPbW; start_x += 8)
+ { // B blocks work 8 at a time
+ // B weights aren't doubled as the QPU code does the same
+ // amount of work as it does for P
+ HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu.y_bxx);
+ qpu_mc_src_t *const src1 = yp->last_l0;
+ qpu_mc_src_t *const src2 = yp->last_l1;
+ qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
+#if RPI_TSTATS
+ {
+ HEVCRpiStats *const ts = &s->tstats;
+ const unsigned int mmx = mx | mx2;
+ const unsigned int mmy = my | my2;
+ if (mmx == 0 && mmy == 0)
+ ++ts->y_pred2_x0y0;
+ else if (mmx == 0)
+ ++ts->y_pred2_x0;
+ else if (mmy == 0)
+ ++ts->y_pred2_y0;
+ else
+ ++ts->y_pred2_xy;
+
+ if (nPbH > 16)
+ ++ts->y_pred2_hgt16;
+ else
+ ++ts->y_pred2_hle16;
+ }
+#endif
+ src1->x = x1 + start_x;
+ src1->y = y1;
+ src1->base = src1_base;
+ src2->x = x2 + start_x;
+ src2->y = y2;
+ src2->base = src2_base;
+ cmd_y->w = FFMIN(nPbW - start_x, 8);
+ cmd_y->h = bh;
+ cmd_y->mymx21 = my2_mx2_my_mx;
+ cmd_y->wo1 = wo1;
+ cmd_y->wo2 = wo2;
+ cmd_y->dst_addr = dst + (start_x << xshl);
+ yp->last_l0 = &cmd_y->next_src1;
+ yp->last_l1 = &cmd_y->next_src2;
+ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
+ }
+ }
+}
+
+// h/v shifts fixed at one as that is all the qasm copes with
+static void
+rpi_pred_c(HEVCContext * const s, const unsigned int lx, const int x0_c, const int y0_c,
+ const int nPbW_c, const int nPbH_c,
+ const Mv * const mv,
+ const int16_t * const c_weights,
+ const int16_t * const c_offsets,
+ AVFrame * const src_frame)
+{
+ const unsigned int c_off = av_rpi_sand_frame_off_c(s->frame, x0_c, y0_c);
+ const int hshift = 1; // = s->ps.sps->hshift[1];
+ const int vshift = 1; // = s->ps.sps->vshift[1];
+
+ const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1;
+ const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1;
+ const qpu_mc_src_addr_t src_base_u = get_mc_address_u(src_frame);
+ const uint32_t x_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->x, 2 + hshift) << (1 - hshift)];
+ const uint32_t y_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->y, 2 + vshift) << (1 - vshift)];
+ const uint32_t wo_u = PACK2(offset_depth_adj(s, c_offsets[0]) * 2 + 1, c_weights[0]);
+ const uint32_t wo_v = PACK2(offset_depth_adj(s, c_offsets[1]) * 2 + 1, c_weights[1]);
+ qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off;
+ HEVCRpiInterPredEnv * const ipe = &s->jb0->chroma_ip;
+ const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame) + 1;
+ const unsigned int bh = nPbH_c;
+ const uint32_t qfn = lx == 0 ? s->qpu.c_pxx : s->qpu.c_pxx_l1;
+
+ for(int start_x=0; start_x < nPbW_c; start_x+=RPI_CHROMA_BLOCK_WIDTH)
+ {
+ HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh + 3, qfn);
+ qpu_mc_pred_c_p_t * const cmd_c = &cp->qpu_mc_curr->c.p;
+ qpu_mc_src_t ** const plast_lx = (lx == 0) ? &cp->last_l0 : &cp->last_l1;
+ qpu_mc_src_t * const last_lx = *plast_lx;
+ const int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
+
+ last_lx->x = x1_c + start_x;
+ last_lx->y = y1_c;
+ last_lx->base = src_base_u;
+ cmd_c->h = bh;
+ cmd_c->w = bw;
+ cmd_c->coeffs_x = x_coeffs;
+ cmd_c->coeffs_y = y_coeffs;
+ cmd_c->wo_u = wo_u;
+ cmd_c->wo_v = wo_v;
+ cmd_c->dst_addr_c = dst_base_u + (start_x << xshl);
+ *plast_lx = &cmd_c->next_src;
+ cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_c + 1);
+ }
+ return;
+}
+
+// h/v shifts fixed at one as that is all the qasm copes with
+static void
+rpi_pred_c_b(HEVCContext * const s, const int x0_c, const int y0_c,
+ const int nPbW_c, const int nPbH_c,
+ const struct MvField * const mv_field,
+ const int16_t * const c_weights,
+ const int16_t * const c_offsets,
+ const int16_t * const c_weights2,
+ const int16_t * const c_offsets2,
+ AVFrame * const src_frame,
+ AVFrame * const src_frame2)
+{
+ const unsigned int c_off = av_rpi_sand_frame_off_c(s->frame, x0_c, y0_c);
+ const int hshift = 1; // s->ps.sps->hshift[1];
+ const int vshift = 1; // s->ps.sps->vshift[1];
+ const Mv * const mv = mv_field->mv + 0;
+ const Mv * const mv2 = mv_field->mv + 1;
+
+ const unsigned int mx = av_mod_uintp2(mv->x, 2 + hshift);
+ const unsigned int my = av_mod_uintp2(mv->y, 2 + vshift);
+ const uint32_t coefs0_x = rpi_filter_coefs[mx << (1 - hshift)];
+ const uint32_t coefs0_y = rpi_filter_coefs[my << (1 - vshift)]; // Fractional part of motion vector
+ const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1;
+ const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1;
+
+ const unsigned int mx2 = av_mod_uintp2(mv2->x, 2 + hshift);
+ const unsigned int my2 = av_mod_uintp2(mv2->y, 2 + vshift);
+ const uint32_t coefs1_x = rpi_filter_coefs[mx2 << (1 - hshift)];
+ const uint32_t coefs1_y = rpi_filter_coefs[my2 << (1 - vshift)]; // Fractional part of motion vector
+
+ const int x2_c = x0_c + (mv2->x >> (2 + hshift)) - 1;
+ const int y2_c = y0_c + (mv2->y >> (2 + hshift)) - 1;
+
+ const uint32_t wo_u2 = PACK2(offset_depth_adj(s, c_offsets[0] + c_offsets2[0]) + 1, c_weights2[0]);
+ const uint32_t wo_v2 = PACK2(offset_depth_adj(s, c_offsets[1] + c_offsets2[1]) + 1, c_weights2[1]);
+
+ const qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off;
+ const qpu_mc_src_addr_t src1_base = get_mc_address_u(src_frame);
+ const qpu_mc_src_addr_t src2_base = get_mc_address_u(src_frame2);
+ HEVCRpiInterPredEnv * const ipe = &s->jb0->chroma_ip;
+ const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame) + 1;
+ const unsigned int bh = nPbH_c;
+
+ for (int start_x=0; start_x < nPbW_c; start_x += RPI_CHROMA_BLOCK_WIDTH)
+ {
+ const unsigned int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
+
+ HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh * 2 + 3, s->qpu.c_bxx);
+ qpu_mc_pred_c_b_t * const u = &cp->qpu_mc_curr->c.b;
+ qpu_mc_src_t * const src_l0 = cp->last_l0;
+ qpu_mc_src_t * const src_l1 = cp->last_l1;
+
+ src_l0->x = x1_c + start_x;
+ src_l0->y = y1_c;
+ src_l0->base = src1_base;
+ src_l1->x = x2_c + start_x;
+ src_l1->y = y2_c;
+ src_l1->base = src2_base;
+
+ u[0].h = bh;
+ u[0].w = bw;
+ u[0].coeffs_x1 = coefs0_x;
+ u[0].coeffs_y1 = coefs0_y;
+ u[0].weight_u1 = c_weights[0]; // Weight L0 U
+ u[0].weight_v1 = c_weights[1]; // Weight L0 V
+ u[0].coeffs_x2 = coefs1_x;
+ u[0].coeffs_y2 = coefs1_y;
+ u[0].wo_u2 = wo_u2;
+ u[0].wo_v2 = wo_v2;
+ u[0].dst_addr_c = dst_base_u + (start_x << xshl);
+
+ cp->last_l0 = &u[0].next_src1;
+ cp->last_l1 = &u[0].next_src2;
+ cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(u + 1);
+ }
+}
+
+
+#endif
+
+
+
+static void hls_prediction_unit(HEVCContext * const s, const int x0, const int y0,
+ const int nPbW, const int nPbH,
+ const unsigned int log2_cb_size, const unsigned int partIdx, const unsigned int idx)
{
#define POS(c_idx, x, y) \
&s->frame->data[c_idx][((y) >> s->ps.sps->vshift[c_idx]) * s->frame->linesize[c_idx] + \
(((x) >> s->ps.sps->hshift[c_idx]) << s->ps.sps->pixel_shift)]
- HEVCLocalContext *lc = s->HEVClc;
+ HEVCLocalContext * const lc = s->HEVClc;
int merge_idx = 0;
struct MvField current_mv = {{{ 0 }}};
@@ -1724,8 +2985,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
int y_cb = y0 >> log2_min_cb_size;
int x_pu, y_pu;
int i, j;
-
- int skip_flag = SAMPLE_CTB(s->skip_flag, x_cb, y_cb);
+ const int skip_flag = SAMPLE_CTB(s->skip_flag, x_cb, y_cb);
if (!skip_flag)
lc->pu.merge_flag = ff_hevc_merge_flag_decode(s);
@@ -1769,12 +3029,29 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
int nPbW_c = nPbW >> s->ps.sps->hshift[1];
int nPbH_c = nPbH >> s->ps.sps->vshift[1];
- luma_mc_uni(s, dst0, s->frame->linesize[0], ref0->frame,
+#if RPI_INTER
+ if (s->enable_rpi) {
+ rpi_pred_y(s, x0, y0, nPbW, nPbH, current_mv.mv + 0,
+ s->sh.luma_weight_l0[current_mv.ref_idx[0]], s->sh.luma_offset_l0[current_mv.ref_idx[0]],
+ ref0->frame);
+ } else
+#endif
+ {
+ luma_mc_uni(s, dst0, s->frame->linesize[0], ref0->frame,
&current_mv.mv[0], x0, y0, nPbW, nPbH,
s->sh.luma_weight_l0[current_mv.ref_idx[0]],
s->sh.luma_offset_l0[current_mv.ref_idx[0]]);
+ }
if (s->ps.sps->chroma_format_idc) {
+#if RPI_INTER
+ if (s->enable_rpi) {
+ rpi_pred_c(s, 0, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 0,
+ s->sh.chroma_weight_l0[current_mv.ref_idx[0]], s->sh.chroma_offset_l0[current_mv.ref_idx[0]],
+ ref0->frame);
+ return;
+ }
+#endif
chroma_mc_uni(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1],
0, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]);
@@ -1788,12 +3065,29 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
int nPbW_c = nPbW >> s->ps.sps->hshift[1];
int nPbH_c = nPbH >> s->ps.sps->vshift[1];
- luma_mc_uni(s, dst0, s->frame->linesize[0], ref1->frame,
+#if RPI_INTER
+ if (s->enable_rpi) {
+ rpi_pred_y(s, x0, y0, nPbW, nPbH, current_mv.mv + 1,
+ s->sh.luma_weight_l1[current_mv.ref_idx[1]], s->sh.luma_offset_l1[current_mv.ref_idx[1]],
+ ref1->frame);
+ } else
+#endif
+ {
+ luma_mc_uni(s, dst0, s->frame->linesize[0], ref1->frame,
&current_mv.mv[1], x0, y0, nPbW, nPbH,
s->sh.luma_weight_l1[current_mv.ref_idx[1]],
s->sh.luma_offset_l1[current_mv.ref_idx[1]]);
+ }
if (s->ps.sps->chroma_format_idc) {
+#if RPI_INTER
+ if (s->enable_rpi) {
+ rpi_pred_c(s, 1, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 1,
+ s->sh.chroma_weight_l1[current_mv.ref_idx[1]], s->sh.chroma_offset_l1[current_mv.ref_idx[1]],
+ ref1->frame);
+ return;
+ }
+#endif
chroma_mc_uni(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1],
1, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0]);
@@ -1808,11 +3102,31 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
int nPbW_c = nPbW >> s->ps.sps->hshift[1];
int nPbH_c = nPbH >> s->ps.sps->vshift[1];
- luma_mc_bi(s, dst0, s->frame->linesize[0], ref0->frame,
+#if RPI_INTER
+ if (s->enable_rpi) {
+ rpi_pred_y_b(s, x0, y0, nPbW, nPbH, &current_mv, ref0->frame, ref1->frame);
+ } else
+#endif
+ {
+ luma_mc_bi(s, dst0, s->frame->linesize[0], ref0->frame,
&current_mv.mv[0], x0, y0, nPbW, nPbH,
ref1->frame, &current_mv.mv[1], &current_mv);
+ }
if (s->ps.sps->chroma_format_idc) {
+#if RPI_INTER
+ if (s->enable_rpi) {
+ rpi_pred_c_b(s, x0_c, y0_c, nPbW_c, nPbH_c,
+ &current_mv,
+ s->sh.chroma_weight_l0[current_mv.ref_idx[0]],
+ s->sh.chroma_offset_l0[current_mv.ref_idx[0]],
+ s->sh.chroma_weight_l1[current_mv.ref_idx[1]],
+ s->sh.chroma_offset_l1[current_mv.ref_idx[1]],
+ ref0->frame,
+ ref1->frame);
+ return;
+ }
+#endif
chroma_mc_bi(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame,
x0_c, y0_c, nPbW_c, nPbH_c, &current_mv, 0);
@@ -2087,7 +3401,9 @@ static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size)
intra_prediction_unit_default_value(s, x0, y0, log2_cb_size);
ret = hls_pcm_sample(s, x0, y0, log2_cb_size);
if (s->ps.sps->pcm.loop_filter_disable_flag)
+ {
set_deblocking_bypass(s, x0, y0, log2_cb_size);
+ }
if (ret < 0)
return ret;
@@ -2310,6 +3626,524 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
lc->ctb_up_left_flag = ((x_ctb > 0) && (y_ctb > 0) && (ctb_addr_in_slice-1 >= s->ps.sps->ctb_width) && (s->ps.pps->tile_id[ctb_addr_ts] == s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1 - s->ps.sps->ctb_width]]));
}
+#ifdef RPI
+static void rpi_execute_dblk_cmds(HEVCContext *s)
+{
+ const unsigned int ctb_size = 1 << s->ps.sps->log2_ctb_size;
+ HEVCRpiDeblkEnv *const de = &s->jb1->deblk;
+ unsigned int i;
+
+ for (i = 0; i != de->n; ++i)
+ {
+ ff_hevc_hls_filters(s, de->blks[i].x_ctb, de->blks[i].y_ctb, ctb_size);
+ }
+ de->n = 0;
+}
+
+#if 0
+static void rpi_execute_transform(HEVCContext *s)
+{
+ int i=2;
+ int job = s->pass1_job;
+ /*int j;
+ int16_t *coeffs = s->coeffs_buf_arm[job][i];
+ for(j=s->num_coeffs[job][i]; j > 0; j-= 16*16, coeffs+=16*16) {
+ s->hevcdsp.idct[4-2](coeffs, 16);
+ }
+ i=3;
+ coeffs = s->coeffs_buf_arm[job][i] - s->num_coeffs[job][i];
+ for(j=s->num_coeffs[job][i]; j > 0; j-= 32*32, coeffs+=32*32) {
+ s->hevcdsp.idct[5-2](coeffs, 32);
+ }*/
+
+ rpi_cache_flush_one_gm_ptr(&s->coeffs_buf_accelerated[job], RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
+ s->vpu_id = vpu_post_code2( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2],
+ s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3],
+ s->num_coeffs[job][3] >> 10, 0, &s->coeffs_buf_accelerated[job]);
+ //vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0);
+ //gpu_cache_flush(&s->coeffs_buf_accelerated);
+ //vpu_wait(s->vpu_id);
+
+ for(i=0;i<4;i++)
+ s->num_coeffs[job][i] = 0;
+}
+#endif
+
+
+#define RPI_OPT_SEP_PRED 0
+
+
+// I-pred, transform_and_add for all blocks types done here
+// All ARM
+#if RPI_OPT_SEP_PRED
+static void rpi_execute_pred_cmds(HEVCContext * const s, const int do_luma, const int do_chroma)
+#else
+static void rpi_execute_pred_cmds(HEVCContext * const s)
+#endif
+{
+ int i;
+ HEVCRpiIntraPredEnv * iap = &s->jb1->intra;
+ const HEVCPredCmd *cmd = iap->cmds;
+#ifdef RPI
+ HEVCLocalContextIntra *lc = &s->HEVClcIntra;
+#else
+ HEVCLocalContext *lc = s->HEVClc;
+#endif
+
+ for(i = iap->n; i > 0; i--, cmd++) {
+// printf("i=%d cmd=%p job1=%d job0=%d\n",i,cmd,s->pass1_job,s->pass0_job);
+#if RPI_OPT_SEP_PRED
+ if (!(cmd->c_idx == 0 ? do_luma : do_chroma)) {
+ continue;
+ }
+#endif
+
+ switch (cmd->type)
+ {
+ case RPI_PRED_INTRA:
+ lc->tu.intra_pred_mode_c = lc->tu.intra_pred_mode = cmd->i_pred.mode;
+ lc->na.cand_bottom_left = (cmd->na >> 4) & 1;
+ lc->na.cand_left = (cmd->na >> 3) & 1;
+ lc->na.cand_up_left = (cmd->na >> 2) & 1;
+ lc->na.cand_up = (cmd->na >> 1) & 1;
+ lc->na.cand_up_right = (cmd->na >> 0) & 1;
+ if (!av_rpi_is_sand_frame(s->frame) || cmd->c_idx == 0)
+ s->hpc.intra_pred[cmd->size - 2](s, cmd->i_pred.x, cmd->i_pred.y, cmd->c_idx);
+ else
+ s->hpc.intra_pred_c[cmd->size - 2](s, cmd->i_pred.x, cmd->i_pred.y, cmd->c_idx);
+ break;
+
+ case RPI_PRED_ADD_RESIDUAL:
+ s->hevcdsp.transform_add[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
+ break;
+ case RPI_PRED_ADD_DC:
+ s->hevcdsp.add_residual_dc[cmd->size - 2](cmd->dc.dst, cmd->dc.stride, cmd->dc.dc);
+ break;
+#if RPI_HEVC_SAND
+ case RPI_PRED_ADD_RESIDUAL_U:
+ s->hevcdsp.add_residual_u[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride, cmd->ta.dc);
+ break;
+ case RPI_PRED_ADD_RESIDUAL_V:
+ s->hevcdsp.add_residual_v[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride, cmd->ta.dc);
+ break;
+ case RPI_PRED_ADD_RESIDUAL_C:
+ s->hevcdsp.add_residual_c[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
+ break;
+ case RPI_PRED_ADD_DC_U:
+ case RPI_PRED_ADD_DC_V:
+ s->hevcdsp.add_residual_dc_c[cmd->size - 2](cmd->dc.dst, cmd->dc.stride, cmd->dc.dc);
+ break;
+#endif
+
+ case RPI_PRED_I_PCM:
+ pcm_extract(s, cmd->i_pcm.src, cmd->i_pcm.src_len, cmd->i_pcm.x, cmd->i_pcm.y, 1 << cmd->size);
+ break;
+
+ default:
+ av_log(NULL, AV_LOG_PANIC, "Bad command %d in worker pred Q\n", cmd->type);
+ abort();
+ }
+ }
+#if RPI_OPT_SEP_PRED
+ if (do_luma)
+#endif
+ {
+ iap->n = 0;
+ }
+}
+
+
+#endif
+
+#ifdef RPI
+
+// Set initial uniform job values & zero ctu_count
+static void rpi_begin(HEVCContext *s)
+{
+#if RPI_INTER
+ unsigned int i;
+ HEVCRpiJob * const jb = s->jb0;
+ HEVCRpiInterPredEnv *const cipe = &jb->chroma_ip;
+ HEVCRpiInterPredEnv *const yipe = &jb->luma_ip;
+
+ const uint16_t pic_width_y = s->ps.sps->width;
+ const uint16_t pic_height_y = s->ps.sps->height;
+
+ const uint16_t pic_width_c = s->ps.sps->width >> s->ps.sps->hshift[1];
+ const uint16_t pic_height_c = s->ps.sps->height >> s->ps.sps->vshift[1];
+
+ rpi_inter_pred_reset(cipe);
+ for (i = 0; i < cipe->n; i++) {
+ HEVCRpiInterPredQ * const cp = cipe->q + i;
+ qpu_mc_pred_c_s_t * const u = &cp->qpu_mc_base->c.s;
+
+ u->next_src1.x = 0;
+ u->next_src1.y = 0;
+ u->next_src1.base = 0;
+ u->pic_cw = pic_width_c;
+ u->pic_ch = pic_height_c;
+ u->stride2 = av_rpi_sand_frame_stride2(s->frame);
+ u->stride1 = av_rpi_sand_frame_stride1(s->frame);
+ u->wdenom = s->sh.chroma_log2_weight_denom;
+ cp->last_l0 = &u->next_src1;
+
+ u->next_fn = 0;
+ u->next_src2.x = 0;
+ u->next_src2.y = 0;
+ u->next_src2.base = 0;
+ cp->last_l1 = &u->next_src2;
+
+ cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(u + 1);
+ }
+
+ rpi_inter_pred_reset(yipe);
+ for (i = 0; i < yipe->n; i++) {
+ HEVCRpiInterPredQ * const yp = yipe->q + i;
+ qpu_mc_pred_y_s_t * const y = &yp->qpu_mc_base->y.s;
+
+ y->next_src1.x = 0;
+ y->next_src1.y = 0;
+ y->next_src1.base = 0;
+ y->next_src2.x = 0;
+ y->next_src2.y = 0;
+ y->next_src2.base = 0;
+ y->pic_h = pic_height_y;
+ y->pic_w = pic_width_y;
+ y->stride2 = av_rpi_sand_frame_stride2(s->frame);
+ y->stride1 = av_rpi_sand_frame_stride1(s->frame);
+ y->wdenom = s->sh.luma_log2_weight_denom;
+ y->next_fn = 0;
+ yp->last_l0 = &y->next_src1;
+ yp->last_l1 = &y->next_src2;
+
+ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(y + 1);
+ }
+
+ s->last_y8_p = NULL;
+ s->last_y8_l1 = NULL;
+
+ for (i = 0; i != FF_ARRAY_ELEMS(jb->progress); ++i) {
+ jb->progress[i] = -1;
+ }
+
+#endif
+ s->ctu_count = 0;
+}
+#endif
+
+
+#if RPI_INTER
+#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
+static unsigned int mc_terminate_add_qpu(HEVCContext * const s,
+ const vpu_qpu_job_h vqj,
+ rpi_cache_flush_env_t * const rfe,
+ HEVCRpiInterPredEnv * const ipe)
+{
+ unsigned int i;
+ uint32_t mail[QPU_N_MAX][QPU_MAIL_EL_VALS];
+ unsigned int max_block = 0;
+
+ if (!ipe->used) {
+ return 0;
+ }
+
+ if (ipe->curr != 0) {
+ rpi_inter_pred_sync(ipe);
+ }
+
+ // Add final commands to Q
+ for(i = 0; i != ipe->n; ++i) {
+ HEVCRpiInterPredQ * const yp = ipe->q + i;
+ qpu_mc_src_t *const p0 = yp->last_l0;
+ qpu_mc_src_t *const p1 = yp->last_l1;
+ const unsigned int block_size = (char *)yp->qpu_mc_curr - (char *)yp->qpu_mc_base;
+
+ if (block_size > max_block)
+ max_block = block_size;
+
+ yp->qpu_mc_curr->data[-1] = yp->code_exit;
+
+ // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched
+ p0->x = MC_DUMMY_X;
+ p0->y = MC_DUMMY_Y;
+ p0->base = s->qpu_dummy_frame_qpu;
+ p1->x = MC_DUMMY_X;
+ p1->y = MC_DUMMY_Y;
+ p1->base = s->qpu_dummy_frame_qpu;
+
+ yp->last_l0 = NULL;
+ yp->last_l1 = NULL;
+
+ // Add to mailbox list
+ mail[i][0] = ipe->gptr.vc + ((uint8_t *)yp->qpu_mc_base - ipe->gptr.arm);
+ mail[i][1] = yp->code_setup;
+ }
+
+#if RPI_CACHE_UNIF_MVS
+ // We don't need invalidate here as the uniforms aren't changed by the QPU
+ // and leaving them in ARM cache avoids (pointless) pre-reads when writing
+ // new values which seems to give us a small performance advantage
+ //
+ // In most cases we will not have a completely packed set of uniforms and as
+ // we have a 2d invalidate we writeback all uniform Qs to the depth of the
+ // fullest
+ rpi_cache_flush_add_gm_blocks(rfe, &ipe->gptr, RPI_CACHE_FLUSH_MODE_WRITEBACK,
+ (uint8_t *)ipe->q[0].qpu_mc_base - ipe->gptr.arm, max_block,
+ ipe->n, ipe->max_fill + ipe->min_gap);
+#endif
+ vpu_qpu_job_add_qpu(vqj, ipe->n, (uint32_t *)mail);
+
+ return 1;
+}
+#endif
+
+#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
+static unsigned int mc_terminate_add_emu(HEVCContext * const s,
+ const vpu_qpu_job_h vqj,
+ rpi_cache_flush_env_t * const rfe,
+ HEVCRpiInterPredEnv * const ipe)
+{
+ unsigned int i;
+ if (!ipe->used) {
+ return 0;
+ }
+
+ if (ipe->curr != 0) {
+ rpi_inter_pred_sync(ipe);
+ }
+
+ // Add final commands to Q
+ for(i = 0; i != ipe->n; ++i) {
+ HEVCRpiInterPredQ * const yp = ipe->q + i;
+ qpu_mc_src_t *const p0 = yp->last_l0;
+ qpu_mc_src_t *const p1 = yp->last_l1;
+
+ yp->qpu_mc_curr->data[-1] = yp->code_exit;
+
+ // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched
+ p0->x = MC_DUMMY_X;
+ p0->y = MC_DUMMY_Y;
+ p0->base = s->qpu_dummy_frame_emu;
+ p1->x = MC_DUMMY_X;
+ p1->y = MC_DUMMY_Y;
+ p1->base = s->qpu_dummy_frame_emu;
+
+ yp->last_l0 = NULL;
+ yp->last_l1 = NULL;
+ }
+
+ return 1;
+}
+#endif
+
+
+#if RPI_QPU_EMU_Y
+#define mc_terminate_add_y mc_terminate_add_emu
+#else
+#define mc_terminate_add_y mc_terminate_add_qpu
+#endif
+#if RPI_QPU_EMU_C
+#define mc_terminate_add_c mc_terminate_add_emu
+#else
+#define mc_terminate_add_c mc_terminate_add_qpu
+#endif
+#endif
+
+#ifdef RPI
+
+
+static void flush_frame(HEVCContext *s,AVFrame *frame)
+{
+ rpi_cache_flush_env_t * rfe = rpi_cache_flush_init();
+ rpi_cache_flush_add_frame(rfe, frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
+ rpi_cache_flush_finish(rfe);
+}
+
+
+// Core execution tasks
+static void worker_core(HEVCContext * const s)
+{
+#if RPI_OPT_SEP_PRED
+ vpu_qpu_wait_h sync_c;
+#endif
+ vpu_qpu_wait_h sync_y;
+
+ HEVCRpiJob * const jb = s->jb1;
+ int pred_y, pred_c;
+
+ const vpu_qpu_job_h vqj = vpu_qpu_job_new();
+ rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init();
+
+ {
+ const HEVCRpiCoeffsEnv * const cf = &jb->coeffs;
+ if (cf->s[3].n + cf->s[2].n != 0)
+ {
+ const unsigned int csize = sizeof(cf->s[3].buf[0]);
+ const unsigned int offset32 = ((cf->s[3].buf - cf->s[2].buf) - cf->s[3].n) * csize;
+ vpu_qpu_job_add_vpu(vqj,
+ vpu_get_fn(s->ps.sps->bit_depth),
+ vpu_get_constants(),
+ cf->gptr.vc,
+ cf->s[2].n >> 8,
+ cf->gptr.vc + offset32,
+ cf->s[3].n >> 10,
+ 0);
+
+ rpi_cache_flush_add_gm_range(rfe, &cf->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, 0, cf->s[2].n * csize);
+ rpi_cache_flush_add_gm_range(rfe, &cf->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, offset32, cf->s[3].n * csize);
+ }
+ }
+
+ pred_c = mc_terminate_add_c(s, vqj, rfe, &jb->chroma_ip);
+
+// We can take a sync here and try to locally overlap QPU processing with ARM
+// but testing showed a slightly negative benefit with noticable extra complexity
+#if RPI_OPT_SEP_PRED
+ vpu_qpu_job_add_sync_this(vqj, &sync_c);
+#endif
+
+ pred_y = mc_terminate_add_y(s, vqj, rfe, &jb->luma_ip);
+
+ vpu_qpu_job_add_sync_this(vqj, &sync_y);
+
+
+ // We are expecting a contiguous Z-shaped set of blocks
+ // So generate up to 3 blocks:
+ // 1st line
+ // body
+ // last line
+ // This will work even if we don't have the expected geometry
+ if (pred_y || pred_c)
+ {
+ const HEVCRpiDeblkEnv *const de = &jb->deblk;
+ const HEVCRpiDeblkBlk * db = de->blks + 0;
+ const unsigned int ctb_size = 1 << s->ps.sps->log2_ctb_size;
+ unsigned int x0 = db->x_ctb;
+ unsigned int xx = x0 + ctb_size;
+ unsigned int y0 = db->y_ctb;
+
+ unsigned int blks_tlbr[3][4] = {{~0U, ~0U, 0, 0}, {~0U, ~0U, 0, 0}, {~0U, ~0U, 0, 0}};
+ unsigned int b = 0;
+ unsigned int i;
+
+ for (i = 1, ++db; i < de->n; ++i, ++db)
+ {
+ if (db->x_ctb == xx && db->y_ctb == y0) {
+ xx += ctb_size;
+ }
+ else
+ {
+ unsigned int * const tlbr = blks_tlbr[b];
+ if (tlbr[0] > y0)
+ tlbr[0] = y0;
+ if (tlbr[1] > x0)
+ tlbr[1] = x0;
+ if (tlbr[2] < y0 + ctb_size)
+ tlbr[2] = y0 + ctb_size;
+ if (tlbr[3] < xx)
+ tlbr[3] = xx;
+ x0 = db->x_ctb;
+ xx = x0 + ctb_size;
+ y0 = db->y_ctb;
+ b = 1;
+ }
+ }
+
+ if (blks_tlbr[b][0] != ~0U)
+ ++b;
+
+ {
+ unsigned int * const tlbr = blks_tlbr[b];
+ tlbr[0] = y0;
+ tlbr[1] = x0;
+ tlbr[2] = y0 + ctb_size;
+ tlbr[3] = xx;
+ }
+
+ // ??? Coalesce blocks ???
+ for (i = 0; i <= b; ++i) {
+ const unsigned int * const tlbr = blks_tlbr[i];
+ rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_INVALIDATE,
+ tlbr[1], tlbr[0], tlbr[3] - tlbr[1], tlbr[2] - tlbr[0], s->ps.sps->vshift[1], pred_y, pred_c);
+ }
+ }
+
+
+ // Having accumulated some commands - do them
+ rpi_cache_flush_finish(rfe);
+
+ // Await progress as required
+ {
+ unsigned int i;
+ for (i = 0; i != FF_ARRAY_ELEMS(jb->progress); ++i) {
+ if (jb->progress[i] >= 0) {
+ ff_hevc_progress_wait_recon(s, jb, s->DPB + i, jb->progress[i]);
+ }
+ }
+ }
+
+ vpu_qpu_job_finish(vqj);
+
+ worker_pic_reset(&jb->coeffs);
+
+ // If we have emulated VPU ops - do it here
+#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
+ if (av_rpi_is_sand8_frame(s->frame))
+#if RPI_QPU_EMU_Y && RPI_QPU_EMU_C
+ rpi_shader_c8(s, &jb->luma_ip, &jb->chroma_ip);
+#elif RPI_QPU_EMU_Y
+ rpi_shader_c8(s, &jb->luma_ip, NULL);
+#else
+ rpi_shader_c8(s, NULL, &jb->chroma_ip);
+#endif
+ else
+#if RPI_QPU_EMU_Y && RPI_QPU_EMU_C
+ rpi_shader_c16(s, &jb->luma_ip, &jb->chroma_ip);
+#elif RPI_QPU_EMU_Y
+ rpi_shader_c16(s, &jb->luma_ip, NULL);
+#else
+ rpi_shader_c16(s, NULL, &jb->chroma_ip);
+#endif
+#endif
+
+#if RPI_OPT_SEP_PRED
+ // Wait for transform completion
+ vpu_qpu_wait(&sync_c);
+
+ // Perform intra prediction and residual reconstruction
+ rpi_execute_pred_cmds(s, 0, 1);
+
+ // Wait for transform completion
+ vpu_qpu_wait(&sync_y);
+
+ // Perform intra prediction and residual reconstruction
+ rpi_execute_pred_cmds(s, 1, 0);
+#else
+ // Wait for transform completion
+ vpu_qpu_wait(&sync_y);
+
+ // Perform intra prediction and residual reconstruction
+ rpi_execute_pred_cmds(s);
+#endif
+
+ // Perform deblocking for CTBs in this row
+ rpi_execute_dblk_cmds(s);
+}
+
+static void rpi_do_all_passes(HEVCContext *s)
+{
+ // Called from main thread - must be no pending background jobs
+ av_assert0(s->pass0_job == s->pass1_job && s->jb0 == s->jb1 && !s->jb0->pending);
+
+ // Do the various passes - common with the worker code
+ worker_core(s);
+ // Prepare next batch
+ rpi_begin(s);
+}
+
+
+#endif
+
static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
{
HEVCContext *s = avctxt->priv_data;
@@ -2319,6 +4153,17 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
int y_ctb = 0;
int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
+#ifdef RPI
+ // * We don't support cross_component_prediction_enabled_flag but as that
+ // must be 0 unless we have 4:4:4 there is no point testing for it as we
+ // only deal with sand which is never 4:4:4
+ // [support wouldn't be hard]
+ s->enable_rpi =
+ ((s->ps.sps->bit_depth == 8 && s->frame->format == AV_PIX_FMT_SAND128) ||
+ (s->ps.sps->bit_depth == 10 && s->frame->format == AV_PIX_FMT_SAND64_10));
+#endif
+ //printf("L0=%d L1=%d\n",s->sh.nb_refs[L1],s->sh.nb_refs[L1]);
+
if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
return AVERROR_INVALIDDATA;
@@ -2332,8 +4177,14 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
}
}
+#ifdef RPI
+ // Worker must be idle at start
+ av_assert0(s->pass0_job == s->pass1_job && s->jb0 == s->jb1 && !s->jb0->pending);
+ rpi_begin(s);
+#endif
+
while (more_data && ctb_addr_ts < s->ps.sps->ctb_size) {
- int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
+ const int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
x_ctb = (ctb_addr_rs % ((s->ps.sps->width + ctb_size - 1) >> s->ps.sps->log2_ctb_size)) << s->ps.sps->log2_ctb_size;
y_ctb = (ctb_addr_rs / ((s->ps.sps->width + ctb_size - 1) >> s->ps.sps->log2_ctb_size)) << s->ps.sps->log2_ctb_size;
@@ -2348,6 +4199,52 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
s->filter_slice_edges[ctb_addr_rs] = s->sh.slice_loop_filter_across_slices_enabled_flag;
more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
+
+#ifdef RPI
+ // Report progress so we can use our MVs in other frames
+ // If we are tiled then this isn't really optimal but given that tiling
+ // can change on a per pic basis (described in PPS) other schemes are
+ // quite a lot harder
+ if (s->threads_type == FF_THREAD_FRAME && x_ctb + ctb_size >= s->ps.sps->width) {
+ ff_hevc_progress_signal_mv(s, y_ctb + ctb_size - 1);
+ }
+
+ if (s->enable_rpi) {
+ int q_full = (++s->ctu_count >= s->max_ctu_count);
+
+ if (rpi_inter_pred_next_ctu(&s->jb0->luma_ip) != 0)
+ q_full = 1;
+ if (rpi_inter_pred_next_ctu(&s->jb0->chroma_ip) != 0)
+ q_full = 1;
+
+ s->jb0->deblk.blks[s->jb0->deblk.n].x_ctb = x_ctb;
+ s->jb0->deblk.blks[s->jb0->deblk.n++].y_ctb = y_ctb;
+
+ if (q_full) {
+ if (s->used_for_ref)
+ {
+// printf("%d %d/%d job=%d, x,y=%d,%d\n",s->ctu_count,s->num_dblk_cmds[s->pass0_job],RPI_MAX_DEBLOCK_CMDS,s->pass0_job, x_ctb, y_ctb);
+
+// worker_wait(s);
+ // Split work load onto separate threads so we make as rapid progress as possible with this frame
+ // Pass on this job to worker thread
+ worker_submit_job(s);
+
+ // Make sure we have space to prepare the next job
+ worker_pass0_ready(s);
+
+ // Prepare the next batch of commands
+ rpi_begin(s);
+ } else {
+ // Non-ref frame so do it all on this thread
+ rpi_do_all_passes(s);
+ }
+ }
+
+ }
+#endif
+
+
if (more_data < 0) {
s->tab_slice_address[ctb_addr_rs] = -1;
return more_data;
@@ -2356,9 +4253,40 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
ctb_addr_ts++;
ff_hevc_save_states(s, ctb_addr_ts);
+#ifdef RPI
+ if (s->enable_rpi)
+ continue;
+#endif
ff_hevc_hls_filters(s, x_ctb, y_ctb, ctb_size);
}
+#ifdef RPI
+
+ // Wait for the worker to finish all its jobs
+ if (s->enable_rpi) {
+ worker_wait(s);
+ }
+
+ // Finish off any half-completed rows
+ if (s->enable_rpi && s->ctu_count) {
+ rpi_do_all_passes(s);
+ }
+
+#if RPI_TSTATS
+ {
+ HEVCRpiStats *const ts = &s->tstats;
+
+ printf("=== P: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d w8gl:%5d/%5d y8m:%d\n B: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d\n",
+ ts->y_pred1_xy, ts->y_pred1_x0, ts->y_pred1_y0, ts->y_pred1_x0y0,
+ ts->y_pred1_hgt16, ts->y_pred1_hle16, ts->y_pred1_wgt8, ts->y_pred1_wle8, ts->y_pred1_y8_merge,
+ ts->y_pred2_xy, ts->y_pred2_x0, ts->y_pred2_y0, ts->y_pred2_x0y0,
+ ts->y_pred2_hgt16, ts->y_pred2_hle16);
+ memset(ts, 0, sizeof(*ts));
+ }
+#endif
+
+#endif
+
if (x_ctb + ctb_size >= s->ps.sps->width &&
y_ctb + ctb_size >= s->ps.sps->height)
ff_hevc_hls_filter(s, x_ctb, y_ctb, ctb_size);
@@ -2393,6 +4321,11 @@ static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int
s = s1->sList[self_id];
lc = s->HEVClc;
+#ifdef RPI
+ s->enable_rpi = 0;
+ //printf("Wavefront\n");
+#endif
+
if(ctb_row) {
ret = init_get_bits8(&lc->gb, s->data + s->sh.offset[ctb_row - 1], s->sh.size[ctb_row - 1]);
@@ -2773,9 +4706,47 @@ static int decode_nal_unit(HEVCContext *s, const H2645NAL *nal)
if (ret < 0)
return ret;
- if (s->max_ra == INT_MAX) {
- if (s->nal_unit_type == NAL_CRA_NUT || IS_BLA(s)) {
- s->max_ra = s->poc;
+ // The definition of _N unit types is "non-reference for other frames
+ // with the same temporal_id" so they may/will be ref frames for pics
+ // with a higher temporal_id.
+ s->used_for_ref = s->ps.sps->max_sub_layers > s->temporal_id + 1 ||
+ !(s->nal_unit_type == NAL_TRAIL_N ||
+ s->nal_unit_type == NAL_TSA_N ||
+ s->nal_unit_type == NAL_STSA_N ||
+ s->nal_unit_type == NAL_RADL_N ||
+ s->nal_unit_type == NAL_RASL_N);
+
+#if DEBUG_DECODE_N
+ {
+ static int z = 0;
+ if (IS_IDR(s)) {
+ z = 1;
+ }
+ if (z != 0 && z++ > DEBUG_DECODE_N) {
+ s->is_decoded = 0;
+ break;
+ }
+ }
+#endif
+ if (!s->used_for_ref && s->avctx->skip_frame >= AVDISCARD_NONREF) {
+ s->is_decoded = 0;
+ break;
+ }
+
+ if (s->sh.first_slice_in_pic_flag) {
+ if (s->max_ra == INT_MAX) {
+ if (s->nal_unit_type == NAL_CRA_NUT || IS_BLA(s)) {
+ s->max_ra = s->poc;
+ } else {
+ if (IS_IDR(s))
+ s->max_ra = INT_MIN;
+ }
+ }
+
+ if ((s->nal_unit_type == NAL_RASL_R || s->nal_unit_type == NAL_RASL_N) &&
+ s->poc <= s->max_ra) {
+ s->is_decoded = 0;
+ break;
} else {
if (IS_IDR(s))
s->max_ra = INT_MIN;
@@ -2896,10 +4867,25 @@ static int decode_nal_units(HEVCContext *s, const uint8_t *buf, int length)
}
}
-fail:
- if (s->ref && s->threads_type == FF_THREAD_FRAME)
- ff_thread_report_progress(&s->ref->tf, INT_MAX, 0);
-
+fail: // Also success path
+ if (s->ref != NULL) {
+ if (s->used_for_ref && s->threads_type == FF_THREAD_FRAME) {
+#ifdef RPI
+ rpi_flush_ref_frame_progress(s, &s->ref->tf, s->ps.sps->height);
+#endif
+ ff_hevc_progress_signal_all_done(s);
+ }
+#ifdef RPI
+ // * Flush frame will become confused if we pass it something
+ // that doesn't have an expected number of planes (e.g. 400)
+ // So only flush if we are sure we can.
+ else if (s->enable_rpi) {
+ // Flush frame to real memory as we expect to be able to pass
+ // it straight on to mmal
+ flush_frame(s, s->frame);
+ }
+#endif
+ }
return ret;
}
@@ -3070,6 +5056,83 @@ fail:
return AVERROR(ENOMEM);
}
+#ifdef RPI
+static av_cold void hevc_init_worker(HEVCContext * const s)
+{
+ int err;
+
+ memset(s->jobs, 0, sizeof(s->jobs));
+
+ for (unsigned int job = 0; job < RPI_MAX_JOBS; job++) {
+ HEVCRpiJob * const jb = s->jobs + job;
+
+ sem_init(&jb->sem_in, 0, 0);
+ sem_init(&jb->sem_out, 0, 0);
+ ff_hevc_rpi_progress_init_wait(&jb->progress_wait);
+
+ jb->intra.n = 0;
+ jb->intra.cmds = av_mallocz(sizeof(HEVCPredCmd) * RPI_MAX_PRED_CMDS);
+
+ // ** Sizeof the union structure might be overkill but at the moment it
+ // is correct (it certainly isn't going to be too small)
+
+ rpi_inter_pred_alloc(&jb->chroma_ip,
+ QPU_N_MAX, QPU_N_GRP,
+ QPU_C_COMMANDS * sizeof(qpu_mc_pred_c_t),
+ QPU_C_CMD_PER_CTU_MAX * sizeof(qpu_mc_pred_c_t));
+ rpi_inter_pred_alloc(&jb->luma_ip,
+ QPU_N_MAX, QPU_N_GRP,
+ QPU_Y_COMMANDS * sizeof(qpu_mc_pred_y_t),
+ QPU_Y_CMD_PER_CTU_MAX * sizeof(qpu_mc_pred_y_t));
+
+ jb->deblk.n = 0;
+ jb->deblk.blks = av_malloc(sizeof(jb->deblk.blks[0]) * RPI_MAX_DEBLOCK_CMDS);
+ }
+ s->pass0_job = 0;
+ s->pass1_job = 0;
+ s->jb0 = s->jobs + 0;
+ s->jb1 = s->jobs + 0;
+
+ err = pthread_create(&s->worker_thread, NULL, worker_start, s);
+ if (err) {
+ printf("Failed to create worker thread\n");
+ exit(-1);
+ }
+}
+
+static void rpi_free_inter_pred(HEVCRpiInterPredEnv * const ipe)
+{
+ av_freep(&ipe->q);
+ gpu_free(&ipe->gptr);
+}
+
+static av_cold void hevc_exit_worker(HEVCContext *s)
+{
+ void *res;
+ unsigned int i;
+
+ for(i = 0; i < RPI_MAX_JOBS; i++)
+ s->jobs[i].terminate = 1;
+ for(i = 0; i < RPI_MAX_JOBS; i++)
+ sem_post(&s->jobs[i].sem_in);
+ pthread_join(s->worker_thread, &res);
+
+ for(i = 0; i < RPI_MAX_JOBS; i++)
+ {
+ HEVCRpiJob * const jb = s->jobs + i;
+
+ sem_destroy(&jb->sem_in);
+ sem_destroy(&jb->sem_out);
+ ff_hevc_rpi_progress_kill_wait(&jb->progress_wait);
+ av_freep(&jb->intra.cmds);
+ av_freep(&jb->deblk.blks);
+ rpi_free_inter_pred(&jb->chroma_ip);
+ rpi_free_inter_pred(&jb->luma_ip);
+ }
+}
+
+#endif
+
static av_cold int hevc_decode_free(AVCodecContext *avctx)
{
HEVCContext *s = avctx->priv_data;
@@ -3081,10 +5144,19 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
av_freep(&s->cabac_state);
- for (i = 0; i < 3; i++) {
- av_freep(&s->sao_pixel_buffer_h[i]);
- av_freep(&s->sao_pixel_buffer_v[i]);
+#ifdef RPI
+
+ hevc_exit_worker(s);
+ vpu_qpu_term();
+ for (i = 0; i != 2; ++i) {
+ ff_hevc_rpi_progress_kill_state(s->progress_states + i);
}
+
+ av_rpi_zc_uninit(avctx);
+#endif
+
+ av_freep(&s->sao_pixel_buffer_h[0]); // [1] & [2] allocated with [0]
+ av_freep(&s->sao_pixel_buffer_v[0]);
av_frame_free(&s->output_frame);
for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
@@ -3122,6 +5194,7 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
return 0;
}
+
static av_cold int hevc_init_context(AVCodecContext *avctx)
{
HEVCContext *s = avctx->priv_data;
@@ -3135,6 +5208,37 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
s->HEVClcList[0] = s->HEVClc;
s->sList[0] = s;
+#ifdef RPI
+ // Whilst FFmpegs init fn is only called once the close fn is called as
+ // many times as we have threads (init_thread_copy is called for the
+ // threads). So to match init & term put the init here where it will be
+ // called by both init & copy
+ av_rpi_zc_init(avctx);
+
+ if (vpu_qpu_init() != 0)
+ goto fail;
+
+#if RPI_INTER
+#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
+ {
+ static const uint32_t dframe[1] = {0x80808080};
+ s->qpu_dummy_frame_emu = (const uint8_t *)dframe;
+ }
+#endif
+#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
+ s->qpu_dummy_frame_qpu = qpu_fn(mc_start); // Use our code as a dummy frame
+#endif
+#endif
+ //gpu_malloc_uncached(2048*64,&s->dummy);
+
+ s->enable_rpi = 0;
+
+ for (i = 0; i != 2; ++i) {
+ ff_hevc_rpi_progress_init_state(s->progress_states + i);
+ }
+ hevc_init_worker(s);
+#endif
+
s->cabac_state = av_malloc(HEVC_CONTEXTS);
if (!s->cabac_state)
goto fail;
@@ -3148,6 +5252,7 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
if (!s->DPB[i].frame)
goto fail;
s->DPB[i].tf.f = s->DPB[i].frame;
+ s->DPB[i].dpb_no = i;
}
s->max_ra = INT_MAX;
@@ -3349,9 +5454,9 @@ static av_cold int hevc_decode_init(AVCodecContext *avctx)
}
if((avctx->active_thread_type & FF_THREAD_FRAME) && avctx->thread_count > 1)
- s->threads_type = FF_THREAD_FRAME;
- else
- s->threads_type = FF_THREAD_SLICE;
+ s->threads_type = FF_THREAD_FRAME;
+ else
+ s->threads_type = FF_THREAD_SLICE;
return 0;
}
@@ -3410,6 +5515,8 @@ AVCodec ff_hevc_decoder = {
.update_thread_context = hevc_update_thread_context,
.init_thread_copy = hevc_init_thread_copy,
.capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY |
+// 0,
+// AV_CODEC_CAP_FRAME_THREADS,
AV_CODEC_CAP_SLICE_THREADS | AV_CODEC_CAP_FRAME_THREADS,
.profiles = NULL_IF_CONFIG_SMALL(ff_hevc_profiles),
};
diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
index 162ca0e582..d647232638 100644
--- a/libavcodec/hevc.h
+++ b/libavcodec/hevc.h
@@ -23,6 +23,7 @@
#ifndef AVCODEC_HEVC_H
#define AVCODEC_HEVC_H
+#include "rpi_opts.h"
#include "libavutil/buffer.h"
#include "libavutil/md5.h"
@@ -37,6 +38,10 @@
#include "thread.h"
#include "videodsp.h"
+#ifdef RPI
+#include "rpi_qpu.h"
+#endif
+
#define MAX_DPB_SIZE 16 // A.4.1
#define MAX_REFS 16
@@ -463,6 +468,7 @@ typedef struct HEVCSPS {
int implicit_rdpcm_enabled_flag;
int explicit_rdpcm_enabled_flag;
int intra_smoothing_disabled_flag;
+ int high_precision_offsets_enabled_flag;
int persistent_rice_adaptation_enabled_flag;
///< coded frame dimension in various units
@@ -660,6 +666,7 @@ typedef struct CodingUnit {
uint8_t cu_transquant_bypass_flag;
} CodingUnit;
+#if 0
typedef struct Mv {
int16_t x; ///< horizontal component of motion vector
int16_t y; ///< vertical component of motion vector
@@ -670,6 +677,7 @@ typedef struct MvField {
int8_t ref_idx[2];
int8_t pred_flag;
} MvField;
+#endif
typedef struct NeighbourAvailable {
int cand_bottom_left;
@@ -745,9 +753,23 @@ typedef struct HEVCFrame {
* A combination of HEVC_FRAME_FLAG_*
*/
uint8_t flags;
+
+ // Entry no in DPB - can be used as a small unique
+ // frame identifier (within the current thread)
+ uint8_t dpb_no;
} HEVCFrame;
+#ifdef RPI
+typedef struct HEVCLocalContextIntra {
+ TransformUnit tu;
+ NeighbourAvailable na;
+} HEVCLocalContextIntra;
+#endif
+
typedef struct HEVCLocalContext {
+ TransformUnit tu; // Moved to start to match HEVCLocalContextIntra (yuk!)
+ NeighbourAvailable na;
+
uint8_t cabac_state[HEVC_CONTEXTS];
uint8_t stat_coeff[4];
@@ -762,8 +784,6 @@ typedef struct HEVCLocalContext {
int qPy_pred;
- TransformUnit tu;
-
uint8_t ctb_left_flag;
uint8_t ctb_up_flag;
uint8_t ctb_up_right_flag;
@@ -779,7 +799,6 @@ typedef struct HEVCLocalContext {
int ct_depth;
CodingUnit cu;
PredictionUnit pu;
- NeighbourAvailable na;
#define BOUNDARY_LEFT_SLICE (1 << 0)
#define BOUNDARY_LEFT_TILE (1 << 1)
@@ -790,6 +809,207 @@ typedef struct HEVCLocalContext {
int boundary_flags;
} HEVCLocalContext;
+#ifdef RPI
+
+// The processing is done in chunks
+// Increasing RPI_NUM_CHUNKS will reduce time spent activating QPUs and cache flushing,
+// but allocate more memory and increase the latency before data in the next frame can be processed
+#define RPI_NUM_CHUNKS 4
+#define RPI_CHUNK_SIZE 12
+#define RPI_ROUND_TO_LINES 0
+
+// RPI_MAX_WIDTH is maximum width in pixels supported by the accelerated code
+#define RPI_MAX_WIDTH (RPI_NUM_CHUNKS*64*RPI_CHUNK_SIZE)
+
+// Worst case is for 4:4:4 4x4 blocks with 64 high coding tree blocks, so 16 MV cmds per 4 pixels across for each colour plane, * 2 for bi
+#define RPI_MAX_MV_CMDS_Y (2*16*1*(RPI_MAX_WIDTH/4))
+#define RPI_MAX_MV_CMDS_C (2*16*2*(RPI_MAX_WIDTH/4))
+// Each block can have an intra prediction and a transform_add command
+#define RPI_MAX_PRED_CMDS (2*16*3*(RPI_MAX_WIDTH/4))
+// Worst case is 16x16 CTUs
+#define RPI_MAX_DEBLOCK_CMDS (RPI_MAX_WIDTH*4/16)
+
+#define RPI_CMD_LUMA_UNI 0
+#define RPI_CMD_CHROMA_UNI 1
+#define RPI_CMD_LUMA_BI 2
+#define RPI_CMD_CHROMA_BI 3
+#define RPI_CMD_V_BI 4
+
+// Command for inter prediction
+typedef struct HEVCMvCmd {
+ uint8_t cmd;
+ uint8_t block_w;
+ uint8_t block_h;
+ int8_t ref_idx[2];
+ uint16_t dststride;
+ uint16_t srcstride;
+ uint16_t srcstride1;
+ int16_t weight;
+ int16_t offset;
+ int16_t x_off;
+ int16_t y_off;
+ uint8_t *src;
+ uint8_t *src1;
+ uint8_t *dst;
+ Mv mv;
+ Mv mv1;
+} HEVCMvCmd;
+
+
+// Command for intra prediction and transform_add of predictions to coefficients
+enum rpi_pred_cmd_e
+{
+ RPI_PRED_ADD_RESIDUAL,
+ RPI_PRED_ADD_RESIDUAL_U, // = RPI_PRED_TRANSFORM_ADD + c_idx
+ RPI_PRED_ADD_RESIDUAL_V, // = RPI_PRED_TRANSFORM_ADD + c_idx
+ RPI_PRED_ADD_RESIDUAL_C, // Merged U+V
+ RPI_PRED_ADD_DC,
+ RPI_PRED_ADD_DC_U, // Both U & V are effectively C
+ RPI_PRED_ADD_DC_V,
+ RPI_PRED_INTRA,
+ RPI_PRED_I_PCM,
+ RPI_PRED_CMD_MAX
+};
+
+typedef struct HEVCPredCmd {
+ uint8_t type;
+ uint8_t size; // log2 "size" used by all variants
+ uint8_t na; // i_pred - but left here as they pack well
+ uint8_t c_idx; // i_pred
+ union {
+ struct { // TRANSFORM_ADD
+ uint8_t * dst;
+ const int16_t * buf;
+ uint16_t stride; // Should be good enough for all pic fmts we use
+ int16_t dc;
+ } ta;
+ struct {
+ uint8_t * dst;
+ uint32_t stride;
+ int dc;
+ } dc;
+ struct { // INTRA
+ uint16_t x;
+ uint16_t y;
+ enum IntraPredMode mode;
+ } i_pred;
+ struct { // I_PCM
+ uint16_t x;
+ uint16_t y;
+ const void * src;
+ uint32_t src_len;
+ } i_pcm;
+ };
+} HEVCPredCmd;
+
+#endif
+
+#ifdef RPI
+#include <semaphore.h>
+
+union qpu_mc_pred_cmd_s;
+struct qpu_mc_pred_y_p_s;
+struct qpu_mc_src_s;
+
+typedef struct HEVCRpiInterPredQ
+{
+ union qpu_mc_pred_cmd_u *qpu_mc_base;
+ union qpu_mc_pred_cmd_u *qpu_mc_curr;
+ struct qpu_mc_src_s *last_l0;
+ struct qpu_mc_src_s *last_l1;
+ unsigned int load;
+ uint32_t code_setup;
+ uint32_t code_sync;
+ uint32_t code_exit;
+} HEVCRpiInterPredQ;
+
+typedef struct HEVCRpiInterPredEnv
+{
+ HEVCRpiInterPredQ * q;
+ unsigned int n; // Number of Qs
+ unsigned int n_grp; // Number of Q in a group
+ unsigned int curr; // Current Q number (0..n-1)
+ int used; // 0 if nothing in any Q, 1 otherwise
+ int used_grp; // 0 if nothing in any Q in the current group
+ unsigned int max_fill;
+ unsigned int min_gap;
+ GPU_MEM_PTR_T gptr;
+} HEVCRpiInterPredEnv;
+
+typedef struct HEVCRpiIntraPredEnv {
+ unsigned int n; // Number of commands
+ HEVCPredCmd * cmds;
+} HEVCRpiIntraPredEnv;
+
+typedef struct HEVCRpiCeoffEnv {
+ unsigned int n;
+ uint16_t * buf;
+} HEVCRpiCoeffEnv;
+
+typedef struct HEVCRpiCeoffsEnv {
+ HEVCRpiCoeffEnv s[4];
+ GPU_MEM_PTR_T gptr;
+ void * mptr;
+} HEVCRpiCoeffsEnv;
+
+typedef struct HEVCRpiDeblkBlk {
+ uint16_t x_ctb;
+ uint16_t y_ctb;
+} HEVCRpiDeblkBlk;
+
+typedef struct HEVCRpiDeblkEnv {
+ unsigned int n;
+ HEVCRpiDeblkBlk * blks;
+} HEVCRpiDeblkEnv;
+
+typedef struct HEVCRPiFrameProgressWait {
+ int req;
+ struct HEVCRPiFrameProgressWait * next;
+ sem_t sem;
+} HEVCRPiFrameProgressWait;
+
+typedef struct HEVCRPiFrameProgressState {
+ struct HEVCRPiFrameProgressWait * first;
+ struct HEVCRPiFrameProgressWait * last;
+ pthread_mutex_t lock;
+} HEVCRPiFrameProgressState;
+
+typedef struct HEVCRpiJob {
+ volatile int terminate;
+ int pending;
+ sem_t sem_in; // set by main
+ sem_t sem_out; // set by worker
+ HEVCRpiInterPredEnv chroma_ip;
+ HEVCRpiInterPredEnv luma_ip;
+ int16_t progress[32]; // index by dpb_no
+ HEVCRpiIntraPredEnv intra;
+ HEVCRpiCoeffsEnv coeffs;
+ HEVCRpiDeblkEnv deblk;
+ HEVCRPiFrameProgressWait progress_wait;
+} HEVCRpiJob;
+
+#if RPI_TSTATS
+typedef struct HEVCRpiStats {
+ int y_pred1_y8_merge;
+ int y_pred1_xy;
+ int y_pred1_x0;
+ int y_pred1_y0;
+ int y_pred1_x0y0;
+ int y_pred1_wle8;
+ int y_pred1_wgt8;
+ int y_pred1_hle16;
+ int y_pred1_hgt16;
+ int y_pred2_xy;
+ int y_pred2_x0;
+ int y_pred2_y0;
+ int y_pred2_x0y0;
+ int y_pred2_hle16;
+ int y_pred2_hgt16;
+} HEVCRpiStats;
+#endif
+
+#endif
+
typedef struct HEVCContext {
const AVClass *c; // needed by private avoptions
AVCodecContext *avctx;
@@ -805,6 +1025,69 @@ typedef struct HEVCContext {
int width;
int height;
+ int used_for_ref; // rpi
+#ifdef RPI
+ int enable_rpi;
+ unsigned int pass0_job; // Pass0 does coefficient decode
+ unsigned int pass1_job; // Pass1 does pixel processing
+ int ctu_count; // Number of CTUs done in pass0 so far
+ int max_ctu_count; // Number of CTUs when we trigger a round of processing
+
+ HEVCRpiJob * jb0;
+ HEVCRpiJob * jb1;
+ HEVCRpiJob jobs[RPI_MAX_JOBS];
+#if RPI_TSTATS
+ HEVCRpiStats tstats;
+#endif
+#if RPI_INTER
+ struct qpu_mc_pred_y_p_s * last_y8_p;
+ struct qpu_mc_src_s * last_y8_l1;
+
+ // Function pointers
+#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
+ const uint8_t * qpu_dummy_frame_emu;
+#endif
+#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
+ uint32_t qpu_dummy_frame_qpu; // Not a frame - just a bit of memory
+#endif
+ HEVCRpiQpu qpu;
+#endif
+
+ pthread_t worker_thread;
+
+#ifdef RPI_DEBLOCK_VPU
+#define RPI_DEBLOCK_VPU_Q_COUNT 2
+ int enable_rpi_deblock;
+
+ int uv_setup_width;
+ int uv_setup_height;
+ int setup_width; // Number of 16x16 blocks across the image
+ int setup_height; // Number of 16x16 blocks down the image
+
+ struct dblk_vpu_q_s
+ {
+ GPU_MEM_PTR_T deblock_vpu_gmem;
+
+ uint8_t (*y_setup_arm)[2][2][2][4];
+ uint8_t (*y_setup_vc)[2][2][2][4];
+
+ uint8_t (*uv_setup_arm)[2][2][2][4]; // Half of this is unused [][][1][], but easier for the VPU as it allows us to store with zeros and addresses are aligned
+ uint8_t (*uv_setup_vc)[2][2][2][4];
+
+ int (*vpu_cmds_arm)[6]; // r0-r5 for each command
+ int vpu_cmds_vc;
+
+ vpu_qpu_wait_h cmd_id;
+ } dvq_ents[RPI_DEBLOCK_VPU_Q_COUNT];
+
+ struct dblk_vpu_q_s * dvq;
+ unsigned int dvq_n;
+
+#endif
+ HEVCLocalContextIntra HEVClcIntra;
+ HEVCRPiFrameProgressState progress_states[2];
+#endif
+
uint8_t *cabac_state;
/** 1 if the independent slice segment header was successfully parsed */
@@ -1053,6 +1336,10 @@ void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size);
int ff_hevc_encode_nal_vps(HEVCVPS *vps, unsigned int id,
uint8_t *buf, int buf_size);
+#if RPI_INTER
+extern void rpi_flush_ref_frame_progress(HEVCContext * const s, ThreadFrame * const f, const unsigned int n);
+#endif
+
/**
* Reset SEI values that are stored on the Context.
@@ -1072,4 +1359,89 @@ extern const uint8_t ff_hevc_diag_scan4x4_y[16];
extern const uint8_t ff_hevc_diag_scan8x8_x[64];
extern const uint8_t ff_hevc_diag_scan8x8_y[64];
+#ifdef RPI
+int16_t * rpi_alloc_coeff_buf(HEVCContext * const s, const int buf_no, const int n);
+
+// arm/hevc_misc_neon.S
+// Neon coeff zap fn
+#if HAVE_NEON
+extern void rpi_zap_coeff_vals_neon(int16_t * dst, unsigned int l2ts_m2);
+#endif
+
+void ff_hevc_rpi_progress_wait_field(HEVCContext * const s, HEVCRpiJob * const jb,
+ const HEVCFrame * const ref, const int val, const int field);
+
+void ff_hevc_rpi_progress_signal_field(HEVCContext * const s, const int val, const int field);
+
+// All of these expect that s->threads_type == FF_THREAD_FRAME
+
+static inline void ff_hevc_progress_wait_mv(HEVCContext * const s, HEVCRpiJob * const jb,
+ const HEVCFrame * const ref, const int y)
+{
+ if (s->enable_rpi)
+ ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 1);
+ else
+ ff_thread_await_progress((ThreadFrame*)&ref->tf, y, 0);
+}
+
+static inline void ff_hevc_progress_signal_mv(HEVCContext * const s, const int y)
+{
+ if (s->enable_rpi && s->used_for_ref)
+ ff_hevc_rpi_progress_signal_field(s, y, 1);
+}
+
+static inline void ff_hevc_progress_wait_recon(HEVCContext * const s, HEVCRpiJob * const jb,
+ const HEVCFrame * const ref, const int y)
+{
+ if (s->enable_rpi)
+ ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 0);
+ else
+ ff_thread_await_progress((ThreadFrame*)&ref->tf, y, 0);
+}
+
+static inline void ff_hevc_progress_signal_recon(HEVCContext * const s, const int y)
+{
+ if (s->used_for_ref)
+ {
+ if (s->enable_rpi)
+ ff_hevc_rpi_progress_signal_field(s, y, 0);
+ else
+ ff_thread_report_progress(&s->ref->tf, y, 0);
+ }
+}
+
+static inline void ff_hevc_progress_signal_all_done(HEVCContext * const s)
+{
+ if (s->enable_rpi)
+ {
+ ff_hevc_rpi_progress_signal_field(s, INT_MAX, 0);
+ ff_hevc_rpi_progress_signal_field(s, INT_MAX, 1);
+ }
+ else
+ ff_thread_report_progress(&s->ref->tf, INT_MAX, 0);
+}
+
+#else
+
+// Use #define as that allows us to discard "jb" which won't exist in non-RPI world
+#define ff_hevc_progress_wait_mv(s, jb, ref, y) ff_thread_await_progress((ThreadFrame *)&ref->tf, y, 0)
+#define ff_hevc_progress_wait_recon(s, jb, ref, y) ff_thread_await_progress((ThreadFrame *)&ref->tf, y, 0)
+#define ff_hevc_progress_signal_mv(s, y)
+#define ff_hevc_progress_signal_recon(s, y) ff_thread_report_progress(&s->ref->tf, y, 0)
+#define ff_hevc_progress_signal_all_done(s) ff_thread_report_progress(&s->ref->tf, INT_MAX, 0)
+
+#endif
+
+// Set all done - signal nothing (used in missing refs)
+// Works for both rpi & non-rpi
+static inline void ff_hevc_progress_set_all_done(HEVCFrame * const ref)
+{
+ if (ref->tf.progress != NULL)
+ {
+ int * const p = (int *)&ref->tf.progress->data;
+ p[0] = INT_MAX;
+ p[1] = INT_MAX;
+ }
+}
+
#endif /* AVCODEC_HEVC_H */
diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
index 05b2821840..c84886817d 100644
--- a/libavcodec/hevc_cabac.c
+++ b/libavcodec/hevc_cabac.c
@@ -21,14 +21,76 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
+#define UNCHECKED_BITSTREAM_READER 1
+
#include "libavutil/attributes.h"
#include "libavutil/common.h"
#include "cabac_functions.h"
#include "hevc.h"
+#ifdef RPI
+#include "libavutil/rpi_sand_fns.h"
+#endif
+
+// BY22 is probably faster than simple bypass if the processor has
+// either a fast 32-bit divide or a fast 32x32->64[63:32] instruction
+// x86 has fast int divide
+// Arm doesn't have divide or general fast 64 bit, but does have the multiply
+// * Beware: ARCH_xxx isn't set if configure --disable-asm is used
+#define USE_BY22 (HAVE_FAST_64BIT || ARCH_ARM || ARCH_X86)
+// Use native divide if we have a fast one - otherwise use mpy 1/x
+// x86 has a fast integer divide - arm doesn't - unsure about other
+// architectures
+#define USE_BY22_DIV ARCH_X86
+
+// Special case blocks with a single significant ceoff
+// Decreases the complexity of the code for a common case but increases the
+// code size.
+#define USE_N_END_1 1
+
+#if ARCH_ARM
+#include "arm/hevc_cabac.h"
+#endif
+
#define CABAC_MAX_BIN 31
+
+#if USE_BY22 && !USE_BY22_DIV
+#define I(x) (uint32_t)((0x10000000000ULL / (uint64_t)(x)) + 1ULL)
+
+static const uint32_t cabac_by22_inv_range[256] = {
+ 0, I(257), I(258), I(259),
+ I(260), I(261), I(262), I(263), I(264), I(265), I(266), I(267), I(268), I(269),
+ I(270), I(271), I(272), I(273), I(274), I(275), I(276), I(277), I(278), I(279),
+ I(280), I(281), I(282), I(283), I(284), I(285), I(286), I(287), I(288), I(289),
+ I(290), I(291), I(292), I(293), I(294), I(295), I(296), I(297), I(298), I(299),
+ I(300), I(301), I(302), I(303), I(304), I(305), I(306), I(307), I(308), I(309),
+ I(310), I(311), I(312), I(313), I(314), I(315), I(316), I(317), I(318), I(319),
+ I(320), I(321), I(322), I(323), I(324), I(325), I(326), I(327), I(328), I(329),
+ I(330), I(331), I(332), I(333), I(334), I(335), I(336), I(337), I(338), I(339),
+ I(340), I(341), I(342), I(343), I(344), I(345), I(346), I(347), I(348), I(349),
+ I(350), I(351), I(352), I(353), I(354), I(355), I(356), I(357), I(358), I(359),
+ I(360), I(361), I(362), I(363), I(364), I(365), I(366), I(367), I(368), I(369),
+ I(370), I(371), I(372), I(373), I(374), I(375), I(376), I(377), I(378), I(379),
+ I(380), I(381), I(382), I(383), I(384), I(385), I(386), I(387), I(388), I(389),
+ I(390), I(391), I(392), I(393), I(394), I(395), I(396), I(397), I(398), I(399),
+ I(400), I(401), I(402), I(403), I(404), I(405), I(406), I(407), I(408), I(409),
+ I(410), I(411), I(412), I(413), I(414), I(415), I(416), I(417), I(418), I(419),
+ I(420), I(421), I(422), I(423), I(424), I(425), I(426), I(427), I(428), I(429),
+ I(430), I(431), I(432), I(433), I(434), I(435), I(436), I(437), I(438), I(439),
+ I(440), I(441), I(442), I(443), I(444), I(445), I(446), I(447), I(448), I(449),
+ I(450), I(451), I(452), I(453), I(454), I(455), I(456), I(457), I(458), I(459),
+ I(460), I(461), I(462), I(463), I(464), I(465), I(466), I(467), I(468), I(469),
+ I(470), I(471), I(472), I(473), I(474), I(475), I(476), I(477), I(478), I(479),
+ I(480), I(481), I(482), I(483), I(484), I(485), I(486), I(487), I(488), I(489),
+ I(490), I(491), I(492), I(493), I(494), I(495), I(496), I(497), I(498), I(499),
+ I(500), I(501), I(502), I(503), I(504), I(505), I(506), I(507), I(508), I(509),
+ I(510), I(511)
+};
+#undef I
+#endif // USE_BY22
+
/**
* number of bin by SyntaxElement.
*/
@@ -445,6 +507,211 @@ static const uint8_t diag_scan8x8_inv[8][8] = {
{ 28, 36, 43, 49, 54, 58, 61, 63, },
};
+
+typedef struct
+{
+ uint16_t coeff;
+ uint16_t scale;
+} xy_off_t;
+
+#define XYT_C(x,y,t) ((x) + ((y) << (t)))
+#define SCALE_TRAFO(t) ((t) > 3 ? 3 : (t))
+#define SCALE_SHR(t) ((t) - SCALE_TRAFO(t))
+#define XYT_S(x,y,t) (((x) >> SCALE_SHR(t)) + (((y) >> SCALE_SHR(t)) << SCALE_TRAFO(t)))
+
+#define XYT(x,y,t) {XYT_C(x,y,t), XYT_S(x,y,t)}
+
+#define OFF_DIAG(t) {\
+ XYT(0,0,t), XYT(0,1,t), XYT(1,0,t), XYT(0,2,t),\
+ XYT(1,1,t), XYT(2,0,t), XYT(0,3,t), XYT(1,2,t),\
+ XYT(2,1,t), XYT(3,0,t), XYT(1,3,t), XYT(2,2,t),\
+ XYT(3,1,t), XYT(2,3,t), XYT(3,2,t), XYT(3,3,t)\
+}
+
+#define OFF_HORIZ(t) {\
+ XYT(0,0,t), XYT(1,0,t), XYT(2,0,t), XYT(3,0,t),\
+ XYT(0,1,t), XYT(1,1,t), XYT(2,1,t), XYT(3,1,t),\
+ XYT(0,2,t), XYT(1,2,t), XYT(2,2,t), XYT(3,2,t),\
+ XYT(0,3,t), XYT(1,3,t), XYT(2,3,t), XYT(3,3,t)\
+}
+
+#define OFF_VERT(t) {\
+ XYT(0,0,t), XYT(0,1,t), XYT(0,2,t), XYT(0,3,t),\
+ XYT(1,0,t), XYT(1,1,t), XYT(1,2,t), XYT(1,3,t),\
+ XYT(2,0,t), XYT(2,1,t), XYT(2,2,t), XYT(2,3,t),\
+ XYT(3,0,t), XYT(3,1,t), XYT(3,2,t), XYT(3,3,t)\
+}
+
+static const xy_off_t off_xys[3][4][16] =
+{
+ {OFF_DIAG(2), OFF_DIAG(3), OFF_DIAG(4), OFF_DIAG(5)},
+ {OFF_HORIZ(2), OFF_HORIZ(3), OFF_HORIZ(4), OFF_HORIZ(5)},
+ {OFF_VERT(2), OFF_VERT(3), OFF_VERT(4), OFF_VERT(5)}
+};
+
+
+// Helper fns
+#ifndef hevc_mem_bits32
+static av_always_inline uint32_t hevc_mem_bits32(const void * buf, const unsigned int offset)
+{
+ return AV_RB32((const uint8_t *)buf + (offset >> 3)) << (offset & 7);
+}
+#endif
+
+#if AV_GCC_VERSION_AT_LEAST(3,4) && !defined(hevc_clz32)
+#define hevc_clz32 hevc_clz32_builtin
+static av_always_inline unsigned int hevc_clz32_builtin(const uint32_t x)
+{
+ // __builtin_clz says it works on ints - so adjust if int is >32 bits long
+ return __builtin_clz(x) - (sizeof(int) * 8 - 32);
+}
+#endif
+
+// It is unlikely that we will ever need this but include for completeness
+#ifndef hevc_clz32
+static inline unsigned int hevc_clz32(unsigned int x)
+{
+ unsigned int n = 1;
+ if ((x & 0xffff0000) == 0) {
+ n += 16;
+ x <<= 16;
+ }
+ if ((x & 0xff000000) == 0) {
+ n += 8;
+ x <<= 8;
+ }
+ if ((x & 0xf0000000) == 0) {
+ n += 4;
+ x <<= 4;
+ }
+ if ((x & 0xc0000000) == 0) {
+ n += 2;
+ x <<= 2;
+ }
+ return n - ((x >> 31) & 1);
+}
+#endif
+
+
+#if !USE_BY22
+// If no by22 then _by22 functions will revert to normal and so _peek/_flush
+// will no longer be called but the setup calls will still exist and we want
+// to null them out
+#define bypass_start(s)
+#define bypass_finish(s)
+#else
+// Use BY22 for residual bypass block
+
+#define bypass_start(s) get_cabac_by22_start(&s->HEVClc->cc)
+#define bypass_finish(s) get_cabac_by22_finish(&s->HEVClc->cc)
+
+// BY22 notes that bypass is simply a divide into the bitstream and so we
+// can peek out large quantities of bits at once and treat the result as if
+// it was VLC. In many cases this will lead to O(1) processing rather than
+// O(n) though the setup and teardown is sufficiently expensive that it is
+// only worth using if we expect to be dealing with more than a few bits
+// The definition of "a few bits" will vary from platform to platform but
+// tests on ARM show that it probably isn't worth it for a single coded
+// residual, but is for >1 - it also seems likely that if there are
+// more residuals then they are likely to be bigger and this will make the
+// O(1) nature of the code more worthwhile.
+
+
+#if !USE_BY22_DIV
+// * 1/x @ 32 bits gets us 22 bits of accuracy
+#define CABAC_BY22_PEEK_BITS 22
+#else
+// A real 32-bit divide gets us another bit
+// If we have a 64 bit int & a unit time divider then we should get a lot
+// of bits (55) but that is untested and it is unclear if it would give
+// us a large advantage
+#define CABAC_BY22_PEEK_BITS 23
+#endif
+
+// Bypass block start
+// Must be called before _by22_peek is used as it sets the CABAC environment
+// into the correct state. _by22_finish must be called to return to 'normal'
+// (i.e. non-bypass) cabac decoding
+static inline void get_cabac_by22_start(CABACContext * const c)
+{
+ const unsigned int bits = __builtin_ctz(c->low);
+ const uint32_t m = hevc_mem_bits32(c->bytestream, 0);
+ uint32_t x = (c->low << (22 - CABAC_BITS)) ^ ((m ^ 0x80000000U) >> (9 + CABAC_BITS - bits));
+#if !USE_BY22_DIV
+ const uint32_t inv = cabac_by22_inv_range[c->range & 0xff];
+#endif
+
+ c->bytestream -= (CABAC_BITS / 8);
+ c->by22.bits = bits;
+#if !USE_BY22_DIV
+ c->by22.range = c->range;
+ c->range = inv;
+#endif
+ c->low = x;
+}
+
+// Bypass block finish
+// Must be called at the end of the bypass block to return to normal operation
+static inline void get_cabac_by22_finish(CABACContext * const c)
+{
+ unsigned int used = c->by22.bits;
+ unsigned int bytes_used = (used / CABAC_BITS) * (CABAC_BITS / 8);
+ unsigned int bits_used = used & (CABAC_BITS == 16 ? 15 : 7);
+
+ c->bytestream += bytes_used + (CABAC_BITS / 8);
+ c->low = (((uint32_t)c->low >> (22 - CABAC_BITS + bits_used)) | 1) << bits_used;
+#if !USE_BY22_DIV
+ c->range = c->by22.range;
+#endif
+}
+
+// Peek bypass bits
+// _by22_start must be called before _by22_peek is called and _by22_flush
+// must be called afterwards to flush any used bits
+// The actual number of valid bits returned is
+// min(<coded bypass block length>, CABAC_BY22_PEEK_BITS). CABAC_BY22_PEEK_BITS
+// will be at least 22 which should be long enough for any prefix or suffix
+// though probably not long enough for the worst case combination
+#ifndef get_cabac_by22_peek
+static inline uint32_t get_cabac_by22_peek(const CABACContext * const c)
+{
+#if USE_BY22_DIV
+ return ((unsigned int)c->low / (unsigned int)c->range) << 9;
+#else
+ uint32_t x = c->low & ~1U;
+ const uint32_t inv = c->range;
+
+ if (inv != 0)
+ x = (uint32_t)(((uint64_t)x * (uint64_t)inv) >> 32);
+
+ return x << 1;
+#endif
+}
+#endif
+
+// Flush bypass bits peeked by _by22_peek
+// Flush n bypass bits. n must be >= 1 to guarantee correct operation
+// val is an unmodified copy of whatever _by22_peek returned
+#ifndef get_cabac_by22_flush
+static inline void get_cabac_by22_flush(CABACContext * c, const unsigned int n, const uint32_t val)
+{
+ // Subtract the bits used & reshift up to the top of the word
+#if USE_BY22_DIV
+ const uint32_t low = (((unsigned int)c->low << n) - (((val >> (32 - n)) * (unsigned int)c->range) << 23));
+#else
+ const uint32_t low = (((uint32_t)c->low << n) - (((val >> (32 - n)) * c->by22.range) << 23));
+#endif
+
+ // and refill lower bits
+ // We will probably OR over some existing bits but that doesn't matter
+ c->by22.bits += n;
+ c->low = low | (hevc_mem_bits32(c->bytestream, c->by22.bits) >> 9);
+}
+#endif
+
+#endif // USE_BY22
+
+
void ff_hevc_save_states(HEVCContext *s, int ctb_addr_ts)
{
if (s->ps.pps->entropy_coding_sync_enabled_flag &&
@@ -863,19 +1130,19 @@ int ff_hevc_cbf_luma_decode(HEVCContext *s, int trafo_depth)
return GET_CABAC(elem_offset[CBF_LUMA] + !trafo_depth);
}
-static int hevc_transform_skip_flag_decode(HEVCContext *s, int c_idx)
+static int hevc_transform_skip_flag_decode(HEVCContext *s, int c_idx_nz)
{
- return GET_CABAC(elem_offset[TRANSFORM_SKIP_FLAG] + !!c_idx);
+ return GET_CABAC(elem_offset[TRANSFORM_SKIP_FLAG] + c_idx_nz);
}
-static int explicit_rdpcm_flag_decode(HEVCContext *s, int c_idx)
+static int explicit_rdpcm_flag_decode(HEVCContext *s, int c_idx_nz)
{
- return GET_CABAC(elem_offset[EXPLICIT_RDPCM_FLAG] + !!c_idx);
+ return GET_CABAC(elem_offset[EXPLICIT_RDPCM_FLAG] + c_idx_nz);
}
-static int explicit_rdpcm_dir_flag_decode(HEVCContext *s, int c_idx)
+static int explicit_rdpcm_dir_flag_decode(HEVCContext *s, int c_idx_nz)
{
- return GET_CABAC(elem_offset[EXPLICIT_RDPCM_DIR_FLAG] + !!c_idx);
+ return GET_CABAC(elem_offset[EXPLICIT_RDPCM_DIR_FLAG] + c_idx_nz);
}
int ff_hevc_log2_res_scale_abs(HEVCContext *s, int idx) {
@@ -891,14 +1158,14 @@ int ff_hevc_res_scale_sign_flag(HEVCContext *s, int idx) {
return GET_CABAC(elem_offset[RES_SCALE_SIGN_FLAG] + idx);
}
-static av_always_inline void last_significant_coeff_xy_prefix_decode(HEVCContext *s, int c_idx,
+static av_always_inline void last_significant_coeff_xy_prefix_decode(HEVCContext *s, int c_idx_nz,
int log2_size, int *last_scx_prefix, int *last_scy_prefix)
{
int i = 0;
int max = (log2_size << 1) - 1;
int ctx_offset, ctx_shift;
- if (!c_idx) {
+ if (!c_idx_nz) {
ctx_offset = 3 * (log2_size - 2) + ((log2_size - 1) >> 2);
ctx_shift = (log2_size + 1) >> 2;
} else {
@@ -929,22 +1196,16 @@ static av_always_inline int last_significant_coeff_suffix_decode(HEVCContext *s,
return value;
}
-static av_always_inline int significant_coeff_group_flag_decode(HEVCContext *s, int c_idx, int ctx_cg)
+static av_always_inline int significant_coeff_group_flag_decode(HEVCContext *s, int c_idx_nz, int ctx_cg)
{
int inc;
- inc = FFMIN(ctx_cg, 1) + (c_idx>0 ? 2 : 0);
+ inc = (ctx_cg != 0) + (c_idx_nz << 1);
return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_GROUP_FLAG] + inc);
}
-static av_always_inline int significant_coeff_flag_decode(HEVCContext *s, int x_c, int y_c,
- int offset, const uint8_t *ctx_idx_map)
-{
- int inc = ctx_idx_map[(y_c << 2) + x_c] + offset;
- return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + inc);
-}
-static av_always_inline int significant_coeff_flag_decode_0(HEVCContext *s, int c_idx, int offset)
+static av_always_inline int significant_coeff_flag_decode_0(HEVCContext *s, int offset)
{
return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + offset);
}
@@ -966,90 +1227,470 @@ static av_always_inline int coeff_abs_level_greater2_flag_decode(HEVCContext *s,
return GET_CABAC(elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] + inc);
}
-static av_always_inline int coeff_abs_level_remaining_decode(HEVCContext *s, int rc_rice_param)
+
+#if !USE_BY22
+#define coeff_abs_level_remaining_decode_bypass(s,r) coeff_abs_level_remaining_decode(s, r)
+#endif
+
+
+#ifndef coeff_abs_level_remaining_decode_bypass
+static int coeff_abs_level_remaining_decode_bypass(HEVCContext * const s, const unsigned int rice_param)
{
+ CABACContext * const c = &s->HEVClc->cc;
+ uint32_t y;
+ unsigned int prefix;
+ unsigned int last_coeff_abs_level_remaining;
+ unsigned int n;
+
+ y = get_cabac_by22_peek(c);
+ prefix = hevc_clz32(~y);
+ // y << prefix will always have top bit 0
+
+ if (prefix < 3) {
+ const unsigned int suffix = (y << prefix) >> (31 - rice_param);
+ last_coeff_abs_level_remaining = (prefix << rice_param) + suffix;
+ n = prefix + 1 + rice_param;
+ }
+ else if (prefix * 2 + rice_param <= CABAC_BY22_PEEK_BITS + 2)
+ {
+ const uint32_t suffix = ((y << prefix) | 0x80000000) >> (34 - (prefix + rice_param));
+
+ last_coeff_abs_level_remaining = (2 << rice_param) + suffix;
+ n = prefix * 2 + rice_param - 2;
+ }
+ else {
+ unsigned int suffix;
+
+ get_cabac_by22_flush(c, prefix, y);
+ y = get_cabac_by22_peek(c);
+
+ suffix = (y | 0x80000000) >> (34 - (prefix + rice_param));
+ last_coeff_abs_level_remaining = (2 << rice_param) + suffix;
+ n = prefix + rice_param - 2;
+ }
+
+ get_cabac_by22_flush(c, n, y);
+
+ return last_coeff_abs_level_remaining;
+}
+#endif
+
+static int coeff_abs_level_remaining_decode(HEVCContext * const s, int rc_rice_param)
+{
+ CABACContext * const c = &s->HEVClc->cc;
int prefix = 0;
int suffix = 0;
int last_coeff_abs_level_remaining;
int i;
- while (prefix < CABAC_MAX_BIN && get_cabac_bypass(&s->HEVClc->cc))
+ while (prefix < CABAC_MAX_BIN && get_cabac_bypass(c))
prefix++;
if (prefix == CABAC_MAX_BIN) {
av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", prefix);
return 0;
}
+
if (prefix < 3) {
for (i = 0; i < rc_rice_param; i++)
- suffix = (suffix << 1) | get_cabac_bypass(&s->HEVClc->cc);
+ suffix = (suffix << 1) | get_cabac_bypass(c);
last_coeff_abs_level_remaining = (prefix << rc_rice_param) + suffix;
} else {
int prefix_minus3 = prefix - 3;
for (i = 0; i < prefix_minus3 + rc_rice_param; i++)
- suffix = (suffix << 1) | get_cabac_bypass(&s->HEVClc->cc);
+ suffix = (suffix << 1) | get_cabac_bypass(c);
last_coeff_abs_level_remaining = (((1 << prefix_minus3) + 3 - 1)
<< rc_rice_param) + suffix;
}
+
return last_coeff_abs_level_remaining;
}
-static av_always_inline int coeff_sign_flag_decode(HEVCContext *s, uint8_t nb)
+#if !USE_BY22
+#define coeff_sign_flag_decode_bypass coeff_sign_flag_decode
+static inline uint32_t coeff_sign_flag_decode(HEVCContext * const s, const unsigned int nb)
{
- int i;
- int ret = 0;
+ CABACContext * const c = &s->HEVClc->cc;
+ unsigned int i;
+ uint32_t ret = 0;
for (i = 0; i < nb; i++)
- ret = (ret << 1) | get_cabac_bypass(&s->HEVClc->cc);
- return ret;
+ ret = (ret << 1) | get_cabac_bypass(c);
+
+ return ret << (32 - nb);
+}
+#endif
+
+#ifndef coeff_sign_flag_decode_bypass
+static inline uint32_t coeff_sign_flag_decode_bypass(HEVCContext * const s, const unsigned int nb)
+{
+ CABACContext * const c = &s->HEVClc->cc;
+ uint32_t y;
+ y = get_cabac_by22_peek(c);
+ get_cabac_by22_flush(c, nb, y);
+ return y & ~(0xffffffffU >> nb);
+}
+#endif
+
+
+#ifndef get_cabac_greater1_bits
+static inline unsigned int get_cabac_greater1_bits(CABACContext * const c, const unsigned int n,
+ uint8_t * const state0)
+{
+ unsigned int i;
+ unsigned int rv = 0;
+ for (i = 0; i != n; ++i) {
+ const unsigned int idx = rv != 0 ? 0 : i < 3 ? i + 1 : 3;
+ const unsigned int b = get_cabac(c, state0 + idx);
+ rv = (rv << 1) | b;
+ }
+ return rv;
}
+#endif
+
+
+// N.B. levels returned are the values assuming coeff_abs_level_remaining
+// is uncoded, so 1 must be added if it is coded. sum_abs also reflects
+// this version of events.
+static inline uint32_t get_greaterx_bits(HEVCContext * const s, const unsigned int n_end, int * const levels,
+ int * const pprev_subset_coded, int * const psum,
+ const unsigned int idx0_gt1, const unsigned int idx_gt2)
+{
+ CABACContext * const c = &s->HEVClc->cc;
+ uint8_t * const state0 = s->HEVClc->cabac_state + idx0_gt1;
+ uint8_t * const state_gt2 = s->HEVClc->cabac_state + idx_gt2;
+ unsigned int rv;
+ unsigned int i;
+ const unsigned int n = FFMIN(n_end, 8);
+
+ // Really this is i != n but the simple unconditional loop is cheaper
+ // and faster
+ for (i = 0; i != 8; ++i)
+ levels[i] = 1;
+
+ rv = get_cabac_greater1_bits(c, n, state0);
+
+ *pprev_subset_coded = 0;
+ *psum = n;
+
+ rv <<= (32 - n);
+ if (rv != 0)
+ {
+ *pprev_subset_coded = 1;
+ *psum = n + 1;
+ i = hevc_clz32(rv);
+ levels[i] = 2;
+ if (get_cabac(c, state_gt2) == 0)
+ {
+ // Unset first coded bit
+ rv &= ~(0x80000000U >> i);
+ }
+ }
+
+ if (n_end > 8) {
+ const unsigned int g8 = n_end - 8;
+ rv |= ((1 << g8) - 1) << (24 - g8);
+ for (i = 0; i != g8; ++i) {
+ levels[i + 8] = 0;
+ }
+ }
+
+ return rv;
+}
+
+// extended_precision_processing_flag must be false given we are
+// putting the result into a 16-bit array
+// So trans_coeff_level must fit in 16 bits too (7.4.9.1 definition of coeff_abs_level_remaining)
+// scale_m is uint8_t
+//
+// scale is [40 - 72] << [0..12] based on qp- worst case is (45 << 12)
+// or it can be 2 (if we have transquant_bypass)
+// shift is set to one less than we really want but would normally be
+// s->ps.sps->bit_depth (max 16, min 8) + log2_trafo_size (max 5, min 2?) - 5 = max 16 min 5?
+// however the scale shift is substracted from shift to a min 0 so scale_m worst = 45 << 6
+// This can still theoretically lead to overflow but the coding would have to be very odd (& inefficient)
+// to achieve it
+
+#ifndef trans_scale_sat
+static inline int trans_scale_sat(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift)
+{
+ return av_clip_int16((((level * (int)(scale * scale_m)) >> shift) + 1) >> 1);
+}
+#endif
+
+
+#ifndef update_rice
+static inline void update_rice(uint8_t * const stat_coeff,
+ const unsigned int last_coeff_abs_level_remaining,
+ const unsigned int c_rice_param)
+{
+ const unsigned int x = (last_coeff_abs_level_remaining << 1) >> c_rice_param;
+ if (x >= 6)
+ (*stat_coeff)++;
+ else if (x == 0 && *stat_coeff > 0)
+ (*stat_coeff)--;
+}
+#endif
+
+
+// n must be > 0 on entry
+#ifndef get_cabac_sig_coeff_flag_idxs
+static inline uint8_t * get_cabac_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0,
+ unsigned int n,
+ const uint8_t const * ctx_map,
+ uint8_t * p)
+{
+ do {
+ if (get_cabac(c, state0 + ctx_map[n]))
+ *p++ = n;
+ } while (--n != 0);
+ return p;
+}
+#endif
+
+
+static int get_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0,
+ unsigned int n,
+ const uint8_t const * ctx_map,
+ uint8_t * const flag_idx)
+{
+ int rv;
+
+ rv = get_cabac_sig_coeff_flag_idxs(c, state0, n, ctx_map, flag_idx) - flag_idx;
+
+ return rv;
+}
+
+#define H4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
+ x0, x1, x2, x3,\
+ x4, x5, x6, x7,\
+ x8, x9, x10, x11,\
+ x12, x13, x14, x15}
+
+#define V4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
+ x0, x4, x8, x12,\
+ x1, x5, x9, x13,\
+ x2, x6, x10, x14,\
+ x3, x7, x11, x15}
+
+#define D4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
+ x0, x4, x1, x8,\
+ x5, x2, x12, x9,\
+ x6, x3, x13, x10,\
+ x7, x14, x11, x15}
+
+
+static inline int next_subset(HEVCContext * const s, int i, const int c_idx_nz,
+ uint8_t * const significant_coeff_group_flag,
+ const uint8_t * const scan_x_cg, const uint8_t * const scan_y_cg,
+ int * const pPrev_sig)
+{
+ while (--i >= 0) {
+ uint8_t * const gf_y = scan_y_cg[i] + significant_coeff_group_flag;
+ const unsigned int x_cg = scan_x_cg[i];
+
+ // For the flag decode we only care about Z/NZ but
+ // we use the full Right * 2 + Down when calculating
+ // significant coeff flags so we obtain it here.
+ //
+ // The group flag array is one longer than it needs to
+ // be so we don't need to check for y_cg limits
+ const unsigned int prev_sig = ((gf_y[0] >> x_cg) & 2) | ((gf_y[1] >> x_cg) & 1);
+
+ if (i == 0 ||
+ significant_coeff_group_flag_decode(s, c_idx_nz, prev_sig))
+ {
+ gf_y[0] |= (1 << x_cg);
+ *pPrev_sig = prev_sig;
+ break;
+ }
+ }
+
+ return i;
+}
+
+#ifdef RPI
+static void rpi_add_residual(HEVCContext * const s,
+ const unsigned int log2_trafo_size, const unsigned int c_idx,
+ const unsigned int x0, const unsigned int y0, const int16_t * const coeffs)
+{
+ const AVFrame * const frame = s->frame;
+ unsigned int stride = frame->linesize[c_idx];
+ unsigned int x = x0 >> s->ps.sps->hshift[c_idx];
+ unsigned int y = y0 >> s->ps.sps->vshift[c_idx];
+ const int is_sliced = av_rpi_is_sand_frame(frame);
+ uint8_t * dst = !is_sliced ?
+ s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) :
+ c_idx == 0 ?
+ av_rpi_sand_frame_pos_y(frame, x, y) :
+ av_rpi_sand_frame_pos_c(frame, x, y);
+
+ if (s->enable_rpi) {
+ const unsigned int i = s->jb0->intra.n;
+ HEVCPredCmd *const pc = s->jb0->intra.cmds + i - 1;
+
+ if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_RESIDUAL_U &&
+ pc->ta.dst == dst)
+ {
+ av_assert1(pc->size == log2_trafo_size &&
+ pc->c_idx == 1 &&
+ pc->ta.stride == stride);
+
+ pc->type = RPI_PRED_ADD_RESIDUAL_C;
+ }
+ else if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_DC_U &&
+ pc->dc.dst == dst)
+ {
+ const int16_t dc = (int16_t)pc->dc.dc; // Discard top bits
+ av_assert1(pc->size == log2_trafo_size &&
+ pc->c_idx == 1 &&
+ pc->dc.stride == stride);
+
+ // Rewrite as add residual - must rewrite all fields as different union member
+ pc->type = RPI_PRED_ADD_RESIDUAL_V;
+ pc->c_idx = c_idx;
+ pc->ta.buf = coeffs;
+ pc->ta.dst = dst;
+ pc->ta.stride = stride;
+ pc->ta.dc = dc;
+ }
+ else
+ {
+ HEVCPredCmd * const cmd = pc + 1;
+ s->jb0->intra.n = i + 1;
+
+ cmd->type = RPI_PRED_ADD_RESIDUAL + (is_sliced ? c_idx : 0);
+ cmd->size = log2_trafo_size;
+ cmd->c_idx = c_idx;
+ cmd->ta.buf = coeffs;
+ cmd->ta.dst = dst;
+ cmd->ta.stride = stride;
+ cmd->ta.dc = 0;
+ }
+ }
+ else if (!is_sliced || c_idx == 0) {
+ s->hevcdsp.transform_add[log2_trafo_size-2](dst, (int16_t *)coeffs, stride);
+ }
+#if RPI_HEVC_SAND
+ // * These should probably never happen
+ else if (c_idx == 1) {
+ s->hevcdsp.add_residual_u[log2_trafo_size-2](dst, (int16_t *)coeffs, stride, 0);
+ }
+ else {
+ s->hevcdsp.add_residual_v[log2_trafo_size-2](dst, (int16_t *)coeffs, stride, 0);
+ }
+#endif
+}
+
+
+static void rpi_add_dc(HEVCContext * const s,
+ const unsigned int log2_trafo_size, const unsigned int c_idx,
+ const unsigned int x0, const unsigned int y0, const int16_t * const coeffs)
+{
+ const AVFrame * const frame = s->frame;
+ const unsigned int stride = frame->linesize[c_idx];
+ const unsigned int x = x0 >> s->ps.sps->hshift[c_idx];
+ const unsigned int y = y0 >> s->ps.sps->vshift[c_idx];
+ const int is_sliced = av_rpi_is_sand_frame(frame);
+ uint8_t * const dst = !is_sliced ?
+ s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) :
+ c_idx == 0 ?
+ av_rpi_sand_frame_pos_y(frame, x, y) :
+ av_rpi_sand_frame_pos_c(frame, x, y);
+
+ const unsigned int shift = FFMAX(14 - s->ps.sps->bit_depth, 0);
+ const int coeff = (coeffs[0] + (1 | (1 << shift))) >> (shift + 1);
+
+ if (s->enable_rpi) {
+ const unsigned int i = s->jb0->intra.n;
+ HEVCPredCmd *const pc = s->jb0->intra.cmds + i - 1;
+
+ if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_RESIDUAL_U &&
+ pc->ta.dst == dst)
+ {
+ av_assert1(pc->size == log2_trafo_size &&
+ pc->c_idx == 1 &&
+ pc->ta.stride == stride);
+
+ pc->ta.dc = (int16_t)coeff;
+ }
+ else if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_DC_U &&
+ pc->dc.dst == dst)
+ {
+ av_assert1(pc->size == log2_trafo_size &&
+ pc->c_idx == 1 &&
+ pc->dc.stride == stride &&
+ (pc->dc.dc & ~0xffff) == 0);
+
+ pc->dc.dc |= (coeff << 16);
+ }
+ else
+ {
+ HEVCPredCmd * const cmd = pc + 1;
+ s->jb0->intra.n = i + 1;
+
+ cmd->type = RPI_PRED_ADD_DC + c_idx;
+ cmd->size = log2_trafo_size;
+ cmd->c_idx = c_idx;
+ cmd->dc.dst = dst;
+ cmd->dc.stride = stride;
+ cmd->dc.dc = c_idx == 0 ? coeff : c_idx == 2 ? coeff << 16 : coeff & 0xffff;
+ }
+ }
+}
+
+
+#endif
void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
int log2_trafo_size, enum ScanType scan_idx,
int c_idx)
{
-#define GET_COORD(offset, n) \
- do { \
- x_c = (x_cg << 2) + scan_x_off[n]; \
- y_c = (y_cg << 2) + scan_y_off[n]; \
- } while (0)
- HEVCLocalContext *lc = s->HEVClc;
- int transform_skip_flag = 0;
+ HEVCLocalContext * const lc = s->HEVClc;
+ int trans_skip_or_bypass = lc->cu.cu_transquant_bypass_flag;
int last_significant_coeff_x, last_significant_coeff_y;
- int last_scan_pos;
- int n_end;
int num_coeff = 0;
- int greater1_ctx = 1;
+ int prev_subset_coded = 0;
int num_last_subset;
int x_cg_last_sig, y_cg_last_sig;
- const uint8_t *scan_x_cg, *scan_y_cg, *scan_x_off, *scan_y_off;
+ const uint8_t *scan_x_cg, *scan_y_cg;
+ const xy_off_t * scan_xy_off;
+#ifndef RPI
ptrdiff_t stride = s->frame->linesize[c_idx];
int hshift = s->ps.sps->hshift[c_idx];
int vshift = s->ps.sps->vshift[c_idx];
- uint8_t *dst = &s->frame->data[c_idx][(y0 >> vshift) * stride +
+ uint8_t * const dst = &s->frame->data[c_idx][(y0 >> vshift) * stride +
((x0 >> hshift) << s->ps.sps->pixel_shift)];
- int16_t *coeffs = (int16_t*)(c_idx ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
- uint8_t significant_coeff_group_flag[8][8] = {{0}};
+#endif
+#ifdef RPI
+ int use_vpu;
+ int use_dc = 0;
+#endif
+ int16_t *coeffs;
+ uint8_t significant_coeff_group_flag[9] = {0}; // Allow 1 final byte that is always zero
int explicit_rdpcm_flag = 0;
int explicit_rdpcm_dir_flag;
int trafo_size = 1 << log2_trafo_size;
int i;
- int qp,shift,add,scale,scale_m;
+ int qp,shift,scale;
static const uint8_t level_scale[] = { 40, 45, 51, 57, 64, 72 };
const uint8_t *scale_matrix = NULL;
uint8_t dc_scale;
int pred_mode_intra = (c_idx == 0) ? lc->tu.intra_pred_mode :
lc->tu.intra_pred_mode_c;
- memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
+ int prev_sig = 0;
+ const int c_idx_nz = (c_idx != 0);
+
+ int may_hide_sign;
// Derive QP for dequant
if (!lc->cu.cu_transquant_bypass_flag) {
- static const int qp_c[] = { 29, 30, 31, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37 };
+ static const uint8_t qp_c[] = { 29, 30, 31, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37 };
static const uint8_t rem6[51 + 4 * 6 + 1] = {
0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2,
3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
@@ -1065,9 +1706,19 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
};
int qp_y = lc->qp_y;
+ may_hide_sign = s->ps.pps->sign_data_hiding_flag;
+
if (s->ps.pps->transform_skip_enabled_flag &&
log2_trafo_size <= s->ps.pps->log2_max_transform_skip_block_size) {
- transform_skip_flag = hevc_transform_skip_flag_decode(s, c_idx);
+ int transform_skip_flag = hevc_transform_skip_flag_decode(s, c_idx_nz);
+ if (transform_skip_flag) {
+ trans_skip_or_bypass = 1;
+ if (lc->cu.pred_mode == MODE_INTRA &&
+ s->ps.sps->implicit_rdpcm_enabled_flag &&
+ (pred_mode_intra == 10 || pred_mode_intra == 26)) {
+ may_hide_sign = 0;
+ }
+ }
}
if (c_idx == 0) {
@@ -1100,39 +1751,76 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
qp += s->ps.sps->qp_bd_offset;
}
- shift = s->ps.sps->bit_depth + log2_trafo_size - 5;
- add = 1 << (shift-1);
- scale = level_scale[rem6[qp]] << (div6[qp]);
- scale_m = 16; // default when no custom scaling lists.
- dc_scale = 16;
+ // Shift is set to one less than will actually occur as the scale
+ // and saturate step adds 1 and then shifts right again
+ shift = s->ps.sps->bit_depth + log2_trafo_size - 6;
+ scale = level_scale[rem6[qp]];
+ if (div6[qp] >= shift) {
+ scale <<= (div6[qp] - shift);
+ shift = 0;
+ } else {
+ shift -= div6[qp];
+ }
- if (s->ps.sps->scaling_list_enable_flag && !(transform_skip_flag && log2_trafo_size > 2)) {
+ if (s->ps.sps->scaling_list_enable_flag && !(trans_skip_or_bypass && log2_trafo_size > 2)) {
const ScalingList *sl = s->ps.pps->scaling_list_data_present_flag ?
- &s->ps.pps->scaling_list : &s->ps.sps->scaling_list;
+ &s->ps.pps->scaling_list : &s->ps.sps->scaling_list;
int matrix_id = lc->cu.pred_mode != MODE_INTRA;
matrix_id = 3 * matrix_id + c_idx;
scale_matrix = sl->sl[log2_trafo_size - 2][matrix_id];
+ dc_scale = scale_matrix[0];
if (log2_trafo_size >= 4)
dc_scale = sl->sl_dc[log2_trafo_size - 4][matrix_id];
}
+ else
+ {
+ static const uint8_t sixteen_scale[64] = {
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16
+ };
+ scale_matrix = sixteen_scale;
+ dc_scale = 16;
+ }
} else {
+ static const uint8_t unit_scale[64] = {
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ };
+ scale_matrix = unit_scale;
shift = 0;
- add = 0;
- scale = 0;
- dc_scale = 0;
+ scale = 2; // We will shift right to kill this
+ dc_scale = 1;
+
+ may_hide_sign = 0;
}
+
+
+
if (lc->cu.pred_mode == MODE_INTER && s->ps.sps->explicit_rdpcm_enabled_flag &&
- (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
- explicit_rdpcm_flag = explicit_rdpcm_flag_decode(s, c_idx);
+ trans_skip_or_bypass) {
+ explicit_rdpcm_flag = explicit_rdpcm_flag_decode(s, c_idx_nz);
if (explicit_rdpcm_flag) {
- explicit_rdpcm_dir_flag = explicit_rdpcm_dir_flag_decode(s, c_idx);
+ may_hide_sign = 0;
+ explicit_rdpcm_dir_flag = explicit_rdpcm_dir_flag_decode(s, c_idx_nz);
}
}
- last_significant_coeff_xy_prefix_decode(s, c_idx, log2_trafo_size,
+ last_significant_coeff_xy_prefix_decode(s, c_idx_nz, log2_trafo_size,
&last_significant_coeff_x, &last_significant_coeff_y);
if (last_significant_coeff_x > 3) {
@@ -1160,119 +1848,147 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
int last_x_c = last_significant_coeff_x & 3;
int last_y_c = last_significant_coeff_y & 3;
- scan_x_off = ff_hevc_diag_scan4x4_x;
- scan_y_off = ff_hevc_diag_scan4x4_y;
num_coeff = diag_scan4x4_inv[last_y_c][last_x_c];
- if (trafo_size == 4) {
+
+ switch (log2_trafo_size) {
+ case 2:
scan_x_cg = scan_1x1;
scan_y_cg = scan_1x1;
- } else if (trafo_size == 8) {
+ break;
+ case 3:
num_coeff += diag_scan2x2_inv[y_cg_last_sig][x_cg_last_sig] << 4;
scan_x_cg = diag_scan2x2_x;
scan_y_cg = diag_scan2x2_y;
- } else if (trafo_size == 16) {
+ break;
+ case 4:
num_coeff += diag_scan4x4_inv[y_cg_last_sig][x_cg_last_sig] << 4;
scan_x_cg = ff_hevc_diag_scan4x4_x;
scan_y_cg = ff_hevc_diag_scan4x4_y;
- } else { // trafo_size == 32
+ break;
+ case 5:
+ default:
num_coeff += diag_scan8x8_inv[y_cg_last_sig][x_cg_last_sig] << 4;
scan_x_cg = ff_hevc_diag_scan8x8_x;
scan_y_cg = ff_hevc_diag_scan8x8_y;
+ break;
}
break;
}
case SCAN_HORIZ:
scan_x_cg = horiz_scan2x2_x;
scan_y_cg = horiz_scan2x2_y;
- scan_x_off = horiz_scan4x4_x;
- scan_y_off = horiz_scan4x4_y;
num_coeff = horiz_scan8x8_inv[last_significant_coeff_y][last_significant_coeff_x];
break;
default: //SCAN_VERT
scan_x_cg = horiz_scan2x2_y;
scan_y_cg = horiz_scan2x2_x;
- scan_x_off = horiz_scan4x4_y;
- scan_y_off = horiz_scan4x4_x;
num_coeff = horiz_scan8x8_inv[last_significant_coeff_x][last_significant_coeff_y];
break;
}
num_coeff++;
num_last_subset = (num_coeff - 1) >> 4;
- for (i = num_last_subset; i >= 0; i--) {
- int n, m;
- int x_cg, y_cg, x_c, y_c, pos;
- int implicit_non_zero_coeff = 0;
- int64_t trans_coeff_level;
- int prev_sig = 0;
- int offset = i << 4;
- int rice_init = 0;
-
- uint8_t significant_coeff_flag_idx[16];
- uint8_t nb_significant_coeff_flag = 0;
+ significant_coeff_group_flag[y_cg_last_sig] = 1 << x_cg_last_sig; // 1st subset always significant
- x_cg = scan_x_cg[i];
- y_cg = scan_y_cg[i];
+ scan_xy_off = off_xys[scan_idx][log2_trafo_size - 2];
- if ((i < num_last_subset) && (i > 0)) {
- int ctx_cg = 0;
- if (x_cg < (1 << (log2_trafo_size - 2)) - 1)
- ctx_cg += significant_coeff_group_flag[x_cg + 1][y_cg];
- if (y_cg < (1 << (log2_trafo_size - 2)) - 1)
- ctx_cg += significant_coeff_group_flag[x_cg][y_cg + 1];
+ {
+ const unsigned int ccount = 1 << (log2_trafo_size * 2);
+#ifdef RPI
+ use_vpu = 0;
+ if (s->enable_rpi) {
+ const int special = trans_skip_or_bypass || lc->tu.cross_pf; // These need special processinmg
+ use_dc = (num_coeff == 1) && !special &&
+ !(lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2);
- significant_coeff_group_flag[x_cg][y_cg] =
- significant_coeff_group_flag_decode(s, c_idx, ctx_cg);
- implicit_non_zero_coeff = 1;
- } else {
- significant_coeff_group_flag[x_cg][y_cg] =
- ((x_cg == x_cg_last_sig && y_cg == y_cg_last_sig) ||
- (x_cg == 0 && y_cg == 0));
+ if (use_dc) {
+ // Just need a little empty space
+ coeffs = (int16_t*)(c_idx_nz ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
+ // No need to clear
+ }
+ else
+ {
+ use_vpu = !special && log2_trafo_size >= 4;
+ coeffs = rpi_alloc_coeff_buf(s, !use_vpu ? 0 : log2_trafo_size - 2, ccount);
+#if HAVE_NEON
+ rpi_zap_coeff_vals_neon(coeffs, log2_trafo_size - 2);
+#else
+ memset(coeffs, 0, ccount * sizeof(int16_t));
+#endif
+ }
}
+ else
+#endif
+ {
+ coeffs = (int16_t*)(c_idx_nz ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
+ memset(coeffs, 0, ccount * sizeof(int16_t));
+ }
+ }
- last_scan_pos = num_coeff - offset - 1;
+ i = num_last_subset;
+ do {
+ int implicit_non_zero_coeff = 0;
+ int n_end;
+
+ uint8_t significant_coeff_flag_idx[16];
+ unsigned int nb_significant_coeff_flag = 0;
if (i == num_last_subset) {
+ // First time through
+ int last_scan_pos = num_coeff - (i << 4) - 1;
n_end = last_scan_pos - 1;
significant_coeff_flag_idx[0] = last_scan_pos;
nb_significant_coeff_flag = 1;
} else {
n_end = 15;
+ implicit_non_zero_coeff = (i != 0);
}
- if (x_cg < ((1 << log2_trafo_size) - 1) >> 2)
- prev_sig = !!significant_coeff_group_flag[x_cg + 1][y_cg];
- if (y_cg < ((1 << log2_trafo_size) - 1) >> 2)
- prev_sig += (!!significant_coeff_group_flag[x_cg][y_cg + 1] << 1);
-
- if (significant_coeff_group_flag[x_cg][y_cg] && n_end >= 0) {
- static const uint8_t ctx_idx_map[] = {
- 0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8, // log2_trafo_size == 2
- 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, // prev_sig == 0
- 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, // prev_sig == 1
- 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, // prev_sig == 2
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 // default
+ if (n_end >= 0) {
+ static const uint8_t ctx_idx_maps_ts2[3][16] = {
+ D4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2
+ H4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2
+ V4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8) // log2_trafo_size == 2
+ };
+ // N.B. prev_sig = Right * 2 + Down
+ static const uint8_t ctx_idx_maps[3][4][16] = {
+ {
+ D4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
+ D4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1
+ D4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2
+ D4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2) // prev_sig == 3, default
+ },
+ {
+ H4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
+ H4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1
+ H4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2
+ H4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2) // prev_sig == 3, default
+ },
+ {
+ V4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
+ V4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1
+ V4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2
+ V4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2) // prev_sig == 3, default
+ }
};
const uint8_t *ctx_idx_map_p;
int scf_offset = 0;
- if (s->ps.sps->transform_skip_context_enabled_flag &&
- (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
- ctx_idx_map_p = (uint8_t*) &ctx_idx_map[4 * 16];
- if (c_idx == 0) {
- scf_offset = 40;
- } else {
- scf_offset = 14 + 27;
- }
+
+ if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) {
+ ctx_idx_map_p = ctx_idx_maps[0][3];
+ scf_offset = 40 + c_idx_nz;
} else {
- if (c_idx != 0)
+ if (c_idx_nz != 0)
scf_offset = 27;
+
if (log2_trafo_size == 2) {
- ctx_idx_map_p = (uint8_t*) &ctx_idx_map[0];
+ ctx_idx_map_p = ctx_idx_maps_ts2[scan_idx];
} else {
- ctx_idx_map_p = (uint8_t*) &ctx_idx_map[(prev_sig + 1) << 4];
- if (c_idx == 0) {
- if ((x_cg > 0 || y_cg > 0))
+ ctx_idx_map_p = ctx_idx_maps[scan_idx][prev_sig];
+ if (!c_idx_nz) {
+ if (i != 0)
scf_offset += 3;
+
if (log2_trafo_size == 3) {
scf_offset += (scan_idx == SCAN_DIAG) ? 9 : 15;
} else {
@@ -1286,34 +2002,30 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
}
}
}
- for (n = n_end; n > 0; n--) {
- x_c = scan_x_off[n];
- y_c = scan_y_off[n];
- if (significant_coeff_flag_decode(s, x_c, y_c, scf_offset, ctx_idx_map_p)) {
- significant_coeff_flag_idx[nb_significant_coeff_flag] = n;
- nb_significant_coeff_flag++;
+
+ if (n_end > 0) {
+ int cnt = get_sig_coeff_flag_idxs(&s->HEVClc->cc,
+ s->HEVClc->cabac_state + elem_offset[SIGNIFICANT_COEFF_FLAG] + scf_offset,
+ n_end, ctx_idx_map_p,
+ significant_coeff_flag_idx + nb_significant_coeff_flag);
+
+ nb_significant_coeff_flag += cnt;
+ if (cnt != 0) {
implicit_non_zero_coeff = 0;
}
}
+
if (implicit_non_zero_coeff == 0) {
- if (s->ps.sps->transform_skip_context_enabled_flag &&
- (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
- if (c_idx == 0) {
- scf_offset = 42;
- } else {
- scf_offset = 16 + 27;
- }
+ if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) {
+ scf_offset = 42 + c_idx_nz;
} else {
if (i == 0) {
- if (c_idx == 0)
- scf_offset = 0;
- else
- scf_offset = 27;
+ scf_offset = c_idx_nz ? 27 : 0;
} else {
scf_offset = 2 + scf_offset;
}
}
- if (significant_coeff_flag_decode_0(s, c_idx, scf_offset) == 1) {
+ if (significant_coeff_flag_decode_0(s, scf_offset) == 1) {
significant_coeff_flag_idx[nb_significant_coeff_flag] = 0;
nb_significant_coeff_flag++;
}
@@ -1323,141 +2035,185 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
}
}
- n_end = nb_significant_coeff_flag;
-
+ if (nb_significant_coeff_flag != 0) {
+ const unsigned int gt1_idx_delta = (c_idx_nz << 2) |
+ ((i != 0 && !c_idx_nz) ? 2 : 0) |
+ prev_subset_coded;
+ const unsigned int idx0_gt1 = elem_offset[COEFF_ABS_LEVEL_GREATER1_FLAG] +
+ (gt1_idx_delta << 2);
+ const unsigned int idx_gt2 = elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] +
+ gt1_idx_delta;
+
+ const unsigned int x_cg = scan_x_cg[i];
+ const unsigned int y_cg = scan_y_cg[i];
+ int16_t * const blk_coeffs = coeffs +
+ ((x_cg + (y_cg << log2_trafo_size)) << 2);
+ // This calculation is 'wrong' for log2_traffo_size == 2
+ // but that doesn't mattor as in this case x_cg & y_cg
+ // are always 0 so result is correct (0) anyway
+ const uint8_t * const blk_scale = scale_matrix +
+ (((x_cg + (y_cg << 3)) << (5 - log2_trafo_size)));
+
+ // * The following code block doesn't deal with these flags:
+ // (nor did the one it replaces)
+ //
+ // cabac_bypass_alignment_enabled_flag
+ // This should be easy but I can't find a test case
+ // extended_precision_processing_flag
+ // This can extend the required precision past 16bits
+ // so is probably tricky - also no example found yet
+
+#if USE_N_END_1
+ if (nb_significant_coeff_flag == 1) {
+ // There is a small gain to be had from special casing the single
+ // transform coefficient case. The reduction in complexity
+ // makes up for the code duplicatioon.
+
+ int trans_coeff_level = 1;
+ int coeff_sign_flag;
+ int coded_val = 0;
+
+ // initialize first elem of coeff_bas_level_greater1_flag
+ prev_subset_coded = 0;
+
+ if (get_cabac(&s->HEVClc->cc, s->HEVClc->cabac_state + idx0_gt1 + 1)) {
+ trans_coeff_level = 2;
+ prev_subset_coded = 1;
+ coded_val = get_cabac(&s->HEVClc->cc, s->HEVClc->cabac_state + idx_gt2);
+ }
- if (n_end) {
- int first_nz_pos_in_cg;
- int last_nz_pos_in_cg;
- int c_rice_param = 0;
- int first_greater1_coeff_idx = -1;
- uint8_t coeff_abs_level_greater1_flag[8];
- uint16_t coeff_sign_flag;
- int sum_abs = 0;
- int sign_hidden;
- int sb_type;
+ // Probably not worth the overhead of starting by22 for just one value
+ coeff_sign_flag = get_cabac_bypass(&s->HEVClc->cc);
+ if (coded_val)
+ {
+ if (!s->ps.sps->persistent_rice_adaptation_enabled_flag) {
+ trans_coeff_level = 3 + coeff_abs_level_remaining_decode(s, 0);
+ } else {
+ uint8_t * const stat_coeff =
+ lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1);
+ const unsigned int c_rice_param = *stat_coeff >> 2;
+ const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
- // initialize first elem of coeff_bas_level_greater1_flag
- int ctx_set = (i > 0 && c_idx == 0) ? 2 : 0;
+ trans_coeff_level = 3 + last_coeff_abs_level_remaining;
+ update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
+ }
+ }
- if (s->ps.sps->persistent_rice_adaptation_enabled_flag) {
- if (!transform_skip_flag && !lc->cu.cu_transquant_bypass_flag)
- sb_type = 2 * (c_idx == 0 ? 1 : 0);
- else
- sb_type = 2 * (c_idx == 0 ? 1 : 0) + 1;
- c_rice_param = lc->stat_coeff[sb_type] / 4;
- }
+ {
+ const xy_off_t * const xy_off = scan_xy_off + significant_coeff_flag_idx[0];
+ const int k = (int32_t)(coeff_sign_flag << 31) >> 31;
+ const unsigned int scale_m = blk_scale[xy_off->scale];
- if (!(i == num_last_subset) && greater1_ctx == 0)
- ctx_set++;
- greater1_ctx = 1;
- last_nz_pos_in_cg = significant_coeff_flag_idx[0];
-
- for (m = 0; m < (n_end > 8 ? 8 : n_end); m++) {
- int inc = (ctx_set << 2) + greater1_ctx;
- coeff_abs_level_greater1_flag[m] =
- coeff_abs_level_greater1_flag_decode(s, c_idx, inc);
- if (coeff_abs_level_greater1_flag[m]) {
- greater1_ctx = 0;
- if (first_greater1_coeff_idx == -1)
- first_greater1_coeff_idx = m;
- } else if (greater1_ctx > 0 && greater1_ctx < 3) {
- greater1_ctx++;
+ blk_coeffs[xy_off->coeff] = trans_scale_sat(
+ (trans_coeff_level ^ k) - k, // Apply sign
+ scale,
+ i == 0 && xy_off->coeff == 0 ? dc_scale : scale_m,
+ shift);
}
}
- first_nz_pos_in_cg = significant_coeff_flag_idx[n_end - 1];
-
- if (lc->cu.cu_transquant_bypass_flag ||
- (lc->cu.pred_mode == MODE_INTRA &&
- s->ps.sps->implicit_rdpcm_enabled_flag && transform_skip_flag &&
- (pred_mode_intra == 10 || pred_mode_intra == 26 )) ||
- explicit_rdpcm_flag)
- sign_hidden = 0;
else
- sign_hidden = (last_nz_pos_in_cg - first_nz_pos_in_cg >= 4);
+#endif
+ {
+ int sign_hidden = may_hide_sign;
+ int levels[16]; // Should be able to get away with int16_t but that fails some tests
+ uint32_t coeff_sign_flags;
+ uint32_t coded_vals = 0;
+ // Sum(abs(level[]))
+ // In fact we only need the bottom bit and in some future
+ // version that may be all we calculate
+ unsigned int sum_abs;
+
+ coded_vals = get_greaterx_bits(s, nb_significant_coeff_flag, levels,
+ &prev_subset_coded, &sum_abs, idx0_gt1, idx_gt2);
+
+ if (significant_coeff_flag_idx[0] - significant_coeff_flag_idx[nb_significant_coeff_flag - 1] <= 3)
+ sign_hidden = 0;
+
+ // -- Start bypass block
+
+ bypass_start(s);
+
+ coeff_sign_flags = coeff_sign_flag_decode_bypass(s, nb_significant_coeff_flag - sign_hidden);
+
+ if (coded_vals != 0)
+ {
+ const int rice_adaptation_enabled = s->ps.sps->persistent_rice_adaptation_enabled_flag;
+ uint8_t * stat_coeff = !rice_adaptation_enabled ? NULL :
+ lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1);
+ int c_rice_param = !rice_adaptation_enabled ? 0 : *stat_coeff >> 2;
+ int * level = levels - 1;
+
+ do {
+ {
+ const unsigned int z = hevc_clz32(coded_vals) + 1;
+ level += z;
+ coded_vals <<= z;
+ }
- if (first_greater1_coeff_idx != -1) {
- coeff_abs_level_greater1_flag[first_greater1_coeff_idx] += coeff_abs_level_greater2_flag_decode(s, c_idx, ctx_set);
- }
- if (!s->ps.pps->sign_data_hiding_flag || !sign_hidden ) {
- coeff_sign_flag = coeff_sign_flag_decode(s, nb_significant_coeff_flag) << (16 - nb_significant_coeff_flag);
- } else {
- coeff_sign_flag = coeff_sign_flag_decode(s, nb_significant_coeff_flag - 1) << (16 - (nb_significant_coeff_flag - 1));
- }
+ {
+ const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode_bypass(s, c_rice_param);
+ const int trans_coeff_level = *level + last_coeff_abs_level_remaining + 1;
- for (m = 0; m < n_end; m++) {
- n = significant_coeff_flag_idx[m];
- GET_COORD(offset, n);
- if (m < 8) {
- trans_coeff_level = 1 + coeff_abs_level_greater1_flag[m];
- if (trans_coeff_level == ((m == first_greater1_coeff_idx) ? 3 : 2)) {
- int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
-
- trans_coeff_level += last_coeff_abs_level_remaining;
- if (trans_coeff_level > (3 << c_rice_param))
- c_rice_param = s->ps.sps->persistent_rice_adaptation_enabled_flag ? c_rice_param + 1 : FFMIN(c_rice_param + 1, 4);
- if (s->ps.sps->persistent_rice_adaptation_enabled_flag && !rice_init) {
- int c_rice_p_init = lc->stat_coeff[sb_type] / 4;
- if (last_coeff_abs_level_remaining >= (3 << c_rice_p_init))
- lc->stat_coeff[sb_type]++;
- else if (2 * last_coeff_abs_level_remaining < (1 << c_rice_p_init))
- if (lc->stat_coeff[sb_type] > 0)
- lc->stat_coeff[sb_type]--;
- rice_init = 1;
+ sum_abs += last_coeff_abs_level_remaining + 1;
+ *level = trans_coeff_level;
+
+ if (stat_coeff != NULL)
+ update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
+ stat_coeff = NULL;
+
+ if (trans_coeff_level > (3 << c_rice_param) &&
+ (c_rice_param < 4 || rice_adaptation_enabled))
+ ++c_rice_param;
}
- }
- } else {
- int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
-
- trans_coeff_level = 1 + last_coeff_abs_level_remaining;
- if (trans_coeff_level > (3 << c_rice_param))
- c_rice_param = s->ps.sps->persistent_rice_adaptation_enabled_flag ? c_rice_param + 1 : FFMIN(c_rice_param + 1, 4);
- if (s->ps.sps->persistent_rice_adaptation_enabled_flag && !rice_init) {
- int c_rice_p_init = lc->stat_coeff[sb_type] / 4;
- if (last_coeff_abs_level_remaining >= (3 << c_rice_p_init))
- lc->stat_coeff[sb_type]++;
- else if (2 * last_coeff_abs_level_remaining < (1 << c_rice_p_init))
- if (lc->stat_coeff[sb_type] > 0)
- lc->stat_coeff[sb_type]--;
- rice_init = 1;
- }
+ } while (coded_vals != 0);
}
- if (s->ps.pps->sign_data_hiding_flag && sign_hidden) {
- sum_abs += trans_coeff_level;
- if (n == first_nz_pos_in_cg && (sum_abs&1))
- trans_coeff_level = -trans_coeff_level;
+
+ // sign_hidden = 0 or 1 so we can combine the tests
+ if ((sign_hidden & sum_abs) != 0) {
+ levels[nb_significant_coeff_flag - 1] = -levels[nb_significant_coeff_flag - 1];
}
- if (coeff_sign_flag >> 15)
- trans_coeff_level = -trans_coeff_level;
- coeff_sign_flag <<= 1;
- if(!lc->cu.cu_transquant_bypass_flag) {
- if (s->ps.sps->scaling_list_enable_flag && !(transform_skip_flag && log2_trafo_size > 2)) {
- if(y_c || x_c || log2_trafo_size < 4) {
- switch(log2_trafo_size) {
- case 3: pos = (y_c << 3) + x_c; break;
- case 4: pos = ((y_c >> 1) << 3) + (x_c >> 1); break;
- case 5: pos = ((y_c >> 2) << 3) + (x_c >> 2); break;
- default: pos = (y_c << 2) + x_c; break;
- }
- scale_m = scale_matrix[pos];
- } else {
- scale_m = dc_scale;
- }
+
+ bypass_finish(s);
+
+ // -- Finish bypass block
+
+ // Scale loop
+ {
+ int m = nb_significant_coeff_flag - 1;
+
+ // Deal with DC component (if any) first
+ if (i == 0 && significant_coeff_flag_idx[m] == 0)
+ {
+ const int k = (int32_t)(coeff_sign_flags << m) >> 31;
+ blk_coeffs[0] = trans_scale_sat(
+ (levels[m] ^ k) - k, scale, dc_scale, shift);
+ --m;
}
- trans_coeff_level = (trans_coeff_level * (int64_t)scale * (int64_t)scale_m + add) >> shift;
- if(trans_coeff_level < 0) {
- if((~trans_coeff_level) & 0xFffffffffff8000)
- trans_coeff_level = -32768;
- } else {
- if(trans_coeff_level & 0xffffffffffff8000)
- trans_coeff_level = 32767;
+
+#if !USE_N_END_1
+ // If N_END_1 set then m was at least 1 initially
+ if (m >= 0)
+#endif
+ {
+ do {
+ const xy_off_t * const xy_off = scan_xy_off +
+ significant_coeff_flag_idx[m];
+ const int k = (int32_t)(coeff_sign_flags << m) >> 31;
+
+ blk_coeffs[xy_off->coeff] = trans_scale_sat(
+ (levels[m] ^ k) - k,
+ scale,
+ blk_scale[xy_off->scale],
+ shift);
+ } while (--m >= 0);
}
}
- coeffs[y_c * trafo_size + x_c] = trans_coeff_level;
+
}
}
- }
+ } while ((i = next_subset(s, i, c_idx_nz,
+ significant_coeff_group_flag, scan_x_cg, scan_y_cg, &prev_sig)) >= 0);
if (lc->cu.cu_transquant_bypass_flag) {
if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
@@ -1467,7 +2223,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
}
} else {
- if (transform_skip_flag) {
+ if (trans_skip_or_bypass) { // Must be trans_skip as we've already dealt with bypass
int rot = s->ps.sps->transform_skip_rotation_enabled_flag &&
log2_trafo_size == 2 &&
lc->cu.pred_mode == MODE_INTRA;
@@ -1487,10 +2243,23 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
}
} else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) {
s->hevcdsp.idct_4x4_luma(coeffs);
- } else {
+ }
+#ifdef RPI
+ else if (!use_vpu)
+#else
+ else
+#endif
+ {
int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
if (max_xy == 0)
- s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs);
+ {
+#ifdef RPI
+ if (use_dc)
+ rpi_add_dc(s, log2_trafo_size, c_idx, x0, y0, coeffs);
+ else
+#endif
+ s->hevcdsp.idct_dc[log2_trafo_size - 2](coeffs);
+ }
else {
int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4;
if (max_xy < 4)
@@ -1510,7 +2279,14 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
coeffs[i] = coeffs[i] + ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
}
}
+#ifdef RPI
+ if (!use_dc)
+ {
+ rpi_add_residual(s, log2_trafo_size, c_idx, x0, y0, coeffs);
+ }
+#else
s->hevcdsp.transform_add[log2_trafo_size-2](dst, coeffs, stride);
+#endif
}
void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size)
diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
index 9fbcd1d8b8..df129e2e46 100644
--- a/libavcodec/hevc_filter.c
+++ b/libavcodec/hevc_filter.c
@@ -22,6 +22,12 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
+//#define DISABLE_SAO
+//#define DISABLE_DEBLOCK
+//#define DISABLE_STRENGTHS
+// define DISABLE_DEBLOCK_NONREF for a 6% speed boost (by skipping deblocking on unimportant frames)
+//#define DISABLE_DEBLOCK_NONREF
+
#include "libavutil/common.h"
#include "libavutil/internal.h"
@@ -31,6 +37,16 @@
#include "bit_depth_template.c"
+#ifdef RPI
+#include "rpi_qpu.h"
+#endif
+#if RPI_HEVC_SAND
+#include "rpi_zc.h"
+#include "libavutil/rpi_sand_fns.h"
+#else
+#define RPI_ZC_SAND_8_IN_10_BUF 0
+#endif
+
#define LUMA 0
#define CB 1
#define CR 2
@@ -139,6 +155,15 @@ static int get_qPy(HEVCContext *s, int xC, int yC)
return s->qp_y_tab[x + y * s->ps.sps->min_cb_width];
}
+static inline unsigned int pixel_shift(const HEVCContext * const s, const unsigned int c_idx)
+{
+#if RPI_HEVC_SAND
+ return c_idx != 0 && av_rpi_is_sand_frame(s->frame) ? 1 + s->ps.sps->pixel_shift : s->ps.sps->pixel_shift;
+#else
+ return s->ps.sps->pixel_shift;
+#endif
+}
+
static void copy_CTB(uint8_t *dst, const uint8_t *src, int width, int height,
intptr_t stride_dst, intptr_t stride_src)
{
@@ -161,12 +186,21 @@ int i, j;
}
}
+// "DSP" these?
static void copy_pixel(uint8_t *dst, const uint8_t *src, int pixel_shift)
{
- if (pixel_shift)
- *(uint16_t *)dst = *(uint16_t *)src;
- else
- *dst = *src;
+ switch (pixel_shift)
+ {
+ case 2:
+ *(uint32_t *)dst = *(uint32_t *)src;
+ break;
+ case 1:
+ *(uint16_t *)dst = *(uint16_t *)src;
+ break;
+ default:
+ *dst = *src;
+ break;
+ }
}
static void copy_vert(uint8_t *dst, const uint8_t *src,
@@ -174,18 +208,29 @@ static void copy_vert(uint8_t *dst, const uint8_t *src,
int stride_dst, int stride_src)
{
int i;
- if (pixel_shift == 0) {
- for (i = 0; i < height; i++) {
- *dst = *src;
- dst += stride_dst;
- src += stride_src;
- }
- } else {
- for (i = 0; i < height; i++) {
- *(uint16_t *)dst = *(uint16_t *)src;
- dst += stride_dst;
- src += stride_src;
- }
+ switch (pixel_shift)
+ {
+ case 2:
+ for (i = 0; i < height; i++) {
+ *(uint32_t *)dst = *(uint32_t *)src;
+ dst += stride_dst;
+ src += stride_src;
+ }
+ break;
+ case 1:
+ for (i = 0; i < height; i++) {
+ *(uint16_t *)dst = *(uint16_t *)src;
+ dst += stride_dst;
+ src += stride_src;
+ }
+ break;
+ default:
+ for (i = 0; i < height; i++) {
+ *dst = *src;
+ dst += stride_dst;
+ src += stride_src;
+ }
+ break;
}
}
@@ -193,7 +238,7 @@ static void copy_CTB_to_hv(HEVCContext *s, const uint8_t *src,
int stride_src, int x, int y, int width, int height,
int c_idx, int x_ctb, int y_ctb)
{
- int sh = s->ps.sps->pixel_shift;
+ const unsigned int sh = pixel_shift(s, c_idx);
int w = s->ps.sps->width >> s->ps.sps->hshift[c_idx];
int h = s->ps.sps->height >> s->ps.sps->vshift[c_idx];
@@ -224,13 +269,14 @@ static void restore_tqb_pixels(HEVCContext *s,
int y_min = ((y0 ) >> s->ps.sps->log2_min_pu_size);
int x_max = ((x0 + width ) >> s->ps.sps->log2_min_pu_size);
int y_max = ((y0 + height) >> s->ps.sps->log2_min_pu_size);
- int len = (min_pu_size >> hshift) << s->ps.sps->pixel_shift;
+ const unsigned int sh = pixel_shift(s, c_idx);
+ int len = (min_pu_size >> hshift) << sh;
for (y = y_min; y < y_max; y++) {
for (x = x_min; x < x_max; x++) {
if (s->is_pcm[y * s->ps.sps->min_pu_width + x]) {
int n;
- uint8_t *src = src1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_src + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << s->ps.sps->pixel_shift);
- const uint8_t *dst = dst1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_dst + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << s->ps.sps->pixel_shift);
+ uint8_t *src = src1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_src + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << sh);
+ const uint8_t *dst = dst1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_dst + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << sh);
for (n = 0; n < (min_pu_size >> vshift); n++) {
memcpy(src, dst, len);
src += stride_src;
@@ -246,7 +292,13 @@ static void restore_tqb_pixels(HEVCContext *s,
static void sao_filter_CTB(HEVCContext *s, int x, int y)
{
- static const uint8_t sao_tab[8] = { 0, 1, 2, 2, 3, 3, 4, 4 };
+#if SAO_FILTER_N == 5
+ static const uint8_t sao_tab[8] = { 0 /* 8 */, 1 /* 16 */, 2 /* 24 */, 2 /* 32 */, 3, 3 /* 48 */, 4, 4 /* 64 */};
+#elif SAO_FILTER_N == 6
+ static const uint8_t sao_tab[8] = { 0 /* 8 */, 1 /* 16 */, 5 /* 24 */, 2 /* 32 */, 3, 3 /* 48 */, 4, 4 /* 64 */};
+#else
+#error Confused by size of sao fn array
+#endif
HEVCLocalContext *lc = s->HEVClc;
int c_idx;
int edges[4]; // 0 left 1 top 2 right 3 bottom
@@ -267,12 +319,22 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
uint8_t right_tile_edge = 0;
uint8_t up_tile_edge = 0;
uint8_t bottom_tile_edge = 0;
+#if RPI_HEVC_SAND
+ const int sliced = av_rpi_is_sand_frame(s->frame);
+ const int plane_count = sliced ? 2 : (s->ps.sps->chroma_format_idc ? 3 : 1);
+#else
+ const int plane_count = (s->ps.sps->chroma_format_idc ? 3 : 1);
+#endif
edges[0] = x_ctb == 0;
edges[1] = y_ctb == 0;
edges[2] = x_ctb == s->ps.sps->ctb_width - 1;
edges[3] = y_ctb == s->ps.sps->ctb_height - 1;
+#ifdef DISABLE_SAO
+ return;
+#endif
+
if (restore) {
if (!edges[0]) {
left_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1]];
@@ -304,7 +366,7 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
}
}
- for (c_idx = 0; c_idx < (s->ps.sps->chroma_format_idc ? 3 : 1); c_idx++) {
+ for (c_idx = 0; c_idx < plane_count; c_idx++) {
int x0 = x >> s->ps.sps->hshift[c_idx];
int y0 = y >> s->ps.sps->vshift[c_idx];
int stride_src = s->frame->linesize[c_idx];
@@ -313,28 +375,84 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
int width = FFMIN(ctb_size_h, (s->ps.sps->width >> s->ps.sps->hshift[c_idx]) - x0);
int height = FFMIN(ctb_size_v, (s->ps.sps->height >> s->ps.sps->vshift[c_idx]) - y0);
int tab = sao_tab[(FFALIGN(width, 8) >> 3) - 1];
- uint8_t *src = &s->frame->data[c_idx][y0 * stride_src + (x0 << s->ps.sps->pixel_shift)];
- int stride_dst;
+ ptrdiff_t stride_dst;
uint8_t *dst;
+#if RPI_HEVC_SAND
+ const unsigned int sh = s->ps.sps->pixel_shift + (sliced && c_idx != 0);
+ const int wants_lr = sao->type_idx[c_idx] == SAO_EDGE && sao->eo_class[c_idx] != 1 /* Vertical */;
+ uint8_t * const src = !sliced ?
+ &s->frame->data[c_idx][y0 * stride_src + (x0 << sh)] :
+ c_idx == 0 ?
+ av_rpi_sand_frame_pos_y(s->frame, x0, y0) :
+ av_rpi_sand_frame_pos_c(s->frame, x0, y0);
+ const uint8_t * const src_l = edges[0] || !wants_lr ? NULL :
+ !sliced ? src - (1 << sh) :
+ c_idx == 0 ?
+ av_rpi_sand_frame_pos_y(s->frame, x0 - 1, y0) :
+ av_rpi_sand_frame_pos_c(s->frame, x0 - 1, y0);
+ const uint8_t * const src_r = edges[2] || !wants_lr ? NULL :
+ !sliced ? src + (width << sh) :
+ c_idx == 0 ?
+ av_rpi_sand_frame_pos_y(s->frame, x0 + width, y0) :
+ av_rpi_sand_frame_pos_c(s->frame, x0 + width, y0);
+
+
+ if (sliced && c_idx > 1) {
+ break;
+ }
+#else
+ const unsigned int sh = s->ps.sps->pixel_shift;
+ const int wants_lr = sao->type_idx[c_idx] == SAO_EDGE && sao->eo_class[c_idx] != 1 /* Vertical */;
+ uint8_t * const src = &s->frame->data[c_idx][y0 * stride_src + (x0 << s->ps.sps->pixel_shift)];
+ const uint8_t * const src_l = edges[0] || !wants_lr ? NULL : src - (1 << sh);
+ const uint8_t * const src_r = edges[2] || !wants_lr ? NULL : src + (width << sh);
+#endif
+
switch (sao->type_idx[c_idx]) {
case SAO_BAND:
copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
x_ctb, y_ctb);
if (s->ps.pps->transquant_bypass_enable_flag ||
(s->ps.sps->pcm.loop_filter_disable_flag && s->ps.sps->pcm_enabled_flag)) {
- dst = lc->edge_emu_buffer;
- stride_dst = 2*MAX_PB_SIZE;
- copy_CTB(dst, src, width << s->ps.sps->pixel_shift, height, stride_dst, stride_src);
- s->hevcdsp.sao_band_filter[tab](src, dst, stride_src, stride_dst,
- sao->offset_val[c_idx], sao->band_position[c_idx],
- width, height);
- restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
- x, y, width, height, c_idx);
+ dst = lc->edge_emu_buffer;
+ stride_dst = 2*MAX_PB_SIZE;
+ copy_CTB(dst, src, width << sh, height, stride_dst, stride_src);
+#if RPI_HEVC_SAND
+ if (sliced && c_idx != 0)
+ {
+ s->hevcdsp.sao_band_filter_c[tab](src, dst, stride_src, stride_dst,
+ sao->offset_val[1], sao->band_position[1],
+ sao->offset_val[2], sao->band_position[2],
+ width, height);
+ }
+ else
+#endif
+ {
+ s->hevcdsp.sao_band_filter[tab](src, dst, stride_src, stride_dst,
+ sao->offset_val[c_idx], sao->band_position[c_idx],
+ width, height);
+ }
+ restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
+ x, y, width, height, c_idx);
} else {
- s->hevcdsp.sao_band_filter[tab](src, src, stride_src, stride_src,
- sao->offset_val[c_idx], sao->band_position[c_idx],
- width, height);
+#if RPI_HEVC_SAND
+ if (sliced && c_idx != 0)
+ {
+// printf("x,y=%d,%d data[1]=%p, src=%p\n", x0, y0, s->frame->data[1], src);
+
+ s->hevcdsp.sao_band_filter_c[tab](src, src, stride_src, stride_src,
+ sao->offset_val[1], sao->band_position[1],
+ sao->offset_val[2], sao->band_position[2],
+ width, height);
+ }
+ else
+#endif
+ {
+ s->hevcdsp.sao_band_filter[tab](src, src, stride_src, stride_src,
+ sao->offset_val[c_idx], sao->band_position[c_idx],
+ width, height);
+ }
}
sao->type_idx[c_idx] = SAO_APPLIED;
break;
@@ -342,108 +460,118 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
{
int w = s->ps.sps->width >> s->ps.sps->hshift[c_idx];
int h = s->ps.sps->height >> s->ps.sps->vshift[c_idx];
- int left_edge = edges[0];
int top_edge = edges[1];
- int right_edge = edges[2];
int bottom_edge = edges[3];
- int sh = s->ps.sps->pixel_shift;
- int left_pixels, right_pixels;
stride_dst = 2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE;
dst = lc->edge_emu_buffer + stride_dst + AV_INPUT_BUFFER_PADDING_SIZE;
if (!top_edge) {
- int left = 1 - left_edge;
- int right = 1 - right_edge;
- const uint8_t *src1[2];
uint8_t *dst1;
- int src_idx, pos;
+ int src_idx;
+ const uint8_t * const src_spb = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb - 1) * w + x0) << sh);
- dst1 = dst - stride_dst - (left << sh);
- src1[0] = src - stride_src - (left << sh);
- src1[1] = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb - 1) * w + x0 - left) << sh);
- pos = 0;
- if (left) {
+ dst1 = dst - stride_dst;
+
+ if (src_l != NULL) {
src_idx = (CTB(s->sao, x_ctb-1, y_ctb-1).type_idx[c_idx] ==
SAO_APPLIED);
- copy_pixel(dst1, src1[src_idx], sh);
- pos += (1 << sh);
+ copy_pixel(dst1 - (1 << sh), src_idx ? src_spb - (1 << sh) : src_l - stride_src, sh);
}
+
src_idx = (CTB(s->sao, x_ctb, y_ctb-1).type_idx[c_idx] ==
SAO_APPLIED);
- memcpy(dst1 + pos, src1[src_idx] + pos, width << sh);
- if (right) {
- pos += width << sh;
+ memcpy(dst1, src_idx ? src_spb : src - stride_src, width << sh);
+
+ if (src_r != NULL) {
src_idx = (CTB(s->sao, x_ctb+1, y_ctb-1).type_idx[c_idx] ==
SAO_APPLIED);
- copy_pixel(dst1 + pos, src1[src_idx] + pos, sh);
+ copy_pixel(dst1 + (width << sh), src_idx ? src_spb + (width << sh) : src_r - stride_src, sh);
}
}
if (!bottom_edge) {
- int left = 1 - left_edge;
- int right = 1 - right_edge;
- const uint8_t *src1[2];
- uint8_t *dst1;
- int src_idx, pos;
+ uint8_t * const dst1 = dst + height * stride_dst;
+ int src_idx;
+ const uint8_t * const src_spb = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 2) * w + x0) << sh);
+ const unsigned int hoff = height * stride_src;
- dst1 = dst + height * stride_dst - (left << sh);
- src1[0] = src + height * stride_src - (left << sh);
- src1[1] = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 2) * w + x0 - left) << sh);
- pos = 0;
- if (left) {
+ if (src_l != NULL) {
src_idx = (CTB(s->sao, x_ctb-1, y_ctb+1).type_idx[c_idx] ==
SAO_APPLIED);
- copy_pixel(dst1, src1[src_idx], sh);
- pos += (1 << sh);
+ copy_pixel(dst1 - (1 << sh), src_idx ? src_spb - (1 << sh) : src_l + hoff, sh);
}
+
src_idx = (CTB(s->sao, x_ctb, y_ctb+1).type_idx[c_idx] ==
SAO_APPLIED);
- memcpy(dst1 + pos, src1[src_idx] + pos, width << sh);
- if (right) {
- pos += width << sh;
+ memcpy(dst1, src_idx ? src_spb : src + hoff, width << sh);
+
+ if (src_r != NULL) {
src_idx = (CTB(s->sao, x_ctb+1, y_ctb+1).type_idx[c_idx] ==
SAO_APPLIED);
- copy_pixel(dst1 + pos, src1[src_idx] + pos, sh);
+ copy_pixel(dst1 + (width << sh), src_idx ? src_spb + (width << sh) : src_r + hoff, sh);
}
}
- left_pixels = 0;
- if (!left_edge) {
+ if (src_l != NULL) {
if (CTB(s->sao, x_ctb-1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
copy_vert(dst - (1 << sh),
s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb - 1) * h + y0) << sh),
sh, height, stride_dst, 1 << sh);
} else {
- left_pixels = 1;
+ copy_vert(dst - (1 << sh),
+ src_l,
+ sh, height, stride_dst, stride_src);
}
}
- right_pixels = 0;
- if (!right_edge) {
+ if (src_r != NULL) {
if (CTB(s->sao, x_ctb+1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
copy_vert(dst + (width << sh),
s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 2) * h + y0) << sh),
sh, height, stride_dst, 1 << sh);
} else {
- right_pixels = 1;
+ copy_vert(dst + (width << sh),
+ src_r,
+ sh, height, stride_dst, stride_src);
}
}
- copy_CTB(dst - (left_pixels << sh),
- src - (left_pixels << sh),
- (width + left_pixels + right_pixels) << sh,
+ copy_CTB(dst,
+ src,
+ width << sh,
height, stride_dst, stride_src);
copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
x_ctb, y_ctb);
- s->hevcdsp.sao_edge_filter[tab](src, dst, stride_src, sao->offset_val[c_idx],
- sao->eo_class[c_idx], width, height);
- s->hevcdsp.sao_edge_restore[restore](src, dst,
- stride_src, stride_dst,
- sao,
- edges, width,
- height, c_idx,
- vert_edge,
- horiz_edge,
- diag_edge);
+#if RPI_HEVC_SAND
+ if (sliced && c_idx != 0)
+ {
+ // Class always the same for both U & V (which is just as well :-))
+ s->hevcdsp.sao_edge_filter_c[tab](src, dst, stride_src,
+ sao->offset_val[1], sao->offset_val[2], sao->eo_class[1],
+ width, height);
+ s->hevcdsp.sao_edge_restore_c[restore](src, dst,
+ stride_src, stride_dst,
+ sao,
+ edges, width,
+ height, c_idx,
+ vert_edge,
+ horiz_edge,
+ diag_edge);
+ }
+ else
+#endif
+ {
+ s->hevcdsp.sao_edge_filter[tab](src, dst, stride_src, sao->offset_val[c_idx],
+ sao->eo_class[c_idx], width, height);
+ s->hevcdsp.sao_edge_restore[restore](src, dst,
+ stride_src, stride_dst,
+ sao,
+ edges, width,
+ height, c_idx,
+ vert_edge,
+ horiz_edge,
+ diag_edge);
+ }
+ // ??? Does this actually work for chroma ???
restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
x, y, width, height, c_idx);
sao->type_idx[c_idx] = SAO_APPLIED;
@@ -451,8 +579,30 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
}
}
}
+
+#if RPI_ZC_SAND_8_IN_10_BUF
+ if (s->frame->format == AV_PIX_FMT_SAND64_10 && s->frame->buf[RPI_ZC_SAND_8_IN_10_BUF] != NULL &&
+ (((x + (1 << (s->ps.sps->log2_ctb_size))) & 255) == 0 || edges[2]))
+ {
+ const unsigned int stride1 = s->frame->linesize[0];
+ const unsigned int stride2 = av_rpi_sand_frame_stride2(s->frame);
+ const unsigned int xoff = (x >> 8) * stride2 * stride1;
+ const unsigned int ctb_size = (1 << s->ps.sps->log2_ctb_size);
+ const uint8_t * const sy = s->frame->data[0] + xoff * 4 + y * stride1;
+ uint8_t * const dy = s->frame->buf[4]->data + xoff * 2 + y * stride1;
+ const uint8_t * const sc = s->frame->data[1] + xoff * 4 + (y >> 1) * stride1;
+ uint8_t * const dc = s->frame->buf[4]->data + (s->frame->data[1] - s->frame->data[0]) + xoff * 2 + (y >> 1) * stride1;
+ const unsigned int wy = !edges[2] ? 256 : s->ps.sps->width - (x & ~255);
+ const unsigned int hy = !edges[3] ? ctb_size : s->ps.sps->height - y;
+
+// printf("dy=%p/%p, stride1=%d, stride2=%d, sy=%p/%p, wy=%d, hy=%d, x=%d, y=%d, cs=%d\n", dy, dc, stride1, stride2, sy, sc, wy, hy, x, y, ctb_size);
+ av_rpi_sand16_to_sand8(dy, stride1, stride2, sy, stride1, stride2, wy, hy, 3);
+ av_rpi_sand16_to_sand8(dc, stride1, stride2, sc, stride1, stride2, wy, hy >> 1, 3);
+ }
+#endif
}
+// Returns 2 or 0.
static int get_pcm(HEVCContext *s, int x, int y)
{
int log2_min_pu_size = s->ps.sps->log2_min_pu_size;
@@ -479,7 +629,7 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
uint8_t *src;
int x, y;
int chroma, beta;
- int32_t c_tc[2], tc[2];
+ int32_t c_tc[4], tc[2];
uint8_t no_p[2] = { 0 };
uint8_t no_q[2] = { 0 };
@@ -496,6 +646,15 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
s->ps.sps->pcm.loop_filter_disable_flag) ||
s->ps.pps->transquant_bypass_enable_flag;
+#ifdef DISABLE_DEBLOCK_NONREF
+ if (!s->used_for_ref)
+ return; // Don't deblock non-reference frames
+#endif
+#ifdef DISABLE_DEBLOCK
+ return;
+#endif
+ if (!s->used_for_ref && s->avctx->skip_loop_filter >= AVDISCARD_NONREF)
+ return;
if (x0) {
left_tc_offset = s->deblock[ctb - 1].tc_offset;
left_beta_offset = s->deblock[ctb - 1].beta_offset;
@@ -529,19 +688,51 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
tc[0] = bs0 ? TC_CALC(qp, bs0) : 0;
tc[1] = bs1 ? TC_CALC(qp, bs1) : 0;
- src = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)];
if (pcmf) {
no_p[0] = get_pcm(s, x - 1, y);
no_p[1] = get_pcm(s, x - 1, y + 4);
no_q[0] = get_pcm(s, x, y);
no_q[1] = get_pcm(s, x, y + 4);
- s->hevcdsp.hevc_v_loop_filter_luma_c(src,
- s->frame->linesize[LUMA],
- beta, tc, no_p, no_q);
- } else
- s->hevcdsp.hevc_v_loop_filter_luma(src,
- s->frame->linesize[LUMA],
- beta, tc, no_p, no_q);
+ }
+#if RPI_HEVC_SAND
+ if (av_rpi_is_sand_frame(s->frame)) {
+
+ // This copes properly with no_p/no_q
+ s->hevcdsp.hevc_v_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y),
+ s->frame->linesize[LUMA],
+ beta, tc, no_p, no_q,
+ av_rpi_sand_frame_pos_y(s->frame, x - 4, y));
+ }
+ else
+#endif
+ {
+ src = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)];
+ if (pcmf) {
+ // Standard DSP code is broken if no_p / no_q is set
+ s->hevcdsp.hevc_v_loop_filter_luma_c(src,
+ s->frame->linesize[LUMA],
+ beta, tc, no_p, no_q);
+ }
+ else
+#ifdef RPI_DEBLOCK_VPU
+ if (s->enable_rpi_deblock) {
+ uint8_t (*setup)[2][2][4];
+ int num16 = (y>>4)*s->setup_width + (x>>4);
+ int a = ((y>>3) & 1) << 1;
+ int b = (x>>3) & 1;
+ setup = s->dvq->y_setup_arm[num16];
+ setup[0][b][0][a] = beta;
+ setup[0][b][0][a + 1] = beta;
+ setup[0][b][1][a] = tc[0];
+ setup[0][b][1][a + 1] = tc[1];
+ } else
+#endif
+ {
+ s->hevcdsp.hevc_v_loop_filter_luma(src,
+ s->frame->linesize[LUMA],
+ beta, tc, no_p, no_q);
+ }
+ }
}
}
@@ -561,7 +752,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
beta = betatable[av_clip(qp + beta_offset, 0, MAX_QP)];
tc[0] = bs0 ? TC_CALC(qp, bs0) : 0;
tc[1] = bs1 ? TC_CALC(qp, bs1) : 0;
- src = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)];
+ src =
+#if RPI_HEVC_SAND
+ av_rpi_is_sand_frame(s->frame) ?
+ av_rpi_sand_frame_pos_y(s->frame, x, y) :
+#endif
+ &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)];
if (pcmf) {
no_p[0] = get_pcm(s, x, y - 1);
no_p[1] = get_pcm(s, x + 4, y - 1);
@@ -571,6 +767,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
s->frame->linesize[LUMA],
beta, tc, no_p, no_q);
} else
+#ifdef RPI_DEBLOCK_VPU
+ if (s->enable_rpi_deblock) {
+ uint8_t (*setup)[2][2][4];
+ int num16 = (y>>4)*s->setup_width + (x>>4);
+ int a = ((x>>3) & 1) << 1;
+ int b = (y>>3) & 1;
+ setup = s->dvq->y_setup_arm[num16];
+ setup[1][b][0][a] = beta;
+ setup[1][b][0][a + 1] = beta;
+ setup[1][b][1][a] = tc[0];
+ setup[1][b][1][a + 1] = tc[1];
+ } else
+#endif
s->hevcdsp.hevc_h_loop_filter_luma(src,
s->frame->linesize[LUMA],
beta, tc, no_p, no_q);
@@ -579,6 +788,96 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
}
if (s->ps.sps->chroma_format_idc) {
+#if RPI_HEVC_SAND
+ if (av_rpi_is_sand_frame(s->frame)) {
+ const int v = 2;
+ const int h = 2;
+
+ // vertical filtering chroma
+ for (y = y0; y < y_end; y += 8 * v) {
+// const int demi_y = y + 4 * v >= s->ps.sps->height;
+ const int demi_y = 0;
+ for (x = x0 ? x0 : 8 * h; x < x_end; x += 8 * h) {
+ const int bs0 = s->vertical_bs[(x + y * s->bs_width) >> 2];
+ const int bs1 = s->vertical_bs[(x + (y + 4 * v) * s->bs_width) >> 2];
+
+ if ((bs0 == 2) || (bs1 == 2)) {
+ const int qp0 = (get_qPy(s, x - 1, y) + get_qPy(s, x, y) + 1) >> 1;
+ const int qp1 = (get_qPy(s, x - 1, y + 4 * v) + get_qPy(s, x, y + 4 * v) + 1) >> 1;
+ unsigned int no_f = !demi_y ? 0 : 2 | 8;
+
+ // tc_offset here should be set to cur_tc_offset I think
+ const uint32_t tc4 =
+ ((bs0 != 2) ? 0 : chroma_tc(s, qp0, 1, cur_tc_offset) | (chroma_tc(s, qp0, 2, cur_tc_offset) << 16)) |
+ ((bs1 != 2) ? 0 : ((chroma_tc(s, qp1, 1, cur_tc_offset) | (chroma_tc(s, qp1, 2, cur_tc_offset) << 16)) << 8));
+
+ if (tc4 == 0)
+ continue;
+
+ if (pcmf) {
+ no_f =
+ (get_pcm(s, x - 1, y) ? 1 : 0) |
+ (get_pcm(s, x - 1, y + 4 * v) ? 2 : 0) |
+ (get_pcm(s, x, y) ? 4 : 0) |
+ (get_pcm(s, x, y + 4 * v) ? 8 : 0);
+ if (no_f == 0xf)
+ continue;
+ }
+
+ s->hevcdsp.hevc_v_loop_filter_uv2(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
+ s->frame->linesize[1],
+ tc4,
+ av_rpi_sand_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1),
+ no_f);
+ }
+ }
+
+ if (y == 0)
+ continue;
+
+ // horizontal filtering chroma
+ tc_offset = x0 ? left_tc_offset : cur_tc_offset;
+ x_end2 = x_end;
+ if (x_end != s->ps.sps->width)
+ x_end2 = x_end - 8 * h;
+
+ for (x = x0 ? x0 - 8 * h: 0; x < x_end2; x += 8 * h) {
+// const int demi_x = x + 4 * v >= s->ps.sps->width;
+ const int demi_x = 0;
+
+ const int bs0 = s->horizontal_bs[( x + y * s->bs_width) >> 2];
+ const int bs1 = s->horizontal_bs[((x + 4 * h) + y * s->bs_width) >> 2];
+ if ((bs0 == 2) || (bs1 == 2)) {
+ const int qp0 = bs0 == 2 ? (get_qPy(s, x, y - 1) + get_qPy(s, x, y) + 1) >> 1 : 0;
+ const int qp1 = bs1 == 2 ? (get_qPy(s, x + 4 * h, y - 1) + get_qPy(s, x + 4 * h, y) + 1) >> 1 : 0;
+ const uint32_t tc4 =
+ ((bs0 != 2) ? 0 : chroma_tc(s, qp0, 1, tc_offset) | (chroma_tc(s, qp0, 2, tc_offset) << 16)) |
+ ((bs1 != 2) ? 0 : ((chroma_tc(s, qp1, 1, cur_tc_offset) | (chroma_tc(s, qp1, 2, cur_tc_offset) << 16)) << 8));
+ unsigned int no_f = !demi_x ? 0 : 2 | 8;
+
+ if (tc4 == 0)
+ continue;
+
+ if (pcmf) {
+ no_f =
+ (get_pcm(s, x, y - 1) ? 1 : 0) |
+ (get_pcm(s, x + 4 * h, y - 1) ? 2 : 0) |
+ (get_pcm(s, x, y) ? 4 : 0) |
+ (get_pcm(s, x + 4 * h, y) ? 8 : 0);
+
+ if (no_f == 0xf)
+ continue;
+ }
+
+ s->hevcdsp.hevc_h_loop_filter_uv(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
+ s->frame->linesize[1],
+ tc4, no_f);
+ }
+ }
+ }
+ }
+ else
+#endif
for (chroma = 1; chroma <= 2; chroma++) {
int h = 1 << s->ps.sps->hshift[chroma];
int v = 1 << s->ps.sps->vshift[chroma];
@@ -595,7 +894,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
c_tc[0] = (bs0 == 2) ? chroma_tc(s, qp0, chroma, tc_offset) : 0;
c_tc[1] = (bs1 == 2) ? chroma_tc(s, qp1, chroma, tc_offset) : 0;
- src = &s->frame->data[chroma][(y >> s->ps.sps->vshift[chroma]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[chroma]) << s->ps.sps->pixel_shift)];
+ src =
+#if RPI_HEVC_SAND
+ av_rpi_is_sand_frame(s->frame) ?
+ av_rpi_sand_frame_pos_c(s->frame, x >> s->ps.sps->hshift[chroma], y >> s->ps.sps->vshift[chroma]) :
+#endif
+ &s->frame->data[chroma][(y >> s->ps.sps->vshift[chroma]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[chroma]) << s->ps.sps->pixel_shift)];
if (pcmf) {
no_p[0] = get_pcm(s, x - 1, y);
no_p[1] = get_pcm(s, x - 1, y + (4 * v));
@@ -605,9 +909,23 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
s->frame->linesize[chroma],
c_tc, no_p, no_q);
} else
+#ifdef RPI_DEBLOCK_VPU
+ if (s->enable_rpi_deblock) {
+ uint8_t (*setup)[2][2][4];
+ int xc = x>>s->ps.sps->hshift[chroma];
+ int yc = y>>s->ps.sps->vshift[chroma];
+ int num16 = (yc>>4)*s->uv_setup_width + (xc>>4);
+ int a = ((yc>>3) & 1) << 1;
+ int b = (xc>>3) & 1;
+ setup = s->dvq->uv_setup_arm[num16];
+ setup[0][b][0][a] = c_tc[0];
+ setup[0][b][0][a + 1] = c_tc[1];
+ } else
+#endif
s->hevcdsp.hevc_v_loop_filter_chroma(src,
s->frame->linesize[chroma],
c_tc, no_p, no_q);
+
}
}
@@ -628,7 +946,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
c_tc[0] = bs0 == 2 ? chroma_tc(s, qp0, chroma, tc_offset) : 0;
c_tc[1] = bs1 == 2 ? chroma_tc(s, qp1, chroma, cur_tc_offset) : 0;
- src = &s->frame->data[chroma][(y >> s->ps.sps->vshift[1]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
+ src =
+#if RPI_HEVC_SAND
+ av_rpi_is_sand_frame(s->frame) ?
+ av_rpi_sand_frame_pos_c(s->frame, x >> s->ps.sps->hshift[chroma], y >> s->ps.sps->vshift[chroma]) :
+#endif
+ &s->frame->data[chroma][(y >> s->ps.sps->vshift[1]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
if (pcmf) {
no_p[0] = get_pcm(s, x, y - 1);
no_p[1] = get_pcm(s, x + (4 * h), y - 1);
@@ -638,6 +961,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
s->frame->linesize[chroma],
c_tc, no_p, no_q);
} else
+#ifdef RPI_DEBLOCK_VPU
+ if (s->enable_rpi_deblock) {
+ uint8_t (*setup)[2][2][4];
+ int xc = x>>s->ps.sps->hshift[chroma];
+ int yc = y>>s->ps.sps->vshift[chroma];
+ int num16 = (yc>>4)*s->uv_setup_width + (xc>>4);
+ int a = ((xc>>3) & 1) << 1;
+ int b = (yc>>3) & 1;
+ setup = s->dvq->uv_setup_arm[num16];
+ setup[1][b][0][a] = c_tc[0];
+ setup[1][b][0][a + 1] = c_tc[1];
+ } else
+#endif
s->hevcdsp.hevc_h_loop_filter_chroma(src,
s->frame->linesize[chroma],
c_tc, no_p, no_q);
@@ -648,69 +984,6 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
}
}
-static int boundary_strength(HEVCContext *s, MvField *curr, MvField *neigh,
- RefPicList *neigh_refPicList)
-{
- if (curr->pred_flag == PF_BI && neigh->pred_flag == PF_BI) {
- // same L0 and L1
- if (s->ref->refPicList[0].list[curr->ref_idx[0]] == neigh_refPicList[0].list[neigh->ref_idx[0]] &&
- s->ref->refPicList[0].list[curr->ref_idx[0]] == s->ref->refPicList[1].list[curr->ref_idx[1]] &&
- neigh_refPicList[0].list[neigh->ref_idx[0]] == neigh_refPicList[1].list[neigh->ref_idx[1]]) {
- if ((FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
- FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4) &&
- (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
- FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4))
- return 1;
- else
- return 0;
- } else if (neigh_refPicList[0].list[neigh->ref_idx[0]] == s->ref->refPicList[0].list[curr->ref_idx[0]] &&
- neigh_refPicList[1].list[neigh->ref_idx[1]] == s->ref->refPicList[1].list[curr->ref_idx[1]]) {
- if (FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
- FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4)
- return 1;
- else
- return 0;
- } else if (neigh_refPicList[1].list[neigh->ref_idx[1]] == s->ref->refPicList[0].list[curr->ref_idx[0]] &&
- neigh_refPicList[0].list[neigh->ref_idx[0]] == s->ref->refPicList[1].list[curr->ref_idx[1]]) {
- if (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
- FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4)
- return 1;
- else
- return 0;
- } else {
- return 1;
- }
- } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV
- Mv A, B;
- int ref_A, ref_B;
-
- if (curr->pred_flag & 1) {
- A = curr->mv[0];
- ref_A = s->ref->refPicList[0].list[curr->ref_idx[0]];
- } else {
- A = curr->mv[1];
- ref_A = s->ref->refPicList[1].list[curr->ref_idx[1]];
- }
-
- if (neigh->pred_flag & 1) {
- B = neigh->mv[0];
- ref_B = neigh_refPicList[0].list[neigh->ref_idx[0]];
- } else {
- B = neigh->mv[1];
- ref_B = neigh_refPicList[1].list[neigh->ref_idx[1]];
- }
-
- if (ref_A == ref_B) {
- if (FFABS(A.x - B.x) >= 4 || FFABS(A.y - B.y) >= 4)
- return 1;
- else
- return 0;
- } else
- return 1;
- }
-
- return 1;
-}
void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
int log2_trafo_size)
@@ -721,10 +994,22 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
int log2_min_tu_size = s->ps.sps->log2_min_tb_size;
int min_pu_width = s->ps.sps->min_pu_width;
int min_tu_width = s->ps.sps->min_tb_width;
- int is_intra = tab_mvf[(y0 >> log2_min_pu_size) * min_pu_width +
- (x0 >> log2_min_pu_size)].pred_flag == PF_INTRA;
int boundary_upper, boundary_left;
- int i, j, bs;
+ int i, j;
+ RefPicList *rpl = s->ref->refPicList;
+ const unsigned int log2_dup = FFMIN(log2_min_pu_size, log2_trafo_size);
+ const unsigned int min_pu_in_4pix = 1 << (log2_dup - 2); // Dup
+ const unsigned int trafo_in_min_pus = 1 << (log2_trafo_size - log2_dup); // Rep
+ int y_pu = y0 >> log2_min_pu_size;
+ int x_pu = x0 >> log2_min_pu_size;
+ MvField *curr = &tab_mvf[y_pu * min_pu_width + x_pu];
+ int is_intra = curr->pred_flag == PF_INTRA;
+ int inc = log2_min_pu_size == 2 ? 2 : 1;
+ uint8_t *bs;
+
+#ifdef DISABLE_STRENGTHS
+ return;
+#endif
boundary_upper = y0 > 0 && !(y0 & 7);
if (boundary_upper &&
@@ -736,34 +1021,56 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
(y0 % (1 << s->ps.sps->log2_ctb_size)) == 0)))
boundary_upper = 0;
+ bs = &s->horizontal_bs[(x0 + y0 * s->bs_width) >> 2];
+
if (boundary_upper) {
RefPicList *rpl_top = (lc->boundary_flags & BOUNDARY_UPPER_SLICE) ?
ff_hevc_get_ref_list(s, s->ref, x0, y0 - 1) :
- s->ref->refPicList;
- int yp_pu = (y0 - 1) >> log2_min_pu_size;
- int yq_pu = y0 >> log2_min_pu_size;
- int yp_tu = (y0 - 1) >> log2_min_tu_size;
- int yq_tu = y0 >> log2_min_tu_size;
+ rpl;
+ MvField *top = curr - min_pu_width;
+
+ if (is_intra) {
+ for (i = 0; i < (1 << log2_trafo_size); i += 4)
+ bs[i >> 2] = 2;
+
+ } else {
+ int y_tu = y0 >> log2_min_tu_size;
+ int x_tu = x0 >> log2_min_tu_size;
+ uint8_t *curr_cbf_luma = &s->cbf_luma[y_tu * min_tu_width + x_tu];
+ uint8_t *top_cbf_luma = curr_cbf_luma - min_tu_width;
+
+ s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
+ min_pu_in_4pix, sizeof (MvField), 4 >> 2,
+ rpl[0].list, rpl[1].list, rpl_top[0].list, rpl_top[1].list,
+ curr, top, bs);
for (i = 0; i < (1 << log2_trafo_size); i += 4) {
- int x_pu = (x0 + i) >> log2_min_pu_size;
- int x_tu = (x0 + i) >> log2_min_tu_size;
- MvField *top = &tab_mvf[yp_pu * min_pu_width + x_pu];
- MvField *curr = &tab_mvf[yq_pu * min_pu_width + x_pu];
- uint8_t top_cbf_luma = s->cbf_luma[yp_tu * min_tu_width + x_tu];
- uint8_t curr_cbf_luma = s->cbf_luma[yq_tu * min_tu_width + x_tu];
-
- if (curr->pred_flag == PF_INTRA || top->pred_flag == PF_INTRA)
- bs = 2;
- else if (curr_cbf_luma || top_cbf_luma)
- bs = 1;
- else
- bs = boundary_strength(s, curr, top, rpl_top);
- s->horizontal_bs[((x0 + i) + y0 * s->bs_width) >> 2] = bs;
+ int i_pu = i >> log2_min_pu_size;
+ int i_tu = i >> log2_min_tu_size;
+
+ if (top[i_pu].pred_flag == PF_INTRA)
+ bs[i >> 2] = 2;
+ else if (curr_cbf_luma[i_tu] || top_cbf_luma[i_tu])
+ bs[i >> 2] = 1;
}
+ }
+ }
+
+ if (!is_intra) {
+ for (j = inc; j < trafo_in_min_pus; j += inc) {
+ MvField *top;
+
+ curr += min_pu_width * inc;
+ top = curr - min_pu_width;
+ bs += s->bs_width * inc << log2_min_pu_size >> 2;
+
+ s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
+ min_pu_in_4pix, sizeof (MvField), 4 >> 2,
+ rpl[0].list, rpl[1].list, rpl[0].list, rpl[1].list,
+ curr, top, bs);
+ }
}
- // bs for vertical TU boundaries
boundary_left = x0 > 0 && !(x0 & 7);
if (boundary_left &&
((!s->sh.slice_loop_filter_across_slices_enabled_flag &&
@@ -774,64 +1081,54 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
(x0 % (1 << s->ps.sps->log2_ctb_size)) == 0)))
boundary_left = 0;
+ curr = &tab_mvf[y_pu * min_pu_width + x_pu];
+ bs = &s->vertical_bs[(x0 + y0 * s->bs_width) >> 2];
+
if (boundary_left) {
RefPicList *rpl_left = (lc->boundary_flags & BOUNDARY_LEFT_SLICE) ?
ff_hevc_get_ref_list(s, s->ref, x0 - 1, y0) :
- s->ref->refPicList;
- int xp_pu = (x0 - 1) >> log2_min_pu_size;
- int xq_pu = x0 >> log2_min_pu_size;
- int xp_tu = (x0 - 1) >> log2_min_tu_size;
- int xq_tu = x0 >> log2_min_tu_size;
-
- for (i = 0; i < (1 << log2_trafo_size); i += 4) {
- int y_pu = (y0 + i) >> log2_min_pu_size;
- int y_tu = (y0 + i) >> log2_min_tu_size;
- MvField *left = &tab_mvf[y_pu * min_pu_width + xp_pu];
- MvField *curr = &tab_mvf[y_pu * min_pu_width + xq_pu];
- uint8_t left_cbf_luma = s->cbf_luma[y_tu * min_tu_width + xp_tu];
- uint8_t curr_cbf_luma = s->cbf_luma[y_tu * min_tu_width + xq_tu];
-
- if (curr->pred_flag == PF_INTRA || left->pred_flag == PF_INTRA)
- bs = 2;
- else if (curr_cbf_luma || left_cbf_luma)
- bs = 1;
- else
- bs = boundary_strength(s, curr, left, rpl_left);
- s->vertical_bs[(x0 + (y0 + i) * s->bs_width) >> 2] = bs;
- }
- }
-
- if (log2_trafo_size > log2_min_pu_size && !is_intra) {
- RefPicList *rpl = s->ref->refPicList;
+ rpl;
+ MvField *left = curr - 1;
- // bs for TU internal horizontal PU boundaries
- for (j = 8; j < (1 << log2_trafo_size); j += 8) {
- int yp_pu = (y0 + j - 1) >> log2_min_pu_size;
- int yq_pu = (y0 + j) >> log2_min_pu_size;
+ if (is_intra) {
+ for (j = 0; j < (1 << log2_trafo_size); j += 4)
+ bs[j * s->bs_width >> 2] = 2;
- for (i = 0; i < (1 << log2_trafo_size); i += 4) {
- int x_pu = (x0 + i) >> log2_min_pu_size;
- MvField *top = &tab_mvf[yp_pu * min_pu_width + x_pu];
- MvField *curr = &tab_mvf[yq_pu * min_pu_width + x_pu];
-
- bs = boundary_strength(s, curr, top, rpl);
- s->horizontal_bs[((x0 + i) + (y0 + j) * s->bs_width) >> 2] = bs;
+ } else {
+ int y_tu = y0 >> log2_min_tu_size;
+ int x_tu = x0 >> log2_min_tu_size;
+ uint8_t *curr_cbf_luma = &s->cbf_luma[y_tu * min_tu_width + x_tu];
+ uint8_t *left_cbf_luma = curr_cbf_luma - 1;
+
+ s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
+ min_pu_in_4pix, min_pu_width * sizeof (MvField), 4 * s->bs_width >> 2,
+ rpl[0].list, rpl[1].list, rpl_left[0].list, rpl_left[1].list,
+ curr, left, bs);
+
+ for (j = 0; j < (1 << log2_trafo_size); j += 4) {
+ int j_pu = j >> log2_min_pu_size;
+ int j_tu = j >> log2_min_tu_size;
+
+ if (left[j_pu * min_pu_width].pred_flag == PF_INTRA)
+ bs[j * s->bs_width >> 2] = 2;
+ else if (curr_cbf_luma[j_tu * min_tu_width] || left_cbf_luma[j_tu * min_tu_width])
+ bs[j * s->bs_width >> 2] = 1;
}
}
+ }
- // bs for TU internal vertical PU boundaries
- for (j = 0; j < (1 << log2_trafo_size); j += 4) {
- int y_pu = (y0 + j) >> log2_min_pu_size;
+ if (!is_intra) {
+ for (i = inc; i < trafo_in_min_pus; i += inc) {
+ MvField *left;
- for (i = 8; i < (1 << log2_trafo_size); i += 8) {
- int xp_pu = (x0 + i - 1) >> log2_min_pu_size;
- int xq_pu = (x0 + i) >> log2_min_pu_size;
- MvField *left = &tab_mvf[y_pu * min_pu_width + xp_pu];
- MvField *curr = &tab_mvf[y_pu * min_pu_width + xq_pu];
+ curr += inc;
+ left = curr - 1;
+ bs += inc << log2_min_pu_size >> 2;
- bs = boundary_strength(s, curr, left, rpl);
- s->vertical_bs[((x0 + i) + (y0 + j) * s->bs_width) >> 2] = bs;
- }
+ s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
+ min_pu_in_4pix, min_pu_width * sizeof (MvField), 4 * s->bs_width >> 2,
+ rpl[0].list, rpl[1].list, rpl[0].list, rpl[1].list,
+ curr, left, bs);
}
}
}
@@ -840,11 +1137,105 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
#undef CB
#undef CR
+#ifdef RPI_DEBLOCK_VPU
+// ff_hevc_flush_buffer_lines
+// flushes and invalidates all pixel rows in [start,end-1]
+static void ff_hevc_flush_buffer_lines(HEVCContext *s, int start, int end, int flush_luma, int flush_chroma)
+{
+ rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init();
+ rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
+ 0, start, s->ps.sps->width, end - start, 0, s->ps.sps->vshift[1], flush_luma, flush_chroma);
+ rpi_cache_flush_finish(rfe);
+}
+#endif
+
+#if RPI_INTER
+
+// Flush some lines of a reference frames
+void rpi_flush_ref_frame_progress(HEVCContext * const s, ThreadFrame * const f, const unsigned int n)
+{
+ if (s->enable_rpi && s->used_for_ref) {
+ const int d0 = ((int *)f->progress->data)[0];
+ const unsigned int curr_y = d0 == -1 ? 0 : d0; // At start of time progress is -1
+
+ if (curr_y < (unsigned int)s->ps.sps->height) {
+ rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init();
+ rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
+ 0, curr_y, s->ps.sps->width, FFMIN(n, (unsigned int)s->ps.sps->height) - curr_y,
+ s->ps.sps->vshift[1], 1, 1);
+ rpi_cache_flush_finish(rfe);
+ }
+ }
+}
+#endif
+
+#ifdef RPI_DEBLOCK_VPU
+/* rpi_deblock deblocks an entire row of ctbs using the VPU */
+static void rpi_deblock(HEVCContext *s, int y, int ctb_size)
+{
+ // Flush image, 4 lines above to bottom of ctb stripe
+ ff_hevc_flush_buffer_lines(s, FFMAX(y-4,0), y+ctb_size, 1, 1);
+ // TODO flush buffer of beta/tc setup when it becomes cached
+
+ // Prepare three commands at once to avoid calling overhead
+ s->dvq->vpu_cmds_arm[0][0] = get_vc_address_y(s->frame) + s->frame->linesize[0] * y;
+ s->dvq->vpu_cmds_arm[0][1] = s->frame->linesize[0];
+ s->dvq->vpu_cmds_arm[0][2] = s->setup_width;
+ s->dvq->vpu_cmds_arm[0][3] = (int) ( s->dvq->y_setup_vc + s->setup_width * (y>>4) );
+ s->dvq->vpu_cmds_arm[0][4] = ctb_size>>4;
+ s->dvq->vpu_cmds_arm[0][5] = 2;
+
+ s->dvq->vpu_cmds_arm[1][0] = get_vc_address_u(s->frame) + s->frame->linesize[1] * (y>> s->ps.sps->vshift[1]);
+ s->dvq->vpu_cmds_arm[1][1] = s->frame->linesize[1];
+ s->dvq->vpu_cmds_arm[1][2] = s->uv_setup_width;
+ s->dvq->vpu_cmds_arm[1][3] = (int) ( s->dvq->uv_setup_vc + s->uv_setup_width * ((y>>4)>> s->ps.sps->vshift[1]) );
+ s->dvq->vpu_cmds_arm[1][4] = (ctb_size>>4)>> s->ps.sps->vshift[1];
+ s->dvq->vpu_cmds_arm[1][5] = 3;
+
+ s->dvq->vpu_cmds_arm[2][0] = get_vc_address_v(s->frame) + s->frame->linesize[2] * (y>> s->ps.sps->vshift[2]);
+ s->dvq->vpu_cmds_arm[2][1] = s->frame->linesize[2];
+ s->dvq->vpu_cmds_arm[2][2] = s->uv_setup_width;
+ s->dvq->vpu_cmds_arm[2][3] = (int) ( s->dvq->uv_setup_vc + s->uv_setup_width * ((y>>4)>> s->ps.sps->vshift[1]) );
+ s->dvq->vpu_cmds_arm[2][4] = (ctb_size>>4)>> s->ps.sps->vshift[1];
+ s->dvq->vpu_cmds_arm[2][5] = 4;
+
+ // Call VPU
+ {
+ const vpu_qpu_job_h vqj = vpu_qpu_job_new();
+ vpu_qpu_job_add_vpu(vqj, vpu_get_fn(s->ps.sps->bit_depth), s->dvq->vpu_cmds_vc, 3, 0, 0, 0, 5); // 5 means to do all the commands
+ vpu_qpu_job_add_sync_this(vqj, &s->dvq->cmd_id);
+ vpu_qpu_job_finish(vqj);
+ }
+
+ s->dvq_n = (s->dvq_n + 1) & (RPI_DEBLOCK_VPU_Q_COUNT - 1);
+ s->dvq = s->dvq_ents + s->dvq_n;
+
+ vpu_qpu_wait(&s->dvq->cmd_id);
+}
+
+#endif
+
void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
{
int x_end = x >= s->ps.sps->width - ctb_size;
+#ifdef RPI_DEBLOCK_VPU
+ int done_deblock = 0;
+#endif
if (s->avctx->skip_loop_filter < AVDISCARD_ALL)
deblocking_filter_CTB(s, x, y);
+#ifdef RPI_DEBLOCK_VPU
+ if (s->enable_rpi_deblock && x_end)
+ {
+ int y_at_end = y >= s->ps.sps->height - ctb_size;
+ int height = 64; // Deblock in units 64 high to avoid too many VPU calls
+ int y_start = y&~63;
+ if (y_at_end) height = s->ps.sps->height - y_start;
+ if ((((y+ctb_size)&63)==0) || y_at_end) {
+ done_deblock = 1;
+ rpi_deblock(s, y_start, height);
+ }
+ }
+#endif
if (s->ps.sps->sao_enabled) {
int y_end = y >= s->ps.sps->height - ctb_size;
if (y && x)
@@ -853,16 +1244,45 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
sao_filter_CTB(s, x - ctb_size, y);
if (y && x_end) {
sao_filter_CTB(s, x, y - ctb_size);
- if (s->threads_type & FF_THREAD_FRAME )
- ff_thread_report_progress(&s->ref->tf, y, 0);
+ if (s->threads_type == FF_THREAD_FRAME ) {
+#if RPI_INTER
+ rpi_flush_ref_frame_progress(s,&s->ref->tf, y);
+#endif
+ ff_hevc_progress_signal_recon(s, y);
+ }
}
if (x_end && y_end) {
sao_filter_CTB(s, x , y);
- if (s->threads_type & FF_THREAD_FRAME )
- ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0);
+ if (s->threads_type == FF_THREAD_FRAME ) {
+#if RPI_INTER
+ rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size);
+#endif
+ ff_hevc_progress_signal_recon(s, y + ctb_size);
+ }
}
- } else if (s->threads_type & FF_THREAD_FRAME && x_end)
- ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
+ } else if (s->threads_type == FF_THREAD_FRAME && x_end) {
+ //int newh = y + ctb_size - 4;
+ //int currh = s->ref->tf.progress->data[0];
+ //if (((y + ctb_size)&63)==0)
+#ifdef RPI_DEBLOCK_VPU
+ if (s->enable_rpi_deblock) {
+ // we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi
+ if (done_deblock) {
+ ff_hevc_progress_signal_recon(s, y + ctb_size - 4);
+ }
+ } else {
+#if RPI_INTER
+ rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size - 4);
+#endif
+ ff_hevc_progress_signal_recon(s, y + ctb_size - 4);
+ }
+#else
+#if RPI_INTER
+ rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size - 4);
+#endif
+ ff_hevc_progress_signal_recon(s, y + ctb_size - 4);
+#endif
+ }
}
void ff_hevc_hls_filters(HEVCContext *s, int x_ctb, int y_ctb, int ctb_size)
diff --git a/libavcodec/hevc_mvs.c b/libavcodec/hevc_mvs.c
index 4a6dde0f67..8ee37ebfbc 100644
--- a/libavcodec/hevc_mvs.c
+++ b/libavcodec/hevc_mvs.c
@@ -111,7 +111,7 @@ static av_always_inline int compare_mv_ref_idx(struct MvField A, struct MvField
return 0;
}
-static av_always_inline void mv_scale(Mv *dst, Mv *src, int td, int tb)
+static av_always_inline void mv_scale(Mv * const dst, const Mv * const src, int td, int tb)
{
int tx, scale_factor;
@@ -125,10 +125,10 @@ static av_always_inline void mv_scale(Mv *dst, Mv *src, int td, int tb)
(scale_factor * src->y < 0)) >> 8);
}
-static int check_mvset(Mv *mvLXCol, Mv *mvCol,
- int colPic, int poc,
- RefPicList *refPicList, int X, int refIdxLx,
- RefPicList *refPicList_col, int listCol, int refidxCol)
+static int check_mvset(Mv * const mvLXCol, const Mv * const mvCol,
+ const int colPic, const int poc,
+ const RefPicList * const refPicList, const int X, const int refIdxLx,
+ const RefPicList * const refPicList_col, const int listCol, const int refidxCol)
{
int cur_lt = refPicList[X].isLongTerm[refIdxLx];
int col_lt = refPicList_col[listCol].isLongTerm[refidxCol];
@@ -159,11 +159,11 @@ static int check_mvset(Mv *mvLXCol, Mv *mvCol,
refPicList_col, L ## l, temp_col.ref_idx[l])
// derive the motion vectors section 8.5.3.1.8
-static int derive_temporal_colocated_mvs(HEVCContext *s, MvField temp_col,
- int refIdxLx, Mv *mvLXCol, int X,
- int colPic, RefPicList *refPicList_col)
+static int derive_temporal_colocated_mvs(const HEVCContext * const s, const MvField temp_col,
+ const int refIdxLx, Mv * const mvLXCol, const int X,
+ const int colPic, const RefPicList * const refPicList_col)
{
- RefPicList *refPicList = s->ref->refPicList;
+ const RefPicList * const refPicList = s->ref->refPicList;
if (temp_col.pred_flag == PF_INTRA)
return 0;
@@ -214,20 +214,20 @@ static int derive_temporal_colocated_mvs(HEVCContext *s, MvField temp_col,
/*
* 8.5.3.1.7 temporal luma motion vector prediction
*/
-static int temporal_luma_motion_vector(HEVCContext *s, int x0, int y0,
- int nPbW, int nPbH, int refIdxLx,
- Mv *mvLXCol, int X)
+static int temporal_luma_motion_vector(HEVCContext * const s, const int x0, const int y0,
+ const int nPbW, const int nPbH, const int refIdxLx,
+ Mv * const mvLXCol, const int X)
{
MvField *tab_mvf;
MvField temp_col;
int x, y, x_pu, y_pu;
- int min_pu_width = s->ps.sps->min_pu_width;
+ const int min_pu_width = s->ps.sps->min_pu_width;
int availableFlagLXCol = 0;
int colPic;
- HEVCFrame *ref = s->ref->collocated_ref;
+ HEVCFrame * const ref = s->ref->collocated_ref;
- if (!ref) {
+ if (ref == NULL || ref->tab_mvf == NULL) {
memset(mvLXCol, 0, sizeof(*mvLXCol));
return 0;
}
@@ -239,14 +239,13 @@ static int temporal_luma_motion_vector(HEVCContext *s, int x0, int y0,
x = x0 + nPbW;
y = y0 + nPbH;
- if (tab_mvf &&
- (y0 >> s->ps.sps->log2_ctb_size) == (y >> s->ps.sps->log2_ctb_size) &&
+ if ((y0 >> s->ps.sps->log2_ctb_size) == (y >> s->ps.sps->log2_ctb_size) &&
y < s->ps.sps->height &&
x < s->ps.sps->width) {
x &= ~15;
y &= ~15;
if (s->threads_type == FF_THREAD_FRAME)
- ff_thread_await_progress(&ref->tf, y, 0);
+ ff_hevc_progress_wait_mv(s, s->jb0, ref, y);
x_pu = x >> s->ps.sps->log2_min_pu_size;
y_pu = y >> s->ps.sps->log2_min_pu_size;
temp_col = TAB_MVF(x_pu, y_pu);
@@ -254,13 +253,13 @@ static int temporal_luma_motion_vector(HEVCContext *s, int x0, int y0,
}
// derive center collocated motion vector
- if (tab_mvf && !availableFlagLXCol) {
+ if (!availableFlagLXCol) {
x = x0 + (nPbW >> 1);
y = y0 + (nPbH >> 1);
x &= ~15;
y &= ~15;
if (s->threads_type == FF_THREAD_FRAME)
- ff_thread_await_progress(&ref->tf, y, 0);
+ ff_hevc_progress_wait_mv(s, s->jb0, ref, y);
x_pu = x >> s->ps.sps->log2_min_pu_size;
y_pu = y >> s->ps.sps->log2_min_pu_size;
temp_col = TAB_MVF(x_pu, y_pu);
diff --git a/libavcodec/hevc_ps.c b/libavcodec/hevc_ps.c
index c1b69a0199..455cdaea1c 100644
--- a/libavcodec/hevc_ps.c
+++ b/libavcodec/hevc_ps.c
@@ -785,7 +785,12 @@ static int map_pixel_format(AVCodecContext *avctx, HEVCSPS *sps)
switch (sps->bit_depth) {
case 8:
if (sps->chroma_format_idc == 0) sps->pix_fmt = AV_PIX_FMT_GRAY8;
+#if RPI_HEVC_SAND
+ // *** Horrid kludge s.t. we start out with sand format
+ if (sps->chroma_format_idc == 1) sps->pix_fmt = sps->width <= 2048 && sps->height <= 1088 ? AV_PIX_FMT_SAND128 : AV_PIX_FMT_YUV420P;
+#else
if (sps->chroma_format_idc == 1) sps->pix_fmt = AV_PIX_FMT_YUV420P;
+#endif
if (sps->chroma_format_idc == 2) sps->pix_fmt = AV_PIX_FMT_YUV422P;
if (sps->chroma_format_idc == 3) sps->pix_fmt = AV_PIX_FMT_YUV444P;
break;
@@ -797,7 +802,12 @@ static int map_pixel_format(AVCodecContext *avctx, HEVCSPS *sps)
break;
case 10:
if (sps->chroma_format_idc == 0) sps->pix_fmt = AV_PIX_FMT_GRAY16;
+#if RPI_HEVC_SAND
+ // *** Horrid kludge s.t. we start out with sand format
+ if (sps->chroma_format_idc == 1) sps->pix_fmt = sps->width <= 2048 && sps->height <= 1088 ? AV_PIX_FMT_SAND64_10 : AV_PIX_FMT_YUV420P10;
+#else
if (sps->chroma_format_idc == 1) sps->pix_fmt = AV_PIX_FMT_YUV420P10;
+#endif
if (sps->chroma_format_idc == 2) sps->pix_fmt = AV_PIX_FMT_YUV422P10;
if (sps->chroma_format_idc == 3) sps->pix_fmt = AV_PIX_FMT_YUV444P10;
break;
@@ -1064,7 +1074,6 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
skip_bits(gb, 7); //sps_extension_7bits = get_bits(gb, 7);
if (sps_extension_flag[0]) {
int extended_precision_processing_flag;
- int high_precision_offsets_enabled_flag;
int cabac_bypass_alignment_enabled_flag;
sps->transform_skip_rotation_enabled_flag = get_bits1(gb);
@@ -1079,10 +1088,10 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
"extended_precision_processing_flag not yet implemented\n");
sps->intra_smoothing_disabled_flag = get_bits1(gb);
- high_precision_offsets_enabled_flag = get_bits1(gb);
- if (high_precision_offsets_enabled_flag)
+ sps->high_precision_offsets_enabled_flag = get_bits1(gb);
+ if (sps->high_precision_offsets_enabled_flag)
av_log(avctx, AV_LOG_WARNING,
- "high_precision_offsets_enabled_flag not yet implemented\n");
+ "high_precision_offsets_enabled_flag not fully implemented\n");
sps->persistent_rice_adaptation_enabled_flag = get_bits1(gb);
diff --git a/libavcodec/hevc_refs.c b/libavcodec/hevc_refs.c
index df52e401ad..992e994b1a 100644
--- a/libavcodec/hevc_refs.c
+++ b/libavcodec/hevc_refs.c
@@ -23,7 +23,7 @@
#include "libavutil/avassert.h"
#include "libavutil/pixdesc.h"
-
+#include "libavutil/rpi_sand_fns.h"
#include "internal.h"
#include "thread.h"
#include "hevc.h"
@@ -205,7 +205,8 @@ int ff_hevc_output_frame(HEVCContext *s, AVFrame *out, int flush)
HEVCFrame *frame = &s->DPB[min_idx];
AVFrame *dst = out;
AVFrame *src = frame->frame;
- const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(src->format);
+ const int fmt = src->format;
+ const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(fmt);
int pixel_shift = !!(desc->comp[0].depth > 8);
ret = av_frame_ref(out, src);
@@ -215,13 +216,31 @@ int ff_hevc_output_frame(HEVCContext *s, AVFrame *out, int flush)
ff_hevc_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT);
if (ret < 0)
return ret;
-
- for (i = 0; i < 3; i++) {
- int hshift = (i > 0) ? desc->log2_chroma_w : 0;
- int vshift = (i > 0) ? desc->log2_chroma_h : 0;
- int off = ((frame->window.left_offset >> hshift) << pixel_shift) +
- (frame->window.top_offset >> vshift) * dst->linesize[i];
- dst->data[i] += off;
+#ifdef RPI
+ if (av_rpi_is_sand_format(fmt))
+ {
+ // Sand cannot be windowed by offset so add side data if we have an offset
+ const HEVCWindow * const window = &frame->window;
+ if (window->left_offset + window->right_offset + window->top_offset + window->bottom_offset != 0)
+ {
+ AVFrameSideData *const sd = av_frame_new_side_data(dst, AV_FRAME_DATA_SAND_INFO, sizeof(AVPanScan));
+ AVFrameDataSandInfo *const si = (AVFrameDataSandInfo *)sd->data;
+ si->left_offset = window->left_offset;
+ si->top_offset = window->top_offset;
+ si->pic_width = s->ps.sps->width;
+ si->pic_height = s->ps.sps->height;
+ }
+ }
+ else
+#endif
+ {
+ for (i = 0; i < 3; i++) {
+ int hshift = (i > 0) ? desc->log2_chroma_w : 0;
+ int vshift = (i > 0) ? desc->log2_chroma_h : 0;
+ int off = ((frame->window.left_offset >> hshift) << pixel_shift) +
+ (frame->window.top_offset >> vshift) * dst->linesize[i];
+ dst->data[i] += off;
+ }
}
av_log(s->avctx, AV_LOG_DEBUG,
"Output frame with POC %d.\n", frame->poc);
@@ -426,8 +445,7 @@ static HEVCFrame *generate_missing_ref(HEVCContext *s, int poc)
frame->sequence = s->seq_decode;
frame->flags = 0;
- if (s->threads_type == FF_THREAD_FRAME)
- ff_thread_report_progress(&frame->tf, INT_MAX, 0);
+ ff_hevc_progress_set_all_done(frame);
return frame;
}
diff --git a/libavcodec/hevcdsp.c b/libavcodec/hevcdsp.c
index 9d773d960e..c9661c3ab1 100644
--- a/libavcodec/hevcdsp.c
+++ b/libavcodec/hevcdsp.c
@@ -123,6 +123,120 @@ DECLARE_ALIGNED(16, const int8_t, ff_hevc_qpel_filters[3][16]) = {
#include "hevcdsp_template.c"
#undef BIT_DEPTH
+static void hevc_deblocking_boundary_strengths(int pus, int dup, int in_inc, int out_inc,
+ int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
+ MvField *curr, MvField *neigh, uint8_t *bs)
+{
+ for (; pus > 0; pus--) {
+ int strength, out;
+ int curr_refL0 = curr_rpl0[curr->ref_idx[0]];
+ int curr_refL1 = curr_rpl1[curr->ref_idx[1]];
+ int neigh_refL0 = neigh_rpl0[neigh->ref_idx[0]];
+ int neigh_refL1 = neigh_rpl1[neigh->ref_idx[1]];
+
+#if 1 // This more directly matches the original implementation
+ if (curr->pred_flag == PF_BI && neigh->pred_flag == PF_BI) {
+ // same L0 and L1
+ if (curr_refL0 == neigh_refL0 &&
+ curr_refL0 == curr_refL1 &&
+ neigh_refL0 == neigh_refL1) {
+ if ((FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
+ FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4) &&
+ (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
+ FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4))
+ strength = 1;
+ else
+ strength = 0;
+ } else if (neigh_refL0 == curr_refL0 &&
+ neigh_refL1 == curr_refL1) {
+ if (FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
+ FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4)
+ strength = 1;
+ else
+ strength = 0;
+ } else if (neigh_refL1 == curr_refL0 &&
+ neigh_refL0 == curr_refL1) {
+ if (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
+ FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4)
+ strength = 1;
+ else
+ strength = 0;
+ } else {
+ strength = 1;
+ }
+ } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV
+ Mv curr_mv0, neigh_mv0;
+
+ if (curr->pred_flag & 1) {
+ curr_mv0 = curr->mv[0];
+ } else {
+ curr_mv0 = curr->mv[1];
+ curr_refL0 = curr_refL1;
+ }
+
+ if (neigh->pred_flag & 1) {
+ neigh_mv0 = neigh->mv[0];
+ } else {
+ neigh_mv0 = neigh->mv[1];
+ neigh_refL0 = neigh_refL1;
+ }
+
+ if (curr_refL0 == neigh_refL0) {
+ if (FFABS(curr_mv0.x - neigh_mv0.x) >= 4 || FFABS(curr_mv0.y - neigh_mv0.y) >= 4)
+ strength = 1;
+ else
+ strength = 0;
+ } else
+ strength = 1;
+ } else
+ strength = 1;
+#else // This has exactly the same effect, but is more suitable for vectorisation
+ Mv curr_mv[2];
+ Mv neigh_mv[2];
+ memcpy(curr_mv, curr->mv, sizeof curr_mv);
+ memcpy(neigh_mv, neigh->mv, sizeof neigh_mv);
+
+ if (!(curr->pred_flag & 2)) {
+ curr_mv[1] = curr_mv[0];
+ curr_refL1 = curr_refL0;
+ }
+ if (!(neigh->pred_flag & 2)) {
+ neigh_mv[1] = neigh_mv[0];
+ neigh_refL1 = neigh_refL0;
+ }
+ if (!(curr->pred_flag & 1)) {
+ curr_mv[0] = curr_mv[1];
+ curr_refL0 = curr_refL1;
+ }
+ if (!(neigh->pred_flag & 1)) {
+ neigh_mv[0] = neigh_mv[1];
+ neigh_refL0 = neigh_refL1;
+ }
+
+ strength = 1;
+
+ strength &= (neigh_refL0 != curr_refL0) | (neigh_refL1 != curr_refL1) |
+ (FFABS(neigh_mv[0].x - curr_mv[0].x) >= 4) | (FFABS(neigh_mv[0].y - curr_mv[0].y) >= 4) |
+ (FFABS(neigh_mv[1].x - curr_mv[1].x) >= 4) | (FFABS(neigh_mv[1].y - curr_mv[1].y) >= 4);
+
+ strength &= (neigh_refL1 != curr_refL0) | (neigh_refL0 != curr_refL1) |
+ (FFABS(neigh_mv[1].x - curr_mv[0].x) >= 4) | (FFABS(neigh_mv[1].y - curr_mv[0].y) >= 4) |
+ (FFABS(neigh_mv[0].x - curr_mv[1].x) >= 4) | (FFABS(neigh_mv[0].y - curr_mv[1].y) >= 4);
+
+ strength |= (((curr->pred_flag + 1) ^ (neigh->pred_flag + 1)) >> 2);
+#endif
+
+ curr += in_inc / sizeof (MvField);
+ neigh += in_inc / sizeof (MvField);
+
+ for (out = dup; out > 0; out--)
+ {
+ *bs = strength;
+ bs += out_inc;
+ }
+ }
+}
+
void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
{
#undef FUNC
@@ -193,15 +307,57 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
PEL_FUNC(put_hevc_qpel_bi_w, 1, 0, put_hevc_qpel_bi_w_v, depth); \
PEL_FUNC(put_hevc_qpel_bi_w, 1, 1, put_hevc_qpel_bi_w_hv, depth)
+#if !RPI_HEVC_SAND
+#define SLICED_LOOP_FILTERS(depth)
+#define SLICED_ADD_RESIDUAL(depth)
+#define SLICED_SAO(depth)
+#else
+#define SLICED_ADD_RESIDUAL(depth)\
+ hevcdsp->add_residual_u[0] = FUNC(add_residual4x4_u, depth); \
+ hevcdsp->add_residual_u[1] = FUNC(add_residual8x8_u, depth); \
+ hevcdsp->add_residual_u[2] = FUNC(add_residual16x16_u, depth); \
+ hevcdsp->add_residual_u[3] = FUNC(add_residual32x32_u, depth); \
+ hevcdsp->add_residual_v[0] = FUNC(add_residual4x4_v, depth); \
+ hevcdsp->add_residual_v[1] = FUNC(add_residual8x8_v, depth); \
+ hevcdsp->add_residual_v[2] = FUNC(add_residual16x16_v, depth); \
+ hevcdsp->add_residual_v[3] = FUNC(add_residual32x32_v, depth); \
+ hevcdsp->add_residual_c[0] = FUNC(add_residual4x4_c, depth); \
+ hevcdsp->add_residual_c[1] = FUNC(add_residual8x8_c, depth); \
+ hevcdsp->add_residual_c[2] = FUNC(add_residual16x16_c, depth); \
+ hevcdsp->add_residual_c[3] = FUNC(add_residual32x32_c, depth); \
+ hevcdsp->add_residual_dc_c[0] = FUNC(add_residual4x4_dc_c, depth); \
+ hevcdsp->add_residual_dc_c[1] = FUNC(add_residual8x8_dc_c, depth); \
+ hevcdsp->add_residual_dc_c[2] = FUNC(add_residual16x16_dc_c, depth); \
+ hevcdsp->add_residual_dc_c[3] = FUNC(add_residual32x32_dc_c, depth); \
+ hevcdsp->put_pcm_c = FUNC(put_pcm_c, depth)
+#define SLICED_LOOP_FILTERS(depth)\
+ hevcdsp->hevc_v_loop_filter_luma2 = FUNC(hevc_v_loop_filter_luma2, depth); \
+ hevcdsp->hevc_h_loop_filter_uv = FUNC(hevc_h_loop_filter_uv, depth); \
+ hevcdsp->hevc_v_loop_filter_uv2 = FUNC(hevc_v_loop_filter_uv2, depth)
+#define SLICED_SAO(depth)\
+ for (i = 0; i != SAO_FILTER_N; ++i) { \
+ hevcdsp->sao_band_filter_c[i] = FUNC(sao_band_filter_c, depth); \
+ hevcdsp->sao_edge_filter_c[i] = FUNC(sao_edge_filter_c, depth); \
+ } \
+ hevcdsp->sao_edge_restore_c[0] = FUNC(sao_edge_restore_c_0, depth); \
+ hevcdsp->sao_edge_restore_c[1] = FUNC(sao_edge_restore_c_1, depth)
+
+#endif
+
#define HEVC_DSP(depth) \
hevcdsp->put_pcm = FUNC(put_pcm, depth); \
- hevcdsp->transform_add[0] = FUNC(transform_add4x4, depth); \
- hevcdsp->transform_add[1] = FUNC(transform_add8x8, depth); \
- hevcdsp->transform_add[2] = FUNC(transform_add16x16, depth); \
- hevcdsp->transform_add[3] = FUNC(transform_add32x32, depth); \
- hevcdsp->transform_skip = FUNC(transform_skip, depth); \
+ hevcdsp->transform_add[0] = FUNC(add_residual4x4, depth); \
+ hevcdsp->transform_add[1] = FUNC(add_residual8x8, depth); \
+ hevcdsp->transform_add[2] = FUNC(add_residual16x16, depth); \
+ hevcdsp->transform_add[3] = FUNC(add_residual32x32, depth); \
+ hevcdsp->add_residual_dc[0] = FUNC(add_residual4x4_dc, depth); \
+ hevcdsp->add_residual_dc[1] = FUNC(add_residual8x8_dc, depth); \
+ hevcdsp->add_residual_dc[2] = FUNC(add_residual16x16_dc, depth); \
+ hevcdsp->add_residual_dc[3] = FUNC(add_residual32x32_dc, depth); \
+ SLICED_ADD_RESIDUAL(depth); \
hevcdsp->transform_rdpcm = FUNC(transform_rdpcm, depth); \
- hevcdsp->idct_4x4_luma = FUNC(transform_4x4_luma, depth); \
+ hevcdsp->transform_skip = FUNC(transform_skip, depth); \
+ hevcdsp->idct_4x4_luma = FUNC(idct_4x4_luma, depth); \
hevcdsp->idct[0] = FUNC(idct_4x4, depth); \
hevcdsp->idct[1] = FUNC(idct_8x8, depth); \
hevcdsp->idct[2] = FUNC(idct_16x16, depth); \
@@ -212,18 +368,13 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
hevcdsp->idct_dc[2] = FUNC(idct_16x16_dc, depth); \
hevcdsp->idct_dc[3] = FUNC(idct_32x32_dc, depth); \
\
- hevcdsp->sao_band_filter[0] = \
- hevcdsp->sao_band_filter[1] = \
- hevcdsp->sao_band_filter[2] = \
- hevcdsp->sao_band_filter[3] = \
- hevcdsp->sao_band_filter[4] = FUNC(sao_band_filter, depth); \
- hevcdsp->sao_edge_filter[0] = \
- hevcdsp->sao_edge_filter[1] = \
- hevcdsp->sao_edge_filter[2] = \
- hevcdsp->sao_edge_filter[3] = \
- hevcdsp->sao_edge_filter[4] = FUNC(sao_edge_filter, depth); \
+ for (i = 0; i != SAO_FILTER_N; ++i) { \
+ hevcdsp->sao_band_filter[i] = FUNC(sao_band_filter, depth); \
+ hevcdsp->sao_edge_filter[i] = FUNC(sao_edge_filter, depth); \
+ } \
hevcdsp->sao_edge_restore[0] = FUNC(sao_edge_restore_0, depth); \
hevcdsp->sao_edge_restore[1] = FUNC(sao_edge_restore_1, depth); \
+ SLICED_SAO(depth); \
\
QPEL_FUNCS(depth); \
QPEL_UNI_FUNCS(depth); \
@@ -232,6 +383,7 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
EPEL_UNI_FUNCS(depth); \
EPEL_BI_FUNCS(depth); \
\
+ SLICED_LOOP_FILTERS(depth); \
hevcdsp->hevc_h_loop_filter_luma = FUNC(hevc_h_loop_filter_luma, depth); \
hevcdsp->hevc_v_loop_filter_luma = FUNC(hevc_v_loop_filter_luma, depth); \
hevcdsp->hevc_h_loop_filter_chroma = FUNC(hevc_h_loop_filter_chroma, depth); \
@@ -257,6 +409,8 @@ int i = 0;
break;
}
+ hevcdsp->hevc_deblocking_boundary_strengths = hevc_deblocking_boundary_strengths;
+
if (ARCH_X86)
ff_hevc_dsp_init_x86(hevcdsp, bit_depth);
if (ARCH_ARM)
diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h
index 9f1f6dd59f..c4a1b0f09d 100644
--- a/libavcodec/hevcdsp.h
+++ b/libavcodec/hevcdsp.h
@@ -25,6 +25,7 @@
#ifndef AVCODEC_HEVCDSP_H
#define AVCODEC_HEVCDSP_H
+#include "rpi_opts.h"
#include "get_bits.h"
#define MAX_PB_SIZE 64
@@ -42,11 +43,40 @@ typedef struct SAOParams {
uint8_t type_idx[3]; ///< sao_type_idx
} SAOParams;
+typedef struct Mv {
+ int16_t x; ///< horizontal component of motion vector
+ int16_t y; ///< vertical component of motion vector
+} Mv;
+
+typedef struct MvField {
+ DECLARE_ALIGNED(4, Mv, mv)[2];
+ int8_t ref_idx[2];
+ int8_t pred_flag;
+} MvField;
+
+#ifdef RPI
+#define SAO_FILTER_N 6
+#else
+#define SAO_FILTER_N 5
+#endif
+
+
typedef struct HEVCDSPContext {
void (*put_pcm)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
struct GetBitContext *gb, int pcm_bit_depth);
- void (*transform_add[4])(uint8_t *_dst, int16_t *coeffs, ptrdiff_t _stride);
+ // add_residual was transform_add - import 3.3 names
+ void (*transform_add[4])(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+ void (*add_residual_dc[4])(uint8_t *dst, ptrdiff_t stride, int dc);
+#if RPI_HEVC_SAND
+ void (*add_residual_u[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride, int dc_v);
+ void (*add_residual_v[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride, int dc_u);
+
+ void (*add_residual_c[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
+ void (*add_residual_dc_c[4])(uint8_t *dst, ptrdiff_t stride, int32_t dc_uv);
+ void (*put_pcm_c)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
+ struct GetBitContext *gb, int pcm_bit_depth);
+#endif
void (*transform_skip)(int16_t *coeffs, int16_t log2_size);
@@ -58,16 +88,31 @@ typedef struct HEVCDSPContext {
void (*idct_dc[4])(int16_t *coeffs);
- void (*sao_band_filter[5])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+ void (*sao_band_filter[SAO_FILTER_N])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
int16_t *sao_offset_val, int sao_left_class, int width, int height);
+#if RPI_HEVC_SAND
+ void (*sao_band_filter_c[SAO_FILTER_N])(uint8_t *_dst, const uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+ const int16_t *sao_offset_val_u, int sao_left_class_u,
+ const int16_t *sao_offset_val_v, int sao_left_class_v,
+ int width, int height);
+#endif
/* implicit stride_src parameter has value of 2 * MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE */
- void (*sao_edge_filter[5])(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
+ void (*sao_edge_filter[SAO_FILTER_N])(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
int16_t *sao_offset_val, int sao_eo_class, int width, int height);
+#if RPI_HEVC_SAND
+ void (*sao_edge_filter_c[SAO_FILTER_N])(uint8_t *_dst /* align 16 */, const uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
+ const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v, int sao_eo_class, int width, int height);
+#endif
void (*sao_edge_restore[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
struct SAOParams *sao, int *borders, int _width, int _height, int c_idx,
uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
+#if RPI_HEVC_SAND
+ void (*sao_edge_restore_c[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+ struct SAOParams *sao, int *borders, int _width, int _height, int c_idx,
+ uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
+#endif
void (*put_hevc_qpel[10][2][2])(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
int height, intptr_t mx, intptr_t my, int width);
@@ -120,6 +165,22 @@ typedef struct HEVCDSPContext {
void (*hevc_v_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
int32_t *tc, uint8_t *no_p,
uint8_t *no_q);
+#ifdef RPI
+ void (*hevc_v_loop_filter_luma2)(uint8_t * _pix_r,
+ unsigned int _stride, unsigned int beta, const int32_t tc[2],
+ const uint8_t no_p[2], const uint8_t no_q[2],
+ uint8_t * _pix_l);
+ void (*hevc_h_loop_filter_uv)(uint8_t * src, unsigned int stride, uint32_t tc4,
+ unsigned int no_f);
+ void (*hevc_v_loop_filter_uv2)(uint8_t * src_r, unsigned int stride, uint32_t tc4,
+ uint8_t * src_l,
+ unsigned int no_f);
+
+#endif
+
+ void (*hevc_deblocking_boundary_strengths)(int pus, int dup, int in_inc, int out_inc,
+ int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
+ MvField *curr, MvField *neigh, uint8_t *bs);
} HEVCDSPContext;
void ff_hevc_dsp_init(HEVCDSPContext *hpc, int bit_depth);
diff --git a/libavcodec/hevcdsp_template.c b/libavcodec/hevcdsp_template.c
index 5bca02342d..122fbe8154 100644
--- a/libavcodec/hevcdsp_template.c
+++ b/libavcodec/hevcdsp_template.c
@@ -26,6 +26,7 @@
#include "bit_depth_template.c"
#include "hevcdsp.h"
+#include "rpi_shader_template.h"
static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
GetBitContext *gb, int pcm_bit_depth)
@@ -42,8 +43,32 @@ static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height
}
}
-static av_always_inline void FUNC(transquant_bypass)(uint8_t *_dst, int16_t *coeffs,
- ptrdiff_t stride, int size)
+#if RPI_HEVC_SAND
+static void FUNC(put_pcm_c)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
+ GetBitContext *gb, int pcm_bit_depth)
+{
+ int x, y;
+ pixel *dst = (pixel *)_dst;
+
+ stride /= sizeof(pixel);
+
+ for (y = 0; y < height; y++) {
+ for (x = 0; x < width; x++)
+ dst[x*2] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
+ dst += stride;
+ }
+
+ dst = (pixel *)_dst + 1;
+ for (y = 0; y < height; y++) {
+ for (x = 0; x < width; x++)
+ dst[x*2] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
+ dst += stride;
+ }
+}
+#endif
+
+static av_always_inline void FUNC(add_residual)(uint8_t *_dst, int16_t *coeffs,
+ ptrdiff_t stride, int size)
{
int x, y;
pixel *dst = (pixel *)_dst;
@@ -59,30 +84,255 @@ static av_always_inline void FUNC(transquant_bypass)(uint8_t *_dst, int16_t *coe
}
}
-static void FUNC(transform_add4x4)(uint8_t *_dst, int16_t *coeffs,
- ptrdiff_t stride)
+static av_always_inline void FUNC(add_residual_dc)(uint8_t *_dst, ptrdiff_t stride, const int dc, int size)
+{
+ int x, y;
+ pixel *dst = (pixel *)_dst;
+
+ stride /= sizeof(pixel);
+
+ for (y = 0; y < size; y++) {
+ for (x = 0; x < size; x++) {
+ dst[x] = av_clip_pixel(dst[x] + dc);
+ }
+ dst += stride;
+ }
+}
+
+
+#if RPI_HEVC_SAND
+static av_always_inline void FUNC(add_residual_u)(uint8_t *_dst, const int16_t *res,
+ ptrdiff_t stride, const int dc_v, int size)
{
- FUNC(transquant_bypass)(_dst, coeffs, stride, 4);
+ int x, y;
+ pixel *dst = (pixel *)_dst;
+
+ stride /= sizeof(pixel);
+
+ for (y = 0; y < size; y++) {
+ for (x = 0; x < size * 2; x += 2) {
+ dst[x] = av_clip_pixel(dst[x] + *res);
+ dst[x + 1] = av_clip_pixel(dst[x + 1] + dc_v);
+ res++;
+ }
+ dst += stride;
+ }
}
-static void FUNC(transform_add8x8)(uint8_t *_dst, int16_t *coeffs,
+static av_always_inline void FUNC(add_residual_v)(uint8_t *_dst, const int16_t *res,
+ ptrdiff_t stride, const int dc_u, int size)
+{
+ int x, y;
+ pixel *dst = (pixel *)_dst;
+
+ stride /= sizeof(pixel);
+
+ for (y = 0; y < size; y++) {
+ for (x = 0; x < size * 2; x += 2) {
+ dst[x] = av_clip_pixel(dst[x] + dc_u);
+ dst[x + 1] = av_clip_pixel(dst[x + 1] + *res);
+ res++;
+ }
+ dst += stride;
+ }
+}
+
+static av_always_inline void FUNC(add_residual_c)(uint8_t *_dst, const int16_t *res,
+ ptrdiff_t stride, unsigned int size)
+{
+ unsigned int x, y;
+ pixel *dst = (pixel *)_dst;
+ const int16_t * ru = res;
+ const int16_t * rv = res + size * size;
+
+// rpi_sand_dump16("ARC In Pred", _dst, stride, 0, 0, 0, size, size, 1);
+// rpi_sand_dump16("ARC In RU", ru, size * 2, 0, 0, 0, size, size, 0);
+// rpi_sand_dump16("ARC In RV", rv, size * 2, 0, 0, 0, size, size, 0);
+
+ stride /= sizeof(pixel);
+
+ for (y = 0; y < size; y++) {
+ for (x = 0; x < size * 2; x += 2) {
+ dst[x + 0] = av_clip_pixel(dst[x + 0] + *ru++);
+ dst[x + 1] = av_clip_pixel(dst[x + 1] + *rv++);
+ }
+ dst += stride;
+ }
+
+// rpi_sand_dump16("ARC Out", _dst, stride * 2, 0, 0, 0, size, size, 1);
+}
+
+
+static av_always_inline void FUNC(add_residual_dc_c)(uint8_t *_dst, ptrdiff_t stride, const int32_t dc, int size)
+{
+ int x, y;
+ pixel *dst = (pixel *)_dst;
+ const int dc_v = dc >> 16;
+ const int dc_u = (dc << 16) >> 16;
+
+ stride /= sizeof(pixel);
+
+ for (y = 0; y < size; y++) {
+ for (x = 0; x < size * 2; x += 2) {
+ dst[x] = av_clip_pixel(dst[x] + dc_u);
+ dst[x + 1] = av_clip_pixel(dst[x + 1] + dc_v);
+ }
+ dst += stride;
+ }
+}
+
+
+#endif
+
+static void FUNC(add_residual4x4)(uint8_t *_dst, int16_t *coeffs,
+ ptrdiff_t stride)
+{
+ FUNC(add_residual)(_dst, coeffs, stride, 4);
+}
+
+static void FUNC(add_residual8x8)(uint8_t *_dst, int16_t *coeffs,
ptrdiff_t stride)
{
- FUNC(transquant_bypass)(_dst, coeffs, stride, 8);
+ FUNC(add_residual)(_dst, coeffs, stride, 8);
}
-static void FUNC(transform_add16x16)(uint8_t *_dst, int16_t *coeffs,
+static void FUNC(add_residual16x16)(uint8_t *_dst, int16_t *coeffs,
ptrdiff_t stride)
{
- FUNC(transquant_bypass)(_dst, coeffs, stride, 16);
+ FUNC(add_residual)(_dst, coeffs, stride, 16);
}
-static void FUNC(transform_add32x32)(uint8_t *_dst, int16_t *coeffs,
+static void FUNC(add_residual32x32)(uint8_t *_dst, int16_t *coeffs,
ptrdiff_t stride)
{
- FUNC(transquant_bypass)(_dst, coeffs, stride, 32);
+ FUNC(add_residual)(_dst, coeffs, stride, 32);
}
+static void FUNC(add_residual4x4_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
+{
+ FUNC(add_residual_dc)(_dst, stride, dc, 4);
+}
+
+static void FUNC(add_residual8x8_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
+{
+ FUNC(add_residual_dc)(_dst, stride, dc, 8);
+}
+
+static void FUNC(add_residual16x16_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
+{
+ FUNC(add_residual_dc)(_dst, stride, dc, 16);
+}
+
+static void FUNC(add_residual32x32_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
+{
+ FUNC(add_residual_dc)(_dst, stride, dc, 32);
+}
+
+#if RPI_HEVC_SAND
+// -- U -- (plaited)
+
+static void FUNC(add_residual4x4_u)(uint8_t *_dst, const int16_t * res,
+ ptrdiff_t stride, int dc_u)
+{
+ FUNC(add_residual_u)(_dst, res, stride, dc_u, 4);
+}
+
+static void FUNC(add_residual8x8_u)(uint8_t *_dst, const int16_t * res,
+ ptrdiff_t stride, int dc_u)
+{
+ FUNC(add_residual_u)(_dst, res, stride, dc_u, 8);
+}
+
+static void FUNC(add_residual16x16_u)(uint8_t *_dst, const int16_t * res,
+ ptrdiff_t stride, int dc_u)
+{
+ FUNC(add_residual_u)(_dst, res, stride, dc_u, 16);
+}
+
+static void FUNC(add_residual32x32_u)(uint8_t *_dst, const int16_t * res,
+ ptrdiff_t stride, int dc_u)
+{
+ // Should never occur for 420, which is all that sand supports
+ av_assert0(0);
+}
+
+// -- V -- (plaited)
+
+static void FUNC(add_residual4x4_v)(uint8_t *_dst, const int16_t * res,
+ ptrdiff_t stride, int dc_v)
+{
+ FUNC(add_residual_v)(_dst, res, stride, dc_v, 4);
+}
+
+static void FUNC(add_residual8x8_v)(uint8_t *_dst, const int16_t * res,
+ ptrdiff_t stride, int dc_v)
+{
+ FUNC(add_residual_v)(_dst, res, stride, dc_v, 8);
+}
+
+static void FUNC(add_residual16x16_v)(uint8_t *_dst, const int16_t * res,
+ ptrdiff_t stride, int dc_v)
+{
+ FUNC(add_residual_v)(_dst, res, stride, dc_v, 16);
+}
+
+static void FUNC(add_residual32x32_v)(uint8_t *_dst, const int16_t * res,
+ ptrdiff_t stride, int dc_v)
+{
+ // Should never occur for 420, which is all that sand supports
+ av_assert0(0);
+}
+
+// -- C -- (plaited - both U & V)
+
+static void FUNC(add_residual4x4_c)(uint8_t *_dst, const int16_t * res,
+ ptrdiff_t stride)
+{
+ FUNC(add_residual_c)(_dst, res, stride, 4);
+}
+
+static void FUNC(add_residual8x8_c)(uint8_t *_dst, const int16_t * res,
+ ptrdiff_t stride)
+{
+ FUNC(add_residual_c)(_dst, res, stride, 8);
+}
+
+static void FUNC(add_residual16x16_c)(uint8_t *_dst, const int16_t * res,
+ ptrdiff_t stride)
+{
+ FUNC(add_residual_c)(_dst, res, stride, 16);
+}
+
+static void FUNC(add_residual32x32_c)(uint8_t *_dst, const int16_t * res,
+ ptrdiff_t stride)
+{
+ // Should never occur for 420, which is all that sand supports
+ av_assert0(0);
+}
+
+static void FUNC(add_residual4x4_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
+{
+ FUNC(add_residual_dc_c)(_dst, stride, dc, 4);
+}
+
+static void FUNC(add_residual8x8_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
+{
+ FUNC(add_residual_dc_c)(_dst, stride, dc, 8);
+}
+
+static void FUNC(add_residual16x16_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
+{
+ FUNC(add_residual_dc_c)(_dst, stride, dc, 16);
+}
+
+static void FUNC(add_residual32x32_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
+{
+ // Should never occur for 420, which is all that sand supports
+ av_assert0(0);
+}
+
+#endif
+
static void FUNC(transform_rdpcm)(int16_t *_coeffs, int16_t log2_size, int mode)
{
@@ -152,7 +402,7 @@ static void FUNC(transform_skip)(int16_t *_coeffs, int16_t log2_size)
assign(dst[3 * step], 55 * c0 + 29 * c2 - c3); \
} while (0)
-static void FUNC(transform_4x4_luma)(int16_t *coeffs)
+static void FUNC(idct_4x4_luma)(int16_t *coeffs)
{
int i;
int shift = 7;
@@ -358,6 +608,32 @@ static void FUNC(sao_edge_filter)(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride
}
}
+
+#if BIT_DEPTH == 10
+#if RPI_HEVC_SAND
+// We need a 32 bit variation for the _c restores so hijack bit depth 10
+#undef pixel
+#undef BIT_DEPTH
+#define pixel uint32_t
+#define BIT_DEPTH 32
+#endif
+// All 16 bit variations are the same
+#define sao_edge_restore_0_10 sao_edge_restore_0_9
+#define sao_edge_restore_1_10 sao_edge_restore_1_9
+#define sao_edge_restore_0_11 sao_edge_restore_0_9
+#define sao_edge_restore_1_11 sao_edge_restore_1_9
+#define sao_edge_restore_0_12 sao_edge_restore_0_9
+#define sao_edge_restore_1_12 sao_edge_restore_1_9
+#define sao_edge_restore_0_13 sao_edge_restore_0_9
+#define sao_edge_restore_1_13 sao_edge_restore_1_9
+#define sao_edge_restore_0_14 sao_edge_restore_0_9
+#define sao_edge_restore_1_14 sao_edge_restore_1_9
+#define sao_edge_restore_0_15 sao_edge_restore_0_9
+#define sao_edge_restore_1_15 sao_edge_restore_1_9
+#define sao_edge_restore_0_16 sao_edge_restore_0_9
+#define sao_edge_restore_1_16 sao_edge_restore_1_9
+#endif
+#if BIT_DEPTH <= 9 || BIT_DEPTH == 32
static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao,
int *borders, int _width, int _height,
@@ -367,7 +643,6 @@ static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
int x, y;
pixel *dst = (pixel *)_dst;
pixel *src = (pixel *)_src;
- int16_t *sao_offset_val = sao->offset_val[c_idx];
int sao_eo_class = sao->eo_class[c_idx];
int init_x = 0, width = _width, height = _height;
@@ -376,33 +651,29 @@ static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
if (sao_eo_class != SAO_EO_VERT) {
if (borders[0]) {
- int offset_val = sao_offset_val[0];
for (y = 0; y < height; y++) {
- dst[y * stride_dst] = av_clip_pixel(src[y * stride_src] + offset_val);
+ dst[y * stride_dst] = src[y * stride_src];
}
init_x = 1;
}
if (borders[2]) {
- int offset_val = sao_offset_val[0];
int offset = width - 1;
for (x = 0; x < height; x++) {
- dst[x * stride_dst + offset] = av_clip_pixel(src[x * stride_src + offset] + offset_val);
+ dst[x * stride_dst + offset] = src[x * stride_src + offset];
}
width--;
}
}
if (sao_eo_class != SAO_EO_HORIZ) {
if (borders[1]) {
- int offset_val = sao_offset_val[0];
for (x = init_x; x < width; x++)
- dst[x] = av_clip_pixel(src[x] + offset_val);
+ dst[x] = src[x];
}
if (borders[3]) {
- int offset_val = sao_offset_val[0];
- int y_stride_dst = stride_dst * (height - 1);
- int y_stride_src = stride_src * (height - 1);
+ ptrdiff_t y_stride_dst = stride_dst * (height - 1);
+ ptrdiff_t y_stride_src = stride_src * (height - 1);
for (x = init_x; x < width; x++)
- dst[x + y_stride_dst] = av_clip_pixel(src[x + y_stride_src] + offset_val);
+ dst[x + y_stride_dst] = src[x + y_stride_src];
height--;
}
}
@@ -417,7 +688,6 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
int x, y;
pixel *dst = (pixel *)_dst;
pixel *src = (pixel *)_src;
- int16_t *sao_offset_val = sao->offset_val[c_idx];
int sao_eo_class = sao->eo_class[c_idx];
int init_x = 0, init_y = 0, width = _width, height = _height;
@@ -426,34 +696,30 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
if (sao_eo_class != SAO_EO_VERT) {
if (borders[0]) {
- int offset_val = sao_offset_val[0];
for (y = 0; y < height; y++) {
- dst[y * stride_dst] = av_clip_pixel(src[y * stride_src] + offset_val);
+ dst[y * stride_dst] = src[y * stride_src];
}
init_x = 1;
}
if (borders[2]) {
- int offset_val = sao_offset_val[0];
int offset = width - 1;
for (x = 0; x < height; x++) {
- dst[x * stride_dst + offset] = av_clip_pixel(src[x * stride_src + offset] + offset_val);
+ dst[x * stride_dst + offset] = src[x * stride_src + offset];
}
width--;
}
}
if (sao_eo_class != SAO_EO_HORIZ) {
if (borders[1]) {
- int offset_val = sao_offset_val[0];
for (x = init_x; x < width; x++)
- dst[x] = av_clip_pixel(src[x] + offset_val);
+ dst[x] = src[x];
init_y = 1;
}
if (borders[3]) {
- int offset_val = sao_offset_val[0];
- int y_stride_dst = stride_dst * (height - 1);
- int y_stride_src = stride_src * (height - 1);
+ ptrdiff_t y_stride_dst = stride_dst * (height - 1);
+ ptrdiff_t y_stride_src = stride_src * (height - 1);
for (x = init_x; x < width; x++)
- dst[x + y_stride_dst] = av_clip_pixel(src[x + y_stride_src] + offset_val);
+ dst[x + y_stride_dst] = src[x + y_stride_src];
height--;
}
}
@@ -493,6 +759,121 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
}
}
+#endif
+#if BIT_DEPTH == 32
+#undef BIT_DEPTH
+#undef pixel
+#define BIT_DEPTH 10
+#define pixel uint16_t
+#endif
+
+// --- Plaited chroma versions
+
+#if RPI_HEVC_SAND
+
+static void FUNC(sao_band_filter_c)(uint8_t *_dst, const uint8_t *_src,
+ ptrdiff_t stride_dst, ptrdiff_t stride_src,
+ const int16_t *sao_offset_val_u, int sao_left_class_u,
+ const int16_t *sao_offset_val_v, int sao_left_class_v,
+ int width, int height)
+{
+ pixel *dst = (pixel *)_dst;
+ pixel *src = (pixel *)_src;
+ int offset_table_u[32] = { 0 };
+ int offset_table_v[32] = { 0 };
+ int k, y, x;
+ int shift = BIT_DEPTH - 5;
+
+ stride_dst /= sizeof(pixel);
+ stride_src /= sizeof(pixel);
+ width *= 2;
+
+ for (k = 0; k < 4; k++)
+ {
+ offset_table_u[(k + sao_left_class_u) & 31] = sao_offset_val_u[k + 1];
+ offset_table_v[(k + sao_left_class_v) & 31] = sao_offset_val_v[k + 1];
+ }
+ for (y = 0; y < height; y++) {
+ for (x = 0; x < width; x += 2)
+ {
+// printf("dst=%p, src=%p, x=%d, shift=%d\n", dst, src, x, shift);
+// printf("offsets=%x,%x\n", src[x + 0], src[x + 1]);
+ // *** & 31 shouldn't be wanted but just now we generate broken input that
+ // crashes us in 10-bit world
+ dst[x + 0] = av_clip_pixel(src[x + 0] + offset_table_u[(src[x + 0] >> shift) & 31]);
+ dst[x + 1] = av_clip_pixel(src[x + 1] + offset_table_v[(src[x + 1] >> shift) & 31]);
+ }
+ dst += stride_dst;
+ src += stride_src;
+ }
+}
+
+static void FUNC(sao_edge_filter_c)(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
+ const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v,
+ int eo, int width, int height) {
+
+ static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
+ static const int8_t pos[4][2][2] = {
+ { { -1, 0 }, { 1, 0 } }, // horizontal
+ { { 0, -1 }, { 0, 1 } }, // vertical
+ { { -1, -1 }, { 1, 1 } }, // 45 degree
+ { { 1, -1 }, { -1, 1 } }, // 135 degree
+ };
+ pixel *dst = (pixel *)_dst;
+ pixel *src = (pixel *)_src;
+ int a_stride, b_stride;
+ int x, y;
+ ptrdiff_t stride_src = (2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE) / sizeof(pixel);
+
+ stride_dst /= sizeof(pixel);
+ width *= 2;
+
+ av_assert0(width <= 64);
+
+ a_stride = pos[eo][0][0] * 2 + pos[eo][0][1] * stride_src;
+ b_stride = pos[eo][1][0] * 2 + pos[eo][1][1] * stride_src;
+ for (y = 0; y < height; y++) {
+ for (x = 0; x < width; x += 2) {
+ int diff0u = CMP(src[x], src[x + a_stride]);
+ int diff1u = CMP(src[x], src[x + b_stride]);
+ int offset_valu = edge_idx[2 + diff0u + diff1u];
+ int diff0v = CMP(src[x+1], src[x+1 + a_stride]);
+ int diff1v = CMP(src[x+1], src[x+1 + b_stride]);
+ int offset_valv = edge_idx[2 + diff0v + diff1v];
+ dst[x] = av_clip_pixel(src[x] + sao_offset_val_u[offset_valu]);
+ dst[x+1] = av_clip_pixel(src[x+1] + sao_offset_val_v[offset_valv]);
+ }
+ src += stride_src;
+ dst += stride_dst;
+ }
+}
+
+// Do once
+#if BIT_DEPTH == 8
+// Any old 2 byte 'normal' restore will work for these
+#define sao_edge_restore_c_0_8 sao_edge_restore_0_16
+#define sao_edge_restore_c_1_8 sao_edge_restore_1_16
+// We need 32 bit for 9 bit+
+#define sao_edge_restore_c_0_9 sao_edge_restore_0_32
+#define sao_edge_restore_c_1_9 sao_edge_restore_1_32
+#define sao_edge_restore_c_0_10 sao_edge_restore_0_32
+#define sao_edge_restore_c_1_10 sao_edge_restore_1_32
+#define sao_edge_restore_c_0_11 sao_edge_restore_0_32
+#define sao_edge_restore_c_1_11 sao_edge_restore_1_32
+#define sao_edge_restore_c_0_12 sao_edge_restore_0_32
+#define sao_edge_restore_c_1_12 sao_edge_restore_1_32
+#define sao_edge_restore_c_0_13 sao_edge_restore_0_32
+#define sao_edge_restore_c_1_13 sao_edge_restore_1_32
+#define sao_edge_restore_c_0_14 sao_edge_restore_0_32
+#define sao_edge_restore_c_1_14 sao_edge_restore_1_32
+#define sao_edge_restore_c_0_15 sao_edge_restore_0_32
+#define sao_edge_restore_c_1_15 sao_edge_restore_1_32
+#define sao_edge_restore_c_0_16 sao_edge_restore_0_32
+#define sao_edge_restore_c_1_16 sao_edge_restore_1_32
+#endif
+
+#endif // RPI_HEVC_SAND
+
#undef CMP
@@ -1694,3 +2075,217 @@ static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
#undef TQ1
#undef TQ2
#undef TQ3
+
+#if RPI_HEVC_SAND
+
+// line zero
+#define P3 pix_l[0 * xstride]
+#define P2 pix_l[1 * xstride]
+#define P1 pix_l[2 * xstride]
+#define P0 pix_l[3 * xstride]
+#define Q0 pix_r[0 * xstride]
+#define Q1 pix_r[1 * xstride]
+#define Q2 pix_r[2 * xstride]
+#define Q3 pix_r[3 * xstride]
+
+// line three. used only for deblocking decision
+#define TP3 pix_l[0 * xstride + 3 * ystride]
+#define TP2 pix_l[1 * xstride + 3 * ystride]
+#define TP1 pix_l[2 * xstride + 3 * ystride]
+#define TP0 pix_l[3 * xstride + 3 * ystride]
+#define TQ0 pix_r[0 * xstride + 3 * ystride]
+#define TQ1 pix_r[1 * xstride + 3 * ystride]
+#define TQ2 pix_r[2 * xstride + 3 * ystride]
+#define TQ3 pix_r[3 * xstride + 3 * ystride]
+
+// This is identical to hevc_loop_filter_luma except that the P/Q
+// components are on separate pointers
+static void FUNC(hevc_v_loop_filter_luma2)(uint8_t * _pix_r,
+ unsigned int _stride, unsigned int beta, const int32_t _tc[2],
+ const uint8_t _no_p[2], const uint8_t _no_q[2],
+ uint8_t * _pix_l)
+{
+ int d, j;
+ pixel *pix_l = (pixel *)_pix_l;
+ pixel *pix_r = (pixel *)_pix_r;
+ const ptrdiff_t xstride = 1;
+ const ptrdiff_t ystride = _stride / sizeof(pixel);
+
+ beta <<= BIT_DEPTH - 8;
+
+ for (j = 0; j < 2; j++) {
+ const int dp0 = abs(P2 - 2 * P1 + P0);
+ const int dq0 = abs(Q2 - 2 * Q1 + Q0);
+ const int dp3 = abs(TP2 - 2 * TP1 + TP0);
+ const int dq3 = abs(TQ2 - 2 * TQ1 + TQ0);
+ const int d0 = dp0 + dq0;
+ const int d3 = dp3 + dq3;
+ const int tc = _tc[j] << (BIT_DEPTH - 8);
+ const int no_p = _no_p[j];
+ const int no_q = _no_q[j];
+
+ if (d0 + d3 >= beta) {
+ pix_l += 4 * ystride;
+ pix_r += 4 * ystride;
+ continue;
+ } else {
+ const int beta_3 = beta >> 3;
+ const int beta_2 = beta >> 2;
+ const int tc25 = ((tc * 5 + 1) >> 1);
+
+ if (abs(P3 - P0) + abs(Q3 - Q0) < beta_3 && abs(P0 - Q0) < tc25 &&
+ abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 &&
+ (d0 << 1) < beta_2 && (d3 << 1) < beta_2) {
+ // strong filtering
+ const int tc2 = tc << 1;
+ for (d = 0; d < 4; d++) {
+ const int p3 = P3;
+ const int p2 = P2;
+ const int p1 = P1;
+ const int p0 = P0;
+ const int q0 = Q0;
+ const int q1 = Q1;
+ const int q2 = Q2;
+ const int q3 = Q3;
+ if (!no_p) {
+ P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc2, tc2);
+ P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2);
+ P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc2, tc2);
+ }
+ if (!no_q) {
+ Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc2, tc2);
+ Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2);
+ Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc2, tc2);
+ }
+ pix_l += ystride;
+ pix_r += ystride;
+ }
+ } else { // normal filtering
+ int nd_p = 1;
+ int nd_q = 1;
+ const int tc_2 = tc >> 1;
+ if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3))
+ nd_p = 2;
+ if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3))
+ nd_q = 2;
+
+ for (d = 0; d < 4; d++) {
+ const int p2 = P2;
+ const int p1 = P1;
+ const int p0 = P0;
+ const int q0 = Q0;
+ const int q1 = Q1;
+ const int q2 = Q2;
+ int delta0 = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4;
+ if (abs(delta0) < 10 * tc) {
+ delta0 = av_clip(delta0, -tc, tc);
+ if (!no_p)
+ P0 = av_clip_pixel(p0 + delta0);
+ if (!no_q)
+ Q0 = av_clip_pixel(q0 - delta0);
+ if (!no_p && nd_p > 1) {
+ const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2);
+ P1 = av_clip_pixel(p1 + deltap1);
+ }
+ if (!no_q && nd_q > 1) {
+ const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2);
+ Q1 = av_clip_pixel(q1 + deltaq1);
+ }
+ }
+ pix_l += ystride;
+ pix_r += ystride;
+ }
+ }
+ }
+ }
+}
+
+#undef TP3
+#undef TP2
+#undef TP1
+#undef TP0
+#undef TQ0
+#undef TQ1
+#undef TQ2
+#undef TQ3
+
+#undef P3
+#undef P2
+#undef P1
+#undef P0
+#undef Q0
+#undef Q1
+#undef Q2
+#undef Q3
+
+#define P1 pix_l[0 * xstride]
+#define P0 pix_l[1 * xstride]
+#define Q0 pix_r[0 * xstride]
+#define Q1 pix_r[1 * xstride]
+
+static void FUNC(hevc_loop_filter_uv2)(uint8_t *_pix_l, ptrdiff_t _xstride,
+ ptrdiff_t _ystride, const int32_t *_tc,
+ const uint8_t *_no_p, const uint8_t *_no_q, uint8_t *_pix_r)
+{
+ int d, j, no_p, no_q;
+ pixel *pix_l = (pixel *)_pix_l;
+ pixel *pix_r = (pixel *)_pix_r;
+ ptrdiff_t xstride = _xstride / sizeof(pixel);
+ ptrdiff_t ystride = _ystride / sizeof(pixel);
+
+ for (j = 0; j < 2; j++) {
+ const int tc = _tc[j] << (BIT_DEPTH - 8);
+ if (tc <= 0) {
+ pix_l += 4 * ystride;
+ pix_r += 4 * ystride;
+ continue;
+ }
+ no_p = _no_p[j];
+ no_q = _no_q[j];
+
+ for (d = 0; d < 4; d++) {
+ int delta0;
+ const int p1 = P1;
+ const int p0 = P0;
+ const int q0 = Q0;
+ const int q1 = Q1;
+ delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc);
+ if (!no_p)
+ P0 = av_clip_pixel(p0 + delta0);
+ if (!no_q)
+ Q0 = av_clip_pixel(q0 - delta0);
+ pix_l += ystride;
+ pix_r += ystride;
+ }
+ }
+}
+
+static void FUNC(hevc_h_loop_filter_uv)(uint8_t * pix, unsigned int stride, uint32_t tc4,
+ unsigned int no_f)
+{
+ uint8_t no_p[2] = {no_f & 1, no_f & 2};
+ uint8_t no_q[2] = {no_f & 4, no_f & 8};
+ int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24};
+ FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel) * 2, tc, no_p, no_q);
+ FUNC(hevc_loop_filter_chroma)(pix + sizeof(pixel), stride, sizeof(pixel) * 2, tc + 2, no_p, no_q);
+}
+
+static void FUNC(hevc_v_loop_filter_uv2)(uint8_t * src_r, unsigned int stride, uint32_t tc4,
+ uint8_t * src_l,
+ unsigned int no_f)
+{
+ uint8_t no_p[2] = {no_f & 1, no_f & 2};
+ uint8_t no_q[2] = {no_f & 4, no_f & 8};
+ int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24};
+ FUNC(hevc_loop_filter_uv2)(src_l, sizeof(pixel) * 2, stride, tc, no_p, no_q, src_r);
+ FUNC(hevc_loop_filter_uv2)(src_l + sizeof(pixel), sizeof(pixel) * 2, stride, tc + 2, no_p, no_q, src_r + sizeof(pixel));
+}
+
+#undef P1
+#undef P0
+#undef Q0
+#undef Q1
+
+
+#endif
+
diff --git a/libavcodec/hevcpred.c b/libavcodec/hevcpred.c
index 02c1766059..cea16eade4 100644
--- a/libavcodec/hevcpred.c
+++ b/libavcodec/hevcpred.c
@@ -24,6 +24,7 @@
#include "hevcpred.h"
+#define PRED_C 0
#define BIT_DEPTH 8
#include "hevcpred_template.c"
#undef BIT_DEPTH
@@ -39,13 +40,37 @@
#define BIT_DEPTH 12
#include "hevcpred_template.c"
#undef BIT_DEPTH
+#undef PRED_C
+
+#ifdef RPI
+#define PRED_C 1
+#define BIT_DEPTH 8
+#include "hevcpred_template.c"
+#undef BIT_DEPTH
+
+#define BIT_DEPTH 9
+#include "hevcpred_template.c"
+#undef BIT_DEPTH
+
+#define BIT_DEPTH 10
+#include "hevcpred_template.c"
+#undef BIT_DEPTH
+
+#define BIT_DEPTH 12
+#include "hevcpred_template.c"
+#undef BIT_DEPTH
+#undef PRED_C
+#endif
void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth)
{
#undef FUNC
#define FUNC(a, depth) a ## _ ## depth
-#define HEVC_PRED(depth) \
+#undef FUNCC
+#define FUNCC(a, depth) a ## _ ## depth ## _c
+
+#define HEVC_PRED_Y(depth) \
hpc->intra_pred[0] = FUNC(intra_pred_2, depth); \
hpc->intra_pred[1] = FUNC(intra_pred_3, depth); \
hpc->intra_pred[2] = FUNC(intra_pred_4, depth); \
@@ -60,6 +85,30 @@ void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth)
hpc->pred_angular[2] = FUNC(pred_angular_2, depth); \
hpc->pred_angular[3] = FUNC(pred_angular_3, depth);
+#define HEVC_PRED_C(depth) \
+ hpc->intra_pred_c[0] = FUNCC(intra_pred_2, depth); \
+ hpc->intra_pred_c[1] = FUNCC(intra_pred_3, depth); \
+ hpc->intra_pred_c[2] = FUNCC(intra_pred_4, depth); \
+ hpc->intra_pred_c[3] = FUNCC(intra_pred_5, depth); \
+ hpc->pred_planar_c[0] = FUNCC(pred_planar_0, depth); \
+ hpc->pred_planar_c[1] = FUNCC(pred_planar_1, depth); \
+ hpc->pred_planar_c[2] = FUNCC(pred_planar_2, depth); \
+ hpc->pred_planar_c[3] = FUNCC(pred_planar_3, depth); \
+ hpc->pred_dc_c = FUNCC(pred_dc, depth); \
+ hpc->pred_angular_c[0] = FUNCC(pred_angular_0, depth); \
+ hpc->pred_angular_c[1] = FUNCC(pred_angular_1, depth); \
+ hpc->pred_angular_c[2] = FUNCC(pred_angular_2, depth); \
+ hpc->pred_angular_c[3] = FUNCC(pred_angular_3, depth);
+
+#ifdef RPI
+#define HEVC_PRED(depth) \
+ HEVC_PRED_Y(depth); \
+ HEVC_PRED_C(depth);
+#else
+#define HEVC_PRED(depth) \
+ HEVC_PRED_Y(depth);
+#endif
+
switch (bit_depth) {
case 9:
HEVC_PRED(9);
diff --git a/libavcodec/hevcpred.h b/libavcodec/hevcpred.h
index eb17663683..00ba3f94c0 100644
--- a/libavcodec/hevcpred.h
+++ b/libavcodec/hevcpred.h
@@ -38,6 +38,17 @@ typedef struct HEVCPredContext {
void (*pred_angular[4])(uint8_t *src, const uint8_t *top,
const uint8_t *left, ptrdiff_t stride,
int c_idx, int mode);
+#ifdef RPI
+ void (*intra_pred_c[4])(struct HEVCContext *s, int x0, int y0, int c_idx);
+
+ void (*pred_planar_c[4])(uint8_t *src, const uint8_t *top,
+ const uint8_t *left, ptrdiff_t stride);
+ void (*pred_dc_c)(uint8_t *src, const uint8_t *top, const uint8_t *left,
+ ptrdiff_t stride, int log2_size, int c_idx);
+ void (*pred_angular_c[4])(uint8_t *src, const uint8_t *top,
+ const uint8_t *left, ptrdiff_t stride,
+ int c_idx, int mode);
+#endif
} HEVCPredContext;
void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth);
diff --git a/libavcodec/hevcpred_template.c b/libavcodec/hevcpred_template.c
index 6fe33546b1..2f9f5f2798 100644
--- a/libavcodec/hevcpred_template.c
+++ b/libavcodec/hevcpred_template.c
@@ -20,13 +20,110 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
+//#define DISABLE_INTRA
+
#include "libavutil/pixdesc.h"
#include "bit_depth_template.c"
#include "hevcpred.h"
+#ifdef RPI
+#include "libavutil/rpi_sand_fns.h"
+#endif
+
+#define DUMP_PRED 0
+
#define POS(x, y) src[(x) + stride * (y)]
+// REPEAT_INCLUDE defined at EOF
+#if defined(RPI) && !defined(INCLUDED_ONCE)
+typedef uint8_t (* c8_dst_ptr_t)[2];
+typedef const uint8_t (* c8_src_ptr_t)[2];
+typedef uint16_t (* c16_dst_ptr_t)[2];
+typedef const uint16_t (* c16_src_ptr_t)[2];
+
+// *** On ARM make these NEON registers
+typedef struct pixel4_16 {
+ uint16_t x[4];
+} pixel4_16;
+typedef struct pixel4_32 {
+ uint32_t x[4];
+} pixel4_32;
+static inline pixel4_16 PIXEL_SPLAT_X4_16(const uint16_t x)
+{
+ pixel4_16 t = {{x, x, x, x}};
+ return t;
+}
+static inline pixel4_32 PIXEL_SPLAT_X4_32(const uint32_t x)
+{
+ pixel4_32 t = {{x, x, x, x}};
+ return t;
+}
+#endif
+
+#if PRED_C
+// For chroma we double pixel size so we copy pairs
+#undef pixel
+#undef pixel2
+#undef pixel4
+#undef dctcoef
+#undef INIT_CLIP
+#undef no_rnd_avg_pixel4
+#undef rnd_avg_pixel4
+#undef AV_RN2P
+#undef AV_RN4P
+#undef AV_RN4PA
+#undef AV_WN2P
+#undef AV_WN4P
+#undef AV_WN4PA
+#undef CLIP
+#undef FUNC
+#undef FUNCC
+#undef av_clip_pixel
+#undef PIXEL_SPLAT_X4
+
+#if BIT_DEPTH == 8
+#define pixel uint16_t
+#define pixel4 pixel4_16
+#define PIXEL_SPLAT_X4 PIXEL_SPLAT_X4_16
+#define cpel uint8_t
+#define c_src_ptr_t c8_src_ptr_t
+#define c_dst_ptr_t c8_dst_ptr_t
+#else
+#define pixel uint32_t
+#define pixel4 pixel4_32
+#define PIXEL_SPLAT_X4 PIXEL_SPLAT_X4_32
+#define cpel uint16_t
+#define c_src_ptr_t c16_dst_ptr_t
+#define c_dst_ptr_t c16_dst_ptr_t
+#endif
+#define AV_RN4P(p) (*(pixel4*)(p))
+#define AV_WN4P(p,x) (*(pixel4*)(p) = (x))
+#define FUNC(a) FUNC2(a, BIT_DEPTH, _c)
+#endif
+
+
+// Get PW prior to horrid PRED_C trickery
+#if BIT_DEPTH == 8
+#define PW 1
+#else
+#define PW 2
+#endif
+
+
+#if DUMP_PRED && !defined(INCLUDE_ONCE)
+static void dump_pred_uv(const uint8_t * data, const unsigned int stride, const unsigned int size)
+{
+ for (unsigned int y = 0; y != size; y++, data += stride * 2) {
+ for (unsigned int x = 0; x != size; x++) {
+ printf("%4d", data[x * 2]);
+ }
+ printf("\n");
+ }
+ printf("\n");
+}
+#endif
+
static av_always_inline void FUNC(intra_pred)(HEVCContext *s, int x0, int y0,
int log2_size, int c_idx)
{
@@ -69,8 +166,11 @@ do { \
AV_WN4P(&ptr[i], a); \
else \
a = PIXEL_SPLAT_X4(ptr[i + 3])
-
+#ifdef RPI
+ HEVCLocalContextIntra *lc = (s->enable_rpi) ? &s->HEVClcIntra : (HEVCLocalContextIntra *)s->HEVClc ;
+#else
HEVCLocalContext *lc = s->HEVClc;
+#endif
int i;
int hshift = s->ps.sps->hshift[c_idx];
int vshift = s->ps.sps->vshift[c_idx];
@@ -79,15 +179,23 @@ do { \
int size_in_tbs_h = size_in_luma_h >> s->ps.sps->log2_min_tb_size;
int size_in_luma_v = size << vshift;
int size_in_tbs_v = size_in_luma_v >> s->ps.sps->log2_min_tb_size;
- int x = x0 >> hshift;
- int y = y0 >> vshift;
+ const int x = x0 >> hshift;
+ const int y = y0 >> vshift;
int x_tb = (x0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
int y_tb = (y0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
int cur_tb_addr = MIN_TB_ADDR_ZS(x_tb, y_tb);
- ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(pixel);
+ const ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(pixel);
+#if defined(RPI)
+ pixel *const src = !av_rpi_is_sand_frame(s->frame) ?
+ (pixel*)s->frame->data[c_idx] + x + y * stride :
+ c_idx == 0 ?
+ (pixel *)av_rpi_sand_frame_pos_y(s->frame, x, y) :
+ (pixel *)av_rpi_sand_frame_pos_c(s->frame, x, y);
+#else
pixel *src = (pixel*)s->frame->data[c_idx] + x + y * stride;
+#endif
int min_pu_width = s->ps.sps->min_pu_width;
@@ -95,14 +203,20 @@ do { \
lc->tu.intra_pred_mode;
pixel4 a;
pixel left_array[2 * MAX_TB_SIZE + 1];
+#if !PRED_C
pixel filtered_left_array[2 * MAX_TB_SIZE + 1];
+#endif
pixel top_array[2 * MAX_TB_SIZE + 1];
+#if !PRED_C
pixel filtered_top_array[2 * MAX_TB_SIZE + 1];
+#endif
pixel *left = left_array + 1;
pixel *top = top_array + 1;
+#if !PRED_C
pixel *filtered_left = filtered_left_array + 1;
pixel *filtered_top = filtered_top_array + 1;
+#endif
int cand_bottom_left = lc->na.cand_bottom_left && cur_tb_addr > MIN_TB_ADDR_ZS( x_tb - 1, (y_tb + size_in_tbs_v) & s->ps.sps->tb_mask);
int cand_left = lc->na.cand_left;
int cand_up_left = lc->na.cand_up_left;
@@ -114,6 +228,27 @@ do { \
int top_right_size = (FFMIN(x0 + 2 * size_in_luma_h, s->ps.sps->width) -
(x0 + size_in_luma_h)) >> hshift;
+ pixel * src_l = src - 1;
+ pixel * src_u = src - stride;
+ pixel * src_ur = src_u + size;
+
+#ifdef DISABLE_INTRA
+ return;
+#endif
+
+#if defined(RPI)
+ if (av_rpi_is_sand_frame(s->frame)) {
+ // N.B. stride is in pixels (not bytes) or in the case of chroma pixel-pairs
+ const AVFrame * const frame = s->frame;
+ const unsigned int mask = stride - 1; // For chroma pixel=uint16 so stride_c is stride_y / 2
+ const unsigned int stripe_adj = (av_rpi_sand_frame_stride2(frame) - 1) * stride;
+ if ((x & mask) == 0)
+ src_l -= stripe_adj;
+ if (((x + size) & mask) == 0)
+ src_ur += stripe_adj;
+ }
+#endif
+
if (s->ps.pps->constrained_intra_pred_flag == 1) {
int size_in_luma_pu_v = PU(size_in_luma_v);
int size_in_luma_pu_h = PU(size_in_luma_h);
@@ -163,23 +298,24 @@ do { \
top[-1] = 128;
}
if (cand_up_left) {
- left[-1] = POS(-1, -1);
+ left[-1] = src_l[-stride];
top[-1] = left[-1];
}
if (cand_up)
- memcpy(top, src - stride, size * sizeof(pixel));
+ // Always good - even with sand
+ memcpy(top, src_u, size * sizeof(pixel));
if (cand_up_right) {
- memcpy(top + size, src - stride + size, size * sizeof(pixel));
- EXTEND(top + size + top_right_size, POS(size + top_right_size - 1, -1),
+ memcpy(top + size, src_ur, top_right_size * sizeof(pixel));
+ EXTEND(top + size + top_right_size, top[size + top_right_size - 1],
size - top_right_size);
}
if (cand_left)
for (i = 0; i < size; i++)
- left[i] = POS(-1, i);
+ left[i] = src_l[stride * i];
if (cand_bottom_left) {
for (i = size; i < size + bottom_left_size; i++)
- left[i] = POS(-1, i);
- EXTEND(left + size + bottom_left_size, POS(-1, size + bottom_left_size - 1),
+ left[i] = src_l[stride * i];
+ EXTEND(left + size + bottom_left_size, left[size + bottom_left_size - 1],
size - bottom_left_size);
}
@@ -268,7 +404,11 @@ do { \
cand_up_left = 1;
cand_left = 1;
} else { // No samples available
+#if PRED_C
+ left[-1] = (1 << (BIT_DEPTH - 1)) | (1 << (BIT_DEPTH - 1 + PW * 8));
+#else
left[-1] = (1 << (BIT_DEPTH - 1));
+#endif
EXTEND(top, left[-1], 2 * size);
EXTEND(left, left[-1], 2 * size);
}
@@ -287,6 +427,9 @@ do { \
top[-1] = left[-1];
// Filtering process
+ // Sand can only apply to chroma_format_idc == 1 so we don't need to
+ // worry about chroma smoothing for that case
+#if !PRED_C
if (!s->ps.sps->intra_smoothing_disabled_flag && (c_idx == 0 || s->ps.sps->chroma_format_idc == 3)) {
if (mode != INTRA_DC && size != 4){
int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
@@ -342,6 +485,30 @@ do { \
mode);
break;
}
+#else
+ switch (mode) {
+ case INTRA_PLANAR:
+ s->hpc.pred_planar_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top,
+ (uint8_t *)left, stride);
+ break;
+ case INTRA_DC:
+ s->hpc.pred_dc_c((uint8_t *)src, (uint8_t *)top,
+ (uint8_t *)left, stride, log2_size, c_idx);
+ break;
+ default:
+ s->hpc.pred_angular_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top,
+ (uint8_t *)left, stride, c_idx,
+ mode);
+ break;
+ }
+
+#if DUMP_PRED
+ printf("U pred @ %d, %d: mode=%d\n", x, y, mode);
+ dump_pred_uv((uint8_t *)src, stride, 1 << log2_size);
+ printf("V pred @ %d, %d: mode=%d\n", x, y, mode);
+ dump_pred_uv((uint8_t *)src + 1, stride, 1 << log2_size);
+#endif
+#endif
}
#define INTRA_PRED(size) \
@@ -357,6 +524,7 @@ INTRA_PRED(5)
#undef INTRA_PRED
+#if !PRED_C
static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_top,
const uint8_t *_left, ptrdiff_t stride,
int trafo_size)
@@ -371,6 +539,29 @@ static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_to
POS(x, y) = ((size - 1 - x) * left[y] + (x + 1) * top[size] +
(size - 1 - y) * top[x] + (y + 1) * left[size] + size) >> (trafo_size + 1);
}
+#else
+static av_always_inline void FUNC(pred_planar)(uint8_t * _src, const uint8_t * _top,
+ const uint8_t * _left, ptrdiff_t stride,
+ int trafo_size)
+{
+ int x, y;
+ int size = 1 << trafo_size;
+ c_dst_ptr_t src = (c_dst_ptr_t)_src;
+ const c_src_ptr_t top = (c_src_ptr_t)_top;
+ const c_src_ptr_t left = (c_src_ptr_t)_left;
+
+ for (y = 0; y < size; y++, src += stride)
+ {
+ for (x = 0; x < size; x++)
+ {
+ src[x][0] = ((size - 1 - x) * left[y][0] + (x + 1) * top[size][0] +
+ (size - 1 - y) * top[x][0] + (y + 1) * left[size][0] + size) >> (trafo_size + 1);
+ src[x][1] = ((size - 1 - x) * left[y][1] + (x + 1) * top[size][1] +
+ (size - 1 - y) * top[x][1] + (y + 1) * left[size][1] + size) >> (trafo_size + 1);
+ }
+ }
+}
+#endif
#define PRED_PLANAR(size)\
static void FUNC(pred_planar_ ## size)(uint8_t *src, const uint8_t *top, \
@@ -386,6 +577,7 @@ PRED_PLANAR(3)
#undef PRED_PLANAR
+#if !PRED_C
static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
const uint8_t *_left,
ptrdiff_t stride, int log2_size, int c_idx)
@@ -416,7 +608,53 @@ static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
POS(0, y) = (left[y] + 3 * dc + 2) >> 2;
}
}
+#else
+static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
+ const uint8_t *_left,
+ ptrdiff_t stride, int log2_size, int c_idx)
+{
+ unsigned int i, j;
+ const unsigned int size = (1 << log2_size);
+ c_dst_ptr_t src = (c_dst_ptr_t)_src;
+ const c_src_ptr_t top = (c_src_ptr_t)_top;
+ const c_src_ptr_t left = (c_src_ptr_t)_left;
+ unsigned int dc0 = size;
+ unsigned int dc1 = size;
+
+ for (i = 0; i < size; i++)
+ {
+ dc0 += left[i][0] + top[i][0];
+ dc1 += left[i][1] + top[i][1];
+ }
+
+ dc0 >>= log2_size + 1;
+ dc1 >>= log2_size + 1;
+
+ for (i = 0; i < size; i++, src += stride)
+ {
+ for (j = 0; j < size; ++j)
+ {
+ src[j][0] = dc0;
+ src[j][1] = dc1;
+ }
+ }
+}
+#endif
+
+#ifndef ANGLE_CONSTS
+#define ANGLE_CONSTS
+static const int intra_pred_angle[] = {
+ 32, 26, 21, 17, 13, 9, 5, 2, 0, -2, -5, -9, -13, -17, -21, -26, -32,
+ -26, -21, -17, -13, -9, -5, -2, 0, 2, 5, 9, 13, 17, 21, 26, 32
+};
+static const int inv_angle[] = {
+ -4096, -1638, -910, -630, -482, -390, -315, -256, -315, -390, -482,
+ -630, -910, -1638, -4096
+};
+#endif
+
+#if !PRED_C
static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
const uint8_t *_top,
const uint8_t *_left,
@@ -428,15 +666,6 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
const pixel *top = (const pixel *)_top;
const pixel *left = (const pixel *)_left;
- static const int intra_pred_angle[] = {
- 32, 26, 21, 17, 13, 9, 5, 2, 0, -2, -5, -9, -13, -17, -21, -26, -32,
- -26, -21, -17, -13, -9, -5, -2, 0, 2, 5, 9, 13, 17, 21, 26, 32
- };
- static const int inv_angle[] = {
- -4096, -1638, -910, -630, -482, -390, -315, -256, -315, -390, -482,
- -630, -910, -1638, -4096
- };
-
int angle = intra_pred_angle[mode - 2];
pixel ref_array[3 * MAX_TB_SIZE + 4];
pixel *ref_tmp = ref_array + size;
@@ -509,6 +738,83 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
}
}
}
+#else
+static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
+ const uint8_t *_top,
+ const uint8_t *_left,
+ ptrdiff_t stride, int c_idx,
+ int mode, int size)
+{
+ int x, y;
+ c_dst_ptr_t src = (c_dst_ptr_t)_src;
+ c_src_ptr_t top = (c_src_ptr_t)_top;
+ c_src_ptr_t left = (c_src_ptr_t)_left;
+
+ const int angle = intra_pred_angle[mode - 2];
+ cpel ref_array[3 * MAX_TB_SIZE + 4][2];
+ c_dst_ptr_t ref_tmp = ref_array + size;
+ c_src_ptr_t ref;
+ const int last = (size * angle) >> 5;
+
+ if (mode >= 18) {
+ ref = top - 1;
+ if (angle < 0 && last < -1) {
+ memcpy(ref_tmp, top - 1, (size + 1) * 2 * PW);
+ for (x = last; x <= -1; x++)
+ {
+ ref_tmp[x][0] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0];
+ ref_tmp[x][1] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1];
+ }
+ ref = (c_src_ptr_t)ref_tmp;
+ }
+
+ for (y = 0; y < size; y++, src += stride) {
+ const int idx = ((y + 1) * angle) >> 5;
+ const int fact = ((y + 1) * angle) & 31;
+ if (fact) {
+ for (x = 0; x < size; ++x) {
+ src[x][0] = ((32 - fact) * ref[x + idx + 1][0] +
+ fact * ref[x + idx + 2][0] + 16) >> 5;
+ src[x][1] = ((32 - fact) * ref[x + idx + 1][1] +
+ fact * ref[x + idx + 2][1] + 16) >> 5;
+ }
+ } else {
+ memcpy(src, ref + idx + 1, size * 2 * PW);
+ }
+ }
+ } else {
+ ref = left - 1;
+ if (angle < 0 && last < -1) {
+ memcpy(ref_tmp, left - 1, (size + 1) * 2 * PW);
+ for (x = last; x <= -1; x++)
+ {
+ ref_tmp[x][0] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0];
+ ref_tmp[x][1] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1];
+ }
+ ref = (c_src_ptr_t)ref_tmp;
+ }
+
+ for (x = 0; x < size; x++, src++) {
+ const int idx = ((x + 1) * angle) >> 5;
+ const int fact = ((x + 1) * angle) & 31;
+ if (fact) {
+ for (y = 0; y < size; y++) {
+ src[y * stride][0] = ((32 - fact) * ref[y + idx + 1][0] +
+ fact * ref[y + idx + 2][0] + 16) >> 5;
+ src[y * stride][1] = ((32 - fact) * ref[y + idx + 1][1] +
+ fact * ref[y + idx + 2][1] + 16) >> 5;
+ }
+ } else {
+ for (y = 0; y < size; y++)
+ {
+ src[y * stride][0] = ref[y + idx + 1][0];
+ src[y * stride][1] = ref[y + idx + 1][1];
+ }
+ }
+ }
+ }
+}
+#endif
static void FUNC(pred_angular_0)(uint8_t *src, const uint8_t *top,
const uint8_t *left,
@@ -538,6 +844,10 @@ static void FUNC(pred_angular_3)(uint8_t *src, const uint8_t *top,
FUNC(pred_angular)(src, top, left, stride, c_idx, mode, 1 << 5);
}
+#undef cpel
+#undef c_src_ptr_t
+#undef c_dst_ptr_t
+
#undef EXTEND_LEFT_CIP
#undef EXTEND_RIGHT_CIP
#undef EXTEND_UP_CIP
@@ -549,3 +859,9 @@ static void FUNC(pred_angular_3)(uint8_t *src, const uint8_t *top,
#undef EXTEND
#undef MIN_TB_ADDR_ZS
#undef POS
+#undef PW
+
+#ifndef INCLUDED_ONCE
+#define INCLUDED_ONCE
+#endif
+
diff --git a/libavcodec/raw.c b/libavcodec/raw.c
index d36b68bfae..b526dc393d 100644
--- a/libavcodec/raw.c
+++ b/libavcodec/raw.c
@@ -260,6 +260,12 @@ const PixelFormatTag ff_raw_pix_fmt_tags[] = {
{ AV_PIX_FMT_YUV444P16LE, MKTAG('I', '4', 'F', 'L') },
{ AV_PIX_FMT_YUV444P16BE, MKTAG('I', '4', 'F', 'B') },
+ /* RPI */
+#ifdef RPI
+ { AV_PIX_FMT_SAND128, MKTAG('S', 'A', 'N', 'D') },
+ { AV_PIX_FMT_SAND64_10, MKTAG('S', 'N', 'D', 'A') },
+#endif
+
/* special */
{ AV_PIX_FMT_RGB565LE,MKTAG( 3 , 0 , 0 , 0 ) }, /* flipped RGB565LE */
{ AV_PIX_FMT_YUV444P, MKTAG('Y', 'V', '2', '4') }, /* YUV444P, swapped UV */
diff --git a/libavcodec/rawenc.c b/libavcodec/rawenc.c
index d83705645c..4c746786ff 100644
--- a/libavcodec/rawenc.c
+++ b/libavcodec/rawenc.c
@@ -31,6 +31,8 @@
#include "libavutil/intreadwrite.h"
#include "libavutil/imgutils.h"
#include "libavutil/internal.h"
+#include "libavutil/avassert.h"
+#include "libavutil/rpi_sand_fns.h"
static av_cold int raw_encode_init(AVCodecContext *avctx)
{
@@ -47,6 +49,73 @@ FF_ENABLE_DEPRECATION_WARNINGS
return 0;
}
+#ifdef RPI
+static int raw_sand8_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
+ const AVFrame *frame)
+{
+ const AVFrameSideData *const sd = av_frame_get_side_data(frame, AV_FRAME_DATA_SAND_INFO);
+ int size;
+ int width = frame->width;
+ int height = frame->height;
+ int x0 = 0;
+ int y0 = 0;
+ uint8_t * dst;
+ int ret;
+
+ if (sd != NULL) {
+ const AVFrameDataSandInfo *const si = (AVFrameDataSandInfo *)sd->data;
+
+ x0 = si->left_offset;
+ y0 = si->top_offset;
+ }
+
+ size = width * height * 3 / 2;
+ if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0)
+ return ret;
+
+ dst = pkt->data;
+
+ av_rpi_sand_to_planar_y8(dst, width, frame->data[0], frame->linesize[0], frame->linesize[3], x0, y0, width, height);
+ dst += width * height;
+ av_rpi_sand_to_planar_c8(dst, width / 2, dst + width * height / 4, width / 2,
+ frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0 / 2, y0 / 2, width / 2, height / 2);
+ return 0;
+}
+
+static int raw_sand16_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
+ const AVFrame *frame)
+{
+ const AVFrameSideData *const sd = av_frame_get_side_data(frame, AV_FRAME_DATA_SAND_INFO);
+ int size;
+ int width = frame->width;
+ int height = frame->height;
+ int x0 = 0;
+ int y0 = 0;
+ uint8_t * dst;
+ int ret;
+
+ if (sd != NULL) {
+ const AVFrameDataSandInfo *const si = (AVFrameDataSandInfo *)sd->data;
+
+ x0 = si->left_offset;
+ y0 = si->top_offset;
+ }
+
+ size = width * height * 3;
+ if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0)
+ return ret;
+
+ dst = pkt->data;
+
+ av_rpi_sand_to_planar_y16(dst, width * 2, frame->data[0], frame->linesize[0], frame->linesize[3], x0 * 2, y0, width * 2, height);
+ dst += width * height * 2;
+ av_rpi_sand_to_planar_c16(dst, width, dst + width * height / 2, width,
+ frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0, y0 / 2, width, height / 2);
+ return 0;
+}
+#endif
+
+
static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
const AVFrame *frame, int *got_packet)
{
@@ -56,6 +125,14 @@ static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
if (ret < 0)
return ret;
+#ifdef RPI
+ if (av_rpi_is_sand_frame(frame)) {
+ ret = av_rpi_is_sand8_frame(frame) ? raw_sand8_as_yuv420(avctx, pkt, frame) : raw_sand16_as_yuv420(avctx, pkt, frame);
+ *got_packet = (ret == 0);
+ return ret;
+ }
+#endif
+
if ((ret = ff_alloc_packet2(avctx, pkt, ret, ret)) < 0)
return ret;
if ((ret = av_image_copy_to_buffer(pkt->data, pkt->size,
diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s
new file mode 100644
index 0000000000..391f761df9
--- /dev/null
+++ b/libavcodec/rpi_hevc_transform.s
@@ -0,0 +1,923 @@
+# ******************************************************************************
+# Argon Design Ltd.
+# (c) Copyright 2015 Argon Design Ltd. All rights reserved.
+#
+# Module : HEVC
+# Author : Peter de Rivaz
+# ******************************************************************************
+
+# HEVC VPU Transform
+# fe
+# Transform matrix can be thought of as
+# output row vector = input row vector * transMatrix2
+#
+# The even rows of the matrix are symmetric
+# The odd rows of the matrix are antisymmetric
+#
+# So only need to compute the first half of the results, then can compute the remainder with a butterfly
+#
+# EXAMPLE
+# (a b c d) (1 2 2 1)
+# (3 4 -4 -3)
+# (5 6 6 5)
+# (7 8 -8 -7)
+#
+# x=(a c)(1 2) = 1a+5c 2a+6c
+# (5 6)
+#
+# y=(b d)(3 4) = 3b+7d 4b+8d
+# (7 8)
+#
+# u=x+y = 1a+5c+3b+7d 2a+4b+6c+8d
+# v=x-y = 1a+5c-3b-7d 2a+6c-4b-8d
+#
+# Final results are (u , v[::-1])
+#
+#
+# For 32x1 input, load even rows into HX(0++,0), odd rows into HX(16++,0)
+# Apply the even matrix first and stop before rounding
+# Then apply the odd matrix in a full manner:
+#
+# First step is to compute partial products with the first input (16 cycles)
+# 1a 3b 5c 7d 16x1 input coefficients produce 16x16 output
+# 2a 4b 6c 8d
+# 2a -4b 6c -8d
+# 1a -3b 5c -7d
+#
+# Second step is to sum partial products into final position (8 cycles)
+# 1a+3b+5c+7d
+# 2a+4b+6c+8d
+# 2a-4b+6c-8d
+# 1a-3b+5c-7d
+#
+# Then can apply butterfly to combine even results and odd results + rounding to produce 16 rows of output at a time (need to save in transposed format)
+#
+# For 16x16 no butterfly is required and can store final results in original location (Could do 2 16x16s in parallel to make use of the trick - saves on the adds)
+#
+# For 8x8 we could compute two in parallel.
+#
+#
+
+# Columns are transformed first
+#
+# Store top left half of transMatrix2 in
+# Store bottom left half of transMatrix2 in HX(32,32)
+#
+# For 16x16
+# HX(0:15,0) contains input data before transform
+# HY(0:15,0) contains 32bit output data after transform
+# HX(32,0) contains even rows of left half of transMatrix2
+# HX(32,32) contains odd rows of left half of transMatrix2
+# HY(48,0) contains partial products ready for summing
+#
+
+
+# hevc_trans_16x16(short *transMatrix2, short *coeffs, int num) # TODO add size so we can branch to correct implementation (or perhaps have coeffs32 and num32 as secondary inputs!)
+# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory)
+# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
+# num: number of 16x16 transforms to be done
+# coeffs32
+# num32: number of 32x32 transforms
+# command 0 for transform, 1 for memclear16(int16_t *dst,num16)
+#
+
+.equ TRANS_SHIFT, 20 - BIT_DEPTH
+.equ TRANS_RND2, 1 << (TRANS_SHIFT - 1)
+.equ TRANS_ASL2, 16 - TRANS_SHIFT
+
+
+hevc_trans_16x16:
+ cmp r5,1
+ beq memclear16
+ cmp r5,2
+ beq hevc_deblock_16x16
+ cmp r5,3
+ beq hevc_uv_deblock_16x16
+ cmp r5,4
+ beq hevc_uv_deblock_16x16_with_clear
+ cmp r5,5
+ beq hevc_run_command_list
+
+ push r6-r15, lr # TODO cut down number of used registers
+ mov r14,r3 # coeffs32
+ mov r15,r4 # num32
+ mov r3, 16*2 # Stride of transMatrix2 in bytes
+ vldh HX(32++,0),(r0 += r3) REP 16 # This is the 16x16 matrix, a transform is equivalent to multiplying input row vector * matrix
+
+ add r0, 16*16*2 # For 32x32 transforms we also need this matrix
+ vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
+
+ # Now use r0 to describe which matrix we are working on.
+ # Allows us to prefetch the next block of coefficients for efficiency.
+ mov r0,0 # This describes the location where we read our coefficients from
+ mov r3,16*2 # Stride of coefficients in bytes (TODO remove)
+ mov r7,16*16*2 # Total block size
+ mov r8,64*16 # Value used to swap from current to next VRF location
+ vldh HX(0++,0)+r0,(r1 += r3) REP 16
+ mov r4,64 # Constant used for rounding first pass
+ mov r5,TRANS_RND2 # Constant used for rounding second pass
+
+ # At start of block r0,r1 point to the current block (that has already been loaded)
+block_loop:
+ eor r0,r8
+ add r1,r7
+ # Prefetch the next block
+ vldh HX(0++,0)+r0,(r1 += r3) REP 16
+ eor r0,r8
+ sub r1,r7
+
+ # Transform the current block
+ bl col_trans_16
+ vadd HY(0++,0)+r0,HY(0++,0)+r0,r4 REP 16 # Now add on rounding, shift down by 7, and saturate
+ #vsasls HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # 9+7=16 so this ends up with the output saturated and in the top half of the word.
+ vasl HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # This should be saturating, but the instruction above does not assemble?
+ vmov VX(0,0++)+r0, HX(0++,32)+r0 REP 16 # For simplicity transpose this back to the original position
+
+ bl col_trans_16
+ vadd HY(0++,0)+r0,HY(0++,0)+r0,r5 REP 16 # Now add on rounding, shift down by 7, and saturate
+ #vsasls HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16 # 4+12=16 so this ends up with the output saturated and in the top half of the word.
+ vasl HY(0++,0)+r0,HY(0++,0)+r0,TRANS_ASL2 REP 16 # This should be saturating, but the instruction above does not assemble? (Probably because it ends with ls which is interpreted as a condition flag)
+
+ # Save results - note there has been a transposition during the processing so we save columns
+ vsth VX(0,32++)+r0, (r1 += r3) REP 16
+
+ # Move onto next block
+ eor r0,r8
+ add r1,r7
+
+ addcmpbgt r2,-1,0,block_loop
+
+ # Now go and do any 32x32 transforms
+ b hevc_trans_32x32
+
+ pop r6-r15, pc
+
+# r1,r2,r3 r7,r8 should be preserved
+# HX(0++,0)+r0 is the block to be transformed
+# HX(32++,0)+r6 is the 16x16 matrix of transform coefficients
+# Use HY(48,0) for intermediate results
+# r0 can be used, but should be returned to its original value at the end
+col_trans_16:
+ add r6,r0,16 # Final value for this loop
+col_trans_16_loop:
+ # First compute partial products for a single column
+ vmul32s HY(48++,0), VX(0,0)+r0, VX(32,0++) REP 16
+ # Then sum up the results and place back
+ vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
+ addcmpblt r0,1,r6,col_trans_16_loop
+ sub r0,16 # put r0 back to its original value
+ b lr
+
+col_trans_odd_16:
+ add r6,r0,16 # Final value for this loop
+col_trans_odd_16_loop:
+ # First compute partial products for a single column
+ vmul32s HY(48++,0), VX(0,0)+r0, VX(32,32++) REP 16
+ # Then sum up the results and place back
+ vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
+ addcmpblt r0,1,r6,col_trans_odd_16_loop
+ sub r0,16 # put r0 back to its original value
+ b lr
+
+# hevc_trans_32x32(short *transMatrix2, short *coeffs, int num)
+# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory) Even followed by odd
+# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
+# num: number of 16x16 transforms to be done
+#
+hevc_trans_32x32:
+ mov r1,r14 # coeffs
+ mov r2,r15 # num
+
+ # Fetch odd transform matrix
+ #mov r3, 16*2 # Stride of transMatrix2 in bytes (and of coefficients)
+ #vldh HX(32++,0),(r0 += r3) REP 16 # This is the even 16x16 matrix
+ #add r0, 16*16*2
+ #vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
+
+ mov r3, 32*2*2 # Stride used to fetch alternate rows of our input coefficient buffer
+ mov r7, 16*16*2 # Total block size
+ sub sp,sp,32*32*2+32 # Allocate some space on the stack for us to store 32*32 shorts as temporary results (needs to be aligned)
+ # set r8 to 32byte aligned stack pointer
+ add r8,sp,31
+ lsr r8,5
+ lsl r8,5
+ mov r9,r8 # Backup of the temporary storage
+ mov r10,r1 # Backup of the coefficient buffer
+block_loop32:
+
+ # COLUMN TRANSFORM
+ mov r4, 64 # Constant used for rounding first pass
+ mov r5, 9 # left shift used for rounding first pass
+
+ # Transform the first 16 columns
+ mov r1,r10 # Input Coefficient buffer
+ mov r8,r9 # Output temporary storage
+ bl trans32
+ # Transform the second 16 columns
+ add r8,32*16*2
+ add r1,32
+ bl trans32
+
+ # ROW TRANSFORM
+ mov r4, TRANS_RND2 # Constant used for rounding second pass
+ mov r5, TRANS_ASL2 # left shift used for rounding second pass
+
+ mov r1,r9 # Input temporary storage
+ mov r8,r10 # Output Coefficient buffer
+ bl trans32
+ # Transform the second 16 columns
+ add r8,32*16*2
+ add r1,32
+ bl trans32
+
+ add r10, 32*32*2 # move onto next block of coefficients
+ addcmpbgt r2,-1,0,block_loop32
+
+ add sp,sp,32*32*2+32 # Restore stack
+
+ pop r6-r15, pc
+
+trans32:
+ push lr
+ # We can no longer afford the VRF space to do prefetching when doing 32x32
+ # Fetch the even rows
+ vldh HX(0++,0),(r1 += r3) REP 16
+ # Fetch the odd rows
+ vldh HX(16++,0),64(r1 += r3) REP 16 # First odd row is 32 shorts ahead of r1
+
+ # Transform the even rows using even matrix
+ mov r0, 0 # Even rows
+ bl col_trans_16
+
+ # Now transform the odd rows using odd matrix
+ mov r0, 64*16 # Odd rows
+ bl col_trans_odd_16
+
+ # Now apply butterfly to compute the first 16 results
+ vadd HY(48++,0),HY(0++,0),HY(16++,0) REP 16
+ vadd HY(48++,0),HY(48++,0),r4 REP 16 # add on rounding,
+ vasl HY(48++,0),HY(48++,0),r5 REP 16 # shift down by 7, and saturate
+ # 16bit results now in HX(48,32)
+ mov r0,r8
+ mov r6,32*2
+ vsth VX(48,32++),(r0+=r6) REP 16
+
+ # Now apply butterfly to compute the second 16 results (in reverse order)
+ vsub HY(63,0),HY(0 ,0),HY(16,0)
+ vsub HY(62,0),HY(1 ,0),HY(17,0)
+ vsub HY(61,0),HY(2 ,0),HY(18,0)
+ vsub HY(60,0),HY(3 ,0),HY(19,0)
+ vsub HY(59,0),HY(4 ,0),HY(20,0)
+ vsub HY(58,0),HY(5 ,0),HY(21,0)
+ vsub HY(57,0),HY(6 ,0),HY(22,0)
+ vsub HY(56,0),HY(7 ,0),HY(23,0)
+ vsub HY(55,0),HY(8 ,0),HY(24,0)
+ vsub HY(54,0),HY(9 ,0),HY(25,0)
+ vsub HY(53,0),HY(10,0),HY(26,0)
+ vsub HY(52,0),HY(11,0),HY(27,0)
+ vsub HY(51,0),HY(12,0),HY(28,0)
+ vsub HY(50,0),HY(13,0),HY(29,0)
+ vsub HY(49,0),HY(14,0),HY(30,0)
+ vsub HY(48,0),HY(15,0),HY(31,0)
+ vadd HY(48++,0),HY(48++,0),r4 REP 16 # add on rounding,
+ vasl HY(48++,0),HY(48++,0),r5 REP 16 # shift down by 7, and saturate
+ add r0,r8,32
+ vsth VX(48,32++),(r0+=r6) REP 16
+ pop pc
+
+memclear16:
+ # r0 is address
+ # r1 is number of 16bits values to set to 0 (may overrun past end and clear more than specified)
+ vmov HX(0++,0),0 REP 16
+ mov r2,32
+loop:
+ vsth HX(0++,0),(r0+=r2) REP 16
+ add r0,16*16*2
+ sub r1,16*16
+ cmp r1,0
+ bgt loop
+ b lr
+
+
+################################################################################
+# HEVC VPU Deblock
+#
+# Vertical edges before horizontal
+# Decision can change every 4 pixels, but only 8 pixel boundaries are deblocked
+#
+# ARM is responsible for storing beta and tc for each 4 pixels horiz and vert edge.
+# The VPU code works in units of 16x16 blocks.
+# We do vertical filtering for the current block followed by horizontal filtering for the previous (except for the first time).
+# One final horizontal filter is required at the end.
+# PCM is not allowed in this code.
+#
+#
+# H(16-4:16+15,0) contains previous block (note that we need 4 lines above of context that may get altered during filtering)
+# H(16:31,16) contains current block (note that we do not need the upper lines until the horizontal filtering.
+
+.set P0,63
+.set P1,62
+.set P2,61
+.set P3,60
+.set Q0,59
+.set Q1,58
+.set Q2,57
+.set Q3,56
+
+.set dp,32
+.set dq,33
+.set d,34
+.set decision,35
+.set beta,36
+.set beta2,37
+.set beta3,38
+.set ptest,39
+.set qtest,40
+.set pqtest,41
+.set thresh,42
+.set deltatest, 44
+.set deltap1, 45
+.set tc25, 46
+.set setup,47
+.set tc,48
+.set tc25,49
+.set tc2, 50
+.set do_filter, 51
+.set delta, 52
+.set tc10, 53
+.set delta0, 54
+.set delta1, 55
+.set zeros, 0
+.set setup_input, 1
+.set deltaq1, 2
+
+
+
+# hevc_deblock_16x16 deblocks an entire row that is 16 pixels high by the full width of the image.
+# Row has num16 16x16 blocks across
+# Beta goes from 0 to 64
+# tc goes from 0 to 24
+# setup[block_idx][0=vert,1=horz][0=first edge, 1=second edge][0=beta,1=tc][0..3=edge number]
+# has 8 bytes per edge
+# has 16 bytes per direction
+# has 32 bytes per 16x16 block
+# hevc_deblock_16x16(uint8_t *img (r0), int stride (r1), int num16w (r2), uint8_t setup[num16][2][2][2][4](r3),int num16h(r4))
+hevc_deblock_16x16:
+ push r6-r15, lr
+ mov r9,r4
+ mov r4,r3
+ mov r13,r2
+ mov r2,r0
+ mov r10,r0
+ subscale4 r0,r1
+ mov r8,63
+ mov r6,-3
+ vmov H(zeros,0),0
+# r7 is number of blocks still to load
+# r0 is location of current block - 4 * stride
+# r1 is stride
+# r2 is location of current block
+# r3 is offset of start of block (actual edges start at H(16,16)+r3 for horizontal and H(16,0)+r3 for vertical
+# r4 is setup
+# r5 is for temporary calculations
+# r8 holds 63
+# r6 holds -3
+# r9 holds the number of 16 high rows to process
+# r10 holds the original img base
+# r11 returns 0 if no filtering was done on the edge
+# r12 saves a copy of this
+# r13 is copy of width
+
+process_row:
+ # First iteration does not do horizontal filtering on previous
+ mov r7, r13
+ mov r3,0
+ vldb H(12++,16)+r3,(r0 += r1) REP 4 # Load the current block
+ vldb H(16++,16)+r3,(r2 += r1) REP 16
+ vldb H(setup_input,0), (r4) # We may wish to prefetch these
+ vstb H(zeros,0),(r4)
+ bl vert_filter
+ add r3,8
+ vadd H(setup_input,0),H(setup_input,8),0 # Rotate to second set of 8
+ bl vert_filter
+ sub r3,8
+ b start_deblock_loop
+deblock_loop:
+ # Middle iterations do vertical on current block and horizontal on preceding
+ vldb H(12++,16)+r3,(r0 += r1) REP 4 # load the current block
+ vldb H(16++,16)+r3,(r2 += r1) REP 16
+ vldb H(setup_input,0), (r4)
+ vstb H(zeros,0),(r4)
+ bl vert_filter
+ add r3,8
+ vadd H(setup_input,0),H(setup_input,8),0
+ bl vert_filter
+ sub r3,8
+ vldb H(setup_input,0), -16(r4)
+ vstb H(zeros,0),-16(r4)
+ bl horz_filter
+ mov r12,r11
+ add r3,8*64
+ vadd H(setup_input,0),H(setup_input,8),0
+ bl horz_filter
+ sub r3,8*64
+ addcmpbeq r12,0,0,skip_save_top
+ vstb H(12++,0)+r3,-16(r0 += r1) REP 4 # Save the deblocked pixels for the previous block
+skip_save_top:
+ vstb H(16++,0)+r3,-16(r2 += r1) REP 16
+start_deblock_loop:
+ # move onto next 16x16 (could do this with circular buffer support instead)
+ add r3,16
+ and r3,r8
+ add r4,32
+ # Perform loop counter operations (may work with an addcmpbgt as well?)
+ add r0,16
+ add r2,16
+ sub r7,1
+ cmp r7,0 # Are there still more blocks to load
+ bgt deblock_loop
+
+ # Final iteration needs to just do horizontal filtering
+ vldb H(setup_input,0), -16(r4)
+ vstb H(zeros,0),-16(r4)
+ bl horz_filter
+ mov r12,r11
+ add r3,8*64
+ vadd H(setup_input,0),H(setup_input,8),0
+ bl horz_filter
+ sub r3,64*8
+ addcmpbeq r12,0,0,skip_save_top2
+ vstb H(12++,0)+r3,-16(r0 += r1) REP 4 # Save the deblocked pixels for the previous block
+skip_save_top2:
+ vstb H(16++,0)+r3,-16(r2 += r1) REP 16
+
+# Now look to see if we should do another row
+ sub r9,1
+ cmp r9,0
+ bgt start_again
+ pop r6-r15, pc
+start_again:
+ # Need to sort out r0,r2 to point to next row down
+ addscale16 r10,r1
+ mov r2,r10
+ subscale4 r0,r2,r1
+ b process_row
+
+
+# At this stage H(16,16)+r3 points to the first pixel of the 16 high edge to be filtered
+# So we can reuse the code we move the parts to be filtered into HX(P0/P1/P2/P3/Q0/Q1/Q2/Q3,0) - we will perform a final saturation step on placing them back into the correct locations
+
+vert_filter:
+ push lr
+
+ vmov HX(P3,0), V(16,12)+r3
+ vmov HX(P2,0), V(16,13)+r3
+ vmov HX(P1,0), V(16,14)+r3
+ vmov HX(P0,0), V(16,15)+r3
+ vmov HX(Q0,0), V(16,16)+r3
+ vmov HX(Q1,0), V(16,17)+r3
+ vmov HX(Q2,0), V(16,18)+r3
+ vmov HX(Q3,0), V(16,19)+r3
+
+ bl do_luma_filter
+
+ vadds V(16,13)+r3, HX(P2,0), 0
+ vadds V(16,14)+r3, HX(P1,0), 0
+ vadds V(16,15)+r3, HX(P0,0), 0
+ # P3 and Q3 never change so don't bother saving back
+ vadds V(16,16)+r3, HX(Q0,0), 0
+ vadds V(16,17)+r3, HX(Q1,0), 0
+ vadds V(16,18)+r3, HX(Q2,0), 0
+
+ pop pc
+
+# Filter edge at H(16,0)+r3
+horz_filter:
+ push lr
+
+ vmov HX(P3,0), H(12,0)+r3
+ vmov HX(P2,0), H(13,0)+r3
+ vmov HX(P1,0), H(14,0)+r3
+ vmov HX(P0,0), H(15,0)+r3
+ vmov HX(Q0,0), H(16,0)+r3
+ vmov HX(Q1,0), H(17,0)+r3
+ vmov HX(Q2,0), H(18,0)+r3
+ vmov HX(Q3,0), H(19,0)+r3
+
+ bl do_luma_filter
+
+ vadds H(13,0)+r3, HX(P2,0), 0
+ vadds H(14,0)+r3, HX(P1,0), 0
+ vadds H(15,0)+r3, HX(P0,0), 0
+ # P3 and Q3 never change so don't bother saving back
+ vadds H(16,0)+r3, HX(Q0,0), 0
+ vadds H(17,0)+r3, HX(Q1,0), 0
+ vadds H(18,0)+r3, HX(Q2,0), 0
+
+ pop pc
+
+# r4 points to array of beta/tc for each 4 length edge
+do_luma_filter:
+ valtl H(setup,0),H(setup_input,0),H(setup_input,0) # b*8tc*8
+ valtl HX(beta,0),H(setup,0),H(setup,0)
+ valtu HX(tc,0),H(setup,0),H(setup,0)
+ vmul HX(tc25,0), HX(tc,0), 5
+ vadd HX(tc25,0),HX(tc25,0), 1
+ vasr HX(tc25,0), HX(tc25,0), 1
+
+ # Compute decision
+ vadd HX(dp,0),HX(P1,0),HX(P1,0) # 2*P1
+ vsub HX(dp,0),HX(P2,0),HX(dp,0) # P2-2*P1
+ vadd HX(dp,0),HX(dp,0),HX(P0,0) # P2-2*P1+P0
+ vdist HX(dp,0),HX(dp,0),0 # abs(P2-2*P1+P0) # dp0
+
+ vadd HX(dq,0),HX(Q1,0),HX(Q1,0) # 2*Q1
+ vsub HX(dq,0),HX(Q2,0),HX(dq,0) # Q2-2*Q1
+ vadd HX(dq,0),HX(dq,0),HX(Q0,0) # Q2-2*Q1+Q0
+ vdist HX(dq,0),HX(dq,0),0 # abs(Q2-2*Q1+Q0) # dq0
+
+ vadd HX(d,0), HX(dp,0), HX(dq,0)
+ vasr HX(beta2,0),HX(beta,0),2
+ vasr HX(beta3,0),HX(beta,0),3
+
+ # Compute flags that are negative if all conditions pass
+ vdist HX(decision,0), HX(P0,0), HX(P3,0) CLRA SACC
+ vdist HX(decision,0), HX(Q0,0), HX(Q3,0) SACC
+ vsub HX(decision,0), HX(decision,0), HX(beta3,0) SETF
+
+ vdist HX(decision,0), HX(P0,0), HX(Q0,0) IFN
+ vsub HX(decision,0), HX(decision,0), HX(tc25,0) IFN SETF
+ vadd HX(decision,0), HX(d,0), HX(d,0) IFN
+ vsub HX(decision,0), HX(decision,0), HX(beta2,0) IFN SETF
+ vmov HX(decision,0), 1 IFNN
+ vadd H(decision,0),H(decision,3),0 IFN
+ vadd H(decision,16),H(decision,19),0 IFN
+ vmov -,HX(decision,0) SETF # N marks strong filter
+ vmov HX(decision,0), 1 IFNN # NN marks normal filter
+
+ vadd HX(do_filter,0), HX(d,3), HX(d,0)
+ vsub HX(do_filter,0), HX(do_filter,0), HX(beta,0) SETF # IFNN means no filter
+ vmov HX(decision,0),0 IFNN # Z marks no filter
+
+ # Expand out decision (currently valid one every 4 pixels) 0...1...2...3
+ # First extract out even terms
+ vodd HX(decision,0),HX(decision,0),HX(decision,0) # 0.1.2.3
+ vodd HX(decision,0),HX(decision,0),HX(decision,0) # 0123
+ # Now expand back
+ valtl HX(decision,0),HX(decision,0),HX(decision,0) # 00112233
+ valtl HX(decision,0),HX(decision,0),HX(decision,0) SETF # 0000111122223333
+
+ # HX(decision,0) is negative if want strong filtering, 1 if want normal filtering, 0 if want no filtering
+
+ # Do a quick check to see if there is anything to do
+ mov r11, 0 # Signal no filtering
+ vmov -,1 IFNZ SUMS r5
+ cmp r5,0
+ beq filtering_done
+ mov r11, 1 # Signal some filtering
+ # And whether there is any strong filtering
+ vmov -,1 IFN SUMS r5
+ cmp r5,0
+ beq normal_filtering
+
+ ##############################################################################
+ # Strong filtering - could maybe fast case if all have same sign? (especially if all disabled!)
+ vshl HX(tc2,0), HX(tc,0), 1 # Note that in normal filtering tx2 is tc/2, while here it is tc*2
+
+ # Take a copy of the original pixels for use in decision calculation
+ vmov HX(P0,32),HX(P0,0)
+ vmov HX(Q0,32),HX(Q0,0)
+ vmov HX(P1,32),HX(P1,0)
+ vmov HX(Q1,32),HX(Q1,0)
+ vmov HX(P2,32),HX(P2,0)
+ vmov HX(Q2,32),HX(Q2,0)
+
+ vadd -,HX(P2,32),4 CLRA SACC
+ vshl -,HX(P1,32),1 SACC
+ vshl -,HX(P0,32),1 SACC
+ vshl -,HX(Q0,32),1 SACC
+ vshl HX(delta,0),HX(Q1,32),0 SACC
+ vasr HX(delta,0),HX(delta,0), 3
+ vsub HX(delta,0),HX(delta,0),HX(P0,32)
+ vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
+ vadd HX(P0,0),HX(P0,32),HX(delta,0) IFN
+
+ vadd -,HX(P2,32),2 CLRA SACC
+ vadd -,HX(P1,32),HX(P0,32) SACC
+ vshl HX(delta,0),HX(Q0,32),0 SACC
+ vasr HX(delta,0),HX(delta,0), 2
+ vsub HX(delta,0),HX(delta,0),HX(P1,32)
+ vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
+ vadd HX(P1,0),HX(P1,32),HX(delta,0) IFN
+
+ vadd -,HX(Q0,32),4 CLRA SACC
+ vadd -,HX(P1,32),HX(P0,32) SACC
+ vmul -,HX(P2,32),3 SACC
+ vshl HX(delta,0),HX(P3,0),1 SACC # Note that we have not made a copy of P3, so using P3,0 is correct
+ vasr HX(delta,0),HX(delta,0), 3
+ vsub HX(delta,0),HX(delta,0),HX(P2,32)
+ vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
+ vadd HX(P2,0),HX(P2,32),HX(delta,0) IFN
+ #vmov HX(P2,0),3 IFN
+
+ # Now reverse all P/Qs
+
+ vadd -,HX(Q2,32),4 CLRA SACC
+ vshl -,HX(Q1,32),1 SACC
+ vshl -,HX(Q0,32),1 SACC
+ vshl -,HX(P0,32),1 SACC
+ vshl HX(delta,0),HX(P1,32),0 SACC
+ vasr HX(delta,0),HX(delta,0), 3
+ vsub HX(delta,0),HX(delta,0),HX(Q0,32)
+ vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
+ vadd HX(Q0,0),HX(Q0,32),HX(delta,0) IFN
+
+ vadd -,HX(Q2,32),2 CLRA SACC
+ vadd -,HX(Q1,32),HX(Q0,32) SACC
+ vshl HX(delta,0),HX(P0,32),0 SACC
+ vasr HX(delta,0),HX(delta,0), 2
+ vsub HX(delta,0),HX(delta,0),HX(Q1,32)
+ vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
+ vadd HX(Q1,0),HX(Q1,32),HX(delta,0) IFN
+
+ vadd -,HX(P0,32),4 CLRA SACC
+ vadd -,HX(Q1,32),HX(Q0,32) SACC
+ vmul -,HX(Q2,32),3 SACC
+ vshl HX(delta,0),HX(Q3,0),1 SACC # Note that we have not made a copy of Q3, so using Q3,0 is correct
+ vasr HX(delta,0),HX(delta,0), 3
+ vsub HX(delta,0),HX(delta,0),HX(Q2,32)
+ vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
+ vadd HX(Q2,0),HX(Q2,32),HX(delta,0) IFN
+
+ ##############################################################################
+ # Normal filtering
+normal_filtering:
+ # Invert the decision flags
+ # make instruction more complicated as assembler has error and loses SETF
+ vrsub HX(tc10,0), HX(decision,0), 0 SETF # IFN means normal filtering
+ vmov -, HX(tc10,0) SETF # IFN means normal filtering
+
+ vmov -,1 IFN SUMS r5
+ cmp r5,0
+ beq filtering_done
+
+ vasr HX(tc2,0), HX(tc,0), 1
+ vmul HX(tc10,0), HX(tc,0), 10
+
+ vasr HX(thresh,0), HX(beta,0), 1
+ vadd HX(thresh,0), HX(thresh,0), HX(beta,0)
+ vasr HX(thresh,0), HX(thresh,0), 3 CLRA SACC
+
+ vadd HX(ptest,0),HX(dp,3),HX(dp,0)
+ vsub HX(ptest,0),HX(ptest,0),HX(thresh,0) # ptest is negative if we need to do the P2 pixel
+ vadd HX(qtest,0),HX(dq,3),HX(dq,0)
+ vsub HX(qtest,0),HX(qtest,0),HX(thresh,0) # qtest is negative if we need to do the Q2 pixel
+ # Expand ptest and qtest together
+ vodd HX(pqtest,0),HX(ptest,0),HX(qtest,0) # p.p.p.p.q.q.q.q
+ vodd HX(pqtest,0),HX(pqtest,0),HX(pqtest,0) # ppppqqqq........
+ valtl HX(pqtest,0),HX(pqtest,0),HX(pqtest,0) # ppppppppqqqqqqqq
+ valtl HX(ptest,0),HX(pqtest,0),HX(pqtest,0)
+ valtu HX(qtest,0),HX(pqtest,0),HX(pqtest,0)
+
+ vsub HX(delta0,0), HX(Q0,0), HX(P0,0)
+ vsub HX(delta1,0), HX(Q1,0), HX(P1,0)
+ vmov -,8 CLRA SACC
+ vmul -,HX(delta0,0), 9 SACC
+ vmul HX(delta0,0),HX(delta1,0), r6 SACC
+ vasr HX(delta0,0), HX(delta0,0), 4
+ vdist HX(deltatest,0), HX(delta0,0), 0
+ vsub HX(deltatest,0), HX(deltatest,0), HX(tc10,0) IFN SETF # negative if still need to do something
+ vmov HX(deltatest,0), 0 IFNN # clear if no need to do anything so we can reload flags later
+
+ vclamps HX(delta0,0), HX(delta0,0), HX(tc,0)
+
+ vadd HX(deltap1,0), HX(P2,0), HX(P0,0)
+ vadd HX(deltap1,0), HX(deltap1,0), 1
+ vasr HX(deltap1,0), HX(deltap1,0), 1 CLRA SACC
+ vsub HX(deltap1,0), HX(delta0,0), HX(P1,0) SACC
+ vasr HX(deltap1,0), HX(deltap1,0), 1
+ vclamps HX(deltap1,0), HX(deltap1,0), HX(tc2,0)
+
+ vadd HX(deltaq1,0), HX(Q2,0), HX(Q0,0)
+ vadd HX(deltaq1,0), HX(deltaq1,0), 1
+ vasr HX(deltaq1,0), HX(deltaq1,0), 1 CLRA SACC
+ vadd HX(deltaq1,0), HX(delta0,0), HX(Q1,0)
+ vrsub -, HX(delta0,0), 0 SACC
+ vrsub HX(deltaq1,0), HX(Q1,0), 0 SACC
+ vasr HX(deltaq1,0), HX(deltaq1,0), 1
+ vclamps HX(deltaq1,0), HX(deltaq1,0), HX(tc2,0)
+
+ vadds HX(P0,0), HX(P0,0), HX(delta0,0) IFN
+ vsubs HX(Q0,0), HX(Q0,0), HX(delta0,0) IFN
+
+ vmov -,HX(ptest,0) IFN SETF # Negative if need to do p1
+ vadds HX(P1,0), HX(P1,0), HX(deltap1,0) IFN
+
+ vmov -,HX(deltatest,0) SETF
+ vmov -,HX(qtest,0) IFN SETF # Negative if need to do q1
+ vadds HX(Q1,0), HX(Q1,0), HX(deltaq1,0) IFN
+
+ #vmov HX(P2,0),1 IFN
+
+filtering_done:
+ b lr
+
+
+hevc_uv_deblock_16x16:
+ push r6-r15, lr
+ mov r14,0
+ b hevc_uv_start
+hevc_uv_deblock_16x16_with_clear:
+ push r6-r15, lr
+ mov r14,1
+ b hevc_uv_start
+
+hevc_uv_start:
+ mov r9,r4
+ mov r4,r3
+ mov r13,r2
+ mov r2,r0
+ mov r10,r0
+ subscale4 r0,r1
+ mov r8,63
+ mov r6,-3
+ vmov H(zeros,0),0
+# r7 is number of blocks still to load
+# r0 is location of current block - 4 * stride
+# r1 is stride
+# r2 is location of current block
+# r3 is offset of start of block (actual edges start at H(16,16)+r3 for horizontal and H(16,0)+r3 for vertical
+# r4 is setup
+# r5 is for temporary calculations
+# r8 holds 63
+# r6 holds -3
+# r9 holds the number of 16 high rows to process
+# r10 holds the original img base
+# r11 returns 0 if no filtering was done on the edge
+# r12 saves a copy of this
+# r13 is copy of width
+# r14 is 1 if we should clear the old contents, or 0 if not
+
+uv_process_row:
+ # First iteration does not do horizontal filtering on previous
+ mov r7, r13
+ mov r3,0
+ vldb H(12++,16)+r3,(r0 += r1) REP 4 # Load the current block
+ vldb H(16++,16)+r3,(r2 += r1) REP 16
+ vldb H(setup_input,0), (r4) # We may wish to prefetch these
+ cmp r14,1
+ bne uv_skip0
+ vstb H(zeros,0),(r4)
+uv_skip0:
+ bl uv_vert_filter
+ add r3,8
+ vadd H(setup_input,0),H(setup_input,8),0 # Rotate to second set of 8
+ bl uv_vert_filter
+ sub r3,8
+ b uv_start_deblock_loop
+uv_deblock_loop:
+ # Middle iterations do vertical on current block and horizontal on preceding
+ vldb H(12++,16)+r3,(r0 += r1) REP 4 # load the current block
+ vldb H(16++,16)+r3,(r2 += r1) REP 16
+ vldb H(setup_input,0), (r4)
+ cmp r14,1
+ bne uv_skip1
+ vstb H(zeros,0),(r4)
+uv_skip1:
+ bl uv_vert_filter
+ add r3,8
+ vadd H(setup_input,0),H(setup_input,8),0
+ bl uv_vert_filter
+ sub r3,8
+ vldb H(setup_input,0), -16(r4)
+ cmp r14,1
+ bne uv_skip3
+ vstb H(zeros,0),-16(r4)
+uv_skip3:
+ bl uv_horz_filter
+ mov r12,r11
+ add r3,8*64
+ vadd H(setup_input,0),H(setup_input,8),0
+ bl uv_horz_filter
+ sub r3,8*64
+ addcmpbeq r12,0,0,uv_skip_save_top
+ vstb H(12++,0)+r3,-16(r0 += r1) REP 4 # Save the deblocked pixels for the previous block
+uv_skip_save_top:
+ vstb H(16++,0)+r3,-16(r2 += r1) REP 16
+uv_start_deblock_loop:
+ # move onto next 16x16 (could do this with circular buffer support instead)
+ add r3,16
+ and r3,r8
+ add r4,32
+ # Perform loop counter operations (may work with an addcmpbgt as well?)
+ add r0,16
+ add r2,16
+ sub r7,1
+ cmp r7,0 # Are there still more blocks to load
+ bgt uv_deblock_loop
+
+ # Final iteration needs to just do horizontal filtering
+ vldb H(setup_input,0), -16(r4)
+ cmp r14,1
+ bne uv_skip2
+ vstb H(zeros,0),-16(r4)
+uv_skip2:
+ bl uv_horz_filter
+ mov r12,r11
+ add r3,8*64
+ vadd H(setup_input,0),H(setup_input,8),0
+ bl uv_horz_filter
+ sub r3,64*8
+ addcmpbeq r12,0,0,uv_skip_save_top2
+ vstb H(12++,0)+r3,-16(r0 += r1) REP 4 # Save the deblocked pixels for the previous block
+uv_skip_save_top2:
+ vstb H(16++,0)+r3,-16(r2 += r1) REP 16
+
+# Now look to see if we should do another row
+ sub r9,1
+ cmp r9,0
+ bgt uv_start_again
+ pop r6-r15, pc
+uv_start_again:
+ # Need to sort out r0,r2 to point to next row down
+ addscale16 r10,r1
+ mov r2,r10
+ subscale4 r0,r2,r1
+ b uv_process_row
+
+
+# At this stage H(16,16)+r3 points to the first pixel of the 16 high edge to be filtered
+# So we can reuse the code we move the parts to be filtered into HX(P0/P1/P2/P3/Q0/Q1/Q2/Q3,0) - we will perform a final saturation step on placing them back into the correct locations
+
+uv_vert_filter:
+ push lr
+
+ vmov HX(P1,0), V(16,14)+r3
+ vmov HX(P0,0), V(16,15)+r3
+ vmov HX(Q0,0), V(16,16)+r3
+ vmov HX(Q1,0), V(16,17)+r3
+
+ bl do_chroma_filter
+
+ vadds V(16,15)+r3, HX(P0,0), 0
+ vadds V(16,16)+r3, HX(Q0,0), 0
+
+ pop pc
+
+# Filter edge at H(16,0)+r3
+uv_horz_filter:
+ push lr
+
+ vmov HX(P1,0), H(14,0)+r3
+ vmov HX(P0,0), H(15,0)+r3
+ vmov HX(Q0,0), H(16,0)+r3
+ vmov HX(Q1,0), H(17,0)+r3
+
+ bl do_chroma_filter
+
+ vadds H(15,0)+r3, HX(P0,0), 0
+ # P3 and Q3 never change so don't bother saving back
+ vadds H(16,0)+r3, HX(Q0,0), 0
+
+ pop pc
+
+# r4 points to array of beta/tc for each 4 length edge
+do_chroma_filter:
+ valtl H(setup,0),H(setup_input,0),H(setup_input,0) # tc*8
+ valtl HX(tc,0),H(setup,0),H(setup,0)
+
+ vsub HX(delta,0),HX(Q0,0),HX(P0,0)
+ vshl HX(delta,0),HX(delta,0),2 CLRA SACC
+ vsub -,HX(P1,0),HX(Q1,0) SACC
+ vmov HX(delta,0),4 SACC
+ vasr HX(delta,0),HX(delta,0),3
+ vclamps HX(delta,0), HX(delta,0), HX(tc,0)
+ vadd HX(P0,0),HX(P0,0),HX(delta,0)
+ vsub HX(Q0,0),HX(Q0,0),HX(delta,0)
+ b lr
+
+# r0 = list
+# r1 = number
+hevc_run_command_list:
+ push r6-r7, lr
+ mov r6, r0
+ mov r7, r1
+loop_cmds:
+ ld r0,(r6) # How to encode r6++?
+ add r6,4
+ ld r1,(r6)
+ add r6,4
+ ld r2,(r6)
+ add r6,4
+ ld r3,(r6)
+ add r6,4
+ ld r4,(r6)
+ add r6,4
+ ld r5,(r6)
+ add r6,4
+ bl hevc_trans_16x16
+ sub r7,1
+ cmp r7,0
+ bgt loop_cmds
+
+ pop r6-r7, pc
diff --git a/libavcodec/rpi_hevc_transform10.h b/libavcodec/rpi_hevc_transform10.h
new file mode 100644
index 0000000000..b0e9902d82
--- /dev/null
+++ b/libavcodec/rpi_hevc_transform10.h
@@ -0,0 +1,3070 @@
+static const unsigned char rpi_hevc_transform10 [] = {
+21,
+106,
+0,
+144,
+47,
+1,
+37,
+106,
+0,
+144,
+66,
+1,
+53,
+106,
+0,
+144,
+192,
+4,
+69,
+106,
+0,
+144,
+192,
+4,
+85,
+106,
+0,
+144,
+220,
+5,
+169,
+3,
+62,
+64,
+79,
+64,
+3,
+232,
+32,
+0,
+0,
+0,
+12,
+248,
+0,
+136,
+0,
+0,
+192,
+248,
+0,
+0,
+64,
+232,
+0,
+2,
+0,
+0,
+12,
+248,
+0,
+168,
+0,
+0,
+192,
+248,
+0,
+0,
+0,
+96,
+3,
+232,
+32,
+0,
+0,
+0,
+7,
+232,
+0,
+2,
+0,
+0,
+8,
+232,
+0,
+4,
+0,
+0,
+12,
+248,
+0,
+128,
+0,
+0,
+192,
+8,
+4,
+0,
+4,
+232,
+64,
+0,
+0,
+0,
+5,
+232,
+0,
+2,
+0,
+0,
+128,
+69,
+113,
+66,
+12,
+248,
+0,
+128,
+0,
+0,
+192,
+8,
+4,
+0,
+128,
+69,
+113,
+70,
+128,
+144,
+40,
+0,
+4,
+255,
+48,
+192,
+128,
+3,
+32,
+8,
+16,
+0,
+76,
+254,
+48,
+192,
+9,
+4,
+32,
+8,
+0,
+0,
+4,
+254,
+0,
+144,
+128,
+2,
+0,
+8,
+2,
+0,
+128,
+144,
+23,
+0,
+4,
+255,
+48,
+192,
+128,
+3,
+32,
+8,
+20,
+0,
+76,
+254,
+48,
+192,
+6,
+4,
+32,
+8,
+0,
+0,
+140,
+248,
+44,
+0,
+0,
+0,
+32,
+48,
+4,
+0,
+128,
+69,
+113,
+66,
+242,
+140,
+211,
+192,
+34,
+31,
+41,
+3,
+70,
+192,
+80,
+7,
+164,
+255,
+36,
+204,
+96,
+2,
+0,
+248,
+62,
+0,
+3,
+255,
+55,
+208,
+120,
+3,
+224,
+3,
+190,
+11,
+16,
+139,
+246,
+91,
+0,
+103,
+90,
+0,
+70,
+192,
+80,
+7,
+164,
+255,
+36,
+204,
+224,
+2,
+0,
+248,
+62,
+0,
+3,
+255,
+55,
+208,
+120,
+3,
+224,
+3,
+190,
+11,
+16,
+139,
+246,
+91,
+0,
+103,
+90,
+0,
+225,
+64,
+242,
+64,
+3,
+232,
+128,
+0,
+0,
+0,
+7,
+232,
+0,
+2,
+0,
+0,
+57,
+239,
+224,
+247,
+255,
+255,
+72,
+192,
+95,
+207,
+88,
+122,
+88,
+124,
+137,
+64,
+26,
+64,
+4,
+232,
+64,
+0,
+0,
+0,
+149,
+96,
+161,
+64,
+152,
+64,
+128,
+144,
+35,
+0,
+72,
+232,
+0,
+4,
+0,
+0,
+65,
+232,
+32,
+0,
+0,
+0,
+128,
+144,
+27,
+0,
+4,
+232,
+0,
+2,
+0,
+0,
+101,
+96,
+145,
+64,
+168,
+64,
+128,
+144,
+19,
+0,
+72,
+232,
+0,
+4,
+0,
+0,
+65,
+232,
+32,
+0,
+0,
+0,
+128,
+144,
+11,
+0,
+74,
+232,
+0,
+8,
+0,
+0,
+242,
+140,
+221,
+192,
+57,
+239,
+32,
+8,
+0,
+0,
+41,
+3,
+239,
+3,
+12,
+248,
+0,
+128,
+0,
+0,
+192,
+248,
+4,
+0,
+12,
+248,
+0,
+132,
+64,
+0,
+192,
+248,
+4,
+0,
+0,
+96,
+255,
+159,
+154,
+255,
+0,
+232,
+0,
+4,
+0,
+0,
+255,
+159,
+165,
+255,
+4,
+255,
+48,
+204,
+16,
+3,
+224,
+251,
+62,
+0,
+4,
+255,
+51,
+204,
+128,
+3,
+224,
+251,
+16,
+0,
+76,
+254,
+51,
+204,
+128,
+3,
+224,
+251,
+20,
+0,
+128,
+64,
+6,
+232,
+64,
+0,
+0,
+0,
+140,
+248,
+47,
+0,
+0,
+0,
+224,
+99,
+0,
+0,
+32,
+247,
+240,
+207,
+16,
+3,
+32,
+247,
+176,
+207,
+17,
+19,
+32,
+247,
+112,
+207,
+18,
+35,
+32,
+247,
+48,
+207,
+19,
+51,
+32,
+247,
+240,
+206,
+20,
+67,
+32,
+247,
+176,
+206,
+21,
+83,
+32,
+247,
+112,
+206,
+22,
+99,
+32,
+247,
+48,
+206,
+23,
+115,
+32,
+247,
+240,
+205,
+24,
+131,
+32,
+247,
+176,
+205,
+25,
+147,
+32,
+247,
+112,
+205,
+26,
+163,
+32,
+247,
+48,
+205,
+27,
+179,
+32,
+247,
+240,
+204,
+28,
+195,
+32,
+247,
+176,
+204,
+29,
+211,
+32,
+247,
+112,
+204,
+30,
+227,
+32,
+247,
+48,
+204,
+31,
+243,
+4,
+255,
+51,
+204,
+128,
+3,
+224,
+251,
+16,
+0,
+76,
+254,
+51,
+204,
+128,
+3,
+224,
+251,
+20,
+0,
+0,
+237,
+32,
+0,
+0,
+0,
+140,
+248,
+47,
+0,
+0,
+0,
+224,
+99,
+0,
+0,
+111,
+3,
+4,
+254,
+0,
+128,
+0,
+4,
+0,
+248,
+0,
+0,
+2,
+232,
+32,
+0,
+0,
+0,
+140,
+248,
+32,
+0,
+0,
+0,
+224,
+35,
+0,
+0,
+64,
+232,
+0,
+2,
+0,
+0,
+193,
+232,
+0,
+1,
+0,
+0,
+1,
+106,
+116,
+30,
+90,
+0,
+169,
+3,
+73,
+64,
+52,
+64,
+45,
+64,
+2,
+64,
+10,
+64,
+64,
+198,
+1,
+7,
+8,
+232,
+63,
+0,
+0,
+0,
+6,
+232,
+253,
+255,
+255,
+255,
+0,
+246,
+0,
+0,
+0,
+4,
+215,
+64,
+3,
+96,
+2,
+248,
+0,
+35,
+0,
+0,
+64,
+56,
+0,
+0,
+4,
+248,
+0,
+36,
+0,
+0,
+64,
+56,
+8,
+0,
+0,
+240,
+64,
+0,
+132,
+3,
+128,
+240,
+0,
+0,
+132,
+3,
+128,
+144,
+137,
+0,
+131,
+98,
+0,
+255,
+64,
+0,
+0,
+20,
+200,
+243,
+0,
+0,
+128,
+144,
+129,
+0,
+131,
+102,
+0,
+158,
+67,
+0,
+2,
+248,
+0,
+35,
+0,
+0,
+64,
+56,
+0,
+0,
+4,
+248,
+0,
+36,
+0,
+0,
+64,
+56,
+8,
+0,
+0,
+240,
+64,
+0,
+132,
+3,
+128,
+240,
+0,
+0,
+132,
+3,
+128,
+144,
+108,
+0,
+131,
+98,
+0,
+255,
+64,
+0,
+0,
+20,
+200,
+243,
+0,
+0,
+128,
+144,
+100,
+0,
+131,
+102,
+0,
+248,
+64,
+0,
+112,
+0,
+192,
+243,
+211,
+31,
+128,
+248,
+0,
+0,
+112,
+0,
+192,
+243,
+211,
+31,
+128,
+144,
+161,
+0,
+188,
+64,
+67,
+232,
+0,
+2,
+0,
+0,
+0,
+255,
+64,
+0,
+0,
+20,
+200,
+243,
+0,
+0,
+128,
+144,
+150,
+0,
+195,
+232,
+0,
+2,
+0,
+0,
+12,
+128,
+7,
+192,
+130,
+248,
+0,
+0,
+112,
+192,
+224,
+16,
+195,
+31,
+132,
+248,
+1,
+0,
+112,
+0,
+224,
+16,
+203,
+31,
+3,
+99,
+131,
+71,
+68,
+232,
+32,
+0,
+0,
+0,
+0,
+99,
+2,
+99,
+23,
+102,
+7,
+106,
+127,
+156,
+182,
+255,
+0,
+248,
+64,
+0,
+112,
+0,
+192,
+243,
+211,
+31,
+128,
+248,
+0,
+0,
+112,
+0,
+192,
+243,
+211,
+31,
+128,
+144,
+112,
+0,
+188,
+64,
+67,
+232,
+0,
+2,
+0,
+0,
+0,
+255,
+64,
+0,
+0,
+20,
+200,
+243,
+0,
+0,
+128,
+144,
+101,
+0,
+195,
+232,
+0,
+2,
+0,
+0,
+12,
+128,
+7,
+192,
+130,
+248,
+0,
+0,
+112,
+192,
+224,
+16,
+195,
+31,
+132,
+248,
+1,
+0,
+112,
+0,
+224,
+16,
+203,
+31,
+25,
+102,
+9,
+106,
+2,
+30,
+41,
+3,
+26,
+87,
+162,
+64,
+64,
+198,
+1,
+23,
+127,
+158,
+103,
+255,
+239,
+3,
+0,
+254,
+0,
+143,
+92,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+64,
+143,
+93,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+128,
+143,
+94,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+192,
+143,
+95,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+192,
+142,
+208,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+128,
+142,
+209,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+64,
+142,
+210,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+0,
+142,
+211,
+0,
+0,
+240,
+12,
+0,
+128,
+144,
+107,
+0,
+8,
+255,
+99,
+23,
+0,
+212,
+192,
+51,
+0,
+0,
+8,
+255,
+163,
+23,
+0,
+228,
+192,
+51,
+0,
+0,
+8,
+255,
+227,
+23,
+0,
+244,
+192,
+51,
+0,
+0,
+8,
+255,
+35,
+52,
+0,
+180,
+192,
+51,
+0,
+0,
+8,
+255,
+99,
+52,
+0,
+164,
+192,
+51,
+0,
+0,
+8,
+255,
+163,
+52,
+0,
+148,
+192,
+51,
+0,
+0,
+111,
+3,
+239,
+3,
+0,
+254,
+0,
+143,
+12,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+64,
+143,
+13,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+128,
+143,
+14,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+192,
+143,
+15,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+192,
+142,
+16,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+128,
+142,
+17,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+64,
+142,
+18,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+0,
+142,
+19,
+0,
+0,
+240,
+12,
+0,
+128,
+144,
+33,
+0,
+8,
+255,
+99,
+3,
+0,
+212,
+192,
+51,
+0,
+0,
+8,
+255,
+163,
+3,
+0,
+228,
+192,
+51,
+0,
+0,
+8,
+255,
+227,
+3,
+0,
+244,
+192,
+51,
+0,
+0,
+8,
+255,
+35,
+4,
+0,
+180,
+192,
+51,
+0,
+0,
+8,
+255,
+99,
+4,
+0,
+164,
+192,
+51,
+0,
+0,
+8,
+255,
+163,
+4,
+0,
+148,
+192,
+51,
+0,
+0,
+111,
+3,
+32,
+246,
+192,
+11,
+1,
+16,
+32,
+246,
+2,
+137,
+47,
+240,
+40,
+246,
+2,
+140,
+47,
+240,
+128,
+245,
+99,
+140,
+5,
+4,
+0,
+247,
+99,
+140,
+1,
+20,
+88,
+246,
+99,
+140,
+1,
+20,
+0,
+247,
+35,
+136,
+62,
+226,
+32,
+247,
+35,
+136,
+32,
+210,
+0,
+247,
+34,
+136,
+63,
+2,
+208,
+246,
+34,
+136,
+0,
+4,
+0,
+247,
+99,
+136,
+58,
+162,
+32,
+247,
+99,
+136,
+33,
+146,
+0,
+247,
+98,
+136,
+59,
+18,
+208,
+246,
+98,
+136,
+0,
+20,
+0,
+247,
+162,
+136,
+33,
+2,
+88,
+246,
+98,
+137,
+2,
+68,
+88,
+246,
+162,
+137,
+3,
+68,
+208,
+254,
+227,
+136,
+60,
+242,
+192,
+243,
+188,
+11,
+208,
+254,
+227,
+136,
+56,
+178,
+192,
+243,
+188,
+10,
+32,
+255,
+226,
+136,
+38,
+58,
+192,
+243,
+60,
+0,
+208,
+254,
+227,
+136,
+59,
+242,
+192,
+243,
+60,
+128,
+32,
+255,
+226,
+136,
+49,
+58,
+192,
+243,
+60,
+128,
+0,
+255,
+226,
+136,
+34,
+34,
+192,
+243,
+60,
+128,
+32,
+255,
+226,
+136,
+37,
+58,
+192,
+243,
+60,
+128,
+0,
+254,
+192,
+136,
+1,
+4,
+0,
+240,
+0,
+160,
+0,
+255,
+194,
+8,
+0,
+52,
+195,
+243,
+0,
+128,
+0,
+255,
+202,
+40,
+0,
+52,
+195,
+243,
+0,
+128,
+0,
+254,
+0,
+240,
+35,
+10,
+0,
+240,
+60,
+0,
+0,
+254,
+192,
+136,
+1,
+4,
+0,
+240,
+0,
+160,
+0,
+255,
+226,
+140,
+34,
+34,
+195,
+243,
+60,
+0,
+32,
+255,
+227,
+140,
+36,
+58,
+192,
+243,
+60,
+0,
+0,
+254,
+192,
+136,
+0,
+4,
+0,
+240,
+0,
+160,
+16,
+246,
+226,
+136,
+35,
+50,
+16,
+246,
+226,
+136,
+35,
+50,
+32,
+246,
+226,
+136,
+35,
+50,
+32,
+254,
+226,
+136,
+35,
+58,
+192,
+243,
+60,
+0,
+11,
+96,
+0,
+254,
+0,
+240,
+1,
+4,
+0,
+240,
+64,
+115,
+5,
+106,
+0,
+144,
+173,
+1,
+27,
+96,
+0,
+254,
+0,
+240,
+1,
+4,
+0,
+240,
+64,
+147,
+5,
+106,
+0,
+144,
+227,
+0,
+64,
+246,
+163,
+140,
+1,
+4,
+0,
+246,
+192,
+175,
+63,
+2,
+0,
+246,
+192,
+174,
+59,
+2,
+0,
+246,
+128,
+175,
+62,
+2,
+0,
+246,
+128,
+174,
+58,
+2,
+0,
+246,
+64,
+175,
+61,
+2,
+0,
+246,
+64,
+174,
+57,
+2,
+0,
+255,
+43,
+240,
+4,
+212,
+192,
+243,
+128,
+11,
+64,
+254,
+43,
+240,
+1,
+228,
+192,
+243,
+128,
+10,
+64,
+254,
+43,
+240,
+1,
+244,
+192,
+243,
+128,
+10,
+64,
+254,
+43,
+240,
+1,
+180,
+192,
+243,
+128,
+10,
+64,
+254,
+43,
+141,
+0,
+164,
+192,
+243,
+128,
+10,
+88,
+246,
+35,
+141,
+3,
+68,
+32,
+247,
+35,
+141,
+191,
+66,
+240,
+246,
+35,
+141,
+50,
+66,
+0,
+255,
+235,
+143,
+52,
+242,
+192,
+243,
+60,
+128,
+0,
+255,
+43,
+240,
+2,
+212,
+192,
+243,
+128,
+11,
+0,
+255,
+43,
+240,
+191,
+226,
+192,
+243,
+188,
+10,
+64,
+254,
+43,
+141,
+0,
+180,
+192,
+243,
+128,
+10,
+88,
+246,
+35,
+141,
+2,
+68,
+32,
+247,
+35,
+141,
+190,
+66,
+240,
+246,
+35,
+141,
+50,
+66,
+0,
+255,
+171,
+143,
+52,
+226,
+192,
+243,
+60,
+128,
+0,
+255,
+43,
+240,
+4,
+180,
+192,
+243,
+128,
+11,
+0,
+255,
+43,
+240,
+191,
+226,
+192,
+243,
+188,
+10,
+128,
+253,
+43,
+240,
+3,
+212,
+192,
+243,
+128,
+10,
+64,
+254,
+35,
+141,
+1,
+196,
+192,
+243,
+128,
+10,
+88,
+246,
+35,
+141,
+3,
+68,
+32,
+247,
+35,
+141,
+189,
+66,
+240,
+246,
+35,
+141,
+50,
+66,
+0,
+255,
+107,
+143,
+52,
+210,
+192,
+243,
+60,
+128,
+0,
+255,
+43,
+240,
+4,
+148,
+192,
+243,
+128,
+11,
+64,
+254,
+43,
+240,
+1,
+164,
+192,
+243,
+128,
+10,
+64,
+254,
+43,
+240,
+1,
+180,
+192,
+243,
+128,
+10,
+64,
+254,
+43,
+240,
+1,
+244,
+192,
+243,
+128,
+10,
+64,
+254,
+43,
+141,
+0,
+228,
+192,
+243,
+128,
+10,
+88,
+246,
+35,
+141,
+3,
+68,
+32,
+247,
+35,
+141,
+187,
+66,
+240,
+246,
+35,
+141,
+50,
+66,
+0,
+255,
+235,
+142,
+52,
+178,
+192,
+243,
+60,
+128,
+0,
+255,
+43,
+240,
+2,
+148,
+192,
+243,
+128,
+11,
+0,
+255,
+43,
+240,
+187,
+162,
+192,
+243,
+188,
+10,
+64,
+254,
+43,
+141,
+0,
+244,
+192,
+243,
+128,
+10,
+88,
+246,
+35,
+141,
+2,
+68,
+32,
+247,
+35,
+141,
+186,
+66,
+240,
+246,
+35,
+141,
+50,
+66,
+0,
+255,
+171,
+142,
+52,
+162,
+192,
+243,
+60,
+128,
+0,
+255,
+43,
+240,
+4,
+244,
+192,
+243,
+128,
+11,
+0,
+255,
+43,
+240,
+187,
+162,
+192,
+243,
+188,
+10,
+128,
+253,
+43,
+240,
+3,
+148,
+192,
+243,
+128,
+10,
+64,
+254,
+35,
+141,
+1,
+132,
+192,
+243,
+128,
+10,
+88,
+246,
+35,
+141,
+3,
+68,
+32,
+247,
+35,
+141,
+185,
+66,
+240,
+246,
+35,
+141,
+50,
+66,
+0,
+255,
+107,
+142,
+52,
+146,
+192,
+243,
+60,
+128,
+64,
+255,
+98,
+141,
+0,
+52,
+192,
+243,
+0,
+0,
+0,
+254,
+0,
+240,
+53,
+10,
+0,
+240,
+60,
+0,
+0,
+254,
+0,
+240,
+1,
+4,
+0,
+240,
+64,
+147,
+5,
+106,
+0,
+144,
+177,
+0,
+88,
+246,
+163,
+140,
+1,
+4,
+128,
+245,
+99,
+141,
+10,
+4,
+88,
+246,
+162,
+138,
+1,
+68,
+0,
+247,
+162,
+138,
+36,
+162,
+88,
+254,
+162,
+138,
+3,
+164,
+192,
+243,
+128,
+11,
+0,
+255,
+226,
+137,
+32,
+2,
+195,
+243,
+60,
+0,
+32,
+247,
+226,
+137,
+42,
+114,
+0,
+255,
+34,
+138,
+33,
+18,
+195,
+243,
+60,
+0,
+32,
+247,
+34,
+138,
+42,
+130,
+16,
+246,
+98,
+138,
+40,
+114,
+16,
+246,
+98,
+138,
+41,
+146,
+32,
+246,
+98,
+138,
+41,
+146,
+32,
+246,
+226,
+137,
+41,
+146,
+40,
+246,
+34,
+138,
+41,
+146,
+32,
+247,
+163,
+141,
+63,
+178,
+32,
+247,
+227,
+141,
+62,
+162,
+0,
+254,
+0,
+240,
+8,
+4,
+0,
+240,
+128,
+11,
+128,
+253,
+35,
+240,
+9,
+100,
+192,
+243,
+128,
+10,
+128,
+253,
+163,
+141,
+128,
+115,
+192,
+243,
+152,
+10,
+88,
+246,
+163,
+141,
+4,
+100,
+208,
+246,
+35,
+139,
+0,
+100,
+32,
+255,
+34,
+139,
+53,
+202,
+192,
+243,
+60,
+128,
+0,
+254,
+0,
+139,
+0,
+4,
+0,
+240,
+0,
+160,
+240,
+246,
+163,
+141,
+48,
+98,
+0,
+247,
+99,
+139,
+63,
+210,
+0,
+247,
+98,
+139,
+1,
+212,
+88,
+254,
+98,
+139,
+1,
+212,
+192,
+243,
+128,
+11,
+32,
+255,
+99,
+139,
+62,
+98,
+192,
+243,
+188,
+10,
+88,
+246,
+98,
+139,
+1,
+212,
+240,
+246,
+98,
+139,
+50,
+210,
+0,
+247,
+163,
+128,
+59,
+146,
+0,
+247,
+160,
+128,
+1,
+36,
+88,
+254,
+160,
+128,
+1,
+36,
+192,
+243,
+128,
+11,
+0,
+247,
+163,
+128,
+58,
+98,
+64,
+255,
+35,
+240,
+0,
+100,
+192,
+243,
+128,
+10,
+64,
+255,
+163,
+128,
+0,
+164,
+192,
+243,
+128,
+10,
+88,
+246,
+160,
+128,
+1,
+36,
+240,
+246,
+160,
+128,
+50,
+34,
+8,
+255,
+227,
+143,
+54,
+242,
+192,
+243,
+60,
+128,
+40,
+255,
+227,
+142,
+54,
+178,
+192,
+243,
+60,
+128,
+0,
+254,
+0,
+240,
+39,
+10,
+0,
+240,
+60,
+128,
+8,
+255,
+163,
+143,
+45,
+226,
+192,
+243,
+60,
+128,
+0,
+254,
+0,
+240,
+44,
+10,
+0,
+240,
+60,
+0,
+0,
+254,
+0,
+240,
+40,
+10,
+0,
+240,
+60,
+128,
+8,
+255,
+163,
+142,
+2,
+162,
+192,
+243,
+60,
+128,
+90,
+0,
+169,
+3,
+14,
+96,
+4,
+31,
+169,
+3,
+30,
+96,
+1,
+31,
+73,
+64,
+52,
+64,
+45,
+64,
+2,
+64,
+10,
+64,
+64,
+198,
+1,
+7,
+8,
+232,
+63,
+0,
+0,
+0,
+6,
+232,
+253,
+255,
+255,
+255,
+0,
+246,
+0,
+0,
+0,
+4,
+215,
+64,
+3,
+96,
+2,
+248,
+0,
+35,
+0,
+0,
+64,
+56,
+0,
+0,
+4,
+248,
+0,
+36,
+0,
+0,
+64,
+56,
+8,
+0,
+0,
+240,
+64,
+0,
+132,
+3,
+30,
+106,
+132,
+24,
+128,
+240,
+0,
+0,
+132,
+3,
+128,
+144,
+143,
+0,
+131,
+98,
+0,
+255,
+64,
+0,
+0,
+20,
+200,
+243,
+0,
+0,
+128,
+144,
+135,
+0,
+131,
+102,
+0,
+158,
+71,
+0,
+2,
+248,
+0,
+35,
+0,
+0,
+64,
+56,
+0,
+0,
+4,
+248,
+0,
+36,
+0,
+0,
+64,
+56,
+8,
+0,
+0,
+240,
+64,
+0,
+132,
+3,
+30,
+106,
+132,
+24,
+128,
+240,
+0,
+0,
+132,
+3,
+128,
+144,
+112,
+0,
+131,
+98,
+0,
+255,
+64,
+0,
+0,
+20,
+200,
+243,
+0,
+0,
+128,
+144,
+104,
+0,
+131,
+102,
+0,
+248,
+64,
+0,
+112,
+0,
+192,
+243,
+211,
+31,
+30,
+106,
+134,
+24,
+128,
+248,
+0,
+0,
+112,
+0,
+192,
+243,
+211,
+31,
+128,
+144,
+123,
+0,
+188,
+64,
+67,
+232,
+0,
+2,
+0,
+0,
+0,
+255,
+64,
+0,
+0,
+20,
+200,
+243,
+0,
+0,
+128,
+144,
+112,
+0,
+195,
+232,
+0,
+2,
+0,
+0,
+12,
+128,
+7,
+192,
+130,
+248,
+0,
+0,
+112,
+192,
+224,
+16,
+195,
+31,
+132,
+248,
+1,
+0,
+112,
+0,
+224,
+16,
+203,
+31,
+3,
+99,
+131,
+71,
+68,
+232,
+32,
+0,
+0,
+0,
+0,
+99,
+2,
+99,
+23,
+102,
+7,
+106,
+127,
+156,
+178,
+255,
+0,
+248,
+64,
+0,
+112,
+0,
+192,
+243,
+211,
+31,
+30,
+106,
+134,
+24,
+128,
+248,
+0,
+0,
+112,
+0,
+192,
+243,
+211,
+31,
+128,
+144,
+72,
+0,
+188,
+64,
+67,
+232,
+0,
+2,
+0,
+0,
+0,
+255,
+64,
+0,
+0,
+20,
+200,
+243,
+0,
+0,
+128,
+144,
+61,
+0,
+195,
+232,
+0,
+2,
+0,
+0,
+12,
+128,
+7,
+192,
+130,
+248,
+0,
+0,
+112,
+192,
+224,
+16,
+195,
+31,
+132,
+248,
+1,
+0,
+112,
+0,
+224,
+16,
+203,
+31,
+25,
+102,
+9,
+106,
+2,
+30,
+41,
+3,
+26,
+87,
+162,
+64,
+64,
+198,
+1,
+23,
+127,
+158,
+95,
+255,
+239,
+3,
+0,
+254,
+128,
+143,
+94,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+192,
+143,
+95,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+192,
+142,
+208,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+128,
+142,
+209,
+0,
+0,
+240,
+12,
+0,
+128,
+144,
+47,
+0,
+8,
+255,
+227,
+23,
+0,
+244,
+192,
+51,
+0,
+0,
+8,
+255,
+35,
+52,
+0,
+180,
+192,
+51,
+0,
+0,
+111,
+3,
+239,
+3,
+0,
+254,
+128,
+143,
+14,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+192,
+143,
+15,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+192,
+142,
+16,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+128,
+142,
+17,
+0,
+0,
+240,
+12,
+0,
+128,
+144,
+13,
+0,
+8,
+255,
+227,
+3,
+0,
+244,
+192,
+51,
+0,
+0,
+8,
+255,
+35,
+4,
+0,
+180,
+192,
+51,
+0,
+0,
+111,
+3,
+32,
+246,
+192,
+11,
+1,
+16,
+32,
+246,
+2,
+140,
+47,
+240,
+32,
+247,
+35,
+141,
+63,
+178,
+64,
+254,
+35,
+141,
+2,
+68,
+192,
+243,
+128,
+11,
+32,
+255,
+35,
+240,
+58,
+226,
+192,
+243,
+188,
+10,
+0,
+254,
+0,
+141,
+4,
+4,
+0,
+240,
+128,
+10,
+88,
+246,
+35,
+141,
+3,
+68,
+240,
+246,
+35,
+141,
+48,
+66,
+0,
+247,
+227,
+143,
+52,
+242,
+32,
+247,
+227,
+142,
+52,
+178,
+90,
+0,
+161,
+3,
+6,
+64,
+23,
+64,
+96,
+8,
+70,
+98,
+97,
+8,
+70,
+98,
+98,
+8,
+70,
+98,
+99,
+8,
+70,
+98,
+100,
+8,
+70,
+98,
+101,
+8,
+70,
+98,
+255,
+159,
+8,
+250,
+23,
+102,
+7,
+106,
+112,
+30,
+33,
+3,
+};
diff --git a/libavcodec/rpi_hevc_transform8.h b/libavcodec/rpi_hevc_transform8.h
new file mode 100644
index 0000000000..2901b6568d
--- /dev/null
+++ b/libavcodec/rpi_hevc_transform8.h
@@ -0,0 +1,3070 @@
+static const unsigned char rpi_hevc_transform8 [] = {
+21,
+106,
+0,
+144,
+47,
+1,
+37,
+106,
+0,
+144,
+66,
+1,
+53,
+106,
+0,
+144,
+192,
+4,
+69,
+106,
+0,
+144,
+192,
+4,
+85,
+106,
+0,
+144,
+220,
+5,
+169,
+3,
+62,
+64,
+79,
+64,
+3,
+232,
+32,
+0,
+0,
+0,
+12,
+248,
+0,
+136,
+0,
+0,
+192,
+248,
+0,
+0,
+64,
+232,
+0,
+2,
+0,
+0,
+12,
+248,
+0,
+168,
+0,
+0,
+192,
+248,
+0,
+0,
+0,
+96,
+3,
+232,
+32,
+0,
+0,
+0,
+7,
+232,
+0,
+2,
+0,
+0,
+8,
+232,
+0,
+4,
+0,
+0,
+12,
+248,
+0,
+128,
+0,
+0,
+192,
+8,
+4,
+0,
+4,
+232,
+64,
+0,
+0,
+0,
+5,
+232,
+0,
+8,
+0,
+0,
+128,
+69,
+113,
+66,
+12,
+248,
+0,
+128,
+0,
+0,
+192,
+8,
+4,
+0,
+128,
+69,
+113,
+70,
+128,
+144,
+40,
+0,
+4,
+255,
+48,
+192,
+128,
+3,
+32,
+8,
+16,
+0,
+76,
+254,
+48,
+192,
+9,
+4,
+32,
+8,
+0,
+0,
+4,
+254,
+0,
+144,
+128,
+2,
+0,
+8,
+2,
+0,
+128,
+144,
+23,
+0,
+4,
+255,
+48,
+192,
+128,
+3,
+32,
+8,
+20,
+0,
+76,
+254,
+48,
+192,
+4,
+4,
+32,
+8,
+0,
+0,
+140,
+248,
+44,
+0,
+0,
+0,
+32,
+48,
+4,
+0,
+128,
+69,
+113,
+66,
+242,
+140,
+211,
+192,
+34,
+31,
+41,
+3,
+70,
+192,
+80,
+7,
+164,
+255,
+36,
+204,
+96,
+2,
+0,
+248,
+62,
+0,
+3,
+255,
+55,
+208,
+120,
+3,
+224,
+3,
+190,
+11,
+16,
+139,
+246,
+91,
+0,
+103,
+90,
+0,
+70,
+192,
+80,
+7,
+164,
+255,
+36,
+204,
+224,
+2,
+0,
+248,
+62,
+0,
+3,
+255,
+55,
+208,
+120,
+3,
+224,
+3,
+190,
+11,
+16,
+139,
+246,
+91,
+0,
+103,
+90,
+0,
+225,
+64,
+242,
+64,
+3,
+232,
+128,
+0,
+0,
+0,
+7,
+232,
+0,
+2,
+0,
+0,
+57,
+239,
+224,
+247,
+255,
+255,
+72,
+192,
+95,
+207,
+88,
+122,
+88,
+124,
+137,
+64,
+26,
+64,
+4,
+232,
+64,
+0,
+0,
+0,
+149,
+96,
+161,
+64,
+152,
+64,
+128,
+144,
+35,
+0,
+72,
+232,
+0,
+4,
+0,
+0,
+65,
+232,
+32,
+0,
+0,
+0,
+128,
+144,
+27,
+0,
+4,
+232,
+0,
+8,
+0,
+0,
+69,
+96,
+145,
+64,
+168,
+64,
+128,
+144,
+19,
+0,
+72,
+232,
+0,
+4,
+0,
+0,
+65,
+232,
+32,
+0,
+0,
+0,
+128,
+144,
+11,
+0,
+74,
+232,
+0,
+8,
+0,
+0,
+242,
+140,
+221,
+192,
+57,
+239,
+32,
+8,
+0,
+0,
+41,
+3,
+239,
+3,
+12,
+248,
+0,
+128,
+0,
+0,
+192,
+248,
+4,
+0,
+12,
+248,
+0,
+132,
+64,
+0,
+192,
+248,
+4,
+0,
+0,
+96,
+255,
+159,
+154,
+255,
+0,
+232,
+0,
+4,
+0,
+0,
+255,
+159,
+165,
+255,
+4,
+255,
+48,
+204,
+16,
+3,
+224,
+251,
+62,
+0,
+4,
+255,
+51,
+204,
+128,
+3,
+224,
+251,
+16,
+0,
+76,
+254,
+51,
+204,
+128,
+3,
+224,
+251,
+20,
+0,
+128,
+64,
+6,
+232,
+64,
+0,
+0,
+0,
+140,
+248,
+47,
+0,
+0,
+0,
+224,
+99,
+0,
+0,
+32,
+247,
+240,
+207,
+16,
+3,
+32,
+247,
+176,
+207,
+17,
+19,
+32,
+247,
+112,
+207,
+18,
+35,
+32,
+247,
+48,
+207,
+19,
+51,
+32,
+247,
+240,
+206,
+20,
+67,
+32,
+247,
+176,
+206,
+21,
+83,
+32,
+247,
+112,
+206,
+22,
+99,
+32,
+247,
+48,
+206,
+23,
+115,
+32,
+247,
+240,
+205,
+24,
+131,
+32,
+247,
+176,
+205,
+25,
+147,
+32,
+247,
+112,
+205,
+26,
+163,
+32,
+247,
+48,
+205,
+27,
+179,
+32,
+247,
+240,
+204,
+28,
+195,
+32,
+247,
+176,
+204,
+29,
+211,
+32,
+247,
+112,
+204,
+30,
+227,
+32,
+247,
+48,
+204,
+31,
+243,
+4,
+255,
+51,
+204,
+128,
+3,
+224,
+251,
+16,
+0,
+76,
+254,
+51,
+204,
+128,
+3,
+224,
+251,
+20,
+0,
+0,
+237,
+32,
+0,
+0,
+0,
+140,
+248,
+47,
+0,
+0,
+0,
+224,
+99,
+0,
+0,
+111,
+3,
+4,
+254,
+0,
+128,
+0,
+4,
+0,
+248,
+0,
+0,
+2,
+232,
+32,
+0,
+0,
+0,
+140,
+248,
+32,
+0,
+0,
+0,
+224,
+35,
+0,
+0,
+64,
+232,
+0,
+2,
+0,
+0,
+193,
+232,
+0,
+1,
+0,
+0,
+1,
+106,
+116,
+30,
+90,
+0,
+169,
+3,
+73,
+64,
+52,
+64,
+45,
+64,
+2,
+64,
+10,
+64,
+64,
+198,
+1,
+7,
+8,
+232,
+63,
+0,
+0,
+0,
+6,
+232,
+253,
+255,
+255,
+255,
+0,
+246,
+0,
+0,
+0,
+4,
+215,
+64,
+3,
+96,
+2,
+248,
+0,
+35,
+0,
+0,
+64,
+56,
+0,
+0,
+4,
+248,
+0,
+36,
+0,
+0,
+64,
+56,
+8,
+0,
+0,
+240,
+64,
+0,
+132,
+3,
+128,
+240,
+0,
+0,
+132,
+3,
+128,
+144,
+137,
+0,
+131,
+98,
+0,
+255,
+64,
+0,
+0,
+20,
+200,
+243,
+0,
+0,
+128,
+144,
+129,
+0,
+131,
+102,
+0,
+158,
+67,
+0,
+2,
+248,
+0,
+35,
+0,
+0,
+64,
+56,
+0,
+0,
+4,
+248,
+0,
+36,
+0,
+0,
+64,
+56,
+8,
+0,
+0,
+240,
+64,
+0,
+132,
+3,
+128,
+240,
+0,
+0,
+132,
+3,
+128,
+144,
+108,
+0,
+131,
+98,
+0,
+255,
+64,
+0,
+0,
+20,
+200,
+243,
+0,
+0,
+128,
+144,
+100,
+0,
+131,
+102,
+0,
+248,
+64,
+0,
+112,
+0,
+192,
+243,
+211,
+31,
+128,
+248,
+0,
+0,
+112,
+0,
+192,
+243,
+211,
+31,
+128,
+144,
+161,
+0,
+188,
+64,
+67,
+232,
+0,
+2,
+0,
+0,
+0,
+255,
+64,
+0,
+0,
+20,
+200,
+243,
+0,
+0,
+128,
+144,
+150,
+0,
+195,
+232,
+0,
+2,
+0,
+0,
+12,
+128,
+7,
+192,
+130,
+248,
+0,
+0,
+112,
+192,
+224,
+16,
+195,
+31,
+132,
+248,
+1,
+0,
+112,
+0,
+224,
+16,
+203,
+31,
+3,
+99,
+131,
+71,
+68,
+232,
+32,
+0,
+0,
+0,
+0,
+99,
+2,
+99,
+23,
+102,
+7,
+106,
+127,
+156,
+182,
+255,
+0,
+248,
+64,
+0,
+112,
+0,
+192,
+243,
+211,
+31,
+128,
+248,
+0,
+0,
+112,
+0,
+192,
+243,
+211,
+31,
+128,
+144,
+112,
+0,
+188,
+64,
+67,
+232,
+0,
+2,
+0,
+0,
+0,
+255,
+64,
+0,
+0,
+20,
+200,
+243,
+0,
+0,
+128,
+144,
+101,
+0,
+195,
+232,
+0,
+2,
+0,
+0,
+12,
+128,
+7,
+192,
+130,
+248,
+0,
+0,
+112,
+192,
+224,
+16,
+195,
+31,
+132,
+248,
+1,
+0,
+112,
+0,
+224,
+16,
+203,
+31,
+25,
+102,
+9,
+106,
+2,
+30,
+41,
+3,
+26,
+87,
+162,
+64,
+64,
+198,
+1,
+23,
+127,
+158,
+103,
+255,
+239,
+3,
+0,
+254,
+0,
+143,
+92,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+64,
+143,
+93,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+128,
+143,
+94,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+192,
+143,
+95,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+192,
+142,
+208,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+128,
+142,
+209,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+64,
+142,
+210,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+0,
+142,
+211,
+0,
+0,
+240,
+12,
+0,
+128,
+144,
+107,
+0,
+8,
+255,
+99,
+23,
+0,
+212,
+192,
+51,
+0,
+0,
+8,
+255,
+163,
+23,
+0,
+228,
+192,
+51,
+0,
+0,
+8,
+255,
+227,
+23,
+0,
+244,
+192,
+51,
+0,
+0,
+8,
+255,
+35,
+52,
+0,
+180,
+192,
+51,
+0,
+0,
+8,
+255,
+99,
+52,
+0,
+164,
+192,
+51,
+0,
+0,
+8,
+255,
+163,
+52,
+0,
+148,
+192,
+51,
+0,
+0,
+111,
+3,
+239,
+3,
+0,
+254,
+0,
+143,
+12,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+64,
+143,
+13,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+128,
+143,
+14,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+192,
+143,
+15,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+192,
+142,
+16,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+128,
+142,
+17,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+64,
+142,
+18,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+0,
+142,
+19,
+0,
+0,
+240,
+12,
+0,
+128,
+144,
+33,
+0,
+8,
+255,
+99,
+3,
+0,
+212,
+192,
+51,
+0,
+0,
+8,
+255,
+163,
+3,
+0,
+228,
+192,
+51,
+0,
+0,
+8,
+255,
+227,
+3,
+0,
+244,
+192,
+51,
+0,
+0,
+8,
+255,
+35,
+4,
+0,
+180,
+192,
+51,
+0,
+0,
+8,
+255,
+99,
+4,
+0,
+164,
+192,
+51,
+0,
+0,
+8,
+255,
+163,
+4,
+0,
+148,
+192,
+51,
+0,
+0,
+111,
+3,
+32,
+246,
+192,
+11,
+1,
+16,
+32,
+246,
+2,
+137,
+47,
+240,
+40,
+246,
+2,
+140,
+47,
+240,
+128,
+245,
+99,
+140,
+5,
+4,
+0,
+247,
+99,
+140,
+1,
+20,
+88,
+246,
+99,
+140,
+1,
+20,
+0,
+247,
+35,
+136,
+62,
+226,
+32,
+247,
+35,
+136,
+32,
+210,
+0,
+247,
+34,
+136,
+63,
+2,
+208,
+246,
+34,
+136,
+0,
+4,
+0,
+247,
+99,
+136,
+58,
+162,
+32,
+247,
+99,
+136,
+33,
+146,
+0,
+247,
+98,
+136,
+59,
+18,
+208,
+246,
+98,
+136,
+0,
+20,
+0,
+247,
+162,
+136,
+33,
+2,
+88,
+246,
+98,
+137,
+2,
+68,
+88,
+246,
+162,
+137,
+3,
+68,
+208,
+254,
+227,
+136,
+60,
+242,
+192,
+243,
+188,
+11,
+208,
+254,
+227,
+136,
+56,
+178,
+192,
+243,
+188,
+10,
+32,
+255,
+226,
+136,
+38,
+58,
+192,
+243,
+60,
+0,
+208,
+254,
+227,
+136,
+59,
+242,
+192,
+243,
+60,
+128,
+32,
+255,
+226,
+136,
+49,
+58,
+192,
+243,
+60,
+128,
+0,
+255,
+226,
+136,
+34,
+34,
+192,
+243,
+60,
+128,
+32,
+255,
+226,
+136,
+37,
+58,
+192,
+243,
+60,
+128,
+0,
+254,
+192,
+136,
+1,
+4,
+0,
+240,
+0,
+160,
+0,
+255,
+194,
+8,
+0,
+52,
+195,
+243,
+0,
+128,
+0,
+255,
+202,
+40,
+0,
+52,
+195,
+243,
+0,
+128,
+0,
+254,
+0,
+240,
+35,
+10,
+0,
+240,
+60,
+0,
+0,
+254,
+192,
+136,
+1,
+4,
+0,
+240,
+0,
+160,
+0,
+255,
+226,
+140,
+34,
+34,
+195,
+243,
+60,
+0,
+32,
+255,
+227,
+140,
+36,
+58,
+192,
+243,
+60,
+0,
+0,
+254,
+192,
+136,
+0,
+4,
+0,
+240,
+0,
+160,
+16,
+246,
+226,
+136,
+35,
+50,
+16,
+246,
+226,
+136,
+35,
+50,
+32,
+246,
+226,
+136,
+35,
+50,
+32,
+254,
+226,
+136,
+35,
+58,
+192,
+243,
+60,
+0,
+11,
+96,
+0,
+254,
+0,
+240,
+1,
+4,
+0,
+240,
+64,
+115,
+5,
+106,
+0,
+144,
+173,
+1,
+27,
+96,
+0,
+254,
+0,
+240,
+1,
+4,
+0,
+240,
+64,
+147,
+5,
+106,
+0,
+144,
+227,
+0,
+64,
+246,
+163,
+140,
+1,
+4,
+0,
+246,
+192,
+175,
+63,
+2,
+0,
+246,
+192,
+174,
+59,
+2,
+0,
+246,
+128,
+175,
+62,
+2,
+0,
+246,
+128,
+174,
+58,
+2,
+0,
+246,
+64,
+175,
+61,
+2,
+0,
+246,
+64,
+174,
+57,
+2,
+0,
+255,
+43,
+240,
+4,
+212,
+192,
+243,
+128,
+11,
+64,
+254,
+43,
+240,
+1,
+228,
+192,
+243,
+128,
+10,
+64,
+254,
+43,
+240,
+1,
+244,
+192,
+243,
+128,
+10,
+64,
+254,
+43,
+240,
+1,
+180,
+192,
+243,
+128,
+10,
+64,
+254,
+43,
+141,
+0,
+164,
+192,
+243,
+128,
+10,
+88,
+246,
+35,
+141,
+3,
+68,
+32,
+247,
+35,
+141,
+191,
+66,
+240,
+246,
+35,
+141,
+50,
+66,
+0,
+255,
+235,
+143,
+52,
+242,
+192,
+243,
+60,
+128,
+0,
+255,
+43,
+240,
+2,
+212,
+192,
+243,
+128,
+11,
+0,
+255,
+43,
+240,
+191,
+226,
+192,
+243,
+188,
+10,
+64,
+254,
+43,
+141,
+0,
+180,
+192,
+243,
+128,
+10,
+88,
+246,
+35,
+141,
+2,
+68,
+32,
+247,
+35,
+141,
+190,
+66,
+240,
+246,
+35,
+141,
+50,
+66,
+0,
+255,
+171,
+143,
+52,
+226,
+192,
+243,
+60,
+128,
+0,
+255,
+43,
+240,
+4,
+180,
+192,
+243,
+128,
+11,
+0,
+255,
+43,
+240,
+191,
+226,
+192,
+243,
+188,
+10,
+128,
+253,
+43,
+240,
+3,
+212,
+192,
+243,
+128,
+10,
+64,
+254,
+35,
+141,
+1,
+196,
+192,
+243,
+128,
+10,
+88,
+246,
+35,
+141,
+3,
+68,
+32,
+247,
+35,
+141,
+189,
+66,
+240,
+246,
+35,
+141,
+50,
+66,
+0,
+255,
+107,
+143,
+52,
+210,
+192,
+243,
+60,
+128,
+0,
+255,
+43,
+240,
+4,
+148,
+192,
+243,
+128,
+11,
+64,
+254,
+43,
+240,
+1,
+164,
+192,
+243,
+128,
+10,
+64,
+254,
+43,
+240,
+1,
+180,
+192,
+243,
+128,
+10,
+64,
+254,
+43,
+240,
+1,
+244,
+192,
+243,
+128,
+10,
+64,
+254,
+43,
+141,
+0,
+228,
+192,
+243,
+128,
+10,
+88,
+246,
+35,
+141,
+3,
+68,
+32,
+247,
+35,
+141,
+187,
+66,
+240,
+246,
+35,
+141,
+50,
+66,
+0,
+255,
+235,
+142,
+52,
+178,
+192,
+243,
+60,
+128,
+0,
+255,
+43,
+240,
+2,
+148,
+192,
+243,
+128,
+11,
+0,
+255,
+43,
+240,
+187,
+162,
+192,
+243,
+188,
+10,
+64,
+254,
+43,
+141,
+0,
+244,
+192,
+243,
+128,
+10,
+88,
+246,
+35,
+141,
+2,
+68,
+32,
+247,
+35,
+141,
+186,
+66,
+240,
+246,
+35,
+141,
+50,
+66,
+0,
+255,
+171,
+142,
+52,
+162,
+192,
+243,
+60,
+128,
+0,
+255,
+43,
+240,
+4,
+244,
+192,
+243,
+128,
+11,
+0,
+255,
+43,
+240,
+187,
+162,
+192,
+243,
+188,
+10,
+128,
+253,
+43,
+240,
+3,
+148,
+192,
+243,
+128,
+10,
+64,
+254,
+35,
+141,
+1,
+132,
+192,
+243,
+128,
+10,
+88,
+246,
+35,
+141,
+3,
+68,
+32,
+247,
+35,
+141,
+185,
+66,
+240,
+246,
+35,
+141,
+50,
+66,
+0,
+255,
+107,
+142,
+52,
+146,
+192,
+243,
+60,
+128,
+64,
+255,
+98,
+141,
+0,
+52,
+192,
+243,
+0,
+0,
+0,
+254,
+0,
+240,
+53,
+10,
+0,
+240,
+60,
+0,
+0,
+254,
+0,
+240,
+1,
+4,
+0,
+240,
+64,
+147,
+5,
+106,
+0,
+144,
+177,
+0,
+88,
+246,
+163,
+140,
+1,
+4,
+128,
+245,
+99,
+141,
+10,
+4,
+88,
+246,
+162,
+138,
+1,
+68,
+0,
+247,
+162,
+138,
+36,
+162,
+88,
+254,
+162,
+138,
+3,
+164,
+192,
+243,
+128,
+11,
+0,
+255,
+226,
+137,
+32,
+2,
+195,
+243,
+60,
+0,
+32,
+247,
+226,
+137,
+42,
+114,
+0,
+255,
+34,
+138,
+33,
+18,
+195,
+243,
+60,
+0,
+32,
+247,
+34,
+138,
+42,
+130,
+16,
+246,
+98,
+138,
+40,
+114,
+16,
+246,
+98,
+138,
+41,
+146,
+32,
+246,
+98,
+138,
+41,
+146,
+32,
+246,
+226,
+137,
+41,
+146,
+40,
+246,
+34,
+138,
+41,
+146,
+32,
+247,
+163,
+141,
+63,
+178,
+32,
+247,
+227,
+141,
+62,
+162,
+0,
+254,
+0,
+240,
+8,
+4,
+0,
+240,
+128,
+11,
+128,
+253,
+35,
+240,
+9,
+100,
+192,
+243,
+128,
+10,
+128,
+253,
+163,
+141,
+128,
+115,
+192,
+243,
+152,
+10,
+88,
+246,
+163,
+141,
+4,
+100,
+208,
+246,
+35,
+139,
+0,
+100,
+32,
+255,
+34,
+139,
+53,
+202,
+192,
+243,
+60,
+128,
+0,
+254,
+0,
+139,
+0,
+4,
+0,
+240,
+0,
+160,
+240,
+246,
+163,
+141,
+48,
+98,
+0,
+247,
+99,
+139,
+63,
+210,
+0,
+247,
+98,
+139,
+1,
+212,
+88,
+254,
+98,
+139,
+1,
+212,
+192,
+243,
+128,
+11,
+32,
+255,
+99,
+139,
+62,
+98,
+192,
+243,
+188,
+10,
+88,
+246,
+98,
+139,
+1,
+212,
+240,
+246,
+98,
+139,
+50,
+210,
+0,
+247,
+163,
+128,
+59,
+146,
+0,
+247,
+160,
+128,
+1,
+36,
+88,
+254,
+160,
+128,
+1,
+36,
+192,
+243,
+128,
+11,
+0,
+247,
+163,
+128,
+58,
+98,
+64,
+255,
+35,
+240,
+0,
+100,
+192,
+243,
+128,
+10,
+64,
+255,
+163,
+128,
+0,
+164,
+192,
+243,
+128,
+10,
+88,
+246,
+160,
+128,
+1,
+36,
+240,
+246,
+160,
+128,
+50,
+34,
+8,
+255,
+227,
+143,
+54,
+242,
+192,
+243,
+60,
+128,
+40,
+255,
+227,
+142,
+54,
+178,
+192,
+243,
+60,
+128,
+0,
+254,
+0,
+240,
+39,
+10,
+0,
+240,
+60,
+128,
+8,
+255,
+163,
+143,
+45,
+226,
+192,
+243,
+60,
+128,
+0,
+254,
+0,
+240,
+44,
+10,
+0,
+240,
+60,
+0,
+0,
+254,
+0,
+240,
+40,
+10,
+0,
+240,
+60,
+128,
+8,
+255,
+163,
+142,
+2,
+162,
+192,
+243,
+60,
+128,
+90,
+0,
+169,
+3,
+14,
+96,
+4,
+31,
+169,
+3,
+30,
+96,
+1,
+31,
+73,
+64,
+52,
+64,
+45,
+64,
+2,
+64,
+10,
+64,
+64,
+198,
+1,
+7,
+8,
+232,
+63,
+0,
+0,
+0,
+6,
+232,
+253,
+255,
+255,
+255,
+0,
+246,
+0,
+0,
+0,
+4,
+215,
+64,
+3,
+96,
+2,
+248,
+0,
+35,
+0,
+0,
+64,
+56,
+0,
+0,
+4,
+248,
+0,
+36,
+0,
+0,
+64,
+56,
+8,
+0,
+0,
+240,
+64,
+0,
+132,
+3,
+30,
+106,
+132,
+24,
+128,
+240,
+0,
+0,
+132,
+3,
+128,
+144,
+143,
+0,
+131,
+98,
+0,
+255,
+64,
+0,
+0,
+20,
+200,
+243,
+0,
+0,
+128,
+144,
+135,
+0,
+131,
+102,
+0,
+158,
+71,
+0,
+2,
+248,
+0,
+35,
+0,
+0,
+64,
+56,
+0,
+0,
+4,
+248,
+0,
+36,
+0,
+0,
+64,
+56,
+8,
+0,
+0,
+240,
+64,
+0,
+132,
+3,
+30,
+106,
+132,
+24,
+128,
+240,
+0,
+0,
+132,
+3,
+128,
+144,
+112,
+0,
+131,
+98,
+0,
+255,
+64,
+0,
+0,
+20,
+200,
+243,
+0,
+0,
+128,
+144,
+104,
+0,
+131,
+102,
+0,
+248,
+64,
+0,
+112,
+0,
+192,
+243,
+211,
+31,
+30,
+106,
+134,
+24,
+128,
+248,
+0,
+0,
+112,
+0,
+192,
+243,
+211,
+31,
+128,
+144,
+123,
+0,
+188,
+64,
+67,
+232,
+0,
+2,
+0,
+0,
+0,
+255,
+64,
+0,
+0,
+20,
+200,
+243,
+0,
+0,
+128,
+144,
+112,
+0,
+195,
+232,
+0,
+2,
+0,
+0,
+12,
+128,
+7,
+192,
+130,
+248,
+0,
+0,
+112,
+192,
+224,
+16,
+195,
+31,
+132,
+248,
+1,
+0,
+112,
+0,
+224,
+16,
+203,
+31,
+3,
+99,
+131,
+71,
+68,
+232,
+32,
+0,
+0,
+0,
+0,
+99,
+2,
+99,
+23,
+102,
+7,
+106,
+127,
+156,
+178,
+255,
+0,
+248,
+64,
+0,
+112,
+0,
+192,
+243,
+211,
+31,
+30,
+106,
+134,
+24,
+128,
+248,
+0,
+0,
+112,
+0,
+192,
+243,
+211,
+31,
+128,
+144,
+72,
+0,
+188,
+64,
+67,
+232,
+0,
+2,
+0,
+0,
+0,
+255,
+64,
+0,
+0,
+20,
+200,
+243,
+0,
+0,
+128,
+144,
+61,
+0,
+195,
+232,
+0,
+2,
+0,
+0,
+12,
+128,
+7,
+192,
+130,
+248,
+0,
+0,
+112,
+192,
+224,
+16,
+195,
+31,
+132,
+248,
+1,
+0,
+112,
+0,
+224,
+16,
+203,
+31,
+25,
+102,
+9,
+106,
+2,
+30,
+41,
+3,
+26,
+87,
+162,
+64,
+64,
+198,
+1,
+23,
+127,
+158,
+95,
+255,
+239,
+3,
+0,
+254,
+128,
+143,
+94,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+192,
+143,
+95,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+192,
+142,
+208,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+128,
+142,
+209,
+0,
+0,
+240,
+12,
+0,
+128,
+144,
+47,
+0,
+8,
+255,
+227,
+23,
+0,
+244,
+192,
+51,
+0,
+0,
+8,
+255,
+35,
+52,
+0,
+180,
+192,
+51,
+0,
+0,
+111,
+3,
+239,
+3,
+0,
+254,
+128,
+143,
+14,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+192,
+143,
+15,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+192,
+142,
+16,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+128,
+142,
+17,
+0,
+0,
+240,
+12,
+0,
+128,
+144,
+13,
+0,
+8,
+255,
+227,
+3,
+0,
+244,
+192,
+51,
+0,
+0,
+8,
+255,
+35,
+4,
+0,
+180,
+192,
+51,
+0,
+0,
+111,
+3,
+32,
+246,
+192,
+11,
+1,
+16,
+32,
+246,
+2,
+140,
+47,
+240,
+32,
+247,
+35,
+141,
+63,
+178,
+64,
+254,
+35,
+141,
+2,
+68,
+192,
+243,
+128,
+11,
+32,
+255,
+35,
+240,
+58,
+226,
+192,
+243,
+188,
+10,
+0,
+254,
+0,
+141,
+4,
+4,
+0,
+240,
+128,
+10,
+88,
+246,
+35,
+141,
+3,
+68,
+240,
+246,
+35,
+141,
+48,
+66,
+0,
+247,
+227,
+143,
+52,
+242,
+32,
+247,
+227,
+142,
+52,
+178,
+90,
+0,
+161,
+3,
+6,
+64,
+23,
+64,
+96,
+8,
+70,
+98,
+97,
+8,
+70,
+98,
+98,
+8,
+70,
+98,
+99,
+8,
+70,
+98,
+100,
+8,
+70,
+98,
+101,
+8,
+70,
+98,
+255,
+159,
+8,
+250,
+23,
+102,
+7,
+106,
+112,
+30,
+33,
+3,
+};
diff --git a/libavcodec/rpi_mailbox.c b/libavcodec/rpi_mailbox.c
new file mode 100644
index 0000000000..0255f5dd44
--- /dev/null
+++ b/libavcodec/rpi_mailbox.c
@@ -0,0 +1,149 @@
+/*
+Copyright (c) 2012, Broadcom Europe Ltd.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the copyright holder nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef RPI
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <assert.h>
+#include <stdint.h>
+#include <sys/ioctl.h>
+
+#include <linux/ioctl.h>
+
+#define MAJOR_NUM 100
+#define IOCTL_MBOX_PROPERTY _IOWR(MAJOR_NUM, 0, char *)
+#define DEVICE_FILE_NAME "/dev/vcio"
+
+#include "rpi_mailbox.h"
+//#include <interface/vctypes/vc_image_structs.h>
+
+/*
+ * use ioctl to send mbox property message
+ */
+
+static int mbox_property(int file_desc, void *buf)
+{
+ int ret_val = ioctl(file_desc, IOCTL_MBOX_PROPERTY, buf);
+
+ if (ret_val < 0) {
+ printf("ioctl_set_msg failed:%d\n", ret_val);
+ }
+
+#ifdef DEBUG
+ unsigned *p = buf; int i; unsigned size = *(unsigned *)buf;
+ for (i=0; i<size/4; i++)
+ printf("%04x: 0x%08x\n", i*sizeof *p, p[i]);
+#endif
+ return ret_val;
+}
+
+unsigned mbox_mem_lock(int file_desc, unsigned handle)
+{
+ int i=0;
+ unsigned p[32];
+ p[i++] = 0; // size
+ p[i++] = 0x00000000; // process request
+
+ p[i++] = 0x3000d; // (the tag id)
+ p[i++] = 4; // (size of the buffer)
+ p[i++] = 4; // (size of the data)
+ p[i++] = handle;
+
+ p[i++] = 0x00000000; // end tag
+ p[0] = i*sizeof *p; // actual size
+
+ mbox_property(file_desc, p);
+ return p[5];
+}
+
+unsigned mbox_mem_unlock(int file_desc, unsigned handle)
+{
+ int i=0;
+ unsigned p[32];
+ p[i++] = 0; // size
+ p[i++] = 0x00000000; // process request
+
+ p[i++] = 0x3000e; // (the tag id)
+ p[i++] = 4; // (size of the buffer)
+ p[i++] = 4; // (size of the data)
+ p[i++] = handle;
+
+ p[i++] = 0x00000000; // end tag
+ p[0] = i*sizeof *p; // actual size
+
+ mbox_property(file_desc, p);
+ return p[5];
+}
+
+#define GET_VCIMAGE_PARAMS 0x30044
+
+int mbox_get_image_params(int fd, VC_IMAGE_T * img)
+{
+ uint32_t buf[sizeof(*img) / sizeof(uint32_t) + 32];
+ uint32_t * p = buf;
+ void * rimg;
+ int rv;
+
+ *p++ = 0; // size
+ *p++ = 0; // process request
+ *p++ = GET_VCIMAGE_PARAMS;
+ *p++ = sizeof(*img);
+ *p++ = sizeof(*img);
+ rimg = p;
+ memcpy(p, img, sizeof(*img));
+ p += sizeof(*img) / sizeof(*p);
+ *p++ = 0; // End tag
+ buf[0] = (p - buf) * sizeof(*p);
+
+ rv = mbox_property(fd, buf);
+ memcpy(img, rimg, sizeof(*img));
+
+ return rv;
+}
+
+int mbox_open() {
+ int file_desc;
+
+ // open a char device file used for communicating with kernel mbox driver
+ file_desc = open(DEVICE_FILE_NAME, 0);
+ if (file_desc < 0) {
+ printf("Can't open device file: %s\n", DEVICE_FILE_NAME);
+ printf("Try creating a device file with: sudo mknod %s c %d 0\n", DEVICE_FILE_NAME, MAJOR_NUM);
+ }
+ return file_desc;
+}
+
+void mbox_close(int file_desc) {
+ close(file_desc);
+}
+
+#endif
+
diff --git a/libavcodec/rpi_mailbox.h b/libavcodec/rpi_mailbox.h
new file mode 100644
index 0000000000..b3168788d2
--- /dev/null
+++ b/libavcodec/rpi_mailbox.h
@@ -0,0 +1,58 @@
+#ifndef RPI_MAILBOX_H
+#define RPI_MAILBOX_H
+
+/* The image structure. */
+typedef struct vc_image_extra_uv_s {
+ void *u, *v;
+ int vpitch;
+} VC_IMAGE_EXTRA_UV_T;
+
+typedef union {
+ VC_IMAGE_EXTRA_UV_T uv;
+// VC_IMAGE_EXTRA_RGBA_T rgba;
+// VC_IMAGE_EXTRA_PAL_T pal;
+// VC_IMAGE_EXTRA_TF_T tf;
+// VC_IMAGE_EXTRA_BAYER_T bayer;
+// VC_IMAGE_EXTRA_MSBAYER_T msbayer;
+// VC_IMAGE_EXTRA_CODEC_T codec;
+// VC_IMAGE_EXTRA_OPENGL_T opengl;
+} VC_IMAGE_EXTRA_T;
+
+
+typedef struct VC_IMAGE_T {
+ unsigned short type; /* should restrict to 16 bits */
+ unsigned short info; /* format-specific info; zero for VC02 behaviour */
+ unsigned short width; /* width in pixels */
+ unsigned short height; /* height in pixels */
+ int pitch; /* pitch of image_data array in bytes */
+ int size; /* number of bytes available in image_data array */
+ void *image_data; /* pixel data */
+ VC_IMAGE_EXTRA_T extra; /* extra data like palette pointer */
+ void *metadata; /* metadata header for the image */
+ void *pool_object; /* nonNULL if image was allocated from a vc_pool */
+ int mem_handle; /* the mem handle for relocatable memory storage */
+ int metadata_size; /* size of metadata of each channel in bytes */
+ int channel_offset; /* offset of consecutive channels in bytes */
+ uint32_t video_timestamp;/* 90000 Hz RTP times domain - derived from audio timestamp */
+ uint8_t num_channels; /* number of channels (2 for stereo) */
+ uint8_t current_channel;/* the channel this header is currently pointing to */
+ uint8_t linked_multichann_flag;/* Indicate the header has the linked-multichannel structure*/
+ uint8_t is_channel_linked; /* Track if the above structure is been used to link the header
+ into a linked-mulitchannel image */
+ uint8_t channel_index; /* index of the channel this header represents while
+ it is being linked. */
+ uint8_t _dummy[3]; /* pad struct to 64 bytes */
+} VC_IMAGE_T;
+
+typedef int vc_image_t_size_check[(sizeof(VC_IMAGE_T) == 64) * 2 - 1];
+
+
+extern int mbox_open(void);
+extern void mbox_close(int file_desc);
+
+extern unsigned mbox_mem_lock(int file_desc, unsigned handle);
+extern unsigned mbox_mem_unlock(int file_desc, unsigned handle);
+
+int mbox_get_image_params(int fd, VC_IMAGE_T * img);
+
+#endif
diff --git a/libavcodec/rpi_opts.h b/libavcodec/rpi_opts.h
new file mode 100644
index 0000000000..e6127749ea
--- /dev/null
+++ b/libavcodec/rpi_opts.h
@@ -0,0 +1,46 @@
+#ifndef AVCODEC_RPI_OPTS_H
+#define AVCODEC_RPI_OPTS_H
+
+// define RPI to split the CABAC/prediction/transform into separate stages
+#ifndef RPI
+
+ #define RPI_INTER 0
+ #define RPI_TSTATS 0
+ #define RPI_HEVC_SAND 0
+
+#else
+ #include "config.h"
+
+ #define RPI_INTER 1 // 0 use ARM for UV inter-pred, 1 use QPU
+
+ // By passing jobs to a worker thread we hope to be able to catch up during slow frames
+ // This has no effect unless RPI_WORKER is defined
+ // N.B. The extra thread count is effectively RPI_MAX_JOBS - 1 as
+ // RPI_MAX_JOBS defines the number of worker parameter sets and we must have one
+ // free for the foreground to fill in.
+ #define RPI_MAX_JOBS 2
+
+ // Define RPI_DEBLOCK_VPU to perform deblocking on the VPUs
+ // As it stands there is something mildy broken in VPU deblock - looks mostly OK
+ // but reliably fails some conformance tests (e.g. DBLK_A/B/C_)
+ // With VPU luma & chroma pred it is much the same speed to deblock on the ARM
+// #define RPI_DEBLOCK_VPU
+
+ #define RPI_VPU_DEBLOCK_CACHED 1
+
+ #if HAVE_NEON
+ #define RPI_HEVC_SAND 1
+ #else
+ // Sand bust on Pi1 currently - reasons unknown
+ #define RPI_HEVC_SAND 0
+ #endif
+
+
+ #define RPI_QPU_EMU_Y 0
+ #define RPI_QPU_EMU_C 0
+
+ #define RPI_TSTATS 0
+#endif
+
+#endif
+
diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
new file mode 100644
index 0000000000..e872b855b7
--- /dev/null
+++ b/libavcodec/rpi_qpu.c
@@ -0,0 +1,935 @@
+#ifdef RPI
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stddef.h>
+#include <stdint.h>
+#include "libavutil/avassert.h"
+
+#include "config.h"
+
+#include <pthread.h>
+#include <time.h>
+
+#include <interface/vcsm/user-vcsm.h>
+
+#include "rpi_mailbox.h"
+#include "rpi_qpu.h"
+#include "rpi_shader.h"
+#include "rpi_hevc_transform8.h"
+#include "rpi_hevc_transform10.h"
+#include "libavutil/rpi_sand_fns.h"
+
+#pragma GCC diagnostic push
+// Many many redundant decls in the header files
+#pragma GCC diagnostic ignored "-Wredundant-decls"
+#include "interface/vmcs_host/vc_vchi_gpuserv.h"
+#pragma GCC diagnostic pop
+
+// Trace time spent waiting for GPU (VPU/QPU) (1=Yes, 0=No)
+#define RPI_TRACE_TIME_VPU_QPU_WAIT 0
+
+// Add profile flags to all QPU requests - generates output in "vcdbg log msg"
+// Beware this is expensive and will probably throw off all other timing by >10%
+#define RPI_TRACE_QPU_PROFILE_ALL 0
+
+// QPU "noflush" flags
+// a mixture of flushing & profiling
+
+#define QPU_FLAGS_NO_FLUSH_VPU 1 // If unset VPU cache will be flushed
+#define QPU_FLAGS_PROF_CLEAR_AND_ENABLE 2 // Clear & Enable detailed QPU profiling registers
+#define QPU_FLAGS_PROF_OUTPUT_COUNTS 4 // Print the results
+#define QPU_FLAGS_OUTPUT_QPU_TIMES 8 // Print QPU times - independant of the profiling
+#define QPU_FLAGS_NO_FLUSH_QPU 16 // If unset flush QPU caches & TMUs (uniforms always flushed)
+
+#define vcos_verify_ge0(x) ((x)>=0)
+
+// Size in 32bit words
+#define QPU_CODE_SIZE 4098
+#define VPU_CODE_SIZE 2048
+
+static const short rpi_transMatrix2even[32][16] = { // Even rows first
+{64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64},
+{90, 87, 80, 70, 57, 43, 25, 9, -9, -25, -43, -57, -70, -80, -87, -90},
+{89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89},
+{87, 57, 9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87},
+{83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83},
+{80, 9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80},
+{75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75},
+{70, -43, -87, 9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70},
+{64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64},
+{57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87, 9, -90, 25, 80, -57},
+{50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50},
+{43, -90, 57, 25, -87, 70, 9, -80, 80, -9, -70, 87, -25, -57, 90, -43},
+{36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36},
+{25, -70, 90, -80, 43, 9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25},
+{18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18},
+{ 9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9},
+// Odd rows
+{90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4},
+{90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13},
+{88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22},
+{85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31},
+{82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38},
+{78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46},
+{73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54},
+{67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61},
+{61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67},
+{54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73},
+{46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78},
+{38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82},
+{31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85},
+{22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88},
+{13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90},
+{ 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90}
+};
+
+// Code/constants on GPU
+struct GPU
+{
+ unsigned int qpu_code[QPU_CODE_SIZE];
+ unsigned int vpu_code8[VPU_CODE_SIZE];
+ unsigned int vpu_code10[VPU_CODE_SIZE];
+ short transMatrix2even[16*16*2];
+};
+
+#define CFE_ENTS_PER_A 8
+// If we have a sliced frame 2048 wide @ 64 per slice then there are 32 slices
+// in a line & we want to flush luma + chroma + a couple of bits so ents ~ 70
+// allow 128
+#define CFE_ENT_COUNT 128
+#define CFE_A_COUNT (CFE_ENT_COUNT / CFE_ENTS_PER_A)
+
+struct rpi_cache_flush_env_s {
+// unsigned int n;
+// struct vcsm_user_clean_invalid_s a[CFE_A_COUNT];
+ struct vcsm_user_clean_invalid2_s v;
+};
+
+#define WAIT_COUNT_MAX 16
+
+typedef struct trace_time_one_s
+{
+ int count;
+ int64_t start[WAIT_COUNT_MAX];
+ int64_t total[WAIT_COUNT_MAX];
+} trace_time_one_t;
+
+typedef struct trace_time_wait_s
+{
+ unsigned int jcount;
+ int64_t start0;
+ int64_t last_update;
+ trace_time_one_t active;
+ trace_time_one_t wait;
+} trace_time_wait_t;
+
+typedef struct vq_wait_s
+{
+ sem_t sem;
+ struct vq_wait_s * next;
+} vq_wait_t;
+
+#define VQ_WAIT_POOL_SIZE 16
+typedef struct vq_wait_pool_s
+{
+ vq_wait_t * head;
+ vq_wait_t pool[VQ_WAIT_POOL_SIZE];
+} vq_wait_pool_t;
+
+static void vq_wait_pool_init(vq_wait_pool_t * const pool);
+static void vq_wait_pool_deinit(vq_wait_pool_t * const pool);
+
+typedef struct gpu_env_s
+{
+ int open_count;
+ int init_count;
+ int mb;
+ int vpu_i_cache_flushed;
+ GPU_MEM_PTR_T code_gm_ptr;
+ vq_wait_pool_t wait_pool;
+#if RPI_TRACE_TIME_VPU_QPU_WAIT
+ trace_time_wait_t ttw;
+#endif
+} gpu_env_t;
+
+// Stop more than one thread trying to allocate memory or use the processing resources at once
+static pthread_mutex_t gpu_mutex = PTHREAD_MUTEX_INITIALIZER;
+static gpu_env_t * gpu = NULL;
+
+#if RPI_TRACE_TIME_VPU_QPU_WAIT
+
+static int64_t ns_time(void)
+{
+ struct timespec ts;
+ clock_gettime(CLOCK_MONOTONIC, &ts);
+ return (int64_t)ts.tv_sec * (int64_t)1000000000 + ts.tv_nsec;
+}
+
+
+#define WAIT_TIME_PRINT_PERIOD (int64_t)2000000000
+
+#define T_MS(t) ((unsigned int)((t)/(int64_t)1000000) % 1000U)
+#define T_SEC(t) (unsigned int)((t)/(int64_t)1000000000)
+#define T_ARG(t) T_SEC(t), T_MS(t)
+#define T_FMT "%u.%03u"
+
+static void tto_print(trace_time_one_t * tto, const int64_t now, const int64_t start0, const char * const prefix)
+{
+ // Update totals for levels that are still pending
+ for (int i = 0; i < tto->count; ++i) {
+ tto->total[i] += now - tto->start[i];
+ tto->start[i] = now;
+ }
+
+ printf("%s: Idle:" T_FMT ", 1:" T_FMT ", 2:" T_FMT ", 3:" T_FMT ", 4:" T_FMT "\n",
+ prefix,
+ T_ARG(now - start0 - tto->total[0]),
+ T_ARG(tto->total[0]),
+ T_ARG(tto->total[1]),
+ T_ARG(tto->total[2]),
+ T_ARG(tto->total[3]));
+}
+
+
+static void tto_start(trace_time_one_t * const tto, const int64_t now)
+{
+ av_assert0(tto->count < WAIT_COUNT_MAX);
+ tto->start[tto->count++] = now;
+}
+
+static void tto_end(trace_time_one_t * const tto, const int64_t now)
+{
+ const int n = --tto->count;
+ av_assert0(n >= 0);
+ tto->total[n] += now - tto->start[n];
+}
+
+static void ttw_print(trace_time_wait_t * const ttw, const int64_t now)
+{
+ printf("Jobs:%d, Total time=" T_FMT "\n", ttw->jcount, T_ARG(now - ttw->start0));
+ tto_print(&ttw->active, now, ttw->start0, "Active");
+ tto_print(&ttw->wait, now, ttw->start0, " Wait");
+}
+
+#endif
+
+// GPU memory alloc fns (internal)
+
+// GPU_MEM_PTR_T alloc fns
+static int gpu_malloc_cached_internal(const int mb, const int numbytes, GPU_MEM_PTR_T * const p) {
+ p->numbytes = (numbytes + 255) & ~255; // Round up
+ p->vcsm_handle = vcsm_malloc_cache(p->numbytes, VCSM_CACHE_TYPE_HOST | 0x80, (char *)"Video Frame" );
+ //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" );
+ //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
+ //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST_AND_VC, (char *)"Video Frame" );
+ av_assert0(p->vcsm_handle);
+ p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
+ av_assert0(p->vc_handle);
+ p->arm = vcsm_lock(p->vcsm_handle);
+ av_assert0(p->arm);
+ p->vc = mbox_mem_lock(mb, p->vc_handle);
+ av_assert0(p->vc);
+// printf("***** %s, %d\n", __func__, numbytes);
+
+ return 0;
+}
+
+static int gpu_malloc_uncached_internal(const int mb, const int numbytes, GPU_MEM_PTR_T * const p) {
+ p->numbytes = numbytes;
+ p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE | 0x80, (char *)"Video Frame" );
+ av_assert0(p->vcsm_handle);
+ p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
+ av_assert0(p->vc_handle);
+ p->arm = vcsm_lock(p->vcsm_handle);
+ av_assert0(p->arm);
+ p->vc = mbox_mem_lock(mb, p->vc_handle);
+ av_assert0(p->vc);
+// printf("***** %s, %d\n", __func__, numbytes);
+ return 0;
+}
+
+static void gpu_free_internal(const int mb, GPU_MEM_PTR_T * const p) {
+ mbox_mem_unlock(mb, p->vc_handle);
+ vcsm_unlock_ptr(p->arm);
+ vcsm_free(p->vcsm_handle);
+ memset(p, 0, sizeof(*p)); // Ensure we crash hard if we try and use this again
+// printf("***** %s\n", __func__);
+}
+
+
+// GPU init, free, lock, unlock
+
+static void gpu_term(void)
+{
+ gpu_env_t * const ge = gpu;
+
+ // We have to hope that eveything has terminated...
+ gpu = NULL;
+
+ vc_gpuserv_deinit();
+
+ gpu_free_internal(ge->mb, &ge->code_gm_ptr);
+
+ vcsm_exit();
+
+ mbox_close(ge->mb);
+
+ vq_wait_pool_deinit(&ge->wait_pool);
+
+ free(ge);
+}
+
+
+// Connect to QPU, returns 0 on success.
+static int gpu_init(gpu_env_t ** const gpu) {
+ volatile struct GPU* ptr;
+ gpu_env_t * const ge = calloc(1, sizeof(gpu_env_t));
+ *gpu = NULL;
+
+ if (ge == NULL)
+ return -1;
+
+ if ((ge->mb = mbox_open()) < 0)
+ return -1;
+
+ vq_wait_pool_init(&ge->wait_pool);
+
+ vcsm_init();
+
+ gpu_malloc_uncached_internal(ge->mb, sizeof(struct GPU), &ge->code_gm_ptr);
+ ptr = (volatile struct GPU*)ge->code_gm_ptr.arm;
+
+ // Zero everything so we have zeros between the code bits
+ memset((void *)ptr, 0, sizeof(*ptr));
+
+ // Now copy over the QPU code into GPU memory
+ {
+ int num_bytes = (char *)mc_end - (char *)rpi_shader;
+ av_assert0(num_bytes<=QPU_CODE_SIZE*sizeof(unsigned int));
+ memcpy((void*)ptr->qpu_code, rpi_shader, num_bytes);
+ }
+ // And the VPU code
+ {
+ int num_bytes = sizeof(rpi_hevc_transform8);
+ av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
+ memcpy((void*)ptr->vpu_code8, rpi_hevc_transform8, num_bytes);
+ }
+ {
+ int num_bytes = sizeof(rpi_hevc_transform10);
+ av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
+ memcpy((void*)ptr->vpu_code10, rpi_hevc_transform10, num_bytes);
+ }
+ // And the transform coefficients
+ memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, sizeof(rpi_transMatrix2even));
+
+ *gpu = ge;
+ return 0;
+}
+
+
+
+static void gpu_unlock(void) {
+ pthread_mutex_unlock(&gpu_mutex);
+}
+
+// Make sure we have exclusive access to the mailbox, and enable qpu if necessary.
+static gpu_env_t * gpu_lock(void) {
+ pthread_mutex_lock(&gpu_mutex);
+
+ av_assert0(gpu != NULL);
+ return gpu;
+}
+
+static gpu_env_t * gpu_lock_ref(void)
+{
+ pthread_mutex_lock(&gpu_mutex);
+
+ if (gpu == NULL) {
+ int rv = gpu_init(&gpu);
+ if (rv != 0) {
+ gpu_unlock();
+ return NULL;
+ }
+ }
+
+ ++gpu->open_count;
+ return gpu;
+}
+
+static void gpu_unlock_unref(gpu_env_t * const ge)
+{
+ if (--ge->open_count == 0)
+ gpu_term();
+
+ gpu_unlock();
+}
+
+static inline gpu_env_t * gpu_ptr(void)
+{
+ av_assert0(gpu != NULL);
+ return gpu;
+}
+
+// Public gpu fns
+
+// Allocate memory on GPU
+// Fills in structure <p> containing ARM pointer, videocore handle, videocore memory address, numbytes
+// Returns 0 on success.
+// This allocates memory that will not be cached in ARM's data cache.
+// Therefore safe to use without data cache flushing.
+int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p)
+{
+ int r;
+ gpu_env_t * const ge = gpu_lock_ref();
+ if (ge == NULL)
+ return -1;
+ r = gpu_malloc_uncached_internal(ge->mb, numbytes, p);
+ gpu_unlock();
+ return r;
+}
+
+// This allocates data that will be
+// Cached in ARM L2
+// Uncached in VPU L2
+int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p)
+{
+ int r;
+ gpu_env_t * const ge = gpu_lock_ref();
+ if (ge == NULL)
+ return -1;
+ r = gpu_malloc_cached_internal(ge->mb, numbytes, p);
+ gpu_unlock();
+ return r;
+}
+
+void gpu_free(GPU_MEM_PTR_T * const p) {
+ gpu_env_t * const ge = gpu_lock();
+ gpu_free_internal(ge->mb, p);
+ gpu_unlock_unref(ge);
+}
+
+unsigned int vpu_get_fn(const unsigned int bit_depth) {
+ // Make sure that the gpu is initialized
+ av_assert0(gpu != NULL);
+ switch (bit_depth){
+ case 8:
+ return gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code8);
+ case 10:
+ return gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code10);
+ default:
+ av_assert0(0);
+ }
+ return 0;
+}
+
+unsigned int vpu_get_constants(void) {
+ av_assert0(gpu != NULL);
+ return gpu->code_gm_ptr.vc + offsetof(struct GPU,transMatrix2even);
+}
+
+int gpu_get_mailbox(void)
+{
+ av_assert0(gpu);
+ return gpu->mb;
+}
+
+void gpu_ref(void)
+{
+ gpu_lock_ref();
+ gpu_unlock();
+}
+
+void gpu_unref(void)
+{
+ gpu_env_t * const ge = gpu_lock();
+ gpu_unlock_unref(ge);
+}
+
+// ----------------------------------------------------------------------------
+//
+// Cache flush functions
+
+#define CACHE_EL_MAX 16
+
+rpi_cache_flush_env_t * rpi_cache_flush_init()
+{
+ rpi_cache_flush_env_t * const rfe = malloc(sizeof(rpi_cache_flush_env_t) +
+ sizeof(struct vcsm_user_clean_invalid2_block_s) * CACHE_EL_MAX);
+ if (rfe == NULL)
+ return NULL;
+
+ rfe->v.op_count = 0;
+ return rfe;
+}
+
+void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe)
+{
+ if (rfe != NULL)
+ free(rfe);
+}
+
+int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe)
+{
+ int rc = 0;
+
+ if (vcsm_clean_invalid2(&rfe->v) != 0)
+ rc = -1;
+
+ free(rfe);
+
+ if (rc == 0)
+ return 0;
+
+ av_log(NULL, AV_LOG_ERROR, "vcsm_clean_invalid failed: errno=%d\n", errno);
+ return rc;
+}
+
+inline void rpi_cache_flush_add_gm_blocks(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
+ const unsigned int offset0, const unsigned int block_size, const unsigned int blocks, const unsigned int block_stride)
+{
+ struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
+
+ av_assert0(rfe->v.op_count <= CACHE_EL_MAX);
+
+ b->invalidate_mode = mode;
+ b->block_count = blocks;
+ b->start_address = gm->arm + offset0;
+ b->block_size = block_size;
+ b->inter_block_stride = block_stride;
+}
+
+void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
+ const unsigned int offset, const unsigned int size)
+{
+ // Deal with empty pointer trivially
+ if (gm == NULL || size == 0)
+ return;
+
+ av_assert0(offset <= gm->numbytes);
+ av_assert0(size <= gm->numbytes);
+ av_assert0(offset + size <= gm->numbytes);
+
+ rpi_cache_flush_add_gm_blocks(rfe, gm, mode, offset, size, 1, 0);
+}
+
+void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode)
+{
+ rpi_cache_flush_add_gm_blocks(rfe, gm, mode, 0, gm->numbytes, 1, 0);
+}
+
+
+void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode)
+{
+#if !RPI_ONE_BUF
+#error Fixme! (NIF)
+#endif
+ if (gpu_is_buf1(frame)) {
+ rpi_cache_flush_add_gm_ptr(rfe, gpu_buf1_gmem(frame), mode);
+ }
+ else
+ {
+ rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 0), mode);
+ rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 1), mode);
+ rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 2), mode);
+ }
+}
+
+// Flush an area of a frame
+// Width, height, x0, y0 in luma pels
+void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode,
+ const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height,
+ const unsigned int uv_shift, const int do_luma, const int do_chroma)
+{
+ const unsigned int y_offset = frame->linesize[0] * y0;
+ const unsigned int y_size = frame->linesize[0] * height;
+ // Round UV up/down to get everything
+ const unsigned int uv_rnd = (1U << uv_shift) >> 1;
+ const unsigned int uv_offset = frame->linesize[1] * (y0 >> uv_shift);
+ const unsigned int uv_size = frame->linesize[1] * ((y0 + height + uv_rnd) >> uv_shift) - uv_offset;
+
+#if 0
+ // *** frame->height is cropped height so not good
+ // As all unsigned they will also reject -ve
+ // Test individually as well as added to reject overflow
+ av_assert0(start_line <= (unsigned int)frame->height); // ***** frame height cropped
+ av_assert0(n <= (unsigned int)frame->height);
+ av_assert0(start_line + n <= (unsigned int)frame->height);
+#endif
+
+ if (!gpu_is_buf1(frame))
+ {
+ if (do_luma) {
+ rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 0), mode, y_offset, y_size);
+ }
+ if (do_chroma) {
+ rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 1), mode, uv_offset, uv_size);
+ rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 2), mode, uv_offset, uv_size);
+ }
+ }
+ else if (!av_rpi_is_sand_frame(frame))
+ {
+ const GPU_MEM_PTR_T * const gm = gpu_buf1_gmem(frame);
+ if (do_luma) {
+ rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[0] - gm->arm) + y_offset, y_size);
+ }
+ if (do_chroma) {
+ rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[1] - gm->arm) + uv_offset, uv_size);
+ rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[2] - gm->arm) + uv_offset, uv_size);
+ }
+ }
+ else
+ {
+ const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
+ const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
+ const unsigned int xshl = av_rpi_sand_frame_xshl(frame);
+ const unsigned int xleft = x0 & ~((stride1 >> xshl) - 1);
+ const unsigned int block_count = (((x0 + width - xleft) << xshl) + stride1 - 1) / stride1; // Same for Y & C
+ av_assert0(rfe->v.op_count + do_chroma + do_luma < CACHE_EL_MAX);
+
+ if (do_chroma)
+ {
+ struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
+ b->invalidate_mode = mode;
+ b->block_count = block_count;
+ b->start_address = av_rpi_sand_frame_pos_c(frame, xleft >> 1, y0 >> 1);
+ b->block_size = uv_size;
+ b->inter_block_stride = stride1 * stride2;
+ }
+ if (do_luma)
+ {
+ struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
+ b->invalidate_mode = mode;
+ b->block_count = block_count;
+ b->start_address = av_rpi_sand_frame_pos_y(frame, xleft, y0);
+ b->block_size = y_size;
+ b->inter_block_stride = stride1 * stride2;
+ }
+ }
+}
+
+// Call this to clean and invalidate a region of memory
+void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T *const p, const rpi_cache_flush_mode_t mode)
+{
+ rpi_cache_flush_env_t * rfe = rpi_cache_flush_init();
+ rpi_cache_flush_add_gm_ptr(rfe, p, mode);
+ rpi_cache_flush_finish(rfe);
+}
+
+
+// ----------------------------------------------------------------------------
+
+
+// Wait abstractions - mostly so we can easily add profile code
+static void vq_wait_pool_init(vq_wait_pool_t * const wp)
+{
+ unsigned int i;
+ for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) {
+ sem_init(&wp->pool[i].sem, 0, 0);
+ wp->pool[i].next = wp->pool + i + 1;
+ }
+ wp->head = wp->pool + 0;
+ wp->pool[VQ_WAIT_POOL_SIZE - 1].next = NULL;
+}
+
+static void vq_wait_pool_deinit(vq_wait_pool_t * const wp)
+{
+ unsigned int i;
+ wp->head = NULL;
+ for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) {
+ sem_destroy(&wp->pool[i].sem);
+ wp->pool[i].next = NULL;
+ }
+}
+
+
+// If sem_init actually takes time then maybe we want a pool...
+static vq_wait_t * vq_wait_new(void)
+{
+ gpu_env_t * const ge = gpu_lock_ref();
+ vq_wait_t * const wait = ge->wait_pool.head;
+ ge->wait_pool.head = wait->next;
+ wait->next = NULL;
+
+#if RPI_TRACE_TIME_VPU_QPU_WAIT
+ tto_start(&ge->ttw.active, ns_time());
+#endif
+
+ gpu_unlock();
+ return wait;
+}
+
+static void vq_wait_delete(vq_wait_t * const wait)
+{
+ gpu_env_t * const ge = gpu_lock();
+ wait->next = ge->wait_pool.head;
+ ge->wait_pool.head = wait;
+
+#if RPI_TRACE_TIME_VPU_QPU_WAIT
+ {
+ trace_time_wait_t * const ttw = &ge->ttw;
+ const int64_t now = ns_time();
+ ++ttw->jcount;
+ tto_end(&ttw->wait, now);
+
+ if (ttw->start0 == 0)
+ {
+ ttw->start0 = ttw->active.start[0];
+ ttw->last_update = ttw->start0;
+ }
+ if (now - ttw->last_update > WAIT_TIME_PRINT_PERIOD)
+ {
+ ttw->last_update += WAIT_TIME_PRINT_PERIOD;
+ ttw_print(ttw, now);
+ }
+ }
+#endif
+ gpu_unlock_unref(ge);
+}
+
+static void vq_wait_wait(vq_wait_t * const wait)
+{
+#if RPI_TRACE_TIME_VPU_QPU_WAIT
+ {
+ const int64_t now = ns_time();
+ gpu_env_t * const ge = gpu_lock();
+ tto_start(&ge->ttw.wait, now);
+ gpu_unlock();
+ }
+#endif
+
+ while (sem_wait(&wait->sem) == -1 && errno == EINTR)
+ /* loop */;
+}
+
+static void vq_wait_post(vq_wait_t * const wait)
+{
+#if RPI_TRACE_TIME_VPU_QPU_WAIT
+ {
+ gpu_env_t *const ge = gpu_lock();
+ tto_end(&ge->ttw.active, ns_time());
+ gpu_unlock();
+ }
+#endif
+
+ sem_post(&wait->sem);
+}
+
+
+
+// Header comments were wrong for these two
+#define VPU_QPU_MASK_QPU 1
+#define VPU_QPU_MASK_VPU 2
+
+#define VPU_QPU_JOB_MAX 4
+struct vpu_qpu_job_env_s
+{
+ unsigned int n;
+ unsigned int mask;
+ struct gpu_job_s j[VPU_QPU_JOB_MAX];
+};
+
+typedef struct vpu_qpu_job_env_s vpu_qpu_job_env_t;
+
+vpu_qpu_job_env_t * vpu_qpu_job_new(void)
+{
+ vpu_qpu_job_env_t * vqj = calloc(1, sizeof(vpu_qpu_job_env_t));
+ return vqj;
+}
+
+void vpu_qpu_job_delete(vpu_qpu_job_env_t * const vqj)
+{
+ memset(vqj, 0, sizeof(*vqj));
+ free(vqj);
+}
+
+static inline struct gpu_job_s * new_job(vpu_qpu_job_env_t * const vqj)
+{
+ struct gpu_job_s * const j = vqj->j + vqj->n++;
+ av_assert0(vqj->n <= VPU_QPU_JOB_MAX);
+ return j;
+}
+
+void vpu_qpu_job_add_vpu(vpu_qpu_job_env_t * const vqj, const uint32_t vpu_code,
+ const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5)
+{
+ if (vpu_code != 0) {
+ struct gpu_job_s *const j = new_job(vqj);
+ vqj->mask |= VPU_QPU_MASK_VPU;
+
+ j->command = EXECUTE_VPU;
+ // The bottom two bits of the execute address contain no-flush flags
+ // b0 will flush the VPU I-cache if unset so we nearly always want that set
+ // as we never reload code
+ j->u.v.q[0] = vpu_code | gpu->vpu_i_cache_flushed;
+ j->u.v.q[1] = r0;
+ j->u.v.q[2] = r1;
+ j->u.v.q[3] = r2;
+ j->u.v.q[4] = r3;
+ j->u.v.q[5] = r4;
+ j->u.v.q[6] = r5;
+ gpu->vpu_i_cache_flushed = 1;
+ }
+}
+
+// flags are QPU_FLAGS_xxx
+void vpu_qpu_job_add_qpu(vpu_qpu_job_env_t * const vqj, const unsigned int n, const uint32_t * const mail)
+{
+ if (n != 0) {
+ struct gpu_job_s *const j = new_job(vqj);
+ vqj->mask |= VPU_QPU_MASK_QPU;
+
+ j->command = EXECUTE_QPU;
+ j->u.q.jobs = n;
+#if RPI_TRACE_QPU_PROFILE_ALL
+ j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU | QPU_FLAGS_PROF_CLEAR_AND_ENABLE | QPU_FLAGS_PROF_OUTPUT_COUNTS;
+#else
+ j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU;
+#endif
+ j->u.q.timeout = 5000;
+ memcpy(j->u.q.control, mail, n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
+ }
+}
+
+// Convert callback to sem post
+static void vpu_qpu_job_callback_wait(void * v)
+{
+ vq_wait_post(v);
+}
+
+void vpu_qpu_job_add_sync_this(vpu_qpu_job_env_t * const vqj, vpu_qpu_wait_h * const wait_h)
+{
+ vq_wait_t * wait;
+
+ if (vqj->mask == 0) {
+ *wait_h = NULL;
+ return;
+ }
+
+ // We are going to want a sync object
+ wait = vq_wait_new();
+
+ // There are 2 VPU Qs & 1 QPU Q so we can collapse sync
+ // If we only posted one thing or only QPU jobs
+ if (vqj->n == 1 || vqj->mask == VPU_QPU_MASK_QPU)
+ {
+ struct gpu_job_s * const j = vqj->j + (vqj->n - 1);
+ av_assert0(j->callback.func == 0);
+
+ j->callback.func = vpu_qpu_job_callback_wait;
+ j->callback.cookie = wait;
+ }
+ else
+ {
+ struct gpu_job_s *const j = new_job(vqj);
+
+ j->command = EXECUTE_SYNC;
+ j->u.s.mask = vqj->mask;
+ j->callback.func = vpu_qpu_job_callback_wait;
+ j->callback.cookie = wait;
+ }
+
+ vqj->mask = 0;
+ *wait_h = wait;
+}
+
+int vpu_qpu_job_start(vpu_qpu_job_env_t * const vqj)
+{
+ return vqj->n == 0 ? 0 : vc_gpuserv_execute_code(vqj->n, vqj->j);
+}
+
+// Simple wrapper of start + delete
+int vpu_qpu_job_finish(vpu_qpu_job_env_t * const vqj)
+{
+ int rv;
+ rv = vpu_qpu_job_start(vqj);
+ vpu_qpu_job_delete(vqj);
+ return rv;
+}
+
+void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h)
+{
+ if (wait_h != NULL)
+ {
+ vq_wait_t * const wait = *wait_h;
+ if (wait != NULL) {
+ *wait_h = NULL;
+ vq_wait_wait(wait);
+ vq_wait_delete(wait);
+ }
+ }
+}
+
+int vpu_qpu_init()
+{
+ gpu_env_t * const ge = gpu_lock_ref();
+ if (ge == NULL)
+ return -1;
+
+ if (ge->init_count++ == 0)
+ {
+ vc_gpuserv_init();
+ }
+
+ gpu_unlock();
+ return 0;
+}
+
+void vpu_qpu_term()
+{
+ gpu_env_t * const ge = gpu_lock();
+
+ if (--ge->init_count == 0) {
+ vc_gpuserv_deinit();
+
+#if RPI_TRACE_TIME_VPU_QPU_WAIT
+ ttw_print(&ge->ttw, ns_time());
+#endif
+ }
+
+ gpu_unlock_unref(ge);
+}
+
+uint32_t qpu_fn(const int * const mc_fn)
+{
+ return gpu->code_gm_ptr.vc + ((const char *)mc_fn - (const char *)rpi_shader) + offsetof(struct GPU, qpu_code);
+}
+
+
+int rpi_hevc_qpu_init_fn(HEVCRpiQpu * const qf, const unsigned int bit_depth)
+{
+ // Dummy values we can catch with emulation
+ qf->y_pxx = ~1U;
+ qf->y_bxx = ~2U;
+ qf->y_p00 = ~3U;
+ qf->y_b00 = ~4U;
+ qf->c_pxx = ~5U;
+ qf->c_bxx = ~6U;
+
+ switch (bit_depth) {
+ case 8:
+ qf->y_pxx = qpu_fn(mc_filter_y_pxx);
+ qf->y_pxx = qpu_fn(mc_filter_y_pxx);
+ qf->y_bxx = qpu_fn(mc_filter_y_bxx);
+ qf->y_p00 = qpu_fn(mc_filter_y_p00);
+ qf->y_b00 = qpu_fn(mc_filter_y_b00);
+ qf->c_pxx = qpu_fn(mc_filter_c_p);
+ qf->c_pxx_l1 = qpu_fn(mc_filter_c_p_l1);
+ qf->c_bxx = qpu_fn(mc_filter_c_b);
+ break;
+ case 10:
+ qf->c_pxx = qpu_fn(mc_filter_c10_p);
+ qf->c_pxx_l1 = qpu_fn(mc_filter_c10_p_l1);
+ qf->c_bxx = qpu_fn(mc_filter_c10_b);
+ qf->y_pxx = qpu_fn(mc_filter_y10_pxx);
+ qf->y_bxx = qpu_fn(mc_filter_y10_bxx);
+ qf->y_p00 = qpu_fn(mc_filter_y10_p00);
+ qf->y_b00 = qpu_fn(mc_filter_y10_b00);
+ break;
+ default:
+ return -1;
+ }
+ return 0;
+}
+
+#endif // RPI
diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
new file mode 100644
index 0000000000..485a08f8ba
--- /dev/null
+++ b/libavcodec/rpi_qpu.h
@@ -0,0 +1,206 @@
+#ifndef RPI_QPU_H
+#define RPI_QPU_H
+
+#define RPI_ONE_BUF 1
+
+typedef struct gpu_mem_ptr_s {
+ unsigned char *arm; // Pointer to memory mapped on ARM side
+ int vc_handle; // Videocore handle of relocatable memory
+ int vcsm_handle; // Handle for use by VCSM
+ int vc; // Address for use in GPU code
+ int numbytes; // Size of memory block
+} GPU_MEM_PTR_T;
+
+// General GPU functions
+extern int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p);
+extern int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p);
+extern void gpu_free(GPU_MEM_PTR_T * const p);
+
+#include "libavutil/frame.h"
+#if !RPI_ONE_BUF
+static inline uint32_t get_vc_address_y(const AVFrame * const frame) {
+ GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[0]);
+ return p->vc;
+}
+
+static inline uint32_t get_vc_address_u(const AVFrame * const frame) {
+ GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[1]);
+ return p->vc;
+}
+
+static inline uint32_t get_vc_address_v(const AVFrame * const frame) {
+ GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[2]);
+ return p->vc;
+}
+
+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) {
+ return *(GPU_MEM_PTR_T *)av_buffer_pool_opaque(frame->buf[0]);
+}
+
+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_u(const AVFrame * const frame) {
+ return *(GPU_MEM_PTR_T *)av_buffer_pool_opaque(frame->buf[1]);
+}
+
+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_v(const AVFrame * const frame) {
+ return *(GPU_MEM_PTR_T *)av_buffer_pool_opaque(frame->buf[2]);
+}
+
+#else
+
+static inline int gpu_is_buf1(const AVFrame * const frame)
+{
+ return frame->buf[1] == NULL;
+}
+
+static inline GPU_MEM_PTR_T * gpu_buf1_gmem(const AVFrame * const frame)
+{
+ return av_buffer_get_opaque(frame->buf[0]);
+}
+
+static inline GPU_MEM_PTR_T * gpu_buf3_gmem(const AVFrame * const frame, const unsigned int n)
+{
+ return av_buffer_pool_opaque(frame->buf[n]);
+}
+
+static inline uint32_t get_vc_address3(const AVFrame * const frame, const unsigned int n)
+{
+ const GPU_MEM_PTR_T * const gm = gpu_is_buf1(frame) ? gpu_buf1_gmem(frame) : gpu_buf3_gmem(frame, n);
+ return gm->vc + (frame->data[n] - gm->arm);
+}
+
+
+static inline uint32_t get_vc_address_y(const AVFrame * const frame) {
+ return get_vc_address3(frame, 0);
+}
+
+static inline uint32_t get_vc_address_u(const AVFrame * const frame) {
+ return get_vc_address3(frame, 1);
+}
+
+static inline uint32_t get_vc_address_v(const AVFrame * const frame) {
+ return get_vc_address3(frame, 2);
+}
+
+#if 0
+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) {
+ if (gpu_is_buf1(frame))
+ {
+ GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
+ g.numbytes = frame->data[1] - frame->data[0];
+ return g;
+ }
+ else
+ return *gpu_buf3_gmem(frame, 0);
+}
+
+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_u(const AVFrame * const frame) {
+ if (gpu_is_buf1(frame))
+ {
+ GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
+ g.arm += frame->data[1] - frame->data[0];
+ g.vc += frame->data[1] - frame->data[0];
+ g.numbytes = frame->data[2] - frame->data[1]; // chroma size
+ return g;
+ }
+ else
+ return *gpu_buf3_gmem(frame, 1);
+}
+
+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_v(const AVFrame * const frame) {
+ if (gpu_is_buf1(frame))
+ {
+ GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
+ g.arm += frame->data[2] - frame->data[0];
+ g.vc += frame->data[2] - frame->data[0];
+ g.numbytes = frame->data[2] - frame->data[1]; // chroma size
+ return g;
+ }
+ else
+ return *gpu_buf3_gmem(frame, 2);
+}
+#endif
+#endif
+
+// Cache flush stuff
+
+struct rpi_cache_flush_env_s;
+typedef struct rpi_cache_flush_env_s rpi_cache_flush_env_t;
+
+rpi_cache_flush_env_t * rpi_cache_flush_init(void);
+// Free env without flushing
+void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe);
+// Do the accumulated flush & free the env
+int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe);
+
+typedef enum
+{
+ RPI_CACHE_FLUSH_MODE_INVALIDATE = 1,
+ RPI_CACHE_FLUSH_MODE_WRITEBACK = 2,
+ RPI_CACHE_FLUSH_MODE_WB_INVALIDATE = 3
+} rpi_cache_flush_mode_t;
+
+void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode);
+void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode,
+ const unsigned int offset, const unsigned int size);
+void rpi_cache_flush_add_gm_blocks(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
+ const unsigned int offset0, const unsigned int block_size, const unsigned int blocks, const unsigned int block_stride);
+void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode);
+void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode,
+ const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height,
+ const unsigned int uv_shift, const int do_luma, const int do_chroma);
+
+// init, add, finish for one gm ptr
+void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T * const p, const rpi_cache_flush_mode_t mode);
+
+
+// QPU specific functions
+
+typedef struct HEVCRpiQpu {
+ uint32_t c_pxx;
+ uint32_t c_pxx_l1;
+ uint32_t c_bxx;
+ uint32_t y_pxx;
+ uint32_t y_bxx;
+ uint32_t y_p00;
+ uint32_t y_b00;
+} HEVCRpiQpu;
+
+int rpi_hevc_qpu_init_fn(HEVCRpiQpu * const qf, const unsigned int bit_depth);
+
+uint32_t qpu_fn(const int * const mc_fn);
+
+#define QPU_N_GRP 4
+#define QPU_N_MAX 12
+
+#define QPU_MAIL_EL_VALS 2
+
+struct vpu_qpu_wait_s;
+typedef struct vq_wait_s * vpu_qpu_wait_h;
+
+// VPU specific functions
+
+struct vpu_qpu_job_env_s;
+typedef struct vpu_qpu_job_env_s * vpu_qpu_job_h;
+
+vpu_qpu_job_h vpu_qpu_job_new(void);
+void vpu_qpu_job_delete(const vpu_qpu_job_h vqj);
+void vpu_qpu_job_add_vpu(const vpu_qpu_job_h vqj, const uint32_t vpu_code,
+ const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5);
+void vpu_qpu_job_add_qpu(const vpu_qpu_job_h vqj, const unsigned int n, const uint32_t * const mail);
+void vpu_qpu_job_add_sync_this(const vpu_qpu_job_h vqj, vpu_qpu_wait_h * const wait_h);
+int vpu_qpu_job_start(const vpu_qpu_job_h vqj);
+int vpu_qpu_job_finish(const vpu_qpu_job_h vqj);
+
+extern unsigned int vpu_get_fn(const unsigned int bit_depth);
+extern unsigned int vpu_get_constants(void);
+
+// Waits for previous post_codee to complete and Will null out *wait_h after use
+void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h);
+int vpu_qpu_init(void);
+void vpu_qpu_term(void);
+
+extern int gpu_get_mailbox(void);
+void gpu_ref(void);
+void gpu_unref(void);
+
+#endif
diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
new file mode 100644
index 0000000000..2c6541a8fb
--- /dev/null
+++ b/libavcodec/rpi_shader.c
@@ -0,0 +1,1570 @@
+#include "rpi_shader.h"
+
+#ifdef _MSC_VER
+ #include <stdint.h>
+ /* cast through uintptr_t to avoid warnings */
+ #define POINTER_TO_UINT(X) ((unsigned int)(uintptr_t)(X))
+#else
+ #define POINTER_TO_UINT(X) ((unsigned int)(X))
+#endif
+
+#ifdef __cplusplus
+extern "C" { /* the types are probably wrong... */
+#endif
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef _MSC_VER
+__declspec(align(8))
+#elif defined(__GNUC__)
+__attribute__((aligned(8)))
+#endif
+unsigned int rpi_shader[] = {
+// ::mc_setup_c_q0
+// ::mc_start
+/* [0x00000000] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i)
+// ::mc_setup_c_qn
+/* [0x00000008] */ 0x00000001, 0xe0020927, // mov tmurs, 1
+/* [0x00000010] */ 0x15827d80, 0x10020027, // mov ra0, unif
+/* [0x00000018] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
+/* [0x00000020] */ 0x119de1c0, 0xd00210e7, // shl rb_ef, r0, i_shift30
+/* [0x00000028] */ 0x15827d80, 0x10020627, // mov ra_base, unif
+/* [0x00000030] */ 0x0d801dc0, 0xd0020827, // sub r0, unif, 1
+/* [0x00000038] */ 0x119c11c0, 0xd0021667, // shl rb_max_x, r0, v_x_shift
+/* [0x00000040] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1
+/* [0x00000048] */ 0xff100100, 0xe0020527, // mov ra_kff100100, 0xff100100
+/* [0x00000050] */ 0x000000ff, 0xe00215a7, // mov rb_pmask, v_pmask
+/* [0x00000058] */ 0x001000ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
+/* [0x00000060] */ 0x15827d80, 0x10021527, // mov rb_xpitch, unif
+/* [0x00000068] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
+/* [0x00000070] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+/* [0x00000078] */ 0x0c9d03c0, 0x10021627, // add rb_dma1_base, r1, rb_pitch
+/* [0x00000080] */ 0x14981f80, 0xd0020827, // and r0, 1, elem_num
+/* [0x00000088] */ 0x409c5007, 0xd00049e0, // nop ; mul24 r0, r0, 5
+/* [0x00000090] */ 0x0c9a7180, 0x100210a7, // add rb_elem_x, r0, elem_num
+/* [0x00000098] */ 0x11001dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
+/* [0x000000a0] */ 0x0c9c21c0, 0x10020827, // add r0, r0, rb_elem_x
+/* [0x000000a8] */ 0x930001f6, 0xd2225811, // max r0, r0, 0 ; mov ra_y, ra0.16a
+/* [0x000000b0] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+/* [0x000000b8] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
+/* [0x000000c0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
+/* [0x000000c8] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
+/* [0x000000d0] */ 0x149e7040, 0x10020867, // and r1, r0, r1
+/* [0x000000d8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
+/* [0x000000e0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+/* [0x000000e8] */ 0x0c627c00, 0x10020627, // add ra_base, ra_base, r0
+/* [0x000000f0] */ 0x0c80ff80, 0xd0021367, // add rb_wt_den_p15, 23 - v_bit_depth, unif
+/* [0x000000f8] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+/* [0x00000100] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2
+/* [0x00000108] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+/* [0x00000110] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3
+/* [0x00000118] */ 0x159e7040, 0x10020827, // or r0, r0, r1
+/* [0x00000120] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+/* [0x00000128] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
+/* [0x00000130] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+/* [0x00000138] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+/* [0x00000140] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
+/* [0x00000148] */ 0x15827d80, 0x10020027, // mov ra0, unif
+/* [0x00000150] */ 0x15827d80, 0x10020667, // mov ra_base2, unif
+/* [0x00000158] */ 0x11001dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
+/* [0x00000160] */ 0x8c0021f6, 0x12125811, // add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a
+/* [0x00000168] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+/* [0x00000170] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+/* [0x00000178] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
+/* [0x00000180] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
+/* [0x00000188] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
+/* [0x00000190] */ 0x149e7040, 0x10020867, // and r1, r0, r1
+/* [0x00000198] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
+/* [0x000001a0] */ 0x8c467076, 0x12024822, // add r0, r0, r1 ; mov r2, ra_y2
+/* [0x000001a8] */ 0x0c667c00, 0x10020667, // add ra_base2, ra_base2, r0
+/* [0x000001b0] */ 0x95444ff6, 0xd40248e0, // mov r3, PREREAD ; mov r0, ra_y
+// :1
+/* [0x000001b8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
+/* [0x000001c0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
+/* [0x000001c8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
+/* [0x000001d0] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch
+/* [0x000001d8] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1 ; mov ra_y, r0
+/* [0x000001e0] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
+/* [0x000001e8] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
+/* [0x000001f0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
+/* [0x000001f8] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch
+/* [0x00000200] */ 0x8c667c52, 0x10125f11, // add t1s, ra_base2, r1 ; mov ra_y2, r2
+/* [0x00000208] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00000210] */ 0x00000000, 0xe0024104, // mov ra4, 0 ; mov rb4, 0
+/* [0x00000218] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00000220] */ 0x00000000, 0xe0024145, // mov ra5, 0 ; mov rb5, 0
+/* [0x00000228] */ 0x00000000, 0xe0024186, // mov ra6, 0 ; mov rb6, 0
+/* [0x00000230] */ 0x00000000, 0xe00241c7, // mov ra7, 0 ; mov rb7, 0
+// ::mc_filter_c_p
+/* [0x00000238] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
+/* [0x00000240] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
+/* [0x00000248] */ 0xf1081dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0
+/* [0x00000250] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif
+/* [0x00000258] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch ; mov ra0, unif
+/* [0x00000260] */ 0x93567176, 0x14024800, // max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next
+/* [0x00000268] */ 0x920991f6, 0x12225813, // min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a
+/* [0x00000270] */ 0x119c31c0, 0xd0220567, // shl vrx_xshift_next, r0, 3
+/* [0x00000278] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
+/* [0x00000280] */ 0x54402077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul
+/* [0x00000288] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
+/* [0x00000290] */ 0x8c827076, 0x10025803, // add r0, r0, r1 ; mov ra3, unif
+/* [0x00000298] */ 0x8c427636, 0x120246a1, // add vrx_base_next, r3, r0 ; mov r1, ra_height
+/* [0x000002a0] */ 0x8d818eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
+/* [0x000002a8] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
+/* [0x000002b0] */ 0x8c8033f6, 0xd0039496, // add rb_lcount, r1, 3 ; mov.ifc ra_wt_off_mul_l0, unif
+/* [0x000002b8] */ 0x910c73f6, 0xd8024808, // shl r0, r1, v_dma_h_shift ; mov rb8, ra3.8a
+/* [0x000002c0] */ 0x8c0e70b6, 0x1a024809, // add r0, r0, r2 ; mov rb9, ra3.8b
+/* [0x000002c8] */ 0x910d01f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c
+/* [0x000002d0] */ 0x8c59b1f6, 0x140256a1, // add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
+/* [0x000002d8] */ 0x9581edbf, 0x100255c9, // mov rb_dest, unif ; mov ra9, rb_max_y
+/* [0x000002e0] */ 0x910cd3f6, 0x1e02484b, // shl r1, r1, rb_wt_den_p15 ; mov rb11, ra3.8d
+/* [0x000002e8] */ 0x8f8023f6, 0xd002531e, // asr rb_wt_off, r1, 2 ; mov ra_link, unif
+/* [0x000002f0] */ 0x0d50df80, 0x1a0200e7, // sub ra3, rb_wt_den_p15, ra_k1
+// :1
+/* [0x000002f8] */ 0xcd511bee, 0xaa0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0
+/* [0x00000300] */ 0x8e4c09f6, 0x140288a3, // shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next
+/* [0x00000308] */ 0x8e4485f6, 0xd402c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
+/* [0x00000310] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next
+/* [0x00000318] */ 0x8c531789, 0xda224460, // add vra_y, r3, ra_k1 ; mov r0, r1 << 15
+/* [0x00000320] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1
+/* [0x00000328] */ 0x92267792, 0x1003c8e0, // min r3, r3, ra9 ; mov.ifnc r0, r2
+/* [0x00000330] */ 0x55150d9f, 0x10024122, // mov ra4, ra5 ; mul24 r2, r3, rb_pitch
+/* [0x00000338] */ 0x8c616c87, 0x10024e20, // add vr_txs, vra_base, r2 ; v8min r0, r0, rb_pmask
+/* [0x00000340] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0
+/* [0x00000348] */ 0x4003e030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 2, r0 << 2 @ "mul_used", 0
+/* [0x00000350] */ 0x40034031, 0xda0109e2, // nop ; mul24.ifn r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
+/* [0x00000358] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0
+/* [0x00000360] */ 0x40032031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
+/* [0x00000368] */ 0x4d004bf1, 0xde0269e0, // sub.setf -, r5, 4 ; mul24 r0, ra0.8d, r1
+/* [0x00000370] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x00000378] */ 0x8c1a74f6, 0x10025885, // add r2, r2, r3 ; mov ra5, ra6
+/* [0x00000380] */ 0x551cadb7, 0x100241a1, // mov ra6, ra7 ; mul24 r1, ra7, rb10
+/* [0x00000388] */ 0x4d108437, 0x100241e0, // sub ra7, r2, r0 ; mul24 r0, ra4, rb8
+/* [0x00000390] */ 0x4d149237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra5, rb9
+/* [0x00000398] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra7, rb11
+/* [0x000003a0] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
+/* [0x000003a8] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
+/* [0x000003b0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+/* [0x000003b8] */ 0x405a700e, 0x120049e1, // nop ; mul24 r1, r1, ra_wt_mul_l0
+/* [0x000003c0] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8 ; mov r3, ra_blk_height
+/* [0x000003c8] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
+/* [0x000003d0] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x000003d8] */ 0x0f0e7380, 0x10020867, // asr r1, r1, ra3
+/* [0x000003e0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait
+/* [0x000003e8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
+/* [0x000003f0] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
+/* [0x000003f8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+/* [0x00000400] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1
+/* [0x00000408] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, rb_dest
+/* [0x00000410] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+/* [0x00000418] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+/* [0x00000420] */ 0xfffffeb8, 0xf0f809e7, // brr -, r:1b
+/* [0x00000428] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
+/* [0x00000430] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
+/* [0x00000438] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
+// ::mc_filter_c_p_l1
+/* [0x00000440] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
+/* [0x00000448] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
+/* [0x00000450] */ 0xf1081dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0
+/* [0x00000458] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif
+/* [0x00000460] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch ; mov ra0, unif
+/* [0x00000468] */ 0x939c117f, 0x10125815, // max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next
+/* [0x00000470] */ 0x920991f6, 0x12125813, // min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a
+/* [0x00000478] */ 0x119c31c0, 0xd0021067, // shl vrx_xshift_next, r0, 3
+/* [0x00000480] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
+/* [0x00000488] */ 0x54402077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul
+/* [0x00000490] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
+/* [0x00000498] */ 0x8c827076, 0x10025803, // add r0, r0, r1 ; mov ra3, unif
+/* [0x000004a0] */ 0x8c427636, 0x120254e1, // add vrx_base_next, r3, r0 ; mov r1, ra_height
+/* [0x000004a8] */ 0x8d818eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
+/* [0x000004b0] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
+/* [0x000004b8] */ 0x8c8033f6, 0xd0039496, // add rb_lcount, r1, 3 ; mov.ifc ra_wt_off_mul_l0, unif
+/* [0x000004c0] */ 0x910c73f6, 0xd8024808, // shl r0, r1, v_dma_h_shift ; mov rb8, ra3.8a
+/* [0x000004c8] */ 0x8c0e70b6, 0x1a024809, // add r0, r0, r2 ; mov rb9, ra3.8b
+/* [0x000004d0] */ 0x910d01f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c
+/* [0x000004d8] */ 0x8c59b1f6, 0x140256a1, // add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
+/* [0x000004e0] */ 0x9581edbf, 0x100255c9, // mov rb_dest, unif ; mov ra9, rb_max_y
+/* [0x000004e8] */ 0x910cd3f6, 0x1e02484b, // shl r1, r1, rb_wt_den_p15 ; mov rb11, ra3.8d
+/* [0x000004f0] */ 0x8f8023f6, 0xd002531e, // asr rb_wt_off, r1, 2 ; mov ra_link, unif
+/* [0x000004f8] */ 0x0d50df80, 0x1a0200e7, // sub ra3, rb_wt_den_p15, ra_k1
+// :1
+/* [0x00000500] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1
+/* [0x00000508] */ 0x8e5539bf, 0x12029899, // shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next
+/* [0x00000510] */ 0x8e4485f6, 0xd202c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
+/* [0x00000518] */ 0x8c4c3ff6, 0x1202a9e3, // add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next
+/* [0x00000520] */ 0x8c531789, 0xda124460, // add vra_y, r3, ra_k1 ; mov r0, r1 << 15
+/* [0x00000528] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1
+/* [0x00000530] */ 0x92267792, 0x1003c8e0, // min r3, r3, ra9 ; mov.ifnc r0, r2
+/* [0x00000538] */ 0x55150d9f, 0x10024122, // mov ra4, ra5 ; mul24 r2, r3, rb_pitch
+/* [0x00000540] */ 0x8c656c87, 0x10024f20, // add vr_txs, vra_base, r2 ; v8min r0, r0, rb_pmask
+/* [0x00000548] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0
+/* [0x00000550] */ 0x4003e030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 2, r0 << 2 @ "mul_used", 0
+/* [0x00000558] */ 0x40034031, 0xda0109e2, // nop ; mul24.ifn r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
+/* [0x00000560] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0
+/* [0x00000568] */ 0x40032031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
+/* [0x00000570] */ 0x4d004bf1, 0xde0269e0, // sub.setf -, r5, 4 ; mul24 r0, ra0.8d, r1
+/* [0x00000578] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x00000580] */ 0x8c1a74f6, 0x10025885, // add r2, r2, r3 ; mov ra5, ra6
+/* [0x00000588] */ 0x551cadb7, 0x100241a1, // mov ra6, ra7 ; mul24 r1, ra7, rb10
+/* [0x00000590] */ 0x4d108437, 0x100241e0, // sub ra7, r2, r0 ; mul24 r0, ra4, rb8
+/* [0x00000598] */ 0x4d149237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra5, rb9
+/* [0x000005a0] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra7, rb11
+/* [0x000005a8] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
+/* [0x000005b0] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
+/* [0x000005b8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+/* [0x000005c0] */ 0x405a700e, 0x120049e1, // nop ; mul24 r1, r1, ra_wt_mul_l0
+/* [0x000005c8] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8 ; mov r3, ra_blk_height
+/* [0x000005d0] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
+/* [0x000005d8] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x000005e0] */ 0x0f0e7380, 0x10020867, // asr r1, r1, ra3
+/* [0x000005e8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait
+/* [0x000005f0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
+/* [0x000005f8] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
+/* [0x00000600] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+/* [0x00000608] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1
+/* [0x00000610] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, rb_dest
+/* [0x00000618] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+/* [0x00000620] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+/* [0x00000628] */ 0xfffffeb8, 0xf0f809e7, // brr -, r:1b
+/* [0x00000630] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
+/* [0x00000638] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
+/* [0x00000640] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
+// ::mc_filter_c_b
+/* [0x00000648] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
+/* [0x00000650] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
+/* [0x00000658] */ 0xf1081dc9, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1
+/* [0x00000660] */ 0x8c0821f6, 0x12225813, // add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a
+/* [0x00000668] */ 0x8d810bf6, 0x10025850, // sub r1, r5, rb_pitch ; mov ra_width_height, unif
+/* [0x00000670] */ 0x93567176, 0x14125815, // max r0, r0, r5 ; mov ra_xshift, ra_xshift_next
+/* [0x00000678] */ 0x928191f6, 0x10025800, // min r0, r0, rb_max_x ; mov ra0, unif
+/* [0x00000680] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
+/* [0x00000688] */ 0x9481c1f6, 0xd0025802, // and r0, r0, -4 ; mov ra2, unif
+/* [0x00000690] */ 0x54402077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul
+/* [0x00000698] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
+/* [0x000006a0] */ 0x8c427076, 0x12024821, // add r0, r0, r1 ; mov r1, ra_height
+/* [0x000006a8] */ 0x8c9c163f, 0x10024680, // add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next
+/* [0x000006b0] */ 0x8d818eb6, 0x10125756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_mul_l0, unif
+/* [0x000006b8] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
+/* [0x000006c0] */ 0x8c8033f6, 0xd0139496, // add rb_lcount, r1, 3 ; mov.ifc ra_wt_mul_l0, unif
+/* [0x000006c8] */ 0x918073f6, 0xd0025803, // shl r0, r1, v_dma_h_shift ; mov ra3, unif
+/* [0x000006d0] */ 0x8c8270b6, 0x10024823, // add r0, r0, r2 ; mov r3, unif
+/* [0x000006d8] */ 0x910d01f6, 0xd2125813, // shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a
+/* [0x000006e0] */ 0x8c81b1f6, 0x10025681, // add rb_dma0, r0, rb_dma0_base ; mov ra1, unif
+/* [0x000006e8] */ 0x110c1dc0, 0xd4020827, // shl r0, ra3.16b, v_x_shift
+/* [0x000006f0] */ 0x8c8021f6, 0x10025803, // add r0, r0, rb_elem_x ; mov ra3, unif
+/* [0x000006f8] */ 0x8d810bf6, 0x10025852, // sub r1, r5, rb_pitch ; mov ra_wt_off_mul_l1, unif
+/* [0x00000700] */ 0x930e7176, 0x18024808, // max r0, r0, r5 ; mov rb8, ra3.8a
+/* [0x00000708] */ 0x920d91f6, 0x1a024809, // min r0, r0, rb_max_x ; mov rb9, ra3.8b
+/* [0x00000710] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
+/* [0x00000718] */ 0x9481c1f6, 0xd0039812, // and r0, r0, -4 ; mov.ifc ra_wt_off_mul_l1, unif
+/* [0x00000720] */ 0x940e7076, 0x1c02484a, // and r1, r0, r1 ; mov rb10, ra3.8c
+/* [0x00000728] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
+/* [0x00000730] */ 0x8c827076, 0x10024817, // add r0, r0, r1 ; mov rb_dest, unif
+/* [0x00000738] */ 0x0c9e7600, 0x100214e7, // add rb_base2_next, r3, r0
+/* [0x00000740] */ 0x950deff6, 0x1e02424b, // mov ra9, rb_max_y ; mov rb11, ra3.8d
+/* [0x00000748] */ 0x1148ddc0, 0x14020867, // shl r1, ra_wt_off_l1, rb_wt_den_p15
+/* [0x00000750] */ 0x8f8093f6, 0xd002531e, // asr rb_wt_off, r1, 9 ; mov ra_link, unif
+// :1
+/* [0x00000758] */ 0xcd511bee, 0xaa0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0
+/* [0x00000760] */ 0x8e5539bf, 0x12029899, // shr r2, r4, ra_xshift ; mov.ifz ra_base2, rb_base2_next
+/* [0x00000768] */ 0x8e4c85f6, 0xd0029851, // shr r1, r2, v_v_shift ; mov.ifz ra_y_y2, ra_y_y2_next
+/* [0x00000770] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz ra_base, ra_base_next
+/* [0x00000778] */ 0x8c441fb6, 0xd4224463, // add ra_y, 1, ra_y ; mov r3, ra_y
+/* [0x00000780] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0 ; mov r0, r1 << 15
+/* [0x00000788] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9 ; mov.ifnc r1, r2 << 1
+/* [0x00000790] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch
+/* [0x00000798] */ 0x8c616cc7, 0x10024e20, // add t0s, ra_base, r3 ; v8min r0, r0, rb_pmask
+/* [0x000007a0] */ 0x95145ff6, 0x10025104, // mov rb4, rb5 ; mov ra4, ra5
+/* [0x000007a8] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0
+/* [0x000007b0] */ 0x4003e030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 2, r0 << 2 @ "mul_used", 0
+/* [0x000007b8] */ 0x40034031, 0xda0109e2, // nop ; mul24.ifn r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
+/* [0x000007c0] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0
+/* [0x000007c8] */ 0x40032031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
+/* [0x000007d0] */ 0x4c0274f1, 0x1e0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8d, r1
+/* [0x000007d8] */ 0x8d9c64ff, 0xb00240c5, // sub ra3, r2, r3 ; mov rb5, rb6 ; ldtmu1
+/* [0x000007e0] */ 0x8e1809f6, 0x10025885, // shr r2, r4, rb_xshift2 ; mov ra5, ra6
+/* [0x000007e8] */ 0x8e4485f6, 0xd2024863, // shr r1, r2, v_v_shift ; mov r3, ra_y2
+/* [0x000007f0] */ 0x8c5077bf, 0x1a124446, // add ra_y2, r3, ra_k1 ; mov rb6, rb7
+/* [0x000007f8] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0 ; mov r0, r1 << 15
+/* [0x00000800] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9 ; mov.ifnc r1, r2 << 1
+/* [0x00000808] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch
+/* [0x00000810] */ 0x8c656cc7, 0x10024f20, // add t1s, ra_base2, r3 ; v8min r0, r0, rb_pmask
+/* [0x00000818] */ 0x540563f0, 0x18024863, // and r1, r1, rb_pmask ; mul24 r3, ra1.8a, r0
+/* [0x00000820] */ 0x4007e030, 0xda0049e2, // nop ; mul24 r2, ra1.8b << 2, r0 << 2 @ "mul_used", 0
+/* [0x00000828] */ 0x40074031, 0xda0109e2, // nop ; mul24.ifn r2, ra1.8b << 12, r1 << 12 @ "mul_used", 0
+/* [0x00000830] */ 0x4d07c4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 4, r0 << 4 @ "mul_used", 0
+/* [0x00000838] */ 0x40072031, 0xdc0109e3, // nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
+/* [0x00000840] */ 0x4d044bf1, 0xde0269e0, // sub.setf -, r5, 4 ; mul24 r0, ra1.8d, r1
+/* [0x00000848] */ 0x4c0854fe, 0x1a0248a1, // add r2, r2, r3 ; mul24 r1, rb5, ra2.8b
+/* [0x00000850] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x00000858] */ 0x551cadb7, 0x100241a3, // mov ra6, ra7 ; mul24 r3, ra7, rb10
+/* [0x00000860] */ 0x4d08443e, 0x180248a0, // sub r2, r2, r0 ; mul24 r0, rb4, ra2.8a
+/* [0x00000868] */ 0x8f0c05f6, 0xd00241c7, // asr ra7, r2, (v_bit_depth - 8) ; mov rb7, ra3
+/* [0x00000870] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c
+/* [0x00000878] */ 0x4c08723e, 0x1e024860, // add r1, r1, r0 ; mul24 r0, rb7, ra2.8d
+/* [0x00000880] */ 0x4d108237, 0x100248a0, // sub r2, r1, r0 ; mul24 r0, ra4, rb8
+/* [0x00000888] */ 0x4d149637, 0x10024860, // sub r1, r3, r0 ; mul24 r0, ra5, rb9
+/* [0x00000890] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra7, rb11
+/* [0x00000898] */ 0x4d527216, 0x12024862, // sub r1, r1, r0 ; mul24 r2, r2, ra_k256
+/* [0x000008a0] */ 0x4f50e5ce, 0xd20248a1, // asr r2, r2, 14 ; mul24 r1, r1, ra_k256
+/* [0x000008a8] */ 0x4f58e3d6, 0xd2024862, // asr r1, r1, 14 ; mul24 r2, r2, ra_wt_mul_l0
+/* [0x000008b0] */ 0x4c48c5ce, 0x120248a1, // add r2, r2, rb_wt_off ; mul24 r1, r1, ra_wt_mul_l1
+/* [0x000008b8] */ 0x8c5e72b6, 0x1c024863, // add r1, r1, r2 ; mov r3, ra_blk_height
+/* [0x000008c0] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
+/* [0x000008c8] */ 0xfffffe70, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x000008d0] */ 0xef40d3f3, 0x12024860, // asr r1, r1, rb_wt_den_p15 ; v8subs r0, ra_height, r3
+/* [0x000008d8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait
+/* [0x000008e0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
+/* [0x000008e8] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
+/* [0x000008f0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+/* [0x000008f8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1
+/* [0x00000900] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, rb_dest
+/* [0x00000908] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+/* [0x00000910] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+/* [0x00000918] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b
+/* [0x00000920] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
+/* [0x00000928] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
+/* [0x00000930] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
+// ::mc_sync_q0
+/* [0x00000938] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00000940] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00000948] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i)
+/* [0x00000950] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i)
+/* [0x00000958] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i)
+/* [0x00000960] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00000968] */ 0x0000001c, 0xe80009e7, // mov dst, sacq(i)
+/* [0x00000970] */ 0x00000001, 0xe80009e7, // mov dst, srel(i)
+/* [0x00000978] */ 0x0000000d, 0xe80009e7, // mov dst, srel(i)
+// ::mc_sync_q1
+/* [0x00000980] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00000988] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00000990] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00000998] */ 0x00000000, 0xe80009e7, // mov dst, srel(i)
+/* [0x000009a0] */ 0x00000011, 0xe80009e7, // mov dst, sacq(i)
+/* [0x000009a8] */ 0x00000002, 0xe80009e7, // mov dst, srel(i)
+// ::mc_sync_q2
+/* [0x000009b0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x000009b8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x000009c0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x000009c8] */ 0x00000000, 0xe80009e7, // mov dst, srel(i)
+/* [0x000009d0] */ 0x00000012, 0xe80009e7, // mov dst, sacq(i)
+/* [0x000009d8] */ 0x00000003, 0xe80009e7, // mov dst, srel(i)
+// ::mc_sync_q3
+/* [0x000009e0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x000009e8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x000009f0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x000009f8] */ 0x00000000, 0xe80009e7, // mov dst, srel(i)
+/* [0x00000a00] */ 0x00000013, 0xe80009e7, // mov dst, sacq(i)
+/* [0x00000a08] */ 0x009e7000, 0x100009e7, // nop
+// ::mc_sync_q4
+/* [0x00000a10] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00000a18] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00000a20] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i)
+/* [0x00000a28] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i)
+/* [0x00000a30] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i)
+/* [0x00000a38] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00000a40] */ 0x0000001d, 0xe80009e7, // mov dst, sacq(i)
+/* [0x00000a48] */ 0x00000005, 0xe80009e7, // mov dst, srel(i)
+/* [0x00000a50] */ 0x0000000e, 0xe80009e7, // mov dst, srel(i)
+// ::mc_sync_q5
+/* [0x00000a58] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00000a60] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00000a68] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00000a70] */ 0x00000004, 0xe80009e7, // mov dst, srel(i)
+/* [0x00000a78] */ 0x00000015, 0xe80009e7, // mov dst, sacq(i)
+/* [0x00000a80] */ 0x00000006, 0xe80009e7, // mov dst, srel(i)
+// ::mc_sync_q6
+/* [0x00000a88] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00000a90] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00000a98] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00000aa0] */ 0x00000004, 0xe80009e7, // mov dst, srel(i)
+/* [0x00000aa8] */ 0x00000016, 0xe80009e7, // mov dst, sacq(i)
+/* [0x00000ab0] */ 0x00000007, 0xe80009e7, // mov dst, srel(i)
+// ::mc_sync_q7
+/* [0x00000ab8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00000ac0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00000ac8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00000ad0] */ 0x00000004, 0xe80009e7, // mov dst, srel(i)
+/* [0x00000ad8] */ 0x00000017, 0xe80009e7, // mov dst, sacq(i)
+/* [0x00000ae0] */ 0x009e7000, 0x100009e7, // nop
+// ::mc_sync_q8
+/* [0x00000ae8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00000af0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00000af8] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i)
+/* [0x00000b00] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i)
+/* [0x00000b08] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i)
+/* [0x00000b10] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00000b18] */ 0x0000001e, 0xe80009e7, // mov dst, sacq(i)
+/* [0x00000b20] */ 0x00000009, 0xe80009e7, // mov dst, srel(i)
+/* [0x00000b28] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i)
+// ::mc_sync_q9
+/* [0x00000b30] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00000b38] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00000b40] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00000b48] */ 0x00000008, 0xe80009e7, // mov dst, srel(i)
+/* [0x00000b50] */ 0x00000019, 0xe80009e7, // mov dst, sacq(i)
+/* [0x00000b58] */ 0x0000000a, 0xe80009e7, // mov dst, srel(i)
+// ::mc_sync_q10
+/* [0x00000b60] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00000b68] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00000b70] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00000b78] */ 0x00000008, 0xe80009e7, // mov dst, srel(i)
+/* [0x00000b80] */ 0x0000001a, 0xe80009e7, // mov dst, sacq(i)
+/* [0x00000b88] */ 0x0000000b, 0xe80009e7, // mov dst, srel(i)
+// ::mc_sync_q11
+/* [0x00000b90] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00000b98] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00000ba0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00000ba8] */ 0x00000008, 0xe80009e7, // mov dst, srel(i)
+/* [0x00000bb0] */ 0x0000001b, 0xe80009e7, // mov dst, sacq(i)
+/* [0x00000bb8] */ 0x009e7000, 0x100009e7, // nop
+// ::mc_exit_c_qn
+// ::mc_exit_y_qn
+/* [0x00000bc0] */ 0x00000003, 0xe00228e7, // mov.setf r3, PREREAD - 1
+// :1
+/* [0x00000bc8] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
+/* [0x00000bd0] */ 0x009e7000, 0xa00009e7, // nop ; nop ; ldtmu0
+/* [0x00000bd8] */ 0x009e7000, 0xb00009e7, // nop ; nop ; ldtmu1
+/* [0x00000be0] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
+/* [0x00000be8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00000bf0] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend
+/* [0x00000bf8] */ 0x009e7000, 0x100009e7, // nop
+/* [0x00000c00] */ 0x009e7000, 0x100009e7, // nop
+// ::mc_exit_c_q0
+// ::mc_exit_y_q0
+/* [0x00000c08] */ 0x00000003, 0xe00228e7, // mov.setf r3, PREREAD - 1
+// :1
+/* [0x00000c10] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
+/* [0x00000c18] */ 0x009e7000, 0xa00009e7, // nop ; nop ; ldtmu0
+/* [0x00000c20] */ 0x009e7000, 0xb00009e7, // nop ; nop ; ldtmu1
+/* [0x00000c28] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
+/* [0x00000c30] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00000c38] */ 0x0000001c, 0xe80009e7, // mov dst, sacq(i)
+/* [0x00000c40] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend
+/* [0x00000c48] */ 0x00000001, 0xe00209a7, // mov interrupt, 1
+/* [0x00000c50] */ 0x009e7000, 0x100009e7, // nop
+// ::mc_setup_y_q0
+/* [0x00000c58] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i)
+// ::mc_setup_y_qn
+/* [0x00000c60] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1 ; mov ra0, unif
+/* [0x00000c68] */ 0x15827d80, 0x10020267, // mov ra9, unif
+/* [0x00000c70] */ 0x15827d80, 0x10020067, // mov ra1, unif
+/* [0x00000c78] */ 0x15827d80, 0x100202e7, // mov ra11, unif
+/* [0x00000c80] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
+/* [0x00000c88] */ 0x119de1c0, 0xd00210e7, // shl rb_ef, r0, i_shift30
+/* [0x00000c90] */ 0xff100100, 0xe0020527, // mov ra_kff100100, 0xff100100
+/* [0x00000c98] */ 0x000000ff, 0xe00215a7, // mov rb_pmask, v_pmask
+/* [0x00000ca0] */ 0x001000ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
+/* [0x00000ca8] */ 0x15827d80, 0x100200e7, // mov ra3, unif
+/* [0x00000cb0] */ 0x15827d80, 0x10021527, // mov rb_xpitch, unif
+/* [0x00000cb8] */ 0x0d0c1dc0, 0xd4021667, // sub rb_max_x, ra3.16b, 1
+/* [0x00000cc0] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1
+/* [0x00000cc8] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
+/* [0x00000cd0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+/* [0x00000cd8] */ 0x159d03c0, 0x10021627, // or rb_dma1_base, r1, rb_pitch
+/* [0x00000ce0] */ 0x159a7d80, 0x100208e7, // mov r3, elem_num
+/* [0x00000ce8] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3
+/* [0x00000cf0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+/* [0x00000cf8] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+/* [0x00000d00] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
+/* [0x00000d08] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4 ; v8subs r2, r2, r2
+/* [0x00000d10] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch
+/* [0x00000d18] */ 0x149e7080, 0x10020867, // and r1, r0, r2
+/* [0x00000d20] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
+/* [0x00000d28] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+/* [0x00000d30] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0
+/* [0x00000d38] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
+/* [0x00000d40] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+/* [0x00000d48] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+/* [0x00000d50] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
+/* [0x00000d58] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
+/* [0x00000d60] */ 0x149e7080, 0x10020867, // and r1, r0, r2
+/* [0x00000d68] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
+/* [0x00000d70] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+/* [0x00000d78] */ 0x0c2e7c00, 0x10020667, // add ra_base2, ra11, r0
+/* [0x00000d80] */ 0x80027036, 0x120049e0, // nop ; mov r0, ra0.16a
+/* [0x00000d88] */ 0x95044ff6, 0xd20248e2, // mov r3, PREREAD ; mov r2, ra1.16a
+// :1
+/* [0x00000d90] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
+/* [0x00000d98] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
+/* [0x00000da0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
+/* [0x00000da8] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch
+/* [0x00000db0] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1 ; mov ra_y, r0
+/* [0x00000db8] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
+/* [0x00000dc0] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
+/* [0x00000dc8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
+/* [0x00000dd0] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch
+/* [0x00000dd8] */ 0x8c667c52, 0x10125f11, // add t1s, ra_base2, r1 ; mov ra_y2, r2
+/* [0x00000de0] */ 0x0c80fdc0, 0xd0021367, // add rb_wt_den_p15, unif, 23 - v_bit_depth
+/* [0x00000de8] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+/* [0x00000df0] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2
+/* [0x00000df8] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+/* [0x00000e00] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3
+/* [0x00000e08] */ 0x159e7040, 0x10020827, // or r0, r0, r1
+/* [0x00000e10] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+/* [0x00000e18] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
+/* [0x00000e20] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+/* [0x00000e28] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+/* [0x00000e30] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
+/* [0x00000e38] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00000e40] */ 0x00000000, 0xe0024208, // mov ra8, 0 ; mov rb8, 0
+/* [0x00000e48] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00000e50] */ 0x00000000, 0xe0024249, // mov ra9, 0 ; mov rb9, 0
+/* [0x00000e58] */ 0x00000000, 0xe002428a, // mov ra10, 0 ; mov rb10, 0
+/* [0x00000e60] */ 0x00000000, 0xe00242cb, // mov ra11, 0 ; mov rb11, 0
+// :per_block_setup_8
+/* [0x00000e68] */ 0x93567176, 0x14125815, // max r0, r0, r5 ; mov ra_xshift, ra_xshift_next
+/* [0x00000e70] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+/* [0x00000e78] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
+/* [0x00000e80] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
+/* [0x00000e88] */ 0x8d810bf6, 0x1002589a, // sub r2, r5, rb_pitch ; mov ra_base_next, unif
+/* [0x00000e90] */ 0x940270b6, 0x12225853, // and r1, r0, r2 ; mov ra_y_next, ra0.16a
+/* [0x00000e98] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
+/* [0x00000ea0] */ 0x8c827076, 0x10025801, // add r0, r0, r1 ; mov ra1, unif
+/* [0x00000ea8] */ 0x0c6a7c00, 0x100206a7, // add ra_base_next, ra_base_next, r0
+/* [0x00000eb0] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
+/* [0x00000eb8] */ 0x93067176, 0x12125813, // max r0, r0, r5 ; mov ra_y2_next, ra1.16a
+/* [0x00000ec0] */ 0x928191f6, 0x10024813, // min r0, r0, rb_max_x ; mov rb_base2_next, unif
+/* [0x00000ec8] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
+/* [0x00000ed0] */ 0x9481c1f6, 0xd0025810, // and r0, r0, -4 ; mov ra_width_height, unif
+/* [0x00000ed8] */ 0x949dc0bf, 0x10024871, // and r1, r0, r2 ; mov vw_setup, rb_vpm_init
+/* [0x00000ee0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
+/* [0x00000ee8] */ 0x4c401077, 0xd4024821, // add r0, r0, r1 ; mul24 r1, ra_width, v_x_mul
+/* [0x00000ef0] */ 0x0c9d3e00, 0x100214e7, // add rb_base2_next, rb_base2_next, r0
+/* [0x00000ef8] */ 0x8d418e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
+/* [0x00000f00] */ 0x8c5c31c6, 0xdc025460, // add rb_i_tmu, r0, 7 - PREREAD ; v8min r0, r0, ra_blk_height
+/* [0x00000f08] */ 0x0c9c71c0, 0xd00214a7, // add rb_lcount, r0, 7
+/* [0x00000f10] */ 0x119c71c0, 0xd0020827, // shl r0, r0, v_dma_h_shift
+/* [0x00000f18] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+/* [0x00000f20] */ 0x119d01c0, 0xd0020827, // shl r0, r0, v_dma_wh_shift
+/* [0x00000f28] */ 0x8c81b1f6, 0x100256a0, // add rb_dma0, r0, rb_dma0_base ; mov r0, unif
+/* [0x00000f30] */ 0x918101f6, 0xd00a5816, // shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif
+/* [0x00000f38] */ 0x915031f6, 0xde024223, // shl ra8, r0, 3 ; mov r3, ra_k255
+/* [0x00000f40] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
+/* [0x00000f48] */ 0x10227380, 0x1e4200a7, // ror ra2.8a, r1, ra8.8d
+/* [0x00000f50] */ 0x10227380, 0x1c420027, // ror ra0.8a, r1, ra8.8c
+/* [0x00000f58] */ 0x01040400, 0xe0020867, // mov r1, 0x01040400
+/* [0x00000f60] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d
+/* [0x00000f68] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c
+/* [0x00000f70] */ 0x050b0a00, 0xe0020867, // mov r1,0x050b0a00
+/* [0x00000f78] */ 0x10227380, 0x1e6200a7, // ror ra2.8c, r1, ra8.8d
+/* [0x00000f80] */ 0x10227380, 0x1c620027, // ror ra0.8c, r1, ra8.8c
+/* [0x00000f88] */ 0x11283a40, 0xe0020867, // mov r1,0x11283a40
+/* [0x00000f90] */ 0x10227380, 0x1e7200a7, // ror ra2.8d, r1, ra8.8d
+/* [0x00000f98] */ 0x10227380, 0x1c720027, // ror ra0.8d, r1, ra8.8c
+/* [0x00000fa0] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
+/* [0x00000fa8] */ 0x902203bf, 0x1e025812, // ror r0, r1, ra8.8d ; mov ra_wt_off_mul_l1, unif
+/* [0x00000fb0] */ 0x90227383, 0x1c424044, // ror ra1.8a, r1, ra8.8c ; v8min rb4, r0, r3
+/* [0x00000fb8] */ 0x0a0b0500, 0xe0020867, // mov r1,0x0a0b0500
+/* [0x00000fc0] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d
+/* [0x00000fc8] */ 0x90227383, 0x1c524045, // ror ra1.8b, r1, ra8.8c ; v8min rb5, r0, r3
+/* [0x00000fd0] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
+/* [0x00000fd8] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d
+/* [0x00000fe0] */ 0x90227383, 0x1c624046, // ror ra1.8c, r1, ra8.8c ; v8min rb6, r0, r3
+/* [0x00000fe8] */ 0x954a0dbf, 0x10084597, // mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 ; mov rb_dest, unif
+/* [0x00000ff0] */ 0x01010000, 0xe0020867, // mov r1,0x01010000
+/* [0x00000ff8] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d
+/* [0x00001000] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00001008] */ 0x90227383, 0x1c724047, // ror ra1.8d, r1, ra8.8c ; v8min rb7, r0, r3
+/* [0x00001010] */ 0x1158ddc0, 0x14020827, // shl r0, ra_wt_off_l0, rb_wt_den_p15
+/* [0x00001018] */ 0x8f8091f6, 0xd002531e, // asr rb_wt_off, r0, 9 ; mov ra_link, unif
+// ::mc_filter_y_pxx
+/* [0x00001020] */ 0xfffffe28, 0xf0f807a7, // brr ra_link, r:per_block_setup_8
+/* [0x00001028] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num
+/* [0x00001030] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2
+/* [0x00001038] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next
+/* [0x00001040] */ 0x11581dc0, 0xd21205a7, // shl ra_wt_mul_l0, ra_wt_mul_l0, 1
+// :1
+/* [0x00001048] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1
+/* [0x00001050] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0
+/* [0x00001058] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch
+/* [0x00001060] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
+/* [0x00001068] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next
+/* [0x00001070] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3
+/* [0x00001078] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2 ; mov.ifz ra_base2, rb_base2_next
+/* [0x00001080] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
+/* [0x00001088] */ 0x9221e5f6, 0x10025887, // min r2, r2, rb_max_y ; mov ra7, ra8
+/* [0x00001090] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3
+/* [0x00001098] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask
+/* [0x000010a0] */ 0x8c243ff6, 0x100279c8, // add.setf -, rb_ef, rb_ef ; mov ra8, ra9
+/* [0x000010a8] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0
+/* [0x000010b0] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0
+/* [0x000010b8] */ 0x40038031, 0xd80109e3, // nop ; mul24.ifn r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0
+/* [0x000010c0] */ 0x40037031, 0xda0109e2, // nop ; mul24.ifn r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0
+/* [0x000010c8] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0
+/* [0x000010d0] */ 0x40036031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
+/* [0x000010d8] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0
+/* [0x000010e0] */ 0x40035031, 0xde0109e3, // nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
+/* [0x000010e8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0
+/* [0x000010f0] */ 0x40074031, 0xd80109e3, // nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
+/* [0x000010f8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0
+/* [0x00001100] */ 0x40073031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
+/* [0x00001108] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0
+/* [0x00001110] */ 0x40072031, 0xdc0109e3, // nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
+/* [0x00001118] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0
+/* [0x00001120] */ 0x40071031, 0xde0109e3, // nop ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
+/* [0x00001128] */ 0x8d288bf6, 0xd00279c9, // sub.setf -, r5, 8 ; mov ra9, ra10
+/* [0x00001130] */ 0x4d0894fe, 0x180248a0, // sub r2, r2, r3 ; mul24 r0, rb9, ra2.8a
+/* [0x00001138] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x00001140] */ 0x5508affe, 0x1a025261, // mov rb9, rb10 ; mul24 r1, rb10, ra2.8b
+/* [0x00001148] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11 ; mov rb10, rb11
+/* [0x00001150] */ 0x8f1c05f6, 0xd00242cb, // asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7
+/* [0x00001158] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c
+/* [0x00001160] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d
+/* [0x00001168] */ 0x4c204237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb4
+/* [0x00001170] */ 0x4c245237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra9, rb5
+/* [0x00001178] */ 0x4d286237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra10, rb6
+/* [0x00001180] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb7
+/* [0x00001188] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
+/* [0x00001190] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
+/* [0x00001198] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+/* [0x000011a0] */ 0x405a700e, 0x120049e1, // nop ; mul24 r1, r1, ra_wt_mul_l0
+/* [0x000011a8] */ 0x8c5cc3f6, 0x1c024863, // add r1, r1, rb_wt_off ; mov r3, ra_blk_height
+/* [0x000011b0] */ 0xf14083f3, 0xd2024860, // shl r1, r1, 8 ; v8subs r0, ra_height, r3
+/* [0x000011b8] */ 0xfffffe70, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x000011c0] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15
+/* [0x000011c8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait
+/* [0x000011d0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
+/* [0x000011d8] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
+/* [0x000011e0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+/* [0x000011e8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1
+/* [0x000011f0] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, rb_dest
+/* [0x000011f8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+/* [0x00001200] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+/* [0x00001208] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b
+/* [0x00001210] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
+/* [0x00001218] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
+/* [0x00001220] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
+// ::mc_filter_y_bxx
+/* [0x00001228] */ 0xfffffc20, 0xf0f807a7, // brr ra_link, r:per_block_setup_8
+/* [0x00001230] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num
+/* [0x00001238] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2
+/* [0x00001240] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next
+// :1
+/* [0x00001248] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1
+/* [0x00001250] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0
+/* [0x00001258] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch
+/* [0x00001260] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
+/* [0x00001268] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next
+/* [0x00001270] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3
+/* [0x00001278] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2 ; mov.ifz ra_base2, rb_base2_next
+/* [0x00001280] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
+/* [0x00001288] */ 0x9221e5f6, 0x10025887, // min r2, r2, rb_max_y ; mov ra7, ra8
+/* [0x00001290] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3
+/* [0x00001298] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask
+/* [0x000012a0] */ 0x8c243ff6, 0x100279c8, // add.setf -, rb_ef, rb_ef ; mov ra8, ra9
+/* [0x000012a8] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0
+/* [0x000012b0] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0
+/* [0x000012b8] */ 0x40038031, 0xd80109e3, // nop ; mul24.ifn r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0
+/* [0x000012c0] */ 0x40037031, 0xda0109e2, // nop ; mul24.ifn r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0
+/* [0x000012c8] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0
+/* [0x000012d0] */ 0x40036031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
+/* [0x000012d8] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0
+/* [0x000012e0] */ 0x40035031, 0xde0109e3, // nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
+/* [0x000012e8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0
+/* [0x000012f0] */ 0x40074031, 0xd80109e3, // nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
+/* [0x000012f8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0
+/* [0x00001300] */ 0x40073031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
+/* [0x00001308] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0
+/* [0x00001310] */ 0x40072031, 0xdc0109e3, // nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
+/* [0x00001318] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0
+/* [0x00001320] */ 0x40071031, 0xde0109e3, // nop ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
+/* [0x00001328] */ 0x8d288bf6, 0xd00279c9, // sub.setf -, r5, 8 ; mov ra9, ra10
+/* [0x00001330] */ 0x4d0894fe, 0x180248a0, // sub r2, r2, r3 ; mul24 r0, rb9, ra2.8a
+/* [0x00001338] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x00001340] */ 0x5508affe, 0x1a025261, // mov rb9, rb10 ; mul24 r1, rb10, ra2.8b
+/* [0x00001348] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11 ; mov rb10, rb11
+/* [0x00001350] */ 0x8f1c05f6, 0xd00242cb, // asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7
+/* [0x00001358] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c
+/* [0x00001360] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d
+/* [0x00001368] */ 0x4c204237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb4
+/* [0x00001370] */ 0x4c245237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra9, rb5
+/* [0x00001378] */ 0x4d286237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra10, rb6
+/* [0x00001380] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb7
+/* [0x00001388] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0 ; mov r2, rb_wt_off
+/* [0x00001390] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
+/* [0x00001398] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+/* [0x000013a0] */ 0x405a700e, 0x120049e0, // nop ; mul24 r0, r1, ra_wt_mul_l0
+/* [0x000013a8] */ 0x4c4b808e, 0xd2024821, // add r0, r0, r2 ; mul24 r1, r1 << 8, ra_wt_mul_l1 << 8 @ "mul_used", 0
+/* [0x000013b0] */ 0x8c5e7236, 0x1c024863, // add r1, r1, r0 ; mov r3, ra_blk_height
+/* [0x000013b8] */ 0xf14083f3, 0xd2024860, // shl r1, r1, 8 ; v8subs r0, ra_height, r3
+/* [0x000013c0] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x000013c8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15
+/* [0x000013d0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait
+/* [0x000013d8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
+/* [0x000013e0] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
+/* [0x000013e8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+/* [0x000013f0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1
+/* [0x000013f8] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, rb_dest
+/* [0x00001400] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+/* [0x00001408] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+/* [0x00001410] */ 0xfffffe18, 0xf0f809e7, // brr -, r:1b
+/* [0x00001418] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
+/* [0x00001420] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
+/* [0x00001428] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
+// ::mc_filter_y_p00
+/* [0x00001430] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num
+/* [0x00001438] */ 0x15567d80, 0x14120567, // mov ra_xshift, ra_xshift_next
+/* [0x00001440] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3
+/* [0x00001448] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+/* [0x00001450] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+/* [0x00001458] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
+/* [0x00001460] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4 ; v8subs r2, r2, r2
+/* [0x00001468] */ 0x8d8105f6, 0x1002589a, // sub r2, r2, rb_pitch ; mov ra_base_next, unif
+/* [0x00001470] */ 0x940270b6, 0x12225853, // and r1, r0, r2 ; mov ra_y_next, ra0.16a
+/* [0x00001478] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
+/* [0x00001480] */ 0x8c827076, 0x10025810, // add r0, r0, r1 ; mov ra_width_height, unif
+/* [0x00001488] */ 0x8c69cc3f, 0x100246b1, // add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init
+/* [0x00001490] */ 0x11400dc0, 0xd4020867, // shl r1, ra_width, v_x_shift
+/* [0x00001498] */ 0x8d418e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
+/* [0x000014a0] */ 0x8d5c41c6, 0xdc025460, // sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height
+/* [0x000014a8] */ 0x919c71c0, 0xd0024812, // shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0
+/* [0x000014b0] */ 0x8c827076, 0x10025816, // add r0, r0, r1 ; mov ra_wt_off_mul_l0, unif
+/* [0x000014b8] */ 0x918101f6, 0xd0024817, // shl r0, r0, v_dma_wh_shift ; mov rb_dest, unif
+/* [0x000014c0] */ 0x0c9db1c0, 0x100216a7, // add rb_dma0, r0, rb_dma0_base
+/* [0x000014c8] */ 0xf158dddb, 0x14024825, // shl r0, ra_wt_off_l0, rb_wt_den_p15 ; v8subs r5rep, r3, r3
+/* [0x000014d0] */ 0x8f8011f6, 0xd002531e, // asr rb_wt_off, r0, 1 ; mov ra_link, unif
+// :1
+/* [0x000014d8] */ 0xcd511bee, 0x1a0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1
+/* [0x000014e0] */ 0x804e7036, 0xa42099d1, // nop ; mov.ifz ra_y, ra_y_next ; ldtmu0
+/* [0x000014e8] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch
+/* [0x000014f0] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
+/* [0x000014f8] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next
+/* [0x00001500] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3
+/* [0x00001508] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2 ; v8min r0, r0, rb_pmask
+/* [0x00001510] */ 0x4d592bc6, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0
+/* [0x00001518] */ 0x915cf3f6, 0xdc024863, // shl r1, r1, 23 - v_bit_depth ; mov r3, ra_blk_height
+/* [0x00001520] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
+/* [0x00001528] */ 0xffffff90, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x00001530] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15
+/* [0x00001538] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait
+/* [0x00001540] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
+/* [0x00001548] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
+/* [0x00001550] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+/* [0x00001558] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1
+/* [0x00001560] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, rb_dest
+/* [0x00001568] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+/* [0x00001570] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+/* [0x00001578] */ 0xffffff40, 0xf0f809e7, // brr -, r:1b
+/* [0x00001580] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
+/* [0x00001588] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
+/* [0x00001590] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
+// ::mc_filter_y_b00
+/* [0x00001598] */ 0xfffff8b0, 0xf0f807a7, // brr ra_link, r:per_block_setup_8
+/* [0x000015a0] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num
+/* [0x000015a8] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2
+/* [0x000015b0] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next
+/* [0x000015b8] */ 0x00000007, 0xe0020827, // mov r0, 7
+/* [0x000015c0] */ 0x0d9d1e00, 0x10021467, // sub rb_i_tmu, rb_i_tmu, r0
+/* [0x000015c8] */ 0x0d9d2e00, 0x100214a7, // sub rb_lcount, rb_lcount, r0
+/* [0x000015d0] */ 0x95588ff6, 0xd0024821, // mov r0, 8 ; mov r1, ra_wt_off_mul_l0
+/* [0x000015d8] */ 0x119cce00, 0x10021327, // shl rb_wt_off, rb_wt_off, r0
+/* [0x000015e0] */ 0x809f8009, 0xd000d9d6, // nop ; mov.ifnz ra_wt_off_mul_l0, r1 << 8
+// :1
+/* [0x000015e8] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1
+/* [0x000015f0] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0
+/* [0x000015f8] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch
+/* [0x00001600] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
+/* [0x00001608] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next
+/* [0x00001610] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3
+/* [0x00001618] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2 ; mov.ifz ra_base2, rb_base2_next
+/* [0x00001620] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
+/* [0x00001628] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
+/* [0x00001630] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3
+/* [0x00001638] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask
+/* [0x00001640] */ 0x545963c6, 0x12024860, // and r1, r1, rb_pmask ; mul24 r0, r0, ra_wt_mul_l0
+/* [0x00001648] */ 0x4d492bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1
+/* [0x00001650] */ 0x0c9e7040, 0x10020867, // add r1, r0, r1
+/* [0x00001658] */ 0x915ce3f6, 0xdc024863, // shl r1, r1, 22 - v_bit_depth ; mov r3, ra_blk_height
+/* [0x00001660] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
+/* [0x00001668] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x00001670] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15
+/* [0x00001678] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait
+/* [0x00001680] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
+/* [0x00001688] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
+/* [0x00001690] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+/* [0x00001698] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1
+/* [0x000016a0] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, rb_dest
+/* [0x000016a8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+/* [0x000016b0] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+/* [0x000016b8] */ 0xffffff10, 0xf0f809e7, // brr -, r:1b
+/* [0x000016c0] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
+/* [0x000016c8] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
+/* [0x000016d0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
+// ::mc_setup_c10_q0
+/* [0x000016d8] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i)
+// ::mc_setup_c10_qn
+/* [0x000016e0] */ 0x00000001, 0xe0020927, // mov tmurs, 1
+/* [0x000016e8] */ 0x15827d80, 0x10020027, // mov ra0, unif
+/* [0x000016f0] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
+/* [0x000016f8] */ 0x119de1c0, 0xd00210e7, // shl rb_ef, r0, i_shift30
+/* [0x00001700] */ 0x15827d80, 0x10020627, // mov ra_base, unif
+/* [0x00001708] */ 0x0d801dc0, 0xd0020827, // sub r0, unif, 1
+/* [0x00001710] */ 0x119c21c0, 0xd0021667, // shl rb_max_x, r0, v_x_shift
+/* [0x00001718] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1
+/* [0x00001720] */ 0xff100100, 0xe0020527, // mov ra_kff100100, 0xff100100
+/* [0x00001728] */ 0x0000ffff, 0xe00215a7, // mov rb_pmask, v_pmask
+/* [0x00001730] */ 0x000803ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
+/* [0x00001738] */ 0x15827d80, 0x10021527, // mov rb_xpitch, unif
+/* [0x00001740] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
+/* [0x00001748] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+/* [0x00001750] */ 0x0c9d03c0, 0x10021627, // add rb_dma1_base, r1, rb_pitch
+/* [0x00001758] */ 0x14981f80, 0xd0020827, // and r0, 1, elem_num
+/* [0x00001760] */ 0x409c5007, 0xd00049e0, // nop ; mul24 r0, r0, 5
+/* [0x00001768] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
+/* [0x00001770] */ 0x0c9e7000, 0x100210a7, // add rb_elem_x, r0, r0
+/* [0x00001778] */ 0x11002dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
+/* [0x00001780] */ 0x0c9c21c0, 0x10020827, // add r0, r0, rb_elem_x
+/* [0x00001788] */ 0x930001f6, 0xd2225811, // max r0, r0, 0 ; mov ra_y, ra0.16a
+/* [0x00001790] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+/* [0x00001798] */ 0x00000000, 0xe0224541, // mov ra_xshift_next, 0 ; mov rb_xshift2_next, 0
+/* [0x000017a0] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
+/* [0x000017a8] */ 0x149e7040, 0x10020867, // and r1, r0, r1
+/* [0x000017b0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
+/* [0x000017b8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+/* [0x000017c0] */ 0x0c627c00, 0x10020627, // add ra_base, ra_base, r0
+/* [0x000017c8] */ 0x0c80df80, 0xd0021367, // add rb_wt_den_p15, 23 - v_bit_depth, unif
+/* [0x000017d0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+/* [0x000017d8] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1
+/* [0x000017e0] */ 0x119c43c0, 0xd0020867, // shl r1, r1, 4
+/* [0x000017e8] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1
+/* [0x000017f0] */ 0x159e7040, 0x10020827, // or r0, r0, r1
+/* [0x000017f8] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
+/* [0x00001800] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
+/* [0x00001808] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0))
+/* [0x00001810] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6
+/* [0x00001818] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
+/* [0x00001820] */ 0x15827d80, 0x10020027, // mov ra0, unif
+/* [0x00001828] */ 0x15827d80, 0x10020667, // mov ra_base2, unif
+/* [0x00001830] */ 0x11002dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
+/* [0x00001838] */ 0x8c0021f6, 0x12125811, // add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a
+/* [0x00001840] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+/* [0x00001848] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+/* [0x00001850] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
+/* [0x00001858] */ 0x149e7040, 0x10020867, // and r1, r0, r1
+/* [0x00001860] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
+/* [0x00001868] */ 0x8c467076, 0x12024822, // add r0, r0, r1 ; mov r2, ra_y2
+/* [0x00001870] */ 0x0c667c00, 0x10020667, // add ra_base2, ra_base2, r0
+/* [0x00001878] */ 0x95444ff6, 0xd40248e0, // mov r3, PREREAD ; mov r0, ra_y
+// :1
+/* [0x00001880] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
+/* [0x00001888] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
+/* [0x00001890] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
+/* [0x00001898] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch
+/* [0x000018a0] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1 ; mov ra_y, r0
+/* [0x000018a8] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
+/* [0x000018b0] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
+/* [0x000018b8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
+/* [0x000018c0] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch
+/* [0x000018c8] */ 0x8c667c52, 0x10125f11, // add t1s, ra_base2, r1 ; mov ra_y2, r2
+/* [0x000018d0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x000018d8] */ 0x00000000, 0xe0024104, // mov ra4, 0 ; mov rb4, 0
+/* [0x000018e0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x000018e8] */ 0x00000000, 0xe0024145, // mov ra5, 0 ; mov rb5, 0
+/* [0x000018f0] */ 0x00000000, 0xe0024186, // mov ra6, 0 ; mov rb6, 0
+/* [0x000018f8] */ 0x00000000, 0xe00241c7, // mov ra7, 0 ; mov rb7, 0
+// ::mc_filter_c10_p
+/* [0x00001900] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
+/* [0x00001908] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
+/* [0x00001910] */ 0xf1082dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0
+/* [0x00001918] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif
+/* [0x00001920] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch ; mov ra0, unif
+/* [0x00001928] */ 0x93567176, 0x14024800, // max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next
+/* [0x00001930] */ 0x920991f6, 0x12225813, // min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a
+/* [0x00001938] */ 0x54404077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul
+/* [0x00001940] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
+/* [0x00001948] */ 0x8c827076, 0x10025803, // add r0, r0, r1 ; mov ra3, unif
+/* [0x00001950] */ 0x8c427636, 0x120246a1, // add vrx_base_next, r3, r0 ; mov r1, ra_height
+/* [0x00001958] */ 0x8d818eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
+/* [0x00001960] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
+/* [0x00001968] */ 0x8c8033f6, 0xd0039496, // add rb_lcount, r1, 3 ; mov.ifc ra_wt_off_mul_l0, unif
+/* [0x00001970] */ 0x910c83f6, 0xd8024808, // shl r0, r1, v_dma_h_shift ; mov rb8, ra3.8a
+/* [0x00001978] */ 0x8c0e70b6, 0x1a024809, // add r0, r0, r2 ; mov rb9, ra3.8b
+/* [0x00001980] */ 0x910cf1f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c
+/* [0x00001988] */ 0x8c59b1f6, 0x140256a1, // add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
+/* [0x00001990] */ 0x9581edbf, 0x100255c9, // mov rb_dest, unif ; mov ra9, rb_max_y
+/* [0x00001998] */ 0x910cd3f6, 0x1e02484b, // shl r1, r1, rb_wt_den_p15 ; mov rb11, ra3.8d
+/* [0x000019a0] */ 0x8f8023f6, 0xd002531e, // asr rb_wt_off, r1, 2 ; mov ra_link, unif
+/* [0x000019a8] */ 0x0d50df80, 0x1a0200e7, // sub ra3, rb_wt_den_p15, ra_k1
+// :1
+/* [0x000019b0] */ 0xcd511bee, 0xaa0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0
+/* [0x000019b8] */ 0x8e4c09f6, 0x140288a3, // shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next
+/* [0x000019c0] */ 0x8e4505f6, 0xd402c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
+/* [0x000019c8] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next
+/* [0x000019d0] */ 0x8c531789, 0xda224460, // add vra_y, r3, ra_k1 ; mov r0, r1 << 15
+/* [0x000019d8] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1
+/* [0x000019e0] */ 0x92267792, 0x1003c8e0, // min r3, r3, ra9 ; mov.ifnc r0, r2
+/* [0x000019e8] */ 0x55150d9f, 0x10024122, // mov ra4, ra5 ; mul24 r2, r3, rb_pitch
+/* [0x000019f0] */ 0x8c616c87, 0x10024e20, // add vr_txs, vra_base, r2 ; v8min r0, r0, rb_pmask
+/* [0x000019f8] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0
+/* [0x00001a00] */ 0x4003e030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 2, r0 << 2 @ "mul_used", 0
+/* [0x00001a08] */ 0x40034031, 0xda0109e2, // nop ; mul24.ifn r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
+/* [0x00001a10] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0
+/* [0x00001a18] */ 0x40032031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
+/* [0x00001a20] */ 0x4d004bf1, 0xde0269e0, // sub.setf -, r5, 4 ; mul24 r0, ra0.8d, r1
+/* [0x00001a28] */ 0x8c1a74f6, 0x10025885, // add r2, r2, r3 ; mov ra5, ra6
+/* [0x00001a30] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x00001a38] */ 0x551cadb7, 0x100241a1, // mov ra6, ra7 ; mul24 r1, ra7, rb10
+/* [0x00001a40] */ 0x4d108437, 0x100248a0, // sub r2, r2, r0 ; mul24 r0, ra4, rb8
+/* [0x00001a48] */ 0x0f9c25c0, 0xd00201e7, // asr ra7, r2, v_bit_depth - 8
+/* [0x00001a50] */ 0x4d149237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra5, rb9
+/* [0x00001a58] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra7, rb11
+/* [0x00001a60] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
+/* [0x00001a68] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
+/* [0x00001a70] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+/* [0x00001a78] */ 0x405a700e, 0x120049e1, // nop ; mul24 r1, r1, ra_wt_mul_l0
+/* [0x00001a80] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8 ; mov r3, ra_blk_height
+/* [0x00001a88] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
+/* [0x00001a90] */ 0xffffff00, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x00001a98] */ 0x0f0e7380, 0x10020867, // asr r1, r1, ra3
+/* [0x00001aa0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait
+/* [0x00001aa8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
+/* [0x00001ab0] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
+/* [0x00001ab8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+/* [0x00001ac0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1
+/* [0x00001ac8] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, rb_dest
+/* [0x00001ad0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+/* [0x00001ad8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+/* [0x00001ae0] */ 0xfffffeb0, 0xf0f809e7, // brr -, r:1b
+/* [0x00001ae8] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
+/* [0x00001af0] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
+/* [0x00001af8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
+// ::mc_filter_c10_p_l1
+/* [0x00001b00] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
+/* [0x00001b08] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
+/* [0x00001b10] */ 0xf1082dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0
+/* [0x00001b18] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif
+/* [0x00001b20] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch ; mov ra0, unif
+/* [0x00001b28] */ 0x939c117f, 0x10125815, // max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next
+/* [0x00001b30] */ 0x920991f6, 0x12125813, // min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a
+/* [0x00001b38] */ 0x54404077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul
+/* [0x00001b40] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
+/* [0x00001b48] */ 0x8c827076, 0x10025803, // add r0, r0, r1 ; mov ra3, unif
+/* [0x00001b50] */ 0x8c427636, 0x120254e1, // add vrx_base_next, r3, r0 ; mov r1, ra_height
+/* [0x00001b58] */ 0x8d818eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
+/* [0x00001b60] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
+/* [0x00001b68] */ 0x8c8033f6, 0xd0039496, // add rb_lcount, r1, 3 ; mov.ifc ra_wt_off_mul_l0, unif
+/* [0x00001b70] */ 0x910c83f6, 0xd8024808, // shl r0, r1, v_dma_h_shift ; mov rb8, ra3.8a
+/* [0x00001b78] */ 0x8c0e70b6, 0x1a024809, // add r0, r0, r2 ; mov rb9, ra3.8b
+/* [0x00001b80] */ 0x910cf1f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c
+/* [0x00001b88] */ 0x8c59b1f6, 0x140256a1, // add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
+/* [0x00001b90] */ 0x9581edbf, 0x100255c9, // mov rb_dest, unif ; mov ra9, rb_max_y
+/* [0x00001b98] */ 0x910cd3f6, 0x1e02484b, // shl r1, r1, rb_wt_den_p15 ; mov rb11, ra3.8d
+/* [0x00001ba0] */ 0x8f8023f6, 0xd002531e, // asr rb_wt_off, r1, 2 ; mov ra_link, unif
+/* [0x00001ba8] */ 0x0d50df80, 0x1a0200e7, // sub ra3, rb_wt_den_p15, ra_k1
+// :1
+/* [0x00001bb0] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1
+/* [0x00001bb8] */ 0x8e5539bf, 0x12029899, // shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next
+/* [0x00001bc0] */ 0x8e4505f6, 0xd202c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
+/* [0x00001bc8] */ 0x8c4c3ff6, 0x1202a9e3, // add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next
+/* [0x00001bd0] */ 0x8c531789, 0xda124460, // add vra_y, r3, ra_k1 ; mov r0, r1 << 15
+/* [0x00001bd8] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1
+/* [0x00001be0] */ 0x92267792, 0x1003c8e0, // min r3, r3, ra9 ; mov.ifnc r0, r2
+/* [0x00001be8] */ 0x55150d9f, 0x10024122, // mov ra4, ra5 ; mul24 r2, r3, rb_pitch
+/* [0x00001bf0] */ 0x8c656c87, 0x10024f20, // add vr_txs, vra_base, r2 ; v8min r0, r0, rb_pmask
+/* [0x00001bf8] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0
+/* [0x00001c00] */ 0x4003e030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 2, r0 << 2 @ "mul_used", 0
+/* [0x00001c08] */ 0x40034031, 0xda0109e2, // nop ; mul24.ifn r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
+/* [0x00001c10] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0
+/* [0x00001c18] */ 0x40032031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
+/* [0x00001c20] */ 0x4d004bf1, 0xde0269e0, // sub.setf -, r5, 4 ; mul24 r0, ra0.8d, r1
+/* [0x00001c28] */ 0x8c1a74f6, 0x10025885, // add r2, r2, r3 ; mov ra5, ra6
+/* [0x00001c30] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x00001c38] */ 0x551cadb7, 0x100241a1, // mov ra6, ra7 ; mul24 r1, ra7, rb10
+/* [0x00001c40] */ 0x4d108437, 0x100248a0, // sub r2, r2, r0 ; mul24 r0, ra4, rb8
+/* [0x00001c48] */ 0x0f9c25c0, 0xd00201e7, // asr ra7, r2, v_bit_depth - 8
+/* [0x00001c50] */ 0x4d149237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra5, rb9
+/* [0x00001c58] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra7, rb11
+/* [0x00001c60] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
+/* [0x00001c68] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
+/* [0x00001c70] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+/* [0x00001c78] */ 0x405a700e, 0x120049e1, // nop ; mul24 r1, r1, ra_wt_mul_l0
+/* [0x00001c80] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8 ; mov r3, ra_blk_height
+/* [0x00001c88] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
+/* [0x00001c90] */ 0xffffff00, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x00001c98] */ 0x0f0e7380, 0x10020867, // asr r1, r1, ra3
+/* [0x00001ca0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait
+/* [0x00001ca8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
+/* [0x00001cb0] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
+/* [0x00001cb8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+/* [0x00001cc0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1
+/* [0x00001cc8] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, rb_dest
+/* [0x00001cd0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+/* [0x00001cd8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+/* [0x00001ce0] */ 0xfffffeb0, 0xf0f809e7, // brr -, r:1b
+/* [0x00001ce8] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
+/* [0x00001cf0] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
+/* [0x00001cf8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
+// ::mc_filter_c10_b
+/* [0x00001d00] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
+/* [0x00001d08] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
+/* [0x00001d10] */ 0xf1082dc9, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1
+/* [0x00001d18] */ 0x8c0821f6, 0x12225813, // add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a
+/* [0x00001d20] */ 0x8d810bf6, 0x10025850, // sub r1, r5, rb_pitch ; mov ra_width_height, unif
+/* [0x00001d28] */ 0x93567176, 0x14125815, // max r0, r0, r5 ; mov ra_xshift, ra_xshift_next
+/* [0x00001d30] */ 0x928191f6, 0x10025800, // min r0, r0, rb_max_x ; mov ra0, unif
+/* [0x00001d38] */ 0x9481c1f6, 0xd0025802, // and r0, r0, -4 ; mov ra2, unif
+/* [0x00001d40] */ 0x54404077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul
+/* [0x00001d48] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
+/* [0x00001d50] */ 0x8c427076, 0x12024821, // add r0, r0, r1 ; mov r1, ra_height
+/* [0x00001d58] */ 0x8c9c163f, 0x10024680, // add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next
+/* [0x00001d60] */ 0x8d818eb6, 0x10125756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_mul_l0, unif
+/* [0x00001d68] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
+/* [0x00001d70] */ 0x8c8033f6, 0xd0139496, // add rb_lcount, r1, 3 ; mov.ifc ra_wt_mul_l0, unif
+/* [0x00001d78] */ 0x918083f6, 0xd0025803, // shl r0, r1, v_dma_h_shift ; mov ra3, unif
+/* [0x00001d80] */ 0x8c8270b6, 0x10024823, // add r0, r0, r2 ; mov r3, unif
+/* [0x00001d88] */ 0x910cf1f6, 0xd2125813, // shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a
+/* [0x00001d90] */ 0x8c81b1f6, 0x10025681, // add rb_dma0, r0, rb_dma0_base ; mov ra1, unif
+/* [0x00001d98] */ 0x110c2dc0, 0xd4020827, // shl r0, ra3.16b, v_x_shift
+/* [0x00001da0] */ 0x8c8021f6, 0x10025803, // add r0, r0, rb_elem_x ; mov ra3, unif
+/* [0x00001da8] */ 0x8d810bf6, 0x10025852, // sub r1, r5, rb_pitch ; mov ra_wt_off_mul_l1, unif
+/* [0x00001db0] */ 0x930e7176, 0x18024808, // max r0, r0, r5 ; mov rb8, ra3.8a
+/* [0x00001db8] */ 0x920d91f6, 0x1a024809, // min r0, r0, rb_max_x ; mov rb9, ra3.8b
+/* [0x00001dc0] */ 0x9481c1f6, 0xd0039812, // and r0, r0, -4 ; mov.ifc ra_wt_off_mul_l1, unif
+/* [0x00001dc8] */ 0x940e7076, 0x1c02484a, // and r1, r0, r1 ; mov rb10, ra3.8c
+/* [0x00001dd0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
+/* [0x00001dd8] */ 0x8c827076, 0x10024817, // add r0, r0, r1 ; mov rb_dest, unif
+/* [0x00001de0] */ 0x0c9e7600, 0x100214e7, // add rb_base2_next, r3, r0
+/* [0x00001de8] */ 0x950deff6, 0x1e02424b, // mov ra9, rb_max_y ; mov rb11, ra3.8d
+/* [0x00001df0] */ 0x1148ddc0, 0x14020867, // shl r1, ra_wt_off_l1, rb_wt_den_p15
+/* [0x00001df8] */ 0x8f8093f6, 0xd002531e, // asr rb_wt_off, r1, 9 ; mov ra_link, unif
+// :1
+/* [0x00001e00] */ 0xcd511bee, 0xaa0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0
+/* [0x00001e08] */ 0x8e5539bf, 0x12029899, // shr r2, r4, ra_xshift ; mov.ifz ra_base2, rb_base2_next
+/* [0x00001e10] */ 0x8e4d05f6, 0xd0029851, // shr r1, r2, v_v_shift ; mov.ifz ra_y_y2, ra_y_y2_next
+/* [0x00001e18] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz ra_base, ra_base_next
+/* [0x00001e20] */ 0x8c441fb6, 0xd4224463, // add ra_y, 1, ra_y ; mov r3, ra_y
+/* [0x00001e28] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0 ; mov r0, r1 << 15
+/* [0x00001e30] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9 ; mov.ifnc r1, r2 << 1
+/* [0x00001e38] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch
+/* [0x00001e40] */ 0x8c616cc7, 0x10024e20, // add t0s, ra_base, r3 ; v8min r0, r0, rb_pmask
+/* [0x00001e48] */ 0x95145ff6, 0x10025104, // mov rb4, rb5 ; mov ra4, ra5
+/* [0x00001e50] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0
+/* [0x00001e58] */ 0x4003e030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 2, r0 << 2 @ "mul_used", 0
+/* [0x00001e60] */ 0x40034031, 0xda0109e2, // nop ; mul24.ifn r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
+/* [0x00001e68] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0
+/* [0x00001e70] */ 0x40032031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
+/* [0x00001e78] */ 0x4c0274f1, 0x1e0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8d, r1
+/* [0x00001e80] */ 0x8d9c64ff, 0xb0024885, // sub r2, r2, r3 ; mov rb5, rb6 ; ldtmu1
+/* [0x00001e88] */ 0x0f9c25c0, 0xd00200e7, // asr ra3, r2, (v_bit_depth - 8)
+/* [0x00001e90] */ 0x8e1809f6, 0x10025885, // shr r2, r4, rb_xshift2 ; mov ra5, ra6
+/* [0x00001e98] */ 0x8e4505f6, 0xd2024863, // shr r1, r2, v_v_shift ; mov r3, ra_y2
+/* [0x00001ea0] */ 0x8c5077bf, 0x1a124446, // add ra_y2, r3, ra_k1 ; mov rb6, rb7
+/* [0x00001ea8] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0 ; mov r0, r1 << 15
+/* [0x00001eb0] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9 ; mov.ifnc r1, r2 << 1
+/* [0x00001eb8] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch
+/* [0x00001ec0] */ 0x8c656cc7, 0x10024f20, // add t1s, ra_base2, r3 ; v8min r0, r0, rb_pmask
+/* [0x00001ec8] */ 0x540563f0, 0x18024863, // and r1, r1, rb_pmask ; mul24 r3, ra1.8a, r0
+/* [0x00001ed0] */ 0x4007e030, 0xda0049e2, // nop ; mul24 r2, ra1.8b << 2, r0 << 2 @ "mul_used", 0
+/* [0x00001ed8] */ 0x40074031, 0xda0109e2, // nop ; mul24.ifn r2, ra1.8b << 12, r1 << 12 @ "mul_used", 0
+/* [0x00001ee0] */ 0x4d07c4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 4, r0 << 4 @ "mul_used", 0
+/* [0x00001ee8] */ 0x40072031, 0xdc0109e3, // nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
+/* [0x00001ef0] */ 0x4d044bf1, 0xde0269e0, // sub.setf -, r5, 4 ; mul24 r0, ra1.8d, r1
+/* [0x00001ef8] */ 0x4c0854fe, 0x1a0248a1, // add r2, r2, r3 ; mul24 r1, rb5, ra2.8b
+/* [0x00001f00] */ 0xfffffee0, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x00001f08] */ 0x551cadb7, 0x100241a3, // mov ra6, ra7 ; mul24 r3, ra7, rb10
+/* [0x00001f10] */ 0x4d08443e, 0x180248a0, // sub r2, r2, r0 ; mul24 r0, rb4, ra2.8a
+/* [0x00001f18] */ 0x8f0c25f6, 0xd00241c7, // asr ra7, r2, (v_bit_depth - 8) ; mov rb7, ra3
+/* [0x00001f20] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c
+/* [0x00001f28] */ 0x4c08723e, 0x1e024860, // add r1, r1, r0 ; mul24 r0, rb7, ra2.8d
+/* [0x00001f30] */ 0x4d108237, 0x100248a0, // sub r2, r1, r0 ; mul24 r0, ra4, rb8
+/* [0x00001f38] */ 0x4d149637, 0x10024860, // sub r1, r3, r0 ; mul24 r0, ra5, rb9
+/* [0x00001f40] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra7, rb11
+/* [0x00001f48] */ 0x4d527216, 0x12024862, // sub r1, r1, r0 ; mul24 r2, r2, ra_k256
+/* [0x00001f50] */ 0x4f50e5ce, 0xd20248a1, // asr r2, r2, 14 ; mul24 r1, r1, ra_k256
+/* [0x00001f58] */ 0x4f58e3d6, 0xd2024862, // asr r1, r1, 14 ; mul24 r2, r2, ra_wt_mul_l0
+/* [0x00001f60] */ 0x4c48c5ce, 0x120248a1, // add r2, r2, rb_wt_off ; mul24 r1, r1, ra_wt_mul_l1
+/* [0x00001f68] */ 0x8c5e72b6, 0x1c024863, // add r1, r1, r2 ; mov r3, ra_blk_height
+/* [0x00001f70] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
+/* [0x00001f78] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x00001f80] */ 0xef40d3f3, 0x12024860, // asr r1, r1, rb_wt_den_p15 ; v8subs r0, ra_height, r3
+/* [0x00001f88] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait
+/* [0x00001f90] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
+/* [0x00001f98] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
+/* [0x00001fa0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+/* [0x00001fa8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1
+/* [0x00001fb0] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, rb_dest
+/* [0x00001fb8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+/* [0x00001fc0] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+/* [0x00001fc8] */ 0xfffffe18, 0xf0f809e7, // brr -, r:1b
+/* [0x00001fd0] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
+/* [0x00001fd8] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
+/* [0x00001fe0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
+// ::mc_sync10_q0
+/* [0x00001fe8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00001ff0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00001ff8] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i)
+/* [0x00002000] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i)
+/* [0x00002008] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i)
+/* [0x00002010] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00002018] */ 0x0000001c, 0xe80009e7, // mov dst, sacq(i)
+/* [0x00002020] */ 0x00000001, 0xe80009e7, // mov dst, srel(i)
+/* [0x00002028] */ 0x0000000d, 0xe80009e7, // mov dst, srel(i)
+// ::mc_sync10_q1
+/* [0x00002030] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00002038] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00002040] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00002048] */ 0x00000000, 0xe80009e7, // mov dst, srel(i)
+/* [0x00002050] */ 0x00000011, 0xe80009e7, // mov dst, sacq(i)
+/* [0x00002058] */ 0x00000002, 0xe80009e7, // mov dst, srel(i)
+// ::mc_sync10_q2
+/* [0x00002060] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00002068] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00002070] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00002078] */ 0x00000000, 0xe80009e7, // mov dst, srel(i)
+/* [0x00002080] */ 0x00000012, 0xe80009e7, // mov dst, sacq(i)
+/* [0x00002088] */ 0x00000003, 0xe80009e7, // mov dst, srel(i)
+// ::mc_sync10_q3
+/* [0x00002090] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00002098] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x000020a0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x000020a8] */ 0x00000000, 0xe80009e7, // mov dst, srel(i)
+/* [0x000020b0] */ 0x00000013, 0xe80009e7, // mov dst, sacq(i)
+/* [0x000020b8] */ 0x009e7000, 0x100009e7, // nop
+// ::mc_sync10_q4
+/* [0x000020c0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x000020c8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x000020d0] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i)
+/* [0x000020d8] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i)
+/* [0x000020e0] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i)
+/* [0x000020e8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x000020f0] */ 0x0000001d, 0xe80009e7, // mov dst, sacq(i)
+/* [0x000020f8] */ 0x00000005, 0xe80009e7, // mov dst, srel(i)
+/* [0x00002100] */ 0x0000000e, 0xe80009e7, // mov dst, srel(i)
+// ::mc_sync10_q5
+/* [0x00002108] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00002110] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00002118] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00002120] */ 0x00000004, 0xe80009e7, // mov dst, srel(i)
+/* [0x00002128] */ 0x00000015, 0xe80009e7, // mov dst, sacq(i)
+/* [0x00002130] */ 0x00000006, 0xe80009e7, // mov dst, srel(i)
+// ::mc_sync10_q6
+/* [0x00002138] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00002140] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00002148] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00002150] */ 0x00000004, 0xe80009e7, // mov dst, srel(i)
+/* [0x00002158] */ 0x00000016, 0xe80009e7, // mov dst, sacq(i)
+/* [0x00002160] */ 0x00000007, 0xe80009e7, // mov dst, srel(i)
+// ::mc_sync10_q7
+/* [0x00002168] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00002170] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00002178] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00002180] */ 0x00000004, 0xe80009e7, // mov dst, srel(i)
+/* [0x00002188] */ 0x00000017, 0xe80009e7, // mov dst, sacq(i)
+/* [0x00002190] */ 0x009e7000, 0x100009e7, // nop
+// ::mc_sync10_q8
+/* [0x00002198] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x000021a0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x000021a8] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i)
+/* [0x000021b0] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i)
+/* [0x000021b8] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i)
+/* [0x000021c0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x000021c8] */ 0x0000001e, 0xe80009e7, // mov dst, sacq(i)
+/* [0x000021d0] */ 0x00000009, 0xe80009e7, // mov dst, srel(i)
+/* [0x000021d8] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i)
+// ::mc_sync10_q9
+/* [0x000021e0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x000021e8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x000021f0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x000021f8] */ 0x00000008, 0xe80009e7, // mov dst, srel(i)
+/* [0x00002200] */ 0x00000019, 0xe80009e7, // mov dst, sacq(i)
+/* [0x00002208] */ 0x0000000a, 0xe80009e7, // mov dst, srel(i)
+// ::mc_sync10_q10
+/* [0x00002210] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00002218] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00002220] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00002228] */ 0x00000008, 0xe80009e7, // mov dst, srel(i)
+/* [0x00002230] */ 0x0000001a, 0xe80009e7, // mov dst, sacq(i)
+/* [0x00002238] */ 0x0000000b, 0xe80009e7, // mov dst, srel(i)
+// ::mc_sync10_q11
+/* [0x00002240] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00002248] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00002250] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00002258] */ 0x00000008, 0xe80009e7, // mov dst, srel(i)
+/* [0x00002260] */ 0x0000001b, 0xe80009e7, // mov dst, sacq(i)
+/* [0x00002268] */ 0x009e7000, 0x100009e7, // nop
+// ::mc_exit_c10_q0
+// ::mc_exit_y10_q0
+/* [0x00002270] */ 0x00000003, 0xe00228e7, // mov.setf r3, PREREAD - 1
+// :1
+/* [0x00002278] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
+/* [0x00002280] */ 0x009e7000, 0xa00009e7, // nop ; nop ; ldtmu0
+/* [0x00002288] */ 0x009e7000, 0xb00009e7, // nop ; nop ; ldtmu1
+/* [0x00002290] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
+/* [0x00002298] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x000022a0] */ 0x0000001c, 0xe80009e7, // mov dst, sacq(i)
+/* [0x000022a8] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend
+/* [0x000022b0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1
+/* [0x000022b8] */ 0x009e7000, 0x100009e7, // nop
+// ::mc_exit_c10_qn
+// ::mc_exit_y10_qn
+/* [0x000022c0] */ 0x00000003, 0xe00228e7, // mov.setf r3, PREREAD - 1
+// :1
+/* [0x000022c8] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
+/* [0x000022d0] */ 0x009e7000, 0xa00009e7, // nop ; nop ; ldtmu0
+/* [0x000022d8] */ 0x009e7000, 0xb00009e7, // nop ; nop ; ldtmu1
+/* [0x000022e0] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
+/* [0x000022e8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x000022f0] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend
+/* [0x000022f8] */ 0x009e7000, 0x100009e7, // nop
+/* [0x00002300] */ 0x009e7000, 0x100009e7, // nop
+// ::mc_setup_y10_q0
+/* [0x00002308] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i)
+// ::mc_setup_y10_qn
+/* [0x00002310] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1 ; mov ra0, unif
+/* [0x00002318] */ 0x15827d80, 0x10020267, // mov ra9, unif
+/* [0x00002320] */ 0x15827d80, 0x10020067, // mov ra1, unif
+/* [0x00002328] */ 0x15827d80, 0x100202e7, // mov ra11, unif
+/* [0x00002330] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
+/* [0x00002338] */ 0x119de1c0, 0xd00210e7, // shl rb_ef, r0, i_shift30
+/* [0x00002340] */ 0xff100100, 0xe0020527, // mov ra_kff100100, 0xff100100
+/* [0x00002348] */ 0x0000ffff, 0xe00215a7, // mov rb_pmask, v_pmask
+/* [0x00002350] */ 0x000803ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
+/* [0x00002358] */ 0x15827d80, 0x100200e7, // mov ra3, unif
+/* [0x00002360] */ 0x15827d80, 0x10021527, // mov rb_xpitch, unif
+/* [0x00002368] */ 0x0d0c1dc0, 0xd4020827, // sub r0, ra3.16b, 1
+/* [0x00002370] */ 0x119c11c0, 0xd0021667, // shl rb_max_x, r0, v_x_shift
+/* [0x00002378] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1
+/* [0x00002380] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
+/* [0x00002388] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+/* [0x00002390] */ 0x159d03c0, 0x10021627, // or rb_dma1_base, r1, rb_pitch
+/* [0x00002398] */ 0x159a7d80, 0x100208e7, // mov r3, elem_num
+/* [0x000023a0] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3
+/* [0x000023a8] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
+/* [0x000023b0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+/* [0x000023b8] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+/* [0x000023c0] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
+/* [0x000023c8] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4 ; v8subs r2, r2, r2
+/* [0x000023d0] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch
+/* [0x000023d8] */ 0x149e7080, 0x10020867, // and r1, r0, r2
+/* [0x000023e0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
+/* [0x000023e8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+/* [0x000023f0] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0
+/* [0x000023f8] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
+/* [0x00002400] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
+/* [0x00002408] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+/* [0x00002410] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+/* [0x00002418] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
+/* [0x00002420] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
+/* [0x00002428] */ 0x149e7080, 0x10020867, // and r1, r0, r2
+/* [0x00002430] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
+/* [0x00002438] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+/* [0x00002440] */ 0x0c2e7c00, 0x10020667, // add ra_base2, ra11, r0
+/* [0x00002448] */ 0x80027036, 0x120049e0, // nop ; mov r0, ra0.16a
+/* [0x00002450] */ 0x95044ff6, 0xd20248e2, // mov r3, PREREAD ; mov r2, ra1.16a
+// :1
+/* [0x00002458] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
+/* [0x00002460] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
+/* [0x00002468] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
+/* [0x00002470] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch
+/* [0x00002478] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1 ; mov ra_y, r0
+/* [0x00002480] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
+/* [0x00002488] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
+/* [0x00002490] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
+/* [0x00002498] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch
+/* [0x000024a0] */ 0x8c667c52, 0x10125f11, // add t1s, ra_base2, r1 ; mov ra_y2, r2
+/* [0x000024a8] */ 0x0c80ddc0, 0xd0021367, // add rb_wt_den_p15, unif, 23 - v_bit_depth
+/* [0x000024b0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+/* [0x000024b8] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1
+/* [0x000024c0] */ 0x119c43c0, 0xd0020867, // shl r1, r1, 4
+/* [0x000024c8] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1
+/* [0x000024d0] */ 0x159e7040, 0x10020827, // or r0, r0, r1
+/* [0x000024d8] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
+/* [0x000024e0] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
+/* [0x000024e8] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0))
+/* [0x000024f0] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6
+/* [0x000024f8] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
+/* [0x00002500] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00002508] */ 0x00000000, 0xe0024208, // mov ra8, 0 ; mov rb8, 0
+/* [0x00002510] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00002518] */ 0x00000000, 0xe0024249, // mov ra9, 0 ; mov rb9, 0
+/* [0x00002520] */ 0x00000000, 0xe002428a, // mov ra10, 0 ; mov rb10, 0
+/* [0x00002528] */ 0x00000000, 0xe00242cb, // mov ra11, 0 ; mov rb11, 0
+// :per_block_setup_10
+/* [0x00002530] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
+/* [0x00002538] */ 0x93567176, 0x14125815, // max r0, r0, r5 ; mov ra_xshift, ra_xshift_next
+/* [0x00002540] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+/* [0x00002548] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
+/* [0x00002550] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
+/* [0x00002558] */ 0x8d810bf6, 0x1002589a, // sub r2, r5, rb_pitch ; mov ra_base_next, unif
+/* [0x00002560] */ 0x940270b6, 0x12225853, // and r1, r0, r2 ; mov ra_y_next, ra0.16a
+/* [0x00002568] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
+/* [0x00002570] */ 0x8c827076, 0x10025801, // add r0, r0, r1 ; mov ra1, unif
+/* [0x00002578] */ 0x0c6a7c00, 0x100206a7, // add ra_base_next, ra_base_next, r0
+/* [0x00002580] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
+/* [0x00002588] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
+/* [0x00002590] */ 0x93067176, 0x12125813, // max r0, r0, r5 ; mov ra_y2_next, ra1.16a
+/* [0x00002598] */ 0x928191f6, 0x10024813, // min r0, r0, rb_max_x ; mov rb_base2_next, unif
+/* [0x000025a0] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
+/* [0x000025a8] */ 0x9481c1f6, 0xd0025810, // and r0, r0, -4 ; mov ra_width_height, unif
+/* [0x000025b0] */ 0x949dc0bf, 0x10024871, // and r1, r0, r2 ; mov vw_setup, rb_vpm_init
+/* [0x000025b8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
+/* [0x000025c0] */ 0x4c402077, 0xd4024821, // add r0, r0, r1 ; mul24 r1, ra_width, v_x_mul
+/* [0x000025c8] */ 0x0c9d3e00, 0x100214e7, // add rb_base2_next, rb_base2_next, r0
+/* [0x000025d0] */ 0x8d418e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
+/* [0x000025d8] */ 0x8c5c31c6, 0xdc025460, // add rb_i_tmu, r0, 7 - PREREAD ; v8min r0, r0, ra_blk_height
+/* [0x000025e0] */ 0x0c9c71c0, 0xd00214a7, // add rb_lcount, r0, 7
+/* [0x000025e8] */ 0x119c81c0, 0xd0020827, // shl r0, r0, v_dma_h_shift
+/* [0x000025f0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+/* [0x000025f8] */ 0x119cf1c0, 0xd0020827, // shl r0, r0, v_dma_wh_shift
+/* [0x00002600] */ 0x8c81b1f6, 0x100256a0, // add rb_dma0, r0, rb_dma0_base ; mov r0, unif
+/* [0x00002608] */ 0x918101f6, 0xd00a5816, // shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif
+/* [0x00002610] */ 0x915031f6, 0xde024223, // shl ra8, r0, 3 ; mov r3, ra_k255
+/* [0x00002618] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
+/* [0x00002620] */ 0x10227380, 0x1e4200a7, // ror ra2.8a, r1, ra8.8d
+/* [0x00002628] */ 0x10227380, 0x1c420027, // ror ra0.8a, r1, ra8.8c
+/* [0x00002630] */ 0x01040400, 0xe0020867, // mov r1, 0x01040400
+/* [0x00002638] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d
+/* [0x00002640] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c
+/* [0x00002648] */ 0x050b0a00, 0xe0020867, // mov r1,0x050b0a00
+/* [0x00002650] */ 0x10227380, 0x1e6200a7, // ror ra2.8c, r1, ra8.8d
+/* [0x00002658] */ 0x10227380, 0x1c620027, // ror ra0.8c, r1, ra8.8c
+/* [0x00002660] */ 0x11283a40, 0xe0020867, // mov r1,0x11283a40
+/* [0x00002668] */ 0x10227380, 0x1e7200a7, // ror ra2.8d, r1, ra8.8d
+/* [0x00002670] */ 0x10227380, 0x1c720027, // ror ra0.8d, r1, ra8.8c
+/* [0x00002678] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
+/* [0x00002680] */ 0x902203bf, 0x1e025812, // ror r0, r1, ra8.8d ; mov ra_wt_off_mul_l1, unif
+/* [0x00002688] */ 0x90227383, 0x1c424044, // ror ra1.8a, r1, ra8.8c ; v8min rb4, r0, r3
+/* [0x00002690] */ 0x0a0b0500, 0xe0020867, // mov r1,0x0a0b0500
+/* [0x00002698] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d
+/* [0x000026a0] */ 0x90227383, 0x1c524045, // ror ra1.8b, r1, ra8.8c ; v8min rb5, r0, r3
+/* [0x000026a8] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
+/* [0x000026b0] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d
+/* [0x000026b8] */ 0x90227383, 0x1c624046, // ror ra1.8c, r1, ra8.8c ; v8min rb6, r0, r3
+/* [0x000026c0] */ 0x954a0dbf, 0x10084597, // mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 ; mov rb_dest, unif
+/* [0x000026c8] */ 0x01010000, 0xe0020867, // mov r1,0x01010000
+/* [0x000026d0] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d
+/* [0x000026d8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x000026e0] */ 0x90227383, 0x1c724047, // ror ra1.8d, r1, ra8.8c ; v8min rb7, r0, r3
+/* [0x000026e8] */ 0x1158ddc0, 0x14020827, // shl r0, ra_wt_off_l0, rb_wt_den_p15
+/* [0x000026f0] */ 0x8f8091f6, 0xd002531e, // asr rb_wt_off, r0, 9 ; mov ra_link, unif
+// ::mc_filter_y10_pxx
+/* [0x000026f8] */ 0xfffffe18, 0xf0f807a7, // brr ra_link, r:per_block_setup_10
+/* [0x00002700] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num
+/* [0x00002708] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2
+/* [0x00002710] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next
+/* [0x00002718] */ 0x11581dc0, 0xd21205a7, // shl ra_wt_mul_l0, ra_wt_mul_l0, 1
+// :1
+/* [0x00002720] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1
+/* [0x00002728] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0
+/* [0x00002730] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch
+/* [0x00002738] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
+/* [0x00002740] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next
+/* [0x00002748] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3
+/* [0x00002750] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2 ; mov.ifz ra_base2, rb_base2_next
+/* [0x00002758] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
+/* [0x00002760] */ 0x9221e5f6, 0x10025887, // min r2, r2, rb_max_y ; mov ra7, ra8
+/* [0x00002768] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3
+/* [0x00002770] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask
+/* [0x00002778] */ 0x8c243ff6, 0x100279c8, // add.setf -, rb_ef, rb_ef ; mov ra8, ra9
+/* [0x00002780] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0
+/* [0x00002788] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0
+/* [0x00002790] */ 0x40038031, 0xd80109e3, // nop ; mul24.ifn r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0
+/* [0x00002798] */ 0x40037031, 0xda0109e2, // nop ; mul24.ifn r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0
+/* [0x000027a0] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0
+/* [0x000027a8] */ 0x40036031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
+/* [0x000027b0] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0
+/* [0x000027b8] */ 0x40035031, 0xde0109e3, // nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
+/* [0x000027c0] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0
+/* [0x000027c8] */ 0x40074031, 0xd80109e3, // nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
+/* [0x000027d0] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0
+/* [0x000027d8] */ 0x40073031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
+/* [0x000027e0] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0
+/* [0x000027e8] */ 0x40072031, 0xdc0109e3, // nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
+/* [0x000027f0] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0
+/* [0x000027f8] */ 0x40071031, 0xde0109e3, // nop ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
+/* [0x00002800] */ 0x8d288bf6, 0xd00279c9, // sub.setf -, r5, 8 ; mov ra9, ra10
+/* [0x00002808] */ 0x4d0894fe, 0x180248a0, // sub r2, r2, r3 ; mul24 r0, rb9, ra2.8a
+/* [0x00002810] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x00002818] */ 0x5508affe, 0x1a025261, // mov rb9, rb10 ; mul24 r1, rb10, ra2.8b
+/* [0x00002820] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11 ; mov rb10, rb11
+/* [0x00002828] */ 0x8f1c25f6, 0xd00242cb, // asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7
+/* [0x00002830] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c
+/* [0x00002838] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d
+/* [0x00002840] */ 0x4c204237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb4
+/* [0x00002848] */ 0x4c245237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra9, rb5
+/* [0x00002850] */ 0x4d286237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra10, rb6
+/* [0x00002858] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb7
+/* [0x00002860] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
+/* [0x00002868] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
+/* [0x00002870] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+/* [0x00002878] */ 0x405a700e, 0x120049e1, // nop ; mul24 r1, r1, ra_wt_mul_l0
+/* [0x00002880] */ 0x8c5cc3f6, 0x1c024863, // add r1, r1, rb_wt_off ; mov r3, ra_blk_height
+/* [0x00002888] */ 0xf14083f3, 0xd2024860, // shl r1, r1, 8 ; v8subs r0, ra_height, r3
+/* [0x00002890] */ 0xfffffe70, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x00002898] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15
+/* [0x000028a0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait
+/* [0x000028a8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
+/* [0x000028b0] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
+/* [0x000028b8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+/* [0x000028c0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1
+/* [0x000028c8] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, rb_dest
+/* [0x000028d0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+/* [0x000028d8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+/* [0x000028e0] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b
+/* [0x000028e8] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
+/* [0x000028f0] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
+/* [0x000028f8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
+// ::mc_filter_y10_p00
+/* [0x00002900] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num
+/* [0x00002908] */ 0x15567d80, 0x14120567, // mov ra_xshift, ra_xshift_next
+/* [0x00002910] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3
+/* [0x00002918] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
+/* [0x00002920] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+/* [0x00002928] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+/* [0x00002930] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
+/* [0x00002938] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4 ; v8subs r2, r2, r2
+/* [0x00002940] */ 0x8d8105f6, 0x1002589a, // sub r2, r2, rb_pitch ; mov ra_base_next, unif
+/* [0x00002948] */ 0x940270b6, 0x12225853, // and r1, r0, r2 ; mov ra_y_next, ra0.16a
+/* [0x00002950] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
+/* [0x00002958] */ 0x8c827076, 0x10025810, // add r0, r0, r1 ; mov ra_width_height, unif
+/* [0x00002960] */ 0x8c69cc3f, 0x100246b1, // add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init
+/* [0x00002968] */ 0x11401dc0, 0xd4020867, // shl r1, ra_width, v_x_shift
+/* [0x00002970] */ 0x8d418e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
+/* [0x00002978] */ 0x8d5c41c6, 0xdc025460, // sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height
+/* [0x00002980] */ 0x919c81c0, 0xd0024812, // shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0
+/* [0x00002988] */ 0x8c827076, 0x10025816, // add r0, r0, r1 ; mov ra_wt_off_mul_l0, unif
+/* [0x00002990] */ 0x9180f1f6, 0xd0024817, // shl r0, r0, v_dma_wh_shift ; mov rb_dest, unif
+/* [0x00002998] */ 0x0c9db1c0, 0x100216a7, // add rb_dma0, r0, rb_dma0_base
+/* [0x000029a0] */ 0xf158dddb, 0x14024825, // shl r0, ra_wt_off_l0, rb_wt_den_p15 ; v8subs r5rep, r3, r3
+/* [0x000029a8] */ 0x8f8011f6, 0xd002531e, // asr rb_wt_off, r0, 1 ; mov ra_link, unif
+// :1
+/* [0x000029b0] */ 0xcd511bee, 0x1a0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1
+/* [0x000029b8] */ 0x804e7036, 0xa42099d1, // nop ; mov.ifz ra_y, ra_y_next ; ldtmu0
+/* [0x000029c0] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch
+/* [0x000029c8] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
+/* [0x000029d0] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next
+/* [0x000029d8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3
+/* [0x000029e0] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2 ; v8min r0, r0, rb_pmask
+/* [0x000029e8] */ 0x4d592bc6, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0
+/* [0x000029f0] */ 0x915cd3f6, 0xdc024863, // shl r1, r1, 23 - v_bit_depth ; mov r3, ra_blk_height
+/* [0x000029f8] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
+/* [0x00002a00] */ 0xffffff90, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x00002a08] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15
+/* [0x00002a10] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait
+/* [0x00002a18] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
+/* [0x00002a20] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
+/* [0x00002a28] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+/* [0x00002a30] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1
+/* [0x00002a38] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, rb_dest
+/* [0x00002a40] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+/* [0x00002a48] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+/* [0x00002a50] */ 0xffffff40, 0xf0f809e7, // brr -, r:1b
+/* [0x00002a58] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
+/* [0x00002a60] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
+/* [0x00002a68] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
+// ::mc_filter_y10_bxx
+/* [0x00002a70] */ 0xfffffaa0, 0xf0f807a7, // brr ra_link, r:per_block_setup_10
+/* [0x00002a78] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num
+/* [0x00002a80] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2
+/* [0x00002a88] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next
+// :1
+/* [0x00002a90] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1
+/* [0x00002a98] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0
+/* [0x00002aa0] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch
+/* [0x00002aa8] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
+/* [0x00002ab0] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next
+/* [0x00002ab8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3
+/* [0x00002ac0] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2 ; mov.ifz ra_base2, rb_base2_next
+/* [0x00002ac8] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
+/* [0x00002ad0] */ 0x9221e5f6, 0x10025887, // min r2, r2, rb_max_y ; mov ra7, ra8
+/* [0x00002ad8] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3
+/* [0x00002ae0] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask
+/* [0x00002ae8] */ 0x8c243ff6, 0x100279c8, // add.setf -, rb_ef, rb_ef ; mov ra8, ra9
+/* [0x00002af0] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0
+/* [0x00002af8] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0
+/* [0x00002b00] */ 0x40038031, 0xd80109e3, // nop ; mul24.ifn r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0
+/* [0x00002b08] */ 0x40037031, 0xda0109e2, // nop ; mul24.ifn r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0
+/* [0x00002b10] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0
+/* [0x00002b18] */ 0x40036031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
+/* [0x00002b20] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0
+/* [0x00002b28] */ 0x40035031, 0xde0109e3, // nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
+/* [0x00002b30] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0
+/* [0x00002b38] */ 0x40074031, 0xd80109e3, // nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
+/* [0x00002b40] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0
+/* [0x00002b48] */ 0x40073031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
+/* [0x00002b50] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0
+/* [0x00002b58] */ 0x40072031, 0xdc0109e3, // nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
+/* [0x00002b60] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0
+/* [0x00002b68] */ 0x40071031, 0xde0109e3, // nop ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
+/* [0x00002b70] */ 0x8d288bf6, 0xd00279c9, // sub.setf -, r5, 8 ; mov ra9, ra10
+/* [0x00002b78] */ 0x4d0894fe, 0x180248a0, // sub r2, r2, r3 ; mul24 r0, rb9, ra2.8a
+/* [0x00002b80] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x00002b88] */ 0x5508affe, 0x1a025261, // mov rb9, rb10 ; mul24 r1, rb10, ra2.8b
+/* [0x00002b90] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11 ; mov rb10, rb11
+/* [0x00002b98] */ 0x8f1c25f6, 0xd00242cb, // asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7
+/* [0x00002ba0] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c
+/* [0x00002ba8] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d
+/* [0x00002bb0] */ 0x4c204237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb4
+/* [0x00002bb8] */ 0x4c245237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra9, rb5
+/* [0x00002bc0] */ 0x4d286237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra10, rb6
+/* [0x00002bc8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb7
+/* [0x00002bd0] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0 ; mov r2, rb_wt_off
+/* [0x00002bd8] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
+/* [0x00002be0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+/* [0x00002be8] */ 0x405a700e, 0x120049e0, // nop ; mul24 r0, r1, ra_wt_mul_l0
+/* [0x00002bf0] */ 0x4c4b808e, 0xd2024821, // add r0, r0, r2 ; mul24 r1, r1 << 8, ra_wt_mul_l1 << 8 @ "mul_used", 0
+/* [0x00002bf8] */ 0x8c5e7236, 0x1c024863, // add r1, r1, r0 ; mov r3, ra_blk_height
+/* [0x00002c00] */ 0xf14083f3, 0xd2024860, // shl r1, r1, 8 ; v8subs r0, ra_height, r3
+/* [0x00002c08] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x00002c10] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15
+/* [0x00002c18] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait
+/* [0x00002c20] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
+/* [0x00002c28] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
+/* [0x00002c30] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+/* [0x00002c38] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1
+/* [0x00002c40] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, rb_dest
+/* [0x00002c48] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+/* [0x00002c50] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+/* [0x00002c58] */ 0xfffffe18, 0xf0f809e7, // brr -, r:1b
+/* [0x00002c60] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
+/* [0x00002c68] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
+/* [0x00002c70] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
+// ::mc_filter_y10_b00
+/* [0x00002c78] */ 0xfffff898, 0xf0f807a7, // brr ra_link, r:per_block_setup_10
+/* [0x00002c80] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num
+/* [0x00002c88] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2
+/* [0x00002c90] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next
+/* [0x00002c98] */ 0x00000007, 0xe0020827, // mov r0, 7
+/* [0x00002ca0] */ 0x0d9d1e00, 0x10021467, // sub rb_i_tmu, rb_i_tmu, r0
+/* [0x00002ca8] */ 0x0d9d2e00, 0x100214a7, // sub rb_lcount, rb_lcount, r0
+/* [0x00002cb0] */ 0x95588ff6, 0xd0024821, // mov r0, 8 ; mov r1, ra_wt_off_mul_l0
+/* [0x00002cb8] */ 0x119cce00, 0x10021327, // shl rb_wt_off, rb_wt_off, r0
+/* [0x00002cc0] */ 0x809f8009, 0xd000d9d6, // nop ; mov.ifnz ra_wt_off_mul_l0, r1 << 8
+// :1
+/* [0x00002cc8] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1
+/* [0x00002cd0] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0
+/* [0x00002cd8] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch
+/* [0x00002ce0] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
+/* [0x00002ce8] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next
+/* [0x00002cf0] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3
+/* [0x00002cf8] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2 ; mov.ifz ra_base2, rb_base2_next
+/* [0x00002d00] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
+/* [0x00002d08] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
+/* [0x00002d10] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3
+/* [0x00002d18] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask
+/* [0x00002d20] */ 0x545963c6, 0x12024860, // and r1, r1, rb_pmask ; mul24 r0, r0, ra_wt_mul_l0
+/* [0x00002d28] */ 0x4d492bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1
+/* [0x00002d30] */ 0x0c9e7040, 0x10020867, // add r1, r0, r1
+/* [0x00002d38] */ 0x915cc3f6, 0xdc024863, // shl r1, r1, 22 - v_bit_depth ; mov r3, ra_blk_height
+/* [0x00002d40] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
+/* [0x00002d48] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x00002d50] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15
+/* [0x00002d58] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait
+/* [0x00002d60] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
+/* [0x00002d68] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
+/* [0x00002d70] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+/* [0x00002d78] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1
+/* [0x00002d80] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, rb_dest
+/* [0x00002d88] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+/* [0x00002d90] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+/* [0x00002d98] */ 0xffffff10, 0xf0f809e7, // brr -, r:1b
+/* [0x00002da0] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
+/* [0x00002da8] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
+/* [0x00002db0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
+// ::mc_end
+};
+#ifdef __HIGHC__
+#pragma Align_to(8, rpi_shader)
+#endif
diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
new file mode 100644
index 0000000000..82bf380eb4
--- /dev/null
+++ b/libavcodec/rpi_shader.h
@@ -0,0 +1,63 @@
+#ifndef rpi_shader_H
+#define rpi_shader_H
+
+extern unsigned int rpi_shader[];
+
+#define mc_setup_c_q0 (rpi_shader + 0)
+#define mc_start (rpi_shader + 0)
+#define mc_setup_c_qn (rpi_shader + 2)
+#define mc_filter_c_p (rpi_shader + 142)
+#define mc_filter_c_p_l1 (rpi_shader + 272)
+#define mc_filter_c_b (rpi_shader + 402)
+#define mc_sync_q0 (rpi_shader + 590)
+#define mc_sync_q1 (rpi_shader + 608)
+#define mc_sync_q2 (rpi_shader + 620)
+#define mc_sync_q3 (rpi_shader + 632)
+#define mc_sync_q4 (rpi_shader + 644)
+#define mc_sync_q5 (rpi_shader + 662)
+#define mc_sync_q6 (rpi_shader + 674)
+#define mc_sync_q7 (rpi_shader + 686)
+#define mc_sync_q8 (rpi_shader + 698)
+#define mc_sync_q9 (rpi_shader + 716)
+#define mc_sync_q10 (rpi_shader + 728)
+#define mc_sync_q11 (rpi_shader + 740)
+#define mc_exit_c_qn (rpi_shader + 752)
+#define mc_exit_y_qn (rpi_shader + 752)
+#define mc_exit_c_q0 (rpi_shader + 770)
+#define mc_exit_y_q0 (rpi_shader + 770)
+#define mc_setup_y_q0 (rpi_shader + 790)
+#define mc_setup_y_qn (rpi_shader + 792)
+#define mc_filter_y_pxx (rpi_shader + 1032)
+#define mc_filter_y_bxx (rpi_shader + 1162)
+#define mc_filter_y_p00 (rpi_shader + 1292)
+#define mc_filter_y_b00 (rpi_shader + 1382)
+#define mc_setup_c10_q0 (rpi_shader + 1462)
+#define mc_setup_c10_qn (rpi_shader + 1464)
+#define mc_filter_c10_p (rpi_shader + 1600)
+#define mc_filter_c10_p_l1 (rpi_shader + 1728)
+#define mc_filter_c10_b (rpi_shader + 1856)
+#define mc_sync10_q0 (rpi_shader + 2042)
+#define mc_sync10_q1 (rpi_shader + 2060)
+#define mc_sync10_q2 (rpi_shader + 2072)
+#define mc_sync10_q3 (rpi_shader + 2084)
+#define mc_sync10_q4 (rpi_shader + 2096)
+#define mc_sync10_q5 (rpi_shader + 2114)
+#define mc_sync10_q6 (rpi_shader + 2126)
+#define mc_sync10_q7 (rpi_shader + 2138)
+#define mc_sync10_q8 (rpi_shader + 2150)
+#define mc_sync10_q9 (rpi_shader + 2168)
+#define mc_sync10_q10 (rpi_shader + 2180)
+#define mc_sync10_q11 (rpi_shader + 2192)
+#define mc_exit_c10_q0 (rpi_shader + 2204)
+#define mc_exit_y10_q0 (rpi_shader + 2204)
+#define mc_exit_c10_qn (rpi_shader + 2224)
+#define mc_exit_y10_qn (rpi_shader + 2224)
+#define mc_setup_y10_q0 (rpi_shader + 2242)
+#define mc_setup_y10_qn (rpi_shader + 2244)
+#define mc_filter_y10_pxx (rpi_shader + 2494)
+#define mc_filter_y10_p00 (rpi_shader + 2624)
+#define mc_filter_y10_bxx (rpi_shader + 2716)
+#define mc_filter_y10_b00 (rpi_shader + 2846)
+#define mc_end (rpi_shader + 2926)
+
+#endif
diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
new file mode 100644
index 0000000000..ba6cc13a95
--- /dev/null
+++ b/libavcodec/rpi_shader.qasm
@@ -0,0 +1,1741 @@
+
+# The @ "mul_used", 0 annotations that occur by various mul blocks suppress
+# the warning that we are using rotation & ra/rb registers. r0..3 can be
+# rotated through all 16 elems ra regs can only be rotated through their
+# local 4. As it happens this is what is wanted here as we do not want the
+# constants from the other half of the calc.
+
+# PREREAD is the number of requests that we have sitting in the TMU request
+# queue.
+#
+# There are 8 slots availible in the TMU request Q for tm0s requests, but
+# only 4 output FIFO entries and overflow is bad (corruption or crash)
+# (If threaded then only 2 out FIFO entries, but we aren't.)
+# In s/w we are effectively limited to the min vertical read which is >= 4
+# so output FIFO is the limit.
+#
+# However in the current world there seems to be no benefit (and a small
+# overhead) in setting this bigger than 2.
+
+.set PREREAD, 4
+
+# Block heights - 8 & 16 are the only numbers we currently support
+
+.set C_BLK_HEIGHT_8, 16
+.set C_BLK_HEIGHT_16, 8
+.set Y_BLK_HEIGHT_8, 16
+.set Y_BLK_HEIGHT_16, 8
+
+# QPU counts - depend on block size
+# If we have a 2-byte format & block_size > 8 then can only afford
+# 8 QPUs
+# These numbers must match the numbers in rpi_shader_cmd.h
+
+.set N_QPU_8, 12
+.set N_QPU_16, 12
+
+# register allocation
+#
+
+# ra0-3
+# Used as temp and may be loop filter coeffs (split into .8s)
+# or temp in loop. Check usage on an individual basis.
+
+# ra4-7
+# C: L0 H filter out FIFO
+# otherwise -- free --
+
+# ra8-11
+# temp in some places - check usage
+# Y: (with rb8-11) horiz out FIFO
+
+# ra12-15
+# -- free --
+
+# uniform: width:height
+.set ra_width_height, ra16
+.set ra_width, ra16.16b
+.set ra_height, ra16.16a
+
+# y:y2 same layout as y_y2_next so we can update both together
+.set ra_y_y2, ra17
+.set ra_y2, ra17.16a
+.set ra_y, ra17.16b
+
+# uniform: L1 weight (U on left, V on right)
+# Only used in Y B
+.set ra_wt_off_mul_l1, ra18
+.set ra_wt_off_l1, ra18.16b
+.set ra_wt_mul_l1, ra18.16a
+
+# y_next:y2_next same layout as y_y2 so we can update both together
+.set ra_y_y2_next, ra19
+.set ra_y_next, ra19.16b
+.set ra_y2_next, ra19.16a
+
+# Setup: consts - subdivide a single register
+.set ra_kff100100, ra20
+.set ra_k256, ra20.16a
+.set ra_k0, ra20.8a
+.set ra_k1, ra20.8b
+.set ra_k16, ra20.8c
+.set ra_k255, ra20.8d
+
+# Loop: xshifts
+.set ra_xshift, ra21.16a
+.set ra_xshift_next, ra21.16b
+
+# Loop var: L0 weight (U on left, V on right)
+# _off_ is not used in loop as we want to modify it before use
+.set ra_wt_off_mul_l0, ra22
+.set ra_wt_mul_l0, ra22.16a
+.set ra_wt_off_l0, ra22.16b
+
+# Max pel value (for 8 bit we can get away with sat ops but not 9+)
+# * Could merge with rb_pmask. For 10 bit Logically pmask needs 0xff in the
+# 2nd byte but as the source should never be > 3 there 0x3ff should do
+.set ra_blk_height_pmax, ra23
+.set ra_pmax, ra23.16a
+.set ra_blk_height, ra23.8c
+# -- free -- ra23.8d
+
+# Loop: src frame base (L0)
+.set ra_base, ra24
+
+# Loop: src frame base (L1)
+.set ra_base2, ra25
+
+# Loop: next src frame base (L0)
+.set ra_base_next, ra26
+
+# -- free -- ra27
+# -- free -- ra28
+# -- free -- ra29
+
+# Use an even numbered register as a link register to avoid corrupting flags
+.set ra_link, ra30
+
+# -- free -- ra31
+
+.set rb_xshift2, rb0
+.set rb_xshift2_next, rb1
+
+# C: (elem & 1) == 0 ? elem * 2 : (elem + 4) * 2
+.set rb_elem_x, rb2
+
+# El Flags
+# After adding to self we to have el even/odd on nc/c and lo/hi on nn/n
+.set rb_ef, rb3
+
+# rb4-7
+# C-B: L1 H filter out FIFO
+# Y: (with ra2.8x) Y vertical filter coeffs
+
+# rb8-11
+# C: Vertical filter coeffs
+# Y: (with ra8-11) horiz out FIFO
+
+# Loop var: offset to add before shift (round + weighting offsets)
+# Exact value varies by loop
+.set rb_wt_off, rb12
+
+# Setup: denom + 6 + 9
+.set rb_wt_den_p15, rb13
+
+# -- free -- rb14
+# -- free -- rb15
+
+# Line pitch (128 for sand128)
+.set rb_pitch, rb16
+
+# Loop count - 2 (set up TMU for next xfer)
+.set rb_i_tmu, rb17
+
+# Loop count for min(height, 16)
+# Y will reset & loop again if height > 16
+.set rb_lcount, rb18
+
+# frame_base2_next
+.set rb_base2_next, rb19
+
+# Setup: Height of Y+C in sand, (x&mask)*xpitch will give
+# offset to the slice
+.set rb_xpitch, rb20
+
+# -- free -- rb21
+
+# Setup: 0xff (8-bit) / 0xffff (9+ bit)
+.set rb_pmask, rb22
+
+# Loop: destination address
+.set rb_dest, rb23
+
+# vdw_setup_1(dst_pitch)
+.set rb_dma1_base, rb24
+
+# Setup: pic width - 1
+# In bytes so 8 bit luma is (width - 1)*1, 16 bit chroma is (width -1)*4 etc.
+.set rb_max_x, rb25
+
+# Loop: height<<23 + width<<16 + vdw_setup_0
+.set rb_dma0, rb26
+
+# vdw_setup_0 (depends on QPU number)
+.set rb_dma0_base, rb27
+
+# Setup: vw_setup value to reset VPM write pointer
+.set rb_vpm_init, rb28
+
+# Loop: vdw_setup_1(dst_pitch-width) = stride
+.set rb_dma1, rb29
+
+# Setup: pic_height - 1
+.set rb_max_y, rb30
+
+# -- free -- rb31
+
+
+
+
+# With shifts only the bottom 5 bits are considered so -16=16, -15=17 etc.
+.set i_shift16, -16
+.set i_shift21, -11
+.set i_shift23, -9
+.set i_shift30, -2
+
+# Much of the setup code is common between Y & C
+# Macros that express this - obviously these can't be overlapped
+# so are probably unsuitable for loop code
+
+.macro m_calc_dma_regs, v_bit_depth, v_blk_height, r_vpm, r_dma
+ mov r2, qpu_num
+.if v_bit_depth <= 8
+ # 8 bit version
+ asr r1, r2, 2
+ shl r1, r1, 6
+ and r0, r2, 3
+ or r0, r0, r1
+
+ mov r1, vpm_setup(0, 4, h8p(0, 0)) # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
+ add r_vpm, r0, r1 # VPM 8bit storage
+
+ mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
+ shl r0, r0, 5
+
+.else
+ # 16 bit version
+ # Limited to 8 QPUs if blk height > 8
+ asr r1, r2, 1
+.if v_blk_height <= 8
+ shl r1, r1, 4
+.else
+ shl r1, r1, 5
+.endif
+ and r0, r2, 1
+ or r0, r0, r1
+
+ mov r1, vpm_setup(0, 2, h16p(0, 0)) # 2 is stride - stride acts on ADDR
+ add r_vpm, r0, r1
+
+ # X = H * 8 so the YH from VPMVCD_WR_SETUP[ADDR] drops into
+ # XY VPMVCD_WR_SETUP[VPMBASE] if shifted left 3 (+ 3 for pos of field in reg)
+ mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0)) # height,width added later
+ shl r0, r0, 6
+.endif
+ add r_dma, r0, r1 # DMA out
+.endm
+
+
+.macro m_setup_q0
+ srel -, 12
+.endm
+
+# Code start label
+::mc_start
+
+################################################################################
+# mc_setup_uv(next_kernel, x, y, ref_c_base, frame_width, frame_height, pitch, dst_pitch, offset, denom, vpm_id)
+
+.macro m_setup_c, v_bit_depth
+
+# Cannot use mul24 on x as x might be -ve, so must use shift
+.if v_bit_depth <= 8
+.set v_x_shift, 1
+.set v_pmask, 0xff
+.set v_blk_height, C_BLK_HEIGHT_8
+.else
+.set v_x_shift, 2
+.set v_pmask, 0xffff
+.set v_blk_height, C_BLK_HEIGHT_16
+.endif
+
+ mov tmurs, 1 # No swap TMUs
+
+# Load first request location
+ mov ra0, unif # next_x_y
+
+ mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
+ shl rb_ef, r0, i_shift30
+
+ mov ra_base, unif # Store frame c base
+
+# Read image dimensions
+ sub r0, unif, 1 # pic c width
+ shl rb_max_x, r0, v_x_shift # rb_max_x in bytes
+ sub rb_max_y, unif, 1 # pic c height
+
+# load constants
+ mov ra_kff100100, 0xff100100
+ mov rb_pmask, v_pmask
+ mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
+
+# get source pitch
+ mov rb_xpitch, unif # stride2
+ mov rb_pitch, unif # stride1
+ mov r1, vdw_setup_1(0) # [rb_pitch delay] Merged with dst_stride shortly
+ add rb_dma1_base, r1, rb_pitch # vdw_setup_1
+
+ and r0, 1, elem_num
+ nop ; mul24 r0, r0, 5
+.if v_bit_depth <= 8
+ add rb_elem_x, r0, elem_num
+.else
+ add r0, r0, elem_num
+ add rb_elem_x, r0, r0
+.endif
+
+# Compute base address for first and second access
+# ra_base ends up with t0s base
+# ra_base2 ends up with t1s base
+
+ shl r0, ra0.16b, v_x_shift # [rb_elem_x delay]
+ add r0, r0, rb_elem_x # Add elem no to x to get X for this slice
+ max r0, r0, 0 ; mov ra_y, ra0.16a # ; stash Y
+ min r0, r0, rb_max_x
+
+# Get shift
+# Shift will always calculate as 0 for 9+ bit
+# Ideally we can optimize the shift out of the code in these cases but for now
+# it is tidier to leave it in
+.if v_bit_depth <= 8
+ shl ra_xshift_next, r0, 3
+.else
+ mov ra_xshift_next, 0 ; mov rb_xshift2_next, 0
+.endif
+
+# In a single 32 bit word we get 1 or 2 UV pairs so mask bottom bits of xs if we need to
+
+.if v_bit_depth <= 8
+ and r0, r0, -4
+.endif
+ sub r1, ra_k0, rb_pitch
+ and r1, r0, r1
+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
+ add r0, r0, r1
+ add ra_base, ra_base, r0
+
+ add rb_wt_den_p15, 23 - v_bit_depth, unif # denominator
+
+# Compute part of VPM to use for DMA output
+# * We only get 8 QPUs if 16 bit - maybe reduce height and auto-loop?
+ m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base
+
+# And again for L1, but only worrying about frame2 stuff
+
+# Load first request location
+ mov ra0, unif # next_x_y
+
+ mov ra_base2, unif # [ra0 delay] Store frame c base
+
+# Compute base address for first and second access
+# ra_base ends up with t0s base
+# ra_base2 ends up with t1s base
+
+ shl r0, ra0.16b, v_x_shift
+ add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a # Add QPU slice offset
+ max r0, r0, 0
+ min r0, r0, rb_max_x
+
+# Get shift (already zero if 9+ bit so ignore)
+.if v_bit_depth <= 8
+ shl rb_xshift2_next, r0, 3
+.endif
+
+# In a single 32 bit word we get 2 UV pairs so mask bottom bit of xs
+
+.if v_bit_depth <= 8
+ and r0, r0, -4
+.endif
+ sub r1, ra_k0, rb_pitch
+ and r1, r0, r1
+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
+ add r0, r0, r1 ; mov r2, ra_y2
+ add ra_base2, ra_base2, r0
+
+# Do preloads
+# r0 = ra_y, r2 = ra_y2
+ mov r3, PREREAD ; mov r0, ra_y
+
+:1
+ sub.setf r3, r3, 1
+ max r1, r0, 0
+ min r1, r1, rb_max_y
+ add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch
+ add t0s, ra_base, r1 ; mov ra_y, r0
+
+ max r1, r2, 0
+ brr.anynz -, r:1b
+ min r1, r1, rb_max_y
+ add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch
+ add t1s, ra_base2, r1 ; mov ra_y2, r2
+# >>> .anynz 1b
+
+ mov ra_link, unif # link
+# touch registers to keep simulator happy
+ # ra/b4..7: B0 -> B stash registers
+ mov ra4, 0 ; mov rb4, 0
+ bra -, ra_link
+ mov ra5, 0 ; mov rb5, 0
+ mov ra6, 0 ; mov rb6, 0
+ mov ra7, 0 ; mov rb7, 0
+# >>> ra_link
+.endm
+
+::mc_setup_c_q0
+ m_setup_q0
+::mc_setup_c_qn
+ m_setup_c 8
+
+################################################################################
+
+# mc_filter_uv(next_kernel, x, y, frame_c_base, width_height, hcoeffs, vcoeffs, offset_weight_u, offset_weight_v, this_u_dst, this_v_dst)
+
+# At this point we have already issued two pairs of texture requests for the current block
+# ra_x, ra_x16_base point to the current coordinates for this block
+
+.macro m_filter_c_p, v_tmu, v_bit_depth
+
+.if v_bit_depth <= 8
+.set v_x_shift, 1
+.set v_x_mul, 2
+.set v_v_shift, 8
+# Shifts to get width & height in the right place in rb_dma0
+.set v_dma_h_shift, 7
+.set v_dma_wh_shift, i_shift16
+.else
+.set v_x_shift, 2
+.set v_x_mul, 4
+.set v_v_shift, i_shift16
+# Shifts to get width & height in the right place in rb_dma0
+.set v_dma_h_shift, 8
+.set v_dma_wh_shift, 15
+.endif
+
+.if v_tmu == 0
+.set vrx_xshift, rb_xshift2 # b side more convienient
+.set vrx_xshift_next, ra_xshift_next
+.set vra_y_next, ra_y_next
+.set vrx_base_next, ra_base_next
+.set vra_y, ra_y
+.set vra_base, ra_base
+.set vr_txs, t0s
+.else
+.set vrx_xshift, ra_xshift # a side more convienient
+.set vrx_xshift_next, rb_xshift2_next
+.set vra_y_next, ra_y2_next
+.set vrx_base_next, rb_base2_next
+.set vra_y, ra_y2
+.set vra_base, ra_base2
+.set vr_txs, t1s
+.endif
+
+# per-channel shifts were calculated on the *previous* invocation
+# get base addresses and per-channel shifts for *next* invocation
+ mov vw_setup, rb_vpm_init ; mov ra2, unif # ; x_y
+
+ add.setf -, rb_ef, rb_ef ; mov r3, unif # [ra2 delay] ; base
+
+ shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0 # r5 = 0
+ add r0, r0, rb_elem_x ; mov ra_width_height, unif # r1=pitch2 mask ; width_height
+ sub r1, r5, rb_pitch ; mov ra0, unif # ; H filter coeffs
+ max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next
+ min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a
+
+.if v_bit_depth <= 8
+ shl vrx_xshift_next, r0, 3
+ and r0, r0, -4
+.endif
+ and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul # r2=w*2 (we are working in pel pairs) ** x*2 already calced!
+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
+ add r0, r0, r1 ; mov ra3, unif # ; V filter coeffs
+ add vrx_base_next, r3, r0 ; mov r1, ra_height
+
+# set up VPM write
+ sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif # Compute vdw_setup1(dst_pitch-width) ; U offset/weight
+ add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
+ add rb_lcount, r1, 3 ; mov.ifc ra_wt_off_mul_l0, unif # ; V offset/weight
+
+# ; unpack filter coefficients
+
+ shl r0, r1, v_dma_h_shift ; mov rb8, ra3.8a
+ add r0, r0, r2 ; mov rb9, ra3.8b # Combine width and height of destination area (r0=h<<8, r2=w*2)
+ shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c # Shift into bits 16 upwards of the vdw_setup0 register
+ add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0 # ; r1=weight
+
+ mov rb_dest, unif ; mov ra9, rb_max_y # dst_addr ; alias rb_max_y
+
+ shl r1, r1, rb_wt_den_p15 ; mov rb11, ra3.8d
+
+ asr rb_wt_off, r1, 2 ; mov ra_link, unif # ; Link
+ sub ra3, rb_wt_den_p15, ra_k1
+
+# r5 = 0 (loop counter)
+# ra9 = alias for rb_max_y
+# ra_wt_mul_l0 = weight L0
+# ra3 = weight denom + 22 - bit_depth [= rb_wt_den_p15 - 1, max 19]
+# rb_wt_off = (offset * 2 + 1) << (ra3 - 1)
+
+# We want (r0r1)
+# U0U3 : V0V3 : U1U4 : V1V4 : U2U5 : V2U5 : ...
+# We fetch (after shift)
+# C0 : C3 : C1 : C4 : C2 : C5 : ...
+
+:1
+# retrieve texture results and pick out bytes
+# then submit two more texture requests
+
+.if v_tmu == 0
+ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0 # loop counter increment
+ shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next
+ shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
+ add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next
+.else
+ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1 # loop counter increment
+ shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next
+ shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
+ add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next
+.endif
+
+ add vra_y, r3, ra_k1 ; mov r0, r1 << 15
+ max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1
+ min r3, r3, ra9 ; mov.ifnc r0, r2
+
+ mov ra4, ra5 ; mul24 r2, r3, rb_pitch
+ add vr_txs, vra_base, r2 ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte
+
+# apply horizontal filter
+# The filter coeffs for the two halves of this are the same (unlike in the
+# Y case) so it doesn't matter which ra0 we get them from
+# Also as the two halves are locked together we don't need to separate the 1st
+# r0 mul or the last r1 mul as they are vaild for all QPUs
+
+ and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0
+ nop ; mul24 r2, ra0.8b << 2, r0 << 2 @ "mul_used", 0
+ nop ; mul24.ifn r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
+ sub r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0
+ nop ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
+ sub.setf -, r5, 4 ; mul24 r0, ra0.8d, r1
+
+# V filter =- ra4 * rb8-+ ra5 * rb9 + ra6 * rb10 - ra7 * rb11 (post FIFO shift)
+# Have to dup block as we need to move the brr - code is more common than it
+# looks at first glance
+.if v_bit_depth <= 8
+ brr.anyn -, r:1b
+ add r2, r2, r3 ; mov ra5, ra6
+ mov ra6, ra7 ; mul24 r1, ra7, rb10
+ sub ra7, r2, r0 ; mul24 r0, ra4, rb8
+.else
+ add r2, r2, r3 ; mov ra5, ra6
+ brr.anyn -, r:1b
+ mov ra6, ra7 ; mul24 r1, ra7, rb10
+ sub r2, r2, r0 ; mul24 r0, ra4, rb8
+ asr ra7, r2, v_bit_depth - 8
+.endif
+# >>> .anyn 1b
+
+ sub r1, r1, r0 ; mul24 r0, ra5, rb9 # [ra7 delay]
+ add r1, r1, r0 ; mul24 r0, ra7, rb11
+ sub r1, r1, r0
+ sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
+ asr r1, r1, 14
+ nop ; mul24 r1, r1, ra_wt_mul_l0
+ shl r1, r1, 8 ; mov r3, ra_blk_height
+ add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
+ brr.anyn -, r:1b
+ asr r1, r1, ra3
+ min r1, r1, ra_pmax ; mov -, vw_wait
+ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
+# >>> .anyn 1b
+
+# r0 = remaining height (min 0)
+# r2 = r3 * rb_pitch
+# r3 = block_height (currently always 16)
+
+# If looping again then we consumed 16 height last loop
+# rb_dma1 (stride) remains constant
+# rb_i_tmu remains const (based on total height)
+# recalc rb_dma0, rb_lcount based on new segment height
+
+ mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
+
+# DMA out
+ bra.anyz -, ra_link
+ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride
+ sub r1, r0, r3 ; mov vw_addr, rb_dest # start the VDW
+ shl r1, r1, i_shift23
+# >>> .anyz ra_link
+
+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
+# We add to dma0 to reduce the number of output lines in the final block
+ add rb_lcount, rb_lcount, r0
+ brr -, r:1b
+ add rb_dma0, rb_dma0, r1
+ add rb_dest, rb_dest, r2
+ mov vw_setup, rb_vpm_init # Reset our VDM write pointer
+# >>> 1b
+.endm
+
+# At 10 bits
+# Worst case +ve after 1st filter = 74 * 0x3ff >> 2 = 18925 0x49ed (15 bits)
+# Worst case -ve after 1st filter = -10 * 0x3ff >> 2 = -10230
+# after 2nd (really we can't get this) = 74 * 18925 + 10 * 10230 >> 6 = 23480 = 0x5bb8 (15 bits)
+# (P)
+# * weight (255) = 5987400 = 0x5b5c48 (23 bits)
+# + 0x3ff << (13 - bit_depth + 7) = 0x6b5848 (23 bits)
+# ... should be OK
+#
+# (B)
+# *2 (L0+L1) = 5963920 = 0x5b0090 (23 bits)
+# + (offset * 2 + 1) << (15 - bit_depth + 7) = 5963920 + (0x3ff << 12) = 5963920 + 4190208 = 10154128 = 0x9af090 (24 bits)
+# So signed overflow if we sign extend here :-(
+#
+# In practice this doesn't happen (we need a maximal offset and a very unlucky
+# filter).
+#
+# This could be fixed by offsetting the filters s.t. they are unsigned until
+# weight mul and then removing the offset with the weighting offset (I think
+# this should work) or splitting the rounding & offsetting
+
+::mc_filter_c_p
+ m_filter_c_p 0, 8
+
+::mc_filter_c_p_l1
+ m_filter_c_p 1, 8
+
+################################################################################
+
+# mc_filter_c_b
+
+# At this point we have already issued two pairs of texture requests for the current block
+# ra_x, ra_x16_base point to the current coordinates for this block
+
+.macro m_filter_c_b, v_bit_depth
+
+.if v_bit_depth <= 8
+.set v_x_shift, 1
+.set v_v_shift, 8
+# Shifts to get width & height in the right place in rb_dma0
+.set v_dma_h_shift, 7
+.set v_dma_wh_shift, i_shift16
+.else
+.set v_x_shift, 2
+.set v_v_shift, i_shift16
+# Shifts to get width & height in the right place in rb_dma0
+.set v_dma_h_shift, 8
+.set v_dma_wh_shift, 15
+.endif
+.set v_x_mul, (1 << v_x_shift)
+
+# per-channel shifts were calculated on the *previous* invocation
+
+# get base addresses and per-channel shifts for *next* invocation
+ mov vw_setup, rb_vpm_init ; mov ra2, unif # ; x_y
+
+ add.setf -, rb_ef, rb_ef ; mov r3, unif # [ra2 delay] ; r3=base
+
+ shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1 # x ; r5=0
+ add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a
+ sub r1, r5, rb_pitch ; mov ra_width_height, unif # r1=pitch2 mask ; width_height
+ max r0, r0, r5 ; mov ra_xshift, ra_xshift_next
+ min r0, r0, rb_max_x ; mov ra0, unif # L0 H filter coeffs
+
+.if v_bit_depth <= 8
+ shl ra_xshift_next, r0, 3
+.endif
+
+ and r0, r0, -4 ; mov ra2, unif # ; L0 V filter coeffs
+ and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul # r2=x*2 (we are working in pel pairs)
+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
+ add r0, r0, r1 ; mov r1, ra_height # Add stripe offsets ; r1=height
+ add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next # ; xshift2 used because B
+
+# set up VPM write
+
+ sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_mul_l0, unif # Compute vdw_setup1(dst_pitch-width) ; U weight
+ add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
+ add rb_lcount, r1, 3 ; mov.ifc ra_wt_mul_l0, unif # ; V weight
+
+ shl r0, r1, v_dma_h_shift ; mov ra3, unif # ; x2_y2
+ add r0, r0, r2 ; mov r3, unif # [ra3 delay] ; base
+ shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a # Shift into bits 16 upwards of the vdw_setup0 register
+ add rb_dma0, r0, rb_dma0_base ; mov ra1, unif # ; H filter coeffs
+
+# L1 - uniform layout could possibly be optimized
+
+ shl r0, ra3.16b, v_x_shift # r0=x*2
+ add r0, r0, rb_elem_x ; mov ra3, unif # ; V filter coeffs
+ sub r1, r5, rb_pitch ; mov ra_wt_off_mul_l1, unif # [ra3 delay] r1=pitch2 mask ; U offset/weight
+ max r0, r0, r5 ; mov rb8, ra3.8a # ; start unpacking filter coeffs
+ min r0, r0, rb_max_x ; mov rb9, ra3.8b
+
+.if v_bit_depth <= 8
+ shl rb_xshift2_next, r0, 3
+.endif
+
+ and r0, r0, -4 ; mov.ifc ra_wt_off_mul_l1, unif # ; V offset/weight
+ and r1, r0, r1 ; mov rb10, ra3.8c
+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
+ add r0, r0, r1 ; mov rb_dest, unif # Add stripe offsets ; dst_addr
+ add rb_base2_next, r3, r0
+
+ mov ra9, rb_max_y ; mov rb11, ra3.8d
+ shl r1, ra_wt_off_l1, rb_wt_den_p15
+ asr rb_wt_off, r1, 9 ; mov ra_link, unif # link
+
+# r5 loop counter
+# ra0 H coeffs L0
+# ra1 H coeffs L1
+# ra2 V coeffs L0
+# ra3 temp
+# ra4-7 L0 H FIFO
+# rb4-7 L1 H FIFO
+# rb8-rb11 V coeffs L1
+# ra9 rb_max_y alias
+
+:1
+# retrieve texture results and pick out bytes
+# then submit two more texture requests
+ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0 # loop counter increment
+ shr r2, r4, ra_xshift ; mov.ifz ra_base2, rb_base2_next
+ shr r1, r2, v_v_shift ; mov.ifz ra_y_y2, ra_y_y2_next
+ add.setf -, rb_ef, rb_ef ; mov.ifz ra_base, ra_base_next
+ add ra_y, 1, ra_y ; mov r3, ra_y
+
+ max r3, r3, ra_k0 ; mov r0, r1 << 15
+ min r3, r3, ra9 ; mov.ifnc r1, r2 << 1
+
+ mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch
+ add t0s, ra_base, r3 ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte
+
+# L0 H-filter
+# H FIFO scrolls are spread all over this loop
+ mov rb4, rb5 ; mov ra4, ra5 # ? Just moves
+
+ and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0
+ nop ; mul24 r2, ra0.8b << 2, r0 << 2 @ "mul_used", 0
+ nop ; mul24.ifn r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
+ sub r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0
+ nop ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
+ add r2, r2, r3 ; mul24 r3, ra0.8d, r1
+.if v_bit_depth <= 8
+ sub ra3, r2, r3 ; mov rb5, rb6 ; ldtmu1
+.else
+ sub r2, r2, r3 ; mov rb5, rb6 ; ldtmu1
+ asr ra3, r2, (v_bit_depth - 8)
+.endif
+
+ shr r2, r4, rb_xshift2 ; mov ra5, ra6
+ shr r1, r2, v_v_shift ; mov r3, ra_y2
+ add ra_y2, r3, ra_k1 ; mov rb6, rb7
+
+ max r3, r3, ra_k0 ; mov r0, r1 << 15
+ min r3, r3, ra9 ; mov.ifnc r1, r2 << 1
+
+ mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch
+ add t1s, ra_base2, r3 ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte
+
+# L1 H-filter
+
+ and r1, r1, rb_pmask ; mul24 r3, ra1.8a, r0
+ nop ; mul24 r2, ra1.8b << 2, r0 << 2 @ "mul_used", 0
+ nop ; mul24.ifn r2, ra1.8b << 12, r1 << 12 @ "mul_used", 0
+ sub r2, r2, r3 ; mul24 r3, ra1.8c << 4, r0 << 4 @ "mul_used", 0
+ nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
+ sub.setf -, r5, 4 ; mul24 r0, ra1.8d, r1
+# V filters - start in branch delay slots of H
+# Final asr not needed for 8-bit but we can#t (currently) save a whole instruction
+ add r2, r2, r3 ; mul24 r1, rb5, ra2.8b
+ brr.anyn -, r:1b
+ mov ra6, ra7 ; mul24 r3, ra7, rb10
+ sub r2, r2, r0 ; mul24 r0, rb4, ra2.8a
+ asr ra7, r2, (v_bit_depth - 8) ; mov rb7, ra3
+# >>> .anyn 1b
+
+ sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c # [rb7 delay]
+ add r1, r1, r0 ; mul24 r0, rb7, ra2.8d
+ sub r2, r1, r0 ; mul24 r0, ra4, rb8
+ sub r1, r3, r0 ; mul24 r0, ra5, rb9
+ add r1, r1, r0 ; mul24 r0, ra7, rb11
+ sub r1, r1, r0 ; mul24 r2, r2, ra_k256
+
+ asr r2, r2, 14 ; mul24 r1, r1, ra_k256
+ asr r1, r1, 14 ; mul24 r2, r2, ra_wt_mul_l0
+
+ add r2, r2, rb_wt_off ; mul24 r1, r1, ra_wt_mul_l1 # rb_wt_off = (offsetL0 + offsetL1 + 1) << (rb_wt_den_p15 - 9)
+ add r1, r1, r2 ; mov r3, ra_blk_height
+
+ sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 # Lose bad top 8 bits & sign extend
+
+ brr.anyn -, r:1b
+ asr r1, r1, rb_wt_den_p15 ; v8subs r0, ra_height, r3
+ min r1, r1, ra_pmax ; mov -, vw_wait
+ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
+# >>> .anyn 1b
+
+# r0 = remaining height (min 0)
+# r2 = r3 * rb_pitch
+# r3 = block_height (currently always 16)
+
+# If looping again then we consumed 16 height last loop
+# rb_dma1 (stride) remains constant
+# rb_i_tmu remains const (based on total height)
+# recalc rb_dma0, rb_lcount based on new segment height
+
+ mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
+
+# DMA out
+ bra.anyz -, ra_link
+ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride
+ sub r1, r0, r3 ; mov vw_addr, rb_dest # start the VDW
+ shl r1, r1, i_shift23
+# >>> .anyz ra_link
+
+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
+# We add to dma0 to reduce the number of output lines in the final block
+ add rb_lcount, rb_lcount, r0
+ brr -, r:1b
+ add rb_dma0, rb_dma0, r1
+ add rb_dest, rb_dest, r2
+ mov vw_setup, rb_vpm_init # Reset our VDM write pointer
+# >>> 1b
+.endm
+
+::mc_filter_c_b
+ m_filter_c_b 8
+
+################################################################################
+# Exit code used by both Luma & Chroma so place between them to avoid I-cache
+# conflicts
+
+.macro m_exit_drain
+.if PREREAD == 2
+# Special case 2 as loop is wasteful
+ nop ; nop ; ldtmu0
+ nop ; nop ; ldtmu1
+ nop ; nop ; ldtmu0
+ mov -, vw_wait ; nop ; ldtmu1
+.else
+ mov.setf r3, PREREAD - 1
+:1
+ brr.anynz -, r:1b
+ nop ; nop ; ldtmu0
+ nop ; nop ; ldtmu1
+ sub.setf r3, r3, 1
+ # >>>
+ mov -, vw_wait
+.endif
+.endm
+
+# This sync layout groups QPUs 0-3, 4-7, 8-11 (i.e. 1 group per TMU pair)
+# All qpus start at the beginning and after that (group - 1) must have finished
+# before (group) can start
+#
+# Requires setup code for QPU 0 to srel sem 12 (m_setup_q0) to start the chain
+# Exit code will sacq sem 12 so everything is @ 0 on exit (this is important -
+# lockup otherwise)
+#
+# There is some, currently ill defined, potential lockup if we have the VDM active
+# whilst doing sem stuff so we wait first. ?? QPU stall from sem stalls VDM pipe too ??
+#
+# The code stalled when I had many waiters on a single sem so we have a
+# "ripple" of srels to restart. Unsure why, may have been bug, but this works
+# and we currently have both the memory & sems to support it.
+.macro m_sync_q, n_qpu, n_quads
+# Do not generate code for qpu >= quads * 4 - fns should never be called
+.if n_qpu < n_quads * 4
+ mov ra_link, unif # Can only branch to an a reg (not r0)
+ mov -, vw_wait # [ra_link delay]
+
+.set n_sem_sync, n_qpu - (n_qpu % 4)
+.set n_sem_in, n_qpu
+.set n_sem_out, n_qpu + 1
+
+.if n_qpu % 4 == 0
+
+.set n_sem_quad_in, 12 + n_qpu / 4
+.set n_sem_quad_out, 12 + (((n_qpu / 4) + 1) % n_quads)
+
+ sacq -, n_sem_sync
+ sacq -, n_sem_sync
+ sacq -, n_sem_sync
+ bra -, ra_link
+ sacq -, n_sem_quad_in
+ srel -, n_sem_out
+ srel -, n_sem_quad_out
+
+.else
+ bra -, ra_link
+ srel -, n_sem_sync
+ sacq -, n_sem_in
+.if n_sem_out % 4 != 0
+ srel -, n_sem_out
+.else
+ nop
+.endif
+.endif
+.endif
+.endm
+
+.set v_quads8, N_QPU_8 / 4
+
+::mc_sync_q0
+ m_sync_q 0, v_quads8
+::mc_sync_q1
+ m_sync_q 1, v_quads8
+::mc_sync_q2
+ m_sync_q 2, v_quads8
+::mc_sync_q3
+ m_sync_q 3, v_quads8
+::mc_sync_q4
+ m_sync_q 4, v_quads8
+::mc_sync_q5
+ m_sync_q 5, v_quads8
+::mc_sync_q6
+ m_sync_q 6, v_quads8
+::mc_sync_q7
+ m_sync_q 7, v_quads8
+::mc_sync_q8
+ m_sync_q 8, v_quads8
+::mc_sync_q9
+ m_sync_q 9, v_quads8
+::mc_sync_q10
+ m_sync_q 10, v_quads8
+::mc_sync_q11
+ m_sync_q 11, v_quads8
+
+# mc_exit()
+# Chroma & Luma the same now
+
+.macro m_exit_qn
+ m_exit_drain
+ nop ; nop ; thrend
+ nop
+ nop
+# >>> thrend <<<
+.endm
+
+::mc_exit_c_qn
+::mc_exit_y_qn
+ m_exit_qn
+
+
+
+# mc_interrupt_exit12()
+
+.macro m_exit_q0
+ m_exit_drain
+ sacq -, 12
+ nop ; nop ; thrend
+ mov interrupt, 1
+ nop
+# >>> thrend <<<
+.endm
+
+::mc_exit_c_q0
+::mc_exit_y_q0
+ m_exit_q0
+
+# LUMA CODE
+
+# The idea is to form B predictions by doing 8 pixels from ref0 in parallel with 8 pixels from ref1.
+# For P frames we make the second x,y coordinates offset by +8
+
+
+################################################################################
+# mc_setup
+#
+# typedef struct qpu_mc_pred_y_s_s {
+# qpu_mc_src_t next_src1;
+# qpu_mc_src_t next_src2;
+# uint16_t pic_h;
+# uint16_t pic_w;
+# uint32_t stride2;
+# uint32_t stride1;
+# uint32_t wdenom;
+# uint32_t next_fn;
+# } qpu_mc_pred_y_s_t;
+
+.macro m_setup_y, v_bit_depth
+
+# Cannot use mul24 on x as x might be -ve, so must use shift
+.if v_bit_depth <= 8
+.set v_x_shift, 0
+.set v_pmask, 0xff
+.set v_blk_height, Y_BLK_HEIGHT_8
+.else
+.set v_x_shift, 1
+.set v_pmask, 0xffff
+.set v_blk_height, Y_BLK_HEIGHT_16
+.endif
+
+
+ # Need to save these because we need to know the frame dimensions before computing texture coordinates
+ mov tmurs, 1 ; mov ra0, unif # No TMU swap ; x_y
+ mov ra9, unif # ref_y_base
+ mov ra1, unif # x2_y2
+ mov ra11, unif # ref_y2_base
+
+# load constants
+ mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
+ shl rb_ef, r0, i_shift30
+
+
+ mov ra_kff100100, 0xff100100
+ mov rb_pmask, v_pmask
+ mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
+
+# Compute part of VPM to use
+
+# Read image dimensions
+ mov ra3, unif # width_height
+ mov rb_xpitch, unif # stride2
+.if v_x_shift == 0
+ sub rb_max_x, ra3.16b, 1
+.else
+ sub r0, ra3.16b, 1
+ shl rb_max_x, r0, v_x_shift
+.endif
+ sub rb_max_y, ra3.16a, 1
+ mov rb_pitch, unif # stride1
+
+# get destination pitch
+ mov r1, vdw_setup_1(0)
+ or rb_dma1_base, r1, rb_pitch
+
+# Compute base address for first and second access
+ mov r3, elem_num
+ add r0, ra0.16b, r3 # Load x + elem_num
+.if v_x_shift != 0
+ shl r0, r0, v_x_shift
+.endif
+ max r0, r0, 0
+ min r0, r0, rb_max_x
+ shl ra_xshift_next, r0, 3 # Compute shifts
+
+# X is byte offset - we can only load words - mask
+
+ and r0, r0, -4 ; v8subs r2, r2, r2
+ sub r2, r2, rb_pitch
+ and r1, r0, r2
+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
+ add r0, r0, r1 # Add stripe offsets
+ add ra_base, ra9, r0
+
+ # r3 still contains elem_num
+ add r0, ra1.16b, r3 # Load x
+.if v_x_shift != 0
+ shl r0, r0, v_x_shift
+.endif
+ max r0, r0, 0
+ min r0, r0, rb_max_x
+ shl rb_xshift2_next, r0, 3 # Compute shifts
+
+ # r2 still contains mask
+ and r0, r0, -4
+ and r1, r0, r2
+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
+ add r0, r0, r1 # Add stripe offsets
+ add ra_base2, ra11, r0
+
+# Do preloads
+ nop ; mov r0, ra0.16a # ; r0 = y
+ mov r3, PREREAD ; mov r2, ra1.16a # ; r2 = y2
+
+:1
+ sub.setf r3, r3, 1
+ max r1, r0, 0
+ min r1, r1, rb_max_y
+ add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch
+ add t0s, ra_base, r1 ; mov ra_y, r0
+
+ max r1, r2, 0
+ brr.anynz -, r:1b
+ min r1, r1, rb_max_y
+ add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch
+ add t1s, ra_base2, r1 ; mov ra_y2, r2
+# >>> .anynz 1b
+
+ add rb_wt_den_p15, unif, 23 - v_bit_depth # weight denom
+
+ m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base
+
+ mov ra_link, unif # Next fn
+
+# touch vertical context to keep simulator happy
+ mov ra8, 0 ; mov rb8, 0
+ bra -, ra_link
+ mov ra9, 0 ; mov rb9, 0
+ mov ra10, 0 ; mov rb10, 0
+ mov ra11, 0 ; mov rb11, 0
+# >>> ra_link
+.endm
+
+::mc_setup_y_q0
+ m_setup_q0
+::mc_setup_y_qn
+ m_setup_y 8
+
+################################################################################
+#
+# Start of per-block setup code
+# P and B blocks share the same setup code to save on Icache space
+
+# luma_setup_delay3 done in delay slots of branch that got us here
+
+# get base addresses and per-channel shifts for *next* invocation
+# per-channel shifts were calculated on the *previous* invocation
+
+# 1st 3 instructions of per_block-setup in branch delay
+#
+# typedef struct qpu_mc_pred_y_p_s {
+# qpu_mc_src_t next_src1;
+# qpu_mc_src_t next_src2;
+# uint16_t h;
+# uint16_t w;
+# uint32_t mymx21;
+# uint32_t wo1;
+# uint32_t wo2;
+# uint32_t dst_addr;
+# uint32_t next_fn;
+# } qpu_mc_pred_y_p_t;
+#
+
+.macro m_luma_setup, v_bit_depth
+# Hack - QASM may well have have label pasting but I have no idea how...
+.if v_bit_depth == 8
+ brr ra_link, r:per_block_setup_8
+.elif v_bit_depth == 10
+ brr ra_link, r:per_block_setup_10
+.endif
+ mov ra0, unif ; mov r3, elem_num # y_x ; elem_num has implicit unpack??
+ add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2 # [ra0 delay] ; r5 = 0
+ add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next
+.endm
+
+.macro m_per_block_setup, v_bit_depth
+
+.if v_bit_depth <= 8
+.set v_x_shift, 0
+.set v_x_mul, 1
+# Shifts to get width & height in the right place in rb_dma0
+.set v_dma_h_shift, 7
+.set v_dma_wh_shift, i_shift16
+.else
+.set v_x_shift, 1
+.set v_x_mul, 2
+# Shifts to get width & height in the right place in rb_dma0
+.set v_dma_h_shift, 8
+.set v_dma_wh_shift, 15
+.endif
+
+.if v_x_shift != 0
+ shl r0, r0, v_x_shift
+.endif
+ max r0, r0, r5 ; mov ra_xshift, ra_xshift_next
+ min r0, r0, rb_max_x
+
+ shl ra_xshift_next, r0, 3 # Compute shifts
+ and r0, r0, -4
+ sub r2, r5, rb_pitch ; mov ra_base_next, unif # src1.base
+ and r1, r0, r2 ; mov ra_y_next, ra0.16a
+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
+ add r0, r0, r1 ; mov ra1, unif # Add stripe offsets ; src2.x_y
+ add ra_base_next, ra_base_next, r0 # [ra1 delay]
+
+ add r0, ra1.16b, r3 # Load x2
+.if v_x_shift != 0
+ shl r0, r0, v_x_shift
+.endif
+ max r0, r0, r5 ; mov ra_y2_next, ra1.16a
+ min r0, r0, rb_max_x ; mov rb_base2_next, unif # ; src2.base
+ shl rb_xshift2_next, r0, 3 # Compute shifts
+ and r0, r0, -4 ; mov ra_width_height, unif # ; width_height
+ and r1, r0, r2 ; mov vw_setup, rb_vpm_init # ; set up VPM write
+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
+ add r0, r0, r1 ; mul24 r1, ra_width, v_x_mul # Add stripe offsets ; r1 = x in bytes
+ add rb_base2_next, rb_base2_next, r0
+
+# get width,height of block (unif load above), r1 = width * pel_size
+ sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height # Compute vdw_setup1(dst_pitch-width)
+ add rb_i_tmu, r0, 7 - PREREAD ; v8min r0, r0, ra_blk_height
+ add rb_lcount, r0, 7
+ shl r0, r0, v_dma_h_shift
+ add r0, r0, r1 # Combine width and height of destination area
+ shl r0, r0, v_dma_wh_shift # Shift into bits 16 upwards of the vdw_setup0 register
+ add rb_dma0, r0, rb_dma0_base ; mov r0, unif # ; Packed filter offsets
+
+# get filter coefficients and discard unused B frame values
+ shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif # Pick half to use ; L0 offset/weight
+ shl ra8, r0, 3 ; mov r3, ra_k255
+
+# Pack the 1st 4 filter coefs for H & V tightly
+# Coeffs are all abs values here as that means mul24 works (no sign extend from .8)
+
+ mov r1,0x00010100 # -ve [ra8 delay]
+ ror ra2.8a, r1, ra8.8d
+ ror ra0.8a, r1, ra8.8c
+
+ mov r1, 0x01040400
+ ror ra2.8b, r1, ra8.8d
+ ror ra0.8b, r1, ra8.8c
+
+ mov r1,0x050b0a00 # -ve
+ ror ra2.8c, r1, ra8.8d
+ ror ra0.8c, r1, ra8.8c
+
+ mov r1,0x11283a40
+ ror ra2.8d, r1, ra8.8d
+ ror ra0.8d, r1, ra8.8c
+
+# In the 2nd vertical half we use b registers due to using a-side fifo regs
+
+ mov r1,0x3a281100
+ ror r0, r1, ra8.8d ; mov ra_wt_off_mul_l1, unif
+ ror ra1.8a, r1, ra8.8c ; v8min rb4, r0, r3
+
+ mov r1,0x0a0b0500 # -ve
+ ror r0, r1, ra8.8d
+ ror ra1.8b, r1, ra8.8c ; v8min rb5, r0, r3
+
+ mov r1,0x04040100
+ ror r0, r1, ra8.8d
+ ror ra1.8c, r1, ra8.8c ; v8min rb6, r0, r3
+
+ mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 ; mov rb_dest, unif # ; Destination address
+
+ mov r1,0x01010000 # -ve
+ ror r0, r1, ra8.8d
+
+ bra -, ra_link
+ ror ra1.8d, r1, ra8.8c ; v8min rb7, r0, r3
+
+ shl r0, ra_wt_off_l0, rb_wt_den_p15 # Offset calc
+ # For B l1 & L0 offsets should be identical so it doesn't matter which we use
+ asr rb_wt_off, r0, 9 ; mov ra_link, unif # ; link - load after we've used its previous val
+# >>> branch ra_link
+
+# r5 = 0
+# ra_wt_mul_l1 = weight L1
+# ra5.16a = weight L0/L1 depending on side (wanted for 2x mono-pred)
+# rb_wt_off = (((is P) ? offset L0/L1 * 2 : offset L1 + offset L0) + 1) << (rb_wt_den_p15 - 1)
+# rb_wt_den_p15 = weight denom + 6 + 9
+# rb_wt_mul_l0 = weight L0
+.endm
+
+:per_block_setup_8
+ m_per_block_setup 8
+
+
+
+################################################################################
+# mc_filter(y_x, base, y2_x2, base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
+# In a P block, y2_x2 should be y_x+8
+# At this point we have already issued two pairs of texture requests for the current block
+
+.macro m_filter_y_pxx, v_bit_depth
+ m_luma_setup v_bit_depth
+
+ shl ra_wt_mul_l0, ra_wt_mul_l0, 1
+
+# r5 = 0 (loop count)
+
+:1
+# retrieve texture results and pick out bytes
+# then submit two more texture requests
+
+# N.B. Whilst y == y2 as far as this loop is concerned we will start
+# the grab for the next block before we finish with this block and that
+# might be B where y != y2 so we must do full processing on both y and y2
+
+ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1
+ shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0
+ shr r0, r4, ra_xshift ; mov r3, rb_pitch
+
+ max r2, ra_y, 0 # y
+ min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next
+ add ra_y, ra_y, 1 ; mul24 r2, r2, r3
+ add t0s, ra_base, r2 ; mov.ifz ra_base2, rb_base2_next
+
+ max r2, ra_y2, 0
+ min r2, r2, rb_max_y ; mov ra7, ra8
+ add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3
+ add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte
+
+ add.setf -, rb_ef, rb_ef ; mov ra8, ra9
+
+# apply horizontal filter
+ and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0
+ nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0
+ nop ; mul24.ifn r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0
+ nop ; mul24.ifn r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0
+ sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0
+ nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
+ sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0
+ nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
+ add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0
+ nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
+ add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0
+ nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
+ sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0
+ nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
+ add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0
+ nop ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
+
+ sub.setf -, r5, 8 ; mov ra9, ra10
+ sub r2, r2, r3 ; mul24 r0, rb9, ra2.8a
+ brr.anyn -, r:1b
+ mov rb9, rb10 ; mul24 r1, rb10, ra2.8b
+ mov ra10, ra11 ; mov rb10, rb11
+ asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7
+ # >>> .anyn 1b
+
+ # apply vertical filter and write to VPM
+ sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c
+ sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d
+ add r1, r1, r0 ; mul24 r0, ra8, rb4
+ add r1, r1, r0 ; mul24 r0, ra9, rb5
+ sub r1, r1, r0 ; mul24 r0, ra10, rb6
+ add r1, r1, r0 ; mul24 r0, ra11, rb7
+ sub r1, r1, r0
+# At this point r1 is a 22-bit signed quantity: 8 (original sample),
+# +6, +6 (each pass), +1 (the passes can overflow slightly), +1 (sign)
+# The top 8 bits have rubbish in them as mul24 is unsigned
+# The low 6 bits need discard before weighting
+ sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 # x256 - sign extend & discard rubbish
+ asr r1, r1, 14
+ nop ; mul24 r1, r1, ra_wt_mul_l0
+ add r1, r1, rb_wt_off ; mov r3, ra_blk_height # ; r3 = block height for outside loop
+
+ shl r1, r1, 8 ; v8subs r0, ra_height, r3
+ brr.anyn -, r:1b
+ asr r1, r1, rb_wt_den_p15
+ min r1, r1, ra_pmax ; mov -, vw_wait
+ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
+
+# >>> branch.anyn yloop
+
+# r0 = remaining height (min 0)
+# r2 = r3 * rb_pitch
+# r3 = block_height (currently always 16)
+
+# If looping again then we consumed 16 height last loop
+# rb_dma1 (stride) remains constant
+# rb_i_tmu remains const (based on total height)
+# recalc rb_dma0, rb_lcount based on new segment height
+
+ mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
+
+# DMA out
+ bra.anyz -, ra_link
+ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride
+ sub r1, r0, r3 ; mov vw_addr, rb_dest # start the VDW
+ shl r1, r1, i_shift23
+# >>> .anyz ra_link
+
+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
+# We add to dma0 to reduce the number of output lines in the final block
+ add rb_lcount, rb_lcount, r0
+ brr -, r:1b
+ add rb_dma0, rb_dma0, r1
+ add rb_dest, rb_dest, r2
+ mov vw_setup, rb_vpm_init # Reset our VDM write pointer
+# >>> 1b
+.endm
+
+::mc_filter_y_pxx
+ m_filter_y_pxx 8
+
+
+################################################################################
+
+# mc_filter_b(y_x, base, y2_x2, base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
+# In a P block, only the first half of coefficients contain used information.
+# At this point we have already issued two pairs of texture requests for the current block
+# Perhaps can unpack coefficients in a more efficient manner by doing H/V for a and b at the same time?
+# Or possibly by taking advantage of symmetry?
+
+.macro m_filter_y_bxx, v_bit_depth
+ m_luma_setup v_bit_depth
+
+:1
+ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1
+ shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0
+ shr r0, r4, ra_xshift ; mov r3, rb_pitch
+
+ max r2, ra_y, 0 # y
+ min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next
+ add ra_y, ra_y, 1 ; mul24 r2, r2, r3
+ add t0s, ra_base, r2 ; mov.ifz ra_base2, rb_base2_next
+
+ max r2, ra_y2, 0
+ min r2, r2, rb_max_y ; mov ra7, ra8
+ add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3
+ add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte
+
+ add.setf -, rb_ef, rb_ef ; mov ra8, ra9
+
+# apply horizontal filter
+ and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0
+ nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0
+ nop ; mul24.ifn r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0
+ nop ; mul24.ifn r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0
+ sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0
+ nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
+ sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0
+ nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
+ add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0
+ nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
+ add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0
+ nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
+ sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0
+ nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
+ add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0
+ nop ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
+
+ sub.setf -, r5, 8 ; mov ra9, ra10
+ sub r2, r2, r3 ; mul24 r0, rb9, ra2.8a
+ brr.anyn -, r:1b
+ mov rb9, rb10 ; mul24 r1, rb10, ra2.8b
+ mov ra10, ra11 ; mov rb10, rb11
+ asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7
+ # >>> .anyn 1b
+
+ # apply vertical filter and write to VPM
+ sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c
+ sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d
+ add r1, r1, r0 ; mul24 r0, ra8, rb4
+ add r1, r1, r0 ; mul24 r0, ra9, rb5
+ sub r1, r1, r0 ; mul24 r0, ra10, rb6
+ add r1, r1, r0 ; mul24 r0, ra11, rb7
+ sub r1, r1, r0 ; mov r2, rb_wt_off
+# As with P-pred r1 is a 22-bit signed quantity in 32-bits
+# Top 8 bits are bad - low 6 bits should be discarded
+ sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
+
+ asr r1, r1, 14
+ nop ; mul24 r0, r1, ra_wt_mul_l0
+ add r0, r0, r2 ; mul24 r1, r1 << 8, ra_wt_mul_l1 << 8 @ "mul_used", 0
+
+ add r1, r1, r0 ; mov r3, ra_blk_height
+ shl r1, r1, 8 ; v8subs r0, ra_height, r3
+ brr.anyn -, r:1b
+ asr r1, r1, rb_wt_den_p15
+ min r1, r1, ra_pmax ; mov -, vw_wait
+ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
+# >>> branch.anyn 1b
+
+# r0 = remaining height (min 0)
+# r2 = r3 * rb_pitch
+# r3 = block_height (currently always 16)
+
+# If looping again then we consumed 16 height last loop
+# rb_dma1 (stride) remains constant
+# rb_i_tmu remains const (based on total height)
+# recalc rb_dma0, rb_lcount based on new segment height
+
+ mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
+
+# DMA out
+ bra.anyz -, ra_link
+ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride
+ sub r1, r0, r3 ; mov vw_addr, rb_dest # start the VDW
+ shl r1, r1, i_shift23
+# >>> .anyz ra_link
+
+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
+# We add to dma0 to reduce the number of output lines in the final block
+ add rb_lcount, rb_lcount, r0
+ brr -, r:1b
+ add rb_dma0, rb_dma0, r1
+ add rb_dest, rb_dest, r2
+ mov vw_setup, rb_vpm_init # Reset our VDM write pointer
+# >>> 1b
+.endm
+
+::mc_filter_y_bxx
+ m_filter_y_bxx 8
+
+################################################################################
+#
+# typedef struct qpu_mc_pred_y_p00_s {
+# qpu_mc_src_t next_src1;
+# uint16_t h;
+# uint16_t w;
+# uint32_t wo1;
+# uint32_t dst_addr;
+# uint32_t next_fn;
+# } qpu_mc_pred_y_p00_t;
+
+.macro m_filter_y_p00, v_bit_depth
+
+.if v_bit_depth <= 8
+.set v_x_shift, 0
+.set v_x_mul, 1
+# Shifts to get width & height in the right place in rb_dma0
+.set v_dma_h_shift, 7
+.set v_dma_wh_shift, i_shift16
+.else
+.set v_x_shift, 1
+.set v_x_mul, 2
+# Shifts to get width & height in the right place in rb_dma0
+.set v_dma_h_shift, 8
+.set v_dma_wh_shift, 15
+.endif
+
+ mov ra0, unif ; mov r3, elem_num # y_x
+ mov ra_xshift, ra_xshift_next # [ra0 delay]
+ add r0, ra0.16b, r3
+.if v_x_shift != 0
+ shl r0, r0, v_x_shift
+.endif
+
+ max r0, r0, 0
+ min r0, r0, rb_max_x
+
+ shl ra_xshift_next, r0, 3 # Compute shifts
+ and r0, r0, -4 ; v8subs r2, r2, r2
+ sub r2, r2, rb_pitch ; mov ra_base_next, unif # src1.base
+ and r1, r0, r2 ; mov ra_y_next, ra0.16a
+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
+ add r0, r0, r1 ; mov ra_width_height, unif # Add stripe offsets ; width_height
+ add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init # [ra_width delay] ; set up VPM write
+
+# get width,height of block (unif load above)
+# Compute vdw_setup1(dst_pitch-width)
+ shl r1, ra_width, v_x_shift
+ sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
+ sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height
+ shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0
+ add r0, r0, r1 ; mov ra_wt_off_mul_l0, unif # Combine width and height of destination area ; weight_offset
+ shl r0, r0, v_dma_wh_shift ; mov rb_dest, unif # Shift into bits 16 upwards of the vdw_setup0 register ; dest addr
+ add rb_dma0, r0, rb_dma0_base
+
+ shl r0, ra_wt_off_l0, rb_wt_den_p15 ; v8subs r5rep, r3, r3 # Offset calc ; r5 = 0
+ # For B l1 & L0 offsets should be identical so it doesn't matter which we use
+ asr rb_wt_off, r0, 1 ; mov ra_link, unif # ; link
+
+:1
+ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1
+ nop ; mov.ifz ra_y, ra_y_next ; ldtmu0
+ shr r0, r4, ra_xshift ; mov r3, rb_pitch
+
+ max r2, ra_y, 0 # y
+ min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next
+ add ra_y, ra_y, 1 ; mul24 r2, r2, r3
+ add t0s, ra_base, r2 ; v8min r0, r0, rb_pmask
+
+ sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0
+ shl r1, r1, 23 - v_bit_depth ; mov r3, ra_blk_height
+ add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
+
+ brr.anyn -, r:1b
+ asr r1, r1, rb_wt_den_p15
+ min r1, r1, ra_pmax ; mov -, vw_wait
+ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
+# >>> branch.anyn 1b
+
+# r0 = remaining height (min 0)
+# r2 = r3 * rb_pitch
+# r3 = block_height (currently always 16)
+
+# If looping again then we consumed 16 height last loop
+# rb_dma1 (stride) remains constant
+# rb_i_tmu remains const (based on total height)
+# recalc rb_dma0, rb_lcount based on new segment height
+
+ mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
+
+# DMA out
+ bra.anyz -, ra_link
+ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride
+ sub r1, r0, r3 ; mov vw_addr, rb_dest # start the VDW
+ shl r1, r1, i_shift23
+# >>> .anyz ra_link
+
+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
+# We add to dma0 to reduce the number of output lines in the final block
+ add rb_lcount, rb_lcount, r0
+ brr -, r:1b
+ add rb_dma0, rb_dma0, r1
+ add rb_dest, rb_dest, r2
+ mov vw_setup, rb_vpm_init # Reset our VDM write pointer
+# >>> 1b
+.endm
+
+::mc_filter_y_p00
+ m_filter_y_p00 8
+
+################################################################################
+
+.macro m_filter_y_b00, v_bit_depth
+# luma setup does a fair bit more than we need calculating filter coeffs
+# that we will never use but it saves I-cache to use it (also simple!)
+ m_luma_setup v_bit_depth
+
+# Fix up vals that were expecting a filter (somewhat icky)
+ mov r0, 7
+ sub rb_i_tmu, rb_i_tmu, r0
+ sub rb_lcount, rb_lcount, r0
+ mov r0, 8 ; mov r1, ra_wt_off_mul_l0
+ shl rb_wt_off, rb_wt_off, r0
+ nop ; mov.ifnz ra_wt_off_mul_l0, r1 << 8
+
+:1
+ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1
+ shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0
+ shr r0, r4, ra_xshift ; mov r3, rb_pitch
+
+ max r2, ra_y, 0 # y
+ min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next
+ add ra_y, ra_y, 1 ; mul24 r2, r2, r3
+ add t0s, ra_base, r2 ; mov.ifz ra_base2, rb_base2_next
+
+ max r2, ra_y2, 0
+ min r2, r2, rb_max_y
+ add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3
+ add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte
+ and r1, r1, rb_pmask ; mul24 r0, r0, ra_wt_mul_l0
+
+ sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1
+ add r1, r0, r1
+ shl r1, r1, 22 - v_bit_depth ; mov r3, ra_blk_height
+ add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
+
+ brr.anyn -, r:1b
+ asr r1, r1, rb_wt_den_p15
+ min r1, r1, ra_pmax ; mov -, vw_wait
+ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
+# >>> branch.anyn 1b
+
+# r0 = remaining height (min 0)
+# r2 = r3 * rb_pitch
+# r3 = block_height (currently always 16)
+
+# If looping again then we consumed 16 height last loop
+# rb_dma1 (stride) remains constant
+# rb_i_tmu remains const (based on total height)
+# recalc rb_dma0, rb_lcount based on new segment height
+
+ mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
+
+# DMA out
+ bra.anyz -, ra_link
+ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride
+ sub r1, r0, r3 ; mov vw_addr, rb_dest # start the VDW
+ shl r1, r1, i_shift23
+# >>> .anyz ra_link
+
+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
+# We add to dma0 to reduce the number of output lines in the final block
+ add rb_lcount, rb_lcount, r0
+ brr -, r:1b
+ add rb_dma0, rb_dma0, r1
+ add rb_dest, rb_dest, r2
+ mov vw_setup, rb_vpm_init # Reset our VDM write pointer
+# >>> 1b
+.endm
+
+::mc_filter_y_b00
+ m_filter_y_b00 8
+
+################################################################################
+################################################################################
+# 10 BIT
+
+::mc_setup_c10_q0
+ m_setup_q0
+::mc_setup_c10_qn
+ m_setup_c 10
+
+::mc_filter_c10_p
+ m_filter_c_p 0, 10
+
+::mc_filter_c10_p_l1
+ m_filter_c_p 1, 10
+
+
+::mc_filter_c10_b
+ m_filter_c_b 10
+
+# Even if these fns are the same as for other bit depths we want our own copy
+# to keep the code we are using in a single lump to avoid (direct map) cache
+# thrashing
+.set v_quads10, N_QPU_16 / 4
+
+::mc_sync10_q0
+ m_sync_q 0, v_quads10
+::mc_sync10_q1
+ m_sync_q 1, v_quads10
+::mc_sync10_q2
+ m_sync_q 2, v_quads10
+::mc_sync10_q3
+ m_sync_q 3, v_quads10
+::mc_sync10_q4
+ m_sync_q 4, v_quads10
+::mc_sync10_q5
+ m_sync_q 5, v_quads10
+::mc_sync10_q6
+ m_sync_q 6, v_quads10
+::mc_sync10_q7
+ m_sync_q 7, v_quads10
+::mc_sync10_q8
+ m_sync_q 8, v_quads10
+::mc_sync10_q9
+ m_sync_q 9, v_quads10
+::mc_sync10_q10
+ m_sync_q 10, v_quads10
+::mc_sync10_q11
+ m_sync_q 11, v_quads10
+
+::mc_exit_y10_q0
+::mc_exit_c10_q0
+ m_exit_q0
+
+::mc_exit_y10_qn
+::mc_exit_c10_qn
+ m_exit_qn
+
+::mc_setup_y10_q0
+ m_setup_q0
+::mc_setup_y10_qn
+ m_setup_y 10
+
+:per_block_setup_10
+ m_per_block_setup 10
+
+::mc_filter_y10_pxx
+ m_filter_y_pxx 10
+
+::mc_filter_y10_p00
+ m_filter_y_p00 10
+
+::mc_filter_y10_bxx
+ m_filter_y_bxx 10
+
+::mc_filter_y10_b00
+ m_filter_y_b00 10
+
+
+
+::mc_end
+# Do not add code here because mc_end must appear after all other code.
diff --git a/libavcodec/rpi_shader_cmd.h b/libavcodec/rpi_shader_cmd.h
new file mode 100644
index 0000000000..9f8983da52
--- /dev/null
+++ b/libavcodec/rpi_shader_cmd.h
@@ -0,0 +1,128 @@
+#ifndef RPI_SHADER_CMD_H
+#define RPI_SHADER_CMD_H
+
+#pragma pack(push, 4)
+
+#if RPI_QPU_EMU_C && RPI_QPU_EMU_Y
+// If mixed then we are just confused and get a lot of warnings....
+typedef const uint8_t * qpu_mc_src_addr_t;
+typedef uint8_t * qpu_mc_dst_addr_t;
+#else
+typedef uint32_t qpu_mc_src_addr_t;
+typedef uint32_t qpu_mc_dst_addr_t;
+#endif
+
+typedef struct qpu_mc_src_s
+{
+ int16_t y;
+ int16_t x;
+ qpu_mc_src_addr_t base;
+} qpu_mc_src_t;
+
+
+typedef struct qpu_mc_pred_c_p_s {
+ qpu_mc_src_t next_src;
+ uint16_t h;
+ uint16_t w;
+ uint32_t coeffs_x;
+ uint32_t coeffs_y;
+ uint32_t wo_u;
+ uint32_t wo_v;
+ qpu_mc_dst_addr_t dst_addr_c;
+ uint32_t next_fn;
+} qpu_mc_pred_c_p_t;
+
+typedef struct qpu_mc_pred_c_b_s {
+ qpu_mc_src_t next_src1;
+ uint16_t h;
+ uint16_t w;
+ uint32_t coeffs_x1;
+ uint32_t coeffs_y1;
+ uint32_t weight_u1;
+ uint32_t weight_v1;
+ qpu_mc_src_t next_src2;
+ uint32_t coeffs_x2;
+ uint32_t coeffs_y2;
+ uint32_t wo_u2;
+ uint32_t wo_v2;
+ qpu_mc_dst_addr_t dst_addr_c;
+ uint32_t next_fn;
+} qpu_mc_pred_c_b_t;
+
+typedef struct qpu_mc_pred_c_s_s {
+ qpu_mc_src_t next_src1;
+ uint32_t pic_cw; // C Width (== Y width / 2)
+ uint32_t pic_ch; // C Height (== Y Height / 2)
+ uint32_t stride2;
+ uint32_t stride1;
+ uint32_t wdenom;
+ qpu_mc_src_t next_src2;
+ uint32_t next_fn;
+} qpu_mc_pred_c_s_t;
+
+typedef struct qpu_mc_pred_c_s {
+ union {
+ qpu_mc_pred_c_p_t p;
+ qpu_mc_pred_c_b_t b;
+ qpu_mc_pred_c_s_t s;
+ };
+} qpu_mc_pred_c_t;
+
+
+typedef struct qpu_mc_pred_y_p_s {
+ qpu_mc_src_t next_src1;
+ qpu_mc_src_t next_src2;
+ uint16_t h;
+ uint16_t w;
+ uint32_t mymx21;
+ uint32_t wo1;
+ uint32_t wo2;
+ qpu_mc_dst_addr_t dst_addr;
+ uint32_t next_fn;
+} qpu_mc_pred_y_p_t;
+
+typedef struct qpu_mc_pred_y_p00_s {
+ qpu_mc_src_t next_src1;
+ uint16_t h;
+ uint16_t w;
+ uint32_t wo1;
+ qpu_mc_dst_addr_t dst_addr;
+ uint32_t next_fn;
+} qpu_mc_pred_y_p00_t;
+
+typedef struct qpu_mc_pred_y_s_s {
+ qpu_mc_src_t next_src1;
+ qpu_mc_src_t next_src2;
+ uint16_t pic_h;
+ uint16_t pic_w;
+ uint32_t stride2;
+ uint32_t stride1;
+ uint32_t wdenom;
+ uint32_t next_fn;
+} qpu_mc_pred_y_s_t;
+
+// Only a useful structure in that it allows us to return something other than a void *
+typedef struct qpu_mc_pred_y_s {
+ union {
+ qpu_mc_pred_y_p_t p;
+ qpu_mc_pred_y_p00_t p00;
+ qpu_mc_pred_y_s_t s;
+ };
+} qpu_mc_pred_y_t;
+
+typedef union qpu_mc_pred_cmd_u {
+ qpu_mc_pred_y_t y;
+ qpu_mc_pred_c_t c;
+ uint32_t data[1];
+} qpu_mc_pred_cmd_t;
+
+#define QPU_MC_PRED_N_Y8 12
+#define QPU_MC_PRED_N_C8 12
+
+#define QPU_MC_PRED_N_Y10 12
+#define QPU_MC_PRED_N_C10 12
+
+#pragma pack(pop)
+
+#endif
+
diff --git a/libavcodec/rpi_shader_template.c b/libavcodec/rpi_shader_template.c
new file mode 100644
index 0000000000..1925ab7a79
--- /dev/null
+++ b/libavcodec/rpi_shader_template.c
@@ -0,0 +1,65 @@
+#ifdef RPI
+
+#include "hevc.h"
+#include "libavutil/rpi_sand_fns.h"
+#include "rpi_shader_cmd.h"
+#include "rpi_shader_template.h"
+
+typedef struct shader_track_s
+{
+ const union qpu_mc_pred_cmd_u *qpu_mc_curr;
+ const struct qpu_mc_src_s *last_l0;
+ const struct qpu_mc_src_s *last_l1;
+ uint32_t width; // pic_width * PW
+ uint32_t height;
+ uint32_t stride2;
+ uint32_t stride1;
+ uint32_t wdenom;
+} shader_track_t;
+
+static int wtoidx(const unsigned int w)
+{
+ static const uint8_t pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
+ return pel_weight[w];
+}
+
+static const int fctom(uint32_t x)
+{
+ int rv;
+ // As it happens we can take the 2nd filter term & divide it by 8
+ // (dropping fractions) to get the fractional move
+ rv = 8 - ((x >> 11) & 0xf);
+ av_assert2(rv >= 0 && rv <= 7);
+ return rv;
+}
+
+static inline int32_t ext(int32_t x, unsigned int shl, unsigned int shr)
+{
+ return (x << shl) >> shr;
+}
+
+static inline int woff_p(HEVCContext *const s, int32_t x)
+{
+ return ext(x, 0, 17 + s->ps.sps->bit_depth - 8);
+}
+
+static inline int woff_b(HEVCContext *const s, int32_t x)
+{
+ return ext(x - 0x10000, 0, 16 + s->ps.sps->bit_depth - 8);
+}
+
+static inline int wweight(int32_t x)
+{
+ return ext(x, 16, 16);
+}
+
+
+#define PW 1
+#include "rpi_shader_template_fn.h"
+
+#undef PW
+#define PW 2
+#include "rpi_shader_template_fn.h"
+
+#endif
+
diff --git a/libavcodec/rpi_shader_template.h b/libavcodec/rpi_shader_template.h
new file mode 100644
index 0000000000..ecf5b8185a
--- /dev/null
+++ b/libavcodec/rpi_shader_template.h
@@ -0,0 +1,24 @@
+#ifndef LIBAVCODEC_RPI_SHADER_TEMPLATE_H
+#define LIBAVCODEC_RPI_SHADER_TEMPLATE_H
+
+#ifdef RPI
+struct HEVCContext;
+struct HEVCRpiInterPredEnv;
+
+void rpi_shader_c8(struct HEVCContext *const s,
+ const struct HEVCRpiInterPredEnv *const ipe_y,
+ const struct HEVCRpiInterPredEnv *const ipe_c);
+
+void rpi_shader_c16(struct HEVCContext *const s,
+ const struct HEVCRpiInterPredEnv *const ipe_y,
+ const struct HEVCRpiInterPredEnv *const ipe_c);
+
+void rpi_sand_dump8(const char * const name,
+ const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c);
+
+void rpi_sand_dump16(const char * const name,
+ const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c);
+
+#endif
+#endif
+
diff --git a/libavcodec/rpi_shader_template_fn.h b/libavcodec/rpi_shader_template_fn.h
new file mode 100644
index 0000000000..b5ac2ceed6
--- /dev/null
+++ b/libavcodec/rpi_shader_template_fn.h
@@ -0,0 +1,477 @@
+#define STRCAT(x,y) x##y
+
+#if PW == 1
+#define pixel uint8_t
+#define FUNC(f) STRCAT(f, 8)
+#elif PW == 2
+#define pixel uint16_t
+#define FUNC(f) STRCAT(f, 16)
+#else
+#error Unexpected PW
+#endif
+
+#define PATCH_STRIDE (16 * PW)
+
+static void FUNC(dup_lr)(uint8_t * dst, const uint8_t * src, unsigned int w, unsigned int h, unsigned int stride)
+{
+ for (unsigned int i = 0; i != h; ++i, dst += stride, src += stride) {
+ const pixel s = *(const pixel *)src;
+ pixel * d = (pixel *)dst;
+ for (unsigned int j = 0; j < w; j += PW) {
+ *d++ = s;
+ }
+ }
+}
+
+static void FUNC(dup_tb)(uint8_t * dst, const uint8_t * src, unsigned int w, unsigned int h, unsigned int stride)
+{
+ for (unsigned int i = 0; i != h; ++i, dst += stride) {
+ memcpy(dst, src, w);
+ }
+}
+
+static void FUNC(get_patch_y)(const shader_track_t * const st,
+ uint8_t * dst, const unsigned int dst_stride,
+ const qpu_mc_src_t *src,
+ unsigned int _w, unsigned int _h)
+{
+ int x = src->x * PW;
+ int y = src->y;
+ int w = _w * PW;
+ int h = _h;
+ int dl = 0;
+ int dr = 0;
+ int dt = 0;
+ int db = 0;
+
+ if (x < 0) {
+ if (-x >= w)
+ x = PW - w;
+ dl = -x;
+ w += x;
+ x = 0;
+ }
+ if (x + w > st->width) {
+ if (x >= st->width)
+ x = st->width - PW;
+ dr = (x + w) - st->width;
+ w = st->width - x;
+ }
+
+ // Y
+ if (y < 0) {
+ if (-y >= h)
+ y = 1 - h;
+ dt = -y;
+ h += y;
+ y = 0;
+ }
+ if (y + h > st->height) {
+ if (y >= st->height)
+ y = st->height - 1;
+ db = (y + h) - st->height;
+ h = st->height - y;
+ }
+
+ dst += dl + dt * dst_stride;
+ FUNC(av_rpi_sand_to_planar_y)(dst, dst_stride, (const uint8_t *)src->base, st->stride1, st->stride2, x, y, w, h);
+
+ // Edge dup
+ if (dl != 0)
+ FUNC(dup_lr)(dst - dl, dst, dl, h, dst_stride);
+ if (dr != 0)
+ FUNC(dup_lr)(dst + w, dst + w - PW, dr, h, dst_stride);
+ w += dl + dr;
+ dst -= dl;
+
+ if (dt != 0)
+ FUNC(dup_tb)(dst - dt * dst_stride, dst, w, dt, dst_stride);
+ if (db != 0)
+ FUNC(dup_tb)(dst + h * dst_stride, dst + (h - 1) * dst_stride, w, db, dst_stride);
+}
+
+
+
+static void FUNC(get_patch_c)(const shader_track_t * const st,
+ uint8_t * dst_u, uint8_t * dst_v, const unsigned int dst_stride,
+ const qpu_mc_src_t *src,
+ unsigned int _w, unsigned int _h)
+{
+ int x = src->x * PW;
+ int y = src->y;
+ int w = _w * PW;
+ int h = _h;
+ int dl = 0;
+ int dr = 0;
+ int dt = 0;
+ int db = 0;
+ const int width = st->width;
+ const int height = st->height;
+
+ if (x < 0) {
+ if (-x >= w)
+ x = PW - w;
+ dl = -x;
+ w += x;
+ x = 0;
+ }
+ if (x + w > width) {
+ if (x >= width)
+ x = width - PW;
+ dr = (x + w) - width;
+ w = width - x;
+ }
+
+ // Y
+ if (y < 0) {
+ if (-y >= h)
+ y = 1 - h;
+ dt = -y;
+ h += y;
+ y = 0;
+ }
+ if (y + h > height) {
+ if (y >= height)
+ y = height - 1;
+ db = (y + h) - height;
+ h = height - y;
+ }
+
+ dst_u += dl + dt * dst_stride;
+ dst_v += dl + dt * dst_stride;
+ FUNC(av_rpi_sand_to_planar_c)(dst_u, dst_stride, dst_v, dst_stride, (const uint8_t *)src->base, st->stride1, st->stride2, x, y, w, h);
+
+ // Edge dup
+ if (dl != 0)
+ {
+ FUNC(dup_lr)(dst_u - dl, dst_u, dl, h, dst_stride);
+ FUNC(dup_lr)(dst_v - dl, dst_v, dl, h, dst_stride);
+ }
+ if (dr != 0)
+ {
+ FUNC(dup_lr)(dst_u + w, dst_u + w - PW, dr, h, dst_stride);
+ FUNC(dup_lr)(dst_v + w, dst_v + w - PW, dr, h, dst_stride);
+ }
+ w += dl + dr;
+ dst_u -= dl;
+ dst_v -= dl;
+
+ if (dt != 0)
+ {
+ FUNC(dup_tb)(dst_u - dt * dst_stride, dst_u, w, dt, dst_stride);
+ FUNC(dup_tb)(dst_v - dt * dst_stride, dst_v, w, dt, dst_stride);
+ }
+ if (db != 0)
+ {
+ FUNC(dup_tb)(dst_u + h * dst_stride, dst_u + (h - 1) * dst_stride, w, db, dst_stride);
+ FUNC(dup_tb)(dst_v + h * dst_stride, dst_v + (h - 1) * dst_stride, w, db, dst_stride);
+ }
+}
+
+// w, y, w, h in pixels
+// stride1, stride2 in bytes
+void FUNC(rpi_sand_dump)(const char * const name,
+ const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c)
+{
+ const int mask = stride2 == 0 ? ~0 : stride1 - 1;
+
+ printf("%s (%d,%d) %dx%d\n", name, x, y, w, h);
+
+ if (is_c) {
+ x *= 2;
+ w *= 2;
+ }
+
+ for (int i = y; i != y + h; ++i) {
+ for (int j = x; j != x + w; ++j) {
+ const uint8_t * p = base + ((j*PW) & mask) + i * stride1 + ((j*PW) & ~mask) * stride2;
+ char sep = is_c && (j & 1) == 0 ? ':' : ' ';
+#if PW == 1
+ if (j < 0 || i < 0)
+ printf("..%c", sep);
+ else
+ printf("%02x%c", *(const pixel*)p, sep);
+#else
+ if (j < 0 || i < 0)
+ printf("...%c", sep);
+ else
+ printf("%03x%c", *(const pixel*)p, sep);
+#endif
+ }
+ printf("\n");
+ }
+}
+
+
+void FUNC(rpi_shader_c)(HEVCContext *const s,
+ const HEVCRpiInterPredEnv *const ipe_y,
+ const HEVCRpiInterPredEnv *const ipe_c)
+{
+ for (int c_idx = 0; c_idx < 2; ++c_idx)
+ {
+ const HEVCRpiInterPredEnv *const ipe = c_idx == 0 ? ipe_y : ipe_c;
+ shader_track_t tracka[QPU_N_MAX] = {{NULL}};
+ unsigned int exit_n = 0;
+
+ if (ipe == NULL || !ipe->used) {
+ continue;
+ }
+
+ do {
+ for (unsigned int i = 0; i != ipe->n; ++i) {
+ const HEVCRpiInterPredQ * const q = ipe->q + i;
+ shader_track_t * const st = tracka + i;
+ const qpu_mc_pred_cmd_t * cmd = st->qpu_mc_curr == NULL ? q->qpu_mc_base : st->qpu_mc_curr;
+
+ for (;;) {
+ const uint32_t link = (cmd == q->qpu_mc_base) ? q->code_setup : ((uint32_t *)cmd)[-1];
+
+ if (link == q->code_setup) {
+ if (c_idx == 0) {
+ // Luma
+ const qpu_mc_pred_y_s_t *const c = &cmd->y.s;
+
+ st->height = c->pic_h;
+ st->width = c->pic_w * PW;
+ st->stride1 = c->stride1;
+ st->stride2 = c->stride2;
+ st->wdenom = c->wdenom;
+ st->last_l0 = &c->next_src1;
+ st->last_l1 = &c->next_src2;
+ cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
+ }
+ else {
+ // Chroma
+ const qpu_mc_pred_c_s_t *const c = &cmd->c.s;
+
+ st->height = c->pic_ch;
+ st->width = c->pic_cw * PW;
+ st->stride1 = c->stride1;
+ st->stride2 = c->stride2;
+ st->wdenom = c->wdenom;
+ st->last_l0 = &c->next_src1;
+ st->last_l1 = &c->next_src2;
+ cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
+ }
+ }
+ else if (link == s->qpu.y_pxx) {
+ const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
+ const int w1 = FFMIN(c->w, 8);
+ const int w2 = c->w - w1;
+
+ uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
+ uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
+
+ FUNC(get_patch_y)(st,
+ patch_y1, PATCH_STRIDE,
+ st->last_l0,
+ 16, c->h + 7);
+ if (w2 > 0) {
+ FUNC(get_patch_y)(st,
+ patch_y2, PATCH_STRIDE,
+ st->last_l1,
+ 16, c->h + 7);
+ }
+
+ // wo[offset] = offset*2+1
+ s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w1)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0](
+ (uint8_t *)c->dst_addr, st->stride1, patch_y1 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
+ c->h, st->wdenom, wweight(c->wo1), woff_p(s, c->wo1), (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), w1);
+ if (w2 > 0) {
+ s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w2)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0](
+ (uint8_t *)c->dst_addr + 8 * PW, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
+ c->h, st->wdenom, wweight(c->wo2), woff_p(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), w2);
+ }
+ st->last_l0 = &c->next_src1;
+ st->last_l1 = &c->next_src2;
+ cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
+ }
+ else if (link == s->qpu.y_bxx) {
+ const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
+
+ uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
+ uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
+ int16_t patch_y3[MAX_PB_SIZE * MAX_PB_SIZE];
+
+ FUNC(get_patch_y)(st,
+ patch_y1, PATCH_STRIDE,
+ st->last_l0,
+ 16, c->h + 7);
+ FUNC(get_patch_y)(st,
+ patch_y2, PATCH_STRIDE,
+ st->last_l1,
+ 16, c->h + 7);
+
+ s->hevcdsp.put_hevc_qpel[wtoidx(c->w)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0](
+ patch_y3, patch_y1+ 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
+ c->h, (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), c->w);
+
+ s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0](
+ (uint8_t *)c->dst_addr, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE, patch_y3,
+ c->h, st->wdenom, wweight(c->wo1), wweight(c->wo2),
+ 0, woff_b(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), c->w);
+ st->last_l0 = &c->next_src1;
+ st->last_l1 = &c->next_src2;
+ cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
+ }
+ else if (link == s->qpu.y_p00) {
+ const qpu_mc_pred_y_p00_t *const c = &cmd->y.p00;
+
+ uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
+
+ FUNC(get_patch_y)(st,
+ patch_y1, PATCH_STRIDE,
+ st->last_l0,
+ 16, c->h + 7);
+
+ // wo[offset] = offset*2+1
+ s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(c->w)][0][0](
+ (uint8_t *)c->dst_addr, st->stride1, patch_y1, PATCH_STRIDE,
+ c->h, st->wdenom, wweight(c->wo1), woff_p(s, c->wo1), 0, 0, c->w);
+
+ st->last_l0 = &c->next_src1;
+ cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
+ }
+ else if (link == s->qpu.y_b00) {
+ const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
+
+ uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
+ uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
+ int16_t patch_y3[MAX_PB_SIZE * MAX_PB_SIZE];
+
+ av_assert0(c->w <= 16 && c->h <= 64);
+
+ FUNC(get_patch_y)(st,
+ patch_y1, PATCH_STRIDE,
+ st->last_l0,
+ 16, c->h);
+ FUNC(get_patch_y)(st,
+ patch_y2, PATCH_STRIDE,
+ st->last_l1,
+ 16, c->h);
+
+ s->hevcdsp.put_hevc_qpel[wtoidx(c->w)][0][0](
+ patch_y3, patch_y1, PATCH_STRIDE,
+ c->h, 0, 0, c->w);
+
+ s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][0][0](
+ (uint8_t *)c->dst_addr, st->stride1, patch_y2, PATCH_STRIDE, patch_y3,
+ c->h, st->wdenom, wweight(c->wo1), wweight(c->wo2),
+ 0, woff_b(s, c->wo2), 0, 0, c->w);
+ st->last_l0 = &c->next_src1;
+ st->last_l1 = &c->next_src2;
+ cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
+ }
+ else if (link == s->qpu.c_pxx) {
+ const qpu_mc_pred_c_p_t *const c = &cmd->c.p;
+ const int mx = fctom(c->coeffs_x);
+ const int my = fctom(c->coeffs_y);
+
+ uint8_t patch_u1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
+ uint8_t patch_v1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
+ uint8_t patch_u3[8 * 16 * PW];
+ uint8_t patch_v3[8 * 16 * PW];
+
+ FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l0, 8+3, c->h + 3);
+
+ s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
+ patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
+ c->h, st->wdenom, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w);
+ s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
+ patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
+ c->h, st->wdenom, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w);
+
+ FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
+
+ st->last_l0 = &c->next_src;
+ cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
+ }
+ else if (link == s->qpu.c_pxx_l1) {
+ const qpu_mc_pred_c_p_t *const c = &cmd->c.p;
+ const int mx = fctom(c->coeffs_x);
+ const int my = fctom(c->coeffs_y);
+
+ uint8_t patch_u1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
+ uint8_t patch_v1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
+ uint8_t patch_u3[8 * 16 * PW];
+ uint8_t patch_v3[8 * 16 * PW];
+
+ FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l1, 8+3, c->h + 3);
+
+ s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
+ patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
+ c->h, st->wdenom, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w);
+ s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
+ patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
+ c->h, st->wdenom, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w);
+
+ FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
+
+ st->last_l1 = &c->next_src;
+ cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
+ }
+ else if (link == s->qpu.c_bxx) {
+ const qpu_mc_pred_c_b_t *const c = &cmd->c.b;
+ const int mx1 = fctom(c->coeffs_x1);
+ const int my1 = fctom(c->coeffs_y1);
+ const int mx2 = fctom(c->coeffs_x2);
+ const int my2 = fctom(c->coeffs_y2);
+
+ uint8_t patch_u1[PATCH_STRIDE * 72];
+ uint8_t patch_v1[PATCH_STRIDE * 72];
+ uint8_t patch_u2[PATCH_STRIDE * 72];
+ uint8_t patch_v2[PATCH_STRIDE * 72];
+ uint8_t patch_u3[8 * 16 * PW];
+ uint8_t patch_v3[8 * 16 * PW];
+ uint16_t patch_u4[MAX_PB_SIZE * MAX_PB_SIZE];
+ uint16_t patch_v4[MAX_PB_SIZE * MAX_PB_SIZE];
+
+ FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l0, 8+3, c->h + 3);
+ FUNC(get_patch_c)(st, patch_u2, patch_v2, PATCH_STRIDE, st->last_l1, 8+3, c->h + 3);
+
+ s->hevcdsp.put_hevc_epel[wtoidx(c->w)][my1 != 0][mx1 != 0](
+ patch_u4, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
+ c->h, mx1, my1, c->w);
+ s->hevcdsp.put_hevc_epel[wtoidx(c->w)][my1 != 0][mx1 != 0](
+ patch_v4, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
+ c->h, mx1, my1, c->w);
+
+ s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0](
+ patch_u3, 8 * PW, patch_u2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_u4,
+ c->h, st->wdenom, c->weight_u1, wweight(c->wo_u2),
+ 0, woff_b(s, c->wo_u2), mx2, my2, c->w);
+ s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0](
+ patch_v3, 8 * PW, patch_v2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_v4,
+ c->h, st->wdenom, c->weight_v1, wweight(c->wo_v2),
+ 0, woff_b(s, c->wo_v2), mx2, my2, c->w);
+
+ FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
+
+ st->last_l0 = &c->next_src1;
+ st->last_l1 = &c->next_src2;
+ cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
+ }
+ else if (link == q->code_sync) {
+ cmd = (const qpu_mc_pred_cmd_t *)((uint32_t *)cmd + 1);
+ break;
+ }
+ else if (link == q->code_exit) {
+ // We expect exit to occur without other sync
+ av_assert0(i == exit_n);
+ ++exit_n;
+ break;
+ }
+ else {
+ av_assert0(0);
+ }
+ }
+
+ st->qpu_mc_curr = cmd;
+ }
+ } while (exit_n == 0);
+ }
+}
+
+#undef FUNC
+#undef pixel
+
diff --git a/libavcodec/rpi_zc.c b/libavcodec/rpi_zc.c
new file mode 100644
index 0000000000..b502de0a2c
--- /dev/null
+++ b/libavcodec/rpi_zc.c
@@ -0,0 +1,745 @@
+#include "config.h"
+#ifdef RPI
+#include "libavcodec/avcodec.h"
+#include "rpi_qpu.h"
+#include "rpi_mailbox.h"
+#include "rpi_zc.h"
+#include "libavutil/avassert.h"
+#include "libavutil/rpi_sand_fns.h"
+#include <pthread.h>
+
+#include "libavutil/buffer_internal.h"
+#include <interface/vctypes/vc_image_types.h>
+
+#define TRACE_ALLOC 0
+
+struct ZcPoolEnt;
+
+typedef struct ZcPool
+{
+ int numbytes;
+ unsigned int n;
+ struct ZcPoolEnt * head;
+ pthread_mutex_t lock;
+} ZcPool;
+
+typedef struct ZcPoolEnt
+{
+ // It is important that we start with gmem as other bits of code will expect to see that
+ GPU_MEM_PTR_T gmem;
+ unsigned int n;
+ struct ZcPoolEnt * next;
+ struct ZcPool * pool;
+} ZcPoolEnt;
+
+#define ALLOC_PAD 0
+#define ALLOC_ROUND 0x1000
+#define ALLOC_N_OFFSET 0
+#define STRIDE_ROUND 64
+#define STRIDE_OR 0
+
+#define DEBUG_ZAP0_BUFFERS 0
+
+
+static ZcPoolEnt * zc_pool_ent_alloc(ZcPool * const pool, const unsigned int req_size)
+{
+ ZcPoolEnt * const zp = av_malloc(sizeof(ZcPoolEnt));
+
+ // Round up to 4k & add 4k
+ const unsigned int alloc_size = (req_size + ALLOC_PAD + ALLOC_ROUND - 1) & ~(ALLOC_ROUND - 1);
+
+ if (zp == NULL) {
+ av_log(NULL, AV_LOG_ERROR, "av_malloc(ZcPoolEnt) failed\n");
+ goto fail0;
+ }
+
+ if (gpu_malloc_cached(alloc_size, &zp->gmem) != 0)
+ {
+ av_log(NULL, AV_LOG_ERROR, "av_gpu_malloc_cached(%d) failed\n", alloc_size);
+ goto fail1;
+ }
+
+#if TRACE_ALLOC
+ printf("%s: Alloc %#x bytes @ %p\n", __func__, zp->gmem.numbytes, zp->gmem.arm);
+#endif
+
+ pool->numbytes = zp->gmem.numbytes;
+ zp->next = NULL;
+ zp->pool = pool;
+ zp->n = pool->n++;
+ return zp;
+
+fail1:
+ av_free(zp);
+fail0:
+ return NULL;
+}
+
+static void zc_pool_ent_free(ZcPoolEnt * const zp)
+{
+#if TRACE_ALLOC
+ printf("%s: Free %#x bytes @ %p\n", __func__, zp->gmem.numbytes, zp->gmem.arm);
+#endif
+
+ gpu_free(&zp->gmem);
+ av_free(zp);
+}
+
+static void zc_pool_flush(ZcPool * const pool)
+{
+ ZcPoolEnt * p = pool->head;
+ pool->head = NULL;
+ pool->numbytes = -1;
+
+ while (p != NULL)
+ {
+ ZcPoolEnt * const zp = p;
+ p = p->next;
+ zc_pool_ent_free(zp);
+ }
+}
+
+static ZcPoolEnt * zc_pool_alloc(ZcPool * const pool, const int req_bytes)
+{
+ ZcPoolEnt * zp;
+ int numbytes;
+
+ pthread_mutex_lock(&pool->lock);
+
+ numbytes = pool->numbytes;
+
+ // If size isn't close then dump the pool
+ // Close in this context means within 128k
+ if (req_bytes > numbytes || req_bytes + 0x20000 < numbytes)
+ {
+ zc_pool_flush(pool);
+ numbytes = req_bytes;
+ }
+
+ if (pool->head != NULL)
+ {
+ zp = pool->head;
+ pool->head = zp->next;
+ }
+ else
+ {
+ zp = zc_pool_ent_alloc(pool, numbytes);
+ }
+
+ pthread_mutex_unlock(&pool->lock);
+
+ // Start with our buffer empty of preconceptions
+// rpi_cache_flush_one_gm_ptr(&zp->gmem, RPI_CACHE_FLUSH_MODE_INVALIDATE);
+
+ return zp;
+}
+
+static void zc_pool_free(ZcPoolEnt * const zp)
+{
+ ZcPool * const pool = zp == NULL ? NULL : zp->pool;
+ if (zp != NULL)
+ {
+ pthread_mutex_lock(&pool->lock);
+#if TRACE_ALLOC
+ printf("%s: Recycle %#x, %#x\n", __func__, pool->numbytes, zp->gmem.numbytes);
+#endif
+
+ if (pool->numbytes == zp->gmem.numbytes)
+ {
+ zp->next = pool->head;
+ pool->head = zp;
+ pthread_mutex_unlock(&pool->lock);
+ }
+ else
+ {
+ pthread_mutex_unlock(&pool->lock);
+ zc_pool_ent_free(zp);
+ }
+ }
+}
+
+static void
+zc_pool_init(ZcPool * const pool)
+{
+ pool->numbytes = -1;
+ pool->head = NULL;
+ pthread_mutex_init(&pool->lock, NULL);
+}
+
+static void
+zc_pool_destroy(ZcPool * const pool)
+{
+ pool->numbytes = -1;
+ zc_pool_flush(pool);
+ pthread_mutex_destroy(&pool->lock);
+}
+
+typedef struct ZcOldCtxVals
+{
+ int thread_safe_callbacks;
+ int (*get_buffer2)(struct AVCodecContext *s, AVFrame *frame, int flags);
+ void * get_buffer_context;
+} ZcOldCtxVals;
+
+typedef struct AVZcEnv
+{
+ unsigned int refcount;
+ ZcPool pool;
+ ZcOldCtxVals old;
+} ZcEnv;
+
+// Callback when buffer unrefed to zero
+static void rpi_free_display_buffer(void *opaque, uint8_t *data)
+{
+ ZcPoolEnt *const zp = opaque;
+// printf("%s: data=%p\n", __func__, data);
+ zc_pool_free(zp);
+}
+
+static inline GPU_MEM_PTR_T * pic_gm_ptr(AVBufferRef * const buf)
+{
+ // Kludge where we check the free fn to check this is really
+ // one of our buffers - can't think of a better way
+ return buf == NULL || buf->buffer->free != rpi_free_display_buffer ? NULL :
+ av_buffer_get_opaque(buf);
+}
+
+AVRpiZcFrameGeometry av_rpi_zc_frame_geometry(
+ const int format, const unsigned int video_width, const unsigned int video_height)
+{
+ AVRpiZcFrameGeometry geo;
+
+ switch (format)
+ {
+ case AV_PIX_FMT_YUV420P:
+ geo.stride_y = ((video_width + 32 + STRIDE_ROUND - 1) & ~(STRIDE_ROUND - 1)) | STRIDE_OR;
+ geo.stride_c = geo.stride_y / 2;
+ geo.height_y = (video_height + 32 + 31) & ~31;
+ geo.height_c = geo.height_y / 2;
+ geo.planes_c = 2;
+ geo.stripes = 1;
+ geo.bytes_per_pel = 1;
+ break;
+
+ case AV_PIX_FMT_YUV420P10:
+ geo.stride_y = ((video_width * 2 + 64 + STRIDE_ROUND - 1) & ~(STRIDE_ROUND - 1)) | STRIDE_OR;
+ geo.stride_c = geo.stride_y / 2;
+ geo.height_y = (video_height + 32 + 31) & ~31;
+ geo.height_c = geo.height_y / 2;
+ geo.planes_c = 2;
+ geo.stripes = 1;
+ geo.bytes_per_pel = 2;
+ break;
+
+ case AV_PIX_FMT_SAND128:
+ {
+ const unsigned int stripe_w = 128;
+
+ static pthread_mutex_t sand_lock = PTHREAD_MUTEX_INITIALIZER;
+ static VC_IMAGE_T img = {0};
+
+ // Given the overhead of calling the mailbox keep a stashed
+ // copy as we will almost certainly just want the same numbers again
+ // but that means we need a lock
+ pthread_mutex_lock(&sand_lock);
+
+ if (img.width != video_width || img.height != video_height)
+ {
+ VC_IMAGE_T new_img = {
+ .type = VC_IMAGE_YUV_UV,
+ .width = video_width,
+ .height = video_height
+ };
+
+ gpu_ref();
+ mbox_get_image_params(gpu_get_mailbox(), &new_img);
+ gpu_unref();
+ img = new_img;
+ }
+
+ geo.stride_y = stripe_w;
+ geo.stride_c = stripe_w;
+ geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w;
+ geo.height_c = img.pitch / stripe_w - geo.height_y;
+ geo.planes_c = 1;
+ geo.stripes = (video_width + stripe_w - 1) / stripe_w;
+ geo.bytes_per_pel = 1;
+
+ pthread_mutex_unlock(&sand_lock);
+
+ av_assert0((int)geo.height_y > 0 && (int)geo.height_c > 0);
+ av_assert0(geo.height_y >= video_height && geo.height_c >= video_height / 2);
+ break;
+ }
+
+ case AV_PIX_FMT_SAND64_16:
+ case AV_PIX_FMT_SAND64_10:
+ {
+ const unsigned int stripe_w = 128; // bytes
+
+ static pthread_mutex_t sand_lock = PTHREAD_MUTEX_INITIALIZER;
+ static VC_IMAGE_T img = {0};
+
+ // Given the overhead of calling the mailbox keep a stashed
+ // copy as we will almost certainly just want the same numbers again
+ // but that means we need a lock
+ pthread_mutex_lock(&sand_lock);
+
+ if (img.width != video_width || img.height != video_height)
+ {
+ VC_IMAGE_T new_img = {
+ .type = VC_IMAGE_YUV_UV_16,
+ .width = video_width,
+ .height = video_height
+ };
+
+ gpu_ref();
+ mbox_get_image_params(gpu_get_mailbox(), &new_img);
+ gpu_unref();
+ img = new_img;
+ }
+
+ geo.stride_y = stripe_w;
+ geo.stride_c = stripe_w;
+ geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w;
+ geo.height_c = img.pitch / stripe_w - geo.height_y;
+ geo.planes_c = 1;
+ geo.stripes = (video_width * 2 + stripe_w - 1) / stripe_w;
+ geo.bytes_per_pel = 2;
+
+ pthread_mutex_unlock(&sand_lock);
+ break;
+ }
+
+ default:
+ memset(&geo, 0, sizeof(geo));
+ break;
+ }
+ return geo;
+}
+
+
+static AVBufferRef * rpi_buf_pool_alloc(ZcPool * const pool, int size)
+{
+ ZcPoolEnt *const zp = zc_pool_alloc(pool, size);
+ AVBufferRef * buf;
+ intptr_t idata = (intptr_t)zp->gmem.arm;
+#if ALLOC_N_OFFSET != 0
+ intptr_t noff = (zp->n * ALLOC_N_OFFSET) & (ALLOC_PAD - 1);
+#endif
+
+ if (zp == NULL) {
+ av_log(NULL, AV_LOG_ERROR, "zc_pool_alloc(%d) failed\n", size);
+ goto fail0;
+ }
+
+#if ALLOC_N_OFFSET != 0
+ idata = ((idata & ~(ALLOC_PAD - 1)) | noff) + (((idata & (ALLOC_PAD - 1)) > noff) ? ALLOC_PAD : 0);
+#endif
+
+#if DEBUG_ZAP0_BUFFERS
+ memset((void*)idata, 0, size);
+#endif
+
+ if ((buf = av_buffer_create((void *)idata, size, rpi_free_display_buffer, zp, AV_BUFFER_FLAG_READONLY)) == NULL)
+ {
+ av_log(NULL, AV_LOG_ERROR, "av_buffer_create() failed\n");
+ goto fail2;
+ }
+
+ return buf;
+
+fail2:
+ zc_pool_free(zp);
+fail0:
+ return NULL;
+}
+
+static int rpi_get_display_buffer(ZcEnv *const zc, AVFrame * const frame)
+{
+ const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(frame->format, frame->width, frame->height);
+ const unsigned int size_y = geo.stride_y * geo.height_y;
+ const unsigned int size_c = geo.stride_c * geo.height_c;
+ const unsigned int size_pic = (size_y + size_c * geo.planes_c) * geo.stripes;
+ AVBufferRef * buf;
+ unsigned int i;
+
+// printf("Do local alloc: format=%#x, %dx%d: %u\n", frame->format, frame->width, frame->height, size_pic);
+
+ if ((buf = rpi_buf_pool_alloc(&zc->pool, size_pic)) == NULL)
+ {
+ av_log(NULL, AV_LOG_ERROR, "rpi_get_display_buffer: Failed to get buffer from pool\n");
+ return AVERROR(ENOMEM);
+ }
+
+ for (i = 0; i < AV_NUM_DATA_POINTERS; i++) {
+ frame->buf[i] = NULL;
+ frame->data[i] = NULL;
+ frame->linesize[i] = 0;
+ }
+
+ frame->buf[0] = buf;
+
+ frame->linesize[0] = geo.stride_y;
+ frame->linesize[1] = geo.stride_c;
+ frame->linesize[2] = geo.stride_c;
+ // abuse: linesize[3] = "stripe stride"
+ // stripe_stride is NOT the stride between slices it is (that / geo.stride_y).
+ // In a general case this makes the calculation an xor and multiply rather
+ // than a divide and multiply
+ if (geo.stripes > 1)
+ frame->linesize[3] = geo.height_y + geo.height_c;
+
+ frame->data[0] = buf->data;
+ frame->data[1] = frame->data[0] + size_y;
+ if (geo.planes_c > 1)
+ frame->data[2] = frame->data[1] + size_c;
+
+ frame->extended_data = frame->data;
+ // Leave extended buf alone
+
+#if RPI_ZC_SAND_8_IN_10_BUF != 0
+ // *** If we intend to use this for real we will want a 2nd buffer pool
+ frame->buf[RPI_ZC_SAND_8_IN_10_BUF] = rpi_buf_pool_alloc(&zc->pool, size_pic); // *** 2 * wanted size - kludge
+#endif
+
+ return 0;
+}
+
+#define RPI_GET_BUFFER2 1
+
+int av_rpi_zc_get_buffer2(struct AVCodecContext *s, AVFrame *frame, int flags)
+{
+#if !RPI_GET_BUFFER2
+ return avcodec_default_get_buffer2(s, frame, flags);
+#else
+ int rv;
+
+ if ((s->codec->capabilities & AV_CODEC_CAP_DR1) == 0)
+ {
+// printf("Do default alloc: format=%#x\n", frame->format);
+ rv = avcodec_default_get_buffer2(s, frame, flags);
+ }
+ else if (frame->format == AV_PIX_FMT_YUV420P ||
+ av_rpi_is_sand_frame(frame))
+ {
+ rv = rpi_get_display_buffer(s->get_buffer_context, frame);
+ }
+ else
+ {
+ rv = avcodec_default_get_buffer2(s, frame, flags);
+ }
+
+#if 0
+ printf("%s: fmt:%d, %dx%d lsize=%d/%d/%d/%d data=%p/%p/%p bref=%p/%p/%p opaque[0]=%p\n", __func__,
+ frame->format, frame->width, frame->height,
+ frame->linesize[0], frame->linesize[1], frame->linesize[2], frame->linesize[3],
+ frame->data[0], frame->data[1], frame->data[2],
+ frame->buf[0], frame->buf[1], frame->buf[2],
+ av_buffer_get_opaque(frame->buf[0]));
+#endif
+ return rv;
+#endif
+}
+
+
+static AVBufferRef * zc_copy(struct AVCodecContext * const s,
+ const AVFrame * const src)
+{
+ AVFrame dest_frame;
+ AVFrame * const dest = &dest_frame;
+ unsigned int i;
+ uint8_t * psrc, * pdest;
+
+ dest->format = src->format;
+ dest->width = src->width;
+ dest->height = src->height;
+
+ if (rpi_get_display_buffer(s->get_buffer_context, dest) != 0)
+ {
+ return NULL;
+ }
+
+ for (i = 0, psrc = src->data[0], pdest = dest->data[0];
+ i != dest->height;
+ ++i, psrc += src->linesize[0], pdest += dest->linesize[0])
+ {
+ memcpy(pdest, psrc, dest->width);
+ }
+ for (i = 0, psrc = src->data[1], pdest = dest->data[1];
+ i != dest->height / 2;
+ ++i, psrc += src->linesize[1], pdest += dest->linesize[1])
+ {
+ memcpy(pdest, psrc, dest->width / 2);
+ }
+ for (i = 0, psrc = src->data[2], pdest = dest->data[2];
+ i != dest->height / 2;
+ ++i, psrc += src->linesize[2], pdest += dest->linesize[2])
+ {
+ memcpy(pdest, psrc, dest->width / 2);
+ }
+
+ return dest->buf[0];
+}
+
+
+static AVBufferRef * zc_420p10_to_sand128(struct AVCodecContext * const s,
+ const AVFrame * const src)
+{
+ AVFrame dest_frame;
+ AVFrame * const dest = &dest_frame;
+ unsigned int i;
+ uint8_t * psrc, * psrc2, * pdest;
+
+ memset(dest, 0, sizeof(*dest));
+ dest->format = AV_PIX_FMT_SAND128;
+ dest->width = src->width;
+ dest->height = src->height;
+
+ if (rpi_get_display_buffer(s->get_buffer_context, dest) != 0)
+ {
+ return NULL;
+ }
+
+ // Y
+ for (i = 0, psrc = src->data[0], pdest = dest->data[0];
+ i != dest->height;
+ ++i, psrc += src->linesize[0], pdest += dest->linesize[0])
+ {
+ uint16_t * s = (uint16_t*)psrc;
+ uint8_t * d = pdest;
+ for (unsigned int k = 0; k < dest->width; k += dest->linesize[0])
+ {
+ const unsigned int n = FFMIN(dest->linesize[0], dest->width - k);
+ for (unsigned int j = 0; j != n; ++j)
+ *d++ = (uint8_t)(*s++ >> 2);
+ d += (dest->linesize[3] - 1) * dest->linesize[0];
+ }
+ }
+
+ // C
+ for (i = 0, psrc = src->data[1], psrc2 = src->data[2], pdest = dest->data[1];
+ i != dest->height / 2;
+ ++i, psrc += src->linesize[1], psrc2 += src->linesize[2], pdest += dest->linesize[1])
+ {
+ const uint16_t * su = (uint16_t*)psrc;
+ const uint16_t * sv = (uint16_t*)psrc2;
+ uint8_t * d = pdest;
+ for (unsigned int k = 0; k < dest->width; k += dest->linesize[1])
+ {
+ const unsigned int n = FFMIN(dest->linesize[1], dest->width - k) / 2;
+ for (unsigned int j = 0; j != n; ++j)
+ {
+ *d++ = (uint8_t)(*su++ >> 2);
+ *d++ = (uint8_t)(*sv++ >> 2);
+ }
+ d += (dest->linesize[3] - 1) * dest->linesize[1];
+ }
+ }
+
+ return dest->buf[0];
+}
+
+
+static AVBufferRef * zc_sand64_16_to_sand128(struct AVCodecContext * const s,
+ const AVFrame * const src, const unsigned int src_bits)
+{
+ AVFrame dest_frame = {
+ .format = AV_PIX_FMT_SAND128,
+ .width = src->width,
+ .height = src->height
+ };
+ AVFrame * const dest = &dest_frame;
+ const unsigned int shr = src_bits - 8;
+
+ if (rpi_get_display_buffer(s->get_buffer_context, dest) != 0)
+ {
+ return NULL;
+ }
+
+ // Y
+ av_rpi_sand16_to_sand8(dest->data[0], dest->linesize[0], av_rpi_sand_frame_stride2(dest),
+ src->data[0], src->linesize[0], av_rpi_sand_frame_stride2(dest),
+ src->width, src->height, shr);
+ // C
+ av_rpi_sand16_to_sand8(dest->data[1], dest->linesize[1], av_rpi_sand_frame_stride2(dest),
+ src->data[1], src->linesize[1], av_rpi_sand_frame_stride2(dest),
+ src->width, src->height / 2, shr);
+
+ return dest->buf[0];
+}
+
+
+
+AVRpiZcRefPtr av_rpi_zc_ref(struct AVCodecContext * const s,
+ const AVFrame * const frame, const enum AVPixelFormat expected_format, const int maycopy)
+{
+ assert(s != NULL);
+
+ if (frame->format != AV_PIX_FMT_YUV420P &&
+ frame->format != AV_PIX_FMT_YUV420P10 &&
+ !av_rpi_is_sand_frame(frame))
+ {
+ av_log(s, AV_LOG_WARNING, "%s: *** Format not SAND/YUV420P: %d\n", __func__, frame->format);
+ return NULL;
+ }
+
+ if (frame->buf[1] != NULL || frame->format != expected_format)
+ {
+#if RPI_ZC_SAND_8_IN_10_BUF
+ if (frame->format == AV_PIX_FMT_SAND64_10 && expected_format == AV_PIX_FMT_SAND128 && frame->buf[RPI_ZC_SAND_8_IN_10_BUF] != NULL)
+ {
+// av_log(s, AV_LOG_INFO, "%s: --- found buf[4]\n", __func__);
+ return av_buffer_ref(frame->buf[RPI_ZC_SAND_8_IN_10_BUF]);
+ }
+#endif
+
+ if (maycopy)
+ {
+ if (frame->buf[1] != NULL)
+ av_log(s, AV_LOG_INFO, "%s: *** Not a single buf frame: copying\n", __func__);
+ else
+ av_log(s, AV_LOG_INFO, "%s: *** Unexpected frame format %d: copying to %d\n", __func__, frame->format, expected_format);
+
+ switch (frame->format)
+ {
+ case AV_PIX_FMT_YUV420P10:
+ return zc_420p10_to_sand128(s, frame);
+
+ case AV_PIX_FMT_SAND64_10:
+ return zc_sand64_16_to_sand128(s, frame, 10);
+
+ default:
+ return zc_copy(s, frame);
+ }
+ }
+ else
+ {
+ if (frame->buf[1] != NULL)
+ av_log(s, AV_LOG_WARNING, "%s: *** Not a single buf frame: buf[1] != NULL\n", __func__);
+ else
+ av_log(s, AV_LOG_INFO, "%s: *** Unexpected frame format: %d != %d\n", __func__, frame->format, expected_format);
+ return NULL;
+ }
+ }
+
+ if (pic_gm_ptr(frame->buf[0]) == NULL)
+ {
+ if (maycopy)
+ {
+ av_log(s, AV_LOG_INFO, "%s: *** Not one of our buffers: copying\n", __func__);
+ return zc_copy(s, frame);
+ }
+ else
+ {
+ av_log(s, AV_LOG_WARNING, "%s: *** Not one of our buffers: NULL\n", __func__);
+ return NULL;
+ }
+ }
+
+ return av_buffer_ref(frame->buf[0]);
+}
+
+int av_rpi_zc_vc_handle(const AVRpiZcRefPtr fr_ref)
+{
+ const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref);
+ return p == NULL ? -1 : p->vc_handle;
+}
+
+int av_rpi_zc_offset(const AVRpiZcRefPtr fr_ref)
+{
+ const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref);
+ return p == NULL ? 0 : fr_ref->data - p->arm;
+}
+
+int av_rpi_zc_length(const AVRpiZcRefPtr fr_ref)
+{
+ return fr_ref == NULL ? 0 : fr_ref->size;
+}
+
+
+int av_rpi_zc_numbytes(const AVRpiZcRefPtr fr_ref)
+{
+ const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref);
+ return p == NULL ? 0 : p->numbytes;
+}
+
+void av_rpi_zc_unref(AVRpiZcRefPtr fr_ref)
+{
+ if (fr_ref != NULL)
+ {
+ av_buffer_unref(&fr_ref);
+ }
+}
+
+AVZcEnvPtr av_rpi_zc_env_alloc(void)
+{
+ ZcEnv * const zc = av_mallocz(sizeof(ZcEnv));
+ if (zc == NULL)
+ {
+ av_log(NULL, AV_LOG_ERROR, "av_rpi_zc_env_alloc: Context allocation failed\n");
+ return NULL;
+ }
+
+ zc_pool_init(&zc->pool);
+ return zc;
+}
+
+void av_rpi_zc_env_free(AVZcEnvPtr zc)
+{
+ if (zc != NULL)
+ {
+ zc_pool_destroy(&zc->pool); ;
+ av_free(zc);
+ }
+}
+
+int av_rpi_zc_in_use(const struct AVCodecContext * const s)
+{
+ return s->get_buffer2 == av_rpi_zc_get_buffer2;
+}
+
+int av_rpi_zc_init(struct AVCodecContext * const s)
+{
+ if (av_rpi_zc_in_use(s))
+ {
+ ZcEnv * const zc = s->get_buffer_context;
+ ++zc->refcount;
+ }
+ else
+ {
+ ZcEnv *const zc = av_rpi_zc_env_alloc();
+ if (zc == NULL)
+ {
+ return AVERROR(ENOMEM);
+ }
+
+ zc->refcount = 1;
+ zc->old.get_buffer_context = s->get_buffer_context;
+ zc->old.get_buffer2 = s->get_buffer2;
+ zc->old.thread_safe_callbacks = s->thread_safe_callbacks;
+
+ s->get_buffer_context = zc;
+ s->get_buffer2 = av_rpi_zc_get_buffer2;
+ s->thread_safe_callbacks = 1;
+ }
+ return 0;
+}
+
+void av_rpi_zc_uninit(struct AVCodecContext * const s)
+{
+ if (av_rpi_zc_in_use(s))
+ {
+ ZcEnv * const zc = s->get_buffer_context;
+ if (--zc->refcount == 0)
+ {
+ s->get_buffer2 = zc->old.get_buffer2;
+ s->get_buffer_context = zc->old.get_buffer_context;
+ s->thread_safe_callbacks = zc->old.thread_safe_callbacks;
+ av_rpi_zc_env_free(zc);
+ }
+ }
+}
+
+#endif // RPI
+
diff --git a/libavcodec/rpi_zc.h b/libavcodec/rpi_zc.h
new file mode 100644
index 0000000000..26fb3be999
--- /dev/null
+++ b/libavcodec/rpi_zc.h
@@ -0,0 +1,105 @@
+#ifndef LIBAVCODEC_RPI_ZC_H
+#define LIBAVCODEC_RPI_ZC_H
+
+// Zero-Copy frame code for RPi
+// RPi needs Y/U/V planes to be contiguous for display. By default
+// ffmpeg will allocate separated planes so a memcpy is needed before
+// display. This code provides a method a making ffmpeg allocate a single
+// bit of memory for the frame when can then be reference counted until
+// display has finished with it.
+
+// Frame buffer number in which to stuff an 8-bit copy of a 16-bit frame
+// 0 disables
+// *** This option still in development
+// Only works if SAO active
+// Allocates buffers that are twice the required size
+#define RPI_ZC_SAND_8_IN_10_BUF 0
+
+struct AVBufferRef;
+struct AVFrame;
+struct AVCodecContext;
+enum AVPixelFormat;
+
+// "Opaque" pointer to whatever we are using as a buffer reference
+typedef struct AVBufferRef * AVRpiZcRefPtr;
+
+struct AVZcEnv;
+typedef struct AVZcEnv * AVZcEnvPtr;
+
+typedef struct AVRpiZcFrameGeometry
+{
+ unsigned int stride_y; // Luma stride (bytes)
+ unsigned int height_y; // Luma height (lines)
+ unsigned int stride_c; // Chroma stride (bytes)
+ unsigned int height_c; // Chroma stride (lines)
+ unsigned int planes_c; // Chroma plane count (U, V = 2, interleaved = 1)
+ unsigned int stripes; // Number of stripes (sand)
+ unsigned int bytes_per_pel;
+} AVRpiZcFrameGeometry;
+
+
+AVRpiZcFrameGeometry av_rpi_zc_frame_geometry(
+ const int format,
+ const unsigned int video_width, const unsigned int video_height);
+
+// Replacement fn for avctx->get_buffer2
+// Should be set before calling avcodec_decode_open2
+//
+// N.B. in addition to to setting avctx->get_buffer2, avctx->refcounted_frames
+// must be set to 1 as otherwise the buffer info is killed before being returned
+// by avcodec_decode_video2. Note also that this means that the AVFrame that is
+// returned must be manually derefed with av_frame_unref. This should be done
+// after av_rpi_zc_ref has been called.
+int av_rpi_zc_get_buffer2(struct AVCodecContext *s, AVFrame *frame, int flags);
+
+// Generate a ZC reference to the buffer(s) in this frame
+// If the buffer doesn't appear to be one allocated by _get_buffer_2
+// then the behaviour depends on maycopy:
+// If maycopy=0 then return NULL
+// If maycopy=1 && the src frame is in a form where we can easily copy
+// the data, then allocate a new buffer and copy the data into it
+// Otherwise return NULL
+AVRpiZcRefPtr av_rpi_zc_ref(struct AVCodecContext * const s,
+ const struct AVFrame * const frame, const enum AVPixelFormat expected_format, const int maycopy);
+
+// Get the vc_handle from the frame ref
+// Returns -1 if ref doesn't look valid
+int av_rpi_zc_vc_handle(const AVRpiZcRefPtr fr_ref);
+// Get offset from the start of the memory referenced
+// by the vc_handle to valid data
+int av_rpi_zc_offset(const AVRpiZcRefPtr fr_ref);
+// Length of buffer data
+int av_rpi_zc_length(const AVRpiZcRefPtr fr_ref);
+// Get the number of bytes allocated from the frame ref
+// Returns 0 if ref doesn't look valid
+int av_rpi_zc_numbytes(const AVRpiZcRefPtr fr_ref);
+
+// Unreference the buffer refed/allocated by _zc_ref
+// If fr_ref is NULL then this will NOP
+void av_rpi_zc_unref(AVRpiZcRefPtr fr_ref);
+
+// Allocate an environment for the buffer pool used by the ZC code
+// This should be put in avctx->get_buffer_context so it can be found by
+// av_rpi_zc_get_buffer2 when it is called from ffmpeg
+AVZcEnvPtr av_rpi_zc_env_alloc(void);
+
+// Allocate the environment used by the ZC code
+void av_rpi_zc_env_free(AVZcEnvPtr);
+
+// Test to see if the context is using zc (checks get_buffer2)
+int av_rpi_zc_in_use(const struct AVCodecContext * const s);
+
+// Init ZC into a context
+// There is nothing magic in this fn - it just packages setting
+// get_buffer2 & get_buffer_context
+int av_rpi_zc_init(struct AVCodecContext * const s);
+
+// Free ZC from a context
+// There is nothing magic in this fn - it just packages unsetting
+// get_buffer2 & get_buffer_context
+void av_rpi_zc_uninit(struct AVCodecContext * const s);
+
+
+
+#endif
+
diff --git a/libavcodec/utils.c b/libavcodec/utils.c
index c4af9cbb17..c1b806e51b 100644
--- a/libavcodec/utils.c
+++ b/libavcodec/utils.c
@@ -26,6 +26,12 @@
*/
#include "config.h"
+
+#ifdef RPI
+// Move video buffers to GPU memory
+#define RPI_GPU_BUFFERS
+#endif
+
#include "libavutil/atomic.h"
#include "libavutil/attributes.h"
#include "libavutil/avassert.h"
@@ -39,6 +45,7 @@
#include "libavutil/mathematics.h"
#include "libavutil/mem_internal.h"
#include "libavutil/pixdesc.h"
+#include "libavutil/rpi_sand_fns.h"
#include "libavutil/imgutils.h"
#include "libavutil/samplefmt.h"
#include "libavutil/dict.h"
@@ -64,6 +71,10 @@
#include "libavutil/ffversion.h"
const char av_codec_ffversion[] = "FFmpeg version " FFMPEG_VERSION;
+#ifdef RPI_GPU_BUFFERS
+#include "rpi_qpu.h"
+#endif
+
#if HAVE_PTHREADS || HAVE_W32THREADS || HAVE_OS2THREADS
static int default_lockmgr_cb(void **arg, enum AVLockOp op)
{
@@ -508,6 +519,47 @@ int avcodec_fill_audio_frame(AVFrame *frame, int nb_channels,
return ret;
}
+#ifdef RPI_GPU_BUFFERS
+static void rpi_buffer_default_free(void *opaque, uint8_t *data)
+{
+ GPU_MEM_PTR_T *p = opaque;
+ gpu_free(p);
+ av_free(p);
+}
+
+static AVBufferRef *rpi_buffer_alloc(int size)
+{
+ AVBufferRef *ret = NULL;
+ uint8_t *data = NULL;
+ GPU_MEM_PTR_T *p;
+
+ static int total=0;
+ total+=size;
+
+ p = av_malloc(sizeof *p);
+ if (!p)
+ return NULL;
+
+ if (gpu_malloc_cached(size,p)<0) // Change this line to choose cached or uncached memory. The caching here refers to the ARM data cache.
+ return NULL;
+
+ data = p->arm;
+ printf("Rpi alloc %d/%d ARM=%p VC=%x->%x\n",size,total,p->arm,p->vc,p->vc+size);
+ //memset(data, 64, size);
+
+ if (!data)
+ return NULL;
+
+ ret = av_buffer_create(data, size, rpi_buffer_default_free, p, 0);
+ if (!ret) {
+ gpu_free(p);
+ av_freep(&p);
+ }
+
+ return ret;
+}
+#endif
+
static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame)
{
FramePool *pool = avctx->internal->pool;
@@ -555,6 +607,14 @@ static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame)
av_buffer_pool_uninit(&pool->pools[i]);
pool->linesize[i] = linesize[i];
if (size[i]) {
+#ifdef RPI_GPU_BUFFERS
+ if (avctx->codec_id == AV_CODEC_ID_HEVC)
+ pool->pools[i] = av_buffer_pool_init(size[i] + 16 + STRIDE_ALIGN - 1,
+ CONFIG_MEMORY_POISONING ?
+ NULL :
+ rpi_buffer_alloc);
+ else
+#endif
pool->pools[i] = av_buffer_pool_init(size[i] + 16 + STRIDE_ALIGN - 1,
CONFIG_MEMORY_POISONING ?
NULL :
@@ -729,6 +789,11 @@ int avcodec_default_get_buffer2(AVCodecContext *avctx, AVFrame *frame, int flags
{
int ret;
+#ifdef RPI
+ // This is going to end badly if we let it continue
+ av_assert0(!av_rpi_is_sand_frame(frame));
+#endif
+
if ((ret = update_frame_pool(avctx, frame)) < 0)
return ret;
diff --git a/libavfilter/avfilter.c b/libavfilter/avfilter.c
index 21f8d9e00d..71ce7b9186 100644
--- a/libavfilter/avfilter.c
+++ b/libavfilter/avfilter.c
@@ -915,6 +915,7 @@ int avfilter_init_str(AVFilterContext *filter, const char *args)
"options, but options were provided: %s.\n", args);
return AVERROR(EINVAL);
}
+ printf("=== args='%s'\n", args);
#if FF_API_OLD_FILTER_OPTS || FF_API_OLD_FILTER_OPTS_ERROR
if ( !strcmp(filter->filter->name, "format") ||
diff --git a/libavformat/mpegts.c b/libavformat/mpegts.c
index 6767b65ec8..f270190d57 100644
--- a/libavformat/mpegts.c
+++ b/libavformat/mpegts.c
@@ -701,7 +701,7 @@ static const StreamType ISO_types[] = {
#endif
{ 0x1b, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264 },
{ 0x1c, AVMEDIA_TYPE_AUDIO, AV_CODEC_ID_AAC },
- { 0x20, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264 },
+ { 0x20, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264_MVC },
{ 0x21, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_JPEG2000 },
{ 0x24, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_HEVC },
{ 0x42, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_CAVS },
diff --git a/libavformat/utils.c b/libavformat/utils.c
index 5a35953d24..d36fdc3199 100644
--- a/libavformat/utils.c
+++ b/libavformat/utils.c
@@ -694,7 +694,7 @@ static int update_wrap_reference(AVFormatContext *s, AVStream *st, int stream_in
int default_stream_index = av_find_default_stream_index(s);
if (s->streams[default_stream_index]->pts_wrap_reference == AV_NOPTS_VALUE) {
for (i = 0; i < s->nb_streams; i++) {
- if (av_find_program_from_stream(s, NULL, i))
+ if (0 && av_find_program_from_stream(s, NULL, i))
continue;
s->streams[i]->pts_wrap_reference = pts_wrap_reference;
s->streams[i]->pts_wrap_behavior = pts_wrap_behavior;
diff --git a/libavutil/Makefile b/libavutil/Makefile
index 1e061763a2..cbc9bc145b 100644
--- a/libavutil/Makefile
+++ b/libavutil/Makefile
@@ -59,6 +59,8 @@ HEADERS = adler32.h \
rational.h \
replaygain.h \
ripemd.h \
+ rpi_sand_fns.h \
+ rpi_sand_fn_pw.h \
samplefmt.h \
sha.h \
sha512.h \
@@ -136,6 +138,7 @@ OBJS = adler32.o \
reverse.o \
rc4.o \
ripemd.o \
+ rpi_sand_fns.o \
samplefmt.o \
sha.o \
sha512.o \
diff --git a/libavutil/arm/Makefile b/libavutil/arm/Makefile
index 5da44b0542..b74b7c4e2f 100644
--- a/libavutil/arm/Makefile
+++ b/libavutil/arm/Makefile
@@ -6,3 +6,4 @@ VFP-OBJS += arm/float_dsp_init_vfp.o \
NEON-OBJS += arm/float_dsp_init_neon.o \
arm/float_dsp_neon.o \
+ arm/rpi_sand_neon.o \
diff --git a/libavutil/arm/rpi_sand_neon.S b/libavutil/arm/rpi_sand_neon.S
new file mode 100644
index 0000000000..dbffdaefa4
--- /dev/null
+++ b/libavutil/arm/rpi_sand_neon.S
@@ -0,0 +1,40 @@
+#include "libavutil/arm/asm.S"
+
+@ void rpi_sand128b_stripe_to_8_10(
+@ uint8_t * dest, [r0]
+@ const uint8_t * src1, [r1]
+@ const uint8_t * src2, [r2]
+@ unsigned int lines); [r3]
+
+.macro stripe2_to_8, bit_depth
+ vpush {q4-q7}
+1:
+ vldm r1!, {q0-q7}
+ subs r3, #1
+ vldm r2!, {q8-q15}
+ vqrshrn.u16 d0, q0, #\bit_depth - 8
+ vqrshrn.u16 d1, q1, #\bit_depth - 8
+ vqrshrn.u16 d2, q2, #\bit_depth - 8
+ vqrshrn.u16 d3, q3, #\bit_depth - 8
+ vqrshrn.u16 d4, q4, #\bit_depth - 8
+ vqrshrn.u16 d5, q5, #\bit_depth - 8
+ vqrshrn.u16 d6, q6, #\bit_depth - 8
+ vqrshrn.u16 d7, q7, #\bit_depth - 8
+ vqrshrn.u16 d8, q8, #\bit_depth - 8
+ vqrshrn.u16 d9, q9, #\bit_depth - 8
+ vqrshrn.u16 d10, q10, #\bit_depth - 8
+ vqrshrn.u16 d11, q11, #\bit_depth - 8
+ vqrshrn.u16 d12, q12, #\bit_depth - 8
+ vqrshrn.u16 d13, q13, #\bit_depth - 8
+ vqrshrn.u16 d14, q14, #\bit_depth - 8
+ vqrshrn.u16 d15, q15, #\bit_depth - 8
+ vstm r0!, {q0-q7}
+ bne 1b
+ vpop {q4-q7}
+ bx lr
+.endm
+
+function rpi_sand128b_stripe_to_8_10, export=1
+ stripe2_to_8 10
+endfunc
+
diff --git a/libavutil/buffer.c b/libavutil/buffer.c
index 694e116a3c..203ca7b3a8 100644
--- a/libavutil/buffer.c
+++ b/libavutil/buffer.c
@@ -425,3 +425,9 @@ AVBufferRef *av_buffer_pool_get(AVBufferPool *pool)
return ret;
}
+
+// Return the opaque for the underlying frame (gives us a GPU_MEM_PTR_T)
+void *av_buffer_pool_opaque(AVBufferRef *ref) {
+ BufferPoolEntry *buf = av_buffer_get_opaque(ref);
+ return buf->opaque;
+}
diff --git a/libavutil/buffer.h b/libavutil/buffer.h
index 0c0ce12cf2..82e0bc3058 100644
--- a/libavutil/buffer.h
+++ b/libavutil/buffer.h
@@ -283,6 +283,9 @@ void av_buffer_pool_uninit(AVBufferPool **pool);
*/
AVBufferRef *av_buffer_pool_get(AVBufferPool *pool);
+// Return the opaque for the underlying frame
+void *av_buffer_pool_opaque(AVBufferRef *ref);
+
/**
* @}
*/
diff --git a/libavutil/frame.h b/libavutil/frame.h
index 2b5c3320c3..990347e484 100644
--- a/libavutil/frame.h
+++ b/libavutil/frame.h
@@ -120,7 +120,20 @@ enum AVFrameSideDataType {
* The GOP timecode in 25 bit timecode format. Data format is 64-bit integer.
* This is set on the first frame of a GOP that has a temporal reference of 0.
*/
- AV_FRAME_DATA_GOP_TIMECODE
+ AV_FRAME_DATA_GOP_TIMECODE,
+
+ /**
+ * The data represents the AVSphericalMapping structure defined in
+ * libavutil/spherical.h.
+ */
+ AV_FRAME_DATA_SPHERICAL,
+
+ /**
+ * Extra data required to deal with a cropped Sand frame
+ * AVFrame holds the cropped size, but we cannot simply offset the start
+ * address to get the picture as we can for planar formats
+ */
+ AV_FRAME_DATA_SAND_INFO,
};
enum AVActiveFormatDescription {
@@ -133,6 +146,13 @@ enum AVActiveFormatDescription {
AV_AFD_SP_4_3 = 15,
};
+typedef struct AVFrameDataSandInfo
+{
+ unsigned int left_offset;
+ unsigned int top_offset;
+ unsigned int pic_width;
+ unsigned int pic_height;
+} AVFrameDataSandInfo;
/**
* Structure to hold side data for an AVFrame.
diff --git a/libavutil/pixdesc.c b/libavutil/pixdesc.c
index 0dffa4dbdb..17134b4f38 100644
--- a/libavutil/pixdesc.c
+++ b/libavutil/pixdesc.c
@@ -2088,6 +2088,30 @@ static const AVPixFmtDescriptor av_pix_fmt_descriptors[AV_PIX_FMT_NB] = {
.flags = AV_PIX_FMT_FLAG_BE | AV_PIX_FMT_FLAG_PLANAR |
AV_PIX_FMT_FLAG_RGB | AV_PIX_FMT_FLAG_ALPHA,
},
+ [AV_PIX_FMT_SAND128] = {
+ .name = "sand128",
+ .nb_components = 3,
+ .log2_chroma_w = 1,
+ .log2_chroma_h = 1,
+ .comp = {
+ { 0, 1, 0, 0, 8, 0, 7, 1 }, /* Y */
+ { 1, 2, 0, 0, 8, 1, 7, 1 }, /* U */
+ { 1, 2, 1, 0, 8, 1, 7, 2 }, /* V */
+ },
+ .flags = 0,
+ },
+ [AV_PIX_FMT_SAND64_10] = {
+ .name = "sand64_10",
+ .nb_components = 3,
+ .log2_chroma_w = 1,
+ .log2_chroma_h = 1,
+ .comp = {
+ { 0, 2, 0, 0, 10, 0, 9, 1 }, /* Y */
+ { 1, 4, 0, 0, 10, 1, 9, 1 }, /* U */
+ { 1, 4, 1, 0, 10, 1, 9, 2 }, /* V */
+ },
+ .flags = 0,
+ },
};
#if FF_API_PLUS1_MINUS1
FF_ENABLE_DEPRECATION_WARNINGS
diff --git a/libavutil/pixfmt.h b/libavutil/pixfmt.h
index 0ed01c4844..2155b78704 100644
--- a/libavutil/pixfmt.h
+++ b/libavutil/pixfmt.h
@@ -303,7 +303,22 @@ enum AVPixelFormat {
AV_PIX_FMT_GBRAP10BE, ///< planar GBR 4:4:4:4 40bpp, big-endian
AV_PIX_FMT_GBRAP10LE, ///< planar GBR 4:4:4:4 40bpp, little-endian
- AV_PIX_FMT_NB, ///< number of pixel formats, DO NOT USE THIS if you want to link with shared libav* because the number of formats might differ between versions
+ AV_PIX_FMT_MEDIACODEC, ///< hardware decoding through MediaCodec
+
+ AV_PIX_FMT_GRAY12BE, ///< Y , 12bpp, big-endian
+ AV_PIX_FMT_GRAY12LE, ///< Y , 12bpp, little-endian
+ AV_PIX_FMT_GRAY10BE, ///< Y , 10bpp, big-endian
+ AV_PIX_FMT_GRAY10LE, ///< Y , 10bpp, little-endian
+
+ AV_PIX_FMT_P016LE, ///< like NV12, with 16bpp per component, little-endian
+ AV_PIX_FMT_P016BE, ///< like NV12, with 16bpp per component, big-endian
+
+// RPI - not on ifdef so can be got at by calling progs
+ AV_PIX_FMT_SAND128, ///< 4:2:0 8-bit 128x*Y stripe, 64x*UV stripe, then next x stripe, mysterious padding
+ AV_PIX_FMT_SAND64_10, ///< 4:2:0 10-bit 64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding
+ AV_PIX_FMT_SAND64_16, ///< 4:2:0 16-bit 64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding
+
+ AV_PIX_FMT_NB ///< number of pixel formats, DO NOT USE THIS if you want to link with shared libav* because the number of formats might differ between versions
};
#define AV_PIX_FMT_Y400A AV_PIX_FMT_GRAY8A
diff --git a/libavutil/rpi_sand_fn_pw.h b/libavutil/rpi_sand_fn_pw.h
new file mode 100644
index 0000000000..52d52a2a83
--- /dev/null
+++ b/libavutil/rpi_sand_fn_pw.h
@@ -0,0 +1,182 @@
+// * Included twice from rpi_sand_fn with different PW
+
+#define STRCAT(x,y) x##y
+
+#if PW == 1
+#define pixel uint8_t
+#define FUNC(f) STRCAT(f, 8)
+#elif PW == 2
+#define pixel uint16_t
+#define FUNC(f) STRCAT(f, 16)
+#else
+#error Unexpected PW
+#endif
+
+// Fetches a single patch - offscreen fixup not done here
+// w <= stride1
+// unclipped
+void FUNC(av_rpi_sand_to_planar_y)(uint8_t * dst, const unsigned int dst_stride,
+ const uint8_t * src,
+ unsigned int stride1, unsigned int stride2,
+ unsigned int _x, unsigned int y,
+ unsigned int _w, unsigned int h)
+{
+ const unsigned int x = _x;
+ const unsigned int w = _w;
+ const unsigned int mask = stride1 - 1;
+
+ if ((x & ~mask) == ((x + w) & ~mask)) {
+ // All in one sand stripe
+ const uint8_t * p = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
+ for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p += stride1) {
+ memcpy(dst, p, w);
+ }
+ }
+ else
+ {
+ // Two+ stripe
+ const unsigned int sstride = stride1 * stride2;
+ const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
+ const uint8_t * p2 = p1 + sstride - (x & mask);
+ const unsigned int w1 = stride1 - (x & mask);
+ const unsigned int w3 = (x + w) & mask;
+ const unsigned int w2 = w - (w1 + w3);
+
+ for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p1 += stride1, p2 += stride1) {
+ unsigned int j;
+ const uint8_t * p = p2;
+ uint8_t * d = dst;
+ memcpy(d, p1, w1);
+ d += w1;
+ for (j = 0; j < w2; j += stride1, d += stride1, p += sstride) {
+ memcpy(d, p, stride1);
+ }
+ memcpy(d, p, w3);
+ }
+ }
+}
+
+// x & w in bytes but not of interleave (i.e. offset = x*2 for U&V)
+
+void FUNC(av_rpi_sand_to_planar_c)(uint8_t * dst_u, const unsigned int dst_stride_u,
+ uint8_t * dst_v, const unsigned int dst_stride_v,
+ const uint8_t * src,
+ unsigned int stride1, unsigned int stride2,
+ unsigned int _x, unsigned int y,
+ unsigned int _w, unsigned int h)
+{
+ const unsigned int x = _x * 2;
+ const unsigned int w = _w * 2;
+ const unsigned int mask = stride1 - 1;
+
+ if ((x & ~mask) == ((x + w) & ~mask)) {
+ // All in one sand stripe
+ const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
+ for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p1 += stride1) {
+ pixel * du = (pixel *)dst_u;
+ pixel * dv = (pixel *)dst_v;
+ const pixel * p = (const pixel *)p1;
+ for (unsigned int k = 0; k < w; k += 2 * PW) {
+ *du++ = *p++;
+ *dv++ = *p++;
+ }
+ }
+ }
+ else
+ {
+ // Two+ stripe
+ const unsigned int sstride = stride1 * stride2;
+ const unsigned int sstride_p = (sstride - stride1) / PW;
+
+ const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
+ const uint8_t * p2 = p1 + sstride - (x & mask);
+ const unsigned int w1 = stride1 - (x & mask);
+ const unsigned int w3 = (x + w) & mask;
+ const unsigned int w2 = w - (w1 + w3);
+
+ for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p1 += stride1, p2 += stride1) {
+ unsigned int j;
+ const pixel * p = (const pixel *)p1;
+ pixel * du = (pixel *)dst_u;
+ pixel * dv = (pixel *)dst_v;
+ for (unsigned int k = 0; k < w1; k += 2 * PW) {
+ *du++ = *p++;
+ *dv++ = *p++;
+ }
+ for (j = 0, p = (const pixel *)p2; j < w2; j += stride1, p += sstride_p) {
+ for (unsigned int k = 0; k < stride1; k += 2 * PW) {
+ *du++ = *p++;
+ *dv++ = *p++;
+ }
+ }
+ for (unsigned int k = 0; k < w3; k += 2 * PW) {
+ *du++ = *p++;
+ *dv++ = *p++;
+ }
+ }
+ }
+}
+
+void FUNC(av_rpi_planar_to_sand_c)(uint8_t * dst_c,
+ unsigned int stride1, unsigned int stride2,
+ const uint8_t * src_u, const unsigned int src_stride_u,
+ const uint8_t * src_v, const unsigned int src_stride_v,
+ unsigned int _x, unsigned int y,
+ unsigned int _w, unsigned int h)
+{
+ const unsigned int x = _x * 2;
+ const unsigned int w = _w * 2;
+ const unsigned int mask = stride1 - 1;
+ if ((x & ~mask) == ((x + w) & ~mask)) {
+ // All in one sand stripe
+ uint8_t * p1 = dst_c + (x & mask) + y * stride1 + (x & ~mask) * stride2;
+ for (unsigned int i = 0; i != h; ++i, src_u += src_stride_u, src_v += src_stride_v, p1 += stride1) {
+ const pixel * su = (const pixel *)src_u;
+ const pixel * sv = (const pixel *)src_v;
+ pixel * p = (pixel *)p1;
+ for (unsigned int k = 0; k < w; k += 2 * PW) {
+ *p++ = *su++;
+ *p++ = *sv++;
+ }
+ }
+ }
+ else
+ {
+ // Two+ stripe
+ const unsigned int sstride = stride1 * stride2;
+ const unsigned int sstride_p = (sstride - stride1) / PW;
+
+ const uint8_t * p1 = dst_c + (x & mask) + y * stride1 + (x & ~mask) * stride2;
+ const uint8_t * p2 = p1 + sstride - (x & mask);
+ const unsigned int w1 = stride1 - (x & mask);
+ const unsigned int w3 = (x + w) & mask;
+ const unsigned int w2 = w - (w1 + w3);
+
+ for (unsigned int i = 0; i != h; ++i, src_u += src_stride_u, src_v += src_stride_v, p1 += stride1, p2 += stride1) {
+ unsigned int j;
+ const pixel * su = (const pixel *)src_u;
+ const pixel * sv = (const pixel *)src_v;
+ pixel * p = (pixel *)p1;
+ for (unsigned int k = 0; k < w1; k += 2 * PW) {
+ *p++ = *su++;
+ *p++ = *sv++;
+ }
+ for (j = 0, p = (pixel *)p2; j < w2; j += stride1, p += sstride_p) {
+ for (unsigned int k = 0; k < stride1; k += 2 * PW) {
+ *p++ = *su++;
+ *p++ = *sv++;
+ }
+ }
+ for (unsigned int k = 0; k < w3; k += 2 * PW) {
+ *p++ = *su++;
+ *p++ = *sv++;
+ }
+ }
+ }
+}
+
+
+#undef pixel
+#undef STRCAT
+#undef FUNC
+
diff --git a/libavutil/rpi_sand_fns.c b/libavutil/rpi_sand_fns.c
new file mode 100644
index 0000000000..ec4cfadf8a
--- /dev/null
+++ b/libavutil/rpi_sand_fns.c
@@ -0,0 +1,99 @@
+#include "config.h"
+#ifdef RPI
+#include <stdint.h>
+#include <string.h>
+#include "rpi_sand_fns.h"
+#include "avassert.h"
+
+#define PW 1
+#include "rpi_sand_fn_pw.h"
+#undef PW
+
+#define PW 2
+#include "rpi_sand_fn_pw.h"
+#undef PW
+
+#if HAVE_NEON
+void rpi_sand128b_stripe_to_8_10(uint8_t * dest, const uint8_t * src1, const uint8_t * src2, unsigned int lines);
+#endif
+
+#if 1
+// Simple round
+static void cpy16_to_8(uint8_t * dst, const uint8_t * _src, unsigned int n, const unsigned int shr)
+{
+ const unsigned int rnd = (1 << shr) >> 1;
+ const uint16_t * src = (const uint16_t *)_src;
+
+ for (; n != 0; --n) {
+ *dst++ = (*src++ + rnd) >> shr;
+ }
+}
+#else
+// Dithered variation
+static void cpy16_to_8(uint8_t * dst, const uint8_t * _src, unsigned int n, const unsigned int shr)
+{
+ unsigned int rnd = (1 << shr) >> 1;
+ const unsigned int mask = ((1 << shr) - 1);
+ const uint16_t * src = (const uint16_t *)_src;
+
+ for (; n != 0; --n) {
+ rnd = *src++ + (rnd & mask);
+ *dst++ = rnd >> shr;
+ }
+}
+#endif
+
+// w/h in pixels
+void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2,
+ const uint8_t * src, const unsigned int src_stride1, const unsigned int src_stride2,
+ unsigned int w, unsigned int h, const unsigned int shr)
+{
+ const unsigned int n = dst_stride1 / 2;
+ unsigned int j;
+
+ // This is true for our current layouts
+ av_assert0(dst_stride1 == src_stride1);
+
+ // As we have the same stride1 for src & dest and src is wider than dest
+ // then if we loop on src we can always write contiguously to dest
+ // We make no effort to copy an exact width - round up to nearest src stripe
+ // as we will always have storage in dest for that
+
+#if HAVE_NEON
+ if (shr == 3 && src_stride1 == 128) {
+ for (j = 0; j + n < w; j += dst_stride1) {
+ uint8_t * d = dst + j * dst_stride2;
+ const uint8_t * s1 = src + j * 2 * src_stride2;
+ const uint8_t * s2 = s1 + src_stride1 * src_stride2;
+
+ rpi_sand128b_stripe_to_8_10(d, s1, s2, h);
+ }
+ }
+ else
+#endif
+ {
+ for (j = 0; j + n < w; j += dst_stride1) {
+ uint8_t * d = dst + j * dst_stride2;
+ const uint8_t * s1 = src + j * 2 * src_stride2;
+ const uint8_t * s2 = s1 + src_stride1 * src_stride2;
+
+ for (unsigned int i = 0; i != h; ++i, s1 += src_stride1, s2 += src_stride1, d += dst_stride1) {
+ cpy16_to_8(d, s1, n, shr);
+ cpy16_to_8(d + n, s2, n, shr);
+ }
+ }
+ }
+
+ // Fix up a trailing dest half stripe
+ if (j < w) {
+ uint8_t * d = dst + j * dst_stride2;
+ const uint8_t * s1 = src + j * 2 * src_stride2;
+
+ for (unsigned int i = 0; i != h; ++i, s1 += src_stride1, d += dst_stride1) {
+ cpy16_to_8(d, s1, n, shr);
+ }
+ }
+}
+
+#endif // RPI
+
diff --git a/libavutil/rpi_sand_fns.h b/libavutil/rpi_sand_fns.h
new file mode 100644
index 0000000000..aa880d0f63
--- /dev/null
+++ b/libavutil/rpi_sand_fns.h
@@ -0,0 +1,129 @@
+#ifndef AVUTIL_RPI_SAND_FNS
+#define AVUTIL_RPI_SAND_FNS
+#ifdef RPI
+
+#include "libavutil/frame.h"
+
+// For all these fns _x & _w are measured as coord * PW
+// For the C fns coords are in chroma pels (so luma / 2)
+// Strides are in bytes
+
+void av_rpi_sand_to_planar_y8(uint8_t * dst, const unsigned int dst_stride,
+ const uint8_t * src,
+ unsigned int stride1, unsigned int stride2,
+ unsigned int _x, unsigned int y,
+ unsigned int _w, unsigned int h);
+void av_rpi_sand_to_planar_y16(uint8_t * dst, const unsigned int dst_stride,
+ const uint8_t * src,
+ unsigned int stride1, unsigned int stride2,
+ unsigned int _x, unsigned int y,
+ unsigned int _w, unsigned int h);
+
+void av_rpi_sand_to_planar_c8(uint8_t * dst_u, const unsigned int dst_stride_u,
+ uint8_t * dst_v, const unsigned int dst_stride_v,
+ const uint8_t * src,
+ unsigned int stride1, unsigned int stride2,
+ unsigned int _x, unsigned int y,
+ unsigned int _w, unsigned int h);
+void av_rpi_sand_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u,
+ uint8_t * dst_v, const unsigned int dst_stride_v,
+ const uint8_t * src,
+ unsigned int stride1, unsigned int stride2,
+ unsigned int _x, unsigned int y,
+ unsigned int _w, unsigned int h);
+
+void av_rpi_planar_to_sand_c8(uint8_t * dst_c,
+ unsigned int stride1, unsigned int stride2,
+ const uint8_t * src_u, const unsigned int src_stride_u,
+ const uint8_t * src_v, const unsigned int src_stride_v,
+ unsigned int _x, unsigned int y,
+ unsigned int _w, unsigned int h);
+void av_rpi_planar_to_sand_c16(uint8_t * dst_c,
+ unsigned int stride1, unsigned int stride2,
+ const uint8_t * src_u, const unsigned int src_stride_u,
+ const uint8_t * src_v, const unsigned int src_stride_v,
+ unsigned int _x, unsigned int y,
+ unsigned int _w, unsigned int h);
+
+// w/h in pixels
+void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2,
+ const uint8_t * src, const unsigned int src_stride1, const unsigned int src_stride2,
+ unsigned int w, unsigned int h, const unsigned int shr);
+
+
+static inline unsigned int av_rpi_sand_frame_stride1(const AVFrame * const frame)
+{
+ // * We could repl;ace thios with a fixed 128 whic would allow the compiler
+ // to optimize a whole lot better
+ return frame->linesize[0];
+}
+
+static inline unsigned int av_rpi_sand_frame_stride2(const AVFrame * const frame)
+{
+ return frame->linesize[3];
+}
+
+
+static inline int av_rpi_is_sand_format(const int format)
+{
+ return (format >= AV_PIX_FMT_SAND128 && format <= AV_PIX_FMT_SAND64_16);
+}
+
+static inline int av_rpi_is_sand_frame(const AVFrame * const frame)
+{
+ return av_rpi_is_sand_format(frame->format);
+}
+
+static inline int av_rpi_is_sand8_frame(const AVFrame * const frame)
+{
+ return (frame->format == AV_PIX_FMT_SAND128);
+}
+
+static inline int av_rpi_is_sand16_frame(const AVFrame * const frame)
+{
+ return (frame->format >= AV_PIX_FMT_SAND64_10 && frame->format <= AV_PIX_FMT_SAND64_16);
+}
+
+static inline int av_rpi_sand_frame_xshl(const AVFrame * const frame)
+{
+ return av_rpi_is_sand8_frame(frame) ? 0 : 1;
+}
+
+// If x is measured in bytes (not pixels) then this works for sand64_16 as
+// well as sand128 - but in the general case we work that out
+
+static inline unsigned int av_rpi_sand_frame_off_y(const AVFrame * const frame, const unsigned int x_y, const unsigned int y)
+{
+ const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
+ const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
+ const unsigned int x = x_y << av_rpi_sand_frame_xshl(frame);
+ const unsigned int x1 = x & (stride1 - 1);
+ const unsigned int x2 = x ^ x1;
+
+ return x1 + stride1 * y + stride2 * x2;
+}
+
+static inline unsigned int av_rpi_sand_frame_off_c(const AVFrame * const frame, const unsigned int x_c, const unsigned int y_c)
+{
+ const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
+ const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
+ const unsigned int x = x_c << (av_rpi_sand_frame_xshl(frame) + 1);
+ const unsigned int x1 = x & (stride1 - 1);
+ const unsigned int x2 = x ^ x1;
+
+ return x1 + stride1 * y_c + stride2 * x2;
+}
+
+static inline uint8_t * av_rpi_sand_frame_pos_y(const AVFrame * const frame, const unsigned int x, const unsigned int y)
+{
+ return frame->data[0] + av_rpi_sand_frame_off_y(frame, x, y);
+}
+
+static inline uint8_t * av_rpi_sand_frame_pos_c(const AVFrame * const frame, const unsigned int x, const unsigned int y)
+{
+ return frame->data[1] + av_rpi_sand_frame_off_c(frame, x, y);
+}
+
+#endif
+#endif
+
diff --git a/libswscale/input.c b/libswscale/input.c
index 14ab5abb3a..7a827c71e3 100644
--- a/libswscale/input.c
+++ b/libswscale/input.c
@@ -719,6 +719,13 @@ static void p010BEToUV_c(uint8_t *dstU, uint8_t *dstV,
}
}
+static void sand128ToUV_c(uint8_t *dstU, uint8_t *dstV,
+ const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
+ int width, uint32_t *unused)
+{
+ // NIF
+}
+
#define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
static void bgr24ToY_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,
@@ -1085,6 +1092,10 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c)
case AV_PIX_FMT_P010BE:
c->chrToYV12 = p010BEToUV_c;
break;
+ case AV_PIX_FMT_SAND128:
+ case AV_PIX_FMT_SAND64_10:
+ c->chrToYV12 = sand128ToUV_c; // NIF
+ break;
}
if (c->chrSrcHSubSample) {
switch (srcFormat) {
diff --git a/libswscale/utils.c b/libswscale/utils.c
index 576d8f0d5a..fd88a5e51e 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -248,6 +248,10 @@ static const FormatEntry format_entries[AV_PIX_FMT_NB] = {
[AV_PIX_FMT_AYUV64LE] = { 1, 1},
[AV_PIX_FMT_P010LE] = { 1, 0 },
[AV_PIX_FMT_P010BE] = { 1, 0 },
+#ifdef RPI
+ [AV_PIX_FMT_SAND128] = { 1, 0 },
+ [AV_PIX_FMT_SAND64_10] = { 1, 0 },
+#endif
};
int sws_isSupportedInput(enum AVPixelFormat pix_fmt)
diff --git a/pi-util/BUILD.txt b/pi-util/BUILD.txt
new file mode 100644
index 0000000000..b1e99a6a89
--- /dev/null
+++ b/pi-util/BUILD.txt
@@ -0,0 +1,25 @@
+Building Pi FFmpeg
+==================
+
+Configuration:
+=============
+
+pi-util/conf_pi2.sh
+
+contains suitable options to build the code for Pi2/3. It expects to find
+git clones of
+
+https://github.com/raspberrypi/tools
+https://github.com/raspberrypi/firmware
+
+in the parent of the FFmpeg directory. I recommend using --depth 1 to avoid a
+lot of history you don't want.
+
+If you have a copy of qasm.py in ../local/bin then the .qasm sources will be
+rebuilt. Otherwise the prebuilt .c & .h files will be used.
+Likewise ../local/bin/vasmvidcore_std will enable VPU code rebuild
+
+pi-util/conf_p1.sh should configure for Pi1. Beware that as of this time
+H265 QPU acceleration is broken on Pi1 and so it is disabled.
+
+
diff --git a/pi-util/conf_h265.2016.csv b/pi-util/conf_h265.2016.csv
new file mode 100644
index 0000000000..f05b7753f7
--- /dev/null
+++ b/pi-util/conf_h265.2016.csv
@@ -0,0 +1,193 @@
+1,HEVC_v1/AMP_A_Samsung_7,AMP_A_Samsung_7.bin,AMP_A_Samsung_7.md5
+1,HEVC_v1/AMP_B_Samsung_7,AMP_B_Samsung_7.bin,AMP_B_Samsung_7.md5
+1,HEVC_v1/AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5
+1,HEVC_v1/AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5
+1,HEVC_v1/AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5
+1,HEVC_v1/AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5
+1,HEVC_v1/AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5
+1,HEVC_v1/AMVP_C_Samsung_7,AMVP_C_Samsung_7.bin,AMVP_C_Samsung_7.md5
+1,HEVC_v1/BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5
+1,HEVC_v1/CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5
+1,HEVC_v1/CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5
+1,HEVC_v1/CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5
+1,HEVC_v1/CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5
+1,HEVC_v1/CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5
+1,HEVC_v1/CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5
+1,HEVC_v1/CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5
+1,HEVC_v1/CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5
+1,HEVC_v1/CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5
+1,HEVC_v1/cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5
+1,HEVC_v1/CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5
+1,HEVC_v1/CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5
+1,HEVC_v1/DBLK_A_MAIN10_VIXS_4,DBLK_A_MAIN10_VIXS_4.bit,DBLK_A_MAIN10_VIXS_4.md5
+1,HEVC_v1/DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5
+1,HEVC_v1/DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5
+1,HEVC_v1/DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5
+1,HEVC_v1/DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5
+1,HEVC_v1/DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5
+1,HEVC_v1/DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5
+1,HEVC_v1/DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5
+1,HEVC_v1/DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
+1,HEVC_v1/DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5
+1,HEVC_v1/DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5
+1,HEVC_v1/DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5
+1,HEVC_v1/DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5
+1,HEVC_v1/DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5
+1,HEVC_v1/ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5
+1,HEVC_v1/ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5
+1,HEVC_v1/ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5
+1,HEVC_v1/EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5
+1,HEVC_v1/FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5
+1,HEVC_v1/HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5
+1,HEVC_v1/INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5
+1,HEVC_v1/INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5
+1,HEVC_v1/ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5
+1,HEVC_v1/ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5
+1,HEVC_v1/ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5
+1,HEVC_v1/ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5
+1,HEVC_v1/ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5
+1,HEVC_v1/IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5
+1,HEVC_v1/IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5
+1,HEVC_v1/IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5
+1,HEVC_v1/LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5
+1,HEVC_v1/LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5
+1,HEVC_v1/LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5
+1,HEVC_v1/MAXBINS_A_TI_5,MAXBINS_A_TI_5.bit,MAXBINS_A_TI_5_yuv.md5
+1,HEVC_v1/MAXBINS_B_TI_5,MAXBINS_B_TI_5.bit,MAXBINS_B_TI_5_yuv.md5
+1,HEVC_v1/MAXBINS_C_TI_5,MAXBINS_C_TI_5.bit,MAXBINS_C_TI_5_yuv.md5
+1,HEVC_v1/MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5
+1,HEVC_v1/MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5
+1,HEVC_v1/MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5
+1,HEVC_v1/MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5
+1,HEVC_v1/MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5
+1,HEVC_v1/MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5
+1,HEVC_v1/MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5
+1,HEVC_v1/MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5
+1,HEVC_v1/MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5
+1,HEVC_v1/MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5
+1,HEVC_v1/NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5
+1,HEVC_v1/NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5
+1,HEVC_v1/NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
+1,HEVC_v1/OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5
+1,HEVC_v1/OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5
+1,HEVC_v1/OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5
+1,HEVC_v1/PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5
+1,HEVC_v1/PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5
+1,HEVC_v1/PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5
+1,HEVC_v1/PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5
+1,HEVC_v1/PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5
+1,HEVC_v1/PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5
+1,HEVC_v1/PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5
+1,HEVC_v1/PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5
+1,HEVC_v1/PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5
+1,HEVC_v1/POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5
+1,HEVC_v1/PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
+1,HEVC_v1/PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5
+1,HEVC_v1/RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5
+1,HEVC_v1/RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
+1,HEVC_v1/RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5
+1,HEVC_v1/RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5
+1,HEVC_v1/RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5
+1,HEVC_v1/RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5
+1,HEVC_v1/RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5
+1,HEVC_v1/RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5
+1,HEVC_v1/RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5
+1,HEVC_v1/RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5
+1,HEVC_v1/RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5
+1,HEVC_v1/RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5
+1,HEVC_v1/RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5
+1,HEVC_v1/RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5
+1,HEVC_v1/RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5
+1,HEVC_v1/RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5
+1,HEVC_v1/RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5
+1,HEVC_v1/SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5
+1,HEVC_v1/SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5
+1,HEVC_v1/SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5
+1,HEVC_v1/SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5
+1,HEVC_v1/SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5
+1,HEVC_v1/SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5
+1,HEVC_v1/SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5
+1,HEVC_v1/SAO_H_Parabola_1,SAO_H_Parabola_1.bit,SAO_H_Parabola_1.md5
+2,HEVC_v1/SAODBLK_A_MainConcept_4,SAODBLK_A_MainConcept_4.bin,SAODBLK_A_MainConcept_4_md5.txt
+2,HEVC_v1/SAODBLK_B_MainConcept_4,SAODBLK_B_MainConcept_4.bin,SAODBLK_B_MainConcept_4_md5.txt
+1,HEVC_v1/SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5
+1,HEVC_v1/SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5
+1,HEVC_v1/SLIST_A_Sony_5,SLIST_A_Sony_5.bin,SLIST_A_Sony_5_yuv.md5
+1,HEVC_v1/SLIST_B_Sony_9,SLIST_B_Sony_9.bin,SLIST_B_Sony_9_yuv.md5
+1,HEVC_v1/SLIST_C_Sony_4,SLIST_C_Sony_4.bin,SLIST_C_Sony_4_yuv.md5
+1,HEVC_v1/SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5
+1,HEVC_v1/SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
+1,HEVC_v1/STRUCT_A_Samsung_7,STRUCT_A_Samsung_7.bin,STRUCT_A_Samsung_7.md5
+1,HEVC_v1/STRUCT_B_Samsung_7,STRUCT_B_Samsung_7.bin,STRUCT_B_Samsung_7.md5
+1,HEVC_v1/TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5
+1,HEVC_v1/TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5
+1,HEVC_v1/TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5
+1,HEVC_v1/TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5
+1,HEVC_v1/TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5
+1,HEVC_v1/TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5
+3,HEVC_v1/TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # unequal bit depth
+1,HEVC_v1/TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5
+1,HEVC_v1/VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
+3,HEVC_v1/VPSSPSPPS_A_MainConcept_1,VPSSPSPPS_A_MainConcept_1.bin,VPSSPSPPS_A_MainConcept_1_md5.txt, # ???
+1,HEVC_v1/WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5
+1,HEVC_v1/WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5
+1,HEVC_v1/WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5
+1,HEVC_v1/WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5
+1,HEVC_v1/WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5
+1,HEVC_v1/WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5
+1,HEVC_v1/WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5
+1,HEVC_v1/WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5
+1,HEVC_v1/WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5
+1,HEVC_v1/WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5
+1,HEVC_v1/WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5
+1,HEVC_v1/WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5
+1,HEVC_v1/WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5
+1,HEVC_v1/WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5
+1,HEVC_v1/WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5
+1,HEVC_v1/WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
+1,RExt/ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_2,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_2.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_yuv_2.md5
+0,RExt/Bitdepth_A_RExt_Sony_1,Bitdepth_A_RExt_Sony_1.bin,md5sum.txt
+0,RExt/Bitdepth_B_RExt_Sony_1,Bitdepth_B_RExt_Sony_1.bin,md5sum.txt
+0,RExt/CCP_10bit_RExt_QCOM,CCP_10bit_RExt_QCOM.bin,CCP_10bit_RExt_QCOM_md5sum.txt
+0,RExt/CCP_12bit_RExt_QCOM,CCP_12bit_RExt_QCOM.bin,CCP_12bit_RExt_QCOM_md5sum.txt
+0,RExt/CCP_8bit_RExt_QCOM,CCP_8bit_RExt_QCOM.bin,CCP_8bit_RExt_QCOM_md5sum.txt
+1,RExt/ExplicitRdpcm_A_BBC_1,ExplicitRdpcm_A_BBC_1.bit,md5sum.txt
+0,RExt/ExplicitRdpcm_B_BBC_2,ExplicitRdpcm_B_BBC_1.bit,md5sum.txt
+0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1.md5
+0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1.md5
+0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1.md5
+0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1.md5
+0,RExt/EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1.md5
+0,RExt/EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1.md5
+0,RExt/EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1.md5
+0,RExt/EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1.md5
+1,RExt/GENERAL_10b_420_RExt_Sony_1,GENERAL_10b_420_RExt_Sony_1.bit,GENERAL_10b_420_RExt_Sony_1.md5
+1,RExt/GENERAL_10b_422_RExt_Sony_1,GENERAL_10b_422_RExt_Sony_1.bit,GENERAL_10b_422_RExt_Sony_1.md5
+1,RExt/GENERAL_10b_444_RExt_Sony_2,GENERAL_10b_444_RExt_Sony_2.bit,GENERAL_10b_444_RExt_Sony_2.md5
+1,RExt/GENERAL_12b_400_RExt_Sony_1,GENERAL_12b_400_RExt_Sony_1.bit,GENERAL_12b_400_RExt_Sony_1.md5
+1,RExt/GENERAL_12b_420_RExt_Sony_1,GENERAL_12b_420_RExt_Sony_1.bit,GENERAL_12b_420_RExt_Sony_1.md5
+1,RExt/GENERAL_12b_422_RExt_Sony_1,GENERAL_12b_422_RExt_Sony_1.bit,GENERAL_12b_422_RExt_Sony_1.md5
+1,RExt/GENERAL_12b_444_RExt_Sony_2,GENERAL_12b_444_RExt_Sony_2.bit,GENERAL_12b_444_RExt_Sony_2.md5
+0,RExt/GENERAL_16b_400_RExt_Sony_1,GENERAL_16b_400_RExt_Sony_1.bit,GENERAL_16b_400_RExt_Sony_1.md5
+0,RExt/GENERAL_16b_444_highThroughput_RExt_Sony_2,GENERAL_16b_444_highThroughput_RExt_Sony_2.bit,GENERAL_16b_444_highThroughput_RExt_Sony_2.md5
+0,RExt/GENERAL_16b_444_RExt_Sony_2,GENERAL_16b_444_RExt_Sony_2.bit,GENERAL_16b_444_RExt_Sony_2.md5
+1,RExt/GENERAL_8b_400_RExt_Sony_1,GENERAL_8b_400_RExt_Sony_1.bit,GENERAL_8b_400_RExt_Sony_1.md5
+1,RExt/GENERAL_8b_420_RExt_Sony_1,GENERAL_8b_420_RExt_Sony_1.bit,GENERAL_8b_420_RExt_Sony_1.md5
+1,RExt/GENERAL_8b_444_RExt_Sony_2,GENERAL_8b_444_RExt_Sony_2.bit,GENERAL_8b_444_RExt_Sony_2.md5
+2,RExt/IPCM_A_RExt_NEC_2,IPCM_A_RExt_NEC_2.bit,IPCM_A_RExt_NEC_2_yuv.md5
+1,RExt/IPCM_B_RExt_NEC,IPCM_B_RExt_NEC.bit,IPCM_B_RExt_NEC_yuv.md5
+1,RExt/Main_422_10_A_RExt_Sony_2,Main_422_10_A_RExt_Sony_2.bin,md5sum.txt
+1,RExt/Main_422_10_B_RExt_Sony_2,Main_422_10_B_RExt_Sony_2.bin,md5sum.txt
+1,RExt/PERSIST_RPARAM_A_RExt_Sony_3,PERSIST_RPARAM_A_RExt_Sony_3.bit,PERSIST_RPARAM_A_RExt_Sony_3.md5
+1,RExt/QMATRIX_A_RExt_Sony_1,QMATRIX_A_RExt_Sony_1.bit,QMATRIX_A_RExt_Sony_1.md5
+1,RExt/SAO_A_RExt_MediaTek_1,SAO_A_RExt_MediaTek_1.bit,SAO_A_RExt_MediaTek_1.md5
+0,RExt/TSCTX_10bit_I_RExt_SHARP_1,TSCTX_10bit_I_RExt_SHARP_1.bin,TSCTX_10bit_I_RExt_SHARP_1.md5
+0,RExt/TSCTX_10bit_RExt_SHARP_1,TSCTX_10bit_RExt_SHARP_1.bin,TSCTX_10bit_RExt_SHARP_1.md5
+0,RExt/TSCTX_12bit_I_RExt_SHARP_1,TSCTX_12bit_I_RExt_SHARP_1.bin,TSCTX_12bit_I_RExt_SHARP_1.md5
+0,RExt/TSCTX_12bit_RExt_SHARP_1,TSCTX_12bit_RExt_SHARP_1.bin,TSCTX_12bit_RExt_SHARP_1.md5
+0,RExt/TSCTX_8bit_I_RExt_SHARP_1,TSCTX_8bit_I_RExt_SHARP_1.bin,TSCTX_8bit_I_RExt_SHARP_1.md5
+0,RExt/TSCTX_8bit_RExt_SHARP_1,TSCTX_8bit_RExt_SHARP_1.bin,TSCTX_8bit_RExt_SHARP_1.md5
+0,RExt/WAVETILES_RExt_Sony_2,WAVETILES_RExt_Sony_2.bit,WAVETILES_RExt_Sony_2.md5
+1,local/sao_cu16_mobile_344x280,sao_cu16_mobile_344x280.265,sao_cu16_mobile_344x280.md5
+1,local/dblk_cu16_mobile_344x280,dblk_cu16_mobile_344x280.265,dblk_cu16_mobile_344x280.md5
+2,local/dblksao_cu16_mobile_344x280,dblksao_cu16_mobile_344x280.265,dblksao_cu16_mobile_344x280.md5
diff --git a/pi-util/conf_h265.2016_HEVC_v1.csv b/pi-util/conf_h265.2016_HEVC_v1.csv
new file mode 100644
index 0000000000..6082641271
--- /dev/null
+++ b/pi-util/conf_h265.2016_HEVC_v1.csv
@@ -0,0 +1,147 @@
+1,AMP_A_Samsung_7,AMP_A_Samsung_7.bin,AMP_A_Samsung_7.md5
+1,AMP_B_Samsung_7,AMP_B_Samsung_7.bin,AMP_B_Samsung_7.md5
+1,AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5
+1,AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5
+1,AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5
+1,AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5
+1,AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5
+1,AMVP_C_Samsung_7,AMVP_C_Samsung_7.bin,AMVP_C_Samsung_7.md5
+1,BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5
+1,CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5
+1,CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5
+1,CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5
+1,CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5
+1,CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5
+1,CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5
+1,CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5
+1,CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5
+1,CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5
+1,cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5
+1,CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5
+1,CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5
+1,DBLK_A_MAIN10_VIXS_4,DBLK_A_MAIN10_VIXS_4.bit,DBLK_A_MAIN10_VIXS_4.md5
+1,DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5
+1,DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5
+1,DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5
+1,DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5
+1,DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5
+1,DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5
+1,DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5
+1,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
+1,DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5
+1,DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5
+1,DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5
+1,DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5
+1,DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5
+1,ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5
+1,ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5
+1,ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5
+1,EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5
+1,FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5
+1,HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5
+1,INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5
+1,INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5
+1,ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5
+1,ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5
+1,ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5
+1,ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5
+1,ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5
+1,IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5
+1,IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5
+1,IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5
+1,LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5
+1,LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5
+1,LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5
+1,MAXBINS_A_TI_5,MAXBINS_A_TI_5.bit,MAXBINS_A_TI_5_yuv.md5
+1,MAXBINS_B_TI_5,MAXBINS_B_TI_5.bit,MAXBINS_B_TI_5_yuv.md5
+1,MAXBINS_C_TI_5,MAXBINS_C_TI_5.bit,MAXBINS_C_TI_5_yuv.md5
+1,MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5
+1,MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5
+1,MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5
+1,MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5
+1,MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5
+1,MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5
+1,MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5
+1,MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5
+1,MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5
+1,MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5
+1,NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5
+1,NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5
+1,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
+1,OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5
+1,OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5
+1,OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5
+1,PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5
+1,PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5
+1,PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5
+1,PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5
+1,PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5
+1,PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5
+1,PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5
+1,PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5
+1,PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5
+1,POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5
+1,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
+1,PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5
+1,RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5
+1,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
+1,RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5
+1,RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5
+1,RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5
+1,RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5
+1,RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5
+1,RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5
+1,RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5
+1,RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5
+1,RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5
+1,RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5
+1,RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5
+1,RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5
+1,RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5
+1,RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5
+1,RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5
+1,SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5
+1,SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5
+1,SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5
+1,SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5
+1,SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5
+1,SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5
+1,SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5
+1,SAO_H_Parabola_1,SAO_H_Parabola_1.bit,SAO_H_Parabola_1.md5
+2,SAODBLK_A_MainConcept_4,SAODBLK_A_MainConcept_4.bin,SAODBLK_A_MainConcept_4_md5.txt
+2,SAODBLK_B_MainConcept_4,SAODBLK_B_MainConcept_4.bin,SAODBLK_B_MainConcept_4_md5.txt
+1,SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5
+1,SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5
+1,SLIST_A_Sony_5,SLIST_A_Sony_5.bin,SLIST_A_Sony_5_yuv.md5
+1,SLIST_B_Sony_9,SLIST_B_Sony_9.bin,SLIST_B_Sony_9_yuv.md5
+1,SLIST_C_Sony_4,SLIST_C_Sony_4.bin,SLIST_C_Sony_4_yuv.md5
+1,SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5
+1,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
+1,STRUCT_A_Samsung_7,STRUCT_A_Samsung_7.bin,STRUCT_A_Samsung_7.md5
+1,STRUCT_B_Samsung_7,STRUCT_B_Samsung_7.bin,STRUCT_B_Samsung_7.md5
+1,TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5
+1,TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5
+1,TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5
+1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5
+1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5
+1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5
+3,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # unequal bit depth
+1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5
+1,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
+3,VPSSPSPPS_A_MainConcept_1,VPSSPSPPS_A_MainConcept_1.bin,VPSSPSPPS_A_MainConcept_1_md5.txt, # ???
+1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5
+1,WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5
+1,WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5
+1,WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5
+1,WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5
+1,WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5
+1,WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5
+1,WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5
+1,WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5
+1,WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5
+1,WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5
+1,WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5
+1,WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5
+1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5
+1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5
+1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
diff --git a/pi-util/conf_h265.csv b/pi-util/conf_h265.csv
new file mode 100644
index 0000000000..fc14f2a3c2
--- /dev/null
+++ b/pi-util/conf_h265.csv
@@ -0,0 +1,144 @@
+1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.md5
+1,AMP_A_Samsung_6,AMP_A_Samsung_6.bin,AMP_A_Samsung_6.md5
+1,AMP_B_Samsung_6,AMP_B_Samsung_6.bin,AMP_B_Samsung_6.md5
+1,AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5
+1,AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5
+1,AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5
+1,AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5
+1,AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5
+1,AMVP_C_Samsung_6,AMVP_C_Samsung_6.bin,AMVP_C_Samsung_6.md5
+1,BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5
+1,CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5
+1,CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5
+1,CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5
+1,CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5
+1,CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5
+1,CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5
+1,CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5
+1,CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5
+1,CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5
+1,cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5
+1,CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5
+1,CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5
+1,DBLK_A_MAIN10_VIXS_3,DBLK_A_MAIN10_VIXS_3.bit,DBLK_A_MAIN10_VIXS_3.md5
+1,DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5
+1,DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5
+1,DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5
+1,DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5
+1,DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5
+1,DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5
+1,DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5
+1,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
+1,DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5
+1,DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5
+1,DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5
+1,DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5
+1,DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5
+1,ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5
+1,ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5
+1,ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5
+1,EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5
+1,FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5
+1,HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5
+1,INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5
+1,INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5
+1,ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5
+1,ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5
+1,ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5
+1,ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5
+1,ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5
+1,IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5
+1,IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5
+1,IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5
+1,LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5
+1,LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5
+1,LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5
+1,MAXBINS_A_TI_4,MAXBINS_A_TI_4.bit,MAXBINS_A_TI_4.md5
+1,MAXBINS_B_TI_4,MAXBINS_B_TI_4.bit,MAXBINS_B_TI_4.md5
+1,MAXBINS_C_TI_4,MAXBINS_C_TI_4.bit,MAXBINS_C_TI_4.md5
+1,MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5
+1,MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5
+1,MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5
+1,MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5
+1,MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5
+1,MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5
+1,MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5
+1,MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5
+1,MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5
+1,MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5
+1,NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5
+1,NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5
+1,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
+1,OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5
+1,OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5
+1,OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5
+1,PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5
+1,PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5
+1,PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5
+1,PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5
+1,PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5
+1,PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5
+1,PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5
+1,PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5
+1,PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5
+1,POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5
+1,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
+1,PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5
+1,RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5
+1,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
+1,RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5
+1,RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5
+1,RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5
+1,RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5
+1,RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5
+1,RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5
+1,RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5
+1,RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5
+1,RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5
+1,RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5
+1,RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5
+1,RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5
+1,RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5
+1,RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5
+1,RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5
+1,SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5
+1,SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5
+1,SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5
+1,SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5
+1,SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5
+1,SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5
+1,SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5
+1,SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5
+1,SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5
+1,SLIST_A_Sony_4,str.bin,SLIST_A_Sony_4_yuv.md5
+1,SLIST_B_Sony_8,str.bin,SLIST_B_Sony_8_yuv.md5
+1,SLIST_C_Sony_3,str.bin,SLIST_C_Sony_3_yuv.md5
+1,SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5
+1,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
+1,STRUCT_A_Samsung_6,STRUCT_A_Samsung_6.bin,STRUCT_A_Samsung_6.md5
+1,STRUCT_B_Samsung_6,STRUCT_B_Samsung_6.bin,STRUCT_B_Samsung_6.md5
+1,TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5
+1,TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5
+1,TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5
+1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5
+1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5
+1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5
+0,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # Y/C bit depth unmatched
+1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5
+1,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
+1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5
+1,WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5
+1,WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5
+1,WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5
+1,WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5
+1,WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5
+1,WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5
+1,WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5
+1,WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5
+1,WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5
+1,WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5
+1,WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5
+1,WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5
+1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5
+1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5
+1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
diff --git a/pi-util/conf_pi1.sh b/pi-util/conf_pi1.sh
new file mode 100755
index 0000000000..ec25b81c31
--- /dev/null
+++ b/pi-util/conf_pi1.sh
@@ -0,0 +1,31 @@
+echo "Configure for Pi1"
+
+RPI_TOOLROOT=`pwd`/../tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf
+RPI_OPT_VC=`pwd`/../firmware/opt/vc
+
+RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
+RPI_DEFS="-D__VCCOREVER__=0x04000000 -DRPI=1"
+RPI_LIBDIRS="-L$RPI_TOOLROOT/lib -L$RPI_OPT_VC/lib"
+#RPI_KEEPS="-save-temps=obj"
+RPI_KEEPS=""
+
+./configure --enable-cross-compile\
+ --cpu=arm1176jzf-s\
+ --arch=arm\
+ --disable-neon\
+ --target-os=linux\
+ --disable-stripping\
+ --enable-mmal\
+ --extra-cflags="-g $RPI_KEEPS $RPI_DEFS $RPI_INCLUDES"\
+ --extra-cxxflags="$RPI_DEFS $RPI_INCLUDES"\
+ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_TOOLROOT/lib,-rpath-link=$RPI_TOOLROOT/lib"\
+ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\
+ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf-
+
+
+# --enable-extra-warnings\
+# --arch=armv71\
+# --enable-shared\
+
+# gcc option for getting asm listing
+# -Wa,-ahls
diff --git a/pi-util/conf_pi2.sh b/pi-util/conf_pi2.sh
new file mode 100755
index 0000000000..f8e5e75375
--- /dev/null
+++ b/pi-util/conf_pi2.sh
@@ -0,0 +1,30 @@
+echo "Configure for Pi2/3"
+
+RPI_TOOLROOT=`pwd`/../tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf
+RPI_OPT_VC=`pwd`/../firmware/opt/vc
+
+RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
+RPI_DEFS="-D__VCCOREVER__=0x04000000 -DRPI=1"
+RPI_LIBDIRS="-L$RPI_TOOLROOT/lib -L$RPI_OPT_VC/lib"
+#RPI_KEEPS="-save-temps=obj"
+RPI_KEEPS=""
+
+./configure --enable-cross-compile\
+ --arch=armv6t2\
+ --cpu=cortex-a7\
+ --target-os=linux\
+ --disable-stripping\
+ --disable-thumb\
+ --enable-mmal\
+ --extra-cflags="-g $RPI_KEEPS $RPI_DEFS $RPI_INCLUDES"\
+ --extra-cxxflags="$RPI_DEFS $RPI_INCLUDES"\
+ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_TOOLROOT/lib,-rpath-link=$RPI_TOOLROOT/lib"\
+ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\
+ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf-
+
+# --enable-extra-warnings\
+# --arch=armv71\
+# --enable-shared\
+
+# gcc option for getting asm listing
+# -Wa,-ahls
diff --git a/pi-util/ffconf.py b/pi-util/ffconf.py
new file mode 100755
index 0000000000..70f7be22bb
--- /dev/null
+++ b/pi-util/ffconf.py
@@ -0,0 +1,174 @@
+#!/usr/bin/env python
+
+import string
+import os
+import subprocess
+import re
+import argparse
+import sys
+import csv
+from stat import *
+
+ffmpeg_exec = "./ffmpeg"
+
+def testone(fileroot, srcname, es_file, md5_file):
+ tmp_root = "/tmp"
+
+ names = srcname.split('/')
+ while len(names) > 1:
+ tmp_root = os.path.join(tmp_root, names[0])
+ del names[0]
+ name = names[0]
+
+ if not os.path.exists(tmp_root):
+ os.makedirs(tmp_root)
+
+ dec_file = os.path.join(tmp_root, name + ".dec.md5")
+ try:
+ os.remove(dec_file)
+ except:
+ pass
+
+ flog = open(os.path.join(tmp_root, name + ".log"), "wt")
+
+ # Unaligned needed for cropping conformance
+ rstr = subprocess.call(
+ [ffmpeg_exec, "-flags", "unaligned", "-vcodec", "hevc", "-i", os.path.join(fileroot, es_file), "-f", "md5", dec_file],
+ stdout=flog, stderr=subprocess.STDOUT)
+
+ try:
+ m1 = None
+ m2 = None
+ with open(os.path.join(fileroot, md5_file)) as f:
+ for line in f:
+ m1 = re.search("[0-9a-f]{32}", line.lower())
+ if m1:
+ break
+
+ with open(dec_file) as f:
+ m2 = re.search("[0-9a-f]{32}", f.readline())
+ except:
+ pass
+
+ if m1 and m2 and m1.group() == m2.group():
+ print >> flog, "Match: " + m1.group()
+ rv = 0
+ elif not m1:
+ print >> flog, "****** Cannot find m1"
+ rv = 3
+ elif not m2:
+ print >> flog, "****** Cannot find m2"
+ rv = 2
+ else:
+ print >> flog, "****** Mismatch: " + m1.group() + " != " + m2.group()
+ rv = 1
+ flog.close()
+ return rv
+
+def scandir(root):
+ aconf = []
+ ents = os.listdir(root)
+ ents.sort(key=str.lower)
+ for name in ents:
+ test_path = os.path.join(root, name)
+ if S_ISDIR(os.stat(test_path).st_mode):
+ files = os.listdir(test_path)
+ es_file = "?"
+ md5_file = "?"
+ for f in files:
+ (base, ext) = os.path.splitext(f)
+ if base[0] == '.':
+ pass
+ elif ext == ".bit" or ext == ".bin":
+ es_file = f
+ elif ext == ".md5" or (ext == ".txt" and (base[-4:] == "_md5" or base[-6:] == "md5sum")):
+ if md5_file == "?":
+ md5_file = f
+ elif base[-3:] == "yuv":
+ md5_file = f
+ aconf.append((1, name, es_file, md5_file))
+ return aconf
+
+def runtest(name, tests):
+ if not tests:
+ return True
+ for t in tests:
+ if name[0:len(t)] == t or name.find("/" + t) != -1:
+ return True
+ return False
+
+def doconf(csva, tests, test_root):
+ unx_failures = []
+ unx_success = []
+ failures = 0
+ successes = 0
+ for a in csva:
+ exp_test = int(a[0])
+ if (exp_test and runtest(a[1], tests)):
+ name = a[1]
+ print "==== ", name,
+ sys.stdout.flush()
+
+ rv = testone(os.path.join(test_root, name), name, a[2], a[3])
+ if (rv == 0):
+ successes += 1
+ else:
+ failures += 1
+
+ if (rv == 0):
+ if exp_test == 2:
+ print ": * OK *"
+ unx_success.append(name)
+ else:
+ print ": ok"
+ elif exp_test == 2 and rv == 1:
+ print ": fail"
+ elif exp_test == 3 and rv == 2:
+ # Call an expected "crash" an abort
+ print ": abort"
+ else:
+ unx_failures.append(name)
+ if rv == 1:
+ print ": * FAIL *"
+ elif (rv == 2) :
+ print ": * CRASH *"
+ elif (rv == 3) :
+ print ": * MD5 MISSING *"
+ else :
+ print ": * BANG *"
+
+ if unx_failures or unx_success:
+ print "Unexpected Failures:", unx_failures
+ print "Unexpected Success: ", unx_success
+ else:
+ print "All tests normal:", successes, "ok,", failures, "failed"
+
+
+class ConfCSVDialect(csv.Dialect):
+ delimiter = ','
+ doublequote = True
+ lineterminator = '\n'
+ quotechar='"'
+ quoting = csv.QUOTE_MINIMAL
+ skipinitialspace = True
+ strict = True
+
+if __name__ == '__main__':
+
+ argp = argparse.ArgumentParser(description="FFmpeg h265 conformance tester")
+ argp.add_argument("tests", nargs='*')
+ argp.add_argument("--test_root", default="/opt/conform/h265.2016", help="Root dir for test")
+ argp.add_argument("--csvgen", action='store_true', help="Generate CSV file for dir")
+ argp.add_argument("--csv", default="pi-util/conf_h265.2016.csv", help="CSV filename")
+ args = argp.parse_args()
+
+ if args.csvgen:
+ csv.writer(sys.stdout).writerows(scandir(args.test_root))
+ exit(0)
+
+ with open(args.csv, 'rt') as csvfile:
+ csva = [a for a in csv.reader(csvfile, ConfCSVDialect())]
+
+
+ doconf(csva, args.tests, args.test_root)
+
diff --git a/pi-util/ffperf.py b/pi-util/ffperf.py
new file mode 100755
index 0000000000..27cc453963
--- /dev/null
+++ b/pi-util/ffperf.py
@@ -0,0 +1,124 @@
+#!/usr/bin/env python3
+
+import time
+import string
+import os
+import tempfile
+import subprocess
+import re
+import argparse
+import sys
+import csv
+from stat import *
+
+class tstats:
+ close_threshold = 0.01
+
+ def __init__(self, stats_dict=None):
+ if stats_dict != None:
+ self.name = stats_dict["name"]
+ self.elapsed = float(stats_dict["elapsed"])
+ self.user = float(stats_dict["user"])
+ self.sys = float(stats_dict["sys"])
+
+ def times_str(self):
+ ctime = self.sys + self.user
+ return "time=%6.2f, cpu=%6.2f (%4.2f%%)" % (self.elapsed, ctime, (ctime * 100.0) / self.elapsed)
+
+ def dict(self):
+ return {"name":self.name, "elapsed":self.elapsed, "user":self.user, "sys":self.sys}
+
+ def is_close(self, other):
+ return abs(self.elapsed - other.elapsed) / self.elapsed < self.close_threshold
+
+ def __lt__(self, other):
+ return self.elapsed < other.elapsed
+ def __gt__(self, other):
+ return self.elapsed > other.elapsed
+
+ def time_file(name, prefix):
+ stats = tstats()
+ stats.name = name
+ start_time = time.clock_gettime(time.CLOCK_MONOTONIC);
+ cproc = subprocess.Popen(["./ffmpeg", "-t", "30", "-i", prefix + name,
+ "-f", "null", os.devnull], bufsize=-1, stdout=flog, stderr=flog);
+ pinfo = os.wait4(cproc.pid, 0)
+ end_time = time.clock_gettime(time.CLOCK_MONOTONIC);
+ stats.elapsed = end_time - start_time
+ stats.user = pinfo[2].ru_utime
+ stats.sys = pinfo[2].ru_stime
+ return stats
+
+
+def common_prefix(s1, s2):
+ for i in range(min(len(s1),len(s2))):
+ if s1[i] != s2[i]:
+ return s1[:i]
+ return s1[:i+1]
+
+def main():
+ global flog
+
+ argp = argparse.ArgumentParser(description="FFmpeg performance tester", epilog="""
+To blank the screen before starting use "xdg-screensaver activate"
+(For some reason this doesn't seem to work from within python).
+""")
+
+ argp.add_argument("streams", nargs='*')
+ argp.add_argument("--csv_out", default="ffperf_out.csv", help="CSV output filename")
+ argp.add_argument("--csv_in", help="CSV input filename")
+ argp.add_argument("--prefix", help="Filename prefix (include terminal '/' if a directory).")
+
+ args = argp.parse_args()
+
+ csv_out = csv.DictWriter(open(args.csv_out, 'w', newline=''), ["name", "elapsed", "user", "sys"])
+ csv_out.writeheader()
+
+ stats_in = {}
+ if args.csv_in != None:
+ with open(args.csv_in, 'r', newline='') as f_in:
+ stats_in = {x["name"]:tstats(x) for x in csv.DictReader(f_in)}
+
+ flog = open(os.path.join(tempfile.gettempdir(), "ffperf.log"), "wt")
+
+ streams = args.streams
+ if not streams:
+ if not stats_in:
+ print ("No source streams specified")
+ return 1
+ prefix = "" if args.prefix == None else args.prefix
+ streams = [k for k in stats_in]
+ elif args.prefix != None:
+ prefix = args.prefix
+ else:
+ prefix = streams[0]
+ for f in streams[1:]:
+ prefix = common_prefix(prefix, f)
+ pp = prefix.rpartition(os.sep)
+ prefix = pp[0] + pp[1]
+ streams = [s[len(prefix):] for s in streams]
+
+ for f in sorted(streams, key=lambda x : "~" * x.count(os.sep) + x.lower()):
+ print ("====", f)
+
+ t0 = tstats({"name":f, "elapsed":999, "user":999, "sys":999})
+ for i in range(3):
+ t = tstats.time_file(f, prefix)
+ print ("...", t.times_str())
+ if t0 > t:
+ t0 = t
+
+ if t0.name in stats_in:
+ pstat = stats_in[t0.name]
+ print("---" if pstat.is_close(t0) else "<<<" if t0 < pstat else ">>>", pstat.times_str())
+
+ csv_out.writerow(t0.dict())
+
+ print ()
+
+ return 0
+
+
+if __name__ == '__main__':
+ exit(main())
+
diff --git a/pi-util/make_array.py b/pi-util/make_array.py
new file mode 100755
index 0000000000..864fa5e704
--- /dev/null
+++ b/pi-util/make_array.py
@@ -0,0 +1,19 @@
+#!/usr/bin/env python
+
+# Usage
+# make_array file.bin
+# Produces file.h with array of bytes.
+#
+import sys
+for file in sys.argv[1:]:
+ prefix,suffix = file.split('.')
+ assert suffix=='bin'
+ name=prefix.split('/')[-1]
+ print 'Converting',file
+ with open(prefix+'.h','wb') as out:
+ print >>out, 'static const unsigned char',name,'[] = {'
+ with open(file,'rb') as fd:
+ for byte in fd.read():
+ print >>out, '%d,' % ord(byte)
+ print >>out,'};'
+
diff --git a/pi-util/qem.sh b/pi-util/qem.sh
new file mode 100755
index 0000000000..5ce2eeaf72
--- /dev/null
+++ b/pi-util/qem.sh
@@ -0,0 +1,9 @@
+TARGET_DIR=../src/eupton_vc4dev_2012a/software/vc4/DEV/applications/tutorials/user_shader_example_tex
+QASM=python\ ../local/bin/qasm.py
+SRC_FILE=libavcodec/rpi_shader.qasm
+DST_BASE=shader
+
+cp libavcodec/rpi_shader_cmd.h $TARGET_DIR
+$QASM -mc_c:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.c
+$QASM -mc_h:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.h
+
diff --git a/pi-util/v3dusage.py b/pi-util/v3dusage.py
new file mode 100755
index 0000000000..5935a11ca5
--- /dev/null
+++ b/pi-util/v3dusage.py
@@ -0,0 +1,128 @@
+#!/usr/bin/env python
+
+import sys
+import argparse
+import re
+
+def do_logparse(logname):
+
+ rmatch = re.compile(r'^([0-9]+\.[0-9]{3}): (done )?((vpu0)|(vpu1)|(qpu1)) ([A-Z_]+) cb:([0-9a-f]+) ')
+ rqcycle = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: QPU Total clock cycles for all QPUs doing vertex/coordinate shading +([0-9]+)$')
+ rqtscycle = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: QPU Total clock cycles for all QPUs stalled waiting for TMUs +([0-9]+)$')
+ rl2hits = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: L2C Total Level 2 cache ([a-z]+) +([0-9]+)$')
+
+ ttotal = {'idle':0.0}
+ tstart = {}
+ qctotal = {}
+ qtstotal = {}
+ l2hits = {}
+ l2total = {}
+ time0 = None
+ idle_start = None
+ qpu_op_no = 0
+ op_count = 0
+
+ with open(logname, "rt") as infile:
+ for line in infile:
+ match = rmatch.match(line)
+ if match:
+# print match.group(1), ":", match.group(2), ":", match.group(3), ":", match.group(7), ":"
+ time = float(match.group(1))
+ unit = match.group(3)
+ opstart = not match.group(2)
+ optype = match.group(7)
+ hascb = match.group(8) != "0"
+
+ if unit == 'qpu1':
+ unit = unit + "." + str(qpu_op_no)
+ if not opstart:
+ if hascb or optype == 'EXECUTE_SYNC':
+ qpu_op_no = 0
+ else:
+ qpu_op_no += 1
+
+ # Ignore sync type
+ if optype == 'EXECUTE_SYNC':
+ continue
+
+ if not time0:
+ time0 = time
+
+ if opstart:
+ tstart[unit] = time;
+ elif unit in tstart:
+ op_count += 1
+ if not unit in ttotal:
+ ttotal[unit] = 0.0
+ ttotal[unit] += time - tstart[unit]
+ del tstart[unit]
+
+ if not idle_start and not tstart:
+ idle_start = time
+ elif idle_start and tstart:
+ ttotal['idle'] += time - idle_start
+ idle_start = None
+
+ match = rqcycle.match(line)
+ if match:
+ unit = "qpu1." + str(qpu_op_no)
+ if not unit in qctotal:
+ qctotal[unit] = 0
+ qctotal[unit] += int(match.group(2))
+
+ match = rqtscycle.match(line)
+ if match:
+ unit = "qpu1." + str(qpu_op_no)
+ if not unit in qtstotal:
+ qtstotal[unit] = 0
+ qtstotal[unit] += int(match.group(2))
+
+ match = rl2hits.match(line)
+ if match:
+ unit = "qpu1." + str(qpu_op_no)
+ if not unit in l2total:
+ l2total[unit] = 0
+ l2hits[unit] = 0
+ l2total[unit] += int(match.group(3))
+ if match.group(2) == "hits":
+ l2hits[unit] += int(match.group(3))
+
+
+ if not time0:
+ print "No v3d profile records found"
+ else:
+ tlogged = time - time0
+
+ print "Logged time:", tlogged, " Op count:", op_count
+ for unit in sorted(ttotal):
+ print b'%6s: %10.3f %7.3f%%' % (unit, ttotal[unit], ttotal[unit] * 100.0 / tlogged)
+ print
+ for unit in sorted(qctotal):
+ if not unit in qtstotal:
+ qtstotal[unit] = 0;
+ print b'%6s: Qcycles: %10d, TMU stall: %10d (%7.3f%%)' % (unit, qctotal[unit], qtstotal[unit], (qtstotal[unit] * 100.0)/qctotal[unit])
+ if unit in l2total:
+ print b' L2Total: %10d, hits: %10d (%7.3f%%)' % (l2total[unit], l2hits[unit], (l2hits[unit] * 100.0)/l2total[unit])
+
+
+
+if __name__ == '__main__':
+ argp = argparse.ArgumentParser(
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ description="QPU/VPU perf summary from VC logging",
+ epilog = """
+Will also summarise TMU stalls if logging requests set in qpu noflush param
+in the profiled code.
+
+Example use:
+ vcgencmd set_logging level=0xc0
+ <command to profile>
+ sudo vcdbg log msg >& t.log
+ v3dusage.py t.log
+""")
+
+ argp.add_argument("logfile")
+ args = argp.parse_args()
+
+ do_logparse(args.logfile)
+