LibreELEC.tv/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch

diff --git a/.gitignore b/.gitignore
index 524fb73c16..bcc983739f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
 *.a
 *.o
 *.o.*
+*.bin
 *.d
 *.def
 *.dll
@@ -23,6 +24,7 @@
 .\#*
 /.config
 /.version
+/build/
 /ffmpeg
 /ffplay
 /ffprobe
diff --git a/ffmpeg.c b/ffmpeg.c
index cdded8673f..5eee7dfd40 100644
--- a/ffmpeg.c
+++ b/ffmpeg.c
@@ -23,6 +23,11 @@
  * multimedia converter based on the FFmpeg libraries
  */

+#ifdef RPI
+#define RPI_DISPLAY
+#define RPI_DISPLAY_ALL 0
+#endif
+
 #include "config.h"
 #include <ctype.h>
 #include <string.h>
@@ -42,6 +47,7 @@
 #include "libavformat/avformat.h"
 #include "libavdevice/avdevice.h"
 #include "libswresample/swresample.h"
+#include "libavutil/atomic.h"
 #include "libavutil/opt.h"
 #include "libavutil/channel_layout.h"
 #include "libavutil/parseutils.h"
@@ -66,6 +72,25 @@
 # include "libavfilter/buffersrc.h"
 # include "libavfilter/buffersink.h"

+#ifdef RPI_DISPLAY
+#pragma GCC diagnostic push
+// Many many redundant decls in the header files
+#pragma GCC diagnostic ignored "-Wredundant-decls"
+#include <bcm_host.h>
+#include <interface/mmal/mmal.h>
+#include <interface/mmal/mmal_parameters_camera.h>
+#include <interface/mmal/mmal_buffer.h>
+#include <interface/mmal/mmal_port.h>
+#include <interface/mmal/util/mmal_util.h>
+#include <interface/mmal/util/mmal_default_components.h>
+#include <interface/mmal/util/mmal_connection.h>
+#include <interface/mmal/util/mmal_util_params.h>
+#pragma GCC diagnostic pop
+#include "libavcodec/rpi_qpu.h"
+#include "libavutil/rpi_sand_fns.h"
+#include "libavcodec/rpi_zc.h"
+#endif
+
 #if HAVE_SYS_RESOURCE_H
 #include <sys/time.h>
 #include <sys/types.h>
@@ -158,6 +183,241 @@ static int restore_tty;
 static void free_input_threads(void);
 #endif

+#ifdef RPI_DISPLAY
+
+#define NUM_BUFFERS 4
+
+
+typedef struct rpi_display_env_s
+{
+    MMAL_COMPONENT_T* display;
+    MMAL_COMPONENT_T* isp;
+    MMAL_PORT_T * port_in;  // Input port of either isp or display depending on pipe setup
+    MMAL_CONNECTION_T * conn;
+
+    MMAL_POOL_T *rpi_pool;
+    volatile int rpi_display_count;
+    enum AVPixelFormat avfmt;
+} rpi_display_env_t;
+
+static rpi_display_env_t * rpi_display_env = NULL;
+
+
+static MMAL_POOL_T* display_alloc_pool(MMAL_PORT_T* port)
+{
+    MMAL_POOL_T* pool;
+    mmal_port_parameter_set_boolean(port, MMAL_PARAMETER_ZERO_COPY, MMAL_TRUE); // Does this mark that the buffer contains a vc_handle?  Would have expected a vc_image?
+    pool = mmal_port_pool_create(port, NUM_BUFFERS, 0);
+    assert(pool);
+
+    return pool;
+}
+
+static void display_cb_input(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer) {
+    rpi_display_env_t *const de = (rpi_display_env_t *)port->userdata;
+    av_rpi_zc_unref(buffer->user_data);
+    avpriv_atomic_int_add_and_fetch(&de->rpi_display_count, -1);
+    mmal_buffer_header_release(buffer);
+}
+
+static void display_cb_control(MMAL_PORT_T *port,MMAL_BUFFER_HEADER_T *buffer) {
+  mmal_buffer_header_release(buffer);
+}
+
+#define DISPLAY_PORT_DEPTH 4
+
+static rpi_display_env_t *
+display_init(const enum AVPixelFormat req_fmt, size_t x, size_t y, size_t w, size_t h)
+{
+    MMAL_STATUS_T err;
+    MMAL_DISPLAYREGION_T region =
+    {
+        .hdr = {MMAL_PARAMETER_DISPLAYREGION, sizeof(region)},
+        .set = MMAL_DISPLAY_SET_LAYER | MMAL_DISPLAY_SET_FULLSCREEN | MMAL_DISPLAY_SET_DEST_RECT,
+        .layer = 2,
+        .fullscreen = 0,
+        .dest_rect = {x, y, w, h}
+    };
+#if RPI_ZC_SAND_8_IN_10_BUF
+    const enum AVPixelFormat fmt = (req_fmt == AV_PIX_FMT_YUV420P10 || av_rpi_is_sand_format(req_fmt)) ? AV_PIX_FMT_SAND128 : req_fmt;
+#else
+    const enum AVPixelFormat fmt = (req_fmt == AV_PIX_FMT_YUV420P10) ? AV_PIX_FMT_SAND128 : req_fmt;
+#endif
+    const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(fmt, w, h);
+    rpi_display_env_t * de;
+    int isp_req = (fmt == AV_PIX_FMT_SAND64_10);
+
+    bcm_host_init();  // Needs to be done by someone...
+
+    if ((de = av_mallocz(sizeof(*de))) == NULL) {
+        return NULL;
+    }
+
+    mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, &de->display);
+    av_assert0(de->display);
+    de->port_in = de->display->input[0];
+
+    if (isp_req)
+    {
+        mmal_component_create("vc.ril.isp", &de->isp);
+        de->port_in = de->isp->input[0];
+    }
+
+    mmal_port_parameter_set(de->display->input[0], &region.hdr);
+
+    {
+        MMAL_PORT_T * const port = de->port_in;
+        MMAL_ES_FORMAT_T* const format = port->format;
+        port->userdata = (struct MMAL_PORT_USERDATA_T *)de;
+        port->buffer_num = DISPLAY_PORT_DEPTH;
+        format->encoding = fmt == AV_PIX_FMT_SAND128 ? MMAL_ENCODING_YUVUV128 :
+            fmt == AV_PIX_FMT_SAND64_10 ? MMAL_ENCODING_YUVUV64_16 :
+                MMAL_ENCODING_I420;
+        format->es->video.width = geo.stride_y;
+        format->es->video.height = (fmt == AV_PIX_FMT_SAND128 || fmt == AV_PIX_FMT_SAND64_10) ?
+                                      (h + 15) & ~15 : geo.height_y;  // Magic
+        format->es->video.crop.x = 0;
+        format->es->video.crop.y = 0;
+        format->es->video.crop.width = w;
+        format->es->video.crop.height = h;
+        mmal_port_format_commit(port);
+    }
+
+    de->rpi_pool = display_alloc_pool(de->port_in);
+    mmal_port_enable(de->port_in,display_cb_input);
+
+    if (isp_req) {
+        MMAL_PORT_T * const port_out = de->isp->output[0];
+        mmal_log_dump_port(de->port_in);
+        mmal_format_copy(port_out->format, de->port_in->format);
+        if (fmt == AV_PIX_FMT_SAND64_10) {
+            if ((err = mmal_port_parameter_set_int32(de->port_in, MMAL_PARAMETER_CCM_SHIFT, 5)) != MMAL_SUCCESS ||
+                (err = mmal_port_parameter_set_int32(port_out, MMAL_PARAMETER_OUTPUT_SHIFT, 1)) != MMAL_SUCCESS)
+            {
+                av_log(NULL, AV_LOG_WARNING, "Failed to set ISP output port shift\n");
+            }
+            else
+                av_log(NULL, AV_LOG_WARNING, "Set ISP output port shift OK\n");
+
+        }
+        port_out->format->encoding = MMAL_ENCODING_I420;
+        mmal_log_dump_port(port_out);
+        if ((err = mmal_port_format_commit(port_out)) != MMAL_SUCCESS)
+        {
+            av_log(NULL, AV_LOG_ERROR, "Failed to set ISP output port format\n");
+            goto fail;
+        }
+        if ((err = mmal_connection_create(&de->conn, port_out, de->display->input[0], MMAL_CONNECTION_FLAG_TUNNELLING)) != MMAL_SUCCESS) {
+            av_log(NULL, AV_LOG_ERROR, "Failed to create connection\n");
+            goto fail;
+        }
+        if ((err = mmal_connection_enable(de->conn)) != MMAL_SUCCESS) {
+            av_log(NULL, AV_LOG_ERROR, "Failed to enable connection\n");
+            goto fail;
+        }
+        mmal_port_enable(de->isp->control,display_cb_control);
+        mmal_component_enable(de->isp);
+    }
+
+    mmal_component_enable(de->display);
+    mmal_port_enable(de->display->control,display_cb_control);
+    de->avfmt = fmt;
+
+    printf("Allocated display %dx%d in %dx%d, fmt=%d\n", w, h, geo.stride_y, geo.height_y, fmt);
+
+    return de;
+
+fail:
+    // **** Free stuff
+    return NULL;
+}
+
+static void display_frame(struct AVCodecContext * const s, rpi_display_env_t * const de, const AVFrame* const fr)
+{
+    MMAL_BUFFER_HEADER_T* buf;
+
+    if (de == NULL)
+        return;
+
+    if (avpriv_atomic_int_get(&de->rpi_display_count) >= DISPLAY_PORT_DEPTH - 1) {
+        av_log(s, AV_LOG_VERBOSE, "Frame dropped\n");
+        return;
+    }
+
+    buf = mmal_queue_get(de->rpi_pool->queue);
+    if (!buf) {
+        // Running too fast so drop the frame
+        printf("Q alloc failure\n");
+        return;
+    }
+    assert(buf);
+    buf->cmd = 0;
+    buf->offset = 0; // Offset to valid data
+    buf->flags = 0;
+    {
+        const AVRpiZcRefPtr fr_buf = av_rpi_zc_ref(s, fr, de->avfmt, 1);
+        if (fr_buf == NULL) {
+            mmal_buffer_header_release(buf);
+            return;
+        }
+
+        buf->user_data = fr_buf;
+        buf->data = (uint8_t *)av_rpi_zc_vc_handle(fr_buf);  // Cast our handle to a pointer for mmal
+        buf->offset = av_rpi_zc_offset(fr_buf);
+        buf->length = av_rpi_zc_length(fr_buf);
+        buf->alloc_size = av_rpi_zc_numbytes(fr_buf);
+        avpriv_atomic_int_add_and_fetch(&de->rpi_display_count, 1);
+    }
+#if RPI_DISPLAY_ALL
+    while (avpriv_atomic_int_get(&de->rpi_display_count) >= DISPLAY_PORT_DEPTH - 1) {
+        usleep(5000);
+    }
+#endif
+
+    if (mmal_port_send_buffer(de->port_in, buf) != MMAL_SUCCESS)
+    {
+        av_log(s, AV_LOG_ERROR, "mmal_port_send_buffer failed: depth=%d\n", de->rpi_display_count);
+        display_cb_input(de->port_in, buf);
+    }
+}
+
+static void display_exit(rpi_display_env_t ** const pde)
+{
+    rpi_display_env_t * const de = *pde;
+    *pde = NULL;
+
+    if (de != NULL) {
+//    sleep(120);
+
+        if (de->port_in != NULL) {
+            mmal_port_disable(de->port_in);
+        }
+
+        // The above disable should kick out all buffers - check that
+        if (avpriv_atomic_int_get(&de->rpi_display_count) != 0) {
+            av_log(NULL, AV_LOG_WARNING, "Exiting with display count non-zero:%d\n", avpriv_atomic_int_get(&de->rpi_display_count));
+        }
+
+        if (de->conn != NULL) {
+            mmal_connection_destroy(de->conn);
+        }
+        if (de->isp != NULL) {
+            mmal_component_destroy(de->isp);
+        }
+        if (de->display != NULL) {
+            mmal_component_destroy(de->display);
+        }
+        if (de->rpi_pool != NULL) {
+            mmal_port_pool_destroy(de->display->input[0], de->rpi_pool);
+        }
+
+        av_free(de);
+    }
+}
+
+#endif
+
+
 /* sub2video hack:
    Convert subtitles to video with alpha to insert them in filter graphs.
    This is a temporary solution until libavfilter gets real subtitles support.
@@ -540,6 +800,11 @@ static void ffmpeg_cleanup(int ret)
         avformat_close_input(&input_files[i]->ctx);
         av_freep(&input_files[i]);
     }
+
+#ifdef RPI_DISPLAY
+    display_exit(&rpi_display_env);
+#endif
+
     for (i = 0; i < nb_input_streams; i++) {
         InputStream *ist = input_streams[i];

@@ -551,6 +816,9 @@ static void ffmpeg_cleanup(int ret)
         av_freep(&ist->filters);
         av_freep(&ist->hwaccel_device);

+#ifdef RPI_DISPLAY
+        av_rpi_zc_uninit(ist->dec_ctx);
+#endif
         avcodec_free_context(&ist->dec_ctx);

         av_freep(&input_streams[i]);
@@ -581,6 +849,7 @@ static void ffmpeg_cleanup(int ret)
     }
     term_exit();
     ffmpeg_exited = 1;
+
 }

 void remove_avoptions(AVDictionary **a, AVDictionary *b)
@@ -944,6 +1213,15 @@ static void do_video_out(AVFormatContext *s,
     if (ost->source_index >= 0)
         ist = input_streams[ost->source_index];

+#ifdef RPI_DISPLAY
+    if (next_picture && ist != NULL)
+    {
+        if (rpi_display_env == NULL)
+            rpi_display_env = display_init(next_picture->format, 0, 0, next_picture->width, next_picture->height);
+        display_frame(ist->dec_ctx, rpi_display_env, next_picture);
+    }
+#endif
+
     if (filter->inputs[0]->frame_rate.num > 0 &&
         filter->inputs[0]->frame_rate.den > 0)
         duration = 1/(av_q2d(filter->inputs[0]->frame_rate) * av_q2d(enc->time_base));
@@ -2544,6 +2822,12 @@ static int init_input_stream(int ist_index, char *error, int error_len)
         ist->dec_ctx->opaque                = ist;
         ist->dec_ctx->get_format            = get_format;
         ist->dec_ctx->get_buffer2           = get_buffer;
+
+#ifdef RPI_DISPLAY
+        // Overrides the above get_buffer2
+        av_rpi_zc_init(ist->dec_ctx);
+#endif
+
         ist->dec_ctx->thread_safe_callbacks = 1;

         av_opt_set_int(ist->dec_ctx, "refcounted_frames", 1, 0);
diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index bb28aea1e2..741aa0bdc4 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -5,6 +5,16 @@ NAME = avcodec
 HEADERS = avcodec.h                                                     \
           avdct.h                                                       \
           avfft.h                                                       \
+          rpi_opts.h                                                    \
+          rpi_qpu.h                                                     \
+          rpi_shader.h                                                  \
+          rpi_shader_cmd.h                                              \
+          rpi_shader_template.h                                         \
+          rpi_shader_template_fn.h                                      \
+          rpi_mailbox.h                                                 \
+          rpi_hevc_transform8.h                                         \
+          rpi_hevc_transform10.h                                        \
+          rpi_zc.h                                                      \
           d3d11va.h                                                     \
           dirac.h                                                       \
           dv_profile.h                                                  \
@@ -43,6 +53,11 @@ OBJS = allcodecs.o                                                      \
        resample.o                                                       \
        resample2.o                                                      \
        utils.o                                                          \
+       rpi_qpu.o                                                        \
+       rpi_shader.o                                                     \
+       rpi_shader_template.o                                            \
+       rpi_mailbox.o                                                    \
+       rpi_zc.o                                                         \
        vorbis_parser.o                                                  \
        xiph.o                                                           \

@@ -1079,3 +1094,30 @@ $(SUBDIR)qdm2.o: $(SUBDIR)qdm2_tables.h
 $(SUBDIR)sinewin.o: $(SUBDIR)sinewin_tables.h
 $(SUBDIR)sinewin_fixed.o: $(SUBDIR)sinewin_fixed_tables.h
 endif
+
+QASM_PY := ../local/bin/qasm.py
+VASMVIDCORE := ../local/bin/vasmvidcore_std
+
+ifneq ("$(wildcard $(QASM_PY))","")
+$(SUBDIR)rpi_shader.c: $(SUBDIR)rpi_shader.qasm
+	$(QASM_PY) -mc_c:rpi_shader,rpi_shader,rpi_shader $< > $@
+
+$(SUBDIR)rpi_shader.h: $(SUBDIR)rpi_shader.qasm
+	$(QASM_PY) -mc_h:rpi_shader,rpi_shader,rpi_shader $< > $@
+endif
+
+ifneq ("$(wildcard $(VASMVIDCORE))","")
+$(SUBDIR)rpi_hevc_transform8.bin: $(SUBDIR)rpi_hevc_transform.s
+	$(VASMVIDCORE) -Fbin -DBIT_DEPTH=8 $< -o $@
+$(SUBDIR)rpi_hevc_transform10.bin: $(SUBDIR)rpi_hevc_transform.s
+	$(VASMVIDCORE) -Fbin -DBIT_DEPTH=10 $< -o $@
+
+$(SUBDIR)rpi_hevc_transform8.h: $(SUBDIR)rpi_hevc_transform8.bin
+	python pi-util/make_array.py $<
+$(SUBDIR)rpi_hevc_transform10.h: $(SUBDIR)rpi_hevc_transform10.bin
+	python pi-util/make_array.py $<
+
+endif
+
+$(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_hevc_transform8.h $(SUBDIR)rpi_hevc_transform10.h
+$(SUBDIR)hevcdec.o $(SUBDIR)rpi_shader_template.o $(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_shader.h
diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
index 54efaad..02a89c3 100644
--- a/libavcodec/allcodecs.c
+++ b/libavcodec/allcodecs.c
@@ -667,6 +667,7 @@ void avcodec_register_all(void)
     REGISTER_PARSER(H261,               h261);
     REGISTER_PARSER(H263,               h263);
     REGISTER_PARSER(H264,               h264);
+    REGISTER_PARSER(H264_MVC,           h264_mvc);
     REGISTER_PARSER(HEVC,               hevc);
     REGISTER_PARSER(MJPEG,              mjpeg);
     REGISTER_PARSER(MLP,                mlp);
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index a4ceca7f46..f8229a80e2 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -131,9 +131,14 @@ NEON-OBJS-$(CONFIG_AAC_DECODER)        += arm/aacpsdsp_neon.o           \
 NEON-OBJS-$(CONFIG_LLAUDDSP)           += arm/lossless_audiodsp_neon.o
 NEON-OBJS-$(CONFIG_DCA_DECODER)        += arm/synth_filter_neon.o
 NEON-OBJS-$(CONFIG_HEVC_DECODER)       += arm/hevcdsp_init_neon.o       \
+                                          arm/hevc_misc_neon.o          \
                                           arm/hevcdsp_deblock_neon.o    \
+                                          arm/hevcdsp_epel_neon.o       \
                                           arm/hevcdsp_idct_neon.o       \
-                                          arm/hevcdsp_qpel_neon.o
+                                          arm/hevcdsp_cres_neon.o       \
+                                          arm/hevcdsp_res16_neon.o      \
+                                          arm/hevcdsp_qpel_neon.o       \
+                                          arm/hevcdsp_sao_neon.o
 NEON-OBJS-$(CONFIG_RV30_DECODER)       += arm/rv34dsp_neon.o
 NEON-OBJS-$(CONFIG_RV40_DECODER)       += arm/rv34dsp_neon.o            \
                                           arm/rv40dsp_neon.o
diff --git a/libavcodec/arm/cabac.h b/libavcodec/arm/cabac.h
index fdbf86b45e..0a3980a1ef 100644
--- a/libavcodec/arm/cabac.h
+++ b/libavcodec/arm/cabac.h
@@ -26,13 +26,34 @@
 #include "libavutil/internal.h"
 #include "libavcodec/cabac.h"

+
+#if UNCHECKED_BITSTREAM_READER
+#define LOAD_16BITS_BEHI\
+        "ldrh       %[tmp]        , [%[ptr]]    , #2            \n\t"\
+        "rev        %[tmp]        , %[tmp]                      \n\t"
+#elif CONFIG_THUMB
+#define LOAD_16BITS_BEHI\
+        "ldr        %[tmp]        , [%[c], %[end]]              \n\t"\
+        "cmp        %[tmp]        , %[ptr]                      \n\t"\
+        "it         cs                                          \n\t"\
+        "ldrhcs     %[tmp]        , [%[ptr]]    , #2            \n\t"\
+        "rev        %[tmp]        , %[tmp]                      \n\t"
+#else
+#define LOAD_16BITS_BEHI\
+        "ldr        %[tmp]        , [%[c], %[end]]              \n\t"\
+        "cmp        %[tmp]        , %[ptr]                      \n\t"\
+        "ldrcsh     %[tmp]        , [%[ptr]]    , #2            \n\t"\
+        "rev        %[tmp]        , %[tmp]                      \n\t"
+#endif
+
+
 #define get_cabac_inline get_cabac_inline_arm
 static av_always_inline int get_cabac_inline_arm(CABACContext *c,
                                                  uint8_t *const state)
 {
     int bit;
+#if 0
     void *reg_b, *reg_c, *tmp;
-
     __asm__ volatile(
         "ldrb       %[bit]        , [%[state]]                  \n\t"
         "add        %[r_b]        , %[tables]   , %[lps_off]    \n\t"
@@ -100,9 +121,141 @@ static av_always_inline int get_cabac_inline_arm(CABACContext *c,
           [mlps_off]"I"(H264_MLPS_STATE_OFFSET + 128)
         : "memory", "cc"
         );
+#else
+   // *** Not thumb compatible yet
+   unsigned int reg_b, tmp;
+    __asm__ (
+        "ldrb       %[bit]        , [%[state]]                  \n\t"
+        "sub        %[r_b]        , %[mlps_tables], %[lps_off]  \n\t"
+        "and        %[tmp]        , %[range]    , #0xC0         \n\t"
+        "add        %[r_b]        , %[r_b]      , %[bit]        \n\t"
+        "ldrb       %[tmp]        , [%[r_b]     , %[tmp], lsl #1] \n\t"
+// %bit = *state
+// %range = range
+// %tmp = RangeLPS
+        "sub        %[range]      , %[range]    , %[tmp]        \n\t"
+
+        "cmp        %[low]        , %[range]    , lsl #17       \n\t"
+        "ittt       ge                                          \n\t"
+        "subge      %[low]        , %[low]      , %[range], lsl #17 \n\t"
+        "mvnge      %[bit]        , %[bit]                      \n\t"
+        "movge      %[range]      , %[tmp]                      \n\t"
+
+        "clz        %[tmp]        , %[range]                    \n\t"
+        "sub        %[tmp]        , #23                         \n\t"
+
+        "ldrb       %[r_b]        , [%[mlps_tables], %[bit]]    \n\t"
+        "lsl        %[low]        , %[low]      , %[tmp]        \n\t"
+        "lsl        %[range]      , %[range]    , %[tmp]        \n\t"
+
+        "strb       %[r_b]        , [%[state]]                  \n\t"
+        "lsls       %[tmp]        , %[low]      , #16           \n\t"
+
+        "bne        2f                                          \n\t"
+        LOAD_16BITS_BEHI
+        "lsr        %[tmp]        , %[tmp]      , #15           \n\t"
+        "movw       %[r_b]        , #0xFFFF                     \n\t"
+        "sub        %[tmp]        , %[tmp]      , %[r_b]        \n\t"
+
+        "rbit       %[r_b]        , %[low]                      \n\t"
+        "clz        %[r_b]        , %[r_b]                      \n\t"
+        "sub        %[r_b]        , %[r_b]      , #16           \n\t"
+#if CONFIG_THUMB
+        "lsl        %[tmp]        , %[tmp]      , %[r_b]        \n\t"
+        "add        %[low]        , %[low]      , %[tmp]        \n\t"
+#else
+        "add        %[low]        , %[low]      , %[tmp], lsl %[r_b] \n\t"
+#endif
+        "2:                                                     \n\t"
+        :    [bit]"=&r"(bit),
+             [low]"+&r"(c->low),
+           [range]"+&r"(c->range),
+             [r_b]"=&r"(reg_b),
+             [ptr]"+&r"(c->bytestream),
+             [tmp]"=&r"(tmp)
+          :  [state]"r"(state),
+            [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
+              [byte]"M"(offsetof(CABACContext, bytestream)),
+#if !UNCHECKED_BITSTREAM_READER
+                 [c]"r"(c),
+               [end]"M"(offsetof(CABACContext, bytestream_end)),
+#endif
+           [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
+        : "memory", "cc"
+        );
+#endif

     return bit & 1;
 }
+
+#define get_cabac_bypass get_cabac_bypass_arm
+static inline int get_cabac_bypass_arm(CABACContext * const c)
+{
+    int rv = 0;
+    unsigned int tmp;
+    __asm (
+        "lsl        %[low]        , #1                          \n\t"
+        "cmp        %[low]        , %[range]    , lsl #17       \n\t"
+        "adc        %[rv]         , %[rv]       , #0            \n\t"
+        "it         cs                                          \n\t"
+        "subcs      %[low]        , %[low]      , %[range], lsl #17 \n\t"
+        "lsls       %[tmp]        , %[low]      , #16           \n\t"
+        "bne        1f                                          \n\t"
+        LOAD_16BITS_BEHI
+        "add        %[low]        , %[low]      , %[tmp], lsr #15 \n\t"
+        "movw       %[tmp]        , #0xFFFF                     \n\t"
+        "sub        %[low]        , %[low]      , %[tmp]        \n\t"
+        "1:                                                     \n\t"
+        : // Outputs
+              [rv]"+&r"(rv),
+             [low]"+&r"(c->low),
+             [tmp]"=&r"(tmp),
+             [ptr]"+&r"(c->bytestream)
+        : // Inputs
+#if !UNCHECKED_BITSTREAM_READER
+                 [c]"r"(c),
+               [end]"M"(offsetof(CABACContext, bytestream_end)),
+#endif
+             [range]"r"(c->range)
+        : "cc"
+    );
+    return rv;
+}
+
+
+#define get_cabac_bypass_sign get_cabac_bypass_sign_arm
+static inline int get_cabac_bypass_sign_arm(CABACContext * const c, int rv)
+{
+    unsigned int tmp;
+    __asm (
+        "lsl        %[low]        , #1                          \n\t"
+        "cmp        %[low]        , %[range]    , lsl #17       \n\t"
+        "ite        cc                                          \n\t"
+        "rsbcc      %[rv]         , %[rv]       , #0            \n\t"
+        "subcs      %[low]        , %[low]      , %[range], lsl #17 \n\t"
+        "lsls       %[tmp]        , %[low]      , #16           \n\t"
+        "bne        1f                                          \n\t"
+        LOAD_16BITS_BEHI
+        "add        %[low]        , %[low]      , %[tmp], lsr #15 \n\t"
+        "movw       %[tmp]        , #0xFFFF                     \n\t"
+        "sub        %[low]        , %[low]      , %[tmp]        \n\t"
+        "1:                                                     \n\t"
+        : // Outputs
+              [rv]"+&r"(rv),
+             [low]"+&r"(c->low),
+             [tmp]"=&r"(tmp),
+             [ptr]"+&r"(c->bytestream)
+        : // Inputs
+#if !UNCHECKED_BITSTREAM_READER
+                 [c]"r"(c),
+               [end]"M"(offsetof(CABACContext, bytestream_end)),
+#endif
+             [range]"r"(c->range)
+        : "cc"
+    );
+    return rv;
+}
+
 #endif /* HAVE_ARMV6T2_INLINE */

 #endif /* AVCODEC_ARM_CABAC_H */
diff --git a/libavcodec/arm/hevc_cabac.h b/libavcodec/arm/hevc_cabac.h
new file mode 100644
index 0000000000..31d3c59205
--- /dev/null
+++ b/libavcodec/arm/hevc_cabac.h
@@ -0,0 +1,491 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ARM_HEVC_CABAC_H
+#define AVCODEC_ARM_HEVC_CABAC_H
+
+#include "config.h"
+#if HAVE_ARMV6T2_INLINE
+
+#define hevc_mem_bits32 hevc_mem_bits32_arm
+static inline uint32_t hevc_mem_bits32_arm(const void * p, const unsigned int bits)
+{
+    unsigned int n;
+    __asm__ (
+        "rev        %[n], %[x]                     \n\t"
+        : [n]"=r"(n)
+        : [x]"r"(*(const uint32_t *)((const uint8_t *)p + (bits >> 3)))
+        :
+        );
+    return n << (bits & 7);
+}
+
+
+// ---------------------------------------------------------------------------
+//
+// Helper fns - little bits of code where ARM has an instraction that the
+// compiler doesn't know about / use
+
+#define trans_scale_sat trans_scale_sat_arm
+static inline int trans_scale_sat_arm(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift)
+{
+    int rv;
+    int t = ((level * (int)(scale * scale_m)) >> shift) + 1;
+
+    __asm__ (
+    "ssat %[rv], #16, %[t], ASR #1 \n\t"
+    : [rv]"=r"(rv)
+    : [t]"r"(t)
+    :
+    );
+    return rv;
+}
+
+#define update_rice update_rice_arm
+static inline void update_rice_arm(uint8_t * const stat_coeff,
+    const unsigned int last_coeff_abs_level_remaining,
+    const unsigned int c_rice_param)
+{
+    int t;
+    __asm__ (
+    "lsl   %[t], %[coeff], #1               \n\t"
+    "lsrs  %[t], %[t], %[shift]             \n\t"
+    "it    eq                               \n\t"
+    "subeq %[stat], %[stat], #1             \n\t"
+    "cmp   %[t], #6                         \n\t"
+    "adc   %[stat], %[stat], #0             \n\t"
+    "usat  %[stat], #8, %[stat]             \n\t"
+    : [stat]"+&r"(*stat_coeff),
+         [t]"=&r"(t)
+    :  [coeff]"r"(last_coeff_abs_level_remaining),
+       [shift]"r"(c_rice_param)
+    : "cc"
+    );
+}
+
+// ---------------------------------------------------------------------------
+//
+// CABAC get loops
+//
+// Where the loop is simple enough we can normally do 10-30% better than the
+// compiler
+
+// Get the residual greater than 1 bits
+
+#define get_cabac_greater1_bits get_cabac_greater1_bits_arm
+static inline unsigned int get_cabac_greater1_bits_arm(CABACContext * const c, const unsigned int n,
+    uint8_t * const state0)
+{
+    unsigned int i, reg_b, st, tmp, bit, rv;
+     __asm__ (
+         "mov        %[i]          , #0                          \n\t"
+         "mov        %[rv]         , #0                          \n\t"
+         "1:                                                     \n\t"
+         "add        %[i]          , %[i]        , #1            \n\t"
+         "cmp        %[rv]         , #0                          \n\t"
+         "ite        eq                                          \n\t"
+         "usateq     %[st]         , #2          , %[i]          \n\t"
+         "movne      %[st]         , #0                          \n\t"
+
+         "ldrb       %[bit]        , [%[state0], %[st]]          \n\t"
+         "sub        %[r_b]        , %[mlps_tables], %[lps_off]  \n\t"
+         "and        %[tmp]        , %[range]    , #0xC0         \n\t"
+         "add        %[r_b]        , %[r_b]      , %[bit]        \n\t"
+         "ldrb       %[tmp]        , [%[r_b], %[tmp], lsl #1]    \n\t"
+         "sub        %[range]      , %[range]    , %[tmp]        \n\t"
+
+         "cmp        %[low]        , %[range], lsl #17           \n\t"
+         "ittt       ge                                          \n\t"
+         "subge      %[low]        , %[low]      , %[range], lsl #17 \n\t"
+         "mvnge      %[bit]        , %[bit]                      \n\t"
+         "movge      %[range]      , %[tmp]                      \n\t"
+
+         "ldrb       %[r_b]        , [%[mlps_tables], %[bit]]    \n\t"
+         "and        %[bit]        , %[bit]      , #1            \n\t"
+         "orr        %[rv]         , %[bit]      , %[rv], lsl #1 \n\t"
+
+         "clz        %[tmp]        , %[range]                    \n\t"
+         "sub        %[tmp]        , #23                         \n\t"
+
+         "lsl        %[low]        , %[low]      , %[tmp]        \n\t"
+         "lsl        %[range]      , %[range]    , %[tmp]        \n\t"
+
+         "strb       %[r_b]        , [%[state0], %[st]]          \n\t"
+// There is a small speed gain from combining both conditions, using a single
+// branch and then working out what that meant later
+         "lsls       %[tmp]        , %[low]      , #16           \n\t"
+         "it         ne                                          \n\t"
+         "cmpne      %[n]          , %[i]                        \n\t"
+         "bne        1b                                          \n\t"
+
+// If reload is not required then we must have run out of flags to decode
+         "tst        %[tmp]        , %[tmp]                      \n\t"
+         "bne        2f                                          \n\t"
+
+// Do reload
+         "ldrh       %[tmp]        , [%[bptr]]   , #2            \n\t"
+         "movw       %[r_b]        , #0xFFFF                     \n\t"
+         "rev        %[tmp]        , %[tmp]                      \n\t"
+         "rsb        %[tmp]        , %[r_b]      , %[tmp], lsr #15 \n\t"
+
+         "rbit       %[r_b]        , %[low]                      \n\t"
+         "clz        %[r_b]        , %[r_b]                      \n\t"
+         "sub        %[r_b]        , %[r_b]      , #16           \n\t"
+
+#if CONFIG_THUMB
+         "lsl        %[tmp]        , %[tmp]      , %[r_b]        \n\t"
+         "add        %[low]        , %[low]      , %[tmp]        \n\t"
+#else
+         "add        %[low]        , %[low]      , %[tmp], lsl %[r_b] \n\t"
+#endif
+
+         "cmp        %[n]          , %[i]                        \n\t"
+         "bne        1b                                          \n\t"
+         "2:                                                     \n\t"
+         :    [bit]"=&r"(bit),
+              [low]"+&r"(c->low),
+            [range]"+&r"(c->range),
+              [r_b]"=&r"(reg_b),
+             [bptr]"+&r"(c->bytestream),
+                [i]"=&r"(i),
+              [tmp]"=&r"(tmp),
+               [st]"=&r"(st),
+               [rv]"=&r"(rv)
+          :  [state0]"r"(state0),
+                  [n]"r"(n),
+        [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
+               [byte]"M"(offsetof(CABACContext, bytestream)),
+            [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
+         : "memory", "cc"
+    );
+    return rv;
+}
+
+
+// n must be > 0 on entry
+#define get_cabac_sig_coeff_flag_idxs get_cabac_sig_coeff_flag_idxs_arm
+static inline uint8_t * get_cabac_sig_coeff_flag_idxs_arm(CABACContext * const c, uint8_t * const state0,
+    unsigned int n,
+    const uint8_t const * ctx_map,
+    uint8_t * p)
+{
+    unsigned int reg_b, tmp, st, bit;
+     __asm__ (
+         "1:                                                     \n\t"
+// Get bin from map
+         "ldrb       %[st]         , [%[ctx_map], %[n]]          \n\t"
+
+// Load state & ranges
+         "sub        %[r_b]        , %[mlps_tables], %[lps_off]  \n\t"
+         "ldrb       %[bit]        , [%[state0], %[st]]          \n\t"
+         "and        %[tmp]        , %[range]    , #0xC0         \n\t"
+         "add        %[r_b]        , %[r_b]      , %[tmp], lsl #1 \n\t"
+         "ldrb       %[tmp]        , [%[r_b], %[bit]]            \n\t"
+         "sub        %[range]      , %[range]    , %[tmp]        \n\t"
+
+         "cmp        %[low]        , %[range], lsl #17           \n\t"
+         "ittt       ge                                          \n\t"
+         "subge      %[low]        , %[low]      , %[range], lsl #17 \n\t"
+         "mvnge      %[bit]        , %[bit]                      \n\t"
+         "movge      %[range]      , %[tmp]                      \n\t"
+
+         "ldrb       %[r_b]        , [%[mlps_tables], %[bit]]    \n\t"
+         "tst        %[bit]        , #1                          \n\t"
+// GCC asm seems to need strbne written differently for thumb and arm
+#if CONFIG_THUMB
+         "it         ne                                          \n\t"
+         "strbne     %[n]          , [%[idx]]    , #1            \n\t"
+#else
+         "strneb     %[n]          , [%[idx]]    , #1            \n\t"
+#endif
+
+// Renorm
+         "clz        %[tmp]        , %[range]                    \n\t"
+         "sub        %[tmp]        , #23                         \n\t"
+         "lsl        %[low]        , %[low]      , %[tmp]        \n\t"
+         "lsl        %[range]      , %[range]    , %[tmp]        \n\t"
+
+         "strb       %[r_b]        , [%[state0], %[st]]          \n\t"
+// There is a small speed gain from combining both conditions, using a single
+// branch and then working out what that meant later
+         "subs       %[n]          , %[n]        , #1            \n\t"
+#if CONFIG_THUMB
+         "itt        ne                                          \n\t"
+         "lslsne     %[tmp]        , %[low]      , #16           \n\t"
+         "bne        1b                                          \n\t"
+#else
+         "lslnes     %[tmp]        , %[low]      , #16           \n\t"
+         "bne        1b                                          \n\t"
+#endif
+
+// If we have bits left then n must be 0 so give up now
+         "lsls       %[tmp]        , %[low]      , #16           \n\t"
+         "bne        2f                                          \n\t"
+
+// Do reload
+         "ldrh       %[tmp]        , [%[bptr]]   , #2            \n\t"
+         "movw       %[r_b]        , #0xFFFF                     \n\t"
+         "rev        %[tmp]        , %[tmp]                      \n\t"
+         "rsb        %[tmp]        , %[r_b]      , %[tmp], lsr #15 \n\t"
+
+         "rbit       %[r_b]        , %[low]                      \n\t"
+         "clz        %[r_b]        , %[r_b]                      \n\t"
+         "sub        %[r_b]        , %[r_b]      , #16           \n\t"
+
+#if CONFIG_THUMB
+         "lsl        %[tmp]        , %[tmp]      , %[r_b]        \n\t"
+         "add        %[low]        , %[low]      , %[tmp]        \n\t"
+#else
+         "add        %[low]        , %[low]      , %[tmp], lsl %[r_b] \n\t"
+#endif
+
+// Check to see if we still have more to do
+         "cmp        %[n]          , #0                          \n\t"
+         "bne        1b                                          \n\t"
+         "2:                                                     \n\t"
+         :    [bit]"=&r"(bit),
+              [low]"+&r"(c->low),
+            [range]"+&r"(c->range),
+              [r_b]"=&r"(reg_b),
+             [bptr]"+&r"(c->bytestream),
+              [idx]"+&r"(p),
+                [n]"+&r"(n),
+              [tmp]"=&r"(tmp),
+               [st]"=&r"(st)
+          :  [state0]"r"(state0),
+            [ctx_map]"r"(ctx_map),
+        [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
+               [byte]"M"(offsetof(CABACContext, bytestream)),
+            [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
+         : "memory", "cc"
+    );
+
+    return p;
+}
+
+// ---------------------------------------------------------------------------
+//
+// CABAC_BY22 functions
+//
+// By and large these are (at best) no faster than their C equivalents - the
+// only one worth having is _peek where we do a slightly better job than the
+// compiler
+//
+// The others have been stashed here for reference in case larger scale asm
+// is attempted in which case they might be a useful base
+
+
+#define get_cabac_by22_peek get_cabac_by22_peek_arm
+static inline uint32_t get_cabac_by22_peek_arm(const CABACContext *const c)
+{
+    uint32_t rv, tmp;
+    __asm__ (
+        "bic      %[rv]  , %[low], #1            \n\t"
+        "cmp      %[inv] , #0                    \n\t"
+        "it       ne                             \n\t"
+        "umullne  %[tmp] , %[rv] , %[inv], %[rv] \n\t"
+        :  // Outputs
+             [rv]"=&r"(rv),
+             [tmp]"=r"(tmp)
+        :  // Inputs
+             [low]"r"(c->low),
+             [inv]"r"(c->range)
+        :  // Clobbers
+                "cc"
+    );
+    return rv << 1;
+}
+
+#if 0
+
+// ***** Slower than the C  :-(
+#define get_cabac_by22_flush get_cabac_by22_flush_arm
+static inline void get_cabac_by22_flush_arm(CABACContext *const c, const unsigned int n, const uint32_t val)
+{
+    uint32_t m, tmp;
+    __asm__ (
+    "add    %[bits], %[bits], %[n]   \n\t"
+    "ldr    %[m], [%[ptr], %[bits], lsr #3]  \n\t"
+
+    "rsb    %[tmp], %[n], #32        \n\t"
+    "lsr    %[tmp], %[val], %[tmp]   \n\t"
+    "mul    %[tmp], %[range], %[tmp] \n\t"
+
+    "rev    %[m], %[m]               \n\t"
+
+    "lsl    %[tmp], %[tmp], #23      \n\t"
+    "rsb    %[low], %[tmp], %[low], lsl %[n] \n\t"
+
+    "and    %[tmp], %[bits], #7         \n\t"
+    "lsl    %[m], %[m], %[tmp]          \n\t"
+
+    "orr    %[low], %[low], %[m], lsr #9      \n\t"
+        :  // Outputs
+             [m]"=&r"(m),
+           [tmp]"=&r"(tmp),
+          [bits]"+&r"(c->by22.bits),
+           [low]"+&r"(c->low)
+        :  // Inputs
+               [n]"r"(n),
+             [val]"r"(val),
+             [inv]"r"(c->range),
+           [range]"r"(c->by22.range),
+             [ptr]"r"(c->bytestream)
+        :  // Clobbers
+    );
+}
+
+
+// Works but slower than C
+#define coeff_abs_level_remaining_decode_by22(c,r) coeff_abs_level_remaining_decode_by22_arm(c, r)
+static int coeff_abs_level_remaining_decode_by22_arm(CABACContext * const c, const unsigned int c_rice_param)
+{
+    uint32_t n, val, tmp, level;
+
+//    PROFILE_START();
+
+    __asm__ (
+            // Peek
+            "bic    %[val],  %[low],   #1  \n\t"
+            "cmp    %[inv], #0          \n\t"
+            "umullne  %[tmp], %[val], %[inv], %[val] \n\t"
+            "lsl    %[val], %[val], #1  \n\t"
+
+            // Count bits (n = prefix)
+            "mvn    %[n], %[val] \n\t"
+            "clz    %[n], %[n]   \n\t"
+
+            "lsl    %[level], %[val], %[n] \n\t"
+            "subs   %[tmp], %[n], #3 \n\t"
+            "blo    2f \n\t"
+
+            // prefix >= 3
+            // < tmp = prefix - 3
+            // > tmp = prefix + rice - 3
+            "add    %[tmp], %[tmp], %[rice] \n\t"
+            // > n = prefix * 2 + rice - 3
+            "add    %[n], %[tmp], %[n] \n\t"
+            "cmp    %[n], #21 \n\t"
+            "bhi    3f \n\t"
+
+            "orr    %[level], %[level], #0x80000000 \n\t"
+            "rsb    %[tmp], %[tmp], #31 \n\t"
+            "lsr    %[level], %[level], %[tmp] \n\t"
+
+            "mov    %[tmp], #2 \n\t"
+            "add    %[level], %[level], %[tmp], lsl %[rice] \n\t"
+            "b      1f \n\t"
+
+            // > 22 bits used in total - need reload
+            "3:  \n\t"
+
+            // Stash prefix + rice - 3 in level (only spare reg)
+            "mov    %[level], %[tmp] \n\t"
+            // Restore n to flush value (prefix)
+            "sub    %[n], %[n], %[tmp] \n\t"
+
+            // Flush + reload
+
+//          "rsb    %[tmp], %[n], #32        \n\t"
+//          "lsr    %[tmp], %[val], %[tmp]   \n\t"
+//          "mul    %[tmp], %[range], %[tmp] \n\t"
+
+            // As it happens we know that all the bits we are flushing are 1
+            // so we can cheat slightly
+            "rsb    %[tmp], %[range], %[range], lsl %[n] \n\t"
+            "lsl    %[tmp], %[tmp], #23      \n\t"
+            "rsb    %[low], %[tmp], %[low], lsl %[n] \n\t"
+
+            "add    %[bits], %[bits], %[n]   \n\t"
+            "ldr    %[n], [%[ptr], %[bits], lsr #3]  \n\t"
+            "rev    %[n], %[n]               \n\t"
+            "and    %[tmp], %[bits], #7         \n\t"
+            "lsl    %[n], %[n], %[tmp]          \n\t"
+
+            "orr    %[low], %[low], %[n], lsr #9      \n\t"
+
+            // (reload)
+
+            "bic    %[val],  %[low],   #1  \n\t"
+            "cmp    %[inv], #0          \n\t"
+            "umullne  %[tmp], %[val], %[inv], %[val] \n\t"
+            "lsl    %[val], %[val], #1  \n\t"
+
+            // Build value
+
+            "mov    %[n], %[level] \n\t"
+
+            "orr     %[tmp], %[val], #0x80000000 \n\t"
+            "rsb     %[level], %[level], #31 \n\t"
+            "lsr     %[level], %[tmp], %[level] \n\t"
+
+            "mov    %[tmp], #2 \n\t"
+            "add    %[level], %[level], %[tmp], lsl %[rice] \n\t"
+            "b      1f \n\t"
+
+            // prefix < 3
+            "2:  \n\t"
+            "rsb    %[tmp], %[rice], #31 \n\t"
+            "lsr    %[level], %[level], %[tmp] \n\t"
+            "orr    %[level], %[level], %[n], lsl %[rice] \n\t"
+            "add    %[n], %[n], %[rice] \n\t"
+
+            "1:  \n\t"
+            // Flush
+            "add    %[n], %[n], #1 \n\t"
+
+            "rsb    %[tmp], %[n], #32        \n\t"
+            "lsr    %[tmp], %[val], %[tmp]   \n\t"
+
+            "add    %[bits], %[bits], %[n]   \n\t"
+            "ldr    %[val], [%[ptr], %[bits], lsr #3]  \n\t"
+
+            "mul    %[tmp], %[range], %[tmp] \n\t"
+            "lsl    %[tmp], %[tmp], #23      \n\t"
+            "rsb    %[low], %[tmp], %[low], lsl %[n] \n\t"
+
+            "rev    %[val], %[val]               \n\t"
+            "and    %[tmp], %[bits], #7         \n\t"
+            "lsl    %[val], %[val], %[tmp]          \n\t"
+
+            "orr    %[low], %[low], %[val], lsr #9      \n\t"
+        :  // Outputs
+         [level]"=&r"(level),
+             [n]"=&r"(n),
+           [val]"=&r"(val),
+           [tmp]"=&r"(tmp),
+          [bits]"+&r"(c->by22.bits),
+           [low]"+&r"(c->low)
+        :  // Inputs
+            [rice]"r"(c_rice_param),
+             [inv]"r"(c->range),
+           [range]"r"(c->by22.range),
+             [ptr]"r"(c->bytestream)
+        :  // Clobbers
+                "cc"
+    );
+
+//    PROFILE_ACC(residual_abs);
+
+    return level;
+}
+#endif
+
+#endif /* HAVE_ARMV6T2_INLINE */
+
+#endif /* AVCODEC_ARM_HEVC_CABAC_H */
diff --git a/libavcodec/arm/hevc_idct_fn_neon.S b/libavcodec/arm/hevc_idct_fn_neon.S
new file mode 100644
index 0000000000..380d3c8d3b
--- /dev/null
+++ b/libavcodec/arm/hevc_idct_fn_neon.S
@@ -0,0 +1,224 @@
+@ Included multiple times from hevc_idct_neon.S
+@ Macros defined there
+
+#define DC_SHIFT  (15 - BIT_DEPTH)
+#define DC_ADD    (1 | (1 << (14 - BIT_DEPTH)))
+#define TRN_SHIFT (20 - BIT_DEPTH)
+
+function JOIN(ff_hevc_idct_4x4_dc_neon_, BIT_DEPTH), export=1
+        ldrsh       r1, [r0]
+        add         r1, #DC_ADD
+        asr         r1, #DC_SHIFT
+        vdup.16     q0, r1
+        vdup.16     q1, r1
+        vst1.16     {q0, q1}, [r0]
+        bx lr
+endfunc
+
+function JOIN(ff_hevc_idct_8x8_dc_neon_, BIT_DEPTH), export=1
+        ldrsh       r1, [r0]
+        add         r1, #DC_ADD
+        asr         r1, #DC_SHIFT
+        vdup.16     q8, r1
+        vdup.16     q9, r1
+        vmov.16     q10, q8
+        vmov.16     q11, q8
+        vmov.16     q12, q8
+        vmov.16     q13, q8
+        vmov.16     q14, q8
+        vmov.16     q15, q8
+        vstm        r0, {q8-q15}
+        bx lr
+endfunc
+
+function JOIN(ff_hevc_idct_16x16_dc_neon_, BIT_DEPTH), export=1
+        ldrsh       r1, [r0]
+        add         r1, #DC_ADD
+        asr         r1, #DC_SHIFT
+        vdup.16     q8, r1
+        vdup.16     q9, r1
+        vmov.16     q10, q8
+        vmov.16     q11, q8
+        vmov.16     q12, q8
+        vmov.16     q13, q8
+        vmov.16     q14, q8
+        vmov.16     q15, q8
+        vstm        r0!, {q8-q15}
+        vstm        r0!, {q8-q15}
+        vstm        r0!, {q8-q15}
+        vstm        r0, {q8-q15}
+        bx lr
+endfunc
+
+function JOIN(ff_hevc_idct_32x32_dc_neon_, BIT_DEPTH), export=1
+        ldrsh       r1, [r0]
+        add         r1, #DC_ADD
+        asr         r1, #DC_SHIFT
+        mov         r3, #16
+        vdup.16     q8, r1
+        vdup.16     q9, r1
+        vmov.16     q10, q8
+        vmov.16     q11, q8
+        vmov.16     q12, q8
+        vmov.16     q13, q8
+        vmov.16     q14, q8
+        vmov.16     q15, q8
+1:      subs        r3, #1
+        vstm        r0!, {q8-q15}
+        bne         1b
+        bx lr
+endfunc
+
+
+function JOIN(ff_hevc_transform_4x4_neon_, BIT_DEPTH), export=1
+        vpush       {d8-d15}
+        vld1.16     {q14, q15}, [r0]  // coeffs
+        ldr         r3, =0x00240053 // 36 and 83
+        vmov.32     d0[0], r3
+
+        tr4_shift d28, d29, d30, d31, #7
+
+        vtrn.16     d28, d29
+        vtrn.16     d30, d31
+        vtrn.32     q14, q15
+
+        tr4_shift d28, d29, d30, d31, #(TRN_SHIFT)
+
+        vtrn.16     d28, d29
+        vtrn.16     d30, d31
+        vtrn.32     q14, q15
+
+        vst1.16     {q14, q15}, [r0]
+        vpop        {d8-d15}
+        bx lr
+endfunc
+
+
+
+function JOIN(ff_hevc_transform_luma_4x4_neon_, BIT_DEPTH), export=1
+        vpush       {d8-d15}
+        vld1.16     {q14, q15}, [r0]  // coeffs
+        ldr         r3, =0x4a  // 74
+        vmov.32     d0[0], r3
+        ldr         r3, =0x1d  // 29
+        vmov.32     d0[1], r3
+        ldr         r3, =0x37  // 55
+        vmov.32     d1[0], r3
+
+        tr4_luma_shift d28, d29, d30, d31, #7
+
+        vtrn.16     d28, d29
+        vtrn.16     d30, d31
+        vtrn.32     q14, q15
+
+        tr4_luma_shift d28, d29, d30, d31, #(TRN_SHIFT)
+
+        vtrn.16     d28, d29
+        vtrn.16     d30, d31
+        vtrn.32     q14, q15
+        vst1.16     {q14, q15}, [r0]
+        vpop        {d8-d15}
+        bx lr
+endfunc
+
+
+
+function JOIN(ff_hevc_transform_8x8_neon_, BIT_DEPTH), export=1
+        push   {r4-r8}
+        vpush {d8-d15}
+        mov    r5, #16
+
+        adrl      r3, tr4f
+        vld1.16   {d0, d1}, [r3]
+
+        // left half
+        vld1.16 {d24}, [r0], r5
+        vld1.16 {d25}, [r0], r5
+        vld1.16 {d26}, [r0], r5
+        vld1.16 {d27}, [r0], r5
+        vld1.16 {d28}, [r0], r5
+        vld1.16 {d29}, [r0], r5
+        vld1.16 {d30}, [r0], r5
+        vld1.16 {d31}, [r0], r5
+        sub      r0, #128
+        tr8_begin d25, d27, d29, d31
+        tr4       d24, d26, d28, d30
+        tr8_end   #7
+        vst1.16 {d2}, [r0], r5
+        vst1.16 {d3}, [r0], r5
+        vst1.16 {d4}, [r0], r5
+        vst1.16 {d5}, [r0], r5
+        vst1.16 {d6}, [r0], r5
+        vst1.16 {d7}, [r0], r5
+        vst1.16 {d8}, [r0], r5
+        vst1.16 {d9}, [r0], r5
+        sub      r0, #128
+        //skip right half if col_limit in r1 is less than 4
+        cmp      r1, #4
+        blt      1f
+        //right half
+        add      r0, #8
+        vld1.16 {d24}, [r0], r5
+        vld1.16 {d25}, [r0], r5
+        vld1.16 {d26}, [r0], r5
+        vld1.16 {d27}, [r0], r5
+        vld1.16 {d28}, [r0], r5
+        vld1.16 {d29}, [r0], r5
+        vld1.16 {d30}, [r0], r5
+        vld1.16 {d31}, [r0], r5
+        sub      r0, #128
+        tr8_begin d25, d27, d29, d31
+        tr4       d24, d26, d28, d30
+        tr8_end   #7
+        vst1.16 {d2}, [r0], r5
+        vst1.16 {d3}, [r0], r5
+        vst1.16 {d4}, [r0], r5
+        vst1.16 {d5}, [r0], r5
+        vst1.16 {d6}, [r0], r5
+        vst1.16 {d7}, [r0], r5
+        vst1.16 {d8}, [r0], r5
+        vst1.16 {d9}, [r0], r5
+        sub      r0, #136
+1:
+        // top half
+        vldm r0, {q12-q15} // coeffs
+        transpose_16b_4x4 d24, d26, d28, d30
+        transpose_16b_4x4 d25, d27, d29, d31
+        tr8_begin d26, d30, d27, d31
+        tr4 d24, d28, d25, d29
+        tr8_end #(TRN_SHIFT)
+        transpose_16b_4x4 d2, d3, d4, d5
+        transpose_16b_4x4 d6, d7, d8, d9
+        vswp     d7, d5
+        vswp     d7, d8
+        vswp     d3, d6
+        vswp     d6, d4
+        vstm r0!, {q1-q4}
+
+        // bottom half
+        vldm r0, {q12-q15} // coeffs
+        transpose_16b_4x4 d24, d26, d28, d30
+        transpose_16b_4x4 d25, d27, d29, d31
+        tr8_begin d26, d30, d27, d31
+        tr4 d24, d28, d25, d29
+        tr8_end #(TRN_SHIFT)
+        transpose_16b_4x4 d2, d3, d4, d5
+        transpose_16b_4x4 d6, d7, d8, d9
+        vswp     d7, d5
+        vswp     d7, d8
+        vswp     d3, d6
+        vswp     d6, d4
+        //vstm     r0, {q1-q4}
+        vst1.16 {q1-q2}, [r0]
+        add     r0, #32
+        vst1.16 {q3-q4}, [r0]
+        sub     r0, #32
+        vpop {d8-d15}
+        pop {r4-r8}
+        bx lr
+endfunc
+
+#undef DC_SHIFT
+#undef DC_ADD
+#undef TRN_SHIFT
+
diff --git a/libavcodec/arm/hevc_misc_neon.S b/libavcodec/arm/hevc_misc_neon.S
new file mode 100644
index 0000000000..373576b4cb
--- /dev/null
+++ b/libavcodec/arm/hevc_misc_neon.S
@@ -0,0 +1,62 @@
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
+@ rpi_zap_coeff_vals_neon(
+@   uint16_t * buf,          [r0]
+@   unsigned int log_n_m2)   [r1]
+
+function rpi_zap_coeff_vals_neon, export=1
+        vmov.i64 q8, #0
+        adr     r12, zc_tab
+        vmov.i64 q9, #0
+        tst     r0, #63
+        vmov.i64 q10, #0
+        add     r0, #63
+        vmov.i64 q11, #0
+        and     r0, #~63
+        ldr     pc, [r12, r1, lsl #2]
+
+zc_tab:
+        .word   zc_lc2
+        .word   zc_lc3
+        .word   zc_lc4
+        .word   zc_lc5
+
+@ 4*4*2: "32 bytes" 64 or 0 depending on dst address
+zc_lc2:
+        it eq
+        vstmeq  r0, {q8-q11}
+        bx      lr
+
+@ 16*16*2 = 512 = 64 * 8
+zc_lc4:
+        vstm    r0!, {q8-q11}
+        vstm    r0!, {q8-q11}
+        vstm    r0!, {q8-q11}
+        vstm    r0!, {q8-q11}
+        vstm    r0!, {q8-q11}
+        vstm    r0!, {q8-q11}
+@ 8*8*2 = 128
+zc_lc3:
+        vstm    r0!, {q8-q11}
+        vstm    r0,  {q8-q11}
+        bx      lr
+
+@ 32*32*2 = 2048 = 128 * 16
+zc_lc5:
+        vmov.i64 q12, #0
+        vmov.i64 q13, #0
+        vmov.i64 q14, #0
+        vmov.i64 q15, #0
+        mov     r2, #4
+1:
+        vstm    r0!, {q8-q15}
+        subs    r2, #1
+        vstm    r0!, {q8-q15}
+        vstm    r0!, {q8-q15}
+        vstm    r0!, {q8-q15}
+        bne     1b
+        bx      lr
+
+endfunc
+
diff --git a/libavcodec/arm/hevcdsp_cres_neon.S b/libavcodec/arm/hevcdsp_cres_neon.S
new file mode 100644
index 0000000000..bafefd4318
--- /dev/null
+++ b/libavcodec/arm/hevcdsp_cres_neon.S
@@ -0,0 +1,296 @@
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
+@ General notes:
+@
+@ Residual is only guaranteed to be cliped to 16 bits
+@ This means that we do need to do movul, qadd, qmovun
+@ rather than addw, qmovun (if we were clipped to 15 then we could get away
+@ with this)
+
+@ ============================================================================
+@ U add
+
+@ add_residual4x4_c(
+@   uint8_t *_dst,        [r0]
+@   const int16_t *res,   [r1]
+@   ptrdiff_t stride,     [r2]
+@   int dc_v)             [r3]
+
+function ff_hevc_add_residual_4x4_u_neon_8, export=1
+        vld1.8      {d16}, [r0, :64], r2
+        vld1.8      {d17}, [r0, :64], r2
+        vld1.8      {d18}, [r0, :64], r2
+        vld1.8      {d19}, [r0, :64], r2
+        vld1.16     {q0, q1}, [r1]
+        vdup.16     q2, r3
+        vdup.16     q3, r3
+        vmovl.u8    q10, d16
+        sub         r0, r0, r2, lsl #2
+        vmovl.u8    q11, d17
+        vmovl.u8    q12, d18
+        vmovl.u8    q13, d19
+        vzip.16     q0, q2
+        vzip.16     q1, q3
+        vqadd.s16   q0,  q10
+        vqadd.s16   q2,  q11
+        vqadd.s16   q1,  q12
+        vqadd.s16   q3,  q13
+        vqmovun.s16 d0,  q0
+        vqmovun.s16 d1,  q2
+        vqmovun.s16 d2,  q1
+        vqmovun.s16 d3,  q3
+        vst1.8      {d0}, [r0, :64], r2
+        vst1.8      {d1}, [r0, :64], r2
+        vst1.8      {d2}, [r0, :64], r2
+        vst1.8      {d3}, [r0, :64]
+        bx          lr
+endfunc
+
+@ add_residual8x8_c(
+@   uint8_t *_dst,        [r0]
+@   const int16_t *res,   [r1]
+@   ptrdiff_t stride)     [r2]
+@   int dc_v)             [r3]
+
+function ff_hevc_add_residual_8x8_u_neon_8, export=1
+        mov         r12,    #4
+        vdup.16     q15, r3
+1:
+        vld2.8      {d16, d17}, [r0, :128], r2
+        vld2.8      {d18, d19}, [r0, :128]
+        vld1.16     {q0, q1}, [r1, :256]!
+        subs        r12, #1
+        vmovl.u8    q10, d16
+        sub         r0, r2
+        vmovl.u8    q11, d18
+        vqadd.s16   q0,  q10
+        vaddw.u8    q2,  q15, d17
+        vqadd.s16   q1,  q11
+        vaddw.u8    q3,  q15, d19
+        vqmovun.s16 d16,  q0
+        vqmovun.s16 d17,  q2
+        vqmovun.s16 d18,  q1
+        vqmovun.s16 d19,  q3
+        vst2.8      {d16, d17}, [r0, :128], r2
+        vst2.8      {d18, d19}, [r0, :128], r2
+        bne         1b
+        bx          lr
+endfunc
+
+@ add_residual16x16_u(
+@   uint8_t *_dst,        [r0]
+@   const int16_t *res,   [r1]
+@   ptrdiff_t stride)     [r2]
+@   int dc_v)             [r3]
+
+function ff_hevc_add_residual_16x16_u_neon_8, export=1
+        mov         r12,    #16
+        vdup.16     q15, r3
+1:
+        vld2.8      {q8, q9}, [r0, :256]
+        vld1.16     {q0, q1}, [r1, :256]!
+        subs        r12,   #1
+        vmovl.u8    q10, d16
+        vmovl.u8    q11, d17
+        vqadd.s16   q0,  q10
+        vqadd.s16   q1,  q11
+        vaddw.u8    q2,  q15, d18
+        vaddw.u8    q3,  q15, d19
+        vqmovun.s16 d16, q0
+        vqmovun.s16 d17, q1
+        vqmovun.s16 d18, q2
+        vqmovun.s16 d19, q3
+        vst2.8      {q8, q9}, [r0, :256], r2
+        bne         1b
+        bx          lr
+endfunc
+
+@ ============================================================================
+@ V add
+
+@ add_residual4x4_v(
+@   uint8_t *_dst,        [r0]
+@   const int16_t *res,   [r1]
+@   ptrdiff_t stride)     [r2]
+
+function ff_hevc_add_residual_4x4_v_neon_8, export=1
+        vld1.8      {d16}, [r0, :64], r2
+        vld1.8      {d17}, [r0, :64], r2
+        vld1.8      {d18}, [r0, :64], r2
+        vld1.8      {d19}, [r0, :64], r2
+        vld1.16     {q2, q3}, [r1]
+        vdup.16     q0, r3
+        vdup.16     q1, r3
+        vmovl.u8    q10, d16
+        sub         r0, r0, r2, lsl #2
+        vmovl.u8    q11, d17
+        vmovl.u8    q12, d18
+        vmovl.u8    q13, d19
+        vzip.16     q0, q2
+        vzip.16     q1, q3
+        vqadd.s16   q0,  q10
+        vqadd.s16   q2,  q11
+        vqadd.s16   q1,  q12
+        vqadd.s16   q3,  q13
+        vqmovun.s16 d0,  q0
+        vqmovun.s16 d1,  q2
+        vqmovun.s16 d2,  q1
+        vqmovun.s16 d3,  q3
+        vst1.8      {d0}, [r0, :64], r2
+        vst1.8      {d1}, [r0, :64], r2
+        vst1.8      {d2}, [r0, :64], r2
+        vst1.8      {d3}, [r0, :64]
+        bx          lr
+endfunc
+
+@ add_residual8x8_v(
+@   uint8_t *_dst,        [r0]
+@   const int16_t *res,   [r1]
+@   ptrdiff_t stride)     [r2]
+
+function ff_hevc_add_residual_8x8_v_neon_8, export=1
+        mov         r12,    #4
+        vdup.16     q15, r3
+1:
+        vld2.8      {d16, d17}, [r0, :128], r2
+        vld2.8      {d18, d19}, [r0, :128]
+        vld1.16     {q0, q1}, [r1, :256]!
+        subs        r12, #1
+        vmovl.u8    q10, d17
+        sub         r0, r2
+        vmovl.u8    q11, d19
+        vqadd.s16   q0,  q10
+        vqadd.s16   q1,  q11
+        vaddw.u8    q2,  q15, d16
+        vaddw.u8    q3,  q15, d18
+        vqmovun.s16 d17,  q0
+        vqmovun.s16 d16,  q2
+        vqmovun.s16 d19,  q1
+        vqmovun.s16 d18,  q3
+        vst2.8      {d16, d17}, [r0, :128], r2
+        vst2.8      {d18, d19}, [r0, :128], r2
+        bne         1b
+        bx          lr
+endfunc
+
+@ add_residual16x16_v(
+@   uint8_t *_dst,        [r0]
+@   const int16_t *res,   [r1]
+@   ptrdiff_t stride)     [r2]
+
+function ff_hevc_add_residual_16x16_v_neon_8, export=1
+        mov         r12,    #16
+        vdup.16     q15, r3
+1:
+        vld2.8      {q8, q9}, [r0, :256]
+        vld1.16     {q0, q1}, [r1, :256]!
+        subs        r12,   #1
+        vmovl.u8    q10, d18
+        vmovl.u8    q11, d19
+        vaddw.u8    q2,  q15, d16
+        vaddw.u8    q3,  q15, d17
+        vqadd.s16   q0,  q10
+        vqadd.s16   q1,  q11
+        vqmovun.s16 d16, q2
+        vqmovun.s16 d17, q3
+        vqmovun.s16 d18, q0
+        vqmovun.s16 d19, q1
+        vst2.8      {q8, q9}, [r0, :256], r2
+        bne         1b
+        bx          lr
+endfunc
+
+@ ============================================================================
+@ U & V add
+
+@ add_residual4x4_c(
+@   uint8_t *_dst,        [r0]
+@   const int16_t *res,   [r1]
+@   ptrdiff_t stride)     [r2]
+
+function ff_hevc_add_residual_4x4_c_neon_8, export=1
+        vld1.8      {d16}, [r0, :64], r2
+        vld1.8      {d17}, [r0, :64], r2
+        vld1.8      {d18}, [r0, :64], r2
+        vld1.8      {d19}, [r0, :64], r2
+        vldm        r1, {q0-q3}           @ Q0/1 gets all of U, Q2/3 gets all of V
+        vmovl.u8    q10, d16
+        sub         r0, r0, r2, lsl #2
+        vmovl.u8    q11, d17
+        vmovl.u8    q12, d18
+        vmovl.u8    q13, d19
+        vzip.16     q0, q2
+        vzip.16     q1, q3
+        vqadd.s16   q0,  q10
+        vqadd.s16   q2,  q11
+        vqadd.s16   q1,  q12
+        vqadd.s16   q3,  q13
+        vqmovun.s16 d0,  q0
+        vqmovun.s16 d1,  q2
+        vqmovun.s16 d2,  q1
+        vqmovun.s16 d3,  q3
+        vst1.8      {d0}, [r0, :64], r2
+        vst1.8      {d1}, [r0, :64], r2
+        vst1.8      {d2}, [r0, :64], r2
+        vst1.8      {d3}, [r0, :64]
+        bx          lr
+endfunc
+
+@ add_residual8x8_c(
+@   uint8_t *_dst,        [r0]
+@   const int16_t *res,   [r1]
+@   ptrdiff_t stride)     [r2]
+
+function ff_hevc_add_residual_8x8_c_neon_8, export=1
+        mov         r12,    #8
+        add         r3, r1, #(8*8*2)  @ Offset to V
+1:
+        vld2.8      {d16, d17}, [r0, :128]
+        vld1.16     {q0}, [r1, :128]!
+        vld1.16     {q1}, [r3, :128]!
+        subs        r12, #1
+        vmovl.u8    q10, d16
+        vmovl.u8    q11, d17
+        vqadd.s16   q0,  q10
+        vqadd.s16   q1,  q11
+        vqmovun.s16 d0,  q0
+        vqmovun.s16 d1,  q1
+        vst2.8      {d0, d1}, [r0, :128], r2
+        bne         1b
+        bx          lr
+endfunc
+
+@ add_residual16x16_c(
+@   uint8_t *_dst,        [r0]
+@   const int16_t *res,   [r1]
+@   ptrdiff_t stride)     [r2]
+
+function ff_hevc_add_residual_16x16_c_neon_8, export=1
+        mov         r12,    #16
+        add         r3, r1, #(16*16*2)  @ Offset to V
+1:
+        vld2.8      {q8, q9}, [r0, :256]
+        vld1.16     {q0, q1}, [r1, :256]!
+        vld1.16     {q2, q3}, [r3, :256]!
+        subs        r12,   #1
+        vmovl.u8    q10, d16
+        vmovl.u8    q11, d17
+        vmovl.u8    q12, d18
+        vmovl.u8    q13, d19
+        vqadd.s16   q0,  q10
+        vqadd.s16   q1,  q11
+        vqadd.s16   q2,  q12
+        vqadd.s16   q3,  q13
+        vqmovun.s16 d0,  q0
+        vqmovun.s16 d1,  q1
+        vqmovun.s16 d2,  q2
+        vqmovun.s16 d3,  q3
+        vst2.8      {q0, q1}, [r0, :256], r2
+        bne         1b
+        bx          lr
+endfunc
+
+@ 32x32 chroma never occurs so NIF
+
+@ ============================================================================
diff --git a/libavcodec/arm/hevcdsp_deblock_neon.S b/libavcodec/arm/hevcdsp_deblock_neon.S
index 166bddb104..15c4329cdb 100644
--- a/libavcodec/arm/hevcdsp_deblock_neon.S
+++ b/libavcodec/arm/hevcdsp_deblock_neon.S
@@ -15,7 +15,7 @@
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1
  */


@@ -24,70 +24,238 @@

 .macro hevc_loop_filter_chroma_start
         ldr      r12, [r2]
-        ldr      r3, [r2, #4]
-        add      r2, r3, r12
-        cmp      r2, #0
+        ldr      r2, [r2, #4]
+        orrs     r2, r12, r2, lsl #16
         it       eq
         bxeq     lr
 .endm

-.macro hevc_loop_filter_chroma_body
-        vsubl.u8  q3, d4, d2
-        vsubl.u8  q11, d18, d19
-        vshl.i16  q3, #2
-        vadd.i16  q11, q3
-        vdup.16   d0, r12
-        vdup.16   d1, r3
-        vrshr.s16 q11, q11, #3
-        vneg.s16  q12, q0
+@ Uses: d2, d4, d18, d19
+@ Returns: d2, d4
+@ Modifies: d0-d7, d22-d25, r12
+
+.macro hevc_loop_filter_chroma_body P1, P0, Q0, Q1
+        vsubl.u8  q0, \Q0, \P0
+        vsubl.u8  q1, \P1, \Q1
+        vdup.16   d4, r2
+        lsr       r2, r2, #16
+        vshl.i16  q0, #2
+        ldr       r12, [sp, #0] @ r12 = &no_q
+        vadd.i16  q0, q1
+        ldrh      r3, [r3]      @ r3[0:8] = no_p[0], r3[8:15] = no_p[1]
+        vdup.16   d5, r2
+
+        vrshr.s16 q0, q0, #3
+        ldrh      r12, [r12]
+        vneg.s16  q3, q2
+        vmin.s16  q0, q0, q2
+        vmovl.u8  q2, \Q0
+        vmax.s16  q0, q0, q3
+        vaddw.u8  q1, q0, \P0
+        vsub.i16  q2, q0
+        orrs      r12, r3, r12, lsl #16  @ So should have b1:no_p[0], b9:no_p[1], b17: no_q[0], b25:no_q[1]
+        vqmovun.s16 \P0, q1
+        vqmovun.s16 \Q0, q2
+.endm
+
+@ Uses r2 (tc a;b)
+@ Modifies: q0-q3
+@ On exit
+@   r12 (and flags) contain no_p;no_q
+.macro hevc_loop_filter_chroma_body_16 P1, P0, Q0, Q1, bit_depth
+        vsub.i16  q0, \Q0, \P0
+        lsl       r12, r2, #(\bit_depth - 8)
+        vsub.i16  q1, \P1, \Q1
+        vshl.i16  q0, #2
+        vdup.16   d4, r12
+        lsr       r12, r12, #16
+        vadd.i16  q0, q1
+        ldrh      r3, [r3]
+        vdup.16   d5, r12
+
+        vrshr.s16 q0, q0, #3
+        vneg.s16  q3, q2
+        movw      r12, #(1 << \bit_depth) - 1
+        vmin.s16  q0, q0, q2
+        vmax.s16  q0, q0, q3
+        vdup.i16  q3, r12
+        ldr       r12, [sp, #0]
+
+        vadd.i16  \P0, q0, \P0
+        vsub.i16  \Q0, q0
+
+        vmov.i64  q2, #0
+        ldrh      r12, [r12]
+        vmin.s16  \P0, q3
+        vmin.s16  \Q0, q3
+        orrs      r12, r3, r12, lsl #16  @ So should have b1:no_p[0], b9:no_p[1], b17: no_q[0], b25:no_q[1]
+        vmax.s16  \P0, q2
+        vmax.s16  \Q0, q2
+.endm
+
+
+@ Preserves r12
+@ Clobbers r2
+.macro hevc_loop_filter_uv_body2 P1u, P1v, P0u, P0v, Q0u, Q0v, Q1u, Q1v
+        vsubl.u8  q0, \Q0u, \P0u
+        vsubl.u8  q1, \Q0v, \P0v
+        vsubl.u8  q2, \P1u, \Q1u
+        vsubl.u8  q3, \P1v, \Q1v
+        vshl.i16  q0, #2
+        vshl.i16  q1, #2
+        vadd.i16  q0, q2
+        vdup.16   d4, r2
+        lsr       r2, #16
+        vadd.i16  q1, q3
+
+        @ r2[0:7] -> d4.16 (all), r2[8:15] -> d5.16(all)
+        vrshr.s16 q0, #3
+        vdup.16   d6, r2
         vmovl.u8  q2, d4
-        vmin.s16  q11, q11, q0
-        vmax.s16  q11, q11, q12
-        vaddw.u8  q1, q11, d2
-        vsub.i16  q2, q11
-        vqmovun.s16 d2, q1
-        vqmovun.s16 d4, q2
+        vmovl.u8  q3, d6
+        vuzp.16   d4, d5
+        vrshr.s16 q1, #3
+        vuzp.16   d6, d7
+
+        vmin.s16  q0, q2
+        vneg.s16  q2, q2
+        vmin.s16  q1, q3
+        vneg.s16  q3, q3
+        vmax.s16  q0, q2
+        vaddw.u8  q2, q0, \P0u
+        vmax.s16  q1, q3
+        vaddw.u8  q3, q1, \P0v
+
+        vqmovun.s16 \P0u, q2
+        vmovl.u8  q2, \Q0u
+        vqmovun.s16 \P0v, q3
+        vmovl.u8  q3, \Q0v
+        vsub.i16  q2, q0
+        vsub.i16  q3, q1
+
+        vqmovun.s16 \Q0u, q2
+        vqmovun.s16 \Q0v, q3
 .endm

+@ Preserves r12
+@ Clobbers r2
+.macro hevc_loop_filter_uv_body2_16 P1u, P1v, P0u, P0v, Q0u, Q0v, Q1u, Q1v, bit_depth
+        vsub.i16  q0, \Q0u, \P0u
+        vsub.i16  q1, \Q0v, \P0v
+        vsub.i16  q2, \P1u, \Q1u
+        vsub.i16  q3, \P1v, \Q1v
+        vshl.i16  q0, #2
+        vshl.i16  q1, #2
+        vadd.i16  q0, q2
+        vdup.16   d4, r2
+        lsr       r2, #16
+        vadd.i16  q1, q3
+
+        @ r2[0:7] -> d4.16 (all), r2[8:15] -> d5.16(all)
+        vrshr.s16 q0, #3
+        vdup.16   d6, r2
+        vshll.u8  q2, d4, #\bit_depth - 8
+        vshll.u8  q3, d6, #\bit_depth - 8
+        vuzp.16   d4, d5
+        vrshr.s16 q1, #3
+        vuzp.16   d6, d7
+
+        movw      r2, #(1 << \bit_depth) - 1
+        vmin.s16  q0, q2
+        vneg.s16  q2, q2
+        vmin.s16  q1, q3
+        vneg.s16  q3, q3
+        vmax.s16  q0, q2
+        vmov.i64  q2, #0
+        vmax.s16  q1, q3
+        vdup.i16  q3, r2
+        vadd.i16  \P0u, q0
+        vsub.i16  \Q0u, q0
+        vadd.i16  \P0v, q1
+        vsub.i16  \Q0v, q1
+
+        vmax.s16  \P0u, q2
+        vmax.s16  \Q0u, q2
+        vmax.s16  \P0v, q2
+        vmax.s16  \Q0v, q2
+        vmin.s16  \P0u, q3
+        vmin.s16  \Q0u, q3
+        vmin.s16  \P0v, q3
+        vmin.s16  \Q0v, q3
+.endm
+
+
+
 .macro hevc_loop_filter_luma_start
         ldr     r12, [r3]
         ldr      r3, [r3, #4]
-        lsl      r3, #16
-        orr      r3, r12
-        cmp      r3, #0
+        orrs     r3, r12, r3, lsl #16
         it       eq
         bxeq     lr
-        lsr      r3, #16
 .endm

-.macro hevc_loop_filter_luma_body
-        vmovl.u8  q8, d16
-        vmovl.u8  q9, d18
-        vmovl.u8  q10, d20
-        vmovl.u8  q11, d22
-        vmovl.u8  q12, d24
-        vmovl.u8  q13, d26
-        vmovl.u8  q14, d28
-        vmovl.u8  q15, d30
+@ Uses: r2, r3, r12
+@ Modifies: r5, r6, r7, r8, r9
+
+@ Input:
+@  r2          beta    (raw: needs shift for bitdepth > 8)
+@  r3[ 0:15]   tc[0]   (raw: needs shift for bitdepth > 8)
+@  r3[16:31]   tc[1]   (raw: needs shift for bitdepth > 8)
+@  [sp,#96]    &no_p[0]
+@  [sp,#100]   &no_q[0]
+@
+@ Input & output
+@  8-bit: d16-d23
+@ 16-bit:  q8-q15
+@
+@ Output
+@  Z           r10==0
+@  r10[ 0:7 ]  no_p[0]
+@  r10[ 8:15]  no_p[1]
+@  r10[16:23]  no_q[0]
+@  r10[24:31]  no_q[1]
+

+.macro m_filter_luma bit_depth
+.if \bit_depth == 8
+        vmovl.u8  q15, d23
+        vmovl.u8  q14, d22
+        vmovl.u8  q13, d21
+        vmovl.u8  q12, d20
+        vmovl.u8  q11, d19
+        vmovl.u8  q10, d18
+        vmovl.u8  q9, d17
+        vmovl.u8  q8, d16
+.endif
         vadd.i16   q7, q9, q11
+.if \bit_depth > 8
+        lsl        r2, r2, #(\bit_depth - 8)
+.endif
         vadd.i16   q6, q14, q12
+.if \bit_depth > 8
+        lsl        r3, r3, #(\bit_depth - 8)
+.endif
         vsub.i16   q7, q10
+        ldr        r5, [sp, #96]        @ Bolt no_x values together into r10
         vsub.i16   q6, q13
         vabd.s16   q7, q7, q10
         vabd.s16   q6, q6, q13
-
+        ldrh       r10, [r5]

         vdup.16    q0, r2
         vmov       q4, q7
         vmov       q5, q6
-        vdup.16    d4, r12
+        ldr        r5, [sp, #100]
+        vdup.16    d4, r3
+        lsr        r3, r3, #16
         vtrn.16    q7, q4
+        ldrh       r5, [r5]
         vtrn.16    q6, q5

         vshl.u64   q7, #32
         vshr.u64   q4, #32
         vshl.u64   q6, #32
+        orr        r10, r10, r5, lsl #16
         vshr.u64   q5, #32
         vshr.u64   q7, #32
         vshr.u64   q6, #32
@@ -152,7 +320,7 @@

         and        r9, r8, r7
         cmp        r9, #0
-        beq        weakfilter_\@
+        beq        1f

         vadd.i16  q2, q11, q12
         vadd.i16  q4, q9, q8
@@ -210,11 +378,11 @@
         vbit      q13, q3, q5
         vbit      q14, q2, q5

-weakfilter_\@:
+1:
         mvn       r8, r8
         and       r9, r8, r7
         cmp       r9, #0
-        beq       ready_\@
+        beq       2f

         vdup.16    q4, r2

@@ -275,111 +443,1041 @@ weakfilter_\@:
         vbit      q11, q0, q5
         vbit      q12, q4, q5

-ready_\@:
+2:
+.if \bit_depth == 8
         vqmovun.s16 d16, q8
-        vqmovun.s16 d18, q9
-        vqmovun.s16 d20, q10
-        vqmovun.s16 d22, q11
-        vqmovun.s16 d24, q12
-        vqmovun.s16 d26, q13
-        vqmovun.s16 d28, q14
-        vqmovun.s16 d30, q15
+        cmp       r10, #0
+        vqmovun.s16 d17, q9
+        vqmovun.s16 d18, q10
+        vqmovun.s16 d19, q11
+        vqmovun.s16 d20, q12
+        vqmovun.s16 d21, q13
+        vqmovun.s16 d22, q14
+        vqmovun.s16 d23, q15
+.else
+        movw      r12, #(1 << \bit_depth - 1)
+        vmov.i64  q0, #0
+        vdup.i16  q1, r12
+        @ q8 & q15 should be unaltered and so don't require clipping
+        vmax.s16  q9,  q0
+        cmp       r10, #0
+        vmax.s16  q10, q0
+        vmax.s16  q11, q0
+        vmax.s16  q12, q0
+        vmax.s16  q13, q0
+        vmax.s16  q14, q0
+        vmin.s16  q9,  q1
+        vmin.s16  q10, q1
+        vmin.s16  q11, q1
+        vmin.s16  q12, q1
+        vmin.s16  q13, q1
+        vmin.s16  q14, q1
+.endif
+        mov       pc, lr
 .endm

+function hevc_loop_filter_luma_body
+        m_filter_luma 8
+endfunc
+
+@ ff_hevc_v_loop_filter_luma2_neon(src (r0), stride (r1), beta (r2), tc (r3), np_p (sp[0]), no_q (sp[4]), src2 (sp[8]))
+function ff_hevc_v_loop_filter_luma2_neon_8, export=1
+        hevc_loop_filter_luma_start
+        push     {r4-r10,lr}       @ 8 regs = 32 bytes
+
+        ldr      r4, [sp, #40]
+        b        v_loop_luma_common
+endfunc
+
+
+@ void ff_hevc_v_loop_filter_luma_neon(
+@   uint8_t *_pix,      [r0]
+@   ptrdiff_t _stride,  [r1]
+@   int _beta,          [r2]
+@   int *_tc,           [r3]
+@   uint8_t *_no_p,     [sp+0]
+@   uint8_t *_no_q)     [sp+4]
+
+
 function ff_hevc_v_loop_filter_luma_neon, export=1
         hevc_loop_filter_luma_start
-        push     {r5-r11}
+        push     {r4-r10,lr}
+
+        sub      r4, r0, #4
+v_loop_luma_common:
         vpush    {d8-d15}
-        sub      r0, #4
-        vld1.8   {d16}, [r0], r1
-        vld1.8   {d18}, [r0], r1
-        vld1.8   {d20}, [r0], r1
-        vld1.8   {d22}, [r0], r1
-        vld1.8   {d24}, [r0], r1
-        vld1.8   {d26}, [r0], r1
-        vld1.8   {d28}, [r0], r1
-        vld1.8   {d30}, [r0], r1
-        sub      r0, r0, r1, lsl #3
-        transpose_8x8 d16, d18, d20, d22, d24, d26, d28, d30
-        hevc_loop_filter_luma_body
-        transpose_8x8 d16, d18, d20, d22, d24, d26, d28, d30
-        vst1.8   {d16}, [r0], r1
-        vst1.8   {d18}, [r0], r1
-        vst1.8   {d20}, [r0], r1
-        vst1.8   {d22}, [r0], r1
-        vst1.8   {d24}, [r0], r1
-        vst1.8   {d26}, [r0], r1
-        vst1.8   {d28}, [r0], r1
-        vst1.8   {d30}, [r0]
+
+        @ Uses slightly fewer instructions to do laned loads than unlaned
+        @ and transpose.  This also means that we can use the same code for
+        @ both split & unsplit deblock
+        vld4.8  {d16[0],d17[0],d18[0],d19[0]}, [r4:32], r1
+        vld4.8  {d20[0],d21[0],d22[0],d23[0]}, [r0:32], r1
+
+        vld4.8  {d16[1],d17[1],d18[1],d19[1]}, [r4:32], r1
+        vld4.8  {d20[1],d21[1],d22[1],d23[1]}, [r0:32], r1
+
+        vld4.8  {d16[2],d17[2],d18[2],d19[2]}, [r4:32], r1
+        vld4.8  {d20[2],d21[2],d22[2],d23[2]}, [r0:32], r1
+
+        vld4.8  {d16[3],d17[3],d18[3],d19[3]}, [r4:32], r1
+        vld4.8  {d20[3],d21[3],d22[3],d23[3]}, [r0:32], r1
+
+        vld4.8  {d16[4],d17[4],d18[4],d19[4]}, [r4:32], r1
+        vld4.8  {d20[4],d21[4],d22[4],d23[4]}, [r0:32], r1
+
+        vld4.8  {d16[5],d17[5],d18[5],d19[5]}, [r4:32], r1
+        vld4.8  {d20[5],d21[5],d22[5],d23[5]}, [r0:32], r1
+
+        vld4.8  {d16[6],d17[6],d18[6],d19[6]}, [r4:32], r1
+        vld4.8  {d20[6],d21[6],d22[6],d23[6]}, [r0:32], r1
+
+        vld4.8  {d16[7],d17[7],d18[7],d19[7]}, [r4:32]
+        vld4.8  {d20[7],d21[7],d22[7],d23[7]}, [r0:32]
+
+        bl hevc_loop_filter_luma_body
+
+        neg     r1, r1
+
+        @ no_p[1]
+        tst     r10, #0xff00
+        add     r2, r4, r1, lsl #2
+        bne     1f
+        vst4.8  {d16[7],d17[7],d18[7],d19[7]}, [r4:32], r1
+        vst4.8  {d16[6],d17[6],d18[6],d19[6]}, [r4:32], r1
+        vst4.8  {d16[5],d17[5],d18[5],d19[5]}, [r4:32], r1
+        vst4.8  {d16[4],d17[4],d18[4],d19[4]}, [r4:32]
+1:
+        @ no_p[0]
+        tst     r10, #0xff
+        bne     1f
+        vst4.8  {d16[3],d17[3],d18[3],d19[3]}, [r2:32], r1
+        vst4.8  {d16[2],d17[2],d18[2],d19[2]}, [r2:32], r1
+        vst4.8  {d16[1],d17[1],d18[1],d19[1]}, [r2:32], r1
+        vst4.8  {d16[0],d17[0],d18[0],d19[0]}, [r2:32]
+1:
+        @ no_q[1]
+        tst     r10, #0xff000000
+        add     r2, r0, r1, lsl #2
+        bne     1f
+        vst4.8  {d20[7],d21[7],d22[7],d23[7]}, [r0:32], r1
+        vst4.8  {d20[6],d21[6],d22[6],d23[6]}, [r0:32], r1
+        vst4.8  {d20[5],d21[5],d22[5],d23[5]}, [r0:32], r1
+        vst4.8  {d20[4],d21[4],d22[4],d23[4]}, [r0:32]
+1:
+        @ no_q[0]
+        tst     r10, #0xff0000
+        bne     1f
+        vst4.8  {d20[3],d21[3],d22[3],d23[3]}, [r2:32], r1
+        vst4.8  {d20[2],d21[2],d22[2],d23[2]}, [r2:32], r1
+        vst4.8  {d20[1],d21[1],d22[1],d23[1]}, [r2:32], r1
+        vst4.8  {d20[0],d21[0],d22[0],d23[0]}, [r2:32]
+1:
+bypasswrite:
         vpop     {d8-d15}
-        pop      {r5-r11}
-        bx lr
+        pop      {r4-r10,pc}
 endfunc

+.macro m_filter_v_luma_common_16 bit_depth
+        vpush    {d8-d15}
+
+        @ Uses slightly fewer instructions to do laned loads than unlaned
+        @ and transpose.  This also means that we can use the same code for
+        @ both split & unsplit deblock
+        vld4.16  {d16[0], d18[0], d20[0], d22[0]}, [r4], r1
+        vld4.16  {d24[0], d26[0], d28[0], d30[0]}, [r0], r1
+
+        vld4.16  {d16[1], d18[1], d20[1], d22[1]}, [r4], r1
+        vld4.16  {d24[1], d26[1], d28[1], d30[1]}, [r0], r1
+
+        vld4.16  {d16[2], d18[2], d20[2], d22[2]}, [r4], r1
+        vld4.16  {d24[2], d26[2], d28[2], d30[2]}, [r0], r1
+
+        vld4.16  {d16[3], d18[3], d20[3], d22[3]}, [r4], r1
+        vld4.16  {d24[3], d26[3], d28[3], d30[3]}, [r0], r1
+
+        vld4.16  {d17[0], d19[0], d21[0], d23[0]}, [r4], r1
+        vld4.16  {d25[0], d27[0], d29[0], d31[0]}, [r0], r1
+
+        vld4.16  {d17[1], d19[1], d21[1], d23[1]}, [r4], r1
+        vld4.16  {d25[1], d27[1], d29[1], d31[1]}, [r0], r1
+
+        vld4.16  {d17[2], d19[2], d21[2], d23[2]}, [r4], r1
+        vld4.16  {d25[2], d27[2], d29[2], d31[2]}, [r0], r1
+
+        vld4.16  {d17[3], d19[3], d21[3], d23[3]}, [r4]
+        vld4.16  {d25[3], d27[3], d29[3], d31[3]}, [r0]
+
+        bl hevc_loop_filter_luma_body_\bit_depth
+
+        neg     r1, r1
+
+        @ p[1]
+        tst      r10, #0xff00
+        add      r2, r4, r1, lsl #2
+        bne      1f
+        vst4.16  {d17[3], d19[3], d21[3], d23[3]}, [r4], r1
+        vst4.16  {d17[2], d19[2], d21[2], d23[2]}, [r4], r1
+        vst4.16  {d17[1], d19[1], d21[1], d23[1]}, [r4], r1
+        vst4.16  {d17[0], d19[0], d21[0], d23[0]}, [r4]
+1:
+        @ p[0]
+        tst      r10, #0xff
+        bne      1f
+        vst4.16  {d16[3], d18[3], d20[3], d22[3]}, [r2], r1
+        vst4.16  {d16[2], d18[2], d20[2], d22[2]}, [r2], r1
+        vst4.16  {d16[1], d18[1], d20[1], d22[1]}, [r2], r1
+        vst4.16  {d16[0], d18[0], d20[0], d22[0]}, [r2]
+1:
+        @ q[1]
+        tst      r10, #0xff000000
+        add      r2, r0, r1, lsl #2
+        bne      1f
+        vst4.16  {d25[3], d27[3], d29[3], d31[3]}, [r0], r1
+        vst4.16  {d25[2], d27[2], d29[2], d31[2]}, [r0], r1
+        vst4.16  {d25[1], d27[1], d29[1], d31[1]}, [r0], r1
+        vst4.16  {d25[0], d27[0], d29[0], d31[0]}, [r0]
+1:
+        @ q[0]
+        tst      r10, #0xff0000
+        bne      1f
+        vst4.16  {d24[3], d26[3], d28[3], d30[3]}, [r2], r1
+        vst4.16  {d24[2], d26[2], d28[2], d30[2]}, [r2], r1
+        vst4.16  {d24[1], d26[1], d28[1], d30[1]}, [r2], r1
+        vst4.16  {d24[0], d26[0], d28[0], d30[0]}, [r2]
+1:
+        vpop     {d8-d15}
+        pop      {r4-r10,pc}
+.endm
+
+
+
+
+@ void (*hevc_h_loop_filter_luma)(uint8_t *pix,     [r0]
+@                                 ptrdiff_t stride, [r1]
+@                                 int beta,         [r2]
+@                                 int32_t *tc,      [r3]
+@                                 uint8_t *no_p,    sp[0]
+@                                 uint8_t *no_q);   sp[4]
+@
+@ Src should always be on 8 byte boundry & all in the same slice
+
 function ff_hevc_h_loop_filter_luma_neon, export=1
         hevc_loop_filter_luma_start
-        push     {r5-r11}
+        push     {r4-r10,lr}
+
         vpush    {d8-d15}
         sub      r0, r0, r1, lsl #2
+
         vld1.8  {d16}, [r0], r1
+        vld1.8  {d17}, [r0], r1
         vld1.8  {d18}, [r0], r1
+        vld1.8  {d19}, [r0], r1
         vld1.8  {d20}, [r0], r1
+        vld1.8  {d21}, [r0], r1
         vld1.8  {d22}, [r0], r1
-        vld1.8  {d24}, [r0], r1
-        vld1.8  {d26}, [r0], r1
-        vld1.8  {d28}, [r0], r1
-        vld1.8  {d30}, [r0], r1
-        sub        r0, r0, r1, lsl #3
-        add        r0, r1
-        hevc_loop_filter_luma_body
-        vst1.8   {d18}, [r0], r1
-        vst1.8   {d20}, [r0], r1
-        vst1.8   {d22}, [r0], r1
-        vst1.8   {d24}, [r0], r1
-        vst1.8   {d26}, [r0], r1
-        vst1.8   {d28}, [r0]
-bypasswrite:
+        vld1.8  {d23}, [r0]
+
+        bl hevc_loop_filter_luma_body
+
         vpop     {d8-d15}
-        pop      {r5-r11}
-        bx lr
+
+        neg     r1, r1
+        add     r0, r0, r1
+
+        bne      1f
+
+        vst1.8  {d22}, [r0], r1
+        vst1.8  {d21}, [r0], r1
+        vst1.8  {d20}, [r0], r1
+        vst1.8  {d19}, [r0], r1
+        vst1.8  {d18}, [r0], r1
+        vst1.8  {d17}, [r0]
+
+        pop      {r4-r10,pc}
+
+@ Partial write
+1:
+        vmov     r2, r3, d22
+        vmov     r4, r5, d21
+        vmov     r6, r7, d20
+
+        tst      r10, #0xff0000
+        ittt eq
+        streq    r2, [r0]
+        streq    r4, [r0, r1]
+        streq    r6, [r0, r1, lsl # 1]
+
+        add      r0, r0, #4
+        tst      r10, #0xff000000
+        ittt eq
+        streq    r3, [r0]
+        streq    r5, [r0, r1]
+        streq    r7, [r0, r1, lsl # 1]
+
+        vmov     r2, r3, d19
+        vmov     r4, r5, d18
+        vmov     r6, r7, d17
+        add      r0, r0, r1
+        add      r0, r0, r1, lsl # 1
+
+        tst      r10, #0xff00
+        ittt eq
+        streq    r3, [r0]
+        streq    r5, [r0, r1]
+        streq    r7, [r0, r1, lsl # 1]
+
+        tst      r10, #0xff
+        ittt eq
+        streq    r2, [r0, #-4]!
+        streq    r4, [r0, r1]
+        streq    r6, [r0, r1, lsl # 1]
+
+        pop      {r4-r10,pc}
+
+endfunc
+
+
+.macro m_filter_h_luma_16 bit_depth
+        hevc_loop_filter_luma_start
+        push     {r4-r10,lr}
+
+        vpush    {d8-d15}
+        sub      r0, r0, r1, lsl #2
+
+        vld1.16 { q8}, [r0], r1
+        vld1.16 { q9}, [r0], r1
+        vld1.16 {q10}, [r0], r1
+        vld1.16 {q11}, [r0], r1
+        vld1.16 {q12}, [r0], r1
+        vld1.16 {q13}, [r0], r1
+        vld1.16 {q14}, [r0], r1
+        vld1.16 {q15}, [r0]
+
+        bl hevc_loop_filter_luma_body_\bit_depth
+
+        vpop     {d8-d15}
+
+        sub      r0, r1
+        neg      r1, r1
+        bne      1f
+
+        vst1.16  {q14}, [r0], r1
+        vst1.16  {q13}, [r0], r1
+        vst1.16  {q12}, [r0], r1
+        vst1.16  {q11}, [r0], r1
+        vst1.16  {q10}, [r0], r1
+        vst1.16  { q9}, [r0]
+        pop      {r4-r10,pc}
+
+@ Partial write
+1:
+        tst      r10, #0xff0000
+        mov      r2, r0
+        bne      1f
+        vst1.16  {d28}, [r2], r1
+        vst1.16  {d26}, [r2], r1
+        vst1.16  {d24}, [r2]
+
+1:
+        tst      r10, #0xff000000
+        add      r2, r0, #8
+        bne      1f
+        vst1.16  {d29}, [r2], r1
+        vst1.16  {d27}, [r2], r1
+        vst1.16  {d25}, [r2]
+
+1:
+        tst      r10, #0xff
+        @ r0 = r0 + r1 * 3
+        add      r0, r0, r1
+        add      r0, r0, r1, lsl # 1
+        add      r2, r0, #8
+        bne      1f
+        vst1.16  {d22}, [r0], r1
+        vst1.16  {d20}, [r0], r1
+        vst1.16  {d18}, [r0]
+
+1:
+        tst      r10, #0xff00
+        bne      1f
+        vst1.16  {d23}, [r2], r1
+        vst1.16  {d21}, [r2], r1
+        vst1.16  {d19}, [r2]
+
+1:
+        pop      {r4-r10,pc}
+.endm
+
+
+@ void ff_hevc_h_loop_filter_uv_neon(uint8_t * src_r,        // r0
+@                                     unsigned int stride,   // r1
+@                                     uint32_t tc4,          // r2
+@                                     unsigned int no_f);    // r3
+@
+@ no-F = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1]
+function ff_hevc_h_loop_filter_uv_neon_8, export=1
+        sub      r0, r0, r1, lsl #1
+        vld2.8   {d16,d17}, [r0], r1
+        vld2.8   {d18,d19}, [r0], r1
+        vld2.8   {d26,d27}, [r0], r1
+        vld2.8   {d28,d29}, [r0]
+        sub      r0, r0, r1, lsl #1
+        hevc_loop_filter_uv_body2 d16, d17, d18, d19, d26, d27, d28, d29
+        cmp      r3, #0
+        bne      1f
+        vst2.8   {d18,d19}, [r0], r1
+        vst2.8   {d26,d27}, [r0]
+        bx       lr
+
+        @ At least one no_f bit is set
+        @ Which means we need to break this apart in an ugly fashion
+1:      vzip.8   d18, d19
+        lsls     r2, r3, #31            @ b0 -> N, b1 -> C
+        vzip.8   d26, d27
+        sub      r1, r1, #8
+
+        bmi      1f
+        vst1.8   {d18}, [r0]
+1:      add      r0, r0, #8
+        bcs      2f
+        vst1.8   {d19}, [r0]
+2:      lsls     r2, r3, #29            @ b2 -> N, b3 -> C
+        add      r0, r0, r1
+
+        bmi      1f
+        vst1.8   {d26}, [r0]
+1:      it cs
+        bxcs     lr
+        add      r0, r0, #8
+        vst1.8   {d27}, [r0]
+        bx       lr
+
+endfunc
+
+
+@ void ff_hevc_h_loop_filter_uv_neon_10(uint8_t * src_r,     // r0
+@                                     unsigned int stride,   // r1
+@                                     uint32_t tc4,          // r2
+@                                     unsigned int no_f);    // r3
+@
+@ no-F = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1]
+@
+@ Macro here actual function near bottom
+
+.macro m_filter_h_uv_16 bit_depth
+        sub      r0, r0, r1, lsl #1
+        vld2.16  {q8,  q9 }, [r0], r1
+        vld2.16  {q10, q11}, [r0], r1
+        vld2.16  {q12, q13}, [r0], r1
+        vld2.16  {q14, q15}, [r0]
+        sub      r0, r0, r1, lsl #1
+
+        hevc_loop_filter_uv_body2_16 q8, q9, q10, q11, q12, q13, q14, q15, \bit_depth
+
+        cmp      r3, #0
+        bne      1f
+        vst2.16  {q10, q11}, [r0], r1
+        vst2.16  {q12, q13}, [r0]
+        bx       lr
+
+        @ At least one no_f bit is set
+        @ Which means we need to break this apart in an ugly fashion
+1:      vzip.16  q10, q11
+        lsls     r2, r3, #31            @ b0 -> N, b1 -> C
+        vzip.16  q12, q13
+        sub      r1, r1, #16
+
+        bmi      1f
+        vst1.16  {q10}, [r0]
+1:      add      r0, r0, #16
+        bcs      2f
+        vst1.16  {q11}, [r0]
+2:      lsls     r2, r3, #29            @ b2 -> N, b3 -> C
+        add      r0, r0, r1
+
+        bmi      1f
+        vst1.16  {q12}, [r0]
+1:      it cs
+        bxcs     lr
+        add      r0, r0, #16
+        vst1.16  {q13}, [r0]
+        bx       lr
+.endm
+
+
+@ void ff_hevc_v_loop_filter_uv2_neon(uint8_t * src_r,       // r0
+@                                     unsigned int stride,   // r1
+@                                     uint32_t tc4,          // r2
+@                                     uint8_t * src_l,       // r3
+@                                     unsigned int no_f);   // sp[0]
+@
+@ no_f = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1]
+
+function ff_hevc_v_loop_filter_uv2_neon_8, export=1
+        vld4.8   {d16[0], d17[0], d18[0], d19[0]}, [r3], r1
+        vld4.8   {d20[0], d21[0], d22[0], d23[0]}, [r0], r1
+        sub      r12, r0, r3
+
+        vld4.8   {d16[1], d17[1], d18[1], d19[1]}, [r3], r1
+        vld4.8   {d20[1], d21[1], d22[1], d23[1]}, [r0], r1
+        cmp      r12, #4
+
+        vld4.8   {d16[2], d17[2], d18[2], d19[2]}, [r3], r1
+        vld4.8   {d20[2], d21[2], d22[2], d23[2]}, [r0], r1
+
+        vld4.8   {d16[3], d17[3], d18[3], d19[3]}, [r3], r1
+        vld4.8   {d20[3], d21[3], d22[3], d23[3]}, [r0], r1
+
+        vld4.8   {d16[4], d17[4], d18[4], d19[4]}, [r3], r1
+        vld4.8   {d20[4], d21[4], d22[4], d23[4]}, [r0], r1
+
+        vld4.8   {d16[5], d17[5], d18[5], d19[5]}, [r3], r1
+        vld4.8   {d20[5], d21[5], d22[5], d23[5]}, [r0], r1
+
+        vld4.8   {d16[6], d17[6], d18[6], d19[6]}, [r3], r1
+        vld4.8   {d20[6], d21[6], d22[6], d23[6]}, [r0], r1
+
+        vld4.8   {d16[7], d17[7], d18[7], d19[7]}, [r3]
+        vld4.8   {d20[7], d21[7], d22[7], d23[7]}, [r0]
+        it eq
+        ldreq    r12, [sp, #0]
+
+        hevc_loop_filter_uv_body2 d16, d17, d18, d19, d20, d21, d22, d23
+        cmp      r12, #0
+        add      r3, #2
+        neg      r1, r1
+        bne      1f
+
+@ Much/most of the time r0 == r3 + 4 and no_f == 0
+@ so it is worth having this special case
+        vst4.8   {d18[7], d19[7], d20[7], d21[7]}, [r3], r1
+        vst4.8   {d18[6], d19[6], d20[6], d21[6]}, [r3], r1
+        vst4.8   {d18[5], d19[5], d20[5], d21[5]}, [r3], r1
+        vst4.8   {d18[4], d19[4], d20[4], d21[4]}, [r3], r1
+        vst4.8   {d18[3], d19[3], d20[3], d21[3]}, [r3], r1
+        vst4.8   {d18[2], d19[2], d20[2], d21[2]}, [r3], r1
+        vst4.8   {d18[1], d19[1], d20[1], d21[1]}, [r3], r1
+        vst4.8   {d18[0], d19[0], d20[0], d21[0]}, [r3]
+        bx       lr
+
+@ Either split or partial
+1:
+        ldr      r12, [sp, #0]
+        lsls     r12, #29               @ b2 -> N, b3 -> C
+        add      r2, r0, r1, lsl #2
+        bcs      1f
+        vst2.8   {d20[7], d21[7]}, [r0], r1
+        vst2.8   {d20[6], d21[6]}, [r0], r1
+        vst2.8   {d20[5], d21[5]}, [r0], r1
+        vst2.8   {d20[4], d21[4]}, [r0]
+1:
+        bmi      2f
+        vst2.8   {d20[3], d21[3]}, [r2], r1
+        vst2.8   {d20[2], d21[2]}, [r2], r1
+        vst2.8   {d20[1], d21[1]}, [r2], r1
+        vst2.8   {d20[0], d21[0]}, [r2]
+
+2:
+        lsls     r12, #2
+        add      r2, r3, r1, lsl #2
+        bcs      3f
+        vst2.8   {d18[7], d19[7]}, [r3], r1
+        vst2.8   {d18[6], d19[6]}, [r3], r1
+        vst2.8   {d18[5], d19[5]}, [r3], r1
+        vst2.8   {d18[4], d19[4]}, [r3]
+3:
+        it mi
+        bxmi     lr
+        vst2.8   {d18[3], d19[3]}, [r2], r1
+        vst2.8   {d18[2], d19[2]}, [r2], r1
+        vst2.8   {d18[1], d19[1]}, [r2], r1
+        vst2.8   {d18[0], d19[0]}, [r2]
+        bx       lr
 endfunc

+
+@ void ff_hevc_v_loop_filter_uv2_neon(uint8_t * src_r,       // r0
+@                                     unsigned int stride,   // r1
+@                                     uint32_t tc4,          // r2
+@                                     uint8_t * src_l,       // r3
+@                                     unsigned int no_f);   // sp[0]
+@
+@ no_f = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1]
+.macro m_filter_v_uv2_16 bit_depth
+        vld4.16  {d16[0], d18[0], d20[0], d22[0]}, [r3], r1
+        vld4.16  {d24[0], d26[0], d28[0], d30[0]}, [r0], r1
+        sub      r12, r0, r3
+
+        vld4.16  {d16[1], d18[1], d20[1], d22[1]}, [r3], r1
+        vld4.16  {d24[1], d26[1], d28[1], d30[1]}, [r0], r1
+        cmp      r12, #8
+
+        vld4.16  {d16[2], d18[2], d20[2], d22[2]}, [r3], r1
+        vld4.16  {d24[2], d26[2], d28[2], d30[2]}, [r0], r1
+
+        vld4.16  {d16[3], d18[3], d20[3], d22[3]}, [r3], r1
+        vld4.16  {d24[3], d26[3], d28[3], d30[3]}, [r0], r1
+
+        vld4.16  {d17[0], d19[0], d21[0], d23[0]}, [r3], r1
+        vld4.16  {d25[0], d27[0], d29[0], d31[0]}, [r0], r1
+
+        vld4.16  {d17[1], d19[1], d21[1], d23[1]}, [r3], r1
+        vld4.16  {d25[1], d27[1], d29[1], d31[1]}, [r0], r1
+
+        vld4.16  {d17[2], d19[2], d21[2], d23[2]}, [r3], r1
+        vld4.16  {d25[2], d27[2], d29[2], d31[2]}, [r0], r1
+
+        vld4.16  {d17[3], d19[3], d21[3], d23[3]}, [r3]
+        vld4.16  {d25[3], d27[3], d29[3], d31[3]}, [r0]
+        it eq
+        ldreq    r12, [sp, #0]
+
+        hevc_loop_filter_uv_body2_16  q8, q9, q10, q11, q12, q13, q14, q15, \bit_depth
+        cmp      r12, #0
+        add      r3, #4
+        neg      r1, r1
+        bne      1f
+
+@ Much/most of the time r0 == r3 + 4 and no_f == 0
+@ so it is worth having this special case
+        vst4.16  {d21[3], d23[3],d25[3], d27[3]}, [r3], r1
+        vst4.16  {d21[2], d23[2],d25[2], d27[2]}, [r3], r1
+        vst4.16  {d21[1], d23[1],d25[1], d27[1]}, [r3], r1
+        vst4.16  {d21[0], d23[0],d25[0], d27[0]}, [r3], r1
+        vst4.16  {d20[3], d22[3],d24[3], d26[3]}, [r3], r1
+        vst4.16  {d20[2], d22[2],d24[2], d26[2]}, [r3], r1
+        vst4.16  {d20[1], d22[1],d24[1], d26[1]}, [r3], r1
+        vst4.16  {d20[0], d22[0],d24[0], d26[0]}, [r3], r1
+        bx       lr
+
+@ Either split or partial
+1:
+        ldr      r12, [sp, #0]
+        lsls     r12, #29               @ b2 -> N, b3 -> C
+        add      r2, r0, r1, lsl #2
+        bcs      1f
+        vst2.16  {d25[3], d27[3]}, [r0], r1
+        vst2.16  {d25[2], d27[2]}, [r0], r1
+        vst2.16  {d25[1], d27[1]}, [r0], r1
+        vst2.16  {d25[0], d27[0]}, [r0]
+1:
+        bmi      2f
+        vst2.16  {d24[3], d26[3]}, [r2], r1
+        vst2.16  {d24[2], d26[2]}, [r2], r1
+        vst2.16  {d24[1], d26[1]}, [r2], r1
+        vst2.16  {d24[0], d26[0]}, [r2]
+
+2:
+        lsls     r12, #2
+        add      r2, r3, r1, lsl #2
+        bcs      3f
+        vst2.16  {d21[3], d23[3]}, [r3], r1
+        vst2.16  {d21[2], d23[2]}, [r3], r1
+        vst2.16  {d21[1], d23[1]}, [r3], r1
+        vst2.16  {d21[0], d23[0]}, [r3]
+3:
+        it mi
+        bxmi     lr
+        vst2.16  {d20[3], d22[3]}, [r2], r1
+        vst2.16  {d20[2], d22[2]}, [r2], r1
+        vst2.16  {d20[1], d22[1]}, [r2], r1
+        vst2.16  {d20[0], d22[0]}, [r2]
+        bx       lr
+.endm
+
+
+
 function ff_hevc_v_loop_filter_chroma_neon, export=1
         hevc_loop_filter_chroma_start
+
+        sub      r0, #2
+        vld4.8   {d16[0], d17[0], d18[0], d19[0]}, [r0], r1
+        vld4.8   {d16[1], d17[1], d18[1], d19[1]}, [r0], r1
+        vld4.8   {d16[2], d17[2], d18[2], d19[2]}, [r0], r1
+        vld4.8   {d16[3], d17[3], d18[3], d19[3]}, [r0], r1
+        vld4.8   {d16[4], d17[4], d18[4], d19[4]}, [r0], r1
+        vld4.8   {d16[5], d17[5], d18[5], d19[5]}, [r0], r1
+        vld4.8   {d16[6], d17[6], d18[6], d19[6]}, [r0], r1
+        vld4.8   {d16[7], d17[7], d18[7], d19[7]}, [r0], r1
+
+        sub      r0, r0, r1, lsl #3
+        add      r0, r0, #1
+        hevc_loop_filter_chroma_body d16, d17, d18, d19
+        bne      1f
+
+        vst2.8   {d17[0], d18[0]}, [r0], r1
+        vst2.8   {d17[1], d18[1]}, [r0], r1
+        vst2.8   {d17[2], d18[2]}, [r0], r1
+        vst2.8   {d17[3], d18[3]}, [r0], r1
+        vst2.8   {d17[4], d18[4]}, [r0], r1
+        vst2.8   {d17[5], d18[5]}, [r0], r1
+        vst2.8   {d17[6], d18[6]}, [r0], r1
+        vst2.8   {d17[7], d18[7]}, [r0], r1
+        bx       lr
+
+1:
+        tst      r12, #0xff             @ P0a
+        bne      2f
+
+        vst1.8   {d17[0]}, [r0], r1
+        vst1.8   {d17[1]}, [r0], r1
+        vst1.8   {d17[2]}, [r0], r1
+        vst1.8   {d17[3]}, [r0], r1
+        sub      r0, r0, r1, lsl #2
+
+2:
+        tst      r12, #0xff0000         @ Q0a
+        add      r0, #1
+        bne      3f
+        vst1.8   {d18[0]}, [r0], r1
+        vst1.8   {d18[1]}, [r0], r1
+        vst1.8   {d18[2]}, [r0], r1
+        vst1.8   {d18[3]}, [r0], r1
+        sub      r0, r0, r1, lsl #2
+
+3:
+        tst      r12, #0xff000000       @ Q0b
+        add      r0, r0, r1, lsl #2
+        bne      4f
+        vst1.8   {d18[4]}, [r0], r1
+        vst1.8   {d18[5]}, [r0], r1
+        vst1.8   {d18[6]}, [r0], r1
+        vst1.8   {d18[7]}, [r0], r1
+        sub      r0, r0, r1, lsl #2
+
+4:
+        tst      r12, #0xff00           @ P0b
+        it ne
+        bxne     lr
+
+        sub      r0, #1
+        vst1.8   {d17[4]}, [r0], r1
+        vst1.8   {d17[5]}, [r0], r1
+        vst1.8   {d17[6]}, [r0], r1
+        vst1.8   {d17[7]}, [r0], r1
+        bx       lr
+
+endfunc
+
+
+.macro m_filter_v_chroma_16 bit_depth
+        hevc_loop_filter_chroma_start
+
         sub      r0, #4
+        vld4.16  {d16[0], d18[0], d20[0], d22[0]}, [r0], r1
+        vld4.16  {d16[1], d18[1], d20[1], d22[1]}, [r0], r1
+        vld4.16  {d16[2], d18[2], d20[2], d22[2]}, [r0], r1
+        vld4.16  {d16[3], d18[3], d20[3], d22[3]}, [r0], r1
+        vld4.16  {d17[0], d19[0], d21[0], d23[0]}, [r0], r1
+        vld4.16  {d17[1], d19[1], d21[1], d23[1]}, [r0], r1
+        vld4.16  {d17[2], d19[2], d21[2], d23[2]}, [r0], r1
+        vld4.16  {d17[3], d19[3], d21[3], d23[3]}, [r0], r1
+
+        sub      r0, r0, r1, lsl #3
+        add      r0, r0, #2
+        hevc_loop_filter_chroma_body_16 q8, q9, q10, q11, \bit_depth
+        bne      1f
+
+        vst2.16  {d18[0], d20[0]}, [r0], r1
+        vst2.16  {d18[1], d20[1]}, [r0], r1
+        vst2.16  {d18[2], d20[2]}, [r0], r1
+        vst2.16  {d18[3], d20[3]}, [r0], r1
+        vst2.16  {d19[0], d21[0]}, [r0], r1
+        vst2.16  {d19[1], d21[1]}, [r0], r1
+        vst2.16  {d19[2], d21[2]}, [r0], r1
+        vst2.16  {d19[3], d21[3]}, [r0], r1
+        bx       lr
+
+1:
+        tst      r12, #0xff             @ P0a
+        bne      2f
+
+        vst1.16  {d18[0]}, [r0], r1
+        vst1.16  {d18[1]}, [r0], r1
+        vst1.16  {d18[2]}, [r0], r1
+        vst1.16  {d18[3]}, [r0], r1
+        sub      r0, r0, r1, lsl #2
+
+2:
+        tst      r12, #0xff0000         @ Q0a
+        add      r0, #1
+        bne      3f
+        vst1.16  {d20[0]}, [r0], r1
+        vst1.16  {d20[1]}, [r0], r1
+        vst1.16  {d20[2]}, [r0], r1
+        vst1.16  {d20[3]}, [r0], r1
+        sub      r0, r0, r1, lsl #2
+
+3:
+        tst      r12, #0xff000000       @ Q0b
+        add      r0, r0, r1, lsl #2
+        bne      4f
+        vst1.16  {d21[0]}, [r0], r1
+        vst1.16  {d21[1]}, [r0], r1
+        vst1.16  {d21[2]}, [r0], r1
+        vst1.16  {d21[3]}, [r0], r1
+        sub      r0, r0, r1, lsl #2
+
+4:
+        tst      r12, #0xff00           @ P0b
+        it ne
+        bxne     lr
+
+        sub      r0, #1
+        vst1.16  {d19[0]}, [r0], r1
+        vst1.16  {d19[1]}, [r0], r1
+        vst1.16  {d19[2]}, [r0], r1
+        vst1.16  {d19[3]}, [r0], r1
+        bx       lr
+.endm
+
+
+@ void ff_hevc_h_loop_filter_chroma_neon(
+@   uint8_t *_pix,     [r0]
+@   ptrdiff_t _stride, [r1]
+@   int *_tc,          [r2]
+@   uint8_t *_no_p,    [r3]
+@   uint8_t *_no_q);   [sp+0]
+
+function ff_hevc_h_loop_filter_chroma_neon, export=1
+        hevc_loop_filter_chroma_start
+        sub      r0, r0, r1, lsl #1
         vld1.8   {d16}, [r0], r1
         vld1.8   {d17}, [r0], r1
         vld1.8   {d18}, [r0], r1
-        vld1.8   {d2},  [r0], r1
-        vld1.8   {d4},  [r0], r1
-        vld1.8   {d19}, [r0], r1
-        vld1.8   {d20}, [r0], r1
-        vld1.8   {d21}, [r0], r1
-        sub      r0, r0, r1, lsl #3
-        transpose_8x8 d16, d17, d18, d2, d4, d19, d20, d21
-        hevc_loop_filter_chroma_body
-        transpose_8x8 d16, d17, d18, d2, d4, d19, d20, d21
-        vst1.8   {d16}, [r0], r1
+        vld1.8   {d19}, [r0]
+        sub      r0, r0, r1, lsl #1
+        hevc_loop_filter_chroma_body d16, d17, d18, d19
+        bne      1f     @ Partial write
         vst1.8   {d17}, [r0], r1
-        vst1.8   {d18}, [r0], r1
-        vst1.8   {d2},  [r0], r1
-        vst1.8   {d4},  [r0], r1
-        vst1.8   {d19}, [r0], r1
-        vst1.8   {d20}, [r0], r1
-        vst1.8   {d21}, [r0]
+        vst1.8   {d18}, [r0]
+        bx       lr
+1:
+        tst      r12, #0xff
+        vmov     r2, r3, d17
+        it eq
+        streq    r2, [r0]
+        tst      r12, #0xff00
+        it eq
+        streq    r3, [r0, #4]
+
+        add      r0, r1
+        tst      r12, #0xff0000
+        vmov     r2, r3, d18
+        it eq
+        streq    r2, [r0]
+        tst      r12, #0xff000000
+        it eq
+        streq    r3, [r0, #4]
+
         bx       lr
 endfunc

-function ff_hevc_h_loop_filter_chroma_neon, export=1
+.macro m_filter_h_chroma_16 bit_depth
         hevc_loop_filter_chroma_start
         sub      r0, r0, r1, lsl #1
-        vld1.8   {d18}, [r0], r1
-        vld1.8   {d2}, [r0], r1
-        vld1.8   {d4}, [r0], r1
-        vld1.8   {d19}, [r0]
+        vld1.16  {q8}, [r0], r1
+        vld1.16  {q9}, [r0], r1
+        vld1.16  {q10}, [r0], r1
+        vld1.16  {q11}, [r0]
         sub      r0, r0, r1, lsl #1
-        hevc_loop_filter_chroma_body
-        vst1.8   {d2}, [r0], r1
-        vst1.8   {d4}, [r0]
+        hevc_loop_filter_chroma_body_16 q8, q9, q10, q11, \bit_depth
+        bne      1f     @ Partial write
+        vst1.16  {q9}, [r0], r1
+        vst1.16  {q10}, [r0]
+        bx       lr
+1:
+        tst      r12, #0xff
+        bne      2f
+        vst1.16  {d18}, [r0]
+2:
+        tst      r12, #0xff00
+        bne      3f
+        add      r0, #8
+        vst1.16  {d19}, [r0]
+        sub      r0, #8
+3:
+        tst      r12, #0xff0000
+        add      r0, r1
+        bne      4f
+        vst1.16  {d20}, [r0]
+4:
+        tst      r12, #0xff000000
+        it ne
+        bxne     lr
+        add      r0, #8
+        vst1.16  {d21}, [r0]
+
         bx       lr
+.endm
+
+
+/* ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_i
+ *                                            int *curr_rpl0, int *curr_
+ *                                            MvField *curr, MvField *ne
+ */
+function ff_hevc_deblocking_boundary_strengths_neon, export=1
+        add         ip, sp, #4*4
+        push        {a2-a4,v1-v8,lr}
+        ldmia       ip, {v5-v7}
+1:      ldmdb       ip, {v1-v4}
+        ldrsb       a3, [v5, #8]    @ curr->ref_idx
+        ldrsb       v8, [v5, #9]
+        ldrsb       ip, [v6, #8]    @ neigh->ref_idx
+        ldrsb       lr, [v6, #9]
+        ldr         v1, [v1, a3, lsl #2]
+        ldrb        a3, [v5, #10]   @ curr->pred_flag
+        ldr         v2, [v2, v8, lsl #2]
+        ldrb        v8, [v6, #10]   @ neigh->pred_flag
+        ldr         v3, [v3, ip, lsl #2]
+        ldr         v4, [v4, lr, lsl #2]
+        teq         a3, #3
+        beq         20f
+        teq         v8, #3
+        beq         90f
+
+        tst         a3, #1
+        itee        ne
+        ldrne       a3, [v5, #0]    @ curr->mv[0]
+        ldreq       a3, [v5, #4]    @ curr->mv[1]
+        moveq       v1, v2
+        tst         v8, #1
+        itee        ne
+        ldrne       v8, [v6, #0]    @ neigh->mv[0]
+        ldreq       v8, [v6, #4]    @ neigh->mv[1]
+        moveq       v3, v4
+        teq         v1, v3
+        bne         10f
+        ldr         lr, =0xFFFCFFFC
+        ssub16      ip, v8, a3
+        ssub16      a3, a3, v8
+        sel         a3, a3, ip
+        ands        a3, a3, lr
+        @ drop through
+10:     it          ne
+        movne       a3, #1
+11:     subs        a2, a2, #1
+12:
+A       strbhs      a3, [v7], a4
+T       itt         hs
+T       strbhs      a3, [v7]
+T       addhs       v7, v7, a4
+        subs        a2, a2, #1
+        bhs         12b
+
+        ldm         sp, {a2, a3}
+        add         ip, sp, #16*4
+        subs        a1, a1, #1
+        add         v5, v5, a3
+        add         v6, v6, a3
+        bhi         1b
+        pop         {a2-a4,v1-v8,pc}
+
+20:     teq         v8, #3
+        bne         10b
+
+        teq         v1, v3
+        it          eq
+        teqeq       v2, v4
+        bne         40f
+        teq         v1, v2
+        bne         30f
+
+        ldrd        v1, v2, [v5]    @ curr->mv
+        ldrd        v3, v4, [v6]    @ neigh->mv
+        ldr         lr, =0xFFFCFFFC
+        ssub16      ip, v3, v1
+        ssub16      a3, v1, v3
+        sel         a3, a3, ip
+        ands        a3, a3, lr
+        bne         25f
+        ssub16      ip, v4, v2
+        ssub16      a3, v2, v4
+        sel         a3, a3, ip
+        ands        a3, a3, lr
+        beq         11b
+        @ drop through
+25:     ssub16      ip, v4, v1
+        ssub16      a3, v1, v4
+        sel         a3, a3, ip
+        ands        a3, a3, lr
+        bne         10b
+        ssub16      ip, v3, v2
+        ssub16      a3, v2, v3
+        sel         a3, a3, ip
+        ands        a3, a3, lr
+        b           10b
+
+30:     ldrd        v1, v2, [v5]    @ curr->mv
+        ldrd        v3, v4, [v6]    @ neigh->mv
+        ldr         lr, =0xFFFCFFFC
+        ssub16      ip, v3, v1
+        ssub16      a3, v1, v3
+        sel         a3, a3, ip
+        ands        a3, a3, lr
+        bne         10b
+        ssub16      ip, v4, v2
+        ssub16      a3, v2, v4
+        sel         a3, a3, ip
+        ands        a3, a3, lr
+        b           10b
+
+40:     teq         v1, v4
+        ite         eq
+        teqeq       v2, v3
+        bne         10b
+
+        ldrd        v1, v2, [v5]    @ curr->mv
+        ldrd        v3, v4, [v6]    @ neigh->mv
+        ldr         lr, =0xFFFCFFFC
+        b           25b
+
+90:     mov         a3, #1
+        b           11b
+endfunc
+
+@ =============================================================================
+@
+@ 10 bit
+
+function hevc_loop_filter_luma_body_10
+        m_filter_luma 10
+endfunc
+
+function ff_hevc_h_loop_filter_luma_neon_10, export=1
+        m_filter_h_luma_16 10
+endfunc
+
+function ff_hevc_v_loop_filter_luma2_neon_10, export=1
+        hevc_loop_filter_luma_start
+        push     {r4-r10,lr}       @ 8 regs = 32 bytes
+
+        ldr      r4, [sp, #40]
+        b        v_loop_luma_common_10
+endfunc
+
+function ff_hevc_v_loop_filter_luma_neon_10, export=1
+        hevc_loop_filter_luma_start
+        push     {r4-r10,lr}
+
+        sub      r4, r0, #8
+v_loop_luma_common_10:
+        m_filter_v_luma_common_16 10
+endfunc
+
+function ff_hevc_h_loop_filter_uv_neon_10, export=1
+        m_filter_h_uv_16 10
+endfunc
+
+function ff_hevc_v_loop_filter_uv2_neon_10, export=1
+        m_filter_v_uv2_16 10
+endfunc
+
+function ff_hevc_h_loop_filter_chroma_neon_10, export=1
+        m_filter_h_chroma_16 10
+endfunc
+
+function ff_hevc_v_loop_filter_chroma_neon_10, export=1
+        m_filter_v_chroma_16 10
 endfunc
+
diff --git a/libavcodec/arm/hevcdsp_epel_neon.S b/libavcodec/arm/hevcdsp_epel_neon.S
new file mode 100644
index 0000000000..00eab9eeee
--- /dev/null
+++ b/libavcodec/arm/hevcdsp_epel_neon.S
@@ -0,0 +1,337 @@
+/*
+ * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
+#define MAX_PB_SIZE #64
+
+.macro vextin_d4
+    vld1.8    {q10}, [r1], r2
+    vmov      d16, d20
+    vext.8    d17, d20, d21, #1
+    vext.8    d18, d20, d21, #2
+    vext.8    d19, d20, d21, #3
+.endm
+
+.macro vextin_d4_8
+    vld1.8    d16, [r1], r2
+    vext.8    d17, d16, d16, #1
+    vext.8    d18, d16, d16, #2
+    vext.8    d19, d16, d16, #3
+.endm
+
+.macro load_coeffs_16b coeffs
+    ldr      \coeffs, [\coeffs]
+    vdup.i8  d0, \coeffs
+    lsr      \coeffs, #8
+    vdup.i8  d1, \coeffs
+    lsr      \coeffs, #8
+    vdup.i8  d2, \coeffs
+    lsr      \coeffs, #8
+    vdup.i8  d3, \coeffs
+.endm
+
+.macro epel_filter_16b out=q12
+    vmull.u8 q3, d16, d0
+    vmull.u8 q11, d19, d3
+    vmull.u8 \out, d17, d1
+    vmull.u8 q10, d18, d2
+    vadd.s16 q3, q11
+    vadd.s16 \out, q10
+    vsub.s16 \out, q3
+.endm
+
+.macro load_coeffs_32b coeffs
+    ldr      \coeffs, [\coeffs]
+    vmov.i64 d4, #0
+    vmov.8   d4[0], \coeffs
+    lsr      \coeffs, #8
+    vmov.8   d4[2], \coeffs
+    lsr      \coeffs, #8
+    vmov.8   d4[4], \coeffs
+    lsr      \coeffs, #8
+    vmov.8   d4[6], \coeffs
+.endm
+
+.macro epel_filter_32b
+    vmull.s16 q3, d24, d4[0] //q12
+    vmull.s16 q4, d25, d4[0]
+    vmull.s16 q5, d30, d4[3] //q15
+    vmull.s16 q6, d31, d4[3]
+
+    vmull.s16 q7, d26, d4[1] // q13
+    vmull.s16 q8, d27, d4[1]
+    vmull.s16 q9, d28, d4[2] // q14
+    vmull.s16 q10, d29, d4[2]
+    vadd.s32 q3, q5
+    vadd.s32 q4, q6
+    vadd.s32 q7, q9
+    vadd.s32 q8, q10
+    vsub.s32 q7, q3
+    vsub.s32 q8, q4
+    vqshrn.s32  d6, q7, #6
+    vqshrn.s32  d7, q8, #6
+.endm
+
+.macro epel_filter_32b_4
+    vmull.s16 q3, d24, d4[0] //q12
+    vmull.s16 q5, d30, d4[3] //q15
+    vmull.s16 q7, d26, d4[1] // q13
+    vmull.s16 q9, d28, d4[2] // q14
+    vadd.s32 q3, q5
+    vadd.s32 q7, q9
+    vsub.s32 q7, q3
+    vqshrn.s32  d6, q7, #6
+.endm
+
+function ff_hevc_put_epel_h_neon_8, export=1
+        push   {r4-r7}
+        mov    r4, MAX_PB_SIZE
+        ldr    r7, [sp, #16] // mx
+        ldr    r5, [sp, #24] // width
+        sub    r7, #1
+        lsl    r7, #2
+        vpush {d8-d15}
+@ adr reaches if we are in thumb mode but not in arm
+T       adr    r12, epel_coeffs
+A       adrl   r12, epel_coeffs
+        add    r7, r12
+        sub       r1, #1
+        lsl       r4, #1
+        load_coeffs_16b r7
+        mov   r12, r3
+        mov   r6, r0
+        mov   r7, r1
+        cmp       r5, #6
+        bgt       8f
+        cmp       r5, #4
+        blt       2f
+        b         4f
+8:      subs r3, #1
+        pld [r1]
+        vextin_d4
+        epel_filter_16b
+        vst1.16    {q12}, [r0], r4
+        bne 8b
+        subs    r5, #8
+        beq  99f
+        mov       r3, r12
+        add       r6, #16
+        mov       r0, r6
+        add       r7, #8
+        mov       r1, r7
+        cmp       r5, #4
+        bgt       8b
+4:      subs r3, #1
+        pld [r1]
+        vextin_d4_8
+        epel_filter_16b
+        vst1.16    d24, [r0], r4
+        bne 4b
+        subs      r5, #4
+        beq       99f
+        mov       r3, r12
+        add       r6, #8
+        mov       r0, r6
+        add       r7, #4
+        mov       r1, r7
+2:      subs r3, #1
+        pld [r1]
+        vextin_d4_8
+        epel_filter_16b
+        vst1.32    d24[0], [r0], r4
+        bne 2b
+99:     vpop {d8-d15}
+        pop {r4-r7}
+        bx lr
+endfunc
+
+function ff_hevc_put_epel_v_neon_8, export=1
+        push   {r4-r7}
+        mov    r4, MAX_PB_SIZE
+        ldr    r7, [sp, #20] // my
+        ldr    r5, [sp, #24] // width
+        sub    r7, #1
+        lsl    r7, #2
+        vpush {d8-d15}
+T       adr    r12, epel_coeffs
+A       adrl   r12, epel_coeffs
+        add    r7, r12
+        load_coeffs_16b r7
+        sub       r1, r2
+        lsl       r4, #1
+        mov   r12, r3
+        mov   r6, r0
+        mov   r7, r1
+0:      pld [r1]
+        vld1.8    {d16}, [r1], r2
+        pld [r1]
+        vld1.8    {d17}, [r1], r2
+        pld [r1]
+        vld1.8    {d18}, [r1], r2
+        cmp       r5, #6
+        bgt       8f
+        cmp       r5, #4
+        blt       2f
+        b         4f
+8:      pld [r1]
+        vld1.8    {d19}, [r1], r2
+        subs r3, #1
+        epel_filter_16b
+        vst1.16    {q12}, [r0], r4
+        vmov d16, d17
+        vmov d17, d18
+        vmov d18, d19
+        bne 8b
+        subs    r5, #8
+        beq  99f
+        mov       r3, r12
+        add       r6, #16
+        mov       r0, r6
+        add       r7, #8
+        mov       r1, r7
+        b         0b
+4:      pld       [r1]
+        vld1.8    {d19}, [r1], r2
+        subs r3, #1
+        epel_filter_16b
+        vst1.16    d24, [r0], r4
+        vmov d16, d17
+        vmov d17, d18
+        vmov d18, d19
+        bne 4b
+        subs      r5, #4
+        beq       99f
+        mov       r3, r12
+        add       r6, #8
+        mov       r0, r6
+        add       r7, #4
+        mov       r1, r7
+        b         0b
+2:      pld [r1]
+        vld1.8    {d19}, [r1], r2
+        subs r3, #1
+        epel_filter_16b
+        vst1.32    d24[0], [r0], r4
+        vmov d16, d17
+        vmov d17, d18
+        vmov d18, d19
+        bne 2b
+99:     vpop {d8-d15}
+        pop {r4-r7}
+        bx lr
+endfunc
+
+function ff_hevc_put_epel_hv_neon_8, export=1
+        push   {r4-r7}
+        mov    r4, MAX_PB_SIZE
+        ldr    r6, [sp, #16] // mx
+        ldr    r7, [sp, #20] // my
+        ldr    r5, [sp, #24] // width
+        sub    r7, #1
+        lsl    r7, #2
+        vpush {d8-d15}
+        adr    r12, epel_coeffs
+        sub    r6, #1
+        lsl    r6, #2
+        add    r6, r12 // mx epel coeff offset
+        add    r7, r12
+        sub       r1, #1
+        sub       r1, r2
+        lsl       r4, #1
+        load_coeffs_16b r6
+        load_coeffs_32b r7
+        mov   r12, r3
+        mov   r6, r0
+        mov   r7, r1
+0:      pld   [r1]
+        vextin_d4
+        epel_filter_16b q12
+        pld   [r1]
+        vextin_d4
+        epel_filter_16b q13
+        pld   [r1]
+        vextin_d4
+        epel_filter_16b q14
+        cmp       r5, #6
+        bgt       8f
+        cmp       r5, #4
+        blt       2f
+        b         4f
+8:      pld     [r1]
+        vextin_d4
+        epel_filter_16b q15
+        subs r3, #1
+        epel_filter_32b
+        vst1.16    {q3}, [r0], r4
+        vmov q12, q13
+        vmov q13, q14
+        vmov q14, q15
+        bne 8b
+        subs    r5, #8
+        beq  99f
+        mov       r3, r12
+        add       r6, #16
+        mov       r0, r6
+        add       r7, #8
+        mov       r1, r7
+        b         0b
+4:      pld      [r1]
+        vextin_d4_8
+        epel_filter_16b q15
+        subs r3, #1
+        epel_filter_32b_4
+        vst1.16    d6, [r0], r4
+        vmov q12, q13
+        vmov q13, q14
+        vmov q14, q15
+        bne 4b
+        subs      r5, #4
+        beq       99f
+        mov       r3, r12
+        add       r6, #8
+        mov       r0, r6
+        add       r7, #4
+        mov       r1, r7
+        b         0b
+2:      pld      [r1]
+        vextin_d4_8
+        epel_filter_16b q15
+        subs r3, #1
+        epel_filter_32b_4
+        vst1.32    d6[0], [r0], r4
+        vmov q12, q13
+        vmov q13, q14
+        vmov q14, q15
+        bne 2b
+99:     vpop {d8-d15}
+        pop {r4-r7}
+        bx lr
+endfunc
+
+epel_coeffs:
+       .byte 2, 58, 10, 2
+       .byte 4, 54, 16, 2
+       .byte 6, 46, 28, 4
+       .byte 4, 36, 36, 4
+       .byte 4, 28, 46, 6
+       .byte 2, 16, 54, 4
+       .byte 2, 10, 58, 2
diff --git a/libavcodec/arm/hevcdsp_idct_neon.S b/libavcodec/arm/hevcdsp_idct_neon.S
index 13d540e5ff..9b6d745556 100644
--- a/libavcodec/arm/hevcdsp_idct_neon.S
+++ b/libavcodec/arm/hevcdsp_idct_neon.S
@@ -21,82 +21,6 @@
 #include "libavutil/arm/asm.S"
 #include "neon.S"

-function ff_hevc_idct_4x4_dc_neon_8, export=1
-        ldrsh       r1, [r0]
-        ldr         r2, =0x20
-        add         r1, #1
-        asr         r1, #1
-        add         r1, r2
-        asr         r1, #6
-        vdup.16     q0, r1
-        vdup.16     q1, r1
-        vst1.16     {q0, q1}, [r0]
-        bx lr
-endfunc
-
-function ff_hevc_idct_8x8_dc_neon_8, export=1
-        ldrsh       r1, [r0]
-        ldr         r2, =0x20
-        add         r1, #1
-        asr         r1, #1
-        add         r1, r2
-        asr         r1, #6
-        vdup.16     q8, r1
-        vdup.16     q9, r1
-        vmov.16     q10, q8
-        vmov.16     q11, q8
-        vmov.16     q12, q8
-        vmov.16     q13, q8
-        vmov.16     q14, q8
-        vmov.16     q15, q8
-        vstm        r0, {q8-q15}
-        bx lr
-endfunc
-
-function ff_hevc_idct_16x16_dc_neon_8, export=1
-        ldrsh       r1, [r0]
-        ldr         r2, =0x20
-        add         r1, #1
-        asr         r1, #1
-        add         r1, r2
-        asr         r1, #6
-        vdup.16     q8, r1
-        vdup.16     q9, r1
-        vmov.16     q10, q8
-        vmov.16     q11, q8
-        vmov.16     q12, q8
-        vmov.16     q13, q8
-        vmov.16     q14, q8
-        vmov.16     q15, q8
-        vstm        r0!, {q8-q15}
-        vstm        r0!, {q8-q15}
-        vstm        r0!, {q8-q15}
-        vstm        r0, {q8-q15}
-        bx lr
-endfunc
-
-function ff_hevc_idct_32x32_dc_neon_8, export=1
-        ldrsh       r1, [r0]
-        ldr         r2, =0x20
-        add         r1, #1
-        asr         r1, #1
-        add         r1, r2
-        asr         r1, #6
-        mov         r3, #16
-        vdup.16     q8, r1
-        vdup.16     q9, r1
-        vmov.16     q10, q8
-        vmov.16     q11, q8
-        vmov.16     q12, q8
-        vmov.16     q13, q8
-        vmov.16     q14, q8
-        vmov.16     q15, q8
-1:      subs        r3, #1
-        vstm        r0!, {q8-q15}
-        bne         1b
-        bx lr
-endfunc
-
 function ff_hevc_transform_add_4x4_neon_8, export=1
         vldm        r1, {q0-q1}
         vld1.32     d4[0], [r0], r2
@@ -168,6 +92,131 @@ function ff_hevc_transform_add_32x32_neon_8, export=1
         bx          lr
 endfunc

+
+@ ff_hevc_add_residual_4x4_dc_neon_8(
+@   uint8_t * dst,              // [r0]
+@   unsigned int stride,        // [r1]
+@   int dc)                     // [r2]
+
+function ff_hevc_add_residual_4x4_dc_neon_8, export=1
+        vdup.16     q15, r2
+
+        vld1.32     d4[0], [r0], r1
+        vld1.32     d4[1], [r0], r1
+        vld1.32     d5[0], [r0], r1
+        vld1.32     d5[1], [r0], r1
+        sub         r0, r0, r1, lsl #2
+        vaddw.u8    q0, q15, d4
+        vaddw.u8    q1, q15, d5
+        vqmovun.s16 d0, q0
+        vqmovun.s16 d1, q1
+        vst1.32     d0[0], [r0], r1
+        vst1.32     d0[1], [r0], r1
+        vst1.32     d1[0], [r0], r1
+        vst1.32     d1[1], [r0], r1
+        bx          lr
+endfunc
+
+
+@ ff_hevc_add_residual_4x4_dc_c_neon_8(
+@   uint8_t * dst,              // [r0]
+@   unsigned int stride,        // [r1]
+@   int dc)                     // [r2]
+
+function ff_hevc_add_residual_4x4_dc_c_neon_8, export=1
+        vdup.32     q15, r2
+        mov         r3,  #4
+        b           1f
+endfunc
+
+@ ff_hevc_add_residual_8x8_dc_neon_8(
+@   uint8_t * dst,              // [r0]
+@   unsigned int stride,        // [r1]
+@   int dc)                     // [r2]
+
+function ff_hevc_add_residual_8x8_dc_neon_8, export=1
+        vdup.16     q15, r2
+        mov         r3,  #8
+
+1:      subs        r3,   #1
+        vld1.8      d16,  [r0]
+        vaddw.u8    q0,   q15, d16
+        vqmovun.s16 d0,   q0
+        vst1.32     d0,   [r0], r1
+        bne         1b
+        bx          lr
+endfunc
+
+
+@ ff_hevc_add_residual_8x8_dc_c_neon_8(
+@   uint8_t * dst,              // [r0]
+@   unsigned int stride,        // [r1]
+@   int dc)                     // [r2]
+
+function ff_hevc_add_residual_8x8_dc_c_neon_8, export=1
+        vdup.32     q15, r2
+        mov         r3,  #8
+        b           1f
+endfunc
+
+@ ff_hevc_add_residual_16x16_dc_neon_8(
+@   uint8_t * dst,              // [r0]
+@   unsigned int stride,        // [r1]
+@   int dc)                     // [r2]
+
+function ff_hevc_add_residual_16x16_dc_neon_8, export=1
+        vdup.16     q15, r2
+        mov         r3,  #16
+
+1:      subs        r3,   #1
+        vld1.8      {q8},  [r0]
+        vaddw.u8    q0,  q15, d16
+        vaddw.u8    q1,  q15, d17
+        vqmovun.s16 d0,  q0
+        vqmovun.s16 d1,  q1
+        vst1.8      {q0},   [r0], r1
+        bne         1b
+        bx          lr
+endfunc
+
+
+@ ff_hevc_add_residual_16x16_dc_c_neon_8(
+@   uint8_t * dst,              // [r0]
+@   unsigned int stride,        // [r1]
+@   int dc)                     // [r2]
+
+function ff_hevc_add_residual_16x16_dc_c_neon_8, export=1
+        vdup.32     q15, r2
+        mov         r3,  #16
+        b           1f
+endfunc
+
+@ ff_hevc_add_residual_32x32_dc_neon_8(
+@   uint8_t * dst,              // [r0]
+@   unsigned int stride,        // [r1]
+@   int dc)                     // [r2]
+
+function ff_hevc_add_residual_32x32_dc_neon_8, export=1
+        vdup.16     q15, r2
+        mov         r3,  #32
+
+1:      subs        r3,   #1
+        vld1.8      {q8, q9},  [r0]
+        vaddw.u8    q0,  q15, d16
+        vaddw.u8    q1,  q15, d17
+        vaddw.u8    q2,  q15, d18
+        vaddw.u8    q3,  q15, d19
+        vqmovun.s16 d0,  q0
+        vqmovun.s16 d1,  q1
+        vqmovun.s16 d2,  q2
+        vqmovun.s16 d3,  q3
+        vst1.8     {q0, q1},   [r0], r1
+        bne         1b
+        bx          lr
+endfunc
+
+
+
 .macro  transpose_16b_8x8   r0, r1, r2, r3, r4, r5, r6, r7
         vtrn.64         \r0, \r4
         vtrn.64         \r1, \r5
@@ -263,55 +312,6 @@ endfunc
         vqrshrn.s32   \r3, q3, \shift
 .endm

-function ff_hevc_transform_4x4_neon_8, export=1
-        vpush       {d8-d15}
-        vld1.16     {q14, q15}, [r0]  // coeffs
-        ldr         r3, =0x00240053 // 36 and 83
-        vmov.32     d0[0], r3
-
-        tr4_shift d28, d29, d30, d31, #7
-
-        vtrn.16     d28, d29
-        vtrn.16     d30, d31
-        vtrn.32     q14, q15
-
-        tr4_shift d28, d29, d30, d31, #12
-
-        vtrn.16     d28, d29
-        vtrn.16     d30, d31
-        vtrn.32     q14, q15
-
-        vst1.16     {q14, q15}, [r0]
-        vpop        {d8-d15}
-        bx lr
-endfunc
-
-function ff_hevc_transform_luma_4x4_neon_8, export=1
-        vpush       {d8-d15}
-        vld1.16     {q14, q15}, [r0]  // coeffs
-        ldr         r3, =0x4a  // 74
-        vmov.32     d0[0], r3
-        ldr         r3, =0x1d  // 29
-        vmov.32     d0[1], r3
-        ldr         r3, =0x37  // 55
-        vmov.32     d1[0], r3
-
-        tr4_luma_shift d28, d29, d30, d31, #7
-
-        vtrn.16     d28, d29
-        vtrn.16     d30, d31
-        vtrn.32     q14, q15
-
-        tr4_luma_shift d28, d29, d30, d31, #12
-
-        vtrn.16     d28, d29
-        vtrn.16     d30, d31
-        vtrn.32     q14, q15
-        vst1.16     {q14, q15}, [r0]
-        vpop        {d8-d15}
-        bx lr
-endfunc
-
 .macro tr8_begin in0, in1, in2, in3
         vmull.s16  q7, \in0, d1[1]   // 89 * src1
         vmull.s16  q8, \in0, d1[0]   // 75 * src1
@@ -356,100 +356,6 @@ endfunc
         vqrshrn.s32   d8, q5, \shift
 .endm

-function ff_hevc_transform_8x8_neon_8, export=1
-        push   {r4-r8}
-        vpush {d8-d15}
-        mov    r5, #16
-
-        adr       r3, tr4f
-        vld1.16   {d0, d1}, [r3]
-
-        // left half
-        vld1.16 {d24}, [r0], r5
-        vld1.16 {d25}, [r0], r5
-        vld1.16 {d26}, [r0], r5
-        vld1.16 {d27}, [r0], r5
-        vld1.16 {d28}, [r0], r5
-        vld1.16 {d29}, [r0], r5
-        vld1.16 {d30}, [r0], r5
-        vld1.16 {d31}, [r0], r5
-        sub      r0, #128
-        tr8_begin d25, d27, d29, d31
-        tr4       d24, d26, d28, d30
-        tr8_end   #7
-        vst1.16 {d2}, [r0], r5
-        vst1.16 {d3}, [r0], r5
-        vst1.16 {d4}, [r0], r5
-        vst1.16 {d5}, [r0], r5
-        vst1.16 {d6}, [r0], r5
-        vst1.16 {d7}, [r0], r5
-        vst1.16 {d8}, [r0], r5
-        vst1.16 {d9}, [r0], r5
-        sub      r0, #128
-        //skip right half if col_limit in r1 is less than 4
-        cmp      r1, #4
-        blt      1f
-        //right half
-        add      r0, #8
-        vld1.16 {d24}, [r0], r5
-        vld1.16 {d25}, [r0], r5
-        vld1.16 {d26}, [r0], r5
-        vld1.16 {d27}, [r0], r5
-        vld1.16 {d28}, [r0], r5
-        vld1.16 {d29}, [r0], r5
-        vld1.16 {d30}, [r0], r5
-        vld1.16 {d31}, [r0], r5
-        sub      r0, #128
-        tr8_begin d25, d27, d29, d31
-        tr4       d24, d26, d28, d30
-        tr8_end   #7
-        vst1.16 {d2}, [r0], r5
-        vst1.16 {d3}, [r0], r5
-        vst1.16 {d4}, [r0], r5
-        vst1.16 {d5}, [r0], r5
-        vst1.16 {d6}, [r0], r5
-        vst1.16 {d7}, [r0], r5
-        vst1.16 {d8}, [r0], r5
-        vst1.16 {d9}, [r0], r5
-        sub      r0, #136
-1:
-        // top half
-        vldm r0, {q12-q15} // coeffs
-        transpose_16b_4x4 d24, d26, d28, d30
-        transpose_16b_4x4 d25, d27, d29, d31
-        tr8_begin d26, d30, d27, d31
-        tr4 d24, d28, d25, d29
-        tr8_end #12
-        transpose_16b_4x4 d2, d3, d4, d5
-        transpose_16b_4x4 d6, d7, d8, d9
-        vswp     d7, d5
-        vswp     d7, d8
-        vswp     d3, d6
-        vswp     d6, d4
-        vstm r0!, {q1-q4}
-
-        // bottom half
-        vldm r0, {q12-q15} // coeffs
-        transpose_16b_4x4 d24, d26, d28, d30
-        transpose_16b_4x4 d25, d27, d29, d31
-        tr8_begin d26, d30, d27, d31
-        tr4 d24, d28, d25, d29
-        tr8_end #12
-        transpose_16b_4x4 d2, d3, d4, d5
-        transpose_16b_4x4 d6, d7, d8, d9
-        vswp     d7, d5
-        vswp     d7, d8
-        vswp     d3, d6
-        vswp     d6, d4
-        //vstm     r0, {q1-q4}
-        vst1.16 {q1-q2}, [r0]
-        add     r0, #32
-        vst1.16 {q3-q4}, [r0]
-        sub     r0, #32
-        vpop {d8-d15}
-        pop {r4-r8}
-        bx lr
-endfunc

 .align 4
 tr4f:
@@ -463,3 +369,11 @@ tr16:
 .word 0x00500046  // 80, d2[2] = 70
 .word 0x0039002b  // 57, d2[0] = 43
 .word 0x00190009  // 25, d2[2] = 9
+
+#define BIT_DEPTH 8
+#include "hevc_idct_fn_neon.S"
+
+#undef BIT_DEPTH
+#define BIT_DEPTH 10
+#include "hevc_idct_fn_neon.S"
+
diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
index 55918077e2..e708b7c074 100644
--- a/libavcodec/arm/hevcdsp_init_neon.c
+++ b/libavcodec/arm/hevcdsp_init_neon.c
@@ -22,11 +22,41 @@
 #include "libavutil/arm/cpu.h"
 #include "libavcodec/hevcdsp.h"
 #include "hevcdsp_arm.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/bit_depth_template.c"

 void ff_hevc_v_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
 void ff_hevc_h_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
 void ff_hevc_v_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
 void ff_hevc_h_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+
+void ff_hevc_v_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+void ff_hevc_h_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+void ff_hevc_v_loop_filter_chroma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+void ff_hevc_h_loop_filter_chroma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+
+#ifdef RPI
+void ff_hevc_v_loop_filter_luma2_neon_8(uint8_t * _pix_r,
+                             unsigned int _stride, unsigned int beta, const int32_t tc[2],
+                             const uint8_t no_p[2], const uint8_t no_q[2],
+                             uint8_t * _pix_l);
+void ff_hevc_h_loop_filter_uv_neon_8(uint8_t * src, unsigned int stride, uint32_t tc4,
+                             unsigned int no_f);
+void ff_hevc_v_loop_filter_uv2_neon_8(uint8_t * src_r, unsigned int stride, uint32_t tc4,
+                             uint8_t * src_l,
+                             unsigned int no_f);
+
+void ff_hevc_v_loop_filter_luma2_neon_10(uint8_t * _pix_r,
+                             unsigned int _stride, unsigned int beta, const int32_t tc[2],
+                             const uint8_t no_p[2], const uint8_t no_q[2],
+                             uint8_t * _pix_l);
+void ff_hevc_h_loop_filter_uv_neon_10(uint8_t * src, unsigned int stride, uint32_t tc4,
+                             unsigned int no_f);
+void ff_hevc_v_loop_filter_uv2_neon_10(uint8_t * src_r, unsigned int stride, uint32_t tc4,
+                             uint8_t * src_l,
+                             unsigned int no_f);
+#endif
+
 void ff_hevc_transform_4x4_neon_8(int16_t *coeffs, int col_limit);
 void ff_hevc_transform_8x8_neon_8(int16_t *coeffs, int col_limit);
 void ff_hevc_idct_4x4_dc_neon_8(int16_t *coeffs);
@@ -34,14 +64,174 @@ void ff_hevc_idct_8x8_dc_neon_8(int16_t *coeffs);
 void ff_hevc_idct_16x16_dc_neon_8(int16_t *coeffs);
 void ff_hevc_idct_32x32_dc_neon_8(int16_t *coeffs);
 void ff_hevc_transform_luma_4x4_neon_8(int16_t *coeffs);
+
+void ff_hevc_transform_4x4_neon_10(int16_t *coeffs, int col_limit);
+void ff_hevc_transform_8x8_neon_10(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_4x4_dc_neon_10(int16_t *coeffs);
+void ff_hevc_idct_8x8_dc_neon_10(int16_t *coeffs);
+void ff_hevc_idct_16x16_dc_neon_10(int16_t *coeffs);
+void ff_hevc_idct_32x32_dc_neon_10(int16_t *coeffs);
+void ff_hevc_transform_luma_4x4_neon_10(int16_t *coeffs);
+
 void ff_hevc_transform_add_4x4_neon_8(uint8_t *_dst, int16_t *coeffs,
-                                      ptrdiff_t stride);
+                                     ptrdiff_t stride);
 void ff_hevc_transform_add_8x8_neon_8(uint8_t *_dst, int16_t *coeffs,
-                                      ptrdiff_t stride);
+                                     ptrdiff_t stride);
 void ff_hevc_transform_add_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
-                                      ptrdiff_t stride);
+                                       ptrdiff_t stride);
 void ff_hevc_transform_add_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
-                                      ptrdiff_t stride);
+                                       ptrdiff_t stride);
+
+void ff_hevc_add_residual_4x4_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
+void ff_hevc_add_residual_8x8_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
+void ff_hevc_add_residual_16x16_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
+void ff_hevc_add_residual_32x32_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
+
+
+void ff_hevc_add_residual_4x4_neon_10(uint8_t *_dst, int16_t *coeffs,
+                                     ptrdiff_t stride);
+void ff_hevc_add_residual_8x8_neon_10(uint8_t *_dst, int16_t *coeffs,
+                                     ptrdiff_t stride);
+void ff_hevc_add_residual_16x16_neon_10(uint8_t *_dst, int16_t *coeffs,
+                                       ptrdiff_t stride);
+void ff_hevc_add_residual_32x32_neon_10(uint8_t *_dst, int16_t *coeffs,
+                                       ptrdiff_t stride);
+
+void ff_hevc_add_residual_4x4_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
+void ff_hevc_add_residual_8x8_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
+void ff_hevc_add_residual_16x16_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
+void ff_hevc_add_residual_32x32_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
+
+
+#if RPI_HEVC_SAND
+void ff_hevc_add_residual_4x4_u_neon_8(uint8_t *_dst, const int16_t * residual,
+                                       ptrdiff_t stride, int dc_v);
+void ff_hevc_add_residual_8x8_u_neon_8(uint8_t *_dst, const int16_t * residual,
+                                       ptrdiff_t stride, int dc_v);
+void ff_hevc_add_residual_16x16_u_neon_8(uint8_t *_dst, const int16_t * residual,
+                                       ptrdiff_t stride, int dc_v);
+void ff_hevc_add_residual_4x4_v_neon_8(uint8_t *_dst, const int16_t * residual,
+                                       ptrdiff_t stride, int dc_u);
+void ff_hevc_add_residual_8x8_v_neon_8(uint8_t *_dst, const int16_t * residual,
+                                       ptrdiff_t stride, int dc_u);
+void ff_hevc_add_residual_16x16_v_neon_8(uint8_t *_dst, const int16_t * residual,
+                                       ptrdiff_t stride, int dc_u);
+void ff_hevc_add_residual_4x4_c_neon_8(uint8_t *_dst, const int16_t * residual,
+                                       ptrdiff_t stride);
+void ff_hevc_add_residual_8x8_c_neon_8(uint8_t *_dst, const int16_t * residual,
+                                       ptrdiff_t stride);
+void ff_hevc_add_residual_16x16_c_neon_8(uint8_t *_dst, const int16_t * residual,
+                                       ptrdiff_t stride);
+void ff_hevc_add_residual_4x4_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
+void ff_hevc_add_residual_8x8_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
+void ff_hevc_add_residual_16x16_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
+
+
+void ff_hevc_add_residual_4x4_u_neon_10(uint8_t *_dst, const int16_t * residual,
+                                       ptrdiff_t stride, int dc_v);
+void ff_hevc_add_residual_8x8_u_neon_10(uint8_t *_dst, const int16_t * residual,
+                                       ptrdiff_t stride, int dc_v);
+void ff_hevc_add_residual_16x16_u_neon_10(uint8_t *_dst, const int16_t * residual,
+                                       ptrdiff_t stride, int dc_v);
+void ff_hevc_add_residual_4x4_v_neon_10(uint8_t *_dst, const int16_t * residual,
+                                       ptrdiff_t stride, int dc_u);
+void ff_hevc_add_residual_8x8_v_neon_10(uint8_t *_dst, const int16_t * residual,
+                                       ptrdiff_t stride, int dc_u);
+void ff_hevc_add_residual_16x16_v_neon_10(uint8_t *_dst, const int16_t * residual,
+                                       ptrdiff_t stride, int dc_u);
+void ff_hevc_add_residual_4x4_c_neon_10(uint8_t *_dst, const int16_t * residual,
+                                       ptrdiff_t stride);
+void ff_hevc_add_residual_8x8_c_neon_10(uint8_t *_dst, const int16_t * residual,
+                                       ptrdiff_t stride);
+void ff_hevc_add_residual_16x16_c_neon_10(uint8_t *_dst, const int16_t * residual,
+                                       ptrdiff_t stride);
+void ff_hevc_add_residual_4x4_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
+void ff_hevc_add_residual_8x8_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
+void ff_hevc_add_residual_16x16_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
+#endif
+
+void ff_hevc_sao_edge_8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
+void ff_hevc_sao_edge_16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
+void ff_hevc_sao_edge_32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
+void ff_hevc_sao_edge_64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
+
+void ff_hevc_sao_edge_8_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
+void ff_hevc_sao_edge_16_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
+void ff_hevc_sao_edge_32_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
+void ff_hevc_sao_edge_64_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
+
+#if RPI_HEVC_SAND
+void ff_hevc_sao_edge_c_8_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
+                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
+                                  int eo, int width, int height);
+void ff_hevc_sao_edge_c_16_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
+                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
+                                  int eo, int width, int height);
+void ff_hevc_sao_edge_c_32_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
+                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
+                                  int eo, int width, int height);
+
+void ff_hevc_sao_edge_c_8_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
+                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
+                                  int eo, int width, int height);
+void ff_hevc_sao_edge_c_16_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
+                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
+                                  int eo, int width, int height);
+void ff_hevc_sao_edge_c_32_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
+                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
+                                  int eo, int width, int height);
+
+void ff_hevc_sao_band_c_8_neon_8(uint8_t *_dst, const uint8_t *_src,
+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
+                                  int width, int height);
+void ff_hevc_sao_band_c_16_neon_8(uint8_t *_dst, const uint8_t *_src,
+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
+                                  int width, int height);
+void ff_hevc_sao_band_c_32_neon_8(uint8_t *_dst, const uint8_t *_src,
+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
+                                  int width, int height);
+
+void ff_hevc_sao_band_c_8_neon_10(uint8_t *_dst, const uint8_t *_src,
+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
+                                  int width, int height);
+void ff_hevc_sao_band_c_16_neon_10(uint8_t *_dst, const uint8_t *_src,
+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
+                                  int width, int height);
+void ff_hevc_sao_band_c_32_neon_10(uint8_t *_dst, const uint8_t *_src,
+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
+                                  int width, int height);
+#endif
+
+void ff_hevc_sao_band_8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
+void ff_hevc_sao_band_16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
+void ff_hevc_sao_band_32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
+void ff_hevc_sao_band_64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
+
+void ff_hevc_sao_band_8_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
+void ff_hevc_sao_band_16_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
+void ff_hevc_sao_band_32_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
+void ff_hevc_sao_band_64_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
+

 #define PUT_PIXELS(name) \
     void name(int16_t *dst, uint8_t *src, \
@@ -58,6 +248,15 @@ PUT_PIXELS(ff_hevc_put_pixels_w32_neon_8);
 PUT_PIXELS(ff_hevc_put_pixels_w48_neon_8);
 PUT_PIXELS(ff_hevc_put_pixels_w64_neon_8);
 #undef PUT_PIXELS
+void ff_hevc_put_epel_h_neon_8(int16_t *dst, uint8_t *src,
+                                ptrdiff_t srcstride, int height,
+                                intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_epel_v_neon_8(int16_t *dst, uint8_t *src,
+                                ptrdiff_t srcstride, int height,
+                                intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_epel_hv_neon_8(int16_t *dst, uint8_t *src,
+                                ptrdiff_t srcstride, int height,
+                                intptr_t mx, intptr_t my, int width);

 static void (*put_hevc_qpel_neon[4][4])(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
                                    int height, int width);
@@ -142,25 +341,181 @@ void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t
     put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, src2, MAX_PB_SIZE);
 }

+void ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc,
+                                                int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
+                                                MvField *curr, MvField *neigh, uint8_t *bs);
+
+
+static void ff_hevc_sao_edge_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
+{
+    ff_hevc_sao_edge_32_neon_8(_dst, _src, stride_dst, _sao_offset_val, eo, 32, height);
+    ff_hevc_sao_edge_16_neon_8(_dst + 32, _src + 32, stride_dst, _sao_offset_val, eo, 16, height);
+}
+static void ff_hevc_sao_edge_48_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
+{
+    ff_hevc_sao_edge_32_neon_10(_dst, _src, stride_dst, _sao_offset_val, eo, 32, height);
+    ff_hevc_sao_edge_16_neon_10(_dst + 64, _src + 64, stride_dst, _sao_offset_val, eo, 16, height);
+}
+
+static void ff_hevc_sao_band_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+                                int16_t *sao_offset_val, int sao_left_class, int width, int height)
+{
+    ff_hevc_sao_band_32_neon_8(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 32, height);
+    ff_hevc_sao_band_16_neon_8(_dst + 32, _src + 32, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
+}
+static void ff_hevc_sao_band_48_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+                                int16_t *sao_offset_val, int sao_left_class, int width, int height)
+{
+    ff_hevc_sao_band_32_neon_10(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 32, height);
+    ff_hevc_sao_band_16_neon_10(_dst + 64, _src + 64, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
+}
+
+#if SAO_FILTER_N == 6
+static void ff_hevc_sao_edge_24_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
+{
+    ff_hevc_sao_edge_16_neon_8(_dst, _src, stride_dst, _sao_offset_val, eo, 16, height);
+    ff_hevc_sao_edge_8_neon_8(_dst + 16, _src + 16, stride_dst, _sao_offset_val, eo, 8, height);
+}
+static void ff_hevc_sao_edge_24_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
+{
+    ff_hevc_sao_edge_16_neon_10(_dst, _src, stride_dst, _sao_offset_val, eo, 16, height);
+    ff_hevc_sao_edge_8_neon_10(_dst + 32, _src + 32, stride_dst, _sao_offset_val, eo, 8, height);
+}
+
+static void ff_hevc_sao_band_24_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+                                int16_t *sao_offset_val, int sao_left_class, int width, int height)
+{
+    ff_hevc_sao_band_16_neon_8(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
+    ff_hevc_sao_band_8_neon_8(_dst + 16, _src + 16, stride_dst, stride_src, sao_offset_val, sao_left_class, 8, height);
+}
+static void ff_hevc_sao_band_24_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+                                int16_t *sao_offset_val, int sao_left_class, int width, int height)
+{
+    ff_hevc_sao_band_16_neon_10(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
+    ff_hevc_sao_band_8_neon_10(_dst + 32, _src + 32, stride_dst, stride_src, sao_offset_val, sao_left_class, 8, height);
+}
+
+#if RPI_HEVC_SAND
+static void ff_hevc_sao_edge_c_24_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
+                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
+                                  int eo, int width, int height)
+{
+    ff_hevc_sao_edge_c_16_neon_8(_dst, _src, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 16, height);
+    ff_hevc_sao_edge_c_8_neon_8(_dst + 32, _src + 32, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 8, height);
+}
+static void ff_hevc_sao_edge_c_24_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
+                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
+                                  int eo, int width, int height)
+{
+    ff_hevc_sao_edge_c_16_neon_10(_dst, _src, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 16, height);
+    ff_hevc_sao_edge_c_8_neon_10(_dst + 64, _src + 64, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 8, height);
+}
+
+static void ff_hevc_sao_band_c_24_neon_8(uint8_t *_dst, const uint8_t *_src,
+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
+                                  int width, int height)
+{
+    ff_hevc_sao_band_c_16_neon_8(_dst, _src, stride_dst, stride_src,
+                                sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 16, height);
+    ff_hevc_sao_band_c_8_neon_8(_dst + 32, _src + 32, stride_dst, stride_src,
+                                sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 8, height);
+}
+static void ff_hevc_sao_band_c_24_neon_10(uint8_t *_dst, const uint8_t *_src,
+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
+                                  int width, int height)
+{
+    ff_hevc_sao_band_c_16_neon_10(_dst, _src, stride_dst, stride_src,
+                                sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 16, height);
+    ff_hevc_sao_band_c_8_neon_10(_dst + 64, _src + 64, stride_dst, stride_src,
+                                sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 8, height);
+}
+#endif
+#endif
+
+
+
+#if (2*MAX_PB_SIZE + FF_INPUT_BUFFER_PADDING_SIZE) != 160
+#error SAO edge src stride not 160 - value used in .S
+#endif
+
 av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
 {
     if (bit_depth == 8) {
         int x;
         c->hevc_v_loop_filter_luma     = ff_hevc_v_loop_filter_luma_neon;
+        c->hevc_v_loop_filter_luma_c   = ff_hevc_v_loop_filter_luma_neon;
         c->hevc_h_loop_filter_luma     = ff_hevc_h_loop_filter_luma_neon;
+        c->hevc_h_loop_filter_luma_c   = ff_hevc_h_loop_filter_luma_neon;
         c->hevc_v_loop_filter_chroma   = ff_hevc_v_loop_filter_chroma_neon;
+        c->hevc_v_loop_filter_chroma_c = ff_hevc_v_loop_filter_chroma_neon;
         c->hevc_h_loop_filter_chroma   = ff_hevc_h_loop_filter_chroma_neon;
+        c->hevc_h_loop_filter_chroma_c = ff_hevc_h_loop_filter_chroma_neon;
+#ifdef RPI
+        c->hevc_v_loop_filter_luma2    = ff_hevc_v_loop_filter_luma2_neon_8;
+        c->hevc_h_loop_filter_uv       = ff_hevc_h_loop_filter_uv_neon_8;
+        c->hevc_v_loop_filter_uv2      = ff_hevc_v_loop_filter_uv2_neon_8;
+#endif
         c->idct[0]                     = ff_hevc_transform_4x4_neon_8;
         c->idct[1]                     = ff_hevc_transform_8x8_neon_8;
         c->idct_dc[0]                  = ff_hevc_idct_4x4_dc_neon_8;
         c->idct_dc[1]                  = ff_hevc_idct_8x8_dc_neon_8;
         c->idct_dc[2]                  = ff_hevc_idct_16x16_dc_neon_8;
         c->idct_dc[3]                  = ff_hevc_idct_32x32_dc_neon_8;
-        c->transform_add[0]            = ff_hevc_transform_add_4x4_neon_8;
-        c->transform_add[1]            = ff_hevc_transform_add_8x8_neon_8;
-        c->transform_add[2]            = ff_hevc_transform_add_16x16_neon_8;
-        c->transform_add[3]            = ff_hevc_transform_add_32x32_neon_8;
+        c->transform_add[0]             = ff_hevc_transform_add_4x4_neon_8;
+        c->transform_add[1]             = ff_hevc_transform_add_8x8_neon_8;
+        c->transform_add[2]             = ff_hevc_transform_add_16x16_neon_8;
+        c->transform_add[3]             = ff_hevc_transform_add_32x32_neon_8;
+        c->add_residual_dc[0]          = ff_hevc_add_residual_4x4_dc_neon_8;
+        c->add_residual_dc[1]          = ff_hevc_add_residual_8x8_dc_neon_8;
+        c->add_residual_dc[2]          = ff_hevc_add_residual_16x16_dc_neon_8;
+        c->add_residual_dc[3]          = ff_hevc_add_residual_32x32_dc_neon_8;
+#if RPI_HEVC_SAND
+        c->add_residual_u[0]           = ff_hevc_add_residual_4x4_u_neon_8;
+        c->add_residual_u[1]           = ff_hevc_add_residual_8x8_u_neon_8;
+        c->add_residual_u[2]           = ff_hevc_add_residual_16x16_u_neon_8;
+        c->add_residual_v[0]           = ff_hevc_add_residual_4x4_v_neon_8;
+        c->add_residual_v[1]           = ff_hevc_add_residual_8x8_v_neon_8;
+        c->add_residual_v[2]           = ff_hevc_add_residual_16x16_v_neon_8;
+        c->add_residual_c[0]           = ff_hevc_add_residual_4x4_c_neon_8;
+        c->add_residual_c[1]           = ff_hevc_add_residual_8x8_c_neon_8;
+        c->add_residual_c[2]           = ff_hevc_add_residual_16x16_c_neon_8;
+        c->add_residual_dc_c[0]        = ff_hevc_add_residual_4x4_dc_c_neon_8;
+        c->add_residual_dc_c[1]        = ff_hevc_add_residual_8x8_dc_c_neon_8;
+        c->add_residual_dc_c[2]        = ff_hevc_add_residual_16x16_dc_c_neon_8;
+#endif
         c->idct_4x4_luma               = ff_hevc_transform_luma_4x4_neon_8;
+        c->sao_band_filter[0]          = ff_hevc_sao_band_8_neon_8;
+        c->sao_band_filter[1]          = ff_hevc_sao_band_16_neon_8;
+        c->sao_band_filter[2]          = ff_hevc_sao_band_32_neon_8;
+        c->sao_band_filter[3]          = ff_hevc_sao_band_48_neon_8;
+        c->sao_band_filter[4]          = ff_hevc_sao_band_64_neon_8;
+        c->sao_edge_filter[0]          = ff_hevc_sao_edge_8_neon_8;
+        c->sao_edge_filter[1]          = ff_hevc_sao_edge_16_neon_8;
+        c->sao_edge_filter[2]          = ff_hevc_sao_edge_32_neon_8;
+        c->sao_edge_filter[3]          = ff_hevc_sao_edge_48_neon_8;
+        c->sao_edge_filter[4]          = ff_hevc_sao_edge_64_neon_8;
+#if SAO_FILTER_N == 6
+        c->sao_band_filter[5]          = ff_hevc_sao_band_24_neon_8;
+        c->sao_edge_filter[5]          = ff_hevc_sao_edge_24_neon_8;
+#endif
+#if RPI_HEVC_SAND
+        c->sao_band_filter_c[0]        = ff_hevc_sao_band_c_8_neon_8;
+        c->sao_band_filter_c[1]        = ff_hevc_sao_band_c_16_neon_8;
+        c->sao_band_filter_c[2]        = ff_hevc_sao_band_c_32_neon_8;
+
+        c->sao_edge_filter_c[0]        = ff_hevc_sao_edge_c_8_neon_8;
+        c->sao_edge_filter_c[1]        = ff_hevc_sao_edge_c_16_neon_8;
+        c->sao_edge_filter_c[2]        = ff_hevc_sao_edge_c_32_neon_8;
+
+#if SAO_FILTER_N == 6
+        c->sao_band_filter_c[5]        = ff_hevc_sao_band_c_24_neon_8;
+        c->sao_edge_filter_c[5]        = ff_hevc_sao_edge_c_24_neon_8;
+#endif
+#endif
         put_hevc_qpel_neon[1][0]       = ff_hevc_put_qpel_v1_neon_8;
         put_hevc_qpel_neon[2][0]       = ff_hevc_put_qpel_v2_neon_8;
         put_hevc_qpel_neon[3][0]       = ff_hevc_put_qpel_v3_neon_8;
@@ -201,7 +556,21 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
             c->put_hevc_qpel_bi[x][1][0]      = ff_hevc_put_qpel_bi_neon_wrapper;
             c->put_hevc_qpel_bi[x][0][1]      = ff_hevc_put_qpel_bi_neon_wrapper;
             c->put_hevc_qpel_bi[x][1][1]      = ff_hevc_put_qpel_bi_neon_wrapper;
+            c->put_hevc_epel[x][1][0]         = ff_hevc_put_epel_v_neon_8;
+            c->put_hevc_epel[x][0][1]         = ff_hevc_put_epel_h_neon_8;
+            c->put_hevc_epel[x][1][1]         = ff_hevc_put_epel_hv_neon_8;
         }
+        c->put_hevc_epel[0][0][0]  = ff_hevc_put_pixels_w2_neon_8;
+        c->put_hevc_epel[1][0][0]  = ff_hevc_put_pixels_w4_neon_8;
+        c->put_hevc_epel[2][0][0]  = ff_hevc_put_pixels_w6_neon_8;
+        c->put_hevc_epel[3][0][0]  = ff_hevc_put_pixels_w8_neon_8;
+        c->put_hevc_epel[4][0][0]  = ff_hevc_put_pixels_w12_neon_8;
+        c->put_hevc_epel[5][0][0]  = ff_hevc_put_pixels_w16_neon_8;
+        c->put_hevc_epel[6][0][0]  = ff_hevc_put_pixels_w24_neon_8;
+        c->put_hevc_epel[7][0][0]  = ff_hevc_put_pixels_w32_neon_8;
+        c->put_hevc_epel[8][0][0]  = ff_hevc_put_pixels_w48_neon_8;
+        c->put_hevc_epel[9][0][0]  = ff_hevc_put_pixels_w64_neon_8;
+
         c->put_hevc_qpel[0][0][0]  = ff_hevc_put_pixels_w2_neon_8;
         c->put_hevc_qpel[1][0][0]  = ff_hevc_put_pixels_w4_neon_8;
         c->put_hevc_qpel[2][0][0]  = ff_hevc_put_pixels_w6_neon_8;
@@ -221,4 +590,82 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
         c->put_hevc_qpel_uni[8][0][0]  = ff_hevc_put_qpel_uw_pixels_w48_neon_8;
         c->put_hevc_qpel_uni[9][0][0]  = ff_hevc_put_qpel_uw_pixels_w64_neon_8;
     }
+    else if (bit_depth == 10) {
+        c->hevc_v_loop_filter_luma     = ff_hevc_v_loop_filter_luma_neon_10;
+        c->hevc_v_loop_filter_luma_c   = ff_hevc_v_loop_filter_luma_neon_10;
+        c->hevc_h_loop_filter_luma     = ff_hevc_h_loop_filter_luma_neon_10;
+        c->hevc_h_loop_filter_luma_c   = ff_hevc_h_loop_filter_luma_neon_10;
+        c->hevc_v_loop_filter_chroma   = ff_hevc_v_loop_filter_chroma_neon_10;
+        c->hevc_v_loop_filter_chroma_c = ff_hevc_v_loop_filter_chroma_neon_10;
+        c->hevc_h_loop_filter_chroma   = ff_hevc_h_loop_filter_chroma_neon_10;
+        c->hevc_h_loop_filter_chroma_c = ff_hevc_h_loop_filter_chroma_neon_10;
+#ifdef RPI
+        c->hevc_v_loop_filter_luma2    = ff_hevc_v_loop_filter_luma2_neon_10;
+        c->hevc_h_loop_filter_uv       = ff_hevc_h_loop_filter_uv_neon_10;
+        c->hevc_v_loop_filter_uv2      = ff_hevc_v_loop_filter_uv2_neon_10;
+#endif
+        c->idct[0]                     = ff_hevc_transform_4x4_neon_10;
+        c->idct[1]                     = ff_hevc_transform_8x8_neon_10;
+        c->idct_dc[0]                  = ff_hevc_idct_4x4_dc_neon_10;
+        c->idct_dc[1]                  = ff_hevc_idct_8x8_dc_neon_10;
+        c->idct_dc[2]                  = ff_hevc_idct_16x16_dc_neon_10;
+        c->idct_dc[3]                  = ff_hevc_idct_32x32_dc_neon_10;
+        c->transform_add[0]             = ff_hevc_add_residual_4x4_neon_10;
+        c->transform_add[1]             = ff_hevc_add_residual_8x8_neon_10;
+        c->transform_add[2]             = ff_hevc_add_residual_16x16_neon_10;
+        c->transform_add[3]             = ff_hevc_add_residual_32x32_neon_10;
+        c->add_residual_dc[0]          = ff_hevc_add_residual_4x4_dc_neon_10;
+        c->add_residual_dc[1]          = ff_hevc_add_residual_8x8_dc_neon_10;
+        c->add_residual_dc[2]          = ff_hevc_add_residual_16x16_dc_neon_10;
+        c->add_residual_dc[3]          = ff_hevc_add_residual_32x32_dc_neon_10;
+#if RPI_HEVC_SAND
+        c->add_residual_u[0]           = ff_hevc_add_residual_4x4_u_neon_10;
+        c->add_residual_u[1]           = ff_hevc_add_residual_8x8_u_neon_10;
+        c->add_residual_u[2]           = ff_hevc_add_residual_16x16_u_neon_10;
+        c->add_residual_v[0]           = ff_hevc_add_residual_4x4_v_neon_10;
+        c->add_residual_v[1]           = ff_hevc_add_residual_8x8_v_neon_10;
+        c->add_residual_v[2]           = ff_hevc_add_residual_16x16_v_neon_10;
+        c->add_residual_c[0]           = ff_hevc_add_residual_4x4_c_neon_10;
+        c->add_residual_c[1]           = ff_hevc_add_residual_8x8_c_neon_10;
+        c->add_residual_c[2]           = ff_hevc_add_residual_16x16_c_neon_10;
+        c->add_residual_dc_c[0]        = ff_hevc_add_residual_4x4_dc_c_neon_10;
+        c->add_residual_dc_c[1]        = ff_hevc_add_residual_8x8_dc_c_neon_10;
+        c->add_residual_dc_c[2]        = ff_hevc_add_residual_16x16_dc_c_neon_10;
+#endif
+        c->idct_4x4_luma               = ff_hevc_transform_luma_4x4_neon_10;
+        c->sao_band_filter[0]          = ff_hevc_sao_band_8_neon_10;
+        c->sao_band_filter[1]          = ff_hevc_sao_band_16_neon_10;
+        c->sao_band_filter[2]          = ff_hevc_sao_band_32_neon_10;
+        c->sao_band_filter[3]          = ff_hevc_sao_band_48_neon_10;
+        c->sao_band_filter[4]          = ff_hevc_sao_band_64_neon_10;
+
+        c->sao_edge_filter[0]          = ff_hevc_sao_edge_8_neon_10;
+        c->sao_edge_filter[1]          = ff_hevc_sao_edge_16_neon_10;
+        c->sao_edge_filter[2]          = ff_hevc_sao_edge_32_neon_10;
+        c->sao_edge_filter[3]          = ff_hevc_sao_edge_48_neon_10;
+        c->sao_edge_filter[4]          = ff_hevc_sao_edge_64_neon_10;
+#if SAO_FILTER_N == 6
+        c->sao_band_filter[5]          = ff_hevc_sao_band_24_neon_10;
+        c->sao_edge_filter[5]          = ff_hevc_sao_edge_24_neon_10;
+#endif
+#if RPI_HEVC_SAND
+        c->sao_band_filter_c[0]        = ff_hevc_sao_band_c_8_neon_10;
+        c->sao_band_filter_c[1]        = ff_hevc_sao_band_c_16_neon_10;
+        c->sao_band_filter_c[2]        = ff_hevc_sao_band_c_32_neon_10;
+
+        c->sao_edge_filter_c[0]        = ff_hevc_sao_edge_c_8_neon_10;
+        c->sao_edge_filter_c[1]        = ff_hevc_sao_edge_c_16_neon_10;
+        c->sao_edge_filter_c[2]        = ff_hevc_sao_edge_c_32_neon_10;
+
+#if SAO_FILTER_N == 6
+        c->sao_band_filter_c[5]        = ff_hevc_sao_band_c_24_neon_10;
+        c->sao_edge_filter_c[5]        = ff_hevc_sao_edge_c_24_neon_10;
+#endif
+#endif
+    }
+
+    assert(offsetof(MvField, mv) == 0);
+    assert(offsetof(MvField, ref_idx) == 8);
+    assert(offsetof(MvField, pred_flag) == 10);
+    c->hevc_deblocking_boundary_strengths = ff_hevc_deblocking_boundary_strengths_neon;
 }
diff --git a/libavcodec/arm/hevcdsp_res16_neon.S b/libavcodec/arm/hevcdsp_res16_neon.S
new file mode 100644
index 0000000000..7cc5cd5e5c
--- /dev/null
+++ b/libavcodec/arm/hevcdsp_res16_neon.S
@@ -0,0 +1,610 @@
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
+#define BIT_DEPTH 10
+
+.macro clip16_4 Q0, Q1, Q2, Q3, Q_MIN, Q_MAX
+        vmax.s16  \Q0, \Q_MIN
+        vmax.s16  \Q1, \Q_MIN
+        vmax.s16  \Q2, \Q_MIN
+        vmax.s16  \Q3, \Q_MIN
+        vmin.s16  \Q0, \Q_MAX
+        vmin.s16  \Q1, \Q_MAX
+        vmin.s16  \Q2, \Q_MAX
+        vmin.s16  \Q3, \Q_MAX
+.endm
+
+@ add_residual4x4(
+@  uint8_t *_dst,     [r0]
+@  int16_t *res,      [r1]
+@  ptrdiff_t stride)  [r2]
+
+function JOIN(ff_hevc_add_residual_4x4_neon_, BIT_DEPTH), export=1
+        vld1.16     {q10, q11}, [r1]
+        movw        r3,  #(1 << BIT_DEPTH) - 1
+        vld1.16     {d0}, [r0, :64], r2
+        vld1.16     {d1}, [r0, :64], r2
+        vld1.16     {d2}, [r0, :64], r2
+        vld1.16     {d3}, [r0, :64], r2
+        vmov.i64    q8,  #0
+        vdup.i16    q9,  r3
+        vqadd.s16   q0,  q10
+        vqadd.s16   q1,  q11
+        sub         r0,  r0,  r2, lsl #2
+        vmax.s16    q0,  q0,  q8
+        vmax.s16    q1,  q1,  q8
+        vmin.s16    q0,  q0,  q9
+        vmin.s16    q1,  q1,  q9
+        vst1.16     {d0}, [r0, :64], r2
+        vst1.16     {d1}, [r0, :64], r2
+        vst1.16     {d2}, [r0, :64], r2
+        vst1.16     {d3}, [r0, :64], r2
+        bx          lr
+
+endfunc
+
+@ add_residual4x4(
+@  uint8_t *_dst,     [r0]
+@  ptrdiff_t stride,  [r1]
+@  int dc)            [r2]
+
+function JOIN(ff_hevc_add_residual_4x4_dc_neon_, BIT_DEPTH), export=1
+        movw        r3,  #(1 << BIT_DEPTH) - 1
+        vdup.i16    q9,  r3
+        vld1.16     {d0}, [r0, :64], r1
+        vld1.16     {d1}, [r0, :64], r1
+        vdup.16     q15, r2
+        vld1.16     {d2}, [r0, :64], r1
+        vld1.16     {d3}, [r0, :64], r1
+        vmov.i64    q8,  #0
+        vdup.i16    q9,  r3
+        vqadd.s16   q0,  q15
+        vqadd.s16   q1,  q15
+        sub         r0,  r0,  r1, lsl #2
+        vmax.s16    q0,  q0,  q8
+        vmax.s16    q1,  q1,  q8
+        vmin.s16    q0,  q0,  q9
+        vmin.s16    q1,  q1,  q9
+        vst1.16     {d0}, [r0, :64], r1
+        vst1.16     {d1}, [r0, :64], r1
+        vst1.16     {d2}, [r0, :64], r1
+        vst1.16     {d3}, [r0, :64], r1
+        bx          lr
+
+endfunc
+
+
+@ add_residual8x8(
+@  uint8_t *_dst,     [r0]
+@  int16_t *res,      [r1]
+@  ptrdiff_t stride)  [r2]
+
+function JOIN(ff_hevc_add_residual_8x8_neon_, BIT_DEPTH), export=1
+        movw        r3,  #(1 << BIT_DEPTH) - 1
+        vmov.i64    q8,  #0
+        vdup.i16    q9,  r3
+        mov         r12, #2
+1:
+        vldm        r1!, {q10-q13}
+        vld1.16     {q0}, [r0, :128], r2
+        subs        r12, #1
+        vld1.16     {q1}, [r0, :128], r2
+        vqadd.s16   q0,  q10
+        vld1.16     {q2}, [r0, :128], r2
+        vqadd.s16   q1,  q11
+        vld1.16     {q3}, [r0, :128], r2
+        vqadd.s16   q2,  q12
+        vqadd.s16   q3,  q13
+        sub         r0,  r0,  r2, lsl #2
+        vmax.s16    q0,  q0,  q8
+        vmax.s16    q1,  q1,  q8
+        vmax.s16    q2,  q2,  q8
+        vmax.s16    q3,  q3,  q8
+        vmin.s16    q0,  q0,  q9
+        vmin.s16    q1,  q1,  q9
+        vst1.16     {q0}, [r0, :128], r2
+        vmin.s16    q2,  q2,  q9
+        vst1.16     {q1}, [r0, :128], r2
+        vmin.s16    q3,  q3,  q9
+        vst1.16     {q2}, [r0, :128], r2
+        vst1.16     {q3}, [r0, :128], r2
+        bne         1b
+        bx          lr
+
+endfunc
+
+@ add_residual4x4_dc_c(
+@  uint8_t *_dst,     [r0]
+@  ptrdiff_t stride,  [r1]
+@  int dc_uv)         [r2]
+
+function JOIN(ff_hevc_add_residual_4x4_dc_c_neon_, BIT_DEPTH), export=1
+        mov         r12, #1
+        vdup.32     q15, r2
+        b           9f
+endfunc
+
+@ add_residual8x8_dc(
+@  uint8_t *_dst,     [r0]
+@  ptrdiff_t stride,  [r1]
+@  int dc)            [r2]
+
+function JOIN(ff_hevc_add_residual_8x8_dc_neon_, BIT_DEPTH), export=1
+        mov         r12, #2
+        vdup.16     q15, r2
+9:
+        movw        r3,  #(1 << BIT_DEPTH) - 1
+        vmov.i64    q8,  #0
+        vdup.i16    q9,  r3
+1:
+        vld1.16     {q0}, [r0, :128], r1
+        subs        r12, #1
+        vld1.16     {q1}, [r0, :128], r1
+        vqadd.s16   q0,  q15
+        vld1.16     {q2}, [r0, :128], r1
+        vqadd.s16   q1,  q15
+        vld1.16     {q3}, [r0, :128], r1
+        vqadd.s16   q2,  q15
+        vqadd.s16   q3,  q15
+        sub         r0,  r0,  r1, lsl #2
+        vmax.s16    q0,  q8
+        vmax.s16    q1,  q8
+        vmax.s16    q2,  q8
+        vmax.s16    q3,  q8
+        vmin.s16    q0,  q9
+        vmin.s16    q1,  q9
+        vst1.16     {q0}, [r0, :128], r1
+        vmin.s16    q2,  q9
+        vst1.16     {q1}, [r0, :128], r1
+        vmin.s16    q3,  q9
+        vst1.16     {q2}, [r0, :128], r1
+        vst1.16     {q3}, [r0, :128], r1
+        bne         1b
+        bx          lr
+
+endfunc
+
+@ add_residual16x16(
+@  uint8_t *_dst,     [r0]
+@  int16_t *res,      [r1]
+@  ptrdiff_t stride)  [r2]
+
+function JOIN(ff_hevc_add_residual_16x16_neon_, BIT_DEPTH), export=1
+        movw        r3,  #(1 << BIT_DEPTH) - 1
+        vmov.i64    q8,  #0
+        vdup.i16    q9,  r3
+        mov         r12, #8
+1:
+        vldm        r1!, {q10-q13}
+        @ For RPI Sand we could guarantee :256 but not for general
+        @ non-RPI allocation. :128 is as good as we can claim
+        vld1.16     {q0, q1}, [r0, :128], r2
+        subs        r12, #1
+        vld1.16     {q2, q3}, [r0, :128]
+        vqadd.s16   q0,  q10
+        vqadd.s16   q1,  q11
+        vqadd.s16   q2,  q12
+        vqadd.s16   q3,  q13
+        sub         r0,  r2
+        vmax.s16    q0,  q0,  q8
+        vmax.s16    q1,  q1,  q8
+        vmax.s16    q2,  q2,  q8
+        vmax.s16    q3,  q3,  q8
+        vmin.s16    q0,  q0,  q9
+        vmin.s16    q1,  q1,  q9
+        vmin.s16    q2,  q2,  q9
+        vmin.s16    q3,  q3,  q9
+        vst1.16     {q0, q1}, [r0, :128], r2
+        vst1.16     {q2, q3}, [r0, :128], r2
+        bne         1b
+        bx          lr
+endfunc
+
+@ add_residual8x8_dc_c(
+@  uint8_t *_dst,     [r0]
+@  ptrdiff_t stride,  [r1]
+@  int dc_uv)         [r2]
+
+function JOIN(ff_hevc_add_residual_8x8_dc_c_neon_, BIT_DEPTH), export=1
+        mov         r12, #4
+        vdup.32     q15, r2
+        b           9f
+endfunc
+
+@ add_residual16x16_dc(
+@  uint8_t *_dst,     [r0]
+@  ptrdiff_t stride,  [r1]
+@  int dc)            [r2]
+
+function JOIN(ff_hevc_add_residual_16x16_dc_neon_, BIT_DEPTH), export=1
+        vdup.i16    q15, r2
+        mov         r12, #8
+9:
+        movw        r3,  #(1 << BIT_DEPTH) - 1
+        vmov.i64    q8,  #0
+        vdup.i16    q9,  r3
+1:
+        @ For RPI Sand we could guarantee :256 but not for general
+        @ non-RPI allocation. :128 is as good as we can claim
+        vld1.16     {q0, q1}, [r0, :128], r1
+        subs        r12, #1
+        vld1.16     {q2, q3}, [r0, :128]
+        vqadd.s16   q0,  q15
+        vqadd.s16   q1,  q15
+        vqadd.s16   q2,  q15
+        vqadd.s16   q3,  q15
+        sub         r0,  r1
+        clip16_4 q0, q1, q2, q3, q8, q9
+        vst1.16     {q0, q1}, [r0, :128], r1
+        vst1.16     {q2, q3}, [r0, :128], r1
+        bne         1b
+        bx          lr
+
+endfunc
+
+
+@ add_residual32x32(
+@  uint8_t *_dst,     [r0]
+@  int16_t *res,      [r1]
+@  ptrdiff_t stride)  [r2]
+
+function JOIN(ff_hevc_add_residual_32x32_neon_, BIT_DEPTH), export=1
+        movw        r3,  #(1 << BIT_DEPTH) - 1
+        vmov.i64    q8,  #0
+        vdup.i16    q9,  r3
+        mov         r12, #32
+1:
+        vldm        r1!, {q10-q13}
+        vldm        r0,  {q0-q3}
+        subs        r12, #1
+        vqadd.s16   q0,  q10
+        vqadd.s16   q1,  q11
+        vqadd.s16   q2,  q12
+        vqadd.s16   q3,  q13
+        clip16_4 q0, q1, q2, q3, q8, q9
+        vstm        r0,  {q0-q3}
+        add         r0,  r2
+        bne         1b
+        bx          lr
+
+endfunc
+
+@ add_residual8x8_dc_c(
+@  uint8_t *_dst,     [r0]
+@  ptrdiff_t stride,  [r1]
+@  int dc_uv)         [r2]
+
+function JOIN(ff_hevc_add_residual_16x16_dc_c_neon_, BIT_DEPTH), export=1
+        mov         r12, #16
+        vdup.32     q15, r2
+        b           9f
+endfunc
+
+@ add_residual32x32_dc(
+@  uint8_t *_dst,     [r0]
+@  ptrdiff_t stride,  [r1]
+@  int dc)            [r2]
+
+function JOIN(ff_hevc_add_residual_32x32_dc_neon_, BIT_DEPTH), export=1
+        vdup.i16    q15, r2
+        mov         r12, #32
+9:
+        movw        r3,  #(1 << BIT_DEPTH) - 1
+        vmov.i64    q8,  #0
+        vdup.i16    q9,  r3
+1:
+        vldm        r0,  {q0-q3}
+        subs        r12, #1
+        vqadd.s16   q0,  q15
+        vqadd.s16   q1,  q15
+        vqadd.s16   q2,  q15
+        vqadd.s16   q3,  q15
+        clip16_4 q0, q1, q2, q3, q8, q9
+        vstm        r0,  {q0-q3}
+        add         r0,  r1
+        bne         1b
+        bx          lr
+
+endfunc
+
+@ ============================================================================
+@ U add
+
+@ add_residual4x4_u(
+@   uint8_t *_dst,        [r0]
+@   const int16_t *res,   [r1]
+@   ptrdiff_t stride,     [r2]
+@   int dc)               [r3]
+
+function JOIN(ff_hevc_add_residual_4x4_u_neon_, BIT_DEPTH), export=1
+        vld1.16     {q10, q11}, [r1, :256]
+        vdup.16     q15, r3
+        movw        r3,  #(1 << BIT_DEPTH) - 1
+        vmov.i64    q8,  #0
+        vdup.i16    q9,  r3
+
+        vld2.16     {d0, d2}, [r0, :128], r2
+        vld2.16     {d1, d3}, [r0, :128], r2
+        vld2.16     {d4, d6}, [r0, :128], r2
+        vld2.16     {d5, d7}, [r0, :128], r2
+
+        vqadd.s16   q0,  q10
+        vqadd.s16   q1,  q15
+        vqadd.s16   q2,  q11
+        vqadd.s16   q3,  q15
+        sub         r0,  r0,  r2, lsl #2
+        clip16_4 q0, q1, q2, q3, q8, q9
+
+        vst2.16     {d0, d2}, [r0, :128], r2
+        vst2.16     {d1, d3}, [r0, :128], r2
+        vst2.16     {d4, d6}, [r0, :128], r2
+        vst2.16     {d5, d7}, [r0, :128]
+        bx          lr
+endfunc
+
+@ add_residual8x8_u(
+@   uint8_t *_dst,        [r0]
+@   const int16_t *res,   [r1]
+@   ptrdiff_t stride,     [r2]
+@   int dc)               [r3]
+
+function JOIN(ff_hevc_add_residual_8x8_u_neon_, BIT_DEPTH), export=1
+        vdup.16     q15, r3
+        movw        r3,  #(1 << BIT_DEPTH) - 1
+        vmov.i64    q8,  #0
+        mov         r12, #4
+        vdup.i16    q9,  r3
+1:
+        vld2.16     {q0, q1}, [r0, :256], r2
+        vld2.16     {q2, q3}, [r0, :256]
+        vld1.16     {q10, q11}, [r1, :256]!
+        subs        r12, #1
+        vqadd.s16   q0,  q10
+        vqadd.s16   q1,  q15
+        vqadd.s16   q2,  q11
+        vqadd.s16   q3,  q15
+        sub         r0,  r2
+        clip16_4 q0, q1, q2, q3, q8, q9
+        vst2.16     {q0, q1}, [r0, :256], r2
+        vst2.16     {q2, q3}, [r0, :256], r2
+        bne         1b
+        bx          lr
+endfunc
+
+@ add_residual16x16_u(
+@   uint8_t *_dst,        [r0]
+@   const int16_t *res,   [r1]
+@   ptrdiff_t stride,     [r2]
+@   int dc)               [r3]
+
+function JOIN(ff_hevc_add_residual_16x16_u_neon_, BIT_DEPTH), export=1
+        vdup.16     q15, r3
+        movw        r3,  #(1 << BIT_DEPTH) - 1
+        vmov.i64    q8,  #0
+        mov         r12, #16
+        vdup.i16    q9,  r3
+        sub         r2,  #32
+1:
+        vld2.16     {q0, q1}, [r0, :256]!
+        vld2.16     {q2, q3}, [r0, :256]
+        vld1.16     {q10, q11}, [r1, :256]!
+        subs        r12, #1
+        vqadd.s16   q0,  q10
+        vqadd.s16   q1,  q15
+        vqadd.s16   q2,  q11
+        vqadd.s16   q3,  q15
+        sub         r0,  #32
+        clip16_4 q0, q1, q2, q3, q8, q9
+        vst2.16     {q0, q1}, [r0, :256]!
+        vst2.16     {q2, q3}, [r0, :256], r2
+        bne         1b
+        bx          lr
+endfunc
+
+@ ============================================================================
+@ V add
+
+@ add_residual4x4_v(
+@   uint8_t *_dst,        [r0]
+@   const int16_t *res,   [r1]
+@   ptrdiff_t stride,     [r2]
+@   int dc)               [r3]
+
+function JOIN(ff_hevc_add_residual_4x4_v_neon_, BIT_DEPTH), export=1
+        vld1.16     {q10, q11}, [r1, :256]
+        vdup.16     q15, r3
+        movw        r3,  #(1 << BIT_DEPTH) - 1
+        vmov.i64    q8,  #0
+        vdup.i16    q9,  r3
+
+        vld2.16     {d0, d2}, [r0, :128], r2
+        vld2.16     {d1, d3}, [r0, :128], r2
+        vld2.16     {d4, d6}, [r0, :128], r2
+        vld2.16     {d5, d7}, [r0, :128], r2
+
+        vqadd.s16   q0,  q15
+        vqadd.s16   q1,  q10
+        vqadd.s16   q2,  q15
+        vqadd.s16   q3,  q11
+        sub         r0,  r0,  r2, lsl #2
+        clip16_4 q0, q1, q2, q3, q8, q9
+
+        vst2.16     {d0, d2}, [r0, :128], r2
+        vst2.16     {d1, d3}, [r0, :128], r2
+        vst2.16     {d4, d6}, [r0, :128], r2
+        vst2.16     {d5, d7}, [r0, :128]
+        bx          lr
+endfunc
+
+@ add_residual8x8_v(
+@   uint8_t *_dst,        [r0]
+@   const int16_t *res,   [r1]
+@   ptrdiff_t stride,     [r2]
+@   int dc)               [r3]
+
+function JOIN(ff_hevc_add_residual_8x8_v_neon_, BIT_DEPTH), export=1
+        vdup.16     q15, r3
+        movw        r3,  #(1 << BIT_DEPTH) - 1
+        vmov.i64    q8,  #0
+        mov         r12, #4
+        vdup.i16    q9,  r3
+1:
+        vld2.16     {q0, q1}, [r0, :256], r2
+        vld2.16     {q2, q3}, [r0, :256]
+        vld1.16     {q10, q11}, [r1, :256]!
+        subs        r12, #1
+        vqadd.s16   q0,  q15
+        vqadd.s16   q1,  q10
+        vqadd.s16   q2,  q15
+        vqadd.s16   q3,  q11
+        sub         r0,  r2
+        clip16_4 q0, q1, q2, q3, q8, q9
+        vst2.16     {q0, q1}, [r0, :256], r2
+        vst2.16     {q2, q3}, [r0, :256], r2
+        bne         1b
+        bx          lr
+endfunc
+
+@ add_residual16x16_v(
+@   uint8_t *_dst,        [r0]
+@   const int16_t *res,   [r1]
+@   ptrdiff_t stride,     [r2]
+@   int dc)               [r3]
+
+function JOIN(ff_hevc_add_residual_16x16_v_neon_, BIT_DEPTH), export=1
+        vdup.16     q15, r3
+        movw        r3,  #(1 << BIT_DEPTH) - 1
+        vmov.i64    q8,  #0
+        mov         r12, #16
+        vdup.i16    q9,  r3
+        sub         r2,  #32
+1:
+        vld2.16     {q0, q1}, [r0, :256]!
+        vld2.16     {q2, q3}, [r0, :256]
+        vld1.16     {q10, q11}, [r1, :256]!
+        subs        r12, #1
+        vqadd.s16   q0,  q15
+        vqadd.s16   q1,  q10
+        vqadd.s16   q2,  q15
+        vqadd.s16   q3,  q11
+        sub         r0,  #32
+        clip16_4 q0, q1, q2, q3, q8, q9
+        vst2.16     {q0, q1}, [r0, :256]!
+        vst2.16     {q2, q3}, [r0, :256], r2
+        bne         1b
+        bx          lr
+endfunc
+
+@ ============================================================================
+@ U & V add
+
+@ add_residual4x4_c(
+@   uint8_t *_dst,        [r0]
+@   const int16_t *res,   [r1]
+@   ptrdiff_t stride)     [r2]
+
+function JOIN(ff_hevc_add_residual_4x4_c_neon_, BIT_DEPTH), export=1
+        vldm        r1, {q10-q13}
+        movw        r3,  #(1 << BIT_DEPTH) - 1
+        vmov.i64    q8,  #0
+        vdup.i16    q9,  r3
+
+        vld2.16     {d0, d2}, [r0, :128], r2
+        vld2.16     {d1, d3}, [r0, :128], r2
+        vld2.16     {d4, d6}, [r0, :128], r2
+        vld2.16     {d5, d7}, [r0, :128], r2
+
+        vqadd.s16   q0,  q10
+        vqadd.s16   q2,  q11
+        vqadd.s16   q1,  q12
+        vqadd.s16   q3,  q13
+        sub         r0,  r0,  r2, lsl #2
+        vmax.s16    q0,  q0,  q8
+        vmax.s16    q1,  q1,  q8
+        vmax.s16    q2,  q2,  q8
+        vmax.s16    q3,  q3,  q8
+        vmin.s16    q0,  q0,  q9
+        vmin.s16    q1,  q1,  q9
+        vmin.s16    q2,  q2,  q9
+        vmin.s16    q3,  q3,  q9
+
+        vst2.16     {d0, d2}, [r0, :128], r2
+        vst2.16     {d1, d3}, [r0, :128], r2
+        vst2.16     {d4, d6}, [r0, :128], r2
+        vst2.16     {d5, d7}, [r0, :128]
+        bx          lr
+endfunc
+
+@ add_residual8x8_c(
+@   uint8_t *_dst,        [r0]
+@   const int16_t *res,   [r1]
+@   ptrdiff_t stride)     [r2]
+
+function JOIN(ff_hevc_add_residual_8x8_c_neon_, BIT_DEPTH), export=1
+        movw        r3,  #(1 << BIT_DEPTH) - 1
+        vmov.i64    q8,  #0
+        mov         r12, #4
+        vdup.i16    q9,  r3
+        add         r3, r1, #(8*8*2)  @ Offset to V
+1:
+        vld2.16     {q0, q1}, [r0, :256], r2
+        vld2.16     {q2, q3}, [r0, :256]
+        vld1.16     {q10, q11}, [r1, :256]!
+        vld1.16     {q12, q13}, [r3, :256]!
+        subs        r12, #1
+        vqadd.s16   q0,  q10
+        vqadd.s16   q2,  q11
+        vqadd.s16   q1,  q12
+        vqadd.s16   q3,  q13
+        sub         r0,  r2
+        vmax.s16    q0,  q0,  q8
+        vmax.s16    q1,  q1,  q8
+        vmax.s16    q2,  q2,  q8
+        vmax.s16    q3,  q3,  q8
+        vmin.s16    q0,  q0,  q9
+        vmin.s16    q1,  q1,  q9
+        vmin.s16    q2,  q2,  q9
+        vmin.s16    q3,  q3,  q9
+        vst2.16     {q0, q1}, [r0, :256], r2
+        vst2.16     {q2, q3}, [r0, :256], r2
+        bne         1b
+        bx          lr
+endfunc
+
+@ add_residual16x16_c(
+@   uint8_t *_dst,        [r0]
+@   const int16_t *res,   [r1]
+@   ptrdiff_t stride)     [r2]
+
+function JOIN(ff_hevc_add_residual_16x16_c_neon_, BIT_DEPTH), export=1
+        movw        r3,  #(1 << BIT_DEPTH) - 1
+        vmov.i64    q8,  #0
+        mov         r12, #16
+        vdup.i16    q9,  r3
+        add         r3,  r1, #(16*16*2)  @ Offset to V
+        sub         r2,  #32
+1:
+        vld2.16     {q0, q1}, [r0, :256]!
+        vld2.16     {q2, q3}, [r0, :256]
+        vld1.16     {q10, q11}, [r1, :256]!
+        vld1.16     {q12, q13}, [r3, :256]!
+        subs        r12, #1
+        vqadd.s16   q0,  q10
+        vqadd.s16   q2,  q11
+        vqadd.s16   q1,  q12
+        vqadd.s16   q3,  q13
+        sub         r0,  #32
+        vmax.s16    q0,  q0,  q8
+        vmax.s16    q1,  q1,  q8
+        vmax.s16    q2,  q2,  q8
+        vmax.s16    q3,  q3,  q8
+        vmin.s16    q0,  q0,  q9
+        vmin.s16    q1,  q1,  q9
+        vmin.s16    q2,  q2,  q9
+        vmin.s16    q3,  q3,  q9
+        vst2.16     {q0, q1}, [r0, :256]!
+        vst2.16     {q2, q3}, [r0, :256], r2
+        bne         1b
+        bx          lr
+endfunc
+
diff --git a/libavcodec/arm/hevcdsp_sao_neon.S b/libavcodec/arm/hevcdsp_sao_neon.S
new file mode 100644
index 0000000000..30113d9c93
--- /dev/null
+++ b/libavcodec/arm/hevcdsp_sao_neon.S
@@ -0,0 +1,1882 @@
+/*
+ * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
+.set EDGE_SRC_STRIDE, 160
+
+.macro sao_band_64b_8 XLAT0, XLAT1, Q_K128
+        vshr.u8 q12, q8, #3
+        vadd.s8  q8, \Q_K128
+        vshr.u8 q13, q9, #3
+        vadd.s8  q9, \Q_K128
+
+        vtbl.8   d24, \XLAT0, d24
+        vtbl.8   d25, \XLAT0, d25
+        vtbl.8   d26, \XLAT1, d26
+        vtbl.8   d27, \XLAT1, d27
+
+        vqadd.s8 q8, q12
+        vshr.u8 q12, q10, #3
+        vadd.s8  q10, \Q_K128
+        vqadd.s8 q9, q13
+        vshr.u8 q13, q11, #3
+        vadd.s8  q11, \Q_K128
+
+        vsub.s8  q8, \Q_K128
+        vtbl.8   d24, \XLAT0, d24
+        vtbl.8   d25, \XLAT0, d25
+        vsub.s8  q9, \Q_K128
+        vtbl.8   d26, \XLAT1, d26
+        vtbl.8   d27, \XLAT1, d27
+        vqadd.s8 q10, q12
+        vqadd.s8 q11, q13
+        vsub.s8  q10, \Q_K128
+        vsub.s8  q11, \Q_K128
+.endm
+
+.macro sao_band_16b_8 XLAT0, XLAT1, Q_K128
+        vshr.u8 q12, q8, #3
+        vadd.s8  q8, \Q_K128
+
+        vtbl.8   d24, \XLAT0, d24
+        vtbl.8   d25, \XLAT1, d25
+
+        vqadd.s8 q8, q12
+        vsub.s8  q8, \Q_K128
+.endm
+
+
+.macro clip16_4 Q0, Q1, Q2, Q3, Q_MIN, Q_MAX
+        vmax.s16  \Q0, \Q_MIN
+        vmax.s16  \Q1, \Q_MIN
+        vmax.s16  \Q2, \Q_MIN
+        vmax.s16  \Q3, \Q_MIN
+        vmin.s16  \Q0, \Q_MAX
+        vmin.s16  \Q1, \Q_MAX
+        vmin.s16  \Q2, \Q_MAX
+        vmin.s16  \Q3, \Q_MAX
+.endm
+
+@ Clobbers q12, q13
+.macro sao_band_64b_16  Q0, Q1, Q2, Q3, XLAT0, XLAT1, Q_MIN, Q_MAX, bit_depth
+        vshrn.i16 d24, \Q0, #(\bit_depth - 5)
+        vshrn.i16 d25, \Q1, #(\bit_depth - 5)
+        vshrn.i16 d26, \Q2, #(\bit_depth - 5)
+        vshrn.i16 d27, \Q3, #(\bit_depth - 5)
+        vtbl.8    d24, \XLAT0, d24
+        vtbl.8    d25, \XLAT1, d25
+        vtbl.8    d26, \XLAT0, d26
+        vtbl.8    d27, \XLAT1, d27
+        vaddw.s8  \Q0, d24
+        vaddw.s8  \Q1, d25
+        vaddw.s8  \Q2, d26
+        vaddw.s8  \Q3, d27
+        clip16_4   \Q0, \Q1, \Q2, \Q3, \Q_MIN, \Q_MAX
+.endm
+
+@ Clobbers q12
+.macro sao_band_32b_16  Q0, Q1, XLAT0, XLAT1, Q_MIN, Q_MAX, bit_depth
+        vshrn.i16 d24, \Q0, #(\bit_depth - 5)
+        vshrn.i16 d25, \Q1, #(\bit_depth - 5)
+        vtbl.8    d24, \XLAT0, d24
+        vtbl.8    d25, \XLAT1, d25
+        vaddw.s8  \Q0, d24
+        vaddw.s8  \Q1, d25
+        vmax.s16  \Q0, \Q_MIN
+        vmax.s16  \Q1, \Q_MIN
+        vmin.s16  \Q0, \Q_MAX
+        vmin.s16  \Q1, \Q_MAX
+.endm
+
+
+@ Standard coding rules for sao_offset_abs limit it to 0-31 (Table 9-38)
+@ so we are quite safe stuffing it into a byte array
+@ There may be a subsequent shl by log2_sao_offset_scale_luma/chroma
+@ (7.4.3.3.2 && 7-70) but we should still be safe to at least 12 bits of
+@ precision
+
+@ This, somewhat nasty, bit of code builds the {d0-d3} translation
+@ array via the stack
+@ Given that sao_left_class > 28 can cause wrap we can't just poke
+@ all 4 bytes in at once
+@
+@ It also loads other common regs
+
+function band_load_y
+        vmov.i64  q0, #0
+        ldr       r12, [sp, #8]         @ &sao_offset_val[0]
+        add       r12, #2               @ 1st interesting val is [1]
+        vld1.16   {d16}, [r12]          @ Unaligned
+        vmov.i64  q1, #0
+        ldr       r12, [sp, #12]        @ sao_left_class
+
+        mov       r4, sp
+        sub       sp, #32
+        and       sp, #~63              @ Align stack so we can wrap with a simple AND
+        vst1.8    {q0, q1}, [sp, :256]  @ Put zero array on stack
+        add       r12, sp
+        vst1.8    {d16[0]}, [r12]!
+        and       r12, #~32
+        vst1.8    {d16[2]}, [r12]!
+        and       r12, #~32
+        vst1.8    {d16[4]}, [r12]!
+        and       r12, #~32
+        vst1.8    {d16[6]}, [r12]
+        vld1.8    {q0, q1}, [sp, :256]  @ Pop modified array
+        mov       sp, r4
+
+        ldr       r12, [sp, #20]        @ height
+        pld       [r1]
+
+        sub       r12, #1
+        add       r4, r1, r3
+        bx        lr
+endfunc
+
+
+function band_load_c
+        vmov.i64  q2, #0
+        ldr       r12, [sp, #8]         @ &sao_offset_val1[0]
+        add       r12, #2               @ 1st interesting val is [1]
+        vld1.16   {d16}, [r12]          @ Unaligned
+        vmov.i64  q3, #0
+        ldr       r12, [sp, #12]        @ sao_left_class
+
+        mov       r4, sp                @ Remember SP
+        sub       sp, #32
+        and       sp, #~63              @ Align stack so we can wrap with a simple AND
+
+        vst1.8    {q2, q3}, [sp, :256]  @ Put zero array on stack
+        add       r12, sp
+        vst1.8    {d16[0]}, [r12]!
+        and       r12, #~32
+        vst1.8    {d16[2]}, [r12]!
+        and       r12, #~32
+        vst1.8    {d16[4]}, [r12]!
+        and       r12, #~32
+        vst1.8    {d16[6]}, [r12]
+        vld1.8    {q0, q1}, [sp, :256]  @ Pop modified array
+
+        @ And again for the 2nd set
+        ldr       r12, [r4, #16]        @ &sao_offset_val2[0]
+        add       r12, #2               @ 1st interesting val is [1]
+        vld1.16   {d16}, [r12]          @ Unaligned
+        ldr       r12, [r4, #20]        @ sao_left_class2
+
+        vst1.8    {q2, q3}, [sp, :256]  @ Put zero array on stack (again)
+        add       r12, sp
+        vst1.8    {d16[0]}, [r12]!
+        and       r12, #~32
+        vst1.8    {d16[2]}, [r12]!
+        and       r12, #~32
+        vst1.8    {d16[4]}, [r12]!
+        and       r12, #~32
+        vst1.8    {d16[6]}, [r12]
+        vld1.8    {q2, q3}, [sp, :256]  @ Pop modified array
+
+        mov       sp, r4
+
+        ldr       r12, [sp, #28]        @ height
+        pld       [r1]
+
+        subs      r12, #1
+        add       r4, r1, r3
+        bx        lr
+endfunc
+
+
+@ ff_hevc_sao_band_64_neon_8 (
+@   uint8_t *_dst,              [r0]
+@   uint8_t *_src,              [r1]
+@   ptrdiff_t stride_dst,       [r2]
+@   ptrdiff_t stride_src,       [r3]
+@   int16_t *sao_offset_val,    [sp, #0]
+@   int sao_left_class,         [sp, #4]
+@   int width,                  [sp, #8]
+@   int height)                 [sp, #12]
+
+function ff_hevc_sao_band_64_neon_8, export=1
+        push      {r4, lr}
+        bl        band_load_y
+        vmov.u8   q15, #128
+
+1:      subs      r12, #1
+        vldm      r1, {q8-q11}
+        pld       [r4]
+        add       r1, r3
+
+        sao_band_64b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15
+
+        it ne
+        addne     r4, r3
+        vstm      r0, {q8-q11}
+        add       r0, r2
+        bpl       1b
+
+        pop       {r4, pc}
+endfunc
+
+@ ff_hevc_sao_band_32_neon_8 (
+@   uint8_t *_dst,              [r0]
+@   uint8_t *_src,              [r1]
+@   ptrdiff_t stride_dst,       [r2]
+@   ptrdiff_t stride_src,       [r3]
+@   int16_t *sao_offset_val,    [sp, #0]
+@   int sao_left_class,         [sp, #4]
+@   int width,                  [sp, #8]
+@   int height)                 [sp, #12]
+
+function ff_hevc_sao_band_32_neon_8, export=1
+        push      {r4, lr}
+        bl        band_load_y
+        vmov.u8   q15, #128
+
+1:      subs      r12, #2
+        vld1.8    { q8, q9 }, [r1, :128], r3
+        vld1.8    {q10, q11}, [r1, :128], r3
+
+        sao_band_64b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15
+
+        vst1.8    { q8, q9 }, [r0, :128], r2
+        vst1.8    {q10, q11}, [r0, :128], r2
+        bpl       1b
+
+        pop       {r4, pc}
+endfunc
+
+@ ff_hevc_sao_band_16_neon_8 (
+@   uint8_t *_dst,              [r0]
+@   uint8_t *_src,              [r1]
+@   ptrdiff_t stride_dst,       [r2]
+@   ptrdiff_t stride_src,       [r3]
+@   int16_t *sao_offset_val,    [sp, #0]
+@   int sao_left_class,         [sp, #4]
+@   int width,                  [sp, #8]
+@   int height)                 [sp, #12]
+
+function ff_hevc_sao_band_16_neon_8, export=1
+        push      {r4, lr}
+        bl        band_load_y
+        vmov.u8   q15, #128
+
+1:      subs      r12, #4
+        vld1.8    { q8}, [r1, :128], r3
+        vld1.8    { q9}, [r1, :128], r3
+        vld1.8    {q10}, [r1, :128], r3
+        vld1.8    {q11}, [r1, :128], r3
+
+        sao_band_64b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15
+
+        vst1.8    { q8}, [r0, :128], r2
+        vst1.8    { q9}, [r0, :128], r2
+        vst1.8    {q10}, [r0, :128], r2
+        vst1.8    {q11}, [r0, :128], r2
+        bpl       1b
+
+        pop       {r4, pc}
+endfunc
+
+@ ff_hevc_sao_band_8_neon_8 (
+@   uint8_t *_dst,              [r0]
+@   uint8_t *_src,              [r1]
+@   ptrdiff_t stride_dst,       [r2]
+@   ptrdiff_t stride_src,       [r3]
+@   int16_t *sao_offset_val,    [sp, #0]
+@   int sao_left_class,         [sp, #4]
+@   int width,                  [sp, #8]
+@   int height)                 [sp, #12]
+
+function ff_hevc_sao_band_8_neon_8, export=1
+        push      {r4, lr}
+        bl        band_load_y
+        ldr       lr, [sp, #16]         @ width
+        vmov.u8   q15, #128
+        cmp       lr, #8
+        blt       4f
+
+1:      subs      r12, #2
+        vld1.8    {d16}, [r1, :64], r3
+        vld1.8    {d17}, [r1, :64], r3
+
+        sao_band_16b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15
+
+        vst1.8    {d16}, [r0, :64], r2
+        vst1.8    {d17}, [r0, :64], r2
+        bpl       1b
+        pop       {r4, pc}
+
+4:
+1:      subs      r12, #4
+        vld1.32   {d16[0]}, [r1, :32], r3
+        vld1.32   {d16[1]}, [r1, :32], r3
+        vld1.32   {d17[0]}, [r1, :32], r3
+        vld1.32   {d17[1]}, [r1, :32], r3
+
+        sao_band_16b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15
+
+        vst1.32   {d16[0]}, [r0, :32], r2
+        vst1.32   {d16[1]}, [r0, :32], r2
+        vst1.32   {d17[0]}, [r0, :32], r2
+        vst1.32   {d17[1]}, [r0, :32], r2
+        bpl       1b
+        pop       {r4, pc}
+endfunc
+
+@ ff_hevc_sao_band_c_32_neon_8(
+@   uint8_t * dst          [r0]
+@   uint8_t * src          [r1]
+@   uint32_t dst_stride    [r2]
+@   uint32_t src_stride    [r3]
+@   const int16_t * table1 sp[0]
+@   uint32_t offset1       sp[4]
+@   const int16_t * table2 sp[8]
+@   uint32_t offset2       sp[12]
+@   int width              sp[16]
+@   int height             sp[20]
+
+function ff_hevc_sao_band_c_32_neon_8, export=1
+        push    {r4, lr}
+        bl      band_load_c
+
+        vmov.i8   q15, #128
+        sub       r3, #32
+        sub       r2, #32
+
+1:      subs      r12, #1
+        vld2.8    { q8, q9 }, [r1, :128]!
+        vld2.8    {q10, q11}, [r1, :128], r3
+
+        pld       [r4]
+
+        sao_band_64b_8 "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q15
+
+        vst2.8    { q8, q9 }, [r0, :128]!
+        vst2.8    {q10, q11}, [r0, :128], r2
+
+        itt ne
+        addne     r4, r3
+        addne     r4, #32
+
+        bpl       1b
+
+        pop     {r4, pc}
+endfunc
+
+@ ff_hevc_sao_band_c_16_neon_8(
+@   uint8_t * dst          [r0]
+@   uint8_t * src          [r1]
+@   uint32_t dst_stride    [r2]
+@   uint32_t src_stride    [r3]
+@   const int16_t * table1 sp[0]
+@   uint32_t offset1       sp[4]
+@   const int16_t * table2 sp[8]
+@   uint32_t offset2       sp[12]
+@   int width              sp[16]
+@   int height             sp[20]
+
+function ff_hevc_sao_band_c_16_neon_8, export=1
+        push    {r4, lr}
+        bl      band_load_c
+        vmov.i8   q15, #128
+
+1:      subs      r12, #2
+        vld2.8    { q8, q9 }, [r1, :128], r3
+        vld2.8    {q10, q11}, [r1, :128], r3
+
+        sao_band_64b_8 "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q15
+
+        vst2.8    { q8, q9 }, [r0, :128], r2
+        vst2.8    {q10, q11}, [r0, :128], r2
+
+        bpl       1b
+        pop     {r4, pc}
+endfunc
+
+@ ff_hevc_sao_band_c_8_neon_8(
+@   uint8_t * dst          [r0]
+@   uint8_t * src          [r1]
+@   uint32_t dst_stride    [r2]
+@   uint32_t src_stride    [r3]
+@   const int16_t * table1 sp[0]
+@   uint32_t offset1       sp[4]
+@   const int16_t * table2 sp[8]
+@   uint32_t offset2       sp[12]
+@   int width              sp[16]
+@   int height             sp[20]
+
+function ff_hevc_sao_band_c_8_neon_8, export=1
+        push    {r4, lr}
+        bl      band_load_c
+        ldr       lr, [sp, #16]         @ width
+        vmov.u8   q15, #128
+        cmp       lr, #8
+        blt       4f
+
+1:      subs      r12, #1
+        vld2.8    {d16, d17}, [r1, :128], r3
+
+        sao_band_16b_8 "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q15
+
+        vst2.8    {d16, d17}, [r0, :128], r2
+        bpl       1b
+        pop     {r4, pc}
+
+4:
+1:      subs      r12, #1
+        vld1.8    {d16}, [r1, :64], r3
+        vld1.8    {d17}, [r1, :64], r3
+        vuzp.8    d16, d17
+
+        sao_band_16b_8 "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q15
+
+        vzip.8    d16, d17
+        vst1.8    {d16}, [r0, :64], r2
+        vst1.8    {d17}, [r0, :64], r2
+        bpl       1b
+        pop     {r4, pc}
+endfunc
+
+
+@ ff_hevc_sao_band_64_neon_10 (
+@   uint8_t *_dst,              [r0]
+@   uint8_t *_src,              [r1]
+@   ptrdiff_t stride_dst,       [r2]
+@   ptrdiff_t stride_src,       [r3]
+@   int16_t *sao_offset_val,    [sp, #0]
+@   int sao_left_class,         [sp, #4]
+@   int width,                  [sp, #8]
+@   int height)                 [sp, #12]
+
+.macro band_64_16 bit_depth
+        push      {r4, lr}
+        movw      lr, #(1 << \bit_depth) - 1
+        vmov.i64  q2, #0
+        vdup.i16  q3, lr
+        bl        band_load_y
+        vpush     {q4-q7}
+
+1:      subs      r12, #1
+        vldm      r1, {q4-q11}
+        add       r1, r3
+        sao_band_64b_16 q4,  q5,  q6,  q7, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q2, q3, \bit_depth
+        sao_band_64b_16 q8,  q9, q10, q11, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q2, q3, \bit_depth
+        vstm      r0, {q4-q11}
+        add       r0, r2
+        bpl       1b
+
+        vpop      {q4-q7}
+        pop       {r4, pc}
+.endm
+
+function ff_hevc_sao_band_64_neon_10, export=1
+        band_64_16 10
+endfunc
+
+@ ff_hevc_sao_band_32_neon_10 (
+@   uint8_t *_dst,              [r0]
+@   uint8_t *_src,              [r1]
+@   ptrdiff_t stride_dst,       [r2]
+@   ptrdiff_t stride_src,       [r3]
+@   int16_t *sao_offset_val,    [sp, #0]
+@   int sao_left_class,         [sp, #4]
+@   int width,                  [sp, #8]
+@   int height)                 [sp, #12]
+
+.macro band_32_16 bit_depth
+        push      {r4, lr}
+        movw      lr, #(1 << \bit_depth) - 1
+        vmov.i64  q2, #0
+        vdup.i16  q3, lr
+        bl        band_load_y
+
+1:      subs      r12, #1
+        vldm      r1, {q8-q11}
+        add       r1, r3
+        sao_band_64b_16 q8,  q9,  q10, q11, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q2, q3, \bit_depth
+        vstm      r0, {q8-q11}
+        add       r0, r2
+        bpl       1b
+
+        pop       {r4, pc}
+.endm
+
+function ff_hevc_sao_band_32_neon_10, export=1
+        band_32_16 10
+endfunc
+
+@ ff_hevc_sao_band_16_neon_10 (
+@   uint8_t *_dst,              [r0]
+@   uint8_t *_src,              [r1]
+@   ptrdiff_t stride_dst,       [r2]
+@   ptrdiff_t stride_src,       [r3]
+@   int16_t *sao_offset_val,    [sp, #0]
+@   int sao_left_class,         [sp, #4]
+@   int width,                  [sp, #8]
+@   int height)                 [sp, #12]
+
+.macro band_16_16 bit_depth
+        push      {r4, lr}
+        movw      lr, #(1 << \bit_depth) - 1
+        vmov.i64  q14, #0
+        vdup.i16  q15, lr
+        bl        band_load_y
+
+1:      subs      r12, #2
+        vld1.16   { q8, q9 }, [r1, :128], r3
+        vld1.16   {q10, q11}, [r1, :128], r3
+        sao_band_64b_16 q8,  q9,  q10, q11, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q14, q15, \bit_depth
+        vst1.16   { q8, q9 }, [r0, :128], r2
+        vst1.16   {q10, q11}, [r0, :128], r2
+        bpl       1b
+
+        pop       {r4, pc}
+.endm
+
+function ff_hevc_sao_band_16_neon_10, export=1
+        band_16_16 10
+endfunc
+
+@ ff_hevc_sao_band_8_neon_10 (
+@   uint8_t *_dst,              [r0]
+@   uint8_t *_src,              [r1]
+@   ptrdiff_t stride_dst,       [r2]
+@   ptrdiff_t stride_src,       [r3]
+@   int16_t *sao_offset_val,    [sp, #0]
+@   int sao_left_class,         [sp, #4]
+@   int width,                  [sp, #8]
+@   int height)                 [sp, #12]
+
+.macro band_8_16 bit_depth
+        push      {r4, lr}
+        movw      lr, #(1 << \bit_depth) - 1
+        vmov.i64  q14, #0
+        vdup.i16  q15, lr
+        bl        band_load_y
+        ldr       lr, [sp, #16]
+        cmp       lr, #8
+        blt       4f
+
+1:      subs      r12, #2
+        vld1.16   { q8}, [r1, :128], r3
+        vld1.16   { q9}, [r1, :128], r3
+        sao_band_32b_16 q8,  q9, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q14, q15, \bit_depth
+        vst1.16   { q8}, [r0, :128], r2
+        vst1.16   { q9}, [r0, :128], r2
+        bpl       1b
+        pop       {r4, pc}
+
+4:
+1:      subs      r12, #4
+        vld1.16   {d16}, [r1, :64], r3
+        vld1.16   {d17}, [r1, :64], r3
+        vld1.16   {d18}, [r1, :64], r3
+        vld1.16   {d19}, [r1, :64], r3
+        sao_band_32b_16 q8,  q9, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q14, q15, \bit_depth
+        vst1.16   {d16}, [r0, :64], r2
+        vst1.16   {d17}, [r0, :64], r2
+        vst1.16   {d18}, [r0, :64], r2
+        vst1.16   {d19}, [r0, :64], r2
+        bpl       1b
+        pop       {r4, pc}
+.endm
+
+function ff_hevc_sao_band_8_neon_10, export=1
+        band_8_16 10
+endfunc
+
+
+@ ff_hevc_sao_band_c_32_neon_10(
+@   uint8_t * dst          [r0]
+@   uint8_t * src          [r1]
+@   uint32_t dst_stride    [r2]
+@   uint32_t src_stride    [r3]
+@   const int16_t * table1 sp[0]
+@   uint32_t offset1       sp[4]
+@   const int16_t * table2 sp[8]
+@   uint32_t offset2       sp[12]
+@   int width              sp[16]
+@   int height             sp[20]
+
+.macro band_c_32_16 bit_depth
+        push      {r4, lr}
+        bl        band_load_c
+        vpush     {q4-q7}
+        movw      lr, #(1 << \bit_depth) - 1
+        vmov.i64  q14, #0
+        vdup.i16  q15, lr
+        sub       r2, #96
+
+1:      subs      r12, #1
+
+        vld2.16   { q4, q5 }, [r1, :128]!
+        vld2.16   { q6, q7 }, [r1, :128]!
+        vld2.16   { q8, q9 }, [r1, :128]!
+        vld2.16   {q10, q11}, [r1, :128], r3
+
+        pld       [r4]
+        sub       r1, #96
+
+        sao_band_64b_16 q4,  q5,  q6,  q7, "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q14, q15, \bit_depth
+        sao_band_64b_16 q8,  q9, q10, q11, "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q14, q15, \bit_depth
+
+        it ne
+        addne     r4, r3
+
+        vst2.16   { q4, q5 }, [r0, :128]!
+        vst2.16   { q6, q7 }, [r0, :128]!
+        vst2.16   { q8, q9 }, [r0, :128]!
+        vst2.16   {q10, q11}, [r0, :128], r2
+
+        bpl       1b
+
+        vpop      {q4-q7}
+        pop       {r4, pc}
+.endm
+
+function ff_hevc_sao_band_c_32_neon_10, export=1
+        band_c_32_16 10
+endfunc
+
+
+@ ff_hevc_sao_band_c_16_neon_10(
+@   uint8_t * dst          [r0]
+@   uint8_t * src          [r1]
+@   uint32_t dst_stride    [r2]
+@   uint32_t src_stride    [r3]
+@   const int16_t * table1 sp[0]
+@   uint32_t offset1       sp[4]
+@   const int16_t * table2 sp[8]
+@   uint32_t offset2       sp[12]
+@   int width              sp[16]
+@   int height             sp[20]
+
+.macro band_c_16_16 bit_depth
+        push      {r4, lr}
+        bl        band_load_c
+        movw      lr, #(1 << \bit_depth) - 1
+        vmov.i64  q14, #0
+        vdup.i16  q15, lr
+        sub       r2, #32
+        sub       r3, #32
+
+1:      subs      r12, #1
+
+        vld2.16   { q8, q9 }, [r1, :128]!
+        vld2.16   {q10, q11}, [r1, :128], r3
+
+        sao_band_64b_16 q8,  q9, q10, q11, "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q14, q15, \bit_depth
+
+        vst2.16   { q8, q9 }, [r0, :128]!
+        vst2.16   {q10, q11}, [r0, :128], r2
+
+        bpl       1b
+        pop       {r4, pc}
+.endm
+
+function ff_hevc_sao_band_c_16_neon_10, export=1
+        band_c_16_16 10
+endfunc
+
+
+@ ff_hevc_sao_band_c_8_neon_10(
+@   uint8_t * dst          [r0]
+@   uint8_t * src          [r1]
+@   uint32_t dst_stride    [r2]
+@   uint32_t src_stride    [r3]
+@   const int16_t * table1 sp[0]
+@   uint32_t offset1       sp[4]
+@   const int16_t * table2 sp[8]
+@   uint32_t offset2       sp[12]
+@   int width              sp[16]
+@   int height             sp[20]
+
+.macro band_c_8_16 bit_depth
+        push      {r4, lr}
+        bl        band_load_c
+        movw      lr, #(1 << \bit_depth) - 1
+        vmov.i64  q14, #0
+        vdup.i16  q15, lr
+        ldr       lr, [sp, #24]         @ width
+        cmp       lr, #8
+        blt       4f
+
+1:      subs      r12, #1
+        vld2.16   { q8, q9 }, [r1, :128], r3
+
+        sao_band_32b_16 q8,  q9, "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q14, q15, \bit_depth
+
+        vst2.16   { q8, q9 }, [r0, :128], r2
+
+        bpl       1b
+        pop       {r4, pc}
+
+4:
+1:      subs      r12, #2
+        vld2.16   {d16, d17}, [r1, :128], r3
+        vld2.16   {d18, d19}, [r1, :128], r3
+
+        sao_band_32b_16 q8,  q9, "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q14, q15, \bit_depth
+
+        vst2.16   {d16, d17}, [r0, :128], r2
+        vst2.16   {d18, d19}, [r0, :128], r2
+
+        bpl       1b
+        pop       {r4, pc}
+.endm
+
+function ff_hevc_sao_band_c_8_neon_10, export=1
+        band_c_8_16 10
+endfunc
+
+
+@ =============================================================================
+@ SAO EDGE
+
+@ r0    destination address
+@ r2    stride to post-increment r0 with
+@ [r5]  translate values
+@
+@ a <- c <- b
+@ a in q0 - q3
+@ c in q4 - q7
+@ b in q8 - q11
+@
+@ q12-15 used as temp
+@
+@ Can be used for both Y & C as we unzip/zip the deltas and
+@ transform "u/v" separately via d26/d27.  For Y d26=d27
+
+function edge_64b_body_8
+
+        vcgt.u8 q12,  q4,  q0   @ c > a -> -1 , otherwise 0
+        vcgt.u8 q13,  q5,  q1
+        vcgt.u8 q14,  q6,  q2
+        vcgt.u8 q15,  q7,  q3
+
+        vcgt.u8  q0,  q4        @ a > c -> -1 , otherwise 0
+        vcgt.u8  q1,  q5
+        vcgt.u8  q2,  q6
+        vcgt.u8  q3,  q7
+
+        vsub.s8  q0,  q12       @ a = sign(c-a)
+        vsub.s8  q1,  q13
+        vsub.s8  q2,  q14
+        vsub.s8  q3,  q15
+
+        vcgt.u8  q12, q4,  q8   @ c > b -> -1 , otherwise 0
+        vcgt.u8  q13, q5,  q9
+        vcgt.u8  q14, q6,  q10
+        vcgt.u8  q15, q7,  q11
+
+        vsub.s8  q0,  q12
+        vsub.s8  q1,  q13
+        vsub.s8  q2,  q14
+        vsub.s8  q3,  q15
+
+        vcgt.u8  q12, q8,  q4   @ c < b -> -1 , otherwise 0
+        vcgt.u8  q13, q9,  q5
+        vcgt.u8  q14, q10, q6
+        vcgt.u8  q15, q11, q7
+
+        vadd.s8  q0,  q12       @ a = sign(c-a) + sign(c-b)
+        vadd.s8  q1,  q13
+        vmov.u8  q12, #2
+        vadd.s8  q2,  q14
+        vadd.s8  q3,  q15
+
+        vadd.s8  q0,  q12
+        vadd.s8  q1,  q12
+
+        vld1.8   {d26, d27}, [r5]
+
+        vadd.s8  q2,  q12
+        vuzp.8   q0,  q1
+        vmov.u8  q15, #128
+        vadd.s8  q3,  q12       @ a = 2 + sign(c-a) + sign(c-b)
+
+        vtbl.8   d0,  {d26}, d0
+        vadd.s8  q12, q4, q15   @ Add -128 so we can use saturating signed add
+
+        vtbl.8   d1,  {d26}, d1
+        vadd.s8  q14, q5, q15
+
+        vtbl.8   d2,  {d27}, d2
+        vuzp.8   q2,  q3
+
+        vtbl.8   d3,  {d27}, d3
+
+        vtbl.8   d4,  {d26}, d4
+        vzip.8   q0,  q1
+
+        vtbl.8   d5,  {d26}, d5
+        vqadd.s8 q0,  q12
+        vqadd.s8 q1,  q14
+        vadd.s8  q12, q6, q15   @ Add -128 so we can use saturating signed add
+
+        vtbl.8   d6,  {d27}, d6
+        vadd.s8  q14, q7, q15   @ Add -128 so we can use saturating signed add
+
+        vtbl.8   d7,  {d27}, d7
+        vzip.8   q2,  q3
+
+        vsub.s8  q0,  q15
+        vqadd.s8 q2,  q12
+        vqadd.s8 q3,  q14
+        vsub.s8  q1,  q15
+        vsub.s8  q2,  q15
+        vsub.s8  q3,  q15
+
+        bx      lr
+endfunc
+
+@ r0    destination address
+@ r2    stride to post-increment r0 with
+@ r4    upper clip value
+@ [r5]  translate values
+@
+@ a <- c <- b
+@ a in q0 - q3
+@ c in q4 - q7
+@ b in q8 - q11
+@
+@ q12-15 used as temp
+@
+@ Can be used for both Y & C as we unzip/zip the deltas and
+@ transform "u/v" separately via d26/d27.  For Y d26=d27
+
+function edge_64b_body_16
+
+        vcgt.u16 q12, q4, q0  // c > a -> -1 , otherwise 0
+        vcgt.u16 q13, q5, q1
+        vcgt.u16 q14, q6, q2
+        vcgt.u16 q15, q7, q3
+
+        vcgt.u16 q0, q0, q4  // a > c -> -1 , otherwise 0
+        vcgt.u16 q1, q1, q5
+        vcgt.u16 q2, q2, q6
+        vcgt.u16 q3, q3, q7
+
+        vsub.s16 q0, q0, q12 // a = sign(c-a)
+        vsub.s16 q1, q1, q13
+        vsub.s16 q2, q2, q14
+        vsub.s16 q3, q3, q15
+
+        vcgt.u16 q12, q4, q8  // c > b -> -1 , otherwise 0
+        vcgt.u16 q13, q5, q9
+        vcgt.u16 q14, q6, q10
+        vcgt.u16 q15, q7, q11
+
+        vsub.s16 q0, q0, q12
+        vsub.s16 q1, q1, q13
+        vsub.s16 q2, q2, q14
+        vsub.s16 q3, q3, q15
+
+        vcgt.u16 q12, q8, q4  // c < b -> -1 , otherwise 0
+        vcgt.u16 q13, q9, q5
+        vcgt.u16 q14, q10, q6
+        vcgt.u16 q15, q11, q7
+
+        vadd.s16 q0, q0, q12  // a = sign(c-a) + sign(c-b)
+        vadd.s16 q1, q1, q13
+        vmov.u8  q12, #2
+        vadd.s16 q2, q2, q14
+        vadd.s16 q3, q3, q15
+
+        vmovn.s16 d0, q0
+        vmovn.s16 d1, q1
+        vmovn.s16 d2, q2
+        vmovn.s16 d3, q3
+
+        vuzp.8   q0, q1
+
+        vld1.8   {d26, d27}, [r5]
+
+        vadd.s8  q0, q0, q12
+        vadd.s8  q1, q1, q12
+
+        vtbl.8   d0, {d26}, d0
+        vtbl.8   d1, {d26}, d1
+        vtbl.8   d2, {d27}, d2
+        vtbl.8   d3, {d27}, d3
+
+        vmov.i64 q12, #0
+
+        vzip.8   q0, q1
+
+        vdup.i16 q13, r4
+
+        @ Avoid overwrite whilst widening
+        vaddw.s8 q2, q6, d2
+        vaddw.s8 q3, q7, d3
+        vaddw.s8 q1, q5, d1
+        vaddw.s8 q0, q4, d0
+
+        @ now clip
+        clip16_4 q2, q3, q1, q0, q12, q13
+
+        bx       lr
+endfunc
+
+
+@ a <- c <- b
+@ a in q0
+@ c in q1
+@ b in q2
+@ Temp q3, q9, q10
+@
+@ d16, d17 (q8) xlat U, V
+@ q14.u8 #2
+@ q15.u8 #128
+
+function edge_16b_body_8
+        vcgt.u8  q3,  q1,  q0   @ c > a -> -1 , otherwise 0
+        vcgt.u8  q0,  q1        @ a > c -> -1 , otherwise 0
+        vcgt.u8  q9,  q1,  q2   @ c > b -> -1 , otherwise 0
+        vcgt.u8  q10, q2,  q1   @ c < b -> -1 , otherwise 0
+
+        vsub.s8  q0,  q3
+        vsub.s8  q10, q9
+        vadd.s8  q0,  q10       @ a = sign(c-a)
+
+        vadd.s8  q0,  q14
+        vuzp.8   d0,  d1
+        vadd.s8  q3,  q1, q15   @ Add -128 so we can use saturating signed add
+
+        vtbl.8   d0,  {d16}, d0
+        vtbl.8   d1,  {d17}, d1
+
+        vzip.8   d0,  d1
+        vqadd.s8 q0,  q3
+        vsub.s8  q0,  q15
+
+        bx      lr
+endfunc
+
+@ a <- c <- b
+@ a in q0
+@ c in q1
+@ b in q2
+@ Temp q3
+@
+@ q12, #0
+@ d16, d17 xlat U, V
+@ q14.u8 #2
+@ q15.u16 max
+function edge_16b_body_16
+        vcgt.u16 q3, q1, q0     @ c > a -> -1 , otherwise 0
+        vcgt.u16 q0, q1         @ a > c -> -1 , otherwise 0
+        vsub.s16 q0, q3         @ a = sign(c-a)
+        vcgt.u16 q3, q1, q2     @ c > b -> -1 , otherwise 0
+        vsub.s16 q0, q3
+        vcgt.u16 q3, q2, q1     @ c < b -> -1 , otherwise 0
+        vadd.s16 q0, q3         @ a = sign(c-a) + sign(c-b)
+
+        vmovn.s16 d0, q0
+        @ d1 will have random contents that we transform but
+        @ that doesn't matter as we then discard them
+        vuzp.8   d0, d1
+
+        vadd.s8  q0, q0, q14
+
+        vtbl.8   d0, {d16}, d0
+        vtbl.8   d1, {d17}, d1
+
+        vzip.8   d0, d1
+
+        vaddw.s8 q0, q1, d0
+
+        @ now clip
+        vmax.s16 q0, q12
+        vmin.s16 q0, q15
+        bx       lr
+endfunc
+
+
+@ ff_hevc_sao_edge_[c_]xx_neon(
+@   uint8_t *_dst,                    [r0]
+@   const uint8_t *_src,              [r1]
+@   ptrdiff_t stride_dst,             [r2]
+@   const int16_t *_sao_offset_val_u, [r3]
+@   const int16_t *_sao_offset_val_v, [sp, #0]   // Chroma only
+@   int eo,                           [sp, #sp_base + 0]
+@   int width,                        [sp, #sp_base + 4]
+@   int height)                       [sp, #sp_base + 8]
+
+.macro  edge_xxb_init, bit_depth, is_chroma, jump_tab, setup_64b = 0, setup_16b = 0, check_w4 = 0, do2 = 0
+        push     {r4-r6, lr}    @ 16 bytes
+.set sp_base, 16
+
+@ Build translate registers
+@ As translate values can only be 0-4 we don't care about junk in the rest
+@ of the register
+        mov      r12, #2
+.if \is_chroma
+        ldr      r4, [sp, #16]
+.set sp_base, sp_base + 4
+.endif
+        vld1.8   {d16[2]}, [r3], r12
+        vld1.8   {d16[0]}, [r3], r12
+        vld1.8   {d16[1]}, [r3], r12
+        vld1.8   {d16[3]}, [r3], r12
+        vld1.8   {d16[4]}, [r3]
+.if \is_chroma
+        vld1.8   {d17[2]}, [r4], r12
+        vld1.8   {d17[0]}, [r4], r12
+        vld1.8   {d17[1]}, [r4], r12
+        vld1.8   {d17[3]}, [r4], r12
+        vld1.8   {d17[4]}, [r4]
+.else
+        vmov     d17, d16
+.endif
+
+@ Setup constant registers
+.if \bit_depth > 8
+        movw     r4, (1 << \bit_depth) - 1
+.endif
+.if \setup_16b
+.if \bit_depth > 8
+        vmov.i64 q12, #0
+        vdup.16  q15, r4
+.else
+        vmov.u8  q15, #128
+.endif
+        vmov.u8  q14, #2
+.endif
+        movw     r3, EDGE_SRC_STRIDE
+
+@ If setup_64b we need the xlat table on the stack and q4-q7 saved
+.if \setup_64b
+        sub      r5, sp, #16
+        vpush    {q4-q8}        @ 80 bytes, q8 pushed first
+.set sp_base, sp_base + 80
+.endif
+
+@ Get jump address
+@ We have a special case for width 4 as the calling code doesn't detect it
+@ If we may have w4 then we add a 2nd jump table after the 1st
+.if \check_w4
+        ldr      r12, [sp, #sp_base + 4]        @ width
+        cmp      r12, #8
+.endif
+        ldr      r12, [sp, #sp_base + 0]        @ e0
+        adr      r6, \jump_tab
+.if \check_w4
+        it lt
+        addlt    r6, #16
+.endif
+        ldr      r6, [r6, r12, lsl #2]
+
+        ldr      r12, [sp, #sp_base + 8]        @ height
+
+@ For 16 bit width 64 (or chroma 32) we need to do this in 2 passes
+.if \do2
+        push     {r0, r1, r6, r12}
+        blx      r6
+        pop      {r0, r1, r6, r12}
+
+        add      r0, #64
+        add      r1, #64
+.endif
+
+        blx      r6
+
+@ Tidy up & return
+.if \setup_64b
+        vpop     {q4-q8}        @ spurious but harmless load of q8
+.endif
+        pop      {r4-r6, pc}
+.endm
+
+
+.macro  edge_16b_init, bit_depth, is_chroma, check_w4, jump_tab
+        edge_xxb_init \bit_depth, \is_chroma, \jump_tab, check_w4=\check_w4, setup_16b=1
+.endm
+
+.macro  edge_64b_init, bit_depth, is_chroma, do2, jump_tab
+        edge_xxb_init \bit_depth, \is_chroma, \jump_tab, do2=\do2, setup_64b=1
+.endm
+
+
+.macro  edge_64b_e0, body_fn, pb
+        mov      r6, lr
+        sub      r1, #8
+1:      vldm     r1, {d7-d16}
+        subs     r12, #1
+        add      r1, r3
+        // load a
+        vext.8   q0,  q3,  q4, #(16 - \pb)
+        vext.8   q1,  q4,  q5, #(16 - \pb)
+        vext.8   q2,  q5,  q6, #(16 - \pb)
+        vext.8   q3,  q6,  q7, #(16 - \pb)
+        // load b
+        vext.8   q11, q7,  q8, #\pb     @ Avoid overwrite
+        vext.8   q8,  q4,  q5, #\pb
+        vext.8   q9,  q5,  q6, #\pb
+        vext.8   q10, q6,  q7, #\pb
+        bl       \body_fn
+        vstm     r0, {q0-q3}
+        add      r0, r0, r2
+        bgt      1b
+        bx       r6
+.endm
+
+.macro  edge_32bx2_e0, body_fn, pb
+        mov      r6, lr
+
+1:      subs     r12, #2
+
+        vld1.8   {q4-q5}, [r1]
+        sub      r1, #\pb
+        vld1.8   {q0-q1}, [r1]
+        add      r1, #(\pb * 2)
+        vld1.8   {q8-q9}, [r1], r3
+        sub      r1, #\pb
+        vld1.8   {q6-q7}, [r1]
+        sub      r1, #\pb
+        vld1.8   {q2-q3}, [r1]
+        add      r1, #(\pb * 2)
+        vld1.8   {q10-q11}, [r1], r3
+        sub      r1, #\pb
+
+        bl       \body_fn
+
+        vst1.8   {q0,q1}, [r0], r2
+        vst1.8   {q2,q3}, [r0], r2
+
+        bgt      1b
+        bx       r6
+.endm
+
+.macro  edge_16b_e0, body_fn, pb
+        mov      r6, lr
+        sub      r1, #\pb
+        sub      r3, #\pb * 2
+
+1:      subs     r12, #1
+
+        vld1.64  {q0}, [r1]             @ load a
+        add      r1, #\pb
+        vld1.64  {q1}, [r1, :128]       @ load c
+        add      r1, #\pb
+        vld1.64  {q2}, [r1], r3         @ load b
+
+        bl       \body_fn
+        vst1.8   {q0}, [r0], r2
+        bgt      1b
+        bx       r6
+.endm
+
+.macro  edge_8bx2_e0, body_fn, pb
+        mov      r6, lr
+
+1:      subs     r12, #2
+
+        vld1.8   {d2}, [r1, :64]
+        sub      r1, #\pb
+        vld1.8   {d0}, [r1]
+        add      r1, #(\pb * 2)
+        vld1.8   {d4}, [r1], r3
+        sub      r1, #\pb
+        vld1.8   {d3}, [r1, :64]
+        sub      r1, #\pb
+        vld1.8   {d1}, [r1]
+        add      r1, #(\pb * 2)
+        vld1.8   {d5}, [r1], r3
+        sub      r1, #\pb
+
+        bl       \body_fn
+
+        vst1.8   {d0}, [r0, :64], r2
+        vst1.8   {d1}, [r0, :64], r2
+
+        bgt      1b
+        bx       r6
+.endm
+
+.macro  edge_4bx4_e0, body_fn, pb
+        mov      r6, lr
+
+1:      subs     r12, #4
+
+        vld1.32  {d2[0]}, [r1]
+        sub      r1, #\pb
+        vld1.32  {d0[0]}, [r1]
+        add      r1, #(\pb * 2)
+        vld1.32  {d4[0]}, [r1], r3      @ R
+        vld1.32  {d4[1]}, [r1]
+        sub      r1, #\pb
+        vld1.32  {d2[1]}, [r1]
+        sub      r1, #\pb
+        vld1.32  {d0[1]}, [r1], r3      @ L
+        vld1.32  {d1[0]}, [r1]
+        add      r1, #\pb
+        vld1.32  {d3[0]}, [r1]
+        add      r1, #\pb
+        vld1.32  {d5[0]}, [r1], r3      @ R
+        vld1.32  {d5[1]}, [r1]
+        sub      r1, #(\pb * 2)
+        vld1.32  {d1[1]}, [r1]
+        add      r1, #\pb
+        vld1.32  {d3[1]}, [r1], r3      @ M
+
+        bl       \body_fn
+
+        vst1.32  {d0[0]}, [r0], r2
+        vst1.32  {d0[1]}, [r0], r2
+        vst1.32  {d1[0]}, [r0], r2
+        vst1.32  {d1[1]}, [r0], r2
+
+        bgt      1b
+        bx       r6
+.endm
+
+
+.macro  edge_64b_e1, body_fn
+        mov      r6, lr
+        sub      r1, r3
+        // load a
+        vld1.8   {q0-q1}, [r1, :128]!
+        vld1.8   {q2-q3}, [r1, :128], r3
+        sub      r1, #32
+        // load c
+        vld1.8   {q4-q5}, [r1, :128]!
+        vld1.8   {q6-q7}, [r1, :128], r3
+        sub      r1, #32
+1:      subs     r12, #1
+        // load b
+        vld1.8   {q8-q9}, [r1, :128]!
+        vld1.8   {q10-q11}, [r1, :128], r3
+        sub      r1, #32
+        bl       \body_fn
+        vstm     r0, {q0-q3}
+        add      r0, r0, r2
+        // copy c to a
+        vmov.64  q0, q4
+        vmov.64  q1, q5
+        vmov.64  q2, q6
+        vmov.64  q3, q7
+        // copy b to c
+        vmov.64  q4, q8
+        vmov.64  q5, q9
+        vmov.64  q6, q10
+        vmov.64  q7, q11
+        bgt      1b
+        bx       r6
+.endm
+
+.macro  edge_32bx2_e1, body_fn
+        mov      r6, lr
+        sub      r1, r3
+        // load a
+        vld1.8   {q0-q1}, [r1, :128], r3
+        vld1.8   {q4-q5}, [r1, :128], r3
+
+1:      subs     r12, #2
+        @ Given the data duplication here we could obviously do better than
+        @ using the generic body_fn but it almost certainly isn't worth it
+        vmov     q2, q4
+        vmov     q3, q5
+        vld1.8   {q8-q9}, [r1, :128], r3
+        vld1.8   {q10-q11}, [r1, :128], r3
+        vmov     q6, q8
+        vmov     q7, q9
+
+        bl       \body_fn
+
+        vst1.8   {q0,q1}, [r0], r2
+        vst1.8   {q2,q3}, [r0], r2
+
+        // copy c to a
+        vmov.64  q0, q8
+        vmov.64  q1, q9
+
+        // copy b to c
+        vmov.64  q4, q10
+        vmov.64  q5, q11
+        bgt      1b
+        bx       r6
+.endm
+
+.macro  edge_16b_e1, body_fn
+        mov      r6, lr
+        sub      r1, r3
+        // load a
+        vld1.8   {q0}, [r1, :128], r3
+        // load c
+        vld1.8   {q1}, [r1, :128], r3
+1:      subs     r12, #1
+        // load b
+        vld1.8   {q2}, [r1, :128], r3
+        bl       \body_fn
+        vst1.8   {q0}, [r0], r2
+        // copy c to a
+        vmov.64  q0, q1
+        // copy b to c
+        vmov.64  q1, q2
+        bgt      1b
+        bx       r6
+.endm
+
+.macro  edge_8bx2_e1, body_fn
+        mov      r6, lr
+        sub      r1, r3
+        // load a
+        vld1.8   {d0}, [r1, :64], r3
+        vld1.8   {d2}, [r1, :64], r3
+
+1:      subs     r12, #2
+        @ Given the data duplication here we could obviously do better than
+        @ using the generic body_fn but it almost certainly isn't worth it
+        vmov.64  d1, d2
+        vld1.8   {d4}, [r1, :64], r3
+        vld1.8   {d5}, [r1, :64], r3
+        vmov.64  d3, d4
+
+        bl       \body_fn
+
+        vst1.8   {d0}, [r0], r2
+        vst1.8   {d1}, [r0], r2
+
+        // copy c to a
+        vmov.64  d0, d4
+        // copy b to c
+        vmov.64  d2, d5
+        bgt      1b
+        bx       r6
+.endm
+
+.macro  edge_4bx4_e1, body_fn
+        mov      r6, lr
+debug_me:
+        sub      r1, r3
+        // load a
+        vld1.32  {d0[0]}, [r1], r3
+        vld1.32  {d0[1]}, [r1], r3
+
+1:      subs     r12, #4
+        @ Given the data duplication here we could probably do better than
+        @ using the generic body_fn but it almost certainly isn't worth it
+        vld1.32  {d4[0]}, [r1], r3
+        vld1.32  {d4[1]}, [r1], r3
+        vld1.32  {d5[0]}, [r1], r3
+        vld1.32  {d5[1]}, [r1], r3
+
+        vmov.32  d1, d4
+        vext.32  d2, d0, d4, #1
+        vext.32  d3, d4, d5, #1
+
+        bl       \body_fn
+
+        vst1.32  {d0[0]}, [r0], r2
+        vst1.32  {d0[1]}, [r0], r2
+        vst1.32  {d1[0]}, [r0], r2
+        vst1.32  {d1[1]}, [r0], r2
+
+        vmov.32  d0, d5
+        bgt      1b
+        bx       r6
+.endm
+
+.macro  edge_64b_e2, body_fn, pb
+        mov      r6, lr
+        sub      r1, #32
+        sub      r3, #(32 - \pb)
+
+1:      sub      r1, r3
+        // load a
+        // TODO: fix unaligned load
+        //       don't reload a like in eo1
+        vld1.8   {q0-q1}, [r1]!
+        vld1.8   {q2-q3}, [r1], r3
+        subs     r12, #1
+        // load  c
+        vld1.8   {q4-q5}, [r1, :128]!
+        vld1.8   {q6-q7}, [r1, :128], r3
+        // load  b
+        vld1.8   {q8-q9}, [r1]!
+        vld1.8   {q10-q11}, [r1]
+        sub      r1, #(64 + \pb)
+        bl       \body_fn
+        vstm     r0, {q0-q3}
+        add      r0, r0, r2
+        bgt      1b
+
+        add      r3, #(32 - \pb)
+        bx       r6
+.endm
+
+.macro  edge_32bx2_e2, body_fn, pb
+        mov      r6, lr
+        sub      r1, #\pb
+
+1:      sub      r1, r3
+        vld1.8   {q0-q1}, [r1], r3
+        vld1.8   {q2-q3}, [r1]
+        subs     r12, #2
+        // load  c
+        add      r1, #\pb
+        vld1.8   {q4-q5}, [r1, :128], r3
+        vld1.8   {q6-q7}, [r1, :128]
+        // load  b
+        add      r1, #\pb
+        vld1.8   {q8-q9}, [r1], r3
+        vld1.8   {q10-q11}, [r1]
+        sub      r1, #(\pb * 2)
+
+        bl       \body_fn
+
+        vst1.8   {q0-q1}, [r0], r2
+        vst1.8   {q2-q3}, [r0], r2
+        bgt      1b
+
+        bx       r6
+.endm
+
+.macro  edge_16b_e2, body_fn, pb
+        mov      r6, lr
+        add     r3, #\pb
+
+1:      sub      r1, r3
+        // load a
+        vld1.8   {q0}, [r1], r3
+        subs     r12, #1
+        // load  c
+        vld1.8   {q1}, [r1, :128], r3
+        // load  b
+        vld1.8   {q2}, [r1]
+        sub      r1, #\pb
+        bl       \body_fn
+        vst1.8   {q0}, [r0], r2
+        bgt      1b
+        bx       r6
+.endm
+
+.macro  edge_8bx2_e2, body_fn, pb
+        mov      r6, lr
+        sub      r1, #\pb
+
+1:      sub      r1, r3
+        vld1.8   {d0}, [r1], r3
+        vld1.8   {d1}, [r1]
+        subs     r12, #2
+        // load  c
+        add      r1, #\pb
+        vld1.8   {d2}, [r1, :64], r3
+        vld1.8   {d3}, [r1, :64]
+        // load  b
+        add      r1, #\pb
+        vld1.8   {d4}, [r1], r3
+        vld1.8   {d5}, [r1]
+        sub      r1, #(\pb * 2)
+
+        bl       \body_fn
+
+        vst1.8   {d0}, [r0], r2
+        vst1.8   {d1}, [r0], r2
+        bgt      1b
+
+        bx       r6
+.endm
+
+.macro  edge_4bx4_e2, body_fn, pb
+        mov      r6, lr
+        sub      r1, #\pb
+
+1:      sub      r1, r3
+        @ line 0 {d0[0], -,     -    }  r1 lo
+        vld1.32  {d0[0]}, [r1], r3
+        subs     r12, #4
+        @ Line 1 {d0[1], d2[0], -    }  r1 lo
+        vld1.32  {d0[1]}, [r1]
+        add      r1, #\pb
+        vld1.32  {d2[0]}, [r1], r3
+        @ Line 2 {d1[0], d2[1], d4[0]}  r1 mid
+        vld1.32  {d2[1]}, [r1]
+        sub      r1, #\pb
+        vld1.32  {d1[0]}, [r1]
+        add      r1, #\pb * 2
+        vld1.32  {d4[0]}, [r1], r3
+        @ Line 2 {d1[1], d3[0], d4[1]}  r1 hi
+        vld1.32  {d4[1]}, [r1]
+        sub      r1, #\pb * 2
+        vld1.32  {d1[1]}, [r1]
+        add      r1, #\pb
+        vld1.32  {d3[0]}, [r1], r3
+        @ Line 3 {-,     d3[1], d5[0]}  r1 mid
+        vld1.32  {d3[1]}, [r1]
+        add      r1, #\pb
+        vld1.32  {d5[0]}, [r1], r3
+        @ Line 4 {-,      -,    d5[1]}  r1 hi
+        vld1.32  {d5[1]}, [r1]
+        sub      r1, #(\pb * 2)
+
+        bl       \body_fn
+
+        vst1.32  {d0[0]}, [r0], r2
+        vst1.32  {d0[1]}, [r0], r2
+        vst1.32  {d1[0]}, [r0], r2
+        vst1.32  {d1[1]}, [r0], r2
+        bgt      1b
+
+        bx       r6
+.endm
+
+.macro  edge_64b_e3, body_fn, pb
+        @ e3 is the same as e2 but with the X offset reversed
+        edge_64b_e2 \body_fn, (-\pb)
+.endm
+
+.macro  edge_32bx2_e3, body_fn, pb
+        @ e3 is the same as e2 but with the X offset reversed
+        edge_32bx2_e2 \body_fn, (-\pb)
+.endm
+
+.macro  edge_16b_e3, body_fn, pb
+        @ e3 is the same as e2 but with the X offset reversed
+        edge_16b_e2 \body_fn, (-\pb)
+.endm
+
+.macro  edge_8bx2_e3, body_fn, pb
+        @ e3 is the same as e2 but with the X offset reversed
+        edge_8bx2_e2 \body_fn, (-\pb)
+.endm
+
+.macro  edge_4bx4_e3, body_fn, pb
+        @ e3 is the same as e2 but with the X offset reversed
+        edge_4bx4_e2 \body_fn, (-\pb)
+.endm
+
+.macro edge_64b_bodies, body_fn, pb
+        .word   0f
+        .word   10f
+        .word   20f
+        .word   30f
+
+0:      edge_64b_e0     \body_fn, \pb
+10:     edge_64b_e1     \body_fn
+20:     edge_64b_e2     \body_fn, \pb
+30:     edge_64b_e3     \body_fn, \pb
+.endm
+
+.macro edge_32bx2_bodies, body_fn, pb
+        .word   0f
+        .word   10f
+        .word   20f
+        .word   30f
+
+0:      edge_32bx2_e0   \body_fn, \pb
+10:     edge_32bx2_e1   \body_fn
+20:     edge_32bx2_e2   \body_fn, \pb
+30:     edge_32bx2_e3   \body_fn, \pb
+.endm
+
+.macro edge_16b_bodies, body_fn, pb
+        .word   0f
+        .word   10f
+        .word   20f
+        .word   30f
+
+0:      edge_16b_e0     \body_fn, \pb
+10:     edge_16b_e1     \body_fn
+20:     edge_16b_e2     \body_fn, \pb
+30:     edge_16b_e3     \body_fn, \pb
+.endm
+
+.macro edge_32bx2_16b_bodies, body_fn_64b, body_fn_16b, pb
+        .word   0f
+        .word   10f
+        .word   20f
+        .word   30f
+        .word   5f
+        .word   15f
+        .word   25f
+        .word   35f
+
+0:      edge_32bx2_e0   \body_fn_64b, \pb
+10:     edge_32bx2_e1   \body_fn_64b
+20:     edge_32bx2_e2   \body_fn_64b, \pb
+30:     edge_32bx2_e3   \body_fn_64b, \pb
+5:      edge_16b_e0     \body_fn_16b, \pb
+15:     edge_16b_e1     \body_fn_16b
+25:     edge_16b_e2     \body_fn_16b, \pb
+35:     edge_16b_e3     \body_fn_16b, \pb
+.endm
+
+.macro edge_16b_8bx2_bodies, body_fn, pb
+        .word   0f
+        .word   10f
+        .word   20f
+        .word   30f
+        .word   5f
+        .word   15f
+        .word   25f
+        .word   35f
+
+0:      edge_16b_e0     \body_fn, \pb
+10:     edge_16b_e1     \body_fn
+20:     edge_16b_e2     \body_fn, \pb
+30:     edge_16b_e3     \body_fn, \pb
+5:      edge_8bx2_e0    \body_fn, \pb
+15:     edge_8bx2_e1    \body_fn
+25:     edge_8bx2_e2    \body_fn, \pb
+35:     edge_8bx2_e3    \body_fn, \pb
+.endm
+
+.macro edge_8bx2_4bx4_bodies, body_fn, pb
+        .word   0f
+        .word   10f
+        .word   20f
+        .word   30f
+        .word   5f
+        .word   15f
+        .word   25f
+        .word   35f
+
+0:      edge_8bx2_e0    \body_fn, \pb
+10:     edge_8bx2_e1    \body_fn
+20:     edge_8bx2_e2    \body_fn, \pb
+30:     edge_8bx2_e3    \body_fn, \pb
+5:      edge_4bx4_e0    \body_fn, \pb
+15:     edge_4bx4_e1    \body_fn
+25:     edge_4bx4_e2    \body_fn, \pb
+35:     edge_4bx4_e3    \body_fn, \pb
+.endm
+
+@ void ff_hevc_sao_edge_8_neon_8(
+@   uint8_t *_dst,            [r0]
+@   uint8_t *_src,            [r1]
+@   int  stride_dst,          [r2]
+@   int16_t *_sao_offset_val, [r3]
+@   int eo,                   [sp, #0]
+@   int width,                [sp, #4]
+@   int height)               [sp, #8]
+
+function ff_hevc_sao_edge_8_neon_8, export=1
+        edge_16b_init   8, 0, 1, 99f
+99:
+        edge_8bx2_4bx4_bodies edge_16b_body_8, 1
+endfunc
+
+@ void ff_hevc_sao_edge_16_neon_8(
+@   uint8_t *_dst,            [r0]
+@   uint8_t *_src,            [r1]
+@   int  stride_dst,          [r2]
+@   int16_t *_sao_offset_val, [r3]
+@   int eo,                   [sp, #0]
+@   int width,                [sp, #4]
+@   int height)               [sp, #8]
+
+function ff_hevc_sao_edge_16_neon_8, export=1
+        edge_16b_init   8, 0, 0, 99f
+99:
+        edge_16b_bodies edge_16b_body_8, 1
+endfunc
+
+@ void ff_hevc_sao_edge_32_neon_8(
+@   uint8_t *_dst,            [r0]
+@   uint8_t *_src,            [r1]
+@   int  stride_dst,          [r2]
+@   int16_t *_sao_offset_val, [r3]
+@   int eo,                   [sp, #0]
+@   int width,                [sp, #4]
+@   int height)               [sp, #8]
+
+function ff_hevc_sao_edge_32_neon_8, export=1
+        edge_64b_init   8, 0, 0, 99f
+99:
+        edge_32bx2_bodies edge_64b_body_8, 1
+endfunc
+
+@ void ff_hevc_sao_edge_64_neon_8(
+@   uint8_t *_dst,            [r0]
+@   uint8_t *_src,            [r1]
+@   int  stride_dst,          [r2]
+@   int16_t *_sao_offset_val, [r3]
+@   int eo,                   [sp, #0]
+@   int width,                [sp, #4]
+@   int height)               [sp, #8]
+
+function ff_hevc_sao_edge_64_neon_8, export=1
+        edge_64b_init   8, 0, 0, 99f
+99:
+        edge_64b_bodies edge_64b_body_8, 1
+endfunc
+
+@ ff_hevc_sao_edge_c_8_neon_8(
+@   uint8_t *_dst,                    [r0]
+@   const uint8_t *_src,              [r1]
+@   ptrdiff_t stride_dst,             [r2]
+@   const int16_t *_sao_offset_val_u, [r3]
+@   const int16_t *_sao_offset_val_v, [sp, #0]
+@   int eo,                           [sp, #4]
+@   int width,                        [sp, #8]
+@   int height)                       [sp, #12]
+
+function ff_hevc_sao_edge_c_8_neon_8, export=1
+        edge_16b_init   8, 1, 1, 99f
+99:
+        edge_16b_8bx2_bodies edge_16b_body_8, 2
+endfunc
+
+@ ff_hevc_sao_edge_c_16_neon_8(
+@   uint8_t *_dst,                    [r0]
+@   const uint8_t *_src,              [r1]
+@   ptrdiff_t stride_dst,             [r2]
+@   const int16_t *_sao_offset_val_u, [r3]
+@   const int16_t *_sao_offset_val_v, [sp, #0]
+@   int eo,                           [sp, #4]
+@   int width,                        [sp, #8]
+@   int height)                       [sp, #12]
+
+function ff_hevc_sao_edge_c_16_neon_8, export=1
+        edge_64b_init   8, 1, 0, 99f
+99:
+        edge_32bx2_bodies edge_64b_body_8, 2
+endfunc
+
+@ ff_hevc_sao_edge_c_32_neon_8(
+@   uint8_t *_dst,                    [r0]
+@   const uint8_t *_src,              [r1]
+@   ptrdiff_t stride_dst,             [r2]
+@   const int16_t *_sao_offset_val_u, [r3]
+@   const int16_t *_sao_offset_val_v, [sp, #0]
+@   int eo,                           [sp, #4]
+@   int width,                        [sp, #8]
+@   int height)                       [sp, #12]
+
+function ff_hevc_sao_edge_c_32_neon_8, export=1
+        edge_64b_init   8, 1, 0, 99f
+99:
+        edge_64b_bodies edge_64b_body_8, 2
+endfunc
+
+@ void ff_hevc_sao_edge_8_neon_10(
+@   uint8_t *_dst,            [r0]
+@   uint8_t *_src,            [r1]
+@   int  stride_dst,          [r2]
+@   int16_t *_sao_offset_val, [r3]
+@   int eo,                   [sp, #0]
+@   int width,                [sp, #4]
+@   int height)               [sp, #8]
+
+function ff_hevc_sao_edge_8_neon_10, export=1
+        edge_16b_init   10, 0, 1, 99f
+99:
+        edge_16b_8bx2_bodies edge_16b_body_16, 2
+endfunc
+
+@ void ff_hevc_sao_edge_16_neon_10(
+@   uint8_t *_dst,            [r0]
+@   uint8_t *_src,            [r1]
+@   int  stride_dst,          [r2]
+@   int16_t *_sao_offset_val, [r3]
+@   int eo,                   [sp, #0]
+@   int width,                [sp, #4]
+@   int height)               [sp, #8]
+
+function ff_hevc_sao_edge_16_neon_10, export=1
+        edge_64b_init   10, 0, 0, 99f
+99:
+        edge_32bx2_bodies edge_64b_body_16, 2
+endfunc
+
+@ void ff_hevc_sao_edge_64_neon_10(
+@   uint8_t *_dst,            [r0]
+@   uint8_t *_src,            [r1]
+@   int  stride_dst,          [r2]
+@   int16_t *_sao_offset_val, [r3]
+@   int eo,                   [sp, #0]
+@   int width,                [sp, #4]
+@   int height)               [sp, #8]
+
+@ We simply split the 32 case into 2 vertical stripes
+@ and call the fns for w32
+@
+@ Calling code will always have src != dst so we don't have to worry
+@ about edge effects
+
+function ff_hevc_sao_edge_64_neon_10, export=1
+        edge_64b_init   10, 0, 1, 99f
+endfunc
+
+@ void ff_hevc_sao_edge_32_neon_10(
+@   uint8_t *_dst,            [r0]
+@   uint8_t *_src,            [r1]
+@   int  stride_dst,          [r2]
+@   int16_t *_sao_offset_val, [r3]
+@   int eo,                   [sp, #0]
+@   int width,                [sp, #4]
+@   int height)               [sp, #8]
+
+function ff_hevc_sao_edge_32_neon_10, export=1
+        edge_64b_init   10, 0, 0, 99f
+99:
+        edge_64b_bodies edge_64b_body_16, 2
+endfunc
+
+@ ff_hevc_sao_edge_c_8_neon_10(
+@   uint8_t *_dst,                    [r0]
+@   const uint8_t *_src,              [r1]
+@   ptrdiff_t stride_dst,             [r2]
+@   const int16_t *_sao_offset_val_u, [r3]
+@   const int16_t *_sao_offset_val_v, [sp, #0]
+@   int eo,                           [sp, #4]
+@   int width,                        [sp, #8]
+@   int height)                       [sp, #12]
+
+function ff_hevc_sao_edge_c_8_neon_10, export=1
+        edge_xxb_init   10, 1, 99f, check_w4=1, setup_16b=1, setup_64b=1
+99:
+        edge_32bx2_16b_bodies edge_64b_body_16, edge_16b_body_16, 4
+endfunc
+
+@ ff_hevc_sao_edge_c_32_neon_10(
+@   uint8_t *_dst,                    [r0]
+@   const uint8_t *_src,              [r1]
+@   ptrdiff_t stride_dst,             [r2]
+@   const int16_t *_sao_offset_val_u, [r3]
+@   const int16_t *_sao_offset_val_v, [sp, #0]
+@   int eo,                           [sp, #4]
+@   int width,                        [sp, #8]
+@   int height)                       [sp, #12]
+
+function ff_hevc_sao_edge_c_32_neon_10, export=1
+        edge_64b_init   10, 1, 1, 99f
+endfunc
+
+
+@ ff_hevc_sao_edge_c_16_neon_10(
+@   uint8_t *_dst,                    [r0]
+@   const uint8_t *_src,              [r1]
+@   ptrdiff_t stride_dst,             [r2]
+@   const int16_t *_sao_offset_val_u, [r3]
+@   const int16_t *_sao_offset_val_v, [sp, #0]
+@   int eo,                           [sp, #4]
+@   int width,                        [sp, #8]
+@   int height)                       [sp, #12]
+
+function ff_hevc_sao_edge_c_16_neon_10, export=1
+        edge_64b_init   10, 1, 0, 99f
+99:
+        edge_64b_bodies edge_64b_body_16, 4
+endfunc
+
diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
index 1be52e7a12..bae5df4bc6 100644
--- a/libavcodec/avcodec.h
+++ b/libavcodec/avcodec.h
@@ -410,6 +410,8 @@ enum AVCodecID {
     AV_CODEC_ID_SHEERVIDEO,
     AV_CODEC_ID_YLC,

+    AV_CODEC_ID_H264_MVC,
+
     /* various PCM "codecs" */
     AV_CODEC_ID_FIRST_AUDIO = 0x10000,     ///< A dummy id pointing at the start of audio codecs
     AV_CODEC_ID_PCM_S16LE = 0x10000,
@@ -3205,6 +3207,9 @@ typedef struct AVCodecContext {
 #define FF_PROFILE_H264_HIGH_444_PREDICTIVE  244
 #define FF_PROFILE_H264_HIGH_444_INTRA       (244|FF_PROFILE_H264_INTRA)
 #define FF_PROFILE_H264_CAVLC_444            44
+#define FF_PROFILE_H264_MULTIVIEW_HIGH       118
+#define FF_PROFILE_H264_STEREO_HIGH          128
+#define FF_PROFILE_H264_MULTIVIEW_HIGH_DEPTH 138

 #define FF_PROFILE_VC1_SIMPLE   0
 #define FF_PROFILE_VC1_MAIN     1
@@ -3515,6 +3520,13 @@ typedef struct AVCodecContext {
 #define FF_SUB_TEXT_FMT_ASS_WITH_TIMINGS 1
 #endif

+    /**
+     * Opaque pointer for use by replacement get_buffer2 code
+     *
+     * @author jc (08/02/2016)
+     */
+    void * get_buffer_context;
+
 } AVCodecContext;

 AVRational av_codec_get_pkt_timebase         (const AVCodecContext *avctx);
diff --git a/libavcodec/cabac.h b/libavcodec/cabac.h
index 1bf1c620d6..ccfa991f60 100644
--- a/libavcodec/cabac.h
+++ b/libavcodec/cabac.h
@@ -43,7 +43,14 @@ extern const uint8_t ff_h264_cabac_tables[512 + 4*2*64 + 4*64 + 63];
 typedef struct CABACContext{
     int low;
     int range;
-    int outstanding_count;
+    union
+    {
+        int outstanding_count;
+        struct {
+            uint16_t bits;
+            uint16_t range;
+        } by22;
+    };
     const uint8_t *bytestream_start;
     const uint8_t *bytestream;
     const uint8_t *bytestream_end;
diff --git a/libavcodec/codec_desc.c b/libavcodec/codec_desc.c
index 9d94b72..535ebf0 100644
--- a/libavcodec/codec_desc.c
+++ b/libavcodec/codec_desc.c
@@ -1563,6 +1563,13 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .long_name = NULL_IF_CONFIG_SMALL("YUY2 Lossless Codec"),
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
     },
+    {
+        .id        = AV_CODEC_ID_H264_MVC,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "h264_mvc",
+        .long_name = NULL_IF_CONFIG_SMALL("H264 MVC"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },

     /* various PCM "codecs" */
     {
diff --git a/libavcodec/h264.h b/libavcodec/h264.h
index efe3555..16358aa 100644
--- a/libavcodec/h264.h
+++ b/libavcodec/h264.h
@@ -126,7 +126,9 @@ enum {
     NAL_END_STREAM      = 11,
     NAL_FILLER_DATA     = 12,
     NAL_SPS_EXT         = 13,
+    NAL_SPS_SUBSET      = 15,
     NAL_AUXILIARY_SLICE = 19,
+    NAL_SLICE_EXT       = 20,
     NAL_FF_IGNORE       = 0xff0f001,
 };

diff --git a/libavcodec/h264_parser.c b/libavcodec/h264_parser.c
index ce4bab2..b9b0c78 100644
--- a/libavcodec/h264_parser.c
+++ b/libavcodec/h264_parser.c
@@ -58,6 +58,8 @@ typedef struct H264ParseContext {
     uint8_t parse_history[6];
     int parse_history_count;
     int parse_last_mb;
+    int is_mvc;
+    int slice_ext;
 } H264ParseContext;


@@ -105,24 +107,27 @@ static int h264_find_frame_end(H264ParseContext *p, const uint8_t *buf,
         } else if (state <= 5) {
             int nalu_type = buf[i] & 0x1F;
             if (nalu_type == NAL_SEI || nalu_type == NAL_SPS ||
-                nalu_type == NAL_PPS || nalu_type == NAL_AUD) {
+                nalu_type == NAL_PPS || nalu_type == NAL_AUD ||
+                nalu_type == NAL_SPS_SUBSET) {
                 if (pc->frame_start_found) {
                     i++;
                     goto found;
                 }
             } else if (nalu_type == NAL_SLICE || nalu_type == NAL_DPA ||
-                       nalu_type == NAL_IDR_SLICE) {
+                       nalu_type == NAL_IDR_SLICE || (p->is_mvc && nalu_type == NAL_SLICE_EXT)) {
                 state += 8;
+
+                p->slice_ext = (nalu_type == NAL_SLICE_EXT);
                 continue;
             }
             state = 7;
         } else {
             p->parse_history[p->parse_history_count++] = buf[i];
-            if (p->parse_history_count > 5) {
+            if (p->parse_history_count > 8) {
                 unsigned int mb, last_mb = p->parse_last_mb;
                 GetBitContext gb;

-                init_get_bits(&gb, p->parse_history, 8*p->parse_history_count);
+                init_get_bits8(&gb, p->parse_history + 3*p->slice_ext, p->parse_history_count - 3*p->slice_ext);
                 p->parse_history_count = 0;
                 mb= get_ue_golomb_long(&gb);
                 p->parse_last_mb = mb;
@@ -145,7 +150,7 @@ found:
     pc->frame_start_found = 0;
     if (p->is_avc)
         return next_avc;
-    return i - (state & 5) - 5 * (state > 7);
+    return i - (state & 5) - 8 * (state > 7);
 }

 static int scan_mmco_reset(AVCodecParserContext *s, GetBitContext *gb,
@@ -585,7 +590,8 @@ static int h264_parse(AVCodecParserContext *s,
         }
     }

-    parse_nal_units(s, avctx, buf, buf_size);
+    if (!p->is_mvc)
+        parse_nal_units(s, avctx, buf, buf_size);

     if (avctx->framerate.num)
         avctx->time_base = av_inv_q(av_mul_q(avctx->framerate, (AVRational){avctx->ticks_per_frame, 1}));
@@ -622,7 +628,7 @@ static int h264_split(AVCodecContext *avctx,
         if ((state & 0xFFFFFF00) != 0x100)
             break;
         nalu_type = state & 0x1F;
-        if (nalu_type == NAL_SPS) {
+        if (nalu_type == NAL_SPS || nalu_type == NAL_SPS_SUBSET) {
             has_sps = 1;
         } else if (nalu_type == NAL_PPS)
             has_pps = 1;
@@ -672,3 +678,23 @@ AVCodecParser ff_h264_parser = {
     .parser_close   = h264_close,
     .split          = h264_split,
 };
+
+static av_cold int init_mvc(AVCodecParserContext *s)
+{
+    H264ParseContext *p = s->priv_data;
+    int ret = init(s);
+    if (ret < 0)
+        return ret;
+
+    p->is_mvc = 1;
+    return 0;
+}
+
+AVCodecParser ff_h264_mvc_parser = {
+    .codec_ids      = { AV_CODEC_ID_H264_MVC },
+    .priv_data_size = sizeof(H264ParseContext),
+    .parser_init    = init_mvc,
+    .parser_parse   = h264_parse,
+    .parser_close   = h264_close,
+    .split          = h264_split,
+};
diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
index c1fa67f67b..6f99021339 100644
--- a/libavcodec/hevc.c
+++ b/libavcodec/hevc.c
@@ -41,8 +41,346 @@
 #include "hevc.h"
 #include "profiles.h"

+#ifdef RPI
+  #include "rpi_qpu.h"
+  #include "rpi_shader.h"
+  #include "rpi_shader_cmd.h"
+  #include "rpi_shader_template.h"
+  #include "rpi_zc.h"
+  #include "libavutil/rpi_sand_fns.h"
+
+  // Define RPI_CACHE_UNIF_MVS to write motion vector uniform stream to cached memory
+  #define RPI_CACHE_UNIF_MVS  1
+
+  #include "pthread.h"
+  #include "libavutil/atomic.h"
+
+  static void worker_core(HEVCContext * const s);
+#endif
+
+#define DEBUG_DECODE_N 0   // 0 = do all, n = frames idr onwards
+
+#define PACK2(hi,lo) (((hi) << 16) | ((lo) & 0xffff))
+
+#ifndef av_mod_uintp2
+static av_always_inline av_const unsigned av_mod_uintp2_c(unsigned a, unsigned p)
+{
+    return a & ((1 << p) - 1);
+}
+#   define av_mod_uintp2   av_mod_uintp2_c
+#endif
+
 const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };

+
+#if RPI_INTER
+
+#define MC_DUMMY_X (-32)
+#define MC_DUMMY_Y (-32)
+
+// UV still has min 4x4 pred
+// Allow for even spread +1 for setup, +1 for rounding
+// As we have load sharing this can (in theory) be exceeded so we have to
+// check after each CTU, but it is a good base size
+
+// Worst case (all 4x4) commands per CTU
+#define QPU_Y_CMD_PER_CTU_MAX (8 * 8)
+#define QPU_C_CMD_PER_CTU_MAX (4 * 4)
+
+#define QPU_C_COMMANDS (((RPI_MAX_WIDTH * 64) / (4 * 4)) / 4 + 2 * QPU_N_MAX)
+#define QPU_Y_COMMANDS (((RPI_MAX_WIDTH * 64) / (4 * 4))     + 2 * QPU_N_MAX)
+
+// The QPU code for UV blocks only works up to a block width of 8
+#define RPI_CHROMA_BLOCK_WIDTH 8
+
+#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
+
+
+// Actual filter goes -ve, +ve, +ve, -ve using these values
+static const uint32_t rpi_filter_coefs[8] = {
+        ENCODE_COEFFS(  0,  64,   0,  0),
+        ENCODE_COEFFS(  2,  58,  10,  2),
+        ENCODE_COEFFS(  4,  54,  16,  2),
+        ENCODE_COEFFS(  6,  46,  28,  4),
+        ENCODE_COEFFS(  4,  36,  36,  4),
+        ENCODE_COEFFS(  4,  28,  46,  6),
+        ENCODE_COEFFS(  2,  16,  54,  4),
+        ENCODE_COEFFS(  2,  10,  58,  2)
+};
+
+// Function arrays by QPU
+
+static const int * const inter_pred_setup_c_qpu[12] = {
+    mc_setup_c_q0, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn,
+    mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn,
+    mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn
+};
+
+static const int * const inter_pred_setup_c10_qpu[12] = {
+    mc_setup_c10_q0, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn,
+    mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn,
+    mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn
+};
+
+static const int * const inter_pred_setup_y_qpu[12] = {
+    mc_setup_y_q0, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn,
+    mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn,
+    mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn
+};
+
+static const int * const inter_pred_setup_y10_qpu[12] = {
+    mc_setup_y10_q0, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn,
+    mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn,
+    mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn
+};
+
+static const int * const inter_pred_sync_qpu[12] = {
+    mc_sync_q0, mc_sync_q1, mc_sync_q2, mc_sync_q3,
+    mc_sync_q4, mc_sync_q5, mc_sync_q6, mc_sync_q7,
+    mc_sync_q8, mc_sync_q9, mc_sync_q10, mc_sync_q11
+};
+
+static const int * const inter_pred_sync10_qpu[12] = {
+    mc_sync10_q0, mc_sync10_q1, mc_sync10_q2, mc_sync10_q3,
+    mc_sync10_q4, mc_sync10_q5, mc_sync10_q6, mc_sync10_q7,
+    mc_sync10_q8, mc_sync10_q9, mc_sync10_q10, mc_sync10_q11
+};
+
+static const int * const inter_pred_exit_c_qpu[12] = {
+    mc_exit_c_q0, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn,
+    mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn,
+    mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn
+};
+
+static const int * const inter_pred_exit_c10_qpu[12] = {
+    mc_exit_c10_q0, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn,
+    mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn,
+    mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn
+};
+
+static const int * const inter_pred_exit_y_qpu[12] = {
+    mc_exit_y_q0, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn,
+    mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn,
+    mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn
+};
+
+static const int * const inter_pred_exit_y10_qpu[12] = {
+    mc_exit_y10_q0, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn,
+    mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn,
+    mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn
+};
+
+typedef struct ipe_chan_info_s
+{
+    const unsigned int n;
+    const int * const * setup_fns;
+    const int * const * sync_fns;
+    const int * const * exit_fns;
+} ipe_chan_info_t;
+
+typedef struct ipe_init_info_s
+{
+    ipe_chan_info_t luma;
+    ipe_chan_info_t chroma;
+} ipe_init_info_t;
+
+static const ipe_init_info_t ipe_init_infos[9] = {  // Alloc for bit depths of 8-16
+   {  // 8
+      .luma =   {QPU_MC_PRED_N_Y8, inter_pred_setup_y_qpu, inter_pred_sync_qpu, inter_pred_exit_y_qpu},
+      .chroma = {QPU_MC_PRED_N_C8, inter_pred_setup_c_qpu, inter_pred_sync_qpu, inter_pred_exit_c_qpu}
+   },
+   {  // 9
+      .luma =   {0},
+      .chroma = {0}
+   },
+   {  // 10
+      .luma =   {QPU_MC_PRED_N_Y10, inter_pred_setup_y10_qpu, inter_pred_sync10_qpu, inter_pred_exit_y10_qpu},
+      .chroma = {QPU_MC_PRED_N_C10, inter_pred_setup_c10_qpu, inter_pred_sync10_qpu, inter_pred_exit_c10_qpu}
+   }
+
+};
+
+static void set_ipe_from_ici(HEVCRpiInterPredEnv * const ipe, const ipe_chan_info_t * const ici)
+{
+    const unsigned int n = ici->n;
+    const unsigned int q1_size = (ipe->gptr.numbytes / n) & ~3;  // Round down to word
+
+    ipe->n = n;
+    ipe->max_fill = q1_size - ipe->min_gap;
+    for(unsigned int i = 0; i < n; i++) {
+        HEVCRpiInterPredQ * const q = ipe->q + i;
+        q->qpu_mc_curr = q->qpu_mc_base =
+            (qpu_mc_pred_cmd_t *)(ipe->gptr.arm + i * q1_size);
+        q->code_setup = qpu_fn(ici->setup_fns[i]);
+        q->code_sync = qpu_fn(ici->sync_fns[i]);
+        q->code_exit = qpu_fn(ici->exit_fns[i]);
+    }
+}
+
+static void rpi_hevc_qpu_set_fns(HEVCContext * const s, const unsigned int bit_depth)
+{
+    const ipe_init_info_t * const iii = ipe_init_infos + bit_depth - 8;
+
+    av_assert0(bit_depth >= 8 && bit_depth <= 16);
+
+    rpi_hevc_qpu_init_fn(&s->qpu, bit_depth);
+
+    for (unsigned int i = 0; i != RPI_MAX_JOBS; ++i) {
+        HEVCRpiJob *const jb = s->jobs + i;
+        set_ipe_from_ici(&jb->chroma_ip, &iii->chroma);
+        set_ipe_from_ici(&jb->luma_ip,   &iii->luma);
+    }
+}
+
+
+#endif
+
+
+#ifdef RPI
+
+//#define LOG_ENTER printf("Enter %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s);
+//#define LOG_EXIT printf("Exit %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s);
+
+#define LOG_ENTER
+#define LOG_EXIT
+
+#define USE_SEM 1
+
+// Call this when we have completed pass0 and wish to trigger pass1 for the current job
+static void worker_submit_job(HEVCContext * const s)
+{
+    LOG_ENTER
+    sem_post(&s->jb0->sem_in);
+    s->jb0->pending = 1;
+    s->pass0_job = (s->pass0_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
+    s->jb0 = s->jobs + s->pass0_job;
+    LOG_EXIT
+}
+
+// Call this to say we have completed pass1
+static void worker_complete_job(HEVCContext * const s)
+{
+    LOG_ENTER
+    sem_t * const sem = &s->jb1->sem_out;
+    // Must set job no before signalling as otherwise rpi_do_all_passes
+    // may call worker_core from the main thread with a bad job number
+    s->pass1_job = (s->pass1_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
+    s->jb1 = s->jobs + s->pass1_job;
+    sem_post(sem);
+    LOG_EXIT
+}
+
+
+// Call worker_pass0_ready to wait until the s->pass0_job slot becomes
+// available to receive the next job.
+static void worker_pass0_ready(HEVCContext *s)
+{
+    LOG_ENTER
+    HEVCRpiJob * const jb = s->jb0;
+    if (jb->pending) {
+        while (sem_wait(&jb->sem_out) == -1 && errno == EINTR)
+            /* Loop */;
+        jb->pending = 0;
+    }
+    LOG_EXIT
+}
+
+// Call this to wait for all jobs to have completed at the end of a frame
+static void worker_wait(HEVCContext * const s)
+{
+    LOG_ENTER
+    unsigned int i;
+    for (i = 0; i != RPI_MAX_JOBS; ++i) {
+        HEVCRpiJob * const jb = s->jobs + i;
+        if (jb->pending) {
+            while (sem_wait(&jb->sem_out) == -1 && errno == EINTR)
+                /* Loop */;
+            jb->pending = 0;
+        }
+    }
+    LOG_EXIT
+}
+
+static void *worker_start(void *arg)
+{
+    HEVCContext * const s = (HEVCContext *)arg;
+
+    for (;;)
+    {
+        HEVCRpiJob * const jb = s->jb1;
+        while (sem_wait(&jb->sem_in) == -1 && errno == EINTR)
+            /* Loop */;
+        if (jb->terminate)
+            break;
+
+        LOG_ENTER
+        worker_core(s);
+        worker_complete_job(s);
+        LOG_EXIT
+    }
+    return NULL;
+}
+
+static void worker_pic_free_all(HEVCContext * const s)
+{
+    unsigned int i;
+
+    // Free coeff stuff - allocation not the same for all buffers
+    for(i = 0; i < RPI_MAX_JOBS; i++)
+    {
+        HEVCRpiCoeffsEnv * const cf = &s->jobs[i].coeffs;
+
+        if (cf->s[0].buf != NULL)
+            av_freep(&cf->mptr);
+        if (cf->s[2].buf != NULL)
+            gpu_free(&cf->gptr);
+        memset(cf, 0, sizeof(*cf));
+    }
+}
+
+static int worker_pic_alloc_all(HEVCContext * const s, const unsigned int coeff_count)
+{
+    unsigned int i;
+
+    // Free coeff stuff - allocation not the same for all buffers
+    for(i = 0; i < RPI_MAX_JOBS; i++)
+    {
+        HEVCRpiCoeffsEnv * const cf = &s->jobs[i].coeffs;
+
+//        av_assert0(cf->s[0].n == 0 && cf->s[0].buf == NULL);
+//        av_assert0(cf->s[1].n == 0 && cf->s[1].buf == NULL);
+//        av_assert0(cf->s[2].n == 0 && cf->s[2].buf == NULL);
+//        av_assert0(cf->s[3].n == 0 && cf->s[3].buf == NULL);
+
+        if (gpu_malloc_cached((coeff_count + 32*32) * sizeof(cf->s[2].buf[0]), &cf->gptr) != 0)
+            goto fail;
+        cf->s[2].buf = (int16_t *)cf->gptr.arm;
+        cf->s[3].buf = cf->s[2].buf + coeff_count;
+
+        // Must be 64 byte aligned for our zero apping code so over-allocate &
+        // round
+        if ((cf->mptr = av_malloc(coeff_count * sizeof(cf->s[0].buf[0]) + 63)) == NULL)
+            goto fail;
+        cf->s[0].buf = (void *)(((intptr_t)cf->mptr + 63) & ~63);
+    }
+    return 0;
+
+fail:
+    printf("%s: **** Failed\n", __func__);
+    worker_pic_free_all(s);
+    return -1;
+}
+
+static void worker_pic_reset(HEVCRpiCoeffsEnv * const cf)
+{
+    unsigned int i;
+    for (i = 0; i != 4; ++i) {
+        cf->s[i].n = 0;
+    }
+}
+#endif
+
+
 /**
  * NOTE: Each function hls_foo correspond to the function foo in the
  * specification (HLS stands for High Level Syntax).
@@ -55,6 +393,23 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
 /* free everything allocated  by pic_arrays_init() */
 static void pic_arrays_free(HEVCContext *s)
 {
+#ifdef RPI
+    worker_pic_free_all(s);
+#endif
+
+#ifdef RPI_DEBLOCK_VPU
+    {
+        int i;
+        for (i = 0; i != RPI_DEBLOCK_VPU_Q_COUNT; ++i) {
+            struct dblk_vpu_q_s * const dvq = s->dvq_ents + i;
+
+            if (dvq->vpu_cmds_arm) {
+                gpu_free(&dvq->deblock_vpu_gmem);
+              dvq->vpu_cmds_arm = 0;
+            }
+        }
+    }
+#endif
     av_freep(&s->sao);
     av_freep(&s->deblock);

@@ -91,6 +446,74 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
     int ctb_count        = sps->ctb_width * sps->ctb_height;
     int min_pu_size      = sps->min_pu_width * sps->min_pu_height;

+#ifdef RPI
+    const int coefs_in_ctb = (1 << sps->log2_ctb_size) * (1 << sps->log2_ctb_size);
+    const int coefs_per_luma = 64*64*RPI_CHUNK_SIZE*RPI_NUM_CHUNKS;
+    const int coefs_per_chroma = (coefs_per_luma * 2) >> sps->vshift[1] >> sps->hshift[1];
+    const int coefs_per_row = coefs_per_luma + coefs_per_chroma;
+
+    av_assert0(sps);
+    s->max_ctu_count = coefs_per_luma / coefs_in_ctb;
+#if RPI_ROUND_TO_LINES
+    // Round down to an integral quantity of lines
+    if (s->max_ctu_count > sps->ctb_width)
+        s->max_ctu_count -= s->max_ctu_count % sps->ctb_width;
+#endif
+
+    if (worker_pic_alloc_all(s, coefs_per_row) != 0)
+        goto fail;
+#endif
+#ifdef RPI_DEBLOCK_VPU
+    {
+        int i;
+        s->enable_rpi_deblock = !sps->sao_enabled;
+        s->setup_width = (sps->width+15) / 16;
+        s->setup_height = (sps->height+15) / 16;
+        s->uv_setup_width = ( (sps->width >> sps->hshift[1]) + 15) / 16;
+        s->uv_setup_height = ( (sps->height >> sps->vshift[1]) + 15) / 16;
+
+        for (i = 0; i != RPI_DEBLOCK_VPU_Q_COUNT; ++i)
+        {
+            struct dblk_vpu_q_s * const dvq = s->dvq_ents + i;
+            const unsigned int cmd_size = (sizeof(*dvq->vpu_cmds_arm) * 3 + 15) & ~15;
+            const unsigned int y_size = (sizeof(*dvq->y_setup_arm) * s->setup_width * s->setup_height + 15) & ~15;
+            const unsigned int uv_size = (sizeof(*dvq->uv_setup_arm) * s->uv_setup_width * s->uv_setup_height + 15) & ~15;
+            const unsigned int total_size =- cmd_size + y_size + uv_size;
+            int p_vc;
+            uint8_t * p_arm;
+ #if RPI_VPU_DEBLOCK_CACHED
+            gpu_malloc_cached(total_size, &dvq->deblock_vpu_gmem);
+ #else
+            gpu_malloc_uncached(total_size, &dvq->deblock_vpu_gmem);
+ #endif
+            p_vc = dvq->deblock_vpu_gmem.vc;
+            p_arm = dvq->deblock_vpu_gmem.arm;
+
+            // Zap all
+            memset(p_arm, 0, dvq->deblock_vpu_gmem.numbytes);
+
+            // Subdivide
+            dvq->vpu_cmds_arm = (void*)p_arm;
+            dvq->vpu_cmds_vc = p_vc;
+
+            p_arm += cmd_size;
+            p_vc += cmd_size;
+
+            dvq->y_setup_arm = (void*)p_arm;
+            dvq->y_setup_vc = (void*)p_vc;
+
+            p_arm += y_size;
+            p_vc += y_size;
+
+            dvq->uv_setup_arm = (void*)p_arm;
+            dvq->uv_setup_vc = (void*)p_vc;
+        }
+
+        s->dvq_n = 0;
+        s->dvq = s->dvq_ents + s->dvq_n;
+    }
+#endif
+
     s->bs_width  = (width  >> 2) + 1;
     s->bs_height = (height >> 2) + 1;

@@ -137,6 +560,29 @@ fail:
     return AVERROR(ENOMEM);
 }

+static void default_pred_weight_table(HEVCContext * const s)
+{
+  unsigned int i;
+  s->sh.luma_log2_weight_denom = 0;
+  s->sh.chroma_log2_weight_denom = 0;
+  for (i = 0; i < s->sh.nb_refs[L0]; i++) {
+      s->sh.luma_weight_l0[i] = 1;
+      s->sh.luma_offset_l0[i] = 0;
+      s->sh.chroma_weight_l0[i][0] = 1;
+      s->sh.chroma_offset_l0[i][0] = 0;
+      s->sh.chroma_weight_l0[i][1] = 1;
+      s->sh.chroma_offset_l0[i][1] = 0;
+  }
+  for (i = 0; i < s->sh.nb_refs[L1]; i++) {
+      s->sh.luma_weight_l1[i] = 1;
+      s->sh.luma_offset_l1[i] = 0;
+      s->sh.chroma_weight_l1[i][0] = 1;
+      s->sh.chroma_offset_l1[i][0] = 0;
+      s->sh.chroma_weight_l1[i][1] = 1;
+      s->sh.chroma_offset_l1[i][1] = 0;
+  }
+}
+
 static void pred_weight_table(HEVCContext *s, GetBitContext *gb)
 {
     int i = 0;
@@ -337,8 +783,8 @@ static void export_stream_params(AVCodecContext *avctx, const HEVCParamSets *ps,
 static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fmt)
 {
     #define HWACCEL_MAX (CONFIG_HEVC_DXVA2_HWACCEL + CONFIG_HEVC_D3D11VA_HWACCEL + CONFIG_HEVC_VAAPI_HWACCEL + CONFIG_HEVC_VDPAU_HWACCEL)
-    enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmt = pix_fmts;
-    int ret, i;
+    enum AVPixelFormat pix_fmts[HWACCEL_MAX + 4], *fmt = pix_fmts;
+    int ret;

     pic_arrays_free(s);
     s->ps.sps = NULL;
@@ -356,6 +802,12 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm
     switch (sps->pix_fmt) {
     case AV_PIX_FMT_YUV420P:
     case AV_PIX_FMT_YUVJ420P:
+#if RPI_HEVC_SAND
+        // Currently geometry calc is stuffed for big sizes
+        if (sps->width < 2048 && sps->height <= 1088) {
+            *fmt++ = AV_PIX_FMT_SAND128;
+        }
+#endif
 #if CONFIG_HEVC_DXVA2_HWACCEL
         *fmt++ = AV_PIX_FMT_DXVA2_VLD;
 #endif
@@ -370,6 +822,12 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm
 #endif
         break;
     case AV_PIX_FMT_YUV420P10:
+#if RPI_HEVC_SAND
+        // Currently geometry calc is stuffed for big sizes
+        if (sps->width < 2048 && sps->height <= 1088) {
+            *fmt++ = AV_PIX_FMT_SAND64_10;
+        }
+#endif
 #if CONFIG_HEVC_DXVA2_HWACCEL
         *fmt++ = AV_PIX_FMT_DXVA2_VLD;
 #endif
@@ -386,6 +844,7 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm
         ret = ff_thread_get_format(s->avctx, pix_fmts);
         if (ret < 0)
             goto fail;
+
         s->avctx->pix_fmt = ret;
     }
     else {
@@ -395,26 +854,36 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm
     ff_hevc_pred_init(&s->hpc,     sps->bit_depth);
     ff_hevc_dsp_init (&s->hevcdsp, sps->bit_depth);
     ff_videodsp_init (&s->vdsp,    sps->bit_depth);
+#ifdef RPI
+    rpi_hevc_qpu_set_fns(s, sps->bit_depth);
+#endif

-    for (i = 0; i < 3; i++) {
-        av_freep(&s->sao_pixel_buffer_h[i]);
-        av_freep(&s->sao_pixel_buffer_v[i]);
-    }
+    av_freep(&s->sao_pixel_buffer_h[0]);
+    av_freep(&s->sao_pixel_buffer_v[0]);

     if (sps->sao_enabled && !s->avctx->hwaccel) {
-        int c_count = (sps->chroma_format_idc != 0) ? 3 : 1;
-        int c_idx;
+        const unsigned int c_count = (sps->chroma_format_idc != 0) ? 3 : 1;
+        unsigned int c_idx;
+        size_t vsize[3] = {0};
+        size_t hsize[3] = {0};

         for(c_idx = 0; c_idx < c_count; c_idx++) {
             int w = sps->width >> sps->hshift[c_idx];
             int h = sps->height >> sps->vshift[c_idx];
-            s->sao_pixel_buffer_h[c_idx] =
-                av_malloc((w * 2 * sps->ctb_height) <<
-                          sps->pixel_shift);
-            s->sao_pixel_buffer_v[c_idx] =
-                av_malloc((h * 2 * sps->ctb_width) <<
-                          sps->pixel_shift);
+            // ctb height & width are a min of 8 so this must a multiple of 16
+            // so no point rounding up!
+            hsize[c_idx] = (w * 2 * sps->ctb_height) << sps->pixel_shift;
+            vsize[c_idx] = (h * 2 * sps->ctb_width) << sps->pixel_shift;
         }
+
+        // Allocate as a single lump so we can extend h[1] & v[1] into h[2] & v[2]
+        // when we have plaited chroma
+        s->sao_pixel_buffer_h[0] = av_malloc(hsize[0] + hsize[1] + hsize[2]);
+        s->sao_pixel_buffer_v[0] = av_malloc(vsize[0] + vsize[1] + vsize[2]);
+        s->sao_pixel_buffer_h[1] = s->sao_pixel_buffer_h[0] + hsize[0];
+        s->sao_pixel_buffer_h[2] = s->sao_pixel_buffer_h[1] + hsize[1];
+        s->sao_pixel_buffer_v[1] = s->sao_pixel_buffer_v[0] + vsize[0];
+        s->sao_pixel_buffer_v[2] = s->sao_pixel_buffer_v[1] + vsize[1];
     }

     s->ps.sps = sps;
@@ -680,6 +1149,11 @@ static int hls_slice_header(HEVCContext *s)
                 (s->ps.pps->weighted_bipred_flag && sh->slice_type == B_SLICE)) {
                 pred_weight_table(s, gb);
             }
+            else
+            {
+              // Give us unit weights
+              default_pred_weight_table(s);
+            }

             sh->max_num_merge_cand = 5 - get_ue_golomb_long(gb);
             if (sh->max_num_merge_cand < 1 || sh->max_num_merge_cand > 5) {
@@ -937,6 +1411,39 @@ static int hls_cross_component_pred(HEVCContext *s, int idx) {
     return 0;
 }

+#ifdef RPI
+static inline HEVCPredCmd * rpi_new_intra_cmd(HEVCContext * const s)
+{
+    return s->jb0->intra.cmds + s->jb0->intra.n++;
+}
+
+static void rpi_intra_pred(HEVCContext *s, int log2_trafo_size, int x0, int y0, int c_idx)
+{
+    // U & V done on U call in the case of sliced frames
+    if (av_rpi_is_sand_frame(s->frame) && c_idx > 1)
+        return;
+
+    if (s->enable_rpi) {
+        HEVCLocalContext *lc = s->HEVClc;
+        HEVCPredCmd *cmd = rpi_new_intra_cmd(s);
+        cmd->type = RPI_PRED_INTRA;
+        cmd->size = log2_trafo_size;
+        cmd->na = (lc->na.cand_bottom_left<<4) + (lc->na.cand_left<<3) + (lc->na.cand_up_left<<2) + (lc->na.cand_up<<1) + lc->na.cand_up_right;
+        cmd->c_idx = c_idx;
+        cmd->i_pred.x = x0;
+        cmd->i_pred.y = y0;
+        cmd->i_pred.mode = c_idx ? lc->tu.intra_pred_mode_c :  lc->tu.intra_pred_mode;
+    }
+    else if (av_rpi_is_sand_frame(s->frame) && c_idx != 0) {
+        s->hpc.intra_pred_c[log2_trafo_size - 2](s, x0, y0, c_idx);
+    }
+    else {
+        s->hpc.intra_pred[log2_trafo_size - 2](s, x0, y0, c_idx);
+    }
+
+}
+#endif
+
 static int hls_transform_unit(HEVCContext *s, int x0, int y0,
                               int xBase, int yBase, int cb_xBase, int cb_yBase,
                               int log2_cb_size, int log2_trafo_size,
@@ -949,8 +1456,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
     if (lc->cu.pred_mode == MODE_INTRA) {
         int trafo_size = 1 << log2_trafo_size;
         ff_hevc_set_neighbour_available(s, x0, y0, trafo_size, trafo_size);
-
+#ifdef RPI
+        rpi_intra_pred(s, log2_trafo_size, x0, y0, 0);
+#else
         s->hpc.intra_pred[log2_trafo_size - 2](s, x0, y0, 0);
+#endif
     }

     if (cbf_luma || cbf_cb[0] || cbf_cr[0] ||
@@ -1036,7 +1546,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
             for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
                 if (lc->cu.pred_mode == MODE_INTRA) {
                     ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
+#ifdef RPI
+                    rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 1);
+#else
                     s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (i << log2_trafo_size_c), 1);
+#endif
                 }
                 if (cbf_cb[i])
                     ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
@@ -1065,7 +1579,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
             for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
                 if (lc->cu.pred_mode == MODE_INTRA) {
                     ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
+#ifdef RPI
+                    rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 2);
+#else
                     s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (i << log2_trafo_size_c), 2);
+#endif
                 }
                 if (cbf_cr[i])
                     ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
@@ -1094,7 +1612,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
                 if (lc->cu.pred_mode == MODE_INTRA) {
                     ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
                                                     trafo_size_h, trafo_size_v);
+#ifdef RPI
+                    rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 1);
+#else
                     s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (i << log2_trafo_size), 1);
+#endif
                 }
                 if (cbf_cb[i])
                     ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
@@ -1104,7 +1626,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
                 if (lc->cu.pred_mode == MODE_INTRA) {
                     ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
                                                 trafo_size_h, trafo_size_v);
+#ifdef RPI
+                    rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 2);
+#else
                     s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (i << log2_trafo_size), 2);
+#endif
                 }
                 if (cbf_cr[i])
                     ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
@@ -1116,26 +1642,46 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
             int trafo_size_h = 1 << (log2_trafo_size_c + s->ps.sps->hshift[1]);
             int trafo_size_v = 1 << (log2_trafo_size_c + s->ps.sps->vshift[1]);
             ff_hevc_set_neighbour_available(s, x0, y0, trafo_size_h, trafo_size_v);
+#ifdef RPI
+            rpi_intra_pred(s, log2_trafo_size_c, x0, y0, 1);
+            rpi_intra_pred(s, log2_trafo_size_c, x0, y0, 2);
+#else
             s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0, 1);
             s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0, 2);
+#endif
             if (s->ps.sps->chroma_format_idc == 2) {
                 ff_hevc_set_neighbour_available(s, x0, y0 + (1 << log2_trafo_size_c),
                                                 trafo_size_h, trafo_size_v);
+#ifdef RPI
+                rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 1);
+                rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 2);
+#else
                 s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (1 << log2_trafo_size_c), 1);
                 s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (1 << log2_trafo_size_c), 2);
+#endif
             }
         } else if (blk_idx == 3) {
             int trafo_size_h = 1 << (log2_trafo_size + 1);
             int trafo_size_v = 1 << (log2_trafo_size + s->ps.sps->vshift[1]);
             ff_hevc_set_neighbour_available(s, xBase, yBase,
                                             trafo_size_h, trafo_size_v);
+#ifdef RPI
+            rpi_intra_pred(s, log2_trafo_size, xBase, yBase, 1);
+            rpi_intra_pred(s, log2_trafo_size, xBase, yBase, 2);
+#else
             s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase, 1);
             s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase, 2);
+#endif
             if (s->ps.sps->chroma_format_idc == 2) {
                 ff_hevc_set_neighbour_available(s, xBase, yBase + (1 << (log2_trafo_size)),
                                                 trafo_size_h, trafo_size_v);
+#ifdef RPI
+                rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 1);
+                rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 2);
+#else
                 s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (1 << (log2_trafo_size)), 1);
                 s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (1 << (log2_trafo_size)), 2);
+#endif
             }
         }
     }
@@ -1281,47 +1827,119 @@ do {
     return 0;
 }

-static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size)
+
+static int pcm_extract(HEVCContext * const s, const uint8_t * pcm, const int length, const int x0, const int y0, const int cb_size)
 {
-    HEVCLocalContext *lc = s->HEVClc;
     GetBitContext gb;
-    int cb_size   = 1 << log2_cb_size;
-    int stride0   = s->frame->linesize[0];
-    uint8_t *dst0 = &s->frame->data[0][y0 * stride0 + (x0 << s->ps.sps->pixel_shift)];
-    int   stride1 = s->frame->linesize[1];
-    uint8_t *dst1 = &s->frame->data[1][(y0 >> s->ps.sps->vshift[1]) * stride1 + ((x0 >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
-    int   stride2 = s->frame->linesize[2];
-    uint8_t *dst2 = &s->frame->data[2][(y0 >> s->ps.sps->vshift[2]) * stride2 + ((x0 >> s->ps.sps->hshift[2]) << s->ps.sps->pixel_shift)];
-
-    int length         = cb_size * cb_size * s->ps.sps->pcm.bit_depth +
-                         (((cb_size >> s->ps.sps->hshift[1]) * (cb_size >> s->ps.sps->vshift[1])) +
-                          ((cb_size >> s->ps.sps->hshift[2]) * (cb_size >> s->ps.sps->vshift[2]))) *
-                          s->ps.sps->pcm.bit_depth_chroma;
-    const uint8_t *pcm = skip_bytes(&lc->cc, (length + 7) >> 3);
     int ret;

-    if (!s->sh.disable_deblocking_filter_flag)
-        ff_hevc_deblocking_boundary_strengths(s, x0, y0, log2_cb_size);
-
     ret = init_get_bits(&gb, pcm, length);
     if (ret < 0)
         return ret;

-    s->hevcdsp.put_pcm(dst0, stride0, cb_size, cb_size,     &gb, s->ps.sps->pcm.bit_depth);
-    if (s->ps.sps->chroma_format_idc) {
-        s->hevcdsp.put_pcm(dst1, stride1,
+#if RPI_HEVC_SAND
+    if (av_rpi_is_sand_frame(s->frame)) {
+        s->hevcdsp.put_pcm(av_rpi_sand_frame_pos_y(s->frame, x0, y0),
+                           s->frame->linesize[0],
+                           cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth);
+
+        s->hevcdsp.put_pcm_c(av_rpi_sand_frame_pos_c(s->frame, x0 >> s->ps.sps->hshift[1], y0 >> s->ps.sps->vshift[1]),
+                           s->frame->linesize[1],
                            cb_size >> s->ps.sps->hshift[1],
                            cb_size >> s->ps.sps->vshift[1],
                            &gb, s->ps.sps->pcm.bit_depth_chroma);
-        s->hevcdsp.put_pcm(dst2, stride2,
-                           cb_size >> s->ps.sps->hshift[2],
-                           cb_size >> s->ps.sps->vshift[2],
-                           &gb, s->ps.sps->pcm.bit_depth_chroma);
     }
+    else
+#endif
+    {
+        const int stride0   = s->frame->linesize[0];
+        uint8_t * const dst0 = &s->frame->data[0][y0 * stride0 + (x0 << s->ps.sps->pixel_shift)];
+        const int   stride1 = s->frame->linesize[1];
+        uint8_t * const dst1 = &s->frame->data[1][(y0 >> s->ps.sps->vshift[1]) * stride1 + ((x0 >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
+        const int   stride2 = s->frame->linesize[2];
+        uint8_t * const dst2 = &s->frame->data[2][(y0 >> s->ps.sps->vshift[2]) * stride2 + ((x0 >> s->ps.sps->hshift[2]) << s->ps.sps->pixel_shift)];
+
+        s->hevcdsp.put_pcm(dst0, stride0, cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth);
+        if (s->ps.sps->chroma_format_idc) {
+            s->hevcdsp.put_pcm(dst1, stride1,
+                               cb_size >> s->ps.sps->hshift[1],
+                               cb_size >> s->ps.sps->vshift[1],
+                               &gb, s->ps.sps->pcm.bit_depth_chroma);
+            s->hevcdsp.put_pcm(dst2, stride2,
+                               cb_size >> s->ps.sps->hshift[2],
+                               cb_size >> s->ps.sps->vshift[2],
+                               &gb, s->ps.sps->pcm.bit_depth_chroma);
+        }

+    }
     return 0;
 }

+#ifdef RPI
+int16_t * rpi_alloc_coeff_buf(HEVCContext * const s, const int buf_no, const int n)
+{
+    HEVCRpiCoeffEnv *const cfe = s->jb0->coeffs.s + buf_no;
+    int16_t * const coeffs = (buf_no != 3) ? cfe->buf + cfe->n : cfe->buf - (cfe->n + n);
+    cfe->n += n;
+    return coeffs;
+}
+#endif
+
+// x * 2^(y*2)
+static inline unsigned int xyexp2(const unsigned int x, const unsigned int y)
+{
+    return x << (y * 2);
+}
+
+static int hls_pcm_sample(HEVCContext * const s, const int x0, const int y0, unsigned int log2_cb_size)
+{
+    // Length in bits
+    const unsigned int length = xyexp2(s->ps.sps->pcm.bit_depth, log2_cb_size) +
+        xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - s->ps.sps->vshift[1]) +
+        xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - s->ps.sps->vshift[2]);
+
+    const uint8_t * const pcm = skip_bytes(&s->HEVClc->cc, (length + 7) >> 3);
+
+    if (!s->sh.disable_deblocking_filter_flag)
+        ff_hevc_deblocking_boundary_strengths(s, x0, y0, log2_cb_size);
+
+#ifdef RPI
+    if (s->enable_rpi) {
+        // Copy coeffs
+        const int blen = (length + 7) >> 3;
+        // Round allocated bytes up to nearest 32 to avoid alignment confusion
+        // Allocation is in int16_t s
+        // As we are only using 1 byte per sample and the coeff buffer allows 2 per
+        // sample this rounding doesn't affect the total size we need to allocate for
+        // the coeff buffer
+        int16_t * const coeffs = rpi_alloc_coeff_buf(s, 0, ((blen + 31) & ~31) >> 1);
+        memcpy(coeffs, pcm, blen);
+
+        // Our coeff stash assumes that any partially allocated 64byte lump
+        // is zeroed so make that true.
+        {
+            uint8_t * const eopcm = (uint8_t *)coeffs + blen;
+            if ((-(intptr_t)eopcm & 63) != 0)
+                memset(eopcm, 0, -(intptr_t)eopcm & 63);
+        }
+
+        // Add command
+        {
+            HEVCPredCmd *const cmd = rpi_new_intra_cmd(s);
+            cmd->type = RPI_PRED_I_PCM;
+            cmd->size = log2_cb_size;
+            cmd->i_pcm.src = coeffs;
+            cmd->i_pcm.x = x0;
+            cmd->i_pcm.y = y0;
+            cmd->i_pcm.src_len = length;
+        }
+        return 0;
+    }
+#endif
+
+    return pcm_extract(s, pcm, length, x0, y0, 1 << log2_cb_size);
+}
+
 /**
  * 8.5.3.2.2.1 Luma sample unidirectional interpolation process
  *
@@ -1353,6 +1971,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
                            (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
     int idx              = ff_hevc_pel_weight[block_w];

+#ifdef DISABLE_MC
+    return;
+#endif
+
     x_off += mv->x >> 2;
     y_off += mv->y >> 2;
     src   += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
@@ -1399,7 +2021,7 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
  * @param mv1 motion vector1 (relative to block position) to get pixel data from
  * @param current_mv current motion vector structure
  */
- static void luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+static void luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
                        AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
                        int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv)
 {
@@ -1423,6 +2045,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
     uint8_t *src0  = ref0->data[0] + y_off0 * src0stride + (int)((unsigned)x_off0 << s->ps.sps->pixel_shift);
     uint8_t *src1  = ref1->data[0] + y_off1 * src1stride + (int)((unsigned)x_off1 << s->ps.sps->pixel_shift);

+#ifdef DISABLE_MC
+    return;
+#endif
+
     if (x_off0 < QPEL_EXTRA_BEFORE || y_off0 < QPEL_EXTRA_AFTER ||
         x_off0 >= pic_width - block_w - QPEL_EXTRA_AFTER ||
         y_off0 >= pic_height - block_h - QPEL_EXTRA_AFTER) {
@@ -1508,6 +2134,10 @@ static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
     intptr_t _mx         = mx << (1 - hshift);
     intptr_t _my         = my << (1 - vshift);

+#ifdef DISABLE_MC
+    return;
+#endif
+
     x_off += mv->x >> (2 + hshift);
     y_off += mv->y >> (2 + vshift);
     src0  += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
@@ -1572,6 +2202,10 @@ static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVF
     int hshift = s->ps.sps->hshift[1];
     int vshift = s->ps.sps->vshift[1];

+#ifdef DISABLE_MC
+    return;
+#endif
+
     intptr_t mx0 = av_mod_uintp2(mv0->x, 2 + hshift);
     intptr_t my0 = av_mod_uintp2(mv0->y, 2 + vshift);
     intptr_t mx1 = av_mod_uintp2(mv1->x, 2 + hshift);
@@ -1645,13 +2279,112 @@ static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVF
                                                          _mx1, _my1, block_w);
 }

-static void hevc_await_progress(HEVCContext *s, HEVCFrame *ref,
-                                const Mv *mv, int y0, int height)
+#ifdef RPI
+void ff_hevc_rpi_progress_wait_field(HEVCContext * const s, HEVCRpiJob * const jb,
+                                     const HEVCFrame * const ref, const int val, const int field)
 {
-    int y = FFMAX(0, (mv->y >> 2) + y0 + height + 9);
+    if (ref->tf.progress != NULL && ((int *)ref->tf.progress->data)[field] < val) {
+        HEVCContext *const fs = ref->tf.owner->priv_data;
+        HEVCRPiFrameProgressState * const pstate = fs->progress_states + field;
+        sem_t * sem = NULL;
+
+        av_assert0(pthread_mutex_lock(&pstate->lock) == 0);
+        if (((volatile int *)ref->tf.progress->data)[field] < val) {
+            HEVCRPiFrameProgressWait * const pwait = &jb->progress_wait;
+
+            av_assert0(pwait->req == -1 && pwait->next == NULL);

-    if (s->threads_type == FF_THREAD_FRAME )
-        ff_thread_await_progress(&ref->tf, y, 0);
+            pwait->req = val;
+            pwait->next = NULL;
+            if (pstate->first == NULL)
+                pstate->first = pwait;
+            else
+                pstate->last->next = pwait;
+            pstate->last = pwait;
+            sem = &pwait->sem;
+        }
+        pthread_mutex_unlock(&pstate->lock);
+
+        if (sem != NULL) {
+            while (sem_wait(sem) != 0)
+                av_assert0(errno == EINTR);
+        }
+    }
+}
+
+void ff_hevc_rpi_progress_signal_field(HEVCContext * const s, const int val, const int field)
+{
+    HEVCRPiFrameProgressState *const pstate = s->progress_states + field;
+
+    ((int *)s->ref->tf.progress->data)[field] = val;
+
+    av_assert0(pthread_mutex_lock(&pstate->lock) == 0);
+    {
+        HEVCRPiFrameProgressWait ** ppwait = &pstate->first;
+        HEVCRPiFrameProgressWait * pwait;
+
+        while ((pwait = *ppwait) != NULL) {
+            if (pwait->req > val)
+            {
+                ppwait = &pwait->next;
+                pstate->last = pwait;
+            }
+            else
+            {
+                *ppwait = pwait->next;
+                pwait->req = -1;
+                pwait->next = NULL;
+                sem_post(&pwait->sem);
+            }
+        }
+    }
+    pthread_mutex_unlock(&pstate->lock);
+}
+
+static void ff_hevc_rpi_progress_init_state(HEVCRPiFrameProgressState * const pstate)
+{
+    pstate->first = NULL;
+    pstate->last = NULL;
+    pthread_mutex_init(&pstate->lock, NULL);
+}
+
+static void ff_hevc_rpi_progress_init_wait(HEVCRPiFrameProgressWait * const pwait)
+{
+    pwait->req = -1;
+    pwait->next = NULL;
+    sem_init(&pwait->sem, 0, 0);
+}
+
+static void ff_hevc_rpi_progress_kill_state(HEVCRPiFrameProgressState * const pstate)
+{
+    av_assert0(pstate->first == NULL);
+    pthread_mutex_destroy(&pstate->lock);
+}
+
+static void ff_hevc_rpi_progress_kill_wait(HEVCRPiFrameProgressWait * const pwait)
+{
+    sem_destroy(&pwait->sem);
+}
+#endif
+
+static void hevc_await_progress(HEVCContext *s, const HEVCFrame * const ref,
+                                const Mv * const mv, const int y0, const int height)
+{
+    if (s->threads_type == FF_THREAD_FRAME) {
+        const int y = FFMAX(0, (mv->y >> 2) + y0 + height + 9);
+
+#ifdef RPI
+        if (s->enable_rpi) {
+            int16_t *const pr = s->jb0->progress + ref->dpb_no;
+            if (*pr < y) {
+                *pr = y;
+            }
+        }
+        else
+#endif
+        // It is a const ThreadFrame but the prototype isn't
+        ff_hevc_progress_wait_mv(s, s->jb0, ref, y);
+    }
 }

 static void hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW,
@@ -1699,14 +2432,542 @@ static void hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW,
     }
 }

-static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-                                int nPbW, int nPbH,
-                                int log2_cb_size, int partIdx, int idx)
+
+#if RPI_INTER
+
+static HEVCRpiInterPredQ *
+rpi_nxt_pred(HEVCRpiInterPredEnv * const ipe, const unsigned int load_val, const uint32_t fn)
+{
+    HEVCRpiInterPredQ * yp = ipe->q + ipe->curr;
+    HEVCRpiInterPredQ * ypt = yp + 1;
+    for (unsigned int i = 1; i != ipe->n_grp; ++i, ++ypt) {
+        if (ypt->load < yp->load)
+            yp = ypt;
+    }
+
+    yp->load += load_val;
+    ipe->used_grp = 1;
+    yp->qpu_mc_curr->data[-1] = fn;  // Link is always last el of previous cmd
+
+    return yp;
+}
+
+
+static void rpi_inter_pred_sync(HEVCRpiInterPredEnv * const ipe)
+{
+    for (unsigned int i = 0; i != ipe->n; ++i) {
+        HEVCRpiInterPredQ * const q = ipe->q + i;
+        q->qpu_mc_curr->data[-1] = q->code_sync;
+        q->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(q->qpu_mc_curr->data + 1);
+        q->load = 0;
+    }
+}
+
+// Returns 0 on success, -1 if Q is dangerously full
+static int rpi_inter_pred_next_ctu(HEVCRpiInterPredEnv * const ipe)
+{
+    if (!ipe->used_grp)
+        return 0;
+
+    if ((ipe->curr += ipe->n_grp) >= ipe->n)
+    {
+        ipe->curr = 0;
+        rpi_inter_pred_sync(ipe);
+    }
+    ipe->used = 1;
+    ipe->used_grp = 0;
+
+    for (unsigned int i = 0; i != ipe->n_grp; ++i) {
+        HEVCRpiInterPredQ * const q = ipe->q + i + ipe->curr;
+        if ((char *)q->qpu_mc_curr - (char *)q->qpu_mc_base > ipe->max_fill) {
+            return -1;
+        }
+    }
+    return 0;
+}
+
+static void rpi_inter_pred_reset(HEVCRpiInterPredEnv * const ipe)
+{
+    unsigned int i;
+    ipe->curr = 0;
+    ipe->used = 0;
+    ipe->used_grp = 0;
+    for (i = 0; i != ipe->n; ++i) {
+        HEVCRpiInterPredQ * const q = ipe->q + i;
+        q->qpu_mc_curr = q->qpu_mc_base;
+        q->load = 0;
+        q->last_l0 = NULL;
+        q->last_l1 = NULL;
+    }
+}
+
+static void rpi_inter_pred_alloc(HEVCRpiInterPredEnv * const ipe,
+                                 const unsigned int n_max, const unsigned int n_grp,
+                                 const unsigned int total_size, const unsigned int min_gap)
+{
+    memset(ipe, 0, sizeof(*ipe));
+    av_assert0((ipe->q = av_mallocz(n_max * sizeof(*ipe->q))) != NULL);
+    ipe->n_grp = n_grp;
+    ipe->min_gap = min_gap;
+
+#if RPI_CACHE_UNIF_MVS
+    gpu_malloc_cached(total_size, &ipe->gptr);
+#else
+    gpu_malloc_uncached(total_size, &ipe->gptr);
+#endif
+}
+
+
+#if RPI_QPU_EMU_Y
+#define get_mc_address_y(f) ((f)->data[0])
+#else
+#define get_mc_address_y(f) get_vc_address_y(f)
+#endif
+#if RPI_QPU_EMU_C
+#define get_mc_address_u(f) ((f)->data[1])
+#else
+#define get_mc_address_u(f) get_vc_address_u(f)
+#endif
+
+static inline int offset_depth_adj(const HEVCContext *const s, const int wt)
+{
+    return s->ps.sps->high_precision_offsets_enabled_flag ? wt :
+           wt << (s->ps.sps->bit_depth - 8);
+}
+
+static void
+rpi_pred_y(HEVCContext *const s, const int x0, const int y0,
+           const int nPbW, const int nPbH,
+           const Mv *const mv,
+           const int weight_mul,
+           const int weight_offset,
+           AVFrame *const src_frame)
+{
+    const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0);
+    const unsigned int mx          = mv->x & 3;
+    const unsigned int my          = mv->y & 3;
+    const unsigned int my_mx       = (my << 8) | mx;
+    const uint32_t     my2_mx2_my_mx = (my_mx << 16) | my_mx;
+    const qpu_mc_src_addr_t src_vc_address_y = get_mc_address_y(src_frame);
+    qpu_mc_dst_addr_t dst_addr = get_mc_address_y(s->frame) + y_off;
+    const uint32_t wo = PACK2(offset_depth_adj(s, weight_offset) * 2 + 1, weight_mul);
+    HEVCRpiInterPredEnv * const ipe = &s->jb0->luma_ip;
+    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame);
+
+    if (my_mx == 0)
+    {
+        const int x1 = x0 + (mv->x >> 2);
+        const int y1 = y0 + (mv->y >> 2);
+        const int bh = nPbH;
+
+        for (int start_x = 0; start_x < nPbW; start_x += 16)
+        {
+            const int bw = FFMIN(nPbW - start_x, 16);
+            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu.y_p00);
+            qpu_mc_src_t *const src1 = yp->last_l0;
+            qpu_mc_pred_y_p00_t *const cmd_y = &yp->qpu_mc_curr->y.p00;
+
+#if RPI_TSTATS
+            {
+                HEVCRpiStats *const ts = &s->tstats;
+                ++ts->y_pred1_x0y0;
+
+                if (nPbW > 8)
+                    ++ts->y_pred1_wgt8;
+                else
+                    ++ts->y_pred1_wle8;
+
+                if (nPbH > 16)
+                    ++ts->y_pred1_hgt16;
+                else
+                    ++ts->y_pred1_hle16;
+            }
+#endif
+
+            src1->x = x1 + start_x;
+            src1->y = y1;
+            src1->base = src_vc_address_y;
+            cmd_y->w = bw;
+            cmd_y->h = bh;
+            cmd_y->wo1 = wo;
+            cmd_y->dst_addr =  dst_addr + (start_x << xshl);
+            yp->last_l0 = &cmd_y->next_src1;
+            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
+        }
+    }
+    else
+    {
+        const int x1_m3 = x0 + (mv->x >> 2) - 3;
+        const int y1_m3 = y0 + (mv->y >> 2) - 3;
+        const unsigned int bh = nPbH;
+        int start_x = 0;
+
+#if 1
+        // As Y-pred operates on two independant 8-wide src blocks we can merge
+        // this pred with the previous one if it the previous one is 8 pel wide,
+        // the same height as the current block, immediately to the left of our
+        // current dest block and mono-pred.
+
+        qpu_mc_pred_y_p_t *const last_y8_p = s->last_y8_p;
+        if (last_y8_p != NULL && last_y8_p->h == bh && last_y8_p->dst_addr + (8 << xshl) == dst_addr)
+        {
+            const int bw = FFMIN(nPbW, 8);
+            qpu_mc_src_t *const last_y8_src2 = s->last_y8_l1;
+
+            last_y8_src2->x = x1_m3;
+            last_y8_src2->y = y1_m3;
+            last_y8_src2->base = src_vc_address_y;
+            last_y8_p->w += bw;
+            last_y8_p->mymx21 = PACK2(my2_mx2_my_mx, last_y8_p->mymx21);
+            last_y8_p->wo2 = wo;
+
+            s->last_y8_p = NULL;
+            s->last_y8_l1 = NULL;
+            start_x = bw;
+#if RPI_TSTATS
+            ++s->tstats.y_pred1_y8_merge;
+#endif
+        }
+#endif
+
+        for (; start_x < nPbW; start_x += 16)
+        {
+            const int bw = FFMIN(nPbW - start_x, 16);
+            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu.y_pxx);
+            qpu_mc_src_t *const src1 = yp->last_l0;
+            qpu_mc_src_t *const src2 = yp->last_l1;
+            qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
+#if RPI_TSTATS
+            {
+                HEVCRpiStats *const ts = &s->tstats;
+                if (mx == 0 && my == 0)
+                    ++ts->y_pred1_x0y0;
+                else if (mx == 0)
+                    ++ts->y_pred1_x0;
+                else if (my == 0)
+                    ++ts->y_pred1_y0;
+                else
+                    ++ts->y_pred1_xy;
+
+                if (nPbW > 8)
+                    ++ts->y_pred1_wgt8;
+                else
+                    ++ts->y_pred1_wle8;
+
+                if (nPbH > 16)
+                    ++ts->y_pred1_hgt16;
+                else
+                    ++ts->y_pred1_hle16;
+            }
+#endif
+            src1->x = x1_m3 + start_x;
+            src1->y = y1_m3;
+            src1->base = src_vc_address_y;
+            if (bw <= 8)
+            {
+                src2->x = MC_DUMMY_X;
+                src2->y = MC_DUMMY_Y;
+#if RPI_QPU_EMU_Y
+                src2->base = s->qpu_dummy_frame_emu;
+#else
+                src2->base = s->qpu_dummy_frame_qpu;
+#endif
+            }
+            else
+            {
+                src2->x = x1_m3 + start_x + 8;
+                src2->y = y1_m3;
+                src2->base = src_vc_address_y;
+            }
+            cmd_y->w = bw;
+            cmd_y->h = bh;
+            cmd_y->mymx21 = my2_mx2_my_mx;
+            cmd_y->wo1 = wo;
+            cmd_y->wo2 = wo;
+            cmd_y->dst_addr =  dst_addr + (start_x << xshl);
+            yp->last_l0 = &cmd_y->next_src1;
+            yp->last_l1 = &cmd_y->next_src2;
+            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
+
+            if (bw == 8) {
+                s->last_y8_l1 = src2;
+                s->last_y8_p = cmd_y;
+            }
+        }
+    }
+}
+
+static void
+rpi_pred_y_b(HEVCContext * const s,
+           const int x0, const int y0,
+           const int nPbW, const int nPbH,
+           const struct MvField *const mv_field,
+           AVFrame *const src_frame,
+           AVFrame *const src_frame2)
+{
+    const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0);
+    const Mv * const mv  = mv_field->mv + 0;
+    const Mv * const mv2 = mv_field->mv + 1;
+
+    const unsigned int mx          = mv->x & 3;
+    const unsigned int my          = mv->y & 3;
+    const unsigned int my_mx = (my<<8) | mx;
+    const unsigned int mx2          = mv2->x & 3;
+    const unsigned int my2          = mv2->y & 3;
+    const unsigned int my2_mx2 = (my2<<8) | mx2;
+    const uint32_t     my2_mx2_my_mx = (my2_mx2 << 16) | my_mx;
+    const unsigned int ref_idx0 = mv_field->ref_idx[0];
+    const unsigned int ref_idx1 = mv_field->ref_idx[1];
+    const uint32_t wt_offset =
+        offset_depth_adj(s, s->sh.luma_offset_l0[ref_idx0] + s->sh.luma_offset_l1[ref_idx1]) + 1;
+    const uint32_t wo1 = PACK2(wt_offset, s->sh.luma_weight_l0[ref_idx0]);
+    const uint32_t wo2 = PACK2(wt_offset, s->sh.luma_weight_l1[ref_idx1]);
+
+    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame);
+    qpu_mc_dst_addr_t dst = get_mc_address_y(s->frame) + y_off;
+    const qpu_mc_src_addr_t src1_base = get_mc_address_y(src_frame);
+    const qpu_mc_src_addr_t src2_base = get_mc_address_y(src_frame2);
+    HEVCRpiInterPredEnv * const ipe = &s->jb0->luma_ip;
+
+    if (my2_mx2_my_mx == 0)
+    {
+        const int x1 = x0 + (mv->x >> 2);
+        const int y1 = y0 + (mv->y >> 2);
+        const int x2 = x0 + (mv2->x >> 2);
+        const int y2 = y0 + (mv2->y >> 2);
+        const int bh = nPbH;
+
+        // Can do chunks a full 16 wide if we don't want the H filter
+        for (int start_x=0; start_x < nPbW; start_x += 16)
+        {
+            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu.y_b00);
+            qpu_mc_src_t *const src1 = yp->last_l0;
+            qpu_mc_src_t *const src2 = yp->last_l1;
+            qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
+#if RPI_TSTATS
+            {
+                HEVCRpiStats *const ts = &s->tstats;
+                ++ts->y_pred2_x0y0;
+
+                if (nPbH > 16)
+                    ++ts->y_pred2_hgt16;
+                else
+                    ++ts->y_pred2_hle16;
+            }
+#endif
+            src1->x = x1 + start_x;
+            src1->y = y1;
+            src1->base = src1_base;
+            src2->x = x2 + start_x;
+            src2->y = y2;
+            src2->base = src2_base;
+            cmd_y->w = FFMIN(nPbW - start_x, 16);
+            cmd_y->h = bh;
+            cmd_y->mymx21 = 0;
+            cmd_y->wo1 = wo1;
+            cmd_y->wo2 = wo2;
+            cmd_y->dst_addr =  dst + (start_x << xshl);
+            yp->last_l0 = &cmd_y->next_src1;
+            yp->last_l1 = &cmd_y->next_src2;
+            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
+        }
+    }
+    else
+    {
+        // Filter requires a run-up of 3
+        const int x1 = x0 + (mv->x >> 2) - 3;
+        const int y1 = y0 + (mv->y >> 2) - 3;
+        const int x2 = x0 + (mv2->x >> 2) - 3;
+        const int y2 = y0 + (mv2->y >> 2) - 3;
+        const int bh = nPbH;
+
+        for (int start_x=0; start_x < nPbW; start_x += 8)
+        { // B blocks work 8 at a time
+            // B weights aren't doubled as the QPU code does the same
+            // amount of work as it does for P
+            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu.y_bxx);
+            qpu_mc_src_t *const src1 = yp->last_l0;
+            qpu_mc_src_t *const src2 = yp->last_l1;
+            qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
+#if RPI_TSTATS
+            {
+                HEVCRpiStats *const ts = &s->tstats;
+                const unsigned int mmx = mx | mx2;
+                const unsigned int mmy = my | my2;
+                if (mmx == 0 && mmy == 0)
+                    ++ts->y_pred2_x0y0;
+                else if (mmx == 0)
+                    ++ts->y_pred2_x0;
+                else if (mmy == 0)
+                    ++ts->y_pred2_y0;
+                else
+                    ++ts->y_pred2_xy;
+
+                if (nPbH > 16)
+                    ++ts->y_pred2_hgt16;
+                else
+                    ++ts->y_pred2_hle16;
+            }
+#endif
+            src1->x = x1 + start_x;
+            src1->y = y1;
+            src1->base = src1_base;
+            src2->x = x2 + start_x;
+            src2->y = y2;
+            src2->base = src2_base;
+            cmd_y->w = FFMIN(nPbW - start_x, 8);
+            cmd_y->h = bh;
+            cmd_y->mymx21 = my2_mx2_my_mx;
+            cmd_y->wo1 = wo1;
+            cmd_y->wo2 = wo2;
+            cmd_y->dst_addr =  dst + (start_x << xshl);
+            yp->last_l0 = &cmd_y->next_src1;
+            yp->last_l1 = &cmd_y->next_src2;
+            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
+        }
+    }
+}
+
+// h/v shifts fixed at one as that is all the qasm copes with
+static void
+rpi_pred_c(HEVCContext * const s, const unsigned int lx, const int x0_c, const int y0_c,
+  const int nPbW_c, const int nPbH_c,
+  const Mv * const mv,
+  const int16_t * const c_weights,
+  const int16_t * const c_offsets,
+  AVFrame * const src_frame)
+{
+    const unsigned int c_off = av_rpi_sand_frame_off_c(s->frame, x0_c, y0_c);
+    const int hshift = 1; // = s->ps.sps->hshift[1];
+    const int vshift = 1; // = s->ps.sps->vshift[1];
+
+    const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1;
+    const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1;
+    const qpu_mc_src_addr_t src_base_u = get_mc_address_u(src_frame);
+    const uint32_t x_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->x, 2 + hshift) << (1 - hshift)];
+    const uint32_t y_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->y, 2 + vshift) << (1 - vshift)];
+    const uint32_t wo_u = PACK2(offset_depth_adj(s, c_offsets[0]) * 2 + 1, c_weights[0]);
+    const uint32_t wo_v = PACK2(offset_depth_adj(s, c_offsets[1]) * 2 + 1, c_weights[1]);
+    qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off;
+    HEVCRpiInterPredEnv * const ipe = &s->jb0->chroma_ip;
+    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame) + 1;
+    const unsigned int bh = nPbH_c;
+    const uint32_t qfn = lx == 0 ? s->qpu.c_pxx : s->qpu.c_pxx_l1;
+
+    for(int start_x=0; start_x < nPbW_c; start_x+=RPI_CHROMA_BLOCK_WIDTH)
+    {
+        HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh + 3, qfn);
+        qpu_mc_pred_c_p_t * const cmd_c = &cp->qpu_mc_curr->c.p;
+        qpu_mc_src_t ** const plast_lx = (lx == 0) ? &cp->last_l0 : &cp->last_l1;
+        qpu_mc_src_t * const last_lx = *plast_lx;
+        const int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
+
+        last_lx->x = x1_c + start_x;
+        last_lx->y = y1_c;
+        last_lx->base = src_base_u;
+        cmd_c->h = bh;
+        cmd_c->w = bw;
+        cmd_c->coeffs_x = x_coeffs;
+        cmd_c->coeffs_y = y_coeffs;
+        cmd_c->wo_u = wo_u;
+        cmd_c->wo_v = wo_v;
+        cmd_c->dst_addr_c = dst_base_u + (start_x << xshl);
+        *plast_lx = &cmd_c->next_src;
+        cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_c + 1);
+    }
+    return;
+}
+
+// h/v shifts fixed at one as that is all the qasm copes with
+static void
+rpi_pred_c_b(HEVCContext * const s, const int x0_c, const int y0_c,
+  const int nPbW_c, const int nPbH_c,
+  const struct MvField * const mv_field,
+  const int16_t * const c_weights,
+  const int16_t * const c_offsets,
+  const int16_t * const c_weights2,
+  const int16_t * const c_offsets2,
+  AVFrame * const src_frame,
+  AVFrame * const src_frame2)
+{
+    const unsigned int c_off = av_rpi_sand_frame_off_c(s->frame, x0_c, y0_c);
+    const int hshift = 1; // s->ps.sps->hshift[1];
+    const int vshift = 1; // s->ps.sps->vshift[1];
+    const Mv * const mv = mv_field->mv + 0;
+    const Mv * const mv2 = mv_field->mv + 1;
+
+    const unsigned int mx = av_mod_uintp2(mv->x, 2 + hshift);
+    const unsigned int my = av_mod_uintp2(mv->y, 2 + vshift);
+    const uint32_t coefs0_x = rpi_filter_coefs[mx << (1 - hshift)];
+    const uint32_t coefs0_y = rpi_filter_coefs[my << (1 - vshift)]; // Fractional part of motion vector
+    const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1;
+    const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1;
+
+    const unsigned int mx2 = av_mod_uintp2(mv2->x, 2 + hshift);
+    const unsigned int my2 = av_mod_uintp2(mv2->y, 2 + vshift);
+    const uint32_t coefs1_x = rpi_filter_coefs[mx2 << (1 - hshift)];
+    const uint32_t coefs1_y = rpi_filter_coefs[my2 << (1 - vshift)]; // Fractional part of motion vector
+
+    const int x2_c = x0_c + (mv2->x >> (2 + hshift)) - 1;
+    const int y2_c = y0_c + (mv2->y >> (2 + hshift)) - 1;
+
+    const uint32_t wo_u2 = PACK2(offset_depth_adj(s, c_offsets[0] + c_offsets2[0]) + 1, c_weights2[0]);
+    const uint32_t wo_v2 = PACK2(offset_depth_adj(s, c_offsets[1] + c_offsets2[1]) + 1, c_weights2[1]);
+
+    const qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off;
+    const qpu_mc_src_addr_t src1_base = get_mc_address_u(src_frame);
+    const qpu_mc_src_addr_t src2_base = get_mc_address_u(src_frame2);
+    HEVCRpiInterPredEnv * const ipe = &s->jb0->chroma_ip;
+    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame) + 1;
+    const unsigned int bh = nPbH_c;
+
+    for (int start_x=0; start_x < nPbW_c; start_x += RPI_CHROMA_BLOCK_WIDTH)
+    {
+        const unsigned int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
+
+        HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh * 2 + 3, s->qpu.c_bxx);
+        qpu_mc_pred_c_b_t * const u = &cp->qpu_mc_curr->c.b;
+        qpu_mc_src_t * const src_l0 = cp->last_l0;
+        qpu_mc_src_t * const src_l1 = cp->last_l1;
+
+        src_l0->x = x1_c + start_x;
+        src_l0->y = y1_c;
+        src_l0->base = src1_base;
+        src_l1->x = x2_c + start_x;
+        src_l1->y = y2_c;
+        src_l1->base = src2_base;
+
+        u[0].h = bh;
+        u[0].w = bw;
+        u[0].coeffs_x1 = coefs0_x;
+        u[0].coeffs_y1 = coefs0_y;
+        u[0].weight_u1 = c_weights[0]; // Weight L0 U
+        u[0].weight_v1 = c_weights[1]; // Weight L0 V
+        u[0].coeffs_x2 = coefs1_x;
+        u[0].coeffs_y2 = coefs1_y;
+        u[0].wo_u2 = wo_u2;
+        u[0].wo_v2 = wo_v2;
+        u[0].dst_addr_c = dst_base_u + (start_x << xshl);
+
+        cp->last_l0 = &u[0].next_src1;
+        cp->last_l1 = &u[0].next_src2;
+        cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(u + 1);
+    }
+}
+
+
+#endif
+
+
+
+static void hls_prediction_unit(HEVCContext * const s, const int x0, const int y0,
+                                const int nPbW, const int nPbH,
+                                const unsigned int log2_cb_size, const unsigned int partIdx, const unsigned int idx)
 {
 #define POS(c_idx, x, y)                                                              \
     &s->frame->data[c_idx][((y) >> s->ps.sps->vshift[c_idx]) * s->frame->linesize[c_idx] + \
                            (((x) >> s->ps.sps->hshift[c_idx]) << s->ps.sps->pixel_shift)]
-    HEVCLocalContext *lc = s->HEVClc;
+    HEVCLocalContext * const lc = s->HEVClc;
     int merge_idx = 0;
     struct MvField current_mv = {{{ 0 }}};

@@ -1724,8 +2985,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
     int y_cb             = y0 >> log2_min_cb_size;
     int x_pu, y_pu;
     int i, j;
-
-    int skip_flag = SAMPLE_CTB(s->skip_flag, x_cb, y_cb);
+    const int skip_flag = SAMPLE_CTB(s->skip_flag, x_cb, y_cb);

     if (!skip_flag)
         lc->pu.merge_flag = ff_hevc_merge_flag_decode(s);
@@ -1769,12 +3029,29 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
         int nPbH_c = nPbH >> s->ps.sps->vshift[1];

-        luma_mc_uni(s, dst0, s->frame->linesize[0], ref0->frame,
+#if RPI_INTER
+        if (s->enable_rpi) {
+            rpi_pred_y(s, x0, y0, nPbW, nPbH, current_mv.mv + 0,
+              s->sh.luma_weight_l0[current_mv.ref_idx[0]], s->sh.luma_offset_l0[current_mv.ref_idx[0]],
+              ref0->frame);
+        } else
+#endif
+        {
+            luma_mc_uni(s, dst0, s->frame->linesize[0], ref0->frame,
                     &current_mv.mv[0], x0, y0, nPbW, nPbH,
                     s->sh.luma_weight_l0[current_mv.ref_idx[0]],
                     s->sh.luma_offset_l0[current_mv.ref_idx[0]]);
+        }

         if (s->ps.sps->chroma_format_idc) {
+#if RPI_INTER
+            if (s->enable_rpi) {
+                rpi_pred_c(s, 0, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 0,
+                  s->sh.chroma_weight_l0[current_mv.ref_idx[0]], s->sh.chroma_offset_l0[current_mv.ref_idx[0]],
+                  ref0->frame);
+                return;
+            }
+#endif
             chroma_mc_uni(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1],
                           0, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
                           s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]);
@@ -1788,12 +3065,29 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
         int nPbH_c = nPbH >> s->ps.sps->vshift[1];

-        luma_mc_uni(s, dst0, s->frame->linesize[0], ref1->frame,
+#if RPI_INTER
+        if (s->enable_rpi) {
+            rpi_pred_y(s, x0, y0, nPbW, nPbH, current_mv.mv + 1,
+              s->sh.luma_weight_l1[current_mv.ref_idx[1]], s->sh.luma_offset_l1[current_mv.ref_idx[1]],
+              ref1->frame);
+        } else
+#endif
+        {
+            luma_mc_uni(s, dst0, s->frame->linesize[0], ref1->frame,
                     &current_mv.mv[1], x0, y0, nPbW, nPbH,
                     s->sh.luma_weight_l1[current_mv.ref_idx[1]],
                     s->sh.luma_offset_l1[current_mv.ref_idx[1]]);
+        }

         if (s->ps.sps->chroma_format_idc) {
+#if RPI_INTER
+            if (s->enable_rpi) {
+                rpi_pred_c(s, 1, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 1,
+                  s->sh.chroma_weight_l1[current_mv.ref_idx[1]], s->sh.chroma_offset_l1[current_mv.ref_idx[1]],
+                  ref1->frame);
+                return;
+            }
+#endif
             chroma_mc_uni(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1],
                           1, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
                           s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0]);
@@ -1808,11 +3102,31 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
         int nPbH_c = nPbH >> s->ps.sps->vshift[1];

-        luma_mc_bi(s, dst0, s->frame->linesize[0], ref0->frame,
+#if RPI_INTER
+        if (s->enable_rpi) {
+            rpi_pred_y_b(s, x0, y0, nPbW, nPbH, &current_mv, ref0->frame, ref1->frame);
+        } else
+#endif
+        {
+            luma_mc_bi(s, dst0, s->frame->linesize[0], ref0->frame,
                    &current_mv.mv[0], x0, y0, nPbW, nPbH,
                    ref1->frame, &current_mv.mv[1], &current_mv);
+        }

         if (s->ps.sps->chroma_format_idc) {
+#if RPI_INTER
+          if (s->enable_rpi) {
+              rpi_pred_c_b(s, x0_c, y0_c, nPbW_c, nPbH_c,
+                           &current_mv,
+                           s->sh.chroma_weight_l0[current_mv.ref_idx[0]],
+                           s->sh.chroma_offset_l0[current_mv.ref_idx[0]],
+                           s->sh.chroma_weight_l1[current_mv.ref_idx[1]],
+                           s->sh.chroma_offset_l1[current_mv.ref_idx[1]],
+                           ref0->frame,
+                           ref1->frame);
+                return;
+            }
+#endif
             chroma_mc_bi(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame,
                          x0_c, y0_c, nPbW_c, nPbH_c, &current_mv, 0);

@@ -2087,7 +3401,9 @@ static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size)
                 intra_prediction_unit_default_value(s, x0, y0, log2_cb_size);
                 ret = hls_pcm_sample(s, x0, y0, log2_cb_size);
                 if (s->ps.sps->pcm.loop_filter_disable_flag)
+                {
                     set_deblocking_bypass(s, x0, y0, log2_cb_size);
+                }

                 if (ret < 0)
                     return ret;
@@ -2310,6 +3626,524 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
     lc->ctb_up_left_flag = ((x_ctb > 0) && (y_ctb > 0)  && (ctb_addr_in_slice-1 >= s->ps.sps->ctb_width) && (s->ps.pps->tile_id[ctb_addr_ts] == s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1 - s->ps.sps->ctb_width]]));
 }

+#ifdef RPI
+static void rpi_execute_dblk_cmds(HEVCContext *s)
+{
+    const unsigned int ctb_size    = 1 << s->ps.sps->log2_ctb_size;
+    HEVCRpiDeblkEnv *const de = &s->jb1->deblk;
+    unsigned int i;
+
+    for (i = 0; i != de->n; ++i)
+    {
+        ff_hevc_hls_filters(s, de->blks[i].x_ctb, de->blks[i].y_ctb, ctb_size);
+    }
+    de->n = 0;
+}
+
+#if 0
+static void rpi_execute_transform(HEVCContext *s)
+{
+    int i=2;
+    int job = s->pass1_job;
+    /*int j;
+    int16_t *coeffs = s->coeffs_buf_arm[job][i];
+    for(j=s->num_coeffs[job][i]; j > 0; j-= 16*16, coeffs+=16*16) {
+        s->hevcdsp.idct[4-2](coeffs, 16);
+    }
+    i=3;
+    coeffs = s->coeffs_buf_arm[job][i] - s->num_coeffs[job][i];
+    for(j=s->num_coeffs[job][i]; j > 0; j-= 32*32, coeffs+=32*32) {
+        s->hevcdsp.idct[5-2](coeffs, 32);
+    }*/
+
+    rpi_cache_flush_one_gm_ptr(&s->coeffs_buf_accelerated[job], RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
+    s->vpu_id = vpu_post_code2( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2],
+                               s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3],
+                               s->num_coeffs[job][3] >> 10, 0, &s->coeffs_buf_accelerated[job]);
+    //vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0);
+    //gpu_cache_flush(&s->coeffs_buf_accelerated);
+    //vpu_wait(s->vpu_id);
+
+    for(i=0;i<4;i++)
+        s->num_coeffs[job][i] = 0;
+}
+#endif
+
+
+#define RPI_OPT_SEP_PRED 0
+
+
+// I-pred, transform_and_add for all blocks types done here
+// All ARM
+#if RPI_OPT_SEP_PRED
+static void rpi_execute_pred_cmds(HEVCContext * const s, const int do_luma, const int do_chroma)
+#else
+static void rpi_execute_pred_cmds(HEVCContext * const s)
+#endif
+{
+  int i;
+  HEVCRpiIntraPredEnv * iap = &s->jb1->intra;
+  const HEVCPredCmd *cmd = iap->cmds;
+#ifdef RPI
+  HEVCLocalContextIntra *lc = &s->HEVClcIntra;
+#else
+  HEVCLocalContext *lc = s->HEVClc;
+#endif
+
+  for(i = iap->n; i > 0; i--, cmd++) {
+//      printf("i=%d cmd=%p job1=%d job0=%d\n",i,cmd,s->pass1_job,s->pass0_job);
+#if RPI_OPT_SEP_PRED
+      if (!(cmd->c_idx == 0 ? do_luma : do_chroma)) {
+          continue;
+      }
+#endif
+
+      switch (cmd->type)
+      {
+          case RPI_PRED_INTRA:
+              lc->tu.intra_pred_mode_c = lc->tu.intra_pred_mode = cmd->i_pred.mode;
+              lc->na.cand_bottom_left  = (cmd->na >> 4) & 1;
+              lc->na.cand_left         = (cmd->na >> 3) & 1;
+              lc->na.cand_up_left      = (cmd->na >> 2) & 1;
+              lc->na.cand_up           = (cmd->na >> 1) & 1;
+              lc->na.cand_up_right     = (cmd->na >> 0) & 1;
+              if (!av_rpi_is_sand_frame(s->frame) || cmd->c_idx == 0)
+                  s->hpc.intra_pred[cmd->size - 2](s, cmd->i_pred.x, cmd->i_pred.y, cmd->c_idx);
+              else
+                  s->hpc.intra_pred_c[cmd->size - 2](s, cmd->i_pred.x, cmd->i_pred.y, cmd->c_idx);
+              break;
+
+          case RPI_PRED_ADD_RESIDUAL:
+              s->hevcdsp.transform_add[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
+              break;
+          case RPI_PRED_ADD_DC:
+              s->hevcdsp.add_residual_dc[cmd->size - 2](cmd->dc.dst, cmd->dc.stride, cmd->dc.dc);
+              break;
+#if RPI_HEVC_SAND
+          case RPI_PRED_ADD_RESIDUAL_U:
+              s->hevcdsp.add_residual_u[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride, cmd->ta.dc);
+              break;
+          case RPI_PRED_ADD_RESIDUAL_V:
+              s->hevcdsp.add_residual_v[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride, cmd->ta.dc);
+              break;
+          case RPI_PRED_ADD_RESIDUAL_C:
+              s->hevcdsp.add_residual_c[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
+              break;
+          case RPI_PRED_ADD_DC_U:
+          case RPI_PRED_ADD_DC_V:
+              s->hevcdsp.add_residual_dc_c[cmd->size - 2](cmd->dc.dst, cmd->dc.stride, cmd->dc.dc);
+              break;
+#endif
+
+          case RPI_PRED_I_PCM:
+              pcm_extract(s, cmd->i_pcm.src, cmd->i_pcm.src_len, cmd->i_pcm.x, cmd->i_pcm.y, 1 << cmd->size);
+              break;
+
+          default:
+              av_log(NULL, AV_LOG_PANIC, "Bad command %d in worker pred Q\n", cmd->type);
+              abort();
+      }
+  }
+#if RPI_OPT_SEP_PRED
+  if (do_luma)
+#endif
+  {
+      iap->n = 0;
+  }
+}
+
+
+#endif
+
+#ifdef RPI
+
+// Set initial uniform job values & zero ctu_count
+static void rpi_begin(HEVCContext *s)
+{
+#if RPI_INTER
+    unsigned int i;
+    HEVCRpiJob * const jb = s->jb0;
+    HEVCRpiInterPredEnv *const cipe = &jb->chroma_ip;
+    HEVCRpiInterPredEnv *const yipe = &jb->luma_ip;
+
+    const uint16_t pic_width_y        = s->ps.sps->width;
+    const uint16_t pic_height_y       = s->ps.sps->height;
+
+    const uint16_t pic_width_c        = s->ps.sps->width >> s->ps.sps->hshift[1];
+    const uint16_t pic_height_c       = s->ps.sps->height >> s->ps.sps->vshift[1];
+
+    rpi_inter_pred_reset(cipe);
+    for (i = 0; i < cipe->n; i++) {
+        HEVCRpiInterPredQ * const cp = cipe->q + i;
+        qpu_mc_pred_c_s_t * const u = &cp->qpu_mc_base->c.s;
+
+        u->next_src1.x = 0;
+        u->next_src1.y = 0;
+        u->next_src1.base = 0;
+        u->pic_cw = pic_width_c;
+        u->pic_ch = pic_height_c;
+        u->stride2 = av_rpi_sand_frame_stride2(s->frame);
+        u->stride1 = av_rpi_sand_frame_stride1(s->frame);
+        u->wdenom = s->sh.chroma_log2_weight_denom;
+        cp->last_l0 = &u->next_src1;
+
+        u->next_fn = 0;
+        u->next_src2.x = 0;
+        u->next_src2.y = 0;
+        u->next_src2.base = 0;
+        cp->last_l1 = &u->next_src2;
+
+        cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(u + 1);
+    }
+
+    rpi_inter_pred_reset(yipe);
+    for (i = 0; i < yipe->n; i++) {
+        HEVCRpiInterPredQ * const yp = yipe->q + i;
+        qpu_mc_pred_y_s_t * const y = &yp->qpu_mc_base->y.s;
+
+        y->next_src1.x = 0;
+        y->next_src1.y = 0;
+        y->next_src1.base = 0;
+        y->next_src2.x = 0;
+        y->next_src2.y = 0;
+        y->next_src2.base = 0;
+        y->pic_h = pic_height_y;
+        y->pic_w = pic_width_y;
+        y->stride2 = av_rpi_sand_frame_stride2(s->frame);
+        y->stride1 = av_rpi_sand_frame_stride1(s->frame);
+        y->wdenom = s->sh.luma_log2_weight_denom;
+        y->next_fn = 0;
+        yp->last_l0 = &y->next_src1;
+        yp->last_l1 = &y->next_src2;
+
+        yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(y + 1);
+    }
+
+    s->last_y8_p = NULL;
+    s->last_y8_l1 = NULL;
+
+    for (i = 0; i != FF_ARRAY_ELEMS(jb->progress); ++i) {
+        jb->progress[i] = -1;
+    }
+
+#endif
+    s->ctu_count = 0;
+}
+#endif
+
+
+#if RPI_INTER
+#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
+static unsigned int mc_terminate_add_qpu(HEVCContext * const s,
+                                     const vpu_qpu_job_h vqj,
+                                     rpi_cache_flush_env_t * const rfe,
+                                     HEVCRpiInterPredEnv * const ipe)
+{
+    unsigned int i;
+    uint32_t mail[QPU_N_MAX][QPU_MAIL_EL_VALS];
+    unsigned int max_block = 0;
+
+    if (!ipe->used) {
+        return 0;
+    }
+
+    if (ipe->curr != 0) {
+        rpi_inter_pred_sync(ipe);
+    }
+
+    // Add final commands to Q
+    for(i = 0; i != ipe->n; ++i) {
+        HEVCRpiInterPredQ * const yp = ipe->q + i;
+        qpu_mc_src_t *const p0 = yp->last_l0;
+        qpu_mc_src_t *const p1 = yp->last_l1;
+        const unsigned int block_size = (char *)yp->qpu_mc_curr - (char *)yp->qpu_mc_base;
+
+        if (block_size > max_block)
+            max_block = block_size;
+
+        yp->qpu_mc_curr->data[-1] = yp->code_exit;
+
+        // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched
+        p0->x = MC_DUMMY_X;
+        p0->y = MC_DUMMY_Y;
+        p0->base = s->qpu_dummy_frame_qpu;
+        p1->x = MC_DUMMY_X;
+        p1->y = MC_DUMMY_Y;
+        p1->base = s->qpu_dummy_frame_qpu;
+
+        yp->last_l0 = NULL;
+        yp->last_l1 = NULL;
+
+        // Add to mailbox list
+        mail[i][0] = ipe->gptr.vc + ((uint8_t *)yp->qpu_mc_base - ipe->gptr.arm);
+        mail[i][1] = yp->code_setup;
+    }
+
+#if RPI_CACHE_UNIF_MVS
+    // We don't need invalidate here as the uniforms aren't changed by the QPU
+    // and leaving them in ARM cache avoids (pointless) pre-reads when writing
+    // new values which seems to give us a small performance advantage
+    //
+    // In most cases we will not have a completely packed set of uniforms and as
+    // we have a 2d invalidate we writeback all uniform Qs to the depth of the
+    // fullest
+    rpi_cache_flush_add_gm_blocks(rfe, &ipe->gptr, RPI_CACHE_FLUSH_MODE_WRITEBACK,
+                                  (uint8_t *)ipe->q[0].qpu_mc_base - ipe->gptr.arm, max_block,
+                                  ipe->n, ipe->max_fill + ipe->min_gap);
+#endif
+    vpu_qpu_job_add_qpu(vqj, ipe->n, (uint32_t *)mail);
+
+    return 1;
+}
+#endif
+
+#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
+static unsigned int mc_terminate_add_emu(HEVCContext * const s,
+                                     const vpu_qpu_job_h vqj,
+                                     rpi_cache_flush_env_t * const rfe,
+                                     HEVCRpiInterPredEnv * const ipe)
+{
+    unsigned int i;
+    if (!ipe->used) {
+        return 0;
+    }
+
+    if (ipe->curr != 0) {
+        rpi_inter_pred_sync(ipe);
+    }
+
+    // Add final commands to Q
+    for(i = 0; i != ipe->n; ++i) {
+        HEVCRpiInterPredQ * const yp = ipe->q + i;
+        qpu_mc_src_t *const p0 = yp->last_l0;
+        qpu_mc_src_t *const p1 = yp->last_l1;
+
+        yp->qpu_mc_curr->data[-1] = yp->code_exit;
+
+        // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched
+        p0->x = MC_DUMMY_X;
+        p0->y = MC_DUMMY_Y;
+        p0->base = s->qpu_dummy_frame_emu;
+        p1->x = MC_DUMMY_X;
+        p1->y = MC_DUMMY_Y;
+        p1->base = s->qpu_dummy_frame_emu;
+
+        yp->last_l0 = NULL;
+        yp->last_l1 = NULL;
+    }
+
+    return 1;
+}
+#endif
+
+
+#if RPI_QPU_EMU_Y
+#define mc_terminate_add_y mc_terminate_add_emu
+#else
+#define mc_terminate_add_y mc_terminate_add_qpu
+#endif
+#if RPI_QPU_EMU_C
+#define mc_terminate_add_c mc_terminate_add_emu
+#else
+#define mc_terminate_add_c mc_terminate_add_qpu
+#endif
+#endif
+
+#ifdef RPI
+
+
+static void flush_frame(HEVCContext *s,AVFrame *frame)
+{
+  rpi_cache_flush_env_t * rfe = rpi_cache_flush_init();
+  rpi_cache_flush_add_frame(rfe, frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
+  rpi_cache_flush_finish(rfe);
+}
+
+
+// Core execution tasks
+static void worker_core(HEVCContext * const s)
+{
+#if RPI_OPT_SEP_PRED
+    vpu_qpu_wait_h sync_c;
+#endif
+    vpu_qpu_wait_h sync_y;
+
+    HEVCRpiJob * const jb = s->jb1;
+    int pred_y, pred_c;
+
+    const vpu_qpu_job_h vqj = vpu_qpu_job_new();
+    rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init();
+
+    {
+        const HEVCRpiCoeffsEnv * const cf = &jb->coeffs;
+        if (cf->s[3].n + cf->s[2].n != 0)
+        {
+            const unsigned int csize = sizeof(cf->s[3].buf[0]);
+            const unsigned int offset32 = ((cf->s[3].buf - cf->s[2].buf) - cf->s[3].n) * csize;
+            vpu_qpu_job_add_vpu(vqj,
+                vpu_get_fn(s->ps.sps->bit_depth),
+                vpu_get_constants(),
+                cf->gptr.vc,
+                cf->s[2].n >> 8,
+                cf->gptr.vc + offset32,
+                cf->s[3].n >> 10,
+                0);
+
+            rpi_cache_flush_add_gm_range(rfe, &cf->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, 0, cf->s[2].n * csize);
+            rpi_cache_flush_add_gm_range(rfe, &cf->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, offset32, cf->s[3].n * csize);
+        }
+    }
+
+    pred_c = mc_terminate_add_c(s, vqj, rfe, &jb->chroma_ip);
+
+// We can take a sync here and try to locally overlap QPU processing with ARM
+// but testing showed a slightly negative benefit with noticable extra complexity
+#if RPI_OPT_SEP_PRED
+    vpu_qpu_job_add_sync_this(vqj, &sync_c);
+#endif
+
+    pred_y = mc_terminate_add_y(s, vqj, rfe, &jb->luma_ip);
+
+    vpu_qpu_job_add_sync_this(vqj, &sync_y);
+
+
+    // We are expecting a contiguous Z-shaped set of blocks
+    // So generate up to 3 blocks:
+    //   1st line
+    //   body
+    //   last line
+    // This will work even if we don't have the expected geometry
+    if (pred_y || pred_c)
+    {
+        const HEVCRpiDeblkEnv *const de = &jb->deblk;
+        const HEVCRpiDeblkBlk * db = de->blks + 0;
+        const unsigned int ctb_size = 1 << s->ps.sps->log2_ctb_size;
+        unsigned int x0 = db->x_ctb;
+        unsigned int xx = x0 + ctb_size;
+        unsigned int y0 = db->y_ctb;
+
+        unsigned int blks_tlbr[3][4] = {{~0U, ~0U, 0, 0}, {~0U, ~0U, 0, 0}, {~0U, ~0U, 0, 0}};
+        unsigned int b = 0;
+        unsigned int i;
+
+        for (i = 1, ++db; i < de->n; ++i, ++db)
+        {
+            if (db->x_ctb == xx && db->y_ctb == y0) {
+                xx += ctb_size;
+            }
+            else
+            {
+                unsigned int * const tlbr = blks_tlbr[b];
+                if (tlbr[0] > y0)
+                    tlbr[0] = y0;
+                if (tlbr[1] > x0)
+                    tlbr[1] = x0;
+                if (tlbr[2] < y0 + ctb_size)
+                    tlbr[2] = y0 + ctb_size;
+                if (tlbr[3] < xx)
+                    tlbr[3] = xx;
+                x0 = db->x_ctb;
+                xx = x0 + ctb_size;
+                y0 = db->y_ctb;
+                b = 1;
+            }
+        }
+
+        if (blks_tlbr[b][0] != ~0U)
+            ++b;
+
+        {
+            unsigned int * const tlbr = blks_tlbr[b];
+            tlbr[0] = y0;
+            tlbr[1] = x0;
+            tlbr[2] = y0 + ctb_size;
+            tlbr[3] = xx;
+        }
+
+        // ??? Coalesce blocks ???
+        for (i = 0; i <= b; ++i) {
+            const unsigned int * const tlbr = blks_tlbr[i];
+            rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_INVALIDATE,
+              tlbr[1], tlbr[0], tlbr[3] - tlbr[1], tlbr[2] - tlbr[0], s->ps.sps->vshift[1], pred_y, pred_c);
+        }
+    }
+
+
+    // Having accumulated some commands - do them
+    rpi_cache_flush_finish(rfe);
+
+    // Await progress as required
+    {
+        unsigned int i;
+        for (i = 0; i != FF_ARRAY_ELEMS(jb->progress); ++i) {
+            if (jb->progress[i] >= 0) {
+                ff_hevc_progress_wait_recon(s, jb, s->DPB + i, jb->progress[i]);
+            }
+        }
+    }
+
+    vpu_qpu_job_finish(vqj);
+
+    worker_pic_reset(&jb->coeffs);
+
+    // If we have emulated VPU ops - do it here
+#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
+    if (av_rpi_is_sand8_frame(s->frame))
+#if RPI_QPU_EMU_Y && RPI_QPU_EMU_C
+        rpi_shader_c8(s, &jb->luma_ip, &jb->chroma_ip);
+#elif RPI_QPU_EMU_Y
+        rpi_shader_c8(s, &jb->luma_ip, NULL);
+#else
+        rpi_shader_c8(s, NULL, &jb->chroma_ip);
+#endif
+    else
+#if RPI_QPU_EMU_Y && RPI_QPU_EMU_C
+        rpi_shader_c16(s, &jb->luma_ip, &jb->chroma_ip);
+#elif RPI_QPU_EMU_Y
+        rpi_shader_c16(s, &jb->luma_ip, NULL);
+#else
+        rpi_shader_c16(s, NULL, &jb->chroma_ip);
+#endif
+#endif
+
+#if RPI_OPT_SEP_PRED
+    // Wait for transform completion
+    vpu_qpu_wait(&sync_c);
+
+    // Perform intra prediction and residual reconstruction
+    rpi_execute_pred_cmds(s, 0, 1);
+
+    // Wait for transform completion
+    vpu_qpu_wait(&sync_y);
+
+    // Perform intra prediction and residual reconstruction
+    rpi_execute_pred_cmds(s, 1, 0);
+#else
+    // Wait for transform completion
+    vpu_qpu_wait(&sync_y);
+
+    // Perform intra prediction and residual reconstruction
+    rpi_execute_pred_cmds(s);
+#endif
+
+    // Perform deblocking for CTBs in this row
+    rpi_execute_dblk_cmds(s);
+}
+
+static void rpi_do_all_passes(HEVCContext *s)
+{
+    // Called from main thread - must be no pending background jobs
+    av_assert0(s->pass0_job == s->pass1_job && s->jb0 == s->jb1 && !s->jb0->pending);
+
+    // Do the various passes - common with the worker code
+    worker_core(s);
+    // Prepare next batch
+    rpi_begin(s);
+}
+
+
+#endif
+
 static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
 {
     HEVCContext *s  = avctxt->priv_data;
@@ -2319,6 +4153,17 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
     int y_ctb       = 0;
     int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];

+#ifdef RPI
+    // * We don't support cross_component_prediction_enabled_flag but as that
+    //   must be 0 unless we have 4:4:4 there is no point testing for it as we
+    //   only deal with sand which is never 4:4:4
+    //   [support wouldn't be hard]
+    s->enable_rpi =
+        ((s->ps.sps->bit_depth == 8 && s->frame->format == AV_PIX_FMT_SAND128) ||
+         (s->ps.sps->bit_depth == 10 && s->frame->format == AV_PIX_FMT_SAND64_10));
+#endif
+    //printf("L0=%d L1=%d\n",s->sh.nb_refs[L1],s->sh.nb_refs[L1]);
+
     if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
         av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
         return AVERROR_INVALIDDATA;
@@ -2332,8 +4177,14 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
         }
     }

+#ifdef RPI
+    // Worker must be idle at start
+    av_assert0(s->pass0_job == s->pass1_job && s->jb0 == s->jb1 && !s->jb0->pending);
+    rpi_begin(s);
+#endif
+
     while (more_data && ctb_addr_ts < s->ps.sps->ctb_size) {
-        int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
+        const int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];

         x_ctb = (ctb_addr_rs % ((s->ps.sps->width + ctb_size - 1) >> s->ps.sps->log2_ctb_size)) << s->ps.sps->log2_ctb_size;
         y_ctb = (ctb_addr_rs / ((s->ps.sps->width + ctb_size - 1) >> s->ps.sps->log2_ctb_size)) << s->ps.sps->log2_ctb_size;
@@ -2348,6 +4199,52 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
         s->filter_slice_edges[ctb_addr_rs]  = s->sh.slice_loop_filter_across_slices_enabled_flag;

         more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
+
+#ifdef RPI
+        // Report progress so we can use our MVs in other frames
+        // If we are tiled then this isn't really optimal but given that tiling
+        // can change on a per pic basis (described in PPS) other schemes are
+        // quite a lot harder
+        if (s->threads_type == FF_THREAD_FRAME && x_ctb + ctb_size >= s->ps.sps->width) {
+            ff_hevc_progress_signal_mv(s, y_ctb + ctb_size - 1);
+        }
+
+        if (s->enable_rpi) {
+            int q_full = (++s->ctu_count >= s->max_ctu_count);
+
+            if (rpi_inter_pred_next_ctu(&s->jb0->luma_ip) != 0)
+                q_full = 1;
+            if (rpi_inter_pred_next_ctu(&s->jb0->chroma_ip) != 0)
+                q_full = 1;
+
+            s->jb0->deblk.blks[s->jb0->deblk.n].x_ctb = x_ctb;
+            s->jb0->deblk.blks[s->jb0->deblk.n++].y_ctb = y_ctb;
+
+            if (q_full) {
+                if (s->used_for_ref)
+                {
+//                  printf("%d %d/%d job=%d, x,y=%d,%d\n",s->ctu_count,s->num_dblk_cmds[s->pass0_job],RPI_MAX_DEBLOCK_CMDS,s->pass0_job, x_ctb, y_ctb);
+
+//                  worker_wait(s);
+                    // Split work load onto separate threads so we make as rapid progress as possible with this frame
+                    // Pass on this job to worker thread
+                    worker_submit_job(s);
+
+                    // Make sure we have space to prepare the next job
+                    worker_pass0_ready(s);
+
+                    // Prepare the next batch of commands
+                    rpi_begin(s);
+                } else {
+                    // Non-ref frame so do it all on this thread
+                    rpi_do_all_passes(s);
+                }
+            }
+
+        }
+#endif
+
+
         if (more_data < 0) {
             s->tab_slice_address[ctb_addr_rs] = -1;
             return more_data;
@@ -2356,9 +4253,40 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)

         ctb_addr_ts++;
         ff_hevc_save_states(s, ctb_addr_ts);
+#ifdef RPI
+        if (s->enable_rpi)
+            continue;
+#endif
         ff_hevc_hls_filters(s, x_ctb, y_ctb, ctb_size);
     }

+#ifdef RPI
+
+    // Wait for the worker to finish all its jobs
+    if (s->enable_rpi) {
+        worker_wait(s);
+    }
+
+    // Finish off any half-completed rows
+    if (s->enable_rpi && s->ctu_count) {
+        rpi_do_all_passes(s);
+    }
+
+#if RPI_TSTATS
+    {
+        HEVCRpiStats *const ts = &s->tstats;
+
+        printf("=== P: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d w8gl:%5d/%5d y8m:%d\n    B: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d\n",
+               ts->y_pred1_xy, ts->y_pred1_x0, ts->y_pred1_y0, ts->y_pred1_x0y0,
+               ts->y_pred1_hgt16, ts->y_pred1_hle16, ts->y_pred1_wgt8, ts->y_pred1_wle8, ts->y_pred1_y8_merge,
+               ts->y_pred2_xy, ts->y_pred2_x0, ts->y_pred2_y0, ts->y_pred2_x0y0,
+               ts->y_pred2_hgt16, ts->y_pred2_hle16);
+        memset(ts, 0, sizeof(*ts));
+    }
+#endif
+
+#endif
+
     if (x_ctb + ctb_size >= s->ps.sps->width &&
         y_ctb + ctb_size >= s->ps.sps->height)
         ff_hevc_hls_filter(s, x_ctb, y_ctb, ctb_size);
@@ -2393,6 +4321,11 @@ static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int
     s = s1->sList[self_id];
     lc = s->HEVClc;

+#ifdef RPI
+    s->enable_rpi = 0;
+    //printf("Wavefront\n");
+#endif
+
     if(ctb_row) {
         ret = init_get_bits8(&lc->gb, s->data + s->sh.offset[ctb_row - 1], s->sh.size[ctb_row - 1]);

@@ -2773,9 +4706,47 @@ static int decode_nal_unit(HEVCContext *s, const H2645NAL *nal)
         if (ret < 0)
             return ret;

-        if (s->max_ra == INT_MAX) {
-            if (s->nal_unit_type == NAL_CRA_NUT || IS_BLA(s)) {
-                s->max_ra = s->poc;
+        // The definition of _N unit types is "non-reference for other frames
+        // with the same temporal_id" so they may/will be ref frames for pics
+        // with a higher temporal_id.
+        s->used_for_ref = s->ps.sps->max_sub_layers > s->temporal_id + 1 ||
+            !(s->nal_unit_type == NAL_TRAIL_N ||
+                        s->nal_unit_type == NAL_TSA_N   ||
+                        s->nal_unit_type == NAL_STSA_N  ||
+                        s->nal_unit_type == NAL_RADL_N  ||
+                        s->nal_unit_type == NAL_RASL_N);
+
+#if DEBUG_DECODE_N
+        {
+            static int z = 0;
+            if (IS_IDR(s)) {
+                z = 1;
+            }
+            if (z != 0 && z++ > DEBUG_DECODE_N) {
+                s->is_decoded = 0;
+                break;
+            }
+        }
+#endif
+        if (!s->used_for_ref && s->avctx->skip_frame >= AVDISCARD_NONREF) {
+            s->is_decoded = 0;
+            break;
+        }
+
+        if (s->sh.first_slice_in_pic_flag) {
+            if (s->max_ra == INT_MAX) {
+                if (s->nal_unit_type == NAL_CRA_NUT || IS_BLA(s)) {
+                    s->max_ra = s->poc;
+                } else {
+                    if (IS_IDR(s))
+                        s->max_ra = INT_MIN;
+                }
+            }
+
+            if ((s->nal_unit_type == NAL_RASL_R || s->nal_unit_type == NAL_RASL_N) &&
+                s->poc <= s->max_ra) {
+                s->is_decoded = 0;
+                break;
             } else {
                 if (IS_IDR(s))
                     s->max_ra = INT_MIN;
@@ -2896,10 +4867,25 @@ static int decode_nal_units(HEVCContext *s, const uint8_t *buf, int length)
         }
     }

-fail:
-    if (s->ref && s->threads_type == FF_THREAD_FRAME)
-        ff_thread_report_progress(&s->ref->tf, INT_MAX, 0);
-
+fail:  // Also success path
+    if (s->ref != NULL) {
+        if (s->used_for_ref && s->threads_type == FF_THREAD_FRAME) {
+#ifdef RPI
+            rpi_flush_ref_frame_progress(s, &s->ref->tf, s->ps.sps->height);
+#endif
+            ff_hevc_progress_signal_all_done(s);
+        }
+#ifdef RPI
+        // * Flush frame will become confused if we pass it something
+        //   that doesn't have an expected number of planes (e.g. 400)
+        //   So only flush if we are sure we can.
+        else if (s->enable_rpi) {
+            // Flush frame to real memory as we expect to be able to pass
+            // it straight on to mmal
+            flush_frame(s, s->frame);
+        }
+#endif
+    }
     return ret;
 }

@@ -3070,6 +5056,83 @@ fail:
     return AVERROR(ENOMEM);
 }

+#ifdef RPI
+static av_cold void hevc_init_worker(HEVCContext * const s)
+{
+    int err;
+
+    memset(s->jobs, 0, sizeof(s->jobs));
+
+    for (unsigned int job = 0; job < RPI_MAX_JOBS; job++) {
+        HEVCRpiJob * const jb = s->jobs + job;
+
+        sem_init(&jb->sem_in, 0, 0);
+        sem_init(&jb->sem_out, 0, 0);
+        ff_hevc_rpi_progress_init_wait(&jb->progress_wait);
+
+        jb->intra.n = 0;
+        jb->intra.cmds = av_mallocz(sizeof(HEVCPredCmd) * RPI_MAX_PRED_CMDS);
+
+        // ** Sizeof the union structure might be overkill but at the moment it
+        //    is correct (it certainly isn't going to be too small)
+
+        rpi_inter_pred_alloc(&jb->chroma_ip,
+                             QPU_N_MAX, QPU_N_GRP,
+                             QPU_C_COMMANDS * sizeof(qpu_mc_pred_c_t),
+                             QPU_C_CMD_PER_CTU_MAX * sizeof(qpu_mc_pred_c_t));
+        rpi_inter_pred_alloc(&jb->luma_ip,
+                             QPU_N_MAX,  QPU_N_GRP,
+                             QPU_Y_COMMANDS * sizeof(qpu_mc_pred_y_t),
+                             QPU_Y_CMD_PER_CTU_MAX * sizeof(qpu_mc_pred_y_t));
+
+        jb->deblk.n = 0;
+        jb->deblk.blks = av_malloc(sizeof(jb->deblk.blks[0]) * RPI_MAX_DEBLOCK_CMDS);
+    }
+    s->pass0_job = 0;
+    s->pass1_job = 0;
+    s->jb0 = s->jobs + 0;
+    s->jb1 = s->jobs + 0;
+
+    err = pthread_create(&s->worker_thread, NULL, worker_start, s);
+    if (err) {
+        printf("Failed to create worker thread\n");
+        exit(-1);
+    }
+}
+
+static void rpi_free_inter_pred(HEVCRpiInterPredEnv * const ipe)
+{
+    av_freep(&ipe->q);
+    gpu_free(&ipe->gptr);
+}
+
+static av_cold void hevc_exit_worker(HEVCContext *s)
+{
+    void *res;
+    unsigned int i;
+
+    for(i = 0; i < RPI_MAX_JOBS; i++)
+        s->jobs[i].terminate = 1;
+    for(i = 0; i < RPI_MAX_JOBS; i++)
+        sem_post(&s->jobs[i].sem_in);
+    pthread_join(s->worker_thread, &res);
+
+    for(i = 0; i < RPI_MAX_JOBS; i++)
+    {
+        HEVCRpiJob * const jb = s->jobs + i;
+
+        sem_destroy(&jb->sem_in);
+        sem_destroy(&jb->sem_out);
+        ff_hevc_rpi_progress_kill_wait(&jb->progress_wait);
+        av_freep(&jb->intra.cmds);
+        av_freep(&jb->deblk.blks);
+        rpi_free_inter_pred(&jb->chroma_ip);
+        rpi_free_inter_pred(&jb->luma_ip);
+    }
+}
+
+#endif
+
 static av_cold int hevc_decode_free(AVCodecContext *avctx)
 {
     HEVCContext       *s = avctx->priv_data;
@@ -3081,10 +5144,19 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)

     av_freep(&s->cabac_state);

-    for (i = 0; i < 3; i++) {
-        av_freep(&s->sao_pixel_buffer_h[i]);
-        av_freep(&s->sao_pixel_buffer_v[i]);
+#ifdef RPI
+
+    hevc_exit_worker(s);
+    vpu_qpu_term();
+    for (i = 0; i != 2; ++i) {
+        ff_hevc_rpi_progress_kill_state(s->progress_states + i);
     }
+
+    av_rpi_zc_uninit(avctx);
+#endif
+
+    av_freep(&s->sao_pixel_buffer_h[0]);  // [1] & [2] allocated with [0]
+    av_freep(&s->sao_pixel_buffer_v[0]);
     av_frame_free(&s->output_frame);

     for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
@@ -3122,6 +5194,7 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
     return 0;
 }

+
 static av_cold int hevc_init_context(AVCodecContext *avctx)
 {
     HEVCContext *s = avctx->priv_data;
@@ -3135,6 +5208,37 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
     s->HEVClcList[0] = s->HEVClc;
     s->sList[0] = s;

+#ifdef RPI
+    // Whilst FFmpegs init fn is only called once the close fn is called as
+    // many times as we have threads (init_thread_copy is called for the
+    // threads).  So to match init & term put the init here where it will be
+    // called by both init & copy
+    av_rpi_zc_init(avctx);
+
+    if (vpu_qpu_init() != 0)
+        goto fail;
+
+#if RPI_INTER
+#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
+    {
+        static const uint32_t dframe[1] = {0x80808080};
+        s->qpu_dummy_frame_emu = (const uint8_t *)dframe;
+    }
+#endif
+#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
+    s->qpu_dummy_frame_qpu = qpu_fn(mc_start);  // Use our code as a dummy frame
+#endif
+#endif
+    //gpu_malloc_uncached(2048*64,&s->dummy);
+
+    s->enable_rpi = 0;
+
+    for (i = 0; i != 2; ++i) {
+        ff_hevc_rpi_progress_init_state(s->progress_states + i);
+    }
+    hevc_init_worker(s);
+#endif
+
     s->cabac_state = av_malloc(HEVC_CONTEXTS);
     if (!s->cabac_state)
         goto fail;
@@ -3148,6 +5252,7 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
         if (!s->DPB[i].frame)
             goto fail;
         s->DPB[i].tf.f = s->DPB[i].frame;
+        s->DPB[i].dpb_no = i;
     }

     s->max_ra = INT_MAX;
@@ -3349,9 +5454,9 @@ static av_cold int hevc_decode_init(AVCodecContext *avctx)
     }

     if((avctx->active_thread_type & FF_THREAD_FRAME) && avctx->thread_count > 1)
-            s->threads_type = FF_THREAD_FRAME;
-        else
-            s->threads_type = FF_THREAD_SLICE;
+        s->threads_type = FF_THREAD_FRAME;
+    else
+        s->threads_type = FF_THREAD_SLICE;

     return 0;
 }
@@ -3410,6 +5515,8 @@ AVCodec ff_hevc_decoder = {
     .update_thread_context = hevc_update_thread_context,
     .init_thread_copy      = hevc_init_thread_copy,
     .capabilities          = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY |
+//                             0,
+//                             AV_CODEC_CAP_FRAME_THREADS,
                              AV_CODEC_CAP_SLICE_THREADS | AV_CODEC_CAP_FRAME_THREADS,
     .profiles              = NULL_IF_CONFIG_SMALL(ff_hevc_profiles),
 };
diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
index 162ca0e582..d647232638 100644
--- a/libavcodec/hevc.h
+++ b/libavcodec/hevc.h
@@ -23,6 +23,7 @@
 #ifndef AVCODEC_HEVC_H
 #define AVCODEC_HEVC_H

+#include "rpi_opts.h"
 #include "libavutil/buffer.h"
 #include "libavutil/md5.h"

@@ -37,6 +38,10 @@
 #include "thread.h"
 #include "videodsp.h"

+#ifdef RPI
+#include "rpi_qpu.h"
+#endif
+
 #define MAX_DPB_SIZE 16 // A.4.1
 #define MAX_REFS 16

@@ -463,6 +468,7 @@ typedef struct HEVCSPS {
     int implicit_rdpcm_enabled_flag;
     int explicit_rdpcm_enabled_flag;
     int intra_smoothing_disabled_flag;
+    int high_precision_offsets_enabled_flag;
     int persistent_rice_adaptation_enabled_flag;

     ///< coded frame dimension in various units
@@ -660,6 +666,7 @@ typedef struct CodingUnit {
     uint8_t cu_transquant_bypass_flag;
 } CodingUnit;

+#if 0
 typedef struct Mv {
     int16_t x;  ///< horizontal component of motion vector
     int16_t y;  ///< vertical component of motion vector
@@ -670,6 +677,7 @@ typedef struct MvField {
     int8_t ref_idx[2];
     int8_t pred_flag;
 } MvField;
+#endif

 typedef struct NeighbourAvailable {
     int cand_bottom_left;
@@ -745,9 +753,23 @@ typedef struct HEVCFrame {
      * A combination of HEVC_FRAME_FLAG_*
      */
     uint8_t flags;
+
+    // Entry no in DPB - can be used as a small unique
+    // frame identifier (within the current thread)
+    uint8_t dpb_no;
 } HEVCFrame;

+#ifdef RPI
+typedef struct HEVCLocalContextIntra {
+    TransformUnit tu;
+    NeighbourAvailable na;
+} HEVCLocalContextIntra;
+#endif
+
 typedef struct HEVCLocalContext {
+    TransformUnit tu;  // Moved to start to match HEVCLocalContextIntra (yuk!)
+    NeighbourAvailable na;
+
     uint8_t cabac_state[HEVC_CONTEXTS];

     uint8_t stat_coeff[4];
@@ -762,8 +784,6 @@ typedef struct HEVCLocalContext {

     int qPy_pred;

-    TransformUnit tu;
-
     uint8_t ctb_left_flag;
     uint8_t ctb_up_flag;
     uint8_t ctb_up_right_flag;
@@ -779,7 +799,6 @@ typedef struct HEVCLocalContext {
     int ct_depth;
     CodingUnit cu;
     PredictionUnit pu;
-    NeighbourAvailable na;

 #define BOUNDARY_LEFT_SLICE     (1 << 0)
 #define BOUNDARY_LEFT_TILE      (1 << 1)
@@ -790,6 +809,207 @@ typedef struct HEVCLocalContext {
     int boundary_flags;
 } HEVCLocalContext;

+#ifdef RPI
+
+// The processing is done in chunks
+// Increasing RPI_NUM_CHUNKS will reduce time spent activating QPUs and cache flushing,
+// but allocate more memory and increase the latency before data in the next frame can be processed
+#define RPI_NUM_CHUNKS 4
+#define RPI_CHUNK_SIZE 12
+#define RPI_ROUND_TO_LINES 0
+
+// RPI_MAX_WIDTH is maximum width in pixels supported by the accelerated code
+#define RPI_MAX_WIDTH (RPI_NUM_CHUNKS*64*RPI_CHUNK_SIZE)
+
+// Worst case is for 4:4:4 4x4 blocks with 64 high coding tree blocks, so 16 MV cmds per 4 pixels across for each colour plane, * 2 for bi
+#define RPI_MAX_MV_CMDS_Y   (2*16*1*(RPI_MAX_WIDTH/4))
+#define RPI_MAX_MV_CMDS_C   (2*16*2*(RPI_MAX_WIDTH/4))
+// Each block can have an intra prediction and a transform_add command
+#define RPI_MAX_PRED_CMDS (2*16*3*(RPI_MAX_WIDTH/4))
+// Worst case is 16x16 CTUs
+#define RPI_MAX_DEBLOCK_CMDS (RPI_MAX_WIDTH*4/16)
+
+#define RPI_CMD_LUMA_UNI 0
+#define RPI_CMD_CHROMA_UNI 1
+#define RPI_CMD_LUMA_BI 2
+#define RPI_CMD_CHROMA_BI 3
+#define RPI_CMD_V_BI 4
+
+// Command for inter prediction
+typedef struct HEVCMvCmd {
+    uint8_t cmd;
+    uint8_t block_w;
+    uint8_t block_h;
+    int8_t ref_idx[2];
+    uint16_t dststride;
+    uint16_t srcstride;
+    uint16_t srcstride1;
+    int16_t weight;
+    int16_t offset;
+    int16_t x_off;
+    int16_t y_off;
+    uint8_t *src;
+    uint8_t *src1;
+    uint8_t *dst;
+    Mv mv;
+    Mv mv1;
+} HEVCMvCmd;
+
+
+// Command for intra prediction and transform_add of predictions to coefficients
+enum rpi_pred_cmd_e
+{
+    RPI_PRED_ADD_RESIDUAL,
+    RPI_PRED_ADD_RESIDUAL_U, // = RPI_PRED_TRANSFORM_ADD + c_idx
+    RPI_PRED_ADD_RESIDUAL_V, // = RPI_PRED_TRANSFORM_ADD + c_idx
+    RPI_PRED_ADD_RESIDUAL_C, // Merged U+V
+    RPI_PRED_ADD_DC,
+    RPI_PRED_ADD_DC_U,       // Both U & V are effectively C
+    RPI_PRED_ADD_DC_V,
+    RPI_PRED_INTRA,
+    RPI_PRED_I_PCM,
+    RPI_PRED_CMD_MAX
+};
+
+typedef struct HEVCPredCmd {
+    uint8_t type;
+    uint8_t size;  // log2 "size" used by all variants
+    uint8_t na;    // i_pred - but left here as they pack well
+    uint8_t c_idx; // i_pred
+    union {
+        struct {  // TRANSFORM_ADD
+            uint8_t * dst;
+            const int16_t * buf;
+            uint16_t stride;  // Should be good enough for all pic fmts we use
+            int16_t dc;
+        } ta;
+        struct {
+            uint8_t * dst;
+            uint32_t stride;
+            int dc;
+        } dc;
+        struct {  // INTRA
+            uint16_t x;
+            uint16_t y;
+            enum IntraPredMode mode;
+        } i_pred;
+        struct {  // I_PCM
+            uint16_t x;
+            uint16_t y;
+            const void * src;
+            uint32_t src_len;
+        } i_pcm;
+    };
+} HEVCPredCmd;
+
+#endif
+
+#ifdef RPI
+#include <semaphore.h>
+
+union qpu_mc_pred_cmd_s;
+struct qpu_mc_pred_y_p_s;
+struct qpu_mc_src_s;
+
+typedef struct HEVCRpiInterPredQ
+{
+    union qpu_mc_pred_cmd_u *qpu_mc_base;
+    union qpu_mc_pred_cmd_u *qpu_mc_curr;
+    struct qpu_mc_src_s *last_l0;
+    struct qpu_mc_src_s *last_l1;
+    unsigned int load;
+    uint32_t code_setup;
+    uint32_t code_sync;
+    uint32_t code_exit;
+} HEVCRpiInterPredQ;
+
+typedef struct HEVCRpiInterPredEnv
+{
+    HEVCRpiInterPredQ * q;
+    unsigned int n;        // Number of Qs
+    unsigned int n_grp;    // Number of Q in a group
+    unsigned int curr;     // Current Q number (0..n-1)
+    int used;              // 0 if nothing in any Q, 1 otherwise
+    int used_grp;          // 0 if nothing in any Q in the current group
+    unsigned int max_fill;
+    unsigned int min_gap;
+    GPU_MEM_PTR_T gptr;
+} HEVCRpiInterPredEnv;
+
+typedef struct HEVCRpiIntraPredEnv {
+    unsigned int n;        // Number of commands
+    HEVCPredCmd * cmds;
+} HEVCRpiIntraPredEnv;
+
+typedef struct HEVCRpiCeoffEnv {
+    unsigned int n;
+    uint16_t * buf;
+} HEVCRpiCoeffEnv;
+
+typedef struct HEVCRpiCeoffsEnv {
+    HEVCRpiCoeffEnv s[4];
+    GPU_MEM_PTR_T gptr;
+    void * mptr;
+} HEVCRpiCoeffsEnv;
+
+typedef struct HEVCRpiDeblkBlk {
+    uint16_t x_ctb;
+    uint16_t y_ctb;
+} HEVCRpiDeblkBlk;
+
+typedef struct HEVCRpiDeblkEnv {
+    unsigned int n;
+    HEVCRpiDeblkBlk * blks;
+} HEVCRpiDeblkEnv;
+
+typedef struct HEVCRPiFrameProgressWait {
+    int req;
+    struct HEVCRPiFrameProgressWait * next;
+    sem_t sem;
+} HEVCRPiFrameProgressWait;
+
+typedef struct HEVCRPiFrameProgressState {
+    struct HEVCRPiFrameProgressWait * first;
+    struct HEVCRPiFrameProgressWait * last;
+    pthread_mutex_t lock;
+} HEVCRPiFrameProgressState;
+
+typedef struct HEVCRpiJob {
+    volatile int terminate;
+    int pending;
+    sem_t sem_in;       // set by main
+    sem_t sem_out;      // set by worker
+    HEVCRpiInterPredEnv chroma_ip;
+    HEVCRpiInterPredEnv luma_ip;
+    int16_t progress[32];  // index by dpb_no
+    HEVCRpiIntraPredEnv intra;
+    HEVCRpiCoeffsEnv coeffs;
+    HEVCRpiDeblkEnv deblk;
+    HEVCRPiFrameProgressWait progress_wait;
+} HEVCRpiJob;
+
+#if RPI_TSTATS
+typedef struct HEVCRpiStats {
+    int y_pred1_y8_merge;
+    int y_pred1_xy;
+    int y_pred1_x0;
+    int y_pred1_y0;
+    int y_pred1_x0y0;
+    int y_pred1_wle8;
+    int y_pred1_wgt8;
+    int y_pred1_hle16;
+    int y_pred1_hgt16;
+    int y_pred2_xy;
+    int y_pred2_x0;
+    int y_pred2_y0;
+    int y_pred2_x0y0;
+    int y_pred2_hle16;
+    int y_pred2_hgt16;
+} HEVCRpiStats;
+#endif
+
+#endif
+
 typedef struct HEVCContext {
     const AVClass *c;  // needed by private avoptions
     AVCodecContext *avctx;
@@ -805,6 +1025,69 @@ typedef struct HEVCContext {
     int                 width;
     int                 height;

+    int used_for_ref;  // rpi
+#ifdef RPI
+    int enable_rpi;
+    unsigned int pass0_job; // Pass0 does coefficient decode
+    unsigned int pass1_job; // Pass1 does pixel processing
+    int ctu_count; // Number of CTUs done in pass0 so far
+    int max_ctu_count; // Number of CTUs when we trigger a round of processing
+
+    HEVCRpiJob * jb0;
+    HEVCRpiJob * jb1;
+    HEVCRpiJob jobs[RPI_MAX_JOBS];
+#if RPI_TSTATS
+    HEVCRpiStats tstats;
+#endif
+#if RPI_INTER
+    struct qpu_mc_pred_y_p_s * last_y8_p;
+    struct qpu_mc_src_s * last_y8_l1;
+
+    // Function pointers
+#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
+    const uint8_t * qpu_dummy_frame_emu;
+#endif
+#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
+    uint32_t qpu_dummy_frame_qpu;  // Not a frame - just a bit of memory
+#endif
+    HEVCRpiQpu qpu;
+#endif
+
+    pthread_t worker_thread;
+
+#ifdef RPI_DEBLOCK_VPU
+#define RPI_DEBLOCK_VPU_Q_COUNT 2
+    int enable_rpi_deblock;
+
+    int uv_setup_width;
+    int uv_setup_height;
+    int setup_width; // Number of 16x16 blocks across the image
+    int setup_height; // Number of 16x16 blocks down the image
+
+    struct dblk_vpu_q_s
+    {
+        GPU_MEM_PTR_T deblock_vpu_gmem;
+
+        uint8_t (*y_setup_arm)[2][2][2][4];
+        uint8_t (*y_setup_vc)[2][2][2][4];
+
+        uint8_t (*uv_setup_arm)[2][2][2][4];  // Half of this is unused [][][1][], but easier for the VPU as it allows us to store with zeros and addresses are aligned
+        uint8_t (*uv_setup_vc)[2][2][2][4];
+
+        int (*vpu_cmds_arm)[6]; // r0-r5 for each command
+        int vpu_cmds_vc;
+
+        vpu_qpu_wait_h cmd_id;
+    } dvq_ents[RPI_DEBLOCK_VPU_Q_COUNT];
+
+    struct dblk_vpu_q_s * dvq;
+    unsigned int dvq_n;
+
+#endif
+    HEVCLocalContextIntra HEVClcIntra;
+    HEVCRPiFrameProgressState progress_states[2];
+#endif
+
     uint8_t *cabac_state;

     /** 1 if the independent slice segment header was successfully parsed */
@@ -1053,6 +1336,10 @@ void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size);

 int ff_hevc_encode_nal_vps(HEVCVPS *vps, unsigned int id,
                            uint8_t *buf, int buf_size);
+#if RPI_INTER
+extern void rpi_flush_ref_frame_progress(HEVCContext * const s, ThreadFrame * const f, const unsigned int n);
+#endif
+

 /**
  * Reset SEI values that are stored on the Context.
@@ -1072,4 +1359,89 @@ extern const uint8_t ff_hevc_diag_scan4x4_y[16];
 extern const uint8_t ff_hevc_diag_scan8x8_x[64];
 extern const uint8_t ff_hevc_diag_scan8x8_y[64];

+#ifdef RPI
+int16_t * rpi_alloc_coeff_buf(HEVCContext * const s, const int buf_no, const int n);
+
+// arm/hevc_misc_neon.S
+// Neon coeff zap fn
+#if HAVE_NEON
+extern void rpi_zap_coeff_vals_neon(int16_t * dst, unsigned int l2ts_m2);
+#endif
+
+void ff_hevc_rpi_progress_wait_field(HEVCContext * const s, HEVCRpiJob * const jb,
+                                     const HEVCFrame * const ref, const int val, const int field);
+
+void ff_hevc_rpi_progress_signal_field(HEVCContext * const s, const int val, const int field);
+
+// All of these expect that s->threads_type == FF_THREAD_FRAME
+
+static inline void ff_hevc_progress_wait_mv(HEVCContext * const s, HEVCRpiJob * const jb,
+                                     const HEVCFrame * const ref, const int y)
+{
+    if (s->enable_rpi)
+        ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 1);
+    else
+        ff_thread_await_progress((ThreadFrame*)&ref->tf, y, 0);
+}
+
+static inline void ff_hevc_progress_signal_mv(HEVCContext * const s, const int y)
+{
+    if (s->enable_rpi && s->used_for_ref)
+        ff_hevc_rpi_progress_signal_field(s, y, 1);
+}
+
+static inline void ff_hevc_progress_wait_recon(HEVCContext * const s, HEVCRpiJob * const jb,
+                                     const HEVCFrame * const ref, const int y)
+{
+    if (s->enable_rpi)
+        ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 0);
+    else
+        ff_thread_await_progress((ThreadFrame*)&ref->tf, y, 0);
+}
+
+static inline void ff_hevc_progress_signal_recon(HEVCContext * const s, const int y)
+{
+    if (s->used_for_ref)
+    {
+        if (s->enable_rpi)
+            ff_hevc_rpi_progress_signal_field(s, y, 0);
+        else
+            ff_thread_report_progress(&s->ref->tf, y, 0);
+    }
+}
+
+static inline void ff_hevc_progress_signal_all_done(HEVCContext * const s)
+{
+    if (s->enable_rpi)
+    {
+        ff_hevc_rpi_progress_signal_field(s, INT_MAX, 0);
+        ff_hevc_rpi_progress_signal_field(s, INT_MAX, 1);
+    }
+    else
+        ff_thread_report_progress(&s->ref->tf, INT_MAX, 0);
+}
+
+#else
+
+// Use #define as that allows us to discard "jb" which won't exist in non-RPI world
+#define ff_hevc_progress_wait_mv(s, jb, ref, y) ff_thread_await_progress((ThreadFrame *)&ref->tf, y, 0)
+#define ff_hevc_progress_wait_recon(s, jb, ref, y) ff_thread_await_progress((ThreadFrame *)&ref->tf, y, 0)
+#define ff_hevc_progress_signal_mv(s, y)
+#define ff_hevc_progress_signal_recon(s, y) ff_thread_report_progress(&s->ref->tf, y, 0)
+#define ff_hevc_progress_signal_all_done(s) ff_thread_report_progress(&s->ref->tf, INT_MAX, 0)
+
+#endif
+
+// Set all done - signal nothing (used in missing refs)
+// Works for both rpi & non-rpi
+static inline void ff_hevc_progress_set_all_done(HEVCFrame * const ref)
+{
+    if (ref->tf.progress != NULL)
+    {
+        int * const p = (int *)&ref->tf.progress->data;
+        p[0] = INT_MAX;
+        p[1] = INT_MAX;
+    }
+}
+
 #endif /* AVCODEC_HEVC_H */
diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
index 05b2821840..c84886817d 100644
--- a/libavcodec/hevc_cabac.c
+++ b/libavcodec/hevc_cabac.c
@@ -21,14 +21,76 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */

+#define UNCHECKED_BITSTREAM_READER 1
+
 #include "libavutil/attributes.h"
 #include "libavutil/common.h"

 #include "cabac_functions.h"
 #include "hevc.h"

+#ifdef RPI
+#include "libavutil/rpi_sand_fns.h"
+#endif
+
+// BY22 is probably faster than simple bypass if the processor has
+// either a fast 32-bit divide or a fast 32x32->64[63:32] instruction
+// x86 has fast int divide
+// Arm doesn't have divide or general fast 64 bit, but does have the multiply
+// * Beware: ARCH_xxx isn't set if configure --disable-asm is used
+#define USE_BY22 (HAVE_FAST_64BIT || ARCH_ARM || ARCH_X86)
+// Use native divide if we have a fast one - otherwise use mpy 1/x
+// x86 has a fast integer divide - arm doesn't - unsure about other
+// architectures
+#define USE_BY22_DIV  ARCH_X86
+
+// Special case blocks with a single significant ceoff
+// Decreases the complexity of the code for a common case but increases the
+// code size.
+#define USE_N_END_1 1
+
+#if ARCH_ARM
+#include "arm/hevc_cabac.h"
+#endif
+
 #define CABAC_MAX_BIN 31

+
+#if USE_BY22 && !USE_BY22_DIV
+#define I(x) (uint32_t)((0x10000000000ULL / (uint64_t)(x)) + 1ULL)
+
+static const uint32_t cabac_by22_inv_range[256] = {
+                                                    0,      I(257), I(258), I(259),
+    I(260), I(261), I(262), I(263), I(264), I(265), I(266), I(267), I(268), I(269),
+    I(270), I(271), I(272), I(273), I(274), I(275), I(276), I(277), I(278), I(279),
+    I(280), I(281), I(282), I(283), I(284), I(285), I(286), I(287), I(288), I(289),
+    I(290), I(291), I(292), I(293), I(294), I(295), I(296), I(297), I(298), I(299),
+    I(300), I(301), I(302), I(303), I(304), I(305), I(306), I(307), I(308), I(309),
+    I(310), I(311), I(312), I(313), I(314), I(315), I(316), I(317), I(318), I(319),
+    I(320), I(321), I(322), I(323), I(324), I(325), I(326), I(327), I(328), I(329),
+    I(330), I(331), I(332), I(333), I(334), I(335), I(336), I(337), I(338), I(339),
+    I(340), I(341), I(342), I(343), I(344), I(345), I(346), I(347), I(348), I(349),
+    I(350), I(351), I(352), I(353), I(354), I(355), I(356), I(357), I(358), I(359),
+    I(360), I(361), I(362), I(363), I(364), I(365), I(366), I(367), I(368), I(369),
+    I(370), I(371), I(372), I(373), I(374), I(375), I(376), I(377), I(378), I(379),
+    I(380), I(381), I(382), I(383), I(384), I(385), I(386), I(387), I(388), I(389),
+    I(390), I(391), I(392), I(393), I(394), I(395), I(396), I(397), I(398), I(399),
+    I(400), I(401), I(402), I(403), I(404), I(405), I(406), I(407), I(408), I(409),
+    I(410), I(411), I(412), I(413), I(414), I(415), I(416), I(417), I(418), I(419),
+    I(420), I(421), I(422), I(423), I(424), I(425), I(426), I(427), I(428), I(429),
+    I(430), I(431), I(432), I(433), I(434), I(435), I(436), I(437), I(438), I(439),
+    I(440), I(441), I(442), I(443), I(444), I(445), I(446), I(447), I(448), I(449),
+    I(450), I(451), I(452), I(453), I(454), I(455), I(456), I(457), I(458), I(459),
+    I(460), I(461), I(462), I(463), I(464), I(465), I(466), I(467), I(468), I(469),
+    I(470), I(471), I(472), I(473), I(474), I(475), I(476), I(477), I(478), I(479),
+    I(480), I(481), I(482), I(483), I(484), I(485), I(486), I(487), I(488), I(489),
+    I(490), I(491), I(492), I(493), I(494), I(495), I(496), I(497), I(498), I(499),
+    I(500), I(501), I(502), I(503), I(504), I(505), I(506), I(507), I(508), I(509),
+    I(510), I(511)
+};
+#undef I
+#endif  // USE_BY22
+
 /**
  * number of bin by SyntaxElement.
  */
@@ -445,6 +507,211 @@ static const uint8_t diag_scan8x8_inv[8][8] = {
     { 28, 36, 43, 49, 54, 58, 61, 63, },
 };

+
+typedef struct
+{
+    uint16_t coeff;
+    uint16_t scale;
+} xy_off_t;
+
+#define XYT_C(x,y,t) ((x) + ((y) << (t)))
+#define SCALE_TRAFO(t) ((t) > 3 ? 3 : (t))
+#define SCALE_SHR(t) ((t) - SCALE_TRAFO(t))
+#define XYT_S(x,y,t) (((x) >> SCALE_SHR(t)) + (((y) >> SCALE_SHR(t)) << SCALE_TRAFO(t)))
+
+#define XYT(x,y,t) {XYT_C(x,y,t), XYT_S(x,y,t)}
+
+#define OFF_DIAG(t) {\
+    XYT(0,0,t), XYT(0,1,t), XYT(1,0,t), XYT(0,2,t),\
+    XYT(1,1,t), XYT(2,0,t), XYT(0,3,t), XYT(1,2,t),\
+    XYT(2,1,t), XYT(3,0,t), XYT(1,3,t), XYT(2,2,t),\
+    XYT(3,1,t), XYT(2,3,t), XYT(3,2,t), XYT(3,3,t)\
+}
+
+#define OFF_HORIZ(t) {\
+    XYT(0,0,t), XYT(1,0,t), XYT(2,0,t), XYT(3,0,t),\
+    XYT(0,1,t), XYT(1,1,t), XYT(2,1,t), XYT(3,1,t),\
+    XYT(0,2,t), XYT(1,2,t), XYT(2,2,t), XYT(3,2,t),\
+    XYT(0,3,t), XYT(1,3,t), XYT(2,3,t), XYT(3,3,t)\
+}
+
+#define OFF_VERT(t) {\
+    XYT(0,0,t), XYT(0,1,t), XYT(0,2,t), XYT(0,3,t),\
+    XYT(1,0,t), XYT(1,1,t), XYT(1,2,t), XYT(1,3,t),\
+    XYT(2,0,t), XYT(2,1,t), XYT(2,2,t), XYT(2,3,t),\
+    XYT(3,0,t), XYT(3,1,t), XYT(3,2,t), XYT(3,3,t)\
+}
+
+static const xy_off_t off_xys[3][4][16] =
+{
+    {OFF_DIAG(2), OFF_DIAG(3), OFF_DIAG(4), OFF_DIAG(5)},
+    {OFF_HORIZ(2), OFF_HORIZ(3), OFF_HORIZ(4), OFF_HORIZ(5)},
+    {OFF_VERT(2), OFF_VERT(3), OFF_VERT(4), OFF_VERT(5)}
+};
+
+
+// Helper fns
+#ifndef hevc_mem_bits32
+static av_always_inline uint32_t hevc_mem_bits32(const void * buf, const unsigned int offset)
+{
+    return AV_RB32((const uint8_t *)buf + (offset >> 3)) << (offset & 7);
+}
+#endif
+
+#if AV_GCC_VERSION_AT_LEAST(3,4) && !defined(hevc_clz32)
+#define hevc_clz32 hevc_clz32_builtin
+static av_always_inline unsigned int hevc_clz32_builtin(const uint32_t x)
+{
+    // __builtin_clz says it works on ints - so adjust if int is >32 bits long
+    return __builtin_clz(x) - (sizeof(int) * 8 - 32);
+}
+#endif
+
+// It is unlikely that we will ever need this but include for completeness
+#ifndef hevc_clz32
+static inline unsigned int hevc_clz32(unsigned int x)
+{
+    unsigned int n = 1;
+    if ((x & 0xffff0000) == 0) {
+        n += 16;
+        x <<= 16;
+    }
+    if ((x & 0xff000000) == 0) {
+        n += 8;
+        x <<= 8;
+    }
+    if ((x & 0xf0000000) == 0) {
+        n += 4;
+        x <<= 4;
+    }
+    if ((x & 0xc0000000) == 0) {
+        n += 2;
+        x <<= 2;
+    }
+    return n - ((x >> 31) & 1);
+}
+#endif
+
+
+#if !USE_BY22
+// If no by22 then _by22 functions will revert to normal and so _peek/_flush
+// will no longer be called but the setup calls will still exist and we want
+// to null them out
+#define bypass_start(s)
+#define bypass_finish(s)
+#else
+// Use BY22 for residual bypass block
+
+#define bypass_start(s) get_cabac_by22_start(&s->HEVClc->cc)
+#define bypass_finish(s) get_cabac_by22_finish(&s->HEVClc->cc)
+
+// BY22 notes that bypass is simply a divide into the bitstream and so we
+// can peek out large quantities of bits at once and treat the result as if
+// it was VLC.  In many cases this will lead to O(1) processing rather than
+// O(n) though the setup and teardown is sufficiently expensive that it is
+// only worth using if we expect to be dealing with more than a few bits
+// The definition of "a few bits" will vary from platform to platform but
+// tests on ARM show that it probably isn't worth it for a single coded
+// residual, but is for >1 - it also seems likely that if there are
+// more residuals then they are likely to be bigger and this will make the
+// O(1) nature of the code more worthwhile.
+
+
+#if !USE_BY22_DIV
+// * 1/x @ 32 bits gets us 22 bits of accuracy
+#define CABAC_BY22_PEEK_BITS  22
+#else
+// A real 32-bit divide gets us another bit
+// If we have a 64 bit int & a unit time divider then we should get a lot
+// of bits (55)  but that is untested and it is unclear if it would give
+// us a large advantage
+#define CABAC_BY22_PEEK_BITS  23
+#endif
+
+// Bypass block start
+// Must be called before _by22_peek is used as it sets the CABAC environment
+// into the correct state.  _by22_finish must be called to return to 'normal'
+// (i.e. non-bypass) cabac decoding
+static inline void get_cabac_by22_start(CABACContext * const c)
+{
+    const unsigned int bits = __builtin_ctz(c->low);
+    const uint32_t m = hevc_mem_bits32(c->bytestream, 0);
+    uint32_t x = (c->low << (22 - CABAC_BITS)) ^ ((m ^ 0x80000000U) >> (9 + CABAC_BITS - bits));
+#if !USE_BY22_DIV
+    const uint32_t inv = cabac_by22_inv_range[c->range & 0xff];
+#endif
+
+    c->bytestream -= (CABAC_BITS / 8);
+    c->by22.bits = bits;
+#if !USE_BY22_DIV
+    c->by22.range = c->range;
+    c->range = inv;
+#endif
+    c->low = x;
+}
+
+// Bypass block finish
+// Must be called at the end of the bypass block to return to normal operation
+static inline void get_cabac_by22_finish(CABACContext * const c)
+{
+    unsigned int used = c->by22.bits;
+    unsigned int bytes_used = (used / CABAC_BITS) * (CABAC_BITS / 8);
+    unsigned int bits_used = used & (CABAC_BITS == 16 ? 15 : 7);
+
+    c->bytestream += bytes_used + (CABAC_BITS / 8);
+    c->low = (((uint32_t)c->low >> (22 - CABAC_BITS + bits_used)) | 1) << bits_used;
+#if !USE_BY22_DIV
+    c->range = c->by22.range;
+#endif
+}
+
+// Peek bypass bits
+// _by22_start must be called before _by22_peek is called and _by22_flush
+// must be called afterwards to flush any used bits
+// The actual number of valid bits returned is
+// min(<coded bypass block length>, CABAC_BY22_PEEK_BITS). CABAC_BY22_PEEK_BITS
+// will be at least 22 which should be long enough for any prefix or suffix
+// though probably not long enough for the worst case combination
+#ifndef get_cabac_by22_peek
+static inline uint32_t get_cabac_by22_peek(const CABACContext * const c)
+{
+#if USE_BY22_DIV
+    return ((unsigned int)c->low / (unsigned int)c->range) << 9;
+#else
+    uint32_t x = c->low & ~1U;
+    const uint32_t inv = c->range;
+
+    if (inv != 0)
+        x = (uint32_t)(((uint64_t)x * (uint64_t)inv) >> 32);
+
+    return x << 1;
+#endif
+}
+#endif
+
+// Flush bypass bits peeked by _by22_peek
+// Flush n bypass bits. n must be >= 1 to guarantee correct operation
+// val is an unmodified copy of whatever _by22_peek returned
+#ifndef get_cabac_by22_flush
+static inline void get_cabac_by22_flush(CABACContext * c, const unsigned int n, const uint32_t val)
+{
+    // Subtract the bits used & reshift up to the top of the word
+#if USE_BY22_DIV
+    const uint32_t low = (((unsigned int)c->low << n) - (((val >> (32 - n)) * (unsigned int)c->range) << 23));
+#else
+    const uint32_t low = (((uint32_t)c->low << n) - (((val >> (32 - n)) * c->by22.range) << 23));
+#endif
+
+    // and refill lower bits
+    // We will probably OR over some existing bits but that doesn't matter
+    c->by22.bits += n;
+    c->low = low | (hevc_mem_bits32(c->bytestream, c->by22.bits) >> 9);
+}
+#endif
+
+#endif  // USE_BY22
+
+
 void ff_hevc_save_states(HEVCContext *s, int ctb_addr_ts)
 {
     if (s->ps.pps->entropy_coding_sync_enabled_flag &&
@@ -863,19 +1130,19 @@ int ff_hevc_cbf_luma_decode(HEVCContext *s, int trafo_depth)
     return GET_CABAC(elem_offset[CBF_LUMA] + !trafo_depth);
 }

-static int hevc_transform_skip_flag_decode(HEVCContext *s, int c_idx)
+static int hevc_transform_skip_flag_decode(HEVCContext *s, int c_idx_nz)
 {
-    return GET_CABAC(elem_offset[TRANSFORM_SKIP_FLAG] + !!c_idx);
+    return GET_CABAC(elem_offset[TRANSFORM_SKIP_FLAG] + c_idx_nz);
 }

-static int explicit_rdpcm_flag_decode(HEVCContext *s, int c_idx)
+static int explicit_rdpcm_flag_decode(HEVCContext *s, int c_idx_nz)
 {
-    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_FLAG] + !!c_idx);
+    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_FLAG] + c_idx_nz);
 }

-static int explicit_rdpcm_dir_flag_decode(HEVCContext *s, int c_idx)
+static int explicit_rdpcm_dir_flag_decode(HEVCContext *s, int c_idx_nz)
 {
-    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_DIR_FLAG] + !!c_idx);
+    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_DIR_FLAG] + c_idx_nz);
 }

 int ff_hevc_log2_res_scale_abs(HEVCContext *s, int idx) {
@@ -891,14 +1158,14 @@ int ff_hevc_res_scale_sign_flag(HEVCContext *s, int idx) {
     return GET_CABAC(elem_offset[RES_SCALE_SIGN_FLAG] + idx);
 }

-static av_always_inline void last_significant_coeff_xy_prefix_decode(HEVCContext *s, int c_idx,
+static av_always_inline void last_significant_coeff_xy_prefix_decode(HEVCContext *s, int c_idx_nz,
                                                    int log2_size, int *last_scx_prefix, int *last_scy_prefix)
 {
     int i = 0;
     int max = (log2_size << 1) - 1;
     int ctx_offset, ctx_shift;

-    if (!c_idx) {
+    if (!c_idx_nz) {
         ctx_offset = 3 * (log2_size - 2)  + ((log2_size - 1) >> 2);
         ctx_shift = (log2_size + 1) >> 2;
     } else {
@@ -929,22 +1196,16 @@ static av_always_inline int last_significant_coeff_suffix_decode(HEVCContext *s,
     return value;
 }

-static av_always_inline int significant_coeff_group_flag_decode(HEVCContext *s, int c_idx, int ctx_cg)
+static av_always_inline int significant_coeff_group_flag_decode(HEVCContext *s, int c_idx_nz, int ctx_cg)
 {
     int inc;

-    inc = FFMIN(ctx_cg, 1) + (c_idx>0 ? 2 : 0);
+    inc = (ctx_cg != 0) + (c_idx_nz << 1);

     return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_GROUP_FLAG] + inc);
 }
-static av_always_inline int significant_coeff_flag_decode(HEVCContext *s, int x_c, int y_c,
-                                           int offset, const uint8_t *ctx_idx_map)
-{
-    int inc = ctx_idx_map[(y_c << 2) + x_c] + offset;
-    return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + inc);
-}

-static av_always_inline int significant_coeff_flag_decode_0(HEVCContext *s, int c_idx, int offset)
+static av_always_inline int significant_coeff_flag_decode_0(HEVCContext *s, int offset)
 {
     return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + offset);
 }
@@ -966,90 +1227,470 @@ static av_always_inline int coeff_abs_level_greater2_flag_decode(HEVCContext *s,
     return GET_CABAC(elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] + inc);
 }

-static av_always_inline int coeff_abs_level_remaining_decode(HEVCContext *s, int rc_rice_param)
+
+#if !USE_BY22
+#define coeff_abs_level_remaining_decode_bypass(s,r) coeff_abs_level_remaining_decode(s, r)
+#endif
+
+
+#ifndef coeff_abs_level_remaining_decode_bypass
+static int coeff_abs_level_remaining_decode_bypass(HEVCContext * const s, const unsigned int rice_param)
 {
+    CABACContext * const c = &s->HEVClc->cc;
+    uint32_t y;
+    unsigned int prefix;
+    unsigned int last_coeff_abs_level_remaining;
+    unsigned int n;
+
+    y = get_cabac_by22_peek(c);
+    prefix = hevc_clz32(~y);
+    // y << prefix will always have top bit 0
+
+    if (prefix < 3) {
+        const unsigned int suffix = (y << prefix) >> (31 - rice_param);
+        last_coeff_abs_level_remaining = (prefix << rice_param) + suffix;
+        n = prefix + 1 + rice_param;
+    }
+    else if (prefix * 2 + rice_param <= CABAC_BY22_PEEK_BITS + 2)
+    {
+        const uint32_t suffix = ((y << prefix) | 0x80000000) >> (34 - (prefix + rice_param));
+
+        last_coeff_abs_level_remaining = (2 << rice_param) + suffix;
+        n = prefix * 2 + rice_param - 2;
+    }
+    else {
+        unsigned int suffix;
+
+        get_cabac_by22_flush(c, prefix, y);
+        y = get_cabac_by22_peek(c);
+
+        suffix = (y | 0x80000000) >> (34 - (prefix + rice_param));
+        last_coeff_abs_level_remaining = (2 << rice_param) + suffix;
+        n = prefix + rice_param - 2;
+    }
+
+    get_cabac_by22_flush(c, n, y);
+
+    return last_coeff_abs_level_remaining;
+}
+#endif
+
+static int coeff_abs_level_remaining_decode(HEVCContext * const s, int rc_rice_param)
+{
+    CABACContext * const c = &s->HEVClc->cc;
     int prefix = 0;
     int suffix = 0;
     int last_coeff_abs_level_remaining;
     int i;

-    while (prefix < CABAC_MAX_BIN && get_cabac_bypass(&s->HEVClc->cc))
+    while (prefix < CABAC_MAX_BIN && get_cabac_bypass(c))
         prefix++;
     if (prefix == CABAC_MAX_BIN) {
         av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", prefix);
         return 0;
     }
+
     if (prefix < 3) {
         for (i = 0; i < rc_rice_param; i++)
-            suffix = (suffix << 1) | get_cabac_bypass(&s->HEVClc->cc);
+            suffix = (suffix << 1) | get_cabac_bypass(c);
         last_coeff_abs_level_remaining = (prefix << rc_rice_param) + suffix;
     } else {
         int prefix_minus3 = prefix - 3;
         for (i = 0; i < prefix_minus3 + rc_rice_param; i++)
-            suffix = (suffix << 1) | get_cabac_bypass(&s->HEVClc->cc);
+            suffix = (suffix << 1) | get_cabac_bypass(c);
         last_coeff_abs_level_remaining = (((1 << prefix_minus3) + 3 - 1)
                                               << rc_rice_param) + suffix;
     }
+
     return last_coeff_abs_level_remaining;
 }

-static av_always_inline int coeff_sign_flag_decode(HEVCContext *s, uint8_t nb)
+#if !USE_BY22
+#define coeff_sign_flag_decode_bypass coeff_sign_flag_decode
+static inline uint32_t coeff_sign_flag_decode(HEVCContext * const s, const unsigned int nb)
 {
-    int i;
-    int ret = 0;
+    CABACContext * const c = &s->HEVClc->cc;
+    unsigned int i;
+    uint32_t ret = 0;

     for (i = 0; i < nb; i++)
-        ret = (ret << 1) | get_cabac_bypass(&s->HEVClc->cc);
-    return ret;
+        ret = (ret << 1) | get_cabac_bypass(c);
+
+    return ret << (32 - nb);
+}
+#endif
+
+#ifndef coeff_sign_flag_decode_bypass
+static inline uint32_t coeff_sign_flag_decode_bypass(HEVCContext * const s, const unsigned int nb)
+{
+    CABACContext * const c = &s->HEVClc->cc;
+    uint32_t y;
+    y = get_cabac_by22_peek(c);
+    get_cabac_by22_flush(c, nb, y);
+    return y & ~(0xffffffffU >> nb);
+}
+#endif
+
+
+#ifndef get_cabac_greater1_bits
+static inline unsigned int get_cabac_greater1_bits(CABACContext * const c, const unsigned int n,
+    uint8_t * const state0)
+{
+    unsigned int i;
+    unsigned int rv = 0;
+    for (i = 0; i != n; ++i) {
+        const unsigned int idx = rv != 0 ? 0 : i < 3 ? i + 1 : 3;
+        const unsigned int b = get_cabac(c, state0 + idx);
+        rv = (rv << 1) | b;
+    }
+    return rv;
 }
+#endif
+
+
+// N.B. levels returned are the values assuming coeff_abs_level_remaining
+// is uncoded, so 1 must be added if it is coded.  sum_abs also reflects
+// this version of events.
+static inline uint32_t get_greaterx_bits(HEVCContext * const s, const unsigned int n_end, int * const levels,
+    int * const pprev_subset_coded, int * const psum,
+    const unsigned int idx0_gt1, const unsigned int idx_gt2)
+{
+    CABACContext * const c = &s->HEVClc->cc;
+    uint8_t * const state0 = s->HEVClc->cabac_state + idx0_gt1;
+    uint8_t * const state_gt2 = s->HEVClc->cabac_state + idx_gt2;
+    unsigned int rv;
+    unsigned int i;
+    const unsigned int n = FFMIN(n_end, 8);
+
+    // Really this is i != n but the simple unconditional loop is cheaper
+    // and faster
+    for (i = 0; i != 8; ++i)
+        levels[i] = 1;
+
+    rv = get_cabac_greater1_bits(c, n, state0);
+
+    *pprev_subset_coded = 0;
+    *psum = n;
+
+    rv <<= (32 - n);
+    if (rv != 0)
+    {
+        *pprev_subset_coded = 1;
+        *psum = n + 1;
+        i = hevc_clz32(rv);
+        levels[i] = 2;
+        if (get_cabac(c, state_gt2) == 0)
+        {
+            // Unset first coded bit
+            rv &= ~(0x80000000U >> i);
+        }
+    }
+
+    if (n_end > 8) {
+        const unsigned int g8 = n_end - 8;
+        rv |= ((1 << g8) - 1) << (24 - g8);
+        for (i = 0; i != g8; ++i) {
+            levels[i + 8] = 0;
+        }
+    }
+
+    return rv;
+}
+
+// extended_precision_processing_flag must be false given we are
+// putting the result into a 16-bit array
+// So trans_coeff_level must fit in 16 bits too (7.4.9.1 definition of coeff_abs_level_remaining)
+// scale_m is uint8_t
+//
+// scale is [40 - 72] << [0..12] based on qp- worst case is (45 << 12)
+//   or it can be 2 (if we have transquant_bypass)
+// shift is set to one less than we really want but would normally be
+//   s->ps.sps->bit_depth (max 16, min 8) + log2_trafo_size (max 5, min 2?) - 5 = max 16 min 5?
+// however the scale shift is substracted from shift to a min 0 so scale_m worst = 45 << 6
+// This can still theoretically lead to overflow but the coding would have to be very odd (& inefficient)
+// to achieve it
+
+#ifndef trans_scale_sat
+static inline int trans_scale_sat(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift)
+{
+    return av_clip_int16((((level * (int)(scale * scale_m)) >> shift) + 1) >> 1);
+}
+#endif
+
+
+#ifndef update_rice
+static inline void update_rice(uint8_t * const stat_coeff,
+    const unsigned int last_coeff_abs_level_remaining,
+    const unsigned int c_rice_param)
+{
+    const unsigned int x = (last_coeff_abs_level_remaining << 1) >> c_rice_param;
+    if (x >= 6)
+        (*stat_coeff)++;
+    else if (x == 0 && *stat_coeff > 0)
+        (*stat_coeff)--;
+}
+#endif
+
+
+// n must be > 0 on entry
+#ifndef get_cabac_sig_coeff_flag_idxs
+static inline uint8_t * get_cabac_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0,
+    unsigned int n,
+    const uint8_t const * ctx_map,
+    uint8_t * p)
+{
+    do {
+        if (get_cabac(c, state0 + ctx_map[n]))
+            *p++ = n;
+    } while (--n != 0);
+    return p;
+}
+#endif
+
+
+static int get_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0,
+    unsigned int n,
+    const uint8_t const * ctx_map,
+    uint8_t * const flag_idx)
+{
+    int rv;
+
+    rv = get_cabac_sig_coeff_flag_idxs(c, state0, n, ctx_map, flag_idx) - flag_idx;
+
+    return rv;
+}
+
+#define H4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
+     x0,  x1,  x2,  x3,\
+     x4,  x5,  x6,  x7,\
+     x8,  x9, x10, x11,\
+    x12, x13, x14, x15}
+
+#define V4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
+     x0,  x4,  x8, x12,\
+     x1,  x5,  x9, x13,\
+     x2,  x6, x10, x14,\
+     x3,  x7, x11, x15}
+
+#define D4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
+     x0,  x4,  x1,  x8,\
+     x5,  x2, x12,  x9,\
+     x6,  x3, x13, x10,\
+     x7, x14, x11, x15}
+
+
+static inline int next_subset(HEVCContext * const s, int i, const int c_idx_nz,
+    uint8_t * const significant_coeff_group_flag,
+    const uint8_t * const scan_x_cg, const uint8_t * const scan_y_cg,
+    int * const pPrev_sig)
+{
+    while (--i >= 0) {
+        uint8_t * const gf_y = scan_y_cg[i] + significant_coeff_group_flag;
+        const unsigned int x_cg = scan_x_cg[i];
+
+        // For the flag decode we only care about Z/NZ but
+        // we use the full Right * 2 + Down when calculating
+        // significant coeff flags so we obtain it here.
+        //
+        // The group flag array is one longer than it needs to
+        // be so we don't need to check for y_cg limits
+        const unsigned int prev_sig = ((gf_y[0] >> x_cg) & 2) | ((gf_y[1] >> x_cg) & 1);
+
+        if (i == 0 ||
+            significant_coeff_group_flag_decode(s, c_idx_nz, prev_sig))
+        {
+            gf_y[0] |= (1 << x_cg);
+            *pPrev_sig = prev_sig;
+            break;
+        }
+    }
+
+    return i;
+}
+
+#ifdef RPI
+static void rpi_add_residual(HEVCContext * const s,
+    const unsigned int log2_trafo_size, const unsigned int c_idx,
+    const unsigned int x0, const unsigned int y0, const int16_t * const coeffs)
+{
+    const AVFrame * const frame = s->frame;
+    unsigned int stride = frame->linesize[c_idx];
+    unsigned int x = x0 >> s->ps.sps->hshift[c_idx];
+    unsigned int y = y0 >> s->ps.sps->vshift[c_idx];
+    const int is_sliced = av_rpi_is_sand_frame(frame);
+    uint8_t * dst = !is_sliced ?
+            s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) :
+        c_idx == 0 ?
+            av_rpi_sand_frame_pos_y(frame, x, y) :
+            av_rpi_sand_frame_pos_c(frame, x, y);
+
+    if (s->enable_rpi) {
+        const unsigned int i = s->jb0->intra.n;
+        HEVCPredCmd *const pc = s->jb0->intra.cmds + i - 1;
+
+        if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_RESIDUAL_U &&
+            pc->ta.dst == dst)
+        {
+            av_assert1(pc->size == log2_trafo_size &&
+                       pc->c_idx == 1 &&
+                       pc->ta.stride == stride);
+
+            pc->type = RPI_PRED_ADD_RESIDUAL_C;
+        }
+        else if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_DC_U &&
+            pc->dc.dst == dst)
+        {
+            const int16_t dc = (int16_t)pc->dc.dc;  // Discard top bits
+            av_assert1(pc->size == log2_trafo_size &&
+                       pc->c_idx == 1 &&
+                       pc->dc.stride == stride);
+
+            // Rewrite as add residual - must rewrite all fields as different union member
+            pc->type = RPI_PRED_ADD_RESIDUAL_V;
+            pc->c_idx = c_idx;
+            pc->ta.buf = coeffs;
+            pc->ta.dst = dst;
+            pc->ta.stride = stride;
+            pc->ta.dc = dc;
+        }
+        else
+        {
+            HEVCPredCmd * const cmd = pc + 1;
+            s->jb0->intra.n = i + 1;
+
+            cmd->type = RPI_PRED_ADD_RESIDUAL + (is_sliced ? c_idx : 0);
+            cmd->size = log2_trafo_size;
+            cmd->c_idx = c_idx;
+            cmd->ta.buf = coeffs;
+            cmd->ta.dst = dst;
+            cmd->ta.stride = stride;
+            cmd->ta.dc = 0;
+        }
+    }
+    else if (!is_sliced || c_idx == 0) {
+        s->hevcdsp.transform_add[log2_trafo_size-2](dst, (int16_t *)coeffs, stride);
+    }
+#if RPI_HEVC_SAND
+    // * These should probably never happen
+    else if (c_idx == 1) {
+        s->hevcdsp.add_residual_u[log2_trafo_size-2](dst, (int16_t *)coeffs, stride, 0);
+    }
+    else {
+        s->hevcdsp.add_residual_v[log2_trafo_size-2](dst, (int16_t *)coeffs, stride, 0);
+    }
+#endif
+}
+
+
+static void rpi_add_dc(HEVCContext * const s,
+    const unsigned int log2_trafo_size, const unsigned int c_idx,
+    const unsigned int x0, const unsigned int y0, const int16_t * const coeffs)
+{
+    const AVFrame * const frame = s->frame;
+    const unsigned int stride = frame->linesize[c_idx];
+    const unsigned int x = x0 >> s->ps.sps->hshift[c_idx];
+    const unsigned int y = y0 >> s->ps.sps->vshift[c_idx];
+    const int is_sliced = av_rpi_is_sand_frame(frame);
+    uint8_t * const dst = !is_sliced ?
+            s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) :
+        c_idx == 0 ?
+            av_rpi_sand_frame_pos_y(frame, x, y) :
+            av_rpi_sand_frame_pos_c(frame, x, y);
+
+    const unsigned int shift = FFMAX(14 - s->ps.sps->bit_depth, 0);
+    const int coeff = (coeffs[0] + (1 | (1 << shift))) >> (shift + 1);
+
+    if (s->enable_rpi) {
+        const unsigned int i = s->jb0->intra.n;
+        HEVCPredCmd *const pc = s->jb0->intra.cmds + i - 1;
+
+        if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_RESIDUAL_U &&
+            pc->ta.dst == dst)
+        {
+            av_assert1(pc->size == log2_trafo_size &&
+                       pc->c_idx == 1 &&
+                       pc->ta.stride == stride);
+
+            pc->ta.dc = (int16_t)coeff;
+        }
+        else if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_DC_U &&
+            pc->dc.dst == dst)
+        {
+            av_assert1(pc->size == log2_trafo_size &&
+                       pc->c_idx == 1 &&
+                       pc->dc.stride == stride &&
+                       (pc->dc.dc & ~0xffff) == 0);
+
+            pc->dc.dc |= (coeff << 16);
+        }
+        else
+        {
+            HEVCPredCmd * const cmd = pc + 1;
+            s->jb0->intra.n = i + 1;
+
+            cmd->type = RPI_PRED_ADD_DC + c_idx;
+            cmd->size = log2_trafo_size;
+            cmd->c_idx = c_idx;
+            cmd->dc.dst = dst;
+            cmd->dc.stride = stride;
+            cmd->dc.dc = c_idx == 0 ? coeff : c_idx == 2 ? coeff << 16 : coeff & 0xffff;
+        }
+    }
+}
+
+
+#endif

 void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
                                 int log2_trafo_size, enum ScanType scan_idx,
                                 int c_idx)
 {
-#define GET_COORD(offset, n)                                    \
-    do {                                                        \
-        x_c = (x_cg << 2) + scan_x_off[n];                      \
-        y_c = (y_cg << 2) + scan_y_off[n];                      \
-    } while (0)
-    HEVCLocalContext *lc = s->HEVClc;
-    int transform_skip_flag = 0;
+    HEVCLocalContext * const lc = s->HEVClc;
+    int trans_skip_or_bypass = lc->cu.cu_transquant_bypass_flag;

     int last_significant_coeff_x, last_significant_coeff_y;
-    int last_scan_pos;
-    int n_end;
     int num_coeff = 0;
-    int greater1_ctx = 1;
+    int prev_subset_coded = 0;

     int num_last_subset;
     int x_cg_last_sig, y_cg_last_sig;

-    const uint8_t *scan_x_cg, *scan_y_cg, *scan_x_off, *scan_y_off;
+    const uint8_t *scan_x_cg, *scan_y_cg;
+    const xy_off_t * scan_xy_off;

+#ifndef RPI
     ptrdiff_t stride = s->frame->linesize[c_idx];
     int hshift = s->ps.sps->hshift[c_idx];
     int vshift = s->ps.sps->vshift[c_idx];
-    uint8_t *dst = &s->frame->data[c_idx][(y0 >> vshift) * stride +
+    uint8_t * const dst = &s->frame->data[c_idx][(y0 >> vshift) * stride +
                                           ((x0 >> hshift) << s->ps.sps->pixel_shift)];
-    int16_t *coeffs = (int16_t*)(c_idx ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
-    uint8_t significant_coeff_group_flag[8][8] = {{0}};
+#endif
+#ifdef RPI
+    int use_vpu;
+    int use_dc = 0;
+#endif
+    int16_t *coeffs;
+    uint8_t significant_coeff_group_flag[9] = {0};  // Allow 1 final byte that is always zero
     int explicit_rdpcm_flag = 0;
     int explicit_rdpcm_dir_flag;

     int trafo_size = 1 << log2_trafo_size;
     int i;
-    int qp,shift,add,scale,scale_m;
+    int qp,shift,scale;
     static const uint8_t level_scale[] = { 40, 45, 51, 57, 64, 72 };
     const uint8_t *scale_matrix = NULL;
     uint8_t dc_scale;
     int pred_mode_intra = (c_idx == 0) ? lc->tu.intra_pred_mode :
                                          lc->tu.intra_pred_mode_c;

-    memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
+    int prev_sig = 0;
+    const int c_idx_nz = (c_idx != 0);
+
+    int may_hide_sign;

     // Derive QP for dequant
     if (!lc->cu.cu_transquant_bypass_flag) {
-        static const int qp_c[] = { 29, 30, 31, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37 };
+        static const uint8_t qp_c[] = { 29, 30, 31, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37 };
         static const uint8_t rem6[51 + 4 * 6 + 1] = {
             0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2,
             3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
@@ -1065,9 +1706,19 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
         };
         int qp_y = lc->qp_y;

+        may_hide_sign = s->ps.pps->sign_data_hiding_flag;
+
         if (s->ps.pps->transform_skip_enabled_flag &&
             log2_trafo_size <= s->ps.pps->log2_max_transform_skip_block_size) {
-            transform_skip_flag = hevc_transform_skip_flag_decode(s, c_idx);
+            int transform_skip_flag = hevc_transform_skip_flag_decode(s, c_idx_nz);
+            if (transform_skip_flag) {
+                trans_skip_or_bypass = 1;
+                if (lc->cu.pred_mode ==  MODE_INTRA  &&
+                    s->ps.sps->implicit_rdpcm_enabled_flag &&
+                    (pred_mode_intra == 10 || pred_mode_intra == 26)) {
+                    may_hide_sign = 0;
+                }
+            }
         }

         if (c_idx == 0) {
@@ -1100,39 +1751,76 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
             qp += s->ps.sps->qp_bd_offset;
         }

-        shift    = s->ps.sps->bit_depth + log2_trafo_size - 5;
-        add      = 1 << (shift-1);
-        scale    = level_scale[rem6[qp]] << (div6[qp]);
-        scale_m  = 16; // default when no custom scaling lists.
-        dc_scale = 16;
+        // Shift is set to one less than will actually occur as the scale
+        // and saturate step adds 1 and then shifts right again
+        shift = s->ps.sps->bit_depth + log2_trafo_size - 6;
+        scale = level_scale[rem6[qp]];
+        if (div6[qp] >= shift) {
+            scale <<= (div6[qp] - shift);
+            shift = 0;
+        } else {
+            shift -= div6[qp];
+        }

-        if (s->ps.sps->scaling_list_enable_flag && !(transform_skip_flag && log2_trafo_size > 2)) {
+        if (s->ps.sps->scaling_list_enable_flag && !(trans_skip_or_bypass && log2_trafo_size > 2)) {
             const ScalingList *sl = s->ps.pps->scaling_list_data_present_flag ?
-            &s->ps.pps->scaling_list : &s->ps.sps->scaling_list;
+                &s->ps.pps->scaling_list : &s->ps.sps->scaling_list;
             int matrix_id = lc->cu.pred_mode != MODE_INTRA;

             matrix_id = 3 * matrix_id + c_idx;

             scale_matrix = sl->sl[log2_trafo_size - 2][matrix_id];
+            dc_scale = scale_matrix[0];
             if (log2_trafo_size >= 4)
                 dc_scale = sl->sl_dc[log2_trafo_size - 4][matrix_id];
         }
+        else
+        {
+            static const uint8_t sixteen_scale[64] = {
+                16, 16, 16, 16, 16, 16, 16, 16,
+                16, 16, 16, 16, 16, 16, 16, 16,
+                16, 16, 16, 16, 16, 16, 16, 16,
+                16, 16, 16, 16, 16, 16, 16, 16,
+                16, 16, 16, 16, 16, 16, 16, 16,
+                16, 16, 16, 16, 16, 16, 16, 16,
+                16, 16, 16, 16, 16, 16, 16, 16,
+                16, 16, 16, 16, 16, 16, 16, 16
+            };
+            scale_matrix = sixteen_scale;
+            dc_scale = 16;
+        }
     } else {
+        static const uint8_t unit_scale[64] = {
+            1, 1, 1, 1, 1, 1, 1, 1,
+            1, 1, 1, 1, 1, 1, 1, 1,
+            1, 1, 1, 1, 1, 1, 1, 1,
+            1, 1, 1, 1, 1, 1, 1, 1,
+            1, 1, 1, 1, 1, 1, 1, 1,
+            1, 1, 1, 1, 1, 1, 1, 1,
+            1, 1, 1, 1, 1, 1, 1, 1,
+            1, 1, 1, 1, 1, 1, 1, 1,
+        };
+        scale_matrix = unit_scale;
         shift        = 0;
-        add          = 0;
-        scale        = 0;
-        dc_scale     = 0;
+        scale        = 2;  // We will shift right to kill this
+        dc_scale     = 1;
+
+        may_hide_sign = 0;
     }

+
+
+
     if (lc->cu.pred_mode == MODE_INTER && s->ps.sps->explicit_rdpcm_enabled_flag &&
-        (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
-        explicit_rdpcm_flag = explicit_rdpcm_flag_decode(s, c_idx);
+        trans_skip_or_bypass) {
+        explicit_rdpcm_flag = explicit_rdpcm_flag_decode(s, c_idx_nz);
         if (explicit_rdpcm_flag) {
-            explicit_rdpcm_dir_flag = explicit_rdpcm_dir_flag_decode(s, c_idx);
+            may_hide_sign = 0;
+            explicit_rdpcm_dir_flag = explicit_rdpcm_dir_flag_decode(s, c_idx_nz);
         }
     }

-    last_significant_coeff_xy_prefix_decode(s, c_idx, log2_trafo_size,
+    last_significant_coeff_xy_prefix_decode(s, c_idx_nz, log2_trafo_size,
                                            &last_significant_coeff_x, &last_significant_coeff_y);

     if (last_significant_coeff_x > 3) {
@@ -1160,119 +1848,147 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
         int last_x_c = last_significant_coeff_x & 3;
         int last_y_c = last_significant_coeff_y & 3;

-        scan_x_off = ff_hevc_diag_scan4x4_x;
-        scan_y_off = ff_hevc_diag_scan4x4_y;
         num_coeff = diag_scan4x4_inv[last_y_c][last_x_c];
-        if (trafo_size == 4) {
+
+        switch (log2_trafo_size) {
+        case 2:
             scan_x_cg = scan_1x1;
             scan_y_cg = scan_1x1;
-        } else if (trafo_size == 8) {
+            break;
+        case 3:
             num_coeff += diag_scan2x2_inv[y_cg_last_sig][x_cg_last_sig] << 4;
             scan_x_cg = diag_scan2x2_x;
             scan_y_cg = diag_scan2x2_y;
-        } else if (trafo_size == 16) {
+            break;
+        case 4:
             num_coeff += diag_scan4x4_inv[y_cg_last_sig][x_cg_last_sig] << 4;
             scan_x_cg = ff_hevc_diag_scan4x4_x;
             scan_y_cg = ff_hevc_diag_scan4x4_y;
-        } else { // trafo_size == 32
+            break;
+        case 5:
+        default:
             num_coeff += diag_scan8x8_inv[y_cg_last_sig][x_cg_last_sig] << 4;
             scan_x_cg = ff_hevc_diag_scan8x8_x;
             scan_y_cg = ff_hevc_diag_scan8x8_y;
+            break;
         }
         break;
     }
     case SCAN_HORIZ:
         scan_x_cg = horiz_scan2x2_x;
         scan_y_cg = horiz_scan2x2_y;
-        scan_x_off = horiz_scan4x4_x;
-        scan_y_off = horiz_scan4x4_y;
         num_coeff = horiz_scan8x8_inv[last_significant_coeff_y][last_significant_coeff_x];
         break;
     default: //SCAN_VERT
         scan_x_cg = horiz_scan2x2_y;
         scan_y_cg = horiz_scan2x2_x;
-        scan_x_off = horiz_scan4x4_y;
-        scan_y_off = horiz_scan4x4_x;
         num_coeff = horiz_scan8x8_inv[last_significant_coeff_x][last_significant_coeff_y];
         break;
     }
     num_coeff++;
     num_last_subset = (num_coeff - 1) >> 4;

-    for (i = num_last_subset; i >= 0; i--) {
-        int n, m;
-        int x_cg, y_cg, x_c, y_c, pos;
-        int implicit_non_zero_coeff = 0;
-        int64_t trans_coeff_level;
-        int prev_sig = 0;
-        int offset = i << 4;
-        int rice_init = 0;
-
-        uint8_t significant_coeff_flag_idx[16];
-        uint8_t nb_significant_coeff_flag = 0;
+    significant_coeff_group_flag[y_cg_last_sig] = 1 << x_cg_last_sig; // 1st subset always significant

-        x_cg = scan_x_cg[i];
-        y_cg = scan_y_cg[i];
+    scan_xy_off = off_xys[scan_idx][log2_trafo_size - 2];

-        if ((i < num_last_subset) && (i > 0)) {
-            int ctx_cg = 0;
-            if (x_cg < (1 << (log2_trafo_size - 2)) - 1)
-                ctx_cg += significant_coeff_group_flag[x_cg + 1][y_cg];
-            if (y_cg < (1 << (log2_trafo_size - 2)) - 1)
-                ctx_cg += significant_coeff_group_flag[x_cg][y_cg + 1];
+    {
+        const unsigned int ccount = 1 << (log2_trafo_size * 2);
+#ifdef RPI
+        use_vpu = 0;
+        if (s->enable_rpi) {
+            const int special = trans_skip_or_bypass || lc->tu.cross_pf;  // These need special processinmg
+            use_dc = (num_coeff == 1) && !special &&
+                !(lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2);

-            significant_coeff_group_flag[x_cg][y_cg] =
-                significant_coeff_group_flag_decode(s, c_idx, ctx_cg);
-            implicit_non_zero_coeff = 1;
-        } else {
-            significant_coeff_group_flag[x_cg][y_cg] =
-            ((x_cg == x_cg_last_sig && y_cg == y_cg_last_sig) ||
-             (x_cg == 0 && y_cg == 0));
+            if (use_dc) {
+                // Just need a little empty space
+                coeffs = (int16_t*)(c_idx_nz ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
+                // No need to clear
+            }
+            else
+            {
+                use_vpu = !special && log2_trafo_size >= 4;
+                coeffs = rpi_alloc_coeff_buf(s, !use_vpu ? 0 : log2_trafo_size - 2, ccount);
+#if HAVE_NEON
+                rpi_zap_coeff_vals_neon(coeffs, log2_trafo_size - 2);
+#else
+                memset(coeffs, 0, ccount * sizeof(int16_t));
+#endif
+            }
         }
+        else
+#endif
+        {
+            coeffs = (int16_t*)(c_idx_nz ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
+            memset(coeffs, 0, ccount * sizeof(int16_t));
+        }
+    }

-        last_scan_pos = num_coeff - offset - 1;
+    i = num_last_subset;
+    do {
+        int implicit_non_zero_coeff = 0;
+        int n_end;
+
+        uint8_t significant_coeff_flag_idx[16];
+        unsigned int nb_significant_coeff_flag = 0;

         if (i == num_last_subset) {
+            // First time through
+            int last_scan_pos = num_coeff - (i << 4) - 1;
             n_end = last_scan_pos - 1;
             significant_coeff_flag_idx[0] = last_scan_pos;
             nb_significant_coeff_flag = 1;
         } else {
             n_end = 15;
+            implicit_non_zero_coeff = (i != 0);
         }

-        if (x_cg < ((1 << log2_trafo_size) - 1) >> 2)
-            prev_sig = !!significant_coeff_group_flag[x_cg + 1][y_cg];
-        if (y_cg < ((1 << log2_trafo_size) - 1) >> 2)
-            prev_sig += (!!significant_coeff_group_flag[x_cg][y_cg + 1] << 1);
-
-        if (significant_coeff_group_flag[x_cg][y_cg] && n_end >= 0) {
-            static const uint8_t ctx_idx_map[] = {
-                0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8, // log2_trafo_size == 2
-                1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, // prev_sig == 0
-                2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, // prev_sig == 1
-                2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, // prev_sig == 2
-                2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2  // default
+        if (n_end >= 0) {
+            static const uint8_t ctx_idx_maps_ts2[3][16] = {
+                D4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2
+                H4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2
+                V4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8)  // log2_trafo_size == 2
+            };
+            // N.B. prev_sig = Right * 2 + Down
+            static const uint8_t ctx_idx_maps[3][4][16] = {
+                {
+                    D4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
+                    D4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1
+                    D4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2
+                    D4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
+                },
+                {
+                    H4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
+                    H4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1
+                    H4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2
+                    H4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
+                },
+                {
+                    V4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
+                    V4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1
+                    V4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2
+                    V4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
+                }
             };
             const uint8_t *ctx_idx_map_p;
             int scf_offset = 0;
-            if (s->ps.sps->transform_skip_context_enabled_flag &&
-                (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
-                ctx_idx_map_p = (uint8_t*) &ctx_idx_map[4 * 16];
-                if (c_idx == 0) {
-                    scf_offset = 40;
-                } else {
-                    scf_offset = 14 + 27;
-                }
+
+            if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) {
+                ctx_idx_map_p = ctx_idx_maps[0][3];
+                scf_offset = 40 + c_idx_nz;
             } else {
-                if (c_idx != 0)
+                if (c_idx_nz != 0)
                     scf_offset = 27;
+
                 if (log2_trafo_size == 2) {
-                    ctx_idx_map_p = (uint8_t*) &ctx_idx_map[0];
+                    ctx_idx_map_p = ctx_idx_maps_ts2[scan_idx];
                 } else {
-                    ctx_idx_map_p = (uint8_t*) &ctx_idx_map[(prev_sig + 1) << 4];
-                    if (c_idx == 0) {
-                        if ((x_cg > 0 || y_cg > 0))
+                    ctx_idx_map_p = ctx_idx_maps[scan_idx][prev_sig];
+                    if (!c_idx_nz) {
+                        if (i != 0)
                             scf_offset += 3;
+
                         if (log2_trafo_size == 3) {
                             scf_offset += (scan_idx == SCAN_DIAG) ? 9 : 15;
                         } else {
@@ -1286,34 +2002,30 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
                     }
                 }
             }
-            for (n = n_end; n > 0; n--) {
-                x_c = scan_x_off[n];
-                y_c = scan_y_off[n];
-                if (significant_coeff_flag_decode(s, x_c, y_c, scf_offset, ctx_idx_map_p)) {
-                    significant_coeff_flag_idx[nb_significant_coeff_flag] = n;
-                    nb_significant_coeff_flag++;
+
+            if (n_end > 0) {
+                int cnt = get_sig_coeff_flag_idxs(&s->HEVClc->cc,
+                    s->HEVClc->cabac_state + elem_offset[SIGNIFICANT_COEFF_FLAG] + scf_offset,
+                    n_end, ctx_idx_map_p,
+                    significant_coeff_flag_idx + nb_significant_coeff_flag);
+
+                nb_significant_coeff_flag += cnt;
+                if (cnt != 0) {
                     implicit_non_zero_coeff = 0;
                 }
             }
+
             if (implicit_non_zero_coeff == 0) {
-                if (s->ps.sps->transform_skip_context_enabled_flag &&
-                    (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
-                    if (c_idx == 0) {
-                        scf_offset = 42;
-                    } else {
-                        scf_offset = 16 + 27;
-                    }
+                if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) {
+                    scf_offset = 42 + c_idx_nz;
                 } else {
                     if (i == 0) {
-                        if (c_idx == 0)
-                            scf_offset = 0;
-                        else
-                            scf_offset = 27;
+                        scf_offset = c_idx_nz ? 27 : 0;
                     } else {
                         scf_offset = 2 + scf_offset;
                     }
                 }
-                if (significant_coeff_flag_decode_0(s, c_idx, scf_offset) == 1) {
+                if (significant_coeff_flag_decode_0(s, scf_offset) == 1) {
                     significant_coeff_flag_idx[nb_significant_coeff_flag] = 0;
                     nb_significant_coeff_flag++;
                 }
@@ -1323,141 +2035,185 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
             }
         }

-        n_end = nb_significant_coeff_flag;
-
+        if (nb_significant_coeff_flag != 0) {
+            const unsigned int gt1_idx_delta = (c_idx_nz << 2) |
+                ((i != 0 && !c_idx_nz) ? 2 : 0) |
+                prev_subset_coded;
+            const unsigned int idx0_gt1 = elem_offset[COEFF_ABS_LEVEL_GREATER1_FLAG] +
+                (gt1_idx_delta << 2);
+            const unsigned int idx_gt2 = elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] +
+                gt1_idx_delta;
+
+            const unsigned int x_cg = scan_x_cg[i];
+            const unsigned int y_cg = scan_y_cg[i];
+            int16_t * const blk_coeffs = coeffs +
+                ((x_cg + (y_cg << log2_trafo_size)) << 2);
+            // This calculation is 'wrong' for log2_traffo_size == 2
+            // but that doesn't mattor as in this case x_cg & y_cg
+            // are always 0 so result is correct (0) anyway
+            const uint8_t * const blk_scale = scale_matrix +
+                (((x_cg + (y_cg << 3)) << (5 - log2_trafo_size)));
+
+            // * The following code block doesn't deal with these flags:
+            //   (nor did the one it replaces)
+            //
+            // cabac_bypass_alignment_enabled_flag
+            //    This should be easy but I can't find a test case
+            // extended_precision_processing_flag
+            //    This can extend the required precision past 16bits
+            //    so is probably tricky - also no example found yet
+
+#if USE_N_END_1
+            if (nb_significant_coeff_flag == 1) {
+                // There is a small gain to be had from special casing the single
+                // transform coefficient case.  The reduction in complexity
+                // makes up for the code duplicatioon.
+
+                int trans_coeff_level = 1;
+                int coeff_sign_flag;
+                int coded_val = 0;
+
+                // initialize first elem of coeff_bas_level_greater1_flag
+                prev_subset_coded = 0;
+
+                if (get_cabac(&s->HEVClc->cc, s->HEVClc->cabac_state + idx0_gt1 + 1)) {
+                    trans_coeff_level = 2;
+                    prev_subset_coded = 1;
+                    coded_val = get_cabac(&s->HEVClc->cc, s->HEVClc->cabac_state + idx_gt2);
+                }

-        if (n_end) {
-            int first_nz_pos_in_cg;
-            int last_nz_pos_in_cg;
-            int c_rice_param = 0;
-            int first_greater1_coeff_idx = -1;
-            uint8_t coeff_abs_level_greater1_flag[8];
-            uint16_t coeff_sign_flag;
-            int sum_abs = 0;
-            int sign_hidden;
-            int sb_type;
+                // Probably not worth the overhead of starting by22 for just one value
+                coeff_sign_flag = get_cabac_bypass(&s->HEVClc->cc);

+                if (coded_val)
+                {
+                    if (!s->ps.sps->persistent_rice_adaptation_enabled_flag) {
+                        trans_coeff_level = 3 + coeff_abs_level_remaining_decode(s, 0);
+                    } else {
+                        uint8_t * const stat_coeff =
+                            lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1);
+                        const unsigned int c_rice_param = *stat_coeff >> 2;
+                        const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);

-            // initialize first elem of coeff_bas_level_greater1_flag
-            int ctx_set = (i > 0 && c_idx == 0) ? 2 : 0;
+                        trans_coeff_level = 3 + last_coeff_abs_level_remaining;
+                        update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
+                    }
+                }

-            if (s->ps.sps->persistent_rice_adaptation_enabled_flag) {
-                if (!transform_skip_flag && !lc->cu.cu_transquant_bypass_flag)
-                    sb_type = 2 * (c_idx == 0 ? 1 : 0);
-                else
-                    sb_type = 2 * (c_idx == 0 ? 1 : 0) + 1;
-                c_rice_param = lc->stat_coeff[sb_type] / 4;
-            }
+                {
+                    const xy_off_t * const xy_off = scan_xy_off + significant_coeff_flag_idx[0];
+                    const int k = (int32_t)(coeff_sign_flag << 31) >> 31;
+                    const unsigned int scale_m = blk_scale[xy_off->scale];

-            if (!(i == num_last_subset) && greater1_ctx == 0)
-                ctx_set++;
-            greater1_ctx = 1;
-            last_nz_pos_in_cg = significant_coeff_flag_idx[0];
-
-            for (m = 0; m < (n_end > 8 ? 8 : n_end); m++) {
-                int inc = (ctx_set << 2) + greater1_ctx;
-                coeff_abs_level_greater1_flag[m] =
-                    coeff_abs_level_greater1_flag_decode(s, c_idx, inc);
-                if (coeff_abs_level_greater1_flag[m]) {
-                    greater1_ctx = 0;
-                    if (first_greater1_coeff_idx == -1)
-                        first_greater1_coeff_idx = m;
-                } else if (greater1_ctx > 0 && greater1_ctx < 3) {
-                    greater1_ctx++;
+                    blk_coeffs[xy_off->coeff] = trans_scale_sat(
+                        (trans_coeff_level ^ k) - k,  // Apply sign
+                        scale,
+                        i == 0 && xy_off->coeff == 0 ? dc_scale : scale_m,
+                        shift);
                 }
             }
-            first_nz_pos_in_cg = significant_coeff_flag_idx[n_end - 1];
-
-            if (lc->cu.cu_transquant_bypass_flag ||
-                (lc->cu.pred_mode ==  MODE_INTRA  &&
-                 s->ps.sps->implicit_rdpcm_enabled_flag  &&  transform_skip_flag  &&
-                 (pred_mode_intra == 10 || pred_mode_intra  ==  26 )) ||
-                 explicit_rdpcm_flag)
-                sign_hidden = 0;
             else
-                sign_hidden = (last_nz_pos_in_cg - first_nz_pos_in_cg >= 4);
+#endif
+            {
+                int sign_hidden = may_hide_sign;
+                int levels[16]; // Should be able to get away with int16_t but that fails some tests
+                uint32_t coeff_sign_flags;
+                uint32_t coded_vals = 0;
+                // Sum(abs(level[]))
+                // In fact we only need the bottom bit and in some future
+                // version that may be all we calculate
+                unsigned int sum_abs;
+
+                coded_vals = get_greaterx_bits(s, nb_significant_coeff_flag, levels,
+                    &prev_subset_coded, &sum_abs, idx0_gt1, idx_gt2);
+
+                if (significant_coeff_flag_idx[0] - significant_coeff_flag_idx[nb_significant_coeff_flag - 1] <= 3)
+                    sign_hidden = 0;
+
+                // -- Start bypass block
+
+                bypass_start(s);
+
+                coeff_sign_flags = coeff_sign_flag_decode_bypass(s, nb_significant_coeff_flag - sign_hidden);
+
+                if (coded_vals != 0)
+                {
+                    const int rice_adaptation_enabled = s->ps.sps->persistent_rice_adaptation_enabled_flag;
+                    uint8_t * stat_coeff = !rice_adaptation_enabled ? NULL :
+                        lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1);
+                    int c_rice_param = !rice_adaptation_enabled ? 0 : *stat_coeff >> 2;
+                    int * level = levels - 1;
+
+                    do {
+                        {
+                            const unsigned int z = hevc_clz32(coded_vals) + 1;
+                            level += z;
+                            coded_vals <<= z;
+                        }

-            if (first_greater1_coeff_idx != -1) {
-                coeff_abs_level_greater1_flag[first_greater1_coeff_idx] += coeff_abs_level_greater2_flag_decode(s, c_idx, ctx_set);
-            }
-            if (!s->ps.pps->sign_data_hiding_flag || !sign_hidden ) {
-                coeff_sign_flag = coeff_sign_flag_decode(s, nb_significant_coeff_flag) << (16 - nb_significant_coeff_flag);
-            } else {
-                coeff_sign_flag = coeff_sign_flag_decode(s, nb_significant_coeff_flag - 1) << (16 - (nb_significant_coeff_flag - 1));
-            }
+                        {
+                            const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode_bypass(s, c_rice_param);
+                            const int trans_coeff_level = *level + last_coeff_abs_level_remaining + 1;

-            for (m = 0; m < n_end; m++) {
-                n = significant_coeff_flag_idx[m];
-                GET_COORD(offset, n);
-                if (m < 8) {
-                    trans_coeff_level = 1 + coeff_abs_level_greater1_flag[m];
-                    if (trans_coeff_level == ((m == first_greater1_coeff_idx) ? 3 : 2)) {
-                        int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
-
-                        trans_coeff_level += last_coeff_abs_level_remaining;
-                        if (trans_coeff_level > (3 << c_rice_param))
-                            c_rice_param = s->ps.sps->persistent_rice_adaptation_enabled_flag ? c_rice_param + 1 : FFMIN(c_rice_param + 1, 4);
-                        if (s->ps.sps->persistent_rice_adaptation_enabled_flag && !rice_init) {
-                            int c_rice_p_init = lc->stat_coeff[sb_type] / 4;
-                            if (last_coeff_abs_level_remaining >= (3 << c_rice_p_init))
-                                lc->stat_coeff[sb_type]++;
-                            else if (2 * last_coeff_abs_level_remaining < (1 << c_rice_p_init))
-                                if (lc->stat_coeff[sb_type] > 0)
-                                    lc->stat_coeff[sb_type]--;
-                            rice_init = 1;
+                            sum_abs += last_coeff_abs_level_remaining + 1;
+                            *level = trans_coeff_level;
+
+                            if (stat_coeff != NULL)
+                                update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
+                            stat_coeff = NULL;
+
+                            if (trans_coeff_level > (3 << c_rice_param) &&
+                                (c_rice_param < 4 || rice_adaptation_enabled))
+                                ++c_rice_param;
                         }
-                    }
-                } else {
-                    int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
-
-                    trans_coeff_level = 1 + last_coeff_abs_level_remaining;
-                    if (trans_coeff_level > (3 << c_rice_param))
-                        c_rice_param = s->ps.sps->persistent_rice_adaptation_enabled_flag ? c_rice_param + 1 : FFMIN(c_rice_param + 1, 4);
-                    if (s->ps.sps->persistent_rice_adaptation_enabled_flag && !rice_init) {
-                        int c_rice_p_init = lc->stat_coeff[sb_type] / 4;
-                        if (last_coeff_abs_level_remaining >= (3 << c_rice_p_init))
-                            lc->stat_coeff[sb_type]++;
-                        else if (2 * last_coeff_abs_level_remaining < (1 << c_rice_p_init))
-                            if (lc->stat_coeff[sb_type] > 0)
-                                lc->stat_coeff[sb_type]--;
-                        rice_init = 1;
-                    }
+                    } while (coded_vals != 0);
                 }
-                if (s->ps.pps->sign_data_hiding_flag && sign_hidden) {
-                    sum_abs += trans_coeff_level;
-                    if (n == first_nz_pos_in_cg && (sum_abs&1))
-                        trans_coeff_level = -trans_coeff_level;
+
+                // sign_hidden = 0 or 1 so we can combine the tests
+                if ((sign_hidden & sum_abs) != 0) {
+                    levels[nb_significant_coeff_flag - 1] = -levels[nb_significant_coeff_flag - 1];
                 }
-                if (coeff_sign_flag >> 15)
-                    trans_coeff_level = -trans_coeff_level;
-                coeff_sign_flag <<= 1;
-                if(!lc->cu.cu_transquant_bypass_flag) {
-                    if (s->ps.sps->scaling_list_enable_flag && !(transform_skip_flag && log2_trafo_size > 2)) {
-                        if(y_c || x_c || log2_trafo_size < 4) {
-                            switch(log2_trafo_size) {
-                                case 3: pos = (y_c << 3) + x_c; break;
-                                case 4: pos = ((y_c >> 1) << 3) + (x_c >> 1); break;
-                                case 5: pos = ((y_c >> 2) << 3) + (x_c >> 2); break;
-                                default: pos = (y_c << 2) + x_c; break;
-                            }
-                            scale_m = scale_matrix[pos];
-                        } else {
-                            scale_m = dc_scale;
-                        }
+
+                bypass_finish(s);
+
+                // -- Finish bypass block
+
+                // Scale loop
+                {
+                    int m = nb_significant_coeff_flag - 1;
+
+                    // Deal with DC component (if any) first
+                    if (i == 0 && significant_coeff_flag_idx[m] == 0)
+                    {
+                        const int k = (int32_t)(coeff_sign_flags << m) >> 31;
+                        blk_coeffs[0] = trans_scale_sat(
+                            (levels[m] ^ k) - k, scale, dc_scale, shift);
+                        --m;
                     }
-                    trans_coeff_level = (trans_coeff_level * (int64_t)scale * (int64_t)scale_m + add) >> shift;
-                    if(trans_coeff_level < 0) {
-                        if((~trans_coeff_level) & 0xFffffffffff8000)
-                            trans_coeff_level = -32768;
-                    } else {
-                        if(trans_coeff_level & 0xffffffffffff8000)
-                            trans_coeff_level = 32767;
+
+#if !USE_N_END_1
+                    // If N_END_1 set then m was at least 1 initially
+                    if (m >= 0)
+#endif
+                    {
+                        do {
+                            const xy_off_t * const xy_off = scan_xy_off +
+                                significant_coeff_flag_idx[m];
+                            const int k = (int32_t)(coeff_sign_flags << m) >> 31;
+
+                            blk_coeffs[xy_off->coeff] = trans_scale_sat(
+                                (levels[m] ^ k) - k,
+                                scale,
+                                blk_scale[xy_off->scale],
+                                shift);
+                        } while (--m >= 0);
                     }
                 }
-                coeffs[y_c * trafo_size + x_c] = trans_coeff_level;
+
             }
         }
-    }
+    } while ((i = next_subset(s, i, c_idx_nz,
+        significant_coeff_group_flag, scan_x_cg, scan_y_cg, &prev_sig)) >= 0);

     if (lc->cu.cu_transquant_bypass_flag) {
         if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
@@ -1467,7 +2223,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
             s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
         }
     } else {
-        if (transform_skip_flag) {
+        if (trans_skip_or_bypass) { // Must be trans_skip as we've already dealt with bypass
             int rot = s->ps.sps->transform_skip_rotation_enabled_flag &&
                       log2_trafo_size == 2 &&
                       lc->cu.pred_mode == MODE_INTRA;
@@ -1487,10 +2243,23 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
             }
         } else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) {
             s->hevcdsp.idct_4x4_luma(coeffs);
-        } else {
+        }
+#ifdef RPI
+        else if (!use_vpu)
+#else
+        else
+#endif
+        {
             int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
             if (max_xy == 0)
-                s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs);
+            {
+#ifdef RPI
+                if (use_dc)
+                    rpi_add_dc(s, log2_trafo_size, c_idx, x0, y0, coeffs);
+                else
+#endif
+                    s->hevcdsp.idct_dc[log2_trafo_size - 2](coeffs);
+            }
             else {
                 int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4;
                 if (max_xy < 4)
@@ -1510,7 +2279,14 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
             coeffs[i] = coeffs[i] + ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
         }
     }
+#ifdef RPI
+    if (!use_dc)
+    {
+        rpi_add_residual(s, log2_trafo_size, c_idx, x0, y0, coeffs);
+    }
+#else
     s->hevcdsp.transform_add[log2_trafo_size-2](dst, coeffs, stride);
+#endif
 }

 void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size)
diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
index 9fbcd1d8b8..df129e2e46 100644
--- a/libavcodec/hevc_filter.c
+++ b/libavcodec/hevc_filter.c
@@ -22,6 +22,12 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */

+//#define DISABLE_SAO
+//#define DISABLE_DEBLOCK
+//#define DISABLE_STRENGTHS
+// define DISABLE_DEBLOCK_NONREF for a 6% speed boost (by skipping deblocking on unimportant frames)
+//#define DISABLE_DEBLOCK_NONREF
+
 #include "libavutil/common.h"
 #include "libavutil/internal.h"

@@ -31,6 +37,16 @@

 #include "bit_depth_template.c"

+#ifdef RPI
+#include "rpi_qpu.h"
+#endif
+#if RPI_HEVC_SAND
+#include "rpi_zc.h"
+#include "libavutil/rpi_sand_fns.h"
+#else
+#define RPI_ZC_SAND_8_IN_10_BUF 0
+#endif
+
 #define LUMA 0
 #define CB 1
 #define CR 2
@@ -139,6 +155,15 @@ static int get_qPy(HEVCContext *s, int xC, int yC)
     return s->qp_y_tab[x + y * s->ps.sps->min_cb_width];
 }

+static inline unsigned int pixel_shift(const HEVCContext * const s, const unsigned int c_idx)
+{
+#if RPI_HEVC_SAND
+    return c_idx != 0 && av_rpi_is_sand_frame(s->frame) ? 1 + s->ps.sps->pixel_shift : s->ps.sps->pixel_shift;
+#else
+    return s->ps.sps->pixel_shift;
+#endif
+}
+
 static void copy_CTB(uint8_t *dst, const uint8_t *src, int width, int height,
                      intptr_t stride_dst, intptr_t stride_src)
 {
@@ -161,12 +186,21 @@ int i, j;
     }
 }

+// "DSP" these?
 static void copy_pixel(uint8_t *dst, const uint8_t *src, int pixel_shift)
 {
-    if (pixel_shift)
-        *(uint16_t *)dst = *(uint16_t *)src;
-    else
-        *dst = *src;
+    switch (pixel_shift)
+    {
+        case 2:
+            *(uint32_t *)dst = *(uint32_t *)src;
+            break;
+        case 1:
+            *(uint16_t *)dst = *(uint16_t *)src;
+            break;
+        default:
+            *dst = *src;
+            break;
+    }
 }

 static void copy_vert(uint8_t *dst, const uint8_t *src,
@@ -174,18 +208,29 @@ static void copy_vert(uint8_t *dst, const uint8_t *src,
                       int stride_dst, int stride_src)
 {
     int i;
-    if (pixel_shift == 0) {
-        for (i = 0; i < height; i++) {
-            *dst = *src;
-            dst += stride_dst;
-            src += stride_src;
-        }
-    } else {
-        for (i = 0; i < height; i++) {
-            *(uint16_t *)dst = *(uint16_t *)src;
-            dst += stride_dst;
-            src += stride_src;
-        }
+    switch (pixel_shift)
+    {
+        case 2:
+            for (i = 0; i < height; i++) {
+                *(uint32_t *)dst = *(uint32_t *)src;
+                dst += stride_dst;
+                src += stride_src;
+            }
+            break;
+        case 1:
+            for (i = 0; i < height; i++) {
+                *(uint16_t *)dst = *(uint16_t *)src;
+                dst += stride_dst;
+                src += stride_src;
+            }
+            break;
+        default:
+            for (i = 0; i < height; i++) {
+                *dst = *src;
+                dst += stride_dst;
+                src += stride_src;
+            }
+            break;
     }
 }

@@ -193,7 +238,7 @@ static void copy_CTB_to_hv(HEVCContext *s, const uint8_t *src,
                            int stride_src, int x, int y, int width, int height,
                            int c_idx, int x_ctb, int y_ctb)
 {
-    int sh = s->ps.sps->pixel_shift;
+    const unsigned int sh = pixel_shift(s, c_idx);
     int w = s->ps.sps->width >> s->ps.sps->hshift[c_idx];
     int h = s->ps.sps->height >> s->ps.sps->vshift[c_idx];

@@ -224,13 +269,14 @@ static void restore_tqb_pixels(HEVCContext *s,
         int y_min        = ((y0         ) >> s->ps.sps->log2_min_pu_size);
         int x_max        = ((x0 + width ) >> s->ps.sps->log2_min_pu_size);
         int y_max        = ((y0 + height) >> s->ps.sps->log2_min_pu_size);
-        int len          = (min_pu_size >> hshift) << s->ps.sps->pixel_shift;
+        const unsigned int sh = pixel_shift(s, c_idx);
+        int len          = (min_pu_size >> hshift) << sh;
         for (y = y_min; y < y_max; y++) {
             for (x = x_min; x < x_max; x++) {
                 if (s->is_pcm[y * s->ps.sps->min_pu_width + x]) {
                     int n;
-                    uint8_t *src = src1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_src + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << s->ps.sps->pixel_shift);
-                    const uint8_t *dst = dst1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_dst + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << s->ps.sps->pixel_shift);
+                    uint8_t *src = src1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_src + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << sh);
+                    const uint8_t *dst = dst1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_dst + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << sh);
                     for (n = 0; n < (min_pu_size >> vshift); n++) {
                         memcpy(src, dst, len);
                         src += stride_src;
@@ -246,7 +292,13 @@ static void restore_tqb_pixels(HEVCContext *s,

 static void sao_filter_CTB(HEVCContext *s, int x, int y)
 {
-    static const uint8_t sao_tab[8] = { 0, 1, 2, 2, 3, 3, 4, 4 };
+#if SAO_FILTER_N == 5
+    static const uint8_t sao_tab[8] = { 0 /* 8 */, 1 /* 16 */, 2 /* 24 */, 2 /* 32 */, 3, 3 /* 48 */, 4, 4 /* 64 */};
+#elif SAO_FILTER_N == 6
+    static const uint8_t sao_tab[8] = { 0 /* 8 */, 1 /* 16 */, 5 /* 24 */, 2 /* 32 */, 3, 3 /* 48 */, 4, 4 /* 64 */};
+#else
+#error Confused by size of sao fn array
+#endif
     HEVCLocalContext *lc = s->HEVClc;
     int c_idx;
     int edges[4];  // 0 left 1 top 2 right 3 bottom
@@ -267,12 +319,22 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
     uint8_t right_tile_edge  = 0;
     uint8_t up_tile_edge     = 0;
     uint8_t bottom_tile_edge = 0;
+#if RPI_HEVC_SAND
+    const int sliced = av_rpi_is_sand_frame(s->frame);
+    const int plane_count = sliced ? 2 : (s->ps.sps->chroma_format_idc ? 3 : 1);
+#else
+    const int plane_count = (s->ps.sps->chroma_format_idc ? 3 : 1);
+#endif

     edges[0]   = x_ctb == 0;
     edges[1]   = y_ctb == 0;
     edges[2]   = x_ctb == s->ps.sps->ctb_width  - 1;
     edges[3]   = y_ctb == s->ps.sps->ctb_height - 1;

+#ifdef DISABLE_SAO
+    return;
+#endif
+
     if (restore) {
         if (!edges[0]) {
             left_tile_edge  = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1]];
@@ -304,7 +366,7 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
         }
     }

-    for (c_idx = 0; c_idx < (s->ps.sps->chroma_format_idc ? 3 : 1); c_idx++) {
+    for (c_idx = 0; c_idx < plane_count; c_idx++) {
         int x0       = x >> s->ps.sps->hshift[c_idx];
         int y0       = y >> s->ps.sps->vshift[c_idx];
         int stride_src = s->frame->linesize[c_idx];
@@ -313,28 +375,84 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
         int width    = FFMIN(ctb_size_h, (s->ps.sps->width  >> s->ps.sps->hshift[c_idx]) - x0);
         int height   = FFMIN(ctb_size_v, (s->ps.sps->height >> s->ps.sps->vshift[c_idx]) - y0);
         int tab      = sao_tab[(FFALIGN(width, 8) >> 3) - 1];
-        uint8_t *src = &s->frame->data[c_idx][y0 * stride_src + (x0 << s->ps.sps->pixel_shift)];
-        int stride_dst;
+        ptrdiff_t stride_dst;
         uint8_t *dst;

+#if RPI_HEVC_SAND
+        const unsigned int sh = s->ps.sps->pixel_shift + (sliced && c_idx != 0);
+        const int wants_lr = sao->type_idx[c_idx] == SAO_EDGE && sao->eo_class[c_idx] != 1 /* Vertical */;
+        uint8_t * const src = !sliced ?
+                &s->frame->data[c_idx][y0 * stride_src + (x0 << sh)] :
+            c_idx == 0 ?
+                av_rpi_sand_frame_pos_y(s->frame, x0, y0) :
+                av_rpi_sand_frame_pos_c(s->frame, x0, y0);
+        const uint8_t * const src_l = edges[0] || !wants_lr ? NULL :
+            !sliced ? src - (1 << sh) :
+            c_idx == 0 ?
+                av_rpi_sand_frame_pos_y(s->frame, x0 - 1, y0) :
+                av_rpi_sand_frame_pos_c(s->frame, x0 - 1, y0);
+        const uint8_t * const src_r = edges[2] || !wants_lr ? NULL :
+            !sliced ? src + (width << sh) :
+            c_idx == 0 ?
+                av_rpi_sand_frame_pos_y(s->frame, x0 + width, y0) :
+                av_rpi_sand_frame_pos_c(s->frame, x0 + width, y0);
+
+
+        if (sliced && c_idx > 1) {
+            break;
+        }
+#else
+        const unsigned int sh = s->ps.sps->pixel_shift;
+        const int wants_lr = sao->type_idx[c_idx] == SAO_EDGE && sao->eo_class[c_idx] != 1 /* Vertical */;
+        uint8_t * const src = &s->frame->data[c_idx][y0 * stride_src + (x0 << s->ps.sps->pixel_shift)];
+        const uint8_t * const src_l = edges[0] || !wants_lr ? NULL : src - (1 << sh);
+        const uint8_t * const src_r = edges[2] || !wants_lr ? NULL : src + (width << sh);
+#endif
+
         switch (sao->type_idx[c_idx]) {
         case SAO_BAND:
             copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
                            x_ctb, y_ctb);
             if (s->ps.pps->transquant_bypass_enable_flag ||
                 (s->ps.sps->pcm.loop_filter_disable_flag && s->ps.sps->pcm_enabled_flag)) {
-            dst = lc->edge_emu_buffer;
-            stride_dst = 2*MAX_PB_SIZE;
-            copy_CTB(dst, src, width << s->ps.sps->pixel_shift, height, stride_dst, stride_src);
-            s->hevcdsp.sao_band_filter[tab](src, dst, stride_src, stride_dst,
-                                            sao->offset_val[c_idx], sao->band_position[c_idx],
-                                            width, height);
-            restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
-                               x, y, width, height, c_idx);
+                dst = lc->edge_emu_buffer;
+                stride_dst = 2*MAX_PB_SIZE;
+                copy_CTB(dst, src, width << sh, height, stride_dst, stride_src);
+#if RPI_HEVC_SAND
+                if (sliced && c_idx != 0)
+                {
+                    s->hevcdsp.sao_band_filter_c[tab](src, dst, stride_src, stride_dst,
+                                                    sao->offset_val[1], sao->band_position[1],
+                                                    sao->offset_val[2], sao->band_position[2],
+                                                    width, height);
+                }
+                else
+#endif
+                {
+                    s->hevcdsp.sao_band_filter[tab](src, dst, stride_src, stride_dst,
+                                                    sao->offset_val[c_idx], sao->band_position[c_idx],
+                                                    width, height);
+                }
+                restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
+                                   x, y, width, height, c_idx);
             } else {
-            s->hevcdsp.sao_band_filter[tab](src, src, stride_src, stride_src,
-                                            sao->offset_val[c_idx], sao->band_position[c_idx],
-                                            width, height);
+#if RPI_HEVC_SAND
+                if (sliced && c_idx != 0)
+                {
+//                    printf("x,y=%d,%d data[1]=%p, src=%p\n", x0, y0, s->frame->data[1], src);
+
+                    s->hevcdsp.sao_band_filter_c[tab](src, src, stride_src, stride_src,
+                                                    sao->offset_val[1], sao->band_position[1],
+                                                    sao->offset_val[2], sao->band_position[2],
+                                                    width, height);
+                }
+                else
+#endif
+                {
+                    s->hevcdsp.sao_band_filter[tab](src, src, stride_src, stride_src,
+                                                    sao->offset_val[c_idx], sao->band_position[c_idx],
+                                                    width, height);
+                }
             }
             sao->type_idx[c_idx] = SAO_APPLIED;
             break;
@@ -342,108 +460,118 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
         {
             int w = s->ps.sps->width >> s->ps.sps->hshift[c_idx];
             int h = s->ps.sps->height >> s->ps.sps->vshift[c_idx];
-            int left_edge = edges[0];
             int top_edge = edges[1];
-            int right_edge = edges[2];
             int bottom_edge = edges[3];
-            int sh = s->ps.sps->pixel_shift;
-            int left_pixels, right_pixels;

             stride_dst = 2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE;
             dst = lc->edge_emu_buffer + stride_dst + AV_INPUT_BUFFER_PADDING_SIZE;

             if (!top_edge) {
-                int left = 1 - left_edge;
-                int right = 1 - right_edge;
-                const uint8_t *src1[2];
                 uint8_t *dst1;
-                int src_idx, pos;
+                int src_idx;
+                const uint8_t * const src_spb = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb - 1) * w + x0) << sh);

-                dst1 = dst - stride_dst - (left << sh);
-                src1[0] = src - stride_src - (left << sh);
-                src1[1] = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb - 1) * w + x0 - left) << sh);
-                pos = 0;
-                if (left) {
+                dst1 = dst - stride_dst;
+
+                if (src_l != NULL) {
                     src_idx = (CTB(s->sao, x_ctb-1, y_ctb-1).type_idx[c_idx] ==
                                SAO_APPLIED);
-                    copy_pixel(dst1, src1[src_idx], sh);
-                    pos += (1 << sh);
+                    copy_pixel(dst1 - (1 << sh), src_idx ? src_spb - (1 << sh) : src_l - stride_src, sh);
                 }
+
                 src_idx = (CTB(s->sao, x_ctb, y_ctb-1).type_idx[c_idx] ==
                            SAO_APPLIED);
-                memcpy(dst1 + pos, src1[src_idx] + pos, width << sh);
-                if (right) {
-                    pos += width << sh;
+                memcpy(dst1, src_idx ? src_spb : src - stride_src, width << sh);
+
+                if (src_r != NULL) {
                     src_idx = (CTB(s->sao, x_ctb+1, y_ctb-1).type_idx[c_idx] ==
                                SAO_APPLIED);
-                    copy_pixel(dst1 + pos, src1[src_idx] + pos, sh);
+                    copy_pixel(dst1 + (width << sh), src_idx ? src_spb + (width << sh) : src_r - stride_src, sh);
                 }
             }
             if (!bottom_edge) {
-                int left = 1 - left_edge;
-                int right = 1 - right_edge;
-                const uint8_t *src1[2];
-                uint8_t *dst1;
-                int src_idx, pos;
+                uint8_t * const dst1 = dst + height * stride_dst;
+                int src_idx;
+                const uint8_t * const src_spb = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 2) * w + x0) << sh);
+                const unsigned int hoff = height * stride_src;

-                dst1 = dst + height * stride_dst - (left << sh);
-                src1[0] = src + height * stride_src - (left << sh);
-                src1[1] = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 2) * w + x0 - left) << sh);
-                pos = 0;
-                if (left) {
+                if (src_l != NULL) {
                     src_idx = (CTB(s->sao, x_ctb-1, y_ctb+1).type_idx[c_idx] ==
                                SAO_APPLIED);
-                    copy_pixel(dst1, src1[src_idx], sh);
-                    pos += (1 << sh);
+                    copy_pixel(dst1 - (1 << sh), src_idx ? src_spb - (1 << sh) : src_l + hoff, sh);
                 }
+
                 src_idx = (CTB(s->sao, x_ctb, y_ctb+1).type_idx[c_idx] ==
                            SAO_APPLIED);
-                memcpy(dst1 + pos, src1[src_idx] + pos, width << sh);
-                if (right) {
-                    pos += width << sh;
+                memcpy(dst1, src_idx ? src_spb : src + hoff, width << sh);
+
+                if (src_r != NULL) {
                     src_idx = (CTB(s->sao, x_ctb+1, y_ctb+1).type_idx[c_idx] ==
                                SAO_APPLIED);
-                    copy_pixel(dst1 + pos, src1[src_idx] + pos, sh);
+                    copy_pixel(dst1 + (width << sh), src_idx ? src_spb + (width << sh) : src_r + hoff, sh);
                 }
             }
-            left_pixels = 0;
-            if (!left_edge) {
+            if (src_l != NULL) {
                 if (CTB(s->sao, x_ctb-1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
                     copy_vert(dst - (1 << sh),
                               s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb - 1) * h + y0) << sh),
                               sh, height, stride_dst, 1 << sh);
                 } else {
-                    left_pixels = 1;
+                    copy_vert(dst - (1 << sh),
+                              src_l,
+                              sh, height, stride_dst, stride_src);
                 }
             }
-            right_pixels = 0;
-            if (!right_edge) {
+            if (src_r != NULL) {
                 if (CTB(s->sao, x_ctb+1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
                     copy_vert(dst + (width << sh),
                               s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 2) * h + y0) << sh),
                               sh, height, stride_dst, 1 << sh);
                 } else {
-                    right_pixels = 1;
+                    copy_vert(dst + (width << sh),
+                              src_r,
+                              sh, height, stride_dst, stride_src);
                 }
             }

-            copy_CTB(dst - (left_pixels << sh),
-                     src - (left_pixels << sh),
-                     (width + left_pixels + right_pixels) << sh,
+            copy_CTB(dst,
+                     src,
+                     width << sh,
                      height, stride_dst, stride_src);

             copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
                            x_ctb, y_ctb);
-            s->hevcdsp.sao_edge_filter[tab](src, dst, stride_src, sao->offset_val[c_idx],
-                                            sao->eo_class[c_idx], width, height);
-            s->hevcdsp.sao_edge_restore[restore](src, dst,
-                                                stride_src, stride_dst,
-                                                sao,
-                                                edges, width,
-                                                height, c_idx,
-                                                vert_edge,
-                                                horiz_edge,
-                                                diag_edge);
+#if RPI_HEVC_SAND
+            if (sliced && c_idx != 0)
+            {
+                // Class always the same for both U & V (which is just as well :-))
+                s->hevcdsp.sao_edge_filter_c[tab](src, dst, stride_src,
+                                                sao->offset_val[1], sao->offset_val[2], sao->eo_class[1],
+                                                width, height);
+                s->hevcdsp.sao_edge_restore_c[restore](src, dst,
+                                                    stride_src, stride_dst,
+                                                    sao,
+                                                    edges, width,
+                                                    height, c_idx,
+                                                    vert_edge,
+                                                    horiz_edge,
+                                                    diag_edge);
+            }
+            else
+#endif
+            {
+                s->hevcdsp.sao_edge_filter[tab](src, dst, stride_src, sao->offset_val[c_idx],
+                                                sao->eo_class[c_idx], width, height);
+                s->hevcdsp.sao_edge_restore[restore](src, dst,
+                                                    stride_src, stride_dst,
+                                                    sao,
+                                                    edges, width,
+                                                    height, c_idx,
+                                                    vert_edge,
+                                                    horiz_edge,
+                                                    diag_edge);
+            }
+            // ??? Does this actually work for chroma ???
             restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
                                x, y, width, height, c_idx);
             sao->type_idx[c_idx] = SAO_APPLIED;
@@ -451,8 +579,30 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
         }
         }
     }
+
+#if RPI_ZC_SAND_8_IN_10_BUF
+    if (s->frame->format == AV_PIX_FMT_SAND64_10 && s->frame->buf[RPI_ZC_SAND_8_IN_10_BUF] != NULL &&
+        (((x + (1 << (s->ps.sps->log2_ctb_size))) & 255) == 0 || edges[2]))
+    {
+        const unsigned int stride1 = s->frame->linesize[0];
+        const unsigned int stride2 = av_rpi_sand_frame_stride2(s->frame);
+        const unsigned int xoff = (x >> 8) * stride2 * stride1;
+        const unsigned int ctb_size = (1 << s->ps.sps->log2_ctb_size);
+        const uint8_t * const sy = s->frame->data[0] + xoff * 4 + y * stride1;
+        uint8_t * const dy = s->frame->buf[4]->data + xoff * 2 + y * stride1;
+        const uint8_t * const sc = s->frame->data[1] + xoff * 4 + (y >> 1) * stride1;
+        uint8_t * const dc = s->frame->buf[4]->data + (s->frame->data[1] - s->frame->data[0]) + xoff * 2 + (y >> 1) * stride1;
+        const unsigned int wy = !edges[2] ? 256 : s->ps.sps->width - (x & ~255);
+        const unsigned int hy = !edges[3] ? ctb_size : s->ps.sps->height - y;
+
+//        printf("dy=%p/%p, stride1=%d, stride2=%d, sy=%p/%p, wy=%d, hy=%d, x=%d, y=%d, cs=%d\n", dy, dc, stride1, stride2, sy, sc, wy, hy, x, y, ctb_size);
+        av_rpi_sand16_to_sand8(dy, stride1, stride2, sy, stride1, stride2, wy, hy, 3);
+        av_rpi_sand16_to_sand8(dc, stride1, stride2, sc, stride1, stride2, wy, hy >> 1, 3);
+    }
+#endif
 }

+// Returns 2 or 0.
 static int get_pcm(HEVCContext *s, int x, int y)
 {
     int log2_min_pu_size = s->ps.sps->log2_min_pu_size;
@@ -479,7 +629,7 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
     uint8_t *src;
     int x, y;
     int chroma, beta;
-    int32_t c_tc[2], tc[2];
+    int32_t c_tc[4], tc[2];
     uint8_t no_p[2] = { 0 };
     uint8_t no_q[2] = { 0 };

@@ -496,6 +646,15 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
                 s->ps.sps->pcm.loop_filter_disable_flag) ||
                s->ps.pps->transquant_bypass_enable_flag;

+#ifdef DISABLE_DEBLOCK_NONREF
+    if (!s->used_for_ref)
+      return; // Don't deblock non-reference frames
+#endif
+#ifdef DISABLE_DEBLOCK
+    return;
+#endif
+    if (!s->used_for_ref && s->avctx->skip_loop_filter >= AVDISCARD_NONREF)
+        return;
     if (x0) {
         left_tc_offset   = s->deblock[ctb - 1].tc_offset;
         left_beta_offset = s->deblock[ctb - 1].beta_offset;
@@ -529,19 +688,51 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)

                 tc[0]   = bs0 ? TC_CALC(qp, bs0) : 0;
                 tc[1]   = bs1 ? TC_CALC(qp, bs1) : 0;
-                src     = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)];
                 if (pcmf) {
                     no_p[0] = get_pcm(s, x - 1, y);
                     no_p[1] = get_pcm(s, x - 1, y + 4);
                     no_q[0] = get_pcm(s, x, y);
                     no_q[1] = get_pcm(s, x, y + 4);
-                    s->hevcdsp.hevc_v_loop_filter_luma_c(src,
-                                                         s->frame->linesize[LUMA],
-                                                         beta, tc, no_p, no_q);
-                } else
-                    s->hevcdsp.hevc_v_loop_filter_luma(src,
-                                                       s->frame->linesize[LUMA],
-                                                       beta, tc, no_p, no_q);
+                }
+#if RPI_HEVC_SAND
+                if (av_rpi_is_sand_frame(s->frame)) {
+
+                    // This copes properly with no_p/no_q
+                    s->hevcdsp.hevc_v_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y),
+                                                     s->frame->linesize[LUMA],
+                                                     beta, tc, no_p, no_q,
+                                                     av_rpi_sand_frame_pos_y(s->frame, x - 4, y));
+                }
+                else
+#endif
+                {
+                    src = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)];
+                    if (pcmf) {
+                        // Standard DSP code is broken if no_p / no_q is set
+                        s->hevcdsp.hevc_v_loop_filter_luma_c(src,
+                                                           s->frame->linesize[LUMA],
+                                                           beta, tc, no_p, no_q);
+                    }
+                    else
+#ifdef RPI_DEBLOCK_VPU
+                    if (s->enable_rpi_deblock) {
+                        uint8_t (*setup)[2][2][4];
+                        int num16 = (y>>4)*s->setup_width + (x>>4);
+                        int a = ((y>>3) & 1) << 1;
+                        int b = (x>>3) & 1;
+                        setup = s->dvq->y_setup_arm[num16];
+                        setup[0][b][0][a] = beta;
+                        setup[0][b][0][a + 1] = beta;
+                        setup[0][b][1][a] = tc[0];
+                        setup[0][b][1][a + 1] = tc[1];
+                    } else
+#endif
+                    {
+                        s->hevcdsp.hevc_v_loop_filter_luma(src,
+                                                           s->frame->linesize[LUMA],
+                                                           beta, tc, no_p, no_q);
+                    }
+                }
             }
         }

@@ -561,7 +752,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
                 beta = betatable[av_clip(qp + beta_offset, 0, MAX_QP)];
                 tc[0]   = bs0 ? TC_CALC(qp, bs0) : 0;
                 tc[1]   = bs1 ? TC_CALC(qp, bs1) : 0;
-                src     = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)];
+                src =
+#if RPI_HEVC_SAND
+                    av_rpi_is_sand_frame(s->frame) ?
+                        av_rpi_sand_frame_pos_y(s->frame, x, y) :
+#endif
+                        &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)];
                 if (pcmf) {
                     no_p[0] = get_pcm(s, x, y - 1);
                     no_p[1] = get_pcm(s, x + 4, y - 1);
@@ -571,6 +767,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
                                                          s->frame->linesize[LUMA],
                                                          beta, tc, no_p, no_q);
                 } else
+#ifdef RPI_DEBLOCK_VPU
+                if (s->enable_rpi_deblock) {
+                    uint8_t (*setup)[2][2][4];
+                    int num16 = (y>>4)*s->setup_width + (x>>4);
+                    int a = ((x>>3) & 1) << 1;
+                    int b = (y>>3) & 1;
+                    setup = s->dvq->y_setup_arm[num16];
+                    setup[1][b][0][a] = beta;
+                    setup[1][b][0][a + 1] = beta;
+                    setup[1][b][1][a] = tc[0];
+                    setup[1][b][1][a + 1] = tc[1];
+                } else
+#endif
                     s->hevcdsp.hevc_h_loop_filter_luma(src,
                                                        s->frame->linesize[LUMA],
                                                        beta, tc, no_p, no_q);
@@ -579,6 +788,96 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
     }

     if (s->ps.sps->chroma_format_idc) {
+#if RPI_HEVC_SAND
+        if (av_rpi_is_sand_frame(s->frame)) {
+            const int v = 2;
+            const int h = 2;
+
+            // vertical filtering chroma
+            for (y = y0; y < y_end; y += 8 * v) {
+//                const int demi_y = y + 4 * v >= s->ps.sps->height;
+                const int demi_y = 0;
+                for (x = x0 ? x0 : 8 * h; x < x_end; x += 8 * h) {
+                    const int bs0 = s->vertical_bs[(x +  y          * s->bs_width) >> 2];
+                    const int bs1 = s->vertical_bs[(x + (y + 4 * v) * s->bs_width) >> 2];
+
+                    if ((bs0 == 2) || (bs1 == 2)) {
+                        const int qp0 = (get_qPy(s, x - 1, y)         + get_qPy(s, x, y)         + 1) >> 1;
+                        const int qp1 = (get_qPy(s, x - 1, y + 4 * v) + get_qPy(s, x, y + 4 * v) + 1) >> 1;
+                        unsigned int no_f = !demi_y ? 0 : 2 | 8;
+
+                        // tc_offset here should be set to cur_tc_offset I think
+                        const uint32_t tc4 =
+                            ((bs0 != 2) ? 0 : chroma_tc(s, qp0, 1, cur_tc_offset) | (chroma_tc(s, qp0, 2, cur_tc_offset) << 16)) |
+                            ((bs1 != 2) ? 0 : ((chroma_tc(s, qp1, 1, cur_tc_offset) | (chroma_tc(s, qp1, 2, cur_tc_offset) << 16)) << 8));
+
+                        if (tc4 == 0)
+                            continue;
+
+                        if (pcmf) {
+                            no_f =
+                                (get_pcm(s, x - 1, y) ? 1 : 0) |
+                                (get_pcm(s, x - 1, y + 4 * v) ? 2 : 0) |
+                                (get_pcm(s, x, y) ? 4 : 0) |
+                                (get_pcm(s, x, y + 4 * v) ? 8 : 0);
+                            if (no_f == 0xf)
+                                continue;
+                        }
+
+                        s->hevcdsp.hevc_v_loop_filter_uv2(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
+                                                       s->frame->linesize[1],
+                                                       tc4,
+                                                       av_rpi_sand_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1),
+                                                       no_f);
+                    }
+                }
+
+                if (y == 0)
+                    continue;
+
+                // horizontal filtering chroma
+                tc_offset = x0 ? left_tc_offset : cur_tc_offset;
+                x_end2 = x_end;
+                if (x_end != s->ps.sps->width)
+                    x_end2 = x_end - 8 * h;
+
+                for (x = x0 ? x0 - 8 * h: 0; x < x_end2; x += 8 * h) {
+//                    const int demi_x = x + 4 * v >= s->ps.sps->width;
+                    const int demi_x = 0;
+
+                    const int bs0 = s->horizontal_bs[( x          + y * s->bs_width) >> 2];
+                    const int bs1 = s->horizontal_bs[((x + 4 * h) + y * s->bs_width) >> 2];
+                    if ((bs0 == 2) || (bs1 == 2)) {
+                        const int qp0 = bs0 == 2 ? (get_qPy(s, x,         y - 1) + get_qPy(s, x,         y) + 1) >> 1 : 0;
+                        const int qp1 = bs1 == 2 ? (get_qPy(s, x + 4 * h, y - 1) + get_qPy(s, x + 4 * h, y) + 1) >> 1 : 0;
+                        const uint32_t tc4 =
+                            ((bs0 != 2) ? 0 : chroma_tc(s, qp0, 1, tc_offset) | (chroma_tc(s, qp0, 2, tc_offset) << 16)) |
+                            ((bs1 != 2) ? 0 : ((chroma_tc(s, qp1, 1, cur_tc_offset) | (chroma_tc(s, qp1, 2, cur_tc_offset) << 16)) << 8));
+                        unsigned int no_f = !demi_x ? 0 : 2 | 8;
+
+                        if (tc4 == 0)
+                            continue;
+
+                        if (pcmf) {
+                            no_f =
+                                (get_pcm(s, x,         y - 1) ? 1 : 0) |
+                                (get_pcm(s, x + 4 * h, y - 1) ? 2 : 0) |
+                                (get_pcm(s, x,         y)     ? 4 : 0) |
+                                (get_pcm(s, x + 4 * h, y)     ? 8 : 0);
+
+                            if (no_f == 0xf)
+                                continue;
+                        }
+
+                        s->hevcdsp.hevc_h_loop_filter_uv(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
+                                                             s->frame->linesize[1],
+                                                             tc4, no_f);
+                    }
+                }
+            }
+        }
+        else
+#endif
         for (chroma = 1; chroma <= 2; chroma++) {
             int h = 1 << s->ps.sps->hshift[chroma];
             int v = 1 << s->ps.sps->vshift[chroma];
@@ -595,7 +894,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)

                         c_tc[0] = (bs0 == 2) ? chroma_tc(s, qp0, chroma, tc_offset) : 0;
                         c_tc[1] = (bs1 == 2) ? chroma_tc(s, qp1, chroma, tc_offset) : 0;
-                        src       = &s->frame->data[chroma][(y >> s->ps.sps->vshift[chroma]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[chroma]) << s->ps.sps->pixel_shift)];
+                        src =
+#if RPI_HEVC_SAND
+                            av_rpi_is_sand_frame(s->frame) ?
+                                av_rpi_sand_frame_pos_c(s->frame, x >> s->ps.sps->hshift[chroma], y >> s->ps.sps->vshift[chroma]) :
+#endif
+                                &s->frame->data[chroma][(y >> s->ps.sps->vshift[chroma]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[chroma]) << s->ps.sps->pixel_shift)];
                         if (pcmf) {
                             no_p[0] = get_pcm(s, x - 1, y);
                             no_p[1] = get_pcm(s, x - 1, y + (4 * v));
@@ -605,9 +909,23 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
                                                                    s->frame->linesize[chroma],
                                                                    c_tc, no_p, no_q);
                         } else
+#ifdef RPI_DEBLOCK_VPU
+                        if (s->enable_rpi_deblock) {
+                            uint8_t (*setup)[2][2][4];
+                            int xc = x>>s->ps.sps->hshift[chroma];
+                            int yc = y>>s->ps.sps->vshift[chroma];
+                            int num16 = (yc>>4)*s->uv_setup_width + (xc>>4);
+                            int a = ((yc>>3) & 1) << 1;
+                            int b = (xc>>3) & 1;
+                            setup = s->dvq->uv_setup_arm[num16];
+                            setup[0][b][0][a] = c_tc[0];
+                            setup[0][b][0][a + 1] = c_tc[1];
+                        } else
+#endif
                             s->hevcdsp.hevc_v_loop_filter_chroma(src,
                                                                  s->frame->linesize[chroma],
                                                                  c_tc, no_p, no_q);
+
                     }
                 }

@@ -628,7 +946,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)

                         c_tc[0]   = bs0 == 2 ? chroma_tc(s, qp0, chroma, tc_offset)     : 0;
                         c_tc[1]   = bs1 == 2 ? chroma_tc(s, qp1, chroma, cur_tc_offset) : 0;
-                        src       = &s->frame->data[chroma][(y >> s->ps.sps->vshift[1]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
+                        src =
+#if RPI_HEVC_SAND
+                            av_rpi_is_sand_frame(s->frame) ?
+                                av_rpi_sand_frame_pos_c(s->frame, x >> s->ps.sps->hshift[chroma], y >> s->ps.sps->vshift[chroma]) :
+#endif
+                                &s->frame->data[chroma][(y >> s->ps.sps->vshift[1]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
                         if (pcmf) {
                             no_p[0] = get_pcm(s, x,           y - 1);
                             no_p[1] = get_pcm(s, x + (4 * h), y - 1);
@@ -638,6 +961,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
                                                                    s->frame->linesize[chroma],
                                                                    c_tc, no_p, no_q);
                         } else
+#ifdef RPI_DEBLOCK_VPU
+                        if (s->enable_rpi_deblock) {
+                            uint8_t (*setup)[2][2][4];
+                            int xc = x>>s->ps.sps->hshift[chroma];
+                            int yc = y>>s->ps.sps->vshift[chroma];
+                            int num16 = (yc>>4)*s->uv_setup_width + (xc>>4);
+                            int a = ((xc>>3) & 1) << 1;
+                            int b = (yc>>3) & 1;
+                            setup = s->dvq->uv_setup_arm[num16];
+                            setup[1][b][0][a] = c_tc[0];
+                            setup[1][b][0][a + 1] = c_tc[1];
+                        } else
+#endif
                             s->hevcdsp.hevc_h_loop_filter_chroma(src,
                                                                  s->frame->linesize[chroma],
                                                                  c_tc, no_p, no_q);
@@ -648,69 +984,6 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
     }
 }

-static int boundary_strength(HEVCContext *s, MvField *curr, MvField *neigh,
-                             RefPicList *neigh_refPicList)
-{
-    if (curr->pred_flag == PF_BI &&  neigh->pred_flag == PF_BI) {
-        // same L0 and L1
-        if (s->ref->refPicList[0].list[curr->ref_idx[0]] == neigh_refPicList[0].list[neigh->ref_idx[0]]  &&
-            s->ref->refPicList[0].list[curr->ref_idx[0]] == s->ref->refPicList[1].list[curr->ref_idx[1]] &&
-            neigh_refPicList[0].list[neigh->ref_idx[0]] == neigh_refPicList[1].list[neigh->ref_idx[1]]) {
-            if ((FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
-                 FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4) &&
-                (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
-                 FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4))
-                return 1;
-            else
-                return 0;
-        } else if (neigh_refPicList[0].list[neigh->ref_idx[0]] == s->ref->refPicList[0].list[curr->ref_idx[0]] &&
-                   neigh_refPicList[1].list[neigh->ref_idx[1]] == s->ref->refPicList[1].list[curr->ref_idx[1]]) {
-            if (FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
-                FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4)
-                return 1;
-            else
-                return 0;
-        } else if (neigh_refPicList[1].list[neigh->ref_idx[1]] == s->ref->refPicList[0].list[curr->ref_idx[0]] &&
-                   neigh_refPicList[0].list[neigh->ref_idx[0]] == s->ref->refPicList[1].list[curr->ref_idx[1]]) {
-            if (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
-                FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4)
-                return 1;
-            else
-                return 0;
-        } else {
-            return 1;
-        }
-    } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV
-        Mv A, B;
-        int ref_A, ref_B;
-
-        if (curr->pred_flag & 1) {
-            A     = curr->mv[0];
-            ref_A = s->ref->refPicList[0].list[curr->ref_idx[0]];
-        } else {
-            A     = curr->mv[1];
-            ref_A = s->ref->refPicList[1].list[curr->ref_idx[1]];
-        }
-
-        if (neigh->pred_flag & 1) {
-            B     = neigh->mv[0];
-            ref_B = neigh_refPicList[0].list[neigh->ref_idx[0]];
-        } else {
-            B     = neigh->mv[1];
-            ref_B = neigh_refPicList[1].list[neigh->ref_idx[1]];
-        }
-
-        if (ref_A == ref_B) {
-            if (FFABS(A.x - B.x) >= 4 || FFABS(A.y - B.y) >= 4)
-                return 1;
-            else
-                return 0;
-        } else
-            return 1;
-    }
-
-    return 1;
-}

 void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
                                            int log2_trafo_size)
@@ -721,10 +994,22 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
     int log2_min_tu_size = s->ps.sps->log2_min_tb_size;
     int min_pu_width     = s->ps.sps->min_pu_width;
     int min_tu_width     = s->ps.sps->min_tb_width;
-    int is_intra = tab_mvf[(y0 >> log2_min_pu_size) * min_pu_width +
-                           (x0 >> log2_min_pu_size)].pred_flag == PF_INTRA;
     int boundary_upper, boundary_left;
-    int i, j, bs;
+    int i, j;
+    RefPicList *rpl      = s->ref->refPicList;
+    const unsigned int log2_dup = FFMIN(log2_min_pu_size, log2_trafo_size);
+    const unsigned int min_pu_in_4pix = 1 << (log2_dup - 2);  // Dup
+    const unsigned int trafo_in_min_pus = 1 << (log2_trafo_size - log2_dup); // Rep
+    int y_pu             = y0 >> log2_min_pu_size;
+    int x_pu             = x0 >> log2_min_pu_size;
+    MvField *curr        = &tab_mvf[y_pu * min_pu_width + x_pu];
+    int is_intra         = curr->pred_flag == PF_INTRA;
+    int inc              = log2_min_pu_size == 2 ? 2 : 1;
+    uint8_t *bs;
+
+#ifdef DISABLE_STRENGTHS
+    return;
+#endif

     boundary_upper = y0 > 0 && !(y0 & 7);
     if (boundary_upper &&
@@ -736,34 +1021,56 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
           (y0 % (1 << s->ps.sps->log2_ctb_size)) == 0)))
         boundary_upper = 0;

+    bs = &s->horizontal_bs[(x0 + y0 * s->bs_width) >> 2];
+
     if (boundary_upper) {
         RefPicList *rpl_top = (lc->boundary_flags & BOUNDARY_UPPER_SLICE) ?
                               ff_hevc_get_ref_list(s, s->ref, x0, y0 - 1) :
-                              s->ref->refPicList;
-        int yp_pu = (y0 - 1) >> log2_min_pu_size;
-        int yq_pu =  y0      >> log2_min_pu_size;
-        int yp_tu = (y0 - 1) >> log2_min_tu_size;
-        int yq_tu =  y0      >> log2_min_tu_size;
+                              rpl;
+        MvField *top = curr - min_pu_width;
+
+        if (is_intra) {
+            for (i = 0; i < (1 << log2_trafo_size); i += 4)
+                bs[i >> 2] = 2;
+
+        } else {
+            int y_tu = y0 >> log2_min_tu_size;
+            int x_tu = x0 >> log2_min_tu_size;
+            uint8_t *curr_cbf_luma = &s->cbf_luma[y_tu * min_tu_width + x_tu];
+            uint8_t *top_cbf_luma = curr_cbf_luma - min_tu_width;
+
+            s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
+                    min_pu_in_4pix, sizeof (MvField), 4 >> 2,
+                    rpl[0].list, rpl[1].list, rpl_top[0].list, rpl_top[1].list,
+                    curr, top, bs);

             for (i = 0; i < (1 << log2_trafo_size); i += 4) {
-                int x_pu = (x0 + i) >> log2_min_pu_size;
-                int x_tu = (x0 + i) >> log2_min_tu_size;
-                MvField *top  = &tab_mvf[yp_pu * min_pu_width + x_pu];
-                MvField *curr = &tab_mvf[yq_pu * min_pu_width + x_pu];
-                uint8_t top_cbf_luma  = s->cbf_luma[yp_tu * min_tu_width + x_tu];
-                uint8_t curr_cbf_luma = s->cbf_luma[yq_tu * min_tu_width + x_tu];
-
-                if (curr->pred_flag == PF_INTRA || top->pred_flag == PF_INTRA)
-                    bs = 2;
-                else if (curr_cbf_luma || top_cbf_luma)
-                    bs = 1;
-                else
-                    bs = boundary_strength(s, curr, top, rpl_top);
-                s->horizontal_bs[((x0 + i) + y0 * s->bs_width) >> 2] = bs;
+                int i_pu = i >> log2_min_pu_size;
+                int i_tu = i >> log2_min_tu_size;
+
+                if (top[i_pu].pred_flag == PF_INTRA)
+                    bs[i >> 2] = 2;
+                else if (curr_cbf_luma[i_tu] || top_cbf_luma[i_tu])
+                    bs[i >> 2] = 1;
             }
+        }
+    }
+
+    if (!is_intra) {
+        for (j = inc; j < trafo_in_min_pus; j += inc) {
+            MvField *top;
+
+            curr += min_pu_width * inc;
+            top = curr - min_pu_width;
+            bs += s->bs_width * inc << log2_min_pu_size >> 2;
+
+            s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
+                    min_pu_in_4pix, sizeof (MvField), 4 >> 2,
+                    rpl[0].list, rpl[1].list, rpl[0].list, rpl[1].list,
+                    curr, top, bs);
+        }
     }

-    // bs for vertical TU boundaries
     boundary_left = x0 > 0 && !(x0 & 7);
     if (boundary_left &&
         ((!s->sh.slice_loop_filter_across_slices_enabled_flag &&
@@ -774,64 +1081,54 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
           (x0 % (1 << s->ps.sps->log2_ctb_size)) == 0)))
         boundary_left = 0;

+    curr = &tab_mvf[y_pu * min_pu_width + x_pu];
+    bs = &s->vertical_bs[(x0 + y0 * s->bs_width) >> 2];
+
     if (boundary_left) {
         RefPicList *rpl_left = (lc->boundary_flags & BOUNDARY_LEFT_SLICE) ?
                                ff_hevc_get_ref_list(s, s->ref, x0 - 1, y0) :
-                               s->ref->refPicList;
-        int xp_pu = (x0 - 1) >> log2_min_pu_size;
-        int xq_pu =  x0      >> log2_min_pu_size;
-        int xp_tu = (x0 - 1) >> log2_min_tu_size;
-        int xq_tu =  x0      >> log2_min_tu_size;
-
-            for (i = 0; i < (1 << log2_trafo_size); i += 4) {
-                int y_pu      = (y0 + i) >> log2_min_pu_size;
-                int y_tu      = (y0 + i) >> log2_min_tu_size;
-                MvField *left = &tab_mvf[y_pu * min_pu_width + xp_pu];
-                MvField *curr = &tab_mvf[y_pu * min_pu_width + xq_pu];
-                uint8_t left_cbf_luma = s->cbf_luma[y_tu * min_tu_width + xp_tu];
-                uint8_t curr_cbf_luma = s->cbf_luma[y_tu * min_tu_width + xq_tu];
-
-                if (curr->pred_flag == PF_INTRA || left->pred_flag == PF_INTRA)
-                    bs = 2;
-                else if (curr_cbf_luma || left_cbf_luma)
-                    bs = 1;
-                else
-                    bs = boundary_strength(s, curr, left, rpl_left);
-                s->vertical_bs[(x0 + (y0 + i) * s->bs_width) >> 2] = bs;
-            }
-    }
-
-    if (log2_trafo_size > log2_min_pu_size && !is_intra) {
-        RefPicList *rpl = s->ref->refPicList;
+                               rpl;
+        MvField *left = curr - 1;

-        // bs for TU internal horizontal PU boundaries
-        for (j = 8; j < (1 << log2_trafo_size); j += 8) {
-            int yp_pu = (y0 + j - 1) >> log2_min_pu_size;
-            int yq_pu = (y0 + j)     >> log2_min_pu_size;
+        if (is_intra) {
+            for (j = 0; j < (1 << log2_trafo_size); j += 4)
+                bs[j * s->bs_width >> 2] = 2;

-            for (i = 0; i < (1 << log2_trafo_size); i += 4) {
-                int x_pu = (x0 + i) >> log2_min_pu_size;
-                MvField *top  = &tab_mvf[yp_pu * min_pu_width + x_pu];
-                MvField *curr = &tab_mvf[yq_pu * min_pu_width + x_pu];
-
-                bs = boundary_strength(s, curr, top, rpl);
-                s->horizontal_bs[((x0 + i) + (y0 + j) * s->bs_width) >> 2] = bs;
+        } else {
+            int y_tu = y0 >> log2_min_tu_size;
+            int x_tu = x0 >> log2_min_tu_size;
+            uint8_t *curr_cbf_luma = &s->cbf_luma[y_tu * min_tu_width + x_tu];
+            uint8_t *left_cbf_luma = curr_cbf_luma - 1;
+
+            s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
+                    min_pu_in_4pix, min_pu_width * sizeof (MvField), 4 * s->bs_width >> 2,
+                    rpl[0].list, rpl[1].list, rpl_left[0].list, rpl_left[1].list,
+                    curr, left, bs);
+
+            for (j = 0; j < (1 << log2_trafo_size); j += 4) {
+                int j_pu = j >> log2_min_pu_size;
+                int j_tu = j >> log2_min_tu_size;
+
+                if (left[j_pu * min_pu_width].pred_flag == PF_INTRA)
+                    bs[j * s->bs_width >> 2] = 2;
+                else if (curr_cbf_luma[j_tu * min_tu_width] || left_cbf_luma[j_tu * min_tu_width])
+                    bs[j * s->bs_width >> 2] = 1;
             }
         }
+    }

-        // bs for TU internal vertical PU boundaries
-        for (j = 0; j < (1 << log2_trafo_size); j += 4) {
-            int y_pu = (y0 + j) >> log2_min_pu_size;
+    if (!is_intra) {
+        for (i = inc; i < trafo_in_min_pus; i += inc) {
+            MvField *left;

-            for (i = 8; i < (1 << log2_trafo_size); i += 8) {
-                int xp_pu = (x0 + i - 1) >> log2_min_pu_size;
-                int xq_pu = (x0 + i)     >> log2_min_pu_size;
-                MvField *left = &tab_mvf[y_pu * min_pu_width + xp_pu];
-                MvField *curr = &tab_mvf[y_pu * min_pu_width + xq_pu];
+            curr += inc;
+            left = curr - 1;
+            bs += inc << log2_min_pu_size >> 2;

-                bs = boundary_strength(s, curr, left, rpl);
-                s->vertical_bs[((x0 + i) + (y0 + j) * s->bs_width) >> 2] = bs;
-            }
+            s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
+                    min_pu_in_4pix, min_pu_width * sizeof (MvField), 4 * s->bs_width >> 2,
+                    rpl[0].list, rpl[1].list, rpl[0].list, rpl[1].list,
+                    curr, left, bs);
         }
     }
 }
@@ -840,11 +1137,105 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
 #undef CB
 #undef CR

+#ifdef RPI_DEBLOCK_VPU
+// ff_hevc_flush_buffer_lines
+// flushes and invalidates all pixel rows in [start,end-1]
+static void ff_hevc_flush_buffer_lines(HEVCContext *s, int start, int end, int flush_luma, int flush_chroma)
+{
+    rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init();
+    rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
+      0, start, s->ps.sps->width, end - start, 0, s->ps.sps->vshift[1], flush_luma, flush_chroma);
+    rpi_cache_flush_finish(rfe);
+}
+#endif
+
+#if RPI_INTER
+
+// Flush some lines of a reference frames
+void rpi_flush_ref_frame_progress(HEVCContext * const s, ThreadFrame * const f, const unsigned int n)
+{
+    if (s->enable_rpi && s->used_for_ref) {
+        const int d0 = ((int *)f->progress->data)[0];
+        const unsigned int curr_y = d0 == -1 ? 0 : d0;  // At start of time progress is -1
+
+        if (curr_y < (unsigned int)s->ps.sps->height) {
+            rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init();
+            rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
+              0, curr_y, s->ps.sps->width, FFMIN(n, (unsigned int)s->ps.sps->height) - curr_y,
+              s->ps.sps->vshift[1], 1, 1);
+            rpi_cache_flush_finish(rfe);
+        }
+    }
+}
+#endif
+
+#ifdef RPI_DEBLOCK_VPU
+/* rpi_deblock deblocks an entire row of ctbs using the VPU */
+static void rpi_deblock(HEVCContext *s, int y, int ctb_size)
+{
+  // Flush image, 4 lines above to bottom of ctb stripe
+  ff_hevc_flush_buffer_lines(s, FFMAX(y-4,0), y+ctb_size, 1, 1);
+  // TODO flush buffer of beta/tc setup when it becomes cached
+
+  // Prepare three commands at once to avoid calling overhead
+  s->dvq->vpu_cmds_arm[0][0] = get_vc_address_y(s->frame) + s->frame->linesize[0] * y;
+  s->dvq->vpu_cmds_arm[0][1] = s->frame->linesize[0];
+  s->dvq->vpu_cmds_arm[0][2] = s->setup_width;
+  s->dvq->vpu_cmds_arm[0][3] = (int) ( s->dvq->y_setup_vc + s->setup_width * (y>>4) );
+  s->dvq->vpu_cmds_arm[0][4] = ctb_size>>4;
+  s->dvq->vpu_cmds_arm[0][5] = 2;
+
+  s->dvq->vpu_cmds_arm[1][0] = get_vc_address_u(s->frame) + s->frame->linesize[1] * (y>> s->ps.sps->vshift[1]);
+  s->dvq->vpu_cmds_arm[1][1] = s->frame->linesize[1];
+  s->dvq->vpu_cmds_arm[1][2] = s->uv_setup_width;
+  s->dvq->vpu_cmds_arm[1][3] = (int) ( s->dvq->uv_setup_vc + s->uv_setup_width * ((y>>4)>> s->ps.sps->vshift[1]) );
+  s->dvq->vpu_cmds_arm[1][4] = (ctb_size>>4)>> s->ps.sps->vshift[1];
+  s->dvq->vpu_cmds_arm[1][5] = 3;
+
+  s->dvq->vpu_cmds_arm[2][0] = get_vc_address_v(s->frame) + s->frame->linesize[2] * (y>> s->ps.sps->vshift[2]);
+  s->dvq->vpu_cmds_arm[2][1] = s->frame->linesize[2];
+  s->dvq->vpu_cmds_arm[2][2] = s->uv_setup_width;
+  s->dvq->vpu_cmds_arm[2][3] = (int) ( s->dvq->uv_setup_vc + s->uv_setup_width * ((y>>4)>> s->ps.sps->vshift[1]) );
+  s->dvq->vpu_cmds_arm[2][4] = (ctb_size>>4)>> s->ps.sps->vshift[1];
+  s->dvq->vpu_cmds_arm[2][5] = 4;
+
+  // Call VPU
+  {
+      const vpu_qpu_job_h vqj = vpu_qpu_job_new();
+      vpu_qpu_job_add_vpu(vqj, vpu_get_fn(s->ps.sps->bit_depth), s->dvq->vpu_cmds_vc, 3, 0, 0, 0, 5);  // 5 means to do all the commands
+      vpu_qpu_job_add_sync_this(vqj, &s->dvq->cmd_id);
+      vpu_qpu_job_finish(vqj);
+  }
+
+  s->dvq_n = (s->dvq_n + 1) & (RPI_DEBLOCK_VPU_Q_COUNT - 1);
+  s->dvq = s->dvq_ents + s->dvq_n;
+
+  vpu_qpu_wait(&s->dvq->cmd_id);
+}
+
+#endif
+
 void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
 {
     int x_end = x >= s->ps.sps->width  - ctb_size;
+#ifdef RPI_DEBLOCK_VPU
+    int done_deblock = 0;
+#endif
     if (s->avctx->skip_loop_filter < AVDISCARD_ALL)
         deblocking_filter_CTB(s, x, y);
+#ifdef RPI_DEBLOCK_VPU
+    if (s->enable_rpi_deblock && x_end)
+    {
+      int y_at_end = y >= s->ps.sps->height - ctb_size;
+      int height = 64;  // Deblock in units 64 high to avoid too many VPU calls
+      int y_start = y&~63;
+      if (y_at_end) height = s->ps.sps->height - y_start;
+      if ((((y+ctb_size)&63)==0) || y_at_end) {
+        done_deblock = 1;
+        rpi_deblock(s, y_start, height);
+      }
+    }
+#endif
     if (s->ps.sps->sao_enabled) {
         int y_end = y >= s->ps.sps->height - ctb_size;
         if (y && x)
@@ -853,16 +1244,45 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
             sao_filter_CTB(s, x - ctb_size, y);
         if (y && x_end) {
             sao_filter_CTB(s, x, y - ctb_size);
-            if (s->threads_type & FF_THREAD_FRAME )
-                ff_thread_report_progress(&s->ref->tf, y, 0);
+            if (s->threads_type == FF_THREAD_FRAME ) {
+#if RPI_INTER
+                rpi_flush_ref_frame_progress(s,&s->ref->tf, y);
+#endif
+                ff_hevc_progress_signal_recon(s, y);
+            }
         }
         if (x_end && y_end) {
             sao_filter_CTB(s, x , y);
-            if (s->threads_type & FF_THREAD_FRAME )
-                ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0);
+            if (s->threads_type == FF_THREAD_FRAME ) {
+#if RPI_INTER
+                rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size);
+#endif
+                ff_hevc_progress_signal_recon(s, y + ctb_size);
+            }
         }
-    } else if (s->threads_type & FF_THREAD_FRAME && x_end)
-        ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
+    } else if (s->threads_type == FF_THREAD_FRAME && x_end) {
+        //int newh = y + ctb_size - 4;
+        //int currh = s->ref->tf.progress->data[0];
+        //if (((y + ctb_size)&63)==0)
+#ifdef RPI_DEBLOCK_VPU
+        if (s->enable_rpi_deblock) {
+            // we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi
+            if (done_deblock) {
+                ff_hevc_progress_signal_recon(s, y + ctb_size - 4);
+            }
+        } else {
+#if RPI_INTER
+            rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size - 4);
+#endif
+            ff_hevc_progress_signal_recon(s, y + ctb_size - 4);
+        }
+#else
+#if RPI_INTER
+        rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size - 4);
+#endif
+        ff_hevc_progress_signal_recon(s, y + ctb_size - 4);
+#endif
+    }
 }

 void ff_hevc_hls_filters(HEVCContext *s, int x_ctb, int y_ctb, int ctb_size)
diff --git a/libavcodec/hevc_mvs.c b/libavcodec/hevc_mvs.c
index 4a6dde0f67..8ee37ebfbc 100644
--- a/libavcodec/hevc_mvs.c
+++ b/libavcodec/hevc_mvs.c
@@ -111,7 +111,7 @@ static av_always_inline int compare_mv_ref_idx(struct MvField A, struct MvField
     return 0;
 }

-static av_always_inline void mv_scale(Mv *dst, Mv *src, int td, int tb)
+static av_always_inline void mv_scale(Mv * const dst, const Mv * const src, int td, int tb)
 {
     int tx, scale_factor;

@@ -125,10 +125,10 @@ static av_always_inline void mv_scale(Mv *dst, Mv *src, int td, int tb)
                            (scale_factor * src->y < 0)) >> 8);
 }

-static int check_mvset(Mv *mvLXCol, Mv *mvCol,
-                       int colPic, int poc,
-                       RefPicList *refPicList, int X, int refIdxLx,
-                       RefPicList *refPicList_col, int listCol, int refidxCol)
+static int check_mvset(Mv * const mvLXCol, const Mv * const mvCol,
+                       const int colPic, const int poc,
+                       const RefPicList * const refPicList, const int X, const int refIdxLx,
+                       const RefPicList * const refPicList_col, const int listCol, const int refidxCol)
 {
     int cur_lt = refPicList[X].isLongTerm[refIdxLx];
     int col_lt = refPicList_col[listCol].isLongTerm[refidxCol];
@@ -159,11 +159,11 @@ static int check_mvset(Mv *mvLXCol, Mv *mvCol,
                 refPicList_col, L ## l, temp_col.ref_idx[l])

 // derive the motion vectors section 8.5.3.1.8
-static int derive_temporal_colocated_mvs(HEVCContext *s, MvField temp_col,
-                                         int refIdxLx, Mv *mvLXCol, int X,
-                                         int colPic, RefPicList *refPicList_col)
+static int derive_temporal_colocated_mvs(const HEVCContext * const s, const MvField temp_col,
+                                         const int refIdxLx, Mv * const mvLXCol, const int X,
+                                         const int colPic, const RefPicList * const refPicList_col)
 {
-    RefPicList *refPicList = s->ref->refPicList;
+    const RefPicList * const refPicList = s->ref->refPicList;

     if (temp_col.pred_flag == PF_INTRA)
         return 0;
@@ -214,20 +214,20 @@ static int derive_temporal_colocated_mvs(HEVCContext *s, MvField temp_col,
 /*
  * 8.5.3.1.7  temporal luma motion vector prediction
  */
-static int temporal_luma_motion_vector(HEVCContext *s, int x0, int y0,
-                                       int nPbW, int nPbH, int refIdxLx,
-                                       Mv *mvLXCol, int X)
+static int temporal_luma_motion_vector(HEVCContext * const s, const int x0, const int y0,
+                                       const int nPbW, const int nPbH, const int refIdxLx,
+                                       Mv * const mvLXCol, const int X)
 {
     MvField *tab_mvf;
     MvField temp_col;
     int x, y, x_pu, y_pu;
-    int min_pu_width = s->ps.sps->min_pu_width;
+    const int min_pu_width = s->ps.sps->min_pu_width;
     int availableFlagLXCol = 0;
     int colPic;

-    HEVCFrame *ref = s->ref->collocated_ref;
+    HEVCFrame * const ref = s->ref->collocated_ref;

-    if (!ref) {
+    if (ref == NULL || ref->tab_mvf == NULL) {
         memset(mvLXCol, 0, sizeof(*mvLXCol));
         return 0;
     }
@@ -239,14 +239,13 @@ static int temporal_luma_motion_vector(HEVCContext *s, int x0, int y0,
     x = x0 + nPbW;
     y = y0 + nPbH;

-    if (tab_mvf &&
-        (y0 >> s->ps.sps->log2_ctb_size) == (y >> s->ps.sps->log2_ctb_size) &&
+    if ((y0 >> s->ps.sps->log2_ctb_size) == (y >> s->ps.sps->log2_ctb_size) &&
         y < s->ps.sps->height &&
         x < s->ps.sps->width) {
         x                 &= ~15;
         y                 &= ~15;
         if (s->threads_type == FF_THREAD_FRAME)
-            ff_thread_await_progress(&ref->tf, y, 0);
+            ff_hevc_progress_wait_mv(s, s->jb0, ref, y);
         x_pu               = x >> s->ps.sps->log2_min_pu_size;
         y_pu               = y >> s->ps.sps->log2_min_pu_size;
         temp_col           = TAB_MVF(x_pu, y_pu);
@@ -254,13 +253,13 @@ static int temporal_luma_motion_vector(HEVCContext *s, int x0, int y0,
     }

     // derive center collocated motion vector
-    if (tab_mvf && !availableFlagLXCol) {
+    if (!availableFlagLXCol) {
         x                  = x0 + (nPbW >> 1);
         y                  = y0 + (nPbH >> 1);
         x                 &= ~15;
         y                 &= ~15;
         if (s->threads_type == FF_THREAD_FRAME)
-            ff_thread_await_progress(&ref->tf, y, 0);
+            ff_hevc_progress_wait_mv(s, s->jb0, ref, y);
         x_pu               = x >> s->ps.sps->log2_min_pu_size;
         y_pu               = y >> s->ps.sps->log2_min_pu_size;
         temp_col           = TAB_MVF(x_pu, y_pu);
diff --git a/libavcodec/hevc_ps.c b/libavcodec/hevc_ps.c
index c1b69a0199..455cdaea1c 100644
--- a/libavcodec/hevc_ps.c
+++ b/libavcodec/hevc_ps.c
@@ -785,7 +785,12 @@ static int map_pixel_format(AVCodecContext *avctx, HEVCSPS *sps)
     switch (sps->bit_depth) {
     case 8:
         if (sps->chroma_format_idc == 0) sps->pix_fmt = AV_PIX_FMT_GRAY8;
+#if RPI_HEVC_SAND
+        // *** Horrid kludge s.t. we start out with sand format
+        if (sps->chroma_format_idc == 1) sps->pix_fmt = sps->width <= 2048 && sps->height <= 1088 ? AV_PIX_FMT_SAND128 : AV_PIX_FMT_YUV420P;
+#else
         if (sps->chroma_format_idc == 1) sps->pix_fmt = AV_PIX_FMT_YUV420P;
+#endif
         if (sps->chroma_format_idc == 2) sps->pix_fmt = AV_PIX_FMT_YUV422P;
         if (sps->chroma_format_idc == 3) sps->pix_fmt = AV_PIX_FMT_YUV444P;
        break;
@@ -797,7 +802,12 @@ static int map_pixel_format(AVCodecContext *avctx, HEVCSPS *sps)
         break;
     case 10:
         if (sps->chroma_format_idc == 0) sps->pix_fmt = AV_PIX_FMT_GRAY16;
+#if RPI_HEVC_SAND
+        // *** Horrid kludge s.t. we start out with sand format
+        if (sps->chroma_format_idc == 1) sps->pix_fmt = sps->width <= 2048 && sps->height <= 1088 ? AV_PIX_FMT_SAND64_10 : AV_PIX_FMT_YUV420P10;
+#else
         if (sps->chroma_format_idc == 1) sps->pix_fmt = AV_PIX_FMT_YUV420P10;
+#endif
         if (sps->chroma_format_idc == 2) sps->pix_fmt = AV_PIX_FMT_YUV422P10;
         if (sps->chroma_format_idc == 3) sps->pix_fmt = AV_PIX_FMT_YUV444P10;
         break;
@@ -1064,7 +1074,6 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
         skip_bits(gb, 7); //sps_extension_7bits = get_bits(gb, 7);
         if (sps_extension_flag[0]) {
             int extended_precision_processing_flag;
-            int high_precision_offsets_enabled_flag;
             int cabac_bypass_alignment_enabled_flag;

             sps->transform_skip_rotation_enabled_flag = get_bits1(gb);
@@ -1079,10 +1088,10 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
                    "extended_precision_processing_flag not yet implemented\n");

             sps->intra_smoothing_disabled_flag       = get_bits1(gb);
-            high_precision_offsets_enabled_flag  = get_bits1(gb);
-            if (high_precision_offsets_enabled_flag)
+            sps->high_precision_offsets_enabled_flag  = get_bits1(gb);
+            if (sps->high_precision_offsets_enabled_flag)
                 av_log(avctx, AV_LOG_WARNING,
-                   "high_precision_offsets_enabled_flag not yet implemented\n");
+                   "high_precision_offsets_enabled_flag not fully implemented\n");

             sps->persistent_rice_adaptation_enabled_flag = get_bits1(gb);

diff --git a/libavcodec/hevc_refs.c b/libavcodec/hevc_refs.c
index df52e401ad..992e994b1a 100644
--- a/libavcodec/hevc_refs.c
+++ b/libavcodec/hevc_refs.c
@@ -23,7 +23,7 @@

 #include "libavutil/avassert.h"
 #include "libavutil/pixdesc.h"
-
+#include "libavutil/rpi_sand_fns.h"
 #include "internal.h"
 #include "thread.h"
 #include "hevc.h"
@@ -205,7 +205,8 @@ int ff_hevc_output_frame(HEVCContext *s, AVFrame *out, int flush)
             HEVCFrame *frame = &s->DPB[min_idx];
             AVFrame *dst = out;
             AVFrame *src = frame->frame;
-            const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(src->format);
+            const int fmt = src->format;
+            const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(fmt);
             int pixel_shift = !!(desc->comp[0].depth > 8);

             ret = av_frame_ref(out, src);
@@ -215,13 +216,31 @@ int ff_hevc_output_frame(HEVCContext *s, AVFrame *out, int flush)
                 ff_hevc_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT);
             if (ret < 0)
                 return ret;
-
-            for (i = 0; i < 3; i++) {
-                int hshift = (i > 0) ? desc->log2_chroma_w : 0;
-                int vshift = (i > 0) ? desc->log2_chroma_h : 0;
-                int off = ((frame->window.left_offset >> hshift) << pixel_shift) +
-                          (frame->window.top_offset   >> vshift) * dst->linesize[i];
-                dst->data[i] += off;
+#ifdef RPI
+            if (av_rpi_is_sand_format(fmt))
+            {
+                // Sand cannot be windowed by offset so add side data if we have an offset
+                const HEVCWindow * const window = &frame->window;
+                if (window->left_offset + window->right_offset + window->top_offset + window->bottom_offset != 0)
+                {
+                    AVFrameSideData *const sd = av_frame_new_side_data(dst, AV_FRAME_DATA_SAND_INFO, sizeof(AVPanScan));
+                    AVFrameDataSandInfo *const si = (AVFrameDataSandInfo *)sd->data;
+                    si->left_offset = window->left_offset;
+                    si->top_offset = window->top_offset;
+                    si->pic_width = s->ps.sps->width;
+                    si->pic_height = s->ps.sps->height;
+                }
+            }
+            else
+#endif
+            {
+                for (i = 0; i < 3; i++) {
+                    int hshift = (i > 0) ? desc->log2_chroma_w : 0;
+                    int vshift = (i > 0) ? desc->log2_chroma_h : 0;
+                    int off = ((frame->window.left_offset >> hshift) << pixel_shift) +
+                              (frame->window.top_offset   >> vshift) * dst->linesize[i];
+                    dst->data[i] += off;
+                }
             }
             av_log(s->avctx, AV_LOG_DEBUG,
                    "Output frame with POC %d.\n", frame->poc);
@@ -426,8 +445,7 @@ static HEVCFrame *generate_missing_ref(HEVCContext *s, int poc)
     frame->sequence = s->seq_decode;
     frame->flags    = 0;

-    if (s->threads_type == FF_THREAD_FRAME)
-        ff_thread_report_progress(&frame->tf, INT_MAX, 0);
+    ff_hevc_progress_set_all_done(frame);

     return frame;
 }
diff --git a/libavcodec/hevcdsp.c b/libavcodec/hevcdsp.c
index 9d773d960e..c9661c3ab1 100644
--- a/libavcodec/hevcdsp.c
+++ b/libavcodec/hevcdsp.c
@@ -123,6 +123,120 @@ DECLARE_ALIGNED(16, const int8_t, ff_hevc_qpel_filters[3][16]) = {
 #include "hevcdsp_template.c"
 #undef BIT_DEPTH

+static void hevc_deblocking_boundary_strengths(int pus, int dup, int in_inc, int out_inc,
+                                               int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
+                                               MvField *curr, MvField *neigh, uint8_t *bs)
+{
+    for (; pus > 0; pus--) {
+        int strength, out;
+        int curr_refL0 = curr_rpl0[curr->ref_idx[0]];
+        int curr_refL1 = curr_rpl1[curr->ref_idx[1]];
+        int neigh_refL0 = neigh_rpl0[neigh->ref_idx[0]];
+        int neigh_refL1 = neigh_rpl1[neigh->ref_idx[1]];
+
+#if 1 // This more directly matches the original implementation
+        if (curr->pred_flag == PF_BI &&  neigh->pred_flag == PF_BI) {
+            // same L0 and L1
+            if (curr_refL0 == neigh_refL0 &&
+                curr_refL0 == curr_refL1 &&
+                neigh_refL0 == neigh_refL1) {
+                if ((FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
+                     FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4) &&
+                    (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
+                     FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4))
+                    strength = 1;
+                else
+                    strength = 0;
+            } else if (neigh_refL0 == curr_refL0 &&
+                       neigh_refL1 == curr_refL1) {
+                if (FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
+                    FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4)
+                    strength = 1;
+                else
+                    strength = 0;
+            } else if (neigh_refL1 == curr_refL0 &&
+                       neigh_refL0 == curr_refL1) {
+                if (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
+                    FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4)
+                    strength = 1;
+                else
+                    strength = 0;
+            } else {
+                strength = 1;
+            }
+        } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV
+            Mv curr_mv0, neigh_mv0;
+
+            if (curr->pred_flag & 1) {
+                curr_mv0   = curr->mv[0];
+            } else {
+                curr_mv0   = curr->mv[1];
+                curr_refL0 = curr_refL1;
+            }
+
+            if (neigh->pred_flag & 1) {
+                neigh_mv0   = neigh->mv[0];
+            } else {
+                neigh_mv0   = neigh->mv[1];
+                neigh_refL0 = neigh_refL1;
+            }
+
+            if (curr_refL0 == neigh_refL0) {
+                if (FFABS(curr_mv0.x - neigh_mv0.x) >= 4 || FFABS(curr_mv0.y - neigh_mv0.y) >= 4)
+                    strength = 1;
+                else
+                    strength = 0;
+            } else
+                strength = 1;
+        } else
+            strength = 1;
+#else // This has exactly the same effect, but is more suitable for vectorisation
+        Mv curr_mv[2];
+        Mv neigh_mv[2];
+        memcpy(curr_mv, curr->mv, sizeof curr_mv);
+        memcpy(neigh_mv, neigh->mv, sizeof neigh_mv);
+
+        if (!(curr->pred_flag & 2)) {
+            curr_mv[1] = curr_mv[0];
+            curr_refL1 = curr_refL0;
+        }
+        if (!(neigh->pred_flag & 2)) {
+            neigh_mv[1] = neigh_mv[0];
+            neigh_refL1 = neigh_refL0;
+        }
+        if (!(curr->pred_flag & 1)) {
+            curr_mv[0] = curr_mv[1];
+            curr_refL0 = curr_refL1;
+        }
+        if (!(neigh->pred_flag & 1)) {
+            neigh_mv[0] = neigh_mv[1];
+            neigh_refL0 = neigh_refL1;
+        }
+
+        strength = 1;
+
+        strength &= (neigh_refL0 != curr_refL0) | (neigh_refL1 != curr_refL1) |
+                (FFABS(neigh_mv[0].x - curr_mv[0].x) >= 4) | (FFABS(neigh_mv[0].y - curr_mv[0].y) >= 4) |
+                (FFABS(neigh_mv[1].x - curr_mv[1].x) >= 4) | (FFABS(neigh_mv[1].y - curr_mv[1].y) >= 4);
+
+        strength &= (neigh_refL1 != curr_refL0) | (neigh_refL0 != curr_refL1) |
+                (FFABS(neigh_mv[1].x - curr_mv[0].x) >= 4) | (FFABS(neigh_mv[1].y - curr_mv[0].y) >= 4) |
+                (FFABS(neigh_mv[0].x - curr_mv[1].x) >= 4) | (FFABS(neigh_mv[0].y - curr_mv[1].y) >= 4);
+
+        strength |= (((curr->pred_flag + 1) ^ (neigh->pred_flag + 1)) >> 2);
+#endif
+
+        curr += in_inc / sizeof (MvField);
+        neigh += in_inc / sizeof (MvField);
+
+        for (out = dup; out > 0; out--)
+        {
+            *bs = strength;
+            bs += out_inc;
+        }
+    }
+}
+
 void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
 {
 #undef FUNC
@@ -193,15 +307,57 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
     PEL_FUNC(put_hevc_qpel_bi_w, 1, 0, put_hevc_qpel_bi_w_v, depth);          \
     PEL_FUNC(put_hevc_qpel_bi_w, 1, 1, put_hevc_qpel_bi_w_hv, depth)

+#if !RPI_HEVC_SAND
+#define SLICED_LOOP_FILTERS(depth)
+#define SLICED_ADD_RESIDUAL(depth)
+#define SLICED_SAO(depth)
+#else
+#define SLICED_ADD_RESIDUAL(depth)\
+    hevcdsp->add_residual_u[0]      = FUNC(add_residual4x4_u, depth);         \
+    hevcdsp->add_residual_u[1]      = FUNC(add_residual8x8_u, depth);         \
+    hevcdsp->add_residual_u[2]      = FUNC(add_residual16x16_u, depth);       \
+    hevcdsp->add_residual_u[3]      = FUNC(add_residual32x32_u, depth);       \
+    hevcdsp->add_residual_v[0]      = FUNC(add_residual4x4_v, depth);         \
+    hevcdsp->add_residual_v[1]      = FUNC(add_residual8x8_v, depth);         \
+    hevcdsp->add_residual_v[2]      = FUNC(add_residual16x16_v, depth);       \
+    hevcdsp->add_residual_v[3]      = FUNC(add_residual32x32_v, depth);       \
+    hevcdsp->add_residual_c[0]      = FUNC(add_residual4x4_c, depth);         \
+    hevcdsp->add_residual_c[1]      = FUNC(add_residual8x8_c, depth);         \
+    hevcdsp->add_residual_c[2]      = FUNC(add_residual16x16_c, depth);       \
+    hevcdsp->add_residual_c[3]      = FUNC(add_residual32x32_c, depth);       \
+    hevcdsp->add_residual_dc_c[0]   = FUNC(add_residual4x4_dc_c, depth);         \
+    hevcdsp->add_residual_dc_c[1]   = FUNC(add_residual8x8_dc_c, depth);         \
+    hevcdsp->add_residual_dc_c[2]   = FUNC(add_residual16x16_dc_c, depth);       \
+    hevcdsp->add_residual_dc_c[3]   = FUNC(add_residual32x32_dc_c, depth);       \
+    hevcdsp->put_pcm_c              = FUNC(put_pcm_c, depth)
+#define SLICED_LOOP_FILTERS(depth)\
+    hevcdsp->hevc_v_loop_filter_luma2 = FUNC(hevc_v_loop_filter_luma2, depth); \
+    hevcdsp->hevc_h_loop_filter_uv    = FUNC(hevc_h_loop_filter_uv, depth);    \
+    hevcdsp->hevc_v_loop_filter_uv2   = FUNC(hevc_v_loop_filter_uv2, depth)
+#define SLICED_SAO(depth)\
+    for (i = 0; i != SAO_FILTER_N; ++i) {                                     \
+        hevcdsp->sao_band_filter_c[i] = FUNC(sao_band_filter_c, depth);       \
+        hevcdsp->sao_edge_filter_c[i] = FUNC(sao_edge_filter_c, depth);       \
+    }                                                                         \
+    hevcdsp->sao_edge_restore_c[0] = FUNC(sao_edge_restore_c_0, depth);       \
+    hevcdsp->sao_edge_restore_c[1] = FUNC(sao_edge_restore_c_1, depth)
+
+#endif
+
 #define HEVC_DSP(depth)                                                     \
     hevcdsp->put_pcm                = FUNC(put_pcm, depth);                 \
-    hevcdsp->transform_add[0]       = FUNC(transform_add4x4, depth);        \
-    hevcdsp->transform_add[1]       = FUNC(transform_add8x8, depth);        \
-    hevcdsp->transform_add[2]       = FUNC(transform_add16x16, depth);      \
-    hevcdsp->transform_add[3]       = FUNC(transform_add32x32, depth);      \
-    hevcdsp->transform_skip         = FUNC(transform_skip, depth);          \
+    hevcdsp->transform_add[0]       = FUNC(add_residual4x4, depth);         \
+    hevcdsp->transform_add[1]       = FUNC(add_residual8x8, depth);         \
+    hevcdsp->transform_add[2]       = FUNC(add_residual16x16, depth);       \
+    hevcdsp->transform_add[3]       = FUNC(add_residual32x32, depth);       \
+    hevcdsp->add_residual_dc[0]     = FUNC(add_residual4x4_dc, depth);      \
+    hevcdsp->add_residual_dc[1]     = FUNC(add_residual8x8_dc, depth);      \
+    hevcdsp->add_residual_dc[2]     = FUNC(add_residual16x16_dc, depth);    \
+    hevcdsp->add_residual_dc[3]     = FUNC(add_residual32x32_dc, depth);    \
+    SLICED_ADD_RESIDUAL(depth);                                             \
     hevcdsp->transform_rdpcm        = FUNC(transform_rdpcm, depth);         \
-    hevcdsp->idct_4x4_luma          = FUNC(transform_4x4_luma, depth);      \
+    hevcdsp->transform_skip         = FUNC(transform_skip, depth);          \
+    hevcdsp->idct_4x4_luma          = FUNC(idct_4x4_luma, depth);           \
     hevcdsp->idct[0]                = FUNC(idct_4x4, depth);                \
     hevcdsp->idct[1]                = FUNC(idct_8x8, depth);                \
     hevcdsp->idct[2]                = FUNC(idct_16x16, depth);              \
@@ -212,18 +368,13 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
     hevcdsp->idct_dc[2]             = FUNC(idct_16x16_dc, depth);           \
     hevcdsp->idct_dc[3]             = FUNC(idct_32x32_dc, depth);           \
                                                                             \
-    hevcdsp->sao_band_filter[0] =                                              \
-    hevcdsp->sao_band_filter[1] =                                              \
-    hevcdsp->sao_band_filter[2] =                                              \
-    hevcdsp->sao_band_filter[3] =                                              \
-    hevcdsp->sao_band_filter[4] = FUNC(sao_band_filter, depth);                \
-    hevcdsp->sao_edge_filter[0] =                                              \
-    hevcdsp->sao_edge_filter[1] =                                              \
-    hevcdsp->sao_edge_filter[2] =                                              \
-    hevcdsp->sao_edge_filter[3] =                                              \
-    hevcdsp->sao_edge_filter[4] = FUNC(sao_edge_filter, depth);                \
+    for (i = 0; i != SAO_FILTER_N; ++i) {                                   \
+        hevcdsp->sao_band_filter[i] = FUNC(sao_band_filter, depth);         \
+        hevcdsp->sao_edge_filter[i] = FUNC(sao_edge_filter, depth);         \
+    }                                                                       \
     hevcdsp->sao_edge_restore[0] = FUNC(sao_edge_restore_0, depth);            \
     hevcdsp->sao_edge_restore[1] = FUNC(sao_edge_restore_1, depth);            \
+    SLICED_SAO(depth);                                                         \
                                                                                \
     QPEL_FUNCS(depth);                                                         \
     QPEL_UNI_FUNCS(depth);                                                     \
@@ -232,6 +383,7 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
     EPEL_UNI_FUNCS(depth);                                                     \
     EPEL_BI_FUNCS(depth);                                                      \
                                                                                \
+    SLICED_LOOP_FILTERS(depth);                                                \
     hevcdsp->hevc_h_loop_filter_luma     = FUNC(hevc_h_loop_filter_luma, depth);   \
     hevcdsp->hevc_v_loop_filter_luma     = FUNC(hevc_v_loop_filter_luma, depth);   \
     hevcdsp->hevc_h_loop_filter_chroma   = FUNC(hevc_h_loop_filter_chroma, depth); \
@@ -257,6 +409,8 @@ int i = 0;
         break;
     }

+    hevcdsp->hevc_deblocking_boundary_strengths = hevc_deblocking_boundary_strengths;
+
     if (ARCH_X86)
         ff_hevc_dsp_init_x86(hevcdsp, bit_depth);
     if (ARCH_ARM)
diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h
index 9f1f6dd59f..c4a1b0f09d 100644
--- a/libavcodec/hevcdsp.h
+++ b/libavcodec/hevcdsp.h
@@ -25,6 +25,7 @@
 #ifndef AVCODEC_HEVCDSP_H
 #define AVCODEC_HEVCDSP_H

+#include "rpi_opts.h"
 #include "get_bits.h"

 #define MAX_PB_SIZE 64
@@ -42,11 +43,40 @@ typedef struct SAOParams {
     uint8_t type_idx[3];    ///< sao_type_idx
 } SAOParams;

+typedef struct Mv {
+    int16_t x;  ///< horizontal component of motion vector
+    int16_t y;  ///< vertical component of motion vector
+} Mv;
+
+typedef struct MvField {
+    DECLARE_ALIGNED(4, Mv, mv)[2];
+    int8_t ref_idx[2];
+    int8_t pred_flag;
+} MvField;
+
+#ifdef RPI
+#define SAO_FILTER_N 6
+#else
+#define SAO_FILTER_N 5
+#endif
+
+
 typedef struct HEVCDSPContext {
     void (*put_pcm)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
                     struct GetBitContext *gb, int pcm_bit_depth);

-    void (*transform_add[4])(uint8_t *_dst, int16_t *coeffs, ptrdiff_t _stride);
+    // add_residual was transform_add - import 3.3 names
+    void (*transform_add[4])(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+    void (*add_residual_dc[4])(uint8_t *dst, ptrdiff_t stride, int dc);
+#if RPI_HEVC_SAND
+    void (*add_residual_u[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride, int dc_v);
+    void (*add_residual_v[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride, int dc_u);
+
+    void (*add_residual_c[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
+    void (*add_residual_dc_c[4])(uint8_t *dst, ptrdiff_t stride, int32_t dc_uv);
+    void (*put_pcm_c)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
+                    struct GetBitContext *gb, int pcm_bit_depth);
+#endif

     void (*transform_skip)(int16_t *coeffs, int16_t log2_size);

@@ -58,16 +88,31 @@ typedef struct HEVCDSPContext {

     void (*idct_dc[4])(int16_t *coeffs);

-    void (*sao_band_filter[5])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+    void (*sao_band_filter[SAO_FILTER_N])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
+#if RPI_HEVC_SAND
+    void (*sao_band_filter_c[SAO_FILTER_N])(uint8_t *_dst, const uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+                               const int16_t *sao_offset_val_u, int sao_left_class_u,
+                               const int16_t *sao_offset_val_v, int sao_left_class_v,
+                               int width, int height);
+#endif

     /* implicit stride_src parameter has value of 2 * MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE */
-    void (*sao_edge_filter[5])(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
+    void (*sao_edge_filter[SAO_FILTER_N])(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
                                int16_t *sao_offset_val, int sao_eo_class, int width, int height);
+#if RPI_HEVC_SAND
+    void (*sao_edge_filter_c[SAO_FILTER_N])(uint8_t *_dst /* align 16 */, const uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
+                               const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v, int sao_eo_class, int width, int height);
+#endif

     void (*sao_edge_restore[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
                                 struct SAOParams *sao, int *borders, int _width, int _height, int c_idx,
                                 uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
+#if RPI_HEVC_SAND
+    void (*sao_edge_restore_c[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+                                struct SAOParams *sao, int *borders, int _width, int _height, int c_idx,
+                                uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
+#endif

     void (*put_hevc_qpel[10][2][2])(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
                                     int height, intptr_t mx, intptr_t my, int width);
@@ -120,6 +165,22 @@ typedef struct HEVCDSPContext {
     void (*hevc_v_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
                                         int32_t *tc, uint8_t *no_p,
                                         uint8_t *no_q);
+#ifdef RPI
+    void (*hevc_v_loop_filter_luma2)(uint8_t * _pix_r,
+                                 unsigned int _stride, unsigned int beta, const int32_t tc[2],
+                                 const uint8_t no_p[2], const uint8_t no_q[2],
+                                 uint8_t * _pix_l);
+    void (*hevc_h_loop_filter_uv)(uint8_t * src, unsigned int stride, uint32_t tc4,
+                                 unsigned int no_f);
+    void (*hevc_v_loop_filter_uv2)(uint8_t * src_r, unsigned int stride, uint32_t tc4,
+                                 uint8_t * src_l,
+                                 unsigned int no_f);
+
+#endif
+
+    void (*hevc_deblocking_boundary_strengths)(int pus, int dup, int in_inc, int out_inc,
+                                               int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
+                                               MvField *curr, MvField *neigh, uint8_t *bs);
 } HEVCDSPContext;

 void ff_hevc_dsp_init(HEVCDSPContext *hpc, int bit_depth);
diff --git a/libavcodec/hevcdsp_template.c b/libavcodec/hevcdsp_template.c
index 5bca02342d..122fbe8154 100644
--- a/libavcodec/hevcdsp_template.c
+++ b/libavcodec/hevcdsp_template.c
@@ -26,6 +26,7 @@
 #include "bit_depth_template.c"
 #include "hevcdsp.h"

+#include "rpi_shader_template.h"

 static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
                           GetBitContext *gb, int pcm_bit_depth)
@@ -42,8 +43,32 @@ static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height
     }
 }

-static av_always_inline void FUNC(transquant_bypass)(uint8_t *_dst, int16_t *coeffs,
-                                                     ptrdiff_t stride, int size)
+#if RPI_HEVC_SAND
+static void FUNC(put_pcm_c)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
+                          GetBitContext *gb, int pcm_bit_depth)
+{
+    int x, y;
+    pixel *dst = (pixel *)_dst;
+
+    stride /= sizeof(pixel);
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x*2] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
+        dst += stride;
+    }
+
+    dst = (pixel *)_dst + 1;
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x*2] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
+        dst += stride;
+    }
+}
+#endif
+
+static av_always_inline void FUNC(add_residual)(uint8_t *_dst, int16_t *coeffs,
+                                                ptrdiff_t stride, int size)
 {
     int x, y;
     pixel *dst = (pixel *)_dst;
@@ -59,30 +84,255 @@ static av_always_inline void FUNC(transquant_bypass)(uint8_t *_dst, int16_t *coe
     }
 }

-static void FUNC(transform_add4x4)(uint8_t *_dst, int16_t *coeffs,
-                                       ptrdiff_t stride)
+static av_always_inline void FUNC(add_residual_dc)(uint8_t *_dst, ptrdiff_t stride, const int dc, int size)
+{
+    int x, y;
+    pixel *dst = (pixel *)_dst;
+
+    stride /= sizeof(pixel);
+
+    for (y = 0; y < size; y++) {
+        for (x = 0; x < size; x++) {
+            dst[x] = av_clip_pixel(dst[x] + dc);
+        }
+        dst += stride;
+    }
+}
+
+
+#if RPI_HEVC_SAND
+static av_always_inline void FUNC(add_residual_u)(uint8_t *_dst, const int16_t *res,
+                                                ptrdiff_t stride, const int dc_v, int size)
 {
-    FUNC(transquant_bypass)(_dst, coeffs, stride, 4);
+    int x, y;
+    pixel *dst = (pixel *)_dst;
+
+    stride /= sizeof(pixel);
+
+    for (y = 0; y < size; y++) {
+        for (x = 0; x < size * 2; x += 2) {
+            dst[x] = av_clip_pixel(dst[x] + *res);
+            dst[x + 1] = av_clip_pixel(dst[x + 1] + dc_v);
+            res++;
+        }
+        dst += stride;
+    }
 }

-static void FUNC(transform_add8x8)(uint8_t *_dst, int16_t *coeffs,
+static av_always_inline void FUNC(add_residual_v)(uint8_t *_dst, const int16_t *res,
+                                                ptrdiff_t stride, const int dc_u, int size)
+{
+    int x, y;
+    pixel *dst = (pixel *)_dst;
+
+    stride /= sizeof(pixel);
+
+    for (y = 0; y < size; y++) {
+        for (x = 0; x < size * 2; x += 2) {
+            dst[x] = av_clip_pixel(dst[x] + dc_u);
+            dst[x + 1] = av_clip_pixel(dst[x + 1] + *res);
+            res++;
+        }
+        dst += stride;
+    }
+}
+
+static av_always_inline void FUNC(add_residual_c)(uint8_t *_dst, const int16_t *res,
+                                                ptrdiff_t stride, unsigned int size)
+{
+    unsigned int x, y;
+    pixel *dst = (pixel *)_dst;
+    const int16_t * ru = res;
+    const int16_t * rv = res + size * size;
+
+//    rpi_sand_dump16("ARC In Pred", _dst, stride, 0, 0, 0, size, size, 1);
+//    rpi_sand_dump16("ARC In RU", ru, size * 2, 0, 0, 0, size, size, 0);
+//    rpi_sand_dump16("ARC In RV", rv, size * 2, 0, 0, 0, size, size, 0);
+
+    stride /= sizeof(pixel);
+
+    for (y = 0; y < size; y++) {
+        for (x = 0; x < size * 2; x += 2) {
+            dst[x + 0] = av_clip_pixel(dst[x + 0] + *ru++);
+            dst[x + 1] = av_clip_pixel(dst[x + 1] + *rv++);
+        }
+        dst += stride;
+    }
+
+//    rpi_sand_dump16("ARC Out", _dst, stride * 2, 0, 0, 0, size, size, 1);
+}
+
+
+static av_always_inline void FUNC(add_residual_dc_c)(uint8_t *_dst, ptrdiff_t stride, const int32_t dc, int size)
+{
+    int x, y;
+    pixel *dst = (pixel *)_dst;
+    const int dc_v = dc >> 16;
+    const int dc_u = (dc << 16) >> 16;
+
+    stride /= sizeof(pixel);
+
+    for (y = 0; y < size; y++) {
+        for (x = 0; x < size * 2; x += 2) {
+            dst[x] = av_clip_pixel(dst[x] + dc_u);
+            dst[x + 1] = av_clip_pixel(dst[x + 1] + dc_v);
+        }
+        dst += stride;
+    }
+}
+
+
+#endif
+
+static void FUNC(add_residual4x4)(uint8_t *_dst, int16_t *coeffs,
+                                  ptrdiff_t stride)
+{
+    FUNC(add_residual)(_dst, coeffs, stride, 4);
+}
+
+static void FUNC(add_residual8x8)(uint8_t *_dst, int16_t *coeffs,
                                        ptrdiff_t stride)
 {
-    FUNC(transquant_bypass)(_dst, coeffs, stride, 8);
+    FUNC(add_residual)(_dst, coeffs, stride, 8);
 }

-static void FUNC(transform_add16x16)(uint8_t *_dst, int16_t *coeffs,
+static void FUNC(add_residual16x16)(uint8_t *_dst, int16_t *coeffs,
                                          ptrdiff_t stride)
 {
-    FUNC(transquant_bypass)(_dst, coeffs, stride, 16);
+    FUNC(add_residual)(_dst, coeffs, stride, 16);
 }

-static void FUNC(transform_add32x32)(uint8_t *_dst, int16_t *coeffs,
+static void FUNC(add_residual32x32)(uint8_t *_dst, int16_t *coeffs,
                                          ptrdiff_t stride)
 {
-    FUNC(transquant_bypass)(_dst, coeffs, stride, 32);
+    FUNC(add_residual)(_dst, coeffs, stride, 32);
 }

+static void FUNC(add_residual4x4_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
+{
+    FUNC(add_residual_dc)(_dst, stride, dc, 4);
+}
+
+static void FUNC(add_residual8x8_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
+{
+    FUNC(add_residual_dc)(_dst, stride, dc, 8);
+}
+
+static void FUNC(add_residual16x16_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
+{
+    FUNC(add_residual_dc)(_dst, stride, dc, 16);
+}
+
+static void FUNC(add_residual32x32_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
+{
+    FUNC(add_residual_dc)(_dst, stride, dc, 32);
+}
+
+#if RPI_HEVC_SAND
+// -- U -- (plaited)
+
+static void FUNC(add_residual4x4_u)(uint8_t *_dst, const int16_t * res,
+                                  ptrdiff_t stride, int dc_u)
+{
+    FUNC(add_residual_u)(_dst, res, stride, dc_u, 4);
+}
+
+static void FUNC(add_residual8x8_u)(uint8_t *_dst, const int16_t * res,
+                                  ptrdiff_t stride, int dc_u)
+{
+    FUNC(add_residual_u)(_dst, res, stride, dc_u, 8);
+}
+
+static void FUNC(add_residual16x16_u)(uint8_t *_dst, const int16_t * res,
+                                    ptrdiff_t stride, int dc_u)
+{
+    FUNC(add_residual_u)(_dst, res, stride, dc_u, 16);
+}
+
+static void FUNC(add_residual32x32_u)(uint8_t *_dst, const int16_t * res,
+                                    ptrdiff_t stride, int dc_u)
+{
+    // Should never occur for 420, which is all that sand supports
+    av_assert0(0);
+}
+
+// -- V -- (plaited)
+
+static void FUNC(add_residual4x4_v)(uint8_t *_dst, const int16_t * res,
+                                  ptrdiff_t stride, int dc_v)
+{
+    FUNC(add_residual_v)(_dst, res, stride, dc_v, 4);
+}
+
+static void FUNC(add_residual8x8_v)(uint8_t *_dst, const int16_t * res,
+                                  ptrdiff_t stride, int dc_v)
+{
+    FUNC(add_residual_v)(_dst, res, stride, dc_v, 8);
+}
+
+static void FUNC(add_residual16x16_v)(uint8_t *_dst, const int16_t * res,
+                                    ptrdiff_t stride, int dc_v)
+{
+    FUNC(add_residual_v)(_dst, res, stride, dc_v, 16);
+}
+
+static void FUNC(add_residual32x32_v)(uint8_t *_dst, const int16_t * res,
+                                    ptrdiff_t stride, int dc_v)
+{
+    // Should never occur for 420, which is all that sand supports
+    av_assert0(0);
+}
+
+// -- C -- (plaited - both U & V)
+
+static void FUNC(add_residual4x4_c)(uint8_t *_dst, const int16_t * res,
+                                  ptrdiff_t stride)
+{
+    FUNC(add_residual_c)(_dst, res, stride, 4);
+}
+
+static void FUNC(add_residual8x8_c)(uint8_t *_dst, const int16_t * res,
+                                  ptrdiff_t stride)
+{
+    FUNC(add_residual_c)(_dst, res, stride, 8);
+}
+
+static void FUNC(add_residual16x16_c)(uint8_t *_dst, const int16_t * res,
+                                    ptrdiff_t stride)
+{
+    FUNC(add_residual_c)(_dst, res, stride, 16);
+}
+
+static void FUNC(add_residual32x32_c)(uint8_t *_dst, const int16_t * res,
+                                    ptrdiff_t stride)
+{
+    // Should never occur for 420, which is all that sand supports
+    av_assert0(0);
+}
+
+static void FUNC(add_residual4x4_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
+{
+    FUNC(add_residual_dc_c)(_dst, stride, dc, 4);
+}
+
+static void FUNC(add_residual8x8_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
+{
+    FUNC(add_residual_dc_c)(_dst, stride, dc, 8);
+}
+
+static void FUNC(add_residual16x16_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
+{
+    FUNC(add_residual_dc_c)(_dst, stride, dc, 16);
+}
+
+static void FUNC(add_residual32x32_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
+{
+    // Should never occur for 420, which is all that sand supports
+    av_assert0(0);
+}
+
+#endif
+

 static void FUNC(transform_rdpcm)(int16_t *_coeffs, int16_t log2_size, int mode)
 {
@@ -152,7 +402,7 @@ static void FUNC(transform_skip)(int16_t *_coeffs, int16_t log2_size)
         assign(dst[3 * step], 55 * c0 + 29 * c2 - c3);                  \
     } while (0)

-static void FUNC(transform_4x4_luma)(int16_t *coeffs)
+static void FUNC(idct_4x4_luma)(int16_t *coeffs)
 {
     int i;
     int shift    = 7;
@@ -358,6 +608,32 @@ static void FUNC(sao_edge_filter)(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride
     }
 }

+
+#if BIT_DEPTH == 10
+#if RPI_HEVC_SAND
+// We need a 32 bit variation for the _c restores so hijack bit depth 10
+#undef pixel
+#undef BIT_DEPTH
+#define pixel uint32_t
+#define BIT_DEPTH 32
+#endif
+// All 16 bit variations are the same
+#define sao_edge_restore_0_10 sao_edge_restore_0_9
+#define sao_edge_restore_1_10 sao_edge_restore_1_9
+#define sao_edge_restore_0_11 sao_edge_restore_0_9
+#define sao_edge_restore_1_11 sao_edge_restore_1_9
+#define sao_edge_restore_0_12 sao_edge_restore_0_9
+#define sao_edge_restore_1_12 sao_edge_restore_1_9
+#define sao_edge_restore_0_13 sao_edge_restore_0_9
+#define sao_edge_restore_1_13 sao_edge_restore_1_9
+#define sao_edge_restore_0_14 sao_edge_restore_0_9
+#define sao_edge_restore_1_14 sao_edge_restore_1_9
+#define sao_edge_restore_0_15 sao_edge_restore_0_9
+#define sao_edge_restore_1_15 sao_edge_restore_1_9
+#define sao_edge_restore_0_16 sao_edge_restore_0_9
+#define sao_edge_restore_1_16 sao_edge_restore_1_9
+#endif
+#if BIT_DEPTH <= 9 || BIT_DEPTH == 32
 static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
                                     ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao,
                                     int *borders, int _width, int _height,
@@ -367,7 +643,6 @@ static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
     int x, y;
     pixel *dst = (pixel *)_dst;
     pixel *src = (pixel *)_src;
-    int16_t *sao_offset_val = sao->offset_val[c_idx];
     int sao_eo_class    = sao->eo_class[c_idx];
     int init_x = 0, width = _width, height = _height;

@@ -376,33 +651,29 @@ static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,

     if (sao_eo_class != SAO_EO_VERT) {
         if (borders[0]) {
-            int offset_val = sao_offset_val[0];
             for (y = 0; y < height; y++) {
-                dst[y * stride_dst] = av_clip_pixel(src[y * stride_src] + offset_val);
+                dst[y * stride_dst] = src[y * stride_src];
             }
             init_x = 1;
         }
         if (borders[2]) {
-            int offset_val = sao_offset_val[0];
             int offset     = width - 1;
             for (x = 0; x < height; x++) {
-                dst[x * stride_dst + offset] = av_clip_pixel(src[x * stride_src + offset] + offset_val);
+                dst[x * stride_dst + offset] = src[x * stride_src + offset];
             }
             width--;
         }
     }
     if (sao_eo_class != SAO_EO_HORIZ) {
         if (borders[1]) {
-            int offset_val = sao_offset_val[0];
             for (x = init_x; x < width; x++)
-                dst[x] = av_clip_pixel(src[x] + offset_val);
+                dst[x] = src[x];
         }
         if (borders[3]) {
-            int offset_val   = sao_offset_val[0];
-            int y_stride_dst = stride_dst * (height - 1);
-            int y_stride_src = stride_src * (height - 1);
+            ptrdiff_t y_stride_dst = stride_dst * (height - 1);
+            ptrdiff_t y_stride_src = stride_src * (height - 1);
             for (x = init_x; x < width; x++)
-                dst[x + y_stride_dst] = av_clip_pixel(src[x + y_stride_src] + offset_val);
+                dst[x + y_stride_dst] = src[x + y_stride_src];
             height--;
         }
     }
@@ -417,7 +688,6 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
     int x, y;
     pixel *dst = (pixel *)_dst;
     pixel *src = (pixel *)_src;
-    int16_t *sao_offset_val = sao->offset_val[c_idx];
     int sao_eo_class    = sao->eo_class[c_idx];
     int init_x = 0, init_y = 0, width = _width, height = _height;

@@ -426,34 +696,30 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,

     if (sao_eo_class != SAO_EO_VERT) {
         if (borders[0]) {
-            int offset_val = sao_offset_val[0];
             for (y = 0; y < height; y++) {
-                dst[y * stride_dst] = av_clip_pixel(src[y * stride_src] + offset_val);
+                dst[y * stride_dst] = src[y * stride_src];
             }
             init_x = 1;
         }
         if (borders[2]) {
-            int offset_val = sao_offset_val[0];
             int offset     = width - 1;
             for (x = 0; x < height; x++) {
-                dst[x * stride_dst + offset] = av_clip_pixel(src[x * stride_src + offset] + offset_val);
+                dst[x * stride_dst + offset] = src[x * stride_src + offset];
             }
             width--;
         }
     }
     if (sao_eo_class != SAO_EO_HORIZ) {
         if (borders[1]) {
-            int offset_val = sao_offset_val[0];
             for (x = init_x; x < width; x++)
-                dst[x] = av_clip_pixel(src[x] + offset_val);
+                dst[x] = src[x];
             init_y = 1;
         }
         if (borders[3]) {
-            int offset_val   = sao_offset_val[0];
-            int y_stride_dst = stride_dst * (height - 1);
-            int y_stride_src = stride_src * (height - 1);
+            ptrdiff_t y_stride_dst = stride_dst * (height - 1);
+            ptrdiff_t y_stride_src = stride_src * (height - 1);
             for (x = init_x; x < width; x++)
-                dst[x + y_stride_dst] = av_clip_pixel(src[x + y_stride_src] + offset_val);
+                dst[x + y_stride_dst] = src[x + y_stride_src];
             height--;
         }
     }
@@ -493,6 +759,121 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,

     }
 }
+#endif
+#if BIT_DEPTH == 32
+#undef BIT_DEPTH
+#undef pixel
+#define BIT_DEPTH 10
+#define pixel uint16_t
+#endif
+
+// --- Plaited chroma versions
+
+#if RPI_HEVC_SAND
+
+static void FUNC(sao_band_filter_c)(uint8_t *_dst, const uint8_t *_src,
+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
+                                  int width, int height)
+{
+    pixel *dst = (pixel *)_dst;
+    pixel *src = (pixel *)_src;
+    int offset_table_u[32] = { 0 };
+    int offset_table_v[32] = { 0 };
+    int k, y, x;
+    int shift  = BIT_DEPTH - 5;
+
+    stride_dst /= sizeof(pixel);
+    stride_src /= sizeof(pixel);
+    width *= 2;
+
+    for (k = 0; k < 4; k++)
+    {
+        offset_table_u[(k + sao_left_class_u) & 31] = sao_offset_val_u[k + 1];
+        offset_table_v[(k + sao_left_class_v) & 31] = sao_offset_val_v[k + 1];
+    }
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x += 2)
+        {
+//            printf("dst=%p, src=%p, x=%d, shift=%d\n", dst, src, x, shift);
+//            printf("offsets=%x,%x\n", src[x + 0], src[x + 1]);
+            // *** & 31 shouldn't be wanted but just now we generate broken input that
+            // crashes us in 10-bit world
+            dst[x + 0] = av_clip_pixel(src[x + 0] + offset_table_u[(src[x + 0] >> shift) & 31]);
+            dst[x + 1] = av_clip_pixel(src[x + 1] + offset_table_v[(src[x + 1] >> shift) & 31]);
+        }
+        dst += stride_dst;
+        src += stride_src;
+    }
+}
+
+static void FUNC(sao_edge_filter_c)(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
+                                  const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v,
+                                  int eo, int width, int height) {
+
+    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
+    static const int8_t pos[4][2][2] = {
+        { { -1,  0 }, {  1, 0 } }, // horizontal
+        { {  0, -1 }, {  0, 1 } }, // vertical
+        { { -1, -1 }, {  1, 1 } }, // 45 degree
+        { {  1, -1 }, { -1, 1 } }, // 135 degree
+    };
+    pixel *dst = (pixel *)_dst;
+    pixel *src = (pixel *)_src;
+    int a_stride, b_stride;
+    int x, y;
+    ptrdiff_t stride_src = (2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE) / sizeof(pixel);
+
+    stride_dst /= sizeof(pixel);
+    width *= 2;
+
+    av_assert0(width <= 64);
+
+    a_stride = pos[eo][0][0] * 2 + pos[eo][0][1] * stride_src;
+    b_stride = pos[eo][1][0] * 2 + pos[eo][1][1] * stride_src;
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x += 2) {
+            int diff0u = CMP(src[x], src[x + a_stride]);
+            int diff1u = CMP(src[x], src[x + b_stride]);
+            int offset_valu        = edge_idx[2 + diff0u + diff1u];
+            int diff0v = CMP(src[x+1], src[x+1 + a_stride]);
+            int diff1v = CMP(src[x+1], src[x+1 + b_stride]);
+            int offset_valv        = edge_idx[2 + diff0v + diff1v];
+            dst[x] = av_clip_pixel(src[x] + sao_offset_val_u[offset_valu]);
+            dst[x+1] = av_clip_pixel(src[x+1] + sao_offset_val_v[offset_valv]);
+        }
+        src += stride_src;
+        dst += stride_dst;
+    }
+}
+
+// Do once
+#if BIT_DEPTH == 8
+// Any old 2 byte 'normal' restore will work for these
+#define sao_edge_restore_c_0_8  sao_edge_restore_0_16
+#define sao_edge_restore_c_1_8  sao_edge_restore_1_16
+// We need 32 bit for 9 bit+
+#define sao_edge_restore_c_0_9  sao_edge_restore_0_32
+#define sao_edge_restore_c_1_9  sao_edge_restore_1_32
+#define sao_edge_restore_c_0_10 sao_edge_restore_0_32
+#define sao_edge_restore_c_1_10 sao_edge_restore_1_32
+#define sao_edge_restore_c_0_11 sao_edge_restore_0_32
+#define sao_edge_restore_c_1_11 sao_edge_restore_1_32
+#define sao_edge_restore_c_0_12 sao_edge_restore_0_32
+#define sao_edge_restore_c_1_12 sao_edge_restore_1_32
+#define sao_edge_restore_c_0_13 sao_edge_restore_0_32
+#define sao_edge_restore_c_1_13 sao_edge_restore_1_32
+#define sao_edge_restore_c_0_14 sao_edge_restore_0_32
+#define sao_edge_restore_c_1_14 sao_edge_restore_1_32
+#define sao_edge_restore_c_0_15 sao_edge_restore_0_32
+#define sao_edge_restore_c_1_15 sao_edge_restore_1_32
+#define sao_edge_restore_c_0_16 sao_edge_restore_0_32
+#define sao_edge_restore_c_1_16 sao_edge_restore_1_32
+#endif
+
+#endif  // RPI_HEVC_SAND
+

 #undef CMP

@@ -1694,3 +2075,217 @@ static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
 #undef TQ1
 #undef TQ2
 #undef TQ3
+
+#if RPI_HEVC_SAND
+
+// line zero
+#define P3 pix_l[0 * xstride]
+#define P2 pix_l[1 * xstride]
+#define P1 pix_l[2 * xstride]
+#define P0 pix_l[3 * xstride]
+#define Q0 pix_r[0 * xstride]
+#define Q1 pix_r[1 * xstride]
+#define Q2 pix_r[2 * xstride]
+#define Q3 pix_r[3 * xstride]
+
+// line three. used only for deblocking decision
+#define TP3 pix_l[0 * xstride + 3 * ystride]
+#define TP2 pix_l[1 * xstride + 3 * ystride]
+#define TP1 pix_l[2 * xstride + 3 * ystride]
+#define TP0 pix_l[3 * xstride + 3 * ystride]
+#define TQ0 pix_r[0 * xstride + 3 * ystride]
+#define TQ1 pix_r[1 * xstride + 3 * ystride]
+#define TQ2 pix_r[2 * xstride + 3 * ystride]
+#define TQ3 pix_r[3 * xstride + 3 * ystride]
+
+// This is identical to hevc_loop_filter_luma except that the P/Q
+// components are on separate pointers
+static void FUNC(hevc_v_loop_filter_luma2)(uint8_t * _pix_r,
+                                 unsigned int _stride, unsigned int beta, const int32_t _tc[2],
+                                 const uint8_t _no_p[2], const uint8_t _no_q[2],
+                                 uint8_t * _pix_l)
+{
+    int d, j;
+    pixel *pix_l        = (pixel *)_pix_l;
+    pixel *pix_r        = (pixel *)_pix_r;
+    const ptrdiff_t xstride = 1;
+    const ptrdiff_t ystride = _stride / sizeof(pixel);
+
+    beta <<= BIT_DEPTH - 8;
+
+    for (j = 0; j < 2; j++) {
+        const int dp0  = abs(P2  - 2 * P1  + P0);
+        const int dq0  = abs(Q2  - 2 * Q1  + Q0);
+        const int dp3  = abs(TP2 - 2 * TP1 + TP0);
+        const int dq3  = abs(TQ2 - 2 * TQ1 + TQ0);
+        const int d0   = dp0 + dq0;
+        const int d3   = dp3 + dq3;
+        const int tc   = _tc[j]   << (BIT_DEPTH - 8);
+        const int no_p = _no_p[j];
+        const int no_q = _no_q[j];
+
+        if (d0 + d3 >= beta) {
+            pix_l += 4 * ystride;
+            pix_r += 4 * ystride;
+            continue;
+        } else {
+            const int beta_3 = beta >> 3;
+            const int beta_2 = beta >> 2;
+            const int tc25   = ((tc * 5 + 1) >> 1);
+
+            if (abs(P3  -  P0) + abs(Q3  -  Q0) < beta_3 && abs(P0  -  Q0) < tc25 &&
+                abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 &&
+                                      (d0 << 1) < beta_2 &&      (d3 << 1) < beta_2) {
+                // strong filtering
+                const int tc2 = tc << 1;
+                for (d = 0; d < 4; d++) {
+                    const int p3 = P3;
+                    const int p2 = P2;
+                    const int p1 = P1;
+                    const int p0 = P0;
+                    const int q0 = Q0;
+                    const int q1 = Q1;
+                    const int q2 = Q2;
+                    const int q3 = Q3;
+                    if (!no_p) {
+                        P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc2, tc2);
+                        P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2);
+                        P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc2, tc2);
+                    }
+                    if (!no_q) {
+                        Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc2, tc2);
+                        Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2);
+                        Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc2, tc2);
+                    }
+                    pix_l += ystride;
+                    pix_r += ystride;
+                }
+            } else { // normal filtering
+                int nd_p = 1;
+                int nd_q = 1;
+                const int tc_2 = tc >> 1;
+                if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3))
+                    nd_p = 2;
+                if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3))
+                    nd_q = 2;
+
+                for (d = 0; d < 4; d++) {
+                    const int p2 = P2;
+                    const int p1 = P1;
+                    const int p0 = P0;
+                    const int q0 = Q0;
+                    const int q1 = Q1;
+                    const int q2 = Q2;
+                    int delta0   = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4;
+                    if (abs(delta0) < 10 * tc) {
+                        delta0 = av_clip(delta0, -tc, tc);
+                        if (!no_p)
+                            P0 = av_clip_pixel(p0 + delta0);
+                        if (!no_q)
+                            Q0 = av_clip_pixel(q0 - delta0);
+                        if (!no_p && nd_p > 1) {
+                            const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2);
+                            P1 = av_clip_pixel(p1 + deltap1);
+                        }
+                        if (!no_q && nd_q > 1) {
+                            const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2);
+                            Q1 = av_clip_pixel(q1 + deltaq1);
+                        }
+                    }
+                    pix_l += ystride;
+                    pix_r += ystride;
+                }
+            }
+        }
+    }
+}
+
+#undef TP3
+#undef TP2
+#undef TP1
+#undef TP0
+#undef TQ0
+#undef TQ1
+#undef TQ2
+#undef TQ3
+
+#undef P3
+#undef P2
+#undef P1
+#undef P0
+#undef Q0
+#undef Q1
+#undef Q2
+#undef Q3
+
+#define P1 pix_l[0 * xstride]
+#define P0 pix_l[1 * xstride]
+#define Q0 pix_r[0 * xstride]
+#define Q1 pix_r[1 * xstride]
+
+static void FUNC(hevc_loop_filter_uv2)(uint8_t *_pix_l, ptrdiff_t _xstride,
+                                          ptrdiff_t _ystride, const int32_t *_tc,
+                                          const uint8_t *_no_p, const uint8_t *_no_q, uint8_t *_pix_r)
+{
+    int d, j, no_p, no_q;
+    pixel *pix_l        = (pixel *)_pix_l;
+    pixel *pix_r        = (pixel *)_pix_r;
+    ptrdiff_t xstride = _xstride / sizeof(pixel);
+    ptrdiff_t ystride = _ystride / sizeof(pixel);
+
+    for (j = 0; j < 2; j++) {
+        const int tc = _tc[j] << (BIT_DEPTH - 8);
+        if (tc <= 0) {
+            pix_l += 4 * ystride;
+            pix_r += 4 * ystride;
+            continue;
+        }
+        no_p = _no_p[j];
+        no_q = _no_q[j];
+
+        for (d = 0; d < 4; d++) {
+            int delta0;
+            const int p1 = P1;
+            const int p0 = P0;
+            const int q0 = Q0;
+            const int q1 = Q1;
+            delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc);
+            if (!no_p)
+                P0 = av_clip_pixel(p0 + delta0);
+            if (!no_q)
+                Q0 = av_clip_pixel(q0 - delta0);
+            pix_l += ystride;
+            pix_r += ystride;
+        }
+    }
+}
+
+static void FUNC(hevc_h_loop_filter_uv)(uint8_t * pix, unsigned int stride, uint32_t tc4,
+                                 unsigned int no_f)
+{
+    uint8_t no_p[2] = {no_f & 1, no_f & 2};
+    uint8_t no_q[2] = {no_f & 4, no_f & 8};
+    int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24};
+    FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel) * 2, tc, no_p, no_q);
+    FUNC(hevc_loop_filter_chroma)(pix + sizeof(pixel), stride, sizeof(pixel) * 2, tc + 2, no_p, no_q);
+}
+
+static void FUNC(hevc_v_loop_filter_uv2)(uint8_t * src_r, unsigned int stride, uint32_t tc4,
+                                 uint8_t * src_l,
+                                 unsigned int no_f)
+{
+    uint8_t no_p[2] = {no_f & 1, no_f & 2};
+    uint8_t no_q[2] = {no_f & 4, no_f & 8};
+    int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24};
+    FUNC(hevc_loop_filter_uv2)(src_l, sizeof(pixel) * 2, stride, tc, no_p, no_q, src_r);
+    FUNC(hevc_loop_filter_uv2)(src_l + sizeof(pixel), sizeof(pixel) * 2, stride, tc + 2, no_p, no_q, src_r + sizeof(pixel));
+}
+
+#undef P1
+#undef P0
+#undef Q0
+#undef Q1
+
+
+#endif
+
diff --git a/libavcodec/hevcpred.c b/libavcodec/hevcpred.c
index 02c1766059..cea16eade4 100644
--- a/libavcodec/hevcpred.c
+++ b/libavcodec/hevcpred.c
@@ -24,6 +24,7 @@

 #include "hevcpred.h"

+#define PRED_C 0
 #define BIT_DEPTH 8
 #include "hevcpred_template.c"
 #undef BIT_DEPTH
@@ -39,13 +40,37 @@
 #define BIT_DEPTH 12
 #include "hevcpred_template.c"
 #undef BIT_DEPTH
+#undef PRED_C
+
+#ifdef RPI
+#define PRED_C 1
+#define BIT_DEPTH 8
+#include "hevcpred_template.c"
+#undef BIT_DEPTH
+
+#define BIT_DEPTH 9
+#include "hevcpred_template.c"
+#undef BIT_DEPTH
+
+#define BIT_DEPTH 10
+#include "hevcpred_template.c"
+#undef BIT_DEPTH
+
+#define BIT_DEPTH 12
+#include "hevcpred_template.c"
+#undef BIT_DEPTH
+#undef PRED_C
+#endif

 void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth)
 {
 #undef FUNC
 #define FUNC(a, depth) a ## _ ## depth

-#define HEVC_PRED(depth)                                \
+#undef FUNCC
+#define FUNCC(a, depth) a ## _ ## depth ## _c
+
+#define HEVC_PRED_Y(depth)                                \
     hpc->intra_pred[0]   = FUNC(intra_pred_2, depth);   \
     hpc->intra_pred[1]   = FUNC(intra_pred_3, depth);   \
     hpc->intra_pred[2]   = FUNC(intra_pred_4, depth);   \
@@ -60,6 +85,30 @@ void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth)
     hpc->pred_angular[2] = FUNC(pred_angular_2, depth); \
     hpc->pred_angular[3] = FUNC(pred_angular_3, depth);

+#define HEVC_PRED_C(depth)                                \
+    hpc->intra_pred_c[0]   = FUNCC(intra_pred_2, depth);   \
+    hpc->intra_pred_c[1]   = FUNCC(intra_pred_3, depth);   \
+    hpc->intra_pred_c[2]   = FUNCC(intra_pred_4, depth);   \
+    hpc->intra_pred_c[3]   = FUNCC(intra_pred_5, depth);   \
+    hpc->pred_planar_c[0]  = FUNCC(pred_planar_0, depth);  \
+    hpc->pred_planar_c[1]  = FUNCC(pred_planar_1, depth);  \
+    hpc->pred_planar_c[2]  = FUNCC(pred_planar_2, depth);  \
+    hpc->pred_planar_c[3]  = FUNCC(pred_planar_3, depth);  \
+    hpc->pred_dc_c         = FUNCC(pred_dc, depth);        \
+    hpc->pred_angular_c[0] = FUNCC(pred_angular_0, depth); \
+    hpc->pred_angular_c[1] = FUNCC(pred_angular_1, depth); \
+    hpc->pred_angular_c[2] = FUNCC(pred_angular_2, depth); \
+    hpc->pred_angular_c[3] = FUNCC(pred_angular_3, depth);
+
+#ifdef RPI
+#define HEVC_PRED(depth) \
+    HEVC_PRED_Y(depth); \
+    HEVC_PRED_C(depth);
+#else
+#define HEVC_PRED(depth) \
+    HEVC_PRED_Y(depth);
+#endif
+
     switch (bit_depth) {
     case 9:
         HEVC_PRED(9);
diff --git a/libavcodec/hevcpred.h b/libavcodec/hevcpred.h
index eb17663683..00ba3f94c0 100644
--- a/libavcodec/hevcpred.h
+++ b/libavcodec/hevcpred.h
@@ -38,6 +38,17 @@ typedef struct HEVCPredContext {
     void (*pred_angular[4])(uint8_t *src, const uint8_t *top,
                             const uint8_t *left, ptrdiff_t stride,
                             int c_idx, int mode);
+#ifdef RPI
+    void (*intra_pred_c[4])(struct HEVCContext *s, int x0, int y0, int c_idx);
+
+    void (*pred_planar_c[4])(uint8_t *src, const uint8_t *top,
+                           const uint8_t *left, ptrdiff_t stride);
+    void (*pred_dc_c)(uint8_t *src, const uint8_t *top, const uint8_t *left,
+                    ptrdiff_t stride, int log2_size, int c_idx);
+    void (*pred_angular_c[4])(uint8_t *src, const uint8_t *top,
+                            const uint8_t *left, ptrdiff_t stride,
+                            int c_idx, int mode);
+#endif
 } HEVCPredContext;

 void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth);
diff --git a/libavcodec/hevcpred_template.c b/libavcodec/hevcpred_template.c
index 6fe33546b1..2f9f5f2798 100644
--- a/libavcodec/hevcpred_template.c
+++ b/libavcodec/hevcpred_template.c
@@ -20,13 +20,110 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */

+//#define DISABLE_INTRA
+
 #include "libavutil/pixdesc.h"

 #include "bit_depth_template.c"
 #include "hevcpred.h"

+#ifdef RPI
+#include "libavutil/rpi_sand_fns.h"
+#endif
+
+#define DUMP_PRED 0
+
 #define POS(x, y) src[(x) + stride * (y)]

+// REPEAT_INCLUDE defined at EOF
+#if defined(RPI) && !defined(INCLUDED_ONCE)
+typedef uint8_t (* c8_dst_ptr_t)[2];
+typedef const uint8_t (* c8_src_ptr_t)[2];
+typedef uint16_t (* c16_dst_ptr_t)[2];
+typedef const uint16_t (* c16_src_ptr_t)[2];
+
+// *** On ARM make these NEON registers
+typedef struct pixel4_16 {
+    uint16_t x[4];
+} pixel4_16;
+typedef struct pixel4_32 {
+    uint32_t x[4];
+} pixel4_32;
+static inline pixel4_16 PIXEL_SPLAT_X4_16(const uint16_t x)
+{
+    pixel4_16 t = {{x, x, x, x}};
+    return t;
+}
+static inline pixel4_32 PIXEL_SPLAT_X4_32(const uint32_t x)
+{
+    pixel4_32 t = {{x, x, x, x}};
+    return t;
+}
+#endif
+
+#if PRED_C
+// For chroma we double pixel size so we copy pairs
+#undef pixel
+#undef pixel2
+#undef pixel4
+#undef dctcoef
+#undef INIT_CLIP
+#undef no_rnd_avg_pixel4
+#undef rnd_avg_pixel4
+#undef AV_RN2P
+#undef AV_RN4P
+#undef AV_RN4PA
+#undef AV_WN2P
+#undef AV_WN4P
+#undef AV_WN4PA
+#undef CLIP
+#undef FUNC
+#undef FUNCC
+#undef av_clip_pixel
+#undef PIXEL_SPLAT_X4
+
+#if BIT_DEPTH == 8
+#define pixel uint16_t
+#define pixel4 pixel4_16
+#define PIXEL_SPLAT_X4 PIXEL_SPLAT_X4_16
+#define cpel uint8_t
+#define c_src_ptr_t  c8_src_ptr_t
+#define c_dst_ptr_t  c8_dst_ptr_t
+#else
+#define pixel uint32_t
+#define pixel4 pixel4_32
+#define PIXEL_SPLAT_X4 PIXEL_SPLAT_X4_32
+#define cpel uint16_t
+#define c_src_ptr_t c16_dst_ptr_t
+#define c_dst_ptr_t c16_dst_ptr_t
+#endif
+#define AV_RN4P(p) (*(pixel4*)(p))
+#define AV_WN4P(p,x) (*(pixel4*)(p) = (x))
+#define FUNC(a) FUNC2(a, BIT_DEPTH, _c)
+#endif
+
+
+// Get PW prior to horrid PRED_C trickery
+#if BIT_DEPTH == 8
+#define PW 1
+#else
+#define PW 2
+#endif
+
+
+#if DUMP_PRED && !defined(INCLUDE_ONCE)
+static void dump_pred_uv(const uint8_t * data, const unsigned int stride, const unsigned int size)
+{
+    for (unsigned int y = 0; y != size; y++, data += stride * 2) {
+        for (unsigned int x = 0; x != size; x++) {
+            printf("%4d", data[x * 2]);
+        }
+        printf("\n");
+    }
+    printf("\n");
+}
+#endif
+
 static av_always_inline void FUNC(intra_pred)(HEVCContext *s, int x0, int y0,
                                               int log2_size, int c_idx)
 {
@@ -69,8 +166,11 @@ do {                                  \
                 AV_WN4P(&ptr[i], a);                                           \
             else                                                               \
                 a = PIXEL_SPLAT_X4(ptr[i + 3])
-
+#ifdef RPI
+    HEVCLocalContextIntra *lc = (s->enable_rpi) ? &s->HEVClcIntra : (HEVCLocalContextIntra *)s->HEVClc ;
+#else
     HEVCLocalContext *lc = s->HEVClc;
+#endif
     int i;
     int hshift = s->ps.sps->hshift[c_idx];
     int vshift = s->ps.sps->vshift[c_idx];
@@ -79,15 +179,23 @@ do {                                  \
     int size_in_tbs_h  = size_in_luma_h >> s->ps.sps->log2_min_tb_size;
     int size_in_luma_v = size << vshift;
     int size_in_tbs_v  = size_in_luma_v >> s->ps.sps->log2_min_tb_size;
-    int x = x0 >> hshift;
-    int y = y0 >> vshift;
+    const int x = x0 >> hshift;
+    const int y = y0 >> vshift;
     int x_tb = (x0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
     int y_tb = (y0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;

     int cur_tb_addr = MIN_TB_ADDR_ZS(x_tb, y_tb);

-    ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(pixel);
+    const ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(pixel);
+#if defined(RPI)
+    pixel *const src = !av_rpi_is_sand_frame(s->frame) ?
+            (pixel*)s->frame->data[c_idx] + x + y * stride :
+        c_idx == 0 ?
+            (pixel *)av_rpi_sand_frame_pos_y(s->frame, x, y) :
+            (pixel *)av_rpi_sand_frame_pos_c(s->frame, x, y);
+#else
     pixel *src = (pixel*)s->frame->data[c_idx] + x + y * stride;
+#endif

     int min_pu_width = s->ps.sps->min_pu_width;

@@ -95,14 +203,20 @@ do {                                  \
                               lc->tu.intra_pred_mode;
     pixel4 a;
     pixel  left_array[2 * MAX_TB_SIZE + 1];
+#if !PRED_C
     pixel  filtered_left_array[2 * MAX_TB_SIZE + 1];
+#endif
     pixel  top_array[2 * MAX_TB_SIZE + 1];
+#if !PRED_C
     pixel  filtered_top_array[2 * MAX_TB_SIZE + 1];
+#endif

     pixel  *left          = left_array + 1;
     pixel  *top           = top_array  + 1;
+#if !PRED_C
     pixel  *filtered_left = filtered_left_array + 1;
     pixel  *filtered_top  = filtered_top_array  + 1;
+#endif
     int cand_bottom_left = lc->na.cand_bottom_left && cur_tb_addr > MIN_TB_ADDR_ZS( x_tb - 1, (y_tb + size_in_tbs_v) & s->ps.sps->tb_mask);
     int cand_left        = lc->na.cand_left;
     int cand_up_left     = lc->na.cand_up_left;
@@ -114,6 +228,27 @@ do {                                  \
     int top_right_size   = (FFMIN(x0 + 2 * size_in_luma_h, s->ps.sps->width) -
                            (x0 + size_in_luma_h)) >> hshift;

+    pixel * src_l = src - 1;
+    pixel * src_u = src - stride;
+    pixel * src_ur = src_u + size;
+
+#ifdef DISABLE_INTRA
+    return;
+#endif
+
+#if defined(RPI)
+    if (av_rpi_is_sand_frame(s->frame)) {
+        // N.B. stride is in pixels (not bytes) or in the case of chroma pixel-pairs
+        const AVFrame * const frame = s->frame;
+        const unsigned int mask = stride - 1; // For chroma pixel=uint16 so stride_c is stride_y / 2
+        const unsigned int stripe_adj = (av_rpi_sand_frame_stride2(frame) - 1) * stride;
+        if ((x & mask) == 0)
+            src_l -= stripe_adj;
+        if (((x + size) & mask) == 0)
+            src_ur += stripe_adj;
+    }
+#endif
+
     if (s->ps.pps->constrained_intra_pred_flag == 1) {
         int size_in_luma_pu_v = PU(size_in_luma_v);
         int size_in_luma_pu_h = PU(size_in_luma_h);
@@ -163,23 +298,24 @@ do {                                  \
         top[-1] = 128;
     }
     if (cand_up_left) {
-        left[-1] = POS(-1, -1);
+        left[-1] = src_l[-stride];
         top[-1]  = left[-1];
     }
     if (cand_up)
-        memcpy(top, src - stride, size * sizeof(pixel));
+        // Always good - even with sand
+        memcpy(top, src_u, size * sizeof(pixel));
     if (cand_up_right) {
-        memcpy(top + size, src - stride + size, size * sizeof(pixel));
-        EXTEND(top + size + top_right_size, POS(size + top_right_size - 1, -1),
+        memcpy(top + size, src_ur, top_right_size * sizeof(pixel));
+        EXTEND(top + size + top_right_size, top[size + top_right_size - 1],
                size - top_right_size);
     }
     if (cand_left)
         for (i = 0; i < size; i++)
-            left[i] = POS(-1, i);
+            left[i] = src_l[stride * i];
     if (cand_bottom_left) {
         for (i = size; i < size + bottom_left_size; i++)
-            left[i] = POS(-1, i);
-        EXTEND(left + size + bottom_left_size, POS(-1, size + bottom_left_size - 1),
+            left[i] = src_l[stride * i];
+        EXTEND(left + size + bottom_left_size, left[size + bottom_left_size - 1],
                size - bottom_left_size);
     }

@@ -268,7 +404,11 @@ do {                                  \
             cand_up_left = 1;
             cand_left    = 1;
         } else { // No samples available
+#if PRED_C
+            left[-1] = (1 << (BIT_DEPTH - 1)) | (1 << (BIT_DEPTH - 1 + PW * 8));
+#else
             left[-1] = (1 << (BIT_DEPTH - 1));
+#endif
             EXTEND(top,  left[-1], 2 * size);
             EXTEND(left, left[-1], 2 * size);
         }
@@ -287,6 +427,9 @@ do {                                  \
     top[-1] = left[-1];

     // Filtering process
+    // Sand can only apply to chroma_format_idc == 1 so we don't need to
+    // worry about chroma smoothing for that case
+#if !PRED_C
     if (!s->ps.sps->intra_smoothing_disabled_flag && (c_idx == 0  || s->ps.sps->chroma_format_idc == 3)) {
         if (mode != INTRA_DC && size != 4){
             int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
@@ -342,6 +485,30 @@ do {                                  \
                                            mode);
         break;
     }
+#else
+    switch (mode) {
+    case INTRA_PLANAR:
+        s->hpc.pred_planar_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top,
+                                          (uint8_t *)left, stride);
+        break;
+    case INTRA_DC:
+        s->hpc.pred_dc_c((uint8_t *)src, (uint8_t *)top,
+                       (uint8_t *)left, stride, log2_size, c_idx);
+        break;
+    default:
+        s->hpc.pred_angular_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top,
+                                           (uint8_t *)left, stride, c_idx,
+                                           mode);
+        break;
+    }
+
+#if DUMP_PRED
+    printf("U pred @ %d, %d: mode=%d\n", x, y, mode);
+    dump_pred_uv((uint8_t *)src, stride, 1 << log2_size);
+    printf("V pred @ %d, %d: mode=%d\n", x, y, mode);
+    dump_pred_uv((uint8_t *)src + 1, stride, 1 << log2_size);
+#endif
+#endif
 }

 #define INTRA_PRED(size)                                                            \
@@ -357,6 +524,7 @@ INTRA_PRED(5)

 #undef INTRA_PRED

+#if !PRED_C
 static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_top,
                                   const uint8_t *_left, ptrdiff_t stride,
                                   int trafo_size)
@@ -371,6 +539,29 @@ static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_to
             POS(x, y) = ((size - 1 - x) * left[y] + (x + 1) * top[size]  +
                          (size - 1 - y) * top[x]  + (y + 1) * left[size] + size) >> (trafo_size + 1);
 }
+#else
+static av_always_inline void FUNC(pred_planar)(uint8_t * _src, const uint8_t * _top,
+                                  const uint8_t * _left, ptrdiff_t stride,
+                                  int trafo_size)
+{
+    int x, y;
+    int size = 1 << trafo_size;
+    c_dst_ptr_t src = (c_dst_ptr_t)_src;
+    const c_src_ptr_t top = (c_src_ptr_t)_top;
+    const c_src_ptr_t left = (c_src_ptr_t)_left;
+
+    for (y = 0; y < size; y++, src += stride)
+    {
+        for (x = 0; x < size; x++)
+        {
+            src[x][0] = ((size - 1 - x) * left[y][0] + (x + 1) * top[size][0]  +
+                         (size - 1 - y) * top[x][0]  + (y + 1) * left[size][0] + size) >> (trafo_size + 1);
+            src[x][1] = ((size - 1 - x) * left[y][1] + (x + 1) * top[size][1]  +
+                         (size - 1 - y) * top[x][1]  + (y + 1) * left[size][1] + size) >> (trafo_size + 1);
+        }
+    }
+}
+#endif

 #define PRED_PLANAR(size)\
 static void FUNC(pred_planar_ ## size)(uint8_t *src, const uint8_t *top,        \
@@ -386,6 +577,7 @@ PRED_PLANAR(3)

 #undef PRED_PLANAR

+#if !PRED_C
 static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
                           const uint8_t *_left,
                           ptrdiff_t stride, int log2_size, int c_idx)
@@ -416,7 +608,53 @@ static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
             POS(0, y) = (left[y] + 3 * dc + 2) >> 2;
     }
 }
+#else
+static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
+                          const uint8_t *_left,
+                          ptrdiff_t stride, int log2_size, int c_idx)
+{
+    unsigned int i, j;
+    const unsigned int size = (1 << log2_size);
+    c_dst_ptr_t src = (c_dst_ptr_t)_src;
+    const c_src_ptr_t top = (c_src_ptr_t)_top;
+    const c_src_ptr_t left = (c_src_ptr_t)_left;
+    unsigned int dc0 = size;
+    unsigned int dc1 = size;
+
+    for (i = 0; i < size; i++)
+    {
+        dc0 += left[i][0] + top[i][0];
+        dc1 += left[i][1] + top[i][1];
+    }
+
+    dc0 >>= log2_size + 1;
+    dc1 >>= log2_size + 1;
+
+    for (i = 0; i < size; i++, src += stride)
+    {
+        for (j = 0; j < size; ++j)
+        {
+            src[j][0] = dc0;
+            src[j][1] = dc1;

+        }
+    }
+}
+#endif
+
+#ifndef ANGLE_CONSTS
+#define ANGLE_CONSTS
+static const int intra_pred_angle[] = {
+     32,  26,  21,  17, 13,  9,  5, 2, 0, -2, -5, -9, -13, -17, -21, -26, -32,
+    -26, -21, -17, -13, -9, -5, -2, 0, 2,  5,  9, 13,  17,  21,  26,  32
+};
+static const int inv_angle[] = {
+    -4096, -1638, -910, -630, -482, -390, -315, -256, -315, -390, -482,
+    -630, -910, -1638, -4096
+};
+#endif
+
+#if !PRED_C
 static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
                                                 const uint8_t *_top,
                                                 const uint8_t *_left,
@@ -428,15 +666,6 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
     const pixel *top  = (const pixel *)_top;
     const pixel *left = (const pixel *)_left;

-    static const int intra_pred_angle[] = {
-         32,  26,  21,  17, 13,  9,  5, 2, 0, -2, -5, -9, -13, -17, -21, -26, -32,
-        -26, -21, -17, -13, -9, -5, -2, 0, 2,  5,  9, 13,  17,  21,  26,  32
-    };
-    static const int inv_angle[] = {
-        -4096, -1638, -910, -630, -482, -390, -315, -256, -315, -390, -482,
-        -630, -910, -1638, -4096
-    };
-
     int angle = intra_pred_angle[mode - 2];
     pixel ref_array[3 * MAX_TB_SIZE + 4];
     pixel *ref_tmp = ref_array + size;
@@ -509,6 +738,83 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
         }
     }
 }
+#else
+static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
+                                                const uint8_t *_top,
+                                                const uint8_t *_left,
+                                                ptrdiff_t stride, int c_idx,
+                                                int mode, int size)
+{
+    int x, y;
+    c_dst_ptr_t src  = (c_dst_ptr_t)_src;
+    c_src_ptr_t top  = (c_src_ptr_t)_top;
+    c_src_ptr_t left = (c_src_ptr_t)_left;
+
+    const int angle = intra_pred_angle[mode - 2];
+    cpel ref_array[3 * MAX_TB_SIZE + 4][2];
+    c_dst_ptr_t ref_tmp = ref_array + size;
+    c_src_ptr_t ref;
+    const int last = (size * angle) >> 5;
+
+    if (mode >= 18) {
+        ref = top - 1;
+        if (angle < 0 && last < -1) {
+            memcpy(ref_tmp, top - 1, (size + 1) * 2 * PW);
+            for (x = last; x <= -1; x++)
+            {
+                ref_tmp[x][0] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0];
+                ref_tmp[x][1] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1];
+            }
+            ref = (c_src_ptr_t)ref_tmp;
+        }
+
+        for (y = 0; y < size; y++, src += stride) {
+            const int idx  = ((y + 1) * angle) >> 5;
+            const int fact = ((y + 1) * angle) & 31;
+            if (fact) {
+                for (x = 0; x < size; ++x) {
+                    src[x][0] = ((32 - fact) * ref[x + idx + 1][0] +
+                                       fact  * ref[x + idx + 2][0] + 16) >> 5;
+                    src[x][1] = ((32 - fact) * ref[x + idx + 1][1] +
+                                       fact  * ref[x + idx + 2][1] + 16) >> 5;
+                }
+            } else {
+                memcpy(src, ref + idx + 1, size * 2 * PW);
+            }
+        }
+    } else {
+        ref = left - 1;
+        if (angle < 0 && last < -1) {
+            memcpy(ref_tmp, left - 1, (size + 1) * 2 * PW);
+            for (x = last; x <= -1; x++)
+            {
+                ref_tmp[x][0] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0];
+                ref_tmp[x][1] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1];
+            }
+            ref = (c_src_ptr_t)ref_tmp;
+        }
+
+        for (x = 0; x < size; x++, src++) {
+            const int idx  = ((x + 1) * angle) >> 5;
+            const int fact = ((x + 1) * angle) & 31;
+            if (fact) {
+                for (y = 0; y < size; y++) {
+                    src[y * stride][0] = ((32 - fact) * ref[y + idx + 1][0] +
+                                       fact  * ref[y + idx + 2][0] + 16) >> 5;
+                    src[y * stride][1] = ((32 - fact) * ref[y + idx + 1][1] +
+                                       fact  * ref[y + idx + 2][1] + 16) >> 5;
+                }
+            } else {
+                for (y = 0; y < size; y++)
+                {
+                    src[y * stride][0] = ref[y + idx + 1][0];
+                    src[y * stride][1] = ref[y + idx + 1][1];
+                }
+            }
+        }
+    }
+}
+#endif

 static void FUNC(pred_angular_0)(uint8_t *src, const uint8_t *top,
                                  const uint8_t *left,
@@ -538,6 +844,10 @@ static void FUNC(pred_angular_3)(uint8_t *src, const uint8_t *top,
     FUNC(pred_angular)(src, top, left, stride, c_idx, mode, 1 << 5);
 }

+#undef cpel
+#undef c_src_ptr_t
+#undef c_dst_ptr_t
+
 #undef EXTEND_LEFT_CIP
 #undef EXTEND_RIGHT_CIP
 #undef EXTEND_UP_CIP
@@ -549,3 +859,9 @@ static void FUNC(pred_angular_3)(uint8_t *src, const uint8_t *top,
 #undef EXTEND
 #undef MIN_TB_ADDR_ZS
 #undef POS
+#undef PW
+
+#ifndef INCLUDED_ONCE
+#define INCLUDED_ONCE
+#endif
+
diff --git a/libavcodec/raw.c b/libavcodec/raw.c
index d36b68bfae..b526dc393d 100644
--- a/libavcodec/raw.c
+++ b/libavcodec/raw.c
@@ -260,6 +260,12 @@ const PixelFormatTag ff_raw_pix_fmt_tags[] = {
     { AV_PIX_FMT_YUV444P16LE, MKTAG('I', '4', 'F', 'L') },
     { AV_PIX_FMT_YUV444P16BE, MKTAG('I', '4', 'F', 'B') },

+    /* RPI */
+#ifdef RPI
+    { AV_PIX_FMT_SAND128,     MKTAG('S', 'A', 'N', 'D') },
+    { AV_PIX_FMT_SAND64_10,   MKTAG('S', 'N', 'D', 'A') },
+#endif
+
     /* special */
     { AV_PIX_FMT_RGB565LE,MKTAG( 3 ,  0 ,  0 ,  0 ) }, /* flipped RGB565LE */
     { AV_PIX_FMT_YUV444P, MKTAG('Y', 'V', '2', '4') }, /* YUV444P, swapped UV */
diff --git a/libavcodec/rawenc.c b/libavcodec/rawenc.c
index d83705645c..4c746786ff 100644
--- a/libavcodec/rawenc.c
+++ b/libavcodec/rawenc.c
@@ -31,6 +31,8 @@
 #include "libavutil/intreadwrite.h"
 #include "libavutil/imgutils.h"
 #include "libavutil/internal.h"
+#include "libavutil/avassert.h"
+#include "libavutil/rpi_sand_fns.h"

 static av_cold int raw_encode_init(AVCodecContext *avctx)
 {
@@ -47,6 +49,73 @@ FF_ENABLE_DEPRECATION_WARNINGS
     return 0;
 }

+#ifdef RPI
+static int raw_sand8_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
+                      const AVFrame *frame)
+{
+    const AVFrameSideData *const sd = av_frame_get_side_data(frame, AV_FRAME_DATA_SAND_INFO);
+    int size;
+    int width = frame->width;
+    int height = frame->height;
+    int x0 = 0;
+    int y0 = 0;
+    uint8_t * dst;
+    int ret;
+
+    if (sd != NULL) {
+        const AVFrameDataSandInfo *const si = (AVFrameDataSandInfo *)sd->data;
+
+        x0 = si->left_offset;
+        y0 = si->top_offset;
+    }
+
+    size = width * height * 3 / 2;
+    if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0)
+        return ret;
+
+    dst = pkt->data;
+
+    av_rpi_sand_to_planar_y8(dst, width, frame->data[0], frame->linesize[0], frame->linesize[3], x0, y0, width, height);
+    dst += width * height;
+    av_rpi_sand_to_planar_c8(dst, width / 2, dst + width * height / 4, width / 2,
+                          frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0 / 2, y0 / 2, width / 2, height / 2);
+    return 0;
+}
+
+static int raw_sand16_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
+                      const AVFrame *frame)
+{
+    const AVFrameSideData *const sd = av_frame_get_side_data(frame, AV_FRAME_DATA_SAND_INFO);
+    int size;
+    int width = frame->width;
+    int height = frame->height;
+    int x0 = 0;
+    int y0 = 0;
+    uint8_t * dst;
+    int ret;
+
+    if (sd != NULL) {
+        const AVFrameDataSandInfo *const si = (AVFrameDataSandInfo *)sd->data;
+
+        x0 = si->left_offset;
+        y0 = si->top_offset;
+    }
+
+    size = width * height * 3;
+    if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0)
+        return ret;
+
+    dst = pkt->data;
+
+    av_rpi_sand_to_planar_y16(dst, width * 2, frame->data[0], frame->linesize[0], frame->linesize[3], x0 * 2, y0, width * 2, height);
+    dst += width * height * 2;
+    av_rpi_sand_to_planar_c16(dst, width, dst + width * height / 2, width,
+                          frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0, y0 / 2, width, height / 2);
+    return 0;
+}
+#endif
+
+
 static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
                       const AVFrame *frame, int *got_packet)
 {
@@ -56,6 +125,14 @@ static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
     if (ret < 0)
         return ret;

+#ifdef RPI
+    if (av_rpi_is_sand_frame(frame)) {
+        ret = av_rpi_is_sand8_frame(frame) ? raw_sand8_as_yuv420(avctx, pkt, frame) : raw_sand16_as_yuv420(avctx, pkt, frame);
+        *got_packet = (ret == 0);
+        return ret;
+    }
+#endif
+
     if ((ret = ff_alloc_packet2(avctx, pkt, ret, ret)) < 0)
         return ret;
     if ((ret = av_image_copy_to_buffer(pkt->data, pkt->size,
diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s
new file mode 100644
index 0000000000..391f761df9
--- /dev/null
+++ b/libavcodec/rpi_hevc_transform.s
@@ -0,0 +1,923 @@
+# ******************************************************************************
+# Argon Design Ltd.
+# (c) Copyright 2015 Argon Design Ltd. All rights reserved.
+#
+# Module : HEVC
+# Author : Peter de Rivaz
+# ******************************************************************************
+
+# HEVC VPU Transform
+#             fe
+# Transform matrix can be thought of as
+#   output row vector = input row vector * transMatrix2
+#
+# The even rows of the matrix are symmetric
+# The odd rows of the matrix are antisymmetric
+#
+# So only need to compute the first half of the results, then can compute the remainder with a butterfly
+#
+# EXAMPLE
+#   (a b c d) (1 2  2  1)
+#             (3 4 -4 -3)
+#             (5 6  6  5)
+#             (7 8 -8 -7)
+#
+#  x=(a c)(1 2) = 1a+5c 2a+6c
+#         (5 6)
+#
+#  y=(b d)(3 4) = 3b+7d 4b+8d
+#         (7 8)
+#
+#  u=x+y = 1a+5c+3b+7d 2a+4b+6c+8d
+#  v=x-y = 1a+5c-3b-7d 2a+6c-4b-8d
+#
+#  Final results are (u , v[::-1])
+#
+#
+#  For 32x1 input, load even rows into HX(0++,0), odd rows into HX(16++,0)
+#  Apply the even matrix first and stop before rounding
+#  Then apply the odd matrix in a full manner:
+#
+#   First step is to compute partial products with the first input (16 cycles)
+#   1a 3b 5c 7d   16x1 input coefficients produce 16x16 output
+#   2a 4b 6c 8d
+#   2a -4b 6c -8d
+#   1a -3b 5c -7d
+#
+#   Second step is to sum partial products into final position (8 cycles)
+#   1a+3b+5c+7d
+#   2a+4b+6c+8d
+#   2a-4b+6c-8d
+#   1a-3b+5c-7d
+#
+#   Then can apply butterfly to combine even results and odd results + rounding to produce 16 rows of output at a time (need to save in transposed format)
+#
+#   For 16x16 no butterfly is required and can store final results in original location  (Could do 2 16x16s in parallel to make use of the trick - saves on the adds)
+#
+#   For 8x8 we could compute two in parallel.
+#
+#
+
+# Columns are transformed first
+#
+# Store top left half of transMatrix2 in
+# Store bottom left half of transMatrix2 in HX(32,32)
+#
+# For 16x16
+# HX(0:15,0) contains input data before transform
+# HY(0:15,0) contains 32bit output data after transform
+# HX(32,0) contains even rows of left half of transMatrix2
+# HX(32,32) contains odd rows of left half of transMatrix2
+# HY(48,0) contains partial products ready for summing
+#
+
+
+# hevc_trans_16x16(short *transMatrix2, short *coeffs, int num) # TODO add size so we can branch to correct implementation (or perhaps have coeffs32 and num32 as secondary inputs!)
+# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory)
+# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
+# num: number of 16x16 transforms to be done
+# coeffs32
+# num32: number of 32x32 transforms
+# command 0 for transform, 1 for memclear16(int16_t *dst,num16)
+#
+
+.equ TRANS_SHIFT, 20 - BIT_DEPTH
+.equ TRANS_RND2, 1 << (TRANS_SHIFT - 1)
+.equ TRANS_ASL2, 16 - TRANS_SHIFT
+
+
+hevc_trans_16x16:
+  cmp r5,1
+  beq memclear16
+  cmp r5,2
+  beq hevc_deblock_16x16
+  cmp r5,3
+  beq hevc_uv_deblock_16x16
+  cmp r5,4
+  beq hevc_uv_deblock_16x16_with_clear
+  cmp r5,5
+  beq hevc_run_command_list
+
+  push r6-r15, lr # TODO cut down number of used registers
+  mov r14,r3 # coeffs32
+  mov r15,r4 # num32
+  mov r3, 16*2 # Stride of transMatrix2 in bytes
+  vldh HX(32++,0),(r0 += r3) REP 16 # This is the 16x16 matrix, a transform is equivalent to multiplying input row vector * matrix
+
+  add r0, 16*16*2 # For 32x32 transforms we also need this matrix
+  vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
+
+  # Now use r0 to describe which matrix we are working on.
+  # Allows us to prefetch the next block of coefficients for efficiency.
+  mov r0,0 # This describes the location where we read our coefficients from
+  mov r3,16*2 # Stride of coefficients in bytes (TODO remove)
+  mov r7,16*16*2 # Total block size
+  mov r8,64*16 # Value used to swap from current to next VRF location
+  vldh HX(0++,0)+r0,(r1 += r3) REP 16
+  mov r4,64 # Constant used for rounding first pass
+  mov r5,TRANS_RND2 # Constant used for rounding second pass
+
+  # At start of block r0,r1 point to the current block (that has already been loaded)
+block_loop:
+  eor r0,r8
+  add r1,r7
+  # Prefetch the next block
+  vldh HX(0++,0)+r0,(r1 += r3) REP 16
+  eor r0,r8
+  sub r1,r7
+
+  # Transform the current block
+  bl col_trans_16
+  vadd HY(0++,0)+r0,HY(0++,0)+r0,r4 REP 16   # Now add on rounding, shift down by 7, and saturate
+  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # 9+7=16 so this ends up with the output saturated and in the top half of the word.
+  vasl HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16    # This should be saturating, but the instruction above does not assemble?
+  vmov VX(0,0++)+r0, HX(0++,32)+r0 REP 16    # For simplicity transpose this back to the original position
+
+  bl col_trans_16
+  vadd HY(0++,0)+r0,HY(0++,0)+r0,r5 REP 16   # Now add on rounding, shift down by 7, and saturate
+  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16 # 4+12=16 so this ends up with the output saturated and in the top half of the word.
+  vasl HY(0++,0)+r0,HY(0++,0)+r0,TRANS_ASL2 REP 16    # This should be saturating, but the instruction above does not assemble?  (Probably because it ends with ls which is interpreted as a condition flag)
+
+  # Save results - note there has been a transposition during the processing so we save columns
+  vsth VX(0,32++)+r0, (r1 += r3) REP 16
+
+  # Move onto next block
+  eor r0,r8
+  add r1,r7
+
+  addcmpbgt r2,-1,0,block_loop
+
+  # Now go and do any 32x32 transforms
+  b hevc_trans_32x32
+
+  pop r6-r15, pc
+
+# r1,r2,r3 r7,r8 should be preserved
+# HX(0++,0)+r0 is the block to be transformed
+# HX(32++,0)+r6 is the 16x16 matrix of transform coefficients
+# Use HY(48,0) for intermediate results
+# r0 can be used, but should be returned to its original value at the end
+col_trans_16:
+  add r6,r0,16 # Final value for this loop
+col_trans_16_loop:
+  # First compute partial products for a single column
+  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,0++) REP 16
+  # Then sum up the results and place back
+  vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
+  addcmpblt r0,1,r6,col_trans_16_loop
+  sub r0,16  # put r0 back to its original value
+  b lr
+
+col_trans_odd_16:
+  add r6,r0,16 # Final value for this loop
+col_trans_odd_16_loop:
+  # First compute partial products for a single column
+  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,32++) REP 16
+  # Then sum up the results and place back
+  vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
+  addcmpblt r0,1,r6,col_trans_odd_16_loop
+  sub r0,16  # put r0 back to its original value
+  b lr
+
+# hevc_trans_32x32(short *transMatrix2, short *coeffs, int num)
+# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory) Even followed by odd
+# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
+# num: number of 16x16 transforms to be done
+#
+hevc_trans_32x32:
+  mov r1,r14 # coeffs
+  mov r2,r15 # num
+
+  # Fetch odd transform matrix
+  #mov r3, 16*2 # Stride of transMatrix2 in bytes (and of coefficients)
+  #vldh HX(32++,0),(r0 += r3) REP 16 # This is the even 16x16 matrix
+  #add r0, 16*16*2
+  #vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
+
+  mov r3, 32*2*2 # Stride used to fetch alternate rows of our input coefficient buffer
+  mov r7, 16*16*2 # Total block size
+  sub sp,sp,32*32*2+32 # Allocate some space on the stack for us to store 32*32 shorts as temporary results (needs to be aligned)
+  # set r8 to 32byte aligned stack pointer
+  add r8,sp,31
+  lsr r8,5
+  lsl r8,5
+  mov r9,r8  # Backup of the temporary storage
+  mov r10,r1 # Backup of the coefficient buffer
+block_loop32:
+
+  # COLUMN TRANSFORM
+  mov r4, 64 # Constant used for rounding first pass
+  mov r5, 9 # left shift used for rounding first pass
+
+  # Transform the first 16 columns
+  mov r1,r10  # Input Coefficient buffer
+  mov r8,r9   # Output temporary storage
+  bl trans32
+  # Transform the second 16 columns
+  add r8,32*16*2
+  add r1,32
+  bl trans32
+
+  # ROW TRANSFORM
+  mov r4, TRANS_RND2 # Constant used for rounding second pass
+  mov r5, TRANS_ASL2 # left shift used for rounding second pass
+
+  mov r1,r9  # Input temporary storage
+  mov r8,r10   # Output Coefficient buffer
+  bl trans32
+  # Transform the second 16 columns
+  add r8,32*16*2
+  add r1,32
+  bl trans32
+
+  add r10, 32*32*2 # move onto next block of coefficients
+  addcmpbgt r2,-1,0,block_loop32
+
+  add sp,sp,32*32*2+32 # Restore stack
+
+  pop r6-r15, pc
+
+trans32:
+  push lr
+  # We can no longer afford the VRF space to do prefetching when doing 32x32
+  # Fetch the even rows
+  vldh HX(0++,0),(r1 += r3) REP 16
+  # Fetch the odd rows
+  vldh HX(16++,0),64(r1 += r3) REP 16 # First odd row is 32 shorts ahead of r1
+
+  # Transform the even rows using even matrix
+  mov r0, 0 # Even rows
+  bl col_trans_16
+
+  # Now transform the odd rows using odd matrix
+  mov r0, 64*16 # Odd rows
+  bl col_trans_odd_16
+
+  # Now apply butterfly to compute the first 16 results
+  vadd HY(48++,0),HY(0++,0),HY(16++,0) REP 16
+  vadd HY(48++,0),HY(48++,0),r4 REP 16   # add on rounding,
+  vasl HY(48++,0),HY(48++,0),r5 REP 16    # shift down by 7, and saturate
+  # 16bit results now in HX(48,32)
+  mov r0,r8
+  mov r6,32*2
+  vsth VX(48,32++),(r0+=r6) REP 16
+
+  # Now apply butterfly to compute the second 16 results (in reverse order)
+  vsub HY(63,0),HY(0 ,0),HY(16,0)
+  vsub HY(62,0),HY(1 ,0),HY(17,0)
+  vsub HY(61,0),HY(2 ,0),HY(18,0)
+  vsub HY(60,0),HY(3 ,0),HY(19,0)
+  vsub HY(59,0),HY(4 ,0),HY(20,0)
+  vsub HY(58,0),HY(5 ,0),HY(21,0)
+  vsub HY(57,0),HY(6 ,0),HY(22,0)
+  vsub HY(56,0),HY(7 ,0),HY(23,0)
+  vsub HY(55,0),HY(8 ,0),HY(24,0)
+  vsub HY(54,0),HY(9 ,0),HY(25,0)
+  vsub HY(53,0),HY(10,0),HY(26,0)
+  vsub HY(52,0),HY(11,0),HY(27,0)
+  vsub HY(51,0),HY(12,0),HY(28,0)
+  vsub HY(50,0),HY(13,0),HY(29,0)
+  vsub HY(49,0),HY(14,0),HY(30,0)
+  vsub HY(48,0),HY(15,0),HY(31,0)
+  vadd HY(48++,0),HY(48++,0),r4 REP 16   # add on rounding,
+  vasl HY(48++,0),HY(48++,0),r5 REP 16    # shift down by 7, and saturate
+  add r0,r8,32
+  vsth VX(48,32++),(r0+=r6) REP 16
+  pop pc
+
+memclear16:
+  # r0 is address
+  # r1 is number of 16bits values to set to 0 (may overrun past end and clear more than specified)
+  vmov HX(0++,0),0 REP 16
+  mov r2,32
+loop:
+  vsth HX(0++,0),(r0+=r2) REP 16
+  add r0,16*16*2
+  sub r1,16*16
+  cmp r1,0
+  bgt loop
+  b lr
+
+
+################################################################################
+# HEVC VPU Deblock
+#
+# Vertical edges before horizontal
+# Decision can change every 4 pixels, but only 8 pixel boundaries are deblocked
+#
+# ARM is responsible for storing beta and tc for each 4 pixels horiz and vert edge.
+# The VPU code works in units of 16x16 blocks.
+# We do vertical filtering for the current block followed by horizontal filtering for the previous (except for the first time).
+# One final horizontal filter is required at the end.
+# PCM is not allowed in this code.
+#
+#
+# H(16-4:16+15,0) contains previous block (note that we need 4 lines above of context that may get altered during filtering)
+# H(16:31,16) contains current block (note that we do not need the upper lines until the horizontal filtering.
+
+.set P0,63
+.set P1,62
+.set P2,61
+.set P3,60
+.set Q0,59
+.set Q1,58
+.set Q2,57
+.set Q3,56
+
+.set dp,32
+.set dq,33
+.set d,34
+.set decision,35
+.set beta,36
+.set beta2,37
+.set beta3,38
+.set ptest,39
+.set qtest,40
+.set pqtest,41
+.set thresh,42
+.set deltatest, 44
+.set deltap1, 45
+.set tc25, 46
+.set setup,47
+.set tc,48
+.set tc25,49
+.set tc2, 50
+.set do_filter, 51
+.set delta, 52
+.set tc10, 53
+.set delta0, 54
+.set delta1, 55
+.set zeros, 0
+.set setup_input, 1
+.set deltaq1, 2
+
+
+
+# hevc_deblock_16x16 deblocks an entire row that is 16 pixels high by the full width of the image.
+# Row has num16 16x16 blocks across
+# Beta goes from 0 to 64
+# tc goes from 0 to 24
+# setup[block_idx][0=vert,1=horz][0=first edge, 1=second edge][0=beta,1=tc][0..3=edge number]
+#   has 8 bytes per edge
+#   has 16 bytes per direction
+#   has 32 bytes per 16x16 block
+# hevc_deblock_16x16(uint8_t *img (r0), int stride (r1), int num16w (r2), uint8_t setup[num16][2][2][2][4](r3),int num16h(r4))
+hevc_deblock_16x16:
+  push r6-r15, lr
+  mov r9,r4
+  mov r4,r3
+  mov r13,r2
+  mov r2,r0
+  mov r10,r0
+  subscale4 r0,r1
+  mov r8,63
+  mov r6,-3
+  vmov H(zeros,0),0
+# r7 is number of blocks still to load
+# r0 is location of current block - 4 * stride
+# r1 is stride
+# r2 is location of current block
+# r3 is offset of start of block (actual edges start at H(16,16)+r3 for horizontal and H(16,0)+r3 for vertical
+# r4 is setup
+# r5 is for temporary calculations
+# r8 holds 63
+# r6 holds -3
+# r9 holds the number of 16 high rows to process
+# r10 holds the original img base
+# r11 returns 0 if no filtering was done on the edge
+# r12 saves a copy of this
+# r13 is copy of width
+
+process_row:
+  # First iteration does not do horizontal filtering on previous
+  mov r7, r13
+  mov r3,0
+  vldb H(12++,16)+r3,(r0 += r1) REP 4    # Load the current block
+  vldb H(16++,16)+r3,(r2 += r1) REP 16
+  vldb H(setup_input,0), (r4)  # We may wish to prefetch these
+  vstb H(zeros,0),(r4)
+  bl vert_filter
+  add r3,8
+  vadd H(setup_input,0),H(setup_input,8),0 # Rotate to second set of 8
+  bl vert_filter
+  sub r3,8
+  b start_deblock_loop
+deblock_loop:
+  # Middle iterations do vertical on current block and horizontal on preceding
+  vldb H(12++,16)+r3,(r0 += r1) REP 4  # load the current block
+  vldb H(16++,16)+r3,(r2 += r1) REP 16
+  vldb H(setup_input,0), (r4)
+  vstb H(zeros,0),(r4)
+  bl vert_filter
+  add r3,8
+  vadd H(setup_input,0),H(setup_input,8),0
+  bl vert_filter
+  sub r3,8
+  vldb H(setup_input,0), -16(r4)
+  vstb H(zeros,0),-16(r4)
+  bl horz_filter
+  mov r12,r11
+  add r3,8*64
+  vadd H(setup_input,0),H(setup_input,8),0
+  bl horz_filter
+  sub r3,8*64
+  addcmpbeq r12,0,0,skip_save_top
+  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
+skip_save_top:
+  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
+start_deblock_loop:
+  # move onto next 16x16 (could do this with circular buffer support instead)
+  add r3,16
+  and r3,r8
+  add r4,32
+  # Perform loop counter operations (may work with an addcmpbgt as well?)
+  add r0,16
+  add r2,16
+  sub r7,1
+  cmp r7,0 # Are there still more blocks to load
+  bgt deblock_loop
+
+  # Final iteration needs to just do horizontal filtering
+  vldb H(setup_input,0), -16(r4)
+  vstb H(zeros,0),-16(r4)
+  bl horz_filter
+  mov r12,r11
+  add r3,8*64
+  vadd H(setup_input,0),H(setup_input,8),0
+  bl horz_filter
+  sub r3,64*8
+  addcmpbeq r12,0,0,skip_save_top2
+  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
+skip_save_top2:
+  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
+
+# Now look to see if we should do another row
+  sub r9,1
+  cmp r9,0
+  bgt start_again
+  pop r6-r15, pc
+start_again:
+  # Need to sort out r0,r2 to point to next row down
+  addscale16 r10,r1
+  mov r2,r10
+  subscale4 r0,r2,r1
+  b process_row
+
+
+# At this stage H(16,16)+r3 points to the first pixel of the 16 high edge to be filtered
+# So we can reuse the code we move the parts to be filtered into HX(P0/P1/P2/P3/Q0/Q1/Q2/Q3,0) - we will perform a final saturation step on placing them back into the correct locations
+
+vert_filter:
+  push lr
+
+  vmov HX(P3,0), V(16,12)+r3
+  vmov HX(P2,0), V(16,13)+r3
+  vmov HX(P1,0), V(16,14)+r3
+  vmov HX(P0,0), V(16,15)+r3
+  vmov HX(Q0,0), V(16,16)+r3
+  vmov HX(Q1,0), V(16,17)+r3
+  vmov HX(Q2,0), V(16,18)+r3
+  vmov HX(Q3,0), V(16,19)+r3
+
+  bl do_luma_filter
+
+  vadds V(16,13)+r3, HX(P2,0), 0
+  vadds V(16,14)+r3, HX(P1,0), 0
+  vadds V(16,15)+r3, HX(P0,0), 0
+  # P3 and Q3 never change so don't bother saving back
+  vadds V(16,16)+r3, HX(Q0,0), 0
+  vadds V(16,17)+r3, HX(Q1,0), 0
+  vadds V(16,18)+r3, HX(Q2,0), 0
+
+  pop pc
+
+# Filter edge at H(16,0)+r3
+horz_filter:
+  push lr
+
+  vmov HX(P3,0), H(12,0)+r3
+  vmov HX(P2,0), H(13,0)+r3
+  vmov HX(P1,0), H(14,0)+r3
+  vmov HX(P0,0), H(15,0)+r3
+  vmov HX(Q0,0), H(16,0)+r3
+  vmov HX(Q1,0), H(17,0)+r3
+  vmov HX(Q2,0), H(18,0)+r3
+  vmov HX(Q3,0), H(19,0)+r3
+
+  bl do_luma_filter
+
+  vadds H(13,0)+r3, HX(P2,0), 0
+  vadds H(14,0)+r3, HX(P1,0), 0
+  vadds H(15,0)+r3, HX(P0,0), 0
+  # P3 and Q3 never change so don't bother saving back
+  vadds H(16,0)+r3, HX(Q0,0), 0
+  vadds H(17,0)+r3, HX(Q1,0), 0
+  vadds H(18,0)+r3, HX(Q2,0), 0
+
+  pop pc
+
+# r4 points to array of beta/tc for each 4 length edge
+do_luma_filter:
+  valtl H(setup,0),H(setup_input,0),H(setup_input,0) # b*8tc*8
+  valtl HX(beta,0),H(setup,0),H(setup,0)
+  valtu HX(tc,0),H(setup,0),H(setup,0)
+  vmul HX(tc25,0), HX(tc,0), 5
+  vadd HX(tc25,0),HX(tc25,0), 1
+  vasr HX(tc25,0), HX(tc25,0), 1
+
+  # Compute decision
+  vadd HX(dp,0),HX(P1,0),HX(P1,0) # 2*P1
+  vsub HX(dp,0),HX(P2,0),HX(dp,0) # P2-2*P1
+  vadd HX(dp,0),HX(dp,0),HX(P0,0) # P2-2*P1+P0
+  vdist HX(dp,0),HX(dp,0),0 # abs(P2-2*P1+P0) # dp0
+
+  vadd HX(dq,0),HX(Q1,0),HX(Q1,0) # 2*Q1
+  vsub HX(dq,0),HX(Q2,0),HX(dq,0) # Q2-2*Q1
+  vadd HX(dq,0),HX(dq,0),HX(Q0,0) # Q2-2*Q1+Q0
+  vdist HX(dq,0),HX(dq,0),0 # abs(Q2-2*Q1+Q0) # dq0
+
+  vadd HX(d,0), HX(dp,0), HX(dq,0)
+  vasr HX(beta2,0),HX(beta,0),2
+  vasr HX(beta3,0),HX(beta,0),3
+
+  # Compute flags that are negative if all conditions pass
+  vdist HX(decision,0), HX(P0,0), HX(P3,0) CLRA SACC
+  vdist HX(decision,0), HX(Q0,0), HX(Q3,0) SACC
+  vsub HX(decision,0), HX(decision,0), HX(beta3,0) SETF
+
+  vdist HX(decision,0), HX(P0,0), HX(Q0,0) IFN
+  vsub HX(decision,0), HX(decision,0), HX(tc25,0) IFN SETF
+  vadd HX(decision,0), HX(d,0), HX(d,0) IFN
+  vsub HX(decision,0), HX(decision,0), HX(beta2,0) IFN SETF
+  vmov HX(decision,0), 1 IFNN
+  vadd H(decision,0),H(decision,3),0 IFN
+  vadd H(decision,16),H(decision,19),0 IFN
+  vmov -,HX(decision,0) SETF   # N marks strong filter
+  vmov HX(decision,0), 1 IFNN  # NN marks normal filter
+
+  vadd HX(do_filter,0), HX(d,3), HX(d,0)
+  vsub HX(do_filter,0), HX(do_filter,0), HX(beta,0) SETF # IFNN means no filter
+  vmov HX(decision,0),0 IFNN # Z marks no filter
+
+  # Expand out decision (currently valid one every 4 pixels)  0...1...2...3
+  # First extract out even terms
+  vodd HX(decision,0),HX(decision,0),HX(decision,0)  # 0.1.2.3
+  vodd HX(decision,0),HX(decision,0),HX(decision,0)  # 0123
+  # Now expand back
+  valtl HX(decision,0),HX(decision,0),HX(decision,0) # 00112233
+  valtl HX(decision,0),HX(decision,0),HX(decision,0) SETF # 0000111122223333
+
+  # HX(decision,0) is negative if want strong filtering, 1 if want normal filtering, 0 if want no filtering
+
+  # Do a quick check to see if there is anything to do
+  mov r11, 0 # Signal no filtering
+  vmov -,1 IFNZ SUMS r5
+  cmp r5,0
+  beq filtering_done
+  mov r11, 1 # Signal some filtering
+  # And whether there is any strong filtering
+  vmov -,1 IFN SUMS r5
+  cmp r5,0
+  beq normal_filtering
+
+  ##############################################################################
+  # Strong filtering - could maybe fast case if all have same sign? (especially if all disabled!)
+  vshl HX(tc2,0), HX(tc,0), 1  # Note that in normal filtering tx2 is tc/2, while here it is tc*2
+
+  # Take a copy of the original pixels for use in decision calculation
+  vmov HX(P0,32),HX(P0,0)
+  vmov HX(Q0,32),HX(Q0,0)
+  vmov HX(P1,32),HX(P1,0)
+  vmov HX(Q1,32),HX(Q1,0)
+  vmov HX(P2,32),HX(P2,0)
+  vmov HX(Q2,32),HX(Q2,0)
+
+  vadd -,HX(P2,32),4 CLRA SACC
+  vshl -,HX(P1,32),1 SACC
+  vshl -,HX(P0,32),1 SACC
+  vshl -,HX(Q0,32),1 SACC
+  vshl HX(delta,0),HX(Q1,32),0 SACC
+  vasr HX(delta,0),HX(delta,0), 3
+  vsub HX(delta,0),HX(delta,0),HX(P0,32)
+  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
+  vadd HX(P0,0),HX(P0,32),HX(delta,0) IFN
+
+  vadd -,HX(P2,32),2 CLRA SACC
+  vadd -,HX(P1,32),HX(P0,32) SACC
+  vshl HX(delta,0),HX(Q0,32),0 SACC
+  vasr HX(delta,0),HX(delta,0), 2
+  vsub HX(delta,0),HX(delta,0),HX(P1,32)
+  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
+  vadd HX(P1,0),HX(P1,32),HX(delta,0) IFN
+
+  vadd -,HX(Q0,32),4 CLRA SACC
+  vadd -,HX(P1,32),HX(P0,32) SACC
+  vmul -,HX(P2,32),3 SACC
+  vshl HX(delta,0),HX(P3,0),1 SACC # Note that we have not made a copy of P3, so using P3,0 is correct
+  vasr HX(delta,0),HX(delta,0), 3
+  vsub HX(delta,0),HX(delta,0),HX(P2,32)
+  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
+  vadd HX(P2,0),HX(P2,32),HX(delta,0) IFN
+  #vmov HX(P2,0),3 IFN
+
+  # Now reverse all P/Qs
+
+  vadd -,HX(Q2,32),4 CLRA SACC
+  vshl -,HX(Q1,32),1 SACC
+  vshl -,HX(Q0,32),1 SACC
+  vshl -,HX(P0,32),1 SACC
+  vshl HX(delta,0),HX(P1,32),0 SACC
+  vasr HX(delta,0),HX(delta,0), 3
+  vsub HX(delta,0),HX(delta,0),HX(Q0,32)
+  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
+  vadd HX(Q0,0),HX(Q0,32),HX(delta,0) IFN
+
+  vadd -,HX(Q2,32),2 CLRA SACC
+  vadd -,HX(Q1,32),HX(Q0,32) SACC
+  vshl HX(delta,0),HX(P0,32),0 SACC
+  vasr HX(delta,0),HX(delta,0), 2
+  vsub HX(delta,0),HX(delta,0),HX(Q1,32)
+  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
+  vadd HX(Q1,0),HX(Q1,32),HX(delta,0) IFN
+
+  vadd -,HX(P0,32),4 CLRA SACC
+  vadd -,HX(Q1,32),HX(Q0,32) SACC
+  vmul -,HX(Q2,32),3 SACC
+  vshl HX(delta,0),HX(Q3,0),1 SACC # Note that we have not made a copy of Q3, so using Q3,0 is correct
+  vasr HX(delta,0),HX(delta,0), 3
+  vsub HX(delta,0),HX(delta,0),HX(Q2,32)
+  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
+  vadd HX(Q2,0),HX(Q2,32),HX(delta,0) IFN
+
+  ##############################################################################
+  # Normal filtering
+normal_filtering:
+  # Invert the decision flags
+  # make instruction more complicated as assembler has error and loses SETF
+  vrsub HX(tc10,0), HX(decision,0), 0 SETF # IFN means normal filtering
+  vmov  -, HX(tc10,0) SETF # IFN means normal filtering
+
+  vmov -,1 IFN SUMS r5
+  cmp r5,0
+  beq filtering_done
+
+  vasr HX(tc2,0), HX(tc,0), 1
+  vmul HX(tc10,0), HX(tc,0), 10
+
+  vasr HX(thresh,0), HX(beta,0), 1
+  vadd HX(thresh,0), HX(thresh,0), HX(beta,0)
+  vasr HX(thresh,0), HX(thresh,0), 3 CLRA SACC
+
+  vadd HX(ptest,0),HX(dp,3),HX(dp,0)
+  vsub HX(ptest,0),HX(ptest,0),HX(thresh,0) # ptest is negative if we need to do the P2 pixel
+  vadd HX(qtest,0),HX(dq,3),HX(dq,0)
+  vsub HX(qtest,0),HX(qtest,0),HX(thresh,0) # qtest is negative if we need to do the Q2 pixel
+  # Expand ptest and qtest together
+  vodd HX(pqtest,0),HX(ptest,0),HX(qtest,0)  # p.p.p.p.q.q.q.q
+  vodd HX(pqtest,0),HX(pqtest,0),HX(pqtest,0) # ppppqqqq........
+  valtl HX(pqtest,0),HX(pqtest,0),HX(pqtest,0) # ppppppppqqqqqqqq
+  valtl HX(ptest,0),HX(pqtest,0),HX(pqtest,0)
+  valtu HX(qtest,0),HX(pqtest,0),HX(pqtest,0)
+
+  vsub HX(delta0,0), HX(Q0,0), HX(P0,0)
+  vsub HX(delta1,0), HX(Q1,0), HX(P1,0)
+  vmov -,8 CLRA SACC
+  vmul -,HX(delta0,0), 9 SACC
+  vmul HX(delta0,0),HX(delta1,0), r6 SACC
+  vasr HX(delta0,0), HX(delta0,0), 4
+  vdist HX(deltatest,0), HX(delta0,0), 0
+  vsub HX(deltatest,0), HX(deltatest,0), HX(tc10,0) IFN SETF # negative if still need to do something
+  vmov HX(deltatest,0), 0 IFNN # clear if no need to do anything so we can reload flags later
+
+  vclamps HX(delta0,0), HX(delta0,0), HX(tc,0)
+
+  vadd HX(deltap1,0), HX(P2,0), HX(P0,0)
+  vadd HX(deltap1,0), HX(deltap1,0), 1
+  vasr HX(deltap1,0), HX(deltap1,0), 1 CLRA SACC
+  vsub HX(deltap1,0), HX(delta0,0), HX(P1,0) SACC
+  vasr HX(deltap1,0), HX(deltap1,0), 1
+  vclamps HX(deltap1,0), HX(deltap1,0), HX(tc2,0)
+
+  vadd HX(deltaq1,0), HX(Q2,0), HX(Q0,0)
+  vadd HX(deltaq1,0), HX(deltaq1,0), 1
+  vasr HX(deltaq1,0), HX(deltaq1,0), 1 CLRA SACC
+  vadd HX(deltaq1,0), HX(delta0,0), HX(Q1,0)
+  vrsub -, HX(delta0,0), 0 SACC
+  vrsub HX(deltaq1,0), HX(Q1,0), 0 SACC
+  vasr HX(deltaq1,0), HX(deltaq1,0), 1
+  vclamps HX(deltaq1,0), HX(deltaq1,0), HX(tc2,0)
+
+  vadds HX(P0,0), HX(P0,0), HX(delta0,0) IFN
+  vsubs HX(Q0,0), HX(Q0,0), HX(delta0,0) IFN
+
+  vmov -,HX(ptest,0) IFN SETF # Negative if need to do p1
+  vadds HX(P1,0), HX(P1,0), HX(deltap1,0) IFN
+
+  vmov -,HX(deltatest,0) SETF
+  vmov -,HX(qtest,0) IFN SETF # Negative if need to do q1
+  vadds HX(Q1,0), HX(Q1,0), HX(deltaq1,0) IFN
+
+  #vmov HX(P2,0),1 IFN
+
+filtering_done:
+  b lr
+
+
+hevc_uv_deblock_16x16:
+  push r6-r15, lr
+  mov r14,0
+  b hevc_uv_start
+hevc_uv_deblock_16x16_with_clear:
+  push r6-r15, lr
+  mov r14,1
+  b hevc_uv_start
+
+hevc_uv_start:
+  mov r9,r4
+  mov r4,r3
+  mov r13,r2
+  mov r2,r0
+  mov r10,r0
+  subscale4 r0,r1
+  mov r8,63
+  mov r6,-3
+  vmov H(zeros,0),0
+# r7 is number of blocks still to load
+# r0 is location of current block - 4 * stride
+# r1 is stride
+# r2 is location of current block
+# r3 is offset of start of block (actual edges start at H(16,16)+r3 for horizontal and H(16,0)+r3 for vertical
+# r4 is setup
+# r5 is for temporary calculations
+# r8 holds 63
+# r6 holds -3
+# r9 holds the number of 16 high rows to process
+# r10 holds the original img base
+# r11 returns 0 if no filtering was done on the edge
+# r12 saves a copy of this
+# r13 is copy of width
+# r14 is 1 if we should clear the old contents, or 0 if not
+
+uv_process_row:
+  # First iteration does not do horizontal filtering on previous
+  mov r7, r13
+  mov r3,0
+  vldb H(12++,16)+r3,(r0 += r1) REP 4    # Load the current block
+  vldb H(16++,16)+r3,(r2 += r1) REP 16
+  vldb H(setup_input,0), (r4)  # We may wish to prefetch these
+  cmp r14,1
+  bne uv_skip0
+  vstb H(zeros,0),(r4)
+uv_skip0:
+  bl uv_vert_filter
+  add r3,8
+  vadd H(setup_input,0),H(setup_input,8),0 # Rotate to second set of 8
+  bl uv_vert_filter
+  sub r3,8
+  b uv_start_deblock_loop
+uv_deblock_loop:
+  # Middle iterations do vertical on current block and horizontal on preceding
+  vldb H(12++,16)+r3,(r0 += r1) REP 4  # load the current block
+  vldb H(16++,16)+r3,(r2 += r1) REP 16
+  vldb H(setup_input,0), (r4)
+  cmp r14,1
+  bne uv_skip1
+  vstb H(zeros,0),(r4)
+uv_skip1:
+  bl uv_vert_filter
+  add r3,8
+  vadd H(setup_input,0),H(setup_input,8),0
+  bl uv_vert_filter
+  sub r3,8
+  vldb H(setup_input,0), -16(r4)
+  cmp r14,1
+  bne uv_skip3
+  vstb H(zeros,0),-16(r4)
+uv_skip3:
+  bl uv_horz_filter
+  mov r12,r11
+  add r3,8*64
+  vadd H(setup_input,0),H(setup_input,8),0
+  bl uv_horz_filter
+  sub r3,8*64
+  addcmpbeq r12,0,0,uv_skip_save_top
+  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
+uv_skip_save_top:
+  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
+uv_start_deblock_loop:
+  # move onto next 16x16 (could do this with circular buffer support instead)
+  add r3,16
+  and r3,r8
+  add r4,32
+  # Perform loop counter operations (may work with an addcmpbgt as well?)
+  add r0,16
+  add r2,16
+  sub r7,1
+  cmp r7,0 # Are there still more blocks to load
+  bgt uv_deblock_loop
+
+  # Final iteration needs to just do horizontal filtering
+  vldb H(setup_input,0), -16(r4)
+  cmp r14,1
+  bne uv_skip2
+  vstb H(zeros,0),-16(r4)
+uv_skip2:
+  bl uv_horz_filter
+  mov r12,r11
+  add r3,8*64
+  vadd H(setup_input,0),H(setup_input,8),0
+  bl uv_horz_filter
+  sub r3,64*8
+  addcmpbeq r12,0,0,uv_skip_save_top2
+  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
+uv_skip_save_top2:
+  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
+
+# Now look to see if we should do another row
+  sub r9,1
+  cmp r9,0
+  bgt uv_start_again
+  pop r6-r15, pc
+uv_start_again:
+  # Need to sort out r0,r2 to point to next row down
+  addscale16 r10,r1
+  mov r2,r10
+  subscale4 r0,r2,r1
+  b uv_process_row
+
+
+# At this stage H(16,16)+r3 points to the first pixel of the 16 high edge to be filtered
+# So we can reuse the code we move the parts to be filtered into HX(P0/P1/P2/P3/Q0/Q1/Q2/Q3,0) - we will perform a final saturation step on placing them back into the correct locations
+
+uv_vert_filter:
+  push lr
+
+  vmov HX(P1,0), V(16,14)+r3
+  vmov HX(P0,0), V(16,15)+r3
+  vmov HX(Q0,0), V(16,16)+r3
+  vmov HX(Q1,0), V(16,17)+r3
+
+  bl do_chroma_filter
+
+  vadds V(16,15)+r3, HX(P0,0), 0
+  vadds V(16,16)+r3, HX(Q0,0), 0
+
+  pop pc
+
+# Filter edge at H(16,0)+r3
+uv_horz_filter:
+  push lr
+
+  vmov HX(P1,0), H(14,0)+r3
+  vmov HX(P0,0), H(15,0)+r3
+  vmov HX(Q0,0), H(16,0)+r3
+  vmov HX(Q1,0), H(17,0)+r3
+
+  bl do_chroma_filter
+
+  vadds H(15,0)+r3, HX(P0,0), 0
+  # P3 and Q3 never change so don't bother saving back
+  vadds H(16,0)+r3, HX(Q0,0), 0
+
+  pop pc
+
+# r4 points to array of beta/tc for each 4 length edge
+do_chroma_filter:
+  valtl H(setup,0),H(setup_input,0),H(setup_input,0) # tc*8
+  valtl HX(tc,0),H(setup,0),H(setup,0)
+
+  vsub HX(delta,0),HX(Q0,0),HX(P0,0)
+  vshl HX(delta,0),HX(delta,0),2 CLRA SACC
+  vsub -,HX(P1,0),HX(Q1,0) SACC
+  vmov HX(delta,0),4 SACC
+  vasr HX(delta,0),HX(delta,0),3
+  vclamps HX(delta,0), HX(delta,0), HX(tc,0)
+  vadd HX(P0,0),HX(P0,0),HX(delta,0)
+  vsub HX(Q0,0),HX(Q0,0),HX(delta,0)
+  b lr
+
+# r0 = list
+# r1 = number
+hevc_run_command_list:
+  push r6-r7, lr
+  mov r6, r0
+  mov r7, r1
+loop_cmds:
+  ld r0,(r6) # How to encode r6++?
+  add r6,4
+  ld r1,(r6)
+  add r6,4
+  ld r2,(r6)
+  add r6,4
+  ld r3,(r6)
+  add r6,4
+  ld r4,(r6)
+  add r6,4
+  ld r5,(r6)
+  add r6,4
+  bl hevc_trans_16x16
+  sub r7,1
+  cmp r7,0
+  bgt loop_cmds
+
+  pop r6-r7, pc
diff --git a/libavcodec/rpi_hevc_transform10.h b/libavcodec/rpi_hevc_transform10.h
new file mode 100644
index 0000000000..b0e9902d82
--- /dev/null
+++ b/libavcodec/rpi_hevc_transform10.h
@@ -0,0 +1,3070 @@
+static const unsigned char rpi_hevc_transform10 [] = {
+21,
+106,
+0,
+144,
+47,
+1,
+37,
+106,
+0,
+144,
+66,
+1,
+53,
+106,
+0,
+144,
+192,
+4,
+69,
+106,
+0,
+144,
+192,
+4,
+85,
+106,
+0,
+144,
+220,
+5,
+169,
+3,
+62,
+64,
+79,
+64,
+3,
+232,
+32,
+0,
+0,
+0,
+12,
+248,
+0,
+136,
+0,
+0,
+192,
+248,
+0,
+0,
+64,
+232,
+0,
+2,
+0,
+0,
+12,
+248,
+0,
+168,
+0,
+0,
+192,
+248,
+0,
+0,
+0,
+96,
+3,
+232,
+32,
+0,
+0,
+0,
+7,
+232,
+0,
+2,
+0,
+0,
+8,
+232,
+0,
+4,
+0,
+0,
+12,
+248,
+0,
+128,
+0,
+0,
+192,
+8,
+4,
+0,
+4,
+232,
+64,
+0,
+0,
+0,
+5,
+232,
+0,
+2,
+0,
+0,
+128,
+69,
+113,
+66,
+12,
+248,
+0,
+128,
+0,
+0,
+192,
+8,
+4,
+0,
+128,
+69,
+113,
+70,
+128,
+144,
+40,
+0,
+4,
+255,
+48,
+192,
+128,
+3,
+32,
+8,
+16,
+0,
+76,
+254,
+48,
+192,
+9,
+4,
+32,
+8,
+0,
+0,
+4,
+254,
+0,
+144,
+128,
+2,
+0,
+8,
+2,
+0,
+128,
+144,
+23,
+0,
+4,
+255,
+48,
+192,
+128,
+3,
+32,
+8,
+20,
+0,
+76,
+254,
+48,
+192,
+6,
+4,
+32,
+8,
+0,
+0,
+140,
+248,
+44,
+0,
+0,
+0,
+32,
+48,
+4,
+0,
+128,
+69,
+113,
+66,
+242,
+140,
+211,
+192,
+34,
+31,
+41,
+3,
+70,
+192,
+80,
+7,
+164,
+255,
+36,
+204,
+96,
+2,
+0,
+248,
+62,
+0,
+3,
+255,
+55,
+208,
+120,
+3,
+224,
+3,
+190,
+11,
+16,
+139,
+246,
+91,
+0,
+103,
+90,
+0,
+70,
+192,
+80,
+7,
+164,
+255,
+36,
+204,
+224,
+2,
+0,
+248,
+62,
+0,
+3,
+255,
+55,
+208,
+120,
+3,
+224,
+3,
+190,
+11,
+16,
+139,
+246,
+91,
+0,
+103,
+90,
+0,
+225,
+64,
+242,
+64,
+3,
+232,
+128,
+0,
+0,
+0,
+7,
+232,
+0,
+2,
+0,
+0,
+57,
+239,
+224,
+247,
+255,
+255,
+72,
+192,
+95,
+207,
+88,
+122,
+88,
+124,
+137,
+64,
+26,
+64,
+4,
+232,
+64,
+0,
+0,
+0,
+149,
+96,
+161,
+64,
+152,
+64,
+128,
+144,
+35,
+0,
+72,
+232,
+0,
+4,
+0,
+0,
+65,
+232,
+32,
+0,
+0,
+0,
+128,
+144,
+27,
+0,
+4,
+232,
+0,
+2,
+0,
+0,
+101,
+96,
+145,
+64,
+168,
+64,
+128,
+144,
+19,
+0,
+72,
+232,
+0,
+4,
+0,
+0,
+65,
+232,
+32,
+0,
+0,
+0,
+128,
+144,
+11,
+0,
+74,
+232,
+0,
+8,
+0,
+0,
+242,
+140,
+221,
+192,
+57,
+239,
+32,
+8,
+0,
+0,
+41,
+3,
+239,
+3,
+12,
+248,
+0,
+128,
+0,
+0,
+192,
+248,
+4,
+0,
+12,
+248,
+0,
+132,
+64,
+0,
+192,
+248,
+4,
+0,
+0,
+96,
+255,
+159,
+154,
+255,
+0,
+232,
+0,
+4,
+0,
+0,
+255,
+159,
+165,
+255,
+4,
+255,
+48,
+204,
+16,
+3,
+224,
+251,
+62,
+0,
+4,
+255,
+51,
+204,
+128,
+3,
+224,
+251,
+16,
+0,
+76,
+254,
+51,
+204,
+128,
+3,
+224,
+251,
+20,
+0,
+128,
+64,
+6,
+232,
+64,
+0,
+0,
+0,
+140,
+248,
+47,
+0,
+0,
+0,
+224,
+99,
+0,
+0,
+32,
+247,
+240,
+207,
+16,
+3,
+32,
+247,
+176,
+207,
+17,
+19,
+32,
+247,
+112,
+207,
+18,
+35,
+32,
+247,
+48,
+207,
+19,
+51,
+32,
+247,
+240,
+206,
+20,
+67,
+32,
+247,
+176,
+206,
+21,
+83,
+32,
+247,
+112,
+206,
+22,
+99,
+32,
+247,
+48,
+206,
+23,
+115,
+32,
+247,
+240,
+205,
+24,
+131,
+32,
+247,
+176,
+205,
+25,
+147,
+32,
+247,
+112,
+205,
+26,
+163,
+32,
+247,
+48,
+205,
+27,
+179,
+32,
+247,
+240,
+204,
+28,
+195,
+32,
+247,
+176,
+204,
+29,
+211,
+32,
+247,
+112,
+204,
+30,
+227,
+32,
+247,
+48,
+204,
+31,
+243,
+4,
+255,
+51,
+204,
+128,
+3,
+224,
+251,
+16,
+0,
+76,
+254,
+51,
+204,
+128,
+3,
+224,
+251,
+20,
+0,
+0,
+237,
+32,
+0,
+0,
+0,
+140,
+248,
+47,
+0,
+0,
+0,
+224,
+99,
+0,
+0,
+111,
+3,
+4,
+254,
+0,
+128,
+0,
+4,
+0,
+248,
+0,
+0,
+2,
+232,
+32,
+0,
+0,
+0,
+140,
+248,
+32,
+0,
+0,
+0,
+224,
+35,
+0,
+0,
+64,
+232,
+0,
+2,
+0,
+0,
+193,
+232,
+0,
+1,
+0,
+0,
+1,
+106,
+116,
+30,
+90,
+0,
+169,
+3,
+73,
+64,
+52,
+64,
+45,
+64,
+2,
+64,
+10,
+64,
+64,
+198,
+1,
+7,
+8,
+232,
+63,
+0,
+0,
+0,
+6,
+232,
+253,
+255,
+255,
+255,
+0,
+246,
+0,
+0,
+0,
+4,
+215,
+64,
+3,
+96,
+2,
+248,
+0,
+35,
+0,
+0,
+64,
+56,
+0,
+0,
+4,
+248,
+0,
+36,
+0,
+0,
+64,
+56,
+8,
+0,
+0,
+240,
+64,
+0,
+132,
+3,
+128,
+240,
+0,
+0,
+132,
+3,
+128,
+144,
+137,
+0,
+131,
+98,
+0,
+255,
+64,
+0,
+0,
+20,
+200,
+243,
+0,
+0,
+128,
+144,
+129,
+0,
+131,
+102,
+0,
+158,
+67,
+0,
+2,
+248,
+0,
+35,
+0,
+0,
+64,
+56,
+0,
+0,
+4,
+248,
+0,
+36,
+0,
+0,
+64,
+56,
+8,
+0,
+0,
+240,
+64,
+0,
+132,
+3,
+128,
+240,
+0,
+0,
+132,
+3,
+128,
+144,
+108,
+0,
+131,
+98,
+0,
+255,
+64,
+0,
+0,
+20,
+200,
+243,
+0,
+0,
+128,
+144,
+100,
+0,
+131,
+102,
+0,
+248,
+64,
+0,
+112,
+0,
+192,
+243,
+211,
+31,
+128,
+248,
+0,
+0,
+112,
+0,
+192,
+243,
+211,
+31,
+128,
+144,
+161,
+0,
+188,
+64,
+67,
+232,
+0,
+2,
+0,
+0,
+0,
+255,
+64,
+0,
+0,
+20,
+200,
+243,
+0,
+0,
+128,
+144,
+150,
+0,
+195,
+232,
+0,
+2,
+0,
+0,
+12,
+128,
+7,
+192,
+130,
+248,
+0,
+0,
+112,
+192,
+224,
+16,
+195,
+31,
+132,
+248,
+1,
+0,
+112,
+0,
+224,
+16,
+203,
+31,
+3,
+99,
+131,
+71,
+68,
+232,
+32,
+0,
+0,
+0,
+0,
+99,
+2,
+99,
+23,
+102,
+7,
+106,
+127,
+156,
+182,
+255,
+0,
+248,
+64,
+0,
+112,
+0,
+192,
+243,
+211,
+31,
+128,
+248,
+0,
+0,
+112,
+0,
+192,
+243,
+211,
+31,
+128,
+144,
+112,
+0,
+188,
+64,
+67,
+232,
+0,
+2,
+0,
+0,
+0,
+255,
+64,
+0,
+0,
+20,
+200,
+243,
+0,
+0,
+128,
+144,
+101,
+0,
+195,
+232,
+0,
+2,
+0,
+0,
+12,
+128,
+7,
+192,
+130,
+248,
+0,
+0,
+112,
+192,
+224,
+16,
+195,
+31,
+132,
+248,
+1,
+0,
+112,
+0,
+224,
+16,
+203,
+31,
+25,
+102,
+9,
+106,
+2,
+30,
+41,
+3,
+26,
+87,
+162,
+64,
+64,
+198,
+1,
+23,
+127,
+158,
+103,
+255,
+239,
+3,
+0,
+254,
+0,
+143,
+92,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+64,
+143,
+93,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+128,
+143,
+94,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+192,
+143,
+95,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+192,
+142,
+208,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+128,
+142,
+209,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+64,
+142,
+210,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+0,
+142,
+211,
+0,
+0,
+240,
+12,
+0,
+128,
+144,
+107,
+0,
+8,
+255,
+99,
+23,
+0,
+212,
+192,
+51,
+0,
+0,
+8,
+255,
+163,
+23,
+0,
+228,
+192,
+51,
+0,
+0,
+8,
+255,
+227,
+23,
+0,
+244,
+192,
+51,
+0,
+0,
+8,
+255,
+35,
+52,
+0,
+180,
+192,
+51,
+0,
+0,
+8,
+255,
+99,
+52,
+0,
+164,
+192,
+51,
+0,
+0,
+8,
+255,
+163,
+52,
+0,
+148,
+192,
+51,
+0,
+0,
+111,
+3,
+239,
+3,
+0,
+254,
+0,
+143,
+12,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+64,
+143,
+13,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+128,
+143,
+14,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+192,
+143,
+15,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+192,
+142,
+16,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+128,
+142,
+17,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+64,
+142,
+18,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+0,
+142,
+19,
+0,
+0,
+240,
+12,
+0,
+128,
+144,
+33,
+0,
+8,
+255,
+99,
+3,
+0,
+212,
+192,
+51,
+0,
+0,
+8,
+255,
+163,
+3,
+0,
+228,
+192,
+51,
+0,
+0,
+8,
+255,
+227,
+3,
+0,
+244,
+192,
+51,
+0,
+0,
+8,
+255,
+35,
+4,
+0,
+180,
+192,
+51,
+0,
+0,
+8,
+255,
+99,
+4,
+0,
+164,
+192,
+51,
+0,
+0,
+8,
+255,
+163,
+4,
+0,
+148,
+192,
+51,
+0,
+0,
+111,
+3,
+32,
+246,
+192,
+11,
+1,
+16,
+32,
+246,
+2,
+137,
+47,
+240,
+40,
+246,
+2,
+140,
+47,
+240,
+128,
+245,
+99,
+140,
+5,
+4,
+0,
+247,
+99,
+140,
+1,
+20,
+88,
+246,
+99,
+140,
+1,
+20,
+0,
+247,
+35,
+136,
+62,
+226,
+32,
+247,
+35,
+136,
+32,
+210,
+0,
+247,
+34,
+136,
+63,
+2,
+208,
+246,
+34,
+136,
+0,
+4,
+0,
+247,
+99,
+136,
+58,
+162,
+32,
+247,
+99,
+136,
+33,
+146,
+0,
+247,
+98,
+136,
+59,
+18,
+208,
+246,
+98,
+136,
+0,
+20,
+0,
+247,
+162,
+136,
+33,
+2,
+88,
+246,
+98,
+137,
+2,
+68,
+88,
+246,
+162,
+137,
+3,
+68,
+208,
+254,
+227,
+136,
+60,
+242,
+192,
+243,
+188,
+11,
+208,
+254,
+227,
+136,
+56,
+178,
+192,
+243,
+188,
+10,
+32,
+255,
+226,
+136,
+38,
+58,
+192,
+243,
+60,
+0,
+208,
+254,
+227,
+136,
+59,
+242,
+192,
+243,
+60,
+128,
+32,
+255,
+226,
+136,
+49,
+58,
+192,
+243,
+60,
+128,
+0,
+255,
+226,
+136,
+34,
+34,
+192,
+243,
+60,
+128,
+32,
+255,
+226,
+136,
+37,
+58,
+192,
+243,
+60,
+128,
+0,
+254,
+192,
+136,
+1,
+4,
+0,
+240,
+0,
+160,
+0,
+255,
+194,
+8,
+0,
+52,
+195,
+243,
+0,
+128,
+0,
+255,
+202,
+40,
+0,
+52,
+195,
+243,
+0,
+128,
+0,
+254,
+0,
+240,
+35,
+10,
+0,
+240,
+60,
+0,
+0,
+254,
+192,
+136,
+1,
+4,
+0,
+240,
+0,
+160,
+0,
+255,
+226,
+140,
+34,
+34,
+195,
+243,
+60,
+0,
+32,
+255,
+227,
+140,
+36,
+58,
+192,
+243,
+60,
+0,
+0,
+254,
+192,
+136,
+0,
+4,
+0,
+240,
+0,
+160,
+16,
+246,
+226,
+136,
+35,
+50,
+16,
+246,
+226,
+136,
+35,
+50,
+32,
+246,
+226,
+136,
+35,
+50,
+32,
+254,
+226,
+136,
+35,
+58,
+192,
+243,
+60,
+0,
+11,
+96,
+0,
+254,
+0,
+240,
+1,
+4,
+0,
+240,
+64,
+115,
+5,
+106,
+0,
+144,
+173,
+1,
+27,
+96,
+0,
+254,
+0,
+240,
+1,
+4,
+0,
+240,
+64,
+147,
+5,
+106,
+0,
+144,
+227,
+0,
+64,
+246,
+163,
+140,
+1,
+4,
+0,
+246,
+192,
+175,
+63,
+2,
+0,
+246,
+192,
+174,
+59,
+2,
+0,
+246,
+128,
+175,
+62,
+2,
+0,
+246,
+128,
+174,
+58,
+2,
+0,
+246,
+64,
+175,
+61,
+2,
+0,
+246,
+64,
+174,
+57,
+2,
+0,
+255,
+43,
+240,
+4,
+212,
+192,
+243,
+128,
+11,
+64,
+254,
+43,
+240,
+1,
+228,
+192,
+243,
+128,
+10,
+64,
+254,
+43,
+240,
+1,
+244,
+192,
+243,
+128,
+10,
+64,
+254,
+43,
+240,
+1,
+180,
+192,
+243,
+128,
+10,
+64,
+254,
+43,
+141,
+0,
+164,
+192,
+243,
+128,
+10,
+88,
+246,
+35,
+141,
+3,
+68,
+32,
+247,
+35,
+141,
+191,
+66,
+240,
+246,
+35,
+141,
+50,
+66,
+0,
+255,
+235,
+143,
+52,
+242,
+192,
+243,
+60,
+128,
+0,
+255,
+43,
+240,
+2,
+212,
+192,
+243,
+128,
+11,
+0,
+255,
+43,
+240,
+191,
+226,
+192,
+243,
+188,
+10,
+64,
+254,
+43,
+141,
+0,
+180,
+192,
+243,
+128,
+10,
+88,
+246,
+35,
+141,
+2,
+68,
+32,
+247,
+35,
+141,
+190,
+66,
+240,
+246,
+35,
+141,
+50,
+66,
+0,
+255,
+171,
+143,
+52,
+226,
+192,
+243,
+60,
+128,
+0,
+255,
+43,
+240,
+4,
+180,
+192,
+243,
+128,
+11,
+0,
+255,
+43,
+240,
+191,
+226,
+192,
+243,
+188,
+10,
+128,
+253,
+43,
+240,
+3,
+212,
+192,
+243,
+128,
+10,
+64,
+254,
+35,
+141,
+1,
+196,
+192,
+243,
+128,
+10,
+88,
+246,
+35,
+141,
+3,
+68,
+32,
+247,
+35,
+141,
+189,
+66,
+240,
+246,
+35,
+141,
+50,
+66,
+0,
+255,
+107,
+143,
+52,
+210,
+192,
+243,
+60,
+128,
+0,
+255,
+43,
+240,
+4,
+148,
+192,
+243,
+128,
+11,
+64,
+254,
+43,
+240,
+1,
+164,
+192,
+243,
+128,
+10,
+64,
+254,
+43,
+240,
+1,
+180,
+192,
+243,
+128,
+10,
+64,
+254,
+43,
+240,
+1,
+244,
+192,
+243,
+128,
+10,
+64,
+254,
+43,
+141,
+0,
+228,
+192,
+243,
+128,
+10,
+88,
+246,
+35,
+141,
+3,
+68,
+32,
+247,
+35,
+141,
+187,
+66,
+240,
+246,
+35,
+141,
+50,
+66,
+0,
+255,
+235,
+142,
+52,
+178,
+192,
+243,
+60,
+128,
+0,
+255,
+43,
+240,
+2,
+148,
+192,
+243,
+128,
+11,
+0,
+255,
+43,
+240,
+187,
+162,
+192,
+243,
+188,
+10,
+64,
+254,
+43,
+141,
+0,
+244,
+192,
+243,
+128,
+10,
+88,
+246,
+35,
+141,
+2,
+68,
+32,
+247,
+35,
+141,
+186,
+66,
+240,
+246,
+35,
+141,
+50,
+66,
+0,
+255,
+171,
+142,
+52,
+162,
+192,
+243,
+60,
+128,
+0,
+255,
+43,
+240,
+4,
+244,
+192,
+243,
+128,
+11,
+0,
+255,
+43,
+240,
+187,
+162,
+192,
+243,
+188,
+10,
+128,
+253,
+43,
+240,
+3,
+148,
+192,
+243,
+128,
+10,
+64,
+254,
+35,
+141,
+1,
+132,
+192,
+243,
+128,
+10,
+88,
+246,
+35,
+141,
+3,
+68,
+32,
+247,
+35,
+141,
+185,
+66,
+240,
+246,
+35,
+141,
+50,
+66,
+0,
+255,
+107,
+142,
+52,
+146,
+192,
+243,
+60,
+128,
+64,
+255,
+98,
+141,
+0,
+52,
+192,
+243,
+0,
+0,
+0,
+254,
+0,
+240,
+53,
+10,
+0,
+240,
+60,
+0,
+0,
+254,
+0,
+240,
+1,
+4,
+0,
+240,
+64,
+147,
+5,
+106,
+0,
+144,
+177,
+0,
+88,
+246,
+163,
+140,
+1,
+4,
+128,
+245,
+99,
+141,
+10,
+4,
+88,
+246,
+162,
+138,
+1,
+68,
+0,
+247,
+162,
+138,
+36,
+162,
+88,
+254,
+162,
+138,
+3,
+164,
+192,
+243,
+128,
+11,
+0,
+255,
+226,
+137,
+32,
+2,
+195,
+243,
+60,
+0,
+32,
+247,
+226,
+137,
+42,
+114,
+0,
+255,
+34,
+138,
+33,
+18,
+195,
+243,
+60,
+0,
+32,
+247,
+34,
+138,
+42,
+130,
+16,
+246,
+98,
+138,
+40,
+114,
+16,
+246,
+98,
+138,
+41,
+146,
+32,
+246,
+98,
+138,
+41,
+146,
+32,
+246,
+226,
+137,
+41,
+146,
+40,
+246,
+34,
+138,
+41,
+146,
+32,
+247,
+163,
+141,
+63,
+178,
+32,
+247,
+227,
+141,
+62,
+162,
+0,
+254,
+0,
+240,
+8,
+4,
+0,
+240,
+128,
+11,
+128,
+253,
+35,
+240,
+9,
+100,
+192,
+243,
+128,
+10,
+128,
+253,
+163,
+141,
+128,
+115,
+192,
+243,
+152,
+10,
+88,
+246,
+163,
+141,
+4,
+100,
+208,
+246,
+35,
+139,
+0,
+100,
+32,
+255,
+34,
+139,
+53,
+202,
+192,
+243,
+60,
+128,
+0,
+254,
+0,
+139,
+0,
+4,
+0,
+240,
+0,
+160,
+240,
+246,
+163,
+141,
+48,
+98,
+0,
+247,
+99,
+139,
+63,
+210,
+0,
+247,
+98,
+139,
+1,
+212,
+88,
+254,
+98,
+139,
+1,
+212,
+192,
+243,
+128,
+11,
+32,
+255,
+99,
+139,
+62,
+98,
+192,
+243,
+188,
+10,
+88,
+246,
+98,
+139,
+1,
+212,
+240,
+246,
+98,
+139,
+50,
+210,
+0,
+247,
+163,
+128,
+59,
+146,
+0,
+247,
+160,
+128,
+1,
+36,
+88,
+254,
+160,
+128,
+1,
+36,
+192,
+243,
+128,
+11,
+0,
+247,
+163,
+128,
+58,
+98,
+64,
+255,
+35,
+240,
+0,
+100,
+192,
+243,
+128,
+10,
+64,
+255,
+163,
+128,
+0,
+164,
+192,
+243,
+128,
+10,
+88,
+246,
+160,
+128,
+1,
+36,
+240,
+246,
+160,
+128,
+50,
+34,
+8,
+255,
+227,
+143,
+54,
+242,
+192,
+243,
+60,
+128,
+40,
+255,
+227,
+142,
+54,
+178,
+192,
+243,
+60,
+128,
+0,
+254,
+0,
+240,
+39,
+10,
+0,
+240,
+60,
+128,
+8,
+255,
+163,
+143,
+45,
+226,
+192,
+243,
+60,
+128,
+0,
+254,
+0,
+240,
+44,
+10,
+0,
+240,
+60,
+0,
+0,
+254,
+0,
+240,
+40,
+10,
+0,
+240,
+60,
+128,
+8,
+255,
+163,
+142,
+2,
+162,
+192,
+243,
+60,
+128,
+90,
+0,
+169,
+3,
+14,
+96,
+4,
+31,
+169,
+3,
+30,
+96,
+1,
+31,
+73,
+64,
+52,
+64,
+45,
+64,
+2,
+64,
+10,
+64,
+64,
+198,
+1,
+7,
+8,
+232,
+63,
+0,
+0,
+0,
+6,
+232,
+253,
+255,
+255,
+255,
+0,
+246,
+0,
+0,
+0,
+4,
+215,
+64,
+3,
+96,
+2,
+248,
+0,
+35,
+0,
+0,
+64,
+56,
+0,
+0,
+4,
+248,
+0,
+36,
+0,
+0,
+64,
+56,
+8,
+0,
+0,
+240,
+64,
+0,
+132,
+3,
+30,
+106,
+132,
+24,
+128,
+240,
+0,
+0,
+132,
+3,
+128,
+144,
+143,
+0,
+131,
+98,
+0,
+255,
+64,
+0,
+0,
+20,
+200,
+243,
+0,
+0,
+128,
+144,
+135,
+0,
+131,
+102,
+0,
+158,
+71,
+0,
+2,
+248,
+0,
+35,
+0,
+0,
+64,
+56,
+0,
+0,
+4,
+248,
+0,
+36,
+0,
+0,
+64,
+56,
+8,
+0,
+0,
+240,
+64,
+0,
+132,
+3,
+30,
+106,
+132,
+24,
+128,
+240,
+0,
+0,
+132,
+3,
+128,
+144,
+112,
+0,
+131,
+98,
+0,
+255,
+64,
+0,
+0,
+20,
+200,
+243,
+0,
+0,
+128,
+144,
+104,
+0,
+131,
+102,
+0,
+248,
+64,
+0,
+112,
+0,
+192,
+243,
+211,
+31,
+30,
+106,
+134,
+24,
+128,
+248,
+0,
+0,
+112,
+0,
+192,
+243,
+211,
+31,
+128,
+144,
+123,
+0,
+188,
+64,
+67,
+232,
+0,
+2,
+0,
+0,
+0,
+255,
+64,
+0,
+0,
+20,
+200,
+243,
+0,
+0,
+128,
+144,
+112,
+0,
+195,
+232,
+0,
+2,
+0,
+0,
+12,
+128,
+7,
+192,
+130,
+248,
+0,
+0,
+112,
+192,
+224,
+16,
+195,
+31,
+132,
+248,
+1,
+0,
+112,
+0,
+224,
+16,
+203,
+31,
+3,
+99,
+131,
+71,
+68,
+232,
+32,
+0,
+0,
+0,
+0,
+99,
+2,
+99,
+23,
+102,
+7,
+106,
+127,
+156,
+178,
+255,
+0,
+248,
+64,
+0,
+112,
+0,
+192,
+243,
+211,
+31,
+30,
+106,
+134,
+24,
+128,
+248,
+0,
+0,
+112,
+0,
+192,
+243,
+211,
+31,
+128,
+144,
+72,
+0,
+188,
+64,
+67,
+232,
+0,
+2,
+0,
+0,
+0,
+255,
+64,
+0,
+0,
+20,
+200,
+243,
+0,
+0,
+128,
+144,
+61,
+0,
+195,
+232,
+0,
+2,
+0,
+0,
+12,
+128,
+7,
+192,
+130,
+248,
+0,
+0,
+112,
+192,
+224,
+16,
+195,
+31,
+132,
+248,
+1,
+0,
+112,
+0,
+224,
+16,
+203,
+31,
+25,
+102,
+9,
+106,
+2,
+30,
+41,
+3,
+26,
+87,
+162,
+64,
+64,
+198,
+1,
+23,
+127,
+158,
+95,
+255,
+239,
+3,
+0,
+254,
+128,
+143,
+94,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+192,
+143,
+95,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+192,
+142,
+208,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+128,
+142,
+209,
+0,
+0,
+240,
+12,
+0,
+128,
+144,
+47,
+0,
+8,
+255,
+227,
+23,
+0,
+244,
+192,
+51,
+0,
+0,
+8,
+255,
+35,
+52,
+0,
+180,
+192,
+51,
+0,
+0,
+111,
+3,
+239,
+3,
+0,
+254,
+128,
+143,
+14,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+192,
+143,
+15,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+192,
+142,
+16,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+128,
+142,
+17,
+0,
+0,
+240,
+12,
+0,
+128,
+144,
+13,
+0,
+8,
+255,
+227,
+3,
+0,
+244,
+192,
+51,
+0,
+0,
+8,
+255,
+35,
+4,
+0,
+180,
+192,
+51,
+0,
+0,
+111,
+3,
+32,
+246,
+192,
+11,
+1,
+16,
+32,
+246,
+2,
+140,
+47,
+240,
+32,
+247,
+35,
+141,
+63,
+178,
+64,
+254,
+35,
+141,
+2,
+68,
+192,
+243,
+128,
+11,
+32,
+255,
+35,
+240,
+58,
+226,
+192,
+243,
+188,
+10,
+0,
+254,
+0,
+141,
+4,
+4,
+0,
+240,
+128,
+10,
+88,
+246,
+35,
+141,
+3,
+68,
+240,
+246,
+35,
+141,
+48,
+66,
+0,
+247,
+227,
+143,
+52,
+242,
+32,
+247,
+227,
+142,
+52,
+178,
+90,
+0,
+161,
+3,
+6,
+64,
+23,
+64,
+96,
+8,
+70,
+98,
+97,
+8,
+70,
+98,
+98,
+8,
+70,
+98,
+99,
+8,
+70,
+98,
+100,
+8,
+70,
+98,
+101,
+8,
+70,
+98,
+255,
+159,
+8,
+250,
+23,
+102,
+7,
+106,
+112,
+30,
+33,
+3,
+};
diff --git a/libavcodec/rpi_hevc_transform8.h b/libavcodec/rpi_hevc_transform8.h
new file mode 100644
index 0000000000..2901b6568d
--- /dev/null
+++ b/libavcodec/rpi_hevc_transform8.h
@@ -0,0 +1,3070 @@
+static const unsigned char rpi_hevc_transform8 [] = {
+21,
+106,
+0,
+144,
+47,
+1,
+37,
+106,
+0,
+144,
+66,
+1,
+53,
+106,
+0,
+144,
+192,
+4,
+69,
+106,
+0,
+144,
+192,
+4,
+85,
+106,
+0,
+144,
+220,
+5,
+169,
+3,
+62,
+64,
+79,
+64,
+3,
+232,
+32,
+0,
+0,
+0,
+12,
+248,
+0,
+136,
+0,
+0,
+192,
+248,
+0,
+0,
+64,
+232,
+0,
+2,
+0,
+0,
+12,
+248,
+0,
+168,
+0,
+0,
+192,
+248,
+0,
+0,
+0,
+96,
+3,
+232,
+32,
+0,
+0,
+0,
+7,
+232,
+0,
+2,
+0,
+0,
+8,
+232,
+0,
+4,
+0,
+0,
+12,
+248,
+0,
+128,
+0,
+0,
+192,
+8,
+4,
+0,
+4,
+232,
+64,
+0,
+0,
+0,
+5,
+232,
+0,
+8,
+0,
+0,
+128,
+69,
+113,
+66,
+12,
+248,
+0,
+128,
+0,
+0,
+192,
+8,
+4,
+0,
+128,
+69,
+113,
+70,
+128,
+144,
+40,
+0,
+4,
+255,
+48,
+192,
+128,
+3,
+32,
+8,
+16,
+0,
+76,
+254,
+48,
+192,
+9,
+4,
+32,
+8,
+0,
+0,
+4,
+254,
+0,
+144,
+128,
+2,
+0,
+8,
+2,
+0,
+128,
+144,
+23,
+0,
+4,
+255,
+48,
+192,
+128,
+3,
+32,
+8,
+20,
+0,
+76,
+254,
+48,
+192,
+4,
+4,
+32,
+8,
+0,
+0,
+140,
+248,
+44,
+0,
+0,
+0,
+32,
+48,
+4,
+0,
+128,
+69,
+113,
+66,
+242,
+140,
+211,
+192,
+34,
+31,
+41,
+3,
+70,
+192,
+80,
+7,
+164,
+255,
+36,
+204,
+96,
+2,
+0,
+248,
+62,
+0,
+3,
+255,
+55,
+208,
+120,
+3,
+224,
+3,
+190,
+11,
+16,
+139,
+246,
+91,
+0,
+103,
+90,
+0,
+70,
+192,
+80,
+7,
+164,
+255,
+36,
+204,
+224,
+2,
+0,
+248,
+62,
+0,
+3,
+255,
+55,
+208,
+120,
+3,
+224,
+3,
+190,
+11,
+16,
+139,
+246,
+91,
+0,
+103,
+90,
+0,
+225,
+64,
+242,
+64,
+3,
+232,
+128,
+0,
+0,
+0,
+7,
+232,
+0,
+2,
+0,
+0,
+57,
+239,
+224,
+247,
+255,
+255,
+72,
+192,
+95,
+207,
+88,
+122,
+88,
+124,
+137,
+64,
+26,
+64,
+4,
+232,
+64,
+0,
+0,
+0,
+149,
+96,
+161,
+64,
+152,
+64,
+128,
+144,
+35,
+0,
+72,
+232,
+0,
+4,
+0,
+0,
+65,
+232,
+32,
+0,
+0,
+0,
+128,
+144,
+27,
+0,
+4,
+232,
+0,
+8,
+0,
+0,
+69,
+96,
+145,
+64,
+168,
+64,
+128,
+144,
+19,
+0,
+72,
+232,
+0,
+4,
+0,
+0,
+65,
+232,
+32,
+0,
+0,
+0,
+128,
+144,
+11,
+0,
+74,
+232,
+0,
+8,
+0,
+0,
+242,
+140,
+221,
+192,
+57,
+239,
+32,
+8,
+0,
+0,
+41,
+3,
+239,
+3,
+12,
+248,
+0,
+128,
+0,
+0,
+192,
+248,
+4,
+0,
+12,
+248,
+0,
+132,
+64,
+0,
+192,
+248,
+4,
+0,
+0,
+96,
+255,
+159,
+154,
+255,
+0,
+232,
+0,
+4,
+0,
+0,
+255,
+159,
+165,
+255,
+4,
+255,
+48,
+204,
+16,
+3,
+224,
+251,
+62,
+0,
+4,
+255,
+51,
+204,
+128,
+3,
+224,
+251,
+16,
+0,
+76,
+254,
+51,
+204,
+128,
+3,
+224,
+251,
+20,
+0,
+128,
+64,
+6,
+232,
+64,
+0,
+0,
+0,
+140,
+248,
+47,
+0,
+0,
+0,
+224,
+99,
+0,
+0,
+32,
+247,
+240,
+207,
+16,
+3,
+32,
+247,
+176,
+207,
+17,
+19,
+32,
+247,
+112,
+207,
+18,
+35,
+32,
+247,
+48,
+207,
+19,
+51,
+32,
+247,
+240,
+206,
+20,
+67,
+32,
+247,
+176,
+206,
+21,
+83,
+32,
+247,
+112,
+206,
+22,
+99,
+32,
+247,
+48,
+206,
+23,
+115,
+32,
+247,
+240,
+205,
+24,
+131,
+32,
+247,
+176,
+205,
+25,
+147,
+32,
+247,
+112,
+205,
+26,
+163,
+32,
+247,
+48,
+205,
+27,
+179,
+32,
+247,
+240,
+204,
+28,
+195,
+32,
+247,
+176,
+204,
+29,
+211,
+32,
+247,
+112,
+204,
+30,
+227,
+32,
+247,
+48,
+204,
+31,
+243,
+4,
+255,
+51,
+204,
+128,
+3,
+224,
+251,
+16,
+0,
+76,
+254,
+51,
+204,
+128,
+3,
+224,
+251,
+20,
+0,
+0,
+237,
+32,
+0,
+0,
+0,
+140,
+248,
+47,
+0,
+0,
+0,
+224,
+99,
+0,
+0,
+111,
+3,
+4,
+254,
+0,
+128,
+0,
+4,
+0,
+248,
+0,
+0,
+2,
+232,
+32,
+0,
+0,
+0,
+140,
+248,
+32,
+0,
+0,
+0,
+224,
+35,
+0,
+0,
+64,
+232,
+0,
+2,
+0,
+0,
+193,
+232,
+0,
+1,
+0,
+0,
+1,
+106,
+116,
+30,
+90,
+0,
+169,
+3,
+73,
+64,
+52,
+64,
+45,
+64,
+2,
+64,
+10,
+64,
+64,
+198,
+1,
+7,
+8,
+232,
+63,
+0,
+0,
+0,
+6,
+232,
+253,
+255,
+255,
+255,
+0,
+246,
+0,
+0,
+0,
+4,
+215,
+64,
+3,
+96,
+2,
+248,
+0,
+35,
+0,
+0,
+64,
+56,
+0,
+0,
+4,
+248,
+0,
+36,
+0,
+0,
+64,
+56,
+8,
+0,
+0,
+240,
+64,
+0,
+132,
+3,
+128,
+240,
+0,
+0,
+132,
+3,
+128,
+144,
+137,
+0,
+131,
+98,
+0,
+255,
+64,
+0,
+0,
+20,
+200,
+243,
+0,
+0,
+128,
+144,
+129,
+0,
+131,
+102,
+0,
+158,
+67,
+0,
+2,
+248,
+0,
+35,
+0,
+0,
+64,
+56,
+0,
+0,
+4,
+248,
+0,
+36,
+0,
+0,
+64,
+56,
+8,
+0,
+0,
+240,
+64,
+0,
+132,
+3,
+128,
+240,
+0,
+0,
+132,
+3,
+128,
+144,
+108,
+0,
+131,
+98,
+0,
+255,
+64,
+0,
+0,
+20,
+200,
+243,
+0,
+0,
+128,
+144,
+100,
+0,
+131,
+102,
+0,
+248,
+64,
+0,
+112,
+0,
+192,
+243,
+211,
+31,
+128,
+248,
+0,
+0,
+112,
+0,
+192,
+243,
+211,
+31,
+128,
+144,
+161,
+0,
+188,
+64,
+67,
+232,
+0,
+2,
+0,
+0,
+0,
+255,
+64,
+0,
+0,
+20,
+200,
+243,
+0,
+0,
+128,
+144,
+150,
+0,
+195,
+232,
+0,
+2,
+0,
+0,
+12,
+128,
+7,
+192,
+130,
+248,
+0,
+0,
+112,
+192,
+224,
+16,
+195,
+31,
+132,
+248,
+1,
+0,
+112,
+0,
+224,
+16,
+203,
+31,
+3,
+99,
+131,
+71,
+68,
+232,
+32,
+0,
+0,
+0,
+0,
+99,
+2,
+99,
+23,
+102,
+7,
+106,
+127,
+156,
+182,
+255,
+0,
+248,
+64,
+0,
+112,
+0,
+192,
+243,
+211,
+31,
+128,
+248,
+0,
+0,
+112,
+0,
+192,
+243,
+211,
+31,
+128,
+144,
+112,
+0,
+188,
+64,
+67,
+232,
+0,
+2,
+0,
+0,
+0,
+255,
+64,
+0,
+0,
+20,
+200,
+243,
+0,
+0,
+128,
+144,
+101,
+0,
+195,
+232,
+0,
+2,
+0,
+0,
+12,
+128,
+7,
+192,
+130,
+248,
+0,
+0,
+112,
+192,
+224,
+16,
+195,
+31,
+132,
+248,
+1,
+0,
+112,
+0,
+224,
+16,
+203,
+31,
+25,
+102,
+9,
+106,
+2,
+30,
+41,
+3,
+26,
+87,
+162,
+64,
+64,
+198,
+1,
+23,
+127,
+158,
+103,
+255,
+239,
+3,
+0,
+254,
+0,
+143,
+92,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+64,
+143,
+93,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+128,
+143,
+94,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+192,
+143,
+95,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+192,
+142,
+208,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+128,
+142,
+209,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+64,
+142,
+210,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+0,
+142,
+211,
+0,
+0,
+240,
+12,
+0,
+128,
+144,
+107,
+0,
+8,
+255,
+99,
+23,
+0,
+212,
+192,
+51,
+0,
+0,
+8,
+255,
+163,
+23,
+0,
+228,
+192,
+51,
+0,
+0,
+8,
+255,
+227,
+23,
+0,
+244,
+192,
+51,
+0,
+0,
+8,
+255,
+35,
+52,
+0,
+180,
+192,
+51,
+0,
+0,
+8,
+255,
+99,
+52,
+0,
+164,
+192,
+51,
+0,
+0,
+8,
+255,
+163,
+52,
+0,
+148,
+192,
+51,
+0,
+0,
+111,
+3,
+239,
+3,
+0,
+254,
+0,
+143,
+12,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+64,
+143,
+13,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+128,
+143,
+14,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+192,
+143,
+15,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+192,
+142,
+16,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+128,
+142,
+17,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+64,
+142,
+18,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+0,
+142,
+19,
+0,
+0,
+240,
+12,
+0,
+128,
+144,
+33,
+0,
+8,
+255,
+99,
+3,
+0,
+212,
+192,
+51,
+0,
+0,
+8,
+255,
+163,
+3,
+0,
+228,
+192,
+51,
+0,
+0,
+8,
+255,
+227,
+3,
+0,
+244,
+192,
+51,
+0,
+0,
+8,
+255,
+35,
+4,
+0,
+180,
+192,
+51,
+0,
+0,
+8,
+255,
+99,
+4,
+0,
+164,
+192,
+51,
+0,
+0,
+8,
+255,
+163,
+4,
+0,
+148,
+192,
+51,
+0,
+0,
+111,
+3,
+32,
+246,
+192,
+11,
+1,
+16,
+32,
+246,
+2,
+137,
+47,
+240,
+40,
+246,
+2,
+140,
+47,
+240,
+128,
+245,
+99,
+140,
+5,
+4,
+0,
+247,
+99,
+140,
+1,
+20,
+88,
+246,
+99,
+140,
+1,
+20,
+0,
+247,
+35,
+136,
+62,
+226,
+32,
+247,
+35,
+136,
+32,
+210,
+0,
+247,
+34,
+136,
+63,
+2,
+208,
+246,
+34,
+136,
+0,
+4,
+0,
+247,
+99,
+136,
+58,
+162,
+32,
+247,
+99,
+136,
+33,
+146,
+0,
+247,
+98,
+136,
+59,
+18,
+208,
+246,
+98,
+136,
+0,
+20,
+0,
+247,
+162,
+136,
+33,
+2,
+88,
+246,
+98,
+137,
+2,
+68,
+88,
+246,
+162,
+137,
+3,
+68,
+208,
+254,
+227,
+136,
+60,
+242,
+192,
+243,
+188,
+11,
+208,
+254,
+227,
+136,
+56,
+178,
+192,
+243,
+188,
+10,
+32,
+255,
+226,
+136,
+38,
+58,
+192,
+243,
+60,
+0,
+208,
+254,
+227,
+136,
+59,
+242,
+192,
+243,
+60,
+128,
+32,
+255,
+226,
+136,
+49,
+58,
+192,
+243,
+60,
+128,
+0,
+255,
+226,
+136,
+34,
+34,
+192,
+243,
+60,
+128,
+32,
+255,
+226,
+136,
+37,
+58,
+192,
+243,
+60,
+128,
+0,
+254,
+192,
+136,
+1,
+4,
+0,
+240,
+0,
+160,
+0,
+255,
+194,
+8,
+0,
+52,
+195,
+243,
+0,
+128,
+0,
+255,
+202,
+40,
+0,
+52,
+195,
+243,
+0,
+128,
+0,
+254,
+0,
+240,
+35,
+10,
+0,
+240,
+60,
+0,
+0,
+254,
+192,
+136,
+1,
+4,
+0,
+240,
+0,
+160,
+0,
+255,
+226,
+140,
+34,
+34,
+195,
+243,
+60,
+0,
+32,
+255,
+227,
+140,
+36,
+58,
+192,
+243,
+60,
+0,
+0,
+254,
+192,
+136,
+0,
+4,
+0,
+240,
+0,
+160,
+16,
+246,
+226,
+136,
+35,
+50,
+16,
+246,
+226,
+136,
+35,
+50,
+32,
+246,
+226,
+136,
+35,
+50,
+32,
+254,
+226,
+136,
+35,
+58,
+192,
+243,
+60,
+0,
+11,
+96,
+0,
+254,
+0,
+240,
+1,
+4,
+0,
+240,
+64,
+115,
+5,
+106,
+0,
+144,
+173,
+1,
+27,
+96,
+0,
+254,
+0,
+240,
+1,
+4,
+0,
+240,
+64,
+147,
+5,
+106,
+0,
+144,
+227,
+0,
+64,
+246,
+163,
+140,
+1,
+4,
+0,
+246,
+192,
+175,
+63,
+2,
+0,
+246,
+192,
+174,
+59,
+2,
+0,
+246,
+128,
+175,
+62,
+2,
+0,
+246,
+128,
+174,
+58,
+2,
+0,
+246,
+64,
+175,
+61,
+2,
+0,
+246,
+64,
+174,
+57,
+2,
+0,
+255,
+43,
+240,
+4,
+212,
+192,
+243,
+128,
+11,
+64,
+254,
+43,
+240,
+1,
+228,
+192,
+243,
+128,
+10,
+64,
+254,
+43,
+240,
+1,
+244,
+192,
+243,
+128,
+10,
+64,
+254,
+43,
+240,
+1,
+180,
+192,
+243,
+128,
+10,
+64,
+254,
+43,
+141,
+0,
+164,
+192,
+243,
+128,
+10,
+88,
+246,
+35,
+141,
+3,
+68,
+32,
+247,
+35,
+141,
+191,
+66,
+240,
+246,
+35,
+141,
+50,
+66,
+0,
+255,
+235,
+143,
+52,
+242,
+192,
+243,
+60,
+128,
+0,
+255,
+43,
+240,
+2,
+212,
+192,
+243,
+128,
+11,
+0,
+255,
+43,
+240,
+191,
+226,
+192,
+243,
+188,
+10,
+64,
+254,
+43,
+141,
+0,
+180,
+192,
+243,
+128,
+10,
+88,
+246,
+35,
+141,
+2,
+68,
+32,
+247,
+35,
+141,
+190,
+66,
+240,
+246,
+35,
+141,
+50,
+66,
+0,
+255,
+171,
+143,
+52,
+226,
+192,
+243,
+60,
+128,
+0,
+255,
+43,
+240,
+4,
+180,
+192,
+243,
+128,
+11,
+0,
+255,
+43,
+240,
+191,
+226,
+192,
+243,
+188,
+10,
+128,
+253,
+43,
+240,
+3,
+212,
+192,
+243,
+128,
+10,
+64,
+254,
+35,
+141,
+1,
+196,
+192,
+243,
+128,
+10,
+88,
+246,
+35,
+141,
+3,
+68,
+32,
+247,
+35,
+141,
+189,
+66,
+240,
+246,
+35,
+141,
+50,
+66,
+0,
+255,
+107,
+143,
+52,
+210,
+192,
+243,
+60,
+128,
+0,
+255,
+43,
+240,
+4,
+148,
+192,
+243,
+128,
+11,
+64,
+254,
+43,
+240,
+1,
+164,
+192,
+243,
+128,
+10,
+64,
+254,
+43,
+240,
+1,
+180,
+192,
+243,
+128,
+10,
+64,
+254,
+43,
+240,
+1,
+244,
+192,
+243,
+128,
+10,
+64,
+254,
+43,
+141,
+0,
+228,
+192,
+243,
+128,
+10,
+88,
+246,
+35,
+141,
+3,
+68,
+32,
+247,
+35,
+141,
+187,
+66,
+240,
+246,
+35,
+141,
+50,
+66,
+0,
+255,
+235,
+142,
+52,
+178,
+192,
+243,
+60,
+128,
+0,
+255,
+43,
+240,
+2,
+148,
+192,
+243,
+128,
+11,
+0,
+255,
+43,
+240,
+187,
+162,
+192,
+243,
+188,
+10,
+64,
+254,
+43,
+141,
+0,
+244,
+192,
+243,
+128,
+10,
+88,
+246,
+35,
+141,
+2,
+68,
+32,
+247,
+35,
+141,
+186,
+66,
+240,
+246,
+35,
+141,
+50,
+66,
+0,
+255,
+171,
+142,
+52,
+162,
+192,
+243,
+60,
+128,
+0,
+255,
+43,
+240,
+4,
+244,
+192,
+243,
+128,
+11,
+0,
+255,
+43,
+240,
+187,
+162,
+192,
+243,
+188,
+10,
+128,
+253,
+43,
+240,
+3,
+148,
+192,
+243,
+128,
+10,
+64,
+254,
+35,
+141,
+1,
+132,
+192,
+243,
+128,
+10,
+88,
+246,
+35,
+141,
+3,
+68,
+32,
+247,
+35,
+141,
+185,
+66,
+240,
+246,
+35,
+141,
+50,
+66,
+0,
+255,
+107,
+142,
+52,
+146,
+192,
+243,
+60,
+128,
+64,
+255,
+98,
+141,
+0,
+52,
+192,
+243,
+0,
+0,
+0,
+254,
+0,
+240,
+53,
+10,
+0,
+240,
+60,
+0,
+0,
+254,
+0,
+240,
+1,
+4,
+0,
+240,
+64,
+147,
+5,
+106,
+0,
+144,
+177,
+0,
+88,
+246,
+163,
+140,
+1,
+4,
+128,
+245,
+99,
+141,
+10,
+4,
+88,
+246,
+162,
+138,
+1,
+68,
+0,
+247,
+162,
+138,
+36,
+162,
+88,
+254,
+162,
+138,
+3,
+164,
+192,
+243,
+128,
+11,
+0,
+255,
+226,
+137,
+32,
+2,
+195,
+243,
+60,
+0,
+32,
+247,
+226,
+137,
+42,
+114,
+0,
+255,
+34,
+138,
+33,
+18,
+195,
+243,
+60,
+0,
+32,
+247,
+34,
+138,
+42,
+130,
+16,
+246,
+98,
+138,
+40,
+114,
+16,
+246,
+98,
+138,
+41,
+146,
+32,
+246,
+98,
+138,
+41,
+146,
+32,
+246,
+226,
+137,
+41,
+146,
+40,
+246,
+34,
+138,
+41,
+146,
+32,
+247,
+163,
+141,
+63,
+178,
+32,
+247,
+227,
+141,
+62,
+162,
+0,
+254,
+0,
+240,
+8,
+4,
+0,
+240,
+128,
+11,
+128,
+253,
+35,
+240,
+9,
+100,
+192,
+243,
+128,
+10,
+128,
+253,
+163,
+141,
+128,
+115,
+192,
+243,
+152,
+10,
+88,
+246,
+163,
+141,
+4,
+100,
+208,
+246,
+35,
+139,
+0,
+100,
+32,
+255,
+34,
+139,
+53,
+202,
+192,
+243,
+60,
+128,
+0,
+254,
+0,
+139,
+0,
+4,
+0,
+240,
+0,
+160,
+240,
+246,
+163,
+141,
+48,
+98,
+0,
+247,
+99,
+139,
+63,
+210,
+0,
+247,
+98,
+139,
+1,
+212,
+88,
+254,
+98,
+139,
+1,
+212,
+192,
+243,
+128,
+11,
+32,
+255,
+99,
+139,
+62,
+98,
+192,
+243,
+188,
+10,
+88,
+246,
+98,
+139,
+1,
+212,
+240,
+246,
+98,
+139,
+50,
+210,
+0,
+247,
+163,
+128,
+59,
+146,
+0,
+247,
+160,
+128,
+1,
+36,
+88,
+254,
+160,
+128,
+1,
+36,
+192,
+243,
+128,
+11,
+0,
+247,
+163,
+128,
+58,
+98,
+64,
+255,
+35,
+240,
+0,
+100,
+192,
+243,
+128,
+10,
+64,
+255,
+163,
+128,
+0,
+164,
+192,
+243,
+128,
+10,
+88,
+246,
+160,
+128,
+1,
+36,
+240,
+246,
+160,
+128,
+50,
+34,
+8,
+255,
+227,
+143,
+54,
+242,
+192,
+243,
+60,
+128,
+40,
+255,
+227,
+142,
+54,
+178,
+192,
+243,
+60,
+128,
+0,
+254,
+0,
+240,
+39,
+10,
+0,
+240,
+60,
+128,
+8,
+255,
+163,
+143,
+45,
+226,
+192,
+243,
+60,
+128,
+0,
+254,
+0,
+240,
+44,
+10,
+0,
+240,
+60,
+0,
+0,
+254,
+0,
+240,
+40,
+10,
+0,
+240,
+60,
+128,
+8,
+255,
+163,
+142,
+2,
+162,
+192,
+243,
+60,
+128,
+90,
+0,
+169,
+3,
+14,
+96,
+4,
+31,
+169,
+3,
+30,
+96,
+1,
+31,
+73,
+64,
+52,
+64,
+45,
+64,
+2,
+64,
+10,
+64,
+64,
+198,
+1,
+7,
+8,
+232,
+63,
+0,
+0,
+0,
+6,
+232,
+253,
+255,
+255,
+255,
+0,
+246,
+0,
+0,
+0,
+4,
+215,
+64,
+3,
+96,
+2,
+248,
+0,
+35,
+0,
+0,
+64,
+56,
+0,
+0,
+4,
+248,
+0,
+36,
+0,
+0,
+64,
+56,
+8,
+0,
+0,
+240,
+64,
+0,
+132,
+3,
+30,
+106,
+132,
+24,
+128,
+240,
+0,
+0,
+132,
+3,
+128,
+144,
+143,
+0,
+131,
+98,
+0,
+255,
+64,
+0,
+0,
+20,
+200,
+243,
+0,
+0,
+128,
+144,
+135,
+0,
+131,
+102,
+0,
+158,
+71,
+0,
+2,
+248,
+0,
+35,
+0,
+0,
+64,
+56,
+0,
+0,
+4,
+248,
+0,
+36,
+0,
+0,
+64,
+56,
+8,
+0,
+0,
+240,
+64,
+0,
+132,
+3,
+30,
+106,
+132,
+24,
+128,
+240,
+0,
+0,
+132,
+3,
+128,
+144,
+112,
+0,
+131,
+98,
+0,
+255,
+64,
+0,
+0,
+20,
+200,
+243,
+0,
+0,
+128,
+144,
+104,
+0,
+131,
+102,
+0,
+248,
+64,
+0,
+112,
+0,
+192,
+243,
+211,
+31,
+30,
+106,
+134,
+24,
+128,
+248,
+0,
+0,
+112,
+0,
+192,
+243,
+211,
+31,
+128,
+144,
+123,
+0,
+188,
+64,
+67,
+232,
+0,
+2,
+0,
+0,
+0,
+255,
+64,
+0,
+0,
+20,
+200,
+243,
+0,
+0,
+128,
+144,
+112,
+0,
+195,
+232,
+0,
+2,
+0,
+0,
+12,
+128,
+7,
+192,
+130,
+248,
+0,
+0,
+112,
+192,
+224,
+16,
+195,
+31,
+132,
+248,
+1,
+0,
+112,
+0,
+224,
+16,
+203,
+31,
+3,
+99,
+131,
+71,
+68,
+232,
+32,
+0,
+0,
+0,
+0,
+99,
+2,
+99,
+23,
+102,
+7,
+106,
+127,
+156,
+178,
+255,
+0,
+248,
+64,
+0,
+112,
+0,
+192,
+243,
+211,
+31,
+30,
+106,
+134,
+24,
+128,
+248,
+0,
+0,
+112,
+0,
+192,
+243,
+211,
+31,
+128,
+144,
+72,
+0,
+188,
+64,
+67,
+232,
+0,
+2,
+0,
+0,
+0,
+255,
+64,
+0,
+0,
+20,
+200,
+243,
+0,
+0,
+128,
+144,
+61,
+0,
+195,
+232,
+0,
+2,
+0,
+0,
+12,
+128,
+7,
+192,
+130,
+248,
+0,
+0,
+112,
+192,
+224,
+16,
+195,
+31,
+132,
+248,
+1,
+0,
+112,
+0,
+224,
+16,
+203,
+31,
+25,
+102,
+9,
+106,
+2,
+30,
+41,
+3,
+26,
+87,
+162,
+64,
+64,
+198,
+1,
+23,
+127,
+158,
+95,
+255,
+239,
+3,
+0,
+254,
+128,
+143,
+94,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+192,
+143,
+95,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+192,
+142,
+208,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+128,
+142,
+209,
+0,
+0,
+240,
+12,
+0,
+128,
+144,
+47,
+0,
+8,
+255,
+227,
+23,
+0,
+244,
+192,
+51,
+0,
+0,
+8,
+255,
+35,
+52,
+0,
+180,
+192,
+51,
+0,
+0,
+111,
+3,
+239,
+3,
+0,
+254,
+128,
+143,
+14,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+192,
+143,
+15,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+192,
+142,
+16,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+128,
+142,
+17,
+0,
+0,
+240,
+12,
+0,
+128,
+144,
+13,
+0,
+8,
+255,
+227,
+3,
+0,
+244,
+192,
+51,
+0,
+0,
+8,
+255,
+35,
+4,
+0,
+180,
+192,
+51,
+0,
+0,
+111,
+3,
+32,
+246,
+192,
+11,
+1,
+16,
+32,
+246,
+2,
+140,
+47,
+240,
+32,
+247,
+35,
+141,
+63,
+178,
+64,
+254,
+35,
+141,
+2,
+68,
+192,
+243,
+128,
+11,
+32,
+255,
+35,
+240,
+58,
+226,
+192,
+243,
+188,
+10,
+0,
+254,
+0,
+141,
+4,
+4,
+0,
+240,
+128,
+10,
+88,
+246,
+35,
+141,
+3,
+68,
+240,
+246,
+35,
+141,
+48,
+66,
+0,
+247,
+227,
+143,
+52,
+242,
+32,
+247,
+227,
+142,
+52,
+178,
+90,
+0,
+161,
+3,
+6,
+64,
+23,
+64,
+96,
+8,
+70,
+98,
+97,
+8,
+70,
+98,
+98,
+8,
+70,
+98,
+99,
+8,
+70,
+98,
+100,
+8,
+70,
+98,
+101,
+8,
+70,
+98,
+255,
+159,
+8,
+250,
+23,
+102,
+7,
+106,
+112,
+30,
+33,
+3,
+};
diff --git a/libavcodec/rpi_mailbox.c b/libavcodec/rpi_mailbox.c
new file mode 100644
index 0000000000..0255f5dd44
--- /dev/null
+++ b/libavcodec/rpi_mailbox.c
@@ -0,0 +1,149 @@
+/*
+Copyright (c) 2012, Broadcom Europe Ltd.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef RPI
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <assert.h>
+#include <stdint.h>
+#include <sys/ioctl.h>
+
+#include <linux/ioctl.h>
+
+#define MAJOR_NUM 100
+#define IOCTL_MBOX_PROPERTY _IOWR(MAJOR_NUM, 0, char *)
+#define DEVICE_FILE_NAME "/dev/vcio"
+
+#include "rpi_mailbox.h"
+//#include <interface/vctypes/vc_image_structs.h>
+
+/*
+ * use ioctl to send mbox property message
+ */
+
+static int mbox_property(int file_desc, void *buf)
+{
+   int ret_val = ioctl(file_desc, IOCTL_MBOX_PROPERTY, buf);
+
+   if (ret_val < 0) {
+      printf("ioctl_set_msg failed:%d\n", ret_val);
+   }
+
+#ifdef DEBUG
+   unsigned *p = buf; int i; unsigned size = *(unsigned *)buf;
+   for (i=0; i<size/4; i++)
+      printf("%04x: 0x%08x\n", i*sizeof *p, p[i]);
+#endif
+   return ret_val;
+}
+
+unsigned mbox_mem_lock(int file_desc, unsigned handle)
+{
+   int i=0;
+   unsigned p[32];
+   p[i++] = 0; // size
+   p[i++] = 0x00000000; // process request
+
+   p[i++] = 0x3000d; // (the tag id)
+   p[i++] = 4; // (size of the buffer)
+   p[i++] = 4; // (size of the data)
+   p[i++] = handle;
+
+   p[i++] = 0x00000000; // end tag
+   p[0] = i*sizeof *p; // actual size
+
+   mbox_property(file_desc, p);
+   return p[5];
+}
+
+unsigned mbox_mem_unlock(int file_desc, unsigned handle)
+{
+   int i=0;
+   unsigned p[32];
+   p[i++] = 0; // size
+   p[i++] = 0x00000000; // process request
+
+   p[i++] = 0x3000e; // (the tag id)
+   p[i++] = 4; // (size of the buffer)
+   p[i++] = 4; // (size of the data)
+   p[i++] = handle;
+
+   p[i++] = 0x00000000; // end tag
+   p[0] = i*sizeof *p; // actual size
+
+   mbox_property(file_desc, p);
+   return p[5];
+}
+
+#define GET_VCIMAGE_PARAMS 0x30044
+
+int mbox_get_image_params(int fd, VC_IMAGE_T * img)
+{
+    uint32_t buf[sizeof(*img) / sizeof(uint32_t) + 32];
+    uint32_t * p = buf;
+    void * rimg;
+    int rv;
+
+    *p++ = 0; // size
+    *p++ = 0; // process request
+    *p++ = GET_VCIMAGE_PARAMS;
+    *p++ = sizeof(*img);
+    *p++ = sizeof(*img);
+    rimg = p;
+    memcpy(p, img, sizeof(*img));
+    p += sizeof(*img) / sizeof(*p);
+    *p++ = 0;  // End tag
+    buf[0] = (p - buf) * sizeof(*p);
+
+    rv = mbox_property(fd, buf);
+    memcpy(img, rimg, sizeof(*img));
+
+    return rv;
+}
+
+int mbox_open() {
+   int file_desc;
+
+   // open a char device file used for communicating with kernel mbox driver
+   file_desc = open(DEVICE_FILE_NAME, 0);
+   if (file_desc < 0) {
+      printf("Can't open device file: %s\n", DEVICE_FILE_NAME);
+      printf("Try creating a device file with: sudo mknod %s c %d 0\n", DEVICE_FILE_NAME, MAJOR_NUM);
+   }
+   return file_desc;
+}
+
+void mbox_close(int file_desc) {
+  close(file_desc);
+}
+
+#endif
+
diff --git a/libavcodec/rpi_mailbox.h b/libavcodec/rpi_mailbox.h
new file mode 100644
index 0000000000..b3168788d2
--- /dev/null
+++ b/libavcodec/rpi_mailbox.h
@@ -0,0 +1,58 @@
+#ifndef RPI_MAILBOX_H
+#define RPI_MAILBOX_H
+
+/* The image structure. */
+typedef struct vc_image_extra_uv_s {
+  void *u, *v;
+  int vpitch;
+} VC_IMAGE_EXTRA_UV_T;
+
+typedef union {
+    VC_IMAGE_EXTRA_UV_T uv;
+//  VC_IMAGE_EXTRA_RGBA_T rgba;
+//  VC_IMAGE_EXTRA_PAL_T pal;
+//  VC_IMAGE_EXTRA_TF_T tf;
+//  VC_IMAGE_EXTRA_BAYER_T bayer;
+//  VC_IMAGE_EXTRA_MSBAYER_T msbayer;
+//  VC_IMAGE_EXTRA_CODEC_T codec;
+//  VC_IMAGE_EXTRA_OPENGL_T opengl;
+} VC_IMAGE_EXTRA_T;
+
+
+typedef struct VC_IMAGE_T {
+  unsigned short                  type;           /* should restrict to 16 bits */
+  unsigned short                  info;           /* format-specific info; zero for VC02 behaviour */
+  unsigned short                  width;          /* width in pixels */
+  unsigned short                  height;         /* height in pixels */
+  int                             pitch;          /* pitch of image_data array in bytes */
+  int                             size;           /* number of bytes available in image_data array */
+  void                           *image_data;     /* pixel data */
+  VC_IMAGE_EXTRA_T                extra;          /* extra data like palette pointer */
+  void                           *metadata;       /* metadata header for the image */
+  void                           *pool_object;    /* nonNULL if image was allocated from a vc_pool */
+  int                             mem_handle;     /* the mem handle for relocatable memory storage */
+  int                             metadata_size;  /* size of metadata of each channel in bytes */
+  int                             channel_offset; /* offset of consecutive channels in bytes */
+  uint32_t                        video_timestamp;/* 90000 Hz RTP times domain - derived from audio timestamp */
+  uint8_t                         num_channels;   /* number of channels (2 for stereo) */
+  uint8_t                         current_channel;/* the channel this header is currently pointing to */
+  uint8_t                         linked_multichann_flag;/* Indicate the header has the linked-multichannel structure*/
+  uint8_t                         is_channel_linked;     /* Track if the above structure is been used to link the header
+                                                            into a linked-mulitchannel image */
+  uint8_t                         channel_index;         /* index of the channel this header represents while
+                                                            it is being linked. */
+  uint8_t                         _dummy[3];      /* pad struct to 64 bytes */
+} VC_IMAGE_T;
+
+typedef int vc_image_t_size_check[(sizeof(VC_IMAGE_T) == 64) * 2 - 1];
+
+
+extern int mbox_open(void);
+extern void mbox_close(int file_desc);
+
+extern unsigned mbox_mem_lock(int file_desc, unsigned handle);
+extern unsigned mbox_mem_unlock(int file_desc, unsigned handle);
+
+int mbox_get_image_params(int fd, VC_IMAGE_T * img);
+
+#endif
diff --git a/libavcodec/rpi_opts.h b/libavcodec/rpi_opts.h
new file mode 100644
index 0000000000..e6127749ea
--- /dev/null
+++ b/libavcodec/rpi_opts.h
@@ -0,0 +1,46 @@
+#ifndef AVCODEC_RPI_OPTS_H
+#define AVCODEC_RPI_OPTS_H
+
+// define RPI to split the CABAC/prediction/transform into separate stages
+#ifndef RPI
+
+  #define RPI_INTER          0
+  #define RPI_TSTATS         0
+  #define RPI_HEVC_SAND      0
+
+#else
+  #include "config.h"
+
+  #define RPI_INTER          1          // 0 use ARM for UV inter-pred, 1 use QPU
+
+  // By passing jobs to a worker thread we hope to be able to catch up during slow frames
+  // This has no effect unless RPI_WORKER is defined
+  // N.B. The extra thread count is effectively RPI_MAX_JOBS - 1 as
+  // RPI_MAX_JOBS defines the number of worker parameter sets and we must have one
+  // free for the foreground to fill in.
+  #define RPI_MAX_JOBS 2
+
+  // Define RPI_DEBLOCK_VPU to perform deblocking on the VPUs
+  // As it stands there is something mildy broken in VPU deblock - looks mostly OK
+  // but reliably fails some conformance tests (e.g. DBLK_A/B/C_)
+  // With VPU luma & chroma pred it is much the same speed to deblock on the ARM
+//  #define RPI_DEBLOCK_VPU
+
+  #define RPI_VPU_DEBLOCK_CACHED 1
+
+  #if HAVE_NEON
+  #define RPI_HEVC_SAND      1
+  #else
+  // Sand bust on Pi1 currently - reasons unknown
+  #define RPI_HEVC_SAND      0
+  #endif
+
+
+  #define RPI_QPU_EMU_Y      0
+  #define RPI_QPU_EMU_C      0
+
+  #define RPI_TSTATS 0
+#endif
+
+#endif
+
diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
new file mode 100644
index 0000000000..e872b855b7
--- /dev/null
+++ b/libavcodec/rpi_qpu.c
@@ -0,0 +1,935 @@
+#ifdef RPI
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stddef.h>
+#include <stdint.h>
+#include "libavutil/avassert.h"
+
+#include "config.h"
+
+#include <pthread.h>
+#include <time.h>
+
+#include <interface/vcsm/user-vcsm.h>
+
+#include "rpi_mailbox.h"
+#include "rpi_qpu.h"
+#include "rpi_shader.h"
+#include "rpi_hevc_transform8.h"
+#include "rpi_hevc_transform10.h"
+#include "libavutil/rpi_sand_fns.h"
+
+#pragma GCC diagnostic push
+// Many many redundant decls in the header files
+#pragma GCC diagnostic ignored "-Wredundant-decls"
+#include "interface/vmcs_host/vc_vchi_gpuserv.h"
+#pragma GCC diagnostic pop
+
+// Trace time spent waiting for GPU (VPU/QPU) (1=Yes, 0=No)
+#define RPI_TRACE_TIME_VPU_QPU_WAIT     0
+
+// Add profile flags to all QPU requests - generates output in "vcdbg log msg"
+// Beware this is expensive and will probably throw off all other timing by >10%
+#define RPI_TRACE_QPU_PROFILE_ALL       0
+
+// QPU "noflush" flags
+// a mixture of flushing & profiling
+
+#define QPU_FLAGS_NO_FLUSH_VPU          1       // If unset VPU cache will be flushed
+#define QPU_FLAGS_PROF_CLEAR_AND_ENABLE 2       // Clear & Enable detailed QPU profiling registers
+#define QPU_FLAGS_PROF_OUTPUT_COUNTS    4       // Print the results
+#define QPU_FLAGS_OUTPUT_QPU_TIMES      8       // Print QPU times - independant of the profiling
+#define QPU_FLAGS_NO_FLUSH_QPU          16      // If unset flush QPU caches & TMUs (uniforms always flushed)
+
+#define vcos_verify_ge0(x) ((x)>=0)
+
+// Size in 32bit words
+#define QPU_CODE_SIZE 4098
+#define VPU_CODE_SIZE 2048
+
+static const short rpi_transMatrix2even[32][16] = { // Even rows first
+{64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64},
+{90,  87,  80,  70,  57,  43,  25,   9,  -9, -25, -43, -57, -70, -80, -87, -90},
+{89,  75,  50,  18, -18, -50, -75, -89, -89, -75, -50, -18,  18,  50,  75,  89},
+{87,  57,   9, -43, -80, -90, -70, -25,  25,  70,  90,  80,  43,  -9, -57, -87},
+{83,  36, -36, -83, -83, -36,  36,  83,  83,  36, -36, -83, -83, -36,  36,  83},
+{80,   9, -70, -87, -25,  57,  90,  43, -43, -90, -57,  25,  87,  70,  -9, -80},
+{75, -18, -89, -50,  50,  89,  18, -75, -75,  18,  89,  50, -50, -89, -18,  75},
+{70, -43, -87,   9,  90,  25, -80, -57,  57,  80, -25, -90,  -9,  87,  43, -70},
+{64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64},
+{57, -80, -25,  90,  -9, -87,  43,  70, -70, -43,  87,   9, -90,  25,  80, -57},
+{50, -89,  18,  75, -75, -18,  89, -50, -50,  89, -18, -75,  75,  18, -89,  50},
+{43, -90,  57,  25, -87,  70,   9, -80,  80,  -9, -70,  87, -25, -57,  90, -43},
+{36, -83,  83, -36, -36,  83, -83,  36,  36, -83,  83, -36, -36,  83, -83,  36},
+{25, -70,  90, -80,  43,   9, -57,  87, -87,  57,  -9, -43,  80, -90,  70, -25},
+{18, -50,  75, -89,  89, -75,  50, -18, -18,  50, -75,  89, -89,  75, -50,  18},
+{ 9, -25,  43, -57,  70, -80,  87, -90,  90, -87,  80, -70,  57, -43,  25,  -9},
+// Odd rows
+{90,  90,  88,  85,  82,  78,  73,  67,  61,  54,  46,  38,  31,  22,  13,   4},
+{90,  82,  67,  46,  22,  -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13},
+{88,  67,  31, -13, -54, -82, -90, -78, -46,  -4,  38,  73,  90,  85,  61,  22},
+{85,  46, -13, -67, -90, -73, -22,  38,  82,  88,  54,  -4, -61, -90, -78, -31},
+{82,  22, -54, -90, -61,  13,  78,  85,  31, -46, -90, -67,   4,  73,  88,  38},
+{78,  -4, -82, -73,  13,  85,  67, -22, -88, -61,  31,  90,  54, -38, -90, -46},
+{73, -31, -90, -22,  78,  67, -38, -90, -13,  82,  61, -46, -88,  -4,  85,  54},
+{67, -54, -78,  38,  85, -22, -90,   4,  90,  13, -88, -31,  82,  46, -73, -61},
+{61, -73, -46,  82,  31, -88, -13,  90,  -4, -90,  22,  85, -38, -78,  54,  67},
+{54, -85,  -4,  88, -46, -61,  82,  13, -90,  38,  67, -78, -22,  90, -31, -73},
+{46, -90,  38,  54, -90,  31,  61, -88,  22,  67, -85,  13,  73, -82,   4,  78},
+{38, -88,  73,  -4, -67,  90, -46, -31,  85, -78,  13,  61, -90,  54,  22, -82},
+{31, -78,  90, -61,   4,  54, -88,  82, -38, -22,  73, -90,  67, -13, -46,  85},
+{22, -61,  85, -90,  73, -38,  -4,  46, -78,  90, -82,  54, -13, -31,  67, -88},
+{13, -38,  61, -78,  88, -90,  85, -73,  54, -31,   4,  22, -46,  67, -82,  90},
+{ 4, -13,  22, -31,  38, -46,  54, -61,  67, -73,  78, -82,  85, -88,  90, -90}
+};
+
+// Code/constants on GPU
+struct GPU
+{
+  unsigned int qpu_code[QPU_CODE_SIZE];
+  unsigned int vpu_code8[VPU_CODE_SIZE];
+  unsigned int vpu_code10[VPU_CODE_SIZE];
+  short transMatrix2even[16*16*2];
+};
+
+#define CFE_ENTS_PER_A 8
+// If we have a sliced frame 2048 wide @ 64 per slice then there are 32 slices
+// in a line & we want to flush luma + chroma + a couple of bits so ents ~ 70
+// allow 128
+#define CFE_ENT_COUNT  128
+#define CFE_A_COUNT    (CFE_ENT_COUNT / CFE_ENTS_PER_A)
+
+struct rpi_cache_flush_env_s {
+//    unsigned int n;
+//    struct vcsm_user_clean_invalid_s a[CFE_A_COUNT];
+  struct vcsm_user_clean_invalid2_s v;
+};
+
+#define WAIT_COUNT_MAX 16
+
+typedef struct trace_time_one_s
+{
+  int count;
+  int64_t start[WAIT_COUNT_MAX];
+  int64_t total[WAIT_COUNT_MAX];
+} trace_time_one_t;
+
+typedef struct trace_time_wait_s
+{
+  unsigned int jcount;
+  int64_t start0;
+  int64_t last_update;
+  trace_time_one_t active;
+  trace_time_one_t wait;
+} trace_time_wait_t;
+
+typedef struct vq_wait_s
+{
+  sem_t sem;
+  struct vq_wait_s * next;
+} vq_wait_t;
+
+#define VQ_WAIT_POOL_SIZE 16
+typedef struct vq_wait_pool_s
+{
+  vq_wait_t * head;
+  vq_wait_t pool[VQ_WAIT_POOL_SIZE];
+} vq_wait_pool_t;
+
+static void vq_wait_pool_init(vq_wait_pool_t * const pool);
+static void vq_wait_pool_deinit(vq_wait_pool_t * const pool);
+
+typedef struct gpu_env_s
+{
+  int open_count;
+  int init_count;
+  int mb;
+  int vpu_i_cache_flushed;
+  GPU_MEM_PTR_T code_gm_ptr;
+  vq_wait_pool_t wait_pool;
+#if RPI_TRACE_TIME_VPU_QPU_WAIT
+  trace_time_wait_t ttw;
+#endif
+} gpu_env_t;
+
+// Stop more than one thread trying to allocate memory or use the processing resources at once
+static pthread_mutex_t gpu_mutex = PTHREAD_MUTEX_INITIALIZER;
+static gpu_env_t * gpu = NULL;
+
+#if RPI_TRACE_TIME_VPU_QPU_WAIT
+
+static int64_t ns_time(void)
+{
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return (int64_t)ts.tv_sec * (int64_t)1000000000 + ts.tv_nsec;
+}
+
+
+#define WAIT_TIME_PRINT_PERIOD (int64_t)2000000000
+
+#define T_MS(t) ((unsigned int)((t)/(int64_t)1000000) % 1000U)
+#define T_SEC(t) (unsigned int)((t)/(int64_t)1000000000)
+#define T_ARG(t) T_SEC(t), T_MS(t)
+#define T_FMT "%u.%03u"
+
+static void tto_print(trace_time_one_t * tto, const int64_t now, const int64_t start0, const char * const prefix)
+{
+  // Update totals for levels that are still pending
+  for (int i = 0; i < tto->count; ++i) {
+    tto->total[i] += now - tto->start[i];
+    tto->start[i] = now;
+  }
+
+  printf("%s: Idle:" T_FMT ", 1:" T_FMT ", 2:" T_FMT ", 3:" T_FMT ", 4:" T_FMT "\n",
+         prefix,
+         T_ARG(now - start0 - tto->total[0]),
+         T_ARG(tto->total[0]),
+         T_ARG(tto->total[1]),
+         T_ARG(tto->total[2]),
+         T_ARG(tto->total[3]));
+}
+
+
+static void tto_start(trace_time_one_t * const tto, const int64_t now)
+{
+  av_assert0(tto->count < WAIT_COUNT_MAX);
+  tto->start[tto->count++] = now;
+}
+
+static void tto_end(trace_time_one_t * const tto, const int64_t now)
+{
+  const int n = --tto->count;
+  av_assert0(n >= 0);
+  tto->total[n] += now - tto->start[n];
+}
+
+static void ttw_print(trace_time_wait_t * const ttw, const int64_t now)
+{
+  printf("Jobs:%d, Total time=" T_FMT "\n", ttw->jcount, T_ARG(now - ttw->start0));
+  tto_print(&ttw->active, now, ttw->start0, "Active");
+  tto_print(&ttw->wait,   now, ttw->start0, "  Wait");
+}
+
+#endif
+
+// GPU memory alloc fns (internal)
+
+// GPU_MEM_PTR_T alloc fns
+static int gpu_malloc_cached_internal(const int mb, const int numbytes, GPU_MEM_PTR_T * const p) {
+  p->numbytes = (numbytes + 255) & ~255;  // Round up
+  p->vcsm_handle = vcsm_malloc_cache(p->numbytes, VCSM_CACHE_TYPE_HOST | 0x80, (char *)"Video Frame" );
+  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" );
+  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
+  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST_AND_VC, (char *)"Video Frame" );
+  av_assert0(p->vcsm_handle);
+  p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
+  av_assert0(p->vc_handle);
+  p->arm = vcsm_lock(p->vcsm_handle);
+  av_assert0(p->arm);
+  p->vc = mbox_mem_lock(mb, p->vc_handle);
+  av_assert0(p->vc);
+//  printf("***** %s, %d\n", __func__, numbytes);
+
+  return 0;
+}
+
+static int gpu_malloc_uncached_internal(const int mb, const int numbytes, GPU_MEM_PTR_T * const p) {
+  p->numbytes = numbytes;
+  p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE | 0x80, (char *)"Video Frame" );
+  av_assert0(p->vcsm_handle);
+  p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
+  av_assert0(p->vc_handle);
+  p->arm = vcsm_lock(p->vcsm_handle);
+  av_assert0(p->arm);
+  p->vc = mbox_mem_lock(mb, p->vc_handle);
+  av_assert0(p->vc);
+//  printf("***** %s, %d\n", __func__, numbytes);
+  return 0;
+}
+
+static void gpu_free_internal(const int mb, GPU_MEM_PTR_T * const p) {
+  mbox_mem_unlock(mb, p->vc_handle);
+  vcsm_unlock_ptr(p->arm);
+  vcsm_free(p->vcsm_handle);
+  memset(p, 0, sizeof(*p));  // Ensure we crash hard if we try and use this again
+//  printf("***** %s\n", __func__);
+}
+
+
+// GPU init, free, lock, unlock
+
+static void gpu_term(void)
+{
+  gpu_env_t * const ge = gpu;
+
+  // We have to hope that eveything has terminated...
+  gpu = NULL;
+
+  vc_gpuserv_deinit();
+
+  gpu_free_internal(ge->mb, &ge->code_gm_ptr);
+
+  vcsm_exit();
+
+  mbox_close(ge->mb);
+
+  vq_wait_pool_deinit(&ge->wait_pool);
+
+  free(ge);
+}
+
+
+// Connect to QPU, returns 0 on success.
+static int gpu_init(gpu_env_t ** const gpu) {
+  volatile struct GPU* ptr;
+  gpu_env_t * const ge = calloc(1, sizeof(gpu_env_t));
+  *gpu = NULL;
+
+  if (ge == NULL)
+    return -1;
+
+  if ((ge->mb = mbox_open()) < 0)
+    return -1;
+
+  vq_wait_pool_init(&ge->wait_pool);
+
+  vcsm_init();
+
+  gpu_malloc_uncached_internal(ge->mb, sizeof(struct GPU), &ge->code_gm_ptr);
+  ptr = (volatile struct GPU*)ge->code_gm_ptr.arm;
+
+  // Zero everything so we have zeros between the code bits
+  memset((void *)ptr, 0, sizeof(*ptr));
+
+  // Now copy over the QPU code into GPU memory
+  {
+    int num_bytes = (char *)mc_end - (char *)rpi_shader;
+    av_assert0(num_bytes<=QPU_CODE_SIZE*sizeof(unsigned int));
+    memcpy((void*)ptr->qpu_code, rpi_shader, num_bytes);
+  }
+  // And the VPU code
+  {
+    int num_bytes = sizeof(rpi_hevc_transform8);
+    av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
+    memcpy((void*)ptr->vpu_code8, rpi_hevc_transform8, num_bytes);
+  }
+  {
+    int num_bytes = sizeof(rpi_hevc_transform10);
+    av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
+    memcpy((void*)ptr->vpu_code10, rpi_hevc_transform10, num_bytes);
+  }
+  // And the transform coefficients
+  memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, sizeof(rpi_transMatrix2even));
+
+  *gpu = ge;
+  return 0;
+}
+
+
+
+static void gpu_unlock(void) {
+  pthread_mutex_unlock(&gpu_mutex);
+}
+
+// Make sure we have exclusive access to the mailbox, and enable qpu if necessary.
+static gpu_env_t * gpu_lock(void) {
+  pthread_mutex_lock(&gpu_mutex);
+
+  av_assert0(gpu != NULL);
+  return gpu;
+}
+
+static gpu_env_t * gpu_lock_ref(void)
+{
+  pthread_mutex_lock(&gpu_mutex);
+
+  if (gpu == NULL) {
+    int rv = gpu_init(&gpu);
+    if (rv != 0) {
+      gpu_unlock();
+      return NULL;
+    }
+  }
+
+  ++gpu->open_count;
+  return gpu;
+}
+
+static void gpu_unlock_unref(gpu_env_t * const ge)
+{
+  if (--ge->open_count == 0)
+    gpu_term();
+
+  gpu_unlock();
+}
+
+static inline gpu_env_t * gpu_ptr(void)
+{
+  av_assert0(gpu != NULL);
+  return gpu;
+}
+
+// Public gpu fns
+
+// Allocate memory on GPU
+// Fills in structure <p> containing ARM pointer, videocore handle, videocore memory address, numbytes
+// Returns 0 on success.
+// This allocates memory that will not be cached in ARM's data cache.
+// Therefore safe to use without data cache flushing.
+int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p)
+{
+  int r;
+  gpu_env_t * const ge = gpu_lock_ref();
+  if (ge == NULL)
+    return -1;
+  r = gpu_malloc_uncached_internal(ge->mb, numbytes, p);
+  gpu_unlock();
+  return r;
+}
+
+// This allocates data that will be
+//    Cached in ARM L2
+//    Uncached in VPU L2
+int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p)
+{
+  int r;
+  gpu_env_t * const ge = gpu_lock_ref();
+  if (ge == NULL)
+    return -1;
+  r = gpu_malloc_cached_internal(ge->mb, numbytes, p);
+  gpu_unlock();
+  return r;
+}
+
+void gpu_free(GPU_MEM_PTR_T * const p) {
+  gpu_env_t * const ge = gpu_lock();
+  gpu_free_internal(ge->mb, p);
+  gpu_unlock_unref(ge);
+}
+
+unsigned int vpu_get_fn(const unsigned int bit_depth) {
+  // Make sure that the gpu is initialized
+  av_assert0(gpu != NULL);
+  switch (bit_depth){
+    case 8:
+      return gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code8);
+    case 10:
+      return gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code10);
+    default:
+      av_assert0(0);
+  }
+  return 0;
+}
+
+unsigned int vpu_get_constants(void) {
+  av_assert0(gpu != NULL);
+  return gpu->code_gm_ptr.vc + offsetof(struct GPU,transMatrix2even);
+}
+
+int gpu_get_mailbox(void)
+{
+  av_assert0(gpu);
+  return gpu->mb;
+}
+
+void gpu_ref(void)
+{
+  gpu_lock_ref();
+  gpu_unlock();
+}
+
+void gpu_unref(void)
+{
+  gpu_env_t * const ge = gpu_lock();
+  gpu_unlock_unref(ge);
+}
+
+// ----------------------------------------------------------------------------
+//
+// Cache flush functions
+
+#define CACHE_EL_MAX 16
+
+rpi_cache_flush_env_t * rpi_cache_flush_init()
+{
+  rpi_cache_flush_env_t * const rfe = malloc(sizeof(rpi_cache_flush_env_t) +
+            sizeof(struct vcsm_user_clean_invalid2_block_s) * CACHE_EL_MAX);
+  if (rfe == NULL)
+    return NULL;
+
+  rfe->v.op_count = 0;
+  return rfe;
+}
+
+void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe)
+{
+  if (rfe != NULL)
+    free(rfe);
+}
+
+int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe)
+{
+  int rc = 0;
+
+  if (vcsm_clean_invalid2(&rfe->v) != 0)
+    rc = -1;
+
+  free(rfe);
+
+  if (rc == 0)
+    return 0;
+
+  av_log(NULL, AV_LOG_ERROR, "vcsm_clean_invalid failed: errno=%d\n", errno);
+  return rc;
+}
+
+inline void rpi_cache_flush_add_gm_blocks(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
+  const unsigned int offset0, const unsigned int block_size, const unsigned int blocks, const unsigned int block_stride)
+{
+  struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
+
+  av_assert0(rfe->v.op_count <= CACHE_EL_MAX);
+
+  b->invalidate_mode = mode;
+  b->block_count = blocks;
+  b->start_address = gm->arm + offset0;
+  b->block_size = block_size;
+  b->inter_block_stride = block_stride;
+}
+
+void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
+  const unsigned int offset, const unsigned int size)
+{
+  // Deal with empty pointer trivially
+  if (gm == NULL || size == 0)
+    return;
+
+  av_assert0(offset <= gm->numbytes);
+  av_assert0(size <= gm->numbytes);
+  av_assert0(offset + size <= gm->numbytes);
+
+  rpi_cache_flush_add_gm_blocks(rfe, gm, mode, offset, size, 1, 0);
+}
+
+void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode)
+{
+  rpi_cache_flush_add_gm_blocks(rfe, gm, mode, 0, gm->numbytes, 1, 0);
+}
+
+
+void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode)
+{
+#if !RPI_ONE_BUF
+#error Fixme! (NIF)
+#endif
+  if (gpu_is_buf1(frame)) {
+    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf1_gmem(frame), mode);
+  }
+  else
+  {
+    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 0), mode);
+    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 1), mode);
+    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 2), mode);
+  }
+}
+
+// Flush an area of a frame
+// Width, height, x0, y0 in luma pels
+void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode,
+  const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height,
+  const unsigned int uv_shift, const int do_luma, const int do_chroma)
+{
+  const unsigned int y_offset = frame->linesize[0] * y0;
+  const unsigned int y_size = frame->linesize[0] * height;
+  // Round UV up/down to get everything
+  const unsigned int uv_rnd = (1U << uv_shift) >> 1;
+  const unsigned int uv_offset = frame->linesize[1] * (y0 >> uv_shift);
+  const unsigned int uv_size = frame->linesize[1] * ((y0 + height + uv_rnd) >> uv_shift) - uv_offset;
+
+#if 0
+  // *** frame->height is cropped height so not good
+  // As all unsigned they will also reject -ve
+  // Test individually as well as added to reject overflow
+  av_assert0(start_line <= (unsigned int)frame->height);  // ***** frame height cropped
+  av_assert0(n <= (unsigned int)frame->height);
+  av_assert0(start_line + n <= (unsigned int)frame->height);
+#endif
+
+  if (!gpu_is_buf1(frame))
+  {
+    if (do_luma) {
+      rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 0), mode, y_offset, y_size);
+    }
+    if (do_chroma) {
+      rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 1), mode, uv_offset, uv_size);
+      rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 2), mode, uv_offset, uv_size);
+    }
+  }
+  else if (!av_rpi_is_sand_frame(frame))
+  {
+    const GPU_MEM_PTR_T * const gm = gpu_buf1_gmem(frame);
+    if (do_luma) {
+      rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[0] - gm->arm) + y_offset, y_size);
+    }
+    if (do_chroma) {
+      rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[1] - gm->arm) + uv_offset, uv_size);
+      rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[2] - gm->arm) + uv_offset, uv_size);
+    }
+  }
+  else
+  {
+    const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
+    const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
+    const unsigned int xshl = av_rpi_sand_frame_xshl(frame);
+    const unsigned int xleft = x0 & ~((stride1 >> xshl) - 1);
+    const unsigned int block_count = (((x0 + width - xleft) << xshl) + stride1 - 1) / stride1;  // Same for Y & C
+    av_assert0(rfe->v.op_count + do_chroma + do_luma < CACHE_EL_MAX);
+
+    if (do_chroma)
+    {
+      struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
+      b->invalidate_mode = mode;
+      b->block_count = block_count;
+      b->start_address = av_rpi_sand_frame_pos_c(frame, xleft >> 1, y0 >> 1);
+      b->block_size = uv_size;
+      b->inter_block_stride = stride1 * stride2;
+    }
+    if (do_luma)
+    {
+      struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
+      b->invalidate_mode = mode;
+      b->block_count = block_count;
+      b->start_address = av_rpi_sand_frame_pos_y(frame, xleft, y0);
+      b->block_size = y_size;
+      b->inter_block_stride = stride1 * stride2;
+    }
+  }
+}
+
+// Call this to clean and invalidate a region of memory
+void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T *const p, const rpi_cache_flush_mode_t mode)
+{
+  rpi_cache_flush_env_t * rfe = rpi_cache_flush_init();
+  rpi_cache_flush_add_gm_ptr(rfe, p, mode);
+  rpi_cache_flush_finish(rfe);
+}
+
+
+// ----------------------------------------------------------------------------
+
+
+// Wait abstractions - mostly so we can easily add profile code
+static void vq_wait_pool_init(vq_wait_pool_t * const wp)
+{
+  unsigned int i;
+  for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) {
+    sem_init(&wp->pool[i].sem, 0, 0);
+    wp->pool[i].next = wp->pool + i + 1;
+  }
+  wp->head = wp->pool + 0;
+  wp->pool[VQ_WAIT_POOL_SIZE - 1].next = NULL;
+}
+
+static void vq_wait_pool_deinit(vq_wait_pool_t * const wp)
+{
+  unsigned int i;
+  wp->head = NULL;
+  for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) {
+    sem_destroy(&wp->pool[i].sem);
+    wp->pool[i].next = NULL;
+  }
+}
+
+
+// If sem_init actually takes time then maybe we want a pool...
+static vq_wait_t * vq_wait_new(void)
+{
+  gpu_env_t * const ge = gpu_lock_ref();
+  vq_wait_t * const wait = ge->wait_pool.head;
+  ge->wait_pool.head = wait->next;
+  wait->next = NULL;
+
+#if RPI_TRACE_TIME_VPU_QPU_WAIT
+  tto_start(&ge->ttw.active, ns_time());
+#endif
+
+  gpu_unlock();
+  return wait;
+}
+
+static void vq_wait_delete(vq_wait_t * const wait)
+{
+  gpu_env_t * const ge = gpu_lock();
+  wait->next = ge->wait_pool.head;
+  ge->wait_pool.head = wait;
+
+#if RPI_TRACE_TIME_VPU_QPU_WAIT
+  {
+    trace_time_wait_t * const ttw = &ge->ttw;
+    const int64_t now = ns_time();
+    ++ttw->jcount;
+    tto_end(&ttw->wait, now);
+
+    if (ttw->start0 == 0)
+    {
+      ttw->start0 = ttw->active.start[0];
+      ttw->last_update = ttw->start0;
+    }
+    if (now - ttw->last_update > WAIT_TIME_PRINT_PERIOD)
+    {
+      ttw->last_update += WAIT_TIME_PRINT_PERIOD;
+      ttw_print(ttw, now);
+    }
+  }
+#endif
+  gpu_unlock_unref(ge);
+}
+
+static void vq_wait_wait(vq_wait_t * const wait)
+{
+#if RPI_TRACE_TIME_VPU_QPU_WAIT
+  {
+      const int64_t now = ns_time();
+      gpu_env_t * const ge = gpu_lock();
+      tto_start(&ge->ttw.wait, now);
+      gpu_unlock();
+  }
+#endif
+
+  while (sem_wait(&wait->sem) == -1 && errno == EINTR)
+    /* loop */;
+}
+
+static void vq_wait_post(vq_wait_t * const wait)
+{
+#if RPI_TRACE_TIME_VPU_QPU_WAIT
+  {
+    gpu_env_t *const ge = gpu_lock();
+    tto_end(&ge->ttw.active, ns_time());
+    gpu_unlock();
+  }
+#endif
+
+  sem_post(&wait->sem);
+}
+
+
+
+// Header comments were wrong for these two
+#define VPU_QPU_MASK_QPU  1
+#define VPU_QPU_MASK_VPU  2
+
+#define VPU_QPU_JOB_MAX 4
+struct vpu_qpu_job_env_s
+{
+  unsigned int n;
+  unsigned int mask;
+  struct gpu_job_s j[VPU_QPU_JOB_MAX];
+};
+
+typedef struct vpu_qpu_job_env_s vpu_qpu_job_env_t;
+
+vpu_qpu_job_env_t * vpu_qpu_job_new(void)
+{
+  vpu_qpu_job_env_t * vqj = calloc(1, sizeof(vpu_qpu_job_env_t));
+  return vqj;
+}
+
+void vpu_qpu_job_delete(vpu_qpu_job_env_t * const vqj)
+{
+  memset(vqj, 0, sizeof(*vqj));
+  free(vqj);
+}
+
+static inline struct gpu_job_s * new_job(vpu_qpu_job_env_t * const vqj)
+{
+  struct gpu_job_s * const j = vqj->j + vqj->n++;
+  av_assert0(vqj->n <= VPU_QPU_JOB_MAX);
+  return j;
+}
+
+void vpu_qpu_job_add_vpu(vpu_qpu_job_env_t * const vqj, const uint32_t vpu_code,
+  const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5)
+{
+  if (vpu_code != 0) {
+    struct gpu_job_s *const j = new_job(vqj);
+    vqj->mask |= VPU_QPU_MASK_VPU;
+
+    j->command = EXECUTE_VPU;
+    // The bottom two bits of the execute address contain no-flush flags
+    // b0 will flush the VPU I-cache if unset so we nearly always want that set
+    // as we never reload code
+    j->u.v.q[0] = vpu_code | gpu->vpu_i_cache_flushed;
+    j->u.v.q[1] = r0;
+    j->u.v.q[2] = r1;
+    j->u.v.q[3] = r2;
+    j->u.v.q[4] = r3;
+    j->u.v.q[5] = r4;
+    j->u.v.q[6] = r5;
+    gpu->vpu_i_cache_flushed = 1;
+  }
+}
+
+// flags are QPU_FLAGS_xxx
+void vpu_qpu_job_add_qpu(vpu_qpu_job_env_t * const vqj, const unsigned int n, const uint32_t * const mail)
+{
+  if (n != 0) {
+    struct gpu_job_s *const j = new_job(vqj);
+    vqj->mask |= VPU_QPU_MASK_QPU;
+
+    j->command = EXECUTE_QPU;
+    j->u.q.jobs = n;
+#if RPI_TRACE_QPU_PROFILE_ALL
+    j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU | QPU_FLAGS_PROF_CLEAR_AND_ENABLE | QPU_FLAGS_PROF_OUTPUT_COUNTS;
+#else
+    j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU;
+#endif
+    j->u.q.timeout = 5000;
+    memcpy(j->u.q.control, mail, n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
+  }
+}
+
+// Convert callback to sem post
+static void vpu_qpu_job_callback_wait(void * v)
+{
+  vq_wait_post(v);
+}
+
+void vpu_qpu_job_add_sync_this(vpu_qpu_job_env_t * const vqj, vpu_qpu_wait_h * const wait_h)
+{
+  vq_wait_t * wait;
+
+  if (vqj->mask == 0) {
+    *wait_h = NULL;
+    return;
+  }
+
+  // We are going to want a sync object
+  wait = vq_wait_new();
+
+  // There are 2 VPU Qs & 1 QPU Q so we can collapse sync
+  // If we only posted one thing or only QPU jobs
+  if (vqj->n == 1 || vqj->mask == VPU_QPU_MASK_QPU)
+  {
+    struct gpu_job_s * const j = vqj->j + (vqj->n - 1);
+    av_assert0(j->callback.func == 0);
+
+    j->callback.func = vpu_qpu_job_callback_wait;
+    j->callback.cookie = wait;
+  }
+  else
+  {
+    struct gpu_job_s *const j = new_job(vqj);
+
+    j->command = EXECUTE_SYNC;
+    j->u.s.mask = vqj->mask;
+    j->callback.func = vpu_qpu_job_callback_wait;
+    j->callback.cookie = wait;
+  }
+
+  vqj->mask = 0;
+  *wait_h = wait;
+}
+
+int vpu_qpu_job_start(vpu_qpu_job_env_t * const vqj)
+{
+  return vqj->n == 0 ? 0 : vc_gpuserv_execute_code(vqj->n, vqj->j);
+}
+
+// Simple wrapper of start + delete
+int vpu_qpu_job_finish(vpu_qpu_job_env_t * const vqj)
+{
+  int rv;
+  rv = vpu_qpu_job_start(vqj);
+  vpu_qpu_job_delete(vqj);
+  return rv;
+}
+
+void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h)
+{
+  if (wait_h != NULL)
+  {
+    vq_wait_t * const wait = *wait_h;
+    if (wait != NULL) {
+      *wait_h = NULL;
+      vq_wait_wait(wait);
+      vq_wait_delete(wait);
+    }
+  }
+}
+
+int vpu_qpu_init()
+{
+  gpu_env_t * const ge = gpu_lock_ref();
+  if (ge == NULL)
+    return -1;
+
+  if (ge->init_count++ == 0)
+  {
+    vc_gpuserv_init();
+  }
+
+  gpu_unlock();
+  return 0;
+}
+
+void vpu_qpu_term()
+{
+  gpu_env_t * const ge = gpu_lock();
+
+  if (--ge->init_count == 0) {
+    vc_gpuserv_deinit();
+
+#if RPI_TRACE_TIME_VPU_QPU_WAIT
+    ttw_print(&ge->ttw, ns_time());
+#endif
+  }
+
+  gpu_unlock_unref(ge);
+}
+
+uint32_t qpu_fn(const int * const mc_fn)
+{
+  return gpu->code_gm_ptr.vc + ((const char *)mc_fn - (const char *)rpi_shader) + offsetof(struct GPU, qpu_code);
+}
+
+
+int rpi_hevc_qpu_init_fn(HEVCRpiQpu * const qf, const unsigned int bit_depth)
+{
+  // Dummy values we can catch with emulation
+  qf->y_pxx = ~1U;
+  qf->y_bxx = ~2U;
+  qf->y_p00 = ~3U;
+  qf->y_b00 = ~4U;
+  qf->c_pxx = ~5U;
+  qf->c_bxx = ~6U;
+
+  switch (bit_depth) {
+    case 8:
+      qf->y_pxx = qpu_fn(mc_filter_y_pxx);
+      qf->y_pxx = qpu_fn(mc_filter_y_pxx);
+      qf->y_bxx = qpu_fn(mc_filter_y_bxx);
+      qf->y_p00 = qpu_fn(mc_filter_y_p00);
+      qf->y_b00 = qpu_fn(mc_filter_y_b00);
+      qf->c_pxx = qpu_fn(mc_filter_c_p);
+      qf->c_pxx_l1 = qpu_fn(mc_filter_c_p_l1);
+      qf->c_bxx = qpu_fn(mc_filter_c_b);
+      break;
+    case 10:
+      qf->c_pxx = qpu_fn(mc_filter_c10_p);
+      qf->c_pxx_l1 = qpu_fn(mc_filter_c10_p_l1);
+      qf->c_bxx = qpu_fn(mc_filter_c10_b);
+      qf->y_pxx = qpu_fn(mc_filter_y10_pxx);
+      qf->y_bxx = qpu_fn(mc_filter_y10_bxx);
+      qf->y_p00 = qpu_fn(mc_filter_y10_p00);
+      qf->y_b00 = qpu_fn(mc_filter_y10_b00);
+      break;
+    default:
+      return -1;
+  }
+  return 0;
+}
+
+#endif // RPI
diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
new file mode 100644
index 0000000000..485a08f8ba
--- /dev/null
+++ b/libavcodec/rpi_qpu.h
@@ -0,0 +1,206 @@
+#ifndef RPI_QPU_H
+#define RPI_QPU_H
+
+#define RPI_ONE_BUF 1
+
+typedef struct gpu_mem_ptr_s {
+  unsigned char *arm; // Pointer to memory mapped on ARM side
+  int vc_handle;   // Videocore handle of relocatable memory
+  int vcsm_handle; // Handle for use by VCSM
+  int vc;       // Address for use in GPU code
+  int numbytes; // Size of memory block
+} GPU_MEM_PTR_T;
+
+// General GPU functions
+extern int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p);
+extern int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p);
+extern void gpu_free(GPU_MEM_PTR_T * const p);
+
+#include "libavutil/frame.h"
+#if !RPI_ONE_BUF
+static inline uint32_t get_vc_address_y(const AVFrame * const frame) {
+    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[0]);
+    return p->vc;
+}
+
+static inline uint32_t get_vc_address_u(const AVFrame * const frame) {
+    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[1]);
+    return p->vc;
+}
+
+static inline uint32_t get_vc_address_v(const AVFrame * const frame) {
+    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[2]);
+    return p->vc;
+}
+
+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) {
+    return *(GPU_MEM_PTR_T *)av_buffer_pool_opaque(frame->buf[0]);
+}
+
+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_u(const AVFrame * const frame) {
+    return *(GPU_MEM_PTR_T *)av_buffer_pool_opaque(frame->buf[1]);
+}
+
+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_v(const AVFrame * const frame) {
+    return *(GPU_MEM_PTR_T *)av_buffer_pool_opaque(frame->buf[2]);
+}
+
+#else
+
+static inline int gpu_is_buf1(const AVFrame * const frame)
+{
+    return frame->buf[1] == NULL;
+}
+
+static inline GPU_MEM_PTR_T * gpu_buf1_gmem(const AVFrame * const frame)
+{
+    return av_buffer_get_opaque(frame->buf[0]);
+}
+
+static inline GPU_MEM_PTR_T * gpu_buf3_gmem(const AVFrame * const frame, const unsigned int n)
+{
+    return av_buffer_pool_opaque(frame->buf[n]);
+}
+
+static inline uint32_t get_vc_address3(const AVFrame * const frame, const unsigned int n)
+{
+    const GPU_MEM_PTR_T * const gm = gpu_is_buf1(frame) ? gpu_buf1_gmem(frame) : gpu_buf3_gmem(frame, n);
+    return gm->vc + (frame->data[n] - gm->arm);
+}
+
+
+static inline uint32_t get_vc_address_y(const AVFrame * const frame) {
+    return get_vc_address3(frame, 0);
+}
+
+static inline uint32_t get_vc_address_u(const AVFrame * const frame) {
+    return get_vc_address3(frame, 1);
+}
+
+static inline uint32_t get_vc_address_v(const AVFrame * const frame) {
+    return get_vc_address3(frame, 2);
+}
+
+#if 0
+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) {
+    if (gpu_is_buf1(frame))
+    {
+        GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
+        g.numbytes = frame->data[1] - frame->data[0];
+        return g;
+    }
+    else
+        return *gpu_buf3_gmem(frame, 0);
+}
+
+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_u(const AVFrame * const frame) {
+    if (gpu_is_buf1(frame))
+    {
+        GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
+        g.arm += frame->data[1] - frame->data[0];
+        g.vc += frame->data[1] - frame->data[0];
+        g.numbytes = frame->data[2] - frame->data[1];  // chroma size
+        return g;
+    }
+    else
+        return *gpu_buf3_gmem(frame, 1);
+}
+
+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_v(const AVFrame * const frame) {
+    if (gpu_is_buf1(frame))
+    {
+        GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
+        g.arm += frame->data[2] - frame->data[0];
+        g.vc += frame->data[2] - frame->data[0];
+        g.numbytes = frame->data[2] - frame->data[1];  // chroma size
+        return g;
+    }
+    else
+        return *gpu_buf3_gmem(frame, 2);
+}
+#endif
+#endif
+
+// Cache flush stuff
+
+struct rpi_cache_flush_env_s;
+typedef struct rpi_cache_flush_env_s rpi_cache_flush_env_t;
+
+rpi_cache_flush_env_t * rpi_cache_flush_init(void);
+// Free env without flushing
+void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe);
+// Do the accumulated flush & free the env
+int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe);
+
+typedef enum
+{
+    RPI_CACHE_FLUSH_MODE_INVALIDATE     = 1,
+    RPI_CACHE_FLUSH_MODE_WRITEBACK      = 2,
+    RPI_CACHE_FLUSH_MODE_WB_INVALIDATE  = 3
+} rpi_cache_flush_mode_t;
+
+void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode);
+void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode,
+  const unsigned int offset, const unsigned int size);
+void rpi_cache_flush_add_gm_blocks(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
+  const unsigned int offset0, const unsigned int block_size, const unsigned int blocks, const unsigned int block_stride);
+void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode);
+void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode,
+  const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height,
+  const unsigned int uv_shift, const int do_luma, const int do_chroma);
+
+// init, add, finish for one gm ptr
+void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T * const p, const rpi_cache_flush_mode_t mode);
+
+
+// QPU specific functions
+
+typedef struct HEVCRpiQpu {
+    uint32_t c_pxx;
+    uint32_t c_pxx_l1;
+    uint32_t c_bxx;
+    uint32_t y_pxx;
+    uint32_t y_bxx;
+    uint32_t y_p00;
+    uint32_t y_b00;
+} HEVCRpiQpu;
+
+int rpi_hevc_qpu_init_fn(HEVCRpiQpu * const qf, const unsigned int bit_depth);
+
+uint32_t qpu_fn(const int * const mc_fn);
+
+#define QPU_N_GRP    4
+#define QPU_N_MAX    12
+
+#define QPU_MAIL_EL_VALS  2
+
+struct vpu_qpu_wait_s;
+typedef struct vq_wait_s * vpu_qpu_wait_h;
+
+// VPU specific functions
+
+struct vpu_qpu_job_env_s;
+typedef struct vpu_qpu_job_env_s * vpu_qpu_job_h;
+
+vpu_qpu_job_h vpu_qpu_job_new(void);
+void vpu_qpu_job_delete(const vpu_qpu_job_h vqj);
+void vpu_qpu_job_add_vpu(const vpu_qpu_job_h vqj, const uint32_t vpu_code,
+  const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5);
+void vpu_qpu_job_add_qpu(const vpu_qpu_job_h vqj, const unsigned int n, const uint32_t * const mail);
+void vpu_qpu_job_add_sync_this(const vpu_qpu_job_h vqj, vpu_qpu_wait_h * const wait_h);
+int vpu_qpu_job_start(const vpu_qpu_job_h vqj);
+int vpu_qpu_job_finish(const vpu_qpu_job_h vqj);
+
+extern unsigned int vpu_get_fn(const unsigned int bit_depth);
+extern unsigned int vpu_get_constants(void);
+
+// Waits for previous post_codee to complete and Will null out *wait_h after use
+void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h);
+int vpu_qpu_init(void);
+void vpu_qpu_term(void);
+
+extern int gpu_get_mailbox(void);
+void gpu_ref(void);
+void gpu_unref(void);
+
+#endif
diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
new file mode 100644
index 0000000000..2c6541a8fb
--- /dev/null
+++ b/libavcodec/rpi_shader.c
@@ -0,0 +1,1570 @@
+#include "rpi_shader.h"
+
+#ifdef _MSC_VER
+   #include <stdint.h>
+   /* cast through uintptr_t to avoid warnings */
+   #define POINTER_TO_UINT(X) ((unsigned int)(uintptr_t)(X))
+#else
+   #define POINTER_TO_UINT(X) ((unsigned int)(X))
+#endif
+
+#ifdef __cplusplus
+extern "C" { /* the types are probably wrong... */
+#endif
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef _MSC_VER
+__declspec(align(8))
+#elif defined(__GNUC__)
+__attribute__((aligned(8)))
+#endif
+unsigned int rpi_shader[] = {
+// ::mc_setup_c_q0
+// ::mc_start
+/* [0x00000000] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
+// ::mc_setup_c_qn
+/* [0x00000008] */ 0x00000001, 0xe0020927, // mov tmurs, 1
+/* [0x00000010] */ 0x15827d80, 0x10020027, // mov ra0, unif
+/* [0x00000018] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
+/* [0x00000020] */ 0x119de1c0, 0xd00210e7, // shl rb_ef, r0, i_shift30
+/* [0x00000028] */ 0x15827d80, 0x10020627, // mov ra_base, unif
+/* [0x00000030] */ 0x0d801dc0, 0xd0020827, // sub r0, unif, 1
+/* [0x00000038] */ 0x119c11c0, 0xd0021667, // shl rb_max_x, r0, v_x_shift
+/* [0x00000040] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1
+/* [0x00000048] */ 0xff100100, 0xe0020527, // mov ra_kff100100, 0xff100100
+/* [0x00000050] */ 0x000000ff, 0xe00215a7, // mov rb_pmask, v_pmask
+/* [0x00000058] */ 0x001000ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
+/* [0x00000060] */ 0x15827d80, 0x10021527, // mov rb_xpitch, unif
+/* [0x00000068] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
+/* [0x00000070] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+/* [0x00000078] */ 0x0c9d03c0, 0x10021627, // add rb_dma1_base, r1, rb_pitch
+/* [0x00000080] */ 0x14981f80, 0xd0020827, // and r0, 1, elem_num
+/* [0x00000088] */ 0x409c5007, 0xd00049e0, // nop                   ; mul24 r0, r0, 5
+/* [0x00000090] */ 0x0c9a7180, 0x100210a7, // add rb_elem_x, r0, elem_num
+/* [0x00000098] */ 0x11001dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
+/* [0x000000a0] */ 0x0c9c21c0, 0x10020827, // add r0, r0, rb_elem_x
+/* [0x000000a8] */ 0x930001f6, 0xd2225811, // max r0, r0, 0         ; mov ra_y, ra0.16a
+/* [0x000000b0] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+/* [0x000000b8] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
+/* [0x000000c0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
+/* [0x000000c8] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
+/* [0x000000d0] */ 0x149e7040, 0x10020867, // and r1, r0, r1
+/* [0x000000d8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+/* [0x000000e0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+/* [0x000000e8] */ 0x0c627c00, 0x10020627, // add ra_base, ra_base, r0
+/* [0x000000f0] */ 0x0c80ff80, 0xd0021367, // add rb_wt_den_p15, 23 - v_bit_depth, unif
+/* [0x000000f8] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+/* [0x00000100] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2
+/* [0x00000108] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+/* [0x00000110] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3
+/* [0x00000118] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
+/* [0x00000120] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+/* [0x00000128] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
+/* [0x00000130] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+/* [0x00000138] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+/* [0x00000140] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
+/* [0x00000148] */ 0x15827d80, 0x10020027, // mov ra0, unif
+/* [0x00000150] */ 0x15827d80, 0x10020667, // mov ra_base2, unif
+/* [0x00000158] */ 0x11001dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
+/* [0x00000160] */ 0x8c0021f6, 0x12125811, // add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a
+/* [0x00000168] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+/* [0x00000170] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+/* [0x00000178] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
+/* [0x00000180] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
+/* [0x00000188] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
+/* [0x00000190] */ 0x149e7040, 0x10020867, // and r1, r0, r1
+/* [0x00000198] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+/* [0x000001a0] */ 0x8c467076, 0x12024822, // add r0, r0, r1        ; mov r2, ra_y2
+/* [0x000001a8] */ 0x0c667c00, 0x10020667, // add ra_base2, ra_base2, r0
+/* [0x000001b0] */ 0x95444ff6, 0xd40248e0, // mov r3, PREREAD       ; mov r0, ra_y
+// :1
+/* [0x000001b8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
+/* [0x000001c0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
+/* [0x000001c8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
+/* [0x000001d0] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1     ; mul24 r1, r1, rb_pitch
+/* [0x000001d8] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1  ; mov ra_y, r0
+/* [0x000001e0] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
+/* [0x000001e8] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
+/* [0x000001f0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
+/* [0x000001f8] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1     ; mul24 r1, r1, rb_pitch
+/* [0x00000200] */ 0x8c667c52, 0x10125f11, // add t1s, ra_base2, r1 ; mov ra_y2, r2
+/* [0x00000208] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00000210] */ 0x00000000, 0xe0024104, // mov ra4, 0 ; mov rb4, 0
+/* [0x00000218] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00000220] */ 0x00000000, 0xe0024145, // mov ra5, 0 ; mov rb5, 0
+/* [0x00000228] */ 0x00000000, 0xe0024186, // mov ra6, 0 ; mov rb6, 0
+/* [0x00000230] */ 0x00000000, 0xe00241c7, // mov ra7, 0 ; mov rb7, 0
+// ::mc_filter_c_p
+/* [0x00000238] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
+/* [0x00000240] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
+/* [0x00000248] */ 0xf1081dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0
+/* [0x00000250] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif
+/* [0x00000258] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch  ; mov ra0, unif
+/* [0x00000260] */ 0x93567176, 0x14024800, // max r0, r0, r5        ; mov vrx_xshift, vrx_xshift_next
+/* [0x00000268] */ 0x920991f6, 0x12225813, // min r0, r0, rb_max_x  ; mov vra_y_next, ra2.16a
+/* [0x00000270] */ 0x119c31c0, 0xd0220567, // shl vrx_xshift_next, r0, 3
+/* [0x00000278] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
+/* [0x00000280] */ 0x54402077, 0xd4024862, // and r1, r0, r1        ; mul24 r2, ra_width, v_x_mul
+/* [0x00000288] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+/* [0x00000290] */ 0x8c827076, 0x10025803, // add r0, r0, r1        ; mov ra3, unif
+/* [0x00000298] */ 0x8c427636, 0x120246a1, // add vrx_base_next, r3, r0     ; mov r1, ra_height
+/* [0x000002a0] */ 0x8d818eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
+/* [0x000002a8] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
+/* [0x000002b0] */ 0x8c8033f6, 0xd0039496, // add rb_lcount, r1, 3          ; mov.ifc ra_wt_off_mul_l0, unif
+/* [0x000002b8] */ 0x910c73f6, 0xd8024808, // shl r0, r1, v_dma_h_shift     ; mov rb8, ra3.8a
+/* [0x000002c0] */ 0x8c0e70b6, 0x1a024809, // add r0, r0, r2                ; mov rb9, ra3.8b
+/* [0x000002c8] */ 0x910d01f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c
+/* [0x000002d0] */ 0x8c59b1f6, 0x140256a1, // add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
+/* [0x000002d8] */ 0x9581edbf, 0x100255c9, // mov rb_dest, unif             ; mov ra9, rb_max_y
+/* [0x000002e0] */ 0x910cd3f6, 0x1e02484b, // shl r1, r1, rb_wt_den_p15     ; mov rb11, ra3.8d
+/* [0x000002e8] */ 0x8f8023f6, 0xd002531e, // asr rb_wt_off, r1, 2          ; mov ra_link, unif
+/* [0x000002f0] */ 0x0d50df80, 0x1a0200e7, // sub ra3, rb_wt_den_p15, ra_k1
+// :1
+/* [0x000002f8] */ 0xcd511bee, 0xaa0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0
+/* [0x00000300] */ 0x8e4c09f6, 0x140288a3, // shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next
+/* [0x00000308] */ 0x8e4485f6, 0xd402c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
+/* [0x00000310] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next
+/* [0x00000318] */ 0x8c531789, 0xda224460, // add vra_y, r3, ra_k1   ; mov      r0, r1 << 15
+/* [0x00000320] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0     ; mov.ifnc r1, r2 << 1
+/* [0x00000328] */ 0x92267792, 0x1003c8e0, // min r3, r3, ra9       ; mov.ifnc r0, r2
+/* [0x00000330] */ 0x55150d9f, 0x10024122, // mov ra4, ra5          ; mul24 r2, r3, rb_pitch
+/* [0x00000338] */ 0x8c616c87, 0x10024e20, // add vr_txs, vra_base, r2 ; v8min r0, r0, rb_pmask
+/* [0x00000340] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,       r0
+/* [0x00000348] */ 0x4003e030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
+/* [0x00000350] */ 0x40034031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
+/* [0x00000358] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
+/* [0x00000360] */ 0x40032031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
+/* [0x00000368] */ 0x4d004bf1, 0xde0269e0, // sub.setf -, r5, 4     ; mul24      r0, ra0.8d,       r1
+/* [0x00000370] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x00000378] */ 0x8c1a74f6, 0x10025885, // add r2, r2, r3        ; mov ra5, ra6
+/* [0x00000380] */ 0x551cadb7, 0x100241a1, // mov ra6, ra7          ; mul24 r1, ra7, rb10
+/* [0x00000388] */ 0x4d108437, 0x100241e0, // sub ra7, r2, r0       ; mul24 r0, ra4, rb8
+/* [0x00000390] */ 0x4d149237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra5, rb9
+/* [0x00000398] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra7, rb11
+/* [0x000003a0] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
+/* [0x000003a8] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
+/* [0x000003b0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+/* [0x000003b8] */ 0x405a700e, 0x120049e1, // nop                   ; mul24 r1, r1, ra_wt_mul_l0
+/* [0x000003c0] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8         ; mov r3, ra_blk_height
+/* [0x000003c8] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
+/* [0x000003d0] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x000003d8] */ 0x0f0e7380, 0x10020867, // asr r1, r1, ra3
+/* [0x000003e0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
+/* [0x000003e8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+/* [0x000003f0] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
+/* [0x000003f8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+/* [0x00000400] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
+/* [0x00000408] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
+/* [0x00000410] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+/* [0x00000418] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+/* [0x00000420] */ 0xfffffeb8, 0xf0f809e7, // brr -, r:1b
+/* [0x00000428] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
+/* [0x00000430] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
+/* [0x00000438] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
+// ::mc_filter_c_p_l1
+/* [0x00000440] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
+/* [0x00000448] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
+/* [0x00000450] */ 0xf1081dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0
+/* [0x00000458] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif
+/* [0x00000460] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch  ; mov ra0, unif
+/* [0x00000468] */ 0x939c117f, 0x10125815, // max r0, r0, r5        ; mov vrx_xshift, vrx_xshift_next
+/* [0x00000470] */ 0x920991f6, 0x12125813, // min r0, r0, rb_max_x  ; mov vra_y_next, ra2.16a
+/* [0x00000478] */ 0x119c31c0, 0xd0021067, // shl vrx_xshift_next, r0, 3
+/* [0x00000480] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
+/* [0x00000488] */ 0x54402077, 0xd4024862, // and r1, r0, r1        ; mul24 r2, ra_width, v_x_mul
+/* [0x00000490] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+/* [0x00000498] */ 0x8c827076, 0x10025803, // add r0, r0, r1        ; mov ra3, unif
+/* [0x000004a0] */ 0x8c427636, 0x120254e1, // add vrx_base_next, r3, r0     ; mov r1, ra_height
+/* [0x000004a8] */ 0x8d818eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
+/* [0x000004b0] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
+/* [0x000004b8] */ 0x8c8033f6, 0xd0039496, // add rb_lcount, r1, 3          ; mov.ifc ra_wt_off_mul_l0, unif
+/* [0x000004c0] */ 0x910c73f6, 0xd8024808, // shl r0, r1, v_dma_h_shift     ; mov rb8, ra3.8a
+/* [0x000004c8] */ 0x8c0e70b6, 0x1a024809, // add r0, r0, r2                ; mov rb9, ra3.8b
+/* [0x000004d0] */ 0x910d01f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c
+/* [0x000004d8] */ 0x8c59b1f6, 0x140256a1, // add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
+/* [0x000004e0] */ 0x9581edbf, 0x100255c9, // mov rb_dest, unif             ; mov ra9, rb_max_y
+/* [0x000004e8] */ 0x910cd3f6, 0x1e02484b, // shl r1, r1, rb_wt_den_p15     ; mov rb11, ra3.8d
+/* [0x000004f0] */ 0x8f8023f6, 0xd002531e, // asr rb_wt_off, r1, 2          ; mov ra_link, unif
+/* [0x000004f8] */ 0x0d50df80, 0x1a0200e7, // sub ra3, rb_wt_den_p15, ra_k1
+// :1
+/* [0x00000500] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1
+/* [0x00000508] */ 0x8e5539bf, 0x12029899, // shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next
+/* [0x00000510] */ 0x8e4485f6, 0xd202c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
+/* [0x00000518] */ 0x8c4c3ff6, 0x1202a9e3, // add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next
+/* [0x00000520] */ 0x8c531789, 0xda124460, // add vra_y, r3, ra_k1   ; mov      r0, r1 << 15
+/* [0x00000528] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0     ; mov.ifnc r1, r2 << 1
+/* [0x00000530] */ 0x92267792, 0x1003c8e0, // min r3, r3, ra9       ; mov.ifnc r0, r2
+/* [0x00000538] */ 0x55150d9f, 0x10024122, // mov ra4, ra5          ; mul24 r2, r3, rb_pitch
+/* [0x00000540] */ 0x8c656c87, 0x10024f20, // add vr_txs, vra_base, r2 ; v8min r0, r0, rb_pmask
+/* [0x00000548] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,       r0
+/* [0x00000550] */ 0x4003e030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
+/* [0x00000558] */ 0x40034031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
+/* [0x00000560] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
+/* [0x00000568] */ 0x40032031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
+/* [0x00000570] */ 0x4d004bf1, 0xde0269e0, // sub.setf -, r5, 4     ; mul24      r0, ra0.8d,       r1
+/* [0x00000578] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x00000580] */ 0x8c1a74f6, 0x10025885, // add r2, r2, r3        ; mov ra5, ra6
+/* [0x00000588] */ 0x551cadb7, 0x100241a1, // mov ra6, ra7          ; mul24 r1, ra7, rb10
+/* [0x00000590] */ 0x4d108437, 0x100241e0, // sub ra7, r2, r0       ; mul24 r0, ra4, rb8
+/* [0x00000598] */ 0x4d149237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra5, rb9
+/* [0x000005a0] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra7, rb11
+/* [0x000005a8] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
+/* [0x000005b0] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
+/* [0x000005b8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+/* [0x000005c0] */ 0x405a700e, 0x120049e1, // nop                   ; mul24 r1, r1, ra_wt_mul_l0
+/* [0x000005c8] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8         ; mov r3, ra_blk_height
+/* [0x000005d0] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
+/* [0x000005d8] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x000005e0] */ 0x0f0e7380, 0x10020867, // asr r1, r1, ra3
+/* [0x000005e8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
+/* [0x000005f0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+/* [0x000005f8] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
+/* [0x00000600] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+/* [0x00000608] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
+/* [0x00000610] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
+/* [0x00000618] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+/* [0x00000620] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+/* [0x00000628] */ 0xfffffeb8, 0xf0f809e7, // brr -, r:1b
+/* [0x00000630] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
+/* [0x00000638] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
+/* [0x00000640] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
+// ::mc_filter_c_b
+/* [0x00000648] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
+/* [0x00000650] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
+/* [0x00000658] */ 0xf1081dc9, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1
+/* [0x00000660] */ 0x8c0821f6, 0x12225813, // add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a
+/* [0x00000668] */ 0x8d810bf6, 0x10025850, // sub r1, r5, rb_pitch  ; mov ra_width_height, unif
+/* [0x00000670] */ 0x93567176, 0x14125815, // max r0, r0, r5        ; mov ra_xshift, ra_xshift_next
+/* [0x00000678] */ 0x928191f6, 0x10025800, // min r0, r0, rb_max_x  ; mov ra0, unif
+/* [0x00000680] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
+/* [0x00000688] */ 0x9481c1f6, 0xd0025802, // and r0, r0, -4        ; mov ra2, unif
+/* [0x00000690] */ 0x54402077, 0xd4024862, // and r1, r0, r1        ; mul24 r2, ra_width, v_x_mul
+/* [0x00000698] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+/* [0x000006a0] */ 0x8c427076, 0x12024821, // add r0, r0, r1        ; mov r1, ra_height
+/* [0x000006a8] */ 0x8c9c163f, 0x10024680, // add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next
+/* [0x000006b0] */ 0x8d818eb6, 0x10125756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_mul_l0, unif
+/* [0x000006b8] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
+/* [0x000006c0] */ 0x8c8033f6, 0xd0139496, // add rb_lcount, r1, 3  ; mov.ifc ra_wt_mul_l0, unif
+/* [0x000006c8] */ 0x918073f6, 0xd0025803, // shl r0, r1, v_dma_h_shift ; mov ra3, unif
+/* [0x000006d0] */ 0x8c8270b6, 0x10024823, // add r0, r0, r2        ; mov r3, unif
+/* [0x000006d8] */ 0x910d01f6, 0xd2125813, // shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a
+/* [0x000006e0] */ 0x8c81b1f6, 0x10025681, // add rb_dma0, r0, rb_dma0_base ; mov ra1, unif
+/* [0x000006e8] */ 0x110c1dc0, 0xd4020827, // shl r0, ra3.16b, v_x_shift
+/* [0x000006f0] */ 0x8c8021f6, 0x10025803, // add r0, r0, rb_elem_x ; mov ra3, unif
+/* [0x000006f8] */ 0x8d810bf6, 0x10025852, // sub r1, r5, rb_pitch  ; mov ra_wt_off_mul_l1, unif
+/* [0x00000700] */ 0x930e7176, 0x18024808, // max r0, r0, r5        ; mov rb8, ra3.8a
+/* [0x00000708] */ 0x920d91f6, 0x1a024809, // min r0, r0, rb_max_x  ; mov rb9, ra3.8b
+/* [0x00000710] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
+/* [0x00000718] */ 0x9481c1f6, 0xd0039812, // and r0, r0, -4        ; mov.ifc ra_wt_off_mul_l1, unif
+/* [0x00000720] */ 0x940e7076, 0x1c02484a, // and r1, r0, r1        ; mov rb10, ra3.8c
+/* [0x00000728] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+/* [0x00000730] */ 0x8c827076, 0x10024817, // add r0, r0, r1        ; mov rb_dest, unif
+/* [0x00000738] */ 0x0c9e7600, 0x100214e7, // add rb_base2_next, r3, r0
+/* [0x00000740] */ 0x950deff6, 0x1e02424b, // mov ra9, rb_max_y     ; mov rb11, ra3.8d
+/* [0x00000748] */ 0x1148ddc0, 0x14020867, // shl r1, ra_wt_off_l1, rb_wt_den_p15
+/* [0x00000750] */ 0x8f8093f6, 0xd002531e, // asr rb_wt_off, r1, 9  ; mov ra_link, unif
+// :1
+/* [0x00000758] */ 0xcd511bee, 0xaa0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0
+/* [0x00000760] */ 0x8e5539bf, 0x12029899, // shr r2, r4, ra_xshift ; mov.ifz ra_base2, rb_base2_next
+/* [0x00000768] */ 0x8e4c85f6, 0xd0029851, // shr r1, r2, v_v_shift ; mov.ifz ra_y_y2, ra_y_y2_next
+/* [0x00000770] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz ra_base, ra_base_next
+/* [0x00000778] */ 0x8c441fb6, 0xd4224463, // add ra_y, 1, ra_y     ; mov r3, ra_y
+/* [0x00000780] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0     ; mov      r0, r1 << 15
+/* [0x00000788] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9       ; mov.ifnc r1, r2 << 1
+/* [0x00000790] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2       ; mul24 r3, r3, rb_pitch
+/* [0x00000798] */ 0x8c616cc7, 0x10024e20, // add t0s, ra_base, r3  ; v8min r0, r0, rb_pmask
+/* [0x000007a0] */ 0x95145ff6, 0x10025104, // mov rb4, rb5          ; mov ra4, ra5
+/* [0x000007a8] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,       r0
+/* [0x000007b0] */ 0x4003e030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
+/* [0x000007b8] */ 0x40034031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
+/* [0x000007c0] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
+/* [0x000007c8] */ 0x40032031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
+/* [0x000007d0] */ 0x4c0274f1, 0x1e0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8d,       r1
+/* [0x000007d8] */ 0x8d9c64ff, 0xb00240c5, // sub ra3, r2, r3       ; mov rb5, rb6          ; ldtmu1
+/* [0x000007e0] */ 0x8e1809f6, 0x10025885, // shr r2, r4, rb_xshift2 ; mov ra5, ra6
+/* [0x000007e8] */ 0x8e4485f6, 0xd2024863, // shr r1, r2, v_v_shift ; mov r3, ra_y2
+/* [0x000007f0] */ 0x8c5077bf, 0x1a124446, // add ra_y2, r3, ra_k1  ; mov rb6, rb7
+/* [0x000007f8] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0     ; mov      r0, r1 << 15
+/* [0x00000800] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9       ; mov.ifnc r1, r2 << 1
+/* [0x00000808] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2       ; mul24 r3, r3, rb_pitch
+/* [0x00000810] */ 0x8c656cc7, 0x10024f20, // add t1s, ra_base2, r3 ; v8min r0, r0, rb_pmask
+/* [0x00000818] */ 0x540563f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra1.8a,       r0
+/* [0x00000820] */ 0x4007e030, 0xda0049e2, // nop                   ; mul24      r2, ra1.8b << 2,  r0 << 2  @ "mul_used", 0
+/* [0x00000828] */ 0x40074031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra1.8b << 12, r1 << 12 @ "mul_used", 0
+/* [0x00000830] */ 0x4d07c4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 4,  r0 << 4  @ "mul_used", 0
+/* [0x00000838] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
+/* [0x00000840] */ 0x4d044bf1, 0xde0269e0, // sub.setf -, r5, 4     ; mul24      r0, ra1.8d,       r1
+/* [0x00000848] */ 0x4c0854fe, 0x1a0248a1, // add r2, r2, r3        ; mul24 r1, rb5, ra2.8b
+/* [0x00000850] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x00000858] */ 0x551cadb7, 0x100241a3, // mov ra6, ra7          ; mul24 r3, ra7, rb10
+/* [0x00000860] */ 0x4d08443e, 0x180248a0, // sub r2, r2, r0        ; mul24 r0, rb4, ra2.8a
+/* [0x00000868] */ 0x8f0c05f6, 0xd00241c7, // asr ra7, r2, (v_bit_depth - 8) ; mov rb7, ra3
+/* [0x00000870] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0        ; mul24 r0, rb6, ra2.8c
+/* [0x00000878] */ 0x4c08723e, 0x1e024860, // add r1, r1, r0        ; mul24 r0, rb7, ra2.8d
+/* [0x00000880] */ 0x4d108237, 0x100248a0, // sub r2, r1, r0        ; mul24 r0, ra4, rb8
+/* [0x00000888] */ 0x4d149637, 0x10024860, // sub r1, r3, r0        ; mul24 r0, ra5, rb9
+/* [0x00000890] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra7, rb11
+/* [0x00000898] */ 0x4d527216, 0x12024862, // sub r1, r1, r0        ; mul24 r2, r2, ra_k256
+/* [0x000008a0] */ 0x4f50e5ce, 0xd20248a1, // asr r2, r2, 14        ; mul24 r1, r1, ra_k256
+/* [0x000008a8] */ 0x4f58e3d6, 0xd2024862, // asr r1, r1, 14        ; mul24 r2, r2, ra_wt_mul_l0
+/* [0x000008b0] */ 0x4c48c5ce, 0x120248a1, // add r2, r2, rb_wt_off ; mul24 r1, r1, ra_wt_mul_l1
+/* [0x000008b8] */ 0x8c5e72b6, 0x1c024863, // add r1, r1, r2        ; mov r3, ra_blk_height
+/* [0x000008c0] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
+/* [0x000008c8] */ 0xfffffe70, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x000008d0] */ 0xef40d3f3, 0x12024860, // asr r1, r1, rb_wt_den_p15 ; v8subs r0, ra_height, r3
+/* [0x000008d8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
+/* [0x000008e0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+/* [0x000008e8] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
+/* [0x000008f0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+/* [0x000008f8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
+/* [0x00000900] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
+/* [0x00000908] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+/* [0x00000910] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+/* [0x00000918] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b
+/* [0x00000920] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
+/* [0x00000928] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
+/* [0x00000930] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
+// ::mc_sync_q0
+/* [0x00000938] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00000940] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00000948] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00000950] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00000958] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00000960] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00000968] */ 0x0000001c, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00000970] */ 0x00000001, 0xe80009e7, // mov  dst, srel(i)
+/* [0x00000978] */ 0x0000000d, 0xe80009e7, // mov  dst, srel(i)
+// ::mc_sync_q1
+/* [0x00000980] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00000988] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00000990] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00000998] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
+/* [0x000009a0] */ 0x00000011, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x000009a8] */ 0x00000002, 0xe80009e7, // mov  dst, srel(i)
+// ::mc_sync_q2
+/* [0x000009b0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x000009b8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x000009c0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x000009c8] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
+/* [0x000009d0] */ 0x00000012, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x000009d8] */ 0x00000003, 0xe80009e7, // mov  dst, srel(i)
+// ::mc_sync_q3
+/* [0x000009e0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x000009e8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x000009f0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x000009f8] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
+/* [0x00000a00] */ 0x00000013, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00000a08] */ 0x009e7000, 0x100009e7, // nop
+// ::mc_sync_q4
+/* [0x00000a10] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00000a18] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00000a20] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00000a28] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00000a30] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00000a38] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00000a40] */ 0x0000001d, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00000a48] */ 0x00000005, 0xe80009e7, // mov  dst, srel(i)
+/* [0x00000a50] */ 0x0000000e, 0xe80009e7, // mov  dst, srel(i)
+// ::mc_sync_q5
+/* [0x00000a58] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00000a60] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00000a68] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00000a70] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
+/* [0x00000a78] */ 0x00000015, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00000a80] */ 0x00000006, 0xe80009e7, // mov  dst, srel(i)
+// ::mc_sync_q6
+/* [0x00000a88] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00000a90] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00000a98] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00000aa0] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
+/* [0x00000aa8] */ 0x00000016, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00000ab0] */ 0x00000007, 0xe80009e7, // mov  dst, srel(i)
+// ::mc_sync_q7
+/* [0x00000ab8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00000ac0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00000ac8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00000ad0] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
+/* [0x00000ad8] */ 0x00000017, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00000ae0] */ 0x009e7000, 0x100009e7, // nop
+// ::mc_sync_q8
+/* [0x00000ae8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00000af0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00000af8] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00000b00] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00000b08] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00000b10] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00000b18] */ 0x0000001e, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00000b20] */ 0x00000009, 0xe80009e7, // mov  dst, srel(i)
+/* [0x00000b28] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
+// ::mc_sync_q9
+/* [0x00000b30] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00000b38] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00000b40] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00000b48] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
+/* [0x00000b50] */ 0x00000019, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00000b58] */ 0x0000000a, 0xe80009e7, // mov  dst, srel(i)
+// ::mc_sync_q10
+/* [0x00000b60] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00000b68] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00000b70] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00000b78] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
+/* [0x00000b80] */ 0x0000001a, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00000b88] */ 0x0000000b, 0xe80009e7, // mov  dst, srel(i)
+// ::mc_sync_q11
+/* [0x00000b90] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00000b98] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00000ba0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00000ba8] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
+/* [0x00000bb0] */ 0x0000001b, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00000bb8] */ 0x009e7000, 0x100009e7, // nop
+// ::mc_exit_c_qn
+// ::mc_exit_y_qn
+/* [0x00000bc0] */ 0x00000003, 0xe00228e7, // mov.setf r3, PREREAD - 1
+// :1
+/* [0x00000bc8] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
+/* [0x00000bd0] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
+/* [0x00000bd8] */ 0x009e7000, 0xb00009e7, // nop                   ; nop           ; ldtmu1
+/* [0x00000be0] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
+/* [0x00000be8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+/* [0x00000bf0] */ 0x009e7000, 0x300009e7, // nop                   ; nop           ; thrend
+/* [0x00000bf8] */ 0x009e7000, 0x100009e7, // nop
+/* [0x00000c00] */ 0x009e7000, 0x100009e7, // nop
+// ::mc_exit_c_q0
+// ::mc_exit_y_q0
+/* [0x00000c08] */ 0x00000003, 0xe00228e7, // mov.setf r3, PREREAD - 1
+// :1
+/* [0x00000c10] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
+/* [0x00000c18] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
+/* [0x00000c20] */ 0x009e7000, 0xb00009e7, // nop                   ; nop           ; ldtmu1
+/* [0x00000c28] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
+/* [0x00000c30] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+/* [0x00000c38] */ 0x0000001c, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00000c40] */ 0x009e7000, 0x300009e7, // nop                   ; nop           ; thrend
+/* [0x00000c48] */ 0x00000001, 0xe00209a7, // mov interrupt, 1
+/* [0x00000c50] */ 0x009e7000, 0x100009e7, // nop
+// ::mc_setup_y_q0
+/* [0x00000c58] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
+// ::mc_setup_y_qn
+/* [0x00000c60] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1          ; mov ra0, unif
+/* [0x00000c68] */ 0x15827d80, 0x10020267, // mov ra9, unif
+/* [0x00000c70] */ 0x15827d80, 0x10020067, // mov ra1, unif
+/* [0x00000c78] */ 0x15827d80, 0x100202e7, // mov ra11, unif
+/* [0x00000c80] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
+/* [0x00000c88] */ 0x119de1c0, 0xd00210e7, // shl rb_ef, r0, i_shift30
+/* [0x00000c90] */ 0xff100100, 0xe0020527, // mov ra_kff100100, 0xff100100
+/* [0x00000c98] */ 0x000000ff, 0xe00215a7, // mov rb_pmask, v_pmask
+/* [0x00000ca0] */ 0x001000ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
+/* [0x00000ca8] */ 0x15827d80, 0x100200e7, // mov ra3, unif
+/* [0x00000cb0] */ 0x15827d80, 0x10021527, // mov rb_xpitch, unif
+/* [0x00000cb8] */ 0x0d0c1dc0, 0xd4021667, // sub rb_max_x, ra3.16b, 1
+/* [0x00000cc0] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1
+/* [0x00000cc8] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
+/* [0x00000cd0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+/* [0x00000cd8] */ 0x159d03c0, 0x10021627, // or  rb_dma1_base, r1, rb_pitch
+/* [0x00000ce0] */ 0x159a7d80, 0x100208e7, // mov r3, elem_num
+/* [0x00000ce8] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3
+/* [0x00000cf0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+/* [0x00000cf8] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+/* [0x00000d00] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
+/* [0x00000d08] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4        ; v8subs r2, r2, r2
+/* [0x00000d10] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch
+/* [0x00000d18] */ 0x149e7080, 0x10020867, // and r1, r0, r2
+/* [0x00000d20] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+/* [0x00000d28] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+/* [0x00000d30] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0
+/* [0x00000d38] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
+/* [0x00000d40] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+/* [0x00000d48] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+/* [0x00000d50] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
+/* [0x00000d58] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
+/* [0x00000d60] */ 0x149e7080, 0x10020867, // and r1, r0, r2
+/* [0x00000d68] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+/* [0x00000d70] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+/* [0x00000d78] */ 0x0c2e7c00, 0x10020667, // add ra_base2, ra11, r0
+/* [0x00000d80] */ 0x80027036, 0x120049e0, // nop                   ; mov r0, ra0.16a
+/* [0x00000d88] */ 0x95044ff6, 0xd20248e2, // mov r3, PREREAD       ; mov r2, ra1.16a
+// :1
+/* [0x00000d90] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
+/* [0x00000d98] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
+/* [0x00000da0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
+/* [0x00000da8] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1     ; mul24 r1, r1, rb_pitch
+/* [0x00000db0] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1  ; mov ra_y, r0
+/* [0x00000db8] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
+/* [0x00000dc0] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
+/* [0x00000dc8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
+/* [0x00000dd0] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1     ; mul24 r1, r1, rb_pitch
+/* [0x00000dd8] */ 0x8c667c52, 0x10125f11, // add t1s, ra_base2, r1 ; mov ra_y2, r2
+/* [0x00000de0] */ 0x0c80fdc0, 0xd0021367, // add rb_wt_den_p15, unif, 23 - v_bit_depth
+/* [0x00000de8] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+/* [0x00000df0] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2
+/* [0x00000df8] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+/* [0x00000e00] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3
+/* [0x00000e08] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
+/* [0x00000e10] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+/* [0x00000e18] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
+/* [0x00000e20] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+/* [0x00000e28] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+/* [0x00000e30] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
+/* [0x00000e38] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00000e40] */ 0x00000000, 0xe0024208, // mov ra8,  0           ; mov rb8,  0
+/* [0x00000e48] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00000e50] */ 0x00000000, 0xe0024249, // mov ra9,  0           ; mov rb9,  0
+/* [0x00000e58] */ 0x00000000, 0xe002428a, // mov ra10, 0           ; mov rb10, 0
+/* [0x00000e60] */ 0x00000000, 0xe00242cb, // mov ra11, 0           ; mov rb11, 0
+// :per_block_setup_8
+/* [0x00000e68] */ 0x93567176, 0x14125815, // max r0, r0, r5         ; mov ra_xshift, ra_xshift_next
+/* [0x00000e70] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+/* [0x00000e78] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
+/* [0x00000e80] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
+/* [0x00000e88] */ 0x8d810bf6, 0x1002589a, // sub r2, r5, rb_pitch  ; mov ra_base_next, unif
+/* [0x00000e90] */ 0x940270b6, 0x12225853, // and r1, r0, r2        ; mov ra_y_next, ra0.16a
+/* [0x00000e98] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+/* [0x00000ea0] */ 0x8c827076, 0x10025801, // add r0, r0, r1        ; mov ra1, unif
+/* [0x00000ea8] */ 0x0c6a7c00, 0x100206a7, // add ra_base_next, ra_base_next, r0
+/* [0x00000eb0] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
+/* [0x00000eb8] */ 0x93067176, 0x12125813, // max r0, r0, r5        ; mov ra_y2_next, ra1.16a
+/* [0x00000ec0] */ 0x928191f6, 0x10024813, // min r0, r0, rb_max_x  ; mov rb_base2_next, unif
+/* [0x00000ec8] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
+/* [0x00000ed0] */ 0x9481c1f6, 0xd0025810, // and r0, r0, -4        ; mov ra_width_height, unif
+/* [0x00000ed8] */ 0x949dc0bf, 0x10024871, // and r1, r0, r2        ; mov vw_setup, rb_vpm_init
+/* [0x00000ee0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+/* [0x00000ee8] */ 0x4c401077, 0xd4024821, // add r0, r0, r1        ; mul24 r1, ra_width, v_x_mul
+/* [0x00000ef0] */ 0x0c9d3e00, 0x100214e7, // add rb_base2_next, rb_base2_next, r0
+/* [0x00000ef8] */ 0x8d418e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
+/* [0x00000f00] */ 0x8c5c31c6, 0xdc025460, // add rb_i_tmu, r0, 7 - PREREAD ; v8min r0, r0, ra_blk_height
+/* [0x00000f08] */ 0x0c9c71c0, 0xd00214a7, // add rb_lcount, r0, 7
+/* [0x00000f10] */ 0x119c71c0, 0xd0020827, // shl r0,   r0, v_dma_h_shift
+/* [0x00000f18] */ 0x0c9e7040, 0x10020827, // add r0,   r0, r1
+/* [0x00000f20] */ 0x119d01c0, 0xd0020827, // shl r0,   r0, v_dma_wh_shift
+/* [0x00000f28] */ 0x8c81b1f6, 0x100256a0, // add rb_dma0, r0, rb_dma0_base ; mov r0, unif
+/* [0x00000f30] */ 0x918101f6, 0xd00a5816, // shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif
+/* [0x00000f38] */ 0x915031f6, 0xde024223, // shl ra8, r0, 3        ; mov r3, ra_k255
+/* [0x00000f40] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
+/* [0x00000f48] */ 0x10227380, 0x1e4200a7, // ror ra2.8a, r1, ra8.8d
+/* [0x00000f50] */ 0x10227380, 0x1c420027, // ror ra0.8a, r1, ra8.8c
+/* [0x00000f58] */ 0x01040400, 0xe0020867, // mov r1, 0x01040400
+/* [0x00000f60] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d
+/* [0x00000f68] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c
+/* [0x00000f70] */ 0x050b0a00, 0xe0020867, // mov r1,0x050b0a00
+/* [0x00000f78] */ 0x10227380, 0x1e6200a7, // ror ra2.8c, r1, ra8.8d
+/* [0x00000f80] */ 0x10227380, 0x1c620027, // ror ra0.8c, r1, ra8.8c
+/* [0x00000f88] */ 0x11283a40, 0xe0020867, // mov r1,0x11283a40
+/* [0x00000f90] */ 0x10227380, 0x1e7200a7, // ror ra2.8d, r1, ra8.8d
+/* [0x00000f98] */ 0x10227380, 0x1c720027, // ror ra0.8d, r1, ra8.8c
+/* [0x00000fa0] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
+/* [0x00000fa8] */ 0x902203bf, 0x1e025812, // ror r0, r1, ra8.8d  ; mov ra_wt_off_mul_l1, unif
+/* [0x00000fb0] */ 0x90227383, 0x1c424044, // ror ra1.8a, r1, ra8.8c ; v8min rb4, r0, r3
+/* [0x00000fb8] */ 0x0a0b0500, 0xe0020867, // mov r1,0x0a0b0500
+/* [0x00000fc0] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d
+/* [0x00000fc8] */ 0x90227383, 0x1c524045, // ror ra1.8b, r1, ra8.8c ; v8min rb5, r0, r3
+/* [0x00000fd0] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
+/* [0x00000fd8] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d
+/* [0x00000fe0] */ 0x90227383, 0x1c624046, // ror ra1.8c, r1, ra8.8c ; v8min rb6, r0, r3
+/* [0x00000fe8] */ 0x954a0dbf, 0x10084597, // mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 ; mov rb_dest, unif
+/* [0x00000ff0] */ 0x01010000, 0xe0020867, // mov r1,0x01010000
+/* [0x00000ff8] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d
+/* [0x00001000] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00001008] */ 0x90227383, 0x1c724047, // ror ra1.8d, r1, ra8.8c ; v8min rb7, r0, r3
+/* [0x00001010] */ 0x1158ddc0, 0x14020827, // shl r0, ra_wt_off_l0, rb_wt_den_p15
+/* [0x00001018] */ 0x8f8091f6, 0xd002531e, // asr rb_wt_off, r0, 9  ; mov ra_link, unif
+// ::mc_filter_y_pxx
+/* [0x00001020] */ 0xfffffe28, 0xf0f807a7, // brr ra_link, r:per_block_setup_8
+/* [0x00001028] */ 0x959a0ff6, 0x10024023, // mov ra0, unif         ; mov r3, elem_num
+/* [0x00001030] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2
+/* [0x00001038] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3   ; mov rb_xshift2, rb_xshift2_next
+/* [0x00001040] */ 0x11581dc0, 0xd21205a7, // shl ra_wt_mul_l0, ra_wt_mul_l0, 1
+// :1
+/* [0x00001048] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1            ; ldtmu1
+/* [0x00001050] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next      ; ldtmu0
+/* [0x00001058] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift         ; mov r3, rb_pitch
+/* [0x00001060] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
+/* [0x00001068] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
+/* [0x00001070] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1             ; mul24 r2, r2, r3
+/* [0x00001078] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2          ; mov.ifz ra_base2, rb_base2_next
+/* [0x00001080] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
+/* [0x00001088] */ 0x9221e5f6, 0x10025887, // min r2, r2, rb_max_y          ; mov ra7, ra8
+/* [0x00001090] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
+/* [0x00001098] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2         ; v8min r0, r0, rb_pmask
+/* [0x000010a0] */ 0x8c243ff6, 0x100279c8, // add.setf -, rb_ef, rb_ef      ; mov ra8, ra9
+/* [0x000010a8] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,      r0
+/* [0x000010b0] */ 0x4003f030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
+/* [0x000010b8] */ 0x40038031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
+/* [0x000010c0] */ 0x40037031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
+/* [0x000010c8] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
+/* [0x000010d0] */ 0x40036031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
+/* [0x000010d8] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
+/* [0x000010e0] */ 0x40035031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
+/* [0x000010e8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3        ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
+/* [0x000010f0] */ 0x40074031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
+/* [0x000010f8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
+/* [0x00001100] */ 0x40073031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
+/* [0x00001108] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
+/* [0x00001110] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
+/* [0x00001118] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
+/* [0x00001120] */ 0x40071031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
+/* [0x00001128] */ 0x8d288bf6, 0xd00279c9, // sub.setf -, r5, 8     ; mov ra9,  ra10
+/* [0x00001130] */ 0x4d0894fe, 0x180248a0, // sub r2, r2, r3        ; mul24 r0, rb9,  ra2.8a
+/* [0x00001138] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x00001140] */ 0x5508affe, 0x1a025261, // mov rb9,  rb10        ; mul24 r1, rb10, ra2.8b
+/* [0x00001148] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11        ; mov rb10, rb11
+/* [0x00001150] */ 0x8f1c05f6, 0xd00242cb, // asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7
+/* [0x00001158] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0        ; mul24 r0, rb10, ra2.8c
+/* [0x00001160] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0        ; mul24 r0, rb11, ra2.8d
+/* [0x00001168] */ 0x4c204237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra8,  rb4
+/* [0x00001170] */ 0x4c245237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra9,  rb5
+/* [0x00001178] */ 0x4d286237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra10, rb6
+/* [0x00001180] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra11, rb7
+/* [0x00001188] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
+/* [0x00001190] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
+/* [0x00001198] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+/* [0x000011a0] */ 0x405a700e, 0x120049e1, // nop                   ; mul24 r1, r1, ra_wt_mul_l0
+/* [0x000011a8] */ 0x8c5cc3f6, 0x1c024863, // add r1, r1, rb_wt_off ; mov r3, ra_blk_height
+/* [0x000011b0] */ 0xf14083f3, 0xd2024860, // shl r1, r1, 8         ; v8subs r0, ra_height, r3
+/* [0x000011b8] */ 0xfffffe70, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x000011c0] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15
+/* [0x000011c8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
+/* [0x000011d0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+/* [0x000011d8] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
+/* [0x000011e0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+/* [0x000011e8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
+/* [0x000011f0] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
+/* [0x000011f8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+/* [0x00001200] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+/* [0x00001208] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b
+/* [0x00001210] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
+/* [0x00001218] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
+/* [0x00001220] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
+// ::mc_filter_y_bxx
+/* [0x00001228] */ 0xfffffc20, 0xf0f807a7, // brr ra_link, r:per_block_setup_8
+/* [0x00001230] */ 0x959a0ff6, 0x10024023, // mov ra0, unif         ; mov r3, elem_num
+/* [0x00001238] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2
+/* [0x00001240] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3   ; mov rb_xshift2, rb_xshift2_next
+// :1
+/* [0x00001248] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1        ; ldtmu1
+/* [0x00001250] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next  ; ldtmu0
+/* [0x00001258] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift         ; mov r3, rb_pitch
+/* [0x00001260] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
+/* [0x00001268] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
+/* [0x00001270] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1             ; mul24 r2, r2, r3
+/* [0x00001278] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2          ; mov.ifz ra_base2, rb_base2_next
+/* [0x00001280] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
+/* [0x00001288] */ 0x9221e5f6, 0x10025887, // min r2, r2, rb_max_y          ; mov ra7, ra8
+/* [0x00001290] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
+/* [0x00001298] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2         ; v8min r0, r0, rb_pmask
+/* [0x000012a0] */ 0x8c243ff6, 0x100279c8, // add.setf -, rb_ef, rb_ef      ; mov ra8, ra9
+/* [0x000012a8] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,      r0
+/* [0x000012b0] */ 0x4003f030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
+/* [0x000012b8] */ 0x40038031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
+/* [0x000012c0] */ 0x40037031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
+/* [0x000012c8] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
+/* [0x000012d0] */ 0x40036031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
+/* [0x000012d8] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
+/* [0x000012e0] */ 0x40035031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
+/* [0x000012e8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3        ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
+/* [0x000012f0] */ 0x40074031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
+/* [0x000012f8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
+/* [0x00001300] */ 0x40073031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
+/* [0x00001308] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
+/* [0x00001310] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
+/* [0x00001318] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
+/* [0x00001320] */ 0x40071031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
+/* [0x00001328] */ 0x8d288bf6, 0xd00279c9, // sub.setf -, r5, 8     ; mov ra9,  ra10
+/* [0x00001330] */ 0x4d0894fe, 0x180248a0, // sub r2, r2, r3        ; mul24 r0, rb9,  ra2.8a
+/* [0x00001338] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x00001340] */ 0x5508affe, 0x1a025261, // mov rb9,  rb10        ; mul24 r1, rb10, ra2.8b
+/* [0x00001348] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11        ; mov rb10, rb11
+/* [0x00001350] */ 0x8f1c05f6, 0xd00242cb, // asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7
+/* [0x00001358] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0        ; mul24 r0, rb10, ra2.8c
+/* [0x00001360] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0        ; mul24 r0, rb11, ra2.8d
+/* [0x00001368] */ 0x4c204237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra8,  rb4
+/* [0x00001370] */ 0x4c245237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra9,  rb5
+/* [0x00001378] */ 0x4d286237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra10, rb6
+/* [0x00001380] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra11, rb7
+/* [0x00001388] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0        ; mov r2, rb_wt_off
+/* [0x00001390] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
+/* [0x00001398] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+/* [0x000013a0] */ 0x405a700e, 0x120049e0, // nop                   ; mul24 r0, r1, ra_wt_mul_l0
+/* [0x000013a8] */ 0x4c4b808e, 0xd2024821, // add r0, r0, r2        ; mul24 r1, r1 << 8, ra_wt_mul_l1 << 8    @ "mul_used", 0
+/* [0x000013b0] */ 0x8c5e7236, 0x1c024863, // add r1, r1, r0        ; mov r3, ra_blk_height
+/* [0x000013b8] */ 0xf14083f3, 0xd2024860, // shl r1, r1, 8         ; v8subs r0, ra_height, r3
+/* [0x000013c0] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x000013c8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15
+/* [0x000013d0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
+/* [0x000013d8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+/* [0x000013e0] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
+/* [0x000013e8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+/* [0x000013f0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
+/* [0x000013f8] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
+/* [0x00001400] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+/* [0x00001408] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+/* [0x00001410] */ 0xfffffe18, 0xf0f809e7, // brr -, r:1b
+/* [0x00001418] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
+/* [0x00001420] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
+/* [0x00001428] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
+// ::mc_filter_y_p00
+/* [0x00001430] */ 0x959a0ff6, 0x10024023, // mov ra0, unif         ; mov r3, elem_num
+/* [0x00001438] */ 0x15567d80, 0x14120567, // mov ra_xshift, ra_xshift_next
+/* [0x00001440] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3
+/* [0x00001448] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+/* [0x00001450] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+/* [0x00001458] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
+/* [0x00001460] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4        ; v8subs r2, r2, r2
+/* [0x00001468] */ 0x8d8105f6, 0x1002589a, // sub r2, r2, rb_pitch  ; mov ra_base_next, unif
+/* [0x00001470] */ 0x940270b6, 0x12225853, // and r1, r0, r2        ; mov ra_y_next, ra0.16a
+/* [0x00001478] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+/* [0x00001480] */ 0x8c827076, 0x10025810, // add r0, r0, r1        ; mov ra_width_height, unif
+/* [0x00001488] */ 0x8c69cc3f, 0x100246b1, // add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init
+/* [0x00001490] */ 0x11400dc0, 0xd4020867, // shl r1, ra_width, v_x_shift
+/* [0x00001498] */ 0x8d418e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
+/* [0x000014a0] */ 0x8d5c41c6, 0xdc025460, // sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height
+/* [0x000014a8] */ 0x919c71c0, 0xd0024812, // shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0
+/* [0x000014b0] */ 0x8c827076, 0x10025816, // add r0, r0, r1        ; mov ra_wt_off_mul_l0, unif
+/* [0x000014b8] */ 0x918101f6, 0xd0024817, // shl r0, r0, v_dma_wh_shift ; mov rb_dest, unif
+/* [0x000014c0] */ 0x0c9db1c0, 0x100216a7, // add rb_dma0, r0, rb_dma0_base
+/* [0x000014c8] */ 0xf158dddb, 0x14024825, // shl r0, ra_wt_off_l0, rb_wt_den_p15 ; v8subs r5rep, r3, r3
+/* [0x000014d0] */ 0x8f8011f6, 0xd002531e, // asr rb_wt_off, r0, 1  ; mov ra_link, unif
+// :1
+/* [0x000014d8] */ 0xcd511bee, 0x1a0269e5, // sub.setf -, r5, rb_i_tmu  ; v8adds r5rep, r5, ra_k1
+/* [0x000014e0] */ 0x804e7036, 0xa42099d1, // nop                   ; mov.ifz ra_y, ra_y_next      ; ldtmu0
+/* [0x000014e8] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch
+/* [0x000014f0] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
+/* [0x000014f8] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y  ; mov.ifz ra_base, ra_base_next
+/* [0x00001500] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1     ; mul24 r2, r2, r3
+/* [0x00001508] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2  ; v8min r0, r0, rb_pmask
+/* [0x00001510] */ 0x4d592bc6, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0
+/* [0x00001518] */ 0x915cf3f6, 0xdc024863, // shl r1, r1, 23 - v_bit_depth ; mov r3, ra_blk_height
+/* [0x00001520] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
+/* [0x00001528] */ 0xffffff90, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x00001530] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15
+/* [0x00001538] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
+/* [0x00001540] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+/* [0x00001548] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
+/* [0x00001550] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+/* [0x00001558] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
+/* [0x00001560] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
+/* [0x00001568] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+/* [0x00001570] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+/* [0x00001578] */ 0xffffff40, 0xf0f809e7, // brr -, r:1b
+/* [0x00001580] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
+/* [0x00001588] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
+/* [0x00001590] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
+// ::mc_filter_y_b00
+/* [0x00001598] */ 0xfffff8b0, 0xf0f807a7, // brr ra_link, r:per_block_setup_8
+/* [0x000015a0] */ 0x959a0ff6, 0x10024023, // mov ra0, unif         ; mov r3, elem_num
+/* [0x000015a8] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2
+/* [0x000015b0] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3   ; mov rb_xshift2, rb_xshift2_next
+/* [0x000015b8] */ 0x00000007, 0xe0020827, // mov r0, 7
+/* [0x000015c0] */ 0x0d9d1e00, 0x10021467, // sub rb_i_tmu, rb_i_tmu, r0
+/* [0x000015c8] */ 0x0d9d2e00, 0x100214a7, // sub rb_lcount, rb_lcount, r0
+/* [0x000015d0] */ 0x95588ff6, 0xd0024821, // mov r0, 8             ; mov r1, ra_wt_off_mul_l0
+/* [0x000015d8] */ 0x119cce00, 0x10021327, // shl rb_wt_off, rb_wt_off, r0
+/* [0x000015e0] */ 0x809f8009, 0xd000d9d6, // nop                   ; mov.ifnz ra_wt_off_mul_l0, r1 << 8
+// :1
+/* [0x000015e8] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1            ; ldtmu1
+/* [0x000015f0] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next        ; ldtmu0
+/* [0x000015f8] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch
+/* [0x00001600] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
+/* [0x00001608] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y  ; mov.ifz ra_base, ra_base_next
+/* [0x00001610] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1     ; mul24 r2, r2, r3
+/* [0x00001618] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2  ; mov.ifz ra_base2, rb_base2_next
+/* [0x00001620] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
+/* [0x00001628] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
+/* [0x00001630] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1   ; mul24 r2, r2, r3
+/* [0x00001638] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask
+/* [0x00001640] */ 0x545963c6, 0x12024860, // and r1, r1, rb_pmask  ; mul24 r0, r0, ra_wt_mul_l0
+/* [0x00001648] */ 0x4d492bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1
+/* [0x00001650] */ 0x0c9e7040, 0x10020867, // add r1, r0, r1
+/* [0x00001658] */ 0x915ce3f6, 0xdc024863, // shl r1, r1, 22 - v_bit_depth ; mov r3, ra_blk_height
+/* [0x00001660] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
+/* [0x00001668] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x00001670] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15
+/* [0x00001678] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
+/* [0x00001680] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+/* [0x00001688] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
+/* [0x00001690] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+/* [0x00001698] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
+/* [0x000016a0] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
+/* [0x000016a8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+/* [0x000016b0] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+/* [0x000016b8] */ 0xffffff10, 0xf0f809e7, // brr -, r:1b
+/* [0x000016c0] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
+/* [0x000016c8] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
+/* [0x000016d0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
+// ::mc_setup_c10_q0
+/* [0x000016d8] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
+// ::mc_setup_c10_qn
+/* [0x000016e0] */ 0x00000001, 0xe0020927, // mov tmurs, 1
+/* [0x000016e8] */ 0x15827d80, 0x10020027, // mov ra0, unif
+/* [0x000016f0] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
+/* [0x000016f8] */ 0x119de1c0, 0xd00210e7, // shl rb_ef, r0, i_shift30
+/* [0x00001700] */ 0x15827d80, 0x10020627, // mov ra_base, unif
+/* [0x00001708] */ 0x0d801dc0, 0xd0020827, // sub r0, unif, 1
+/* [0x00001710] */ 0x119c21c0, 0xd0021667, // shl rb_max_x, r0, v_x_shift
+/* [0x00001718] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1
+/* [0x00001720] */ 0xff100100, 0xe0020527, // mov ra_kff100100, 0xff100100
+/* [0x00001728] */ 0x0000ffff, 0xe00215a7, // mov rb_pmask, v_pmask
+/* [0x00001730] */ 0x000803ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
+/* [0x00001738] */ 0x15827d80, 0x10021527, // mov rb_xpitch, unif
+/* [0x00001740] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
+/* [0x00001748] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+/* [0x00001750] */ 0x0c9d03c0, 0x10021627, // add rb_dma1_base, r1, rb_pitch
+/* [0x00001758] */ 0x14981f80, 0xd0020827, // and r0, 1, elem_num
+/* [0x00001760] */ 0x409c5007, 0xd00049e0, // nop                   ; mul24 r0, r0, 5
+/* [0x00001768] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
+/* [0x00001770] */ 0x0c9e7000, 0x100210a7, // add rb_elem_x, r0, r0
+/* [0x00001778] */ 0x11002dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
+/* [0x00001780] */ 0x0c9c21c0, 0x10020827, // add r0, r0, rb_elem_x
+/* [0x00001788] */ 0x930001f6, 0xd2225811, // max r0, r0, 0         ; mov ra_y, ra0.16a
+/* [0x00001790] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+/* [0x00001798] */ 0x00000000, 0xe0224541, // mov ra_xshift_next, 0 ; mov rb_xshift2_next, 0
+/* [0x000017a0] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
+/* [0x000017a8] */ 0x149e7040, 0x10020867, // and r1, r0, r1
+/* [0x000017b0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+/* [0x000017b8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+/* [0x000017c0] */ 0x0c627c00, 0x10020627, // add ra_base, ra_base, r0
+/* [0x000017c8] */ 0x0c80df80, 0xd0021367, // add rb_wt_den_p15, 23 - v_bit_depth, unif
+/* [0x000017d0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+/* [0x000017d8] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1
+/* [0x000017e0] */ 0x119c43c0, 0xd0020867, // shl r1, r1, 4
+/* [0x000017e8] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1
+/* [0x000017f0] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
+/* [0x000017f8] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
+/* [0x00001800] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
+/* [0x00001808] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0))
+/* [0x00001810] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6
+/* [0x00001818] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
+/* [0x00001820] */ 0x15827d80, 0x10020027, // mov ra0, unif
+/* [0x00001828] */ 0x15827d80, 0x10020667, // mov ra_base2, unif
+/* [0x00001830] */ 0x11002dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
+/* [0x00001838] */ 0x8c0021f6, 0x12125811, // add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a
+/* [0x00001840] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+/* [0x00001848] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+/* [0x00001850] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
+/* [0x00001858] */ 0x149e7040, 0x10020867, // and r1, r0, r1
+/* [0x00001860] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+/* [0x00001868] */ 0x8c467076, 0x12024822, // add r0, r0, r1        ; mov r2, ra_y2
+/* [0x00001870] */ 0x0c667c00, 0x10020667, // add ra_base2, ra_base2, r0
+/* [0x00001878] */ 0x95444ff6, 0xd40248e0, // mov r3, PREREAD       ; mov r0, ra_y
+// :1
+/* [0x00001880] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
+/* [0x00001888] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
+/* [0x00001890] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
+/* [0x00001898] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1     ; mul24 r1, r1, rb_pitch
+/* [0x000018a0] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1  ; mov ra_y, r0
+/* [0x000018a8] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
+/* [0x000018b0] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
+/* [0x000018b8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
+/* [0x000018c0] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1     ; mul24 r1, r1, rb_pitch
+/* [0x000018c8] */ 0x8c667c52, 0x10125f11, // add t1s, ra_base2, r1 ; mov ra_y2, r2
+/* [0x000018d0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x000018d8] */ 0x00000000, 0xe0024104, // mov ra4, 0 ; mov rb4, 0
+/* [0x000018e0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x000018e8] */ 0x00000000, 0xe0024145, // mov ra5, 0 ; mov rb5, 0
+/* [0x000018f0] */ 0x00000000, 0xe0024186, // mov ra6, 0 ; mov rb6, 0
+/* [0x000018f8] */ 0x00000000, 0xe00241c7, // mov ra7, 0 ; mov rb7, 0
+// ::mc_filter_c10_p
+/* [0x00001900] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
+/* [0x00001908] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
+/* [0x00001910] */ 0xf1082dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0
+/* [0x00001918] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif
+/* [0x00001920] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch  ; mov ra0, unif
+/* [0x00001928] */ 0x93567176, 0x14024800, // max r0, r0, r5        ; mov vrx_xshift, vrx_xshift_next
+/* [0x00001930] */ 0x920991f6, 0x12225813, // min r0, r0, rb_max_x  ; mov vra_y_next, ra2.16a
+/* [0x00001938] */ 0x54404077, 0xd4024862, // and r1, r0, r1        ; mul24 r2, ra_width, v_x_mul
+/* [0x00001940] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+/* [0x00001948] */ 0x8c827076, 0x10025803, // add r0, r0, r1        ; mov ra3, unif
+/* [0x00001950] */ 0x8c427636, 0x120246a1, // add vrx_base_next, r3, r0     ; mov r1, ra_height
+/* [0x00001958] */ 0x8d818eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
+/* [0x00001960] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
+/* [0x00001968] */ 0x8c8033f6, 0xd0039496, // add rb_lcount, r1, 3          ; mov.ifc ra_wt_off_mul_l0, unif
+/* [0x00001970] */ 0x910c83f6, 0xd8024808, // shl r0, r1, v_dma_h_shift     ; mov rb8, ra3.8a
+/* [0x00001978] */ 0x8c0e70b6, 0x1a024809, // add r0, r0, r2                ; mov rb9, ra3.8b
+/* [0x00001980] */ 0x910cf1f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c
+/* [0x00001988] */ 0x8c59b1f6, 0x140256a1, // add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
+/* [0x00001990] */ 0x9581edbf, 0x100255c9, // mov rb_dest, unif             ; mov ra9, rb_max_y
+/* [0x00001998] */ 0x910cd3f6, 0x1e02484b, // shl r1, r1, rb_wt_den_p15     ; mov rb11, ra3.8d
+/* [0x000019a0] */ 0x8f8023f6, 0xd002531e, // asr rb_wt_off, r1, 2          ; mov ra_link, unif
+/* [0x000019a8] */ 0x0d50df80, 0x1a0200e7, // sub ra3, rb_wt_den_p15, ra_k1
+// :1
+/* [0x000019b0] */ 0xcd511bee, 0xaa0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0
+/* [0x000019b8] */ 0x8e4c09f6, 0x140288a3, // shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next
+/* [0x000019c0] */ 0x8e4505f6, 0xd402c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
+/* [0x000019c8] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next
+/* [0x000019d0] */ 0x8c531789, 0xda224460, // add vra_y, r3, ra_k1   ; mov      r0, r1 << 15
+/* [0x000019d8] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0     ; mov.ifnc r1, r2 << 1
+/* [0x000019e0] */ 0x92267792, 0x1003c8e0, // min r3, r3, ra9       ; mov.ifnc r0, r2
+/* [0x000019e8] */ 0x55150d9f, 0x10024122, // mov ra4, ra5          ; mul24 r2, r3, rb_pitch
+/* [0x000019f0] */ 0x8c616c87, 0x10024e20, // add vr_txs, vra_base, r2 ; v8min r0, r0, rb_pmask
+/* [0x000019f8] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,       r0
+/* [0x00001a00] */ 0x4003e030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
+/* [0x00001a08] */ 0x40034031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
+/* [0x00001a10] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
+/* [0x00001a18] */ 0x40032031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
+/* [0x00001a20] */ 0x4d004bf1, 0xde0269e0, // sub.setf -, r5, 4     ; mul24      r0, ra0.8d,       r1
+/* [0x00001a28] */ 0x8c1a74f6, 0x10025885, // add r2, r2, r3        ; mov ra5, ra6
+/* [0x00001a30] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x00001a38] */ 0x551cadb7, 0x100241a1, // mov ra6, ra7          ; mul24 r1, ra7, rb10
+/* [0x00001a40] */ 0x4d108437, 0x100248a0, // sub r2, r2, r0        ; mul24 r0, ra4, rb8
+/* [0x00001a48] */ 0x0f9c25c0, 0xd00201e7, // asr ra7, r2, v_bit_depth - 8
+/* [0x00001a50] */ 0x4d149237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra5, rb9
+/* [0x00001a58] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra7, rb11
+/* [0x00001a60] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
+/* [0x00001a68] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
+/* [0x00001a70] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+/* [0x00001a78] */ 0x405a700e, 0x120049e1, // nop                   ; mul24 r1, r1, ra_wt_mul_l0
+/* [0x00001a80] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8         ; mov r3, ra_blk_height
+/* [0x00001a88] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
+/* [0x00001a90] */ 0xffffff00, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x00001a98] */ 0x0f0e7380, 0x10020867, // asr r1, r1, ra3
+/* [0x00001aa0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
+/* [0x00001aa8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+/* [0x00001ab0] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
+/* [0x00001ab8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+/* [0x00001ac0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
+/* [0x00001ac8] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
+/* [0x00001ad0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+/* [0x00001ad8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+/* [0x00001ae0] */ 0xfffffeb0, 0xf0f809e7, // brr -, r:1b
+/* [0x00001ae8] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
+/* [0x00001af0] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
+/* [0x00001af8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
+// ::mc_filter_c10_p_l1
+/* [0x00001b00] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
+/* [0x00001b08] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
+/* [0x00001b10] */ 0xf1082dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0
+/* [0x00001b18] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif
+/* [0x00001b20] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch  ; mov ra0, unif
+/* [0x00001b28] */ 0x939c117f, 0x10125815, // max r0, r0, r5        ; mov vrx_xshift, vrx_xshift_next
+/* [0x00001b30] */ 0x920991f6, 0x12125813, // min r0, r0, rb_max_x  ; mov vra_y_next, ra2.16a
+/* [0x00001b38] */ 0x54404077, 0xd4024862, // and r1, r0, r1        ; mul24 r2, ra_width, v_x_mul
+/* [0x00001b40] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+/* [0x00001b48] */ 0x8c827076, 0x10025803, // add r0, r0, r1        ; mov ra3, unif
+/* [0x00001b50] */ 0x8c427636, 0x120254e1, // add vrx_base_next, r3, r0     ; mov r1, ra_height
+/* [0x00001b58] */ 0x8d818eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
+/* [0x00001b60] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
+/* [0x00001b68] */ 0x8c8033f6, 0xd0039496, // add rb_lcount, r1, 3          ; mov.ifc ra_wt_off_mul_l0, unif
+/* [0x00001b70] */ 0x910c83f6, 0xd8024808, // shl r0, r1, v_dma_h_shift     ; mov rb8, ra3.8a
+/* [0x00001b78] */ 0x8c0e70b6, 0x1a024809, // add r0, r0, r2                ; mov rb9, ra3.8b
+/* [0x00001b80] */ 0x910cf1f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c
+/* [0x00001b88] */ 0x8c59b1f6, 0x140256a1, // add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
+/* [0x00001b90] */ 0x9581edbf, 0x100255c9, // mov rb_dest, unif             ; mov ra9, rb_max_y
+/* [0x00001b98] */ 0x910cd3f6, 0x1e02484b, // shl r1, r1, rb_wt_den_p15     ; mov rb11, ra3.8d
+/* [0x00001ba0] */ 0x8f8023f6, 0xd002531e, // asr rb_wt_off, r1, 2          ; mov ra_link, unif
+/* [0x00001ba8] */ 0x0d50df80, 0x1a0200e7, // sub ra3, rb_wt_den_p15, ra_k1
+// :1
+/* [0x00001bb0] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1
+/* [0x00001bb8] */ 0x8e5539bf, 0x12029899, // shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next
+/* [0x00001bc0] */ 0x8e4505f6, 0xd202c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
+/* [0x00001bc8] */ 0x8c4c3ff6, 0x1202a9e3, // add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next
+/* [0x00001bd0] */ 0x8c531789, 0xda124460, // add vra_y, r3, ra_k1   ; mov      r0, r1 << 15
+/* [0x00001bd8] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0     ; mov.ifnc r1, r2 << 1
+/* [0x00001be0] */ 0x92267792, 0x1003c8e0, // min r3, r3, ra9       ; mov.ifnc r0, r2
+/* [0x00001be8] */ 0x55150d9f, 0x10024122, // mov ra4, ra5          ; mul24 r2, r3, rb_pitch
+/* [0x00001bf0] */ 0x8c656c87, 0x10024f20, // add vr_txs, vra_base, r2 ; v8min r0, r0, rb_pmask
+/* [0x00001bf8] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,       r0
+/* [0x00001c00] */ 0x4003e030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
+/* [0x00001c08] */ 0x40034031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
+/* [0x00001c10] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
+/* [0x00001c18] */ 0x40032031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
+/* [0x00001c20] */ 0x4d004bf1, 0xde0269e0, // sub.setf -, r5, 4     ; mul24      r0, ra0.8d,       r1
+/* [0x00001c28] */ 0x8c1a74f6, 0x10025885, // add r2, r2, r3        ; mov ra5, ra6
+/* [0x00001c30] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x00001c38] */ 0x551cadb7, 0x100241a1, // mov ra6, ra7          ; mul24 r1, ra7, rb10
+/* [0x00001c40] */ 0x4d108437, 0x100248a0, // sub r2, r2, r0        ; mul24 r0, ra4, rb8
+/* [0x00001c48] */ 0x0f9c25c0, 0xd00201e7, // asr ra7, r2, v_bit_depth - 8
+/* [0x00001c50] */ 0x4d149237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra5, rb9
+/* [0x00001c58] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra7, rb11
+/* [0x00001c60] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
+/* [0x00001c68] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
+/* [0x00001c70] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+/* [0x00001c78] */ 0x405a700e, 0x120049e1, // nop                   ; mul24 r1, r1, ra_wt_mul_l0
+/* [0x00001c80] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8         ; mov r3, ra_blk_height
+/* [0x00001c88] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
+/* [0x00001c90] */ 0xffffff00, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x00001c98] */ 0x0f0e7380, 0x10020867, // asr r1, r1, ra3
+/* [0x00001ca0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
+/* [0x00001ca8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+/* [0x00001cb0] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
+/* [0x00001cb8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+/* [0x00001cc0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
+/* [0x00001cc8] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
+/* [0x00001cd0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+/* [0x00001cd8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+/* [0x00001ce0] */ 0xfffffeb0, 0xf0f809e7, // brr -, r:1b
+/* [0x00001ce8] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
+/* [0x00001cf0] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
+/* [0x00001cf8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
+// ::mc_filter_c10_b
+/* [0x00001d00] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
+/* [0x00001d08] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
+/* [0x00001d10] */ 0xf1082dc9, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1
+/* [0x00001d18] */ 0x8c0821f6, 0x12225813, // add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a
+/* [0x00001d20] */ 0x8d810bf6, 0x10025850, // sub r1, r5, rb_pitch  ; mov ra_width_height, unif
+/* [0x00001d28] */ 0x93567176, 0x14125815, // max r0, r0, r5        ; mov ra_xshift, ra_xshift_next
+/* [0x00001d30] */ 0x928191f6, 0x10025800, // min r0, r0, rb_max_x  ; mov ra0, unif
+/* [0x00001d38] */ 0x9481c1f6, 0xd0025802, // and r0, r0, -4        ; mov ra2, unif
+/* [0x00001d40] */ 0x54404077, 0xd4024862, // and r1, r0, r1        ; mul24 r2, ra_width, v_x_mul
+/* [0x00001d48] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+/* [0x00001d50] */ 0x8c427076, 0x12024821, // add r0, r0, r1        ; mov r1, ra_height
+/* [0x00001d58] */ 0x8c9c163f, 0x10024680, // add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next
+/* [0x00001d60] */ 0x8d818eb6, 0x10125756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_mul_l0, unif
+/* [0x00001d68] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
+/* [0x00001d70] */ 0x8c8033f6, 0xd0139496, // add rb_lcount, r1, 3  ; mov.ifc ra_wt_mul_l0, unif
+/* [0x00001d78] */ 0x918083f6, 0xd0025803, // shl r0, r1, v_dma_h_shift ; mov ra3, unif
+/* [0x00001d80] */ 0x8c8270b6, 0x10024823, // add r0, r0, r2        ; mov r3, unif
+/* [0x00001d88] */ 0x910cf1f6, 0xd2125813, // shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a
+/* [0x00001d90] */ 0x8c81b1f6, 0x10025681, // add rb_dma0, r0, rb_dma0_base ; mov ra1, unif
+/* [0x00001d98] */ 0x110c2dc0, 0xd4020827, // shl r0, ra3.16b, v_x_shift
+/* [0x00001da0] */ 0x8c8021f6, 0x10025803, // add r0, r0, rb_elem_x ; mov ra3, unif
+/* [0x00001da8] */ 0x8d810bf6, 0x10025852, // sub r1, r5, rb_pitch  ; mov ra_wt_off_mul_l1, unif
+/* [0x00001db0] */ 0x930e7176, 0x18024808, // max r0, r0, r5        ; mov rb8, ra3.8a
+/* [0x00001db8] */ 0x920d91f6, 0x1a024809, // min r0, r0, rb_max_x  ; mov rb9, ra3.8b
+/* [0x00001dc0] */ 0x9481c1f6, 0xd0039812, // and r0, r0, -4        ; mov.ifc ra_wt_off_mul_l1, unif
+/* [0x00001dc8] */ 0x940e7076, 0x1c02484a, // and r1, r0, r1        ; mov rb10, ra3.8c
+/* [0x00001dd0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+/* [0x00001dd8] */ 0x8c827076, 0x10024817, // add r0, r0, r1        ; mov rb_dest, unif
+/* [0x00001de0] */ 0x0c9e7600, 0x100214e7, // add rb_base2_next, r3, r0
+/* [0x00001de8] */ 0x950deff6, 0x1e02424b, // mov ra9, rb_max_y     ; mov rb11, ra3.8d
+/* [0x00001df0] */ 0x1148ddc0, 0x14020867, // shl r1, ra_wt_off_l1, rb_wt_den_p15
+/* [0x00001df8] */ 0x8f8093f6, 0xd002531e, // asr rb_wt_off, r1, 9  ; mov ra_link, unif
+// :1
+/* [0x00001e00] */ 0xcd511bee, 0xaa0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0
+/* [0x00001e08] */ 0x8e5539bf, 0x12029899, // shr r2, r4, ra_xshift ; mov.ifz ra_base2, rb_base2_next
+/* [0x00001e10] */ 0x8e4d05f6, 0xd0029851, // shr r1, r2, v_v_shift ; mov.ifz ra_y_y2, ra_y_y2_next
+/* [0x00001e18] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz ra_base, ra_base_next
+/* [0x00001e20] */ 0x8c441fb6, 0xd4224463, // add ra_y, 1, ra_y     ; mov r3, ra_y
+/* [0x00001e28] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0     ; mov      r0, r1 << 15
+/* [0x00001e30] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9       ; mov.ifnc r1, r2 << 1
+/* [0x00001e38] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2       ; mul24 r3, r3, rb_pitch
+/* [0x00001e40] */ 0x8c616cc7, 0x10024e20, // add t0s, ra_base, r3  ; v8min r0, r0, rb_pmask
+/* [0x00001e48] */ 0x95145ff6, 0x10025104, // mov rb4, rb5          ; mov ra4, ra5
+/* [0x00001e50] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,       r0
+/* [0x00001e58] */ 0x4003e030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
+/* [0x00001e60] */ 0x40034031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
+/* [0x00001e68] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
+/* [0x00001e70] */ 0x40032031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
+/* [0x00001e78] */ 0x4c0274f1, 0x1e0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8d,       r1
+/* [0x00001e80] */ 0x8d9c64ff, 0xb0024885, // sub r2, r2, r3        ; mov rb5, rb6          ; ldtmu1
+/* [0x00001e88] */ 0x0f9c25c0, 0xd00200e7, // asr ra3, r2, (v_bit_depth - 8)
+/* [0x00001e90] */ 0x8e1809f6, 0x10025885, // shr r2, r4, rb_xshift2 ; mov ra5, ra6
+/* [0x00001e98] */ 0x8e4505f6, 0xd2024863, // shr r1, r2, v_v_shift ; mov r3, ra_y2
+/* [0x00001ea0] */ 0x8c5077bf, 0x1a124446, // add ra_y2, r3, ra_k1  ; mov rb6, rb7
+/* [0x00001ea8] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0     ; mov      r0, r1 << 15
+/* [0x00001eb0] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9       ; mov.ifnc r1, r2 << 1
+/* [0x00001eb8] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2       ; mul24 r3, r3, rb_pitch
+/* [0x00001ec0] */ 0x8c656cc7, 0x10024f20, // add t1s, ra_base2, r3 ; v8min r0, r0, rb_pmask
+/* [0x00001ec8] */ 0x540563f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra1.8a,       r0
+/* [0x00001ed0] */ 0x4007e030, 0xda0049e2, // nop                   ; mul24      r2, ra1.8b << 2,  r0 << 2  @ "mul_used", 0
+/* [0x00001ed8] */ 0x40074031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra1.8b << 12, r1 << 12 @ "mul_used", 0
+/* [0x00001ee0] */ 0x4d07c4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 4,  r0 << 4  @ "mul_used", 0
+/* [0x00001ee8] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
+/* [0x00001ef0] */ 0x4d044bf1, 0xde0269e0, // sub.setf -, r5, 4     ; mul24      r0, ra1.8d,       r1
+/* [0x00001ef8] */ 0x4c0854fe, 0x1a0248a1, // add r2, r2, r3        ; mul24 r1, rb5, ra2.8b
+/* [0x00001f00] */ 0xfffffee0, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x00001f08] */ 0x551cadb7, 0x100241a3, // mov ra6, ra7          ; mul24 r3, ra7, rb10
+/* [0x00001f10] */ 0x4d08443e, 0x180248a0, // sub r2, r2, r0        ; mul24 r0, rb4, ra2.8a
+/* [0x00001f18] */ 0x8f0c25f6, 0xd00241c7, // asr ra7, r2, (v_bit_depth - 8) ; mov rb7, ra3
+/* [0x00001f20] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0        ; mul24 r0, rb6, ra2.8c
+/* [0x00001f28] */ 0x4c08723e, 0x1e024860, // add r1, r1, r0        ; mul24 r0, rb7, ra2.8d
+/* [0x00001f30] */ 0x4d108237, 0x100248a0, // sub r2, r1, r0        ; mul24 r0, ra4, rb8
+/* [0x00001f38] */ 0x4d149637, 0x10024860, // sub r1, r3, r0        ; mul24 r0, ra5, rb9
+/* [0x00001f40] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra7, rb11
+/* [0x00001f48] */ 0x4d527216, 0x12024862, // sub r1, r1, r0        ; mul24 r2, r2, ra_k256
+/* [0x00001f50] */ 0x4f50e5ce, 0xd20248a1, // asr r2, r2, 14        ; mul24 r1, r1, ra_k256
+/* [0x00001f58] */ 0x4f58e3d6, 0xd2024862, // asr r1, r1, 14        ; mul24 r2, r2, ra_wt_mul_l0
+/* [0x00001f60] */ 0x4c48c5ce, 0x120248a1, // add r2, r2, rb_wt_off ; mul24 r1, r1, ra_wt_mul_l1
+/* [0x00001f68] */ 0x8c5e72b6, 0x1c024863, // add r1, r1, r2        ; mov r3, ra_blk_height
+/* [0x00001f70] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
+/* [0x00001f78] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x00001f80] */ 0xef40d3f3, 0x12024860, // asr r1, r1, rb_wt_den_p15 ; v8subs r0, ra_height, r3
+/* [0x00001f88] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
+/* [0x00001f90] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+/* [0x00001f98] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
+/* [0x00001fa0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+/* [0x00001fa8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
+/* [0x00001fb0] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
+/* [0x00001fb8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+/* [0x00001fc0] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+/* [0x00001fc8] */ 0xfffffe18, 0xf0f809e7, // brr -, r:1b
+/* [0x00001fd0] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
+/* [0x00001fd8] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
+/* [0x00001fe0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
+// ::mc_sync10_q0
+/* [0x00001fe8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00001ff0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00001ff8] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00002000] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00002008] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00002010] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00002018] */ 0x0000001c, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00002020] */ 0x00000001, 0xe80009e7, // mov  dst, srel(i)
+/* [0x00002028] */ 0x0000000d, 0xe80009e7, // mov  dst, srel(i)
+// ::mc_sync10_q1
+/* [0x00002030] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00002038] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00002040] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00002048] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
+/* [0x00002050] */ 0x00000011, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00002058] */ 0x00000002, 0xe80009e7, // mov  dst, srel(i)
+// ::mc_sync10_q2
+/* [0x00002060] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00002068] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00002070] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00002078] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
+/* [0x00002080] */ 0x00000012, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00002088] */ 0x00000003, 0xe80009e7, // mov  dst, srel(i)
+// ::mc_sync10_q3
+/* [0x00002090] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00002098] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x000020a0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x000020a8] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
+/* [0x000020b0] */ 0x00000013, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x000020b8] */ 0x009e7000, 0x100009e7, // nop
+// ::mc_sync10_q4
+/* [0x000020c0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x000020c8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x000020d0] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x000020d8] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x000020e0] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x000020e8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x000020f0] */ 0x0000001d, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x000020f8] */ 0x00000005, 0xe80009e7, // mov  dst, srel(i)
+/* [0x00002100] */ 0x0000000e, 0xe80009e7, // mov  dst, srel(i)
+// ::mc_sync10_q5
+/* [0x00002108] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00002110] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00002118] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00002120] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
+/* [0x00002128] */ 0x00000015, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00002130] */ 0x00000006, 0xe80009e7, // mov  dst, srel(i)
+// ::mc_sync10_q6
+/* [0x00002138] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00002140] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00002148] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00002150] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
+/* [0x00002158] */ 0x00000016, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00002160] */ 0x00000007, 0xe80009e7, // mov  dst, srel(i)
+// ::mc_sync10_q7
+/* [0x00002168] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00002170] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00002178] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00002180] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
+/* [0x00002188] */ 0x00000017, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00002190] */ 0x009e7000, 0x100009e7, // nop
+// ::mc_sync10_q8
+/* [0x00002198] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x000021a0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x000021a8] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x000021b0] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x000021b8] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x000021c0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x000021c8] */ 0x0000001e, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x000021d0] */ 0x00000009, 0xe80009e7, // mov  dst, srel(i)
+/* [0x000021d8] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
+// ::mc_sync10_q9
+/* [0x000021e0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x000021e8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x000021f0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x000021f8] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
+/* [0x00002200] */ 0x00000019, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00002208] */ 0x0000000a, 0xe80009e7, // mov  dst, srel(i)
+// ::mc_sync10_q10
+/* [0x00002210] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00002218] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00002220] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00002228] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
+/* [0x00002230] */ 0x0000001a, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00002238] */ 0x0000000b, 0xe80009e7, // mov  dst, srel(i)
+// ::mc_sync10_q11
+/* [0x00002240] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00002248] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00002250] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00002258] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
+/* [0x00002260] */ 0x0000001b, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00002268] */ 0x009e7000, 0x100009e7, // nop
+// ::mc_exit_c10_q0
+// ::mc_exit_y10_q0
+/* [0x00002270] */ 0x00000003, 0xe00228e7, // mov.setf r3, PREREAD - 1
+// :1
+/* [0x00002278] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
+/* [0x00002280] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
+/* [0x00002288] */ 0x009e7000, 0xb00009e7, // nop                   ; nop           ; ldtmu1
+/* [0x00002290] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
+/* [0x00002298] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+/* [0x000022a0] */ 0x0000001c, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x000022a8] */ 0x009e7000, 0x300009e7, // nop                   ; nop           ; thrend
+/* [0x000022b0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1
+/* [0x000022b8] */ 0x009e7000, 0x100009e7, // nop
+// ::mc_exit_c10_qn
+// ::mc_exit_y10_qn
+/* [0x000022c0] */ 0x00000003, 0xe00228e7, // mov.setf r3, PREREAD - 1
+// :1
+/* [0x000022c8] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
+/* [0x000022d0] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
+/* [0x000022d8] */ 0x009e7000, 0xb00009e7, // nop                   ; nop           ; ldtmu1
+/* [0x000022e0] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
+/* [0x000022e8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+/* [0x000022f0] */ 0x009e7000, 0x300009e7, // nop                   ; nop           ; thrend
+/* [0x000022f8] */ 0x009e7000, 0x100009e7, // nop
+/* [0x00002300] */ 0x009e7000, 0x100009e7, // nop
+// ::mc_setup_y10_q0
+/* [0x00002308] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
+// ::mc_setup_y10_qn
+/* [0x00002310] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1          ; mov ra0, unif
+/* [0x00002318] */ 0x15827d80, 0x10020267, // mov ra9, unif
+/* [0x00002320] */ 0x15827d80, 0x10020067, // mov ra1, unif
+/* [0x00002328] */ 0x15827d80, 0x100202e7, // mov ra11, unif
+/* [0x00002330] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
+/* [0x00002338] */ 0x119de1c0, 0xd00210e7, // shl rb_ef, r0, i_shift30
+/* [0x00002340] */ 0xff100100, 0xe0020527, // mov ra_kff100100, 0xff100100
+/* [0x00002348] */ 0x0000ffff, 0xe00215a7, // mov rb_pmask, v_pmask
+/* [0x00002350] */ 0x000803ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
+/* [0x00002358] */ 0x15827d80, 0x100200e7, // mov ra3, unif
+/* [0x00002360] */ 0x15827d80, 0x10021527, // mov rb_xpitch, unif
+/* [0x00002368] */ 0x0d0c1dc0, 0xd4020827, // sub r0, ra3.16b, 1
+/* [0x00002370] */ 0x119c11c0, 0xd0021667, // shl rb_max_x, r0, v_x_shift
+/* [0x00002378] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1
+/* [0x00002380] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
+/* [0x00002388] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+/* [0x00002390] */ 0x159d03c0, 0x10021627, // or  rb_dma1_base, r1, rb_pitch
+/* [0x00002398] */ 0x159a7d80, 0x100208e7, // mov r3, elem_num
+/* [0x000023a0] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3
+/* [0x000023a8] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
+/* [0x000023b0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+/* [0x000023b8] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+/* [0x000023c0] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
+/* [0x000023c8] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4        ; v8subs r2, r2, r2
+/* [0x000023d0] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch
+/* [0x000023d8] */ 0x149e7080, 0x10020867, // and r1, r0, r2
+/* [0x000023e0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+/* [0x000023e8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+/* [0x000023f0] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0
+/* [0x000023f8] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
+/* [0x00002400] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
+/* [0x00002408] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+/* [0x00002410] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+/* [0x00002418] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
+/* [0x00002420] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
+/* [0x00002428] */ 0x149e7080, 0x10020867, // and r1, r0, r2
+/* [0x00002430] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+/* [0x00002438] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+/* [0x00002440] */ 0x0c2e7c00, 0x10020667, // add ra_base2, ra11, r0
+/* [0x00002448] */ 0x80027036, 0x120049e0, // nop                   ; mov r0, ra0.16a
+/* [0x00002450] */ 0x95044ff6, 0xd20248e2, // mov r3, PREREAD       ; mov r2, ra1.16a
+// :1
+/* [0x00002458] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
+/* [0x00002460] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
+/* [0x00002468] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
+/* [0x00002470] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1     ; mul24 r1, r1, rb_pitch
+/* [0x00002478] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1  ; mov ra_y, r0
+/* [0x00002480] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
+/* [0x00002488] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
+/* [0x00002490] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
+/* [0x00002498] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1     ; mul24 r1, r1, rb_pitch
+/* [0x000024a0] */ 0x8c667c52, 0x10125f11, // add t1s, ra_base2, r1 ; mov ra_y2, r2
+/* [0x000024a8] */ 0x0c80ddc0, 0xd0021367, // add rb_wt_den_p15, unif, 23 - v_bit_depth
+/* [0x000024b0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+/* [0x000024b8] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1
+/* [0x000024c0] */ 0x119c43c0, 0xd0020867, // shl r1, r1, 4
+/* [0x000024c8] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1
+/* [0x000024d0] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
+/* [0x000024d8] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
+/* [0x000024e0] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
+/* [0x000024e8] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0))
+/* [0x000024f0] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6
+/* [0x000024f8] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
+/* [0x00002500] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00002508] */ 0x00000000, 0xe0024208, // mov ra8,  0           ; mov rb8,  0
+/* [0x00002510] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00002518] */ 0x00000000, 0xe0024249, // mov ra9,  0           ; mov rb9,  0
+/* [0x00002520] */ 0x00000000, 0xe002428a, // mov ra10, 0           ; mov rb10, 0
+/* [0x00002528] */ 0x00000000, 0xe00242cb, // mov ra11, 0           ; mov rb11, 0
+// :per_block_setup_10
+/* [0x00002530] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
+/* [0x00002538] */ 0x93567176, 0x14125815, // max r0, r0, r5         ; mov ra_xshift, ra_xshift_next
+/* [0x00002540] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+/* [0x00002548] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
+/* [0x00002550] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
+/* [0x00002558] */ 0x8d810bf6, 0x1002589a, // sub r2, r5, rb_pitch  ; mov ra_base_next, unif
+/* [0x00002560] */ 0x940270b6, 0x12225853, // and r1, r0, r2        ; mov ra_y_next, ra0.16a
+/* [0x00002568] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+/* [0x00002570] */ 0x8c827076, 0x10025801, // add r0, r0, r1        ; mov ra1, unif
+/* [0x00002578] */ 0x0c6a7c00, 0x100206a7, // add ra_base_next, ra_base_next, r0
+/* [0x00002580] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
+/* [0x00002588] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
+/* [0x00002590] */ 0x93067176, 0x12125813, // max r0, r0, r5        ; mov ra_y2_next, ra1.16a
+/* [0x00002598] */ 0x928191f6, 0x10024813, // min r0, r0, rb_max_x  ; mov rb_base2_next, unif
+/* [0x000025a0] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
+/* [0x000025a8] */ 0x9481c1f6, 0xd0025810, // and r0, r0, -4        ; mov ra_width_height, unif
+/* [0x000025b0] */ 0x949dc0bf, 0x10024871, // and r1, r0, r2        ; mov vw_setup, rb_vpm_init
+/* [0x000025b8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+/* [0x000025c0] */ 0x4c402077, 0xd4024821, // add r0, r0, r1        ; mul24 r1, ra_width, v_x_mul
+/* [0x000025c8] */ 0x0c9d3e00, 0x100214e7, // add rb_base2_next, rb_base2_next, r0
+/* [0x000025d0] */ 0x8d418e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
+/* [0x000025d8] */ 0x8c5c31c6, 0xdc025460, // add rb_i_tmu, r0, 7 - PREREAD ; v8min r0, r0, ra_blk_height
+/* [0x000025e0] */ 0x0c9c71c0, 0xd00214a7, // add rb_lcount, r0, 7
+/* [0x000025e8] */ 0x119c81c0, 0xd0020827, // shl r0,   r0, v_dma_h_shift
+/* [0x000025f0] */ 0x0c9e7040, 0x10020827, // add r0,   r0, r1
+/* [0x000025f8] */ 0x119cf1c0, 0xd0020827, // shl r0,   r0, v_dma_wh_shift
+/* [0x00002600] */ 0x8c81b1f6, 0x100256a0, // add rb_dma0, r0, rb_dma0_base ; mov r0, unif
+/* [0x00002608] */ 0x918101f6, 0xd00a5816, // shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif
+/* [0x00002610] */ 0x915031f6, 0xde024223, // shl ra8, r0, 3        ; mov r3, ra_k255
+/* [0x00002618] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
+/* [0x00002620] */ 0x10227380, 0x1e4200a7, // ror ra2.8a, r1, ra8.8d
+/* [0x00002628] */ 0x10227380, 0x1c420027, // ror ra0.8a, r1, ra8.8c
+/* [0x00002630] */ 0x01040400, 0xe0020867, // mov r1, 0x01040400
+/* [0x00002638] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d
+/* [0x00002640] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c
+/* [0x00002648] */ 0x050b0a00, 0xe0020867, // mov r1,0x050b0a00
+/* [0x00002650] */ 0x10227380, 0x1e6200a7, // ror ra2.8c, r1, ra8.8d
+/* [0x00002658] */ 0x10227380, 0x1c620027, // ror ra0.8c, r1, ra8.8c
+/* [0x00002660] */ 0x11283a40, 0xe0020867, // mov r1,0x11283a40
+/* [0x00002668] */ 0x10227380, 0x1e7200a7, // ror ra2.8d, r1, ra8.8d
+/* [0x00002670] */ 0x10227380, 0x1c720027, // ror ra0.8d, r1, ra8.8c
+/* [0x00002678] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
+/* [0x00002680] */ 0x902203bf, 0x1e025812, // ror r0, r1, ra8.8d  ; mov ra_wt_off_mul_l1, unif
+/* [0x00002688] */ 0x90227383, 0x1c424044, // ror ra1.8a, r1, ra8.8c ; v8min rb4, r0, r3
+/* [0x00002690] */ 0x0a0b0500, 0xe0020867, // mov r1,0x0a0b0500
+/* [0x00002698] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d
+/* [0x000026a0] */ 0x90227383, 0x1c524045, // ror ra1.8b, r1, ra8.8c ; v8min rb5, r0, r3
+/* [0x000026a8] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
+/* [0x000026b0] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d
+/* [0x000026b8] */ 0x90227383, 0x1c624046, // ror ra1.8c, r1, ra8.8c ; v8min rb6, r0, r3
+/* [0x000026c0] */ 0x954a0dbf, 0x10084597, // mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 ; mov rb_dest, unif
+/* [0x000026c8] */ 0x01010000, 0xe0020867, // mov r1,0x01010000
+/* [0x000026d0] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d
+/* [0x000026d8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x000026e0] */ 0x90227383, 0x1c724047, // ror ra1.8d, r1, ra8.8c ; v8min rb7, r0, r3
+/* [0x000026e8] */ 0x1158ddc0, 0x14020827, // shl r0, ra_wt_off_l0, rb_wt_den_p15
+/* [0x000026f0] */ 0x8f8091f6, 0xd002531e, // asr rb_wt_off, r0, 9  ; mov ra_link, unif
+// ::mc_filter_y10_pxx
+/* [0x000026f8] */ 0xfffffe18, 0xf0f807a7, // brr ra_link, r:per_block_setup_10
+/* [0x00002700] */ 0x959a0ff6, 0x10024023, // mov ra0, unif         ; mov r3, elem_num
+/* [0x00002708] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2
+/* [0x00002710] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3   ; mov rb_xshift2, rb_xshift2_next
+/* [0x00002718] */ 0x11581dc0, 0xd21205a7, // shl ra_wt_mul_l0, ra_wt_mul_l0, 1
+// :1
+/* [0x00002720] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1            ; ldtmu1
+/* [0x00002728] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next      ; ldtmu0
+/* [0x00002730] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift         ; mov r3, rb_pitch
+/* [0x00002738] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
+/* [0x00002740] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
+/* [0x00002748] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1             ; mul24 r2, r2, r3
+/* [0x00002750] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2          ; mov.ifz ra_base2, rb_base2_next
+/* [0x00002758] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
+/* [0x00002760] */ 0x9221e5f6, 0x10025887, // min r2, r2, rb_max_y          ; mov ra7, ra8
+/* [0x00002768] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
+/* [0x00002770] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2         ; v8min r0, r0, rb_pmask
+/* [0x00002778] */ 0x8c243ff6, 0x100279c8, // add.setf -, rb_ef, rb_ef      ; mov ra8, ra9
+/* [0x00002780] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,      r0
+/* [0x00002788] */ 0x4003f030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
+/* [0x00002790] */ 0x40038031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
+/* [0x00002798] */ 0x40037031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
+/* [0x000027a0] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
+/* [0x000027a8] */ 0x40036031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
+/* [0x000027b0] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
+/* [0x000027b8] */ 0x40035031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
+/* [0x000027c0] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3        ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
+/* [0x000027c8] */ 0x40074031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
+/* [0x000027d0] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
+/* [0x000027d8] */ 0x40073031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
+/* [0x000027e0] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
+/* [0x000027e8] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
+/* [0x000027f0] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
+/* [0x000027f8] */ 0x40071031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
+/* [0x00002800] */ 0x8d288bf6, 0xd00279c9, // sub.setf -, r5, 8     ; mov ra9,  ra10
+/* [0x00002808] */ 0x4d0894fe, 0x180248a0, // sub r2, r2, r3        ; mul24 r0, rb9,  ra2.8a
+/* [0x00002810] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x00002818] */ 0x5508affe, 0x1a025261, // mov rb9,  rb10        ; mul24 r1, rb10, ra2.8b
+/* [0x00002820] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11        ; mov rb10, rb11
+/* [0x00002828] */ 0x8f1c25f6, 0xd00242cb, // asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7
+/* [0x00002830] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0        ; mul24 r0, rb10, ra2.8c
+/* [0x00002838] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0        ; mul24 r0, rb11, ra2.8d
+/* [0x00002840] */ 0x4c204237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra8,  rb4
+/* [0x00002848] */ 0x4c245237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra9,  rb5
+/* [0x00002850] */ 0x4d286237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra10, rb6
+/* [0x00002858] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra11, rb7
+/* [0x00002860] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
+/* [0x00002868] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
+/* [0x00002870] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+/* [0x00002878] */ 0x405a700e, 0x120049e1, // nop                   ; mul24 r1, r1, ra_wt_mul_l0
+/* [0x00002880] */ 0x8c5cc3f6, 0x1c024863, // add r1, r1, rb_wt_off ; mov r3, ra_blk_height
+/* [0x00002888] */ 0xf14083f3, 0xd2024860, // shl r1, r1, 8         ; v8subs r0, ra_height, r3
+/* [0x00002890] */ 0xfffffe70, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x00002898] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15
+/* [0x000028a0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
+/* [0x000028a8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+/* [0x000028b0] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
+/* [0x000028b8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+/* [0x000028c0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
+/* [0x000028c8] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
+/* [0x000028d0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+/* [0x000028d8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+/* [0x000028e0] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b
+/* [0x000028e8] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
+/* [0x000028f0] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
+/* [0x000028f8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
+// ::mc_filter_y10_p00
+/* [0x00002900] */ 0x959a0ff6, 0x10024023, // mov ra0, unif         ; mov r3, elem_num
+/* [0x00002908] */ 0x15567d80, 0x14120567, // mov ra_xshift, ra_xshift_next
+/* [0x00002910] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3
+/* [0x00002918] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
+/* [0x00002920] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+/* [0x00002928] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+/* [0x00002930] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
+/* [0x00002938] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4        ; v8subs r2, r2, r2
+/* [0x00002940] */ 0x8d8105f6, 0x1002589a, // sub r2, r2, rb_pitch  ; mov ra_base_next, unif
+/* [0x00002948] */ 0x940270b6, 0x12225853, // and r1, r0, r2        ; mov ra_y_next, ra0.16a
+/* [0x00002950] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+/* [0x00002958] */ 0x8c827076, 0x10025810, // add r0, r0, r1        ; mov ra_width_height, unif
+/* [0x00002960] */ 0x8c69cc3f, 0x100246b1, // add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init
+/* [0x00002968] */ 0x11401dc0, 0xd4020867, // shl r1, ra_width, v_x_shift
+/* [0x00002970] */ 0x8d418e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
+/* [0x00002978] */ 0x8d5c41c6, 0xdc025460, // sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height
+/* [0x00002980] */ 0x919c81c0, 0xd0024812, // shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0
+/* [0x00002988] */ 0x8c827076, 0x10025816, // add r0, r0, r1        ; mov ra_wt_off_mul_l0, unif
+/* [0x00002990] */ 0x9180f1f6, 0xd0024817, // shl r0, r0, v_dma_wh_shift ; mov rb_dest, unif
+/* [0x00002998] */ 0x0c9db1c0, 0x100216a7, // add rb_dma0, r0, rb_dma0_base
+/* [0x000029a0] */ 0xf158dddb, 0x14024825, // shl r0, ra_wt_off_l0, rb_wt_den_p15 ; v8subs r5rep, r3, r3
+/* [0x000029a8] */ 0x8f8011f6, 0xd002531e, // asr rb_wt_off, r0, 1  ; mov ra_link, unif
+// :1
+/* [0x000029b0] */ 0xcd511bee, 0x1a0269e5, // sub.setf -, r5, rb_i_tmu  ; v8adds r5rep, r5, ra_k1
+/* [0x000029b8] */ 0x804e7036, 0xa42099d1, // nop                   ; mov.ifz ra_y, ra_y_next      ; ldtmu0
+/* [0x000029c0] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch
+/* [0x000029c8] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
+/* [0x000029d0] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y  ; mov.ifz ra_base, ra_base_next
+/* [0x000029d8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1     ; mul24 r2, r2, r3
+/* [0x000029e0] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2  ; v8min r0, r0, rb_pmask
+/* [0x000029e8] */ 0x4d592bc6, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0
+/* [0x000029f0] */ 0x915cd3f6, 0xdc024863, // shl r1, r1, 23 - v_bit_depth ; mov r3, ra_blk_height
+/* [0x000029f8] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
+/* [0x00002a00] */ 0xffffff90, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x00002a08] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15
+/* [0x00002a10] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
+/* [0x00002a18] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+/* [0x00002a20] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
+/* [0x00002a28] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+/* [0x00002a30] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
+/* [0x00002a38] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
+/* [0x00002a40] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+/* [0x00002a48] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+/* [0x00002a50] */ 0xffffff40, 0xf0f809e7, // brr -, r:1b
+/* [0x00002a58] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
+/* [0x00002a60] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
+/* [0x00002a68] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
+// ::mc_filter_y10_bxx
+/* [0x00002a70] */ 0xfffffaa0, 0xf0f807a7, // brr ra_link, r:per_block_setup_10
+/* [0x00002a78] */ 0x959a0ff6, 0x10024023, // mov ra0, unif         ; mov r3, elem_num
+/* [0x00002a80] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2
+/* [0x00002a88] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3   ; mov rb_xshift2, rb_xshift2_next
+// :1
+/* [0x00002a90] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1        ; ldtmu1
+/* [0x00002a98] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next  ; ldtmu0
+/* [0x00002aa0] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift         ; mov r3, rb_pitch
+/* [0x00002aa8] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
+/* [0x00002ab0] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
+/* [0x00002ab8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1             ; mul24 r2, r2, r3
+/* [0x00002ac0] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2          ; mov.ifz ra_base2, rb_base2_next
+/* [0x00002ac8] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
+/* [0x00002ad0] */ 0x9221e5f6, 0x10025887, // min r2, r2, rb_max_y          ; mov ra7, ra8
+/* [0x00002ad8] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
+/* [0x00002ae0] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2         ; v8min r0, r0, rb_pmask
+/* [0x00002ae8] */ 0x8c243ff6, 0x100279c8, // add.setf -, rb_ef, rb_ef      ; mov ra8, ra9
+/* [0x00002af0] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,      r0
+/* [0x00002af8] */ 0x4003f030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
+/* [0x00002b00] */ 0x40038031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
+/* [0x00002b08] */ 0x40037031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
+/* [0x00002b10] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
+/* [0x00002b18] */ 0x40036031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
+/* [0x00002b20] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
+/* [0x00002b28] */ 0x40035031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
+/* [0x00002b30] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3        ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
+/* [0x00002b38] */ 0x40074031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
+/* [0x00002b40] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
+/* [0x00002b48] */ 0x40073031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
+/* [0x00002b50] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
+/* [0x00002b58] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
+/* [0x00002b60] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
+/* [0x00002b68] */ 0x40071031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
+/* [0x00002b70] */ 0x8d288bf6, 0xd00279c9, // sub.setf -, r5, 8     ; mov ra9,  ra10
+/* [0x00002b78] */ 0x4d0894fe, 0x180248a0, // sub r2, r2, r3        ; mul24 r0, rb9,  ra2.8a
+/* [0x00002b80] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x00002b88] */ 0x5508affe, 0x1a025261, // mov rb9,  rb10        ; mul24 r1, rb10, ra2.8b
+/* [0x00002b90] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11        ; mov rb10, rb11
+/* [0x00002b98] */ 0x8f1c25f6, 0xd00242cb, // asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7
+/* [0x00002ba0] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0        ; mul24 r0, rb10, ra2.8c
+/* [0x00002ba8] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0        ; mul24 r0, rb11, ra2.8d
+/* [0x00002bb0] */ 0x4c204237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra8,  rb4
+/* [0x00002bb8] */ 0x4c245237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra9,  rb5
+/* [0x00002bc0] */ 0x4d286237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra10, rb6
+/* [0x00002bc8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra11, rb7
+/* [0x00002bd0] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0        ; mov r2, rb_wt_off
+/* [0x00002bd8] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
+/* [0x00002be0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+/* [0x00002be8] */ 0x405a700e, 0x120049e0, // nop                   ; mul24 r0, r1, ra_wt_mul_l0
+/* [0x00002bf0] */ 0x4c4b808e, 0xd2024821, // add r0, r0, r2        ; mul24 r1, r1 << 8, ra_wt_mul_l1 << 8    @ "mul_used", 0
+/* [0x00002bf8] */ 0x8c5e7236, 0x1c024863, // add r1, r1, r0        ; mov r3, ra_blk_height
+/* [0x00002c00] */ 0xf14083f3, 0xd2024860, // shl r1, r1, 8         ; v8subs r0, ra_height, r3
+/* [0x00002c08] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x00002c10] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15
+/* [0x00002c18] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
+/* [0x00002c20] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+/* [0x00002c28] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
+/* [0x00002c30] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+/* [0x00002c38] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
+/* [0x00002c40] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
+/* [0x00002c48] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+/* [0x00002c50] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+/* [0x00002c58] */ 0xfffffe18, 0xf0f809e7, // brr -, r:1b
+/* [0x00002c60] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
+/* [0x00002c68] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
+/* [0x00002c70] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
+// ::mc_filter_y10_b00
+/* [0x00002c78] */ 0xfffff898, 0xf0f807a7, // brr ra_link, r:per_block_setup_10
+/* [0x00002c80] */ 0x959a0ff6, 0x10024023, // mov ra0, unif         ; mov r3, elem_num
+/* [0x00002c88] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2
+/* [0x00002c90] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3   ; mov rb_xshift2, rb_xshift2_next
+/* [0x00002c98] */ 0x00000007, 0xe0020827, // mov r0, 7
+/* [0x00002ca0] */ 0x0d9d1e00, 0x10021467, // sub rb_i_tmu, rb_i_tmu, r0
+/* [0x00002ca8] */ 0x0d9d2e00, 0x100214a7, // sub rb_lcount, rb_lcount, r0
+/* [0x00002cb0] */ 0x95588ff6, 0xd0024821, // mov r0, 8             ; mov r1, ra_wt_off_mul_l0
+/* [0x00002cb8] */ 0x119cce00, 0x10021327, // shl rb_wt_off, rb_wt_off, r0
+/* [0x00002cc0] */ 0x809f8009, 0xd000d9d6, // nop                   ; mov.ifnz ra_wt_off_mul_l0, r1 << 8
+// :1
+/* [0x00002cc8] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1            ; ldtmu1
+/* [0x00002cd0] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next        ; ldtmu0
+/* [0x00002cd8] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch
+/* [0x00002ce0] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
+/* [0x00002ce8] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y  ; mov.ifz ra_base, ra_base_next
+/* [0x00002cf0] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1     ; mul24 r2, r2, r3
+/* [0x00002cf8] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2  ; mov.ifz ra_base2, rb_base2_next
+/* [0x00002d00] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
+/* [0x00002d08] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
+/* [0x00002d10] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1   ; mul24 r2, r2, r3
+/* [0x00002d18] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask
+/* [0x00002d20] */ 0x545963c6, 0x12024860, // and r1, r1, rb_pmask  ; mul24 r0, r0, ra_wt_mul_l0
+/* [0x00002d28] */ 0x4d492bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1
+/* [0x00002d30] */ 0x0c9e7040, 0x10020867, // add r1, r0, r1
+/* [0x00002d38] */ 0x915cc3f6, 0xdc024863, // shl r1, r1, 22 - v_bit_depth ; mov r3, ra_blk_height
+/* [0x00002d40] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
+/* [0x00002d48] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x00002d50] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15
+/* [0x00002d58] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
+/* [0x00002d60] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+/* [0x00002d68] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
+/* [0x00002d70] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+/* [0x00002d78] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
+/* [0x00002d80] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
+/* [0x00002d88] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+/* [0x00002d90] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+/* [0x00002d98] */ 0xffffff10, 0xf0f809e7, // brr -, r:1b
+/* [0x00002da0] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
+/* [0x00002da8] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
+/* [0x00002db0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
+// ::mc_end
+};
+#ifdef __HIGHC__
+#pragma Align_to(8, rpi_shader)
+#endif
diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
new file mode 100644
index 0000000000..82bf380eb4
--- /dev/null
+++ b/libavcodec/rpi_shader.h
@@ -0,0 +1,63 @@
+#ifndef rpi_shader_H
+#define rpi_shader_H
+
+extern unsigned int rpi_shader[];
+
+#define mc_setup_c_q0 (rpi_shader + 0)
+#define mc_start (rpi_shader + 0)
+#define mc_setup_c_qn (rpi_shader + 2)
+#define mc_filter_c_p (rpi_shader + 142)
+#define mc_filter_c_p_l1 (rpi_shader + 272)
+#define mc_filter_c_b (rpi_shader + 402)
+#define mc_sync_q0 (rpi_shader + 590)
+#define mc_sync_q1 (rpi_shader + 608)
+#define mc_sync_q2 (rpi_shader + 620)
+#define mc_sync_q3 (rpi_shader + 632)
+#define mc_sync_q4 (rpi_shader + 644)
+#define mc_sync_q5 (rpi_shader + 662)
+#define mc_sync_q6 (rpi_shader + 674)
+#define mc_sync_q7 (rpi_shader + 686)
+#define mc_sync_q8 (rpi_shader + 698)
+#define mc_sync_q9 (rpi_shader + 716)
+#define mc_sync_q10 (rpi_shader + 728)
+#define mc_sync_q11 (rpi_shader + 740)
+#define mc_exit_c_qn (rpi_shader + 752)
+#define mc_exit_y_qn (rpi_shader + 752)
+#define mc_exit_c_q0 (rpi_shader + 770)
+#define mc_exit_y_q0 (rpi_shader + 770)
+#define mc_setup_y_q0 (rpi_shader + 790)
+#define mc_setup_y_qn (rpi_shader + 792)
+#define mc_filter_y_pxx (rpi_shader + 1032)
+#define mc_filter_y_bxx (rpi_shader + 1162)
+#define mc_filter_y_p00 (rpi_shader + 1292)
+#define mc_filter_y_b00 (rpi_shader + 1382)
+#define mc_setup_c10_q0 (rpi_shader + 1462)
+#define mc_setup_c10_qn (rpi_shader + 1464)
+#define mc_filter_c10_p (rpi_shader + 1600)
+#define mc_filter_c10_p_l1 (rpi_shader + 1728)
+#define mc_filter_c10_b (rpi_shader + 1856)
+#define mc_sync10_q0 (rpi_shader + 2042)
+#define mc_sync10_q1 (rpi_shader + 2060)
+#define mc_sync10_q2 (rpi_shader + 2072)
+#define mc_sync10_q3 (rpi_shader + 2084)
+#define mc_sync10_q4 (rpi_shader + 2096)
+#define mc_sync10_q5 (rpi_shader + 2114)
+#define mc_sync10_q6 (rpi_shader + 2126)
+#define mc_sync10_q7 (rpi_shader + 2138)
+#define mc_sync10_q8 (rpi_shader + 2150)
+#define mc_sync10_q9 (rpi_shader + 2168)
+#define mc_sync10_q10 (rpi_shader + 2180)
+#define mc_sync10_q11 (rpi_shader + 2192)
+#define mc_exit_c10_q0 (rpi_shader + 2204)
+#define mc_exit_y10_q0 (rpi_shader + 2204)
+#define mc_exit_c10_qn (rpi_shader + 2224)
+#define mc_exit_y10_qn (rpi_shader + 2224)
+#define mc_setup_y10_q0 (rpi_shader + 2242)
+#define mc_setup_y10_qn (rpi_shader + 2244)
+#define mc_filter_y10_pxx (rpi_shader + 2494)
+#define mc_filter_y10_p00 (rpi_shader + 2624)
+#define mc_filter_y10_bxx (rpi_shader + 2716)
+#define mc_filter_y10_b00 (rpi_shader + 2846)
+#define mc_end (rpi_shader + 2926)
+
+#endif
diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
new file mode 100644
index 0000000000..ba6cc13a95
--- /dev/null
+++ b/libavcodec/rpi_shader.qasm
@@ -0,0 +1,1741 @@
+
+# The @ "mul_used", 0 annotations that occur by various mul blocks suppress
+# the warning that we are using rotation & ra/rb registers. r0..3 can be
+# rotated through all 16 elems ra regs can only be rotated through their
+# local 4.  As it happens this is what is wanted here as we do not want the
+# constants from the other half of the calc.
+
+# PREREAD is the number of requests that we have sitting in the TMU request
+# queue.
+#
+# There are 8 slots availible in the TMU request Q for tm0s requests, but
+# only 4 output FIFO entries and overflow is bad (corruption or crash)
+# (If threaded then only 2 out FIFO entries, but we aren't.)
+# In s/w we are effectively limited to the min vertical read which is >= 4
+# so output FIFO is the limit.
+#
+# However in the current world there seems to be no benefit (and a small
+# overhead) in setting this bigger than 2.
+
+.set PREREAD,                      4
+
+# Block heights - 8 & 16 are the only numbers we currently support
+
+.set C_BLK_HEIGHT_8,               16
+.set C_BLK_HEIGHT_16,              8
+.set Y_BLK_HEIGHT_8,               16
+.set Y_BLK_HEIGHT_16,              8
+
+# QPU counts - depend on block size
+# If we have a 2-byte format & block_size > 8 then can only afford
+# 8 QPUs
+# These numbers must match the numbers in rpi_shader_cmd.h
+
+.set N_QPU_8,                      12
+.set N_QPU_16,                     12
+
+# register allocation
+#
+
+# ra0-3
+# Used as temp and may be loop filter coeffs (split into .8s)
+# or temp in loop. Check usage on an individual basis.
+
+# ra4-7
+# C:   L0 H filter out FIFO
+# otherwise -- free --
+
+# ra8-11
+# temp in some places - check usage
+# Y:   (with rb8-11) horiz out FIFO
+
+# ra12-15
+# -- free --
+
+# uniform: width:height
+.set ra_width_height,              ra16
+.set ra_width,                     ra16.16b
+.set ra_height,                    ra16.16a
+
+# y:y2 same layout as y_y2_next so we can update both together
+.set ra_y_y2,                      ra17
+.set ra_y2,                        ra17.16a
+.set ra_y,                         ra17.16b
+
+# uniform: L1 weight (U on left, V on right)
+# Only used in Y B
+.set ra_wt_off_mul_l1,             ra18
+.set ra_wt_off_l1,                 ra18.16b
+.set ra_wt_mul_l1,                 ra18.16a
+
+# y_next:y2_next same layout as y_y2 so we can update both together
+.set ra_y_y2_next,                 ra19
+.set ra_y_next,                    ra19.16b
+.set ra_y2_next,                   ra19.16a
+
+# Setup: consts - subdivide a single register
+.set ra_kff100100,                 ra20
+.set ra_k256,                      ra20.16a
+.set ra_k0,                        ra20.8a
+.set ra_k1,                        ra20.8b
+.set ra_k16,                       ra20.8c
+.set ra_k255,                      ra20.8d
+
+# Loop: xshifts
+.set ra_xshift,                    ra21.16a
+.set ra_xshift_next,               ra21.16b
+
+# Loop var: L0 weight (U on left, V on right)
+# _off_ is not used in loop as we want to modify it before use
+.set ra_wt_off_mul_l0,             ra22
+.set ra_wt_mul_l0,                 ra22.16a
+.set ra_wt_off_l0,                 ra22.16b
+
+# Max pel value (for 8 bit we can get away with sat ops but not 9+)
+# * Could merge with rb_pmask. For 10 bit Logically pmask needs 0xff in the
+#   2nd byte   but as the source should never be > 3 there 0x3ff should do
+.set ra_blk_height_pmax,           ra23
+.set ra_pmax,                      ra23.16a
+.set ra_blk_height,                ra23.8c
+# -- free --                       ra23.8d
+
+# Loop:  src frame base (L0)
+.set ra_base,                      ra24
+
+# Loop: src frame base (L1)
+.set ra_base2,                     ra25
+
+# Loop: next src frame base (L0)
+.set ra_base_next,                 ra26
+
+# -- free --                       ra27
+# -- free --                       ra28
+# -- free --                       ra29
+
+# Use an even numbered register as a link register to avoid corrupting flags
+.set ra_link,                      ra30
+
+# -- free --                       ra31
+
+.set rb_xshift2,                   rb0
+.set rb_xshift2_next,              rb1
+
+# C:  (elem & 1) == 0 ? elem * 2 : (elem + 4) * 2
+.set rb_elem_x,                    rb2
+
+# El Flags
+# After adding to self we to have el even/odd on nc/c and lo/hi on nn/n
+.set rb_ef,                        rb3
+
+# rb4-7
+# C-B: L1 H filter out FIFO
+# Y:   (with ra2.8x) Y vertical filter coeffs
+
+# rb8-11
+# C:   Vertical filter coeffs
+# Y:   (with ra8-11) horiz out FIFO
+
+# Loop var: offset to add before shift (round + weighting offsets)
+# Exact value varies by loop
+.set rb_wt_off,                    rb12
+
+# Setup: denom + 6 + 9
+.set rb_wt_den_p15,                rb13
+
+# -- free --                       rb14
+# -- free --                       rb15
+
+# Line pitch (128 for sand128)
+.set rb_pitch,                     rb16
+
+# Loop count - 2 (set up TMU for next xfer)
+.set rb_i_tmu,                     rb17
+
+# Loop count for min(height, 16)
+# Y will reset & loop again if height > 16
+.set rb_lcount,                    rb18
+
+# frame_base2_next
+.set rb_base2_next,                rb19
+
+# Setup: Height of Y+C in sand, (x&mask)*xpitch will give
+# offset to the slice
+.set rb_xpitch,                    rb20
+
+# -- free --                       rb21
+
+# Setup: 0xff (8-bit) / 0xffff (9+ bit)
+.set rb_pmask,                     rb22
+
+# Loop: destination address
+.set rb_dest,                      rb23
+
+# vdw_setup_1(dst_pitch)
+.set rb_dma1_base,                 rb24
+
+# Setup: pic width - 1
+# In bytes so 8 bit luma is (width - 1)*1, 16 bit chroma is (width -1)*4 etc.
+.set rb_max_x,                     rb25
+
+# Loop: height<<23 + width<<16 + vdw_setup_0
+.set rb_dma0,                      rb26
+
+# vdw_setup_0 (depends on QPU number)
+.set rb_dma0_base,                 rb27
+
+# Setup: vw_setup value to reset VPM write pointer
+.set rb_vpm_init,                  rb28
+
+# Loop: vdw_setup_1(dst_pitch-width) = stride
+.set rb_dma1,                      rb29
+
+# Setup: pic_height - 1
+.set rb_max_y,                     rb30
+
+# -- free --                       rb31
+
+
+
+
+# With shifts only the bottom 5 bits are considered so -16=16, -15=17 etc.
+.set i_shift16,                    -16
+.set i_shift21,                    -11
+.set i_shift23,                     -9
+.set i_shift30,                     -2
+
+# Much of the setup code is common between Y & C
+# Macros that express this - obviously these can't be overlapped
+# so are probably unsuitable for loop code
+
+.macro m_calc_dma_regs, v_bit_depth, v_blk_height, r_vpm, r_dma
+  mov r2, qpu_num
+.if v_bit_depth <= 8
+  # 8 bit version
+  asr r1, r2, 2
+  shl r1, r1, 6
+  and r0, r2, 3
+  or  r0, r0, r1
+
+  mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
+  add r_vpm, r0, r1  # VPM 8bit storage
+
+  mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
+  shl r0, r0, 5
+
+.else
+  # 16 bit version
+  # Limited to 8 QPUs if blk height > 8
+  asr r1, r2, 1
+.if v_blk_height <= 8
+  shl r1, r1, 4
+.else
+  shl r1, r1, 5
+.endif
+  and r0, r2, 1
+  or  r0, r0, r1
+
+  mov r1, vpm_setup(0, 2, h16p(0, 0))   # 2 is stride - stride acts on ADDR
+  add r_vpm, r0, r1
+
+  # X = H * 8 so the YH from VPMVCD_WR_SETUP[ADDR] drops into
+  # XY VPMVCD_WR_SETUP[VPMBASE] if shifted left 3 (+ 3 for pos of field in reg)
+  mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0))    # height,width added later
+  shl r0, r0, 6
+.endif
+  add r_dma, r0, r1  # DMA out
+.endm
+
+
+.macro m_setup_q0
+  srel -, 12
+.endm
+
+# Code start label
+::mc_start
+
+################################################################################
+# mc_setup_uv(next_kernel, x, y, ref_c_base, frame_width, frame_height, pitch, dst_pitch, offset, denom, vpm_id)
+
+.macro m_setup_c, v_bit_depth
+
+# Cannot use mul24 on x as x might be -ve, so must use shift
+.if v_bit_depth <= 8
+.set v_x_shift,         1
+.set v_pmask,           0xff
+.set v_blk_height,      C_BLK_HEIGHT_8
+.else
+.set v_x_shift,         2
+.set v_pmask,           0xffff
+.set v_blk_height,      C_BLK_HEIGHT_16
+.endif
+
+  mov tmurs, 1                                  # No swap TMUs
+
+# Load first request location
+  mov ra0, unif                                 # next_x_y
+
+  mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
+  shl rb_ef, r0, i_shift30
+
+  mov ra_base, unif                             # Store frame c base
+
+# Read image dimensions
+  sub r0, unif, 1                               # pic c width
+  shl rb_max_x, r0, v_x_shift                   # rb_max_x in bytes
+  sub rb_max_y, unif, 1                         # pic c height
+
+# load constants
+  mov ra_kff100100, 0xff100100
+  mov rb_pmask, v_pmask
+  mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
+
+# get source pitch
+  mov rb_xpitch, unif                           # stride2
+  mov rb_pitch, unif                            # stride1
+  mov r1, vdw_setup_1(0)                        # [rb_pitch delay] Merged with dst_stride shortly
+  add rb_dma1_base, r1, rb_pitch                # vdw_setup_1
+
+  and r0, 1, elem_num
+  nop                   ; mul24 r0, r0, 5
+.if v_bit_depth <= 8
+  add rb_elem_x, r0, elem_num
+.else
+  add r0, r0, elem_num
+  add rb_elem_x, r0, r0
+.endif
+
+# Compute base address for first and second access
+# ra_base ends up with t0s base
+# ra_base2 ends up with t1s base
+
+  shl r0, ra0.16b, v_x_shift                    # [rb_elem_x delay]
+  add r0, r0, rb_elem_x                         # Add elem no to x to get X for this slice
+  max r0, r0, 0         ; mov ra_y, ra0.16a     # ; stash Y
+  min r0, r0, rb_max_x
+
+# Get shift
+# Shift will always calculate as 0 for 9+ bit
+# Ideally we can optimize the shift out of the code in these cases but for now
+# it is tidier to leave it in
+.if v_bit_depth <= 8
+  shl ra_xshift_next, r0, 3
+.else
+  mov ra_xshift_next, 0 ; mov rb_xshift2_next, 0
+.endif
+
+# In a single 32 bit word we get 1 or 2 UV pairs so mask bottom bits of xs if we need to
+
+.if v_bit_depth <= 8
+  and r0, r0, -4
+.endif
+  sub r1, ra_k0, rb_pitch
+  and r1, r0, r1
+  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+  add r0, r0, r1
+  add ra_base, ra_base, r0
+
+  add rb_wt_den_p15, 23 - v_bit_depth, unif     # denominator
+
+# Compute part of VPM to use for DMA output
+# * We only get 8 QPUs if 16 bit - maybe reduce height and auto-loop?
+  m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base
+
+# And again for L1, but only worrying about frame2 stuff
+
+# Load first request location
+  mov ra0, unif                                 # next_x_y
+
+  mov ra_base2, unif                            # [ra0 delay] Store frame c base
+
+# Compute base address for first and second access
+# ra_base ends up with t0s base
+# ra_base2 ends up with t1s base
+
+  shl r0, ra0.16b, v_x_shift
+  add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a    # Add QPU slice offset
+  max r0, r0, 0
+  min r0, r0, rb_max_x
+
+# Get shift (already zero if 9+ bit so ignore)
+.if v_bit_depth <= 8
+  shl rb_xshift2_next, r0, 3
+.endif
+
+# In a single 32 bit word we get 2 UV pairs so mask bottom bit of xs
+
+.if v_bit_depth <= 8
+  and r0, r0, -4
+.endif
+  sub r1, ra_k0, rb_pitch
+  and r1, r0, r1
+  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+  add r0, r0, r1        ; mov r2, ra_y2
+  add ra_base2, ra_base2, r0
+
+# Do preloads
+# r0 = ra_y, r2 = ra_y2
+  mov r3, PREREAD       ; mov r0, ra_y
+
+:1
+  sub.setf r3, r3, 1
+  max r1, r0, 0
+  min r1, r1, rb_max_y
+  add r0, r0, ra_k1     ; mul24 r1, r1, rb_pitch
+  add t0s, ra_base, r1  ; mov ra_y, r0
+
+  max r1, r2, 0
+  brr.anynz -, r:1b
+  min r1, r1, rb_max_y
+  add r2, r2, ra_k1     ; mul24 r1, r1, rb_pitch
+  add t1s, ra_base2, r1 ; mov ra_y2, r2
+# >>> .anynz 1b
+
+  mov ra_link, unif                             # link
+# touch registers to keep simulator happy
+  # ra/b4..7: B0 -> B stash registers
+  mov ra4, 0 ; mov rb4, 0
+  bra -, ra_link
+  mov ra5, 0 ; mov rb5, 0
+  mov ra6, 0 ; mov rb6, 0
+  mov ra7, 0 ; mov rb7, 0
+# >>> ra_link
+.endm
+
+::mc_setup_c_q0
+  m_setup_q0
+::mc_setup_c_qn
+  m_setup_c 8
+
+################################################################################
+
+# mc_filter_uv(next_kernel, x, y, frame_c_base, width_height, hcoeffs, vcoeffs, offset_weight_u, offset_weight_v, this_u_dst, this_v_dst)
+
+# At this point we have already issued two pairs of texture requests for the current block
+# ra_x, ra_x16_base point to the current coordinates for this block
+
+.macro m_filter_c_p, v_tmu, v_bit_depth
+
+.if v_bit_depth <= 8
+.set v_x_shift,         1
+.set v_x_mul,           2
+.set v_v_shift,         8
+# Shifts to get width & height in the right place in rb_dma0
+.set v_dma_h_shift,     7
+.set v_dma_wh_shift,    i_shift16
+.else
+.set v_x_shift,         2
+.set v_x_mul,           4
+.set v_v_shift,         i_shift16
+# Shifts to get width & height in the right place in rb_dma0
+.set v_dma_h_shift,     8
+.set v_dma_wh_shift,    15
+.endif
+
+.if v_tmu == 0
+.set vrx_xshift,        rb_xshift2              # b side more convienient
+.set vrx_xshift_next,   ra_xshift_next
+.set vra_y_next,        ra_y_next
+.set vrx_base_next,     ra_base_next
+.set vra_y,             ra_y
+.set vra_base,          ra_base
+.set vr_txs,            t0s
+.else
+.set vrx_xshift,        ra_xshift               # a side more convienient
+.set vrx_xshift_next,   rb_xshift2_next
+.set vra_y_next,        ra_y2_next
+.set vrx_base_next,     rb_base2_next
+.set vra_y,             ra_y2
+.set vra_base,          ra_base2
+.set vr_txs,            t1s
+.endif
+
+# per-channel shifts were calculated on the *previous* invocation
+# get base addresses and per-channel shifts for *next* invocation
+  mov vw_setup, rb_vpm_init ; mov ra2, unif     # ; x_y
+
+  add.setf -, rb_ef, rb_ef ; mov r3, unif       # [ra2 delay] ; base
+
+  shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0 # r5 = 0
+  add r0, r0, rb_elem_x ; mov ra_width_height, unif # r1=pitch2 mask ; width_height
+  sub r1, r5, rb_pitch  ; mov ra0, unif         # ; H filter coeffs
+  max r0, r0, r5        ; mov vrx_xshift, vrx_xshift_next
+  min r0, r0, rb_max_x  ; mov vra_y_next, ra2.16a
+
+.if v_bit_depth <= 8
+  shl vrx_xshift_next, r0, 3
+  and r0, r0, -4
+.endif
+  and r1, r0, r1        ; mul24 r2, ra_width, v_x_mul        # r2=w*2 (we are working in pel pairs)  ** x*2 already calced!
+  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+  add r0, r0, r1        ; mov ra3, unif                      # ; V filter coeffs
+  add vrx_base_next, r3, r0     ; mov r1, ra_height
+
+# set up VPM write
+  sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif # Compute vdw_setup1(dst_pitch-width) ; U offset/weight
+  add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
+  add rb_lcount, r1, 3          ; mov.ifc ra_wt_off_mul_l0, unif    # ; V offset/weight
+
+# ; unpack filter coefficients
+
+  shl r0, r1, v_dma_h_shift     ; mov rb8, ra3.8a
+  add r0, r0, r2                ; mov rb9, ra3.8b            # Combine width and height of destination area (r0=h<<8, r2=w*2)
+  shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c           # Shift into bits 16 upwards of the vdw_setup0 register
+  add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0       # ; r1=weight
+
+  mov rb_dest, unif             ; mov ra9, rb_max_y          # dst_addr ; alias rb_max_y
+
+  shl r1, r1, rb_wt_den_p15     ; mov rb11, ra3.8d
+
+  asr rb_wt_off, r1, 2          ; mov ra_link, unif    # ; Link
+  sub ra3, rb_wt_den_p15, ra_k1
+
+# r5           = 0 (loop counter)
+# ra9          = alias for rb_max_y
+# ra_wt_mul_l0 = weight L0
+# ra3          = weight denom + 22 - bit_depth [= rb_wt_den_p15 - 1, max 19]
+# rb_wt_off    = (offset * 2 + 1) << (ra3 - 1)
+
+# We want (r0r1)
+# U0U3 : V0V3 : U1U4 : V1V4 : U2U5 : V2U5 : ...
+# We fetch (after shift)
+#  C0  :  C3  :  C1  :  C4  :  C2  :  C5  : ...
+
+:1
+# retrieve texture results and pick out bytes
+# then submit two more texture requests
+
+.if v_tmu == 0
+  sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0   # loop counter increment
+  shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next
+  shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
+  add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next
+.else
+  sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1     # loop counter increment
+  shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next
+  shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
+  add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next
+.endif
+
+  add vra_y, r3, ra_k1   ; mov      r0, r1 << 15
+  max r3, r3, ra_k0     ; mov.ifnc r1, r2 << 1
+  min r3, r3, ra9       ; mov.ifnc r0, r2
+
+  mov ra4, ra5          ; mul24 r2, r3, rb_pitch
+  add vr_txs, vra_base, r2 ; v8min r0, r0, rb_pmask  # v8subs masks out all but bottom byte
+
+# apply horizontal filter
+# The filter coeffs for the two halves of this are the same (unlike in the
+# Y case) so it doesn't matter which ra0 we get them from
+# Also as the two halves are locked together we don't need to separate the 1st
+# r0 mul or the last r1 mul as they are vaild for all QPUs
+
+  and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,       r0
+  nop                   ; mul24      r2, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
+  nop                   ; mul24.ifn  r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
+  sub r2, r2, r3        ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
+  nop                   ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
+  sub.setf -, r5, 4     ; mul24      r0, ra0.8d,       r1
+
+# V filter =- ra4 * rb8-+ ra5 * rb9 + ra6 * rb10 - ra7 * rb11 (post FIFO shift)
+# Have to dup block as we need to move the brr - code is more common than it
+# looks at first glance
+.if v_bit_depth <= 8
+  brr.anyn -, r:1b
+  add r2, r2, r3        ; mov ra5, ra6
+  mov ra6, ra7          ; mul24 r1, ra7, rb10
+  sub ra7, r2, r0       ; mul24 r0, ra4, rb8
+.else
+  add r2, r2, r3        ; mov ra5, ra6
+  brr.anyn -, r:1b
+  mov ra6, ra7          ; mul24 r1, ra7, rb10
+  sub r2, r2, r0        ; mul24 r0, ra4, rb8
+  asr ra7, r2, v_bit_depth - 8
+.endif
+# >>> .anyn 1b
+
+  sub r1, r1, r0        ; mul24 r0, ra5, rb9    # [ra7 delay]
+  add r1, r1, r0        ; mul24 r0, ra7, rb11
+  sub r1, r1, r0
+  sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
+  asr r1, r1, 14
+  nop                   ; mul24 r1, r1, ra_wt_mul_l0
+  shl r1, r1, 8         ; mov r3, ra_blk_height
+  add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
+  brr.anyn -, r:1b
+  asr r1, r1, ra3
+  min r1, r1, ra_pmax   ; mov -, vw_wait
+  max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+# >>> .anyn 1b
+
+# r0 = remaining height (min 0)
+# r2 = r3 * rb_pitch
+# r3 = block_height (currently always 16)
+
+# If looping again then we consumed 16 height last loop
+# rb_dma1 (stride) remains constant
+# rb_i_tmu remains const (based on total height)
+# recalc rb_dma0, rb_lcount based on new segment height
+
+  mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
+
+# DMA out
+  bra.anyz -, ra_link
+  min r0, r0, r3        ; mov vw_setup, rb_dma1 # Stride
+  sub r1, r0, r3        ; mov vw_addr, rb_dest  # start the VDW
+  shl r1, r1, i_shift23
+# >>> .anyz ra_link
+
+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
+# We add to dma0 to reduce the number of output lines in the final block
+  add rb_lcount, rb_lcount, r0
+  brr -, r:1b
+  add rb_dma0, rb_dma0, r1
+  add rb_dest, rb_dest, r2
+  mov vw_setup, rb_vpm_init                     # Reset our VDM write pointer
+# >>> 1b
+.endm
+
+# At 10 bits
+# Worst case +ve after 1st filter = 74 * 0x3ff >> 2 = 18925 0x49ed (15 bits)
+# Worst case -ve after 1st filter = -10 * 0x3ff >> 2 = -10230
+# after 2nd (really we can't get this) = 74 * 18925 + 10 * 10230 >> 6 = 23480 = 0x5bb8 (15 bits)
+# (P)
+# * weight (255) = 5987400 = 0x5b5c48 (23 bits)
+# + 0x3ff << (13 - bit_depth + 7) = 0x6b5848 (23 bits)
+# ... should be OK
+#
+# (B)
+# *2 (L0+L1) = 5963920 = 0x5b0090 (23 bits)
+# + (offset * 2 + 1) << (15 - bit_depth + 7) = 5963920 + (0x3ff << 12) = 5963920 + 4190208 = 10154128 = 0x9af090 (24 bits)
+# So signed overflow if we sign extend here :-(
+#
+# In practice this doesn't happen (we need a maximal offset and a very unlucky
+# filter).
+#
+# This could be fixed by offsetting the filters s.t. they are unsigned until
+# weight mul and then removing the offset with the weighting offset (I think
+# this should work) or splitting the rounding & offsetting
+
+::mc_filter_c_p
+  m_filter_c_p 0, 8
+
+::mc_filter_c_p_l1
+  m_filter_c_p 1, 8
+
+################################################################################
+
+# mc_filter_c_b
+
+# At this point we have already issued two pairs of texture requests for the current block
+# ra_x, ra_x16_base point to the current coordinates for this block
+
+.macro m_filter_c_b, v_bit_depth
+
+.if v_bit_depth <= 8
+.set v_x_shift,         1
+.set v_v_shift,         8
+# Shifts to get width & height in the right place in rb_dma0
+.set v_dma_h_shift,     7
+.set v_dma_wh_shift,    i_shift16
+.else
+.set v_x_shift,         2
+.set v_v_shift,         i_shift16
+# Shifts to get width & height in the right place in rb_dma0
+.set v_dma_h_shift,     8
+.set v_dma_wh_shift,    15
+.endif
+.set v_x_mul,           (1 << v_x_shift)
+
+# per-channel shifts were calculated on the *previous* invocation
+
+# get base addresses and per-channel shifts for *next* invocation
+  mov vw_setup, rb_vpm_init ; mov ra2, unif     # ; x_y
+
+  add.setf -, rb_ef, rb_ef ; mov r3, unif       # [ra2 delay] ; r3=base
+
+  shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1  # x ; r5=0
+  add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a
+  sub r1, r5, rb_pitch  ; mov ra_width_height, unif  # r1=pitch2 mask ; width_height
+  max r0, r0, r5        ; mov ra_xshift, ra_xshift_next
+  min r0, r0, rb_max_x  ; mov ra0, unif         # L0 H filter coeffs
+
+.if v_bit_depth <= 8
+  shl ra_xshift_next, r0, 3
+.endif
+
+  and r0, r0, -4        ; mov ra2, unif         # ; L0 V filter coeffs
+  and r1, r0, r1        ; mul24 r2, ra_width, v_x_mul  # r2=x*2 (we are working in pel pairs)
+  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+  add r0, r0, r1        ; mov r1, ra_height     # Add stripe offsets ; r1=height
+  add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next # ; xshift2 used because B
+
+# set up VPM write
+
+  sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_mul_l0, unif # Compute vdw_setup1(dst_pitch-width) ; U weight
+  add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
+  add rb_lcount, r1, 3  ; mov.ifc ra_wt_mul_l0, unif # ; V weight
+
+  shl r0, r1, v_dma_h_shift ; mov ra3, unif     # ; x2_y2
+  add r0, r0, r2        ; mov r3, unif          # [ra3 delay] ; base
+  shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a    # Shift into bits 16 upwards of the vdw_setup0 register
+  add rb_dma0, r0, rb_dma0_base ; mov ra1, unif # ; H filter coeffs
+
+# L1 - uniform layout could possibly be optimized
+
+  shl r0, ra3.16b, v_x_shift                    # r0=x*2
+  add r0, r0, rb_elem_x ; mov ra3, unif         # ; V filter coeffs
+  sub r1, r5, rb_pitch  ; mov ra_wt_off_mul_l1, unif # [ra3 delay] r1=pitch2 mask ; U offset/weight
+  max r0, r0, r5        ; mov rb8, ra3.8a       # ; start unpacking filter coeffs
+  min r0, r0, rb_max_x  ; mov rb9, ra3.8b
+
+.if v_bit_depth <= 8
+  shl rb_xshift2_next, r0, 3
+.endif
+
+  and r0, r0, -4        ; mov.ifc ra_wt_off_mul_l1, unif # ; V offset/weight
+  and r1, r0, r1        ; mov rb10, ra3.8c
+  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+  add r0, r0, r1        ; mov rb_dest, unif     #  Add stripe offsets ; dst_addr
+  add rb_base2_next, r3, r0
+
+  mov ra9, rb_max_y     ; mov rb11, ra3.8d
+  shl r1, ra_wt_off_l1, rb_wt_den_p15
+  asr rb_wt_off, r1, 9  ; mov ra_link, unif     # link
+
+# r5        loop counter
+# ra0       H coeffs L0
+# ra1       H coeffs L1
+# ra2       V coeffs L0
+# ra3       temp
+# ra4-7     L0 H FIFO
+# rb4-7     L1 H FIFO
+# rb8-rb11  V coeffs L1
+# ra9       rb_max_y alias
+
+:1
+# retrieve texture results and pick out bytes
+# then submit two more texture requests
+  sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0     # loop counter increment
+  shr r2, r4, ra_xshift ; mov.ifz ra_base2, rb_base2_next
+  shr r1, r2, v_v_shift ; mov.ifz ra_y_y2, ra_y_y2_next
+  add.setf -, rb_ef, rb_ef ; mov.ifz ra_base, ra_base_next
+  add ra_y, 1, ra_y     ; mov r3, ra_y
+
+  max r3, r3, ra_k0     ; mov      r0, r1 << 15
+  min r3, r3, ra9       ; mov.ifnc r1, r2 << 1
+
+  mov.ifnc r0, r2       ; mul24 r3, r3, rb_pitch
+  add t0s, ra_base, r3  ; v8min r0, r0, rb_pmask  # v8subs masks out all but bottom byte
+
+# L0 H-filter
+# H FIFO scrolls are spread all over this loop
+  mov rb4, rb5          ; mov ra4, ra5          # ? Just moves
+
+  and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,       r0
+  nop                   ; mul24      r2, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
+  nop                   ; mul24.ifn  r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
+  sub r2, r2, r3        ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
+  nop                   ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
+  add r2, r2, r3        ; mul24      r3, ra0.8d,       r1
+.if v_bit_depth <= 8
+  sub ra3, r2, r3       ; mov rb5, rb6          ; ldtmu1
+.else
+  sub r2, r2, r3        ; mov rb5, rb6          ; ldtmu1
+  asr ra3, r2, (v_bit_depth - 8)
+.endif
+
+  shr r2, r4, rb_xshift2 ; mov ra5, ra6
+  shr r1, r2, v_v_shift ; mov r3, ra_y2
+  add ra_y2, r3, ra_k1  ; mov rb6, rb7
+
+  max r3, r3, ra_k0     ; mov      r0, r1 << 15
+  min r3, r3, ra9       ; mov.ifnc r1, r2 << 1
+
+  mov.ifnc r0, r2       ; mul24 r3, r3, rb_pitch
+  add t1s, ra_base2, r3 ; v8min r0, r0, rb_pmask  # v8subs masks out all but bottom byte
+
+# L1 H-filter
+
+  and r1, r1, rb_pmask  ; mul24      r3, ra1.8a,       r0
+  nop                   ; mul24      r2, ra1.8b << 2,  r0 << 2  @ "mul_used", 0
+  nop                   ; mul24.ifn  r2, ra1.8b << 12, r1 << 12 @ "mul_used", 0
+  sub r2, r2, r3        ; mul24      r3, ra1.8c << 4,  r0 << 4  @ "mul_used", 0
+  nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
+  sub.setf -, r5, 4     ; mul24      r0, ra1.8d,       r1
+# V filters - start in branch delay slots of H
+# Final asr not needed for 8-bit but we can#t (currently) save a whole instruction
+  add r2, r2, r3        ; mul24 r1, rb5, ra2.8b
+  brr.anyn -, r:1b
+  mov ra6, ra7          ; mul24 r3, ra7, rb10
+  sub r2, r2, r0        ; mul24 r0, rb4, ra2.8a
+  asr ra7, r2, (v_bit_depth - 8) ; mov rb7, ra3
+# >>> .anyn 1b
+
+  sub r1, r1, r0        ; mul24 r0, rb6, ra2.8c # [rb7 delay]
+  add r1, r1, r0        ; mul24 r0, rb7, ra2.8d
+  sub r2, r1, r0        ; mul24 r0, ra4, rb8
+  sub r1, r3, r0        ; mul24 r0, ra5, rb9
+  add r1, r1, r0        ; mul24 r0, ra7, rb11
+  sub r1, r1, r0        ; mul24 r2, r2, ra_k256
+
+  asr r2, r2, 14        ; mul24 r1, r1, ra_k256
+  asr r1, r1, 14        ; mul24 r2, r2, ra_wt_mul_l0
+
+  add r2, r2, rb_wt_off ; mul24 r1, r1, ra_wt_mul_l1    # rb_wt_off = (offsetL0 + offsetL1 + 1) << (rb_wt_den_p15 - 9)
+  add r1, r1, r2        ; mov r3, ra_blk_height
+
+  sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256     # Lose bad top 8 bits & sign extend
+
+  brr.anyn -, r:1b
+  asr r1, r1, rb_wt_den_p15 ; v8subs r0, ra_height, r3
+  min r1, r1, ra_pmax   ; mov -, vw_wait
+  max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+# >>> .anyn 1b
+
+# r0 = remaining height (min 0)
+# r2 = r3 * rb_pitch
+# r3 = block_height (currently always 16)
+
+# If looping again then we consumed 16 height last loop
+# rb_dma1 (stride) remains constant
+# rb_i_tmu remains const (based on total height)
+# recalc rb_dma0, rb_lcount based on new segment height
+
+  mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
+
+# DMA out
+  bra.anyz -, ra_link
+  min r0, r0, r3        ; mov vw_setup, rb_dma1 # Stride
+  sub r1, r0, r3        ; mov vw_addr, rb_dest  # start the VDW
+  shl r1, r1, i_shift23
+# >>> .anyz ra_link
+
+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
+# We add to dma0 to reduce the number of output lines in the final block
+  add rb_lcount, rb_lcount, r0
+  brr -, r:1b
+  add rb_dma0, rb_dma0, r1
+  add rb_dest, rb_dest, r2
+  mov vw_setup, rb_vpm_init                     # Reset our VDM write pointer
+# >>> 1b
+.endm
+
+::mc_filter_c_b
+  m_filter_c_b 8
+
+################################################################################
+# Exit code used by both Luma & Chroma so place between them to avoid I-cache
+# conflicts
+
+.macro m_exit_drain
+.if PREREAD == 2
+# Special case 2 as loop is wasteful
+  nop                   ; nop           ; ldtmu0
+  nop                   ; nop           ; ldtmu1
+  nop                   ; nop           ; ldtmu0
+  mov -, vw_wait        ; nop           ; ldtmu1
+.else
+  mov.setf r3, PREREAD - 1
+:1
+  brr.anynz -, r:1b
+  nop                   ; nop           ; ldtmu0
+  nop                   ; nop           ; ldtmu1
+  sub.setf r3, r3, 1
+ # >>>
+  mov  -, vw_wait
+.endif
+.endm
+
+# This sync layout groups QPUs 0-3, 4-7, 8-11 (i.e. 1 group per TMU pair)
+# All qpus start at the beginning and after that (group - 1) must have finished
+# before (group) can start
+#
+# Requires setup code for QPU 0 to srel sem 12 (m_setup_q0) to start the chain
+# Exit code will sacq sem 12 so everything is @ 0 on exit (this is important -
+# lockup otherwise)
+#
+# There is some, currently ill defined, potential lockup if we have the VDM active
+# whilst doing sem stuff so we wait first. ?? QPU stall from sem stalls VDM pipe too ??
+#
+# The code stalled when I had many waiters on a single sem so we have a
+# "ripple" of srels to restart.  Unsure why, may have been bug, but this works
+# and we currently have both the memory & sems to support it.
+.macro m_sync_q, n_qpu, n_quads
+# Do not generate code for qpu >= quads * 4 -  fns should never be called
+.if n_qpu < n_quads * 4
+  mov ra_link, unif     # Can only branch to an a reg (not r0)
+  mov -, vw_wait        # [ra_link delay]
+
+.set n_sem_sync, n_qpu - (n_qpu % 4)
+.set n_sem_in, n_qpu
+.set n_sem_out, n_qpu + 1
+
+.if n_qpu % 4 == 0
+
+.set n_sem_quad_in,  12 + n_qpu / 4
+.set n_sem_quad_out, 12 + (((n_qpu / 4) + 1) % n_quads)
+
+  sacq -, n_sem_sync
+  sacq -, n_sem_sync
+  sacq -, n_sem_sync
+  bra -, ra_link
+  sacq -, n_sem_quad_in
+  srel -, n_sem_out
+  srel -, n_sem_quad_out
+
+.else
+  bra -, ra_link
+  srel -, n_sem_sync
+  sacq -, n_sem_in
+.if n_sem_out % 4 != 0
+  srel -, n_sem_out
+.else
+  nop
+.endif
+.endif
+.endif
+.endm
+
+.set v_quads8, N_QPU_8 / 4
+
+::mc_sync_q0
+  m_sync_q 0, v_quads8
+::mc_sync_q1
+  m_sync_q 1, v_quads8
+::mc_sync_q2
+  m_sync_q 2, v_quads8
+::mc_sync_q3
+  m_sync_q 3, v_quads8
+::mc_sync_q4
+  m_sync_q 4, v_quads8
+::mc_sync_q5
+  m_sync_q 5, v_quads8
+::mc_sync_q6
+  m_sync_q 6, v_quads8
+::mc_sync_q7
+  m_sync_q 7, v_quads8
+::mc_sync_q8
+  m_sync_q 8, v_quads8
+::mc_sync_q9
+  m_sync_q 9, v_quads8
+::mc_sync_q10
+  m_sync_q 10, v_quads8
+::mc_sync_q11
+  m_sync_q 11, v_quads8
+
+# mc_exit()
+# Chroma & Luma the same now
+
+.macro m_exit_qn
+  m_exit_drain
+  nop                   ; nop           ; thrend
+  nop
+  nop
+# >>> thrend <<<
+.endm
+
+::mc_exit_c_qn
+::mc_exit_y_qn
+  m_exit_qn
+
+
+
+# mc_interrupt_exit12()
+
+.macro m_exit_q0
+  m_exit_drain
+  sacq -, 12
+  nop                   ; nop           ; thrend
+  mov interrupt, 1
+  nop
+# >>> thrend <<<
+.endm
+
+::mc_exit_c_q0
+::mc_exit_y_q0
+  m_exit_q0
+
+# LUMA CODE
+
+# The idea is to form B predictions by doing 8 pixels from ref0 in parallel with 8 pixels from ref1.
+# For P frames we make the second x,y coordinates offset by +8
+
+
+################################################################################
+# mc_setup
+#
+# typedef struct qpu_mc_pred_y_s_s {
+#    qpu_mc_src_t next_src1;
+#    qpu_mc_src_t next_src2;
+#    uint16_t pic_h;
+#    uint16_t pic_w;
+#    uint32_t stride2;
+#    uint32_t stride1;
+#    uint32_t wdenom;
+#    uint32_t next_fn;
+# } qpu_mc_pred_y_s_t;
+
+.macro m_setup_y, v_bit_depth
+
+# Cannot use mul24 on x as x might be -ve, so must use shift
+.if v_bit_depth <= 8
+.set v_x_shift,         0
+.set v_pmask,           0xff
+.set v_blk_height,      Y_BLK_HEIGHT_8
+.else
+.set v_x_shift,         1
+.set v_pmask,           0xffff
+.set v_blk_height,      Y_BLK_HEIGHT_16
+.endif
+
+
+  # Need to save these because we need to know the frame dimensions before computing texture coordinates
+  mov tmurs, 1          ; mov ra0, unif         # No TMU swap ; x_y
+  mov ra9, unif                                 # ref_y_base
+  mov ra1, unif                                 # x2_y2
+  mov ra11, unif                                # ref_y2_base
+
+# load constants
+  mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
+  shl rb_ef, r0, i_shift30
+
+
+  mov ra_kff100100, 0xff100100
+  mov rb_pmask, v_pmask
+  mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
+
+# Compute part of VPM to use
+
+# Read image dimensions
+  mov ra3, unif                                 # width_height
+  mov rb_xpitch, unif                           # stride2
+.if v_x_shift == 0
+  sub rb_max_x, ra3.16b, 1
+.else
+  sub r0, ra3.16b, 1
+  shl rb_max_x, r0, v_x_shift
+.endif
+  sub rb_max_y, ra3.16a, 1
+  mov rb_pitch, unif                            # stride1
+
+# get destination pitch
+  mov r1, vdw_setup_1(0)
+  or  rb_dma1_base, r1, rb_pitch
+
+# Compute base address for first and second access
+  mov r3, elem_num
+  add r0, ra0.16b, r3                           # Load x + elem_num
+.if v_x_shift != 0
+  shl r0, r0, v_x_shift
+.endif
+  max r0, r0, 0
+  min r0, r0, rb_max_x
+  shl ra_xshift_next, r0, 3 # Compute shifts
+
+# X is byte offset - we can only load words - mask
+
+  and r0, r0, -4        ; v8subs r2, r2, r2
+  sub r2, r2, rb_pitch
+  and r1, r0, r2
+  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+  add r0, r0, r1                                # Add stripe offsets
+  add ra_base, ra9, r0
+
+  # r3 still contains elem_num
+  add r0, ra1.16b, r3                           # Load x
+.if v_x_shift != 0
+  shl r0, r0, v_x_shift
+.endif
+  max r0, r0, 0
+  min r0, r0, rb_max_x
+  shl rb_xshift2_next, r0, 3                    # Compute shifts
+
+  # r2 still contains mask
+  and r0, r0, -4
+  and r1, r0, r2
+  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+  add r0, r0, r1                                # Add stripe offsets
+  add ra_base2, ra11, r0
+
+# Do preloads
+  nop                   ; mov r0, ra0.16a       # ; r0 = y
+  mov r3, PREREAD       ; mov r2, ra1.16a       # ; r2 = y2
+
+:1
+  sub.setf r3, r3, 1
+  max r1, r0, 0
+  min r1, r1, rb_max_y
+  add r0, r0, ra_k1     ; mul24 r1, r1, rb_pitch
+  add t0s, ra_base, r1  ; mov ra_y, r0
+
+  max r1, r2, 0
+  brr.anynz -, r:1b
+  min r1, r1, rb_max_y
+  add r2, r2, ra_k1     ; mul24 r1, r1, rb_pitch
+  add t1s, ra_base2, r1 ; mov ra_y2, r2
+# >>> .anynz 1b
+
+  add rb_wt_den_p15, unif, 23 - v_bit_depth     # weight denom
+
+  m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base
+
+  mov ra_link, unif                             # Next fn
+
+# touch vertical context to keep simulator happy
+  mov ra8,  0           ; mov rb8,  0
+  bra -, ra_link
+  mov ra9,  0           ; mov rb9,  0
+  mov ra10, 0           ; mov rb10, 0
+  mov ra11, 0           ; mov rb11, 0
+# >>> ra_link
+.endm
+
+::mc_setup_y_q0
+  m_setup_q0
+::mc_setup_y_qn
+  m_setup_y 8
+
+################################################################################
+#
+# Start of per-block setup code
+# P and B blocks share the same setup code to save on Icache space
+
+# luma_setup_delay3 done in delay slots of branch that got us here
+
+# get base addresses and per-channel shifts for *next* invocation
+# per-channel shifts were calculated on the *previous* invocation
+
+# 1st 3 instructions of per_block-setup in branch delay
+#
+# typedef struct qpu_mc_pred_y_p_s {
+#    qpu_mc_src_t next_src1;
+#    qpu_mc_src_t next_src2;
+#    uint16_t h;
+#    uint16_t w;
+#    uint32_t mymx21;
+#    uint32_t wo1;
+#    uint32_t wo2;
+#    uint32_t dst_addr;
+#    uint32_t next_fn;
+# } qpu_mc_pred_y_p_t;
+#
+
+.macro m_luma_setup, v_bit_depth
+# Hack - QASM may well have have label pasting but I have no idea how...
+.if v_bit_depth == 8
+  brr ra_link, r:per_block_setup_8
+.elif v_bit_depth == 10
+  brr ra_link, r:per_block_setup_10
+.endif
+  mov ra0, unif         ; mov r3, elem_num      # y_x ; elem_num has implicit unpack??
+  add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2 # [ra0 delay] ; r5 = 0
+  add r0, ra0.16b, r3   ; mov rb_xshift2, rb_xshift2_next
+.endm
+
+.macro m_per_block_setup, v_bit_depth
+
+.if v_bit_depth <= 8
+.set v_x_shift,         0
+.set v_x_mul,           1
+# Shifts to get width & height in the right place in rb_dma0
+.set v_dma_h_shift,     7
+.set v_dma_wh_shift,    i_shift16
+.else
+.set v_x_shift,         1
+.set v_x_mul,           2
+# Shifts to get width & height in the right place in rb_dma0
+.set v_dma_h_shift,     8
+.set v_dma_wh_shift,    15
+.endif
+
+.if v_x_shift != 0
+  shl r0, r0, v_x_shift
+.endif
+  max r0, r0, r5         ; mov ra_xshift, ra_xshift_next
+  min r0, r0, rb_max_x
+
+  shl ra_xshift_next, r0, 3         # Compute shifts
+  and r0, r0, -4
+  sub r2, r5, rb_pitch  ; mov ra_base_next, unif # src1.base
+  and r1, r0, r2        ; mov ra_y_next, ra0.16a
+  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+  add r0, r0, r1        ; mov ra1, unif         # Add stripe offsets ; src2.x_y
+  add ra_base_next, ra_base_next, r0            # [ra1 delay]
+
+  add r0, ra1.16b, r3                           # Load x2
+.if v_x_shift != 0
+  shl r0, r0, v_x_shift
+.endif
+  max r0, r0, r5        ; mov ra_y2_next, ra1.16a
+  min r0, r0, rb_max_x  ; mov rb_base2_next, unif # ; src2.base
+  shl rb_xshift2_next, r0, 3                    # Compute shifts
+  and r0, r0, -4        ; mov ra_width_height, unif # ; width_height
+  and r1, r0, r2        ; mov vw_setup, rb_vpm_init # ; set up VPM write
+  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+  add r0, r0, r1        ; mul24 r1, ra_width, v_x_mul # Add stripe offsets ; r1 = x in bytes
+  add rb_base2_next, rb_base2_next, r0
+
+# get width,height of block (unif load above), r1 = width * pel_size
+  sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height # Compute vdw_setup1(dst_pitch-width)
+  add rb_i_tmu, r0, 7 - PREREAD ; v8min r0, r0, ra_blk_height
+  add rb_lcount, r0, 7
+  shl r0,   r0, v_dma_h_shift
+  add r0,   r0, r1                              # Combine width and height of destination area
+  shl r0,   r0, v_dma_wh_shift                  # Shift into bits 16 upwards of the vdw_setup0 register
+  add rb_dma0, r0, rb_dma0_base ; mov r0, unif  # ; Packed filter offsets
+
+# get filter coefficients and discard unused B frame values
+  shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif     #  Pick half to use ; L0 offset/weight
+  shl ra8, r0, 3        ; mov r3, ra_k255
+
+# Pack the 1st 4 filter coefs for H & V tightly
+# Coeffs are all abs values here as that means mul24 works (no sign extend from .8)
+
+  mov r1,0x00010100  # -ve                      [ra8 delay]
+  ror ra2.8a, r1, ra8.8d
+  ror ra0.8a, r1, ra8.8c
+
+  mov r1, 0x01040400
+  ror ra2.8b, r1, ra8.8d
+  ror ra0.8b, r1, ra8.8c
+
+  mov r1,0x050b0a00  # -ve
+  ror ra2.8c, r1, ra8.8d
+  ror ra0.8c, r1, ra8.8c
+
+  mov r1,0x11283a40
+  ror ra2.8d, r1, ra8.8d
+  ror ra0.8d, r1, ra8.8c
+
+# In the 2nd vertical half we use b registers due to using a-side fifo regs
+
+  mov r1,0x3a281100
+  ror r0, r1, ra8.8d  ; mov ra_wt_off_mul_l1, unif
+  ror ra1.8a, r1, ra8.8c ; v8min rb4, r0, r3
+
+  mov r1,0x0a0b0500  # -ve
+  ror r0, r1, ra8.8d
+  ror ra1.8b, r1, ra8.8c ; v8min rb5, r0, r3
+
+  mov r1,0x04040100
+  ror r0, r1, ra8.8d
+  ror ra1.8c, r1, ra8.8c ; v8min rb6, r0, r3
+
+  mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 ; mov rb_dest, unif # ; Destination address
+
+  mov r1,0x01010000  # -ve
+  ror r0, r1, ra8.8d
+
+  bra -, ra_link
+  ror ra1.8d, r1, ra8.8c ; v8min rb7, r0, r3
+
+  shl r0, ra_wt_off_l0, rb_wt_den_p15           # Offset calc
+  # For B l1 & L0 offsets should be identical so it doesn't matter which we use
+  asr rb_wt_off, r0, 9  ; mov ra_link, unif    # ; link - load after we've used its previous val
+# >>> branch ra_link
+
+# r5 = 0
+# ra_wt_mul_l1  = weight L1
+# ra5.16a       = weight L0/L1 depending on side (wanted for 2x mono-pred)
+# rb_wt_off     = (((is P) ? offset L0/L1 * 2 : offset L1 + offset L0) + 1) << (rb_wt_den_p15 - 1)
+# rb_wt_den_p15 = weight denom + 6 + 9
+# rb_wt_mul_l0  = weight L0
+.endm
+
+:per_block_setup_8
+  m_per_block_setup 8
+
+
+
+################################################################################
+# mc_filter(y_x, base, y2_x2, base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
+# In a P block, y2_x2 should be y_x+8
+# At this point we have already issued two pairs of texture requests for the current block
+
+.macro m_filter_y_pxx, v_bit_depth
+  m_luma_setup v_bit_depth
+
+  shl ra_wt_mul_l0, ra_wt_mul_l0, 1
+
+# r5 = 0 (loop count)
+
+:1
+# retrieve texture results and pick out bytes
+# then submit two more texture requests
+
+# N.B. Whilst y == y2 as far as this loop is concerned we will start
+# the grab for the next block before we finish with this block and that
+# might be B where y != y2 so we must do full processing on both y and y2
+
+  sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1            ; ldtmu1
+  shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next      ; ldtmu0
+  shr r0, r4, ra_xshift         ; mov r3, rb_pitch
+
+  max r2, ra_y, 0  # y
+  min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
+  add ra_y, ra_y, 1             ; mul24 r2, r2, r3
+  add t0s, ra_base, r2          ; mov.ifz ra_base2, rb_base2_next
+
+  max r2, ra_y2, 0
+  min r2, r2, rb_max_y          ; mov ra7, ra8
+  add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
+  add t1s, ra_base2, r2         ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte
+
+  add.setf -, rb_ef, rb_ef      ; mov ra8, ra9
+
+# apply horizontal filter
+  and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,      r0
+  nop                   ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
+  nop                   ; mul24.ifn  r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
+  nop                   ; mul24.ifn  r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
+  sub r2, r2, r3        ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
+  nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
+  sub r2, r2, r3        ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
+  nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
+  add r2, r2, r3        ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
+  nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
+  add r2, r2, r3        ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
+  nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
+  sub r2, r2, r3        ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
+  nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
+  add r2, r2, r3        ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
+  nop                   ; mul24.ifn  r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
+
+  sub.setf -, r5, 8     ; mov ra9,  ra10
+  sub r2, r2, r3        ; mul24 r0, rb9,  ra2.8a
+  brr.anyn -, r:1b
+  mov rb9,  rb10        ; mul24 r1, rb10, ra2.8b
+  mov ra10, ra11        ; mov rb10, rb11
+  asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7
+  # >>> .anyn 1b
+
+  # apply vertical filter and write to VPM
+  sub r1, r1, r0        ; mul24 r0, rb10, ra2.8c
+  sub r1, r1, r0        ; mul24 r0, rb11, ra2.8d
+  add r1, r1, r0        ; mul24 r0, ra8,  rb4
+  add r1, r1, r0        ; mul24 r0, ra9,  rb5
+  sub r1, r1, r0        ; mul24 r0, ra10, rb6
+  add r1, r1, r0        ; mul24 r0, ra11, rb7
+  sub r1, r1, r0
+# At this point r1 is a 22-bit signed quantity: 8 (original sample),
+#  +6, +6 (each pass), +1 (the passes can overflow slightly), +1 (sign)
+# The top 8 bits have rubbish in them as mul24 is unsigned
+# The low 6 bits need discard before weighting
+  sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256  # x256 - sign extend & discard rubbish
+  asr r1, r1, 14
+  nop                   ; mul24 r1, r1, ra_wt_mul_l0
+  add r1, r1, rb_wt_off ; mov r3, ra_blk_height      # ; r3 = block height for outside loop
+
+  shl r1, r1, 8         ; v8subs r0, ra_height, r3
+  brr.anyn -, r:1b
+  asr r1, r1, rb_wt_den_p15
+  min r1, r1, ra_pmax   ; mov -, vw_wait
+  max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+
+# >>> branch.anyn yloop
+
+# r0 = remaining height (min 0)
+# r2 = r3 * rb_pitch
+# r3 = block_height (currently always 16)
+
+# If looping again then we consumed 16 height last loop
+# rb_dma1 (stride) remains constant
+# rb_i_tmu remains const (based on total height)
+# recalc rb_dma0, rb_lcount based on new segment height
+
+  mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
+
+# DMA out
+  bra.anyz -, ra_link
+  min r0, r0, r3        ; mov vw_setup, rb_dma1 # Stride
+  sub r1, r0, r3        ; mov vw_addr, rb_dest  # start the VDW
+  shl r1, r1, i_shift23
+# >>> .anyz ra_link
+
+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
+# We add to dma0 to reduce the number of output lines in the final block
+  add rb_lcount, rb_lcount, r0
+  brr -, r:1b
+  add rb_dma0, rb_dma0, r1
+  add rb_dest, rb_dest, r2
+  mov vw_setup, rb_vpm_init                     # Reset our VDM write pointer
+# >>> 1b
+.endm
+
+::mc_filter_y_pxx
+  m_filter_y_pxx 8
+
+
+################################################################################
+
+# mc_filter_b(y_x, base, y2_x2, base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
+# In a P block, only the first half of coefficients contain used information.
+# At this point we have already issued two pairs of texture requests for the current block
+# Perhaps can unpack coefficients in a more efficient manner by doing H/V for a and b at the same time?
+# Or possibly by taking advantage of symmetry?
+
+.macro m_filter_y_bxx, v_bit_depth
+  m_luma_setup v_bit_depth
+
+:1
+  sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1        ; ldtmu1
+  shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next  ; ldtmu0
+  shr r0, r4, ra_xshift         ; mov r3, rb_pitch
+
+  max r2, ra_y, 0  # y
+  min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
+  add ra_y, ra_y, 1             ; mul24 r2, r2, r3
+  add t0s, ra_base, r2          ; mov.ifz ra_base2, rb_base2_next
+
+  max r2, ra_y2, 0
+  min r2, r2, rb_max_y          ; mov ra7, ra8
+  add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
+  add t1s, ra_base2, r2         ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte
+
+  add.setf -, rb_ef, rb_ef      ; mov ra8, ra9
+
+# apply horizontal filter
+  and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,      r0
+  nop                   ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
+  nop                   ; mul24.ifn  r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
+  nop                   ; mul24.ifn  r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
+  sub r2, r2, r3        ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
+  nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
+  sub r2, r2, r3        ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
+  nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
+  add r2, r2, r3        ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
+  nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
+  add r2, r2, r3        ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
+  nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
+  sub r2, r2, r3        ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
+  nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
+  add r2, r2, r3        ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
+  nop                   ; mul24.ifn  r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
+
+  sub.setf -, r5, 8     ; mov ra9,  ra10
+  sub r2, r2, r3        ; mul24 r0, rb9,  ra2.8a
+  brr.anyn -, r:1b
+  mov rb9,  rb10        ; mul24 r1, rb10, ra2.8b
+  mov ra10, ra11        ; mov rb10, rb11
+  asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7
+  # >>> .anyn 1b
+
+  # apply vertical filter and write to VPM
+  sub r1, r1, r0        ; mul24 r0, rb10, ra2.8c
+  sub r1, r1, r0        ; mul24 r0, rb11, ra2.8d
+  add r1, r1, r0        ; mul24 r0, ra8,  rb4
+  add r1, r1, r0        ; mul24 r0, ra9,  rb5
+  sub r1, r1, r0        ; mul24 r0, ra10, rb6
+  add r1, r1, r0        ; mul24 r0, ra11, rb7
+  sub r1, r1, r0        ; mov r2, rb_wt_off
+# As with P-pred r1 is a 22-bit signed quantity in 32-bits
+# Top 8 bits are bad - low 6 bits should be discarded
+  sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
+
+  asr r1, r1, 14
+  nop                   ; mul24 r0, r1, ra_wt_mul_l0
+  add r0, r0, r2        ; mul24 r1, r1 << 8, ra_wt_mul_l1 << 8    @ "mul_used", 0
+
+  add r1, r1, r0        ; mov r3, ra_blk_height
+  shl r1, r1, 8         ; v8subs r0, ra_height, r3
+  brr.anyn -, r:1b
+  asr r1, r1, rb_wt_den_p15
+  min r1, r1, ra_pmax   ; mov -, vw_wait
+  max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+# >>> branch.anyn 1b
+
+# r0 = remaining height (min 0)
+# r2 = r3 * rb_pitch
+# r3 = block_height (currently always 16)
+
+# If looping again then we consumed 16 height last loop
+# rb_dma1 (stride) remains constant
+# rb_i_tmu remains const (based on total height)
+# recalc rb_dma0, rb_lcount based on new segment height
+
+  mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
+
+# DMA out
+  bra.anyz -, ra_link
+  min r0, r0, r3        ; mov vw_setup, rb_dma1 # Stride
+  sub r1, r0, r3        ; mov vw_addr, rb_dest  # start the VDW
+  shl r1, r1, i_shift23
+# >>> .anyz ra_link
+
+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
+# We add to dma0 to reduce the number of output lines in the final block
+  add rb_lcount, rb_lcount, r0
+  brr -, r:1b
+  add rb_dma0, rb_dma0, r1
+  add rb_dest, rb_dest, r2
+  mov vw_setup, rb_vpm_init                     # Reset our VDM write pointer
+# >>> 1b
+.endm
+
+::mc_filter_y_bxx
+  m_filter_y_bxx 8
+
+################################################################################
+#
+# typedef struct qpu_mc_pred_y_p00_s {
+#    qpu_mc_src_t next_src1;
+#    uint16_t h;
+#    uint16_t w;
+#    uint32_t wo1;
+#    uint32_t dst_addr;
+#    uint32_t next_fn;
+# } qpu_mc_pred_y_p00_t;
+
+.macro m_filter_y_p00, v_bit_depth
+
+.if v_bit_depth <= 8
+.set v_x_shift,         0
+.set v_x_mul,           1
+# Shifts to get width & height in the right place in rb_dma0
+.set v_dma_h_shift,     7
+.set v_dma_wh_shift,    i_shift16
+.else
+.set v_x_shift,         1
+.set v_x_mul,           2
+# Shifts to get width & height in the right place in rb_dma0
+.set v_dma_h_shift,     8
+.set v_dma_wh_shift,    15
+.endif
+
+  mov ra0, unif         ; mov r3, elem_num      # y_x
+  mov ra_xshift, ra_xshift_next                 # [ra0 delay]
+  add r0, ra0.16b, r3
+.if v_x_shift != 0
+  shl r0, r0, v_x_shift
+.endif
+
+  max r0, r0, 0
+  min r0, r0, rb_max_x
+
+  shl ra_xshift_next, r0, 3                     # Compute shifts
+  and r0, r0, -4        ; v8subs r2, r2, r2
+  sub r2, r2, rb_pitch  ; mov ra_base_next, unif # src1.base
+  and r1, r0, r2        ; mov ra_y_next, ra0.16a
+  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+  add r0, r0, r1        ; mov ra_width_height, unif # Add stripe offsets ; width_height
+  add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init  # [ra_width delay] ; set up VPM write
+
+# get width,height of block (unif load above)
+# Compute vdw_setup1(dst_pitch-width)
+  shl r1, ra_width, v_x_shift
+  sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
+  sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height
+  shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0
+  add r0, r0, r1        ; mov ra_wt_off_mul_l0, unif # Combine width and height of destination area ; weight_offset
+  shl r0, r0, v_dma_wh_shift ; mov rb_dest, unif  # Shift into bits 16 upwards of the vdw_setup0 register ; dest addr
+  add rb_dma0, r0, rb_dma0_base
+
+  shl r0, ra_wt_off_l0, rb_wt_den_p15 ; v8subs r5rep, r3, r3     # Offset calc ; r5 = 0
+  # For B l1 & L0 offsets should be identical so it doesn't matter which we use
+  asr rb_wt_off, r0, 1  ; mov ra_link, unif    # ; link
+
+:1
+  sub.setf -, r5, rb_i_tmu  ; v8adds r5rep, r5, ra_k1
+  nop                   ; mov.ifz ra_y, ra_y_next      ; ldtmu0
+  shr r0, r4, ra_xshift ; mov r3, rb_pitch
+
+  max r2, ra_y, 0  # y
+  min r2, r2, rb_max_y  ; mov.ifz ra_base, ra_base_next
+  add ra_y, ra_y, 1     ; mul24 r2, r2, r3
+  add t0s, ra_base, r2  ; v8min r0, r0, rb_pmask
+
+  sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0
+  shl r1, r1, 23 - v_bit_depth ; mov r3, ra_blk_height
+  add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
+
+  brr.anyn -, r:1b
+  asr r1, r1, rb_wt_den_p15
+  min r1, r1, ra_pmax   ; mov -, vw_wait
+  max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+# >>> branch.anyn 1b
+
+# r0 = remaining height (min 0)
+# r2 = r3 * rb_pitch
+# r3 = block_height (currently always 16)
+
+# If looping again then we consumed 16 height last loop
+# rb_dma1 (stride) remains constant
+# rb_i_tmu remains const (based on total height)
+# recalc rb_dma0, rb_lcount based on new segment height
+
+  mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
+
+# DMA out
+  bra.anyz -, ra_link
+  min r0, r0, r3        ; mov vw_setup, rb_dma1 # Stride
+  sub r1, r0, r3        ; mov vw_addr, rb_dest  # start the VDW
+  shl r1, r1, i_shift23
+# >>> .anyz ra_link
+
+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
+# We add to dma0 to reduce the number of output lines in the final block
+  add rb_lcount, rb_lcount, r0
+  brr -, r:1b
+  add rb_dma0, rb_dma0, r1
+  add rb_dest, rb_dest, r2
+  mov vw_setup, rb_vpm_init                     # Reset our VDM write pointer
+# >>> 1b
+.endm
+
+::mc_filter_y_p00
+  m_filter_y_p00 8
+
+################################################################################
+
+.macro m_filter_y_b00, v_bit_depth
+# luma setup does a fair bit more than we need calculating filter coeffs
+# that we will never use but it saves I-cache to use it (also simple!)
+  m_luma_setup v_bit_depth
+
+# Fix up vals that were expecting a filter (somewhat icky)
+  mov r0, 7
+  sub rb_i_tmu, rb_i_tmu, r0
+  sub rb_lcount, rb_lcount, r0
+  mov r0, 8             ; mov r1, ra_wt_off_mul_l0
+  shl rb_wt_off, rb_wt_off, r0
+  nop                   ; mov.ifnz ra_wt_off_mul_l0, r1 << 8
+
+:1
+  sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1            ; ldtmu1
+  shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next        ; ldtmu0
+  shr r0, r4, ra_xshift ; mov r3, rb_pitch
+
+  max r2, ra_y, 0  # y
+  min r2, r2, rb_max_y  ; mov.ifz ra_base, ra_base_next
+  add ra_y, ra_y, 1     ; mul24 r2, r2, r3
+  add t0s, ra_base, r2  ; mov.ifz ra_base2, rb_base2_next
+
+  max r2, ra_y2, 0
+  min r2, r2, rb_max_y
+  add ra_y2, ra_y2, 1   ; mul24 r2, r2, r3
+  add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte
+  and r1, r1, rb_pmask  ; mul24 r0, r0, ra_wt_mul_l0
+
+  sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1
+  add r1, r0, r1
+  shl r1, r1, 22 - v_bit_depth ; mov r3, ra_blk_height
+  add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
+
+  brr.anyn -, r:1b
+  asr r1, r1, rb_wt_den_p15
+  min r1, r1, ra_pmax   ; mov -, vw_wait
+  max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+# >>> branch.anyn 1b
+
+# r0 = remaining height (min 0)
+# r2 = r3 * rb_pitch
+# r3 = block_height (currently always 16)
+
+# If looping again then we consumed 16 height last loop
+# rb_dma1 (stride) remains constant
+# rb_i_tmu remains const (based on total height)
+# recalc rb_dma0, rb_lcount based on new segment height
+
+  mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
+
+# DMA out
+  bra.anyz -, ra_link
+  min r0, r0, r3        ; mov vw_setup, rb_dma1 # Stride
+  sub r1, r0, r3        ; mov vw_addr, rb_dest  # start the VDW
+  shl r1, r1, i_shift23
+# >>> .anyz ra_link
+
+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
+# We add to dma0 to reduce the number of output lines in the final block
+  add rb_lcount, rb_lcount, r0
+  brr -, r:1b
+  add rb_dma0, rb_dma0, r1
+  add rb_dest, rb_dest, r2
+  mov vw_setup, rb_vpm_init                     # Reset our VDM write pointer
+# >>> 1b
+.endm
+
+::mc_filter_y_b00
+  m_filter_y_b00 8
+
+################################################################################
+################################################################################
+# 10 BIT
+
+::mc_setup_c10_q0
+  m_setup_q0
+::mc_setup_c10_qn
+  m_setup_c 10
+
+::mc_filter_c10_p
+  m_filter_c_p 0, 10
+
+::mc_filter_c10_p_l1
+  m_filter_c_p 1, 10
+
+
+::mc_filter_c10_b
+  m_filter_c_b 10
+
+# Even if these fns are the same as for other bit depths we want our own copy
+# to keep the code we are using in a single lump to avoid (direct map) cache
+# thrashing
+.set v_quads10, N_QPU_16 / 4
+
+::mc_sync10_q0
+  m_sync_q 0, v_quads10
+::mc_sync10_q1
+  m_sync_q 1, v_quads10
+::mc_sync10_q2
+  m_sync_q 2, v_quads10
+::mc_sync10_q3
+  m_sync_q 3, v_quads10
+::mc_sync10_q4
+  m_sync_q 4, v_quads10
+::mc_sync10_q5
+  m_sync_q 5, v_quads10
+::mc_sync10_q6
+  m_sync_q 6, v_quads10
+::mc_sync10_q7
+  m_sync_q 7, v_quads10
+::mc_sync10_q8
+  m_sync_q 8, v_quads10
+::mc_sync10_q9
+  m_sync_q 9, v_quads10
+::mc_sync10_q10
+  m_sync_q 10, v_quads10
+::mc_sync10_q11
+  m_sync_q 11, v_quads10
+
+::mc_exit_y10_q0
+::mc_exit_c10_q0
+  m_exit_q0
+
+::mc_exit_y10_qn
+::mc_exit_c10_qn
+  m_exit_qn
+
+::mc_setup_y10_q0
+  m_setup_q0
+::mc_setup_y10_qn
+  m_setup_y 10
+
+:per_block_setup_10
+  m_per_block_setup 10
+
+::mc_filter_y10_pxx
+  m_filter_y_pxx 10
+
+::mc_filter_y10_p00
+  m_filter_y_p00 10
+
+::mc_filter_y10_bxx
+  m_filter_y_bxx 10
+
+::mc_filter_y10_b00
+  m_filter_y_b00 10
+
+
+
+::mc_end
+# Do not add code here because mc_end must appear after all other code.
diff --git a/libavcodec/rpi_shader_cmd.h b/libavcodec/rpi_shader_cmd.h
new file mode 100644
index 0000000000..9f8983da52
--- /dev/null
+++ b/libavcodec/rpi_shader_cmd.h
@@ -0,0 +1,128 @@
+#ifndef RPI_SHADER_CMD_H
+#define RPI_SHADER_CMD_H
+
+#pragma pack(push, 4)
+
+#if RPI_QPU_EMU_C && RPI_QPU_EMU_Y
+// If mixed then we are just confused and get a lot of warnings....
+typedef const uint8_t * qpu_mc_src_addr_t;
+typedef uint8_t * qpu_mc_dst_addr_t;
+#else
+typedef uint32_t qpu_mc_src_addr_t;
+typedef uint32_t qpu_mc_dst_addr_t;
+#endif
+
+typedef struct qpu_mc_src_s
+{
+    int16_t y;
+    int16_t x;
+    qpu_mc_src_addr_t base;
+} qpu_mc_src_t;
+
+
+typedef struct qpu_mc_pred_c_p_s {
+    qpu_mc_src_t next_src;
+    uint16_t h;
+    uint16_t w;
+    uint32_t coeffs_x;
+    uint32_t coeffs_y;
+    uint32_t wo_u;
+    uint32_t wo_v;
+    qpu_mc_dst_addr_t dst_addr_c;
+    uint32_t next_fn;
+} qpu_mc_pred_c_p_t;
+
+typedef struct qpu_mc_pred_c_b_s {
+    qpu_mc_src_t next_src1;
+    uint16_t h;
+    uint16_t w;
+    uint32_t coeffs_x1;
+    uint32_t coeffs_y1;
+    uint32_t weight_u1;
+    uint32_t weight_v1;
+    qpu_mc_src_t next_src2;
+    uint32_t coeffs_x2;
+    uint32_t coeffs_y2;
+    uint32_t wo_u2;
+    uint32_t wo_v2;
+    qpu_mc_dst_addr_t dst_addr_c;
+    uint32_t next_fn;
+} qpu_mc_pred_c_b_t;
+
+typedef struct qpu_mc_pred_c_s_s {
+    qpu_mc_src_t next_src1;
+    uint32_t pic_cw;            // C Width (== Y width / 2)
+    uint32_t pic_ch;            // C Height (== Y Height / 2)
+    uint32_t stride2;
+    uint32_t stride1;
+    uint32_t wdenom;
+    qpu_mc_src_t next_src2;
+    uint32_t next_fn;
+} qpu_mc_pred_c_s_t;
+
+typedef struct qpu_mc_pred_c_s {
+    union {
+        qpu_mc_pred_c_p_t p;
+        qpu_mc_pred_c_b_t b;
+        qpu_mc_pred_c_s_t s;
+    };
+} qpu_mc_pred_c_t;
+
+
+typedef struct qpu_mc_pred_y_p_s {
+    qpu_mc_src_t next_src1;
+    qpu_mc_src_t next_src2;
+    uint16_t h;
+    uint16_t w;
+    uint32_t mymx21;
+    uint32_t wo1;
+    uint32_t wo2;
+    qpu_mc_dst_addr_t dst_addr;
+    uint32_t next_fn;
+} qpu_mc_pred_y_p_t;
+
+typedef struct qpu_mc_pred_y_p00_s {
+    qpu_mc_src_t next_src1;
+    uint16_t h;
+    uint16_t w;
+    uint32_t wo1;
+    qpu_mc_dst_addr_t dst_addr;
+    uint32_t next_fn;
+} qpu_mc_pred_y_p00_t;
+
+typedef struct qpu_mc_pred_y_s_s {
+    qpu_mc_src_t next_src1;
+    qpu_mc_src_t next_src2;
+    uint16_t pic_h;
+    uint16_t pic_w;
+    uint32_t stride2;
+    uint32_t stride1;
+    uint32_t wdenom;
+    uint32_t next_fn;
+} qpu_mc_pred_y_s_t;
+
+// Only a useful structure in that it allows us to return something other than a void *
+typedef struct qpu_mc_pred_y_s {
+    union {
+        qpu_mc_pred_y_p_t p;
+        qpu_mc_pred_y_p00_t p00;
+        qpu_mc_pred_y_s_t s;
+    };
+} qpu_mc_pred_y_t;
+
+typedef union qpu_mc_pred_cmd_u {
+    qpu_mc_pred_y_t y;
+    qpu_mc_pred_c_t c;
+    uint32_t data[1];
+} qpu_mc_pred_cmd_t;
+
+#define QPU_MC_PRED_N_Y8        12
+#define QPU_MC_PRED_N_C8        12
+
+#define QPU_MC_PRED_N_Y10       12
+#define QPU_MC_PRED_N_C10       12
+
+#pragma pack(pop)
+
+#endif
+
diff --git a/libavcodec/rpi_shader_template.c b/libavcodec/rpi_shader_template.c
new file mode 100644
index 0000000000..1925ab7a79
--- /dev/null
+++ b/libavcodec/rpi_shader_template.c
@@ -0,0 +1,65 @@
+#ifdef RPI
+
+#include "hevc.h"
+#include "libavutil/rpi_sand_fns.h"
+#include "rpi_shader_cmd.h"
+#include "rpi_shader_template.h"
+
+typedef struct shader_track_s
+{
+    const union qpu_mc_pred_cmd_u *qpu_mc_curr;
+    const struct qpu_mc_src_s *last_l0;
+    const struct qpu_mc_src_s *last_l1;
+    uint32_t width;  // pic_width * PW
+    uint32_t height;
+    uint32_t stride2;
+    uint32_t stride1;
+    uint32_t wdenom;
+} shader_track_t;
+
+static int wtoidx(const unsigned int w)
+{
+    static const uint8_t pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
+    return pel_weight[w];
+}
+
+static const int fctom(uint32_t x)
+{
+    int rv;
+    // As it happens we can take the 2nd filter term & divide it by 8
+    // (dropping fractions) to get the fractional move
+    rv = 8 - ((x >> 11) & 0xf);
+    av_assert2(rv >= 0 && rv <= 7);
+    return rv;
+}
+
+static inline int32_t ext(int32_t x, unsigned int shl, unsigned int shr)
+{
+    return (x << shl) >> shr;
+}
+
+static inline int woff_p(HEVCContext *const s, int32_t x)
+{
+    return ext(x, 0, 17 + s->ps.sps->bit_depth - 8);
+}
+
+static inline int woff_b(HEVCContext *const s, int32_t x)
+{
+    return ext(x - 0x10000, 0, 16 + s->ps.sps->bit_depth - 8);
+}
+
+static inline int wweight(int32_t x)
+{
+    return ext(x, 16, 16);
+}
+
+
+#define PW 1
+#include "rpi_shader_template_fn.h"
+
+#undef PW
+#define PW 2
+#include "rpi_shader_template_fn.h"
+
+#endif
+
diff --git a/libavcodec/rpi_shader_template.h b/libavcodec/rpi_shader_template.h
new file mode 100644
index 0000000000..ecf5b8185a
--- /dev/null
+++ b/libavcodec/rpi_shader_template.h
@@ -0,0 +1,24 @@
+#ifndef LIBAVCODEC_RPI_SHADER_TEMPLATE_H
+#define LIBAVCODEC_RPI_SHADER_TEMPLATE_H
+
+#ifdef RPI
+struct HEVCContext;
+struct HEVCRpiInterPredEnv;
+
+void rpi_shader_c8(struct HEVCContext *const s,
+                  const struct HEVCRpiInterPredEnv *const ipe_y,
+                  const struct HEVCRpiInterPredEnv *const ipe_c);
+
+void rpi_shader_c16(struct HEVCContext *const s,
+                  const struct HEVCRpiInterPredEnv *const ipe_y,
+                  const struct HEVCRpiInterPredEnv *const ipe_c);
+
+void rpi_sand_dump8(const char * const name,
+                    const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c);
+
+void rpi_sand_dump16(const char * const name,
+                     const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c);
+
+#endif
+#endif
+
diff --git a/libavcodec/rpi_shader_template_fn.h b/libavcodec/rpi_shader_template_fn.h
new file mode 100644
index 0000000000..b5ac2ceed6
--- /dev/null
+++ b/libavcodec/rpi_shader_template_fn.h
@@ -0,0 +1,477 @@
+#define STRCAT(x,y) x##y
+
+#if PW == 1
+#define pixel uint8_t
+#define FUNC(f) STRCAT(f, 8)
+#elif PW == 2
+#define pixel uint16_t
+#define FUNC(f) STRCAT(f, 16)
+#else
+#error Unexpected PW
+#endif
+
+#define PATCH_STRIDE (16 * PW)
+
+static void FUNC(dup_lr)(uint8_t * dst, const uint8_t * src, unsigned int w, unsigned int h, unsigned int stride)
+{
+    for (unsigned int i = 0; i != h; ++i, dst += stride, src += stride) {
+        const pixel s = *(const pixel *)src;
+        pixel * d = (pixel *)dst;
+        for (unsigned int j = 0; j < w; j += PW) {
+            *d++ = s;
+        }
+    }
+}
+
+static void FUNC(dup_tb)(uint8_t * dst, const uint8_t * src, unsigned int w, unsigned int h, unsigned int stride)
+{
+    for (unsigned int i = 0; i != h; ++i, dst += stride) {
+        memcpy(dst, src, w);
+    }
+}
+
+static void FUNC(get_patch_y)(const shader_track_t * const st,
+                         uint8_t * dst, const unsigned int dst_stride,
+                         const qpu_mc_src_t *src,
+                         unsigned int _w, unsigned int _h)
+{
+    int x = src->x * PW;
+    int y = src->y;
+    int w = _w * PW;
+    int h = _h;
+    int dl = 0;
+    int dr = 0;
+    int dt = 0;
+    int db = 0;
+
+    if (x < 0) {
+        if (-x >= w)
+            x = PW - w;
+        dl = -x;
+        w += x;
+        x = 0;
+    }
+    if (x + w > st->width) {
+        if (x >= st->width)
+            x = st->width - PW;
+        dr = (x + w) - st->width;
+        w = st->width - x;
+    }
+
+    // Y
+    if (y < 0) {
+        if (-y >= h)
+            y = 1 - h;
+        dt = -y;
+        h += y;
+        y = 0;
+    }
+    if (y + h > st->height) {
+        if (y >= st->height)
+            y = st->height - 1;
+        db = (y + h) - st->height;
+        h = st->height - y;
+    }
+
+    dst += dl + dt * dst_stride;
+    FUNC(av_rpi_sand_to_planar_y)(dst, dst_stride, (const uint8_t *)src->base, st->stride1, st->stride2, x, y, w, h);
+
+    // Edge dup
+    if (dl != 0)
+        FUNC(dup_lr)(dst - dl, dst, dl, h, dst_stride);
+    if (dr != 0)
+        FUNC(dup_lr)(dst + w, dst + w - PW, dr, h, dst_stride);
+    w += dl + dr;
+    dst -= dl;
+
+    if (dt != 0)
+        FUNC(dup_tb)(dst - dt * dst_stride, dst, w, dt, dst_stride);
+    if (db != 0)
+        FUNC(dup_tb)(dst + h * dst_stride, dst + (h - 1) * dst_stride, w, db, dst_stride);
+}
+
+
+
+static void FUNC(get_patch_c)(const shader_track_t * const st,
+                         uint8_t * dst_u, uint8_t * dst_v, const unsigned int dst_stride,
+                         const qpu_mc_src_t *src,
+                         unsigned int _w, unsigned int _h)
+{
+    int x = src->x * PW;
+    int y = src->y;
+    int w = _w * PW;
+    int h = _h;
+    int dl = 0;
+    int dr = 0;
+    int dt = 0;
+    int db = 0;
+    const int width = st->width;
+    const int height = st->height;
+
+    if (x < 0) {
+        if (-x >= w)
+            x = PW - w;
+        dl = -x;
+        w += x;
+        x = 0;
+    }
+    if (x + w > width) {
+        if (x >= width)
+            x = width - PW;
+        dr = (x + w) - width;
+        w = width - x;
+    }
+
+    // Y
+    if (y < 0) {
+        if (-y >= h)
+            y = 1 - h;
+        dt = -y;
+        h += y;
+        y = 0;
+    }
+    if (y + h > height) {
+        if (y >= height)
+            y = height - 1;
+        db = (y + h) - height;
+        h = height - y;
+    }
+
+    dst_u += dl + dt * dst_stride;
+    dst_v += dl + dt * dst_stride;
+    FUNC(av_rpi_sand_to_planar_c)(dst_u, dst_stride, dst_v, dst_stride, (const uint8_t *)src->base, st->stride1, st->stride2, x, y, w, h);
+
+    // Edge dup
+    if (dl != 0)
+    {
+        FUNC(dup_lr)(dst_u - dl, dst_u, dl, h, dst_stride);
+        FUNC(dup_lr)(dst_v - dl, dst_v, dl, h, dst_stride);
+    }
+    if (dr != 0)
+    {
+        FUNC(dup_lr)(dst_u + w, dst_u + w - PW, dr, h, dst_stride);
+        FUNC(dup_lr)(dst_v + w, dst_v + w - PW, dr, h, dst_stride);
+    }
+    w += dl + dr;
+    dst_u -= dl;
+    dst_v -= dl;
+
+    if (dt != 0)
+    {
+        FUNC(dup_tb)(dst_u - dt * dst_stride, dst_u, w, dt, dst_stride);
+        FUNC(dup_tb)(dst_v - dt * dst_stride, dst_v, w, dt, dst_stride);
+    }
+    if (db != 0)
+    {
+        FUNC(dup_tb)(dst_u + h * dst_stride, dst_u + (h - 1) * dst_stride, w, db, dst_stride);
+        FUNC(dup_tb)(dst_v + h * dst_stride, dst_v + (h - 1) * dst_stride, w, db, dst_stride);
+    }
+}
+
+// w, y, w, h in pixels
+// stride1, stride2 in bytes
+void FUNC(rpi_sand_dump)(const char * const name,
+                         const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c)
+{
+    const int mask = stride2 == 0 ? ~0 : stride1 - 1;
+
+    printf("%s (%d,%d) %dx%d\n", name, x, y, w, h);
+
+    if (is_c) {
+        x *= 2;
+        w *= 2;
+    }
+
+    for (int i = y; i != y + h; ++i) {
+        for (int j = x; j != x + w; ++j) {
+            const uint8_t * p = base + ((j*PW) & mask) + i * stride1 + ((j*PW) & ~mask) * stride2;
+            char sep = is_c && (j & 1) == 0 ? ':' : ' ';
+#if PW == 1
+            if (j < 0 || i < 0)
+                printf("..%c", sep);
+            else
+                printf("%02x%c", *(const pixel*)p, sep);
+#else
+            if (j < 0 || i < 0)
+                printf("...%c", sep);
+            else
+                printf("%03x%c", *(const pixel*)p, sep);
+#endif
+        }
+        printf("\n");
+    }
+}
+
+
+void FUNC(rpi_shader_c)(HEVCContext *const s,
+                  const HEVCRpiInterPredEnv *const ipe_y,
+                  const HEVCRpiInterPredEnv *const ipe_c)
+{
+    for (int c_idx = 0; c_idx < 2; ++c_idx)
+    {
+        const HEVCRpiInterPredEnv *const ipe = c_idx == 0 ? ipe_y : ipe_c;
+        shader_track_t tracka[QPU_N_MAX] = {{NULL}};
+        unsigned int exit_n = 0;
+
+        if (ipe == NULL || !ipe->used) {
+            continue;
+        }
+
+        do {
+            for (unsigned int i = 0; i != ipe->n; ++i) {
+                const HEVCRpiInterPredQ * const q = ipe->q + i;
+                shader_track_t * const st = tracka + i;
+                const qpu_mc_pred_cmd_t * cmd = st->qpu_mc_curr == NULL ? q->qpu_mc_base : st->qpu_mc_curr;
+
+                for (;;) {
+                    const uint32_t link = (cmd == q->qpu_mc_base) ? q->code_setup : ((uint32_t *)cmd)[-1];
+
+                    if (link == q->code_setup) {
+                        if (c_idx == 0) {
+                            // Luma
+                            const qpu_mc_pred_y_s_t *const c = &cmd->y.s;
+
+                            st->height = c->pic_h;
+                            st->width = c->pic_w * PW;
+                            st->stride1 = c->stride1;
+                            st->stride2 = c->stride2;
+                            st->wdenom = c->wdenom;
+                            st->last_l0 = &c->next_src1;
+                            st->last_l1 = &c->next_src2;
+                            cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
+                        }
+                        else {
+                            // Chroma
+                            const qpu_mc_pred_c_s_t *const c = &cmd->c.s;
+
+                            st->height = c->pic_ch;
+                            st->width = c->pic_cw * PW;
+                            st->stride1 = c->stride1;
+                            st->stride2 = c->stride2;
+                            st->wdenom = c->wdenom;
+                            st->last_l0 = &c->next_src1;
+                            st->last_l1 = &c->next_src2;
+                            cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
+                        }
+                    }
+                    else if (link == s->qpu.y_pxx) {
+                        const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
+                        const int w1 = FFMIN(c->w, 8);
+                        const int w2 = c->w - w1;
+
+                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
+                        uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
+
+                        FUNC(get_patch_y)(st,
+                                    patch_y1, PATCH_STRIDE,
+                                    st->last_l0,
+                                    16, c->h + 7);
+                        if (w2 > 0) {
+                            FUNC(get_patch_y)(st,
+                                        patch_y2, PATCH_STRIDE,
+                                        st->last_l1,
+                                        16, c->h + 7);
+                        }
+
+                        // wo[offset] = offset*2+1
+                        s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w1)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0](
+                            (uint8_t *)c->dst_addr, st->stride1, patch_y1 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
+                            c->h, st->wdenom, wweight(c->wo1), woff_p(s, c->wo1), (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), w1);
+                        if (w2 > 0) {
+                            s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w2)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0](
+                                (uint8_t *)c->dst_addr + 8 * PW, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
+                                c->h, st->wdenom, wweight(c->wo2), woff_p(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), w2);
+                        }
+                        st->last_l0 = &c->next_src1;
+                        st->last_l1 = &c->next_src2;
+                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
+                    }
+                    else if (link == s->qpu.y_bxx) {
+                        const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
+
+                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
+                        uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
+                        int16_t patch_y3[MAX_PB_SIZE * MAX_PB_SIZE];
+
+                        FUNC(get_patch_y)(st,
+                                    patch_y1, PATCH_STRIDE,
+                                    st->last_l0,
+                                    16, c->h + 7);
+                        FUNC(get_patch_y)(st,
+                                    patch_y2, PATCH_STRIDE,
+                                    st->last_l1,
+                                    16, c->h + 7);
+
+                        s->hevcdsp.put_hevc_qpel[wtoidx(c->w)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0](
+                           patch_y3, patch_y1+ 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
+                           c->h, (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), c->w);
+
+                        s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0](
+                            (uint8_t *)c->dst_addr, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE, patch_y3,
+                            c->h, st->wdenom, wweight(c->wo1), wweight(c->wo2),
+                            0, woff_b(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), c->w);
+                        st->last_l0 = &c->next_src1;
+                        st->last_l1 = &c->next_src2;
+                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
+                    }
+                    else if (link == s->qpu.y_p00) {
+                        const qpu_mc_pred_y_p00_t *const c = &cmd->y.p00;
+
+                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
+
+                        FUNC(get_patch_y)(st,
+                                    patch_y1, PATCH_STRIDE,
+                                    st->last_l0,
+                                    16, c->h + 7);
+
+                        // wo[offset] = offset*2+1
+                        s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(c->w)][0][0](
+                            (uint8_t *)c->dst_addr, st->stride1, patch_y1, PATCH_STRIDE,
+                            c->h, st->wdenom, wweight(c->wo1), woff_p(s, c->wo1), 0, 0, c->w);
+
+                        st->last_l0 = &c->next_src1;
+                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
+                    }
+                    else if (link == s->qpu.y_b00) {
+                        const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
+
+                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
+                        uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
+                        int16_t patch_y3[MAX_PB_SIZE * MAX_PB_SIZE];
+
+                        av_assert0(c->w <= 16 && c->h <= 64);
+
+                        FUNC(get_patch_y)(st,
+                                    patch_y1, PATCH_STRIDE,
+                                    st->last_l0,
+                                    16, c->h);
+                        FUNC(get_patch_y)(st,
+                                    patch_y2, PATCH_STRIDE,
+                                    st->last_l1,
+                                    16, c->h);
+
+                        s->hevcdsp.put_hevc_qpel[wtoidx(c->w)][0][0](
+                           patch_y3, patch_y1, PATCH_STRIDE,
+                           c->h, 0, 0, c->w);
+
+                        s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][0][0](
+                            (uint8_t *)c->dst_addr, st->stride1, patch_y2, PATCH_STRIDE, patch_y3,
+                            c->h, st->wdenom, wweight(c->wo1), wweight(c->wo2),
+                            0, woff_b(s, c->wo2), 0, 0, c->w);
+                        st->last_l0 = &c->next_src1;
+                        st->last_l1 = &c->next_src2;
+                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
+                    }
+                    else if (link == s->qpu.c_pxx) {
+                        const qpu_mc_pred_c_p_t *const c = &cmd->c.p;
+                        const int mx = fctom(c->coeffs_x);
+                        const int my = fctom(c->coeffs_y);
+
+                        uint8_t patch_u1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
+                        uint8_t patch_v1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
+                        uint8_t patch_u3[8 * 16 * PW];
+                        uint8_t patch_v3[8 * 16 * PW];
+
+                        FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l0, 8+3, c->h + 3);
+
+                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
+                            patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
+                            c->h, st->wdenom, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w);
+                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
+                            patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
+                            c->h, st->wdenom, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w);
+
+                        FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
+
+                        st->last_l0 = &c->next_src;
+                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
+                    }
+                    else if (link == s->qpu.c_pxx_l1) {
+                        const qpu_mc_pred_c_p_t *const c = &cmd->c.p;
+                        const int mx = fctom(c->coeffs_x);
+                        const int my = fctom(c->coeffs_y);
+
+                        uint8_t patch_u1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
+                        uint8_t patch_v1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
+                        uint8_t patch_u3[8 * 16 * PW];
+                        uint8_t patch_v3[8 * 16 * PW];
+
+                        FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l1, 8+3, c->h + 3);
+
+                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
+                            patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
+                            c->h, st->wdenom, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w);
+                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
+                            patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
+                            c->h, st->wdenom, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w);
+
+                        FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
+
+                        st->last_l1 = &c->next_src;
+                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
+                    }
+                    else if (link == s->qpu.c_bxx) {
+                        const qpu_mc_pred_c_b_t *const c = &cmd->c.b;
+                        const int mx1 = fctom(c->coeffs_x1);
+                        const int my1 = fctom(c->coeffs_y1);
+                        const int mx2 = fctom(c->coeffs_x2);
+                        const int my2 = fctom(c->coeffs_y2);
+
+                        uint8_t patch_u1[PATCH_STRIDE * 72];
+                        uint8_t patch_v1[PATCH_STRIDE * 72];
+                        uint8_t patch_u2[PATCH_STRIDE * 72];
+                        uint8_t patch_v2[PATCH_STRIDE * 72];
+                        uint8_t patch_u3[8 * 16 * PW];
+                        uint8_t patch_v3[8 * 16 * PW];
+                        uint16_t patch_u4[MAX_PB_SIZE * MAX_PB_SIZE];
+                        uint16_t patch_v4[MAX_PB_SIZE * MAX_PB_SIZE];
+
+                        FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l0, 8+3, c->h + 3);
+                        FUNC(get_patch_c)(st, patch_u2, patch_v2, PATCH_STRIDE, st->last_l1, 8+3, c->h + 3);
+
+                        s->hevcdsp.put_hevc_epel[wtoidx(c->w)][my1 != 0][mx1 != 0](
+                           patch_u4, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
+                           c->h, mx1, my1, c->w);
+                        s->hevcdsp.put_hevc_epel[wtoidx(c->w)][my1 != 0][mx1 != 0](
+                           patch_v4, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
+                           c->h, mx1, my1, c->w);
+
+                        s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0](
+                            patch_u3, 8 * PW, patch_u2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_u4,
+                            c->h, st->wdenom, c->weight_u1, wweight(c->wo_u2),
+                            0, woff_b(s, c->wo_u2), mx2, my2, c->w);
+                        s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0](
+                            patch_v3, 8 * PW, patch_v2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_v4,
+                            c->h, st->wdenom, c->weight_v1, wweight(c->wo_v2),
+                            0, woff_b(s, c->wo_v2), mx2, my2, c->w);
+
+                        FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
+
+                        st->last_l0 = &c->next_src1;
+                        st->last_l1 = &c->next_src2;
+                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
+                    }
+                    else if (link == q->code_sync) {
+                        cmd = (const qpu_mc_pred_cmd_t *)((uint32_t *)cmd + 1);
+                        break;
+                    }
+                    else if (link == q->code_exit) {
+                        // We expect exit to occur without other sync
+                        av_assert0(i == exit_n);
+                        ++exit_n;
+                        break;
+                    }
+                    else {
+                        av_assert0(0);
+                    }
+                }
+
+                st->qpu_mc_curr = cmd;
+            }
+        } while (exit_n == 0);
+    }
+}
+
+#undef FUNC
+#undef pixel
+
diff --git a/libavcodec/rpi_zc.c b/libavcodec/rpi_zc.c
new file mode 100644
index 0000000000..b502de0a2c
--- /dev/null
+++ b/libavcodec/rpi_zc.c
@@ -0,0 +1,745 @@
+#include "config.h"
+#ifdef RPI
+#include "libavcodec/avcodec.h"
+#include "rpi_qpu.h"
+#include "rpi_mailbox.h"
+#include "rpi_zc.h"
+#include "libavutil/avassert.h"
+#include "libavutil/rpi_sand_fns.h"
+#include <pthread.h>
+
+#include "libavutil/buffer_internal.h"
+#include <interface/vctypes/vc_image_types.h>
+
+#define TRACE_ALLOC 0
+
+struct ZcPoolEnt;
+
+typedef struct ZcPool
+{
+    int numbytes;
+    unsigned int n;
+    struct ZcPoolEnt * head;
+    pthread_mutex_t lock;
+} ZcPool;
+
+typedef struct ZcPoolEnt
+{
+    // It is important that we start with gmem as other bits of code will expect to see that
+    GPU_MEM_PTR_T gmem;
+    unsigned int n;
+    struct ZcPoolEnt * next;
+    struct ZcPool * pool;
+} ZcPoolEnt;
+
+#define ALLOC_PAD       0
+#define ALLOC_ROUND     0x1000
+#define ALLOC_N_OFFSET  0
+#define STRIDE_ROUND    64
+#define STRIDE_OR       0
+
+#define DEBUG_ZAP0_BUFFERS 0
+
+
+static ZcPoolEnt * zc_pool_ent_alloc(ZcPool * const pool, const unsigned int req_size)
+{
+    ZcPoolEnt * const zp = av_malloc(sizeof(ZcPoolEnt));
+
+    // Round up to 4k & add 4k
+    const unsigned int alloc_size = (req_size + ALLOC_PAD + ALLOC_ROUND - 1) & ~(ALLOC_ROUND - 1);
+
+    if (zp == NULL) {
+        av_log(NULL, AV_LOG_ERROR, "av_malloc(ZcPoolEnt) failed\n");
+        goto fail0;
+    }
+
+    if (gpu_malloc_cached(alloc_size, &zp->gmem) != 0)
+    {
+        av_log(NULL, AV_LOG_ERROR, "av_gpu_malloc_cached(%d) failed\n", alloc_size);
+        goto fail1;
+    }
+
+#if TRACE_ALLOC
+    printf("%s: Alloc %#x bytes @ %p\n", __func__, zp->gmem.numbytes, zp->gmem.arm);
+#endif
+
+    pool->numbytes = zp->gmem.numbytes;
+    zp->next = NULL;
+    zp->pool = pool;
+    zp->n = pool->n++;
+    return zp;
+
+fail1:
+    av_free(zp);
+fail0:
+    return NULL;
+}
+
+static void zc_pool_ent_free(ZcPoolEnt * const zp)
+{
+#if TRACE_ALLOC
+    printf("%s: Free %#x bytes @ %p\n", __func__, zp->gmem.numbytes, zp->gmem.arm);
+#endif
+
+    gpu_free(&zp->gmem);
+    av_free(zp);
+}
+
+static void zc_pool_flush(ZcPool * const pool)
+{
+    ZcPoolEnt * p = pool->head;
+    pool->head = NULL;
+    pool->numbytes = -1;
+
+    while (p != NULL)
+    {
+        ZcPoolEnt * const zp = p;
+        p = p->next;
+        zc_pool_ent_free(zp);
+    }
+}
+
+static ZcPoolEnt * zc_pool_alloc(ZcPool * const pool, const int req_bytes)
+{
+    ZcPoolEnt * zp;
+    int numbytes;
+
+    pthread_mutex_lock(&pool->lock);
+
+    numbytes = pool->numbytes;
+
+    // If size isn't close then dump the pool
+    // Close in this context means within 128k
+    if (req_bytes > numbytes || req_bytes + 0x20000 < numbytes)
+    {
+        zc_pool_flush(pool);
+        numbytes = req_bytes;
+    }
+
+    if (pool->head != NULL)
+    {
+        zp = pool->head;
+        pool->head = zp->next;
+    }
+    else
+    {
+        zp = zc_pool_ent_alloc(pool, numbytes);
+    }
+
+    pthread_mutex_unlock(&pool->lock);
+
+    // Start with our buffer empty of preconceptions
+//    rpi_cache_flush_one_gm_ptr(&zp->gmem, RPI_CACHE_FLUSH_MODE_INVALIDATE);
+
+    return zp;
+}
+
+static void zc_pool_free(ZcPoolEnt * const zp)
+{
+    ZcPool * const pool = zp == NULL ? NULL : zp->pool;
+    if (zp != NULL)
+    {
+        pthread_mutex_lock(&pool->lock);
+#if TRACE_ALLOC
+        printf("%s: Recycle %#x, %#x\n", __func__, pool->numbytes, zp->gmem.numbytes);
+#endif
+
+        if (pool->numbytes == zp->gmem.numbytes)
+        {
+            zp->next = pool->head;
+            pool->head = zp;
+            pthread_mutex_unlock(&pool->lock);
+        }
+        else
+        {
+            pthread_mutex_unlock(&pool->lock);
+            zc_pool_ent_free(zp);
+        }
+    }
+}
+
+static void
+zc_pool_init(ZcPool * const pool)
+{
+    pool->numbytes = -1;
+    pool->head = NULL;
+    pthread_mutex_init(&pool->lock, NULL);
+}
+
+static void
+zc_pool_destroy(ZcPool * const pool)
+{
+    pool->numbytes = -1;
+    zc_pool_flush(pool);
+    pthread_mutex_destroy(&pool->lock);
+}
+
+typedef struct ZcOldCtxVals
+{
+    int thread_safe_callbacks;
+    int (*get_buffer2)(struct AVCodecContext *s, AVFrame *frame, int flags);
+    void * get_buffer_context;
+} ZcOldCtxVals;
+
+typedef struct AVZcEnv
+{
+    unsigned int refcount;
+    ZcPool pool;
+    ZcOldCtxVals old;
+} ZcEnv;
+
+// Callback when buffer unrefed to zero
+static void rpi_free_display_buffer(void *opaque, uint8_t *data)
+{
+    ZcPoolEnt *const zp = opaque;
+//    printf("%s: data=%p\n", __func__, data);
+    zc_pool_free(zp);
+}
+
+static inline GPU_MEM_PTR_T * pic_gm_ptr(AVBufferRef * const buf)
+{
+    // Kludge where we check the free fn to check this is really
+    // one of our buffers - can't think of a better way
+    return buf == NULL || buf->buffer->free != rpi_free_display_buffer ? NULL :
+        av_buffer_get_opaque(buf);
+}
+
+AVRpiZcFrameGeometry av_rpi_zc_frame_geometry(
+    const int format, const unsigned int video_width, const unsigned int video_height)
+{
+    AVRpiZcFrameGeometry geo;
+
+    switch (format)
+    {
+        case AV_PIX_FMT_YUV420P:
+            geo.stride_y = ((video_width + 32 + STRIDE_ROUND - 1) & ~(STRIDE_ROUND - 1)) | STRIDE_OR;
+            geo.stride_c = geo.stride_y / 2;
+            geo.height_y = (video_height + 32 + 31) & ~31;
+            geo.height_c = geo.height_y / 2;
+            geo.planes_c = 2;
+            geo.stripes = 1;
+            geo.bytes_per_pel = 1;
+            break;
+
+        case AV_PIX_FMT_YUV420P10:
+            geo.stride_y = ((video_width * 2 + 64 + STRIDE_ROUND - 1) & ~(STRIDE_ROUND - 1)) | STRIDE_OR;
+            geo.stride_c = geo.stride_y / 2;
+            geo.height_y = (video_height + 32 + 31) & ~31;
+            geo.height_c = geo.height_y / 2;
+            geo.planes_c = 2;
+            geo.stripes = 1;
+            geo.bytes_per_pel = 2;
+            break;
+
+        case AV_PIX_FMT_SAND128:
+        {
+            const unsigned int stripe_w = 128;
+
+            static pthread_mutex_t sand_lock = PTHREAD_MUTEX_INITIALIZER;
+            static VC_IMAGE_T img = {0};
+
+            // Given the overhead of calling the mailbox keep a stashed
+            // copy as we will almost certainly just want the same numbers again
+            // but that means we need a lock
+            pthread_mutex_lock(&sand_lock);
+
+            if (img.width != video_width || img.height != video_height)
+            {
+                VC_IMAGE_T new_img = {
+                    .type = VC_IMAGE_YUV_UV,
+                    .width = video_width,
+                    .height = video_height
+                };
+
+                gpu_ref();
+                mbox_get_image_params(gpu_get_mailbox(), &new_img);
+                gpu_unref();
+                img = new_img;
+            }
+
+            geo.stride_y = stripe_w;
+            geo.stride_c = stripe_w;
+            geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w;
+            geo.height_c = img.pitch / stripe_w - geo.height_y;
+            geo.planes_c = 1;
+            geo.stripes = (video_width + stripe_w - 1) / stripe_w;
+            geo.bytes_per_pel = 1;
+
+            pthread_mutex_unlock(&sand_lock);
+
+            av_assert0((int)geo.height_y > 0 && (int)geo.height_c > 0);
+            av_assert0(geo.height_y >= video_height && geo.height_c >= video_height / 2);
+            break;
+        }
+
+        case AV_PIX_FMT_SAND64_16:
+        case AV_PIX_FMT_SAND64_10:
+        {
+            const unsigned int stripe_w = 128;  // bytes
+
+            static pthread_mutex_t sand_lock = PTHREAD_MUTEX_INITIALIZER;
+            static VC_IMAGE_T img = {0};
+
+            // Given the overhead of calling the mailbox keep a stashed
+            // copy as we will almost certainly just want the same numbers again
+            // but that means we need a lock
+            pthread_mutex_lock(&sand_lock);
+
+            if (img.width != video_width || img.height != video_height)
+            {
+                VC_IMAGE_T new_img = {
+                    .type = VC_IMAGE_YUV_UV_16,
+                    .width = video_width,
+                    .height = video_height
+                };
+
+                gpu_ref();
+                mbox_get_image_params(gpu_get_mailbox(), &new_img);
+                gpu_unref();
+                img = new_img;
+            }
+
+            geo.stride_y = stripe_w;
+            geo.stride_c = stripe_w;
+            geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w;
+            geo.height_c = img.pitch / stripe_w - geo.height_y;
+            geo.planes_c = 1;
+            geo.stripes = (video_width * 2 + stripe_w - 1) / stripe_w;
+            geo.bytes_per_pel = 2;
+
+            pthread_mutex_unlock(&sand_lock);
+            break;
+        }
+
+        default:
+            memset(&geo, 0, sizeof(geo));
+            break;
+    }
+    return geo;
+}
+
+
+static AVBufferRef * rpi_buf_pool_alloc(ZcPool * const pool, int size)
+{
+    ZcPoolEnt *const zp = zc_pool_alloc(pool, size);
+    AVBufferRef * buf;
+    intptr_t idata = (intptr_t)zp->gmem.arm;
+#if ALLOC_N_OFFSET != 0
+    intptr_t noff = (zp->n * ALLOC_N_OFFSET) & (ALLOC_PAD - 1);
+#endif
+
+    if (zp == NULL) {
+        av_log(NULL, AV_LOG_ERROR, "zc_pool_alloc(%d) failed\n", size);
+        goto fail0;
+    }
+
+#if ALLOC_N_OFFSET != 0
+    idata = ((idata & ~(ALLOC_PAD - 1)) | noff) + (((idata & (ALLOC_PAD - 1)) > noff) ? ALLOC_PAD : 0);
+#endif
+
+#if DEBUG_ZAP0_BUFFERS
+    memset((void*)idata, 0, size);
+#endif
+
+    if ((buf = av_buffer_create((void *)idata, size, rpi_free_display_buffer, zp, AV_BUFFER_FLAG_READONLY)) == NULL)
+    {
+        av_log(NULL, AV_LOG_ERROR, "av_buffer_create() failed\n");
+        goto fail2;
+    }
+
+    return buf;
+
+fail2:
+    zc_pool_free(zp);
+fail0:
+    return NULL;
+}
+
+static int rpi_get_display_buffer(ZcEnv *const zc, AVFrame * const frame)
+{
+    const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(frame->format, frame->width, frame->height);
+    const unsigned int size_y = geo.stride_y * geo.height_y;
+    const unsigned int size_c = geo.stride_c * geo.height_c;
+    const unsigned int size_pic = (size_y + size_c * geo.planes_c) * geo.stripes;
+    AVBufferRef * buf;
+    unsigned int i;
+
+//    printf("Do local alloc: format=%#x, %dx%d: %u\n", frame->format, frame->width, frame->height, size_pic);
+
+    if ((buf = rpi_buf_pool_alloc(&zc->pool, size_pic)) == NULL)
+    {
+        av_log(NULL, AV_LOG_ERROR, "rpi_get_display_buffer: Failed to get buffer from pool\n");
+        return AVERROR(ENOMEM);
+    }
+
+    for (i = 0; i < AV_NUM_DATA_POINTERS; i++) {
+        frame->buf[i] = NULL;
+        frame->data[i] = NULL;
+        frame->linesize[i] = 0;
+    }
+
+    frame->buf[0] = buf;
+
+    frame->linesize[0] = geo.stride_y;
+    frame->linesize[1] = geo.stride_c;
+    frame->linesize[2] = geo.stride_c;
+    // abuse: linesize[3] = "stripe stride"
+    // stripe_stride is NOT the stride between slices it is (that / geo.stride_y).
+    // In a general case this makes the calculation an xor and multiply rather
+    // than a divide and multiply
+    if (geo.stripes > 1)
+        frame->linesize[3] = geo.height_y + geo.height_c;
+
+    frame->data[0] = buf->data;
+    frame->data[1] = frame->data[0] + size_y;
+    if (geo.planes_c > 1)
+        frame->data[2] = frame->data[1] + size_c;
+
+    frame->extended_data = frame->data;
+    // Leave extended buf alone
+
+#if RPI_ZC_SAND_8_IN_10_BUF != 0
+    // *** If we intend to use this for real we will want a 2nd buffer pool
+    frame->buf[RPI_ZC_SAND_8_IN_10_BUF] = rpi_buf_pool_alloc(&zc->pool, size_pic);  // *** 2 * wanted size - kludge
+#endif
+
+    return 0;
+}
+
+#define RPI_GET_BUFFER2 1
+
+int av_rpi_zc_get_buffer2(struct AVCodecContext *s, AVFrame *frame, int flags)
+{
+#if !RPI_GET_BUFFER2
+    return avcodec_default_get_buffer2(s, frame, flags);
+#else
+    int rv;
+
+    if ((s->codec->capabilities & AV_CODEC_CAP_DR1) == 0)
+    {
+//        printf("Do default alloc: format=%#x\n", frame->format);
+        rv = avcodec_default_get_buffer2(s, frame, flags);
+    }
+    else if (frame->format == AV_PIX_FMT_YUV420P ||
+             av_rpi_is_sand_frame(frame))
+    {
+        rv = rpi_get_display_buffer(s->get_buffer_context, frame);
+    }
+    else
+    {
+        rv = avcodec_default_get_buffer2(s, frame, flags);
+    }
+
+#if 0
+    printf("%s: fmt:%d, %dx%d lsize=%d/%d/%d/%d data=%p/%p/%p bref=%p/%p/%p opaque[0]=%p\n", __func__,
+        frame->format, frame->width, frame->height,
+        frame->linesize[0], frame->linesize[1], frame->linesize[2], frame->linesize[3],
+        frame->data[0], frame->data[1], frame->data[2],
+        frame->buf[0], frame->buf[1], frame->buf[2],
+        av_buffer_get_opaque(frame->buf[0]));
+#endif
+    return rv;
+#endif
+}
+
+
+static AVBufferRef * zc_copy(struct AVCodecContext * const s,
+    const AVFrame * const src)
+{
+    AVFrame dest_frame;
+    AVFrame * const dest = &dest_frame;
+    unsigned int i;
+    uint8_t * psrc, * pdest;
+
+    dest->format = src->format;
+    dest->width = src->width;
+    dest->height = src->height;
+
+    if (rpi_get_display_buffer(s->get_buffer_context, dest) != 0)
+    {
+        return NULL;
+    }
+
+    for (i = 0, psrc = src->data[0], pdest = dest->data[0];
+         i != dest->height;
+         ++i, psrc += src->linesize[0], pdest += dest->linesize[0])
+    {
+        memcpy(pdest, psrc, dest->width);
+    }
+    for (i = 0, psrc = src->data[1], pdest = dest->data[1];
+         i != dest->height / 2;
+         ++i, psrc += src->linesize[1], pdest += dest->linesize[1])
+    {
+        memcpy(pdest, psrc, dest->width / 2);
+    }
+    for (i = 0, psrc = src->data[2], pdest = dest->data[2];
+         i != dest->height / 2;
+         ++i, psrc += src->linesize[2], pdest += dest->linesize[2])
+    {
+        memcpy(pdest, psrc, dest->width / 2);
+    }
+
+    return dest->buf[0];
+}
+
+
+static AVBufferRef * zc_420p10_to_sand128(struct AVCodecContext * const s,
+    const AVFrame * const src)
+{
+    AVFrame dest_frame;
+    AVFrame * const dest = &dest_frame;
+    unsigned int i;
+    uint8_t * psrc, * psrc2, * pdest;
+
+    memset(dest, 0, sizeof(*dest));
+    dest->format = AV_PIX_FMT_SAND128;
+    dest->width = src->width;
+    dest->height = src->height;
+
+    if (rpi_get_display_buffer(s->get_buffer_context, dest) != 0)
+    {
+        return NULL;
+    }
+
+    // Y
+    for (i = 0, psrc = src->data[0], pdest = dest->data[0];
+         i != dest->height;
+         ++i, psrc += src->linesize[0], pdest += dest->linesize[0])
+    {
+        uint16_t * s = (uint16_t*)psrc;
+        uint8_t * d = pdest;
+        for (unsigned int k = 0; k < dest->width; k += dest->linesize[0])
+        {
+            const unsigned int n = FFMIN(dest->linesize[0], dest->width - k);
+            for (unsigned int j = 0; j != n; ++j)
+                *d++ = (uint8_t)(*s++ >> 2);
+            d += (dest->linesize[3] - 1) * dest->linesize[0];
+        }
+    }
+
+    // C
+    for (i = 0, psrc = src->data[1], psrc2 = src->data[2], pdest = dest->data[1];
+         i != dest->height / 2;
+         ++i, psrc += src->linesize[1], psrc2 += src->linesize[2], pdest += dest->linesize[1])
+    {
+        const uint16_t * su = (uint16_t*)psrc;
+        const uint16_t * sv = (uint16_t*)psrc2;
+        uint8_t * d = pdest;
+        for (unsigned int k = 0; k < dest->width; k += dest->linesize[1])
+        {
+            const unsigned int n = FFMIN(dest->linesize[1], dest->width - k) / 2;
+            for (unsigned int j = 0; j != n; ++j)
+            {
+                *d++ = (uint8_t)(*su++ >> 2);
+                *d++ = (uint8_t)(*sv++ >> 2);
+            }
+            d += (dest->linesize[3] - 1) * dest->linesize[1];
+        }
+    }
+
+    return dest->buf[0];
+}
+
+
+static AVBufferRef * zc_sand64_16_to_sand128(struct AVCodecContext * const s,
+    const AVFrame * const src, const unsigned int src_bits)
+{
+    AVFrame dest_frame = {
+        .format = AV_PIX_FMT_SAND128,
+        .width = src->width,
+        .height = src->height
+    };
+    AVFrame * const dest = &dest_frame;
+    const unsigned int shr = src_bits - 8;
+
+    if (rpi_get_display_buffer(s->get_buffer_context, dest) != 0)
+    {
+        return NULL;
+    }
+
+    // Y
+    av_rpi_sand16_to_sand8(dest->data[0], dest->linesize[0], av_rpi_sand_frame_stride2(dest),
+                        src->data[0], src->linesize[0], av_rpi_sand_frame_stride2(dest),
+                        src->width, src->height, shr);
+    // C
+    av_rpi_sand16_to_sand8(dest->data[1], dest->linesize[1], av_rpi_sand_frame_stride2(dest),
+                        src->data[1], src->linesize[1], av_rpi_sand_frame_stride2(dest),
+                        src->width, src->height / 2, shr);
+
+    return dest->buf[0];
+}
+
+
+
+AVRpiZcRefPtr av_rpi_zc_ref(struct AVCodecContext * const s,
+    const AVFrame * const frame, const enum AVPixelFormat expected_format, const int maycopy)
+{
+    assert(s != NULL);
+
+    if (frame->format != AV_PIX_FMT_YUV420P &&
+        frame->format != AV_PIX_FMT_YUV420P10 &&
+        !av_rpi_is_sand_frame(frame))
+    {
+        av_log(s, AV_LOG_WARNING, "%s: *** Format not SAND/YUV420P: %d\n", __func__, frame->format);
+        return NULL;
+    }
+
+    if (frame->buf[1] != NULL || frame->format != expected_format)
+    {
+#if RPI_ZC_SAND_8_IN_10_BUF
+        if (frame->format == AV_PIX_FMT_SAND64_10 && expected_format == AV_PIX_FMT_SAND128 && frame->buf[RPI_ZC_SAND_8_IN_10_BUF] != NULL)
+        {
+//            av_log(s, AV_LOG_INFO, "%s: --- found buf[4]\n", __func__);
+            return av_buffer_ref(frame->buf[RPI_ZC_SAND_8_IN_10_BUF]);
+        }
+#endif
+
+        if (maycopy)
+        {
+            if (frame->buf[1] != NULL)
+                av_log(s, AV_LOG_INFO, "%s: *** Not a single buf frame: copying\n", __func__);
+            else
+                av_log(s, AV_LOG_INFO, "%s: *** Unexpected frame format %d: copying to %d\n", __func__, frame->format, expected_format);
+
+            switch (frame->format)
+            {
+                case AV_PIX_FMT_YUV420P10:
+                    return zc_420p10_to_sand128(s, frame);
+
+                case AV_PIX_FMT_SAND64_10:
+                    return zc_sand64_16_to_sand128(s, frame, 10);
+
+                default:
+                    return zc_copy(s, frame);
+            }
+        }
+        else
+        {
+            if (frame->buf[1] != NULL)
+                av_log(s, AV_LOG_WARNING, "%s: *** Not a single buf frame: buf[1] != NULL\n", __func__);
+            else
+                av_log(s, AV_LOG_INFO, "%s: *** Unexpected frame format: %d != %d\n", __func__, frame->format, expected_format);
+            return NULL;
+        }
+    }
+
+    if (pic_gm_ptr(frame->buf[0]) == NULL)
+    {
+        if (maycopy)
+        {
+            av_log(s, AV_LOG_INFO, "%s: *** Not one of our buffers: copying\n", __func__);
+            return zc_copy(s, frame);
+        }
+        else
+        {
+            av_log(s, AV_LOG_WARNING, "%s: *** Not one of our buffers: NULL\n", __func__);
+            return NULL;
+        }
+    }
+
+    return av_buffer_ref(frame->buf[0]);
+}
+
+int av_rpi_zc_vc_handle(const AVRpiZcRefPtr fr_ref)
+{
+    const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref);
+    return p == NULL ? -1 : p->vc_handle;
+}
+
+int av_rpi_zc_offset(const AVRpiZcRefPtr fr_ref)
+{
+    const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref);
+    return p == NULL ? 0 : fr_ref->data - p->arm;
+}
+
+int av_rpi_zc_length(const AVRpiZcRefPtr fr_ref)
+{
+    return fr_ref == NULL ? 0 : fr_ref->size;
+}
+
+
+int av_rpi_zc_numbytes(const AVRpiZcRefPtr fr_ref)
+{
+    const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref);
+    return p == NULL ? 0 : p->numbytes;
+}
+
+void av_rpi_zc_unref(AVRpiZcRefPtr fr_ref)
+{
+    if (fr_ref != NULL)
+    {
+        av_buffer_unref(&fr_ref);
+    }
+}
+
+AVZcEnvPtr av_rpi_zc_env_alloc(void)
+{
+    ZcEnv * const zc = av_mallocz(sizeof(ZcEnv));
+    if (zc == NULL)
+    {
+        av_log(NULL, AV_LOG_ERROR, "av_rpi_zc_env_alloc: Context allocation failed\n");
+        return NULL;
+    }
+
+    zc_pool_init(&zc->pool);
+    return zc;
+}
+
+void av_rpi_zc_env_free(AVZcEnvPtr zc)
+{
+    if (zc != NULL)
+    {
+        zc_pool_destroy(&zc->pool); ;
+        av_free(zc);
+    }
+}
+
+int av_rpi_zc_in_use(const struct AVCodecContext * const s)
+{
+    return s->get_buffer2 == av_rpi_zc_get_buffer2;
+}
+
+int av_rpi_zc_init(struct AVCodecContext * const s)
+{
+    if (av_rpi_zc_in_use(s))
+    {
+        ZcEnv * const zc = s->get_buffer_context;
+        ++zc->refcount;
+    }
+    else
+    {
+        ZcEnv *const zc = av_rpi_zc_env_alloc();
+        if (zc == NULL)
+        {
+            return AVERROR(ENOMEM);
+        }
+
+        zc->refcount = 1;
+        zc->old.get_buffer_context = s->get_buffer_context;
+        zc->old.get_buffer2 = s->get_buffer2;
+        zc->old.thread_safe_callbacks = s->thread_safe_callbacks;
+
+        s->get_buffer_context = zc;
+        s->get_buffer2 = av_rpi_zc_get_buffer2;
+        s->thread_safe_callbacks = 1;
+    }
+    return 0;
+}
+
+void av_rpi_zc_uninit(struct AVCodecContext * const s)
+{
+    if (av_rpi_zc_in_use(s))
+    {
+        ZcEnv * const zc = s->get_buffer_context;
+        if (--zc->refcount == 0)
+        {
+            s->get_buffer2 = zc->old.get_buffer2;
+            s->get_buffer_context = zc->old.get_buffer_context;
+            s->thread_safe_callbacks = zc->old.thread_safe_callbacks;
+            av_rpi_zc_env_free(zc);
+        }
+    }
+}
+
+#endif  // RPI
+
diff --git a/libavcodec/rpi_zc.h b/libavcodec/rpi_zc.h
new file mode 100644
index 0000000000..26fb3be999
--- /dev/null
+++ b/libavcodec/rpi_zc.h
@@ -0,0 +1,105 @@
+#ifndef LIBAVCODEC_RPI_ZC_H
+#define LIBAVCODEC_RPI_ZC_H
+
+// Zero-Copy frame code for RPi
+// RPi needs Y/U/V planes to be contiguous for display.  By default
+// ffmpeg will allocate separated planes so a memcpy is needed before
+// display.  This code provides a method a making ffmpeg allocate a single
+// bit of memory for the frame when can then be reference counted until
+// display has finished with it.
+
+// Frame buffer number in which to stuff an 8-bit copy of a 16-bit frame
+// 0 disables
+// *** This option still in development
+//     Only works if SAO active
+//     Allocates buffers that are twice the required size
+#define RPI_ZC_SAND_8_IN_10_BUF  0
+
+struct AVBufferRef;
+struct AVFrame;
+struct AVCodecContext;
+enum AVPixelFormat;
+
+// "Opaque" pointer to whatever we are using as a buffer reference
+typedef struct AVBufferRef * AVRpiZcRefPtr;
+
+struct AVZcEnv;
+typedef struct AVZcEnv * AVZcEnvPtr;
+
+typedef struct AVRpiZcFrameGeometry
+{
+    unsigned int stride_y;  // Luma stride (bytes)
+    unsigned int height_y;  // Luma height (lines)
+    unsigned int stride_c;  // Chroma stride (bytes)
+    unsigned int height_c;  // Chroma stride (lines)
+    unsigned int planes_c;  // Chroma plane count (U, V = 2, interleaved = 1)
+    unsigned int stripes;   // Number of stripes (sand)
+    unsigned int bytes_per_pel;
+} AVRpiZcFrameGeometry;
+
+
+AVRpiZcFrameGeometry av_rpi_zc_frame_geometry(
+    const int format,
+    const unsigned int video_width, const unsigned int video_height);
+
+// Replacement fn for avctx->get_buffer2
+// Should be set before calling avcodec_decode_open2
+//
+// N.B. in addition to to setting avctx->get_buffer2, avctx->refcounted_frames
+// must be set to 1 as otherwise the buffer info is killed before being returned
+// by avcodec_decode_video2.  Note also that this means that the AVFrame that is
+// returned must be manually derefed with av_frame_unref.  This should be done
+// after av_rpi_zc_ref has been called.
+int av_rpi_zc_get_buffer2(struct AVCodecContext *s, AVFrame *frame, int flags);
+
+// Generate a ZC reference to the buffer(s) in this frame
+// If the buffer doesn't appear to be one allocated by _get_buffer_2
+// then the behaviour depends on maycopy:
+//   If maycopy=0 then return NULL
+//   If maycopy=1 && the src frame is in a form where we can easily copy
+//     the data, then allocate a new buffer and copy the data into it
+//   Otherwise return NULL
+AVRpiZcRefPtr av_rpi_zc_ref(struct AVCodecContext * const s,
+    const struct AVFrame * const frame, const enum AVPixelFormat expected_format, const int maycopy);
+
+// Get the vc_handle from the frame ref
+// Returns -1 if ref doesn't look valid
+int av_rpi_zc_vc_handle(const AVRpiZcRefPtr fr_ref);
+// Get offset from the start of the memory referenced
+// by the vc_handle to valid data
+int av_rpi_zc_offset(const AVRpiZcRefPtr fr_ref);
+// Length of buffer data
+int av_rpi_zc_length(const AVRpiZcRefPtr fr_ref);
+// Get the number of bytes allocated from the frame ref
+// Returns 0 if ref doesn't look valid
+int av_rpi_zc_numbytes(const AVRpiZcRefPtr fr_ref);
+
+// Unreference the buffer refed/allocated by _zc_ref
+// If fr_ref is NULL then this will NOP
+void av_rpi_zc_unref(AVRpiZcRefPtr fr_ref);
+
+// Allocate an environment for the buffer pool used by the ZC code
+// This should be put in avctx->get_buffer_context so it can be found by
+// av_rpi_zc_get_buffer2 when it is called from ffmpeg
+AVZcEnvPtr av_rpi_zc_env_alloc(void);
+
+// Allocate the environment used by the ZC code
+void av_rpi_zc_env_free(AVZcEnvPtr);
+
+// Test to see if the context is using zc (checks get_buffer2)
+int av_rpi_zc_in_use(const struct AVCodecContext * const s);
+
+// Init ZC into a context
+// There is nothing magic in this fn - it just packages setting
+// get_buffer2 & get_buffer_context
+int av_rpi_zc_init(struct AVCodecContext * const s);
+
+// Free ZC from a context
+// There is nothing magic in this fn - it just packages unsetting
+// get_buffer2 & get_buffer_context
+void av_rpi_zc_uninit(struct AVCodecContext * const s);
+
+
+
+#endif
+
diff --git a/libavcodec/utils.c b/libavcodec/utils.c
index c4af9cbb17..c1b806e51b 100644
--- a/libavcodec/utils.c
+++ b/libavcodec/utils.c
@@ -26,6 +26,12 @@
  */

 #include "config.h"
+
+#ifdef RPI
+// Move video buffers to GPU memory
+#define RPI_GPU_BUFFERS
+#endif
+
 #include "libavutil/atomic.h"
 #include "libavutil/attributes.h"
 #include "libavutil/avassert.h"
@@ -39,6 +45,7 @@
 #include "libavutil/mathematics.h"
 #include "libavutil/mem_internal.h"
 #include "libavutil/pixdesc.h"
+#include "libavutil/rpi_sand_fns.h"
 #include "libavutil/imgutils.h"
 #include "libavutil/samplefmt.h"
 #include "libavutil/dict.h"
@@ -64,6 +71,10 @@
 #include "libavutil/ffversion.h"
 const char av_codec_ffversion[] = "FFmpeg version " FFMPEG_VERSION;

+#ifdef RPI_GPU_BUFFERS
+#include "rpi_qpu.h"
+#endif
+
 #if HAVE_PTHREADS || HAVE_W32THREADS || HAVE_OS2THREADS
 static int default_lockmgr_cb(void **arg, enum AVLockOp op)
 {
@@ -508,6 +519,47 @@ int avcodec_fill_audio_frame(AVFrame *frame, int nb_channels,
     return ret;
 }

+#ifdef RPI_GPU_BUFFERS
+static void rpi_buffer_default_free(void *opaque, uint8_t *data)
+{
+    GPU_MEM_PTR_T *p = opaque;
+    gpu_free(p);
+    av_free(p);
+}
+
+static AVBufferRef *rpi_buffer_alloc(int size)
+{
+    AVBufferRef *ret = NULL;
+    uint8_t    *data = NULL;
+    GPU_MEM_PTR_T *p;
+
+    static int total=0;
+    total+=size;
+
+    p = av_malloc(sizeof *p);
+    if (!p)
+        return NULL;
+
+    if (gpu_malloc_cached(size,p)<0)  // Change this line to choose cached or uncached memory.  The caching here refers to the ARM data cache.
+        return NULL;
+
+    data = p->arm;
+    printf("Rpi alloc %d/%d ARM=%p VC=%x->%x\n",size,total,p->arm,p->vc,p->vc+size);
+    //memset(data, 64, size);
+
+    if (!data)
+        return NULL;
+
+    ret = av_buffer_create(data, size, rpi_buffer_default_free, p, 0);
+    if (!ret) {
+        gpu_free(p);
+        av_freep(&p);
+    }
+
+    return ret;
+}
+#endif
+
 static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame)
 {
     FramePool *pool = avctx->internal->pool;
@@ -555,6 +607,14 @@ static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame)
             av_buffer_pool_uninit(&pool->pools[i]);
             pool->linesize[i] = linesize[i];
             if (size[i]) {
+#ifdef RPI_GPU_BUFFERS
+                if (avctx->codec_id == AV_CODEC_ID_HEVC)
+                    pool->pools[i] = av_buffer_pool_init(size[i] + 16 + STRIDE_ALIGN - 1,
+                                                     CONFIG_MEMORY_POISONING ?
+                                                        NULL :
+                                                        rpi_buffer_alloc);
+                else
+#endif
                 pool->pools[i] = av_buffer_pool_init(size[i] + 16 + STRIDE_ALIGN - 1,
                                                      CONFIG_MEMORY_POISONING ?
                                                         NULL :
@@ -729,6 +789,11 @@ int avcodec_default_get_buffer2(AVCodecContext *avctx, AVFrame *frame, int flags
 {
     int ret;

+#ifdef RPI
+    // This is going to end badly if we let it continue
+    av_assert0(!av_rpi_is_sand_frame(frame));
+#endif
+
     if ((ret = update_frame_pool(avctx, frame)) < 0)
         return ret;

diff --git a/libavfilter/avfilter.c b/libavfilter/avfilter.c
index 21f8d9e00d..71ce7b9186 100644
--- a/libavfilter/avfilter.c
+++ b/libavfilter/avfilter.c
@@ -915,6 +915,7 @@ int avfilter_init_str(AVFilterContext *filter, const char *args)
                    "options, but options were provided: %s.\n", args);
             return AVERROR(EINVAL);
         }
+        printf("=== args='%s'\n", args);

 #if FF_API_OLD_FILTER_OPTS || FF_API_OLD_FILTER_OPTS_ERROR
             if (   !strcmp(filter->filter->name, "format")     ||
diff --git a/libavformat/mpegts.c b/libavformat/mpegts.c
index 6767b65ec8..f270190d57 100644
--- a/libavformat/mpegts.c
+++ b/libavformat/mpegts.c
@@ -701,7 +701,7 @@ static const StreamType ISO_types[] = {
 #endif
     { 0x1b, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264       },
     { 0x1c, AVMEDIA_TYPE_AUDIO, AV_CODEC_ID_AAC        },
-    { 0x20, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264       },
+    { 0x20, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264_MVC   },
     { 0x21, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_JPEG2000   },
     { 0x24, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_HEVC       },
     { 0x42, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_CAVS       },
diff --git a/libavformat/utils.c b/libavformat/utils.c
index 5a35953d24..d36fdc3199 100644
--- a/libavformat/utils.c
+++ b/libavformat/utils.c
@@ -694,7 +694,7 @@ static int update_wrap_reference(AVFormatContext *s, AVStream *st, int stream_in
         int default_stream_index = av_find_default_stream_index(s);
         if (s->streams[default_stream_index]->pts_wrap_reference == AV_NOPTS_VALUE) {
             for (i = 0; i < s->nb_streams; i++) {
-                if (av_find_program_from_stream(s, NULL, i))
+                if (0 && av_find_program_from_stream(s, NULL, i))
                     continue;
                 s->streams[i]->pts_wrap_reference = pts_wrap_reference;
                 s->streams[i]->pts_wrap_behavior = pts_wrap_behavior;
diff --git a/libavutil/Makefile b/libavutil/Makefile
index 1e061763a2..cbc9bc145b 100644
--- a/libavutil/Makefile
+++ b/libavutil/Makefile
@@ -59,6 +59,8 @@ HEADERS = adler32.h                                                     \
           rational.h                                                    \
           replaygain.h                                                  \
           ripemd.h                                                      \
+          rpi_sand_fns.h                                                \
+          rpi_sand_fn_pw.h                                              \
           samplefmt.h                                                   \
           sha.h                                                         \
           sha512.h                                                      \
@@ -136,6 +138,7 @@ OBJS = adler32.o                                                        \
        reverse.o                                                        \
        rc4.o                                                            \
        ripemd.o                                                         \
+       rpi_sand_fns.o                                                   \
        samplefmt.o                                                      \
        sha.o                                                            \
        sha512.o                                                         \
diff --git a/libavutil/arm/Makefile b/libavutil/arm/Makefile
index 5da44b0542..b74b7c4e2f 100644
--- a/libavutil/arm/Makefile
+++ b/libavutil/arm/Makefile
@@ -6,3 +6,4 @@ VFP-OBJS += arm/float_dsp_init_vfp.o                                    \

 NEON-OBJS += arm/float_dsp_init_neon.o                                  \
              arm/float_dsp_neon.o                                       \
+             arm/rpi_sand_neon.o                                        \
diff --git a/libavutil/arm/rpi_sand_neon.S b/libavutil/arm/rpi_sand_neon.S
new file mode 100644
index 0000000000..dbffdaefa4
--- /dev/null
+++ b/libavutil/arm/rpi_sand_neon.S
@@ -0,0 +1,40 @@
+#include "libavutil/arm/asm.S"
+
+@ void rpi_sand128b_stripe_to_8_10(
+@   uint8_t * dest,             [r0]
+@   const uint8_t * src1,       [r1]
+@   const uint8_t * src2,       [r2]
+@   unsigned int lines);        [r3]
+
+.macro  stripe2_to_8, bit_depth
+        vpush    {q4-q7}
+1:
+        vldm     r1!, {q0-q7}
+        subs     r3, #1
+        vldm     r2!, {q8-q15}
+        vqrshrn.u16 d0,  q0,  #\bit_depth - 8
+        vqrshrn.u16 d1,  q1,  #\bit_depth - 8
+        vqrshrn.u16 d2,  q2,  #\bit_depth - 8
+        vqrshrn.u16 d3,  q3,  #\bit_depth - 8
+        vqrshrn.u16 d4,  q4,  #\bit_depth - 8
+        vqrshrn.u16 d5,  q5,  #\bit_depth - 8
+        vqrshrn.u16 d6,  q6,  #\bit_depth - 8
+        vqrshrn.u16 d7,  q7,  #\bit_depth - 8
+        vqrshrn.u16 d8,  q8,  #\bit_depth - 8
+        vqrshrn.u16 d9,  q9,  #\bit_depth - 8
+        vqrshrn.u16 d10, q10, #\bit_depth - 8
+        vqrshrn.u16 d11, q11, #\bit_depth - 8
+        vqrshrn.u16 d12, q12, #\bit_depth - 8
+        vqrshrn.u16 d13, q13, #\bit_depth - 8
+        vqrshrn.u16 d14, q14, #\bit_depth - 8
+        vqrshrn.u16 d15, q15, #\bit_depth - 8
+        vstm     r0!, {q0-q7}
+        bne      1b
+        vpop     {q4-q7}
+        bx       lr
+.endm
+
+function rpi_sand128b_stripe_to_8_10, export=1
+        stripe2_to_8     10
+endfunc
+
diff --git a/libavutil/buffer.c b/libavutil/buffer.c
index 694e116a3c..203ca7b3a8 100644
--- a/libavutil/buffer.c
+++ b/libavutil/buffer.c
@@ -425,3 +425,9 @@ AVBufferRef *av_buffer_pool_get(AVBufferPool *pool)

     return ret;
 }
+
+// Return the opaque for the underlying frame (gives us a GPU_MEM_PTR_T)
+void *av_buffer_pool_opaque(AVBufferRef *ref) {
+  BufferPoolEntry *buf = av_buffer_get_opaque(ref);
+  return buf->opaque;
+}
diff --git a/libavutil/buffer.h b/libavutil/buffer.h
index 0c0ce12cf2..82e0bc3058 100644
--- a/libavutil/buffer.h
+++ b/libavutil/buffer.h
@@ -283,6 +283,9 @@ void av_buffer_pool_uninit(AVBufferPool **pool);
  */
 AVBufferRef *av_buffer_pool_get(AVBufferPool *pool);

+// Return the opaque for the underlying frame
+void *av_buffer_pool_opaque(AVBufferRef *ref);
+
 /**
  * @}
  */
diff --git a/libavutil/frame.h b/libavutil/frame.h
index 2b5c3320c3..990347e484 100644
--- a/libavutil/frame.h
+++ b/libavutil/frame.h
@@ -120,7 +120,20 @@ enum AVFrameSideDataType {
      * The GOP timecode in 25 bit timecode format. Data format is 64-bit integer.
      * This is set on the first frame of a GOP that has a temporal reference of 0.
      */
-    AV_FRAME_DATA_GOP_TIMECODE
+    AV_FRAME_DATA_GOP_TIMECODE,
+
+    /**
+     * The data represents the AVSphericalMapping structure defined in
+     * libavutil/spherical.h.
+     */
+    AV_FRAME_DATA_SPHERICAL,
+
+    /**
+     * Extra data required to deal with a cropped Sand frame
+     * AVFrame holds the cropped size, but we cannot simply offset the start
+     * address to get the picture as we can for planar formats
+     */
+    AV_FRAME_DATA_SAND_INFO,
 };

 enum AVActiveFormatDescription {
@@ -133,6 +146,13 @@ enum AVActiveFormatDescription {
     AV_AFD_SP_4_3       = 15,
 };

+typedef struct AVFrameDataSandInfo
+{
+    unsigned int left_offset;
+    unsigned int top_offset;
+    unsigned int pic_width;
+    unsigned int pic_height;
+} AVFrameDataSandInfo;

 /**
  * Structure to hold side data for an AVFrame.
diff --git a/libavutil/pixdesc.c b/libavutil/pixdesc.c
index 0dffa4dbdb..17134b4f38 100644
--- a/libavutil/pixdesc.c
+++ b/libavutil/pixdesc.c
@@ -2088,6 +2088,30 @@ static const AVPixFmtDescriptor av_pix_fmt_descriptors[AV_PIX_FMT_NB] = {
         .flags = AV_PIX_FMT_FLAG_BE | AV_PIX_FMT_FLAG_PLANAR |
                  AV_PIX_FMT_FLAG_RGB | AV_PIX_FMT_FLAG_ALPHA,
     },
+    [AV_PIX_FMT_SAND128] = {
+        .name = "sand128",
+        .nb_components = 3,
+        .log2_chroma_w = 1,
+        .log2_chroma_h = 1,
+        .comp = {
+            { 0, 1, 0, 0, 8, 0, 7, 1 },        /* Y */
+            { 1, 2, 0, 0, 8, 1, 7, 1 },        /* U */
+            { 1, 2, 1, 0, 8, 1, 7, 2 },        /* V */
+        },
+        .flags = 0,
+    },
+    [AV_PIX_FMT_SAND64_10] = {
+        .name = "sand64_10",
+        .nb_components = 3,
+        .log2_chroma_w = 1,
+        .log2_chroma_h = 1,
+        .comp = {
+            { 0, 2, 0, 0, 10, 0, 9, 1 },        /* Y */
+            { 1, 4, 0, 0, 10, 1, 9, 1 },        /* U */
+            { 1, 4, 1, 0, 10, 1, 9, 2 },        /* V */
+        },
+        .flags = 0,
+    },
 };
 #if FF_API_PLUS1_MINUS1
 FF_ENABLE_DEPRECATION_WARNINGS
diff --git a/libavutil/pixfmt.h b/libavutil/pixfmt.h
index 0ed01c4844..2155b78704 100644
--- a/libavutil/pixfmt.h
+++ b/libavutil/pixfmt.h
@@ -303,7 +303,22 @@ enum AVPixelFormat {
     AV_PIX_FMT_GBRAP10BE,  ///< planar GBR 4:4:4:4 40bpp, big-endian
     AV_PIX_FMT_GBRAP10LE,  ///< planar GBR 4:4:4:4 40bpp, little-endian

-    AV_PIX_FMT_NB,        ///< number of pixel formats, DO NOT USE THIS if you want to link with shared libav* because the number of formats might differ between versions
+    AV_PIX_FMT_MEDIACODEC, ///< hardware decoding through MediaCodec
+
+    AV_PIX_FMT_GRAY12BE,   ///<        Y        , 12bpp, big-endian
+    AV_PIX_FMT_GRAY12LE,   ///<        Y        , 12bpp, little-endian
+    AV_PIX_FMT_GRAY10BE,   ///<        Y        , 10bpp, big-endian
+    AV_PIX_FMT_GRAY10LE,   ///<        Y        , 10bpp, little-endian
+
+    AV_PIX_FMT_P016LE, ///< like NV12, with 16bpp per component, little-endian
+    AV_PIX_FMT_P016BE, ///< like NV12, with 16bpp per component, big-endian
+
+// RPI - not on ifdef so can be got at by calling progs
+    AV_PIX_FMT_SAND128,    ///< 4:2:0  8-bit 128x*Y stripe, 64x*UV stripe, then next x stripe, mysterious padding
+    AV_PIX_FMT_SAND64_10,  ///< 4:2:0 10-bit  64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding
+    AV_PIX_FMT_SAND64_16,  ///< 4:2:0 16-bit  64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding
+
+    AV_PIX_FMT_NB         ///< number of pixel formats, DO NOT USE THIS if you want to link with shared libav* because the number of formats might differ between versions
 };

 #define AV_PIX_FMT_Y400A AV_PIX_FMT_GRAY8A
diff --git a/libavutil/rpi_sand_fn_pw.h b/libavutil/rpi_sand_fn_pw.h
new file mode 100644
index 0000000000..52d52a2a83
--- /dev/null
+++ b/libavutil/rpi_sand_fn_pw.h
@@ -0,0 +1,182 @@
+// * Included twice from rpi_sand_fn with different PW
+
+#define STRCAT(x,y) x##y
+
+#if PW == 1
+#define pixel uint8_t
+#define FUNC(f) STRCAT(f, 8)
+#elif PW == 2
+#define pixel uint16_t
+#define FUNC(f) STRCAT(f, 16)
+#else
+#error Unexpected PW
+#endif
+
+// Fetches a single patch - offscreen fixup not done here
+// w <= stride1
+// unclipped
+void FUNC(av_rpi_sand_to_planar_y)(uint8_t * dst, const unsigned int dst_stride,
+                             const uint8_t * src,
+                             unsigned int stride1, unsigned int stride2,
+                             unsigned int _x, unsigned int y,
+                             unsigned int _w, unsigned int h)
+{
+    const unsigned int x = _x;
+    const unsigned int w = _w;
+    const unsigned int mask = stride1 - 1;
+
+    if ((x & ~mask) == ((x + w) & ~mask)) {
+        // All in one sand stripe
+        const uint8_t * p = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
+        for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p += stride1) {
+            memcpy(dst, p, w);
+        }
+    }
+    else
+    {
+        // Two+ stripe
+        const unsigned int sstride = stride1 * stride2;
+        const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
+        const uint8_t * p2 = p1 + sstride - (x & mask);
+        const unsigned int w1 = stride1 - (x & mask);
+        const unsigned int w3 = (x + w) & mask;
+        const unsigned int w2 = w - (w1 + w3);
+
+        for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p1 += stride1, p2 += stride1) {
+            unsigned int j;
+            const uint8_t * p = p2;
+            uint8_t * d = dst;
+            memcpy(d, p1, w1);
+            d += w1;
+            for (j = 0; j < w2; j += stride1, d += stride1, p += sstride) {
+                memcpy(d, p, stride1);
+            }
+            memcpy(d, p, w3);
+        }
+    }
+}
+
+// x & w in bytes but not of interleave (i.e. offset = x*2 for U&V)
+
+void FUNC(av_rpi_sand_to_planar_c)(uint8_t * dst_u, const unsigned int dst_stride_u,
+                             uint8_t * dst_v, const unsigned int dst_stride_v,
+                             const uint8_t * src,
+                             unsigned int stride1, unsigned int stride2,
+                             unsigned int _x, unsigned int y,
+                             unsigned int _w, unsigned int h)
+{
+    const unsigned int x = _x * 2;
+    const unsigned int w = _w * 2;
+    const unsigned int mask = stride1 - 1;
+
+    if ((x & ~mask) == ((x + w) & ~mask)) {
+        // All in one sand stripe
+        const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
+        for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p1 += stride1) {
+            pixel * du = (pixel *)dst_u;
+            pixel * dv = (pixel *)dst_v;
+            const pixel * p = (const pixel *)p1;
+            for (unsigned int k = 0; k < w; k += 2 * PW) {
+                *du++ = *p++;
+                *dv++ = *p++;
+            }
+        }
+    }
+    else
+    {
+        // Two+ stripe
+        const unsigned int sstride = stride1 * stride2;
+        const unsigned int sstride_p = (sstride - stride1) / PW;
+
+        const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
+        const uint8_t * p2 = p1 + sstride - (x & mask);
+        const unsigned int w1 = stride1 - (x & mask);
+        const unsigned int w3 = (x + w) & mask;
+        const unsigned int w2 = w - (w1 + w3);
+
+        for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p1 += stride1, p2 += stride1) {
+            unsigned int j;
+            const pixel * p = (const pixel *)p1;
+            pixel * du = (pixel *)dst_u;
+            pixel * dv = (pixel *)dst_v;
+            for (unsigned int k = 0; k < w1; k += 2 * PW) {
+                *du++ = *p++;
+                *dv++ = *p++;
+            }
+            for (j = 0, p = (const pixel *)p2; j < w2; j += stride1, p += sstride_p) {
+                for (unsigned int k = 0; k < stride1; k += 2 * PW) {
+                    *du++ = *p++;
+                    *dv++ = *p++;
+                }
+            }
+            for (unsigned int k = 0; k < w3; k += 2 * PW) {
+                *du++ = *p++;
+                *dv++ = *p++;
+            }
+        }
+    }
+}
+
+void FUNC(av_rpi_planar_to_sand_c)(uint8_t * dst_c,
+                             unsigned int stride1, unsigned int stride2,
+                             const uint8_t * src_u, const unsigned int src_stride_u,
+                             const uint8_t * src_v, const unsigned int src_stride_v,
+                             unsigned int _x, unsigned int y,
+                             unsigned int _w, unsigned int h)
+{
+    const unsigned int x = _x * 2;
+    const unsigned int w = _w * 2;
+    const unsigned int mask = stride1 - 1;
+    if ((x & ~mask) == ((x + w) & ~mask)) {
+        // All in one sand stripe
+        uint8_t * p1 = dst_c + (x & mask) + y * stride1 + (x & ~mask) * stride2;
+        for (unsigned int i = 0; i != h; ++i, src_u += src_stride_u, src_v += src_stride_v, p1 += stride1) {
+            const pixel * su = (const pixel *)src_u;
+            const pixel * sv = (const pixel *)src_v;
+            pixel * p = (pixel *)p1;
+            for (unsigned int k = 0; k < w; k += 2 * PW) {
+                *p++ = *su++;
+                *p++ = *sv++;
+            }
+        }
+    }
+    else
+    {
+        // Two+ stripe
+        const unsigned int sstride = stride1 * stride2;
+        const unsigned int sstride_p = (sstride - stride1) / PW;
+
+        const uint8_t * p1 = dst_c + (x & mask) + y * stride1 + (x & ~mask) * stride2;
+        const uint8_t * p2 = p1 + sstride - (x & mask);
+        const unsigned int w1 = stride1 - (x & mask);
+        const unsigned int w3 = (x + w) & mask;
+        const unsigned int w2 = w - (w1 + w3);
+
+        for (unsigned int i = 0; i != h; ++i, src_u += src_stride_u, src_v += src_stride_v, p1 += stride1, p2 += stride1) {
+            unsigned int j;
+            const pixel * su = (const pixel *)src_u;
+            const pixel * sv = (const pixel *)src_v;
+            pixel * p = (pixel *)p1;
+            for (unsigned int k = 0; k < w1; k += 2 * PW) {
+                *p++ = *su++;
+                *p++ = *sv++;
+            }
+            for (j = 0, p = (pixel *)p2; j < w2; j += stride1, p += sstride_p) {
+                for (unsigned int k = 0; k < stride1; k += 2 * PW) {
+                    *p++ = *su++;
+                    *p++ = *sv++;
+                }
+            }
+            for (unsigned int k = 0; k < w3; k += 2 * PW) {
+                *p++ = *su++;
+                *p++ = *sv++;
+            }
+        }
+    }
+}
+
+
+#undef pixel
+#undef STRCAT
+#undef FUNC
+
diff --git a/libavutil/rpi_sand_fns.c b/libavutil/rpi_sand_fns.c
new file mode 100644
index 0000000000..ec4cfadf8a
--- /dev/null
+++ b/libavutil/rpi_sand_fns.c
@@ -0,0 +1,99 @@
+#include "config.h"
+#ifdef RPI
+#include <stdint.h>
+#include <string.h>
+#include "rpi_sand_fns.h"
+#include "avassert.h"
+
+#define PW 1
+#include "rpi_sand_fn_pw.h"
+#undef PW
+
+#define PW 2
+#include "rpi_sand_fn_pw.h"
+#undef PW
+
+#if HAVE_NEON
+void rpi_sand128b_stripe_to_8_10(uint8_t * dest, const uint8_t * src1, const uint8_t * src2, unsigned int lines);
+#endif
+
+#if 1
+// Simple round
+static void cpy16_to_8(uint8_t * dst, const uint8_t * _src, unsigned int n, const unsigned int shr)
+{
+    const unsigned int rnd = (1 << shr) >> 1;
+    const uint16_t * src = (const uint16_t *)_src;
+
+    for (; n != 0; --n) {
+        *dst++ = (*src++ + rnd) >> shr;
+    }
+}
+#else
+// Dithered variation
+static void cpy16_to_8(uint8_t * dst, const uint8_t * _src, unsigned int n, const unsigned int shr)
+{
+    unsigned int rnd = (1 << shr) >> 1;
+    const unsigned int mask = ((1 << shr) - 1);
+    const uint16_t * src = (const uint16_t *)_src;
+
+    for (; n != 0; --n) {
+        rnd = *src++ + (rnd & mask);
+        *dst++ = rnd >> shr;
+    }
+}
+#endif
+
+// w/h in pixels
+void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2,
+                         const uint8_t * src, const unsigned int src_stride1, const unsigned int src_stride2,
+                         unsigned int w, unsigned int h, const unsigned int shr)
+{
+    const unsigned int n = dst_stride1 / 2;
+    unsigned int j;
+
+    // This is true for our current layouts
+    av_assert0(dst_stride1 == src_stride1);
+
+    // As we have the same stride1 for src & dest and src is wider than dest
+    // then if we loop on src we can always write contiguously to dest
+    // We make no effort to copy an exact width - round up to nearest src stripe
+    // as we will always have storage in dest for that
+
+#if HAVE_NEON
+    if (shr == 3 && src_stride1 == 128) {
+        for (j = 0; j + n < w; j += dst_stride1) {
+            uint8_t * d = dst + j * dst_stride2;
+            const uint8_t * s1 = src + j * 2 * src_stride2;
+            const uint8_t * s2 = s1 + src_stride1 * src_stride2;
+
+            rpi_sand128b_stripe_to_8_10(d, s1, s2, h);
+        }
+    }
+    else
+#endif
+    {
+        for (j = 0; j + n < w; j += dst_stride1) {
+            uint8_t * d = dst + j * dst_stride2;
+            const uint8_t * s1 = src + j * 2 * src_stride2;
+            const uint8_t * s2 = s1 + src_stride1 * src_stride2;
+
+            for (unsigned int i = 0; i != h; ++i, s1 += src_stride1, s2 += src_stride1, d += dst_stride1) {
+                cpy16_to_8(d, s1, n, shr);
+                cpy16_to_8(d + n, s2, n, shr);
+            }
+        }
+    }
+
+    // Fix up a trailing dest half stripe
+    if (j < w) {
+        uint8_t * d = dst + j * dst_stride2;
+        const uint8_t * s1 = src + j * 2 * src_stride2;
+
+        for (unsigned int i = 0; i != h; ++i, s1 += src_stride1, d += dst_stride1) {
+            cpy16_to_8(d, s1, n, shr);
+        }
+    }
+}
+
+#endif  // RPI
+
diff --git a/libavutil/rpi_sand_fns.h b/libavutil/rpi_sand_fns.h
new file mode 100644
index 0000000000..aa880d0f63
--- /dev/null
+++ b/libavutil/rpi_sand_fns.h
@@ -0,0 +1,129 @@
+#ifndef AVUTIL_RPI_SAND_FNS
+#define AVUTIL_RPI_SAND_FNS
+#ifdef RPI
+
+#include "libavutil/frame.h"
+
+// For all these fns _x & _w are measured as coord * PW
+// For the C fns coords are in chroma pels (so luma / 2)
+// Strides are in bytes
+
+void av_rpi_sand_to_planar_y8(uint8_t * dst, const unsigned int dst_stride,
+                             const uint8_t * src,
+                             unsigned int stride1, unsigned int stride2,
+                             unsigned int _x, unsigned int y,
+                             unsigned int _w, unsigned int h);
+void av_rpi_sand_to_planar_y16(uint8_t * dst, const unsigned int dst_stride,
+                             const uint8_t * src,
+                             unsigned int stride1, unsigned int stride2,
+                             unsigned int _x, unsigned int y,
+                             unsigned int _w, unsigned int h);
+
+void av_rpi_sand_to_planar_c8(uint8_t * dst_u, const unsigned int dst_stride_u,
+                             uint8_t * dst_v, const unsigned int dst_stride_v,
+                             const uint8_t * src,
+                             unsigned int stride1, unsigned int stride2,
+                             unsigned int _x, unsigned int y,
+                             unsigned int _w, unsigned int h);
+void av_rpi_sand_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u,
+                             uint8_t * dst_v, const unsigned int dst_stride_v,
+                             const uint8_t * src,
+                             unsigned int stride1, unsigned int stride2,
+                             unsigned int _x, unsigned int y,
+                             unsigned int _w, unsigned int h);
+
+void av_rpi_planar_to_sand_c8(uint8_t * dst_c,
+                             unsigned int stride1, unsigned int stride2,
+                             const uint8_t * src_u, const unsigned int src_stride_u,
+                             const uint8_t * src_v, const unsigned int src_stride_v,
+                             unsigned int _x, unsigned int y,
+                             unsigned int _w, unsigned int h);
+void av_rpi_planar_to_sand_c16(uint8_t * dst_c,
+                             unsigned int stride1, unsigned int stride2,
+                             const uint8_t * src_u, const unsigned int src_stride_u,
+                             const uint8_t * src_v, const unsigned int src_stride_v,
+                             unsigned int _x, unsigned int y,
+                             unsigned int _w, unsigned int h);
+
+// w/h in pixels
+void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2,
+                         const uint8_t * src, const unsigned int src_stride1, const unsigned int src_stride2,
+                         unsigned int w, unsigned int h, const unsigned int shr);
+
+
+static inline unsigned int av_rpi_sand_frame_stride1(const AVFrame * const frame)
+{
+    // * We could repl;ace thios with a fixed 128 whic would allow the compiler
+    //   to optimize a whole lot better
+    return frame->linesize[0];
+}
+
+static inline unsigned int av_rpi_sand_frame_stride2(const AVFrame * const frame)
+{
+    return frame->linesize[3];
+}
+
+
+static inline int av_rpi_is_sand_format(const int format)
+{
+    return (format >= AV_PIX_FMT_SAND128 && format <= AV_PIX_FMT_SAND64_16);
+}
+
+static inline int av_rpi_is_sand_frame(const AVFrame * const frame)
+{
+    return av_rpi_is_sand_format(frame->format);
+}
+
+static inline int av_rpi_is_sand8_frame(const AVFrame * const frame)
+{
+    return (frame->format == AV_PIX_FMT_SAND128);
+}
+
+static inline int av_rpi_is_sand16_frame(const AVFrame * const frame)
+{
+    return (frame->format >= AV_PIX_FMT_SAND64_10 && frame->format <= AV_PIX_FMT_SAND64_16);
+}
+
+static inline int av_rpi_sand_frame_xshl(const AVFrame * const frame)
+{
+    return av_rpi_is_sand8_frame(frame) ? 0 : 1;
+}
+
+// If x is measured in bytes (not pixels) then this works for sand64_16 as
+// well as sand128 - but in the general case we work that out
+
+static inline unsigned int av_rpi_sand_frame_off_y(const AVFrame * const frame, const unsigned int x_y, const unsigned int y)
+{
+    const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
+    const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
+    const unsigned int x = x_y << av_rpi_sand_frame_xshl(frame);
+    const unsigned int x1 = x & (stride1 - 1);
+    const unsigned int x2 = x ^ x1;
+
+    return x1 + stride1 * y + stride2 * x2;
+}
+
+static inline unsigned int av_rpi_sand_frame_off_c(const AVFrame * const frame, const unsigned int x_c, const unsigned int y_c)
+{
+    const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
+    const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
+    const unsigned int x = x_c << (av_rpi_sand_frame_xshl(frame) + 1);
+    const unsigned int x1 = x & (stride1 - 1);
+    const unsigned int x2 = x ^ x1;
+
+    return x1 + stride1 * y_c + stride2 * x2;
+}
+
+static inline uint8_t * av_rpi_sand_frame_pos_y(const AVFrame * const frame, const unsigned int x, const unsigned int y)
+{
+    return frame->data[0] + av_rpi_sand_frame_off_y(frame, x, y);
+}
+
+static inline uint8_t * av_rpi_sand_frame_pos_c(const AVFrame * const frame, const unsigned int x, const unsigned int y)
+{
+    return frame->data[1] + av_rpi_sand_frame_off_c(frame, x, y);
+}
+
+#endif
+#endif
+
diff --git a/libswscale/input.c b/libswscale/input.c
index 14ab5abb3a..7a827c71e3 100644
--- a/libswscale/input.c
+++ b/libswscale/input.c
@@ -719,6 +719,13 @@ static void p010BEToUV_c(uint8_t *dstU, uint8_t *dstV,
     }
 }

+static void sand128ToUV_c(uint8_t *dstU, uint8_t *dstV,
+                       const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
+                       int width, uint32_t *unused)
+{
+    // NIF
+}
+
 #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))

 static void bgr24ToY_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,
@@ -1085,6 +1092,10 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c)
     case AV_PIX_FMT_P010BE:
         c->chrToYV12 = p010BEToUV_c;
         break;
+    case AV_PIX_FMT_SAND128:
+    case AV_PIX_FMT_SAND64_10:
+        c->chrToYV12 = sand128ToUV_c;  // NIF
+        break;
     }
     if (c->chrSrcHSubSample) {
         switch (srcFormat) {
diff --git a/libswscale/utils.c b/libswscale/utils.c
index 576d8f0d5a..fd88a5e51e 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -248,6 +248,10 @@ static const FormatEntry format_entries[AV_PIX_FMT_NB] = {
     [AV_PIX_FMT_AYUV64LE]    = { 1, 1},
     [AV_PIX_FMT_P010LE]      = { 1, 0 },
     [AV_PIX_FMT_P010BE]      = { 1, 0 },
+#ifdef RPI
+    [AV_PIX_FMT_SAND128]     = { 1, 0 },
+    [AV_PIX_FMT_SAND64_10]   = { 1, 0 },
+#endif
 };

 int sws_isSupportedInput(enum AVPixelFormat pix_fmt)
diff --git a/pi-util/BUILD.txt b/pi-util/BUILD.txt
new file mode 100644
index 0000000000..b1e99a6a89
--- /dev/null
+++ b/pi-util/BUILD.txt
@@ -0,0 +1,25 @@
+Building Pi FFmpeg
+==================
+
+Configuration:
+=============
+
+pi-util/conf_pi2.sh
+
+contains suitable options to build the code for Pi2/3.  It expects to find
+git clones of
+
+https://github.com/raspberrypi/tools
+https://github.com/raspberrypi/firmware
+
+in the parent of the FFmpeg directory.  I recommend using --depth 1 to avoid a
+lot of history you don't want.
+
+If you have a copy of qasm.py in ../local/bin then the .qasm sources will be
+rebuilt.  Otherwise the prebuilt .c & .h files will be used.
+Likewise ../local/bin/vasmvidcore_std will enable VPU code rebuild
+
+pi-util/conf_p1.sh should configure for Pi1.  Beware that as of this time
+H265 QPU acceleration is broken on Pi1 and so it is disabled.
+
+
diff --git a/pi-util/conf_h265.2016.csv b/pi-util/conf_h265.2016.csv
new file mode 100644
index 0000000000..f05b7753f7
--- /dev/null
+++ b/pi-util/conf_h265.2016.csv
@@ -0,0 +1,193 @@
+1,HEVC_v1/AMP_A_Samsung_7,AMP_A_Samsung_7.bin,AMP_A_Samsung_7.md5
+1,HEVC_v1/AMP_B_Samsung_7,AMP_B_Samsung_7.bin,AMP_B_Samsung_7.md5
+1,HEVC_v1/AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5
+1,HEVC_v1/AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5
+1,HEVC_v1/AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5
+1,HEVC_v1/AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5
+1,HEVC_v1/AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5
+1,HEVC_v1/AMVP_C_Samsung_7,AMVP_C_Samsung_7.bin,AMVP_C_Samsung_7.md5
+1,HEVC_v1/BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5
+1,HEVC_v1/CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5
+1,HEVC_v1/CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5
+1,HEVC_v1/CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5
+1,HEVC_v1/CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5
+1,HEVC_v1/CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5
+1,HEVC_v1/CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5
+1,HEVC_v1/CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5
+1,HEVC_v1/CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5
+1,HEVC_v1/CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5
+1,HEVC_v1/cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5
+1,HEVC_v1/CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5
+1,HEVC_v1/CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5
+1,HEVC_v1/DBLK_A_MAIN10_VIXS_4,DBLK_A_MAIN10_VIXS_4.bit,DBLK_A_MAIN10_VIXS_4.md5
+1,HEVC_v1/DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5
+1,HEVC_v1/DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5
+1,HEVC_v1/DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5
+1,HEVC_v1/DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5
+1,HEVC_v1/DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5
+1,HEVC_v1/DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5
+1,HEVC_v1/DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5
+1,HEVC_v1/DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
+1,HEVC_v1/DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5
+1,HEVC_v1/DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5
+1,HEVC_v1/DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5
+1,HEVC_v1/DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5
+1,HEVC_v1/DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5
+1,HEVC_v1/ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5
+1,HEVC_v1/ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5
+1,HEVC_v1/ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5
+1,HEVC_v1/EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5
+1,HEVC_v1/FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5
+1,HEVC_v1/HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5
+1,HEVC_v1/INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5
+1,HEVC_v1/INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5
+1,HEVC_v1/ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5
+1,HEVC_v1/ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5
+1,HEVC_v1/ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5
+1,HEVC_v1/ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5
+1,HEVC_v1/ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5
+1,HEVC_v1/IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5
+1,HEVC_v1/IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5
+1,HEVC_v1/IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5
+1,HEVC_v1/LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5
+1,HEVC_v1/LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5
+1,HEVC_v1/LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5
+1,HEVC_v1/MAXBINS_A_TI_5,MAXBINS_A_TI_5.bit,MAXBINS_A_TI_5_yuv.md5
+1,HEVC_v1/MAXBINS_B_TI_5,MAXBINS_B_TI_5.bit,MAXBINS_B_TI_5_yuv.md5
+1,HEVC_v1/MAXBINS_C_TI_5,MAXBINS_C_TI_5.bit,MAXBINS_C_TI_5_yuv.md5
+1,HEVC_v1/MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5
+1,HEVC_v1/MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5
+1,HEVC_v1/MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5
+1,HEVC_v1/MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5
+1,HEVC_v1/MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5
+1,HEVC_v1/MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5
+1,HEVC_v1/MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5
+1,HEVC_v1/MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5
+1,HEVC_v1/MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5
+1,HEVC_v1/MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5
+1,HEVC_v1/NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5
+1,HEVC_v1/NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5
+1,HEVC_v1/NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
+1,HEVC_v1/OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5
+1,HEVC_v1/OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5
+1,HEVC_v1/OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5
+1,HEVC_v1/PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5
+1,HEVC_v1/PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5
+1,HEVC_v1/PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5
+1,HEVC_v1/PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5
+1,HEVC_v1/PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5
+1,HEVC_v1/PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5
+1,HEVC_v1/PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5
+1,HEVC_v1/PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5
+1,HEVC_v1/PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5
+1,HEVC_v1/POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5
+1,HEVC_v1/PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
+1,HEVC_v1/PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5
+1,HEVC_v1/RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5
+1,HEVC_v1/RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
+1,HEVC_v1/RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5
+1,HEVC_v1/RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5
+1,HEVC_v1/RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5
+1,HEVC_v1/RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5
+1,HEVC_v1/RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5
+1,HEVC_v1/RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5
+1,HEVC_v1/RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5
+1,HEVC_v1/RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5
+1,HEVC_v1/RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5
+1,HEVC_v1/RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5
+1,HEVC_v1/RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5
+1,HEVC_v1/RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5
+1,HEVC_v1/RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5
+1,HEVC_v1/RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5
+1,HEVC_v1/RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5
+1,HEVC_v1/SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5
+1,HEVC_v1/SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5
+1,HEVC_v1/SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5
+1,HEVC_v1/SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5
+1,HEVC_v1/SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5
+1,HEVC_v1/SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5
+1,HEVC_v1/SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5
+1,HEVC_v1/SAO_H_Parabola_1,SAO_H_Parabola_1.bit,SAO_H_Parabola_1.md5
+2,HEVC_v1/SAODBLK_A_MainConcept_4,SAODBLK_A_MainConcept_4.bin,SAODBLK_A_MainConcept_4_md5.txt
+2,HEVC_v1/SAODBLK_B_MainConcept_4,SAODBLK_B_MainConcept_4.bin,SAODBLK_B_MainConcept_4_md5.txt
+1,HEVC_v1/SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5
+1,HEVC_v1/SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5
+1,HEVC_v1/SLIST_A_Sony_5,SLIST_A_Sony_5.bin,SLIST_A_Sony_5_yuv.md5
+1,HEVC_v1/SLIST_B_Sony_9,SLIST_B_Sony_9.bin,SLIST_B_Sony_9_yuv.md5
+1,HEVC_v1/SLIST_C_Sony_4,SLIST_C_Sony_4.bin,SLIST_C_Sony_4_yuv.md5
+1,HEVC_v1/SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5
+1,HEVC_v1/SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
+1,HEVC_v1/STRUCT_A_Samsung_7,STRUCT_A_Samsung_7.bin,STRUCT_A_Samsung_7.md5
+1,HEVC_v1/STRUCT_B_Samsung_7,STRUCT_B_Samsung_7.bin,STRUCT_B_Samsung_7.md5
+1,HEVC_v1/TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5
+1,HEVC_v1/TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5
+1,HEVC_v1/TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5
+1,HEVC_v1/TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5
+1,HEVC_v1/TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5
+1,HEVC_v1/TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5
+3,HEVC_v1/TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # unequal bit depth
+1,HEVC_v1/TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5
+1,HEVC_v1/VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
+3,HEVC_v1/VPSSPSPPS_A_MainConcept_1,VPSSPSPPS_A_MainConcept_1.bin,VPSSPSPPS_A_MainConcept_1_md5.txt, # ???
+1,HEVC_v1/WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5
+1,HEVC_v1/WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5
+1,HEVC_v1/WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5
+1,HEVC_v1/WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5
+1,HEVC_v1/WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5
+1,HEVC_v1/WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5
+1,HEVC_v1/WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5
+1,HEVC_v1/WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5
+1,HEVC_v1/WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5
+1,HEVC_v1/WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5
+1,HEVC_v1/WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5
+1,HEVC_v1/WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5
+1,HEVC_v1/WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5
+1,HEVC_v1/WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5
+1,HEVC_v1/WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5
+1,HEVC_v1/WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
+1,RExt/ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_2,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_2.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_yuv_2.md5
+0,RExt/Bitdepth_A_RExt_Sony_1,Bitdepth_A_RExt_Sony_1.bin,md5sum.txt
+0,RExt/Bitdepth_B_RExt_Sony_1,Bitdepth_B_RExt_Sony_1.bin,md5sum.txt
+0,RExt/CCP_10bit_RExt_QCOM,CCP_10bit_RExt_QCOM.bin,CCP_10bit_RExt_QCOM_md5sum.txt
+0,RExt/CCP_12bit_RExt_QCOM,CCP_12bit_RExt_QCOM.bin,CCP_12bit_RExt_QCOM_md5sum.txt
+0,RExt/CCP_8bit_RExt_QCOM,CCP_8bit_RExt_QCOM.bin,CCP_8bit_RExt_QCOM_md5sum.txt
+1,RExt/ExplicitRdpcm_A_BBC_1,ExplicitRdpcm_A_BBC_1.bit,md5sum.txt
+0,RExt/ExplicitRdpcm_B_BBC_2,ExplicitRdpcm_B_BBC_1.bit,md5sum.txt
+0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1.md5
+0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1.md5
+0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1.md5
+0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1.md5
+0,RExt/EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1.md5
+0,RExt/EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1.md5
+0,RExt/EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1.md5
+0,RExt/EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1.md5
+1,RExt/GENERAL_10b_420_RExt_Sony_1,GENERAL_10b_420_RExt_Sony_1.bit,GENERAL_10b_420_RExt_Sony_1.md5
+1,RExt/GENERAL_10b_422_RExt_Sony_1,GENERAL_10b_422_RExt_Sony_1.bit,GENERAL_10b_422_RExt_Sony_1.md5
+1,RExt/GENERAL_10b_444_RExt_Sony_2,GENERAL_10b_444_RExt_Sony_2.bit,GENERAL_10b_444_RExt_Sony_2.md5
+1,RExt/GENERAL_12b_400_RExt_Sony_1,GENERAL_12b_400_RExt_Sony_1.bit,GENERAL_12b_400_RExt_Sony_1.md5
+1,RExt/GENERAL_12b_420_RExt_Sony_1,GENERAL_12b_420_RExt_Sony_1.bit,GENERAL_12b_420_RExt_Sony_1.md5
+1,RExt/GENERAL_12b_422_RExt_Sony_1,GENERAL_12b_422_RExt_Sony_1.bit,GENERAL_12b_422_RExt_Sony_1.md5
+1,RExt/GENERAL_12b_444_RExt_Sony_2,GENERAL_12b_444_RExt_Sony_2.bit,GENERAL_12b_444_RExt_Sony_2.md5
+0,RExt/GENERAL_16b_400_RExt_Sony_1,GENERAL_16b_400_RExt_Sony_1.bit,GENERAL_16b_400_RExt_Sony_1.md5
+0,RExt/GENERAL_16b_444_highThroughput_RExt_Sony_2,GENERAL_16b_444_highThroughput_RExt_Sony_2.bit,GENERAL_16b_444_highThroughput_RExt_Sony_2.md5
+0,RExt/GENERAL_16b_444_RExt_Sony_2,GENERAL_16b_444_RExt_Sony_2.bit,GENERAL_16b_444_RExt_Sony_2.md5
+1,RExt/GENERAL_8b_400_RExt_Sony_1,GENERAL_8b_400_RExt_Sony_1.bit,GENERAL_8b_400_RExt_Sony_1.md5
+1,RExt/GENERAL_8b_420_RExt_Sony_1,GENERAL_8b_420_RExt_Sony_1.bit,GENERAL_8b_420_RExt_Sony_1.md5
+1,RExt/GENERAL_8b_444_RExt_Sony_2,GENERAL_8b_444_RExt_Sony_2.bit,GENERAL_8b_444_RExt_Sony_2.md5
+2,RExt/IPCM_A_RExt_NEC_2,IPCM_A_RExt_NEC_2.bit,IPCM_A_RExt_NEC_2_yuv.md5
+1,RExt/IPCM_B_RExt_NEC,IPCM_B_RExt_NEC.bit,IPCM_B_RExt_NEC_yuv.md5
+1,RExt/Main_422_10_A_RExt_Sony_2,Main_422_10_A_RExt_Sony_2.bin,md5sum.txt
+1,RExt/Main_422_10_B_RExt_Sony_2,Main_422_10_B_RExt_Sony_2.bin,md5sum.txt
+1,RExt/PERSIST_RPARAM_A_RExt_Sony_3,PERSIST_RPARAM_A_RExt_Sony_3.bit,PERSIST_RPARAM_A_RExt_Sony_3.md5
+1,RExt/QMATRIX_A_RExt_Sony_1,QMATRIX_A_RExt_Sony_1.bit,QMATRIX_A_RExt_Sony_1.md5
+1,RExt/SAO_A_RExt_MediaTek_1,SAO_A_RExt_MediaTek_1.bit,SAO_A_RExt_MediaTek_1.md5
+0,RExt/TSCTX_10bit_I_RExt_SHARP_1,TSCTX_10bit_I_RExt_SHARP_1.bin,TSCTX_10bit_I_RExt_SHARP_1.md5
+0,RExt/TSCTX_10bit_RExt_SHARP_1,TSCTX_10bit_RExt_SHARP_1.bin,TSCTX_10bit_RExt_SHARP_1.md5
+0,RExt/TSCTX_12bit_I_RExt_SHARP_1,TSCTX_12bit_I_RExt_SHARP_1.bin,TSCTX_12bit_I_RExt_SHARP_1.md5
+0,RExt/TSCTX_12bit_RExt_SHARP_1,TSCTX_12bit_RExt_SHARP_1.bin,TSCTX_12bit_RExt_SHARP_1.md5
+0,RExt/TSCTX_8bit_I_RExt_SHARP_1,TSCTX_8bit_I_RExt_SHARP_1.bin,TSCTX_8bit_I_RExt_SHARP_1.md5
+0,RExt/TSCTX_8bit_RExt_SHARP_1,TSCTX_8bit_RExt_SHARP_1.bin,TSCTX_8bit_RExt_SHARP_1.md5
+0,RExt/WAVETILES_RExt_Sony_2,WAVETILES_RExt_Sony_2.bit,WAVETILES_RExt_Sony_2.md5
+1,local/sao_cu16_mobile_344x280,sao_cu16_mobile_344x280.265,sao_cu16_mobile_344x280.md5
+1,local/dblk_cu16_mobile_344x280,dblk_cu16_mobile_344x280.265,dblk_cu16_mobile_344x280.md5
+2,local/dblksao_cu16_mobile_344x280,dblksao_cu16_mobile_344x280.265,dblksao_cu16_mobile_344x280.md5
diff --git a/pi-util/conf_h265.2016_HEVC_v1.csv b/pi-util/conf_h265.2016_HEVC_v1.csv
new file mode 100644
index 0000000000..6082641271
--- /dev/null
+++ b/pi-util/conf_h265.2016_HEVC_v1.csv
@@ -0,0 +1,147 @@
+1,AMP_A_Samsung_7,AMP_A_Samsung_7.bin,AMP_A_Samsung_7.md5
+1,AMP_B_Samsung_7,AMP_B_Samsung_7.bin,AMP_B_Samsung_7.md5
+1,AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5
+1,AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5
+1,AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5
+1,AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5
+1,AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5
+1,AMVP_C_Samsung_7,AMVP_C_Samsung_7.bin,AMVP_C_Samsung_7.md5
+1,BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5
+1,CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5
+1,CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5
+1,CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5
+1,CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5
+1,CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5
+1,CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5
+1,CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5
+1,CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5
+1,CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5
+1,cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5
+1,CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5
+1,CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5
+1,DBLK_A_MAIN10_VIXS_4,DBLK_A_MAIN10_VIXS_4.bit,DBLK_A_MAIN10_VIXS_4.md5
+1,DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5
+1,DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5
+1,DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5
+1,DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5
+1,DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5
+1,DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5
+1,DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5
+1,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
+1,DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5
+1,DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5
+1,DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5
+1,DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5
+1,DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5
+1,ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5
+1,ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5
+1,ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5
+1,EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5
+1,FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5
+1,HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5
+1,INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5
+1,INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5
+1,ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5
+1,ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5
+1,ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5
+1,ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5
+1,ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5
+1,IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5
+1,IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5
+1,IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5
+1,LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5
+1,LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5
+1,LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5
+1,MAXBINS_A_TI_5,MAXBINS_A_TI_5.bit,MAXBINS_A_TI_5_yuv.md5
+1,MAXBINS_B_TI_5,MAXBINS_B_TI_5.bit,MAXBINS_B_TI_5_yuv.md5
+1,MAXBINS_C_TI_5,MAXBINS_C_TI_5.bit,MAXBINS_C_TI_5_yuv.md5
+1,MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5
+1,MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5
+1,MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5
+1,MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5
+1,MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5
+1,MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5
+1,MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5
+1,MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5
+1,MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5
+1,MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5
+1,NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5
+1,NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5
+1,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
+1,OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5
+1,OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5
+1,OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5
+1,PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5
+1,PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5
+1,PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5
+1,PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5
+1,PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5
+1,PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5
+1,PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5
+1,PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5
+1,PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5
+1,POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5
+1,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
+1,PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5
+1,RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5
+1,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
+1,RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5
+1,RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5
+1,RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5
+1,RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5
+1,RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5
+1,RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5
+1,RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5
+1,RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5
+1,RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5
+1,RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5
+1,RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5
+1,RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5
+1,RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5
+1,RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5
+1,RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5
+1,SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5
+1,SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5
+1,SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5
+1,SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5
+1,SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5
+1,SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5
+1,SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5
+1,SAO_H_Parabola_1,SAO_H_Parabola_1.bit,SAO_H_Parabola_1.md5
+2,SAODBLK_A_MainConcept_4,SAODBLK_A_MainConcept_4.bin,SAODBLK_A_MainConcept_4_md5.txt
+2,SAODBLK_B_MainConcept_4,SAODBLK_B_MainConcept_4.bin,SAODBLK_B_MainConcept_4_md5.txt
+1,SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5
+1,SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5
+1,SLIST_A_Sony_5,SLIST_A_Sony_5.bin,SLIST_A_Sony_5_yuv.md5
+1,SLIST_B_Sony_9,SLIST_B_Sony_9.bin,SLIST_B_Sony_9_yuv.md5
+1,SLIST_C_Sony_4,SLIST_C_Sony_4.bin,SLIST_C_Sony_4_yuv.md5
+1,SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5
+1,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
+1,STRUCT_A_Samsung_7,STRUCT_A_Samsung_7.bin,STRUCT_A_Samsung_7.md5
+1,STRUCT_B_Samsung_7,STRUCT_B_Samsung_7.bin,STRUCT_B_Samsung_7.md5
+1,TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5
+1,TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5
+1,TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5
+1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5
+1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5
+1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5
+3,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # unequal bit depth
+1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5
+1,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
+3,VPSSPSPPS_A_MainConcept_1,VPSSPSPPS_A_MainConcept_1.bin,VPSSPSPPS_A_MainConcept_1_md5.txt, # ???
+1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5
+1,WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5
+1,WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5
+1,WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5
+1,WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5
+1,WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5
+1,WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5
+1,WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5
+1,WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5
+1,WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5
+1,WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5
+1,WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5
+1,WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5
+1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5
+1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5
+1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
diff --git a/pi-util/conf_h265.csv b/pi-util/conf_h265.csv
new file mode 100644
index 0000000000..fc14f2a3c2
--- /dev/null
+++ b/pi-util/conf_h265.csv
@@ -0,0 +1,144 @@
+1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.md5
+1,AMP_A_Samsung_6,AMP_A_Samsung_6.bin,AMP_A_Samsung_6.md5
+1,AMP_B_Samsung_6,AMP_B_Samsung_6.bin,AMP_B_Samsung_6.md5
+1,AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5
+1,AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5
+1,AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5
+1,AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5
+1,AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5
+1,AMVP_C_Samsung_6,AMVP_C_Samsung_6.bin,AMVP_C_Samsung_6.md5
+1,BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5
+1,CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5
+1,CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5
+1,CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5
+1,CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5
+1,CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5
+1,CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5
+1,CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5
+1,CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5
+1,CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5
+1,cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5
+1,CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5
+1,CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5
+1,DBLK_A_MAIN10_VIXS_3,DBLK_A_MAIN10_VIXS_3.bit,DBLK_A_MAIN10_VIXS_3.md5
+1,DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5
+1,DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5
+1,DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5
+1,DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5
+1,DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5
+1,DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5
+1,DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5
+1,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
+1,DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5
+1,DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5
+1,DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5
+1,DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5
+1,DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5
+1,ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5
+1,ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5
+1,ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5
+1,EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5
+1,FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5
+1,HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5
+1,INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5
+1,INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5
+1,ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5
+1,ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5
+1,ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5
+1,ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5
+1,ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5
+1,IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5
+1,IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5
+1,IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5
+1,LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5
+1,LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5
+1,LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5
+1,MAXBINS_A_TI_4,MAXBINS_A_TI_4.bit,MAXBINS_A_TI_4.md5
+1,MAXBINS_B_TI_4,MAXBINS_B_TI_4.bit,MAXBINS_B_TI_4.md5
+1,MAXBINS_C_TI_4,MAXBINS_C_TI_4.bit,MAXBINS_C_TI_4.md5
+1,MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5
+1,MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5
+1,MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5
+1,MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5
+1,MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5
+1,MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5
+1,MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5
+1,MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5
+1,MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5
+1,MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5
+1,NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5
+1,NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5
+1,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
+1,OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5
+1,OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5
+1,OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5
+1,PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5
+1,PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5
+1,PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5
+1,PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5
+1,PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5
+1,PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5
+1,PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5
+1,PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5
+1,PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5
+1,POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5
+1,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
+1,PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5
+1,RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5
+1,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
+1,RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5
+1,RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5
+1,RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5
+1,RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5
+1,RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5
+1,RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5
+1,RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5
+1,RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5
+1,RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5
+1,RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5
+1,RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5
+1,RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5
+1,RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5
+1,RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5
+1,RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5
+1,SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5
+1,SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5
+1,SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5
+1,SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5
+1,SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5
+1,SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5
+1,SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5
+1,SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5
+1,SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5
+1,SLIST_A_Sony_4,str.bin,SLIST_A_Sony_4_yuv.md5
+1,SLIST_B_Sony_8,str.bin,SLIST_B_Sony_8_yuv.md5
+1,SLIST_C_Sony_3,str.bin,SLIST_C_Sony_3_yuv.md5
+1,SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5
+1,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
+1,STRUCT_A_Samsung_6,STRUCT_A_Samsung_6.bin,STRUCT_A_Samsung_6.md5
+1,STRUCT_B_Samsung_6,STRUCT_B_Samsung_6.bin,STRUCT_B_Samsung_6.md5
+1,TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5
+1,TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5
+1,TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5
+1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5
+1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5
+1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5
+0,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # Y/C bit depth unmatched
+1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5
+1,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
+1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5
+1,WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5
+1,WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5
+1,WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5
+1,WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5
+1,WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5
+1,WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5
+1,WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5
+1,WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5
+1,WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5
+1,WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5
+1,WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5
+1,WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5
+1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5
+1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5
+1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
diff --git a/pi-util/conf_pi1.sh b/pi-util/conf_pi1.sh
new file mode 100755
index 0000000000..ec25b81c31
--- /dev/null
+++ b/pi-util/conf_pi1.sh
@@ -0,0 +1,31 @@
+echo "Configure for Pi1"
+
+RPI_TOOLROOT=`pwd`/../tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf
+RPI_OPT_VC=`pwd`/../firmware/opt/vc
+
+RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
+RPI_DEFS="-D__VCCOREVER__=0x04000000 -DRPI=1"
+RPI_LIBDIRS="-L$RPI_TOOLROOT/lib -L$RPI_OPT_VC/lib"
+#RPI_KEEPS="-save-temps=obj"
+RPI_KEEPS=""
+
+./configure --enable-cross-compile\
+ --cpu=arm1176jzf-s\
+ --arch=arm\
+ --disable-neon\
+ --target-os=linux\
+ --disable-stripping\
+ --enable-mmal\
+ --extra-cflags="-g $RPI_KEEPS $RPI_DEFS $RPI_INCLUDES"\
+ --extra-cxxflags="$RPI_DEFS $RPI_INCLUDES"\
+ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_TOOLROOT/lib,-rpath-link=$RPI_TOOLROOT/lib"\
+ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\
+ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf-
+
+
+# --enable-extra-warnings\
+# --arch=armv71\
+# --enable-shared\
+
+# gcc option for getting asm listing
+# -Wa,-ahls
diff --git a/pi-util/conf_pi2.sh b/pi-util/conf_pi2.sh
new file mode 100755
index 0000000000..f8e5e75375
--- /dev/null
+++ b/pi-util/conf_pi2.sh
@@ -0,0 +1,30 @@
+echo "Configure for Pi2/3"
+
+RPI_TOOLROOT=`pwd`/../tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf
+RPI_OPT_VC=`pwd`/../firmware/opt/vc
+
+RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
+RPI_DEFS="-D__VCCOREVER__=0x04000000 -DRPI=1"
+RPI_LIBDIRS="-L$RPI_TOOLROOT/lib -L$RPI_OPT_VC/lib"
+#RPI_KEEPS="-save-temps=obj"
+RPI_KEEPS=""
+
+./configure --enable-cross-compile\
+ --arch=armv6t2\
+ --cpu=cortex-a7\
+ --target-os=linux\
+ --disable-stripping\
+ --disable-thumb\
+ --enable-mmal\
+ --extra-cflags="-g $RPI_KEEPS $RPI_DEFS $RPI_INCLUDES"\
+ --extra-cxxflags="$RPI_DEFS $RPI_INCLUDES"\
+ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_TOOLROOT/lib,-rpath-link=$RPI_TOOLROOT/lib"\
+ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\
+ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf-
+
+# --enable-extra-warnings\
+# --arch=armv71\
+# --enable-shared\
+
+# gcc option for getting asm listing
+# -Wa,-ahls
diff --git a/pi-util/ffconf.py b/pi-util/ffconf.py
new file mode 100755
index 0000000000..70f7be22bb
--- /dev/null
+++ b/pi-util/ffconf.py
@@ -0,0 +1,174 @@
+#!/usr/bin/env python
+
+import string
+import os
+import subprocess
+import re
+import argparse
+import sys
+import csv
+from stat import *
+
+ffmpeg_exec = "./ffmpeg"
+
+def testone(fileroot, srcname, es_file, md5_file):
+    tmp_root = "/tmp"
+
+    names = srcname.split('/')
+    while len(names) > 1:
+        tmp_root = os.path.join(tmp_root, names[0])
+        del names[0]
+    name = names[0]
+
+    if not os.path.exists(tmp_root):
+        os.makedirs(tmp_root)
+
+    dec_file = os.path.join(tmp_root, name + ".dec.md5")
+    try:
+        os.remove(dec_file)
+    except:
+        pass
+
+    flog = open(os.path.join(tmp_root, name + ".log"), "wt")
+
+    # Unaligned needed for cropping conformance
+    rstr = subprocess.call(
+        [ffmpeg_exec, "-flags", "unaligned", "-vcodec", "hevc", "-i", os.path.join(fileroot, es_file), "-f", "md5", dec_file],
+        stdout=flog, stderr=subprocess.STDOUT)
+
+    try:
+        m1 = None
+        m2 = None
+        with open(os.path.join(fileroot, md5_file)) as f:
+            for line in f:
+                m1 = re.search("[0-9a-f]{32}", line.lower())
+                if m1:
+                    break
+
+        with open(dec_file) as f:
+            m2 = re.search("[0-9a-f]{32}", f.readline())
+    except:
+        pass
+
+    if  m1 and m2 and m1.group() == m2.group():
+        print >> flog, "Match: " + m1.group()
+        rv = 0
+    elif not m1:
+        print >> flog, "****** Cannot find m1"
+        rv = 3
+    elif not m2:
+        print >> flog, "****** Cannot find m2"
+        rv = 2
+    else:
+        print >> flog, "****** Mismatch: " + m1.group() + " != " + m2.group()
+        rv = 1
+    flog.close()
+    return rv
+
+def scandir(root):
+    aconf = []
+    ents = os.listdir(root)
+    ents.sort(key=str.lower)
+    for name in ents:
+        test_path = os.path.join(root, name)
+        if S_ISDIR(os.stat(test_path).st_mode):
+            files = os.listdir(test_path)
+            es_file = "?"
+            md5_file = "?"
+            for f in files:
+                (base, ext) = os.path.splitext(f)
+                if base[0] == '.':
+                    pass
+                elif ext == ".bit" or ext == ".bin":
+                    es_file = f
+                elif ext == ".md5" or (ext == ".txt" and (base[-4:] == "_md5" or base[-6:] == "md5sum")):
+                    if md5_file == "?":
+                        md5_file = f
+                    elif base[-3:] == "yuv":
+                        md5_file = f
+            aconf.append((1, name, es_file, md5_file))
+    return aconf
+
+def runtest(name, tests):
+    if not tests:
+        return True
+    for t in tests:
+        if name[0:len(t)] == t or name.find("/" + t) != -1:
+            return True
+    return False
+
+def doconf(csva, tests, test_root):
+    unx_failures = []
+    unx_success = []
+    failures = 0
+    successes = 0
+    for a in csva:
+        exp_test = int(a[0])
+        if (exp_test and runtest(a[1], tests)):
+            name = a[1]
+            print "==== ", name,
+            sys.stdout.flush()
+
+            rv = testone(os.path.join(test_root, name), name, a[2], a[3])
+            if (rv == 0):
+                successes += 1
+            else:
+                failures += 1
+
+            if (rv == 0):
+                if exp_test == 2:
+                    print ": * OK *"
+                    unx_success.append(name)
+                else:
+                    print ": ok"
+            elif exp_test == 2 and rv == 1:
+                print ": fail"
+            elif exp_test == 3 and rv == 2:
+                # Call an expected "crash" an abort
+                print ": abort"
+            else:
+                unx_failures.append(name)
+                if rv == 1:
+                    print ": * FAIL *"
+                elif (rv == 2) :
+                    print ": * CRASH *"
+                elif (rv == 3) :
+                    print ": * MD5 MISSING *"
+                else :
+                    print ": * BANG *"
+
+    if unx_failures or unx_success:
+        print "Unexpected Failures:", unx_failures
+        print "Unexpected Success: ", unx_success
+    else:
+        print "All tests normal:", successes, "ok,", failures, "failed"
+
+
+class ConfCSVDialect(csv.Dialect):
+    delimiter = ','
+    doublequote = True
+    lineterminator = '\n'
+    quotechar='"'
+    quoting = csv.QUOTE_MINIMAL
+    skipinitialspace = True
+    strict = True
+
+if __name__ == '__main__':
+
+    argp = argparse.ArgumentParser(description="FFmpeg h265 conformance tester")
+    argp.add_argument("tests", nargs='*')
+    argp.add_argument("--test_root", default="/opt/conform/h265.2016", help="Root dir for test")
+    argp.add_argument("--csvgen", action='store_true', help="Generate CSV file for dir")
+    argp.add_argument("--csv", default="pi-util/conf_h265.2016.csv", help="CSV filename")
+    args = argp.parse_args()
+
+    if args.csvgen:
+        csv.writer(sys.stdout).writerows(scandir(args.test_root))
+        exit(0)
+
+    with open(args.csv, 'rt') as csvfile:
+        csva = [a for a in csv.reader(csvfile, ConfCSVDialect())]
+
+
+    doconf(csva, args.tests, args.test_root)
+
diff --git a/pi-util/ffperf.py b/pi-util/ffperf.py
new file mode 100755
index 0000000000..27cc453963
--- /dev/null
+++ b/pi-util/ffperf.py
@@ -0,0 +1,124 @@
+#!/usr/bin/env python3
+
+import time
+import string
+import os
+import tempfile
+import subprocess
+import re
+import argparse
+import sys
+import csv
+from stat import *
+
+class tstats:
+    close_threshold = 0.01
+
+    def __init__(self, stats_dict=None):
+        if stats_dict != None:
+            self.name = stats_dict["name"]
+            self.elapsed = float(stats_dict["elapsed"])
+            self.user = float(stats_dict["user"])
+            self.sys = float(stats_dict["sys"])
+
+    def times_str(self):
+        ctime = self.sys + self.user
+        return "time=%6.2f, cpu=%6.2f (%4.2f%%)" % (self.elapsed, ctime, (ctime * 100.0) / self.elapsed)
+
+    def dict(self):
+        return {"name":self.name, "elapsed":self.elapsed, "user":self.user, "sys":self.sys}
+
+    def is_close(self, other):
+        return abs(self.elapsed - other.elapsed) / self.elapsed < self.close_threshold
+
+    def __lt__(self, other):
+        return self.elapsed < other.elapsed
+    def __gt__(self, other):
+        return self.elapsed > other.elapsed
+
+    def time_file(name, prefix):
+        stats = tstats()
+        stats.name = name
+        start_time = time.clock_gettime(time.CLOCK_MONOTONIC);
+        cproc = subprocess.Popen(["./ffmpeg", "-t", "30", "-i", prefix + name,
+                                  "-f", "null", os.devnull], bufsize=-1, stdout=flog, stderr=flog);
+        pinfo = os.wait4(cproc.pid, 0)
+        end_time = time.clock_gettime(time.CLOCK_MONOTONIC);
+        stats.elapsed = end_time - start_time
+        stats.user = pinfo[2].ru_utime
+        stats.sys = pinfo[2].ru_stime
+        return stats
+
+
+def common_prefix(s1, s2):
+    for i in range(min(len(s1),len(s2))):
+        if s1[i] != s2[i]:
+            return s1[:i]
+    return s1[:i+1]
+
+def main():
+    global flog
+
+    argp = argparse.ArgumentParser(description="FFmpeg performance tester", epilog="""
+To blank the screen before starting use "xdg-screensaver activate"
+(For some reason this doesn't seem to work from within python).
+""")
+
+    argp.add_argument("streams", nargs='*')
+    argp.add_argument("--csv_out", default="ffperf_out.csv", help="CSV output filename")
+    argp.add_argument("--csv_in", help="CSV input filename")
+    argp.add_argument("--prefix", help="Filename prefix (include terminal '/' if a directory).")
+
+    args = argp.parse_args()
+
+    csv_out = csv.DictWriter(open(args.csv_out, 'w', newline=''), ["name", "elapsed", "user", "sys"])
+    csv_out.writeheader()
+
+    stats_in = {}
+    if args.csv_in != None:
+        with open(args.csv_in, 'r', newline='') as f_in:
+            stats_in = {x["name"]:tstats(x) for x in csv.DictReader(f_in)}
+
+    flog = open(os.path.join(tempfile.gettempdir(), "ffperf.log"), "wt")
+
+    streams = args.streams
+    if not streams:
+        if not stats_in:
+            print ("No source streams specified")
+            return 1
+        prefix = "" if args.prefix == None else args.prefix
+        streams = [k for k in stats_in]
+    elif args.prefix != None:
+        prefix = args.prefix
+    else:
+        prefix = streams[0]
+        for f in streams[1:]:
+            prefix = common_prefix(prefix, f)
+        pp = prefix.rpartition(os.sep)
+        prefix = pp[0] + pp[1]
+        streams = [s[len(prefix):] for s in streams]
+
+    for f in sorted(streams, key=lambda x : "~" * x.count(os.sep) + x.lower()):
+        print ("====", f)
+
+        t0 = tstats({"name":f, "elapsed":999, "user":999, "sys":999})
+        for i in range(3):
+            t = tstats.time_file(f, prefix)
+            print ("...", t.times_str())
+            if t0 > t:
+                t0 = t
+
+        if t0.name in stats_in:
+            pstat = stats_in[t0.name]
+            print("---" if pstat.is_close(t0) else "<<<" if t0 < pstat else ">>>", pstat.times_str())
+
+        csv_out.writerow(t0.dict())
+
+        print ()
+
+    return 0
+
+
+if __name__ == '__main__':
+    exit(main())
+
diff --git a/pi-util/make_array.py b/pi-util/make_array.py
new file mode 100755
index 0000000000..864fa5e704
--- /dev/null
+++ b/pi-util/make_array.py
@@ -0,0 +1,19 @@
+#!/usr/bin/env python
+
+# Usage
+#   make_array file.bin
+#   Produces file.h with array of bytes.
+#
+import sys
+for file in sys.argv[1:]:
+  prefix,suffix = file.split('.')
+  assert suffix=='bin'
+  name=prefix.split('/')[-1]
+  print 'Converting',file
+  with open(prefix+'.h','wb') as out:
+    print >>out, 'static const unsigned char',name,'[] = {'
+    with open(file,'rb') as fd:
+      for byte in fd.read():
+        print >>out, '%d,' % ord(byte)
+    print >>out,'};'
+
diff --git a/pi-util/qem.sh b/pi-util/qem.sh
new file mode 100755
index 0000000000..5ce2eeaf72
--- /dev/null
+++ b/pi-util/qem.sh
@@ -0,0 +1,9 @@
+TARGET_DIR=../src/eupton_vc4dev_2012a/software/vc4/DEV/applications/tutorials/user_shader_example_tex
+QASM=python\ ../local/bin/qasm.py
+SRC_FILE=libavcodec/rpi_shader.qasm
+DST_BASE=shader
+
+cp libavcodec/rpi_shader_cmd.h $TARGET_DIR
+$QASM -mc_c:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.c
+$QASM -mc_h:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.h
+
diff --git a/pi-util/v3dusage.py b/pi-util/v3dusage.py
new file mode 100755
index 0000000000..5935a11ca5
--- /dev/null
+++ b/pi-util/v3dusage.py
@@ -0,0 +1,128 @@
+#!/usr/bin/env python
+
+import sys
+import argparse
+import re
+
+def do_logparse(logname):
+
+    rmatch = re.compile(r'^([0-9]+\.[0-9]{3}): (done )?((vpu0)|(vpu1)|(qpu1)) ([A-Z_]+) cb:([0-9a-f]+) ')
+    rqcycle = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: QPU Total clock cycles for all QPUs doing vertex/coordinate shading +([0-9]+)$')
+    rqtscycle = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: QPU Total clock cycles for all QPUs stalled waiting for TMUs +([0-9]+)$')
+    rl2hits = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: L2C Total Level 2 cache ([a-z]+) +([0-9]+)$')
+
+    ttotal = {'idle':0.0}
+    tstart = {}
+    qctotal = {}
+    qtstotal = {}
+    l2hits = {}
+    l2total = {}
+    time0 = None
+    idle_start = None
+    qpu_op_no = 0
+    op_count = 0
+
+    with open(logname, "rt") as infile:
+        for line in infile:
+            match = rmatch.match(line)
+            if match:
+#                print match.group(1), ":", match.group(2), ":", match.group(3), ":", match.group(7), ":"
+                time = float(match.group(1))
+                unit = match.group(3)
+                opstart = not match.group(2)
+                optype = match.group(7)
+                hascb = match.group(8) != "0"
+
+                if unit == 'qpu1':
+                    unit = unit + "." + str(qpu_op_no)
+                    if not opstart:
+                        if hascb or optype == 'EXECUTE_SYNC':
+                            qpu_op_no = 0
+                        else:
+                            qpu_op_no += 1
+
+                # Ignore sync type
+                if optype == 'EXECUTE_SYNC':
+                    continue
+
+                if not time0:
+                    time0 = time
+
+                if opstart:
+                    tstart[unit] = time;
+                elif unit in tstart:
+                    op_count += 1
+                    if not unit in ttotal:
+                        ttotal[unit] = 0.0
+                    ttotal[unit] += time - tstart[unit]
+                    del tstart[unit]
+
+                if not idle_start and not tstart:
+                    idle_start = time
+                elif idle_start and tstart:
+                    ttotal['idle'] += time - idle_start
+                    idle_start = None
+
+            match = rqcycle.match(line)
+            if match:
+                unit = "qpu1." + str(qpu_op_no)
+                if not unit in qctotal:
+                    qctotal[unit] = 0
+                qctotal[unit] += int(match.group(2))
+
+            match = rqtscycle.match(line)
+            if match:
+                unit = "qpu1." + str(qpu_op_no)
+                if not unit in qtstotal:
+                    qtstotal[unit] = 0
+                qtstotal[unit] += int(match.group(2))
+
+            match = rl2hits.match(line)
+            if match:
+                unit = "qpu1." + str(qpu_op_no)
+                if not unit in l2total:
+                    l2total[unit] = 0
+                    l2hits[unit] = 0
+                l2total[unit] += int(match.group(3))
+                if match.group(2) == "hits":
+                    l2hits[unit] += int(match.group(3))
+
+
+    if not time0:
+        print "No v3d profile records found"
+    else:
+        tlogged = time - time0
+
+        print "Logged time:", tlogged, "  Op count:", op_count
+        for unit in sorted(ttotal):
+            print b'%6s: %10.3f    %7.3f%%' % (unit, ttotal[unit], ttotal[unit] * 100.0 / tlogged)
+        print
+        for unit in sorted(qctotal):
+            if not unit in qtstotal:
+                qtstotal[unit] = 0;
+            print b'%6s: Qcycles: %10d, TMU stall: %10d (%7.3f%%)' % (unit, qctotal[unit], qtstotal[unit], (qtstotal[unit] * 100.0)/qctotal[unit])
+            if unit in l2total:
+                print b'        L2Total: %10d, hits:      %10d (%7.3f%%)' % (l2total[unit], l2hits[unit], (l2hits[unit] * 100.0)/l2total[unit])
+
+
+
+if __name__ == '__main__':
+    argp = argparse.ArgumentParser(
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        description="QPU/VPU perf summary from VC logging",
+        epilog = """
+Will also summarise TMU stalls if logging requests set in qpu noflush param
+in the profiled code.
+
+Example use:
+  vcgencmd set_logging level=0xc0
+  <command to profile>
+  sudo vcdbg log msg >& t.log
+  v3dusage.py t.log
+""")
+
+    argp.add_argument("logfile")
+    args = argp.parse_args()
+
+    do_logparse(args.logfile)
+