LibreELEC.tv/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch

diff --git a/.gitignore b/.gitignore
index 524fb73..305632b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -23,6 +23,7 @@
 .\#*
 /.config
 /.version
+/build/
 /ffmpeg
 /ffplay
 /ffprobe
diff --git a/ffmpeg.c b/ffmpeg.c
index 9ffd833..7a86d7e 100644
--- a/ffmpeg.c
+++ b/ffmpeg.c
@@ -23,6 +23,11 @@
  * multimedia converter based on the FFmpeg libraries
  */

+#ifdef RPI
+#define RPI_DISPLAY
+#define RPI_ZERO_COPY
+#endif
+
 #include "config.h"
 #include <ctype.h>
 #include <string.h>
@@ -66,6 +71,25 @@
 # include "libavfilter/buffersrc.h"
 # include "libavfilter/buffersink.h"

+#ifdef RPI_DISPLAY
+#pragma GCC diagnostic push
+// Many many redundant decls in the header files
+#pragma GCC diagnostic ignored "-Wredundant-decls"
+#include <bcm_host.h>
+#include <interface/mmal/mmal.h>
+#include <interface/mmal/mmal_parameters_camera.h>
+#include <interface/mmal/mmal_buffer.h>
+#include <interface/mmal/util/mmal_util.h>
+#include <interface/mmal/util/mmal_default_components.h>
+#include <interface/mmal/util/mmal_connection.h>
+#include <interface/mmal/util/mmal_util_params.h>
+#pragma GCC diagnostic pop
+#ifdef RPI_ZERO_COPY
+#include "libavcodec/rpi_qpu.h"
+#endif
+#include "libavcodec/rpi_zc.h"
+#endif
+
 #if HAVE_SYS_RESOURCE_H
 #include <sys/time.h>
 #include <sys/types.h>
@@ -158,6 +182,169 @@ static int restore_tty;
 static void free_input_threads(void);
 #endif

+#ifdef RPI_DISPLAY
+
+#define NUM_BUFFERS 4
+
+static MMAL_COMPONENT_T* rpi_display = NULL;
+static MMAL_POOL_T *rpi_pool = NULL;
+static volatile int rpi_display_count = 0;
+
+static MMAL_POOL_T* display_alloc_pool(MMAL_PORT_T* port, size_t w, size_t h)
+{
+    MMAL_POOL_T* pool;
+    size_t i;
+    size_t size = (w*h*3)/2;
+#ifdef RPI_ZERO_COPY
+    mmal_port_parameter_set_boolean(port, MMAL_PARAMETER_ZERO_COPY, MMAL_TRUE); // Does this mark that the buffer contains a vc_handle?  Would have expected a vc_image?
+    pool = mmal_port_pool_create(port, NUM_BUFFERS, 0);
+    assert(pool);
+#else
+    pool = mmal_port_pool_create(port, NUM_BUFFERS, size);
+
+    for (i = 0; i < NUM_BUFFERS; ++i)
+    {
+       MMAL_BUFFER_HEADER_T* buffer = pool->header[i];
+       char * bufPtr = buffer->data;
+       memset(bufPtr, i*30, w*h);
+       memset(bufPtr+w*h, 128, (w*h)/2);
+    }
+#endif
+
+    return pool;
+}
+
+static void display_cb_input(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer) {
+#ifdef RPI_ZERO_COPY
+    av_rpi_zc_unref(buffer->user_data);
+    --rpi_display_count;
+#endif
+    mmal_buffer_header_release(buffer);
+}
+
+static void display_cb_control(MMAL_PORT_T *port,MMAL_BUFFER_HEADER_T *buffer) {
+  mmal_buffer_header_release(buffer);
+}
+
+static MMAL_COMPONENT_T* display_init(size_t x, size_t y, size_t w, size_t h)
+{
+    MMAL_COMPONENT_T* display;
+    MMAL_DISPLAYREGION_T region =
+    {
+        .hdr = {MMAL_PARAMETER_DISPLAYREGION, sizeof(region)},
+        .set = MMAL_DISPLAY_SET_LAYER | MMAL_DISPLAY_SET_FULLSCREEN | MMAL_DISPLAY_SET_DEST_RECT,
+        .layer = 2,
+        .fullscreen = 0,
+        .dest_rect = {x, y, w, h}
+    };
+    const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(w, h);
+
+    bcm_host_init();  // TODO is this needed?
+    mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, &display);
+    assert(display);
+
+    mmal_port_parameter_set(display->input[0], &region.hdr);
+
+    {
+        MMAL_ES_FORMAT_T* format = display->input[0]->format;
+        format->encoding = MMAL_ENCODING_I420;
+        format->es->video.width = geo.stride_y;
+        format->es->video.height = geo.height_y;
+        format->es->video.crop.x = 0;
+        format->es->video.crop.y = 0;
+        format->es->video.crop.width = w;
+        format->es->video.crop.height = h;
+        mmal_port_format_commit(display->input[0]);
+    }
+
+    mmal_component_enable(display);
+
+    rpi_pool = display_alloc_pool(display->input[0], geo.stride_y, geo.height_y);
+
+    mmal_port_enable(display->input[0],display_cb_input);
+    mmal_port_enable(display->control,display_cb_control);
+
+    printf("Allocated display %dx%d in %dx%d\n", w, h, geo.stride_y, geo.height_y);
+
+    return display;
+}
+
+static void display_frame(struct AVCodecContext * const s, MMAL_COMPONENT_T* const display, const AVFrame* const fr)
+{
+    MMAL_BUFFER_HEADER_T* buf;
+
+    if (!display || !rpi_pool)
+        return;
+
+    if (rpi_display_count >= 3) {
+        av_log(s, AV_LOG_VERBOSE, "Frame dropped\n");
+        return;
+    }
+
+    buf = mmal_queue_get(rpi_pool->queue);
+    if (!buf) {
+        // Running too fast so drop the frame
+        printf("Q alloc failure\n");
+        return;
+    }
+    assert(buf);
+    buf->cmd = 0;
+    buf->offset = 0; // Offset to valid data
+    buf->flags = 0;
+#ifdef RPI_ZERO_COPY
+{
+    const AVRpiZcRefPtr fr_buf = av_rpi_zc_ref(s, fr, 1);
+
+    buf->user_data = fr_buf;
+    buf->data = av_rpi_zc_vc_handle(fr_buf);
+    buf->alloc_size =
+        buf->length = av_rpi_zc_numbytes(fr_buf);
+
+    ++rpi_display_count;
+}
+#else
+{
+#error YYY
+    int w = fr->width;
+    int h = fr->height;
+    int w2 = (w+31)&~31;
+    int h2 = (h+15)&~15;
+
+    buf->length = (w2 * h2 * 3)/2;
+    buf->user_data = NULL;
+
+    //mmal_buffer_header_mem_lock(buf);
+    memcpy(buf->data, fr->data[0], w2 * h);
+    memcpy(buf->data+w2*h2, fr->data[1], w2 * h / 4);
+    memcpy(buf->data+w2*h2*5/4, fr->data[2], w2 * h / 4);
+    //mmal_buffer_header_mem_unlock(buf);
+}
+#endif
+
+    while (rpi_display_count >= 3) {
+        usleep(5000);
+    }
+
+    if (mmal_port_send_buffer(display->input[0], buf) != MMAL_SUCCESS)
+    {
+        printf("** send failed: depth=%d\n", rpi_display_count);
+        display_cb_input(NULL, buf);
+    }
+}
+
+static void display_exit(MMAL_COMPONENT_T* display)
+{
+    if (display) {
+        mmal_component_destroy(display);
+    }
+    if (rpi_pool) {
+        mmal_port_pool_destroy(display->input[0], rpi_pool);
+    }
+}
+
+#endif
+
+
 /* sub2video hack:
    Convert subtitles to video with alpha to insert them in filter graphs.
    This is a temporary solution until libavfilter gets real subtitles support.
@@ -540,6 +727,11 @@ static void ffmpeg_cleanup(int ret)
         avformat_close_input(&input_files[i]->ctx);
         av_freep(&input_files[i]);
     }
+
+#ifdef RPI_DISPLAY
+    display_exit(rpi_display);
+#endif
+
     for (i = 0; i < nb_input_streams; i++) {
         InputStream *ist = input_streams[i];

@@ -551,6 +743,9 @@ static void ffmpeg_cleanup(int ret)
         av_freep(&ist->filters);
         av_freep(&ist->hwaccel_device);

+#ifdef RPI_ZERO_COPY
+        av_rpi_zc_uninit(ist->dec_ctx);
+#endif
         avcodec_free_context(&ist->dec_ctx);

         av_freep(&input_streams[i]);
@@ -581,6 +776,7 @@ static void ffmpeg_cleanup(int ret)
     }
     term_exit();
     ffmpeg_exited = 1;
+
 }

 void remove_avoptions(AVDictionary **a, AVDictionary *b)
@@ -944,6 +1140,15 @@ static void do_video_out(AVFormatContext *s,
     if (ost->source_index >= 0)
         ist = input_streams[ost->source_index];

+#ifdef RPI_DISPLAY
+    if (next_picture && ist != NULL)
+    {
+        if (!rpi_display)
+           rpi_display = display_init(0,0,next_picture->width,next_picture->height);
+        display_frame(ist->dec_ctx, rpi_display, next_picture);
+    }
+#endif
+
     if (filter->inputs[0]->frame_rate.num > 0 &&
         filter->inputs[0]->frame_rate.den > 0)
         duration = 1/(av_q2d(filter->inputs[0]->frame_rate) * av_q2d(enc->time_base));
@@ -2549,6 +2754,12 @@ static int init_input_stream(int ist_index, char *error, int error_len)
         ist->dec_ctx->opaque                = ist;
         ist->dec_ctx->get_format            = get_format;
         ist->dec_ctx->get_buffer2           = get_buffer;
+
+#ifdef RPI_ZERO_COPY
+        // Overrides the above get_buffer2
+        av_rpi_zc_init(ist->dec_ctx);
+#endif
+
         ist->dec_ctx->thread_safe_callbacks = 1;

         av_opt_set_int(ist->dec_ctx, "refcounted_frames", 1, 0);
diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index fd0d1f0..40d22d2 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -5,6 +5,11 @@ NAME = avcodec
 HEADERS = avcodec.h                                                     \
           avdct.h                                                       \
           avfft.h                                                       \
+          rpi_qpu.h                                                     \
+          rpi_shader.h                                                  \
+          rpi_mailbox.h                                                 \
+          rpi_hevc_transform.h                                          \
+          rpi_zc.h                                                      \
           d3d11va.h                                                     \
           dirac.h                                                       \
           dv_profile.h                                                  \
@@ -43,6 +48,10 @@ OBJS = allcodecs.o                                                      \
        resample.o                                                       \
        resample2.o                                                      \
        utils.o                                                          \
+       rpi_qpu.o                                                        \
+       rpi_shader.o                                                     \
+       rpi_mailbox.o                                                    \
+       rpi_zc.o                                                         \
        vorbis_parser.o                                                  \
        xiph.o                                                           \

@@ -1078,3 +1087,11 @@ $(SUBDIR)qdm2.o: $(SUBDIR)qdm2_tables.h
 $(SUBDIR)sinewin.o: $(SUBDIR)sinewin_tables.h
 $(SUBDIR)sinewin_fixed.o: $(SUBDIR)sinewin_fixed_tables.h
 endif
+
+$(SUBDIR)rpi_shader.c: $(SUBDIR)rpi_shader.qasm
+	python $(SUBDIR)../pi-util/qasm.py -mc_c:rpi_shader,rpi_shader,rpi_shader $< > $@
+
+$(SUBDIR)rpi_shader.h: $(SUBDIR)rpi_shader.qasm
+	python $(SUBDIR)../pi-util/qasm.py -mc_h:rpi_shader,rpi_shader,rpi_shader $< > $@
+
+$(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_shader.h
diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
index 54efaad..02a89c3 100644
--- a/libavcodec/allcodecs.c
+++ b/libavcodec/allcodecs.c
@@ -667,6 +667,7 @@ void avcodec_register_all(void)
     REGISTER_PARSER(H261,               h261);
     REGISTER_PARSER(H263,               h263);
     REGISTER_PARSER(H264,               h264);
+    REGISTER_PARSER(H264_MVC,           h264_mvc);
     REGISTER_PARSER(HEVC,               hevc);
     REGISTER_PARSER(MJPEG,              mjpeg);
     REGISTER_PARSER(MLP,                mlp);
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index a4ceca7..1354c14 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -132,8 +132,10 @@ NEON-OBJS-$(CONFIG_LLAUDDSP)           += arm/lossless_audiodsp_neon.o
 NEON-OBJS-$(CONFIG_DCA_DECODER)        += arm/synth_filter_neon.o
 NEON-OBJS-$(CONFIG_HEVC_DECODER)       += arm/hevcdsp_init_neon.o       \
                                           arm/hevcdsp_deblock_neon.o    \
+                                          arm/hevcdsp_epel_neon.o       \
                                           arm/hevcdsp_idct_neon.o       \
-                                          arm/hevcdsp_qpel_neon.o
+                                          arm/hevcdsp_qpel_neon.o       \
+                                          arm/hevcdsp_sao_neon.o
 NEON-OBJS-$(CONFIG_RV30_DECODER)       += arm/rv34dsp_neon.o
 NEON-OBJS-$(CONFIG_RV40_DECODER)       += arm/rv34dsp_neon.o            \
                                           arm/rv40dsp_neon.o
diff --git a/libavcodec/arm/cabac.h b/libavcodec/arm/cabac.h
index fdbf86b..0a3980a 100644
--- a/libavcodec/arm/cabac.h
+++ b/libavcodec/arm/cabac.h
@@ -26,13 +26,34 @@
 #include "libavutil/internal.h"
 #include "libavcodec/cabac.h"

+
+#if UNCHECKED_BITSTREAM_READER
+#define LOAD_16BITS_BEHI\
+        "ldrh       %[tmp]        , [%[ptr]]    , #2            \n\t"\
+        "rev        %[tmp]        , %[tmp]                      \n\t"
+#elif CONFIG_THUMB
+#define LOAD_16BITS_BEHI\
+        "ldr        %[tmp]        , [%[c], %[end]]              \n\t"\
+        "cmp        %[tmp]        , %[ptr]                      \n\t"\
+        "it         cs                                          \n\t"\
+        "ldrhcs     %[tmp]        , [%[ptr]]    , #2            \n\t"\
+        "rev        %[tmp]        , %[tmp]                      \n\t"
+#else
+#define LOAD_16BITS_BEHI\
+        "ldr        %[tmp]        , [%[c], %[end]]              \n\t"\
+        "cmp        %[tmp]        , %[ptr]                      \n\t"\
+        "ldrcsh     %[tmp]        , [%[ptr]]    , #2            \n\t"\
+        "rev        %[tmp]        , %[tmp]                      \n\t"
+#endif
+
+
 #define get_cabac_inline get_cabac_inline_arm
 static av_always_inline int get_cabac_inline_arm(CABACContext *c,
                                                  uint8_t *const state)
 {
     int bit;
+#if 0
     void *reg_b, *reg_c, *tmp;
-
     __asm__ volatile(
         "ldrb       %[bit]        , [%[state]]                  \n\t"
         "add        %[r_b]        , %[tables]   , %[lps_off]    \n\t"
@@ -100,9 +121,141 @@ static av_always_inline int get_cabac_inline_arm(CABACContext *c,
           [mlps_off]"I"(H264_MLPS_STATE_OFFSET + 128)
         : "memory", "cc"
         );
+#else
+   // *** Not thumb compatible yet
+   unsigned int reg_b, tmp;
+    __asm__ (
+        "ldrb       %[bit]        , [%[state]]                  \n\t"
+        "sub        %[r_b]        , %[mlps_tables], %[lps_off]  \n\t"
+        "and        %[tmp]        , %[range]    , #0xC0         \n\t"
+        "add        %[r_b]        , %[r_b]      , %[bit]        \n\t"
+        "ldrb       %[tmp]        , [%[r_b]     , %[tmp], lsl #1] \n\t"
+// %bit = *state
+// %range = range
+// %tmp = RangeLPS
+        "sub        %[range]      , %[range]    , %[tmp]        \n\t"
+
+        "cmp        %[low]        , %[range]    , lsl #17       \n\t"
+        "ittt       ge                                          \n\t"
+        "subge      %[low]        , %[low]      , %[range], lsl #17 \n\t"
+        "mvnge      %[bit]        , %[bit]                      \n\t"
+        "movge      %[range]      , %[tmp]                      \n\t"
+
+        "clz        %[tmp]        , %[range]                    \n\t"
+        "sub        %[tmp]        , #23                         \n\t"
+
+        "ldrb       %[r_b]        , [%[mlps_tables], %[bit]]    \n\t"
+        "lsl        %[low]        , %[low]      , %[tmp]        \n\t"
+        "lsl        %[range]      , %[range]    , %[tmp]        \n\t"
+
+        "strb       %[r_b]        , [%[state]]                  \n\t"
+        "lsls       %[tmp]        , %[low]      , #16           \n\t"
+
+        "bne        2f                                          \n\t"
+        LOAD_16BITS_BEHI
+        "lsr        %[tmp]        , %[tmp]      , #15           \n\t"
+        "movw       %[r_b]        , #0xFFFF                     \n\t"
+        "sub        %[tmp]        , %[tmp]      , %[r_b]        \n\t"
+
+        "rbit       %[r_b]        , %[low]                      \n\t"
+        "clz        %[r_b]        , %[r_b]                      \n\t"
+        "sub        %[r_b]        , %[r_b]      , #16           \n\t"
+#if CONFIG_THUMB
+        "lsl        %[tmp]        , %[tmp]      , %[r_b]        \n\t"
+        "add        %[low]        , %[low]      , %[tmp]        \n\t"
+#else
+        "add        %[low]        , %[low]      , %[tmp], lsl %[r_b] \n\t"
+#endif
+        "2:                                                     \n\t"
+        :    [bit]"=&r"(bit),
+             [low]"+&r"(c->low),
+           [range]"+&r"(c->range),
+             [r_b]"=&r"(reg_b),
+             [ptr]"+&r"(c->bytestream),
+             [tmp]"=&r"(tmp)
+          :  [state]"r"(state),
+            [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
+              [byte]"M"(offsetof(CABACContext, bytestream)),
+#if !UNCHECKED_BITSTREAM_READER
+                 [c]"r"(c),
+               [end]"M"(offsetof(CABACContext, bytestream_end)),
+#endif
+           [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
+        : "memory", "cc"
+        );
+#endif

     return bit & 1;
 }
+
+#define get_cabac_bypass get_cabac_bypass_arm
+static inline int get_cabac_bypass_arm(CABACContext * const c)
+{
+    int rv = 0;
+    unsigned int tmp;
+    __asm (
+        "lsl        %[low]        , #1                          \n\t"
+        "cmp        %[low]        , %[range]    , lsl #17       \n\t"
+        "adc        %[rv]         , %[rv]       , #0            \n\t"
+        "it         cs                                          \n\t"
+        "subcs      %[low]        , %[low]      , %[range], lsl #17 \n\t"
+        "lsls       %[tmp]        , %[low]      , #16           \n\t"
+        "bne        1f                                          \n\t"
+        LOAD_16BITS_BEHI
+        "add        %[low]        , %[low]      , %[tmp], lsr #15 \n\t"
+        "movw       %[tmp]        , #0xFFFF                     \n\t"
+        "sub        %[low]        , %[low]      , %[tmp]        \n\t"
+        "1:                                                     \n\t"
+        : // Outputs
+              [rv]"+&r"(rv),
+             [low]"+&r"(c->low),
+             [tmp]"=&r"(tmp),
+             [ptr]"+&r"(c->bytestream)
+        : // Inputs
+#if !UNCHECKED_BITSTREAM_READER
+                 [c]"r"(c),
+               [end]"M"(offsetof(CABACContext, bytestream_end)),
+#endif
+             [range]"r"(c->range)
+        : "cc"
+    );
+    return rv;
+}
+
+
+#define get_cabac_bypass_sign get_cabac_bypass_sign_arm
+static inline int get_cabac_bypass_sign_arm(CABACContext * const c, int rv)
+{
+    unsigned int tmp;
+    __asm (
+        "lsl        %[low]        , #1                          \n\t"
+        "cmp        %[low]        , %[range]    , lsl #17       \n\t"
+        "ite        cc                                          \n\t"
+        "rsbcc      %[rv]         , %[rv]       , #0            \n\t"
+        "subcs      %[low]        , %[low]      , %[range], lsl #17 \n\t"
+        "lsls       %[tmp]        , %[low]      , #16           \n\t"
+        "bne        1f                                          \n\t"
+        LOAD_16BITS_BEHI
+        "add        %[low]        , %[low]      , %[tmp], lsr #15 \n\t"
+        "movw       %[tmp]        , #0xFFFF                     \n\t"
+        "sub        %[low]        , %[low]      , %[tmp]        \n\t"
+        "1:                                                     \n\t"
+        : // Outputs
+              [rv]"+&r"(rv),
+             [low]"+&r"(c->low),
+             [tmp]"=&r"(tmp),
+             [ptr]"+&r"(c->bytestream)
+        : // Inputs
+#if !UNCHECKED_BITSTREAM_READER
+                 [c]"r"(c),
+               [end]"M"(offsetof(CABACContext, bytestream_end)),
+#endif
+             [range]"r"(c->range)
+        : "cc"
+    );
+    return rv;
+}
+
 #endif /* HAVE_ARMV6T2_INLINE */

 #endif /* AVCODEC_ARM_CABAC_H */
diff --git a/libavcodec/arm/hevc_cabac.h b/libavcodec/arm/hevc_cabac.h
new file mode 100644
index 0000000..31d3c59
--- /dev/null
+++ b/libavcodec/arm/hevc_cabac.h
@@ -0,0 +1,491 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ARM_HEVC_CABAC_H
+#define AVCODEC_ARM_HEVC_CABAC_H
+
+#include "config.h"
+#if HAVE_ARMV6T2_INLINE
+
+#define hevc_mem_bits32 hevc_mem_bits32_arm
+static inline uint32_t hevc_mem_bits32_arm(const void * p, const unsigned int bits)
+{
+    unsigned int n;
+    __asm__ (
+        "rev        %[n], %[x]                     \n\t"
+        : [n]"=r"(n)
+        : [x]"r"(*(const uint32_t *)((const uint8_t *)p + (bits >> 3)))
+        :
+        );
+    return n << (bits & 7);
+}
+
+
+// ---------------------------------------------------------------------------
+//
+// Helper fns - little bits of code where ARM has an instraction that the
+// compiler doesn't know about / use
+
+#define trans_scale_sat trans_scale_sat_arm
+static inline int trans_scale_sat_arm(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift)
+{
+    int rv;
+    int t = ((level * (int)(scale * scale_m)) >> shift) + 1;
+
+    __asm__ (
+    "ssat %[rv], #16, %[t], ASR #1 \n\t"
+    : [rv]"=r"(rv)
+    : [t]"r"(t)
+    :
+    );
+    return rv;
+}
+
+#define update_rice update_rice_arm
+static inline void update_rice_arm(uint8_t * const stat_coeff,
+    const unsigned int last_coeff_abs_level_remaining,
+    const unsigned int c_rice_param)
+{
+    int t;
+    __asm__ (
+    "lsl   %[t], %[coeff], #1               \n\t"
+    "lsrs  %[t], %[t], %[shift]             \n\t"
+    "it    eq                               \n\t"
+    "subeq %[stat], %[stat], #1             \n\t"
+    "cmp   %[t], #6                         \n\t"
+    "adc   %[stat], %[stat], #0             \n\t"
+    "usat  %[stat], #8, %[stat]             \n\t"
+    : [stat]"+&r"(*stat_coeff),
+         [t]"=&r"(t)
+    :  [coeff]"r"(last_coeff_abs_level_remaining),
+       [shift]"r"(c_rice_param)
+    : "cc"
+    );
+}
+
+// ---------------------------------------------------------------------------
+//
+// CABAC get loops
+//
+// Where the loop is simple enough we can normally do 10-30% better than the
+// compiler
+
+// Get the residual greater than 1 bits
+
+#define get_cabac_greater1_bits get_cabac_greater1_bits_arm
+static inline unsigned int get_cabac_greater1_bits_arm(CABACContext * const c, const unsigned int n,
+    uint8_t * const state0)
+{
+    unsigned int i, reg_b, st, tmp, bit, rv;
+     __asm__ (
+         "mov        %[i]          , #0                          \n\t"
+         "mov        %[rv]         , #0                          \n\t"
+         "1:                                                     \n\t"
+         "add        %[i]          , %[i]        , #1            \n\t"
+         "cmp        %[rv]         , #0                          \n\t"
+         "ite        eq                                          \n\t"
+         "usateq     %[st]         , #2          , %[i]          \n\t"
+         "movne      %[st]         , #0                          \n\t"
+
+         "ldrb       %[bit]        , [%[state0], %[st]]          \n\t"
+         "sub        %[r_b]        , %[mlps_tables], %[lps_off]  \n\t"
+         "and        %[tmp]        , %[range]    , #0xC0         \n\t"
+         "add        %[r_b]        , %[r_b]      , %[bit]        \n\t"
+         "ldrb       %[tmp]        , [%[r_b], %[tmp], lsl #1]    \n\t"
+         "sub        %[range]      , %[range]    , %[tmp]        \n\t"
+
+         "cmp        %[low]        , %[range], lsl #17           \n\t"
+         "ittt       ge                                          \n\t"
+         "subge      %[low]        , %[low]      , %[range], lsl #17 \n\t"
+         "mvnge      %[bit]        , %[bit]                      \n\t"
+         "movge      %[range]      , %[tmp]                      \n\t"
+
+         "ldrb       %[r_b]        , [%[mlps_tables], %[bit]]    \n\t"
+         "and        %[bit]        , %[bit]      , #1            \n\t"
+         "orr        %[rv]         , %[bit]      , %[rv], lsl #1 \n\t"
+
+         "clz        %[tmp]        , %[range]                    \n\t"
+         "sub        %[tmp]        , #23                         \n\t"
+
+         "lsl        %[low]        , %[low]      , %[tmp]        \n\t"
+         "lsl        %[range]      , %[range]    , %[tmp]        \n\t"
+
+         "strb       %[r_b]        , [%[state0], %[st]]          \n\t"
+// There is a small speed gain from combining both conditions, using a single
+// branch and then working out what that meant later
+         "lsls       %[tmp]        , %[low]      , #16           \n\t"
+         "it         ne                                          \n\t"
+         "cmpne      %[n]          , %[i]                        \n\t"
+         "bne        1b                                          \n\t"
+
+// If reload is not required then we must have run out of flags to decode
+         "tst        %[tmp]        , %[tmp]                      \n\t"
+         "bne        2f                                          \n\t"
+
+// Do reload
+         "ldrh       %[tmp]        , [%[bptr]]   , #2            \n\t"
+         "movw       %[r_b]        , #0xFFFF                     \n\t"
+         "rev        %[tmp]        , %[tmp]                      \n\t"
+         "rsb        %[tmp]        , %[r_b]      , %[tmp], lsr #15 \n\t"
+
+         "rbit       %[r_b]        , %[low]                      \n\t"
+         "clz        %[r_b]        , %[r_b]                      \n\t"
+         "sub        %[r_b]        , %[r_b]      , #16           \n\t"
+
+#if CONFIG_THUMB
+         "lsl        %[tmp]        , %[tmp]      , %[r_b]        \n\t"
+         "add        %[low]        , %[low]      , %[tmp]        \n\t"
+#else
+         "add        %[low]        , %[low]      , %[tmp], lsl %[r_b] \n\t"
+#endif
+
+         "cmp        %[n]          , %[i]                        \n\t"
+         "bne        1b                                          \n\t"
+         "2:                                                     \n\t"
+         :    [bit]"=&r"(bit),
+              [low]"+&r"(c->low),
+            [range]"+&r"(c->range),
+              [r_b]"=&r"(reg_b),
+             [bptr]"+&r"(c->bytestream),
+                [i]"=&r"(i),
+              [tmp]"=&r"(tmp),
+               [st]"=&r"(st),
+               [rv]"=&r"(rv)
+          :  [state0]"r"(state0),
+                  [n]"r"(n),
+        [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
+               [byte]"M"(offsetof(CABACContext, bytestream)),
+            [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
+         : "memory", "cc"
+    );
+    return rv;
+}
+
+
+// n must be > 0 on entry
+#define get_cabac_sig_coeff_flag_idxs get_cabac_sig_coeff_flag_idxs_arm
+static inline uint8_t * get_cabac_sig_coeff_flag_idxs_arm(CABACContext * const c, uint8_t * const state0,
+    unsigned int n,
+    const uint8_t const * ctx_map,
+    uint8_t * p)
+{
+    unsigned int reg_b, tmp, st, bit;
+     __asm__ (
+         "1:                                                     \n\t"
+// Get bin from map
+         "ldrb       %[st]         , [%[ctx_map], %[n]]          \n\t"
+
+// Load state & ranges
+         "sub        %[r_b]        , %[mlps_tables], %[lps_off]  \n\t"
+         "ldrb       %[bit]        , [%[state0], %[st]]          \n\t"
+         "and        %[tmp]        , %[range]    , #0xC0         \n\t"
+         "add        %[r_b]        , %[r_b]      , %[tmp], lsl #1 \n\t"
+         "ldrb       %[tmp]        , [%[r_b], %[bit]]            \n\t"
+         "sub        %[range]      , %[range]    , %[tmp]        \n\t"
+
+         "cmp        %[low]        , %[range], lsl #17           \n\t"
+         "ittt       ge                                          \n\t"
+         "subge      %[low]        , %[low]      , %[range], lsl #17 \n\t"
+         "mvnge      %[bit]        , %[bit]                      \n\t"
+         "movge      %[range]      , %[tmp]                      \n\t"
+
+         "ldrb       %[r_b]        , [%[mlps_tables], %[bit]]    \n\t"
+         "tst        %[bit]        , #1                          \n\t"
+// GCC asm seems to need strbne written differently for thumb and arm
+#if CONFIG_THUMB
+         "it         ne                                          \n\t"
+         "strbne     %[n]          , [%[idx]]    , #1            \n\t"
+#else
+         "strneb     %[n]          , [%[idx]]    , #1            \n\t"
+#endif
+
+// Renorm
+         "clz        %[tmp]        , %[range]                    \n\t"
+         "sub        %[tmp]        , #23                         \n\t"
+         "lsl        %[low]        , %[low]      , %[tmp]        \n\t"
+         "lsl        %[range]      , %[range]    , %[tmp]        \n\t"
+
+         "strb       %[r_b]        , [%[state0], %[st]]          \n\t"
+// There is a small speed gain from combining both conditions, using a single
+// branch and then working out what that meant later
+         "subs       %[n]          , %[n]        , #1            \n\t"
+#if CONFIG_THUMB
+         "itt        ne                                          \n\t"
+         "lslsne     %[tmp]        , %[low]      , #16           \n\t"
+         "bne        1b                                          \n\t"
+#else
+         "lslnes     %[tmp]        , %[low]      , #16           \n\t"
+         "bne        1b                                          \n\t"
+#endif
+
+// If we have bits left then n must be 0 so give up now
+         "lsls       %[tmp]        , %[low]      , #16           \n\t"
+         "bne        2f                                          \n\t"
+
+// Do reload
+         "ldrh       %[tmp]        , [%[bptr]]   , #2            \n\t"
+         "movw       %[r_b]        , #0xFFFF                     \n\t"
+         "rev        %[tmp]        , %[tmp]                      \n\t"
+         "rsb        %[tmp]        , %[r_b]      , %[tmp], lsr #15 \n\t"
+
+         "rbit       %[r_b]        , %[low]                      \n\t"
+         "clz        %[r_b]        , %[r_b]                      \n\t"
+         "sub        %[r_b]        , %[r_b]      , #16           \n\t"
+
+#if CONFIG_THUMB
+         "lsl        %[tmp]        , %[tmp]      , %[r_b]        \n\t"
+         "add        %[low]        , %[low]      , %[tmp]        \n\t"
+#else
+         "add        %[low]        , %[low]      , %[tmp], lsl %[r_b] \n\t"
+#endif
+
+// Check to see if we still have more to do
+         "cmp        %[n]          , #0                          \n\t"
+         "bne        1b                                          \n\t"
+         "2:                                                     \n\t"
+         :    [bit]"=&r"(bit),
+              [low]"+&r"(c->low),
+            [range]"+&r"(c->range),
+              [r_b]"=&r"(reg_b),
+             [bptr]"+&r"(c->bytestream),
+              [idx]"+&r"(p),
+                [n]"+&r"(n),
+              [tmp]"=&r"(tmp),
+               [st]"=&r"(st)
+          :  [state0]"r"(state0),
+            [ctx_map]"r"(ctx_map),
+        [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
+               [byte]"M"(offsetof(CABACContext, bytestream)),
+            [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
+         : "memory", "cc"
+    );
+
+    return p;
+}
+
+// ---------------------------------------------------------------------------
+//
+// CABAC_BY22 functions
+//
+// By and large these are (at best) no faster than their C equivalents - the
+// only one worth having is _peek where we do a slightly better job than the
+// compiler
+//
+// The others have been stashed here for reference in case larger scale asm
+// is attempted in which case they might be a useful base
+
+
+#define get_cabac_by22_peek get_cabac_by22_peek_arm
+static inline uint32_t get_cabac_by22_peek_arm(const CABACContext *const c)
+{
+    uint32_t rv, tmp;
+    __asm__ (
+        "bic      %[rv]  , %[low], #1            \n\t"
+        "cmp      %[inv] , #0                    \n\t"
+        "it       ne                             \n\t"
+        "umullne  %[tmp] , %[rv] , %[inv], %[rv] \n\t"
+        :  // Outputs
+             [rv]"=&r"(rv),
+             [tmp]"=r"(tmp)
+        :  // Inputs
+             [low]"r"(c->low),
+             [inv]"r"(c->range)
+        :  // Clobbers
+                "cc"
+    );
+    return rv << 1;
+}
+
+#if 0
+
+// ***** Slower than the C  :-(
+#define get_cabac_by22_flush get_cabac_by22_flush_arm
+static inline void get_cabac_by22_flush_arm(CABACContext *const c, const unsigned int n, const uint32_t val)
+{
+    uint32_t m, tmp;
+    __asm__ (
+    "add    %[bits], %[bits], %[n]   \n\t"
+    "ldr    %[m], [%[ptr], %[bits], lsr #3]  \n\t"
+
+    "rsb    %[tmp], %[n], #32        \n\t"
+    "lsr    %[tmp], %[val], %[tmp]   \n\t"
+    "mul    %[tmp], %[range], %[tmp] \n\t"
+
+    "rev    %[m], %[m]               \n\t"
+
+    "lsl    %[tmp], %[tmp], #23      \n\t"
+    "rsb    %[low], %[tmp], %[low], lsl %[n] \n\t"
+
+    "and    %[tmp], %[bits], #7         \n\t"
+    "lsl    %[m], %[m], %[tmp]          \n\t"
+
+    "orr    %[low], %[low], %[m], lsr #9      \n\t"
+        :  // Outputs
+             [m]"=&r"(m),
+           [tmp]"=&r"(tmp),
+          [bits]"+&r"(c->by22.bits),
+           [low]"+&r"(c->low)
+        :  // Inputs
+               [n]"r"(n),
+             [val]"r"(val),
+             [inv]"r"(c->range),
+           [range]"r"(c->by22.range),
+             [ptr]"r"(c->bytestream)
+        :  // Clobbers
+    );
+}
+
+
+// Works but slower than C
+#define coeff_abs_level_remaining_decode_by22(c,r) coeff_abs_level_remaining_decode_by22_arm(c, r)
+static int coeff_abs_level_remaining_decode_by22_arm(CABACContext * const c, const unsigned int c_rice_param)
+{
+    uint32_t n, val, tmp, level;
+
+//    PROFILE_START();
+
+    __asm__ (
+            // Peek
+            "bic    %[val],  %[low],   #1  \n\t"
+            "cmp    %[inv], #0          \n\t"
+            "umullne  %[tmp], %[val], %[inv], %[val] \n\t"
+            "lsl    %[val], %[val], #1  \n\t"
+
+            // Count bits (n = prefix)
+            "mvn    %[n], %[val] \n\t"
+            "clz    %[n], %[n]   \n\t"
+
+            "lsl    %[level], %[val], %[n] \n\t"
+            "subs   %[tmp], %[n], #3 \n\t"
+            "blo    2f \n\t"
+
+            // prefix >= 3
+            // < tmp = prefix - 3
+            // > tmp = prefix + rice - 3
+            "add    %[tmp], %[tmp], %[rice] \n\t"
+            // > n = prefix * 2 + rice - 3
+            "add    %[n], %[tmp], %[n] \n\t"
+            "cmp    %[n], #21 \n\t"
+            "bhi    3f \n\t"
+
+            "orr    %[level], %[level], #0x80000000 \n\t"
+            "rsb    %[tmp], %[tmp], #31 \n\t"
+            "lsr    %[level], %[level], %[tmp] \n\t"
+
+            "mov    %[tmp], #2 \n\t"
+            "add    %[level], %[level], %[tmp], lsl %[rice] \n\t"
+            "b      1f \n\t"
+
+            // > 22 bits used in total - need reload
+            "3:  \n\t"
+
+            // Stash prefix + rice - 3 in level (only spare reg)
+            "mov    %[level], %[tmp] \n\t"
+            // Restore n to flush value (prefix)
+            "sub    %[n], %[n], %[tmp] \n\t"
+
+            // Flush + reload
+
+//          "rsb    %[tmp], %[n], #32        \n\t"
+//          "lsr    %[tmp], %[val], %[tmp]   \n\t"
+//          "mul    %[tmp], %[range], %[tmp] \n\t"
+
+            // As it happens we know that all the bits we are flushing are 1
+            // so we can cheat slightly
+            "rsb    %[tmp], %[range], %[range], lsl %[n] \n\t"
+            "lsl    %[tmp], %[tmp], #23      \n\t"
+            "rsb    %[low], %[tmp], %[low], lsl %[n] \n\t"
+
+            "add    %[bits], %[bits], %[n]   \n\t"
+            "ldr    %[n], [%[ptr], %[bits], lsr #3]  \n\t"
+            "rev    %[n], %[n]               \n\t"
+            "and    %[tmp], %[bits], #7         \n\t"
+            "lsl    %[n], %[n], %[tmp]          \n\t"
+
+            "orr    %[low], %[low], %[n], lsr #9      \n\t"
+
+            // (reload)
+
+            "bic    %[val],  %[low],   #1  \n\t"
+            "cmp    %[inv], #0          \n\t"
+            "umullne  %[tmp], %[val], %[inv], %[val] \n\t"
+            "lsl    %[val], %[val], #1  \n\t"
+
+            // Build value
+
+            "mov    %[n], %[level] \n\t"
+
+            "orr     %[tmp], %[val], #0x80000000 \n\t"
+            "rsb     %[level], %[level], #31 \n\t"
+            "lsr     %[level], %[tmp], %[level] \n\t"
+
+            "mov    %[tmp], #2 \n\t"
+            "add    %[level], %[level], %[tmp], lsl %[rice] \n\t"
+            "b      1f \n\t"
+
+            // prefix < 3
+            "2:  \n\t"
+            "rsb    %[tmp], %[rice], #31 \n\t"
+            "lsr    %[level], %[level], %[tmp] \n\t"
+            "orr    %[level], %[level], %[n], lsl %[rice] \n\t"
+            "add    %[n], %[n], %[rice] \n\t"
+
+            "1:  \n\t"
+            // Flush
+            "add    %[n], %[n], #1 \n\t"
+
+            "rsb    %[tmp], %[n], #32        \n\t"
+            "lsr    %[tmp], %[val], %[tmp]   \n\t"
+
+            "add    %[bits], %[bits], %[n]   \n\t"
+            "ldr    %[val], [%[ptr], %[bits], lsr #3]  \n\t"
+
+            "mul    %[tmp], %[range], %[tmp] \n\t"
+            "lsl    %[tmp], %[tmp], #23      \n\t"
+            "rsb    %[low], %[tmp], %[low], lsl %[n] \n\t"
+
+            "rev    %[val], %[val]               \n\t"
+            "and    %[tmp], %[bits], #7         \n\t"
+            "lsl    %[val], %[val], %[tmp]          \n\t"
+
+            "orr    %[low], %[low], %[val], lsr #9      \n\t"
+        :  // Outputs
+         [level]"=&r"(level),
+             [n]"=&r"(n),
+           [val]"=&r"(val),
+           [tmp]"=&r"(tmp),
+          [bits]"+&r"(c->by22.bits),
+           [low]"+&r"(c->low)
+        :  // Inputs
+            [rice]"r"(c_rice_param),
+             [inv]"r"(c->range),
+           [range]"r"(c->by22.range),
+             [ptr]"r"(c->bytestream)
+        :  // Clobbers
+                "cc"
+    );
+
+//    PROFILE_ACC(residual_abs);
+
+    return level;
+}
+#endif
+
+#endif /* HAVE_ARMV6T2_INLINE */
+
+#endif /* AVCODEC_ARM_HEVC_CABAC_H */
diff --git a/libavcodec/arm/hevcdsp_deblock_neon.S b/libavcodec/arm/hevcdsp_deblock_neon.S
index 166bddb..a088cc3 100644
--- a/libavcodec/arm/hevcdsp_deblock_neon.S
+++ b/libavcodec/arm/hevcdsp_deblock_neon.S
@@ -383,3 +383,127 @@ function ff_hevc_h_loop_filter_chroma_neon, export=1
         vst1.8   {d4}, [r0]
         bx       lr
 endfunc
+
+/* ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc,
+ *                                            int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
+ *                                            MvField *curr, MvField *neigh, uint8_t *bs)
+ */
+function ff_hevc_deblocking_boundary_strengths_neon, export=1
+        add         ip, sp, #4*4
+        push        {a2-a4,v1-v8,lr}
+        ldmia       ip, {v5-v7}
+1:      ldmdb       ip, {v1-v4}
+        ldrsb       a3, [v5, #8]    @ curr->ref_idx
+        ldrsb       v8, [v5, #9]
+        ldrsb       ip, [v6, #8]    @ neigh->ref_idx
+        ldrsb       lr, [v6, #9]
+        ldr         v1, [v1, a3, lsl #2]
+        ldrb        a3, [v5, #10]   @ curr->pred_flag
+        ldr         v2, [v2, v8, lsl #2]
+        ldrb        v8, [v6, #10]   @ neigh->pred_flag
+        ldr         v3, [v3, ip, lsl #2]
+        ldr         v4, [v4, lr, lsl #2]
+        teq         a3, #3
+        beq         20f
+        teq         v8, #3
+        beq         90f
+
+        tst         a3, #1
+        itee        ne
+        ldrne       a3, [v5, #0]    @ curr->mv[0]
+        ldreq       a3, [v5, #4]    @ curr->mv[1]
+        moveq       v1, v2
+        tst         v8, #1
+        itee        ne
+        ldrne       v8, [v6, #0]    @ neigh->mv[0]
+        ldreq       v8, [v6, #4]    @ neigh->mv[1]
+        moveq       v3, v4
+        teq         v1, v3
+        bne         10f
+        ldr         lr, =0xFFFCFFFC
+        ssub16      ip, v8, a3
+        ssub16      a3, a3, v8
+        sel         a3, a3, ip
+        ands        a3, a3, lr
+        @ drop through
+10:     it          ne
+        movne       a3, #1
+11:     subs        a2, a2, #1
+12:
+A       strbhs      a3, [v7], a4
+T       itt         hs
+T       strbhs      a3, [v7]
+T       addhs       v7, v7, a4
+        subs        a2, a2, #1
+        bhs         12b
+
+        ldm         sp, {a2, a3}
+        add         ip, sp, #16*4
+        subs        a1, a1, #1
+        add         v5, v5, a3
+        add         v6, v6, a3
+        bhi         1b
+        pop         {a2-a4,v1-v8,pc}
+
+20:     teq         v8, #3
+        bne         10b
+
+        teq         v1, v3
+        it          eq
+        teqeq       v2, v4
+        bne         40f
+        teq         v1, v2
+        bne         30f
+
+        ldrd        v1, v2, [v5]    @ curr->mv
+        ldrd        v3, v4, [v6]    @ neigh->mv
+        ldr         lr, =0xFFFCFFFC
+        ssub16      ip, v3, v1
+        ssub16      a3, v1, v3
+        sel         a3, a3, ip
+        ands        a3, a3, lr
+        bne         25f
+        ssub16      ip, v4, v2
+        ssub16      a3, v2, v4
+        sel         a3, a3, ip
+        ands        a3, a3, lr
+        beq         11b
+        @ drop through
+25:     ssub16      ip, v4, v1
+        ssub16      a3, v1, v4
+        sel         a3, a3, ip
+        ands        a3, a3, lr
+        bne         10b
+        ssub16      ip, v3, v2
+        ssub16      a3, v2, v3
+        sel         a3, a3, ip
+        ands        a3, a3, lr
+        b           10b
+
+30:     ldrd        v1, v2, [v5]    @ curr->mv
+        ldrd        v3, v4, [v6]    @ neigh->mv
+        ldr         lr, =0xFFFCFFFC
+        ssub16      ip, v3, v1
+        ssub16      a3, v1, v3
+        sel         a3, a3, ip
+        ands        a3, a3, lr
+        bne         10b
+        ssub16      ip, v4, v2
+        ssub16      a3, v2, v4
+        sel         a3, a3, ip
+        ands        a3, a3, lr
+        b           10b
+
+40:     teq         v1, v4
+        ite         eq
+        teqeq       v2, v3
+        bne         10b
+
+        ldrd        v1, v2, [v5]    @ curr->mv
+        ldrd        v3, v4, [v6]    @ neigh->mv
+        ldr         lr, =0xFFFCFFFC
+        b           25b
+
+90:     mov         a3, #1
+        b           11b
+endfunc
diff --git a/libavcodec/arm/hevcdsp_epel_neon.S b/libavcodec/arm/hevcdsp_epel_neon.S
new file mode 100644
index 0000000..00eab9e
--- /dev/null
+++ b/libavcodec/arm/hevcdsp_epel_neon.S
@@ -0,0 +1,337 @@
+/*
+ * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
+#define MAX_PB_SIZE #64
+
+.macro vextin_d4
+    vld1.8    {q10}, [r1], r2
+    vmov      d16, d20
+    vext.8    d17, d20, d21, #1
+    vext.8    d18, d20, d21, #2
+    vext.8    d19, d20, d21, #3
+.endm
+
+.macro vextin_d4_8
+    vld1.8    d16, [r1], r2
+    vext.8    d17, d16, d16, #1
+    vext.8    d18, d16, d16, #2
+    vext.8    d19, d16, d16, #3
+.endm
+
+.macro load_coeffs_16b coeffs
+    ldr      \coeffs, [\coeffs]
+    vdup.i8  d0, \coeffs
+    lsr      \coeffs, #8
+    vdup.i8  d1, \coeffs
+    lsr      \coeffs, #8
+    vdup.i8  d2, \coeffs
+    lsr      \coeffs, #8
+    vdup.i8  d3, \coeffs
+.endm
+
+.macro epel_filter_16b out=q12
+    vmull.u8 q3, d16, d0
+    vmull.u8 q11, d19, d3
+    vmull.u8 \out, d17, d1
+    vmull.u8 q10, d18, d2
+    vadd.s16 q3, q11
+    vadd.s16 \out, q10
+    vsub.s16 \out, q3
+.endm
+
+.macro load_coeffs_32b coeffs
+    ldr      \coeffs, [\coeffs]
+    vmov.i64 d4, #0
+    vmov.8   d4[0], \coeffs
+    lsr      \coeffs, #8
+    vmov.8   d4[2], \coeffs
+    lsr      \coeffs, #8
+    vmov.8   d4[4], \coeffs
+    lsr      \coeffs, #8
+    vmov.8   d4[6], \coeffs
+.endm
+
+.macro epel_filter_32b
+    vmull.s16 q3, d24, d4[0] //q12
+    vmull.s16 q4, d25, d4[0]
+    vmull.s16 q5, d30, d4[3] //q15
+    vmull.s16 q6, d31, d4[3]
+
+    vmull.s16 q7, d26, d4[1] // q13
+    vmull.s16 q8, d27, d4[1]
+    vmull.s16 q9, d28, d4[2] // q14
+    vmull.s16 q10, d29, d4[2]
+    vadd.s32 q3, q5
+    vadd.s32 q4, q6
+    vadd.s32 q7, q9
+    vadd.s32 q8, q10
+    vsub.s32 q7, q3
+    vsub.s32 q8, q4
+    vqshrn.s32  d6, q7, #6
+    vqshrn.s32  d7, q8, #6
+.endm
+
+.macro epel_filter_32b_4
+    vmull.s16 q3, d24, d4[0] //q12
+    vmull.s16 q5, d30, d4[3] //q15
+    vmull.s16 q7, d26, d4[1] // q13
+    vmull.s16 q9, d28, d4[2] // q14
+    vadd.s32 q3, q5
+    vadd.s32 q7, q9
+    vsub.s32 q7, q3
+    vqshrn.s32  d6, q7, #6
+.endm
+
+function ff_hevc_put_epel_h_neon_8, export=1
+        push   {r4-r7}
+        mov    r4, MAX_PB_SIZE
+        ldr    r7, [sp, #16] // mx
+        ldr    r5, [sp, #24] // width
+        sub    r7, #1
+        lsl    r7, #2
+        vpush {d8-d15}
+@ adr reaches if we are in thumb mode but not in arm
+T       adr    r12, epel_coeffs
+A       adrl   r12, epel_coeffs
+        add    r7, r12
+        sub       r1, #1
+        lsl       r4, #1
+        load_coeffs_16b r7
+        mov   r12, r3
+        mov   r6, r0
+        mov   r7, r1
+        cmp       r5, #6
+        bgt       8f
+        cmp       r5, #4
+        blt       2f
+        b         4f
+8:      subs r3, #1
+        pld [r1]
+        vextin_d4
+        epel_filter_16b
+        vst1.16    {q12}, [r0], r4
+        bne 8b
+        subs    r5, #8
+        beq  99f
+        mov       r3, r12
+        add       r6, #16
+        mov       r0, r6
+        add       r7, #8
+        mov       r1, r7
+        cmp       r5, #4
+        bgt       8b
+4:      subs r3, #1
+        pld [r1]
+        vextin_d4_8
+        epel_filter_16b
+        vst1.16    d24, [r0], r4
+        bne 4b
+        subs      r5, #4
+        beq       99f
+        mov       r3, r12
+        add       r6, #8
+        mov       r0, r6
+        add       r7, #4
+        mov       r1, r7
+2:      subs r3, #1
+        pld [r1]
+        vextin_d4_8
+        epel_filter_16b
+        vst1.32    d24[0], [r0], r4
+        bne 2b
+99:     vpop {d8-d15}
+        pop {r4-r7}
+        bx lr
+endfunc
+
+function ff_hevc_put_epel_v_neon_8, export=1
+        push   {r4-r7}
+        mov    r4, MAX_PB_SIZE
+        ldr    r7, [sp, #20] // my
+        ldr    r5, [sp, #24] // width
+        sub    r7, #1
+        lsl    r7, #2
+        vpush {d8-d15}
+T       adr    r12, epel_coeffs
+A       adrl   r12, epel_coeffs
+        add    r7, r12
+        load_coeffs_16b r7
+        sub       r1, r2
+        lsl       r4, #1
+        mov   r12, r3
+        mov   r6, r0
+        mov   r7, r1
+0:      pld [r1]
+        vld1.8    {d16}, [r1], r2
+        pld [r1]
+        vld1.8    {d17}, [r1], r2
+        pld [r1]
+        vld1.8    {d18}, [r1], r2
+        cmp       r5, #6
+        bgt       8f
+        cmp       r5, #4
+        blt       2f
+        b         4f
+8:      pld [r1]
+        vld1.8    {d19}, [r1], r2
+        subs r3, #1
+        epel_filter_16b
+        vst1.16    {q12}, [r0], r4
+        vmov d16, d17
+        vmov d17, d18
+        vmov d18, d19
+        bne 8b
+        subs    r5, #8
+        beq  99f
+        mov       r3, r12
+        add       r6, #16
+        mov       r0, r6
+        add       r7, #8
+        mov       r1, r7
+        b         0b
+4:      pld       [r1]
+        vld1.8    {d19}, [r1], r2
+        subs r3, #1
+        epel_filter_16b
+        vst1.16    d24, [r0], r4
+        vmov d16, d17
+        vmov d17, d18
+        vmov d18, d19
+        bne 4b
+        subs      r5, #4
+        beq       99f
+        mov       r3, r12
+        add       r6, #8
+        mov       r0, r6
+        add       r7, #4
+        mov       r1, r7
+        b         0b
+2:      pld [r1]
+        vld1.8    {d19}, [r1], r2
+        subs r3, #1
+        epel_filter_16b
+        vst1.32    d24[0], [r0], r4
+        vmov d16, d17
+        vmov d17, d18
+        vmov d18, d19
+        bne 2b
+99:     vpop {d8-d15}
+        pop {r4-r7}
+        bx lr
+endfunc
+
+function ff_hevc_put_epel_hv_neon_8, export=1
+        push   {r4-r7}
+        mov    r4, MAX_PB_SIZE
+        ldr    r6, [sp, #16] // mx
+        ldr    r7, [sp, #20] // my
+        ldr    r5, [sp, #24] // width
+        sub    r7, #1
+        lsl    r7, #2
+        vpush {d8-d15}
+        adr    r12, epel_coeffs
+        sub    r6, #1
+        lsl    r6, #2
+        add    r6, r12 // mx epel coeff offset
+        add    r7, r12
+        sub       r1, #1
+        sub       r1, r2
+        lsl       r4, #1
+        load_coeffs_16b r6
+        load_coeffs_32b r7
+        mov   r12, r3
+        mov   r6, r0
+        mov   r7, r1
+0:      pld   [r1]
+        vextin_d4
+        epel_filter_16b q12
+        pld   [r1]
+        vextin_d4
+        epel_filter_16b q13
+        pld   [r1]
+        vextin_d4
+        epel_filter_16b q14
+        cmp       r5, #6
+        bgt       8f
+        cmp       r5, #4
+        blt       2f
+        b         4f
+8:      pld     [r1]
+        vextin_d4
+        epel_filter_16b q15
+        subs r3, #1
+        epel_filter_32b
+        vst1.16    {q3}, [r0], r4
+        vmov q12, q13
+        vmov q13, q14
+        vmov q14, q15
+        bne 8b
+        subs    r5, #8
+        beq  99f
+        mov       r3, r12
+        add       r6, #16
+        mov       r0, r6
+        add       r7, #8
+        mov       r1, r7
+        b         0b
+4:      pld      [r1]
+        vextin_d4_8
+        epel_filter_16b q15
+        subs r3, #1
+        epel_filter_32b_4
+        vst1.16    d6, [r0], r4
+        vmov q12, q13
+        vmov q13, q14
+        vmov q14, q15
+        bne 4b
+        subs      r5, #4
+        beq       99f
+        mov       r3, r12
+        add       r6, #8
+        mov       r0, r6
+        add       r7, #4
+        mov       r1, r7
+        b         0b
+2:      pld      [r1]
+        vextin_d4_8
+        epel_filter_16b q15
+        subs r3, #1
+        epel_filter_32b_4
+        vst1.32    d6[0], [r0], r4
+        vmov q12, q13
+        vmov q13, q14
+        vmov q14, q15
+        bne 2b
+99:     vpop {d8-d15}
+        pop {r4-r7}
+        bx lr
+endfunc
+
+epel_coeffs:
+       .byte 2, 58, 10, 2
+       .byte 4, 54, 16, 2
+       .byte 6, 46, 28, 4
+       .byte 4, 36, 36, 4
+       .byte 4, 28, 46, 6
+       .byte 2, 16, 54, 4
+       .byte 2, 10, 58, 2
diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
index 5591807..49c70dd 100644
--- a/libavcodec/arm/hevcdsp_init_neon.c
+++ b/libavcodec/arm/hevcdsp_init_neon.c
@@ -22,6 +22,8 @@
 #include "libavutil/arm/cpu.h"
 #include "libavcodec/hevcdsp.h"
 #include "hevcdsp_arm.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/bit_depth_template.c"

 void ff_hevc_v_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
 void ff_hevc_h_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
@@ -43,6 +45,21 @@ void ff_hevc_transform_add_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
 void ff_hevc_transform_add_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
                                       ptrdiff_t stride);

+void ff_hevc_sao_band_w8_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
+void ff_hevc_sao_band_w16_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
+void ff_hevc_sao_band_w32_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
+void ff_hevc_sao_band_w64_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
+
+void ff_hevc_sao_edge_eo0_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
+void ff_hevc_sao_edge_eo1_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
+void ff_hevc_sao_edge_eo2_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
+void ff_hevc_sao_edge_eo3_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
+
+void ff_hevc_sao_edge_eo0_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
+void ff_hevc_sao_edge_eo1_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
+void ff_hevc_sao_edge_eo2_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
+void ff_hevc_sao_edge_eo3_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
+
 #define PUT_PIXELS(name) \
     void name(int16_t *dst, uint8_t *src, \
                                 ptrdiff_t srcstride, int height, \
@@ -58,6 +75,15 @@ PUT_PIXELS(ff_hevc_put_pixels_w32_neon_8);
 PUT_PIXELS(ff_hevc_put_pixels_w48_neon_8);
 PUT_PIXELS(ff_hevc_put_pixels_w64_neon_8);
 #undef PUT_PIXELS
+void ff_hevc_put_epel_h_neon_8(int16_t *dst, uint8_t *src,
+                                ptrdiff_t srcstride, int height,
+                                intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_epel_v_neon_8(int16_t *dst, uint8_t *src,
+                                ptrdiff_t srcstride, int height,
+                                intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_epel_hv_neon_8(int16_t *dst, uint8_t *src,
+                                ptrdiff_t srcstride, int height,
+                                intptr_t mx, intptr_t my, int width);

 static void (*put_hevc_qpel_neon[4][4])(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
                                    int height, int width);
@@ -142,6 +168,132 @@ void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t
     put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, src2, MAX_PB_SIZE);
 }

+static void ff_hevc_sao_band_neon_wrapper(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+                                          int16_t *sao_offset_val, int sao_left_class, int width, int height)
+{
+    pixel *dst = (pixel *)_dst;
+    pixel *src = (pixel *)_src;
+    int8_t offset_table[32] = { 0 };
+    int k, y, x;
+    int shift  = 3; // BIT_DEPTH - 5
+    int cwidth = 0;
+
+    stride_src /= sizeof(pixel);
+    stride_dst /= sizeof(pixel);
+
+    for (k = 0; k < 4; k++)
+        offset_table[(k + sao_left_class) & 31] = sao_offset_val[k + 1];
+
+    if (height % 8 == 0)
+        cwidth = width;
+
+    switch(cwidth){
+    case 8:
+        ff_hevc_sao_band_w8_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height);
+        break;
+    case 16:
+        ff_hevc_sao_band_w16_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height);
+        break;
+    case 32:
+        ff_hevc_sao_band_w32_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height);
+        break;
+    case 64:
+        ff_hevc_sao_band_w64_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height);
+        break;
+    default:
+        for (y = 0; y < height; y++) {
+            for (x = 0; x < width; x++)
+                dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
+            dst += stride_dst;
+            src += stride_src;
+        }
+    }
+}
+
+#define CMP(a, b) ((a) > (b) ? 1 : ((a) == (b) ? 0 : -1))
+static void ff_hevc_sao_edge_neon_wrapper(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
+                                          int16_t *_sao_offset_val, int eo, int width, int height)
+{
+    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
+    static const int8_t pos[4][2][2] = {
+        { { -1,  0 }, {  1, 0 } }, // horizontal
+        { {  0, -1 }, {  0, 1 } }, // vertical
+        { { -1, -1 }, {  1, 1 } }, // 45 degree
+        { {  1, -1 }, { -1, 1 } }, // 135 degree
+    };
+    int8_t sao_offset_val[8];  // padding of 3 for vld
+    ptrdiff_t stride_src = (2*MAX_PB_SIZE + FF_INPUT_BUFFER_PADDING_SIZE);
+    pixel *dst = (pixel *)_dst;
+    pixel *src = (pixel *)_src;
+    int a_stride, b_stride;
+    int x, y;
+    int cwidth = 0;
+
+    for (x = 0; x < 5; x++) {
+        sao_offset_val[x] = _sao_offset_val[edge_idx[x]];
+    }
+
+    if (height % 8 == 0)
+        cwidth = width;
+
+    stride_src /= sizeof(pixel);
+    stride_dst /= sizeof(pixel);
+
+    switch (cwidth) {
+    case 32:
+        switch(eo) {
+        case 0:
+            ff_hevc_sao_edge_eo0_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
+            break;
+        case 1:
+            ff_hevc_sao_edge_eo1_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
+            break;
+        case 2:
+            ff_hevc_sao_edge_eo2_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
+            break;
+        case 3:
+            ff_hevc_sao_edge_eo3_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
+            break;
+        }
+        break;
+    case 64:
+        switch(eo) {
+        case 0:
+            ff_hevc_sao_edge_eo0_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
+            break;
+        case 1:
+            ff_hevc_sao_edge_eo1_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
+            break;
+        case 2:
+            ff_hevc_sao_edge_eo2_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
+            break;
+        case 3:
+            ff_hevc_sao_edge_eo3_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
+            break;
+        }
+        break;
+    default:
+        a_stride = pos[eo][0][0] + pos[eo][0][1] * stride_src;
+        b_stride = pos[eo][1][0] + pos[eo][1][1] * stride_src;
+        for (y = 0; y < height; y++) {
+            for (x = 0; x < width; x++) {
+                int diff0         = CMP(src[x], src[x + a_stride]);
+                int diff1         = CMP(src[x], src[x + b_stride]);
+                int idx           = diff0 + diff1;
+                if (idx)
+                    dst[x] = av_clip_pixel(src[x] + sao_offset_val[idx+2]);
+            }
+            src += stride_src;
+            dst += stride_dst;
+        }
+    }
+}
+#undef CMP
+
+void ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc,
+                                                int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
+                                                MvField *curr, MvField *neigh, uint8_t *bs);
+
 av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
 {
     if (bit_depth == 8) {
@@ -161,6 +313,10 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
         c->transform_add[2]            = ff_hevc_transform_add_16x16_neon_8;
         c->transform_add[3]            = ff_hevc_transform_add_32x32_neon_8;
         c->idct_4x4_luma               = ff_hevc_transform_luma_4x4_neon_8;
+        for (x = 0; x < sizeof c->sao_band_filter / sizeof *c->sao_band_filter; x++) {
+          c->sao_band_filter[x]        = ff_hevc_sao_band_neon_wrapper;
+          c->sao_edge_filter[x]        = ff_hevc_sao_edge_neon_wrapper;
+        }
         put_hevc_qpel_neon[1][0]       = ff_hevc_put_qpel_v1_neon_8;
         put_hevc_qpel_neon[2][0]       = ff_hevc_put_qpel_v2_neon_8;
         put_hevc_qpel_neon[3][0]       = ff_hevc_put_qpel_v3_neon_8;
@@ -201,7 +357,21 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
             c->put_hevc_qpel_bi[x][1][0]      = ff_hevc_put_qpel_bi_neon_wrapper;
             c->put_hevc_qpel_bi[x][0][1]      = ff_hevc_put_qpel_bi_neon_wrapper;
             c->put_hevc_qpel_bi[x][1][1]      = ff_hevc_put_qpel_bi_neon_wrapper;
+            c->put_hevc_epel[x][1][0]         = ff_hevc_put_epel_v_neon_8;
+            c->put_hevc_epel[x][0][1]         = ff_hevc_put_epel_h_neon_8;
+            c->put_hevc_epel[x][1][1]         = ff_hevc_put_epel_hv_neon_8;
         }
+        c->put_hevc_epel[0][0][0]  = ff_hevc_put_pixels_w2_neon_8;
+        c->put_hevc_epel[1][0][0]  = ff_hevc_put_pixels_w4_neon_8;
+        c->put_hevc_epel[2][0][0]  = ff_hevc_put_pixels_w6_neon_8;
+        c->put_hevc_epel[3][0][0]  = ff_hevc_put_pixels_w8_neon_8;
+        c->put_hevc_epel[4][0][0]  = ff_hevc_put_pixels_w12_neon_8;
+        c->put_hevc_epel[5][0][0]  = ff_hevc_put_pixels_w16_neon_8;
+        c->put_hevc_epel[6][0][0]  = ff_hevc_put_pixels_w24_neon_8;
+        c->put_hevc_epel[7][0][0]  = ff_hevc_put_pixels_w32_neon_8;
+        c->put_hevc_epel[8][0][0]  = ff_hevc_put_pixels_w48_neon_8;
+        c->put_hevc_epel[9][0][0]  = ff_hevc_put_pixels_w64_neon_8;
+
         c->put_hevc_qpel[0][0][0]  = ff_hevc_put_pixels_w2_neon_8;
         c->put_hevc_qpel[1][0][0]  = ff_hevc_put_pixels_w4_neon_8;
         c->put_hevc_qpel[2][0][0]  = ff_hevc_put_pixels_w6_neon_8;
@@ -221,4 +391,9 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
         c->put_hevc_qpel_uni[8][0][0]  = ff_hevc_put_qpel_uw_pixels_w48_neon_8;
         c->put_hevc_qpel_uni[9][0][0]  = ff_hevc_put_qpel_uw_pixels_w64_neon_8;
     }
+
+    assert(offsetof(MvField, mv) == 0);
+    assert(offsetof(MvField, ref_idx) == 8);
+    assert(offsetof(MvField, pred_flag) == 10);
+    c->hevc_deblocking_boundary_strengths = ff_hevc_deblocking_boundary_strengths_neon;
 }
diff --git a/libavcodec/arm/hevcdsp_sao_neon.S b/libavcodec/arm/hevcdsp_sao_neon.S
new file mode 100644
index 0000000..9c7808d
--- /dev/null
+++ b/libavcodec/arm/hevcdsp_sao_neon.S
@@ -0,0 +1,510 @@
+/*
+ * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
+.macro init_sao_band
+        pld      [r1]
+        vld1.8   {q0, q1}, [r2]  // offset table
+        ldr       r2, [sp, #0]   // stride_dst
+        ldr      r12, [sp, #4]   // height
+        vmov.u8  q3, #128
+.endm
+
+// 128 in q3
+// input q8 - q11
+.macro sao_band_64
+        vtbl.8   d24, {d0, d1, d2, d3}, d24
+        vadd.s8  q8, q3
+        vtbl.8   d25, {d0, d1, d2, d3}, d25
+        vadd.s8  q9, q3
+        vtbl.8   d26, {d0, d1, d2, d3}, d26
+        vadd.s8  q10, q3
+        vtbl.8   d27, {d0, d1, d2, d3}, d27
+        vadd.s8  q11, q3
+        vtbl.8   d28, {d0, d1, d2, d3}, d28
+        vqadd.s8 q8, q12
+        vtbl.8   d29, {d0, d1, d2, d3}, d29
+        vqadd.s8 q9, q13
+        vtbl.8   d30, {d0, d1, d2, d3}, d30
+        vqadd.s8 q10, q14
+        vtbl.8   d31, {d0, d1, d2, d3}, d31
+        vsub.s8  q8, q3
+        vqadd.s8 q11, q15
+        vsub.s8  q9, q3
+        vsub.s8  q10, q3
+        vsub.s8  q11, q3
+.endm
+
+function ff_hevc_sao_band_w8_neon_8, export=1
+        init_sao_band
+1:      subs     r12, #8
+        vld1.8   {d16}, [r1, :64], r3
+        vld1.8   {d17}, [r1, :64], r3
+        vshr.u8  q12, q8, #3
+        vld1.8   {d18}, [r1, :64], r3
+        vld1.8   {d19}, [r1, :64], r3
+        vshr.u8  q13, q9, #3
+        vld1.8   {d20}, [r1, :64], r3
+        vld1.8   {d21}, [r1, :64], r3
+        vshr.u8  q14, q10, #3
+        vld1.8   {d22}, [r1, :64], r3
+        vld1.8   {d23}, [r1, :64], r3
+        vshr.u8  q15, q11, #3
+        sao_band_64
+        vst1.8  {d16}, [r0, :64], r2
+        vst1.8  {d17}, [r0, :64], r2
+        vst1.8  {d18}, [r0, :64], r2
+        vst1.8  {d19}, [r0, :64], r2
+        vst1.8  {d20}, [r0, :64], r2
+        vst1.8  {d21}, [r0, :64], r2
+        vst1.8  {d22}, [r0, :64], r2
+        vst1.8  {d23}, [r0, :64], r2
+        bne    1b
+
+        bx lr
+endfunc
+
+function ff_hevc_sao_band_w16_neon_8, export=1
+        init_sao_band
+1:      subs     r12, #4
+        vld1.8  {q8}, [r1, :128], r3
+        vshr.u8  q12, q8, #3
+        vld1.8  {q9}, [r1, :128], r3
+        vshr.u8  q13, q9, #3
+        vld1.8  {q10}, [r1, :128], r3
+        vshr.u8  q14, q10, #3
+        vld1.8  {q11}, [r1, :128], r3
+        vshr.u8  q15, q11, #3
+        sao_band_64
+        vst1.8   {q8}, [r0, :128], r2
+        vst1.8   {q9}, [r0, :128], r2
+        vst1.8   {q10}, [r0, :128], r2
+        vst1.8   {q11}, [r0, :128], r2
+        bne    1b
+
+        bx lr
+endfunc
+
+function ff_hevc_sao_band_w32_neon_8, export=1
+        init_sao_band
+1:      subs     r12, #2
+        vld1.8   {q8-q9}, [r1, :128], r3
+        vshr.u8  q12, q8, #3
+        vshr.u8  q13, q9, #3
+        vld1.8   {q10-q11}, [r1, :128], r3
+        vshr.u8  q14, q10, #3
+        vshr.u8  q15, q11, #3
+        sao_band_64
+        vst1.8   {q8-q9}, [r0, :128], r2
+        vst1.8   {q10-q11}, [r0, :128], r2
+        bne      1b
+
+        bx       lr
+endfunc
+
+function ff_hevc_sao_band_w64_neon_8, export=1
+        init_sao_band
+1:      subs      r12, #1
+        pld       [r1, r3]
+        vld1.8    {q8-q9}, [r1, :128]!
+        vshr.u8  q12, q8, #3
+        vshr.u8  q13, q9, #3
+        vld1.8    {q10-q11}, [r1, :128], r3
+        vshr.u8  q14, q10, #3
+        vshr.u8  q15, q11, #3
+        sub       r1, #32
+        sao_band_64
+        vst1.8    {q8-q9}, [r0, :128]!
+        vst1.8    {q10-q11}, [r0, :128], r2
+        sub       r0, #32
+        bne       1b
+
+        bx lr
+endfunc
+
+.macro diff32 out0, out1, tmp0, tmp1, in0, in1, in2, in3
+        vcgt.u8 \out0, \in2, \in0  // c > a -> -1 , otherwise 0
+        vcgt.u8 \tmp0,  \in0, \in2  // a > c -> -1 , otherwise 0
+        vcgt.u8 \out1, \in3, \in1  // c > a -> -1 , otherwise 0 part 2
+        vcgt.u8 \tmp1,  \in1, \in3  // a > c -> -1 , otherwise 0 part 2
+        vsub.s8 \out0, \tmp0, \out0 // diff0
+        vsub.s8 \out1, \tmp1, \out1 // diff0 part 2
+.endm
+
+.macro table64
+        vmov.s8 q13, #2 // 2 to all elements
+        vmov.32  d24[0], r4  // load offset table from general registers
+        vmov.32  d24[1], r5  // load rest of offset table
+
+        vadd.s8 q0, q13
+        vadd.s8 q1, q13
+        vadd.s8 q2, q13
+        vadd.s8 q3, q13
+
+        vmov.u8  q15, #128 // s8 #-128
+        vtbl.8   d0, {d24}, d0
+        vadd.s8  q13,  q4, q15
+        vtbl.8   d1, {d24}, d1
+        vadd.s8  q14,  q5, q15
+        vtbl.8   d2, {d24}, d2
+        vqadd.s8 q0, q13
+        vtbl.8   d3, {d24}, d3
+        vqadd.s8 q1, q14
+        vtbl.8   d4, {d24}, d4
+        vadd.s8  q13,  q6, q15
+        vtbl.8   d5, {d24}, d5
+        vadd.s8  q14,  q7, q15
+        vtbl.8   d6, {d24}, d6
+        vqadd.s8 q2, q13
+        vtbl.8   d7, {d24}, d7
+        vqadd.s8 q3, q14
+        vsub.s8   q0, q15
+        vsub.s8   q1, q15
+        vsub.s8   q2, q15
+        vsub.s8   q3, q15
+        vst1.8  {q0-q1}, [r0, :128]!
+        vst1.8  {q2-q3}, [r0, :128], r2
+        sub     r0, #32
+.endm
+
+// input
+// a in q0 - q3
+// c in q4 - q7
+// b in q8 - q11
+// offset table in r7 and r5
+// output in q0 - q3
+// clobbers q12 - q15
+.macro edge_w64_body
+        diff32 q12, q13, q0, q1, q0, q1, q4, q5
+        diff32 q0, q1, q14, q15, q8, q9, q4, q5
+
+        vadd.s8  q0, q12 //diff0 + diff1
+        vadd.s8  q1, q13
+
+        diff32  q14, q15, q2, q3, q2, q3, q6, q7
+        diff32  q2, q3, q12, q13, q10, q11, q6, q7
+
+        vadd.s8  q2, q14
+        vadd.s8  q3, q15
+        table64
+.endm
+
+.macro init_edge_64
+        push   {r4-r5}
+        ldr    r12, [sp, #8] // height
+        ldr    r5, [sp, #12] // sao_offset_val_table
+        ldr    r4, [r5]
+        add    r5, #4
+        ldr    r5, [r5]
+.endm
+
+function ff_hevc_sao_edge_eo0_w64_neon_8, export=1
+        init_edge_64
+        vpush {d8-d15}
+        sub    r1, #8
+1:      subs    r12, #1
+        vld1.64  {d7}, [r1, :64]!
+        vld1.64  {q4-q5}, [r1, :128]! // load c
+        vld1.64  {q6-q7}, [r1, :128]!
+        vld1.64  {d24}, [r1, :64], r3
+        sub      r1, #72
+        // load a
+        vext.8 q0, q3, q4, #15
+        vext.8 q1, q4, q5, #15
+        vext.8 q2, q5, q6, #15
+        vext.8 q3, q6, q7, #15
+        // load b
+        vext.8 q8, q4, q5, #1
+        vext.8 q9, q5, q6, #1
+        vext.8 q10, q6, q7, #1
+        vext.8 q11, q7, q12, #1
+        edge_w64_body
+        bne   1b
+        vpop  {d8-d15}
+        pop   {r4-r5}
+        bx lr
+endfunc
+
+function ff_hevc_sao_edge_eo1_w64_neon_8, export=1
+        init_edge_64
+        vpush {d8-d15}
+        sub     r1, r3
+        // load a
+        vld1.8  {q0-q1}, [r1, :128]!
+        vld1.8  {q2-q3}, [r1, :128], r3
+        sub     r1, #32
+        // load c
+        vld1.8  {q4-q5}, [r1, :128]!
+        vld1.8  {q6-q7}, [r1, :128], r3
+        sub     r1, #32
+1:      subs    r12, #1
+        // load b
+        vld1.8  {q8-q9}, [r1, :128]!
+        vld1.8  {q10-q11}, [r1, :128], r3
+        sub     r1, #32
+        edge_w64_body
+        // copy c to a
+        vmov.64 q0, q4
+        vmov.64 q1, q5
+        vmov.64 q2, q6
+        vmov.64 q3, q7
+        // copy b to c
+        vmov.64 q4, q8
+        vmov.64 q5, q9
+        vmov.64 q6, q10
+        vmov.64 q7, q11
+        bne   1b
+        vpop  {d8-d15}
+        pop   {r4-r5}
+        bx lr
+endfunc
+
+function ff_hevc_sao_edge_eo2_w64_neon_8, export=1
+        init_edge_64
+        vpush {d8-d15}
+1:      sub     r1, r3
+        // load a
+        // TODO: fix unaligned load
+        //       don't reload a like in eo1
+        sub     r1, #1
+        vld1.8  {q0-q1}, [r1]!
+        vld1.8  {q2-q3}, [r1], r3
+        sub     r1, #31
+        subs    r12, #1
+        // load c
+        vld1.8  {q4-q5}, [r1, :128]!
+        vld1.8  {q6-q7}, [r1, :128], r3
+        sub     r1, #32
+        // load b
+        add     r1, #1
+        vld1.8  {q8-q9}, [r1]!
+        vld1.8  {q10-q11}, [r1]
+        sub     r1, #33
+        edge_w64_body
+        bne   1b
+        vpop  {d8-d15}
+        pop   {r4-r5}
+        bx lr
+endfunc
+
+function ff_hevc_sao_edge_eo3_w64_neon_8, export=1
+        init_edge_64
+        vpush {d8-d15}
+1:      sub     r1, r3
+        // load a
+        // TODO: fix unaligned load
+        //       don't reload a like in eo1
+        add     r1, #1
+        vld1.8  {q0-q1}, [r1]!
+        vld1.8  {q2-q3}, [r1], r3
+        sub     r1, #33
+        subs    r12, #1
+        // load c
+        vld1.8  {q4-q5}, [r1, :128]!
+        vld1.8  {q6-q7}, [r1, :128], r3
+        sub     r1, #32
+        // load b
+        sub     r1, #1
+        vld1.8  {q8-q9}, [r1]!
+        vld1.8  {q10-q11}, [r1]
+        sub     r1, #31
+        edge_w64_body
+        bne   1b
+        vpop  {d8-d15}
+        pop   {r4-r5}
+        bx lr
+endfunc
+
+.macro init_edge_32
+        ldr     r12, [sp, #4] // sao_offset_val_table
+        vld1.32 {d31}, [r12]
+        ldr     r12, [sp] // height
+.endm
+
+.macro diff out0, tmp0, in0, in1
+        vcgt.u8 \out0, \in1, \in0  // c > a -> -1 , otherwise 0
+        vcgt.u8 \tmp0,  \in0, \in1  // a > c -> -1 , otherwise 0
+        vsub.s8 \out0, \tmp0, \out0 // diff0
+.endm
+
+.macro table32
+        vmov.s8  q10, #2
+        vadd.s8  q0, q10
+        vadd.s8  q1, q10
+        vmov.s8  q10, #128
+        vtbl.8   d0, {d31}, d0
+        vadd.s8  q11, q2, q10
+        vtbl.8   d1, {d31}, d1
+        vadd.s8  q12, q3, q10
+        vtbl.8   d2, {d31}, d2
+        vqadd.s8 q11, q0
+        vtbl.8   d3, {d31}, d3
+        vqadd.s8 q12, q1
+        vsub.s8  q0, q11, q10
+        vsub.s8  q1, q12, q10
+        vst1.8   {q0-q1}, [r0, :128], r2
+.endm
+
+function ff_hevc_sao_edge_eo0_w32_neon_8, export=1
+        init_edge_32
+        vpush {q4-q7}
+        sub     r1, #4
+1:      subs    r12, #1
+        vld1.8  {q13-q14}, [r1]!
+        vld1.32 d30, [r1], r3
+        sub     r1, #32
+        // a
+        vext.8   q0, q13, q14, #3
+        vext.8   q1, q14, q15, #3
+        vshr.u64 d24, d30, #24
+        // c
+        vext.8   q2, q13, q14, #4
+        vext.8   q3, q14, q15, #4
+        vshr.u64 d16, d30, #32
+        // diff0
+        diff32 q13, q14, q4, q5, q0, q1, q2, q3
+        diff   d18, d25, d24, d16
+        // -diff1
+        vext.s8 q0, q13, q14, #1
+        vext.s8 q1, q14, q9, #1
+
+        vsub.s8 q0, q13, q0 //diff0 + diff1
+        vsub.s8 q1, q14, q1
+        table32
+        bne     1b
+        vpop {q4-q7}
+
+        bx      lr
+endfunc
+
+function ff_hevc_sao_edge_eo1_w32_neon_8, export=1
+        init_edge_32
+        vpush {q4-q7}
+        // load a
+        sub     r1, r3
+        vld1.8  {q0-q1}, [r1, :128], r3
+        // load c
+        vld1.8  {q2-q3}, [r1, :128], r3
+        diff32 q12, q13, q0, q1, q0, q1, q2, q3 // CMP ( c, a )
+1:      subs    r12, #1
+        // load b
+        vld1.8  {q8-q9}, [r1, :128], r3
+        diff32 q4, q5, q10, q11, q8, q9, q2, q3 // CMP ( c, b )
+        vadd.s8 q0, q4, q12 //diff0 + diff1
+        vadd.s8 q1, q5, q13
+        table32
+        // CMP ( c, a )
+        vneg.s8 q12, q4
+        vneg.s8 q13, q5
+        // c
+        vmov.64 q2, q8
+        vmov.64 q3, q9
+        bne     1b
+        vpop {q4-q7}
+        bx      lr
+endfunc
+
+function ff_hevc_sao_edge_eo2_w32_neon_8, export=1
+        init_edge_32
+        vpush   {d8-d15}
+        // load a
+        sub     r1, r3
+        sub     r1, #8
+        vld1.8  {q10-q11}, [r1, :64]!
+        vld1.8  {d24}, [r1, :64], r3
+        sub     r1, #32
+        vext.8  q0, q10, q11, #7
+        vext.8  q1, q11, q12, #7
+        // load c
+        vld1.8  {d9}, [r1, :64]!
+        vld1.8  {q2-q3}, [r1, :64], r3
+        sub     r1, #8
+        vext.8  q4, q4, q2, #15
+1:      subs    r12, #1
+        // load b
+        vld1.8  {q10-q11}, [r1, :64]!
+        vld1.8  {q12}, [r1, :64], r3
+        sub     r1, #32
+        vext.8  q8, q10, q11, #9
+        vext.8  q9, q11, q12, #9
+        vext.8  q6, q10, q11, #8
+        vext.8  q7, q11, q12, #8
+        vext.8  q5, q10, q11, #7
+        diff32 q12, q13, q0, q1, q0, q1, q2, q3
+        diff32 q0, q1, q10, q11, q8, q9, q2, q3
+        vadd.s8 q0, q12 //diff0 + diff1
+        vadd.s8 q1, q13
+        table32
+        // inputs for next loop iteration
+        // a
+        vmov.8  q0, q4
+        vext.8  q1, q2, q3, #15
+        // c
+        vmov.8  q2, q6
+        vmov.8  q3, q7
+        vmov.8  q4, q5
+        bne     1b
+        vpop    {d8-d15}
+        bx      lr
+endfunc
+
+function ff_hevc_sao_edge_eo3_w32_neon_8, export=1
+        init_edge_32
+        sub     r1, r3
+        // load a
+        vld1.8  {q10-q11}, [r1, :64]!
+        vld1.8  {d24}, [r1, :64], r3
+        sub     r1, #32
+        vext.8  q0, q10, q11, #1
+        vext.8  q1, q11, q12, #1
+        // load c
+        vld1.8  {q2-q3}, [r1, :64]!
+        vld1.8  {d30}, [r1, :64], r3
+        sub     r1, #40
+1:      subs    r12, #1
+        // load b
+        vld1.8  {q10-q11}, [r1, :64]!
+        vld1.8  {q12}, [r1, :64], r3
+        sub     r1, #32
+        vext.8  q8, q10, q11, #7
+        vext.8  q9, q11, q12, #7
+        vext.8  q14, q12, q10, #7
+
+        diff32 q12, q13, q0, q1, q0, q1, q2, q3
+        diff32 q0, q1, q10, q11, q8, q9, q2, q3
+
+        vadd.s8 q0, q12 //diff0 + diff1
+        vadd.s8 q1, q13
+        table32
+
+        // inputs for next loop iteration
+        // a
+        vext.8  q0, q2, q3, #1
+        vext.8  q1, q3, q15, #1
+        // c
+        vext.8  q2, q8, q9, #1
+        vext.8  q3, q9, q14, #1
+        vext.8  d30, d28, d2, #1
+        bne     1b
+        bx      lr
+endfunc
+
diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
index 39713ed..25eb52b 100644
--- a/libavcodec/avcodec.h
+++ b/libavcodec/avcodec.h
@@ -410,6 +410,8 @@ enum AVCodecID {
     AV_CODEC_ID_SHEERVIDEO,
     AV_CODEC_ID_YLC,

+    AV_CODEC_ID_H264_MVC,
+
     /* various PCM "codecs" */
     AV_CODEC_ID_FIRST_AUDIO = 0x10000,     ///< A dummy id pointing at the start of audio codecs
     AV_CODEC_ID_PCM_S16LE = 0x10000,
@@ -2850,6 +2852,7 @@ typedef struct AVCodecContext {
 #define FF_BUG_DC_CLIP          4096
 #define FF_BUG_MS               8192 ///< Work around various bugs in Microsoft's broken decoders.
 #define FF_BUG_TRUNCATED       16384
+#define FF_BUG_GMC_UNSUPPORTED 32768

     /**
      * strictly follow the standard (MPEG-4, ...).
@@ -3195,6 +3198,9 @@ typedef struct AVCodecContext {
 #define FF_PROFILE_H264_HIGH_444_PREDICTIVE  244
 #define FF_PROFILE_H264_HIGH_444_INTRA       (244|FF_PROFILE_H264_INTRA)
 #define FF_PROFILE_H264_CAVLC_444            44
+#define FF_PROFILE_H264_MULTIVIEW_HIGH       118
+#define FF_PROFILE_H264_STEREO_HIGH          128
+#define FF_PROFILE_H264_MULTIVIEW_HIGH_DEPTH 138

 #define FF_PROFILE_VC1_SIMPLE   0
 #define FF_PROFILE_VC1_MAIN     1
@@ -3505,6 +3511,12 @@ typedef struct AVCodecContext {
 #define FF_SUB_TEXT_FMT_ASS_WITH_TIMINGS 1
 #endif

+    /**
+     * Opaque pointer for use by replacement get_buffer2 code
+     *
+     * @author jc (08/02/2016)
+     */
+    void * get_buffer_context;
 } AVCodecContext;

 AVRational av_codec_get_pkt_timebase         (const AVCodecContext *avctx);
diff --git a/libavcodec/cabac.h b/libavcodec/cabac.h
index 1bf1c62..ccfa991 100644
--- a/libavcodec/cabac.h
+++ b/libavcodec/cabac.h
@@ -43,7 +43,14 @@ extern const uint8_t ff_h264_cabac_tables[512 + 4*2*64 + 4*64 + 63];
 typedef struct CABACContext{
     int low;
     int range;
-    int outstanding_count;
+    union
+    {
+        int outstanding_count;
+        struct {
+            uint16_t bits;
+            uint16_t range;
+        } by22;
+    };
     const uint8_t *bytestream_start;
     const uint8_t *bytestream;
     const uint8_t *bytestream_end;
diff --git a/libavcodec/codec_desc.c b/libavcodec/codec_desc.c
index 9d94b72..535ebf0 100644
--- a/libavcodec/codec_desc.c
+++ b/libavcodec/codec_desc.c
@@ -1563,6 +1563,13 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .long_name = NULL_IF_CONFIG_SMALL("YUY2 Lossless Codec"),
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
     },
+    {
+        .id        = AV_CODEC_ID_H264_MVC,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "h264_mvc",
+        .long_name = NULL_IF_CONFIG_SMALL("H264 MVC"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },

     /* various PCM "codecs" */
     {
diff --git a/libavcodec/h264.h b/libavcodec/h264.h
index efe3555..16358aa 100644
--- a/libavcodec/h264.h
+++ b/libavcodec/h264.h
@@ -126,7 +126,9 @@ enum {
     NAL_END_STREAM      = 11,
     NAL_FILLER_DATA     = 12,
     NAL_SPS_EXT         = 13,
+    NAL_SPS_SUBSET      = 15,
     NAL_AUXILIARY_SLICE = 19,
+    NAL_SLICE_EXT       = 20,
     NAL_FF_IGNORE       = 0xff0f001,
 };

diff --git a/libavcodec/h264_parser.c b/libavcodec/h264_parser.c
index ce4bab2..b9b0c78 100644
--- a/libavcodec/h264_parser.c
+++ b/libavcodec/h264_parser.c
@@ -58,6 +58,8 @@ typedef struct H264ParseContext {
     uint8_t parse_history[6];
     int parse_history_count;
     int parse_last_mb;
+    int is_mvc;
+    int slice_ext;
 } H264ParseContext;


@@ -105,24 +107,27 @@ static int h264_find_frame_end(H264ParseContext *p, const uint8_t *buf,
         } else if (state <= 5) {
             int nalu_type = buf[i] & 0x1F;
             if (nalu_type == NAL_SEI || nalu_type == NAL_SPS ||
-                nalu_type == NAL_PPS || nalu_type == NAL_AUD) {
+                nalu_type == NAL_PPS || nalu_type == NAL_AUD ||
+                nalu_type == NAL_SPS_SUBSET) {
                 if (pc->frame_start_found) {
                     i++;
                     goto found;
                 }
             } else if (nalu_type == NAL_SLICE || nalu_type == NAL_DPA ||
-                       nalu_type == NAL_IDR_SLICE) {
+                       nalu_type == NAL_IDR_SLICE || (p->is_mvc && nalu_type == NAL_SLICE_EXT)) {
                 state += 8;
+
+                p->slice_ext = (nalu_type == NAL_SLICE_EXT);
                 continue;
             }
             state = 7;
         } else {
             p->parse_history[p->parse_history_count++] = buf[i];
-            if (p->parse_history_count > 5) {
+            if (p->parse_history_count > 8) {
                 unsigned int mb, last_mb = p->parse_last_mb;
                 GetBitContext gb;

-                init_get_bits(&gb, p->parse_history, 8*p->parse_history_count);
+                init_get_bits8(&gb, p->parse_history + 3*p->slice_ext, p->parse_history_count - 3*p->slice_ext);
                 p->parse_history_count = 0;
                 mb= get_ue_golomb_long(&gb);
                 p->parse_last_mb = mb;
@@ -145,7 +150,7 @@ found:
     pc->frame_start_found = 0;
     if (p->is_avc)
         return next_avc;
-    return i - (state & 5) - 5 * (state > 7);
+    return i - (state & 5) - 8 * (state > 7);
 }

 static int scan_mmco_reset(AVCodecParserContext *s, GetBitContext *gb,
@@ -585,7 +590,8 @@ static int h264_parse(AVCodecParserContext *s,
         }
     }

-    parse_nal_units(s, avctx, buf, buf_size);
+    if (!p->is_mvc)
+        parse_nal_units(s, avctx, buf, buf_size);

     if (avctx->framerate.num)
         avctx->time_base = av_inv_q(av_mul_q(avctx->framerate, (AVRational){avctx->ticks_per_frame, 1}));
@@ -622,7 +628,7 @@ static int h264_split(AVCodecContext *avctx,
         if ((state & 0xFFFFFF00) != 0x100)
             break;
         nalu_type = state & 0x1F;
-        if (nalu_type == NAL_SPS) {
+        if (nalu_type == NAL_SPS || nalu_type == NAL_SPS_SUBSET) {
             has_sps = 1;
         } else if (nalu_type == NAL_PPS)
             has_pps = 1;
@@ -672,3 +678,23 @@ AVCodecParser ff_h264_parser = {
     .parser_close   = h264_close,
     .split          = h264_split,
 };
+
+static av_cold int init_mvc(AVCodecParserContext *s)
+{
+    H264ParseContext *p = s->priv_data;
+    int ret = init(s);
+    if (ret < 0)
+        return ret;
+
+    p->is_mvc = 1;
+    return 0;
+}
+
+AVCodecParser ff_h264_mvc_parser = {
+    .codec_ids      = { AV_CODEC_ID_H264_MVC },
+    .priv_data_size = sizeof(H264ParseContext),
+    .parser_init    = init_mvc,
+    .parser_parse   = h264_parse,
+    .parser_close   = h264_close,
+    .split          = h264_split,
+};
diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
index b478065..88dd40b 100644
--- a/libavcodec/hevc.c
+++ b/libavcodec/hevc.c
@@ -41,8 +41,186 @@
 #include "hevc.h"
 #include "profiles.h"

+#ifdef RPI
+  #include "rpi_qpu.h"
+  #include "rpi_user_vcsm.h"
+  // Move Inter prediction into separate pass
+  #define RPI_INTER
+
+  #ifdef RPI_INTER_QPU
+    // Define RPI_MULTI_MAILBOX to use the updated mailbox that can launch both QPU and VPU
+    #define RPI_MULTI_MAILBOX
+  #endif
+
+  // Define RPI_CACHE_UNIF_MVS to write motion vector uniform stream to cached memory
+  // RPI_CACHE_UNIF_MVS doesn't seem to make much difference, so left undefined.
+
+  // Define RPI_SIMULATE_QPUS for debugging to run QPU code on the ARMs (*rotted*)
+  //#define RPI_SIMULATE_QPUS
+  #ifdef RPI_WORKER
+    #include "pthread.h"
+  #endif
+
+  static void rpi_execute_dblk_cmds(HEVCContext *s);
+  static void rpi_execute_transform(HEVCContext *s);
+  static void rpi_launch_vpu_qpu(HEVCContext *s);
+  static void rpi_execute_pred_cmds(HEVCContext *s);
+  static void rpi_execute_inter_cmds(HEVCContext *s);
+  static void rpi_begin(HEVCContext *s);
+  static void flush_frame(HEVCContext *s,AVFrame *frame);
+  static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2, int job);
+
+#endif
+
+// #define DISABLE_MC
+
+#define PACK2(hi,lo) (((hi) << 16) | ((lo) & 0xffff))
+
+#ifndef av_mod_uintp2
+static av_always_inline av_const unsigned av_mod_uintp2_c(unsigned a, unsigned p)
+{
+    return a & ((1 << p) - 1);
+}
+#   define av_mod_uintp2   av_mod_uintp2_c
+#endif
+
 const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };

+
+#ifdef RPI_INTER_QPU
+
+// Each luma QPU processes 2*RPI_NUM_CHUNKS 64x64 blocks
+// Each chroma QPU processes 3*RPI_NUM_CHUNKS 64x64 blocks, but requires two commands for B blocks
+// For each block of 64*64 the smallest block size is 8x4
+// We also need an extra command for the setup information
+
+#define RPI_CHROMA_COMMAND_WORDS 12
+#define UV_COMMANDS_PER_QPU ((1 + 3*RPI_NUM_CHUNKS*(64*64)*2/(8*4)) * RPI_CHROMA_COMMAND_WORDS)
+// The QPU code for UV blocks only works up to a block width of 8
+#define RPI_CHROMA_BLOCK_WIDTH 8
+
+#define RPI_LUMA_COMMAND_WORDS 10
+#define Y_COMMANDS_PER_QPU ((1+2*RPI_NUM_CHUNKS*(64*64)/(8*4)) * RPI_LUMA_COMMAND_WORDS)
+
+#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
+
+// TODO Chroma only needs 4 taps
+
+// Actual filter goes -ve, +ve, +ve, -ve using these values
+static const uint32_t rpi_filter_coefs[8][1] = {
+        { ENCODE_COEFFS(   0,  64,   0,   0) },
+        { ENCODE_COEFFS(  2,  58,  10,  2) },
+        { ENCODE_COEFFS(  4,  54,  16,  2) },
+        { ENCODE_COEFFS(  6,  46,  28,  4) },
+        { ENCODE_COEFFS(  4,  36,  36,  4) },
+        { ENCODE_COEFFS(  4,  28,  46,  6) },
+        { ENCODE_COEFFS(  2,  16,  54,  4) },
+        { ENCODE_COEFFS(  2,  10,  58,  2) }
+};
+
+#endif
+
+
+#ifdef RPI_WORKER
+
+//#define LOG_ENTER printf("Enter %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s);
+//#define LOG_EXIT printf("Exit %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s);
+
+#define LOG_ENTER
+#define LOG_EXIT
+
+// Call this when we have completed pass0 and wish to trigger pass1 for the current job
+static void worker_submit_job(HEVCContext *s)
+{
+  LOG_ENTER
+  pthread_mutex_lock(&s->worker_mutex);
+  s->worker_tail++;
+  s->pass0_job = (s->pass0_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
+  pthread_cond_broadcast(&s->worker_cond_tail); // Let people know that the tail has moved
+  pthread_mutex_unlock(&s->worker_mutex);
+  LOG_EXIT
+}
+
+// Call this to say we have completed pass1
+static void worker_complete_job(HEVCContext *s)
+{
+  LOG_ENTER
+  pthread_mutex_lock(&s->worker_mutex);
+  s->worker_head++;
+  s->pass1_job = (s->pass1_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
+  pthread_cond_broadcast(&s->worker_cond_head); // Let people know that the head has moved
+  pthread_mutex_unlock(&s->worker_mutex);
+  LOG_EXIT
+}
+
+// Call this to wait for all jobs to have completed at the end of a frame
+static void worker_wait(HEVCContext *s)
+{
+  LOG_ENTER
+  pthread_mutex_lock(&s->worker_mutex);
+  while( s->worker_head !=s->worker_tail)
+  {
+    pthread_cond_wait(&s->worker_cond_head, &s->worker_mutex);
+  }
+  pthread_mutex_unlock(&s->worker_mutex);
+  LOG_EXIT
+}
+
+// Call worker_pass0_ready to wait until the s->pass0_job slot becomes
+// available to receive the next job.
+static void worker_pass0_ready(HEVCContext *s)
+{
+  LOG_ENTER
+    pthread_mutex_lock(&s->worker_mutex);
+    // tail is number of submitted jobs
+    // head is number of completed jobs
+    // tail-head is number of outstanding jobs in the queue
+    // we need to ensure there is at least 1 space left for us to use
+    while( s->worker_tail - s->worker_head >= RPI_MAX_JOBS)
+    {
+      // Wait until another job is completed
+      pthread_cond_wait(&s->worker_cond_head, &s->worker_mutex);
+    }
+    pthread_mutex_unlock(&s->worker_mutex);
+  LOG_EXIT
+}
+
+static void *worker_start(void *arg)
+{
+  HEVCContext *s = (HEVCContext *)arg;
+  while(1) {
+    pthread_mutex_lock(&s->worker_mutex);
+
+    while( !s->kill_worker && s->worker_tail - s->worker_head <= 0)
+    {
+      pthread_cond_wait(&s->worker_cond_tail, &s->worker_mutex);
+    }
+    pthread_mutex_unlock(&s->worker_mutex);
+
+    if (s->kill_worker) {
+      break;
+    }
+    LOG_ENTER
+    // printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
+    rpi_launch_vpu_qpu(s);
+    // Perform inter prediction
+    rpi_execute_inter_cmds(s);
+    // Wait for transform completion
+    vpu_wait(s->vpu_id);
+
+    // Perform intra prediction and residual reconstruction
+    rpi_execute_pred_cmds(s);
+    // Perform deblocking for CTBs in this row
+    rpi_execute_dblk_cmds(s);
+
+    worker_complete_job(s);
+    LOG_EXIT
+  }
+  return NULL;
+}
+
+#endif
+
 /**
  * NOTE: Each function hls_foo correspond to the function foo in the
  * specification (HLS stands for High Level Syntax).
@@ -55,6 +233,32 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
 /* free everything allocated  by pic_arrays_init() */
 static void pic_arrays_free(HEVCContext *s)
 {
+#ifdef RPI
+    int job;
+    for(job=0;job<RPI_MAX_JOBS;job++) {
+      if (s->coeffs_buf_arm[job][0]) {
+        gpu_free(&s->coeffs_buf_default[job]);
+        s->coeffs_buf_arm[job][0] = 0;
+      }
+      if (s->coeffs_buf_arm[job][2]) {
+        gpu_free(&s->coeffs_buf_accelerated[job]);
+        s->coeffs_buf_arm[job][2] = 0;
+      }
+    }
+#endif
+#ifdef RPI_DEBLOCK_VPU
+    {
+        int i;
+        for (i = 0; i != RPI_DEBLOCK_VPU_Q_COUNT; ++i) {
+            struct dblk_vpu_q_s * const dvq = s->dvq_ents + i;
+
+            if (dvq->vpu_cmds_arm) {
+                gpu_free(&dvq->deblock_vpu_gmem);
+              dvq->vpu_cmds_arm = 0;
+            }
+        }
+    }
+#endif
     av_freep(&s->sao);
     av_freep(&s->deblock);

@@ -91,6 +295,87 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
     int ctb_count        = sps->ctb_width * sps->ctb_height;
     int min_pu_size      = sps->min_pu_width * sps->min_pu_height;

+#ifdef RPI
+    int coefs_in_ctb = (1 << sps->log2_ctb_size) * (1 << sps->log2_ctb_size);
+    int coefs_per_luma = 64*64*24*RPI_NUM_CHUNKS;
+    int coefs_per_chroma = (coefs_per_luma * 2) >> sps->vshift[1] >> sps->hshift[1];
+    int coefs_per_row = coefs_per_luma + coefs_per_chroma;
+    int job;
+
+    av_assert0(sps);
+    s->max_ctu_count = coefs_per_luma / coefs_in_ctb;
+    s->ctu_per_y_chan = s->max_ctu_count / 12;
+    s->ctu_per_uv_chan = s->max_ctu_count / 8;
+    for(job=0;job<RPI_MAX_JOBS;job++) {
+      printf("Allocated %d\n",coefs_per_row);
+      for(job=0;job<RPI_MAX_JOBS;job++) {
+        gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default[job]);
+        s->coeffs_buf_arm[job][0] = (int16_t*) s->coeffs_buf_default[job].arm;
+        if (!s->coeffs_buf_arm[job][0])
+            goto fail;
+        gpu_malloc_cached(sizeof(int16_t) * (coefs_per_row + 32*32), &s->coeffs_buf_accelerated[job]);  // We prefetch past the end so provide an extra blocks worth of data
+        s->coeffs_buf_arm[job][2] = (int16_t*) s->coeffs_buf_accelerated[job].arm;
+        s->coeffs_buf_vc[job][2] = s->coeffs_buf_accelerated[job].vc;
+        if (!s->coeffs_buf_arm[job][2])
+            goto fail;
+        s->coeffs_buf_arm[job][3] = coefs_per_row + s->coeffs_buf_arm[job][2];  // This points to just beyond the end of the buffer.  Coefficients fill in backwards.
+        s->coeffs_buf_vc[job][3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[job][2];
+      }
+    }
+#endif
+#ifdef RPI_DEBLOCK_VPU
+    {
+        int i;
+        s->enable_rpi_deblock = !sps->sao_enabled;
+        s->setup_width = (sps->width+15) / 16;
+        s->setup_height = (sps->height+15) / 16;
+        s->uv_setup_width = ( (sps->width >> sps->hshift[1]) + 15) / 16;
+        s->uv_setup_height = ( (sps->height >> sps->vshift[1]) + 15) / 16;
+
+        for (i = 0; i != RPI_DEBLOCK_VPU_Q_COUNT; ++i)
+        {
+            struct dblk_vpu_q_s * const dvq = s->dvq_ents + i;
+            const unsigned int cmd_size = (sizeof(*dvq->vpu_cmds_arm) * 3 + 15) & ~15;
+            const unsigned int y_size = (sizeof(*dvq->y_setup_arm) * s->setup_width * s->setup_height + 15) & ~15;
+            const unsigned int uv_size = (sizeof(*dvq->uv_setup_arm) * s->uv_setup_width * s->uv_setup_height + 15) & ~15;
+            const unsigned int total_size =- cmd_size + y_size + uv_size;
+            int p_vc;
+            uint8_t * p_arm;
+ #if RPI_VPU_DEBLOCK_CACHED
+            gpu_malloc_cached(total_size, &dvq->deblock_vpu_gmem);
+ #else
+            gpu_malloc_uncached(total_size, &dvq->deblock_vpu_gmem);
+ #endif
+            p_vc = dvq->deblock_vpu_gmem.vc;
+            p_arm = dvq->deblock_vpu_gmem.arm;
+
+            // Zap all
+            memset(p_arm, 0, dvq->deblock_vpu_gmem.numbytes);
+
+            // Subdivide
+            dvq->vpu_cmds_arm = (void*)p_arm;
+            dvq->vpu_cmds_vc = p_vc;
+
+            p_arm += cmd_size;
+            p_vc += cmd_size;
+
+            dvq->y_setup_arm = (void*)p_arm;
+            dvq->y_setup_vc = (void*)p_vc;
+
+            p_arm += y_size;
+            p_vc += y_size;
+
+            dvq->uv_setup_arm = (void*)p_arm;
+            dvq->uv_setup_vc = (void*)p_vc;
+
+            dvq->cmd_id = -1;
+        }
+
+        s->dvq_n = 0;
+        s->dvq = s->dvq_ents + s->dvq_n;
+    }
+#endif
+
     s->bs_width  = (width  >> 2) + 1;
     s->bs_height = (height >> 2) + 1;

@@ -137,6 +422,29 @@ fail:
     return AVERROR(ENOMEM);
 }

+static void default_pred_weight_table(HEVCContext * const s)
+{
+  unsigned int i;
+  s->sh.luma_log2_weight_denom = 0;
+  s->sh.chroma_log2_weight_denom = 0;
+  for (i = 0; i < s->sh.nb_refs[L0]; i++) {
+      s->sh.luma_weight_l0[i] = 1;
+      s->sh.luma_offset_l0[i] = 0;
+      s->sh.chroma_weight_l0[i][0] = 1;
+      s->sh.chroma_offset_l0[i][0] = 0;
+      s->sh.chroma_weight_l0[i][1] = 1;
+      s->sh.chroma_offset_l0[i][1] = 0;
+  }
+  for (i = 0; i < s->sh.nb_refs[L1]; i++) {
+      s->sh.luma_weight_l1[i] = 1;
+      s->sh.luma_offset_l1[i] = 0;
+      s->sh.chroma_weight_l1[i][0] = 1;
+      s->sh.chroma_offset_l1[i][0] = 0;
+      s->sh.chroma_weight_l1[i][1] = 1;
+      s->sh.chroma_offset_l1[i][1] = 0;
+  }
+}
+
 static void pred_weight_table(HEVCContext *s, GetBitContext *gb)
 {
     int i = 0;
@@ -674,6 +982,11 @@ static int hls_slice_header(HEVCContext *s)
                 (s->ps.pps->weighted_bipred_flag && sh->slice_type == B_SLICE)) {
                 pred_weight_table(s, gb);
             }
+            else
+            {
+              // Give us unit weights
+              default_pred_weight_table(s);
+            }

             sh->max_num_merge_cand = 5 - get_ue_golomb_long(gb);
             if (sh->max_num_merge_cand < 1 || sh->max_num_merge_cand > 5) {
@@ -931,6 +1244,25 @@ static int hls_cross_component_pred(HEVCContext *s, int idx) {
     return 0;
 }

+#ifdef RPI
+static void rpi_intra_pred(HEVCContext *s, int log2_trafo_size, int x0, int y0, int c_idx)
+{
+    if (s->enable_rpi) {
+        HEVCLocalContext *lc = s->HEVClc;
+        HEVCPredCmd *cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
+        cmd->type = RPI_PRED_INTRA;
+        cmd->size = log2_trafo_size;
+        cmd->c_idx = c_idx;
+        cmd->x = x0;
+        cmd->y = y0;
+        cmd->na = (lc->na.cand_bottom_left<<4) + (lc->na.cand_left<<3) + (lc->na.cand_up_left<<2) + (lc->na.cand_up<<1) + lc->na.cand_up_right;
+        cmd->mode = c_idx ? lc->tu.intra_pred_mode_c :  lc->tu.intra_pred_mode;
+    } else {
+        s->hpc.intra_pred[log2_trafo_size - 2](s, x0, y0, c_idx);
+    }
+}
+#endif
+
 static int hls_transform_unit(HEVCContext *s, int x0, int y0,
                               int xBase, int yBase, int cb_xBase, int cb_yBase,
                               int log2_cb_size, int log2_trafo_size,
@@ -943,8 +1275,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
     if (lc->cu.pred_mode == MODE_INTRA) {
         int trafo_size = 1 << log2_trafo_size;
         ff_hevc_set_neighbour_available(s, x0, y0, trafo_size, trafo_size);
-
+#ifdef RPI
+        rpi_intra_pred(s, log2_trafo_size, x0, y0, 0);
+#else
         s->hpc.intra_pred[log2_trafo_size - 2](s, x0, y0, 0);
+#endif
     }

     if (cbf_luma || cbf_cb[0] || cbf_cr[0] ||
@@ -1030,7 +1365,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
             for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
                 if (lc->cu.pred_mode == MODE_INTRA) {
                     ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
+#ifdef RPI
+                    rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 1);
+#else
                     s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (i << log2_trafo_size_c), 1);
+#endif
                 }
                 if (cbf_cb[i])
                     ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
@@ -1059,7 +1398,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
             for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
                 if (lc->cu.pred_mode == MODE_INTRA) {
                     ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
+#ifdef RPI
+                    rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 2);
+#else
                     s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (i << log2_trafo_size_c), 2);
+#endif
                 }
                 if (cbf_cr[i])
                     ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
@@ -1088,7 +1431,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
                 if (lc->cu.pred_mode == MODE_INTRA) {
                     ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
                                                     trafo_size_h, trafo_size_v);
+#ifdef RPI
+                    rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 1);
+#else
                     s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (i << log2_trafo_size), 1);
+#endif
                 }
                 if (cbf_cb[i])
                     ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
@@ -1098,7 +1445,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
                 if (lc->cu.pred_mode == MODE_INTRA) {
                     ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
                                                 trafo_size_h, trafo_size_v);
+#ifdef RPI
+                    rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 2);
+#else
                     s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (i << log2_trafo_size), 2);
+#endif
                 }
                 if (cbf_cr[i])
                     ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
@@ -1110,26 +1461,46 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
             int trafo_size_h = 1 << (log2_trafo_size_c + s->ps.sps->hshift[1]);
             int trafo_size_v = 1 << (log2_trafo_size_c + s->ps.sps->vshift[1]);
             ff_hevc_set_neighbour_available(s, x0, y0, trafo_size_h, trafo_size_v);
+#ifdef RPI
+            rpi_intra_pred(s, log2_trafo_size_c, x0, y0, 1);
+            rpi_intra_pred(s, log2_trafo_size_c, x0, y0, 2);
+#else
             s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0, 1);
             s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0, 2);
+#endif
             if (s->ps.sps->chroma_format_idc == 2) {
                 ff_hevc_set_neighbour_available(s, x0, y0 + (1 << log2_trafo_size_c),
                                                 trafo_size_h, trafo_size_v);
+#ifdef RPI
+                rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 1);
+                rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 2);
+#else
                 s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (1 << log2_trafo_size_c), 1);
                 s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (1 << log2_trafo_size_c), 2);
+#endif
             }
         } else if (blk_idx == 3) {
             int trafo_size_h = 1 << (log2_trafo_size + 1);
             int trafo_size_v = 1 << (log2_trafo_size + s->ps.sps->vshift[1]);
             ff_hevc_set_neighbour_available(s, xBase, yBase,
                                             trafo_size_h, trafo_size_v);
+#ifdef RPI
+            rpi_intra_pred(s, log2_trafo_size, xBase, yBase, 1);
+            rpi_intra_pred(s, log2_trafo_size, xBase, yBase, 2);
+#else
             s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase, 1);
             s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase, 2);
+#endif
             if (s->ps.sps->chroma_format_idc == 2) {
                 ff_hevc_set_neighbour_available(s, xBase, yBase + (1 << (log2_trafo_size)),
                                                 trafo_size_h, trafo_size_v);
+#ifdef RPI
+                rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 1);
+                rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 2);
+#else
                 s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (1 << (log2_trafo_size)), 1);
                 s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (1 << (log2_trafo_size)), 2);
+#endif
             }
         }
     }
@@ -1332,6 +1703,93 @@ static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size)
  * @param luma_offset additive offset applied to the luma prediction value
  */

+#ifdef RPI_INTER
+#define RPI_REDIRECT(fn) (s->enable_rpi ? rpi_ ## fn : fn)
+static void rpi_luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+                        AVFrame *ref, const Mv *mv, int x_off, int y_off,
+                        int block_w, int block_h, int luma_weight, int luma_offset)
+{
+    HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
+    cmd->cmd = RPI_CMD_LUMA_UNI;
+    cmd->dst = dst;
+    cmd->dststride = dststride;
+    cmd->src = ref->data[0];
+    cmd->srcstride = ref->linesize[0];
+    cmd->mv = *mv;
+    cmd->x_off = x_off;
+    cmd->y_off = y_off;
+    cmd->block_w = block_w;
+    cmd->block_h = block_h;
+    cmd->weight = luma_weight;
+    cmd->offset = luma_offset;
+}
+
+static void rpi_luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+                       AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
+                       int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv)
+{
+    HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
+    cmd->cmd = RPI_CMD_LUMA_BI;
+    cmd->dst = dst;
+    cmd->dststride = dststride;
+    cmd->src = ref0->data[0];
+    cmd->srcstride = ref0->linesize[0];
+    cmd->mv = *mv0;
+    cmd->x_off = x_off;
+    cmd->y_off = y_off;
+    cmd->block_w = block_w;
+    cmd->block_h = block_h;
+    cmd->src1 = ref1->data[0];
+    cmd->srcstride1 = ref1->linesize[0];
+    cmd->mv1 = *mv1;
+    cmd->ref_idx[0] = current_mv->ref_idx[0];
+    cmd->ref_idx[1] = current_mv->ref_idx[1];
+}
+
+static void rpi_chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
+                          ptrdiff_t dststride, uint8_t *src0, ptrdiff_t srcstride, int reflist,
+                          int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int chroma_weight, int chroma_offset)
+{
+    HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
+    cmd->cmd = RPI_CMD_CHROMA_UNI;
+    cmd->dst = dst0;
+    cmd->dststride = dststride;
+    cmd->src = src0;
+    cmd->srcstride = srcstride;
+    cmd->mv = current_mv->mv[reflist];
+    cmd->x_off = x_off;
+    cmd->y_off = y_off;
+    cmd->block_w = block_w;
+    cmd->block_h = block_h;
+    cmd->weight = chroma_weight;
+    cmd->offset = chroma_offset;
+}
+
+static void rpi_chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVFrame *ref0, AVFrame *ref1,
+                         int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int cidx)
+{
+    HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
+    cmd->cmd = RPI_CMD_CHROMA_BI+cidx;
+    cmd->dst = dst0;
+    cmd->dststride = dststride;
+    cmd->src = ref0->data[cidx+1];
+    cmd->srcstride = ref0->linesize[cidx+1];
+    cmd->mv = current_mv->mv[0];
+    cmd->mv1 = current_mv->mv[1];
+    cmd->x_off = x_off;
+    cmd->y_off = y_off;
+    cmd->block_w = block_w;
+    cmd->block_h = block_h;
+    cmd->src1 = ref1->data[cidx+1];
+    cmd->srcstride1 = ref1->linesize[cidx+1];
+    cmd->ref_idx[0] = current_mv->ref_idx[0];
+    cmd->ref_idx[1] = current_mv->ref_idx[1];
+}
+
+#else
+#define RPI_REDIRECT(fn) fn
+#endif
+
 static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
                         AVFrame *ref, const Mv *mv, int x_off, int y_off,
                         int block_w, int block_h, int luma_weight, int luma_offset)
@@ -1347,6 +1805,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
                            (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
     int idx              = ff_hevc_pel_weight[block_w];

+#ifdef DISABLE_MC
+    return;
+#endif
+
     x_off += mv->x >> 2;
     y_off += mv->y >> 2;
     src   += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
@@ -1393,7 +1855,7 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
  * @param mv1 motion vector1 (relative to block position) to get pixel data from
  * @param current_mv current motion vector structure
  */
- static void luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+static void luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
                        AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
                        int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv)
 {
@@ -1417,6 +1879,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
     uint8_t *src0  = ref0->data[0] + y_off0 * src0stride + (int)((unsigned)x_off0 << s->ps.sps->pixel_shift);
     uint8_t *src1  = ref1->data[0] + y_off1 * src1stride + (int)((unsigned)x_off1 << s->ps.sps->pixel_shift);

+#ifdef DISABLE_MC
+    return;
+#endif
+
     if (x_off0 < QPEL_EXTRA_BEFORE || y_off0 < QPEL_EXTRA_AFTER ||
         x_off0 >= pic_width - block_w - QPEL_EXTRA_AFTER ||
         y_off0 >= pic_height - block_h - QPEL_EXTRA_AFTER) {
@@ -1502,6 +1968,10 @@ static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
     intptr_t _mx         = mx << (1 - hshift);
     intptr_t _my         = my << (1 - vshift);

+#ifdef DISABLE_MC
+    return;
+#endif
+
     x_off += mv->x >> (2 + hshift);
     y_off += mv->y >> (2 + vshift);
     src0  += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
@@ -1566,6 +2036,10 @@ static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVF
     int hshift = s->ps.sps->hshift[1];
     int vshift = s->ps.sps->vshift[1];

+#ifdef DISABLE_MC
+    return;
+#endif
+
     intptr_t mx0 = av_mod_uintp2(mv0->x, 2 + hshift);
     intptr_t my0 = av_mod_uintp2(mv0->y, 2 + vshift);
     intptr_t mx1 = av_mod_uintp2(mv1->x, 2 + hshift);
@@ -1693,14 +2167,14 @@ static void hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW,
     }
 }

-static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-                                int nPbW, int nPbH,
-                                int log2_cb_size, int partIdx, int idx)
+static void hls_prediction_unit(HEVCContext * const s, const int x0, const int y0,
+                                const int nPbW, const int nPbH,
+                                const unsigned int log2_cb_size, const unsigned int partIdx, const unsigned int idx)
 {
 #define POS(c_idx, x, y)                                                              \
     &s->frame->data[c_idx][((y) >> s->ps.sps->vshift[c_idx]) * s->frame->linesize[c_idx] + \
                            (((x) >> s->ps.sps->hshift[c_idx]) << s->ps.sps->pixel_shift)]
-    HEVCLocalContext *lc = s->HEVClc;
+    HEVCLocalContext * const lc = s->HEVClc;
     int merge_idx = 0;
     struct MvField current_mv = {{{ 0 }}};

@@ -1718,8 +2192,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
     int y_cb             = y0 >> log2_min_cb_size;
     int x_pu, y_pu;
     int i, j;
-
-    int skip_flag = SAMPLE_CTB(s->skip_flag, x_cb, y_cb);
+    const int skip_flag = SAMPLE_CTB(s->skip_flag, x_cb, y_cb);

     if (!skip_flag)
         lc->pu.merge_flag = ff_hevc_merge_flag_decode(s);
@@ -1763,16 +2236,89 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
         int nPbH_c = nPbH >> s->ps.sps->vshift[1];

-        luma_mc_uni(s, dst0, s->frame->linesize[0], ref0->frame,
+#ifdef RPI_LUMA_QPU
+        if (s->enable_rpi) {
+            const Mv * const mv    = &current_mv.mv[0];
+            const unsigned int mx          = mv->x & 3;
+            const unsigned int my          = mv->y & 3;
+            const unsigned int my_mx       = (my<<8) | mx;
+            const uint32_t     my2_mx2_my_mx = (my_mx << 16) | my_mx;
+            const int x1_m3 = x0 + (mv->x >> 2) - 3;
+            const int y1_m3 = y0 + (mv->y >> 2) - 3;
+            const uint32_t src_vc_address_y = get_vc_address_y(ref0->frame);
+            uint32_t * y = s->curr_y_mvs;
+
+            for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
+              const uint32_t src_yx_hi = ((y1_m3 + start_y) << 16);
+
+              for(int start_x=0;start_x < nPbW;start_x+=16) {
+                  const int bw = nPbW-start_x;
+                  const int bh = nPbH-start_y;
+                  y++[-RPI_LUMA_COMMAND_WORDS] = src_yx_hi | ((x1_m3 + start_x) & 0xffff);
+                  y++[-RPI_LUMA_COMMAND_WORDS] = src_vc_address_y;
+                  y++[-RPI_LUMA_COMMAND_WORDS] = src_yx_hi | ((x1_m3 + 8 + start_x) & 0xffff);
+                  y++[-RPI_LUMA_COMMAND_WORDS] = src_vc_address_y;
+                  *y++ = ( (bw<16 ? bw : 16) << 16 ) + (bh<16 ? bh : 16);
+                  *y++ = my2_mx2_my_mx;
+                  *y++ = s->sh.luma_weight_l0[current_mv.ref_idx[0]];
+                  *y++ = s->sh.luma_offset_l0[current_mv.ref_idx[0]] * 2 + 1;
+                  *y++ = (get_vc_address_y(s->frame) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
+                  y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter;
+                }
+            }
+            s->curr_y_mvs = y;
+        } else
+#endif
+        {
+            RPI_REDIRECT(luma_mc_uni)(s, dst0, s->frame->linesize[0], ref0->frame,
                     &current_mv.mv[0], x0, y0, nPbW, nPbH,
                     s->sh.luma_weight_l0[current_mv.ref_idx[0]],
                     s->sh.luma_offset_l0[current_mv.ref_idx[0]]);
+        }

         if (s->ps.sps->chroma_format_idc) {
-            chroma_mc_uni(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1],
+#ifdef RPI_INTER_QPU
+          if (s->enable_rpi) {
+                int hshift           = s->ps.sps->hshift[1];
+                int vshift           = s->ps.sps->vshift[1];
+                const Mv *mv         = &current_mv.mv[0];
+                intptr_t mx          = av_mod_uintp2(mv->x, 2 + hshift);
+                intptr_t my          = av_mod_uintp2(mv->y, 2 + vshift);
+                intptr_t _mx         = mx << (1 - hshift);
+                intptr_t _my         = my << (1 - vshift); // Fractional part of motion vector
+
+                int x1_c = x0_c + (mv->x >> (2 + hshift));
+                int y1_c = y0_c + (mv->y >> (2 + hshift));
+
+                uint32_t *u = s->curr_u_mvs;
+                for(int start_y=0;start_y < nPbH_c;start_y+=16) {
+                  for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
+                      int bw = nPbW_c-start_x;
+                      int bh = nPbH_c-start_y;
+                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
+                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
+                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref0->frame);
+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref0->frame);
+                      *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
+                      *u++ = rpi_filter_coefs[_mx][0];
+                      *u++ = rpi_filter_coefs[_my][0];
+                      *u++ = PACK2(s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0] * 2 + 1,
+                                   s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0]);
+                      *u++ = PACK2(s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1] * 2 + 1,
+                                   s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1]);
+                      *u++ = (get_vc_address_u(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
+                      *u++ = (get_vc_address_v(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+                    }
+                }
+                s->curr_u_mvs = u;
+                return;
+            }
+#endif
+            RPI_REDIRECT(chroma_mc_uni)(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1],
                           0, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
                           s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]);
-            chroma_mc_uni(s, dst2, s->frame->linesize[2], ref0->frame->data[2], ref0->frame->linesize[2],
+            RPI_REDIRECT(chroma_mc_uni)(s, dst2, s->frame->linesize[2], ref0->frame->data[2], ref0->frame->linesize[2],
                           0, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
                           s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1]);
         }
@@ -1782,17 +2328,89 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
         int nPbH_c = nPbH >> s->ps.sps->vshift[1];

-        luma_mc_uni(s, dst0, s->frame->linesize[0], ref1->frame,
+#ifdef RPI_LUMA_QPU
+        if (s->enable_rpi) {
+            const int reflist = 1;
+            const Mv *mv    = &current_mv.mv[reflist];
+            int mx          = mv->x & 3;
+            int my          = mv->y & 3;
+            int my_mx = (my<<8) + mx;
+            int my2_mx2_my_mx = (my_mx << 16) + my_mx;
+            int x1 = x0 + (mv->x >> 2);
+            int y1 = y0 + (mv->y >> 2);
+            uint32_t *y = s->curr_y_mvs;
+            for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
+              for(int start_x=0;start_x < nPbW;start_x+=16) {
+                  int bw = nPbW-start_x;
+                  int bh = nPbH-start_y;
+                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref1->frame);
+                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + 8 + start_x) & 0xffff);
+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref1->frame);
+                  *y++ = ( (bw<16 ? bw : 16) << 16 ) + (bh<16 ? bh : 16);
+                  *y++ = my2_mx2_my_mx;
+                  *y++ = s->sh.luma_weight_l1[current_mv.ref_idx[reflist]];
+                  *y++ = s->sh.luma_offset_l1[current_mv.ref_idx[reflist]] * 2 + 1;
+                  *y++ = (get_vc_address_y(s->frame) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
+                  y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter;
+                }
+            }
+            s->curr_y_mvs = y;
+        } else
+#endif
+
+        {
+            RPI_REDIRECT(luma_mc_uni)(s, dst0, s->frame->linesize[0], ref1->frame,
                     &current_mv.mv[1], x0, y0, nPbW, nPbH,
                     s->sh.luma_weight_l1[current_mv.ref_idx[1]],
                     s->sh.luma_offset_l1[current_mv.ref_idx[1]]);
+        }

         if (s->ps.sps->chroma_format_idc) {
-            chroma_mc_uni(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1],
+#ifdef RPI_INTER_QPU
+            if (s->enable_rpi) {
+                const int reflist = 1;
+                const int hshift           = s->ps.sps->hshift[1];
+                const int vshift           = s->ps.sps->vshift[1];
+                const Mv * const mv        = &current_mv.mv[reflist];
+                const intptr_t mx          = av_mod_uintp2(mv->x, 2 + hshift);
+                const intptr_t my          = av_mod_uintp2(mv->y, 2 + vshift);
+                const intptr_t _mx         = mx << (1 - hshift);
+                const intptr_t _my         = my << (1 - vshift); // Fractional part of motion vector
+
+                const int x1_c = x0_c + (mv->x >> (2 + hshift));
+                const int y1_c = y0_c + (mv->y >> (2 + hshift));
+
+                uint32_t * u = s->curr_u_mvs;
+                for(int start_y=0;start_y < nPbH_c;start_y+=16) {
+                  for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
+                      const int bw = nPbW_c-start_x;
+                      const int bh = nPbH_c-start_y;
+                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
+                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
+                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref1->frame);
+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref1->frame);
+                      *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
+                      *u++ = rpi_filter_coefs[_mx][0];
+                      *u++ = rpi_filter_coefs[_my][0];
+                      *u++ = PACK2(s->sh.chroma_offset_l1[current_mv.ref_idx[reflist]][0] * 2 + 1,
+                                   s->sh.chroma_weight_l1[current_mv.ref_idx[reflist]][0]);
+                      *u++ = PACK2(s->sh.chroma_offset_l1[current_mv.ref_idx[reflist]][1] * 2 + 1,
+                                   s->sh.chroma_weight_l1[current_mv.ref_idx[reflist]][1]);
+                      *u++ = (get_vc_address_u(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
+                      *u++ = (get_vc_address_v(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+                    }
+                }
+                s->curr_u_mvs = u;
+                return;
+            }
+#endif
+            RPI_REDIRECT(chroma_mc_uni)(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1],
                           1, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
                           s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0]);

-            chroma_mc_uni(s, dst2, s->frame->linesize[2], ref1->frame->data[2], ref1->frame->linesize[2],
+            RPI_REDIRECT(chroma_mc_uni)(s, dst2, s->frame->linesize[2], ref1->frame->data[2], ref1->frame->linesize[2],
                           1, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
                           s->sh.chroma_weight_l1[current_mv.ref_idx[1]][1], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][1]);
         }
@@ -1802,15 +2420,118 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
         int nPbH_c = nPbH >> s->ps.sps->vshift[1];

-        luma_mc_bi(s, dst0, s->frame->linesize[0], ref0->frame,
+#ifdef RPI_LUMA_QPU
+        if (s->enable_rpi && 0) {
+            const Mv *mv    = &current_mv.mv[0];
+            int mx          = mv->x & 3;
+            int my          = mv->y & 3;
+            int my_mx = (my<<8) + mx;
+            const Mv *mv2    = &current_mv.mv[1];
+            int mx2          = mv2->x & 3;
+            int my2          = mv2->y & 3;
+            int my2_mx2 = (my2<<8) + mx2;
+            int my2_mx2_my_mx = (my2_mx2 << 16) + my_mx;
+            int x1 = x0 + (mv->x >> 2);
+            int y1 = y0 + (mv->y >> 2);
+            int x2 = x0 + (mv2->x >> 2);
+            int y2 = y0 + (mv2->y >> 2);
+            uint32_t *y = s->curr_y_mvs;
+            for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
+              for(int start_x=0;start_x < nPbW;start_x+=8) { // B blocks work 8 at a time
+                  int bw = nPbW-start_x;
+                  int bh = nPbH-start_y;
+                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref0->frame);
+                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y2 - 3 + start_y) << 16) + ( (x2 - 3 + start_x) & 0xffff); // Second fetch is for ref1
+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref1->frame);
+                  *y++ = PACK2(bw<8 ? bw : 8, bh<16 ? bh : 16);
+                  *y++ = my2_mx2_my_mx;
+
+                  *y++ = PACK2(s->sh.luma_weight_l1[current_mv.ref_idx[1]],
+                               s->sh.luma_weight_l0[current_mv.ref_idx[0]]);
+                  *y++ = s->sh.luma_offset_l0[current_mv.ref_idx[0]] +
+                         s->sh.luma_offset_l1[current_mv.ref_idx[1]] + 1;
+
+                  *y++ = (get_vc_address_y(s->frame) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
+                  y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter_b;
+                }
+            }
+            s->curr_y_mvs = y;
+        } else
+#endif
+        {
+            RPI_REDIRECT(luma_mc_bi)(s, dst0, s->frame->linesize[0], ref0->frame,
                    &current_mv.mv[0], x0, y0, nPbW, nPbH,
                    ref1->frame, &current_mv.mv[1], &current_mv);
+        }

         if (s->ps.sps->chroma_format_idc) {
-            chroma_mc_bi(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame,
+#ifdef RPI_INTER_QPU
+          if (s->enable_rpi) {
+                int hshift           = s->ps.sps->hshift[1];
+                int vshift           = s->ps.sps->vshift[1];
+                const Mv *mv         = &current_mv.mv[0];
+                intptr_t mx          = av_mod_uintp2(mv->x, 2 + hshift);
+                intptr_t my          = av_mod_uintp2(mv->y, 2 + vshift);
+                intptr_t _mx         = mx << (1 - hshift);
+                intptr_t _my         = my << (1 - vshift); // Fractional part of motion vector
+                int x1_c = x0_c + (mv->x >> (2 + hshift));
+                int y1_c = y0_c + (mv->y >> (2 + hshift));
+
+                const Mv *mv2         = &current_mv.mv[1];
+                intptr_t mx2          = av_mod_uintp2(mv2->x, 2 + hshift);
+                intptr_t my2          = av_mod_uintp2(mv2->y, 2 + vshift);
+                intptr_t _mx2         = mx2 << (1 - hshift);
+                intptr_t _my2         = my2 << (1 - vshift); // Fractional part of motion vector
+
+                int x2_c = x0_c + (mv2->x >> (2 + hshift));
+                int y2_c = y0_c + (mv2->y >> (2 + hshift));
+
+
+                uint32_t *u = s->curr_u_mvs;
+                for(int start_y=0;start_y < nPbH_c;start_y+=16) {
+                  for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
+                      int bw = nPbW_c-start_x;
+                      int bh = nPbH_c-start_y;
+                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b0;
+                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
+                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref0->frame);
+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref0->frame);
+                      *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
+                      *u++ = rpi_filter_coefs[_mx][0];
+                      *u++ = rpi_filter_coefs[_my][0];
+                      *u++ = s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0]; // Weight L0 U
+                      *u++ = s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1]; // Weight L0 V
+                      *u++ = 0;  // Intermediate results are not written back in first pass of B filtering
+                      *u++ = 0;
+
+                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b;
+                      u++[-RPI_CHROMA_COMMAND_WORDS] = x2_c - 1 + start_x;
+                      u++[-RPI_CHROMA_COMMAND_WORDS] = y2_c - 1 + start_y;
+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref1->frame);
+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref1->frame);
+                      *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
+                      *u++ = rpi_filter_coefs[_mx2][0];
+                      *u++ = rpi_filter_coefs[_my2][0];
+                      *u++ = PACK2(s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0] +
+                                     s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0] + 1,
+                                   s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0]);
+                      *u++ = PACK2(s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1] +
+                                     s->sh.chroma_offset_l1[current_mv.ref_idx[1]][1] + 1,
+                                   s->sh.chroma_weight_l1[current_mv.ref_idx[1]][1]);
+                      *u++ = (get_vc_address_u(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
+                      *u++ = (get_vc_address_v(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+                    }
+                }
+                s->curr_u_mvs = u;
+                return;
+            }
+#endif
+            RPI_REDIRECT(chroma_mc_bi)(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame,
                          x0_c, y0_c, nPbW_c, nPbH_c, &current_mv, 0);

-            chroma_mc_bi(s, dst2, s->frame->linesize[2], ref0->frame, ref1->frame,
+            RPI_REDIRECT(chroma_mc_bi)(s, dst2, s->frame->linesize[2], ref0->frame, ref1->frame,
                          x0_c, y0_c, nPbW_c, nPbH_c, &current_mv, 1);
         }
     }
@@ -2304,6 +3025,734 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
     lc->ctb_up_left_flag = ((x_ctb > 0) && (y_ctb > 0)  && (ctb_addr_in_slice-1 >= s->ps.sps->ctb_width) && (s->ps.pps->tile_id[ctb_addr_ts] == s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1 - s->ps.sps->ctb_width]]));
 }

+#ifdef RPI
+static void rpi_execute_dblk_cmds(HEVCContext *s)
+{
+    int n;
+    int job = s->pass1_job;
+    int ctb_size    = 1 << s->ps.sps->log2_ctb_size;
+    int (*p)[2] = s->dblk_cmds[job];
+    for(n = s->num_dblk_cmds[job]; n>0 ;n--,p++) {
+        ff_hevc_hls_filters(s, (*p)[0], (*p)[1], ctb_size);
+    }
+    s->num_dblk_cmds[job] = 0;
+}
+
+static void rpi_execute_transform(HEVCContext *s)
+{
+    int i=2;
+    int job = s->pass1_job;
+    /*int j;
+    int16_t *coeffs = s->coeffs_buf_arm[job][i];
+    for(j=s->num_coeffs[job][i]; j > 0; j-= 16*16, coeffs+=16*16) {
+        s->hevcdsp.idct[4-2](coeffs, 16);
+    }
+    i=3;
+    coeffs = s->coeffs_buf_arm[job][i] - s->num_coeffs[job][i];
+    for(j=s->num_coeffs[job][i]; j > 0; j-= 32*32, coeffs+=32*32) {
+        s->hevcdsp.idct[5-2](coeffs, 32);
+    }*/
+
+    gpu_cache_flush(&s->coeffs_buf_accelerated[job]);
+    s->vpu_id = vpu_post_code2( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2],
+                               s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3],
+                               s->num_coeffs[job][3] >> 10, 0, &s->coeffs_buf_accelerated[job]);
+    //vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0);
+    //gpu_cache_flush(&s->coeffs_buf_accelerated);
+    //vpu_wait(s->vpu_id);
+
+    for(i=0;i<4;i++)
+        s->num_coeffs[job][i] = 0;
+}
+
+static void rpi_execute_pred_cmds(HEVCContext *s)
+{
+  int i;
+  int job = s->pass1_job;
+  HEVCPredCmd *cmd = s->univ_pred_cmds[job];
+#ifdef RPI_WORKER
+  HEVCLocalContextIntra *lc = &s->HEVClcIntra;
+#else
+  HEVCLocalContext *lc = s->HEVClc;
+#endif
+
+  for(i = s->num_pred_cmds[job]; i > 0; i--, cmd++) {
+      //printf("i=%d cmd=%p job1=%d job0=%d\n",i,cmd,s->pass1_job,s->pass0_job);
+      if (cmd->type == RPI_PRED_INTRA) {
+          lc->tu.intra_pred_mode_c = lc->tu.intra_pred_mode = cmd->mode;
+          lc->na.cand_bottom_left  = (cmd->na >> 4) & 1;
+          lc->na.cand_left         = (cmd->na >> 3) & 1;
+          lc->na.cand_up_left      = (cmd->na >> 2) & 1;
+          lc->na.cand_up           = (cmd->na >> 1) & 1;
+          lc->na.cand_up_right     = (cmd->na >> 0) & 1;
+          s->hpc.intra_pred[cmd->size - 2](s, cmd->x, cmd->y, cmd->c_idx);
+      } else {
+#ifdef RPI_PRECLEAR
+          int trafo_size = 1 << cmd->size;
+#endif
+          s->hevcdsp.transform_add[cmd->size-2](cmd->dst, cmd->buf, cmd->stride);
+#ifdef RPI_PRECLEAR
+          memset(cmd->buf, 0, trafo_size * trafo_size * sizeof(int16_t)); // Clear coefficients here while they are in the cache
+#endif
+      }
+  }
+  s->num_pred_cmds[job] = 0;
+}
+
+static void rpi_execute_inter_cmds(HEVCContext *s)
+{
+    int job = s->pass1_job;
+    HEVCMvCmd *cmd = s->unif_mv_cmds[job];
+    int n,cidx;
+    AVFrame myref;
+    AVFrame myref1;
+    struct MvField mymv;
+    if (s->num_mv_cmds[job] > RPI_MAX_MV_CMDS) {
+        printf("Overflow inter_cmds\n");
+        exit(-1);
+    }
+    for(n = s->num_mv_cmds[job]; n>0 ; n--, cmd++) {
+        switch(cmd->cmd) {
+        case RPI_CMD_LUMA_UNI:
+            myref.data[0] = cmd->src;
+            myref.linesize[0] = cmd->srcstride;
+            luma_mc_uni(s, cmd->dst, cmd->dststride, &myref, &cmd->mv, cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, cmd->weight, cmd->offset);
+            break;
+        case RPI_CMD_LUMA_BI:
+            myref.data[0] = cmd->src;
+            myref.linesize[0] = cmd->srcstride;
+            myref1.data[0] = cmd->src1;
+            myref1.linesize[0] = cmd->srcstride1;
+            mymv.ref_idx[0] = cmd->ref_idx[0];
+            mymv.ref_idx[1] = cmd->ref_idx[1];
+            luma_mc_bi(s, cmd->dst, cmd->dststride,
+                       &myref, &cmd->mv, cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h,
+                       &myref1, &cmd->mv1, &mymv);
+            break;
+        case RPI_CMD_CHROMA_UNI:
+            mymv.mv[0] = cmd->mv;
+            chroma_mc_uni(s, cmd->dst,
+                          cmd->dststride, cmd->src, cmd->srcstride, 0,
+                          cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, &mymv, cmd->weight, cmd->offset);
+            break;
+        case RPI_CMD_CHROMA_BI:
+        case RPI_CMD_CHROMA_BI+1:
+            cidx = cmd->cmd - RPI_CMD_CHROMA_BI;
+            myref.data[cidx+1] = cmd->src;
+            myref.linesize[cidx+1] = cmd->srcstride;
+            myref1.data[cidx+1] = cmd->src1;
+            myref1.linesize[cidx+1] = cmd->srcstride1;
+            mymv.ref_idx[0] = cmd->ref_idx[0];
+            mymv.ref_idx[1] = cmd->ref_idx[1];
+            mymv.mv[0] = cmd->mv;
+            mymv.mv[1] = cmd->mv1;
+            chroma_mc_bi(s, cmd->dst, cmd->dststride, &myref, &myref1,
+                         cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, &mymv, cidx);
+            break;
+        }
+    }
+    s->num_mv_cmds[job] = 0;
+}
+
+static void rpi_do_all_passes(HEVCContext *s)
+{
+    // Kick off QPUs and VPUs
+    rpi_launch_vpu_qpu(s);
+    // Perform luma inter prediction
+    rpi_execute_inter_cmds(s);
+    // Wait for transform completion
+    vpu_wait(s->vpu_id);
+    // Perform intra prediction and residual reconstruction
+    rpi_execute_pred_cmds(s);
+    // Perform deblocking for CTBs in this row
+    rpi_execute_dblk_cmds(s);
+    // Prepare next batch
+    rpi_begin(s);
+}
+
+#endif
+
+#ifdef RPI
+static void rpi_begin(HEVCContext *s)
+{
+    int job = s->pass0_job;
+    int i;
+#ifdef RPI_INTER_QPU
+    int pic_width        = s->ps.sps->width >> s->ps.sps->hshift[1];
+    int pic_height       = s->ps.sps->height >> s->ps.sps->vshift[1];
+
+    for(i=0;i<8;i++) {
+        s->u_mvs[job][i] = s->mvs_base[job][i];
+        *s->u_mvs[job][i]++ = 0;
+        *s->u_mvs[job][i]++ = 0;
+        *s->u_mvs[job][i]++ = 0;
+        *s->u_mvs[job][i]++ = 0;
+        *s->u_mvs[job][i]++ = 0;
+        *s->u_mvs[job][i]++ = pic_width;
+        *s->u_mvs[job][i]++ = pic_height;
+        *s->u_mvs[job][i]++ = s->frame->linesize[1];
+        *s->u_mvs[job][i]++ = s->frame->linesize[2];
+        *s->u_mvs[job][i]++ = s->sh.chroma_log2_weight_denom + 6;
+        *s->u_mvs[job][i]++ = 0;
+        *s->u_mvs[job][i]++ = i;  // Select section of VPM (avoid collisions with 3d unit)
+    }
+    s->curr_u_mvs = s->u_mvs[job][0];
+#endif
+
+#ifdef RPI_LUMA_QPU
+    for(i=0;i<12;i++) {
+        // This needs to have a generally similar structure to the
+        // actual filter code as various pipelined bits need to land correctly
+        // when inserted by the filter requests
+        s->y_mvs[job][i] = s->y_mvs_base[job][i];
+        *s->y_mvs[job][i]++ = 0; // y_x
+        *s->y_mvs[job][i]++ = 0; // ref_y_base
+        *s->y_mvs[job][i]++ = 0; // y2_x2
+        *s->y_mvs[job][i]++ = 0; // ref_y2_base
+        *s->y_mvs[job][i]++ = (s->ps.sps->width << 16) + s->ps.sps->height;
+        *s->y_mvs[job][i]++ = s->frame->linesize[0]; // pitch
+        *s->y_mvs[job][i]++ = s->frame->linesize[0]; // dst_pitch
+        *s->y_mvs[job][i]++ = s->sh.luma_log2_weight_denom + 6;  // weight demon + 6
+        *s->y_mvs[job][i]++ = 0; // Unused - alignment with per-block
+        *s->y_mvs[job][i]++ = 0; // Next kernel
+    }
+    s->curr_y_mvs = s->y_mvs[job][0];
+#endif
+    s->ctu_count = 0;
+}
+#endif
+
+#ifdef RPI_SIMULATE_QPUS
+
+static int32_t clipx(int x,int FRAME_WIDTH)
+{
+	if (x<=0) return 0;
+	if (x>=FRAME_WIDTH) return FRAME_WIDTH-1;
+	return x;
+}
+
+static int32_t clipy(int y,int FRAME_HEIGHT)
+{
+	if (y<=0) return 0;
+	if (y>=FRAME_HEIGHT) return FRAME_HEIGHT-1;
+	return y;
+}
+
+/*static int32_t filter8(uint8_t *data, int x0, int y0, int pitch, int mx, int my,int round,int denom,int weight,int offset)
+{
+   int32_t vsum = 0;
+   int x, y;
+
+   for (y = 0; y < 8; y++) {
+      int32_t hsum = 0;
+
+      for (x = 0; x < 8; x++)
+         hsum += lumaFilter[mx][x]*data[clipx(x + x0) + clipy(y + y0) * pitch];
+
+      vsum += lumaFilter[my][y]*hsum;
+   }
+   vsum >>= 6;
+   vsum = (((vsum*weight)+round)>>denom)+offset;
+
+   return av_clip_uint8( vsum );
+}*/
+
+static int32_t filter8_chroma(uint8_t *data, int x0, int y0, int pitch, int hcoeffs, int vcoeffs,int offset_weight,int offset_before,int denom,int pic_width, int pic_height)
+{
+  int32_t vsum = 0;
+  int x, y;
+  int chromaFilterH[4];
+  int chromaFilterV[4];
+  int i;
+  int offset_after = offset_weight>>16;
+  int weight = (offset_weight<<16)>>16;
+  for(i=0;i<4;i++) {
+    chromaFilterH[i] = ((hcoeffs>>(8*i))<<24)>>24;
+    chromaFilterV[i] = ((vcoeffs>>(8*i))<<24)>>24;
+  }
+
+   for (y = 0; y < 4; y++) {
+      int32_t hsum = 0;
+
+      for (x = 0; x < 4; x++)
+         hsum += chromaFilterH[x]*data[clipx(x + x0,pic_width) + clipy(y + y0,pic_height) * pitch];
+
+      vsum += chromaFilterV[y]*hsum;
+   }
+   vsum >>= 6;
+   vsum = (((vsum*weight)+offset_before)>>denom)+offset_after;
+
+   return vsum;
+}
+
+int lumaFilter[4][8]={ {0,0,0,64,0,0,0,0},{-1,4,-10,58,17,-5,1,0},{-1,4,-11,40,40,-11,4,-1},{0,1,-5,17,58,-10,4,-1} };
+
+static int32_t filter8_luma(uint8_t *data, int x0, int y0, int pitch, int my_mx,int offset_weight,int offset_before,int denom,int pic_width, int pic_height)
+{
+  int32_t vsum = 0;
+  int x, y;
+  int i;
+  int offset_after = offset_weight>>16;
+  int weight = (offset_weight<<16)>>16;
+
+   for (y = 0; y < 8; y++) {
+      int32_t hsum = 0;
+
+      for (x = 0; x < 8; x++)
+         hsum += lumaFilter[my_mx&3][x]*data[clipx(x + x0,pic_width) + clipy(y + y0,pic_height) * pitch];
+
+      vsum += lumaFilter[(my_mx>>8)&3][y]*hsum;
+   }
+   vsum >>= 6;
+   vsum = (((vsum*weight)+offset_before)>>denom)+offset_after;
+
+   return vsum;
+}
+
+static uint8_t *test_frame(HEVCContext *s,uint32_t p, AVFrame *frame, const int cIdx)
+{
+  //int pic_width        = s->ps.sps->width >> s->ps.sps->hshift[cIdx];
+  int pic_height       = s->ps.sps->height >> s->ps.sps->vshift[cIdx];
+  int pitch = frame->linesize[cIdx];
+  uint32_t base = cIdx == 0 ? get_vc_address_y(frame) :
+    cIdx == 1 ? get_vc_address_u(frame) : get_vc_address_v(frame);
+  if (p>=base && p<base+pitch*pic_height) {
+    return frame->data[cIdx] + (p-base);
+  }
+  return NULL;
+}
+
+static uint8_t *compute_arm_addr(HEVCContext *s,uint32_t p, int cIdx)
+{
+  SliceHeader *sh   = &s->sh;
+  uint8_t *arm = test_frame(s,p,s->frame,cIdx);
+  int i;
+  if (arm) return arm;
+  if (sh->slice_type == P_SLICE || sh->slice_type == B_SLICE)
+  {
+    for(i=0;i<sh->nb_refs[L0];i++) {
+      arm = test_frame(s,p,s->ref->refPicList[0].ref[i]->frame,cIdx);
+      if (arm) return arm;
+    }
+  }
+  if (sh->slice_type == B_SLICE) {
+    for(i=0;i<sh->nb_refs[L1];i++) {
+      arm = test_frame(s,p,s->ref->refPicList[1].ref[i]->frame,cIdx);
+      if (arm) return arm;
+    }
+  }
+  printf("Frame 0x%x not found! Exit=%x\n",p,qpu_get_fn(QPU_MC_EXIT));
+  exit(-1);
+  return NULL;
+}
+
+static void rpi_simulate_inter_chroma(HEVCContext *s,uint32_t *p)
+{
+  uint32_t next_kernel;
+  uint32_t x0;
+  uint32_t y0;
+  uint8_t *ref_u_base;
+  uint8_t *ref_v_base;
+  uint32_t frame_width = p[5];
+  uint32_t frame_height = p[6];
+  uint32_t pitch = p[7];
+  uint32_t dst_pitch = p[8];
+  int32_t offset_before = p[9];
+  int32_t denom = p[10];
+  uint32_t vpm_id = p[11];
+  uint32_t tmp_u_dst[256];
+  uint32_t tmp_v_dst[256];
+  while(1) {
+    p += 12;
+    next_kernel = p[0-12];
+    x0 = p[1-12];
+    y0 = p[2-12];
+    if (next_kernel==s->mc_filter_uv || next_kernel==s->mc_filter_uv_b0 || next_kernel==s->mc_filter_uv_b) {
+      int x,y;
+      uint32_t width_height = p[5];
+      uint32_t hcoeffs = p[6];
+      uint32_t vcoeffs = p[7];
+      uint32_t offset_weight_u = p[8];
+      uint32_t offset_weight_v = p[9];
+      uint8_t *this_u_dst;
+      uint8_t *this_v_dst;
+      uint32_t width = width_height >> 16;
+      uint32_t height = (width_height << 16) >> 16;
+      ref_u_base = compute_arm_addr(s,p[3-12],1);
+      ref_v_base = compute_arm_addr(s,p[4-12],2);
+      if (next_kernel!=s->mc_filter_uv_b0)
+      {
+        this_u_dst = compute_arm_addr(s,p[10],1);
+        this_v_dst = compute_arm_addr(s,p[11],2);
+      }
+      for (y=0; y<height; ++y) {
+        for (x=0; x<width; ++x) {
+          if (next_kernel==s->mc_filter_uv) {
+            int32_t refa = filter8_chroma(ref_u_base,x+x0, y+y0, pitch, hcoeffs, vcoeffs, offset_weight_u,offset_before,denom,frame_width,frame_height);
+            int32_t refb = filter8_chroma(ref_v_base,x+x0, y+y0, pitch, hcoeffs, vcoeffs, offset_weight_v,offset_before,denom,frame_width,frame_height);
+            this_u_dst[x+y*dst_pitch] = av_clip_uint8(refa);
+            this_v_dst[x+y*dst_pitch] = av_clip_uint8(refb);
+          } else if (next_kernel==s->mc_filter_uv_b0) {
+            int32_t refa = filter8_chroma(ref_u_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1,0,0,frame_width,frame_height);
+            int32_t refb = filter8_chroma(ref_v_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1,0,0,frame_width,frame_height);
+            tmp_u_dst[x+y*16] = refa;
+            tmp_v_dst[x+y*16] = refb;
+          } else {
+            int32_t refa = filter8_chroma(ref_u_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1, 64 + tmp_u_dst[x+y*16], 7, frame_width, frame_height);
+            int32_t refb = filter8_chroma(ref_v_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1, 64 + tmp_v_dst[x+y*16], 7, frame_width, frame_height);
+            this_u_dst[x+y*dst_pitch] = av_clip_uint8(refa);
+            this_v_dst[x+y*dst_pitch] = av_clip_uint8(refb);
+          }
+        }
+      }
+    } else {
+      av_assert0(next_kernel==qpu_get_fn(QPU_MC_INTERRUPT_EXIT8) || next_kernel==qpu_get_fn(QPU_MC_EXIT) );
+      break;
+    }
+  }
+}
+
+// mc_setup(y_x, ref_y_base, y2_x2, ref_y2_base, frame_width_height, pitch, dst_pitch, offset_shift, next_kernel)
+static void rpi_simulate_inter_luma(HEVCContext *s,uint32_t *p,int chan)
+{
+  uint32_t next_kernel;
+  int y_x,y2_x2;
+  int x0;
+  int y0;
+  int x2;
+  int y2;
+  uint32_t *p0 = p;
+  uint8_t *ref_y_base;
+  uint8_t *ref_y2_base;
+  uint32_t frame_width_height = p[4];
+  uint32_t frame_width = frame_width_height>>16;
+  uint32_t frame_height = (frame_width_height<<16)>>16;
+  uint32_t pitch = p[5];
+  uint32_t dst_pitch = p[6];
+  int offset_shift = p[7];
+  int32_t offset_before = offset_shift>>16;
+  int32_t denom = (offset_shift<<16)>>16;
+  while(1) {
+    p += 9;
+    next_kernel = p[8-9];
+    y_x = p[0-9];
+    x0 = (y_x<<16)>>16;
+    y0 = y_x>>16;
+    y2_x2 = p[2-9];
+    x2 = (y2_x2<<16)>>16;
+    y2 = y2_x2>>16;
+
+    if (next_kernel==s->mc_filter || next_kernel==s->mc_filter_b) {
+      // y_x, frame_base, y2_x2, frame_base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
+      int x,y;
+      uint32_t width_height = p[4];
+      uint32_t my2_mx2_my_mx = p[5];
+      uint32_t offset_weight = p[6];
+      uint8_t *this_dst = compute_arm_addr(s,p[7],0);
+      uint32_t width = width_height >> 16;
+      uint32_t height = (width_height << 16) >> 16;
+      uint8_t *dst_base = s->frame->data[0];
+      ref_y_base = compute_arm_addr(s,p[1-9],0);
+      ref_y2_base = compute_arm_addr(s,p[3-9],0);
+      for (y=0; y<height; ++y) {
+        for (x=0; x<width; ++x) {
+          if (next_kernel==s->mc_filter) {
+            int32_t refa = filter8_luma(ref_y_base,x+x0, y+y0, pitch, my2_mx2_my_mx, offset_weight,offset_before,denom,frame_width,frame_height);
+            refa = av_clip_uint8(refa);
+            this_dst[x+y*dst_pitch] = refa;
+          }
+          else {
+            int32_t refa = filter8_luma(ref_y_base, x+x0, y+y0, pitch, my2_mx2_my_mx, 1, 0, 0, frame_width, frame_height);
+            int32_t refb = filter8_luma(ref_y2_base, x+x2, y+y2, pitch, my2_mx2_my_mx>>16, 1, 64 + refa, 7, frame_width, frame_height);
+            this_dst[x+y*dst_pitch] = av_clip_uint8(refb);
+          }
+        }
+      }
+    } else {
+      av_assert0(next_kernel==qpu_get_fn(QPU_MC_INTERRUPT_EXIT12) || next_kernel==qpu_get_fn(QPU_MC_EXIT) );
+      break;
+    }
+  }
+}
+
+static void rpi_simulate_inter_qpu(HEVCContext *s)
+{
+  // First run the transform as normal
+  int i;
+  rpi_execute_transform(s);
+  for(i=0;i<8;i++)
+  {
+    rpi_simulate_inter_chroma(s,s->mvs_base[i]);
+  }
+  for(i=0;i<12;i++)
+  {
+    rpi_simulate_inter_luma(s,s->y_mvs_base[i],i);
+  }
+}
+
+#endif
+
+#ifdef RPI_INTER_QPU
+
+static void rpi_launch_vpu_qpu(HEVCContext *s)
+{
+    int k;
+    int job = s->pass1_job;
+    int i;
+    uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr[job].vc;
+#ifdef RPI_LUMA_QPU
+    uint32_t *y_unif_vc = (uint32_t *)s->y_unif_mvs_ptr[job].vc;
+#endif
+    if (s->sh.slice_type == I_SLICE) {
+#ifdef RPI_MULTI_MAILBOX
+      rpi_execute_transform(s);
+      return;
+#endif
+    }
+    for(k=0;k<8;k++) {
+        s->u_mvs[job][k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
+        s->u_mvs[job][k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
+        s->u_mvs[job][k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for V
+        av_assert0(s->u_mvs[job][k] - s->mvs_base[job][k] < UV_COMMANDS_PER_QPU);
+    }
+
+    s->u_mvs[job][8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
+
+#ifdef RPI_LUMA_QPU
+    for(k=0;k<12;k++) {
+        s->y_mvs[job][k][-RPI_LUMA_COMMAND_WORDS+1] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
+        s->y_mvs[job][k][-RPI_LUMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for second request
+        s->y_mvs[job][k][-1] = qpu_get_fn(QPU_MC_EXIT); // Add exit command (Final uniform)
+        av_assert0(s->y_mvs[job][k] - s->y_mvs_base[job][k] < Y_COMMANDS_PER_QPU);
+    }
+    s->y_mvs[job][12-1][-1] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT12); // This QPU will signal interrupt when all others are done and have acquired a semaphore
+#endif
+
+#ifdef RPI_SIMULATE_QPUS
+    rpi_simulate_inter_qpu(s);
+    return;
+#endif
+
+#ifdef RPI_MULTI_MAILBOX
+#ifdef RPI_CACHE_UNIF_MVS
+    flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],&s->y_unif_mvs_ptr[job], &s->unif_mvs_ptr[job], job);
+#else
+    flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],NULL,NULL, job);
+#endif
+
+#if 1
+    {
+        unsigned int i;
+        uint32_t * p;
+        uint32_t code = qpu_get_fn(QPU_MC_SETUP_UV);
+        uint32_t mail_uv[QPU_N_UV * QPU_MAIL_EL_VALS];
+        uint32_t mail_y[QPU_N_Y * QPU_MAIL_EL_VALS];
+
+        for (p = mail_uv, i = 0; i != QPU_N_UV; ++i) {
+            *p++ = (uint32_t)(unif_vc + (s->mvs_base[job][i] - (uint32_t*)s->unif_mvs_ptr[job].arm));
+            *p++ = code;
+        }
+
+        code = qpu_get_fn(QPU_MC_SETUP);
+        for (p = mail_y, i = 0; i != QPU_N_Y; ++i) {
+            *p++ = (uint32_t)(y_unif_vc + (s->y_mvs_base[job][i] - (uint32_t*)s->y_unif_mvs_ptr[job].arm));
+            *p++ = code;
+        }
+
+        s->vpu_id = vpu_qpu_post_code2(vpu_get_fn(),
+            vpu_get_constants(),
+            s->coeffs_buf_vc[job][2],
+            s->num_coeffs[job][2] >> 8,
+            s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3],
+            s->num_coeffs[job][3] >> 10,
+            0,
+            // QPU job 1
+            QPU_N_UV,
+            mail_uv,
+            // QPU job 2
+            QPU_N_Y,
+            mail_y
+            );
+    }
+
+#else
+    s->vpu_id = vpu_qpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2], s->num_coeffs[job][2] >> 8,
+                                                                      s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3], s->num_coeffs[job][3] >> 10, 0,
+                                   qpu_get_fn(QPU_MC_SETUP_UV),
+                                   (uint32_t)(unif_vc+(s->mvs_base[job][0 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+                                   (uint32_t)(unif_vc+(s->mvs_base[job][1 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+                                   (uint32_t)(unif_vc+(s->mvs_base[job][2 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+                                   (uint32_t)(unif_vc+(s->mvs_base[job][3 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+                                   (uint32_t)(unif_vc+(s->mvs_base[job][4 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+                                   (uint32_t)(unif_vc+(s->mvs_base[job][5 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+                                   (uint32_t)(unif_vc+(s->mvs_base[job][6 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+                                   (uint32_t)(unif_vc+(s->mvs_base[job][7 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+#ifdef RPI_LUMA_QPU
+                                   qpu_get_fn(QPU_MC_SETUP),
+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][0 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][1 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][2 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][3 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][4 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][5 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][6 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][7 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][8 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][9 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][10 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][11 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm))
+#else
+                                   0,
+                                   0,0,0,0,
+                                   0,0,0,0,
+                                   0,0,0,0
+#endif
+                                 );
+#endif
+    for(i=0;i<4;i++)
+        s->num_coeffs[job][i] = 0;
+#else
+#error Code rotted here
+    qpu_run_shader8(qpu_get_fn(QPU_MC_SETUP_UV),
+      (uint32_t)(unif_vc+(s->mvs_base[job][0 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+      (uint32_t)(unif_vc+(s->mvs_base[job][1 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+      (uint32_t)(unif_vc+(s->mvs_base[job][2 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+      (uint32_t)(unif_vc+(s->mvs_base[job][3 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+      (uint32_t)(unif_vc+(s->mvs_base[job][4 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+      (uint32_t)(unif_vc+(s->mvs_base[job][5 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+      (uint32_t)(unif_vc+(s->mvs_base[job][6 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+      (uint32_t)(unif_vc+(s->mvs_base[job][7 ] - (uint32_t*)s->unif_mvs_ptr[job].arm))
+      );
+#endif
+
+
+}
+#else
+
+#ifdef RPI
+static void rpi_launch_vpu_qpu(HEVCContext *s)
+{
+  rpi_execute_transform(s);
+}
+#endif
+
+#endif
+
+#ifdef RPI
+
+#ifndef RPI_FAST_CACHEFLUSH
+#error RPI_FAST_CACHEFLUSH is broken
+static void flush_buffer(AVBufferRef *bref) {
+    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
+    gpu_cache_flush(p);
+}
+#endif
+
+static void flush_frame(HEVCContext *s,AVFrame *frame)
+{
+#ifdef RPI_FAST_CACHEFLUSH
+    struct vcsm_user_clean_invalid_s iocache = {};
+    GPU_MEM_PTR_T p = get_gpu_mem_ptr_u(s->frame);
+    int n = s->ps.sps->height;
+    int curr_y = 0;
+    int curr_uv = 0;
+    int n_uv = n >> s->ps.sps->vshift[1];
+    int sz,base;
+    sz = s->frame->linesize[1] * (n_uv-curr_uv);
+    base = s->frame->linesize[1] * curr_uv;
+    iocache.s[0].handle = p.vcsm_handle;
+    iocache.s[0].cmd = 3; // clean+invalidate
+    iocache.s[0].addr = (int)(p.arm) + base;
+    iocache.s[0].size  = sz;
+    p = get_gpu_mem_ptr_v(s->frame);
+    iocache.s[1].handle = p.vcsm_handle;
+    iocache.s[1].cmd = 3; // clean+invalidate
+    iocache.s[1].addr = (int)(p.arm) + base;
+    iocache.s[1].size  = sz;
+    p = get_gpu_mem_ptr_y(s->frame);
+    sz = s->frame->linesize[0] * (n-curr_y);
+    base = s->frame->linesize[0] * curr_y;
+    iocache.s[2].handle = p.vcsm_handle;
+    iocache.s[2].cmd = 3; // clean+invalidate
+    iocache.s[2].addr = (int)(p.arm) + base;
+    iocache.s[2].size  = sz;
+    vcsm_clean_invalid( &iocache );
+#else
+    flush_buffer(frame->buf[0]);
+    flush_buffer(frame->buf[1]);
+    flush_buffer(frame->buf[2]);
+#endif
+}
+
+static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2, int job)
+{
+#ifdef RPI_FAST_CACHEFLUSH
+    struct vcsm_user_clean_invalid_s iocache = {};
+    int n;
+    int curr_y;
+    int curr_uv;
+    int n_uv;
+    GPU_MEM_PTR_T p = get_gpu_mem_ptr_u(s->frame);
+    int sz,base;
+    int (*d)[2] = s->dblk_cmds[job];
+    int low=(*d)[1];
+    int high=(*d)[1];
+    for(n = s->num_dblk_cmds[job]; n>0 ;n--,d++) {
+        int y = (*d)[1];
+        low=FFMIN(low,y);
+        high=FFMAX(high,y);
+    }
+    curr_y = low;
+    n = high+(1 << s->ps.sps->log2_ctb_size);
+    curr_uv = curr_y >> s->ps.sps->vshift[1];
+    n_uv = n >> s->ps.sps->vshift[1];
+
+    sz = s->frame->linesize[1] * (n_uv-curr_uv);
+    base = s->frame->linesize[1] * curr_uv;
+    iocache.s[0].handle = p.vcsm_handle;
+    iocache.s[0].cmd = 3; // clean+invalidate
+    iocache.s[0].addr = (int)(p.arm) + base;
+    iocache.s[0].size  = sz;
+    p = get_gpu_mem_ptr_v(s->frame);
+    iocache.s[1].handle = p.vcsm_handle;
+    iocache.s[1].cmd = 3; // clean+invalidate
+    iocache.s[1].addr = (int)(p.arm) + base;
+    iocache.s[1].size  = sz;
+    p = get_gpu_mem_ptr_y(s->frame);
+    sz = s->frame->linesize[0] * (n-curr_y);
+    base = s->frame->linesize[0] * curr_y;
+    iocache.s[2].handle = p.vcsm_handle;
+    iocache.s[2].cmd = 3; // clean+invalidate
+    iocache.s[2].addr = (int)(p.arm) + base;
+    iocache.s[2].size  = sz;
+
+    iocache.s[3].handle = p0->vcsm_handle;
+    iocache.s[3].cmd = 3; // clean+invalidate
+    iocache.s[3].addr = (int) p0->arm;
+    iocache.s[3].size  = p0->numbytes;
+    if (p1) {
+      iocache.s[4].handle = p1->vcsm_handle;
+      iocache.s[4].cmd = 3; // clean+invalidate
+      iocache.s[4].addr = (int) p1->arm;
+      iocache.s[4].size  = p1->numbytes;
+    }
+    if (p2) {
+      iocache.s[5].handle = p2->vcsm_handle;
+      iocache.s[5].cmd = 3; // clean+invalidate
+      iocache.s[5].addr = (int) p2->arm;
+      iocache.s[5].size  = p2->numbytes;
+    }
+    vcsm_clean_invalid( &iocache );
+#else
+    flush_buffer(frame->buf[0]);
+    flush_buffer(frame->buf[1]);
+    flush_buffer(frame->buf[2]);
+    gpu_cache_flush3(p0, p1, p2);
+#endif
+}
+
+#endif
+
 static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
 {
     HEVCContext *s  = avctxt->priv_data;
@@ -2313,6 +3762,17 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
     int y_ctb       = 0;
     int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];

+#ifdef RPI
+    s->enable_rpi = s->ps.sps->bit_depth == 8
+                    && !s->ps.pps->cross_component_prediction_enabled_flag;
+
+    if (!s->enable_rpi) {
+      if (s->ps.pps->cross_component_prediction_enabled_flag)
+        printf("Cross component\n");
+    }
+#endif
+    //printf("L0=%d L1=%d\n",s->sh.nb_refs[L1],s->sh.nb_refs[L1]);
+
     if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
         av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
         return AVERROR_INVALIDDATA;
@@ -2326,6 +3786,14 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
         }
     }

+#ifdef RPI_WORKER
+    s->pass0_job = 0;
+    s->pass1_job = 0;
+#endif
+#ifdef RPI
+    rpi_begin(s);
+#endif
+
     while (more_data && ctb_addr_ts < s->ps.sps->ctb_size) {
         int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];

@@ -2341,7 +3809,57 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
         s->deblock[ctb_addr_rs].tc_offset   = s->sh.tc_offset;
         s->filter_slice_edges[ctb_addr_rs]  = s->sh.slice_loop_filter_across_slices_enabled_flag;

+#ifdef RPI_INTER_QPU
+        s->curr_u_mvs = s->u_mvs[s->pass0_job][s->ctu_count % 8];
+#endif
+#ifdef RPI_LUMA_QPU
+        s->curr_y_mvs = s->y_mvs[s->pass0_job][s->ctu_count % 12];
+#endif
+
         more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
+
+#ifdef RPI_INTER_QPU
+        s->u_mvs[s->pass0_job][s->ctu_count % 8]= s->curr_u_mvs;
+#endif
+#ifdef RPI_LUMA_QPU
+        s->y_mvs[s->pass0_job][s->ctu_count % 12] = s->curr_y_mvs;
+#endif
+
+#ifdef RPI
+        if (s->enable_rpi) {
+          //av_assert0(s->num_dblk_cmds[s->pass0_job]>=0);
+          //av_assert0(s->num_dblk_cmds[s->pass0_job]<RPI_MAX_DEBLOCK_CMDS);
+          //av_assert0(s->pass0_job<RPI_MAX_JOBS);
+          //av_assert0(s->pass0_job>=0);
+          s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]][0] = x_ctb;
+          s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]++][1] = y_ctb;
+          s->ctu_count++;
+          //printf("%d %d/%d job=%d\n",s->ctu_count,s->num_dblk_cmds[s->pass0_job],RPI_MAX_DEBLOCK_CMDS,s->pass0_job);
+
+          if ( s->ctu_count >= s->max_ctu_count ) {
+#ifdef RPI_WORKER
+            if (s->used_for_ref) {
+              // Split work load onto separate threads so we make as rapid progress as possible with this frame
+              // Pass on this job to worker thread
+              worker_submit_job(s);
+              // Make sure we have space to prepare the next job
+              worker_pass0_ready(s);
+
+              // Prepare the next batch of commands
+              rpi_begin(s);
+            } else {
+              // Non-ref frame so do it all on this thread
+              rpi_do_all_passes(s);
+            }
+#else
+            rpi_do_all_passes(s);
+#endif
+          }
+
+        }
+#endif
+
+
         if (more_data < 0) {
             s->tab_slice_address[ctb_addr_rs] = -1;
             return more_data;
@@ -2350,9 +3868,29 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)

         ctb_addr_ts++;
         ff_hevc_save_states(s, ctb_addr_ts);
+#ifdef RPI
+        if (s->enable_rpi)
+            continue;
+#endif
         ff_hevc_hls_filters(s, x_ctb, y_ctb, ctb_size);
     }

+#ifdef RPI
+
+#ifdef RPI_WORKER
+    // Wait for the worker to finish all its jobs
+    if (s->enable_rpi) {
+        worker_wait(s);
+    }
+#endif
+
+    // Finish off any half-completed rows
+    if (s->enable_rpi && s->ctu_count) {
+        rpi_do_all_passes(s);
+    }
+
+#endif
+
     if (x_ctb + ctb_size >= s->ps.sps->width &&
         y_ctb + ctb_size >= s->ps.sps->height)
         ff_hevc_hls_filter(s, x_ctb, y_ctb, ctb_size);
@@ -2387,6 +3925,11 @@ static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int
     s = s1->sList[self_id];
     lc = s->HEVClc;

+#ifdef RPI
+    s->enable_rpi = 0;
+    //printf("Wavefront\n");
+#endif
+
     if(ctb_row) {
         ret = init_get_bits8(&lc->gb, s->data + s->sh.offset[ctb_row - 1], s->sh.size[ctb_row - 1]);

@@ -2767,6 +4310,16 @@ static int decode_nal_unit(HEVCContext *s, const H2645NAL *nal)
         if (ret < 0)
             return ret;

+        s->used_for_ref = !(s->nal_unit_type == NAL_TRAIL_N ||
+                        s->nal_unit_type == NAL_TSA_N   ||
+                        s->nal_unit_type == NAL_STSA_N  ||
+                        s->nal_unit_type == NAL_RADL_N  ||
+                        s->nal_unit_type == NAL_RASL_N);
+
+        if (!s->used_for_ref && s->avctx->skip_frame >= AVDISCARD_NONREF) {
+            s->is_decoded = 0;
+            break;
+        }
         if (s->max_ra == INT_MAX) {
             if (s->nal_unit_type == NAL_CRA_NUT || IS_BLA(s)) {
                 s->max_ra = s->poc;
@@ -2891,9 +4444,17 @@ static int decode_nal_units(HEVCContext *s, const uint8_t *buf, int length)
     }

 fail:
-    if (s->ref && s->threads_type == FF_THREAD_FRAME)
+    if (s->ref && s->threads_type == FF_THREAD_FRAME) {
+#ifdef RPI_INTER_QPU
+        ff_hevc_flush_buffer(s, &s->ref->tf, s->ps.sps->height);
+#endif
         ff_thread_report_progress(&s->ref->tf, INT_MAX, 0);
-
+    } else if (s->ref) {
+#ifdef RPI_INTER_QPU
+      // When running single threaded we need to flush the whole frame
+      flush_frame(s,s->frame);
+#endif
+    }
     return ret;
 }

@@ -3064,6 +4625,41 @@ fail:
     return AVERROR(ENOMEM);
 }

+#ifdef RPI_WORKER
+static av_cold void hevc_init_worker(HEVCContext *s)
+{
+    int err;
+    pthread_cond_init(&s->worker_cond_head, NULL);
+    pthread_cond_init(&s->worker_cond_tail, NULL);
+    pthread_mutex_init(&s->worker_mutex, NULL);
+
+    s->worker_tail=0;
+    s->worker_head=0;
+    s->kill_worker=0;
+    err = pthread_create(&s->worker_thread, NULL, worker_start, s);
+    if (err) {
+        printf("Failed to create worker thread\n");
+        exit(-1);
+    }
+}
+
+static av_cold void hevc_exit_worker(HEVCContext *s)
+{
+    void *res;
+    s->kill_worker=1;
+    pthread_cond_broadcast(&s->worker_cond_tail);
+    pthread_join(s->worker_thread, &res);
+
+    pthread_cond_destroy(&s->worker_cond_head);
+    pthread_cond_destroy(&s->worker_cond_tail);
+    pthread_mutex_destroy(&s->worker_mutex);
+
+    s->worker_tail=0;
+    s->worker_head=0;
+    s->kill_worker=0;
+}
+#endif
+
 static av_cold int hevc_decode_free(AVCodecContext *avctx)
 {
     HEVCContext       *s = avctx->priv_data;
@@ -3075,6 +4671,32 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)

     av_freep(&s->cabac_state);

+#ifdef RPI
+
+#ifdef RPI_WORKER
+    hevc_exit_worker(s);
+#endif
+
+    for(i=0;i<RPI_MAX_JOBS;i++) {
+      av_freep(&s->unif_mv_cmds[i]);
+      av_freep(&s->univ_pred_cmds[i]);
+
+#ifdef RPI_INTER_QPU
+      if (s->unif_mvs[i]) {
+        gpu_free( &s->unif_mvs_ptr[i] );
+        s->unif_mvs[i] = 0;
+      }
+#endif
+#ifdef RPI_LUMA_QPU
+      if (s->y_unif_mvs[i]) {
+        gpu_free( &s->y_unif_mvs_ptr[i] );
+        s->y_unif_mvs[i] = 0;
+      }
+#endif
+    }
+
+#endif
+
     for (i = 0; i < 3; i++) {
         av_freep(&s->sao_pixel_buffer_h[i]);
         av_freep(&s->sao_pixel_buffer_v[i]);
@@ -3116,10 +4738,23 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
     return 0;
 }

+#ifdef RPI
+#ifdef RPI_PRECLEAR
+static av_cold void memclear16(int16_t *p, int n)
+{
+  vpu_execute_code( vpu_get_fn(), p, n, 0, 0, 0, 1);
+  //int i;
+  //for(i=0;i<n;i++)
+  //  p[i] = 0;
+}
+#endif
+#endif
+
 static av_cold int hevc_init_context(AVCodecContext *avctx)
 {
     HEVCContext *s = avctx->priv_data;
     int i;
+    int job;

     s->avctx = avctx;

@@ -3129,6 +4764,78 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
     s->HEVClcList[0] = s->HEVClc;
     s->sList[0] = s;

+#ifdef RPI
+    for(job=0;job<RPI_MAX_JOBS;job++) {
+        s->unif_mv_cmds[job] = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS);
+        if (!s->unif_mv_cmds[job])
+            goto fail;
+        s->univ_pred_cmds[job] = av_mallocz(sizeof(HEVCPredCmd)*RPI_MAX_PRED_CMDS);
+        if (!s->univ_pred_cmds[job])
+            goto fail;
+    }
+
+#ifdef RPI_INTER_QPU
+    // We divide the image into blocks 256 wide and 64 high
+    // We support up to 2048 widths
+    // We compute the number of chroma motion vector commands for 4:4:4 format and 4x4 chroma blocks - assuming all blocks are B predicted
+    // Also add space for the startup command for each stream.
+
+    {
+        int uv_commands_per_qpu = UV_COMMANDS_PER_QPU;
+        uint32_t *p;
+		for(job=0;job<RPI_MAX_JOBS;job++) {
+#ifdef RPI_CACHE_UNIF_MVS
+          gpu_malloc_cached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr[job] );
+#else
+          gpu_malloc_uncached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr[job] );
+#endif
+          s->unif_mvs[job] = (uint32_t *) s->unif_mvs_ptr[job].arm;
+
+          // Set up initial locations for uniform streams
+          p = s->unif_mvs[job];
+          for(i = 0; i < 8; i++) {
+            s->mvs_base[job][i] = p;
+            p += uv_commands_per_qpu;
+          }
+        }
+        s->mc_filter_uv = qpu_get_fn(QPU_MC_FILTER_UV);
+        s->mc_filter_uv_b0 = qpu_get_fn(QPU_MC_FILTER_UV_B0);
+        s->mc_filter_uv_b = qpu_get_fn(QPU_MC_FILTER_UV_B);
+    }
+
+#endif
+#ifdef RPI_LUMA_QPU
+    for(job=0;job<RPI_MAX_JOBS;job++)
+    {
+        int y_commands_per_qpu = Y_COMMANDS_PER_QPU;
+        uint32_t *p;
+#ifdef RPI_CACHE_UNIF_MVS
+        gpu_malloc_cached( 12 * y_commands_per_qpu * sizeof(uint32_t), &s->y_unif_mvs_ptr[job] );
+#else
+        gpu_malloc_uncached( 12 * y_commands_per_qpu * sizeof(uint32_t), &s->y_unif_mvs_ptr[job] );
+#endif
+        s->y_unif_mvs[job] = (uint32_t *) s->y_unif_mvs_ptr[job].arm;
+
+        // Set up initial locations for uniform streams
+        p = s->y_unif_mvs[job];
+        for(i = 0; i < 12; i++) {
+            s->y_mvs_base[job][i] = p;
+            p += y_commands_per_qpu;
+        }
+    }
+    s->mc_filter = qpu_get_fn(QPU_MC_FILTER);
+    s->mc_filter_b = qpu_get_fn(QPU_MC_FILTER_B);
+#endif
+    //gpu_malloc_uncached(2048*64,&s->dummy);
+
+    s->enable_rpi = 0;
+
+#ifdef RPI_WORKER
+    hevc_init_worker(s);
+#endif
+
+#endif
+
     s->cabac_state = av_malloc(HEVC_CONTEXTS);
     if (!s->cabac_state)
         goto fail;
diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
index be91010..6b03ea8 100644
--- a/libavcodec/hevc.h
+++ b/libavcodec/hevc.h
@@ -23,6 +23,9 @@
 #ifndef AVCODEC_HEVC_H
 #define AVCODEC_HEVC_H

+// define RPI to split the CABAC/prediction/transform into separate stages
+#include "config.h"
+
 #include "libavutil/buffer.h"
 #include "libavutil/md5.h"

@@ -37,6 +40,29 @@
 #include "thread.h"
 #include "videodsp.h"

+// define RPI to split the CABAC/prediction/transform into separate stages
+#ifdef RPI
+
+  #include "rpi_qpu.h"
+  // Define RPI_INTER_QPU to use QPU for chroma inter prediction
+  #define RPI_INTER_QPU
+
+  #ifdef RPI_INTER_QPU
+    // Define RPI_LUMA_QPU to also use QPU for luma inter prediction
+    #define RPI_LUMA_QPU
+  #endif
+
+  // By passing jobs to a worker thread we hope to be able to catch up during slow frames
+  #define RPI_MAX_JOBS 2
+  // Define RPI_WORKER to launch a worker thread for pixel processing tasks
+  #define RPI_WORKER
+  // Define RPI_DEBLOCK_VPU to perform deblocking on the VPUs
+//  #define RPI_DEBLOCK_VPU
+
+#endif
+
+#define RPI_VPU_DEBLOCK_CACHED 1
+
 #define MAX_DPB_SIZE 16 // A.4.1
 #define MAX_REFS 16

@@ -660,17 +686,6 @@ typedef struct CodingUnit {
     uint8_t cu_transquant_bypass_flag;
 } CodingUnit;

-typedef struct Mv {
-    int16_t x;  ///< horizontal component of motion vector
-    int16_t y;  ///< vertical component of motion vector
-} Mv;
-
-typedef struct MvField {
-    DECLARE_ALIGNED(4, Mv, mv)[2];
-    int8_t ref_idx[2];
-    int8_t pred_flag;
-} MvField;
-
 typedef struct NeighbourAvailable {
     int cand_bottom_left;
     int cand_left;
@@ -747,7 +762,17 @@ typedef struct HEVCFrame {
     uint8_t flags;
 } HEVCFrame;

+#ifdef RPI_WORKER
+typedef struct HEVCLocalContextIntra {
+    TransformUnit tu;
+    NeighbourAvailable na;
+} HEVCLocalContextIntra;
+#endif
+
 typedef struct HEVCLocalContext {
+    TransformUnit tu;
+    NeighbourAvailable na;  // WARNING tu and na must be the first two fields to match HEVCLocalContextIntra
+
     uint8_t cabac_state[HEVC_CONTEXTS];

     uint8_t stat_coeff[4];
@@ -762,7 +787,6 @@ typedef struct HEVCLocalContext {

     int qPy_pred;

-    TransformUnit tu;

     uint8_t ctb_left_flag;
     uint8_t ctb_up_flag;
@@ -779,7 +803,6 @@ typedef struct HEVCLocalContext {
     int ct_depth;
     CodingUnit cu;
     PredictionUnit pu;
-    NeighbourAvailable na;

 #define BOUNDARY_LEFT_SLICE     (1 << 0)
 #define BOUNDARY_LEFT_TILE      (1 << 1)
@@ -790,6 +813,80 @@ typedef struct HEVCLocalContext {
     int boundary_flags;
 } HEVCLocalContext;

+
+#ifdef RPI
+
+// The processing is done in chunks
+// Each chunk corresponds to 24 64x64 luma blocks (24 so it is divisible by 8 for chroma and 12 for luma)
+// This is a distance of 1536 pixels across the screen
+// Increasing RPI_NUM_CHUNKS will reduce time spent activating QPUs and cache flushing,
+// but allocate more memory and increase the latency before data in the next frame can be processed
+#define RPI_NUM_CHUNKS 1
+
+// RPI_MAX_WIDTH is maximum width in pixels supported by the accelerated code
+#define RPI_MAX_WIDTH (RPI_NUM_CHUNKS*64*24)
+
+// Worst case is for 4:4:4 4x4 blocks with 64 high coding tree blocks, so 16 MV cmds per 4 pixels across for each colour plane, * 2 for bi
+#define RPI_MAX_MV_CMDS   (2*16*3*(RPI_MAX_WIDTH/4))
+// Each block can have an intra prediction and a transform_add command
+#define RPI_MAX_PRED_CMDS (2*16*3*(RPI_MAX_WIDTH/4))
+// Worst case is 16x16 CTUs
+#define RPI_MAX_DEBLOCK_CMDS (RPI_MAX_WIDTH*4/16)
+
+#define RPI_CMD_LUMA_UNI 0
+#define RPI_CMD_CHROMA_UNI 1
+#define RPI_CMD_LUMA_BI 2
+#define RPI_CMD_CHROMA_BI 3
+#define RPI_CMD_V_BI 4
+
+// RPI_PRECLEAR is not working yet - perhaps clearing on VPUs is flawed?
+// #define RPI_PRECLEAR
+
+// Command for inter prediction
+typedef struct HEVCMvCmd {
+    int cmd;
+    uint8_t *dst;
+    ptrdiff_t dststride;
+    uint8_t *src;
+    ptrdiff_t srcstride;
+    Mv mv;
+    int x_off;
+    int y_off;
+    int block_w;
+    int block_h;
+    int weight;
+    int offset;
+    uint8_t *src1;
+    ptrdiff_t srcstride1;
+    Mv mv1;
+    int8_t ref_idx[2];
+} HEVCMvCmd;
+
+
+// Command for intra prediction and transform_add of predictions to coefficients
+#define RPI_PRED_TRANSFORM_ADD 0
+#define RPI_PRED_INTRA 1
+typedef struct HEVCPredCmd {
+    uint8_t size;
+    uint8_t type;
+    uint8_t na;
+    uint8_t c_idx;
+    union {
+        uint8_t *dst; // RPI_PRED_TRANSFORM_ADD
+        uint32_t x;   // RPI_PRED_INTRA
+    };
+    union {
+        int16_t *buf; // RPI_PRED_TRANSFORM_ADD
+        uint32_t y;   // RPI_PRED_INTRA
+    };
+    union {
+        enum IntraPredMode mode; // RPI_PRED_TRANSFORM_ADD
+        uint32_t stride;         // RPI_PRED_INTRA
+    };
+} HEVCPredCmd;
+
+#endif
+
 typedef struct HEVCContext {
     const AVClass *c;  // needed by private avoptions
     AVCodecContext *avctx;
@@ -798,13 +895,107 @@ typedef struct HEVCContext {

     HEVCLocalContext    *HEVClcList[MAX_NB_THREADS];
     HEVCLocalContext    *HEVClc;
-
+#ifdef RPI_WORKER
+    HEVCLocalContextIntra HEVClcIntra;
+#endif
     uint8_t             threads_type;
     uint8_t             threads_number;

     int                 width;
     int                 height;

+    int used_for_ref;
+
+#ifdef RPI
+    int enable_rpi;
+    HEVCMvCmd *unif_mv_cmds[RPI_MAX_JOBS];
+    HEVCPredCmd *univ_pred_cmds[RPI_MAX_JOBS];
+    int buf_width;
+    GPU_MEM_PTR_T coeffs_buf_default[RPI_MAX_JOBS];
+    GPU_MEM_PTR_T coeffs_buf_accelerated[RPI_MAX_JOBS];
+    int16_t *coeffs_buf_arm[RPI_MAX_JOBS][4];
+    unsigned int coeffs_buf_vc[RPI_MAX_JOBS][4];
+    int num_coeffs[RPI_MAX_JOBS][4];
+    int num_xfm_cmds[RPI_MAX_JOBS];
+    int num_mv_cmds[RPI_MAX_JOBS];
+    int num_pred_cmds[RPI_MAX_JOBS];
+    int num_dblk_cmds[RPI_MAX_JOBS];
+    int vpu_id;
+    int pass0_job; // Pass0 does coefficient decode
+    int pass1_job; // Pass1 does pixel processing
+    int ctu_count; // Number of CTUs done in pass0 so far
+    int max_ctu_count; // Number of CTUs when we trigger a round of processing
+    int ctu_per_y_chan; // Number of CTUs per luma QPU
+    int ctu_per_uv_chan; // Number of CTUs per chroma QPU
+#ifdef RPI_INTER_QPU
+    GPU_MEM_PTR_T unif_mvs_ptr[RPI_MAX_JOBS];
+    uint32_t *unif_mvs[RPI_MAX_JOBS]; // Base of memory for motion vector commands
+
+    // _base pointers are to the start of the row
+    uint32_t *mvs_base[RPI_MAX_JOBS][8];
+    // these pointers are to the next free space
+    uint32_t *u_mvs[RPI_MAX_JOBS][8];
+    uint32_t *curr_u_mvs; // Current uniform stream to use for chroma
+    // Function pointers
+    uint32_t mc_filter_uv;
+    uint32_t mc_filter_uv_b0;
+    uint32_t mc_filter_uv_b;
+#endif
+#ifdef RPI_LUMA_QPU
+    GPU_MEM_PTR_T y_unif_mvs_ptr[RPI_MAX_JOBS];
+    uint32_t *y_unif_mvs[RPI_MAX_JOBS]; // Base of memory for motion vector commands
+    uint32_t *y_mvs_base[RPI_MAX_JOBS][12];
+    uint32_t *y_mvs[RPI_MAX_JOBS][12];
+    uint32_t *curr_y_mvs; // Current uniform stream for luma
+    // Function pointers
+    uint32_t mc_filter;
+    uint32_t mc_filter_b;
+#endif
+
+#ifdef RPI_WORKER
+    pthread_t worker_thread;
+    pthread_cond_t worker_cond_head;
+    pthread_cond_t worker_cond_tail;
+    pthread_mutex_t worker_mutex;
+
+    int worker_tail; // Contains the number of posted jobs
+    int worker_head; // Contains the number of completed jobs
+    int kill_worker; // set to 1 to terminate the worker
+#endif
+
+#define RPI_DEBLOCK_VPU_Q_COUNT 2
+
+#ifdef RPI_DEBLOCK_VPU
+    int enable_rpi_deblock;
+
+    int uv_setup_width;
+    int uv_setup_height;
+    int setup_width; // Number of 16x16 blocks across the image
+    int setup_height; // Number of 16x16 blocks down the image
+
+    struct dblk_vpu_q_s
+    {
+        GPU_MEM_PTR_T deblock_vpu_gmem;
+
+        uint8_t (*y_setup_arm)[2][2][2][4];
+        uint8_t (*y_setup_vc)[2][2][2][4];
+
+        uint8_t (*uv_setup_arm)[2][2][2][4];  // Half of this is unused [][][1][], but easier for the VPU as it allows us to store with zeros and addresses are aligned
+        uint8_t (*uv_setup_vc)[2][2][2][4];
+
+        int (*vpu_cmds_arm)[6]; // r0-r5 for each command
+        int vpu_cmds_vc;
+
+        int cmd_id;
+    } dvq_ents[RPI_DEBLOCK_VPU_Q_COUNT];
+
+    struct dblk_vpu_q_s * dvq;
+    unsigned int dvq_n;
+
+#endif
+
+#endif
+
     uint8_t *cabac_state;

     /** 1 if the independent slice segment header was successfully parsed */
@@ -922,6 +1113,9 @@ typedef struct HEVCContext {
     uint32_t max_mastering_luminance;
     uint32_t min_mastering_luminance;

+#ifdef RPI
+    int dblk_cmds[RPI_MAX_JOBS][RPI_MAX_DEBLOCK_CMDS][2];
+#endif
 } HEVCContext;

 int ff_hevc_decode_short_term_rps(GetBitContext *gb, AVCodecContext *avctx,
@@ -1048,6 +1242,10 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
                                  int log2_trafo_size, enum ScanType scan_idx,
                                  int c_idx);

+#ifdef RPI_INTER_QPU
+extern void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n);
+#endif
+
 void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size);


diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
index 05b2821..e2f1f4e 100644
--- a/libavcodec/hevc_cabac.c
+++ b/libavcodec/hevc_cabac.c
@@ -21,14 +21,72 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */

+#define UNCHECKED_BITSTREAM_READER 1
+
 #include "libavutil/attributes.h"
 #include "libavutil/common.h"

-#include "cabac_functions.h"
 #include "hevc.h"
+#include "cabac_functions.h"
+
+// BY22 is probably faster than simple bypass if the processor has
+// either a fast 32-bit divide or a fast 32x32->64[63:32] instruction
+// x86 has fast int divide
+// Arm doesn't have divide or general fast 64 bit, but does have the multiply
+// * Beware: ARCH_xxx isn't set if configure --disable-asm is used
+#define USE_BY22 (HAVE_FAST_64BIT || ARCH_ARM || ARCH_X86)
+// Use native divide if we have a fast one - otherwise use mpy 1/x
+// x86 has a fast integer divide - arm doesn't - unsure about other
+// architectures
+#define USE_BY22_DIV  ARCH_X86
+
+// Special case blocks with a single significant ceoff
+// Decreases the complexity of the code for a common case but increases the
+// code size.
+#define USE_N_END_1 1
+
+#if ARCH_ARM
+#include "arm/hevc_cabac.h"
+#endif

 #define CABAC_MAX_BIN 31

+
+#if USE_BY22 && !USE_BY22_DIV
+#define I(x) (uint32_t)((0x10000000000ULL / (uint64_t)(x)) + 1ULL)
+
+static const uint32_t cabac_by22_inv_range[256] = {
+                                                    0,      I(257), I(258), I(259),
+    I(260), I(261), I(262), I(263), I(264), I(265), I(266), I(267), I(268), I(269),
+    I(270), I(271), I(272), I(273), I(274), I(275), I(276), I(277), I(278), I(279),
+    I(280), I(281), I(282), I(283), I(284), I(285), I(286), I(287), I(288), I(289),
+    I(290), I(291), I(292), I(293), I(294), I(295), I(296), I(297), I(298), I(299),
+    I(300), I(301), I(302), I(303), I(304), I(305), I(306), I(307), I(308), I(309),
+    I(310), I(311), I(312), I(313), I(314), I(315), I(316), I(317), I(318), I(319),
+    I(320), I(321), I(322), I(323), I(324), I(325), I(326), I(327), I(328), I(329),
+    I(330), I(331), I(332), I(333), I(334), I(335), I(336), I(337), I(338), I(339),
+    I(340), I(341), I(342), I(343), I(344), I(345), I(346), I(347), I(348), I(349),
+    I(350), I(351), I(352), I(353), I(354), I(355), I(356), I(357), I(358), I(359),
+    I(360), I(361), I(362), I(363), I(364), I(365), I(366), I(367), I(368), I(369),
+    I(370), I(371), I(372), I(373), I(374), I(375), I(376), I(377), I(378), I(379),
+    I(380), I(381), I(382), I(383), I(384), I(385), I(386), I(387), I(388), I(389),
+    I(390), I(391), I(392), I(393), I(394), I(395), I(396), I(397), I(398), I(399),
+    I(400), I(401), I(402), I(403), I(404), I(405), I(406), I(407), I(408), I(409),
+    I(410), I(411), I(412), I(413), I(414), I(415), I(416), I(417), I(418), I(419),
+    I(420), I(421), I(422), I(423), I(424), I(425), I(426), I(427), I(428), I(429),
+    I(430), I(431), I(432), I(433), I(434), I(435), I(436), I(437), I(438), I(439),
+    I(440), I(441), I(442), I(443), I(444), I(445), I(446), I(447), I(448), I(449),
+    I(450), I(451), I(452), I(453), I(454), I(455), I(456), I(457), I(458), I(459),
+    I(460), I(461), I(462), I(463), I(464), I(465), I(466), I(467), I(468), I(469),
+    I(470), I(471), I(472), I(473), I(474), I(475), I(476), I(477), I(478), I(479),
+    I(480), I(481), I(482), I(483), I(484), I(485), I(486), I(487), I(488), I(489),
+    I(490), I(491), I(492), I(493), I(494), I(495), I(496), I(497), I(498), I(499),
+    I(500), I(501), I(502), I(503), I(504), I(505), I(506), I(507), I(508), I(509),
+    I(510), I(511)
+};
+#undef I
+#endif  // USE_BY22
+
 /**
  * number of bin by SyntaxElement.
  */
@@ -445,6 +503,211 @@ static const uint8_t diag_scan8x8_inv[8][8] = {
     { 28, 36, 43, 49, 54, 58, 61, 63, },
 };

+
+typedef struct
+{
+    uint16_t coeff;
+    uint16_t scale;
+} xy_off_t;
+
+#define XYT_C(x,y,t) ((x) + ((y) << (t)))
+#define SCALE_TRAFO(t) ((t) > 3 ? 3 : (t))
+#define SCALE_SHR(t) ((t) - SCALE_TRAFO(t))
+#define XYT_S(x,y,t) (((x) >> SCALE_SHR(t)) + (((y) >> SCALE_SHR(t)) << SCALE_TRAFO(t)))
+
+#define XYT(x,y,t) {XYT_C(x,y,t), XYT_S(x,y,t)}
+
+#define OFF_DIAG(t) {\
+    XYT(0,0,t), XYT(0,1,t), XYT(1,0,t), XYT(0,2,t),\
+    XYT(1,1,t), XYT(2,0,t), XYT(0,3,t), XYT(1,2,t),\
+    XYT(2,1,t), XYT(3,0,t), XYT(1,3,t), XYT(2,2,t),\
+    XYT(3,1,t), XYT(2,3,t), XYT(3,2,t), XYT(3,3,t)\
+}
+
+#define OFF_HORIZ(t) {\
+    XYT(0,0,t), XYT(1,0,t), XYT(2,0,t), XYT(3,0,t),\
+    XYT(0,1,t), XYT(1,1,t), XYT(2,1,t), XYT(3,1,t),\
+    XYT(0,2,t), XYT(1,2,t), XYT(2,2,t), XYT(3,2,t),\
+    XYT(0,3,t), XYT(1,3,t), XYT(2,3,t), XYT(3,3,t)\
+}
+
+#define OFF_VERT(t) {\
+    XYT(0,0,t), XYT(0,1,t), XYT(0,2,t), XYT(0,3,t),\
+    XYT(1,0,t), XYT(1,1,t), XYT(1,2,t), XYT(1,3,t),\
+    XYT(2,0,t), XYT(2,1,t), XYT(2,2,t), XYT(2,3,t),\
+    XYT(3,0,t), XYT(3,1,t), XYT(3,2,t), XYT(3,3,t)\
+}
+
+static const xy_off_t off_xys[3][4][16] =
+{
+    {OFF_DIAG(2), OFF_DIAG(3), OFF_DIAG(4), OFF_DIAG(5)},
+    {OFF_HORIZ(2), OFF_HORIZ(3), OFF_HORIZ(4), OFF_HORIZ(5)},
+    {OFF_VERT(2), OFF_VERT(3), OFF_VERT(4), OFF_VERT(5)}
+};
+
+
+// Helper fns
+#ifndef hevc_mem_bits32
+static av_always_inline uint32_t hevc_mem_bits32(const void * buf, const unsigned int offset)
+{
+    return AV_RB32((const uint8_t *)buf + (offset >> 3)) << (offset & 7);
+}
+#endif
+
+#if AV_GCC_VERSION_AT_LEAST(3,4) && !defined(hevc_clz32)
+#define hevc_clz32 hevc_clz32_builtin
+static av_always_inline unsigned int hevc_clz32_builtin(const uint32_t x)
+{
+    // __builtin_clz says it works on ints - so adjust if int is >32 bits long
+    return __builtin_clz(x) - (sizeof(int) * 8 - 32);
+}
+#endif
+
+// It is unlikely that we will ever need this but include for completeness
+#ifndef hevc_clz32
+static inline unsigned int hevc_clz32(unsigned int x)
+{
+    unsigned int n = 1;
+    if ((x & 0xffff0000) == 0) {
+        n += 16;
+        x <<= 16;
+    }
+    if ((x & 0xff000000) == 0) {
+        n += 8;
+        x <<= 8;
+    }
+    if ((x & 0xf0000000) == 0) {
+        n += 4;
+        x <<= 4;
+    }
+    if ((x & 0xc0000000) == 0) {
+        n += 2;
+        x <<= 2;
+    }
+    return n - ((x >> 31) & 1);
+}
+#endif
+
+
+#if !USE_BY22
+// If no by22 then _by22 functions will revert to normal and so _peek/_flush
+// will no longer be called but the setup calls will still exist and we want
+// to null them out
+#define bypass_start(s)
+#define bypass_finish(s)
+#else
+// Use BY22 for residual bypass block
+
+#define bypass_start(s) get_cabac_by22_start(&s->HEVClc->cc)
+#define bypass_finish(s) get_cabac_by22_finish(&s->HEVClc->cc)
+
+// BY22 notes that bypass is simply a divide into the bitstream and so we
+// can peek out large quantities of bits at once and treat the result as if
+// it was VLC.  In many cases this will lead to O(1) processing rather than
+// O(n) though the setup and teardown is sufficiently expensive that it is
+// only worth using if we expect to be dealing with more than a few bits
+// The definition of "a few bits" will vary from platform to platform but
+// tests on ARM show that it probably isn't worth it for a single coded
+// residual, but is for >1 - it also seems likely that if there are
+// more residuals then they are likely to be bigger and this will make the
+// O(1) nature of the code more worthwhile.
+
+
+#if !USE_BY22_DIV
+// * 1/x @ 32 bits gets us 22 bits of accuracy
+#define CABAC_BY22_PEEK_BITS  22
+#else
+// A real 32-bit divide gets us another bit
+// If we have a 64 bit int & a unit time divider then we should get a lot
+// of bits (55)  but that is untested and it is unclear if it would give
+// us a large advantage
+#define CABAC_BY22_PEEK_BITS  23
+#endif
+
+// Bypass block start
+// Must be called before _by22_peek is used as it sets the CABAC environment
+// into the correct state.  _by22_finish must be called to return to 'normal'
+// (i.e. non-bypass) cabac decoding
+static inline void get_cabac_by22_start(CABACContext * const c)
+{
+    const unsigned int bits = __builtin_ctz(c->low);
+    const uint32_t m = hevc_mem_bits32(c->bytestream, 0);
+    uint32_t x = (c->low << (22 - CABAC_BITS)) ^ ((m ^ 0x80000000U) >> (9 + CABAC_BITS - bits));
+#if !USE_BY22_DIV
+    const uint32_t inv = cabac_by22_inv_range[c->range & 0xff];
+#endif
+
+    c->bytestream -= (CABAC_BITS / 8);
+    c->by22.bits = bits;
+#if !USE_BY22_DIV
+    c->by22.range = c->range;
+    c->range = inv;
+#endif
+    c->low = x;
+}
+
+// Bypass block finish
+// Must be called at the end of the bypass block to return to normal operation
+static inline void get_cabac_by22_finish(CABACContext * const c)
+{
+    unsigned int used = c->by22.bits;
+    unsigned int bytes_used = (used / CABAC_BITS) * (CABAC_BITS / 8);
+    unsigned int bits_used = used & (CABAC_BITS == 16 ? 15 : 7);
+
+    c->bytestream += bytes_used + (CABAC_BITS / 8);
+    c->low = (((uint32_t)c->low >> (22 - CABAC_BITS + bits_used)) | 1) << bits_used;
+#if !USE_BY22_DIV
+    c->range = c->by22.range;
+#endif
+}
+
+// Peek bypass bits
+// _by22_start must be called before _by22_peek is called and _by22_flush
+// must be called afterwards to flush any used bits
+// The actual number of valid bits returned is
+// min(<coded bypass block length>, CABAC_BY22_PEEK_BITS). CABAC_BY22_PEEK_BITS
+// will be at least 22 which should be long enough for any prefix or suffix
+// though probably not long enough for the worst case combination
+#ifndef get_cabac_by22_peek
+static inline uint32_t get_cabac_by22_peek(const CABACContext * const c)
+{
+#if USE_BY22_DIV
+    return ((unsigned int)c->low / (unsigned int)c->range) << 9;
+#else
+    uint32_t x = c->low & ~1U;
+    const uint32_t inv = c->range;
+
+    if (inv != 0)
+        x = (uint32_t)(((uint64_t)x * (uint64_t)inv) >> 32);
+
+    return x << 1;
+#endif
+}
+#endif
+
+// Flush bypass bits peeked by _by22_peek
+// Flush n bypass bits. n must be >= 1 to guarantee correct operation
+// val is an unmodified copy of whatever _by22_peek returned
+#ifndef get_cabac_by22_flush
+static inline void get_cabac_by22_flush(CABACContext * c, const unsigned int n, const uint32_t val)
+{
+    // Subtract the bits used & reshift up to the top of the word
+#if USE_BY22_DIV
+    const uint32_t low = (((unsigned int)c->low << n) - (((val >> (32 - n)) * (unsigned int)c->range) << 23));
+#else
+    const uint32_t low = (((uint32_t)c->low << n) - (((val >> (32 - n)) * c->by22.range) << 23));
+#endif
+
+    // and refill lower bits
+    // We will probably OR over some existing bits but that doesn't matter
+    c->by22.bits += n;
+    c->low = low | (hevc_mem_bits32(c->bytestream, c->by22.bits) >> 9);
+}
+#endif
+
+#endif  // USE_BY22
+
+
 void ff_hevc_save_states(HEVCContext *s, int ctb_addr_ts)
 {
     if (s->ps.pps->entropy_coding_sync_enabled_flag &&
@@ -863,19 +1126,19 @@ int ff_hevc_cbf_luma_decode(HEVCContext *s, int trafo_depth)
     return GET_CABAC(elem_offset[CBF_LUMA] + !trafo_depth);
 }

-static int hevc_transform_skip_flag_decode(HEVCContext *s, int c_idx)
+static int hevc_transform_skip_flag_decode(HEVCContext *s, int c_idx_nz)
 {
-    return GET_CABAC(elem_offset[TRANSFORM_SKIP_FLAG] + !!c_idx);
+    return GET_CABAC(elem_offset[TRANSFORM_SKIP_FLAG] + c_idx_nz);
 }

-static int explicit_rdpcm_flag_decode(HEVCContext *s, int c_idx)
+static int explicit_rdpcm_flag_decode(HEVCContext *s, int c_idx_nz)
 {
-    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_FLAG] + !!c_idx);
+    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_FLAG] + c_idx_nz);
 }

-static int explicit_rdpcm_dir_flag_decode(HEVCContext *s, int c_idx)
+static int explicit_rdpcm_dir_flag_decode(HEVCContext *s, int c_idx_nz)
 {
-    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_DIR_FLAG] + !!c_idx);
+    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_DIR_FLAG] + c_idx_nz);
 }

 int ff_hevc_log2_res_scale_abs(HEVCContext *s, int idx) {
@@ -891,14 +1154,14 @@ int ff_hevc_res_scale_sign_flag(HEVCContext *s, int idx) {
     return GET_CABAC(elem_offset[RES_SCALE_SIGN_FLAG] + idx);
 }

-static av_always_inline void last_significant_coeff_xy_prefix_decode(HEVCContext *s, int c_idx,
+static av_always_inline void last_significant_coeff_xy_prefix_decode(HEVCContext *s, int c_idx_nz,
                                                    int log2_size, int *last_scx_prefix, int *last_scy_prefix)
 {
     int i = 0;
     int max = (log2_size << 1) - 1;
     int ctx_offset, ctx_shift;

-    if (!c_idx) {
+    if (!c_idx_nz) {
         ctx_offset = 3 * (log2_size - 2)  + ((log2_size - 1) >> 2);
         ctx_shift = (log2_size + 1) >> 2;
     } else {
@@ -929,22 +1192,16 @@ static av_always_inline int last_significant_coeff_suffix_decode(HEVCContext *s,
     return value;
 }

-static av_always_inline int significant_coeff_group_flag_decode(HEVCContext *s, int c_idx, int ctx_cg)
+static av_always_inline int significant_coeff_group_flag_decode(HEVCContext *s, int c_idx_nz, int ctx_cg)
 {
     int inc;

-    inc = FFMIN(ctx_cg, 1) + (c_idx>0 ? 2 : 0);
+    inc = (ctx_cg != 0) + (c_idx_nz << 1);

     return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_GROUP_FLAG] + inc);
 }
-static av_always_inline int significant_coeff_flag_decode(HEVCContext *s, int x_c, int y_c,
-                                           int offset, const uint8_t *ctx_idx_map)
-{
-    int inc = ctx_idx_map[(y_c << 2) + x_c] + offset;
-    return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + inc);
-}

-static av_always_inline int significant_coeff_flag_decode_0(HEVCContext *s, int c_idx, int offset)
+static av_always_inline int significant_coeff_flag_decode_0(HEVCContext *s, int offset)
 {
     return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + offset);
 }
@@ -966,90 +1223,366 @@ static av_always_inline int coeff_abs_level_greater2_flag_decode(HEVCContext *s,
     return GET_CABAC(elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] + inc);
 }

-static av_always_inline int coeff_abs_level_remaining_decode(HEVCContext *s, int rc_rice_param)
+
+#if !USE_BY22
+#define coeff_abs_level_remaining_decode_bypass(s,r) coeff_abs_level_remaining_decode(s, r)
+#endif
+
+
+#ifndef coeff_abs_level_remaining_decode_bypass
+static int coeff_abs_level_remaining_decode_bypass(HEVCContext * const s, const unsigned int rice_param)
 {
+    CABACContext * const c = &s->HEVClc->cc;
+    uint32_t y;
+    unsigned int prefix;
+    unsigned int last_coeff_abs_level_remaining;
+    unsigned int n;
+
+    y = get_cabac_by22_peek(c);
+    prefix = hevc_clz32(~y);
+    // y << prefix will always have top bit 0
+
+    if (prefix < 3) {
+        const unsigned int suffix = (y << prefix) >> (31 - rice_param);
+        last_coeff_abs_level_remaining = (prefix << rice_param) + suffix;
+        n = prefix + 1 + rice_param;
+    }
+    else if (prefix * 2 + rice_param <= CABAC_BY22_PEEK_BITS + 2)
+    {
+        const uint32_t suffix = ((y << prefix) | 0x80000000) >> (34 - (prefix + rice_param));
+
+        last_coeff_abs_level_remaining = (2 << rice_param) + suffix;
+        n = prefix * 2 + rice_param - 2;
+    }
+    else {
+        unsigned int suffix;
+
+        get_cabac_by22_flush(c, prefix, y);
+        y = get_cabac_by22_peek(c);
+
+        suffix = (y | 0x80000000) >> (34 - (prefix + rice_param));
+        last_coeff_abs_level_remaining = (2 << rice_param) + suffix;
+        n = prefix + rice_param - 2;
+    }
+
+    get_cabac_by22_flush(c, n, y);
+
+    return last_coeff_abs_level_remaining;
+}
+#endif
+
+static int coeff_abs_level_remaining_decode(HEVCContext * const s, int rc_rice_param)
+{
+    CABACContext * const c = &s->HEVClc->cc;
     int prefix = 0;
     int suffix = 0;
     int last_coeff_abs_level_remaining;
     int i;

-    while (prefix < CABAC_MAX_BIN && get_cabac_bypass(&s->HEVClc->cc))
+    while (prefix < CABAC_MAX_BIN && get_cabac_bypass(c))
         prefix++;
     if (prefix == CABAC_MAX_BIN) {
         av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", prefix);
         return 0;
     }
+
     if (prefix < 3) {
         for (i = 0; i < rc_rice_param; i++)
-            suffix = (suffix << 1) | get_cabac_bypass(&s->HEVClc->cc);
+            suffix = (suffix << 1) | get_cabac_bypass(c);
         last_coeff_abs_level_remaining = (prefix << rc_rice_param) + suffix;
     } else {
         int prefix_minus3 = prefix - 3;
         for (i = 0; i < prefix_minus3 + rc_rice_param; i++)
-            suffix = (suffix << 1) | get_cabac_bypass(&s->HEVClc->cc);
+            suffix = (suffix << 1) | get_cabac_bypass(c);
         last_coeff_abs_level_remaining = (((1 << prefix_minus3) + 3 - 1)
                                               << rc_rice_param) + suffix;
     }
+
     return last_coeff_abs_level_remaining;
 }

-static av_always_inline int coeff_sign_flag_decode(HEVCContext *s, uint8_t nb)
+#if !USE_BY22
+#define coeff_sign_flag_decode_bypass coeff_sign_flag_decode
+static inline uint32_t coeff_sign_flag_decode(HEVCContext * const s, const unsigned int nb)
 {
-    int i;
-    int ret = 0;
+    CABACContext * const c = &s->HEVClc->cc;
+    unsigned int i;
+    uint32_t ret = 0;

     for (i = 0; i < nb; i++)
-        ret = (ret << 1) | get_cabac_bypass(&s->HEVClc->cc);
-    return ret;
+        ret = (ret << 1) | get_cabac_bypass(c);
+
+    return ret << (32 - nb);
+}
+#endif
+
+#ifndef coeff_sign_flag_decode_bypass
+static inline uint32_t coeff_sign_flag_decode_bypass(HEVCContext * const s, const unsigned int nb)
+{
+    CABACContext * const c = &s->HEVClc->cc;
+    uint32_t y;
+    y = get_cabac_by22_peek(c);
+    get_cabac_by22_flush(c, nb, y);
+    return y & ~(0xffffffffU >> nb);
+}
+#endif
+
+
+#ifndef get_cabac_greater1_bits
+static inline unsigned int get_cabac_greater1_bits(CABACContext * const c, const unsigned int n,
+    uint8_t * const state0)
+{
+    unsigned int i;
+    unsigned int rv = 0;
+    for (i = 0; i != n; ++i) {
+        const unsigned int idx = rv != 0 ? 0 : i < 3 ? i + 1 : 3;
+        const unsigned int b = get_cabac(c, state0 + idx);
+        rv = (rv << 1) | b;
+    }
+    return rv;
+}
+#endif
+
+
+// N.B. levels returned are the values assuming coeff_abs_level_remaining
+// is uncoded, so 1 must be added if it is coded.  sum_abs also reflects
+// this version of events.
+static inline uint32_t get_greaterx_bits(HEVCContext * const s, const unsigned int n_end, int * const levels,
+    int * const pprev_subset_coded, int * const psum,
+    const unsigned int idx0_gt1, const unsigned int idx_gt2)
+{
+    CABACContext * const c = &s->HEVClc->cc;
+    uint8_t * const state0 = s->HEVClc->cabac_state + idx0_gt1;
+    uint8_t * const state_gt2 = s->HEVClc->cabac_state + idx_gt2;
+    unsigned int rv;
+    unsigned int i;
+    const unsigned int n = FFMIN(n_end, 8);
+
+    // Really this is i != n but the simple unconditional loop is cheaper
+    // and faster
+    for (i = 0; i != 8; ++i)
+        levels[i] = 1;
+
+    rv = get_cabac_greater1_bits(c, n, state0);
+
+    *pprev_subset_coded = 0;
+    *psum = n;
+
+    rv <<= (32 - n);
+    if (rv != 0)
+    {
+        *pprev_subset_coded = 1;
+        *psum = n + 1;
+        i = hevc_clz32(rv);
+        levels[i] = 2;
+        if (get_cabac(c, state_gt2) == 0)
+        {
+            // Unset first coded bit
+            rv &= ~(0x80000000U >> i);
+        }
+    }
+
+    if (n_end > 8) {
+        const unsigned int g8 = n_end - 8;
+        rv |= ((1 << g8) - 1) << (24 - g8);
+        for (i = 0; i != g8; ++i) {
+            levels[i + 8] = 0;
+        }
+    }
+
+    return rv;
+}
+
+// extended_precision_processing_flag must be false given we are
+// putting the result into a 16-bit array
+// So trans_coeff_level must fit in 16 bits too (7.4.9.1 definition of coeff_abs_level_remaining)
+// scale_m is uint8_t
+//
+// scale is [40 - 72] << [0..12] based on qp- worst case is (45 << 12)
+//   or it can be 2 (if we have transquant_bypass)
+// shift is set to one less than we really want but would normally be
+//   s->ps.sps->bit_depth (max 16, min 8) + log2_trafo_size (max 5, min 2?) - 5 = max 16 min 5?
+// however the scale shift is substracted from shift to a min 0 so scale_m worst = 45 << 6
+// This can still theoretically lead to overflow but the coding would have to be very odd (& inefficient)
+// to achieve it
+
+#ifndef trans_scale_sat
+static inline int trans_scale_sat(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift)
+{
+    return av_clip_int16((((level * (int)(scale * scale_m)) >> shift) + 1) >> 1);
 }
+#endif
+
+
+#ifndef update_rice
+static inline void update_rice(uint8_t * const stat_coeff,
+    const unsigned int last_coeff_abs_level_remaining,
+    const unsigned int c_rice_param)
+{
+    const unsigned int x = (last_coeff_abs_level_remaining << 1) >> c_rice_param;
+    if (x >= 6)
+        (*stat_coeff)++;
+    else if (x == 0 && *stat_coeff > 0)
+        (*stat_coeff)--;
+}
+#endif
+
+
+// n must be > 0 on entry
+#ifndef get_cabac_sig_coeff_flag_idxs
+static inline uint8_t * get_cabac_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0,
+    unsigned int n,
+    const uint8_t const * ctx_map,
+    uint8_t * p)
+{
+    do {
+        if (get_cabac(c, state0 + ctx_map[n]))
+            *p++ = n;
+    } while (--n != 0);
+    return p;
+}
+#endif
+
+
+static int get_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0,
+    unsigned int n,
+    const uint8_t const * ctx_map,
+    uint8_t * const flag_idx)
+{
+    int rv;
+
+    rv = get_cabac_sig_coeff_flag_idxs(c, state0, n, ctx_map, flag_idx) - flag_idx;
+
+    return rv;
+}
+
+#define H4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
+     x0,  x1,  x2,  x3,\
+     x4,  x5,  x6,  x7,\
+     x8,  x9, x10, x11,\
+    x12, x13, x14, x15}
+
+#define V4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
+     x0,  x4,  x8, x12,\
+     x1,  x5,  x9, x13,\
+     x2,  x6, x10, x14,\
+     x3,  x7, x11, x15}
+
+#define D4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
+     x0,  x4,  x1,  x8,\
+     x5,  x2, x12,  x9,\
+     x6,  x3, x13, x10,\
+     x7, x14, x11, x15}
+
+
+static inline int next_subset(HEVCContext * const s, int i, const int c_idx_nz,
+    uint8_t * const significant_coeff_group_flag,
+    const uint8_t * const scan_x_cg, const uint8_t * const scan_y_cg,
+    int * const pPrev_sig)
+{
+    while (--i >= 0) {
+        unsigned int x_cg = scan_x_cg[i];
+        unsigned int y_cg = scan_y_cg[i];
+
+        // For the flag decode we only care about Z/NZ but
+        // we use the full Right + Down * 2 when calculating
+        // significant coeff flags so we obtain it here
+        //.
+        // The group flag array is one longer than it needs to
+        // be so we don't need to check for y_cg limits
+        unsigned int prev_sig = ((significant_coeff_group_flag[y_cg] >> (x_cg + 1)) & 1) |
+            (((significant_coeff_group_flag[y_cg + 1] >> x_cg) & 1) << 1);
+
+        if (i == 0 ||
+            significant_coeff_group_flag_decode(s, c_idx_nz, prev_sig))
+        {
+            significant_coeff_group_flag[y_cg] |= (1 << x_cg);
+            *pPrev_sig = prev_sig;
+            break;
+        }
+    }
+
+    return i;
+}
+

 void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
                                 int log2_trafo_size, enum ScanType scan_idx,
                                 int c_idx)
 {
-#define GET_COORD(offset, n)                                    \
-    do {                                                        \
-        x_c = (x_cg << 2) + scan_x_off[n];                      \
-        y_c = (y_cg << 2) + scan_y_off[n];                      \
-    } while (0)
-    HEVCLocalContext *lc = s->HEVClc;
-    int transform_skip_flag = 0;
+    HEVCLocalContext * const lc = s->HEVClc;
+    int trans_skip_or_bypass = lc->cu.cu_transquant_bypass_flag;

     int last_significant_coeff_x, last_significant_coeff_y;
-    int last_scan_pos;
-    int n_end;
     int num_coeff = 0;
-    int greater1_ctx = 1;
+    int prev_subset_coded = 0;

     int num_last_subset;
     int x_cg_last_sig, y_cg_last_sig;

-    const uint8_t *scan_x_cg, *scan_y_cg, *scan_x_off, *scan_y_off;
+    const uint8_t *scan_x_cg, *scan_y_cg;
+    const xy_off_t * scan_xy_off;

     ptrdiff_t stride = s->frame->linesize[c_idx];
     int hshift = s->ps.sps->hshift[c_idx];
     int vshift = s->ps.sps->vshift[c_idx];
     uint8_t *dst = &s->frame->data[c_idx][(y0 >> vshift) * stride +
                                           ((x0 >> hshift) << s->ps.sps->pixel_shift)];
+#ifdef RPI
+    //***** transform_skip_flag decoded later!
+    int use_vpu = s->enable_rpi && !lc->cu.cu_transquant_bypass_flag /* && !transform_skip_flag*/ && !lc->tu.cross_pf && log2_trafo_size>=4;
+#endif
     int16_t *coeffs = (int16_t*)(c_idx ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
-    uint8_t significant_coeff_group_flag[8][8] = {{0}};
+    uint8_t significant_coeff_group_flag[9] = {0};  // Allow 1 final byte that is always zero
     int explicit_rdpcm_flag = 0;
     int explicit_rdpcm_dir_flag;

     int trafo_size = 1 << log2_trafo_size;
     int i;
-    int qp,shift,add,scale,scale_m;
+    int qp,shift,scale;
     static const uint8_t level_scale[] = { 40, 45, 51, 57, 64, 72 };
     const uint8_t *scale_matrix = NULL;
     uint8_t dc_scale;
     int pred_mode_intra = (c_idx == 0) ? lc->tu.intra_pred_mode :
                                          lc->tu.intra_pred_mode_c;

+    int prev_sig = 0;
+    const int c_idx_nz = (c_idx != 0);
+
+    int may_hide_sign;
+
+#ifdef RPI
+    if (s->enable_rpi) {
+        int n = trafo_size * trafo_size;
+        if (use_vpu) {
+            // We support size 4 and size 5.
+            // Size 4 grows from the front  (Coeffs_buf_arm[2] points to start of buf)
+            // Size 5 grows from the back   (Coeffs_buf_arm[3] points to end of buf)
+            // num_coeffs is indexed by log2_trafo_size-2
+            if (log2_trafo_size == 4)
+                coeffs = s->coeffs_buf_arm[s->pass0_job][log2_trafo_size - 2] + s->num_coeffs[s->pass0_job][log2_trafo_size - 2];
+            else
+                coeffs = s->coeffs_buf_arm[s->pass0_job][log2_trafo_size - 2] - s->num_coeffs[s->pass0_job][log2_trafo_size - 2] - n;
+            s->num_coeffs[s->pass0_job][log2_trafo_size - 2] += n;
+        } else {
+            coeffs = s->coeffs_buf_arm[s->pass0_job][0] + s->num_coeffs[s->pass0_job][0];
+            s->num_coeffs[s->pass0_job][0] += n;
+        }
+    }
+    // We now do the memset after transform_add while we know the data is cached.
+    #ifdef RPI_PRECLEAR
+    #else
+    memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
+    #endif
+#else
     memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
+#endif
+
+

     // Derive QP for dequant
     if (!lc->cu.cu_transquant_bypass_flag) {
-        static const int qp_c[] = { 29, 30, 31, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37 };
+        static const uint8_t qp_c[] = { 29, 30, 31, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37 };
         static const uint8_t rem6[51 + 4 * 6 + 1] = {
             0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2,
             3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
@@ -1065,9 +1598,19 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
         };
         int qp_y = lc->qp_y;

+        may_hide_sign = s->ps.pps->sign_data_hiding_flag;
+
         if (s->ps.pps->transform_skip_enabled_flag &&
             log2_trafo_size <= s->ps.pps->log2_max_transform_skip_block_size) {
-            transform_skip_flag = hevc_transform_skip_flag_decode(s, c_idx);
+            int transform_skip_flag = hevc_transform_skip_flag_decode(s, c_idx_nz);
+            if (transform_skip_flag) {
+                trans_skip_or_bypass = 1;
+                if (lc->cu.pred_mode ==  MODE_INTRA  &&
+                    s->ps.sps->implicit_rdpcm_enabled_flag &&
+                    (pred_mode_intra == 10 || pred_mode_intra == 26)) {
+                    may_hide_sign = 0;
+                }
+            }
         }

         if (c_idx == 0) {
@@ -1100,39 +1643,73 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
             qp += s->ps.sps->qp_bd_offset;
         }

-        shift    = s->ps.sps->bit_depth + log2_trafo_size - 5;
-        add      = 1 << (shift-1);
-        scale    = level_scale[rem6[qp]] << (div6[qp]);
-        scale_m  = 16; // default when no custom scaling lists.
-        dc_scale = 16;
+        // Shift is set to one less than will actually occur as the scale
+        // and saturate step adds 1 and then shifts right again
+        shift = s->ps.sps->bit_depth + log2_trafo_size - 6;
+        scale = level_scale[rem6[qp]];
+        if (div6[qp] >= shift) {
+            scale <<= (div6[qp] - shift);
+            shift = 0;
+        } else {
+            shift -= div6[qp];
+        }

-        if (s->ps.sps->scaling_list_enable_flag && !(transform_skip_flag && log2_trafo_size > 2)) {
+        if (s->ps.sps->scaling_list_enable_flag && !(trans_skip_or_bypass && log2_trafo_size > 2)) {
             const ScalingList *sl = s->ps.pps->scaling_list_data_present_flag ?
-            &s->ps.pps->scaling_list : &s->ps.sps->scaling_list;
+                &s->ps.pps->scaling_list : &s->ps.sps->scaling_list;
             int matrix_id = lc->cu.pred_mode != MODE_INTRA;

             matrix_id = 3 * matrix_id + c_idx;

             scale_matrix = sl->sl[log2_trafo_size - 2][matrix_id];
+            dc_scale = scale_matrix[0];
             if (log2_trafo_size >= 4)
                 dc_scale = sl->sl_dc[log2_trafo_size - 4][matrix_id];
         }
+        else
+        {
+            static const uint8_t sixteen_scale[64] = {
+                16, 16, 16, 16, 16, 16, 16, 16,
+                16, 16, 16, 16, 16, 16, 16, 16,
+                16, 16, 16, 16, 16, 16, 16, 16,
+                16, 16, 16, 16, 16, 16, 16, 16,
+                16, 16, 16, 16, 16, 16, 16, 16,
+                16, 16, 16, 16, 16, 16, 16, 16,
+                16, 16, 16, 16, 16, 16, 16, 16,
+                16, 16, 16, 16, 16, 16, 16, 16
+            };
+            scale_matrix = sixteen_scale;
+            dc_scale = 16;
+        }
     } else {
+        static const uint8_t unit_scale[64] = {
+            1, 1, 1, 1, 1, 1, 1, 1,
+            1, 1, 1, 1, 1, 1, 1, 1,
+            1, 1, 1, 1, 1, 1, 1, 1,
+            1, 1, 1, 1, 1, 1, 1, 1,
+            1, 1, 1, 1, 1, 1, 1, 1,
+            1, 1, 1, 1, 1, 1, 1, 1,
+            1, 1, 1, 1, 1, 1, 1, 1,
+            1, 1, 1, 1, 1, 1, 1, 1,
+        };
+        scale_matrix = unit_scale;
         shift        = 0;
-        add          = 0;
-        scale        = 0;
-        dc_scale     = 0;
+        scale        = 2;  // We will shift right to kill this
+        dc_scale     = 1;
+
+        may_hide_sign = 0;
     }

     if (lc->cu.pred_mode == MODE_INTER && s->ps.sps->explicit_rdpcm_enabled_flag &&
-        (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
-        explicit_rdpcm_flag = explicit_rdpcm_flag_decode(s, c_idx);
+        trans_skip_or_bypass) {
+        explicit_rdpcm_flag = explicit_rdpcm_flag_decode(s, c_idx_nz);
         if (explicit_rdpcm_flag) {
-            explicit_rdpcm_dir_flag = explicit_rdpcm_dir_flag_decode(s, c_idx);
+            may_hide_sign = 0;
+            explicit_rdpcm_dir_flag = explicit_rdpcm_dir_flag_decode(s, c_idx_nz);
         }
     }

-    last_significant_coeff_xy_prefix_decode(s, c_idx, log2_trafo_size,
+    last_significant_coeff_xy_prefix_decode(s, c_idx_nz, log2_trafo_size,
                                            &last_significant_coeff_x, &last_significant_coeff_y);

     if (last_significant_coeff_x > 3) {
@@ -1160,119 +1737,113 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
         int last_x_c = last_significant_coeff_x & 3;
         int last_y_c = last_significant_coeff_y & 3;

-        scan_x_off = ff_hevc_diag_scan4x4_x;
-        scan_y_off = ff_hevc_diag_scan4x4_y;
         num_coeff = diag_scan4x4_inv[last_y_c][last_x_c];
-        if (trafo_size == 4) {
+
+        switch (log2_trafo_size) {
+        case 2:
             scan_x_cg = scan_1x1;
             scan_y_cg = scan_1x1;
-        } else if (trafo_size == 8) {
+            break;
+        case 3:
             num_coeff += diag_scan2x2_inv[y_cg_last_sig][x_cg_last_sig] << 4;
             scan_x_cg = diag_scan2x2_x;
             scan_y_cg = diag_scan2x2_y;
-        } else if (trafo_size == 16) {
+            break;
+        case 4:
             num_coeff += diag_scan4x4_inv[y_cg_last_sig][x_cg_last_sig] << 4;
             scan_x_cg = ff_hevc_diag_scan4x4_x;
             scan_y_cg = ff_hevc_diag_scan4x4_y;
-        } else { // trafo_size == 32
+            break;
+        case 5:
+        default:
             num_coeff += diag_scan8x8_inv[y_cg_last_sig][x_cg_last_sig] << 4;
             scan_x_cg = ff_hevc_diag_scan8x8_x;
             scan_y_cg = ff_hevc_diag_scan8x8_y;
+            break;
         }
         break;
     }
     case SCAN_HORIZ:
         scan_x_cg = horiz_scan2x2_x;
         scan_y_cg = horiz_scan2x2_y;
-        scan_x_off = horiz_scan4x4_x;
-        scan_y_off = horiz_scan4x4_y;
         num_coeff = horiz_scan8x8_inv[last_significant_coeff_y][last_significant_coeff_x];
         break;
     default: //SCAN_VERT
         scan_x_cg = horiz_scan2x2_y;
         scan_y_cg = horiz_scan2x2_x;
-        scan_x_off = horiz_scan4x4_y;
-        scan_y_off = horiz_scan4x4_x;
         num_coeff = horiz_scan8x8_inv[last_significant_coeff_x][last_significant_coeff_y];
         break;
     }
     num_coeff++;
     num_last_subset = (num_coeff - 1) >> 4;

-    for (i = num_last_subset; i >= 0; i--) {
-        int n, m;
-        int x_cg, y_cg, x_c, y_c, pos;
-        int implicit_non_zero_coeff = 0;
-        int64_t trans_coeff_level;
-        int prev_sig = 0;
-        int offset = i << 4;
-        int rice_init = 0;
+    significant_coeff_group_flag[y_cg_last_sig] = 1 << x_cg_last_sig; // 1st subset always significant

-        uint8_t significant_coeff_flag_idx[16];
-        uint8_t nb_significant_coeff_flag = 0;
-
-        x_cg = scan_x_cg[i];
-        y_cg = scan_y_cg[i];
-
-        if ((i < num_last_subset) && (i > 0)) {
-            int ctx_cg = 0;
-            if (x_cg < (1 << (log2_trafo_size - 2)) - 1)
-                ctx_cg += significant_coeff_group_flag[x_cg + 1][y_cg];
-            if (y_cg < (1 << (log2_trafo_size - 2)) - 1)
-                ctx_cg += significant_coeff_group_flag[x_cg][y_cg + 1];
+    scan_xy_off = off_xys[scan_idx][log2_trafo_size - 2];

-            significant_coeff_group_flag[x_cg][y_cg] =
-                significant_coeff_group_flag_decode(s, c_idx, ctx_cg);
-            implicit_non_zero_coeff = 1;
-        } else {
-            significant_coeff_group_flag[x_cg][y_cg] =
-            ((x_cg == x_cg_last_sig && y_cg == y_cg_last_sig) ||
-             (x_cg == 0 && y_cg == 0));
-        }
+    i = num_last_subset;
+    do {
+        int implicit_non_zero_coeff = 0;
+        int n_end;

-        last_scan_pos = num_coeff - offset - 1;
+        uint8_t significant_coeff_flag_idx[16];
+        unsigned int nb_significant_coeff_flag = 0;

         if (i == num_last_subset) {
+            // First time through
+            int last_scan_pos = num_coeff - (i << 4) - 1;
             n_end = last_scan_pos - 1;
             significant_coeff_flag_idx[0] = last_scan_pos;
             nb_significant_coeff_flag = 1;
         } else {
             n_end = 15;
+            implicit_non_zero_coeff = (i != 0);
         }

-        if (x_cg < ((1 << log2_trafo_size) - 1) >> 2)
-            prev_sig = !!significant_coeff_group_flag[x_cg + 1][y_cg];
-        if (y_cg < ((1 << log2_trafo_size) - 1) >> 2)
-            prev_sig += (!!significant_coeff_group_flag[x_cg][y_cg + 1] << 1);
-
-        if (significant_coeff_group_flag[x_cg][y_cg] && n_end >= 0) {
-            static const uint8_t ctx_idx_map[] = {
-                0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8, // log2_trafo_size == 2
-                1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, // prev_sig == 0
-                2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, // prev_sig == 1
-                2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, // prev_sig == 2
-                2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2  // default
+        if (n_end >= 0) {
+            static const uint8_t ctx_idx_maps_ts2[3][16] = {
+                D4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2
+                H4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2
+                V4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8)  // log2_trafo_size == 2
+            };
+            static const uint8_t ctx_idx_maps[3][4][16] = {
+                {
+                    D4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
+                    D4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 1
+                    D4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 2
+                    D4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
+                },
+                {
+                    H4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
+                    H4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 1
+                    H4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 2
+                    H4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
+                },
+                {
+                    V4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
+                    V4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 1
+                    V4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 2
+                    V4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
+                }
             };
             const uint8_t *ctx_idx_map_p;
             int scf_offset = 0;
-            if (s->ps.sps->transform_skip_context_enabled_flag &&
-                (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
-                ctx_idx_map_p = (uint8_t*) &ctx_idx_map[4 * 16];
-                if (c_idx == 0) {
-                    scf_offset = 40;
-                } else {
-                    scf_offset = 14 + 27;
-                }
+
+            if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) {
+                ctx_idx_map_p = ctx_idx_maps[0][3];
+                scf_offset = 40 + c_idx_nz;
             } else {
-                if (c_idx != 0)
+                if (c_idx_nz != 0)
                     scf_offset = 27;
+
                 if (log2_trafo_size == 2) {
-                    ctx_idx_map_p = (uint8_t*) &ctx_idx_map[0];
+                    ctx_idx_map_p = ctx_idx_maps_ts2[scan_idx];
                 } else {
-                    ctx_idx_map_p = (uint8_t*) &ctx_idx_map[(prev_sig + 1) << 4];
-                    if (c_idx == 0) {
-                        if ((x_cg > 0 || y_cg > 0))
+                    ctx_idx_map_p = ctx_idx_maps[scan_idx][prev_sig];
+                    if (!c_idx_nz) {
+                        if (i != 0)
                             scf_offset += 3;
+
                         if (log2_trafo_size == 3) {
                             scf_offset += (scan_idx == SCAN_DIAG) ? 9 : 15;
                         } else {
@@ -1286,34 +1857,30 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
                     }
                 }
             }
-            for (n = n_end; n > 0; n--) {
-                x_c = scan_x_off[n];
-                y_c = scan_y_off[n];
-                if (significant_coeff_flag_decode(s, x_c, y_c, scf_offset, ctx_idx_map_p)) {
-                    significant_coeff_flag_idx[nb_significant_coeff_flag] = n;
-                    nb_significant_coeff_flag++;
+
+            if (n_end > 0) {
+                int cnt = get_sig_coeff_flag_idxs(&s->HEVClc->cc,
+                    s->HEVClc->cabac_state + elem_offset[SIGNIFICANT_COEFF_FLAG] + scf_offset,
+                    n_end, ctx_idx_map_p,
+                    significant_coeff_flag_idx + nb_significant_coeff_flag);
+
+                nb_significant_coeff_flag += cnt;
+                if (cnt != 0) {
                     implicit_non_zero_coeff = 0;
                 }
             }
+
             if (implicit_non_zero_coeff == 0) {
-                if (s->ps.sps->transform_skip_context_enabled_flag &&
-                    (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
-                    if (c_idx == 0) {
-                        scf_offset = 42;
-                    } else {
-                        scf_offset = 16 + 27;
-                    }
+                if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) {
+                    scf_offset = 42 + c_idx_nz;
                 } else {
                     if (i == 0) {
-                        if (c_idx == 0)
-                            scf_offset = 0;
-                        else
-                            scf_offset = 27;
+                        scf_offset = c_idx_nz ? 27 : 0;
                     } else {
                         scf_offset = 2 + scf_offset;
                     }
                 }
-                if (significant_coeff_flag_decode_0(s, c_idx, scf_offset) == 1) {
+                if (significant_coeff_flag_decode_0(s, scf_offset) == 1) {
                     significant_coeff_flag_idx[nb_significant_coeff_flag] = 0;
                     nb_significant_coeff_flag++;
                 }
@@ -1323,141 +1890,185 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
             }
         }

-        n_end = nb_significant_coeff_flag;
-
+        if (nb_significant_coeff_flag != 0) {
+            const unsigned int gt1_idx_delta = (c_idx_nz << 2) |
+                ((i != 0 && !c_idx_nz) ? 2 : 0) |
+                prev_subset_coded;
+            const unsigned int idx0_gt1 = elem_offset[COEFF_ABS_LEVEL_GREATER1_FLAG] +
+                (gt1_idx_delta << 2);
+            const unsigned int idx_gt2 = elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] +
+                gt1_idx_delta;
+
+            const unsigned int x_cg = scan_x_cg[i];
+            const unsigned int y_cg = scan_y_cg[i];
+            int16_t * const blk_coeffs = coeffs +
+                ((x_cg + (y_cg << log2_trafo_size)) << 2);
+            // This calculation is 'wrong' for log2_traffo_size == 2
+            // but that doesn't mattor as in this case x_cg & y_cg
+            // are always 0 so result is correct (0) anyway
+            const uint8_t * const blk_scale = scale_matrix +
+                (((x_cg + (y_cg << 3)) << (5 - log2_trafo_size)));
+
+            // * The following code block doesn't deal with these flags:
+            //   (nor did the one it replaces)
+            //
+            // cabac_bypass_alignment_enabled_flag
+            //    This should be easy but I can't find a test case
+            // extended_precision_processing_flag
+            //    This can extend the required precision past 16bits
+            //    so is probably tricky - also no example found yet
+
+#if USE_N_END_1
+            if (nb_significant_coeff_flag == 1) {
+                // There is a small gain to be had from special casing the single
+                // transform coefficient case.  The reduction in complexity
+                // makes up for the code duplicatioon.
+
+                int trans_coeff_level = 1;
+                int coeff_sign_flag;
+                int coded_val = 0;
+
+                // initialize first elem of coeff_bas_level_greater1_flag
+                prev_subset_coded = 0;
+
+                if (get_cabac(&s->HEVClc->cc, s->HEVClc->cabac_state + idx0_gt1 + 1)) {
+                    trans_coeff_level = 2;
+                    prev_subset_coded = 1;
+                    coded_val = get_cabac(&s->HEVClc->cc, s->HEVClc->cabac_state + idx_gt2);
+                }

-        if (n_end) {
-            int first_nz_pos_in_cg;
-            int last_nz_pos_in_cg;
-            int c_rice_param = 0;
-            int first_greater1_coeff_idx = -1;
-            uint8_t coeff_abs_level_greater1_flag[8];
-            uint16_t coeff_sign_flag;
-            int sum_abs = 0;
-            int sign_hidden;
-            int sb_type;
+                // Probably not worth the overhead of starting by22 for just one value
+                coeff_sign_flag = get_cabac_bypass(&s->HEVClc->cc);

+                if (coded_val)
+                {
+                    if (!s->ps.sps->persistent_rice_adaptation_enabled_flag) {
+                        trans_coeff_level = 3 + coeff_abs_level_remaining_decode(s, 0);
+                    } else {
+                        uint8_t * const stat_coeff =
+                            lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1);
+                        const unsigned int c_rice_param = *stat_coeff >> 2;
+                        const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);

-            // initialize first elem of coeff_bas_level_greater1_flag
-            int ctx_set = (i > 0 && c_idx == 0) ? 2 : 0;
+                        trans_coeff_level = 3 + last_coeff_abs_level_remaining;
+                        update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
+                    }
+                }

-            if (s->ps.sps->persistent_rice_adaptation_enabled_flag) {
-                if (!transform_skip_flag && !lc->cu.cu_transquant_bypass_flag)
-                    sb_type = 2 * (c_idx == 0 ? 1 : 0);
-                else
-                    sb_type = 2 * (c_idx == 0 ? 1 : 0) + 1;
-                c_rice_param = lc->stat_coeff[sb_type] / 4;
-            }
+                {
+                    const xy_off_t * const xy_off = scan_xy_off + significant_coeff_flag_idx[0];
+                    const int k = (int32_t)(coeff_sign_flag << 31) >> 31;
+                    const unsigned int scale_m = blk_scale[xy_off->scale];

-            if (!(i == num_last_subset) && greater1_ctx == 0)
-                ctx_set++;
-            greater1_ctx = 1;
-            last_nz_pos_in_cg = significant_coeff_flag_idx[0];
-
-            for (m = 0; m < (n_end > 8 ? 8 : n_end); m++) {
-                int inc = (ctx_set << 2) + greater1_ctx;
-                coeff_abs_level_greater1_flag[m] =
-                    coeff_abs_level_greater1_flag_decode(s, c_idx, inc);
-                if (coeff_abs_level_greater1_flag[m]) {
-                    greater1_ctx = 0;
-                    if (first_greater1_coeff_idx == -1)
-                        first_greater1_coeff_idx = m;
-                } else if (greater1_ctx > 0 && greater1_ctx < 3) {
-                    greater1_ctx++;
+                    blk_coeffs[xy_off->coeff] = trans_scale_sat(
+                        (trans_coeff_level ^ k) - k,  // Apply sign
+                        scale,
+                        i == 0 && xy_off->coeff == 0 ? dc_scale : scale_m,
+                        shift);
                 }
             }
-            first_nz_pos_in_cg = significant_coeff_flag_idx[n_end - 1];
-
-            if (lc->cu.cu_transquant_bypass_flag ||
-                (lc->cu.pred_mode ==  MODE_INTRA  &&
-                 s->ps.sps->implicit_rdpcm_enabled_flag  &&  transform_skip_flag  &&
-                 (pred_mode_intra == 10 || pred_mode_intra  ==  26 )) ||
-                 explicit_rdpcm_flag)
-                sign_hidden = 0;
             else
-                sign_hidden = (last_nz_pos_in_cg - first_nz_pos_in_cg >= 4);
+#endif
+            {
+                int sign_hidden = may_hide_sign;
+                int levels[16]; // Should be able to get away with int16_t but that fails some tests
+                uint32_t coeff_sign_flags;
+                uint32_t coded_vals = 0;
+                // Sum(abs(level[]))
+                // In fact we only need the bottom bit and in some future
+                // version that may be all we calculate
+                unsigned int sum_abs;
+
+                coded_vals = get_greaterx_bits(s, nb_significant_coeff_flag, levels,
+                    &prev_subset_coded, &sum_abs, idx0_gt1, idx_gt2);
+
+                if (significant_coeff_flag_idx[0] - significant_coeff_flag_idx[nb_significant_coeff_flag - 1] <= 3)
+                    sign_hidden = 0;
+
+                // -- Start bypass block
+
+                bypass_start(s);
+
+                coeff_sign_flags = coeff_sign_flag_decode_bypass(s, nb_significant_coeff_flag - sign_hidden);
+
+                if (coded_vals != 0)
+                {
+                    const int rice_adaptation_enabled = s->ps.sps->persistent_rice_adaptation_enabled_flag;
+                    uint8_t * stat_coeff = !rice_adaptation_enabled ? NULL :
+                        lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1);
+                    int c_rice_param = !rice_adaptation_enabled ? 0 : *stat_coeff >> 2;
+                    int * level = levels - 1;
+
+                    do {
+                        {
+                            const unsigned int z = hevc_clz32(coded_vals) + 1;
+                            level += z;
+                            coded_vals <<= z;
+                        }

-            if (first_greater1_coeff_idx != -1) {
-                coeff_abs_level_greater1_flag[first_greater1_coeff_idx] += coeff_abs_level_greater2_flag_decode(s, c_idx, ctx_set);
-            }
-            if (!s->ps.pps->sign_data_hiding_flag || !sign_hidden ) {
-                coeff_sign_flag = coeff_sign_flag_decode(s, nb_significant_coeff_flag) << (16 - nb_significant_coeff_flag);
-            } else {
-                coeff_sign_flag = coeff_sign_flag_decode(s, nb_significant_coeff_flag - 1) << (16 - (nb_significant_coeff_flag - 1));
-            }
+                        {
+                            const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode_bypass(s, c_rice_param);
+                            const int trans_coeff_level = *level + last_coeff_abs_level_remaining + 1;
+
+                            sum_abs += last_coeff_abs_level_remaining + 1;
+                            *level = trans_coeff_level;
+
+                            if (stat_coeff != NULL)
+                                update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
+                            stat_coeff = NULL;

-            for (m = 0; m < n_end; m++) {
-                n = significant_coeff_flag_idx[m];
-                GET_COORD(offset, n);
-                if (m < 8) {
-                    trans_coeff_level = 1 + coeff_abs_level_greater1_flag[m];
-                    if (trans_coeff_level == ((m == first_greater1_coeff_idx) ? 3 : 2)) {
-                        int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
-
-                        trans_coeff_level += last_coeff_abs_level_remaining;
-                        if (trans_coeff_level > (3 << c_rice_param))
-                            c_rice_param = s->ps.sps->persistent_rice_adaptation_enabled_flag ? c_rice_param + 1 : FFMIN(c_rice_param + 1, 4);
-                        if (s->ps.sps->persistent_rice_adaptation_enabled_flag && !rice_init) {
-                            int c_rice_p_init = lc->stat_coeff[sb_type] / 4;
-                            if (last_coeff_abs_level_remaining >= (3 << c_rice_p_init))
-                                lc->stat_coeff[sb_type]++;
-                            else if (2 * last_coeff_abs_level_remaining < (1 << c_rice_p_init))
-                                if (lc->stat_coeff[sb_type] > 0)
-                                    lc->stat_coeff[sb_type]--;
-                            rice_init = 1;
+                            if (trans_coeff_level > (3 << c_rice_param) &&
+                                (c_rice_param < 4 || rice_adaptation_enabled))
+                                ++c_rice_param;
                         }
-                    }
-                } else {
-                    int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
-
-                    trans_coeff_level = 1 + last_coeff_abs_level_remaining;
-                    if (trans_coeff_level > (3 << c_rice_param))
-                        c_rice_param = s->ps.sps->persistent_rice_adaptation_enabled_flag ? c_rice_param + 1 : FFMIN(c_rice_param + 1, 4);
-                    if (s->ps.sps->persistent_rice_adaptation_enabled_flag && !rice_init) {
-                        int c_rice_p_init = lc->stat_coeff[sb_type] / 4;
-                        if (last_coeff_abs_level_remaining >= (3 << c_rice_p_init))
-                            lc->stat_coeff[sb_type]++;
-                        else if (2 * last_coeff_abs_level_remaining < (1 << c_rice_p_init))
-                            if (lc->stat_coeff[sb_type] > 0)
-                                lc->stat_coeff[sb_type]--;
-                        rice_init = 1;
-                    }
+                    } while (coded_vals != 0);
                 }
-                if (s->ps.pps->sign_data_hiding_flag && sign_hidden) {
-                    sum_abs += trans_coeff_level;
-                    if (n == first_nz_pos_in_cg && (sum_abs&1))
-                        trans_coeff_level = -trans_coeff_level;
+
+                // sign_hidden = 0 or 1 so we can combine the tests
+                if ((sign_hidden & sum_abs) != 0) {
+                    levels[nb_significant_coeff_flag - 1] = -levels[nb_significant_coeff_flag - 1];
                 }
-                if (coeff_sign_flag >> 15)
-                    trans_coeff_level = -trans_coeff_level;
-                coeff_sign_flag <<= 1;
-                if(!lc->cu.cu_transquant_bypass_flag) {
-                    if (s->ps.sps->scaling_list_enable_flag && !(transform_skip_flag && log2_trafo_size > 2)) {
-                        if(y_c || x_c || log2_trafo_size < 4) {
-                            switch(log2_trafo_size) {
-                                case 3: pos = (y_c << 3) + x_c; break;
-                                case 4: pos = ((y_c >> 1) << 3) + (x_c >> 1); break;
-                                case 5: pos = ((y_c >> 2) << 3) + (x_c >> 2); break;
-                                default: pos = (y_c << 2) + x_c; break;
-                            }
-                            scale_m = scale_matrix[pos];
-                        } else {
-                            scale_m = dc_scale;
-                        }
+
+                bypass_finish(s);
+
+                // -- Finish bypass block
+
+                // Scale loop
+                {
+                    int m = nb_significant_coeff_flag - 1;
+
+                    // Deal with DC component (if any) first
+                    if (i == 0 && significant_coeff_flag_idx[m] == 0)
+                    {
+                        const int k = (int32_t)(coeff_sign_flags << m) >> 31;
+                        blk_coeffs[0] = trans_scale_sat(
+                            (levels[m] ^ k) - k, scale, dc_scale, shift);
+                        --m;
                     }
-                    trans_coeff_level = (trans_coeff_level * (int64_t)scale * (int64_t)scale_m + add) >> shift;
-                    if(trans_coeff_level < 0) {
-                        if((~trans_coeff_level) & 0xFffffffffff8000)
-                            trans_coeff_level = -32768;
-                    } else {
-                        if(trans_coeff_level & 0xffffffffffff8000)
-                            trans_coeff_level = 32767;
+
+#if !USE_N_END_1
+                    // If N_END_1 set then m was at least 1 initially
+                    if (m >= 0)
+#endif
+                    {
+                        do {
+                            const xy_off_t * const xy_off = scan_xy_off +
+                                significant_coeff_flag_idx[m];
+                            const int k = (int32_t)(coeff_sign_flags << m) >> 31;
+
+                            blk_coeffs[xy_off->coeff] = trans_scale_sat(
+                                (levels[m] ^ k) - k,
+                                scale,
+                                blk_scale[xy_off->scale],
+                                shift);
+                        } while (--m >= 0);
                     }
                 }
-                coeffs[y_c * trafo_size + x_c] = trans_coeff_level;
+
             }
         }
-    }
+    } while ((i = next_subset(s, i, c_idx_nz,
+        significant_coeff_group_flag, scan_x_cg, scan_y_cg, &prev_sig)) >= 0);

     if (lc->cu.cu_transquant_bypass_flag) {
         if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
@@ -1467,7 +2078,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
             s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
         }
     } else {
-        if (transform_skip_flag) {
+        if (trans_skip_or_bypass) { // Must be trans_skip as we've already dealt with bypass
             int rot = s->ps.sps->transform_skip_rotation_enabled_flag &&
                       log2_trafo_size == 2 &&
                       lc->cu.pred_mode == MODE_INTRA;
@@ -1475,7 +2086,6 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
                 for (i = 0; i < 8; i++)
                     FFSWAP(int16_t, coeffs[i], coeffs[16 - i - 1]);
             }
-
             s->hevcdsp.transform_skip(coeffs, log2_trafo_size);

             if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
@@ -1486,8 +2096,26 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
                 s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
             }
         } else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) {
-            s->hevcdsp.idct_4x4_luma(coeffs);
+           s->hevcdsp.idct_4x4_luma(coeffs);
         } else {
+#ifdef RPI
+            if (!use_vpu) {
+              int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
+              if (max_xy == 0) {
+                  s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs);
+              } else {
+                  int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4;
+                  if (max_xy < 4)
+                      col_limit = FFMIN(4, col_limit);
+                  else if (max_xy < 8)
+                      col_limit = FFMIN(8, col_limit);
+                  else if (max_xy < 12)
+                      col_limit = FFMIN(24, col_limit);
+
+                  s->hevcdsp.idct[log2_trafo_size-2](coeffs, col_limit);
+              }
+            }
+#else
             int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
             if (max_xy == 0)
                 s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs);
@@ -1501,6 +2129,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
                     col_limit = FFMIN(24, col_limit);
                 s->hevcdsp.idct[log2_trafo_size-2](coeffs, col_limit);
             }
+#endif
         }
     }
     if (lc->tu.cross_pf) {
@@ -1510,6 +2139,17 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
             coeffs[i] = coeffs[i] + ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
         }
     }
+#ifdef RPI
+    if (s->enable_rpi) {
+        HEVCPredCmd *cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
+        cmd->type = RPI_PRED_TRANSFORM_ADD;
+        cmd->size = log2_trafo_size;
+        cmd->buf = coeffs;
+        cmd->dst = dst;
+        cmd->stride = stride;
+        return;
+    }
+#endif
     s->hevcdsp.transform_add[log2_trafo_size-2](dst, coeffs, stride);
 }

diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
index 1f33b0c..55a0315 100644
--- a/libavcodec/hevc_filter.c
+++ b/libavcodec/hevc_filter.c
@@ -22,6 +22,12 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */

+//#define DISABLE_SAO
+//#define DISABLE_DEBLOCK
+//#define DISABLE_STRENGTHS
+// define DISABLE_DEBLOCK_NONREF for a 6% speed boost (by skipping deblocking on unimportant frames)
+//#define DISABLE_DEBLOCK_NONREF
+
 #include "libavutil/common.h"
 #include "libavutil/internal.h"

@@ -31,6 +37,11 @@

 #include "bit_depth_template.c"

+#ifdef RPI
+#include "rpi_user_vcsm.h"
+#include "rpi_qpu.h"
+#endif
+
 #define LUMA 0
 #define CB 1
 #define CR 2
@@ -273,6 +284,10 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
     edges[2]   = x_ctb == s->ps.sps->ctb_width  - 1;
     edges[3]   = y_ctb == s->ps.sps->ctb_height - 1;

+#ifdef DISABLE_SAO
+    return;
+#endif
+
     if (restore) {
         if (!edges[0]) {
             left_tile_edge  = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1]];
@@ -496,6 +511,15 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
                 s->ps.sps->pcm.loop_filter_disable_flag) ||
                s->ps.pps->transquant_bypass_enable_flag;

+#ifdef DISABLE_DEBLOCK_NONREF
+    if (!s->used_for_ref)
+      return; // Don't deblock non-reference frames
+#endif
+#ifdef DISABLE_DEBLOCK
+    return;
+#endif
+    if (!s->used_for_ref && s->avctx->skip_loop_filter >= AVDISCARD_NONREF)
+        return;
     if (x0) {
         left_tc_offset   = s->deblock[ctb - 1].tc_offset;
         left_beta_offset = s->deblock[ctb - 1].beta_offset;
@@ -539,6 +563,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
                                                          s->frame->linesize[LUMA],
                                                          beta, tc, no_p, no_q);
                 } else
+#ifdef RPI_DEBLOCK_VPU
+                if (s->enable_rpi_deblock) {
+                    uint8_t (*setup)[2][2][4];
+                    int num16 = (y>>4)*s->setup_width + (x>>4);
+                    int a = ((y>>3) & 1) << 1;
+                    int b = (x>>3) & 1;
+                    setup = s->dvq->y_setup_arm[num16];
+                    setup[0][b][0][a] = beta;
+                    setup[0][b][0][a + 1] = beta;
+                    setup[0][b][1][a] = tc[0];
+                    setup[0][b][1][a + 1] = tc[1];
+                } else
+#endif
                     s->hevcdsp.hevc_v_loop_filter_luma(src,
                                                        s->frame->linesize[LUMA],
                                                        beta, tc, no_p, no_q);
@@ -571,6 +608,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
                                                          s->frame->linesize[LUMA],
                                                          beta, tc, no_p, no_q);
                 } else
+#ifdef RPI_DEBLOCK_VPU
+                if (s->enable_rpi_deblock) {
+                    uint8_t (*setup)[2][2][4];
+                    int num16 = (y>>4)*s->setup_width + (x>>4);
+                    int a = ((x>>3) & 1) << 1;
+                    int b = (y>>3) & 1;
+                    setup = s->dvq->y_setup_arm[num16];
+                    setup[1][b][0][a] = beta;
+                    setup[1][b][0][a + 1] = beta;
+                    setup[1][b][1][a] = tc[0];
+                    setup[1][b][1][a + 1] = tc[1];
+                } else
+#endif
                     s->hevcdsp.hevc_h_loop_filter_luma(src,
                                                        s->frame->linesize[LUMA],
                                                        beta, tc, no_p, no_q);
@@ -605,9 +655,23 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
                                                                    s->frame->linesize[chroma],
                                                                    c_tc, no_p, no_q);
                         } else
+#ifdef RPI_DEBLOCK_VPU
+                        if (s->enable_rpi_deblock) {
+                            uint8_t (*setup)[2][2][4];
+                            int xc = x>>s->ps.sps->hshift[chroma];
+                            int yc = y>>s->ps.sps->vshift[chroma];
+                            int num16 = (yc>>4)*s->uv_setup_width + (xc>>4);
+                            int a = ((yc>>3) & 1) << 1;
+                            int b = (xc>>3) & 1;
+                            setup = s->dvq->uv_setup_arm[num16];
+                            setup[0][b][0][a] = c_tc[0];
+                            setup[0][b][0][a + 1] = c_tc[1];
+                        } else
+#endif
                             s->hevcdsp.hevc_v_loop_filter_chroma(src,
                                                                  s->frame->linesize[chroma],
                                                                  c_tc, no_p, no_q);
+
                     }
                 }

@@ -638,6 +702,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
                                                                    s->frame->linesize[chroma],
                                                                    c_tc, no_p, no_q);
                         } else
+#ifdef RPI_DEBLOCK_VPU
+                        if (s->enable_rpi_deblock) {
+                            uint8_t (*setup)[2][2][4];
+                            int xc = x>>s->ps.sps->hshift[chroma];
+                            int yc = y>>s->ps.sps->vshift[chroma];
+                            int num16 = (yc>>4)*s->uv_setup_width + (xc>>4);
+                            int a = ((xc>>3) & 1) << 1;
+                            int b = (yc>>3) & 1;
+                            setup = s->dvq->uv_setup_arm[num16];
+                            setup[1][b][0][a] = c_tc[0];
+                            setup[1][b][0][a + 1] = c_tc[1];
+                        } else
+#endif
                             s->hevcdsp.hevc_h_loop_filter_chroma(src,
                                                                  s->frame->linesize[chroma],
                                                                  c_tc, no_p, no_q);
@@ -648,69 +725,6 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
     }
 }

-static int boundary_strength(HEVCContext *s, MvField *curr, MvField *neigh,
-                             RefPicList *neigh_refPicList)
-{
-    if (curr->pred_flag == PF_BI &&  neigh->pred_flag == PF_BI) {
-        // same L0 and L1
-        if (s->ref->refPicList[0].list[curr->ref_idx[0]] == neigh_refPicList[0].list[neigh->ref_idx[0]]  &&
-            s->ref->refPicList[0].list[curr->ref_idx[0]] == s->ref->refPicList[1].list[curr->ref_idx[1]] &&
-            neigh_refPicList[0].list[neigh->ref_idx[0]] == neigh_refPicList[1].list[neigh->ref_idx[1]]) {
-            if ((FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
-                 FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4) &&
-                (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
-                 FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4))
-                return 1;
-            else
-                return 0;
-        } else if (neigh_refPicList[0].list[neigh->ref_idx[0]] == s->ref->refPicList[0].list[curr->ref_idx[0]] &&
-                   neigh_refPicList[1].list[neigh->ref_idx[1]] == s->ref->refPicList[1].list[curr->ref_idx[1]]) {
-            if (FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
-                FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4)
-                return 1;
-            else
-                return 0;
-        } else if (neigh_refPicList[1].list[neigh->ref_idx[1]] == s->ref->refPicList[0].list[curr->ref_idx[0]] &&
-                   neigh_refPicList[0].list[neigh->ref_idx[0]] == s->ref->refPicList[1].list[curr->ref_idx[1]]) {
-            if (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
-                FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4)
-                return 1;
-            else
-                return 0;
-        } else {
-            return 1;
-        }
-    } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV
-        Mv A, B;
-        int ref_A, ref_B;
-
-        if (curr->pred_flag & 1) {
-            A     = curr->mv[0];
-            ref_A = s->ref->refPicList[0].list[curr->ref_idx[0]];
-        } else {
-            A     = curr->mv[1];
-            ref_A = s->ref->refPicList[1].list[curr->ref_idx[1]];
-        }
-
-        if (neigh->pred_flag & 1) {
-            B     = neigh->mv[0];
-            ref_B = neigh_refPicList[0].list[neigh->ref_idx[0]];
-        } else {
-            B     = neigh->mv[1];
-            ref_B = neigh_refPicList[1].list[neigh->ref_idx[1]];
-        }
-
-        if (ref_A == ref_B) {
-            if (FFABS(A.x - B.x) >= 4 || FFABS(A.y - B.y) >= 4)
-                return 1;
-            else
-                return 0;
-        } else
-            return 1;
-    }
-
-    return 1;
-}

 void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
                                            int log2_trafo_size)
@@ -721,10 +735,21 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
     int log2_min_tu_size = s->ps.sps->log2_min_tb_size;
     int min_pu_width     = s->ps.sps->min_pu_width;
     int min_tu_width     = s->ps.sps->min_tb_width;
-    int is_intra = tab_mvf[(y0 >> log2_min_pu_size) * min_pu_width +
-                           (x0 >> log2_min_pu_size)].pred_flag == PF_INTRA;
     int boundary_upper, boundary_left;
-    int i, j, bs;
+    int i, j;
+    RefPicList *rpl      = s->ref->refPicList;
+    int min_pu_in_4pix   = (1 << log2_min_pu_size) >> 2;
+    int trafo_in_min_pus = (1 << log2_trafo_size) >> log2_min_pu_size;
+    int y_pu             = y0 >> log2_min_pu_size;
+    int x_pu             = x0 >> log2_min_pu_size;
+    MvField *curr        = &tab_mvf[y_pu * min_pu_width + x_pu];
+    int is_intra         = curr->pred_flag == PF_INTRA;
+    int inc              = log2_min_pu_size == 2 ? 2 : 1;
+    uint8_t *bs;
+
+#ifdef DISABLE_STRENGTHS
+    return;
+#endif

     boundary_upper = y0 > 0 && !(y0 & 7);
     if (boundary_upper &&
@@ -736,34 +761,56 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
           (y0 % (1 << s->ps.sps->log2_ctb_size)) == 0)))
         boundary_upper = 0;

+    bs = &s->horizontal_bs[(x0 + y0 * s->bs_width) >> 2];
+
     if (boundary_upper) {
         RefPicList *rpl_top = (lc->boundary_flags & BOUNDARY_UPPER_SLICE) ?
                               ff_hevc_get_ref_list(s, s->ref, x0, y0 - 1) :
-                              s->ref->refPicList;
-        int yp_pu = (y0 - 1) >> log2_min_pu_size;
-        int yq_pu =  y0      >> log2_min_pu_size;
-        int yp_tu = (y0 - 1) >> log2_min_tu_size;
-        int yq_tu =  y0      >> log2_min_tu_size;
+                              rpl;
+        MvField *top = curr - min_pu_width;
+
+        if (is_intra) {
+            for (i = 0; i < (1 << log2_trafo_size); i += 4)
+                bs[i >> 2] = 2;
+
+        } else {
+            int y_tu = y0 >> log2_min_tu_size;
+            int x_tu = x0 >> log2_min_tu_size;
+            uint8_t *curr_cbf_luma = &s->cbf_luma[y_tu * min_tu_width + x_tu];
+            uint8_t *top_cbf_luma = curr_cbf_luma - min_tu_width;
+
+            s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
+                    min_pu_in_4pix, sizeof (MvField), 4 >> 2,
+                    rpl[0].list, rpl[1].list, rpl_top[0].list, rpl_top[1].list,
+                    curr, top, bs);

             for (i = 0; i < (1 << log2_trafo_size); i += 4) {
-                int x_pu = (x0 + i) >> log2_min_pu_size;
-                int x_tu = (x0 + i) >> log2_min_tu_size;
-                MvField *top  = &tab_mvf[yp_pu * min_pu_width + x_pu];
-                MvField *curr = &tab_mvf[yq_pu * min_pu_width + x_pu];
-                uint8_t top_cbf_luma  = s->cbf_luma[yp_tu * min_tu_width + x_tu];
-                uint8_t curr_cbf_luma = s->cbf_luma[yq_tu * min_tu_width + x_tu];
-
-                if (curr->pred_flag == PF_INTRA || top->pred_flag == PF_INTRA)
-                    bs = 2;
-                else if (curr_cbf_luma || top_cbf_luma)
-                    bs = 1;
-                else
-                    bs = boundary_strength(s, curr, top, rpl_top);
-                s->horizontal_bs[((x0 + i) + y0 * s->bs_width) >> 2] = bs;
+                int i_pu = i >> log2_min_pu_size;
+                int i_tu = i >> log2_min_tu_size;
+
+                if (top[i_pu].pred_flag == PF_INTRA)
+                    bs[i >> 2] = 2;
+                else if (curr_cbf_luma[i_tu] || top_cbf_luma[i_tu])
+                    bs[i >> 2] = 1;
             }
+        }
+    }
+
+    if (!is_intra) {
+        for (j = inc; j < trafo_in_min_pus; j += inc) {
+            MvField *top;
+
+            curr += min_pu_width * inc;
+            top = curr - min_pu_width;
+            bs += s->bs_width * inc << log2_min_pu_size >> 2;
+
+            s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
+                    min_pu_in_4pix, sizeof (MvField), 4 >> 2,
+                    rpl[0].list, rpl[1].list, rpl[0].list, rpl[1].list,
+                    curr, top, bs);
+        }
     }

-    // bs for vertical TU boundaries
     boundary_left = x0 > 0 && !(x0 & 7);
     if (boundary_left &&
         ((!s->sh.slice_loop_filter_across_slices_enabled_flag &&
@@ -774,64 +821,54 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
           (x0 % (1 << s->ps.sps->log2_ctb_size)) == 0)))
         boundary_left = 0;

+    curr = &tab_mvf[y_pu * min_pu_width + x_pu];
+    bs = &s->vertical_bs[(x0 + y0 * s->bs_width) >> 2];
+
     if (boundary_left) {
         RefPicList *rpl_left = (lc->boundary_flags & BOUNDARY_LEFT_SLICE) ?
                                ff_hevc_get_ref_list(s, s->ref, x0 - 1, y0) :
-                               s->ref->refPicList;
-        int xp_pu = (x0 - 1) >> log2_min_pu_size;
-        int xq_pu =  x0      >> log2_min_pu_size;
-        int xp_tu = (x0 - 1) >> log2_min_tu_size;
-        int xq_tu =  x0      >> log2_min_tu_size;
+                               rpl;
+        MvField *left = curr - 1;

-            for (i = 0; i < (1 << log2_trafo_size); i += 4) {
-                int y_pu      = (y0 + i) >> log2_min_pu_size;
-                int y_tu      = (y0 + i) >> log2_min_tu_size;
-                MvField *left = &tab_mvf[y_pu * min_pu_width + xp_pu];
-                MvField *curr = &tab_mvf[y_pu * min_pu_width + xq_pu];
-                uint8_t left_cbf_luma = s->cbf_luma[y_tu * min_tu_width + xp_tu];
-                uint8_t curr_cbf_luma = s->cbf_luma[y_tu * min_tu_width + xq_tu];
-
-                if (curr->pred_flag == PF_INTRA || left->pred_flag == PF_INTRA)
-                    bs = 2;
-                else if (curr_cbf_luma || left_cbf_luma)
-                    bs = 1;
-                else
-                    bs = boundary_strength(s, curr, left, rpl_left);
-                s->vertical_bs[(x0 + (y0 + i) * s->bs_width) >> 2] = bs;
-            }
-    }
+        if (is_intra) {
+            for (j = 0; j < (1 << log2_trafo_size); j += 4)
+                bs[j * s->bs_width >> 2] = 2;

-    if (log2_trafo_size > log2_min_pu_size && !is_intra) {
-        RefPicList *rpl = s->ref->refPicList;
-
-        // bs for TU internal horizontal PU boundaries
-        for (j = 8; j < (1 << log2_trafo_size); j += 8) {
-            int yp_pu = (y0 + j - 1) >> log2_min_pu_size;
-            int yq_pu = (y0 + j)     >> log2_min_pu_size;
-
-            for (i = 0; i < (1 << log2_trafo_size); i += 4) {
-                int x_pu = (x0 + i) >> log2_min_pu_size;
-                MvField *top  = &tab_mvf[yp_pu * min_pu_width + x_pu];
-                MvField *curr = &tab_mvf[yq_pu * min_pu_width + x_pu];
-
-                bs = boundary_strength(s, curr, top, rpl);
-                s->horizontal_bs[((x0 + i) + (y0 + j) * s->bs_width) >> 2] = bs;
+        } else {
+            int y_tu = y0 >> log2_min_tu_size;
+            int x_tu = x0 >> log2_min_tu_size;
+            uint8_t *curr_cbf_luma = &s->cbf_luma[y_tu * min_tu_width + x_tu];
+            uint8_t *left_cbf_luma = curr_cbf_luma - 1;
+
+            s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
+                    min_pu_in_4pix, min_pu_width * sizeof (MvField), 4 * s->bs_width >> 2,
+                    rpl[0].list, rpl[1].list, rpl_left[0].list, rpl_left[1].list,
+                    curr, left, bs);
+
+            for (j = 0; j < (1 << log2_trafo_size); j += 4) {
+                int j_pu = j >> log2_min_pu_size;
+                int j_tu = j >> log2_min_tu_size;
+
+                if (left[j_pu * min_pu_width].pred_flag == PF_INTRA)
+                    bs[j * s->bs_width >> 2] = 2;
+                else if (curr_cbf_luma[j_tu * min_tu_width] || left_cbf_luma[j_tu * min_tu_width])
+                    bs[j * s->bs_width >> 2] = 1;
             }
         }
+    }

-        // bs for TU internal vertical PU boundaries
-        for (j = 0; j < (1 << log2_trafo_size); j += 4) {
-            int y_pu = (y0 + j) >> log2_min_pu_size;
+    if (!is_intra) {
+        for (i = inc; i < trafo_in_min_pus; i += inc) {
+            MvField *left;

-            for (i = 8; i < (1 << log2_trafo_size); i += 8) {
-                int xp_pu = (x0 + i - 1) >> log2_min_pu_size;
-                int xq_pu = (x0 + i)     >> log2_min_pu_size;
-                MvField *left = &tab_mvf[y_pu * min_pu_width + xp_pu];
-                MvField *curr = &tab_mvf[y_pu * min_pu_width + xq_pu];
+            curr += inc;
+            left = curr - 1;
+            bs += inc << log2_min_pu_size >> 2;

-                bs = boundary_strength(s, curr, left, rpl);
-                s->vertical_bs[((x0 + i) + (y0 + j) * s->bs_width) >> 2] = bs;
-            }
+            s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
+                    min_pu_in_4pix, min_pu_width * sizeof (MvField), 4 * s->bs_width >> 2,
+                    rpl[0].list, rpl[1].list, rpl[0].list, rpl[1].list,
+                    curr, left, bs);
         }
     }
 }
@@ -840,11 +877,196 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
 #undef CB
 #undef CR

+#if !defined(RPI_FAST_CACHEFLUSH)
+#if defined(RPI_LUMA_QPU) || defined(RPI_DEBLOCK_VPU)
+static void flush_buffer_y(const AVFrame * const frame) {
+    GPU_MEM_PTR_T p = get_gpu_mem_ptr_y(frame);
+    gpu_cache_flush(&p);
+}
+
+static void flush_buffer_u(const AVFrame * const frame) {
+    GPU_MEM_PTR_T p = get_gpu_mem_ptr_u(frame);
+    gpu_cache_flush(&p);
+}
+
+static void flush_buffer_v(const AVFrame * const frame) {
+    GPU_MEM_PTR_T p = get_gpu_mem_ptr_v(frame);
+    gpu_cache_flush(&p);
+}
+#endif
+#endif
+
+
+#ifdef RPI_DEBLOCK_VPU
+#error Not fixed yet
+
+// ff_hevc_flush_buffer_lines
+// flushes and invalidates all pixel rows in [start,end-1]
+static void ff_hevc_flush_buffer_lines(HEVCContext *s, int start, int end, int flush_luma, int flush_chroma)
+{
+#ifdef RPI_FAST_CACHEFLUSH
+        struct vcsm_user_clean_invalid_s iocache = {};
+        int curr_y = start;
+        int n = end;
+        int curr_uv = curr_y >> s->ps.sps->vshift[1];
+        int n_uv = n >> s->ps.sps->vshift[1];
+        int sz,base;
+        GPU_MEM_PTR_T p;
+        if (curr_uv < 0) curr_uv = 0;
+        if (n_uv<=curr_uv) { return; }
+        sz = s->frame->linesize[1] * (n_uv-curr_uv);
+        base = s->frame->linesize[1] * curr_uv;
+        if (flush_chroma) {
+          p = get_gpu_mem_ptr_u(s->frame);
+          iocache.s[0].handle = p.vcsm_handle;
+          iocache.s[0].cmd = 3; // clean+invalidate
+          iocache.s[0].addr = (int)p.arm + base;
+          iocache.s[0].size  = sz;
+          p = get_gpu_mem_ptr_v(s->frame);
+          iocache.s[1].handle = p.vcsm_handle;
+          iocache.s[1].cmd = 3; // clean+invalidate
+          iocache.s[1].addr = (int)p.arm + base;
+          iocache.s[1].size  = sz;
+        }
+        if (flush_luma) {
+          p = get_gpu_mem_ptr_y(s->frame);
+          sz = s->frame->linesize[0] * (n-curr_y);
+          base = s->frame->linesize[0] * curr_y;
+          iocache.s[2].handle = p.vcsm_handle;
+          iocache.s[2].cmd = 3; // clean+invalidate
+          iocache.s[2].addr = (int)p.arm + base;
+          iocache.s[2].size  = sz;
+        }
+        vcsm_clean_invalid( &iocache );
+#else
+        if (flush_chroma) {
+          flush_buffer_u(s->frame);
+          flush_buffer_v(s->frame);
+        }
+        if (flush_luma) {
+          flush_buffer_y(s->frame);
+        }
+#endif
+}
+#endif
+
+#ifdef RPI_INTER_QPU
+void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
+{
+    if (s->enable_rpi && s->used_for_ref) {
+      // TODO make this use ff_hevc_flush_buffer_lines
+#ifdef RPI_FAST_CACHEFLUSH
+        struct vcsm_user_clean_invalid_s iocache = {};
+        int curr_y = ((int *)f->progress->data)[0];
+        int curr_uv = curr_y >> s->ps.sps->vshift[1];
+        int n_uv = n >> s->ps.sps->vshift[1];
+        int sz,base;
+        GPU_MEM_PTR_T p;
+        if (curr_uv < 0) curr_uv = 0;
+        if (n_uv<=curr_uv) { return; }
+        sz = s->frame->linesize[1] * (n_uv-curr_uv);
+        base = s->frame->linesize[1] * curr_uv;
+        p = get_gpu_mem_ptr_u(s->frame);
+        iocache.s[0].handle = p.vcsm_handle;
+        iocache.s[0].cmd = 3; // clean+invalidate
+        iocache.s[0].addr = (int)p.arm + base;
+        iocache.s[0].size  = sz;
+        p = get_gpu_mem_ptr_v(s->frame);
+        iocache.s[1].handle = p.vcsm_handle;
+        iocache.s[1].cmd = 3; // clean+invalidate
+        iocache.s[1].addr = (int)p.arm + base;
+        iocache.s[1].size  = sz;
+
+#ifdef RPI_LUMA_QPU
+        p = get_gpu_mem_ptr_y(s->frame);
+        sz = s->frame->linesize[0] * (n-curr_y);
+        base = s->frame->linesize[0] * curr_y;
+        iocache.s[2].handle = p.vcsm_handle;
+        iocache.s[2].cmd = 3; // clean+invalidate
+        iocache.s[2].addr = (int)p.arm + base;
+        iocache.s[2].size  = sz;
+#endif
+        vcsm_clean_invalid( &iocache );
+#else
+        flush_buffer_u(s->frame);
+        flush_buffer_v(s->frame);
+#ifdef RPI_LUMA_QPU
+        flush_buffer_y(s->frame);
+#endif
+
+#endif
+        //memcpy(s->dummy.arm,s->frame->data[0],2048*64);
+        //memcpy(s->dummy.arm,s->frame->data[1],1024*32);
+        //memcpy(s->dummy.arm,s->frame->data[2],1024*32);
+    }
+}
+#endif
+
+#ifdef RPI_DEBLOCK_VPU
+#error XXX
+/* rpi_deblock deblocks an entire row of ctbs using the VPU */
+static void rpi_deblock(HEVCContext *s, int y, int ctb_size)
+{
+  // Flush image, 4 lines above to bottom of ctb stripe
+  ff_hevc_flush_buffer_lines(s, FFMAX(y-4,0), y+ctb_size, 1, 1);
+  // TODO flush buffer of beta/tc setup when it becomes cached
+
+  // Prepare three commands at once to avoid calling overhead
+  s->dvq->vpu_cmds_arm[0][0] = get_vc_address_y(s->frame) + s->frame->linesize[0] * y;
+  s->dvq->vpu_cmds_arm[0][1] = s->frame->linesize[0];
+  s->dvq->vpu_cmds_arm[0][2] = s->setup_width;
+  s->dvq->vpu_cmds_arm[0][3] = (int) ( s->dvq->y_setup_vc + s->setup_width * (y>>4) );
+  s->dvq->vpu_cmds_arm[0][4] = ctb_size>>4;
+  s->dvq->vpu_cmds_arm[0][5] = 2;
+
+  s->dvq->vpu_cmds_arm[1][0] = get_vc_address_u(s->frame) + s->frame->linesize[1] * (y>> s->ps.sps->vshift[1]);
+  s->dvq->vpu_cmds_arm[1][1] = s->frame->linesize[1];
+  s->dvq->vpu_cmds_arm[1][2] = s->uv_setup_width;
+  s->dvq->vpu_cmds_arm[1][3] = (int) ( s->dvq->uv_setup_vc + s->uv_setup_width * ((y>>4)>> s->ps.sps->vshift[1]) );
+  s->dvq->vpu_cmds_arm[1][4] = (ctb_size>>4)>> s->ps.sps->vshift[1];
+  s->dvq->vpu_cmds_arm[1][5] = 3;
+
+  s->dvq->vpu_cmds_arm[2][0] = get_vc_address_v(s->frame) + s->frame->linesize[2] * (y>> s->ps.sps->vshift[2]);
+  s->dvq->vpu_cmds_arm[2][1] = s->frame->linesize[2];
+  s->dvq->vpu_cmds_arm[2][2] = s->uv_setup_width;
+  s->dvq->vpu_cmds_arm[2][3] = (int) ( s->dvq->uv_setup_vc + s->uv_setup_width * ((y>>4)>> s->ps.sps->vshift[1]) );
+  s->dvq->vpu_cmds_arm[2][4] = (ctb_size>>4)>> s->ps.sps->vshift[1];
+  s->dvq->vpu_cmds_arm[2][5] = 4;
+  // Call VPU
+  s->dvq->cmd_id = vpu_post_code2( vpu_get_fn(), s->dvq->vpu_cmds_vc, 3, 0, 0, 0, 5, 0); // 5 means to do all the commands
+
+  s->dvq_n = (s->dvq_n + 1) & (RPI_DEBLOCK_VPU_Q_COUNT - 1);
+  s->dvq = s->dvq_ents + s->dvq_n;
+
+  if (s->dvq->cmd_id != -1) {
+      vpu_wait(s->dvq->cmd_id);
+      s->dvq->cmd_id = -1;
+  }
+}
+
+#endif
+
 void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
 {
     int x_end = x >= s->ps.sps->width  - ctb_size;
+#ifdef RPI_DEBLOCK_VPU
+    int done_deblock = 0;
+#endif
     if (s->avctx->skip_loop_filter < AVDISCARD_ALL)
         deblocking_filter_CTB(s, x, y);
+#ifdef RPI_DEBLOCK_VPU
+    if (s->enable_rpi_deblock && x_end)
+    {
+      int y_at_end = y >= s->ps.sps->height - ctb_size;
+      int height = 64;  // Deblock in units 64 high to avoid too many VPU calls
+      int y_start = y&~63;
+      if (y_at_end) height = s->ps.sps->height - y_start;
+      if ((((y+ctb_size)&63)==0) || y_at_end) {
+        done_deblock = 1;
+        rpi_deblock(s, y_start, height);
+      }
+    }
+#endif
     if (s->ps.sps->sao_enabled) {
         int y_end = y >= s->ps.sps->height - ctb_size;
         if (y && x)
@@ -853,16 +1075,46 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
             sao_filter_CTB(s, x - ctb_size, y);
         if (y && x_end) {
             sao_filter_CTB(s, x, y - ctb_size);
-            if (s->threads_type & FF_THREAD_FRAME )
+            if (s->threads_type & FF_THREAD_FRAME ) {
+#ifdef RPI_INTER_QPU
+                ff_hevc_flush_buffer(s,&s->ref->tf, y);
+#endif
                 ff_thread_report_progress(&s->ref->tf, y, 0);
+            }
         }
         if (x_end && y_end) {
             sao_filter_CTB(s, x , y);
-            if (s->threads_type & FF_THREAD_FRAME )
+            if (s->threads_type & FF_THREAD_FRAME ) {
+#ifdef RPI_INTER_QPU
+                ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size);
+#endif
                 ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0);
+            }
+        }
+    } else if (s->threads_type & FF_THREAD_FRAME && x_end) {
+        //int newh = y + ctb_size - 4;
+        //int currh = s->ref->tf.progress->data[0];
+        //if (((y + ctb_size)&63)==0)
+#ifdef RPI_DEBLOCK_VPU
+        if (s->enable_rpi_deblock) {
+          // we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi
+          if (done_deblock) {
+            ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
+          }
+        } else {
+#ifdef RPI_INTER_QPU
+          ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size - 4);
+#endif
+          ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
         }
-    } else if (s->threads_type & FF_THREAD_FRAME && x_end)
+#else
+#ifdef RPI_INTER_QPU
+        ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size - 4);
+        // we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi
+#endif
         ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
+#endif
+    }
 }

 void ff_hevc_hls_filters(HEVCContext *s, int x_ctb, int y_ctb, int ctb_size)
diff --git a/libavcodec/hevc_ps.c b/libavcodec/hevc_ps.c
index 83f2ec2..6882a8d 100644
--- a/libavcodec/hevc_ps.c
+++ b/libavcodec/hevc_ps.c
@@ -989,6 +989,8 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
     sps->amp_enabled_flag = get_bits1(gb);
     sps->sao_enabled      = get_bits1(gb);

+    av_log(avctx, AV_LOG_INFO, "sao_enabled=%d\n", sps->sao_enabled);
+
     sps->pcm_enabled_flag = get_bits1(gb);
     if (sps->pcm_enabled_flag) {
         sps->pcm.bit_depth   = get_bits(gb, 4) + 1;
diff --git a/libavcodec/hevcdsp.c b/libavcodec/hevcdsp.c
index 9d773d9..a6534a9 100644
--- a/libavcodec/hevcdsp.c
+++ b/libavcodec/hevcdsp.c
@@ -123,6 +123,120 @@ DECLARE_ALIGNED(16, const int8_t, ff_hevc_qpel_filters[3][16]) = {
 #include "hevcdsp_template.c"
 #undef BIT_DEPTH

+static void hevc_deblocking_boundary_strengths(int pus, int dup, int in_inc, int out_inc,
+                                               int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
+                                               MvField *curr, MvField *neigh, uint8_t *bs)
+{
+    for (; pus > 0; pus--) {
+        int strength, out;
+        int curr_refL0 = curr_rpl0[curr->ref_idx[0]];
+        int curr_refL1 = curr_rpl1[curr->ref_idx[1]];
+        int neigh_refL0 = neigh_rpl0[neigh->ref_idx[0]];
+        int neigh_refL1 = neigh_rpl1[neigh->ref_idx[1]];
+
+#if 1 // This more directly matches the original implementation
+        if (curr->pred_flag == PF_BI &&  neigh->pred_flag == PF_BI) {
+            // same L0 and L1
+            if (curr_refL0 == neigh_refL0 &&
+                curr_refL0 == curr_refL1 &&
+                neigh_refL0 == neigh_refL1) {
+                if ((FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
+                     FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4) &&
+                    (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
+                     FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4))
+                    strength = 1;
+                else
+                    strength = 0;
+            } else if (neigh_refL0 == curr_refL0 &&
+                       neigh_refL1 == curr_refL1) {
+                if (FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
+                    FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4)
+                    strength = 1;
+                else
+                    strength = 0;
+            } else if (neigh_refL1 == curr_refL0 &&
+                       neigh_refL0 == curr_refL1) {
+                if (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
+                    FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4)
+                    strength = 1;
+                else
+                    strength = 0;
+            } else {
+                strength = 1;
+            }
+        } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV
+            Mv curr_mv0, neigh_mv0;
+
+            if (curr->pred_flag & 1) {
+                curr_mv0   = curr->mv[0];
+            } else {
+                curr_mv0   = curr->mv[1];
+                curr_refL0 = curr_refL1;
+            }
+
+            if (neigh->pred_flag & 1) {
+                neigh_mv0   = neigh->mv[0];
+            } else {
+                neigh_mv0   = neigh->mv[1];
+                neigh_refL0 = neigh_refL1;
+            }
+
+            if (curr_refL0 == neigh_refL0) {
+                if (FFABS(curr_mv0.x - neigh_mv0.x) >= 4 || FFABS(curr_mv0.y - neigh_mv0.y) >= 4)
+                    strength = 1;
+                else
+                    strength = 0;
+            } else
+                strength = 1;
+        } else
+            strength = 1;
+#else // This has exactly the same effect, but is more suitable for vectorisation
+        Mv curr_mv[2];
+        Mv neigh_mv[2];
+        memcpy(curr_mv, curr->mv, sizeof curr_mv);
+        memcpy(neigh_mv, neigh->mv, sizeof neigh_mv);
+
+        if (!(curr->pred_flag & 2)) {
+            curr_mv[1] = curr_mv[0];
+            curr_refL1 = curr_refL0;
+        }
+        if (!(neigh->pred_flag & 2)) {
+            neigh_mv[1] = neigh_mv[0];
+            neigh_refL1 = neigh_refL0;
+        }
+        if (!(curr->pred_flag & 1)) {
+            curr_mv[0] = curr_mv[1];
+            curr_refL0 = curr_refL1;
+        }
+        if (!(neigh->pred_flag & 1)) {
+            neigh_mv[0] = neigh_mv[1];
+            neigh_refL0 = neigh_refL1;
+        }
+
+        strength = 1;
+
+        strength &= (neigh_refL0 != curr_refL0) | (neigh_refL1 != curr_refL1) |
+                (FFABS(neigh_mv[0].x - curr_mv[0].x) >= 4) | (FFABS(neigh_mv[0].y - curr_mv[0].y) >= 4) |
+                (FFABS(neigh_mv[1].x - curr_mv[1].x) >= 4) | (FFABS(neigh_mv[1].y - curr_mv[1].y) >= 4);
+
+        strength &= (neigh_refL1 != curr_refL0) | (neigh_refL0 != curr_refL1) |
+                (FFABS(neigh_mv[1].x - curr_mv[0].x) >= 4) | (FFABS(neigh_mv[1].y - curr_mv[0].y) >= 4) |
+                (FFABS(neigh_mv[0].x - curr_mv[1].x) >= 4) | (FFABS(neigh_mv[0].y - curr_mv[1].y) >= 4);
+
+        strength |= (((curr->pred_flag + 1) ^ (neigh->pred_flag + 1)) >> 2);
+#endif
+
+        curr += in_inc / sizeof (MvField);
+        neigh += in_inc / sizeof (MvField);
+
+        for (out = dup; out > 0; out--)
+        {
+            *bs = strength;
+            bs += out_inc;
+        }
+    }
+}
+
 void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
 {
 #undef FUNC
@@ -257,6 +371,8 @@ int i = 0;
         break;
     }

+    hevcdsp->hevc_deblocking_boundary_strengths = hevc_deblocking_boundary_strengths;
+
     if (ARCH_X86)
         ff_hevc_dsp_init_x86(hevcdsp, bit_depth);
     if (ARCH_ARM)
diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h
index 9f1f6dd..e221e54 100644
--- a/libavcodec/hevcdsp.h
+++ b/libavcodec/hevcdsp.h
@@ -42,6 +42,17 @@ typedef struct SAOParams {
     uint8_t type_idx[3];    ///< sao_type_idx
 } SAOParams;

+typedef struct Mv {
+    int16_t x;  ///< horizontal component of motion vector
+    int16_t y;  ///< vertical component of motion vector
+} Mv;
+
+typedef struct MvField {
+    DECLARE_ALIGNED(4, Mv, mv)[2];
+    int8_t ref_idx[2];
+    int8_t pred_flag;
+} MvField;
+
 typedef struct HEVCDSPContext {
     void (*put_pcm)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
                     struct GetBitContext *gb, int pcm_bit_depth);
@@ -120,6 +131,9 @@ typedef struct HEVCDSPContext {
     void (*hevc_v_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
                                         int32_t *tc, uint8_t *no_p,
                                         uint8_t *no_q);
+    void (*hevc_deblocking_boundary_strengths)(int pus, int dup, int in_inc, int out_inc,
+                                               int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
+                                               MvField *curr, MvField *neigh, uint8_t *bs);
 } HEVCDSPContext;

 void ff_hevc_dsp_init(HEVCDSPContext *hpc, int bit_depth);
diff --git a/libavcodec/hevcpred_template.c b/libavcodec/hevcpred_template.c
index 6ae87cc..28d2653 100644
--- a/libavcodec/hevcpred_template.c
+++ b/libavcodec/hevcpred_template.c
@@ -20,6 +20,8 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */

+//#define DISABLE_INTRA
+
 #include "libavutil/pixdesc.h"

 #include "bit_depth_template.c"
@@ -69,8 +71,11 @@ do {                                  \
                 AV_WN4P(&ptr[i], a);                                           \
             else                                                               \
                 a = PIXEL_SPLAT_X4(ptr[i + 3])
-
+#ifdef RPI_WORKER
+    HEVCLocalContextIntra *lc = (s->enable_rpi) ? &s->HEVClcIntra : (HEVCLocalContextIntra *)s->HEVClc ;
+#else
     HEVCLocalContext *lc = s->HEVClc;
+#endif
     int i;
     int hshift = s->ps.sps->hshift[c_idx];
     int vshift = s->ps.sps->vshift[c_idx];
@@ -114,6 +119,10 @@ do {                                  \
     int top_right_size   = (FFMIN(x0 + 2 * size_in_luma_h, s->ps.sps->width) -
                            (x0 + size_in_luma_h)) >> hshift;

+#ifdef DISABLE_INTRA
+    return;
+#endif
+
     if (s->ps.pps->constrained_intra_pred_flag == 1) {
         int size_in_luma_pu_v = PU(size_in_luma_v);
         int size_in_luma_pu_h = PU(size_in_luma_h);
diff --git a/libavcodec/mmaldec.c b/libavcodec/mmaldec.c
index 099a8c5..bdff2d2 100644
--- a/libavcodec/mmaldec.c
+++ b/libavcodec/mmaldec.c
@@ -24,6 +24,9 @@
  * MMAL Video Decoder
  */

+#pragma GCC diagnostic push
+// Many many redundant decls in the header files
+#pragma GCC diagnostic ignored "-Wredundant-decls"
 #include <bcm_host.h>
 #include <interface/mmal/mmal.h>
 #include <interface/mmal/mmal_parameters_video.h>
@@ -31,6 +34,7 @@
 #include <interface/mmal/util/mmal_util_params.h>
 #include <interface/mmal/util/mmal_default_components.h>
 #include <interface/mmal/vc/mmal_vc_api.h>
+#pragma GCC diagnostic pop

 #include "avcodec.h"
 #include "internal.h"
diff --git a/libavcodec/mpeg4videodec.c b/libavcodec/mpeg4videodec.c
index 3adf28d..2f9195f 100644
--- a/libavcodec/mpeg4videodec.c
+++ b/libavcodec/mpeg4videodec.c
@@ -2205,6 +2205,9 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx)

         if (ctx->divx_version >= 0)
             s->workaround_bugs |= FF_BUG_HPEL_CHROMA;
+
+        if (ctx->num_sprite_warping_points > 1)
+            s->workaround_bugs |= FF_BUG_GMC_UNSUPPORTED;
     }

     if (s->workaround_bugs & FF_BUG_STD_QPEL) {
@@ -2229,6 +2232,7 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx)
                s->workaround_bugs, ctx->lavc_build, ctx->xvid_build,
                ctx->divx_version, ctx->divx_build, s->divx_packed ? "p" : "");

+    avctx->workaround_bugs = s->workaround_bugs;
     if (CONFIG_MPEG4_DECODER && ctx->xvid_build >= 0 &&
         s->codec_id == AV_CODEC_ID_MPEG4 &&
         avctx->idct_algo == FF_IDCT_AUTO) {
diff --git a/libavcodec/rpi_hevc_transform.h b/libavcodec/rpi_hevc_transform.h
new file mode 100644
index 0000000..4309f1c
--- /dev/null
+++ b/libavcodec/rpi_hevc_transform.h
@@ -0,0 +1,3070 @@
+unsigned char rpi_hevc_transform [] = {
+21,
+106,
+0,
+144,
+47,
+1,
+37,
+106,
+0,
+144,
+66,
+1,
+53,
+106,
+0,
+144,
+192,
+4,
+69,
+106,
+0,
+144,
+192,
+4,
+85,
+106,
+0,
+144,
+220,
+5,
+169,
+3,
+62,
+64,
+79,
+64,
+3,
+232,
+32,
+0,
+0,
+0,
+12,
+248,
+0,
+136,
+0,
+0,
+192,
+248,
+0,
+0,
+64,
+232,
+0,
+2,
+0,
+0,
+12,
+248,
+0,
+168,
+0,
+0,
+192,
+248,
+0,
+0,
+0,
+96,
+3,
+232,
+32,
+0,
+0,
+0,
+7,
+232,
+0,
+2,
+0,
+0,
+8,
+232,
+0,
+4,
+0,
+0,
+12,
+248,
+0,
+128,
+0,
+0,
+192,
+8,
+4,
+0,
+4,
+232,
+64,
+0,
+0,
+0,
+5,
+232,
+0,
+8,
+0,
+0,
+128,
+69,
+113,
+66,
+12,
+248,
+0,
+128,
+0,
+0,
+192,
+8,
+4,
+0,
+128,
+69,
+113,
+70,
+128,
+144,
+40,
+0,
+4,
+255,
+48,
+192,
+128,
+3,
+32,
+8,
+16,
+0,
+76,
+254,
+48,
+192,
+9,
+4,
+32,
+8,
+0,
+0,
+4,
+254,
+0,
+144,
+128,
+2,
+0,
+8,
+2,
+0,
+128,
+144,
+23,
+0,
+4,
+255,
+48,
+192,
+128,
+3,
+32,
+8,
+20,
+0,
+76,
+254,
+48,
+192,
+4,
+4,
+32,
+8,
+0,
+0,
+140,
+248,
+44,
+0,
+0,
+0,
+32,
+48,
+4,
+0,
+128,
+69,
+113,
+66,
+242,
+140,
+211,
+192,
+34,
+31,
+41,
+3,
+70,
+192,
+80,
+7,
+164,
+255,
+36,
+204,
+96,
+2,
+0,
+248,
+62,
+0,
+3,
+255,
+55,
+208,
+120,
+3,
+224,
+3,
+190,
+11,
+16,
+139,
+246,
+91,
+0,
+103,
+90,
+0,
+70,
+192,
+80,
+7,
+164,
+255,
+36,
+204,
+224,
+2,
+0,
+248,
+62,
+0,
+3,
+255,
+55,
+208,
+120,
+3,
+224,
+3,
+190,
+11,
+16,
+139,
+246,
+91,
+0,
+103,
+90,
+0,
+225,
+64,
+242,
+64,
+3,
+232,
+128,
+0,
+0,
+0,
+7,
+232,
+0,
+2,
+0,
+0,
+57,
+239,
+224,
+247,
+255,
+255,
+72,
+192,
+95,
+207,
+88,
+122,
+88,
+124,
+137,
+64,
+26,
+64,
+4,
+232,
+64,
+0,
+0,
+0,
+149,
+96,
+161,
+64,
+152,
+64,
+128,
+144,
+35,
+0,
+72,
+232,
+0,
+4,
+0,
+0,
+65,
+232,
+32,
+0,
+0,
+0,
+128,
+144,
+27,
+0,
+4,
+232,
+0,
+8,
+0,
+0,
+69,
+96,
+145,
+64,
+168,
+64,
+128,
+144,
+19,
+0,
+72,
+232,
+0,
+4,
+0,
+0,
+65,
+232,
+32,
+0,
+0,
+0,
+128,
+144,
+11,
+0,
+74,
+232,
+0,
+8,
+0,
+0,
+242,
+140,
+221,
+192,
+57,
+239,
+32,
+8,
+0,
+0,
+41,
+3,
+239,
+3,
+12,
+248,
+0,
+128,
+0,
+0,
+192,
+248,
+4,
+0,
+12,
+248,
+0,
+132,
+64,
+0,
+192,
+248,
+4,
+0,
+0,
+96,
+255,
+159,
+154,
+255,
+0,
+232,
+0,
+4,
+0,
+0,
+255,
+159,
+165,
+255,
+4,
+255,
+48,
+204,
+16,
+3,
+224,
+251,
+62,
+0,
+4,
+255,
+51,
+204,
+128,
+3,
+224,
+251,
+16,
+0,
+76,
+254,
+51,
+204,
+128,
+3,
+224,
+251,
+20,
+0,
+128,
+64,
+6,
+232,
+64,
+0,
+0,
+0,
+140,
+248,
+47,
+0,
+0,
+0,
+224,
+99,
+0,
+0,
+32,
+247,
+240,
+207,
+16,
+3,
+32,
+247,
+176,
+207,
+17,
+19,
+32,
+247,
+112,
+207,
+18,
+35,
+32,
+247,
+48,
+207,
+19,
+51,
+32,
+247,
+240,
+206,
+20,
+67,
+32,
+247,
+176,
+206,
+21,
+83,
+32,
+247,
+112,
+206,
+22,
+99,
+32,
+247,
+48,
+206,
+23,
+115,
+32,
+247,
+240,
+205,
+24,
+131,
+32,
+247,
+176,
+205,
+25,
+147,
+32,
+247,
+112,
+205,
+26,
+163,
+32,
+247,
+48,
+205,
+27,
+179,
+32,
+247,
+240,
+204,
+28,
+195,
+32,
+247,
+176,
+204,
+29,
+211,
+32,
+247,
+112,
+204,
+30,
+227,
+32,
+247,
+48,
+204,
+31,
+243,
+4,
+255,
+51,
+204,
+128,
+3,
+224,
+251,
+16,
+0,
+76,
+254,
+51,
+204,
+128,
+3,
+224,
+251,
+20,
+0,
+0,
+237,
+32,
+0,
+0,
+0,
+140,
+248,
+47,
+0,
+0,
+0,
+224,
+99,
+0,
+0,
+111,
+3,
+4,
+254,
+0,
+128,
+0,
+4,
+0,
+248,
+0,
+0,
+2,
+232,
+32,
+0,
+0,
+0,
+140,
+248,
+32,
+0,
+0,
+0,
+224,
+35,
+0,
+0,
+64,
+232,
+0,
+2,
+0,
+0,
+193,
+232,
+0,
+1,
+0,
+0,
+1,
+106,
+116,
+30,
+90,
+0,
+169,
+3,
+73,
+64,
+52,
+64,
+45,
+64,
+2,
+64,
+10,
+64,
+64,
+198,
+1,
+7,
+8,
+232,
+63,
+0,
+0,
+0,
+6,
+232,
+253,
+255,
+255,
+255,
+0,
+246,
+0,
+0,
+0,
+4,
+215,
+64,
+3,
+96,
+2,
+248,
+0,
+35,
+0,
+0,
+64,
+56,
+0,
+0,
+4,
+248,
+0,
+36,
+0,
+0,
+64,
+56,
+8,
+0,
+0,
+240,
+64,
+0,
+132,
+3,
+128,
+240,
+0,
+0,
+132,
+3,
+128,
+144,
+137,
+0,
+131,
+98,
+0,
+255,
+64,
+0,
+0,
+20,
+200,
+243,
+0,
+0,
+128,
+144,
+129,
+0,
+131,
+102,
+0,
+158,
+67,
+0,
+2,
+248,
+0,
+35,
+0,
+0,
+64,
+56,
+0,
+0,
+4,
+248,
+0,
+36,
+0,
+0,
+64,
+56,
+8,
+0,
+0,
+240,
+64,
+0,
+132,
+3,
+128,
+240,
+0,
+0,
+132,
+3,
+128,
+144,
+108,
+0,
+131,
+98,
+0,
+255,
+64,
+0,
+0,
+20,
+200,
+243,
+0,
+0,
+128,
+144,
+100,
+0,
+131,
+102,
+0,
+248,
+64,
+0,
+112,
+0,
+192,
+243,
+211,
+31,
+128,
+248,
+0,
+0,
+112,
+0,
+192,
+243,
+211,
+31,
+128,
+144,
+161,
+0,
+188,
+64,
+67,
+232,
+0,
+2,
+0,
+0,
+0,
+255,
+64,
+0,
+0,
+20,
+200,
+243,
+0,
+0,
+128,
+144,
+150,
+0,
+195,
+232,
+0,
+2,
+0,
+0,
+12,
+128,
+7,
+192,
+130,
+248,
+0,
+0,
+112,
+192,
+224,
+16,
+195,
+31,
+132,
+248,
+1,
+0,
+112,
+0,
+224,
+16,
+203,
+31,
+3,
+99,
+131,
+71,
+68,
+232,
+32,
+0,
+0,
+0,
+0,
+99,
+2,
+99,
+23,
+102,
+7,
+106,
+127,
+156,
+182,
+255,
+0,
+248,
+64,
+0,
+112,
+0,
+192,
+243,
+211,
+31,
+128,
+248,
+0,
+0,
+112,
+0,
+192,
+243,
+211,
+31,
+128,
+144,
+112,
+0,
+188,
+64,
+67,
+232,
+0,
+2,
+0,
+0,
+0,
+255,
+64,
+0,
+0,
+20,
+200,
+243,
+0,
+0,
+128,
+144,
+101,
+0,
+195,
+232,
+0,
+2,
+0,
+0,
+12,
+128,
+7,
+192,
+130,
+248,
+0,
+0,
+112,
+192,
+224,
+16,
+195,
+31,
+132,
+248,
+1,
+0,
+112,
+0,
+224,
+16,
+203,
+31,
+25,
+102,
+9,
+106,
+2,
+30,
+41,
+3,
+26,
+87,
+162,
+64,
+64,
+198,
+1,
+23,
+127,
+158,
+103,
+255,
+239,
+3,
+0,
+254,
+0,
+143,
+92,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+64,
+143,
+93,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+128,
+143,
+94,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+192,
+143,
+95,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+192,
+142,
+208,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+128,
+142,
+209,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+64,
+142,
+210,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+0,
+142,
+211,
+0,
+0,
+240,
+12,
+0,
+128,
+144,
+107,
+0,
+8,
+255,
+99,
+23,
+0,
+212,
+192,
+51,
+0,
+0,
+8,
+255,
+163,
+23,
+0,
+228,
+192,
+51,
+0,
+0,
+8,
+255,
+227,
+23,
+0,
+244,
+192,
+51,
+0,
+0,
+8,
+255,
+35,
+52,
+0,
+180,
+192,
+51,
+0,
+0,
+8,
+255,
+99,
+52,
+0,
+164,
+192,
+51,
+0,
+0,
+8,
+255,
+163,
+52,
+0,
+148,
+192,
+51,
+0,
+0,
+111,
+3,
+239,
+3,
+0,
+254,
+0,
+143,
+12,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+64,
+143,
+13,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+128,
+143,
+14,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+192,
+143,
+15,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+192,
+142,
+16,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+128,
+142,
+17,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+64,
+142,
+18,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+0,
+142,
+19,
+0,
+0,
+240,
+12,
+0,
+128,
+144,
+33,
+0,
+8,
+255,
+99,
+3,
+0,
+212,
+192,
+51,
+0,
+0,
+8,
+255,
+163,
+3,
+0,
+228,
+192,
+51,
+0,
+0,
+8,
+255,
+227,
+3,
+0,
+244,
+192,
+51,
+0,
+0,
+8,
+255,
+35,
+4,
+0,
+180,
+192,
+51,
+0,
+0,
+8,
+255,
+99,
+4,
+0,
+164,
+192,
+51,
+0,
+0,
+8,
+255,
+163,
+4,
+0,
+148,
+192,
+51,
+0,
+0,
+111,
+3,
+32,
+246,
+192,
+11,
+1,
+16,
+32,
+246,
+2,
+137,
+47,
+240,
+40,
+246,
+2,
+140,
+47,
+240,
+128,
+245,
+99,
+140,
+5,
+4,
+0,
+247,
+99,
+140,
+1,
+20,
+88,
+246,
+99,
+140,
+1,
+20,
+0,
+247,
+35,
+136,
+62,
+226,
+32,
+247,
+35,
+136,
+32,
+210,
+0,
+247,
+34,
+136,
+63,
+2,
+208,
+246,
+34,
+136,
+0,
+4,
+0,
+247,
+99,
+136,
+58,
+162,
+32,
+247,
+99,
+136,
+33,
+146,
+0,
+247,
+98,
+136,
+59,
+18,
+208,
+246,
+98,
+136,
+0,
+20,
+0,
+247,
+162,
+136,
+33,
+2,
+88,
+246,
+98,
+137,
+2,
+68,
+88,
+246,
+162,
+137,
+3,
+68,
+208,
+254,
+227,
+136,
+60,
+242,
+192,
+243,
+188,
+11,
+208,
+254,
+227,
+136,
+56,
+178,
+192,
+243,
+188,
+10,
+32,
+255,
+226,
+136,
+38,
+58,
+192,
+243,
+60,
+0,
+208,
+254,
+227,
+136,
+59,
+242,
+192,
+243,
+60,
+128,
+32,
+255,
+226,
+136,
+49,
+58,
+192,
+243,
+60,
+128,
+0,
+255,
+226,
+136,
+34,
+34,
+192,
+243,
+60,
+128,
+32,
+255,
+226,
+136,
+37,
+58,
+192,
+243,
+60,
+128,
+0,
+254,
+192,
+136,
+1,
+4,
+0,
+240,
+0,
+160,
+0,
+255,
+194,
+8,
+0,
+52,
+195,
+243,
+0,
+128,
+0,
+255,
+202,
+40,
+0,
+52,
+195,
+243,
+0,
+128,
+0,
+254,
+0,
+240,
+35,
+10,
+0,
+240,
+60,
+0,
+0,
+254,
+192,
+136,
+1,
+4,
+0,
+240,
+0,
+160,
+0,
+255,
+226,
+140,
+34,
+34,
+195,
+243,
+60,
+0,
+32,
+255,
+227,
+140,
+36,
+58,
+192,
+243,
+60,
+0,
+0,
+254,
+192,
+136,
+0,
+4,
+0,
+240,
+0,
+160,
+16,
+246,
+226,
+136,
+35,
+50,
+16,
+246,
+226,
+136,
+35,
+50,
+32,
+246,
+226,
+136,
+35,
+50,
+32,
+254,
+226,
+136,
+35,
+58,
+192,
+243,
+60,
+0,
+11,
+96,
+0,
+254,
+0,
+240,
+1,
+4,
+0,
+240,
+64,
+115,
+5,
+106,
+0,
+144,
+173,
+1,
+27,
+96,
+0,
+254,
+0,
+240,
+1,
+4,
+0,
+240,
+64,
+147,
+5,
+106,
+0,
+144,
+227,
+0,
+64,
+246,
+163,
+140,
+1,
+4,
+0,
+246,
+192,
+175,
+63,
+2,
+0,
+246,
+192,
+174,
+59,
+2,
+0,
+246,
+128,
+175,
+62,
+2,
+0,
+246,
+128,
+174,
+58,
+2,
+0,
+246,
+64,
+175,
+61,
+2,
+0,
+246,
+64,
+174,
+57,
+2,
+0,
+255,
+43,
+240,
+4,
+212,
+192,
+243,
+128,
+11,
+64,
+254,
+43,
+240,
+1,
+228,
+192,
+243,
+128,
+10,
+64,
+254,
+43,
+240,
+1,
+244,
+192,
+243,
+128,
+10,
+64,
+254,
+43,
+240,
+1,
+180,
+192,
+243,
+128,
+10,
+64,
+254,
+43,
+141,
+0,
+164,
+192,
+243,
+128,
+10,
+88,
+246,
+35,
+141,
+3,
+68,
+32,
+247,
+35,
+141,
+191,
+66,
+240,
+246,
+35,
+141,
+50,
+66,
+0,
+255,
+235,
+143,
+52,
+242,
+192,
+243,
+60,
+128,
+0,
+255,
+43,
+240,
+2,
+212,
+192,
+243,
+128,
+11,
+0,
+255,
+43,
+240,
+191,
+226,
+192,
+243,
+188,
+10,
+64,
+254,
+43,
+141,
+0,
+180,
+192,
+243,
+128,
+10,
+88,
+246,
+35,
+141,
+2,
+68,
+32,
+247,
+35,
+141,
+190,
+66,
+240,
+246,
+35,
+141,
+50,
+66,
+0,
+255,
+171,
+143,
+52,
+226,
+192,
+243,
+60,
+128,
+0,
+255,
+43,
+240,
+4,
+180,
+192,
+243,
+128,
+11,
+0,
+255,
+43,
+240,
+191,
+226,
+192,
+243,
+188,
+10,
+128,
+253,
+43,
+240,
+3,
+212,
+192,
+243,
+128,
+10,
+64,
+254,
+35,
+141,
+1,
+196,
+192,
+243,
+128,
+10,
+88,
+246,
+35,
+141,
+3,
+68,
+32,
+247,
+35,
+141,
+189,
+66,
+240,
+246,
+35,
+141,
+50,
+66,
+0,
+255,
+107,
+143,
+52,
+210,
+192,
+243,
+60,
+128,
+0,
+255,
+43,
+240,
+4,
+148,
+192,
+243,
+128,
+11,
+64,
+254,
+43,
+240,
+1,
+164,
+192,
+243,
+128,
+10,
+64,
+254,
+43,
+240,
+1,
+180,
+192,
+243,
+128,
+10,
+64,
+254,
+43,
+240,
+1,
+244,
+192,
+243,
+128,
+10,
+64,
+254,
+43,
+141,
+0,
+228,
+192,
+243,
+128,
+10,
+88,
+246,
+35,
+141,
+3,
+68,
+32,
+247,
+35,
+141,
+187,
+66,
+240,
+246,
+35,
+141,
+50,
+66,
+0,
+255,
+235,
+142,
+52,
+178,
+192,
+243,
+60,
+128,
+0,
+255,
+43,
+240,
+2,
+148,
+192,
+243,
+128,
+11,
+0,
+255,
+43,
+240,
+187,
+162,
+192,
+243,
+188,
+10,
+64,
+254,
+43,
+141,
+0,
+244,
+192,
+243,
+128,
+10,
+88,
+246,
+35,
+141,
+2,
+68,
+32,
+247,
+35,
+141,
+186,
+66,
+240,
+246,
+35,
+141,
+50,
+66,
+0,
+255,
+171,
+142,
+52,
+162,
+192,
+243,
+60,
+128,
+0,
+255,
+43,
+240,
+4,
+244,
+192,
+243,
+128,
+11,
+0,
+255,
+43,
+240,
+187,
+162,
+192,
+243,
+188,
+10,
+128,
+253,
+43,
+240,
+3,
+148,
+192,
+243,
+128,
+10,
+64,
+254,
+35,
+141,
+1,
+132,
+192,
+243,
+128,
+10,
+88,
+246,
+35,
+141,
+3,
+68,
+32,
+247,
+35,
+141,
+185,
+66,
+240,
+246,
+35,
+141,
+50,
+66,
+0,
+255,
+107,
+142,
+52,
+146,
+192,
+243,
+60,
+128,
+64,
+255,
+98,
+141,
+0,
+52,
+192,
+243,
+0,
+0,
+0,
+254,
+0,
+240,
+53,
+10,
+0,
+240,
+60,
+0,
+0,
+254,
+0,
+240,
+1,
+4,
+0,
+240,
+64,
+147,
+5,
+106,
+0,
+144,
+177,
+0,
+88,
+246,
+163,
+140,
+1,
+4,
+128,
+245,
+99,
+141,
+10,
+4,
+88,
+246,
+162,
+138,
+1,
+68,
+0,
+247,
+162,
+138,
+36,
+162,
+88,
+254,
+162,
+138,
+3,
+164,
+192,
+243,
+128,
+11,
+0,
+255,
+226,
+137,
+32,
+2,
+195,
+243,
+60,
+0,
+32,
+247,
+226,
+137,
+42,
+114,
+0,
+255,
+34,
+138,
+33,
+18,
+195,
+243,
+60,
+0,
+32,
+247,
+34,
+138,
+42,
+130,
+16,
+246,
+98,
+138,
+40,
+114,
+16,
+246,
+98,
+138,
+41,
+146,
+32,
+246,
+98,
+138,
+41,
+146,
+32,
+246,
+226,
+137,
+41,
+146,
+40,
+246,
+34,
+138,
+41,
+146,
+32,
+247,
+163,
+141,
+63,
+178,
+32,
+247,
+227,
+141,
+62,
+162,
+0,
+254,
+0,
+240,
+8,
+4,
+0,
+240,
+128,
+11,
+128,
+253,
+35,
+240,
+9,
+100,
+192,
+243,
+128,
+10,
+128,
+253,
+163,
+141,
+128,
+115,
+192,
+243,
+152,
+10,
+88,
+246,
+163,
+141,
+4,
+100,
+208,
+246,
+35,
+139,
+0,
+100,
+32,
+255,
+34,
+139,
+53,
+202,
+192,
+243,
+60,
+128,
+0,
+254,
+0,
+139,
+0,
+4,
+0,
+240,
+0,
+160,
+240,
+246,
+163,
+141,
+48,
+98,
+0,
+247,
+99,
+139,
+63,
+210,
+0,
+247,
+98,
+139,
+1,
+212,
+88,
+254,
+98,
+139,
+1,
+212,
+192,
+243,
+128,
+11,
+32,
+255,
+99,
+139,
+62,
+98,
+192,
+243,
+188,
+10,
+88,
+246,
+98,
+139,
+1,
+212,
+240,
+246,
+98,
+139,
+50,
+210,
+0,
+247,
+163,
+128,
+59,
+146,
+0,
+247,
+160,
+128,
+1,
+36,
+88,
+254,
+160,
+128,
+1,
+36,
+192,
+243,
+128,
+11,
+0,
+247,
+163,
+128,
+58,
+98,
+64,
+255,
+35,
+240,
+0,
+100,
+192,
+243,
+128,
+10,
+64,
+255,
+163,
+128,
+0,
+164,
+192,
+243,
+128,
+10,
+88,
+246,
+160,
+128,
+1,
+36,
+240,
+246,
+160,
+128,
+50,
+34,
+8,
+255,
+227,
+143,
+54,
+242,
+192,
+243,
+60,
+128,
+40,
+255,
+227,
+142,
+54,
+178,
+192,
+243,
+60,
+128,
+0,
+254,
+0,
+240,
+39,
+10,
+0,
+240,
+60,
+128,
+8,
+255,
+163,
+143,
+45,
+226,
+192,
+243,
+60,
+128,
+0,
+254,
+0,
+240,
+44,
+10,
+0,
+240,
+60,
+0,
+0,
+254,
+0,
+240,
+40,
+10,
+0,
+240,
+60,
+128,
+8,
+255,
+163,
+142,
+2,
+162,
+192,
+243,
+60,
+128,
+90,
+0,
+169,
+3,
+14,
+96,
+4,
+31,
+169,
+3,
+30,
+96,
+1,
+31,
+73,
+64,
+52,
+64,
+45,
+64,
+2,
+64,
+10,
+64,
+64,
+198,
+1,
+7,
+8,
+232,
+63,
+0,
+0,
+0,
+6,
+232,
+253,
+255,
+255,
+255,
+0,
+246,
+0,
+0,
+0,
+4,
+215,
+64,
+3,
+96,
+2,
+248,
+0,
+35,
+0,
+0,
+64,
+56,
+0,
+0,
+4,
+248,
+0,
+36,
+0,
+0,
+64,
+56,
+8,
+0,
+0,
+240,
+64,
+0,
+132,
+3,
+30,
+106,
+132,
+24,
+128,
+240,
+0,
+0,
+132,
+3,
+128,
+144,
+143,
+0,
+131,
+98,
+0,
+255,
+64,
+0,
+0,
+20,
+200,
+243,
+0,
+0,
+128,
+144,
+135,
+0,
+131,
+102,
+0,
+158,
+71,
+0,
+2,
+248,
+0,
+35,
+0,
+0,
+64,
+56,
+0,
+0,
+4,
+248,
+0,
+36,
+0,
+0,
+64,
+56,
+8,
+0,
+0,
+240,
+64,
+0,
+132,
+3,
+30,
+106,
+132,
+24,
+128,
+240,
+0,
+0,
+132,
+3,
+128,
+144,
+112,
+0,
+131,
+98,
+0,
+255,
+64,
+0,
+0,
+20,
+200,
+243,
+0,
+0,
+128,
+144,
+104,
+0,
+131,
+102,
+0,
+248,
+64,
+0,
+112,
+0,
+192,
+243,
+211,
+31,
+30,
+106,
+134,
+24,
+128,
+248,
+0,
+0,
+112,
+0,
+192,
+243,
+211,
+31,
+128,
+144,
+123,
+0,
+188,
+64,
+67,
+232,
+0,
+2,
+0,
+0,
+0,
+255,
+64,
+0,
+0,
+20,
+200,
+243,
+0,
+0,
+128,
+144,
+112,
+0,
+195,
+232,
+0,
+2,
+0,
+0,
+12,
+128,
+7,
+192,
+130,
+248,
+0,
+0,
+112,
+192,
+224,
+16,
+195,
+31,
+132,
+248,
+1,
+0,
+112,
+0,
+224,
+16,
+203,
+31,
+3,
+99,
+131,
+71,
+68,
+232,
+32,
+0,
+0,
+0,
+0,
+99,
+2,
+99,
+23,
+102,
+7,
+106,
+127,
+156,
+178,
+255,
+0,
+248,
+64,
+0,
+112,
+0,
+192,
+243,
+211,
+31,
+30,
+106,
+134,
+24,
+128,
+248,
+0,
+0,
+112,
+0,
+192,
+243,
+211,
+31,
+128,
+144,
+72,
+0,
+188,
+64,
+67,
+232,
+0,
+2,
+0,
+0,
+0,
+255,
+64,
+0,
+0,
+20,
+200,
+243,
+0,
+0,
+128,
+144,
+61,
+0,
+195,
+232,
+0,
+2,
+0,
+0,
+12,
+128,
+7,
+192,
+130,
+248,
+0,
+0,
+112,
+192,
+224,
+16,
+195,
+31,
+132,
+248,
+1,
+0,
+112,
+0,
+224,
+16,
+203,
+31,
+25,
+102,
+9,
+106,
+2,
+30,
+41,
+3,
+26,
+87,
+162,
+64,
+64,
+198,
+1,
+23,
+127,
+158,
+95,
+255,
+239,
+3,
+0,
+254,
+128,
+143,
+94,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+192,
+143,
+95,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+192,
+142,
+208,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+128,
+142,
+209,
+0,
+0,
+240,
+12,
+0,
+128,
+144,
+47,
+0,
+8,
+255,
+227,
+23,
+0,
+244,
+192,
+51,
+0,
+0,
+8,
+255,
+35,
+52,
+0,
+180,
+192,
+51,
+0,
+0,
+111,
+3,
+239,
+3,
+0,
+254,
+128,
+143,
+14,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+192,
+143,
+15,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+192,
+142,
+16,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+128,
+142,
+17,
+0,
+0,
+240,
+12,
+0,
+128,
+144,
+13,
+0,
+8,
+255,
+227,
+3,
+0,
+244,
+192,
+51,
+0,
+0,
+8,
+255,
+35,
+4,
+0,
+180,
+192,
+51,
+0,
+0,
+111,
+3,
+32,
+246,
+192,
+11,
+1,
+16,
+32,
+246,
+2,
+140,
+47,
+240,
+32,
+247,
+35,
+141,
+63,
+178,
+64,
+254,
+35,
+141,
+2,
+68,
+192,
+243,
+128,
+11,
+32,
+255,
+35,
+240,
+58,
+226,
+192,
+243,
+188,
+10,
+0,
+254,
+0,
+141,
+4,
+4,
+0,
+240,
+128,
+10,
+88,
+246,
+35,
+141,
+3,
+68,
+240,
+246,
+35,
+141,
+48,
+66,
+0,
+247,
+227,
+143,
+52,
+242,
+32,
+247,
+227,
+142,
+52,
+178,
+90,
+0,
+161,
+3,
+6,
+64,
+23,
+64,
+96,
+8,
+70,
+98,
+97,
+8,
+70,
+98,
+98,
+8,
+70,
+98,
+99,
+8,
+70,
+98,
+100,
+8,
+70,
+98,
+101,
+8,
+70,
+98,
+255,
+159,
+8,
+250,
+23,
+102,
+7,
+106,
+112,
+30,
+33,
+3,
+};
diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s
new file mode 100644
index 0000000..5543093
--- /dev/null
+++ b/libavcodec/rpi_hevc_transform.s
@@ -0,0 +1,917 @@
+# ******************************************************************************
+# Argon Design Ltd.
+# (c) Copyright 2015 Argon Design Ltd. All rights reserved.
+#
+# Module : HEVC
+# Author : Peter de Rivaz
+# ******************************************************************************
+
+# HEVC VPU Transform
+#
+# Transform matrix can be thought of as
+#   output row vector = input row vector * transMatrix2
+#
+# The even rows of the matrix are symmetric
+# The odd rows of the matrix are antisymmetric
+#
+# So only need to compute the first half of the results, then can compute the remainder with a butterfly
+#
+# EXAMPLE
+#   (a b c d) (1 2  2  1)
+#             (3 4 -4 -3)
+#             (5 6  6  5)
+#             (7 8 -8 -7)
+#
+#  x=(a c)(1 2) = 1a+5c 2a+6c
+#         (5 6)
+#
+#  y=(b d)(3 4) = 3b+7d 4b+8d
+#         (7 8)
+#
+#  u=x+y = 1a+5c+3b+7d 2a+4b+6c+8d
+#  v=x-y = 1a+5c-3b-7d 2a+6c-4b-8d
+#
+#  Final results are (u , v[::-1])
+#
+#
+#  For 32x1 input, load even rows into HX(0++,0), odd rows into HX(16++,0)
+#  Apply the even matrix first and stop before rounding
+#  Then apply the odd matrix in a full manner:
+#
+#   First step is to compute partial products with the first input (16 cycles)
+#   1a 3b 5c 7d   16x1 input coefficients produce 16x16 output
+#   2a 4b 6c 8d
+#   2a -4b 6c -8d
+#   1a -3b 5c -7d
+#
+#   Second step is to sum partial products into final position (8 cycles)
+#   1a+3b+5c+7d
+#   2a+4b+6c+8d
+#   2a-4b+6c-8d
+#   1a-3b+5c-7d
+#
+#   Then can apply butterfly to combine even results and odd results + rounding to produce 16 rows of output at a time (need to save in transposed format)
+#
+#   For 16x16 no butterfly is required and can store final results in original location  (Could do 2 16x16s in parallel to make use of the trick - saves on the adds)
+#
+#   For 8x8 we could compute two in parallel.
+#
+#
+
+# Columns are transformed first
+#
+# Store top left half of transMatrix2 in
+# Store bottom left half of transMatrix2 in HX(32,32)
+#
+# For 16x16
+# HX(0:15,0) contains input data before transform
+# HY(0:15,0) contains 32bit output data after transform
+# HX(32,0) contains even rows of left half of transMatrix2
+# HX(32,32) contains odd rows of left half of transMatrix2
+# HY(48,0) contains partial products ready for summing
+#
+
+
+# hevc_trans_16x16(short *transMatrix2, short *coeffs, int num) # TODO add size so we can branch to correct implementation (or perhaps have coeffs32 and num32 as secondary inputs!)
+# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory)
+# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
+# num: number of 16x16 transforms to be done
+# coeffs32
+# num32: number of 32x32 transforms
+# command 0 for transform, 1 for memclear16(int16_t *dst,num16)
+#
+hevc_trans_16x16:
+  cmp r5,1
+  beq memclear16
+  cmp r5,2
+  beq hevc_deblock_16x16
+  cmp r5,3
+  beq hevc_uv_deblock_16x16
+  cmp r5,4
+  beq hevc_uv_deblock_16x16_with_clear
+  cmp r5,5
+  beq hevc_run_command_list
+
+  push r6-r15, lr # TODO cut down number of used registers
+  mov r14,r3 # coeffs32
+  mov r15,r4 # num32
+  mov r3, 16*2 # Stride of transMatrix2 in bytes
+  vldh HX(32++,0),(r0 += r3) REP 16 # This is the 16x16 matrix, a transform is equivalent to multiplying input row vector * matrix
+
+  add r0, 16*16*2 # For 32x32 transforms we also need this matrix
+  vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
+
+  # Now use r0 to describe which matrix we are working on.
+  # Allows us to prefetch the next block of coefficients for efficiency.
+  mov r0,0 # This describes the location where we read our coefficients from
+  mov r3,16*2 # Stride of coefficients in bytes (TODO remove)
+  mov r7,16*16*2 # Total block size
+  mov r8,64*16 # Value used to swap from current to next VRF location
+  vldh HX(0++,0)+r0,(r1 += r3) REP 16
+  mov r4,64 # Constant used for rounding first pass
+  mov r5,1<<11 # Constant used for rounding second pass
+
+  # At start of block r0,r1 point to the current block (that has already been loaded)
+block_loop:
+  eor r0,r8
+  add r1,r7
+  # Prefetch the next block
+  vldh HX(0++,0)+r0,(r1 += r3) REP 16
+  eor r0,r8
+  sub r1,r7
+
+  # Transform the current block
+  bl col_trans_16
+  vadd HY(0++,0)+r0,HY(0++,0)+r0,r4 REP 16   # Now add on rounding, shift down by 7, and saturate
+  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # 9+7=16 so this ends up with the output saturated and in the top half of the word.
+  vasl HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16    # This should be saturating, but the instruction above does not assemble?
+  vmov VX(0,0++)+r0, HX(0++,32)+r0 REP 16    # For simplicity transpose this back to the original position
+
+  bl col_trans_16
+  vadd HY(0++,0)+r0,HY(0++,0)+r0,r5 REP 16   # Now add on rounding, shift down by 7, and saturate
+  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16 # 4+12=16 so this ends up with the output saturated and in the top half of the word.
+  vasl HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16    # This should be saturating, but the instruction above does not assemble?  (Probably because it ends with ls which is interpreted as a condition flag)
+
+  # Save results - note there has been a transposition during the processing so we save columns
+  vsth VX(0,32++)+r0, (r1 += r3) REP 16
+
+  # Move onto next block
+  eor r0,r8
+  add r1,r7
+
+  addcmpbgt r2,-1,0,block_loop
+
+  # Now go and do any 32x32 transforms
+  b hevc_trans_32x32
+
+  pop r6-r15, pc
+
+# r1,r2,r3 r7,r8 should be preserved
+# HX(0++,0)+r0 is the block to be transformed
+# HX(32++,0)+r6 is the 16x16 matrix of transform coefficients
+# Use HY(48,0) for intermediate results
+# r0 can be used, but should be returned to its original value at the end
+col_trans_16:
+  add r6,r0,16 # Final value for this loop
+col_trans_16_loop:
+  # First compute partial products for a single column
+  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,0++) REP 16
+  # Then sum up the results and place back
+  vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
+  addcmpblt r0,1,r6,col_trans_16_loop
+  sub r0,16  # put r0 back to its original value
+  b lr
+
+col_trans_odd_16:
+  add r6,r0,16 # Final value for this loop
+col_trans_odd_16_loop:
+  # First compute partial products for a single column
+  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,32++) REP 16
+  # Then sum up the results and place back
+  vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
+  addcmpblt r0,1,r6,col_trans_odd_16_loop
+  sub r0,16  # put r0 back to its original value
+  b lr
+
+# hevc_trans_32x32(short *transMatrix2, short *coeffs, int num)
+# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory) Even followed by odd
+# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
+# num: number of 16x16 transforms to be done
+#
+hevc_trans_32x32:
+  mov r1,r14 # coeffs
+  mov r2,r15 # num
+
+  # Fetch odd transform matrix
+  #mov r3, 16*2 # Stride of transMatrix2 in bytes (and of coefficients)
+  #vldh HX(32++,0),(r0 += r3) REP 16 # This is the even 16x16 matrix
+  #add r0, 16*16*2
+  #vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
+
+  mov r3, 32*2*2 # Stride used to fetch alternate rows of our input coefficient buffer
+  mov r7, 16*16*2 # Total block size
+  sub sp,sp,32*32*2+32 # Allocate some space on the stack for us to store 32*32 shorts as temporary results (needs to be aligned)
+  # set r8 to 32byte aligned stack pointer
+  add r8,sp,31
+  lsr r8,5
+  lsl r8,5
+  mov r9,r8  # Backup of the temporary storage
+  mov r10,r1 # Backup of the coefficient buffer
+block_loop32:
+
+  # COLUMN TRANSFORM
+  mov r4, 64 # Constant used for rounding first pass
+  mov r5, 9 # left shift used for rounding first pass
+
+  # Transform the first 16 columns
+  mov r1,r10  # Input Coefficient buffer
+  mov r8,r9   # Output temporary storage
+  bl trans32
+  # Transform the second 16 columns
+  add r8,32*16*2
+  add r1,32
+  bl trans32
+
+  # ROW TRANSFORM
+  mov r4, 1<<11 # Constant used for rounding second pass
+  mov r5, 4 # left shift used for rounding second pass
+
+  mov r1,r9  # Input temporary storage
+  mov r8,r10   # Output Coefficient buffer
+  bl trans32
+  # Transform the second 16 columns
+  add r8,32*16*2
+  add r1,32
+  bl trans32
+
+  add r10, 32*32*2 # move onto next block of coefficients
+  addcmpbgt r2,-1,0,block_loop32
+
+  add sp,sp,32*32*2+32 # Restore stack
+
+  pop r6-r15, pc
+
+trans32:
+  push lr
+  # We can no longer afford the VRF space to do prefetching when doing 32x32
+  # Fetch the even rows
+  vldh HX(0++,0),(r1 += r3) REP 16
+  # Fetch the odd rows
+  vldh HX(16++,0),64(r1 += r3) REP 16 # First odd row is 32 shorts ahead of r1
+
+  # Transform the even rows using even matrix
+  mov r0, 0 # Even rows
+  bl col_trans_16
+
+  # Now transform the odd rows using odd matrix
+  mov r0, 64*16 # Odd rows
+  bl col_trans_odd_16
+
+  # Now apply butterfly to compute the first 16 results
+  vadd HY(48++,0),HY(0++,0),HY(16++,0) REP 16
+  vadd HY(48++,0),HY(48++,0),r4 REP 16   # add on rounding,
+  vasl HY(48++,0),HY(48++,0),r5 REP 16    # shift down by 7, and saturate
+  # 16bit results now in HX(48,32)
+  mov r0,r8
+  mov r6,32*2
+  vsth VX(48,32++),(r0+=r6) REP 16
+
+  # Now apply butterfly to compute the second 16 results (in reverse order)
+  vsub HY(63,0),HY(0 ,0),HY(16,0)
+  vsub HY(62,0),HY(1 ,0),HY(17,0)
+  vsub HY(61,0),HY(2 ,0),HY(18,0)
+  vsub HY(60,0),HY(3 ,0),HY(19,0)
+  vsub HY(59,0),HY(4 ,0),HY(20,0)
+  vsub HY(58,0),HY(5 ,0),HY(21,0)
+  vsub HY(57,0),HY(6 ,0),HY(22,0)
+  vsub HY(56,0),HY(7 ,0),HY(23,0)
+  vsub HY(55,0),HY(8 ,0),HY(24,0)
+  vsub HY(54,0),HY(9 ,0),HY(25,0)
+  vsub HY(53,0),HY(10,0),HY(26,0)
+  vsub HY(52,0),HY(11,0),HY(27,0)
+  vsub HY(51,0),HY(12,0),HY(28,0)
+  vsub HY(50,0),HY(13,0),HY(29,0)
+  vsub HY(49,0),HY(14,0),HY(30,0)
+  vsub HY(48,0),HY(15,0),HY(31,0)
+  vadd HY(48++,0),HY(48++,0),r4 REP 16   # add on rounding,
+  vasl HY(48++,0),HY(48++,0),r5 REP 16    # shift down by 7, and saturate
+  add r0,r8,32
+  vsth VX(48,32++),(r0+=r6) REP 16
+  pop pc
+
+memclear16:
+  # r0 is address
+  # r1 is number of 16bits values to set to 0 (may overrun past end and clear more than specified)
+  vmov HX(0++,0),0 REP 16
+  mov r2,32
+loop:
+  vsth HX(0++,0),(r0+=r2) REP 16
+  add r0,16*16*2
+  sub r1,16*16
+  cmp r1,0
+  bgt loop
+  b lr
+
+
+################################################################################
+# HEVC VPU Deblock
+#
+# Vertical edges before horizontal
+# Decision can change every 4 pixels, but only 8 pixel boundaries are deblocked
+#
+# ARM is responsible for storing beta and tc for each 4 pixels horiz and vert edge.
+# The VPU code works in units of 16x16 blocks.
+# We do vertical filtering for the current block followed by horizontal filtering for the previous (except for the first time).
+# One final horizontal filter is required at the end.
+# PCM is not allowed in this code.
+#
+#
+# H(16-4:16+15,0) contains previous block (note that we need 4 lines above of context that may get altered during filtering)
+# H(16:31,16) contains current block (note that we do not need the upper lines until the horizontal filtering.
+
+.set P0,63
+.set P1,62
+.set P2,61
+.set P3,60
+.set Q0,59
+.set Q1,58
+.set Q2,57
+.set Q3,56
+
+.set dp,32
+.set dq,33
+.set d,34
+.set decision,35
+.set beta,36
+.set beta2,37
+.set beta3,38
+.set ptest,39
+.set qtest,40
+.set pqtest,41
+.set thresh,42
+.set deltatest, 44
+.set deltap1, 45
+.set tc25, 46
+.set setup,47
+.set tc,48
+.set tc25,49
+.set tc2, 50
+.set do_filter, 51
+.set delta, 52
+.set tc10, 53
+.set delta0, 54
+.set delta1, 55
+.set zeros, 0
+.set setup_input, 1
+.set deltaq1, 2
+
+
+
+# hevc_deblock_16x16 deblocks an entire row that is 16 pixels high by the full width of the image.
+# Row has num16 16x16 blocks across
+# Beta goes from 0 to 64
+# tc goes from 0 to 24
+# setup[block_idx][0=vert,1=horz][0=first edge, 1=second edge][0=beta,1=tc][0..3=edge number]
+#   has 8 bytes per edge
+#   has 16 bytes per direction
+#   has 32 bytes per 16x16 block
+# hevc_deblock_16x16(uint8_t *img (r0), int stride (r1), int num16w (r2), uint8_t setup[num16][2][2][2][4](r3),int num16h(r4))
+hevc_deblock_16x16:
+  push r6-r15, lr
+  mov r9,r4
+  mov r4,r3
+  mov r13,r2
+  mov r2,r0
+  mov r10,r0
+  subscale4 r0,r1
+  mov r8,63
+  mov r6,-3
+  vmov H(zeros,0),0
+# r7 is number of blocks still to load
+# r0 is location of current block - 4 * stride
+# r1 is stride
+# r2 is location of current block
+# r3 is offset of start of block (actual edges start at H(16,16)+r3 for horizontal and H(16,0)+r3 for vertical
+# r4 is setup
+# r5 is for temporary calculations
+# r8 holds 63
+# r6 holds -3
+# r9 holds the number of 16 high rows to process
+# r10 holds the original img base
+# r11 returns 0 if no filtering was done on the edge
+# r12 saves a copy of this
+# r13 is copy of width
+
+process_row:
+  # First iteration does not do horizontal filtering on previous
+  mov r7, r13
+  mov r3,0
+  vldb H(12++,16)+r3,(r0 += r1) REP 4    # Load the current block
+  vldb H(16++,16)+r3,(r2 += r1) REP 16
+  vldb H(setup_input,0), (r4)  # We may wish to prefetch these
+  vstb H(zeros,0),(r4)
+  bl vert_filter
+  add r3,8
+  vadd H(setup_input,0),H(setup_input,8),0 # Rotate to second set of 8
+  bl vert_filter
+  sub r3,8
+  b start_deblock_loop
+deblock_loop:
+  # Middle iterations do vertical on current block and horizontal on preceding
+  vldb H(12++,16)+r3,(r0 += r1) REP 4  # load the current block
+  vldb H(16++,16)+r3,(r2 += r1) REP 16
+  vldb H(setup_input,0), (r4)
+  vstb H(zeros,0),(r4)
+  bl vert_filter
+  add r3,8
+  vadd H(setup_input,0),H(setup_input,8),0
+  bl vert_filter
+  sub r3,8
+  vldb H(setup_input,0), -16(r4)
+  vstb H(zeros,0),-16(r4)
+  bl horz_filter
+  mov r12,r11
+  add r3,8*64
+  vadd H(setup_input,0),H(setup_input,8),0
+  bl horz_filter
+  sub r3,8*64
+  addcmpbeq r12,0,0,skip_save_top
+  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
+skip_save_top:
+  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
+start_deblock_loop:
+  # move onto next 16x16 (could do this with circular buffer support instead)
+  add r3,16
+  and r3,r8
+  add r4,32
+  # Perform loop counter operations (may work with an addcmpbgt as well?)
+  add r0,16
+  add r2,16
+  sub r7,1
+  cmp r7,0 # Are there still more blocks to load
+  bgt deblock_loop
+
+  # Final iteration needs to just do horizontal filtering
+  vldb H(setup_input,0), -16(r4)
+  vstb H(zeros,0),-16(r4)
+  bl horz_filter
+  mov r12,r11
+  add r3,8*64
+  vadd H(setup_input,0),H(setup_input,8),0
+  bl horz_filter
+  sub r3,64*8
+  addcmpbeq r12,0,0,skip_save_top2
+  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
+skip_save_top2:
+  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
+
+# Now look to see if we should do another row
+  sub r9,1
+  cmp r9,0
+  bgt start_again
+  pop r6-r15, pc
+start_again:
+  # Need to sort out r0,r2 to point to next row down
+  addscale16 r10,r1
+  mov r2,r10
+  subscale4 r0,r2,r1
+  b process_row
+
+
+# At this stage H(16,16)+r3 points to the first pixel of the 16 high edge to be filtered
+# So we can reuse the code we move the parts to be filtered into HX(P0/P1/P2/P3/Q0/Q1/Q2/Q3,0) - we will perform a final saturation step on placing them back into the correct locations
+
+vert_filter:
+  push lr
+
+  vmov HX(P3,0), V(16,12)+r3
+  vmov HX(P2,0), V(16,13)+r3
+  vmov HX(P1,0), V(16,14)+r3
+  vmov HX(P0,0), V(16,15)+r3
+  vmov HX(Q0,0), V(16,16)+r3
+  vmov HX(Q1,0), V(16,17)+r3
+  vmov HX(Q2,0), V(16,18)+r3
+  vmov HX(Q3,0), V(16,19)+r3
+
+  bl do_luma_filter
+
+  vadds V(16,13)+r3, HX(P2,0), 0
+  vadds V(16,14)+r3, HX(P1,0), 0
+  vadds V(16,15)+r3, HX(P0,0), 0
+  # P3 and Q3 never change so don't bother saving back
+  vadds V(16,16)+r3, HX(Q0,0), 0
+  vadds V(16,17)+r3, HX(Q1,0), 0
+  vadds V(16,18)+r3, HX(Q2,0), 0
+
+  pop pc
+
+# Filter edge at H(16,0)+r3
+horz_filter:
+  push lr
+
+  vmov HX(P3,0), H(12,0)+r3
+  vmov HX(P2,0), H(13,0)+r3
+  vmov HX(P1,0), H(14,0)+r3
+  vmov HX(P0,0), H(15,0)+r3
+  vmov HX(Q0,0), H(16,0)+r3
+  vmov HX(Q1,0), H(17,0)+r3
+  vmov HX(Q2,0), H(18,0)+r3
+  vmov HX(Q3,0), H(19,0)+r3
+
+  bl do_luma_filter
+
+  vadds H(13,0)+r3, HX(P2,0), 0
+  vadds H(14,0)+r3, HX(P1,0), 0
+  vadds H(15,0)+r3, HX(P0,0), 0
+  # P3 and Q3 never change so don't bother saving back
+  vadds H(16,0)+r3, HX(Q0,0), 0
+  vadds H(17,0)+r3, HX(Q1,0), 0
+  vadds H(18,0)+r3, HX(Q2,0), 0
+
+  pop pc
+
+# r4 points to array of beta/tc for each 4 length edge
+do_luma_filter:
+  valtl H(setup,0),H(setup_input,0),H(setup_input,0) # b*8tc*8
+  valtl HX(beta,0),H(setup,0),H(setup,0)
+  valtu HX(tc,0),H(setup,0),H(setup,0)
+  vmul HX(tc25,0), HX(tc,0), 5
+  vadd HX(tc25,0),HX(tc25,0), 1
+  vasr HX(tc25,0), HX(tc25,0), 1
+
+  # Compute decision
+  vadd HX(dp,0),HX(P1,0),HX(P1,0) # 2*P1
+  vsub HX(dp,0),HX(P2,0),HX(dp,0) # P2-2*P1
+  vadd HX(dp,0),HX(dp,0),HX(P0,0) # P2-2*P1+P0
+  vdist HX(dp,0),HX(dp,0),0 # abs(P2-2*P1+P0) # dp0
+
+  vadd HX(dq,0),HX(Q1,0),HX(Q1,0) # 2*Q1
+  vsub HX(dq,0),HX(Q2,0),HX(dq,0) # Q2-2*Q1
+  vadd HX(dq,0),HX(dq,0),HX(Q0,0) # Q2-2*Q1+Q0
+  vdist HX(dq,0),HX(dq,0),0 # abs(Q2-2*Q1+Q0) # dq0
+
+  vadd HX(d,0), HX(dp,0), HX(dq,0)
+  vasr HX(beta2,0),HX(beta,0),2
+  vasr HX(beta3,0),HX(beta,0),3
+
+  # Compute flags that are negative if all conditions pass
+  vdist HX(decision,0), HX(P0,0), HX(P3,0) CLRA SACC
+  vdist HX(decision,0), HX(Q0,0), HX(Q3,0) SACC
+  vsub HX(decision,0), HX(decision,0), HX(beta3,0) SETF
+
+  vdist HX(decision,0), HX(P0,0), HX(Q0,0) IFN
+  vsub HX(decision,0), HX(decision,0), HX(tc25,0) IFN SETF
+  vadd HX(decision,0), HX(d,0), HX(d,0) IFN
+  vsub HX(decision,0), HX(decision,0), HX(beta2,0) IFN SETF
+  vmov HX(decision,0), 1 IFNN
+  vadd H(decision,0),H(decision,3),0 IFN
+  vadd H(decision,16),H(decision,19),0 IFN
+  vmov -,HX(decision,0) SETF   # N marks strong filter
+  vmov HX(decision,0), 1 IFNN  # NN marks normal filter
+
+  vadd HX(do_filter,0), HX(d,3), HX(d,0)
+  vsub HX(do_filter,0), HX(do_filter,0), HX(beta,0) SETF # IFNN means no filter
+  vmov HX(decision,0),0 IFNN # Z marks no filter
+
+  # Expand out decision (currently valid one every 4 pixels)  0...1...2...3
+  # First extract out even terms
+  vodd HX(decision,0),HX(decision,0),HX(decision,0)  # 0.1.2.3
+  vodd HX(decision,0),HX(decision,0),HX(decision,0)  # 0123
+  # Now expand back
+  valtl HX(decision,0),HX(decision,0),HX(decision,0) # 00112233
+  valtl HX(decision,0),HX(decision,0),HX(decision,0) SETF # 0000111122223333
+
+  # HX(decision,0) is negative if want strong filtering, 1 if want normal filtering, 0 if want no filtering
+
+  # Do a quick check to see if there is anything to do
+  mov r11, 0 # Signal no filtering
+  vmov -,1 IFNZ SUMS r5
+  cmp r5,0
+  beq filtering_done
+  mov r11, 1 # Signal some filtering
+  # And whether there is any strong filtering
+  vmov -,1 IFN SUMS r5
+  cmp r5,0
+  beq normal_filtering
+
+  ##############################################################################
+  # Strong filtering - could maybe fast case if all have same sign? (especially if all disabled!)
+  vshl HX(tc2,0), HX(tc,0), 1  # Note that in normal filtering tx2 is tc/2, while here it is tc*2
+
+  # Take a copy of the original pixels for use in decision calculation
+  vmov HX(P0,32),HX(P0,0)
+  vmov HX(Q0,32),HX(Q0,0)
+  vmov HX(P1,32),HX(P1,0)
+  vmov HX(Q1,32),HX(Q1,0)
+  vmov HX(P2,32),HX(P2,0)
+  vmov HX(Q2,32),HX(Q2,0)
+
+  vadd -,HX(P2,32),4 CLRA SACC
+  vshl -,HX(P1,32),1 SACC
+  vshl -,HX(P0,32),1 SACC
+  vshl -,HX(Q0,32),1 SACC
+  vshl HX(delta,0),HX(Q1,32),0 SACC
+  vasr HX(delta,0),HX(delta,0), 3
+  vsub HX(delta,0),HX(delta,0),HX(P0,32)
+  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
+  vadd HX(P0,0),HX(P0,32),HX(delta,0) IFN
+
+  vadd -,HX(P2,32),2 CLRA SACC
+  vadd -,HX(P1,32),HX(P0,32) SACC
+  vshl HX(delta,0),HX(Q0,32),0 SACC
+  vasr HX(delta,0),HX(delta,0), 2
+  vsub HX(delta,0),HX(delta,0),HX(P1,32)
+  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
+  vadd HX(P1,0),HX(P1,32),HX(delta,0) IFN
+
+  vadd -,HX(Q0,32),4 CLRA SACC
+  vadd -,HX(P1,32),HX(P0,32) SACC
+  vmul -,HX(P2,32),3 SACC
+  vshl HX(delta,0),HX(P3,0),1 SACC # Note that we have not made a copy of P3, so using P3,0 is correct
+  vasr HX(delta,0),HX(delta,0), 3
+  vsub HX(delta,0),HX(delta,0),HX(P2,32)
+  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
+  vadd HX(P2,0),HX(P2,32),HX(delta,0) IFN
+  #vmov HX(P2,0),3 IFN
+
+  # Now reverse all P/Qs
+
+  vadd -,HX(Q2,32),4 CLRA SACC
+  vshl -,HX(Q1,32),1 SACC
+  vshl -,HX(Q0,32),1 SACC
+  vshl -,HX(P0,32),1 SACC
+  vshl HX(delta,0),HX(P1,32),0 SACC
+  vasr HX(delta,0),HX(delta,0), 3
+  vsub HX(delta,0),HX(delta,0),HX(Q0,32)
+  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
+  vadd HX(Q0,0),HX(Q0,32),HX(delta,0) IFN
+
+  vadd -,HX(Q2,32),2 CLRA SACC
+  vadd -,HX(Q1,32),HX(Q0,32) SACC
+  vshl HX(delta,0),HX(P0,32),0 SACC
+  vasr HX(delta,0),HX(delta,0), 2
+  vsub HX(delta,0),HX(delta,0),HX(Q1,32)
+  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
+  vadd HX(Q1,0),HX(Q1,32),HX(delta,0) IFN
+
+  vadd -,HX(P0,32),4 CLRA SACC
+  vadd -,HX(Q1,32),HX(Q0,32) SACC
+  vmul -,HX(Q2,32),3 SACC
+  vshl HX(delta,0),HX(Q3,0),1 SACC # Note that we have not made a copy of Q3, so using Q3,0 is correct
+  vasr HX(delta,0),HX(delta,0), 3
+  vsub HX(delta,0),HX(delta,0),HX(Q2,32)
+  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
+  vadd HX(Q2,0),HX(Q2,32),HX(delta,0) IFN
+
+  ##############################################################################
+  # Normal filtering
+normal_filtering:
+  # Invert the decision flags
+  # make instruction more complicated as assembler has error and loses SETF
+  vrsub HX(tc10,0), HX(decision,0), 0 SETF # IFN means normal filtering
+  vmov  -, HX(tc10,0) SETF # IFN means normal filtering
+
+  vmov -,1 IFN SUMS r5
+  cmp r5,0
+  beq filtering_done
+
+  vasr HX(tc2,0), HX(tc,0), 1
+  vmul HX(tc10,0), HX(tc,0), 10
+
+  vasr HX(thresh,0), HX(beta,0), 1
+  vadd HX(thresh,0), HX(thresh,0), HX(beta,0)
+  vasr HX(thresh,0), HX(thresh,0), 3 CLRA SACC
+
+  vadd HX(ptest,0),HX(dp,3),HX(dp,0)
+  vsub HX(ptest,0),HX(ptest,0),HX(thresh,0) # ptest is negative if we need to do the P2 pixel
+  vadd HX(qtest,0),HX(dq,3),HX(dq,0)
+  vsub HX(qtest,0),HX(qtest,0),HX(thresh,0) # qtest is negative if we need to do the Q2 pixel
+  # Expand ptest and qtest together
+  vodd HX(pqtest,0),HX(ptest,0),HX(qtest,0)  # p.p.p.p.q.q.q.q
+  vodd HX(pqtest,0),HX(pqtest,0),HX(pqtest,0) # ppppqqqq........
+  valtl HX(pqtest,0),HX(pqtest,0),HX(pqtest,0) # ppppppppqqqqqqqq
+  valtl HX(ptest,0),HX(pqtest,0),HX(pqtest,0)
+  valtu HX(qtest,0),HX(pqtest,0),HX(pqtest,0)
+
+  vsub HX(delta0,0), HX(Q0,0), HX(P0,0)
+  vsub HX(delta1,0), HX(Q1,0), HX(P1,0)
+  vmov -,8 CLRA SACC
+  vmul -,HX(delta0,0), 9 SACC
+  vmul HX(delta0,0),HX(delta1,0), r6 SACC
+  vasr HX(delta0,0), HX(delta0,0), 4
+  vdist HX(deltatest,0), HX(delta0,0), 0
+  vsub HX(deltatest,0), HX(deltatest,0), HX(tc10,0) IFN SETF # negative if still need to do something
+  vmov HX(deltatest,0), 0 IFNN # clear if no need to do anything so we can reload flags later
+
+  vclamps HX(delta0,0), HX(delta0,0), HX(tc,0)
+
+  vadd HX(deltap1,0), HX(P2,0), HX(P0,0)
+  vadd HX(deltap1,0), HX(deltap1,0), 1
+  vasr HX(deltap1,0), HX(deltap1,0), 1 CLRA SACC
+  vsub HX(deltap1,0), HX(delta0,0), HX(P1,0) SACC
+  vasr HX(deltap1,0), HX(deltap1,0), 1
+  vclamps HX(deltap1,0), HX(deltap1,0), HX(tc2,0)
+
+  vadd HX(deltaq1,0), HX(Q2,0), HX(Q0,0)
+  vadd HX(deltaq1,0), HX(deltaq1,0), 1
+  vasr HX(deltaq1,0), HX(deltaq1,0), 1 CLRA SACC
+  vadd HX(deltaq1,0), HX(delta0,0), HX(Q1,0)
+  vrsub -, HX(delta0,0), 0 SACC
+  vrsub HX(deltaq1,0), HX(Q1,0), 0 SACC
+  vasr HX(deltaq1,0), HX(deltaq1,0), 1
+  vclamps HX(deltaq1,0), HX(deltaq1,0), HX(tc2,0)
+
+  vadds HX(P0,0), HX(P0,0), HX(delta0,0) IFN
+  vsubs HX(Q0,0), HX(Q0,0), HX(delta0,0) IFN
+
+  vmov -,HX(ptest,0) IFN SETF # Negative if need to do p1
+  vadds HX(P1,0), HX(P1,0), HX(deltap1,0) IFN
+
+  vmov -,HX(deltatest,0) SETF
+  vmov -,HX(qtest,0) IFN SETF # Negative if need to do q1
+  vadds HX(Q1,0), HX(Q1,0), HX(deltaq1,0) IFN
+
+  #vmov HX(P2,0),1 IFN
+
+filtering_done:
+  b lr
+
+
+hevc_uv_deblock_16x16:
+  push r6-r15, lr
+  mov r14,0
+  b hevc_uv_start
+hevc_uv_deblock_16x16_with_clear:
+  push r6-r15, lr
+  mov r14,1
+  b hevc_uv_start
+
+hevc_uv_start:
+  mov r9,r4
+  mov r4,r3
+  mov r13,r2
+  mov r2,r0
+  mov r10,r0
+  subscale4 r0,r1
+  mov r8,63
+  mov r6,-3
+  vmov H(zeros,0),0
+# r7 is number of blocks still to load
+# r0 is location of current block - 4 * stride
+# r1 is stride
+# r2 is location of current block
+# r3 is offset of start of block (actual edges start at H(16,16)+r3 for horizontal and H(16,0)+r3 for vertical
+# r4 is setup
+# r5 is for temporary calculations
+# r8 holds 63
+# r6 holds -3
+# r9 holds the number of 16 high rows to process
+# r10 holds the original img base
+# r11 returns 0 if no filtering was done on the edge
+# r12 saves a copy of this
+# r13 is copy of width
+# r14 is 1 if we should clear the old contents, or 0 if not
+
+uv_process_row:
+  # First iteration does not do horizontal filtering on previous
+  mov r7, r13
+  mov r3,0
+  vldb H(12++,16)+r3,(r0 += r1) REP 4    # Load the current block
+  vldb H(16++,16)+r3,(r2 += r1) REP 16
+  vldb H(setup_input,0), (r4)  # We may wish to prefetch these
+  cmp r14,1
+  bne uv_skip0
+  vstb H(zeros,0),(r4)
+uv_skip0:
+  bl uv_vert_filter
+  add r3,8
+  vadd H(setup_input,0),H(setup_input,8),0 # Rotate to second set of 8
+  bl uv_vert_filter
+  sub r3,8
+  b uv_start_deblock_loop
+uv_deblock_loop:
+  # Middle iterations do vertical on current block and horizontal on preceding
+  vldb H(12++,16)+r3,(r0 += r1) REP 4  # load the current block
+  vldb H(16++,16)+r3,(r2 += r1) REP 16
+  vldb H(setup_input,0), (r4)
+  cmp r14,1
+  bne uv_skip1
+  vstb H(zeros,0),(r4)
+uv_skip1:
+  bl uv_vert_filter
+  add r3,8
+  vadd H(setup_input,0),H(setup_input,8),0
+  bl uv_vert_filter
+  sub r3,8
+  vldb H(setup_input,0), -16(r4)
+  cmp r14,1
+  bne uv_skip3
+  vstb H(zeros,0),-16(r4)
+uv_skip3:
+  bl uv_horz_filter
+  mov r12,r11
+  add r3,8*64
+  vadd H(setup_input,0),H(setup_input,8),0
+  bl uv_horz_filter
+  sub r3,8*64
+  addcmpbeq r12,0,0,uv_skip_save_top
+  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
+uv_skip_save_top:
+  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
+uv_start_deblock_loop:
+  # move onto next 16x16 (could do this with circular buffer support instead)
+  add r3,16
+  and r3,r8
+  add r4,32
+  # Perform loop counter operations (may work with an addcmpbgt as well?)
+  add r0,16
+  add r2,16
+  sub r7,1
+  cmp r7,0 # Are there still more blocks to load
+  bgt uv_deblock_loop
+
+  # Final iteration needs to just do horizontal filtering
+  vldb H(setup_input,0), -16(r4)
+  cmp r14,1
+  bne uv_skip2
+  vstb H(zeros,0),-16(r4)
+uv_skip2:
+  bl uv_horz_filter
+  mov r12,r11
+  add r3,8*64
+  vadd H(setup_input,0),H(setup_input,8),0
+  bl uv_horz_filter
+  sub r3,64*8
+  addcmpbeq r12,0,0,uv_skip_save_top2
+  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
+uv_skip_save_top2:
+  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
+
+# Now look to see if we should do another row
+  sub r9,1
+  cmp r9,0
+  bgt uv_start_again
+  pop r6-r15, pc
+uv_start_again:
+  # Need to sort out r0,r2 to point to next row down
+  addscale16 r10,r1
+  mov r2,r10
+  subscale4 r0,r2,r1
+  b uv_process_row
+
+
+# At this stage H(16,16)+r3 points to the first pixel of the 16 high edge to be filtered
+# So we can reuse the code we move the parts to be filtered into HX(P0/P1/P2/P3/Q0/Q1/Q2/Q3,0) - we will perform a final saturation step on placing them back into the correct locations
+
+uv_vert_filter:
+  push lr
+
+  vmov HX(P1,0), V(16,14)+r3
+  vmov HX(P0,0), V(16,15)+r3
+  vmov HX(Q0,0), V(16,16)+r3
+  vmov HX(Q1,0), V(16,17)+r3
+
+  bl do_chroma_filter
+
+  vadds V(16,15)+r3, HX(P0,0), 0
+  vadds V(16,16)+r3, HX(Q0,0), 0
+
+  pop pc
+
+# Filter edge at H(16,0)+r3
+uv_horz_filter:
+  push lr
+
+  vmov HX(P1,0), H(14,0)+r3
+  vmov HX(P0,0), H(15,0)+r3
+  vmov HX(Q0,0), H(16,0)+r3
+  vmov HX(Q1,0), H(17,0)+r3
+
+  bl do_chroma_filter
+
+  vadds H(15,0)+r3, HX(P0,0), 0
+  # P3 and Q3 never change so don't bother saving back
+  vadds H(16,0)+r3, HX(Q0,0), 0
+
+  pop pc
+
+# r4 points to array of beta/tc for each 4 length edge
+do_chroma_filter:
+  valtl H(setup,0),H(setup_input,0),H(setup_input,0) # tc*8
+  valtl HX(tc,0),H(setup,0),H(setup,0)
+
+  vsub HX(delta,0),HX(Q0,0),HX(P0,0)
+  vshl HX(delta,0),HX(delta,0),2 CLRA SACC
+  vsub -,HX(P1,0),HX(Q1,0) SACC
+  vmov HX(delta,0),4 SACC
+  vasr HX(delta,0),HX(delta,0),3
+  vclamps HX(delta,0), HX(delta,0), HX(tc,0)
+  vadd HX(P0,0),HX(P0,0),HX(delta,0)
+  vsub HX(Q0,0),HX(Q0,0),HX(delta,0)
+  b lr
+
+# r0 = list
+# r1 = number
+hevc_run_command_list:
+  push r6-r7, lr
+  mov r6, r0
+  mov r7, r1
+loop_cmds:
+  ld r0,(r6) # How to encode r6++?
+  add r6,4
+  ld r1,(r6)
+  add r6,4
+  ld r2,(r6)
+  add r6,4
+  ld r3,(r6)
+  add r6,4
+  ld r4,(r6)
+  add r6,4
+  ld r5,(r6)
+  add r6,4
+  bl hevc_trans_16x16
+  sub r7,1
+  cmp r7,0
+  bgt loop_cmds
+
+  pop r6-r7, pc
diff --git a/libavcodec/rpi_mailbox.c b/libavcodec/rpi_mailbox.c
new file mode 100644
index 0000000..3904efc
--- /dev/null
+++ b/libavcodec/rpi_mailbox.c
@@ -0,0 +1,340 @@
+/*
+Copyright (c) 2012, Broadcom Europe Ltd.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <assert.h>
+#include <stdint.h>
+#include <sys/mman.h>
+#include <sys/ioctl.h>
+
+#include <linux/ioctl.h>
+
+#define MAJOR_NUM 100
+#define IOCTL_MBOX_PROPERTY _IOWR(MAJOR_NUM, 0, char *)
+#define DEVICE_FILE_NAME "/dev/vcio"
+
+#include "rpi_mailbox.h"
+
+#define PAGE_SIZE (4*1024)
+
+// Shared memory will not be cached in ARM cache
+void *mapmem_shared(unsigned base, unsigned size)
+{
+   int mem_fd;
+   unsigned offset = base % PAGE_SIZE;
+   base = base - offset;
+   /* open /dev/mem */
+   if ((mem_fd = open("/dev/mem", O_RDWR|O_SYNC) ) < 0) {
+      printf("can't open /dev/mem\nThis program should be run as root. Try prefixing command with: sudo\n");
+      return NULL;
+   }
+   void *mem = mmap(
+      0,
+      size,
+      PROT_READ|PROT_WRITE,
+      MAP_SHARED/*|MAP_FIXED*/,
+      mem_fd,
+      base);
+#ifdef DEBUG
+   printf("base=0x%x, mem=%p\n", base, mem);
+#endif
+   if (mem == MAP_FAILED) {
+      printf("mmap error %d\n", (int)mem);
+      return NULL;
+   }
+   close(mem_fd);
+   return (char *)mem + offset;
+}
+
+// Unshared memory will be faster as lives in ARM cache, but requires cache flushing
+void *mapmem_private(unsigned base, unsigned size)
+{
+   int mem_fd;
+   unsigned offset = base % PAGE_SIZE;
+   base = base - offset;
+   /* open /dev/mem */
+   if ((mem_fd = open("/dev/mem", O_RDWR|O_SYNC) ) < 0) {
+      printf("can't open /dev/mem\nThis program should be run as root. Try prefixing command with: sudo\n");
+      return NULL;
+   }
+   void *mem = mmap(
+      0,
+      size,
+      PROT_READ|PROT_WRITE,
+      MAP_PRIVATE/*|MAP_FIXED*/,
+      mem_fd,
+      base);
+#ifdef DEBUG
+   printf("base=0x%x, mem=%p\n", base, mem);
+#endif
+   if (mem == MAP_FAILED) {
+      printf("mmap error %d\n", (int)mem);
+      return NULL;
+   }
+   close(mem_fd);
+   return (char *)mem + offset;
+}
+
+void unmapmem(void *addr, unsigned size)
+{
+   int s = munmap(addr, size);
+   if (s != 0) {
+      printf("munmap error %d\n", s);
+      exit (-1);
+   }
+}
+
+/*
+ * use ioctl to send mbox property message
+ */
+
+static int mbox_property(int file_desc, void *buf)
+{
+   int ret_val = ioctl(file_desc, IOCTL_MBOX_PROPERTY, buf);
+
+   if (ret_val < 0) {
+      printf("ioctl_set_msg failed:%d\n", ret_val);
+   }
+
+#ifdef DEBUG
+   unsigned *p = buf; int i; unsigned size = *(unsigned *)buf;
+   for (i=0; i<size/4; i++)
+      printf("%04x: 0x%08x\n", i*sizeof *p, p[i]);
+#endif
+   return ret_val;
+}
+
+unsigned mem_alloc(int file_desc, unsigned size, unsigned align, unsigned flags)
+{
+   int i=0;
+   unsigned p[32];
+   p[i++] = 0; // size
+   p[i++] = 0x00000000; // process request
+
+   p[i++] = 0x3000c; // (the tag id)
+   p[i++] = 12; // (size of the buffer)
+   p[i++] = 12; // (size of the data)
+   p[i++] = size; // (num bytes? or pages?)
+   p[i++] = align; // (alignment)
+   p[i++] = flags; // (MEM_FLAG_L1_NONALLOCATING)
+
+   p[i++] = 0x00000000; // end tag
+   p[0] = i*sizeof *p; // actual size
+
+   mbox_property(file_desc, p);
+   return p[5];
+}
+
+unsigned mem_free(int file_desc, unsigned handle)
+{
+   int i=0;
+   unsigned p[32];
+   p[i++] = 0; // size
+   p[i++] = 0x00000000; // process request
+
+   p[i++] = 0x3000f; // (the tag id)
+   p[i++] = 4; // (size of the buffer)
+   p[i++] = 4; // (size of the data)
+   p[i++] = handle;
+
+   p[i++] = 0x00000000; // end tag
+   p[0] = i*sizeof *p; // actual size
+
+   mbox_property(file_desc, p);
+   return p[5];
+}
+
+unsigned mem_lock(int file_desc, unsigned handle)
+{
+   int i=0;
+   unsigned p[32];
+   p[i++] = 0; // size
+   p[i++] = 0x00000000; // process request
+
+   p[i++] = 0x3000d; // (the tag id)
+   p[i++] = 4; // (size of the buffer)
+   p[i++] = 4; // (size of the data)
+   p[i++] = handle;
+
+   p[i++] = 0x00000000; // end tag
+   p[0] = i*sizeof *p; // actual size
+
+   mbox_property(file_desc, p);
+   return p[5];
+}
+
+unsigned mem_unlock(int file_desc, unsigned handle)
+{
+   int i=0;
+   unsigned p[32];
+   p[i++] = 0; // size
+   p[i++] = 0x00000000; // process request
+
+   p[i++] = 0x3000e; // (the tag id)
+   p[i++] = 4; // (size of the buffer)
+   p[i++] = 4; // (size of the data)
+   p[i++] = handle;
+
+   p[i++] = 0x00000000; // end tag
+   p[0] = i*sizeof *p; // actual size
+
+   mbox_property(file_desc, p);
+   return p[5];
+}
+
+unsigned execute_code(int file_desc, unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5)
+{
+   int i=0;
+   unsigned p[32];
+   p[i++] = 0; // size
+   p[i++] = 0x00000000; // process request
+
+   p[i++] = 0x30010; // (the tag id)
+   p[i++] = 28; // (size of the buffer)
+   p[i++] = 28; // (size of the data)
+   p[i++] = code;
+   p[i++] = r0;
+   p[i++] = r1;
+   p[i++] = r2;
+   p[i++] = r3;
+   p[i++] = r4;
+   p[i++] = r5;
+
+   p[i++] = 0x00000000; // end tag
+   p[0] = i*sizeof *p; // actual size
+
+   mbox_property(file_desc, p);
+   return p[5];
+}
+
+unsigned qpu_enable(int file_desc, unsigned enable)
+{
+   int i=0;
+   unsigned p[32];
+
+   p[i++] = 0; // size
+   p[i++] = 0x00000000; // process request
+
+   p[i++] = 0x30012; // (the tag id)
+   p[i++] = 4; // (size of the buffer)
+   p[i++] = 4; // (size of the data)
+   p[i++] = enable;
+
+   p[i++] = 0x00000000; // end tag
+   p[0] = i*sizeof *p; // actual size
+
+   mbox_property(file_desc, p);
+   return p[5];
+}
+
+unsigned execute_qpu(int file_desc, unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout) {
+   int i=0;
+   unsigned p[32];
+
+   p[i++] = 0; // size
+   p[i++] = 0x00000000; // process request
+   p[i++] = 0x30011; // (the tag id)
+   p[i++] = 16; // (size of the buffer)
+   p[i++] = 16; // (size of the data)
+   p[i++] = num_qpus;
+   p[i++] = control;
+   p[i++] = noflush;
+   p[i++] = timeout; // ms
+
+   p[i++] = 0x00000000; // end tag
+   p[0] = i*sizeof *p; // actual size
+
+   mbox_property(file_desc, p);
+   return p[5];
+}
+
+void execute_multi(int file_desc,
+   unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout,
+   unsigned num_qpus_2, unsigned control_2, unsigned noflush_2, unsigned timeout_2,
+   unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
+   unsigned code_2, unsigned r0_2, unsigned r1_2, unsigned r2_2, unsigned r3_2, unsigned r4_2, unsigned r5_2) {
+   int i=0;
+   unsigned p[32];
+
+   p[i++] = 0; // size
+   p[i++] = 0x00000000; // process request
+   p[i++] = 0x30018; // (the tag id)
+   p[i++] = 88; // (size of the buffer)
+   p[i++] = 88; // (size of the data)
+
+   p[i++] = num_qpus;
+   p[i++] = control;
+   p[i++] = noflush;
+   p[i++] = timeout; // ms
+
+   p[i++] = num_qpus_2;
+   p[i++] = control_2;
+   p[i++] = noflush_2;
+   p[i++] = timeout_2; // ms
+
+   p[i++] = code;
+   p[i++] = r0;
+   p[i++] = r1;
+   p[i++] = r2;
+   p[i++] = r3;
+   p[i++] = r4;
+   p[i++] = r5;
+
+   p[i++] = code_2;
+   p[i++] = r0_2;
+   p[i++] = r1_2;
+   p[i++] = r2_2;
+   p[i++] = r3_2;
+   p[i++] = r4_2;
+   p[i++] = r5_2;
+
+   p[i++] = 0x00000000; // end tag
+   p[0] = i*sizeof *p; // actual size
+
+   mbox_property(file_desc, p);
+   return;
+}
+
+int mbox_open() {
+   int file_desc;
+
+   // open a char device file used for communicating with kernel mbox driver
+   file_desc = open(DEVICE_FILE_NAME, 0);
+   if (file_desc < 0) {
+      printf("Can't open device file: %s\n", DEVICE_FILE_NAME);
+      printf("Try creating a device file with: sudo mknod %s c %d 0\n", DEVICE_FILE_NAME, MAJOR_NUM);
+   }
+   return file_desc;
+}
+
+void mbox_close(int file_desc) {
+  close(file_desc);
+}
diff --git a/libavcodec/rpi_mailbox.h b/libavcodec/rpi_mailbox.h
new file mode 100644
index 0000000..5898102
--- /dev/null
+++ b/libavcodec/rpi_mailbox.h
@@ -0,0 +1,25 @@
+#ifndef RPI_MAILBOX_H
+#define RPI_MAILBOX_H
+
+extern int mbox_open(void);
+extern void mbox_close(int file_desc);
+
+extern unsigned get_version(int file_desc);
+extern unsigned mem_alloc(int file_desc, unsigned size, unsigned align, unsigned flags);
+extern unsigned mem_free(int file_desc, unsigned handle);
+extern unsigned mem_lock(int file_desc, unsigned handle);
+extern unsigned mem_unlock(int file_desc, unsigned handle);
+extern void *mapmem_shared(unsigned base, unsigned size);
+extern void *mapmem_private(unsigned base, unsigned size);
+extern void unmapmem(void *addr, unsigned size);
+
+extern unsigned execute_code(int file_desc, unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
+extern unsigned execute_qpu(int file_desc, unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout);
+extern void execute_multi(int file_desc,
+   unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout,
+   unsigned num_qpus_2, unsigned control_2, unsigned noflush_2, unsigned timeout_2,
+   unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
+   unsigned code_2, unsigned r0_2, unsigned r1_2, unsigned r2_2, unsigned r3_2, unsigned r4_2, unsigned r5_2);
+extern unsigned qpu_enable(int file_desc, unsigned enable);
+
+#endif
diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
new file mode 100644
index 0000000..a01c051
--- /dev/null
+++ b/libavcodec/rpi_qpu.c
@@ -0,0 +1,991 @@
+#ifdef RPI
+// Use vchiq service for submitting jobs
+#define GPUSERVICE
+
+// This works better than the mmap in that the memory can be cached, but requires a kernel modification to enable the device.
+// define RPI_TIME_TOTAL_QPU to print out how much time is spent in the QPU code
+//#define RPI_TIME_TOTAL_QPU
+// define RPI_TIME_TOTAL_VPU to print out how much time is spent in the VPI code
+//#define RPI_TIME_TOTAL_VPU
+// define RPI_TIME_TOTAL_POSTED to print out how much time is spent in the multi execute QPU/VPU combined
+#define RPI_TIME_TOTAL_POSTED
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stddef.h>
+#include <stdint.h>
+#include "libavutil/avassert.h"
+
+#include "config.h"
+
+#include <pthread.h>
+#include <time.h>
+
+#include "rpi_mailbox.h"
+#include "rpi_qpu.h"
+#include "rpi_shader.h"
+#include "rpi_hevc_transform.h"
+
+#include "rpi_user_vcsm.h"
+#ifdef GPUSERVICE
+#pragma GCC diagnostic push
+// Many many redundant decls in the header files
+#pragma GCC diagnostic ignored "-Wredundant-decls"
+#include "interface/vmcs_host/vc_vchi_gpuserv.h"
+#pragma GCC diagnostic pop
+#endif
+
+// QPU profile flags
+#define NO_FLUSH 1
+#define CLEAR_PROFILE 2
+#define OUTPUT_COUNTS 4
+
+#define FLAGS_FOR_PROFILING (NO_FLUSH)
+
+
+// On Pi2 there is no way to access the VPU L2 cache
+// GPU_MEM_FLG should be 4 for uncached memory.  (Or C for alias to allocate in the VPU L2 cache)
+// However, if using VCSM allocated buffers, need to use C at the moment because VCSM does not allocate uncached memory correctly
+// The QPU crashes if we mix L2 cached and L2 uncached accesses due to a HW bug.
+#define GPU_MEM_FLG 0x4
+// GPU_MEM_MAP is meaningless on the Pi2 and should be left at 0  (On Pi1 it allows ARM to access VPU L2 cache)
+#define GPU_MEM_MAP 0x0
+
+#define vcos_verify_ge0(x) ((x)>=0)
+
+/*static const unsigned code[] =
+{
+  #include "rpi_shader.hex"
+};*/
+
+// Size in 32bit words
+#define QPU_CODE_SIZE 2048
+#define VPU_CODE_SIZE 2048
+
+const short rpi_transMatrix2even[32][16] = { // Even rows first
+{64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64},
+{90,  87,  80,  70,  57,  43,  25,   9,  -9, -25, -43, -57, -70, -80, -87, -90},
+{89,  75,  50,  18, -18, -50, -75, -89, -89, -75, -50, -18,  18,  50,  75,  89},
+{87,  57,   9, -43, -80, -90, -70, -25,  25,  70,  90,  80,  43,  -9, -57, -87},
+{83,  36, -36, -83, -83, -36,  36,  83,  83,  36, -36, -83, -83, -36,  36,  83},
+{80,   9, -70, -87, -25,  57,  90,  43, -43, -90, -57,  25,  87,  70,  -9, -80},
+{75, -18, -89, -50,  50,  89,  18, -75, -75,  18,  89,  50, -50, -89, -18,  75},
+{70, -43, -87,   9,  90,  25, -80, -57,  57,  80, -25, -90,  -9,  87,  43, -70},
+{64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64},
+{57, -80, -25,  90,  -9, -87,  43,  70, -70, -43,  87,   9, -90,  25,  80, -57},
+{50, -89,  18,  75, -75, -18,  89, -50, -50,  89, -18, -75,  75,  18, -89,  50},
+{43, -90,  57,  25, -87,  70,   9, -80,  80,  -9, -70,  87, -25, -57,  90, -43},
+{36, -83,  83, -36, -36,  83, -83,  36,  36, -83,  83, -36, -36,  83, -83,  36},
+{25, -70,  90, -80,  43,   9, -57,  87, -87,  57,  -9, -43,  80, -90,  70, -25},
+{18, -50,  75, -89,  89, -75,  50, -18, -18,  50, -75,  89, -89,  75, -50,  18},
+{ 9, -25,  43, -57,  70, -80,  87, -90,  90, -87,  80, -70,  57, -43,  25,  -9},
+// Odd rows
+{90,  90,  88,  85,  82,  78,  73,  67,  61,  54,  46,  38,  31,  22,  13,   4},
+{90,  82,  67,  46,  22,  -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13},
+{88,  67,  31, -13, -54, -82, -90, -78, -46,  -4,  38,  73,  90,  85,  61,  22},
+{85,  46, -13, -67, -90, -73, -22,  38,  82,  88,  54,  -4, -61, -90, -78, -31},
+{82,  22, -54, -90, -61,  13,  78,  85,  31, -46, -90, -67,   4,  73,  88,  38},
+{78,  -4, -82, -73,  13,  85,  67, -22, -88, -61,  31,  90,  54, -38, -90, -46},
+{73, -31, -90, -22,  78,  67, -38, -90, -13,  82,  61, -46, -88,  -4,  85,  54},
+{67, -54, -78,  38,  85, -22, -90,   4,  90,  13, -88, -31,  82,  46, -73, -61},
+{61, -73, -46,  82,  31, -88, -13,  90,  -4, -90,  22,  85, -38, -78,  54,  67},
+{54, -85,  -4,  88, -46, -61,  82,  13, -90,  38,  67, -78, -22,  90, -31, -73},
+{46, -90,  38,  54, -90,  31,  61, -88,  22,  67, -85,  13,  73, -82,   4,  78},
+{38, -88,  73,  -4, -67,  90, -46, -31,  85, -78,  13,  61, -90,  54,  22, -82},
+{31, -78,  90, -61,   4,  54, -88,  82, -38, -22,  73, -90,  67, -13, -46,  85},
+{22, -61,  85, -90,  73, -38,  -4,  46, -78,  90, -82,  54, -13, -31,  67, -88},
+{13, -38,  61, -78,  88, -90,  85, -73,  54, -31,   4,  22, -46,  67, -82,  90},
+{ 4, -13,  22, -31,  38, -46,  54, -61,  67, -73,  78, -82,  85, -88,  90, -90}
+};
+
+struct GPU
+{
+  unsigned int qpu_code[QPU_CODE_SIZE];
+  unsigned int vpu_code[VPU_CODE_SIZE];
+  short transMatrix2even[16*16*2];
+  int open_count; // Number of allocated video buffers
+  int      mb; // Mailbox handle
+  int      vc; // Address in GPU memory
+  int mail[12*2]; // These are used to pass pairs of code/unifs to the QPUs for the first QPU task
+  int mail2[12*2]; // These are used to pass pairs of code/unifs to the QPUs for the second QPU task
+};
+
+// Stop more than one thread trying to allocate memory or use the processing resources at once
+static pthread_mutex_t gpu_mutex = PTHREAD_MUTEX_INITIALIZER;
+static volatile struct GPU* gpu = NULL;
+static GPU_MEM_PTR_T gpu_mem_ptr;
+
+#if defined(RPI_TIME_TOTAL_QPU) || defined(RPI_TIME_TOTAL_VPU) || defined(RPI_TIME_TOTAL_POSTED)
+static unsigned int Microseconds(void) {
+    struct timespec ts;
+    unsigned int x;
+    static unsigned int base = 0;
+    clock_gettime(CLOCK_REALTIME, &ts);
+    x = ts.tv_sec*1000000 + ts.tv_nsec/1000;
+    if (base==0) base=x;
+    return x-base;
+}
+#endif
+
+static int gpu_malloc_uncached_internal(int numbytes, GPU_MEM_PTR_T *p, int mb);
+static void gpu_free_internal(GPU_MEM_PTR_T *p);
+
+// Connect to QPU, returns 0 on success.
+static int gpu_init(volatile struct GPU **gpu) {
+  int mb = mbox_open();
+  int vc;
+  volatile struct GPU* ptr;
+	if (mb < 0)
+		return -1;
+#ifndef RPI_ASYNC
+	if (qpu_enable(mb, 1)) return -2;
+#endif
+  vcsm_init();
+  gpu_malloc_uncached_internal(sizeof(struct GPU), &gpu_mem_ptr, mb);
+  ptr = (volatile struct GPU*)gpu_mem_ptr.arm;
+  memset((void*)ptr, 0, sizeof *ptr);
+  vc = gpu_mem_ptr.vc;
+
+  ptr->mb = mb;
+  ptr->vc = vc;
+
+  printf("GPU allocated at 0x%x\n",vc);
+
+  *gpu = ptr;
+
+  // Now copy over the QPU code into GPU memory
+  {
+    int num_bytes = qpu_get_fn(QPU_MC_END) - qpu_get_fn(QPU_MC_SETUP_UV);
+    av_assert0(num_bytes<=QPU_CODE_SIZE*sizeof(unsigned int));
+    memcpy((void*)ptr->qpu_code, rpi_shader, num_bytes);
+  }
+  // And the VPU code
+  {
+    int num_bytes = sizeof(rpi_hevc_transform);
+    av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
+    memcpy((void*)ptr->vpu_code, rpi_hevc_transform, num_bytes);
+  }
+  // And the transform coefficients
+  memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, sizeof(rpi_transMatrix2even));
+
+#ifdef RPI_ASYNC
+  {
+    int err;
+    vpu_async_tail = 0;
+    vpu_async_head = 0;
+    err = pthread_create(&vpu_thread, NULL, vpu_start, NULL);
+    //printf("Created thread\n");
+    if (err) {
+        av_log(NULL, AV_LOG_FATAL, "Failed to create vpu thread\n");
+        return -4;
+    }
+
+    {
+      struct sched_param param = {0};
+      int policy = 0;
+
+      if (pthread_getschedparam(vpu_thread, &policy, &param) != 0)
+      {
+        av_log(NULL, AV_LOG_ERROR, "Unable to get VPU thread scheduling parameters\n");
+      }
+      else
+      {
+        av_log(NULL, AV_LOG_INFO, "VPU thread: policy=%d (%s), pri=%d\n",
+            policy,
+            policy == SCHED_RR ? "RR" : policy == SCHED_FIFO ? "FIFO" : "???" ,
+            param.sched_priority);
+
+        policy = SCHED_FIFO;
+        param.sched_priority = sched_get_priority_max(SCHED_FIFO);
+
+        av_log(NULL, AV_LOG_INFO, "Attempt to set: policy=%d (%s), pri=%d\n",
+            policy,
+            policy == SCHED_RR ? "RR" : policy == SCHED_FIFO ? "FIFO" : "???" ,
+            param.sched_priority);
+
+        if (pthread_setschedparam(vpu_thread, policy, &param) != 0)
+        {
+          av_log(NULL, AV_LOG_ERROR, "Unable to set VPU thread scheduling parameters\n");
+        }
+        else
+        {
+          if (pthread_getschedparam(vpu_thread, &policy, &param) != 0)
+          {
+            av_log(NULL, AV_LOG_ERROR, "Unable to get VPU thread scheduling parameters\n");
+          }
+          else
+          {
+            av_log(NULL, AV_LOG_INFO, "VPU thread (after): policy=%d (%s), pri=%d\n",
+                policy,
+                policy == SCHED_RR ? "RR" : policy == SCHED_FIFO ? "FIFO" : "???" ,
+                param.sched_priority);
+          }
+        }
+      }
+
+    }
+
+  }
+#endif
+
+  return 0;
+}
+
+// Returns 1 if the gpu is currently idle
+static int gpu_idle(void)
+{
+  int ret = pthread_mutex_trylock(&gpu_mutex);
+  if (ret==0) {
+    pthread_mutex_unlock(&gpu_mutex);
+    return 1;
+  }
+  return 0;
+}
+
+// Make sure we have exclusive access to the mailbox, and enable qpu if necessary.
+static void gpu_lock(void) {
+  pthread_mutex_lock(&gpu_mutex);
+
+  if (gpu==NULL) {
+    gpu_init(&gpu);
+  }
+}
+
+static void gpu_unlock(void) {
+  pthread_mutex_unlock(&gpu_mutex);
+}
+
+static int gpu_malloc_uncached_internal(int numbytes, GPU_MEM_PTR_T *p, int mb) {
+  p->numbytes = numbytes;
+  p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
+  av_assert0(p->vcsm_handle);
+  p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
+  av_assert0(p->vc_handle);
+  p->arm = vcsm_lock(p->vcsm_handle);
+  av_assert0(p->arm);
+  p->vc = mem_lock(mb, p->vc_handle);
+  av_assert0(p->vc);
+  return 0;
+}
+
+// Allocate memory on GPU
+// Fills in structure <p> containing ARM pointer, videocore handle, videocore memory address, numbytes
+// Returns 0 on success.
+// This allocates memory that will not be cached in ARM's data cache.
+// Therefore safe to use without data cache flushing.
+int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p)
+{
+  int r;
+  gpu_lock();
+  r = gpu_malloc_uncached_internal(numbytes, p, gpu->mb);
+  gpu->open_count++;
+  gpu_unlock();
+  return r;
+}
+
+int gpu_get_mailbox(void)
+{
+  av_assert0(gpu);
+  return gpu->mb;
+}
+
+// Call this to clean and invalidate a region of memory
+void gpu_cache_flush(const GPU_MEM_PTR_T * const p)
+{
+#ifdef RPI_FAST_CACHEFLUSH
+    struct vcsm_user_clean_invalid_s iocache = {};
+    iocache.s[0].handle = p->vcsm_handle;
+    iocache.s[0].cmd = 3; // clean+invalidate
+    iocache.s[0].addr = (int) p->arm;
+    iocache.s[0].size  = p->numbytes;
+    vcsm_clean_invalid( &iocache );
+#else
+    void *tmp = vcsm_lock(p->vcsm_handle);
+    vcsm_unlock_ptr(tmp);
+#endif
+}
+
+void gpu_cache_flush3(GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2)
+{
+#ifdef RPI_FAST_CACHEFLUSH
+    struct vcsm_user_clean_invalid_s iocache = {};
+    iocache.s[0].handle = p0->vcsm_handle;
+    iocache.s[0].cmd = 3; // clean+invalidate
+    iocache.s[0].addr = (int) p0->arm;
+    iocache.s[0].size  = p0->numbytes;
+    iocache.s[1].handle = p1->vcsm_handle;
+    iocache.s[1].cmd = 3; // clean+invalidate
+    iocache.s[1].addr = (int) p1->arm;
+    iocache.s[1].size  = p1->numbytes;
+    iocache.s[2].handle = p2->vcsm_handle;
+    iocache.s[2].cmd = 3; // clean+invalidate
+    iocache.s[2].addr = (int) p2->arm;
+    iocache.s[2].size  = p2->numbytes;
+    vcsm_clean_invalid( &iocache );
+#else
+    void *tmp;
+    tmp = vcsm_lock(p0->vcsm_handle);
+    vcsm_unlock_ptr(tmp);
+    tmp = vcsm_lock(p1->vcsm_handle);
+    vcsm_unlock_ptr(tmp);
+    tmp = vcsm_lock(p2->vcsm_handle);
+    vcsm_unlock_ptr(tmp);
+#endif
+}
+
+static int gpu_malloc_cached_internal(int numbytes, GPU_MEM_PTR_T *p) {
+  p->numbytes = numbytes;
+  p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST, (char *)"Video Frame" );
+  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" );
+  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
+  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST_AND_VC, (char *)"Video Frame" );
+  av_assert0(p->vcsm_handle);
+  p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
+  av_assert0(p->vc_handle);
+  p->arm = vcsm_lock(p->vcsm_handle);
+  av_assert0(p->arm);
+  p->vc = mem_lock(gpu->mb, p->vc_handle);
+  av_assert0(p->vc);
+  return 0;
+}
+
+// This allocates data that will be
+//    Cached in ARM L2
+//    Uncached in VPU L2
+int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p)
+{
+  int r;
+  gpu_lock();
+  r = gpu_malloc_cached_internal(numbytes, p);
+  gpu->open_count++;
+  gpu_unlock();
+  return r;
+}
+
+static void gpu_term(void)
+{
+  int mb;
+
+  if (gpu==NULL)
+    return;
+  mb = gpu->mb;
+
+  // ??? Tear down anything needed for gpuexecute
+
+  qpu_enable(mb, 0);
+  gpu_free_internal(&gpu_mem_ptr);
+
+  vcsm_exit();
+
+  mbox_close(mb);
+  gpu = NULL;
+}
+
+void gpu_free_internal(GPU_MEM_PTR_T *p) {
+  int mb = gpu->mb;
+  mem_unlock(mb,p->vc_handle);
+  vcsm_unlock_ptr(p->arm);
+  vcsm_free(p->vcsm_handle);
+}
+
+void gpu_free(GPU_MEM_PTR_T *p) {
+  gpu_lock();
+
+  gpu_free_internal(p);
+
+  gpu->open_count--;
+  if (gpu->open_count==0) {
+      printf("Closing GPU\n");
+      gpu_term();
+      gpu = NULL;
+  }
+  gpu_unlock();
+}
+
+unsigned int vpu_get_fn(void) {
+  // Make sure that the gpu is initialized
+  if (gpu==NULL) {
+    printf("Preparing gpu\n");
+    gpu_lock();
+    gpu_unlock();
+  }
+  return gpu->vc + offsetof(struct GPU,vpu_code);
+}
+
+unsigned int vpu_get_constants(void) {
+  if (gpu==NULL) {
+    gpu_lock();
+    gpu_unlock();
+  }
+  return gpu->vc + offsetof(struct GPU,transMatrix2even);
+}
+
+#ifdef GPUSERVICE
+static void callback(void *cookie)
+{
+  sem_post((sem_t *)cookie);
+}
+#endif
+
+
+static volatile uint32_t post_done = 0;
+static volatile uint32_t post_qed = 0;
+
+static void post_code2_cb(void * v)
+{
+  uint32_t n = (uint32_t)v;
+  if ((int32_t)(n - post_done) > 0) {
+    post_done = n;
+  }
+}
+
+
+// Post a command to the queue
+// Returns an id which we can use to wait for completion
+int vpu_post_code2(unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, GPU_MEM_PTR_T *buf)
+{
+  struct gpu_job_s j[1] = {
+    {
+      .command = EXECUTE_VPU,
+      .u.v.q = {code, r0, r1, r2, r3, r4, r5},
+      .callback.func = post_code2_cb
+    }
+  };
+  uint32_t id;
+
+  j[0].callback.cookie = (void *)(id = ++post_qed);
+
+  av_assert0(vc_gpuserv_execute_code(1, j) == 0);
+
+  return id;
+}
+
+int vpu_qpu_post_code2(unsigned vpu_code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
+    int qpu0_n, const uint32_t * qpu0_mail,
+    int qpu1_n, const uint32_t * qpu1_mail)
+{
+#if 1
+  sem_t sync0;
+  struct gpu_job_s j[4];
+
+  sem_init(&sync0, 0, 0);
+
+  j[0].command = EXECUTE_VPU;
+  j[0].u.v.q[0] = vpu_code;
+  j[0].u.v.q[1] = r0;
+  j[0].u.v.q[2] = r1;
+  j[0].u.v.q[3] = r2;
+  j[0].u.v.q[4] = r3;
+  j[0].u.v.q[5] = r4;
+  j[0].u.v.q[6] = r5;
+  j[0].callback.func = 0;
+  j[0].callback.cookie = NULL;
+
+  j[1].command = EXECUTE_QPU;
+  j[1].u.q.jobs = qpu1_n;
+  memcpy(j[1].u.q.control, qpu1_mail, qpu1_n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
+  j[1].u.q.noflush = FLAGS_FOR_PROFILING;
+  j[1].u.q.timeout = 5000;
+  j[1].callback.func = 0;
+  j[1].callback.cookie = NULL;
+
+  j[2].command = EXECUTE_QPU;
+  j[2].u.q.jobs = qpu0_n;
+  memcpy(j[2].u.q.control, qpu0_mail, qpu0_n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
+  j[2].u.q.noflush = 1;
+  j[2].u.q.timeout = 5000;
+  j[2].callback.func = 0;
+  j[2].callback.cookie = NULL;
+
+  j[3].command = EXECUTE_SYNC;
+  j[3].u.s.mask = 3;
+  j[3].callback.func = callback;
+  j[3].callback.cookie = (void *)&sync0;
+
+  av_assert0(vc_gpuserv_execute_code(4, j) == 0);
+
+  sem_wait(&sync0);
+#else
+
+  sem_t sync0, sync2;
+  struct gpu_job_s j[3];
+
+  sem_init(&sync0, 0, 0);
+  sem_init(&sync2, 0, 0);
+
+  j[0].command = EXECUTE_VPU;
+  j[0].u.v.q[0] = vpu_code;
+  j[0].u.v.q[1] = r0;
+  j[0].u.v.q[2] = r1;
+  j[0].u.v.q[3] = r2;
+  j[0].u.v.q[4] = r3;
+  j[0].u.v.q[5] = r4;
+  j[0].u.v.q[6] = r5;
+  j[0].callback.func = callback;
+  j[0].callback.cookie = (void *)&sync0;
+
+  j[1].command = EXECUTE_QPU;
+  j[1].u.q.jobs = qpu1_n;
+  memcpy(j[1].u.q.control, qpu1_mail, qpu1_n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
+  j[1].u.q.noflush = FLAGS_FOR_PROFILING;
+  j[1].u.q.timeout = 5000;
+  j[1].callback.func = 0;
+  j[1].callback.cookie = NULL;
+
+  j[2].command = EXECUTE_QPU;
+  j[2].u.q.jobs = qpu0_n;
+  memcpy(j[2].u.q.control, qpu0_mail, qpu0_n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
+  j[2].u.q.noflush = 1;
+  j[2].u.q.timeout = 5000;
+  j[2].callback.func = callback;
+  j[2].callback.cookie = (void *)&sync2;
+
+  av_assert0(vc_gpuserv_execute_code(3, j) == 0);
+
+  sem_wait(&sync0);
+  sem_wait(&sync2);
+#endif
+
+  return 0;
+}
+
+
+// Wait for completion of the given command
+void vpu_wait(int id)
+{
+  if (id == 0) {
+#if 0
+    sem_t sync0;
+    struct gpu_job_s j[1] =
+    {
+      {
+        .command = EXECUTE_SYNC,
+        .u.s.mask = 3,
+        .callback.func = callback,
+        .callback.cookie = (void *)&sync0
+      }
+    };
+
+    sem_init(&sync0, 0, 0);
+
+    av_assert0(vc_gpuserv_execute_code(1, j) == 0);
+
+    sem_wait(&sync0);
+#endif
+  }
+  else {
+    while ((int32_t)(post_done - (uint32_t)id) < 0) {
+      usleep(1000);
+    }
+  }
+}
+
+
+unsigned int qpu_get_fn(int num) {
+    // Make sure that the gpu is initialized
+    unsigned int *fn;
+    if (gpu==NULL) {
+      printf("Preparing gpu\n");
+      gpu_lock();
+      gpu_unlock();
+    }
+    switch(num) {
+    case QPU_MC_SETUP:
+      fn = mc_setup;
+      break;
+    case QPU_MC_FILTER:
+      fn = mc_filter;
+      break;
+    case QPU_MC_EXIT:
+      fn = mc_exit;
+      break;
+    case QPU_MC_INTERRUPT_EXIT12:
+      fn = mc_interrupt_exit12;
+      break;
+    case QPU_MC_FILTER_B:
+      fn = mc_filter_b;
+      break;
+    //case QPU_MC_FILTER_HONLY:
+    //  fn = mc_filter_honly;
+    //  break;
+    case QPU_MC_SETUP_UV:
+      fn = mc_setup_uv;
+      break;
+    case QPU_MC_FILTER_UV:
+      fn = mc_filter_uv;
+      break;
+    case QPU_MC_FILTER_UV_B0:
+      fn = mc_filter_uv_b0;
+      break;
+    case QPU_MC_FILTER_UV_B:
+      fn = mc_filter_uv_b;
+      break;
+    case QPU_MC_INTERRUPT_EXIT8:
+      fn = mc_interrupt_exit8;
+      break;
+    case QPU_MC_END:
+      fn = mc_end;
+      break;
+    default:
+      printf("Unknown function\n");
+      exit(-1);
+    }
+    return gpu->vc + 4*(int)(fn-rpi_shader);
+    //return code[num] + gpu->vc;
+}
+
+#if 0
+typedef unsigned int uint32_t;
+
+typedef struct mvs_s {
+    GPU_MEM_PTR_T unif_mvs_ptr;
+    uint32_t *unif_mvs; // Base of memory for motion vector commands
+
+    // _base pointers are to the start of the row
+    uint32_t *mvs_base[8];
+    // these pointers are to the next free space
+    uint32_t *u_mvs[8];
+
+} HEVCContext;
+
+#define RPI_CHROMA_COMMAND_WORDS 12
+
+static void rpi_inter_clear(HEVCContext *s)
+{
+    int i;
+    for(i=0;i<8;i++) {
+        s->u_mvs[i] = s->mvs_base[i];
+        *s->u_mvs[i]++ = 0;
+        *s->u_mvs[i]++ = 0;
+        *s->u_mvs[i]++ = 0;
+        *s->u_mvs[i]++ = 0;
+        *s->u_mvs[i]++ = 0;
+        *s->u_mvs[i]++ = 128;  // w
+        *s->u_mvs[i]++ = 128;  // h
+        *s->u_mvs[i]++ = 128;  // stride u
+        *s->u_mvs[i]++ = 128;  // stride v
+        s->u_mvs[i] += 3;  // Padding words
+    }
+}
+
+static void rpi_execute_inter_qpu(HEVCContext *s)
+{
+    int k;
+    uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr.vc;
+
+    for(k=0;k<8;k++) {
+        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
+        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
+        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP); //  dummy location for V
+    }
+
+    s->u_mvs[8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
+
+    qpu_run_shader8(qpu_get_fn(QPU_MC_SETUP_UV),
+      (uint32_t)(unif_vc+(s->mvs_base[0 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+      (uint32_t)(unif_vc+(s->mvs_base[1 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+      (uint32_t)(unif_vc+(s->mvs_base[2 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+      (uint32_t)(unif_vc+(s->mvs_base[3 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+      (uint32_t)(unif_vc+(s->mvs_base[4 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+      (uint32_t)(unif_vc+(s->mvs_base[5 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+      (uint32_t)(unif_vc+(s->mvs_base[6 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+      (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm))
+      );
+}
+
+void rpi_test_qpu(void)
+{
+    HEVCContext mvs;
+    HEVCContext *s = &mvs;
+    int i;
+    int uv_commands_per_qpu = (1 + (256*64*2)/(4*4)) * RPI_CHROMA_COMMAND_WORDS;
+    uint32_t *p;
+    printf("Allocate memory\n");
+    gpu_malloc_uncached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr );
+    s->unif_mvs = (uint32_t *) s->unif_mvs_ptr.arm;
+
+    // Set up initial locations for uniform streams
+    p = s->unif_mvs;
+    for(i = 0; i < 8; i++) {
+        s->mvs_base[i] = p;
+        p += uv_commands_per_qpu;
+    }
+    // Now run a simple program that should just quit immediately after a single texture fetch
+    rpi_inter_clear(s);
+    for(i=0;i<4;i++) {
+      printf("Launch QPUs\n");
+      rpi_execute_inter_qpu(s);
+      printf("Done\n");
+    }
+    printf("Free memory\n");
+    gpu_free(&s->unif_mvs_ptr);
+    return;
+}
+#endif
+
+#if 0
+
+int32_t hcoeffs[] = {-4, 10, -21, 70, 90, -24, 11, -4};
+//int32_t hcoeffs[] = {1, 1, 1, 1, 1, 1, 1, 1};
+int32_t vcoeffs[] = {-2, 6, -13, 37, 115, -20, 9, -4};
+//int32_t vcoeffs[] = {1, 1, 1, 1, 1, 1, 1, 1};
+
+#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0-1) & 0xff) | ((c1-1) & 0xff) << 8 | ((c2-1) & 0xff) << 16 | ((c3-1) & 0xff) << 24);
+
+static uint8_t av_clip_uint8(int32_t a)
+{
+    if (a&(~255)) return (-a)>>31;
+    else          return a;
+}
+
+static int32_t filter8(const uint8_t *data, int pitch)
+{
+   int32_t vsum = 0;
+   int x, y;
+
+   for (y = 0; y < 8; y++) {
+      int32_t hsum = 0;
+
+      for (x = 0; x < 8; x++)
+         hsum += hcoeffs[x]*data[x + y * pitch];
+
+      vsum += vcoeffs[y]*av_clip_uint8( (hsum + 64) >> 7); // Added brackets to stop compiler warning
+   }
+
+   return av_clip_uint8( (vsum + 64) >> 7);
+}
+
+// Note regression changes coefficients so is not thread safe
+//#define REGRESSION
+#ifdef REGRESSION
+#define CMAX 100
+#else
+#define CMAX 2
+#endif
+#define YMAX 16
+
+int rpi_test_shader(void)
+{
+   int i, c;
+
+   uint32_t *unifs;
+
+   uint8_t *in_buffer;
+   uint8_t *out_buffer[2];
+
+   GPU_MEM_PTR_T unifs_ptr;
+   GPU_MEM_PTR_T in_buffer_ptr;
+   GPU_MEM_PTR_T out_buffer_ptr[2];
+
+   // Addresses in GPU memory of filter programs
+   uint32_t mc_setup = 0;
+   uint32_t mc_filter = 0;
+   uint32_t mc_exit = 0;
+
+   int pitch = 0x500;
+
+   if (gpu==NULL) {
+      gpu_lock();
+      gpu_unlock();
+   }
+
+   printf("This needs to change to reflect new assembler\n");
+   // Use table to compute locations of program start points
+   mc_setup = code[0] + gpu->vc;
+   mc_filter = code[1] + gpu->vc;
+   mc_exit = code[2] + gpu->vc;
+
+   if (!vcos_verify_ge0(gpu_malloc_uncached(4*64,&unifs_ptr))) {
+      return -2;
+   }
+   unifs = (uint32_t*)unifs_ptr.arm;
+
+   if (!vcos_verify_ge0(gpu_malloc_uncached(64*23,&in_buffer_ptr))) {
+      return -3;
+   }
+   in_buffer = (uint8_t*)in_buffer_ptr.arm;
+
+   if (!vcos_verify_ge0(gpu_malloc_uncached(16*pitch,&out_buffer_ptr[0])) || !vcos_verify_ge0(gpu_malloc_uncached(16*pitch,&out_buffer_ptr[1]))) {
+      return -4;
+   }
+   out_buffer[0] = (uint8_t*)out_buffer_ptr[0].arm;
+   out_buffer[1] = (uint8_t*)out_buffer_ptr[1].arm;
+
+   for (c = 0; c < CMAX; c++) {
+      int xo[] = {rand()&31, rand()&31};
+
+#ifdef REGRESSION
+      for (i = 0; i < 8; i++) {
+         hcoeffs[i] = (int8_t)rand();
+         vcoeffs[i] = (int8_t)rand();
+         if (hcoeffs[i]==-128)
+           hcoeffs[i]++;
+         if (vcoeffs[i]==-128)
+           vcoeffs[i]++;
+      }
+#endif
+
+      for (i = 0; i < 64*23; i++) {
+         //printf("%d %d %p\n",i,gpu->mb,&in_buffer[i]);
+         in_buffer[i] = rand();
+      }
+
+      // Clear output array
+      {
+        int b;
+        for(b=0;b<2;b++) {
+          for(i=0;i<16*16;i++) {
+            out_buffer[b][i] = 3;
+          }
+        }
+      }
+
+      unifs[0] = mc_filter;
+      unifs[1] = in_buffer_ptr.vc+xo[0]+16;
+      unifs[2] = 64; // src pitch
+      unifs[3] = pitch; // dst pitch
+      unifs[4] = 0; // Padding
+      unifs[5] = 0;
+      unifs[6] = 0;
+      unifs[7 ] = mc_filter;
+      unifs[8 ] = in_buffer_ptr.vc+xo[1]+16;
+      unifs[9 ] = ENCODE_COEFFS(hcoeffs[0], hcoeffs[1], hcoeffs[2], hcoeffs[3]);
+      unifs[10] = ENCODE_COEFFS(hcoeffs[4], hcoeffs[5], hcoeffs[6], hcoeffs[7]);
+      unifs[11] = ENCODE_COEFFS(vcoeffs[0], vcoeffs[1], vcoeffs[2], vcoeffs[3]);
+      unifs[12] = ENCODE_COEFFS(vcoeffs[4], vcoeffs[5], vcoeffs[6], vcoeffs[7]);
+      unifs[13] = out_buffer_ptr[0].vc;
+      unifs[14] = mc_exit;
+      unifs[15] = in_buffer_ptr.vc+xo[1]+16;        // dummy
+      unifs[16] = ENCODE_COEFFS(hcoeffs[0], hcoeffs[1], hcoeffs[2], hcoeffs[3]);
+      unifs[17] = ENCODE_COEFFS(hcoeffs[4], hcoeffs[5], hcoeffs[6], hcoeffs[7]);
+      unifs[18] = ENCODE_COEFFS(vcoeffs[0], vcoeffs[1], vcoeffs[2], vcoeffs[3]);
+      unifs[19] = ENCODE_COEFFS(vcoeffs[4], vcoeffs[5], vcoeffs[6], vcoeffs[7]);
+      unifs[20] = out_buffer_ptr[1].vc;
+
+      printf("Gpu->vc=%x Code=%x dst=%x\n",gpu->vc, mc_filter,out_buffer_ptr[1].vc);
+
+      // flush_dcache(); TODO is this needed on ARM side? - tried to use the direct alias to avoid this problem
+
+      //qpu_run_shader(mc_setup, unifs_ptr.vc);
+      //qpu_run_shader(gpu, gpu->vc, unifs_ptr.vc);
+      rpi_do_block(in_buffer_ptr.vc+xo[0]+16, 64, out_buffer_ptr[0].vc, pitch,out_buffer[0]);
+      rpi_do_block(in_buffer_ptr.vc+xo[1]+16, 64, out_buffer_ptr[1].vc, pitch,out_buffer[1]);
+
+      if (1)
+      {
+         int x, y, b;
+         int bad = 0;
+
+         for (b=0; b<2; ++b)
+            for (y=0; y<YMAX; ++y)
+               for (x=0; x<16; ++x) {
+                  int32_t ref = filter8(in_buffer+x+y*64+xo[b], 64);
+
+                  if (out_buffer[b][x+y*pitch] != ref) {
+                      bad = 1;
+//                     printf("%d, %d, %d, %d\n", c, b, x, y);
+                  }
+#ifndef REGRESSION
+                  //printf("%08x %08x\n", out_buffer[b][x+y*pitch], ref);
+#endif
+               }
+          if (bad)
+            printf("Failed dst=%x test=%d\n",out_buffer_ptr[1].vc,c);
+          else
+            printf("Passed dst=%x test=%d\n",out_buffer_ptr[1].vc,c);
+      }
+      //printf("%d\n", simpenrose_get_qpu_tick_count());
+   }
+
+   gpu_free(&out_buffer_ptr[0]);
+   gpu_free(&out_buffer_ptr[1]);
+   gpu_free(&in_buffer_ptr);
+   gpu_free(&unifs_ptr);
+
+   return 0;
+}
+
+void rpi_do_block_arm(const uint8_t *in_buffer, int src_pitch, uint8_t *dst, int dst_pitch)
+{
+  int x,y;
+  for (y=0; y<16; ++y) {
+    for (x=0; x<16; ++x) {
+       dst[x+y*dst_pitch] = filter8(in_buffer+x+y*src_pitch, src_pitch);
+    }
+  }
+}
+
+void rpi_do_block(const uint8_t *in_buffer_vc, int src_pitch, uint8_t *dst_vc, int dst_pitch, uint8_t *dst)
+{
+   uint32_t *unifs;
+
+   GPU_MEM_PTR_T unifs_ptr;
+   //uint8_t *out_buffer;
+   //GPU_MEM_PTR_T out_buffer_ptr;
+
+   // Addresses in GPU memory of filter programs
+   uint32_t mc_setup = 0;
+   uint32_t mc_filter = 0;
+   uint32_t mc_exit = 0;
+   //int x,y;
+
+   if (gpu==NULL) {
+      gpu_lock();
+      gpu_unlock();
+   }
+
+   // Use table to compute locations of program start points
+   mc_setup = code[0] + gpu->vc;
+   mc_filter = code[1] + gpu->vc;
+   mc_exit = code[2] + gpu->vc;
+
+   if (!vcos_verify_ge0(gpu_malloc_uncached(4*64,&unifs_ptr))) {
+      return;
+   }
+   //gpu_malloc_uncached(16*dst_pitch,&out_buffer_ptr);
+   //out_buffer = (uint8_t*)out_buffer_ptr.arm;
+
+   /*for (y=0; y<16; ++y) {
+      for (x=0; x<16; ++x) {
+         out_buffer[x+y*dst_pitch] = 7;
+      }
+    }*/
+
+   unifs = (uint32_t*)unifs_ptr.arm;
+
+    unifs[0] = mc_filter;
+    unifs[1] = (int)in_buffer_vc;
+    unifs[2] = src_pitch; // src pitch
+    unifs[3] = dst_pitch; // dst pitch
+    unifs[4] = 0; // Padding
+    unifs[5] = 0;
+    unifs[6] = 0;
+    unifs[7 ] = mc_exit;
+    unifs[8 ] = (int)in_buffer_vc;
+    unifs[9 ] = ENCODE_COEFFS(hcoeffs[0], hcoeffs[1], hcoeffs[2], hcoeffs[3]);
+    unifs[10] = ENCODE_COEFFS(hcoeffs[4], hcoeffs[5], hcoeffs[6], hcoeffs[7]);
+    unifs[11] = ENCODE_COEFFS(vcoeffs[0], vcoeffs[1], vcoeffs[2], vcoeffs[3]);
+    unifs[12] = ENCODE_COEFFS(vcoeffs[4], vcoeffs[5], vcoeffs[6], vcoeffs[7]);
+    unifs[13] = (int)dst_vc;
+    //unifs[13] = (int)out_buffer_ptr.vc;
+
+    //printf("Gpu->vc=%x Code=%x dst=%x\n",gpu->vc, mc_filter,out_buffer_ptr[1].vc);
+
+    qpu_run_shader(mc_setup, unifs_ptr.vc);
+
+    /*for (y=0; y<16; ++y) {
+      for (x=0; x<16; ++x) {
+         dst[x+y*dst_pitch] = out_buffer[x+y*dst_pitch];
+      }
+    }*/
+
+    gpu_free(&unifs_ptr);
+    //gpu_free(&out_buffer_ptr);
+}
+
+
+
+#endif
+
+#endif // RPI
diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
new file mode 100644
index 0000000..c6cdb2b
--- /dev/null
+++ b/libavcodec/rpi_qpu.h
@@ -0,0 +1,176 @@
+#ifndef RPI_QPU_H
+#define RPI_QPU_H
+
+// Define RPI_FAST_CACHEFLUSH to use the VCSM cache flush code
+// *** N.B. Code has rotted & crashes if this is unset (before this set of changes)
+#define RPI_FAST_CACHEFLUSH
+
+#define RPI_ONE_BUF 1
+
+typedef struct gpu_mem_ptr_s {
+  unsigned char *arm; // Pointer to memory mapped on ARM side
+  int vc_handle;   // Videocore handle of relocatable memory
+  int vcsm_handle; // Handle for use by VCSM
+  int vc;       // Address for use in GPU code
+  int numbytes; // Size of memory block
+} GPU_MEM_PTR_T;
+
+// General GPU functions
+extern int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p);
+extern int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p);
+extern void gpu_free(GPU_MEM_PTR_T *p);
+extern void gpu_cache_flush(const GPU_MEM_PTR_T * const p);
+extern void gpu_cache_flush3(GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2);
+
+#include "libavutil/frame.h"
+#if !RPI_ONE_BUF
+static inline uint32_t get_vc_address_y(const AVFrame * const frame) {
+    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[0]);
+    return p->vc;
+}
+
+static inline uint32_t get_vc_address_u(const AVFrame * const frame) {
+    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[1]);
+    return p->vc;
+}
+
+static inline uint32_t get_vc_address_v(const AVFrame * const frame) {
+    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[2]);
+    return p->vc;
+}
+
+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) {
+    return *(GPU_MEM_PTR_T *)av_buffer_pool_opaque(frame->buf[0]);
+}
+
+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_u(const AVFrame * const frame) {
+    return *(GPU_MEM_PTR_T *)av_buffer_pool_opaque(frame->buf[1]);
+}
+
+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_v(const AVFrame * const frame) {
+    return *(GPU_MEM_PTR_T *)av_buffer_pool_opaque(frame->buf[2]);
+}
+
+#else
+
+static inline int gpu_is_buf1(const AVFrame * const frame)
+{
+    return frame->buf[1] == NULL;
+}
+
+static inline GPU_MEM_PTR_T * gpu_buf1_gmem(const AVFrame * const frame)
+{
+    return av_buffer_get_opaque(frame->buf[0]);
+}
+
+static inline GPU_MEM_PTR_T * gpu_buf3_gmem(const AVFrame * const frame, const int n)
+{
+    return av_buffer_pool_opaque(frame->buf[n]);
+}
+
+
+static inline uint32_t get_vc_address_y(const AVFrame * const frame) {
+    return gpu_is_buf1(frame) ? gpu_buf1_gmem(frame)->vc : gpu_buf3_gmem(frame, 0)->vc;
+}
+
+static inline uint32_t get_vc_address_u(const AVFrame * const frame) {
+    return gpu_is_buf1(frame) ?
+        gpu_buf1_gmem(frame)->vc + frame->data[1] - frame->data[0] :
+        gpu_buf3_gmem(frame, 1)->vc;
+}
+
+static inline uint32_t get_vc_address_v(const AVFrame * const frame) {
+    return gpu_is_buf1(frame) ?
+        gpu_buf1_gmem(frame)->vc + frame->data[2] - frame->data[0] :
+        gpu_buf3_gmem(frame, 2)->vc;
+}
+
+
+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) {
+    if (gpu_is_buf1(frame))
+    {
+        GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
+        g.numbytes = frame->data[1] - frame->data[0];
+        return g;
+    }
+    else
+        return *gpu_buf3_gmem(frame, 0);
+}
+
+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_u(const AVFrame * const frame) {
+    if (gpu_is_buf1(frame))
+    {
+        GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
+        g.arm += frame->data[1] - frame->data[0];
+        g.vc += frame->data[1] - frame->data[0];
+        g.numbytes = frame->data[2] - frame->data[1];  // chroma size
+        return g;
+    }
+    else
+        return *gpu_buf3_gmem(frame, 1);
+}
+
+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_v(const AVFrame * const frame) {
+    if (gpu_is_buf1(frame))
+    {
+        GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
+        g.arm += frame->data[2] - frame->data[0];
+        g.vc += frame->data[2] - frame->data[0];
+        g.numbytes = frame->data[2] - frame->data[1];  // chroma size
+        return g;
+    }
+    else
+        return *gpu_buf3_gmem(frame, 2);
+}
+
+#endif
+
+
+// QPU specific functions
+extern void rpi_test_qpu(void);
+
+enum {
+  QPU_MC_SETUP,
+  QPU_MC_FILTER,
+  QPU_MC_EXIT,
+  QPU_MC_INTERRUPT_EXIT12,
+  QPU_MC_FILTER_B,
+  QPU_MC_FILTER_HONLY,
+  QPU_MC_SETUP_UV,
+  QPU_MC_FILTER_UV,
+  QPU_MC_FILTER_UV_B0,
+  QPU_MC_FILTER_UV_B,
+  QPU_MC_INTERRUPT_EXIT8,
+  QPU_MC_END
+  };
+extern unsigned int qpu_get_fn(int num);
+
+#define QPU_N_UV   8
+#define QPU_N_Y    12
+#define QPU_N_MAX  16
+
+#define QPU_MAIL_EL_VALS  2
+#define QPU_MAIL_EL_SIZE  (QPU_MAIL_EL_VALS * sizeof(uint32_t))
+#define QPU_MAIL_VALS_MAX (QPU_N_MAX * QPU_MAIL_EL_VALS)
+#define QPU_MAIL_SIZE (QPU_MAIL_VALS_MAX * sizeof(uint32_t))
+
+// VPU specific functions
+extern unsigned int vpu_get_fn(void);
+extern unsigned int vpu_get_constants(void);
+//extern unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
+extern int vpu_post_code2( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, GPU_MEM_PTR_T *buf);
+int vpu_qpu_post_code2(unsigned vpu_code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
+    int qpu0_n, const uint32_t * qpu0_mail,
+    int qpu1_n, const uint32_t * qpu1_mail);
+
+extern void vpu_wait( int id);
+
+// Simple test of shader code
+extern int rpi_test_shader(void);
+
+extern void rpi_do_block(const unsigned char *in_buffer_vc, int src_pitch, unsigned char *dst_vc, int dst_pitch, unsigned char *dst);
+extern void rpi_do_block_arm(const unsigned char *in_buffer, int src_pitch, unsigned char *dst, int dst_pitch);
+
+extern int gpu_get_mailbox(void);
+
+#endif
diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
new file mode 100644
index 0000000..06fb166
--- /dev/null
+++ b/libavcodec/rpi_shader.c
@@ -0,0 +1,629 @@
+#include "rpi_shader.h"
+
+#ifdef _MSC_VER
+   #include <stdint.h>
+   /* cast through uintptr_t to avoid warnings */
+   #define POINTER_TO_UINT(X) ((unsigned int)(uintptr_t)(X))
+#else
+   #define POINTER_TO_UINT(X) ((unsigned int)(X))
+#endif
+
+#ifdef __cplusplus
+extern "C" { /* the types are probably wrong... */
+#endif
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef _MSC_VER
+__declspec(align(8))
+#elif defined(__GNUC__)
+__attribute__((aligned(8)))
+#endif
+unsigned int rpi_shader[] = {
+// ::mc_setup_uv
+/* [0x00000000] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+/* [0x00000008] */ 0x0c9a0f80, 0x10020427, // add ra_x, unif, elem_num
+/* [0x00000010] */ 0x15827d80, 0x10020767, // mov ra_y, unif
+/* [0x00000018] */ 0x15827d80, 0x10020627, // mov ra_frame_base, unif
+/* [0x00000020] */ 0x009e7000, 0x100009e7, // nop
+/* [0x00000028] */ 0x0d620f80, 0x10020667, // sub ra_u2v_ref_offset, unif, ra_frame_base
+/* [0x00000030] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
+/* [0x00000038] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
+/* [0x00000040] */ 0x15827d80, 0x10021427, // mov rb16, unif
+/* [0x00000048] */ 0x15827d80, 0x10020827, // mov r0, unif
+/* [0x00000050] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+/* [0x00000058] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
+/* [0x00000060] */ 0x00010000, 0xe0020127, // mov ra4, 0x10000
+/* [0x00000068] */ 0x00000001, 0xe0020527, // mov ra_k1, 1
+/* [0x00000070] */ 0x00000100, 0xe00205a7, // mov ra_k256, 256
+/* [0x00000078] */ 0x00000040, 0xe00207a7, // mov ra30, 64
+/* [0x00000080] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
+/* [0x00000088] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255
+/* [0x00000090] */ 0x00000018, 0xe00215e7, // mov rb23, 24
+/* [0x00000098] */ 0x00000000, 0xe0020227, // mov ra8, 0
+/* [0x000000a0] */ 0x00000000, 0xe0020267, // mov ra9, 0
+/* [0x000000a8] */ 0x00000000, 0xe00202a7, // mov ra10, 0
+/* [0x000000b0] */ 0x00000000, 0xe00202e7, // mov ra11, 0
+/* [0x000000b8] */ 0x00000000, 0xe0020327, // mov ra12, 0
+/* [0x000000c0] */ 0x00000000, 0xe0020367, // mov ra13, 0
+/* [0x000000c8] */ 0x00000000, 0xe00203a7, // mov ra14, 0
+/* [0x000000d0] */ 0x00000000, 0xe00203e7, // mov ra15, 0
+/* [0x000000d8] */ 0x15427d80, 0x10020827, // mov r0, ra_x
+/* [0x000000e0] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
+/* [0x000000e8] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base
+/* [0x000000f0] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
+/* [0x000000f8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
+/* [0x00000100] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+/* [0x00000108] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+/* [0x00000110] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x, r0
+/* [0x00000118] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+/* [0x00000120] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
+/* [0x00000128] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_frame_base, r2
+/* [0x00000130] */ 0x0c9e7440, 0x10020f27, // add t1s, r2, r1
+/* [0x00000138] */ 0x00000009, 0xe00208a7, // mov r2, 9
+/* [0x00000140] */ 0x0c827580, 0x10021367, // add rb13, r2, unif
+/* [0x00000148] */ 0x15827d80, 0x100009e7, // mov -, unif
+/* [0x00000150] */ 0x15827d80, 0x100208a7, // mov r2, unif
+/* [0x00000158] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
+/* [0x00000160] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+/* [0x00000168] */ 0x159e7480, 0x10020867, // mov r1, r2
+/* [0x00000170] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+/* [0x00000178] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+/* [0x00000180] */ 0x159e7480, 0x10020827, // mov r0, r2
+/* [0x00000188] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+/* [0x00000190] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+/* [0x00000198] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+/* [0x000001a0] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
+/* [0x000001a8] */ 0x0f9c11c0, 0xd00208a7, // asr r2, r0, 1
+/* [0x000001b0] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
+/* [0x000001b8] */ 0x0c9e7440, 0x10021567, // add rb21, r2, r1
+/* [0x000001c0] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+/* [0x000001c8] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+/* [0x000001d0] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
+/* [0x000001d8] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+/* [0x000001e0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+/* [0x000001e8] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+/* [0x000001f0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+/* [0x000001f8] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+/* [0x00000200] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x
+/* [0x00000208] */ 0x0c627380, 0x10020f27, // add t1s, r1, ra_frame_base
+// ::mc_filter_uv
+/* [0x00000210] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+/* [0x00000218] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+/* [0x00000220] */ 0x938001f6, 0xd0024821, // max r0, r0, 0         ; mov r1, unif
+/* [0x00000228] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+/* [0x00000230] */ 0x8d4e0ef6, 0x10025891, // sub r2, unif, r3      ; mov ra_xshift, ra_xshift_next
+/* [0x00000238] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+/* [0x00000240] */ 0x8c8270f6, 0x10025801, // add r0, r0, r3        ; mov ra1, unif
+/* [0x00000248] */ 0x9481c1f6, 0xd00254c0, // and rb_x_next, r0, ~3 ; mov ra0, unif
+/* [0x00000250] */ 0x959dc27f, 0x10024731, // mov ra_y_next, r1     ; mov vw_setup, rb28
+/* [0x00000258] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
+/* [0x00000260] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b
+/* [0x00000268] */ 0x0c041dc0, 0xd2021467, // add rb17, ra1.16a, 1
+/* [0x00000270] */ 0x0c043dc0, 0xd20214a7, // add rb18, ra1.16a, 3
+/* [0x00000278] */ 0x11047dc0, 0xd2020827, // shl r0,   ra1.16a, 7
+/* [0x00000280] */ 0x0c067180, 0x14020827, // add r0,   r0, ra1.16b
+/* [0x00000288] */ 0x119d01c0, 0xd0020827, // shl r0,   r0, i_shift16
+/* [0x00000290] */ 0x8c81b1f6, 0x10025683, // add rb26, r0, rb27    ; mov ra3, unif
+/* [0x00000298] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+/* [0x000002a0] */ 0x950e0ff6, 0x18024048, // mov ra1, unif         ; mov rb8,  ra3.8a
+/* [0x000002a8] */ 0x950e0ff6, 0x1a064049, // mov.ifnz ra1, unif    ; mov rb9,  ra3.8b
+/* [0x000002b0] */ 0x800e7036, 0x1c0049ca, // nop                   ; mov rb10, ra3.8c
+/* [0x000002b8] */ 0x950c0ff6, 0xde0248cb, // mov r3, 0             ; mov rb11, ra3.8d
+/* [0x000002c0] */ 0x1104ddc0, 0x14020867, // shl r1, ra1.16b, rb13
+/* [0x000002c8] */ 0x0f9c13c0, 0xd0021327, // asr rb12, r1, 1
+/* [0x000002d0] */ 0x11041dc0, 0xd20213a7, // shl rb14, ra1.16a, 1
+// :uvloop
+/* [0x000002d8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1          ; ldtmu0
+/* [0x000002e0] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
+/* [0x000002e8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+/* [0x000002f0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+/* [0x000002f8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+/* [0x00000300] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+/* [0x00000308] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+/* [0x00000310] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+/* [0x00000318] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+/* [0x00000320] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
+/* [0x00000328] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+/* [0x00000330] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,       r0
+/* [0x00000338] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8
+/* [0x00000340] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1
+/* [0x00000348] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9
+/* [0x00000350] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2
+/* [0x00000358] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+/* [0x00000360] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3
+/* [0x00000368] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+/* [0x00000370] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
+/* [0x00000378] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+/* [0x00000380] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
+/* [0x00000388] */ 0x55389db7, 0x10024361, // mov ra13, ra14          ; mul24 r1, ra14, rb9
+/* [0x00000390] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+/* [0x00000398] */ 0x55308037, 0x100243e0, // mov ra15, r0            ; mul24 r0, ra12, rb8
+/* [0x000003a0] */ 0x4d38a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra14, rb10
+/* [0x000003a8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+/* [0x000003b0] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0          ; mov -, vw_wait
+/* [0x000003b8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
+/* [0x000003c0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+/* [0x000003c8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
+/* [0x000003d0] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
+/* [0x000003d8] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
+/* [0x000003e0] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:uvloop
+/* [0x000003e8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
+/* [0x000003f0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
+/* [0x000003f8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+/* [0x00000400] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+/* [0x00000408] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+/* [0x00000410] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+/* [0x00000418] */ 0x00000010, 0xe0020827, // mov r0, 16
+/* [0x00000420] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00000428] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+/* [0x00000430] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+/* [0x00000438] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+/* [0x00000440] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+// ::mc_filter_uv_b0
+/* [0x00000448] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+/* [0x00000450] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+/* [0x00000458] */ 0x938001f6, 0xd0024821, // max r0, r0, 0                ; mov r1, unif
+/* [0x00000460] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+/* [0x00000468] */ 0x8d4e0ef6, 0x10025891, // sub r2, unif, r3             ; mov ra_xshift, ra_xshift_next
+/* [0x00000470] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+/* [0x00000478] */ 0x8c8270f6, 0x10025801, // add r0, r0, r3  	     ; mov ra1, unif
+/* [0x00000480] */ 0x9481c1f6, 0xd00254c0, // and rb_x_next, r0, ~3        ; mov ra0, unif
+/* [0x00000488] */ 0x959d527f, 0x10024731, // mov ra_y_next, r1            ; mov vw_setup, rb21
+/* [0x00000490] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
+/* [0x00000498] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b
+/* [0x000004a0] */ 0x0c041dc0, 0xd2021467, // add rb17, ra1.16a, 1
+/* [0x000004a8] */ 0x0c043dc0, 0xd20214a7, // add rb18, ra1.16a, 3
+/* [0x000004b0] */ 0x11047dc0, 0xd2020827, // shl r0,   ra1.16a, 7
+/* [0x000004b8] */ 0x0c067180, 0x14020827, // add r0,   r0, ra1.16b
+/* [0x000004c0] */ 0x918101f6, 0xd0025803, // shl r0,   r0, i_shift16      ; mov ra3, unif
+/* [0x000004c8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+/* [0x000004d0] */ 0x150e7d80, 0x18021227, // mov rb8, ra3.8a
+/* [0x000004d8] */ 0x150e7d80, 0x1a021267, // mov rb9, ra3.8b
+/* [0x000004e0] */ 0x150e7d80, 0x1c0212a7, // mov rb10, ra3.8c
+/* [0x000004e8] */ 0x150e7d80, 0x1e0212e7, // mov rb11, ra3.8d
+/* [0x000004f0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+/* [0x000004f8] */ 0x15827d80, 0x100213a7, // mov      rb14, unif
+/* [0x00000500] */ 0x95800dbf, 0xd00653a3, // mov.ifnz rb14, unif    ; mov r3, 0
+// :uvloop_b0
+/* [0x00000508] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1          ; ldtmu0
+/* [0x00000510] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
+/* [0x00000518] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+/* [0x00000520] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+/* [0x00000528] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+/* [0x00000530] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+/* [0x00000538] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+/* [0x00000540] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+/* [0x00000548] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+/* [0x00000550] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
+/* [0x00000558] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+/* [0x00000560] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,       r0
+/* [0x00000568] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8
+/* [0x00000570] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1
+/* [0x00000578] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9
+/* [0x00000580] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2
+/* [0x00000588] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+/* [0x00000590] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3
+/* [0x00000598] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+/* [0x000005a0] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
+/* [0x000005a8] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+/* [0x000005b0] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+/* [0x000005b8] */ 0x55389db7, 0x10024361, // mov ra13, ra14          ; mul24 r1, ra14, rb9
+/* [0x000005c0] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+/* [0x000005c8] */ 0x55308037, 0x100243e0, // mov ra15, r0            ; mul24 r0, ra12, rb8
+/* [0x000005d0] */ 0x4d38a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra14, rb10
+/* [0x000005d8] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
+/* [0x000005e0] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+/* [0x000005e8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+/* [0x000005f0] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0          ; mov -, vw_wait
+/* [0x000005f8] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
+/* [0x00000600] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+/* [0x00000608] */ 0x15827d80, 0x100009e7, // mov -, unif
+/* [0x00000610] */ 0x15827d80, 0x100009e7, // mov -, unif
+/* [0x00000618] */ 0x009e7000, 0x100009e7, // nop
+// ::mc_filter_uv_b
+/* [0x00000620] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+/* [0x00000628] */ 0x954dcdbf, 0x10024471, // mov ra_xshift, ra_xshift_next      ; mov vw_setup, rb28
+/* [0x00000630] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+/* [0x00000638] */ 0x938001f6, 0xd002581c, // max r0, r0, 0                      ; mov ra_y_next, unif
+/* [0x00000640] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+/* [0x00000648] */ 0x4d808cc7, 0xd0025893, // sub r2, unif, r3                   ; mul24 ra_xshift_next, r0, 8
+/* [0x00000650] */ 0x8c8270f6, 0x10025801, // add r0, r0, r3                     ; mov ra1, unif
+/* [0x00000658] */ 0x9481c1f6, 0xd00254c0, // and rb_x_next, r0, ~3              ; mov ra0, unif
+/* [0x00000660] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b
+/* [0x00000668] */ 0x0c041dc0, 0xd2021467, // add rb17, ra1.16a, 1
+/* [0x00000670] */ 0x0c043dc0, 0xd20214a7, // add rb18, ra1.16a, 3
+/* [0x00000678] */ 0x11047dc0, 0xd2020827, // shl r0,   ra1.16a, 7
+/* [0x00000680] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
+/* [0x00000688] */ 0x918151f6, 0xd00258c3, // shl r3, r0, i_shift21     ; mov ra3, unif
+/* [0x00000690] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+/* [0x00000698] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
+/* [0x000006a0] */ 0x0c067180, 0x14020827, // add r0, r0, ra1.16b
+/* [0x000006a8] */ 0x119d01c0, 0xd0020827, // shl r0, r0, i_shift16
+/* [0x000006b0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+/* [0x000006b8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+/* [0x000006c0] */ 0x950e0ff6, 0x18024048, // mov      ra1, unif  ; mov rb8,  ra3.8a
+/* [0x000006c8] */ 0x950e0ff6, 0x1a064049, // mov.ifnz ra1, unif  ; mov rb9,  ra3.8b
+/* [0x000006d0] */ 0x800e7036, 0x1c0049ca, // nop                 ; mov rb10, ra3.8c
+/* [0x000006d8] */ 0x950c0ff6, 0xde0248cb, // mov r3, 0           ; mov rb11, ra3.8d
+/* [0x000006e0] */ 0x1104ddc0, 0x14020867, // shl r1, ra1.16b, rb13
+/* [0x000006e8] */ 0x0f9c13c0, 0xd0021327, // asr rb12, r1, 1
+// :uvloop_b
+/* [0x000006f0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1          ; ldtmu0
+/* [0x000006f8] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
+/* [0x00000700] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+/* [0x00000708] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+/* [0x00000710] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift     ; v8subs r0, r0, rb20
+/* [0x00000718] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+/* [0x00000720] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+/* [0x00000728] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+/* [0x00000730] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2         ; v8subs r1, r1, rb20
+/* [0x00000738] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
+/* [0x00000740] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+/* [0x00000748] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,       r0
+/* [0x00000750] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8
+/* [0x00000758] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1
+/* [0x00000760] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9
+/* [0x00000768] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2
+/* [0x00000770] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+/* [0x00000778] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3
+/* [0x00000780] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+/* [0x00000788] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
+/* [0x00000790] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+/* [0x00000798] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
+/* [0x000007a0] */ 0x55389db7, 0x10024361, // mov ra13, ra14          ; mul24 r1, ra14, rb9
+/* [0x000007a8] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+/* [0x000007b0] */ 0x55308037, 0x100243e0, // mov ra15, r0            ; mul24 r0, ra12, rb8
+/* [0x000007b8] */ 0x4d38a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra14, rb10
+/* [0x000007c0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+/* [0x000007c8] */ 0x4d13023e, 0x10024860, // sub r1, r1, r0          ; mul24 r0, vpm, ra4
+/* [0x000007d0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
+/* [0x000007d8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+/* [0x000007e0] */ 0x4f0501ce, 0xd2024821, // asr r0, r0, i_shift16   ; mul24 r1, r1, ra1.16a
+/* [0x000007e8] */ 0x409ce007, 0x100049e0, // nop                     ; mul24 r0, r0, rb14
+/* [0x000007f0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+/* [0x000007f8] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
+/* [0x00000800] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
+/* [0x00000808] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:uvloop_b
+/* [0x00000810] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
+/* [0x00000818] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
+/* [0x00000820] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+/* [0x00000828] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+/* [0x00000830] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+/* [0x00000838] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+/* [0x00000840] */ 0x00000010, 0xe0020827, // mov r0, 16
+/* [0x00000848] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00000850] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+/* [0x00000858] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+/* [0x00000860] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+/* [0x00000868] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+// ::mc_exit
+/* [0x00000870] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+/* [0x00000878] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+/* [0x00000880] */ 0x009e7000, 0xa00009e7, // ldtmu0
+/* [0x00000888] */ 0x009e7000, 0xb00009e7, // ldtmu1
+/* [0x00000890] */ 0x009e7000, 0xa00009e7, // ldtmu0
+/* [0x00000898] */ 0x009e7000, 0xb00009e7, // ldtmu1
+/* [0x000008a0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+/* [0x000008a8] */ 0x009e7000, 0x100009e7, // nop        ; nop
+/* [0x000008b0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+// ::mc_interrupt_exit8
+/* [0x000008b8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+/* [0x000008c0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+/* [0x000008c8] */ 0x009e7000, 0xb00009e7, // ldtmu1
+/* [0x000008d0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+/* [0x000008d8] */ 0x009e7000, 0xb00009e7, // ldtmu1
+/* [0x000008e0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+/* [0x000008e8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+/* [0x000008f0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+/* [0x000008f8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+/* [0x00000900] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+/* [0x00000908] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+/* [0x00000910] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+/* [0x00000918] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+/* [0x00000920] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+/* [0x00000928] */ 0x009e7000, 0x100009e7, // nop        ; nop
+// ::mc_setup
+/* [0x00000930] */ 0x00000010, 0xe00208e7, // mov r3, 16
+/* [0x00000938] */ 0x15827d80, 0x10020227, // mov ra8, unif
+/* [0x00000940] */ 0x15827d80, 0x10020267, // mov ra9, unif
+/* [0x00000948] */ 0x15827d80, 0x100202a7, // mov ra10, unif
+/* [0x00000950] */ 0x15827d80, 0x100202e7, // mov ra11, unif
+/* [0x00000958] */ 0x15827d80, 0x10020867, // mov r1, unif
+/* [0x00000960] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
+/* [0x00000968] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
+/* [0x00000970] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
+/* [0x00000978] */ 0x0d9c13c0, 0xd0021667, // sub rb_frame_width_minus_1,r1,1
+/* [0x00000980] */ 0x0d9c11c0, 0xd00217a7, // sub rb_frame_height_minus_1,r0,1
+/* [0x00000988] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
+/* [0x00000990] */ 0x15827d80, 0x10020827, // mov r0, unif
+/* [0x00000998] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+/* [0x000009a0] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
+/* [0x000009a8] */ 0x15227d80, 0x10020867, // mov r1, ra8
+/* [0x000009b0] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
+/* [0x000009b8] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
+/* [0x000009c0] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
+/* [0x000009c8] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
+/* [0x000009d0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+/* [0x000009d8] */ 0x922591f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, ra9
+/* [0x000009e0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+/* [0x000009e8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
+/* [0x000009f0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+/* [0x000009f8] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
+/* [0x00000a00] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+/* [0x00000a08] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+/* [0x00000a10] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
+/* [0x00000a18] */ 0x8c9e7452, 0x10025e18, // add t0s, r2, r1 ; mov ra_frame_base, r2
+/* [0x00000a20] */ 0x152a7d80, 0x10020867, // mov r1, ra10
+/* [0x00000a28] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
+/* [0x00000a30] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
+/* [0x00000a38] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
+/* [0x00000a40] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
+/* [0x00000a48] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+/* [0x00000a50] */ 0x922d91f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, ra11
+/* [0x00000a58] */ 0x119c31c0, 0xd0021067, // shl rx_xshift2_next, r0, 3
+/* [0x00000a60] */ 0x0c9c13c0, 0xd0120567, // add ra_y2, r1, 1
+/* [0x00000a68] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+/* [0x00000a70] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
+/* [0x00000a78] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+/* [0x00000a80] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+/* [0x00000a88] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
+/* [0x00000a90] */ 0x8c9e7452, 0x10025f19, // add t1s, r2, r1 ; mov ra_frame_base2, r2
+/* [0x00000a98] */ 0x00000001, 0xe0020527, // mov ra_k1, 1
+/* [0x00000aa0] */ 0x00000100, 0xe00205a7, // mov ra_k256, 256
+/* [0x00000aa8] */ 0x00000040, 0xe00207a7, // mov ra30, 64
+/* [0x00000ab0] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
+/* [0x00000ab8] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255
+/* [0x00000ac0] */ 0x00000018, 0xe00215e7, // mov rb23, 24
+/* [0x00000ac8] */ 0x00000000, 0xe0020227, // mov ra8, 0
+/* [0x00000ad0] */ 0x00000000, 0xe0020267, // mov ra9, 0
+/* [0x00000ad8] */ 0x00000000, 0xe00202a7, // mov ra10, 0
+/* [0x00000ae0] */ 0x00000000, 0xe00202e7, // mov ra11, 0
+/* [0x00000ae8] */ 0x00000000, 0xe0020327, // mov ra12, 0
+/* [0x00000af0] */ 0x00000000, 0xe0020367, // mov ra13, 0
+/* [0x00000af8] */ 0x00000000, 0xe00203a7, // mov ra14, 0
+/* [0x00000b00] */ 0x00000000, 0xe00203e7, // mov ra15, 0
+/* [0x00000b08] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+/* [0x00000b10] */ 0x159e7480, 0x10020867, // mov r1, r2
+/* [0x00000b18] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+/* [0x00000b20] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+/* [0x00000b28] */ 0x159e7480, 0x10020827, // mov r0, r2
+/* [0x00000b30] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+/* [0x00000b38] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+/* [0x00000b40] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+/* [0x00000b48] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
+/* [0x00000b50] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+/* [0x00000b58] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+/* [0x00000b60] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
+/* [0x00000b68] */ 0x0c809dc0, 0xd0021367, // add rb13, unif, 9
+/* [0x00000b70] */ 0x15827d80, 0x100009e7, // mov -, unif
+/* [0x00000b78] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+/* [0x00000b80] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+/* [0x00000b88] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+/* [0x00000b90] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+/* [0x00000b98] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
+/* [0x00000ba0] */ 0x13540dc0, 0xd2020867, // max r1, ra_y2, 0
+/* [0x00000ba8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+/* [0x00000bb0] */ 0x0c541dc0, 0xd2120567, // add ra_y2, ra_y2, 1
+/* [0x00000bb8] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+/* [0x00000bc0] */ 0x0c667380, 0x10020f27, // add t1s, r1, ra_frame_base2
+// :per_block_setup
+/* [0x00000bc8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+/* [0x00000bd0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+/* [0x00000bd8] */ 0x959a0ff6, 0x10024061, // mov ra1, unif  ; mov r1, elem_num
+/* [0x00000be0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+/* [0x00000be8] */ 0x159c1fc0, 0x10021027, // mov rx_xshift2, rx_xshift2_next
+/* [0x00000bf0] */ 0x0c067c40, 0x12020827, // add r0, ra1.16a, r1
+/* [0x00000bf8] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+/* [0x00000c00] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+/* [0x00000c08] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+/* [0x00000c10] */ 0x95048ff6, 0xd40258dc, // mov r3, 8                          ; mov ra_y_next, ra1.16b
+/* [0x00000c18] */ 0x9481c1f6, 0xd0025801, // and r0, r0, ~3                     ; mov ra1, unif
+/* [0x00000c20] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0
+/* [0x00000c28] */ 0x0c067c40, 0x12020827, // add r0, ra1.16a, r1
+/* [0x00000c30] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+/* [0x00000c38] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+/* [0x00000c40] */ 0x119c31c0, 0xd0021067, // shl rx_xshift2_next, r0, 3
+/* [0x00000c48] */ 0x8c0676f6, 0x142258d5, // add r3, r3, r3                     ; mov ra_y2_next, ra1.16b
+/* [0x00000c50] */ 0x9481c1f6, 0xd0025801, // and r0, r0, ~3                     ; mov ra1, unif
+/* [0x00000c58] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
+/* [0x00000c60] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+/* [0x00000c68] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b
+/* [0x00000c70] */ 0x0c045dc0, 0xd2021467, // add rb17, ra1.16a, 5
+/* [0x00000c78] */ 0x0c047dc0, 0xd20214a7, // add rb18, ra1.16a, 7
+/* [0x00000c80] */ 0x11047dc0, 0xd2020827, // shl r0,   ra1.16a, 7
+/* [0x00000c88] */ 0x0c067180, 0x14020827, // add r0,   r0, ra1.16b
+/* [0x00000c90] */ 0x119d01c0, 0xd0020827, // shl r0,   r0, i_shift16
+/* [0x00000c98] */ 0x8c81b1f6, 0x100256a0, // add rb26, r0, rb27                 ; mov r0, unif
+/* [0x00000ca0] */ 0x119d01c0, 0xd0040827, // shl.ifz r0, r0, i_shift16
+/* [0x00000ca8] */ 0x119c31c0, 0xd0020227, // shl ra8, r0, 3
+/* [0x00000cb0] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
+/* [0x00000cb8] */ 0x10227380, 0x1e4200a7, // ror ra2.8a, r1, ra8.8d
+/* [0x00000cc0] */ 0x10227380, 0x1c420027, // ror ra0.8a, r1, ra8.8c
+/* [0x00000cc8] */ 0x01040400, 0xe0020867, // mov r1,0x01040400
+/* [0x00000cd0] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d
+/* [0x00000cd8] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c
+/* [0x00000ce0] */ 0x050b0a00, 0xe0020867, // mov r1,0x050b0a00
+/* [0x00000ce8] */ 0x10227380, 0x1e6200a7, // ror ra2.8c, r1, ra8.8d
+/* [0x00000cf0] */ 0x10227380, 0x1c620027, // ror ra0.8c, r1, ra8.8c
+/* [0x00000cf8] */ 0x11283a40, 0xe0020867, // mov r1,0x11283a40
+/* [0x00000d00] */ 0x10227380, 0x1e7200a7, // ror ra2.8d, r1, ra8.8d
+/* [0x00000d08] */ 0x10227380, 0x1c720027, // ror ra0.8d, r1, ra8.8c
+/* [0x00000d10] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
+/* [0x00000d18] */ 0x10227380, 0x1e4200e7, // ror ra3.8a, r1, ra8.8d
+/* [0x00000d20] */ 0x10227380, 0x1c420067, // ror ra1.8a, r1, ra8.8c
+/* [0x00000d28] */ 0x0a0b0500, 0xe0020867, // mov r1,0x0a0b0500
+/* [0x00000d30] */ 0x10227380, 0x1e5200e7, // ror ra3.8b, r1, ra8.8d
+/* [0x00000d38] */ 0x10227380, 0x1c520067, // ror ra1.8b, r1, ra8.8c
+/* [0x00000d40] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
+/* [0x00000d48] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d
+/* [0x00000d50] */ 0x10227380, 0x1c620067, // ror ra1.8c, r1, ra8.8c
+/* [0x00000d58] */ 0x01010000, 0xe0020867, // mov r1,0x01010000
+/* [0x00000d60] */ 0x902203bf, 0x1e7240e0, // ror ra3.8d, r1, ra8.8d    ; mov r0, unif
+/* [0x00000d68] */ 0x9020d3bf, 0x1c724061, // ror ra1.8d, r1, ra8.8c    ; mov r1, rb13
+/* [0x00000d70] */ 0x910e0e76, 0x18024844, // shl r1, unif, r1          ; mov rb4, ra3.8a
+/* [0x00000d78] */ 0x8f0e70f6, 0x1a024485, // asr ra18, r0, r3          ; mov rb5, ra3.8b
+/* [0x00000d80] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+/* [0x00000d88] */ 0x910e70f6, 0x1c024806, // shl r0, r0, r3            ; mov rb6, ra3.8c
+/* [0x00000d90] */ 0x950c0ff6, 0xde0248c7, // mov r3, 0                 ; mov rb7, ra3.8d
+/* [0x00000d98] */ 0x0f9c93c0, 0xd0021327, // asr rb12, r1, 9
+// ::mc_filter
+/* [0x00000da0] */ 0x0f9cf1c0, 0xd00213a7, // asr rb14, r0, 15
+// :yloop
+/* [0x00000da8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
+/* [0x00000db0] */ 0x8e4539bf, 0xb0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
+/* [0x00000db8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+/* [0x00000dc0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+/* [0x00000dc8] */ 0x8e5409f6, 0x14129855, // shr r1, r4, rx_xshift2    ; mov.ifz ra_y2, ra_y2_next
+/* [0x00000dd0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+/* [0x00000dd8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+/* [0x00000de0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+/* [0x00000de8] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
+/* [0x00000df0] */ 0x13540dc0, 0xd20208a7, // max r2, ra_y2, 0
+/* [0x00000df8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+/* [0x00000e00] */ 0x4c541dd3, 0xd2124562, // add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
+/* [0x00000e08] */ 0xec654c8f, 0x10024f21, // add t1s, ra_frame_base2, r2  ; v8subs r1, r1, rb20
+/* [0x00000e10] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+/* [0x00000e18] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,      r0
+/* [0x00000e20] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
+/* [0x00000e28] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1, r0 << 1
+/* [0x00000e30] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
+/* [0x00000e38] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2
+/* [0x00000e40] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+/* [0x00000e48] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3
+/* [0x00000e50] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+/* [0x00000e58] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4
+/* [0x00000e60] */ 0x40074031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12
+/* [0x00000e68] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5
+/* [0x00000e70] */ 0x40073031, 0xda00c9e3, // nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13
+/* [0x00000e78] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6
+/* [0x00000e80] */ 0x40072031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14
+/* [0x00000e88] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7
+/* [0x00000e90] */ 0x40071031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15
+/* [0x00000e98] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
+/* [0x00000ea0] */ 0x8d2087f6, 0xd00269e1, // sub.setf -, r3, 8       ; mov r1,   ra8
+/* [0x00000ea8] */ 0x95249dbf, 0x10024208, // mov ra8,  ra9           ; mov rb8,  rb9
+/* [0x00000eb0] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:yloop
+/* [0x00000eb8] */ 0x9528adbf, 0x10024249, // mov ra9,  ra10          ; mov rb9,  rb10
+/* [0x00000ec0] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11          ; mov rb10, rb11
+/* [0x00000ec8] */ 0x959e7009, 0x100242cb, // mov ra11, r0            ; mov rb11, r1
+/* [0x00000ed0] */ 0x4008803e, 0x180049e0, // nop                     ; mul24 r0, rb8,  ra2.8a
+/* [0x00000ed8] */ 0x4008903e, 0x1a0049e1, // nop                     ; mul24 r1, rb9,  ra2.8b
+/* [0x00000ee0] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
+/* [0x00000ee8] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0          ; mul24 r0, rb11, ra2.8d
+/* [0x00000ef0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8,  rb4
+/* [0x00000ef8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9,  rb5
+/* [0x00000f00] */ 0x4d286237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb6
+/* [0x00000f08] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
+/* [0x00000f10] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0          ; mov -, vw_wait
+/* [0x00000f18] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
+/* [0x00000f20] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+/* [0x00000f28] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
+/* [0x00000f30] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
+/* [0x00000f38] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
+/* [0x00000f40] */ 0xfffffe48, 0xf06809e7, // brr.anyn -, r:yloop
+/* [0x00000f48] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
+/* [0x00000f50] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
+/* [0x00000f58] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+/* [0x00000f60] */ 0xfffffc48, 0xf0f809e7, // brr -, r:per_block_setup
+/* [0x00000f68] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+/* [0x00000f70] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+/* [0x00000f78] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+// ::mc_filter_b
+/* [0x00000f80] */ 0x0f9d01c0, 0xd00213a7, // asr rb14, r0, i_shift16
+// :yloopb
+/* [0x00000f88] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
+/* [0x00000f90] */ 0x8e4539bf, 0xb0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
+/* [0x00000f98] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+/* [0x00000fa0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+/* [0x00000fa8] */ 0x8e5409f6, 0x14129855, // shr r1, r4, rx_xshift2    ; mov.ifz ra_y2, ra_y2_next
+/* [0x00000fb0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+/* [0x00000fb8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+/* [0x00000fc0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+/* [0x00000fc8] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
+/* [0x00000fd0] */ 0x13540dc0, 0xd20208a7, // max r2, ra_y2, 0
+/* [0x00000fd8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+/* [0x00000fe0] */ 0x4c541dd3, 0xd2124562, // add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
+/* [0x00000fe8] */ 0xec654c8f, 0x10024f21, // add t1s, ra_frame_base2, r2  ; v8subs r1, r1, rb20
+/* [0x00000ff0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+/* [0x00000ff8] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,      r0
+/* [0x00001000] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
+/* [0x00001008] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1, r0 << 1
+/* [0x00001010] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
+/* [0x00001018] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2
+/* [0x00001020] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+/* [0x00001028] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3
+/* [0x00001030] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+/* [0x00001038] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4
+/* [0x00001040] */ 0x40074031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12
+/* [0x00001048] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5
+/* [0x00001050] */ 0x40073031, 0xda00c9e3, // nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13
+/* [0x00001058] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6
+/* [0x00001060] */ 0x40072031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14
+/* [0x00001068] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7
+/* [0x00001070] */ 0x40071031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15
+/* [0x00001078] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
+/* [0x00001080] */ 0x8d2087f6, 0xd00269e1, // sub.setf -, r3, 8       ; mov r1,   ra8
+/* [0x00001088] */ 0x95249dbf, 0x10024208, // mov ra8,  ra9           ; mov rb8,  rb9
+/* [0x00001090] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:yloopb
+/* [0x00001098] */ 0x9528adbf, 0x10024249, // mov ra9,  ra10          ; mov rb9,  rb10
+/* [0x000010a0] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11          ; mov rb10, rb11
+/* [0x000010a8] */ 0x959e7009, 0x100242cb, // mov ra11, r0            ; mov rb11, r1
+/* [0x000010b0] */ 0x4008803e, 0x180049e0, // nop                     ; mul24 r0, rb8,  ra2.8a
+/* [0x000010b8] */ 0x4008903e, 0x1a0049e1, // nop                     ; mul24 r1, rb9,  ra2.8b
+/* [0x000010c0] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
+/* [0x000010c8] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0          ; mul24 r0, rb11, ra2.8d
+/* [0x000010d0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8,  rb4
+/* [0x000010d8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9,  rb5
+/* [0x000010e0] */ 0x4d286237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb6
+/* [0x000010e8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
+/* [0x000010f0] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0          ; mov r2, rb12
+/* [0x000010f8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
+/* [0x00001100] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+/* [0x00001108] */ 0x409ce00f, 0x100049e0, // nop                     ; mul24 r0, r1, rb14
+/* [0x00001110] */ 0x4c4b808e, 0xd0024821, // add r0, r0, r2          ; mul24 r1, r1 << 8, ra18 << 8
+/* [0x00001118] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+/* [0x00001120] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
+/* [0x00001128] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:yloopb
+/* [0x00001130] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
+/* [0x00001138] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
+/* [0x00001140] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+/* [0x00001148] */ 0xfffffa60, 0xf0f809e7, // brr -, r:per_block_setup
+/* [0x00001150] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+/* [0x00001158] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+/* [0x00001160] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+// ::mc_interrupt_exit12
+/* [0x00001168] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+/* [0x00001170] */ 0x009e7000, 0xa00009e7, // ldtmu0
+/* [0x00001178] */ 0x009e7000, 0xa00009e7, // ldtmu0
+/* [0x00001180] */ 0x009e7000, 0xb00009e7, // ldtmu1
+/* [0x00001188] */ 0x009e7000, 0xb00009e7, // ldtmu1
+/* [0x00001190] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+/* [0x00001198] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+/* [0x000011a0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+/* [0x000011a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+/* [0x000011b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+/* [0x000011b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+/* [0x000011c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+/* [0x000011c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+/* [0x000011d0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+/* [0x000011d8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+/* [0x000011e0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+/* [0x000011e8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+/* [0x000011f0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+/* [0x000011f8] */ 0x009e7000, 0x100009e7, // nop        ; nop
+// ::mc_exit1
+/* [0x00001200] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+/* [0x00001208] */ 0x009e7000, 0xa00009e7, // ldtmu0
+/* [0x00001210] */ 0x009e7000, 0xb00009e7, // ldtmu1
+/* [0x00001218] */ 0x009e7000, 0xa00009e7, // ldtmu0
+/* [0x00001220] */ 0x009e7000, 0xb00009e7, // ldtmu1
+/* [0x00001228] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+/* [0x00001230] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+/* [0x00001238] */ 0x009e7000, 0x100009e7, // nop        ; nop
+// ::mc_end
+};
+#ifdef __HIGHC__
+#pragma Align_to(8, rpi_shader)
+#endif
diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
new file mode 100644
index 0000000..9772796
--- /dev/null
+++ b/libavcodec/rpi_shader.h
@@ -0,0 +1,19 @@
+#ifndef rpi_shader_H
+#define rpi_shader_H
+
+extern unsigned int rpi_shader[];
+
+#define mc_setup_uv (rpi_shader + 0)
+#define mc_filter_uv (rpi_shader + 132)
+#define mc_filter_uv_b0 (rpi_shader + 274)
+#define mc_filter_uv_b (rpi_shader + 392)
+#define mc_exit (rpi_shader + 540)
+#define mc_interrupt_exit8 (rpi_shader + 558)
+#define mc_setup (rpi_shader + 588)
+#define mc_filter (rpi_shader + 872)
+#define mc_filter_b (rpi_shader + 992)
+#define mc_interrupt_exit12 (rpi_shader + 1114)
+#define mc_exit1 (rpi_shader + 1152)
+#define mc_end (rpi_shader + 1168)
+
+#endif
diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
new file mode 100644
index 0000000..aa9e1e7
--- /dev/null
+++ b/libavcodec/rpi_shader.qasm
@@ -0,0 +1,1098 @@
+# register allocation
+#
+# ra0...ra7                                     eight horizontal filter coefficients
+#
+# rb0 rx_shift2
+# rb1 rb_y2_next
+#
+# rb4...rb7
+#
+# rb8..rb11, ra8...ra11                         Y: eight filtered rows of context (ra11 == most recent)
+#
+#                                               (ra15 isn't clamped to zero - this happens during the
+#                                                copy to ra14, and during its use in the vertical filter)
+#
+# rb8...rb11                                    eight vertical filter coefficients
+
+# ra4                                           y: Fiter, UV: 0x10000
+
+# rb12                                          offset to add before shift (round + weighting offsets)
+# rb13                                          shift: denom + 6 + 9
+# rb14                                          L0 weight (U on left, V on right)
+# rb15                                          -- free --
+#
+# ra16                                          clipped(row start address+elem_num)&~3
+# ra17                                          per-channel shifts
+# ra18                                          L1 weight (Y)
+# ra19                                          next ra17
+#
+# rb16                                          pitch
+# rb17                                          height + 1
+# rb18                                          height + 3
+# rb19                                          next ra16
+#
+# ra20                                          1
+# ra21                                          ra_21
+# ra22 ra_k256                                  256
+# ra23 ra_y2_next                               ra_y2_next
+#
+# rb20                                          0xffffff00
+# rb21                                          vpm_setup for reading/writing 16bit results into VPM
+# rb22 rb_k255                                  255
+# rb23                                          24
+#
+# rb24                                          vdw_setup_1(dst_pitch)
+# rb25                                          frame width-1
+# rb26                                          height<<23 + width<<16 + vdw_setup_0
+# rb27                                          vdw_setup_0 (depends on QPU number)
+# rb28                                          vpm_setup (depends on QPU number) for writing 8bit results into VPM
+# rb29                                          vdw_setup_1(dst_pitch-width)
+# rb30                                          frame height-1
+# rb31                                          used as temp to count loop iterations
+#
+# ra24                                          clipped(row start address+8+elem_num)&~3
+# ra25                                          per-channel shifts 2
+# ra26                                          next ra24
+# ra27                                          next ra25
+# ra28                                          next y
+# ra29                                          y for next texture access
+# ra30                                          64
+#
+# ra31                                          next kernel address
+
+.set rb_frame_width_minus_1,       rb25
+.set rb_frame_height_minus_1,      rb30
+.set rb_pitch,                     rb16
+.set ra_x,                         ra16
+.set ra_y2,                        ra21.16a
+.set ra_y2_next,                   ra21.16b
+
+.set rb_x_next,                    rb19
+.set rx_frame_base2_next,          rb19
+
+.set ra_frame_base,                ra24
+.set ra_frame_base_next,           ra26
+.set ra_xshift,                    ra17
+
+.set ra_u2v_ref_offset,            ra25
+.set ra_frame_base2,               ra25
+
+.set ra_xshift_next,               ra19
+.set rx_xshift2,                   rb0
+.set rx_xshift2_next,              rb1
+
+.set ra_u2v_dst_offset,            ra27
+
+.set ra_y_next,                    ra28
+.set ra_y,                         ra29
+
+.set ra_k1,                        ra20
+.set rb_k255,                      rb22
+.set ra_k256,                      ra22
+
+# With shifts only the bottom 5 bits are considered so -16=16, -15=17 etc.
+.set i_shift16,                    -16
+.set i_shift21,                    -11
+
+################################################################################
+# mc_setup_uv(next_kernel, x, y, ref_u_base, ref_v_base, frame_width, frame_height, pitch, dst_pitch, offset, denom, vpm_id)
+::mc_setup_uv
+
+# Read starting kernel
+mov ra31, unif
+
+# Load first request location
+add ra_x, unif, elem_num # Store x
+mov ra_y, unif # Store y
+mov ra_frame_base, unif # Store frame u base
+nop
+sub ra_u2v_ref_offset, unif, ra_frame_base # Store offset to add to move from u to v in reference frame
+
+# Read image dimensions
+sub rb25,unif,1
+sub rb30,unif,1
+
+# get source pitch
+mov rb16, unif
+
+# get destination pitch
+mov r0, unif
+mov r1, vdw_setup_1(0)
+add rb24, r1, r0
+
+# load constants
+
+mov ra4, 0x10000
+mov ra_k1, 1
+mov ra_k256, 256
+mov ra30, 64
+
+mov rb20, 0xffffff00
+mov rb_k255, 255
+mov rb23, 24
+
+# touch vertical context to keep simulator happy
+
+mov ra8, 0
+mov ra9, 0
+mov ra10, 0
+mov ra11, 0
+mov ra12, 0
+mov ra13, 0
+mov ra14, 0
+mov ra15, 0
+
+# Compute base address for first and second access
+mov r0, ra_x           # Load x
+max r0, r0, 0; mov r1, ra_y # Load y
+min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base  # Load the frame base
+shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
+add ra_y, r1, 1
+add r0, r0, r3
+and r0, r0, ~3
+max r1, r1, 0 ; mov ra_x, r0 # y
+min r1, r1, rb_frame_height_minus_1
+# submit texture requests for first line
+add r2, r2, r0 ; mul24 r1, r1, rb_pitch
+add t0s, r0, r1 ; mov ra_frame_base, r2
+add t1s, r2, r1
+
+mov r2, 9
+add rb13, r2, unif  # denominator
+mov -, unif         # Unused
+
+# Compute part of VPM to use for DMA output
+mov r2, unif
+shl r2, r2, 1   # Convert QPU numbers to be even (this means we can only use 8 QPUs, but is necessary as we need to save 16bit intermediate results)
+and r2, r2, 15
+mov r1, r2
+asr r1, r1, 2
+shl r1, r1, 6
+mov r0, r2
+and r0, r0, 3
+add r0, r0, r1
+
+mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
+add rb28, r0, r1  # VPM 8bit storage
+asr r2, r0, 1     # r0 = bc0000d
+mov r1, vpm_setup(0, 2, h16p(0, 0))  # 2 is stride - stride acts on ADDR which is Y[5:0],H[0] for 16 bit
+add rb21, r2, r1  # VPM for 16bit intermediates
+mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
+shl r0, r0, 5
+add rb27, r0, r1  # DMA out
+
+# submit texture requests for second line
+max r1, ra_y, 0
+min r1, r1, rb_frame_height_minus_1
+add ra_y, ra_y, 1
+bra -, ra31
+nop ; mul24 r1, r1, rb_pitch
+add t0s, r1, ra_x
+add t1s, r1, ra_frame_base
+
+
+
+################################################################################
+
+# mc_filter_uv(next_kernel, x, y, frame_u_base, frame_v_base, width_height, hcoeffs, vcoeffs, offset_weight_u, offset_weight_v, this_u_dst, this_v_dst)
+
+# At this point we have already issued two pairs of texture requests for the current block
+# ra_x, ra_x16_base point to the current coordinates for this block
+::mc_filter_uv
+mov ra31, unif
+
+# per-channel shifts were calculated on the *previous* invocation
+
+# get base addresses and per-channel shifts for *next* invocation
+add r0, unif, elem_num    # x
+max r0, r0, 0         ; mov r1, unif # y
+min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
+# compute offset from frame base u to frame base v
+sub r2, unif, r3      ; mov ra_xshift, ra_xshift_next
+shl ra_xshift_next, r0, 3
+add r0, r0, r3        ; mov ra1, unif  # ; width_height
+and rb_x_next, r0, ~3 ; mov ra0, unif  # H filter coeffs
+mov ra_y_next, r1     ; mov vw_setup, rb28
+add ra_frame_base_next, rb_x_next, r2
+
+# set up VPM write
+# get width,height of block
+
+sub rb29, rb24, ra1.16b  # Compute vdw_setup1(dst_pitch-width)
+add rb17, ra1.16a, 1
+add rb18, ra1.16a, 3
+shl r0,   ra1.16a, 7
+add r0,   r0, ra1.16b    # Combine width and height of destination area
+shl r0,   r0, i_shift16  # Shift into bits 16 upwards of the vdw_setup0 register
+add rb26, r0, rb27    ; mov ra3, unif  # ; V filter coeffs
+
+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+
+# unpack filter coefficients
+
+mov ra1, unif         ; mov rb8,  ra3.8a   # U offset/weight
+mov.ifnz ra1, unif    ; mov rb9,  ra3.8b   # V offset/weight
+nop                   ; mov rb10, ra3.8c
+mov r3, 0             ; mov rb11, ra3.8d   # Loop count
+
+shl r1, ra1.16b, rb13
+asr rb12, r1, 1
+shl rb14, ra1.16a, 1  # b14 = weight*2
+
+# rb14 - weight L0 * 2
+# rb13 = weight denom + 6 + 9
+# rb12 = (((is P) ? offset L0 * 2 : offset L1 + offset L0) + 1) << (rb13 - 1)
+
+# r2 is elem_num
+# retrieve texture results and pick out bytes
+# then submit two more texture requests
+
+# r3 = 0
+:uvloop
+# retrieve texture results and pick out bytes
+# then submit two more texture requests
+
+sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1          ; ldtmu0     # loop counter increment
+shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
+mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
+
+max r2, ra_y, 0  # y
+min r2, r2, rb_frame_height_minus_1
+add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+add t1s, ra_frame_base, r2
+
+# generate seven shifted versions
+# interleave with scroll of vertical context
+
+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+
+# apply horizontal filter
+nop                  ; mul24      r3, ra0.8a,       r0
+nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8
+nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1
+nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9
+sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2
+nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3
+nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+sub r0, r2, r3       ; mov r3, rb31
+sub.setf -, r3, 4    ; mov ra12, ra13
+brr.anyn -, r:uvloop
+mov ra13, ra14          ; mul24 r1, ra14, rb9
+mov ra14, ra15
+mov ra15, r0            ; mul24 r0, ra12, rb8
+# >>> .anyn uvloop
+
+# apply vertical filter and write to VPM
+
+sub r1, r1, r0          ; mul24 r0, ra14, rb10
+add r1, r1, r0          ; mul24 r0, ra15, rb11
+sub r1, r1, r0          ; mov -, vw_wait
+sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
+asr r1, r1, 14
+nop                     ; mul24 r1, r1, rb14
+shl r1, r1, 8
+
+add r1, r1, rb12
+brr.anyn -, r:uvloop
+asr r1, r1, rb13
+min r1, r1, rb_k255       # Delay 2
+max vpm, r1, 0         # Delay 3
+
+# DMA out for U
+
+mov vw_setup, rb26 # VDW setup 0
+mov vw_setup, rb29 # Stride
+mov vw_addr, unif # start the VDW
+
+# DMA out for V
+# We need to wait for the U to complete first, but have nothing useful to compute while we wait.
+# Could potentially push this write into the start of the next pipeline stage.
+mov r0, 16
+mov -, vw_wait
+
+bra -, ra31
+add vw_setup, rb26, r0 # VDW setup 0
+mov vw_setup, rb29 # Stride
+mov vw_addr, unif # start the VDW
+
+
+################################################################################
+
+# mc_filter_uv_b0(next_kernel, x, y, frame_u_base, frame_v_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_u_dst, this_v_dst)
+
+# At this point we have already issued two pairs of texture requests for the current block
+# ra_x, ra_x16_base point to the current coordinates for this block
+::mc_filter_uv_b0
+mov ra31, unif
+
+# per-channel shifts were calculated on the *previous* invocation
+
+# get base addresses and per-channel shifts for *next* invocation
+add r0, unif, elem_num       # x
+max r0, r0, 0                ; mov r1, unif # y
+min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
+sub r2, unif, r3             ; mov ra_xshift, ra_xshift_next # compute offset from frame base u to frame base v ;
+shl ra_xshift_next, r0, 3
+add r0, r0, r3  	     ; mov ra1, unif   # ; width_height
+and rb_x_next, r0, ~3        ; mov ra0, unif   # ; H filter coeffs
+mov ra_y_next, r1            ; mov vw_setup, rb21
+
+add ra_frame_base_next, rb_x_next, r2
+
+# Need to have unsigned coeffs to so we can just unpack in the filter
+# chroma filter always goes -ve, +ve, +ve, -ve. This is fixed in the
+# filter code. Unpack into b regs for V
+
+# set up VPM write, we need to save 16bit precision
+
+sub rb29, rb24, ra1.16b         # Compute vdw_setup1(dst_pitch-width)
+add rb17, ra1.16a, 1
+add rb18, ra1.16a, 3
+shl r0,   ra1.16a, 7
+add r0,   r0, ra1.16b           # Combine width and height of destination area
+shl r0,   r0, i_shift16      ; mov ra3, unif  # ; V filter coeffs
+add rb26, r0, rb27
+
+mov rb8, ra3.8a
+mov rb9, ra3.8b
+mov rb10, ra3.8c
+mov rb11, ra3.8d
+
+# r2 is elem_num
+# r3 is loop counter
+
+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+
+mov      rb14, unif                 # U weight L0
+mov.ifnz rb14, unif    ; mov r3, 0  # V weight L0 ; Loop counter
+# rb14 unused in b0 but will hang around till the second pass
+
+# retrieve texture results and pick out bytes
+# then submit two more texture requests
+
+# r3 = 0
+:uvloop_b0
+# retrieve texture results and pick out bytes
+# then submit two more texture requests
+
+sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1          ; ldtmu0     # loop counter increment
+shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
+mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
+
+max r2, ra_y, 0  # y
+min r2, r2, rb_frame_height_minus_1
+add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+add t1s, ra_frame_base, r2
+
+# generate seven shifted versions
+# interleave with scroll of vertical context
+
+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+
+nop                  ; mul24      r3, ra0.8a,       r0
+nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8
+nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1
+nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9
+sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2
+nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3
+nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+sub r0, r2, r3       ; mov r3, rb31
+sub.setf -, r3, 4    ; mov ra12, ra13
+brr.anyn -, r:uvloop_b0
+mov ra13, ra14          ; mul24 r1, ra14, rb9  # ra14 is about to be ra13
+mov ra14, ra15
+mov ra15, r0            ; mul24 r0, ra12, rb8
+# >>> .anyn uvloop_b0
+
+# apply vertical filter and write to VPM
+
+sub r1, r1, r0          ; mul24 r0, ra14, rb10
+sub.setf -, r3, rb18
+brr.anyn -, r:uvloop_b0
+add r1, r1, r0          ; mul24 r0, ra15, rb11
+sub r1, r1, r0          ; mov -, vw_wait
+asr vpm, r1, 6
+# >>> .anyn uvloop_b0
+
+# in pass0 we don't really need to save any results, but need to discard the uniforms
+# DMA out for U
+
+bra -, ra31
+mov -, unif           # Delay 1
+mov -, unif           # Delay 2
+nop                   # Delay 3
+
+
+################################################################################
+
+::mc_filter_uv_b
+mov ra31, unif
+
+# per-channel shifts were calculated on the *previous* invocation
+
+# set up VPM write
+mov ra_xshift, ra_xshift_next      ; mov vw_setup, rb28
+
+# get base addresses and per-channel shifts for *next* invocation
+add r0, unif, elem_num    # x
+max r0, r0, 0                      ; mov ra_y_next, unif # y
+min r0, r0, rb_frame_width_minus_1 ; mov r3, unif        # V frame_base
+# compute offset from frame base u to frame base v
+sub r2, unif, r3                   ; mul24 ra_xshift_next, r0, 8 # U frame_base
+add r0, r0, r3                     ; mov ra1, unif       # width_height
+and rb_x_next, r0, ~3              ; mov ra0, unif       # H filter coeffs
+
+sub rb29, rb24, ra1.16b  # Compute vdw_setup1(dst_pitch-width)
+add rb17, ra1.16a, 1
+add rb18, ra1.16a, 3
+shl r0,   ra1.16a, 7
+
+add ra_frame_base_next, rb_x_next, r2
+
+# r0 is currently height<<7
+# For vr_setup we want height<<20 (so 20-7=13 additional bits)
+shl r3, r0, i_shift21     ; mov ra3, unif # Shl 13 + Mask off top 8 bits ; V filter coeffs
+shr r3, r3, 8
+add vr_setup, r3, rb21
+
+add r0, r0, ra1.16b    # Combine width and height of destination area
+shl r0, r0, i_shift16  # Shift into bits 16 upwards of the vdw_setup0 register
+add rb26, r0, rb27
+
+# get filter coefficients
+
+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+
+# Get offset & weight stuff
+
+# The unif read occurs unconditionally, only the write is conditional
+mov      ra1, unif  ; mov rb8,  ra3.8a    # U offset/weight ;
+mov.ifnz ra1, unif  ; mov rb9,  ra3.8b    # V offset/weight ;
+nop                 ; mov rb10, ra3.8c
+mov r3, 0           ; mov rb11, ra3.8d    # Loop counter ;
+
+shl r1, ra1.16b, rb13
+asr rb12, r1, 1
+
+# ra1.16a used directly in the loop
+
+# retrieve texture results and pick out bytes
+# then submit two more texture requests
+
+# r3 = 0
+:uvloop_b
+# retrieve texture results and pick out bytes
+# then submit two more texture requests
+
+sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1          ; ldtmu0     # loop counter increment
+shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
+mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+shr r1, r4, ra_xshift     ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
+
+max r2, ra_y, 0  # y
+min r2, r2, rb_frame_height_minus_1
+add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+add t0s, ra_x, r2         ; v8subs r1, r1, rb20
+add t1s, ra_frame_base, r2
+
+# generate seven shifted versions
+# interleave with scroll of vertical context
+
+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+
+nop                  ; mul24      r3, ra0.8a,       r0
+nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8
+nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1
+nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9
+sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2
+nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3
+nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+sub r0, r2, r3       ; mov r3, rb31
+sub.setf -, r3, 4    ; mov ra12, ra13
+brr.anyn -, r:uvloop_b
+mov ra13, ra14          ; mul24 r1, ra14, rb9
+mov ra14, ra15
+mov ra15, r0            ; mul24 r0, ra12, rb8
+# >>> .anyn uvloop_b
+
+# apply vertical filter and write to VPM
+
+sub r1, r1, r0          ; mul24 r0, ra14, rb10
+add r1, r1, r0          ; mul24 r0, ra15, rb11
+# Beware: vpm read gets unsigned 16-bit value, so we must sign extend it
+sub r1, r1, r0          ; mul24 r0, vpm, ra4  # ra4 = 0x10000
+sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
+asr r1, r1, 14          # shift2=6
+
+asr r0, r0, i_shift16   ; mul24 r1, r1, ra1.16a
+nop                     ; mul24 r0, r0, rb14
+
+add r1, r1, r0          ; mov -, vw_wait
+shl r1, r1, 8           # Lose bad top 8 bits & sign extend
+
+add r1, r1, rb12        # rb12 = (offsetL0 + offsetL1 + 1) << (rb13 - 1)
+
+brr.anyn -, r:uvloop_b
+asr r1, r1, rb13         # Delay 1
+min r1, r1, rb_k255       # Delay 2
+max vpm, r1, 0         # Delay 3
+
+
+# DMA out for U
+
+mov vw_setup, rb26 # VDW setup 0
+mov vw_setup, rb29 # Stride
+mov vw_addr, unif # start the VDW
+
+# DMA out for V
+# We need to wait for the U to complete first, but have nothing useful to compute while we wait.
+# Could potentially push this write into the start of the next pipeline stage.
+mov r0, 16
+mov -, vw_wait
+
+bra -, ra31
+add vw_setup, rb26, r0 # VDW setup 0
+mov vw_setup, rb29 # Stride
+mov vw_addr, unif # start the VDW
+
+################################################################################
+
+# mc_exit()
+
+::mc_exit
+mov  -, vw_wait # wait on the VDW
+
+mov -,srel(0)
+
+ldtmu0
+ldtmu1
+ldtmu0
+ldtmu1
+
+nop        ; nop ; thrend
+nop        ; nop # delay slot 1
+nop        ; nop # delay slot 2
+
+# mc_interrupt_exit8()
+::mc_interrupt_exit8
+mov  -, vw_wait # wait on the VDW
+
+ldtmu0
+ldtmu1
+ldtmu0
+ldtmu1
+
+mov -,sacq(0) # 1
+mov -,sacq(0) # 2
+mov -,sacq(0) # 3
+mov -,sacq(0) # 4
+mov -,sacq(0) # 5
+mov -,sacq(0) # 6
+mov -,sacq(0) # 7
+
+nop        ; nop ; thrend
+mov interrupt, 1; nop # delay slot 1
+nop        ; nop # delay slot 2
+
+
+
+
+
+# LUMA CODE
+
+# The idea is to form B predictions by doing 8 pixels from ref0 in parallel with 8 pixels from ref1.
+# For P frames we make the second x,y coordinates offset by +8
+
+################################################################################
+# mc_setup(y_x, ref_y_base, y2_x2, ref_y2_base, frame_width_height, pitch, dst_pitch, offset_shift, tbd, next_kernel)
+::mc_setup
+  mov r3, 16
+
+  # Need to save these because we need to know the frame dimensions before computing texture coordinates
+  mov ra8, unif  # y_x
+  mov ra9, unif  # ref_y_base
+  mov ra10, unif # y2_x2
+  mov ra11, unif # ref_y2_base
+
+# Read image dimensions
+  mov r1, unif # width_height
+  shl r0,r1,r3
+  asr r1,r1,r3 # width
+  asr r0,r0,r3 # height
+  sub rb_frame_width_minus_1,r1,1
+  sub rb_frame_height_minus_1,r0,1
+
+# get source pitch
+  mov rb_pitch, unif # src_pitch
+
+# get destination pitch
+  mov r0, unif       # dst_pitch
+  mov r1, vdw_setup_1(0)
+  add rb24, r1, r0
+
+# Compute base address for first and second access
+  mov r1, ra8 # y_x
+  shl r0,r1,r3 # r0 is x<<16
+  asr r1,r1,r3 # r1 is y
+  asr r0,r0,r3 # r0 is x
+  add r0, r0, elem_num # Load x
+  max r0, r0, 0
+  min r0, r0, rb_frame_width_minus_1 ; mov r2, ra9  # Load the frame base
+  shl ra_xshift_next, r0, 3 # Compute shifts
+  add ra_y, r1, 1
+  and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
+  add r2, r2, r0  # r2 is address for frame0 (not including y offset)
+  max r1, r1, 0
+  min r1, r1, rb_frame_height_minus_1
+  nop             ; mul24 r1, r1, rb_pitch   # r2 contains the addresses (not including y offset) for frame0
+  add t0s, r2, r1 ; mov ra_frame_base, r2
+
+  mov r1, ra10 # y_x
+  shl r0,r1,r3 # r0 is x<<16
+  asr r1,r1,r3 # r1 is y
+  asr r0,r0,r3 # r0 is x
+  add r0, r0, elem_num # Load x
+  max r0, r0, 0
+  min r0, r0, rb_frame_width_minus_1 ; mov r2, ra11  # Load the frame base
+  shl rx_xshift2_next, r0, 3 # Compute shifts
+  add ra_y2, r1, 1
+  and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
+  add r2, r2, r0  # r2 is address for frame1 (not including y offset)
+  max r1, r1, 0
+  min r1, r1, rb_frame_height_minus_1
+  nop             ; mul24 r1, r1, rb_pitch   # r2 contains the addresses (not including y offset) for frame0
+  add t1s, r2, r1 ; mov ra_frame_base2, r2
+
+
+# load constants
+
+  mov ra_k1, 1
+  mov ra_k256, 256
+  mov ra30, 64
+
+  mov rb20, 0xffffff00
+  mov rb_k255, 255
+  mov rb23, 24
+
+# touch vertical context to keep simulator happy
+
+  mov ra8, 0
+  mov ra9, 0
+  mov ra10, 0
+  mov ra11, 0
+  mov ra12, 0
+  mov ra13, 0
+  mov ra14, 0
+  mov ra15, 0
+
+# Compute part of VPM to use
+  mov r2, qpu_num
+  mov r1, r2
+  asr r1, r1, 2
+  shl r1, r1, 6
+  mov r0, r2
+  and r0, r0, 3
+  add r0, r0, r1
+  mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
+  add rb28, r0, r1  # VPM for saving data
+  mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
+  shl r0, r0, 5
+  add rb27, r0, r1  # Command for dma output
+
+# Weighted prediction denom
+  add rb13, unif, 9  # unif = weight denom + 6
+
+  mov -, unif # Unused
+
+# submit texture requests for second line
+  max r1, ra_y, 0
+  min r1, r1, rb_frame_height_minus_1
+  add ra_y, ra_y, 1
+  nop ; mul24 r1, r1, rb_pitch
+  add t0s, r1, ra_frame_base
+
+  max r1, ra_y2, 0
+  min r1, r1, rb_frame_height_minus_1
+  add ra_y2, ra_y2, 1
+  nop ; mul24 r1, r1, rb_pitch
+  add t1s, r1, ra_frame_base2
+
+# FALL THROUGHT TO PER-BLOCK SETUP
+
+# Start of per-block setup code
+# P and B blocks share the same setup code to save on Icache space
+:per_block_setup
+  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+  mov ra31, unif
+
+  mov ra1, unif  ; mov r1, elem_num  # y_x ; elem_num has implicit unpack??
+
+# per-channel shifts were calculated on the *previous* invocation
+  mov ra_xshift, ra_xshift_next
+  mov rx_xshift2, rx_xshift2_next
+
+# get base addresses and per-channel shifts for *next* invocation
+
+  add r0, ra1.16a, r1 # Load x
+  max r0, r0, 0
+  min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
+  shl ra_xshift_next, r0, 3 # Compute shifts
+  mov r3, 8                          ; mov ra_y_next, ra1.16b
+  and r0, r0, ~3                     ; mov ra1, unif # y2_x2
+  add ra_frame_base_next, r2, r0
+
+  add r0, ra1.16a, r1 # Load x
+  max r0, r0, 0
+  min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
+  shl rx_xshift2_next, r0, 3         # Compute shifts
+  add r3, r3, r3                     ; mov ra_y2_next, ra1.16b  # r3 = 16 ;
+  and r0, r0, ~3                     ; mov ra1, unif  # width_height ; r0 gives the clipped and aligned x coordinate
+  add rx_frame_base2_next, r2, r0    # r2 is address for frame1 (not including y offset)
+
+# set up VPM write
+  mov vw_setup, rb28
+
+# get width,height of block (unif load above)
+  sub rb29, rb24, ra1.16b # Compute vdw_setup1(dst_pitch-width)
+  add rb17, ra1.16a, 5
+  add rb18, ra1.16a, 7
+  shl r0,   ra1.16a, 7
+  add r0,   r0, ra1.16b # Combine width and height of destination area
+  shl r0,   r0, i_shift16 # Shift into bits 16 upwards of the vdw_setup0 register
+  add rb26, r0, rb27                 ; mov r0, unif   # Packed filter offsets
+
+# get filter coefficients and discard unused B frame values
+  shl.ifz r0, r0, i_shift16      # Pick half to use
+  shl ra8, r0, 3
+
+# Pack the 1st 4 filter coefs for H & V tightly
+
+  mov r1,0x00010100  # -ve
+  ror ra2.8a, r1, ra8.8d
+  ror ra0.8a, r1, ra8.8c
+
+  mov r1,0x01040400
+  ror ra2.8b, r1, ra8.8d
+  ror ra0.8b, r1, ra8.8c
+
+  mov r1,0x050b0a00  # -ve
+  ror ra2.8c, r1, ra8.8d
+  ror ra0.8c, r1, ra8.8c
+
+  mov r1,0x11283a40
+  ror ra2.8d, r1, ra8.8d
+  ror ra0.8d, r1, ra8.8c
+
+# In the 2nd vertical half we use b registers due to
+# using a-side fifo regs. The easiest way to achieve this to pack it
+# and then unpack!
+
+  mov r1,0x3a281100
+  ror ra3.8a, r1, ra8.8d
+  ror ra1.8a, r1, ra8.8c
+
+  mov r1,0x0a0b0500  # -ve
+  ror ra3.8b, r1, ra8.8d
+  ror ra1.8b, r1, ra8.8c
+
+  mov r1,0x04040100
+  ror ra3.8c, r1, ra8.8d
+  ror ra1.8c, r1, ra8.8c
+
+# Extract weighted prediction information in parallel
+
+  mov r1,0x01010000  # -ve
+  ror ra3.8d, r1, ra8.8d    ; mov r0, unif      # ; weight L1 weight L1 (hi16)/weight L0 (lo16)
+  ror ra1.8d, r1, ra8.8c    ; mov r1, rb13      # ; rb13 = weight denom + 6 + 9
+
+# r3 = 16 from (long way) above
+  shl r1, unif, r1          ; mov rb4, ra3.8a   # combined offet = ((is P) ? offset L0 * 2 : offset L1 + offset L0) + 1) ;
+  asr ra18, r0, r3          ; mov rb5, ra3.8b
+  bra -, ra31
+  shl r0, r0, r3            ; mov rb6, ra3.8c
+  mov r3, 0                 ; mov rb7, ra3.8d   # loop count ;
+  asr rb12, r1, 9
+
+# >>> branch ra31
+#
+# r3 = 0
+# ra18 = weight L1
+# r0   = weight L0 << 16 (will be put into rb14 in filter preamble)
+# rb13 = weight denom + 6 + 9
+# rb12 = (((is P) ? offset L0 * 2 : offset L1 + offset L0) + 1) << (rb13 - 1)
+
+
+################################################################################
+# mc_filter(y_x, frame_base, y2_x2, frame_base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
+# In a P block, y2_x2 should be y_x+8
+# At this point we have already issued two pairs of texture requests for the current block
+
+::mc_filter
+# r0 = weight << 16; We want weight * 2 in rb14
+  asr rb14, r0, 15
+
+# r3 = 0
+
+:yloop
+# retrieve texture results and pick out bytes
+# then submit two more texture requests
+
+# If we knew there was no clipping then this code would get simpler.
+# Perhaps we could add on the pitch and clip using larger values?
+
+# N.B. Whilst y == y2 as far as this loop is concerned we will start
+# the grab for the next block before we finish with this block and that
+# might be B where y != y2 so we must do full processing on both y and y2
+
+  sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
+  shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
+  mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+  mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+  shr r1, r4, rx_xshift2    ; mov.ifz ra_y2, ra_y2_next
+
+  max r2, ra_y, 0  # y
+  min r2, r2, rb_frame_height_minus_1
+  add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+  add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
+
+  max r2, ra_y2, 0  # y
+  min r2, r2, rb_frame_height_minus_1
+  add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
+  add t1s, ra_frame_base2, r2  ; v8subs r1, r1, rb20
+
+# generate seven shifted versions
+# interleave with scroll of vertical context
+
+  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+
+# apply horizontal filter
+  nop                  ; mul24      r3, ra0.8a,      r0
+  nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
+  nop                  ; mul24      r2, ra0.8b << 1, r0 << 1
+  nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
+  sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2
+  nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+  sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3
+  nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+  add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4
+  nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12
+  add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5
+  nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13
+  sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6
+  nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14
+  add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7
+  nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15
+  sub r0, r2, r3       ; mov r3, rb31
+
+  sub.setf -, r3, 8       ; mov r1,   ra8
+  mov ra8,  ra9           ; mov rb8,  rb9
+  brr.anyn -, r:yloop
+  mov ra9,  ra10          ; mov rb9,  rb10
+  mov ra10, ra11          ; mov rb10, rb11
+  mov ra11, r0            ; mov rb11, r1
+  # >>> .anyn yloop
+
+  # apply vertical filter and write to VPM
+
+  nop                     ; mul24 r0, rb8,  ra2.8a
+  nop                     ; mul24 r1, rb9,  ra2.8b
+  sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
+  sub r1, r1, r0          ; mul24 r0, rb11, ra2.8d
+  add r1, r1, r0          ; mul24 r0, ra8,  rb4
+  add r1, r1, r0          ; mul24 r0, ra9,  rb5
+  sub r1, r1, r0          ; mul24 r0, ra10, rb6
+  add r1, r1, r0          ; mul24 r0, ra11, rb7
+  sub r1, r1, r0          ; mov -, vw_wait
+# At this point r1 is a 22-bit signed quantity: 8 (original sample),
+#  +6, +6 (each pass), +1 (the passes can overflow slightly), +1 (sign)
+# The top 8 bits have rubbish in them as mul24 is unsigned
+# The low 6 bits need discard before weighting
+  sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256  # x256 - sign extend & discard rubbish
+  asr r1, r1, 14
+  nop                     ; mul24 r1, r1, rb14
+  add r1, r1, rb12
+
+  shl r1, r1, 8
+  brr.anyn -, r:yloop
+  asr r1, r1, rb13
+# We have a saturating pack unit - I can't help feeling it should be useful here
+  min r1, r1, rb_k255       # Delay 2  rb_k255 = 255
+  max vpm, r1, 0         # Delay 3
+# >>> branch.anyn yloop
+
+# DMA out
+
+  brr -, r:per_block_setup
+  mov vw_setup, rb26 # VDW setup 0    Delay 1
+  mov vw_setup, rb29 # Stride         Delay 2
+  mov vw_addr, unif # start the VDW   Delay 3
+
+
+
+################################################################################
+
+# mc_filter_b(y_x, frame_base, y2_x2, frame_base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
+# In a P block, only the first half of coefficients contain used information.
+# At this point we have already issued two pairs of texture requests for the current block
+# May be better to just send 16.16 motion vector and figure out the coefficients inside this block (only 4 cases so can compute hcoeffs in around 24 cycles?)
+# Can fill in the coefficients so only
+# Can also assume default weighted prediction for B frames.
+# Perhaps can unpack coefficients in a more efficient manner by doing H/V for a and b at the same time?
+# Or possibly by taking advantage of symmetry?
+# From 19->7 32bits per command.
+
+::mc_filter_b
+  # r0 = weightL0 << 16, we want it in rb14
+  asr rb14, r0, i_shift16
+
+:yloopb
+# retrieve texture results and pick out bytes
+# then submit two more texture requests
+
+# If we knew there was no clipping then this code would get simpler.
+# Perhaps we could add on the pitch and clip using larger values?
+
+  sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
+  shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
+  mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+  mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+  shr r1, r4, rx_xshift2    ; mov.ifz ra_y2, ra_y2_next
+
+  max r2, ra_y, 0  # y
+  min r2, r2, rb_frame_height_minus_1
+  add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+  add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
+
+  max r2, ra_y2, 0  # y
+  min r2, r2, rb_frame_height_minus_1
+  add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
+  add t1s, ra_frame_base2, r2  ; v8subs r1, r1, rb20
+
+# generate seven shifted versions
+# interleave with scroll of vertical context
+
+  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+
+# apply horizontal filter
+  nop                  ; mul24      r3, ra0.8a,      r0
+  nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
+  nop                  ; mul24      r2, ra0.8b << 1, r0 << 1
+  nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
+  sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2
+  nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+  sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3
+  nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+  add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4
+  nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12
+  add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5
+  nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13
+  sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6
+  nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14
+  add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7
+  nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15
+  sub r0, r2, r3       ; mov r3, rb31
+
+  sub.setf -, r3, 8       ; mov r1,   ra8
+  mov ra8,  ra9           ; mov rb8,  rb9
+  brr.anyn -, r:yloopb
+  mov ra9,  ra10          ; mov rb9,  rb10
+  mov ra10, ra11          ; mov rb10, rb11
+  mov ra11, r0            ; mov rb11, r1
+  # >>> .anyn yloopb
+
+  # apply vertical filter and write to VPM
+
+  nop                     ; mul24 r0, rb8,  ra2.8a
+  nop                     ; mul24 r1, rb9,  ra2.8b
+  sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
+  sub r1, r1, r0          ; mul24 r0, rb11, ra2.8d
+  add r1, r1, r0          ; mul24 r0, ra8,  rb4
+  add r1, r1, r0          ; mul24 r0, ra9,  rb5
+  sub r1, r1, r0          ; mul24 r0, ra10, rb6
+  add r1, r1, r0          ; mul24 r0, ra11, rb7
+  sub r1, r1, r0          ; mov r2, rb12
+# As with P-pred r1 is a 22-bit signed quantity in 32-bits
+# Top 8 bits are bad - low 6 bits should be discarded
+  sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
+
+  asr r1, r1, 14
+  nop                     ; mul24 r0, r1, rb14
+  add r0, r0, r2          ; mul24 r1, r1 << 8, ra18 << 8
+
+  add r1, r1, r0          ; mov -, vw_wait
+  shl r1, r1, 8
+
+  brr.anyn -, r:yloopb
+  asr r1, r1, rb13         # Delay 1
+  min r1, r1, rb_k255       # Delay 2
+  max vpm, r1, 0         # Delay 3
+
+# DMA out
+  brr -, r:per_block_setup
+  mov vw_setup, rb26 # VDW setup 0    Delay 1
+  mov vw_setup, rb29 # Stride         Delay 2
+  mov vw_addr, unif # start the VDW   Delay 3
+
+################################################################################
+
+# mc_interrupt_exit12()
+::mc_interrupt_exit12
+  mov  -, vw_wait # wait on the VDW
+
+  # Dummy wait to test instructions
+#  mov r3,1000000
+#:dummy_loop
+#  sub.setf r3, r3, 1
+#  nop
+#  nop
+#  brr.anynn -, r:dummy_loop
+#  nop
+#  nop
+#  nop
+
+  ldtmu0
+  ldtmu0
+  ldtmu1
+  ldtmu1
+
+  mov -,sacq(0) # 1
+  mov -,sacq(0) # 2
+  mov -,sacq(0) # 3
+  mov -,sacq(0) # 4
+  mov -,sacq(0) # 5
+  mov -,sacq(0) # 6
+  mov -,sacq(0) # 7
+  mov -,sacq(0) # 8
+  mov -,sacq(0) # 9
+  mov -,sacq(0) # 10
+  mov -,sacq(0) # 11
+
+  nop        ; nop ; thrend
+  mov interrupt, 1; nop # delay slot 1
+  nop        ; nop # delay slot 2
+
+
+::mc_exit1
+  mov  -, vw_wait # wait on the VDW
+
+  ldtmu0
+  ldtmu1
+  ldtmu0
+  ldtmu1
+  nop        ; nop ; thrend
+  mov interrupt, 1; nop # delay slot 1
+  nop        ; nop # delay slot 2
+
+
+::mc_end
+# Do not add code here because mc_end must appear after all other code.
diff --git a/libavcodec/rpi_user_vcsm.h b/libavcodec/rpi_user_vcsm.h
new file mode 100644
index 0000000..db41a4d
--- /dev/null
+++ b/libavcodec/rpi_user_vcsm.h
@@ -0,0 +1,459 @@
+/*****************************************************************************
+* Copyright 2001 - 2011 Broadcom Corporation.  All rights reserved.
+*
+* This program is the proprietary software of Broadcom Corporation and/or
+* its licensors, and may only be used, duplicated, modified or distributed
+* pursuant to the terms and conditions of a separate, written license
+* agreement executed between you and Broadcom (an "Authorized License").
+* Except as set forth in an Authorized License, Broadcom grants no license
+* (express or implied), right to use, or waiver of any kind with respect to
+* the Software, and Broadcom expressly reserves all rights in and to the
+* Software and all intellectual property rights therein.  IF YOU HAVE NO
+* AUTHORIZED LICENSE, THEN YOU HAVE NO RIGHT TO USE THIS SOFTWARE IN ANY
+* WAY, AND SHOULD IMMEDIATELY NOTIFY BROADCOM AND DISCONTINUE ALL USE OF
+* THE SOFTWARE.
+*
+* Except as expressly set forth in the Authorized License,
+* 1. This program, including its structure, sequence and organization,
+*    constitutes the valuable trade secrets of Broadcom, and you shall use
+*    all reasonable efforts to protect the confidentiality thereof, and to
+*    use this information only in connection with your use of Broadcom
+*    integrated circuit products.
+* 2. TO THE MAXIMUM EXTENT PERMITTED BY LAW, THE SOFTWARE IS PROVIDED "AS IS"
+*    AND WITH ALL FAULTS AND BROADCOM MAKES NO PROMISES, REPRESENTATIONS OR
+*    WARRANTIES, EITHER EXPRESS, IMPLIED, STATUTORY, OR OTHERWISE, WITH
+*    RESPECT TO THE SOFTWARE.  BROADCOM SPECIFICALLY DISCLAIMS ANY AND ALL
+*    IMPLIED WARRANTIES OF TITLE, MERCHANTABILITY, NONINFRINGEMENT, FITNESS
+*    FOR A PARTICULAR PURPOSE, LACK OF VIRUSES, ACCURACY OR COMPLETENESS,
+*    QUIET ENJOYMENT, QUIET POSSESSION OR CORRESPONDENCE TO DESCRIPTION. YOU
+*    ASSUME THE ENTIRE RISK ARISING OUT OF USE OR PERFORMANCE OF THE SOFTWARE.
+* 3. TO THE MAXIMUM EXTENT PERMITTED BY LAW, IN NO EVENT SHALL BROADCOM OR ITS
+*    LICENSORS BE LIABLE FOR (i) CONSEQUENTIAL, INCIDENTAL, SPECIAL, INDIRECT,
+*    OR EXEMPLARY DAMAGES WHATSOEVER ARISING OUT OF OR IN ANY WAY RELATING TO
+*    YOUR USE OF OR INABILITY TO USE THE SOFTWARE EVEN IF BROADCOM HAS BEEN
+*    ADVISED OF THE POSSIBILITY OF SUCH DAMAGES; OR (ii) ANY AMOUNT IN EXCESS
+*    OF THE AMOUNT ACTUALLY PAID FOR THE SOFTWARE ITSELF OR U.S. $1, WHICHEVER
+*    IS GREATER. THESE LIMITATIONS SHALL APPLY NOTWITHSTANDING ANY FAILURE OF
+*    ESSENTIAL PURPOSE OF ANY LIMITED REMEDY.
+*****************************************************************************/
+
+#ifndef __USER_VCSM__H__INCLUDED__
+#define __USER_VCSM__H__INCLUDED__
+
+/* VideoCore Shared Memory - user interface library.
+**
+** This library provides all the necessary abstraction for any application to
+** make use of the shared memory service which is distributed accross a kernel
+** driver and a videocore service.
+**
+** It is an application design decision to choose or not to use this service.
+**
+** The logical flow of operations that a user application needs to follow when
+** using this service is:
+**
+**       1) Initialize the service.
+**       2) Allocate shared memory blocks.
+**       3) Start using the allocated blocks.
+**          - In order to gain ownership on a block, lock the allocated block,
+**            locking a block returns a valid address that the user application
+**            can access.
+**          - When finished with using the block for the current execution cycle
+**            or function, and so when giving up the ownership, unlock the block.
+**       4) A block can be locked/unlocked as many times required - within or outside
+**          of - a specific execution context.
+**       5) To completely release an allocated block, free it.
+**       6) If the service is no longer required, terminate it.
+**
+**
+** Some generic considerations:
+
+** Allocating memory blocks.
+**
+**   Memory blocks can be allocated in different manners depending on the cache
+**   behavior desired.  A given block can either be:
+
+**       - Allocated in a non cached fashion all the way through host and videocore.
+**       - Allocated in a cached fashion on host OR videocore.
+**       - Allocated in a cached fashion on host AND videocore.
+**
+**   It is an application decision to determine how to allocate a block.  Evidently
+**   if the application will be doing substantial read/write accesses to a given block,
+**   it is recommended to allocate the block at least in a 'host cached' fashion for
+**   better results.
+**
+**
+** Locking memory blocks.
+**
+**   When the memory block has been allocated in a host cached fashion, locking the
+**   memory block (and so taking ownership of it) will trigger a cache invalidation.
+**
+**   For the above reason and when using host cached allocation, it is important that
+**   an application properly implements the lock/unlock mechanism to ensure cache will
+**   stay coherent, otherwise there is no guarantee it will at all be.
+**
+**   It is possible to dynamically change the host cache behavior (ie cached or non
+**   cached) of a given allocation without needing to free and re-allocate the block.
+**   This feature can be useful for such application which requires access to the block
+**   only at certain times and not otherwise.  By changing the cache behavior dynamically
+**   the application can optimize performances for a given duration of use.
+**   Such dynamic cache behavior remapping only applies to host cache and not videocore
+**   cache.  If one requires to change the videocore cache behavior, then a new block
+**   must be created to replace the old one.
+**
+**   On successful locking, a valid pointer is returned that the application can use
+**   to access to data inside the block.  There is no guarantee that the pointer will
+**   stay valid following the unlock action corresponding to this lock.
+**
+**
+** Unocking memory blocks.
+**
+**   When the memory block has been allocated in a host cached fashion, unlocking the
+**   memory block (and so forgiving its ownership) will trigger a cache flush unless
+**   explicitely asked not to flush the cache for performances reasons.
+**
+**   For the above reason and when using host cached allocation, it is important that
+**   an application properly implements the lock/unlock mechanism to ensure cache will
+**   stay coherent, otherwise there is no guarantee it will at all be.
+**
+**
+** A complete API is defined below.
+*/
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/* Different status that can be dumped.
+*/
+typedef enum
+{
+   VCSM_STATUS_VC_WALK_ALLOC = 0,   // Walks *all* the allocation on videocore.
+                                    // Result of the walk is seen in the videocore
+                                    // log.
+   VCSM_STATUS_HOST_WALK_MAP,       // Walks the *full* mapping allocation on host
+                                    // driver (ie for all processes).  Result of
+                                    // the walk is seen in the kernel log.
+   VCSM_STATUS_HOST_WALK_PID_MAP,   // Walks the per process mapping allocation on host
+                                    // driver (for current process).  Result of
+                                    // the walk is seen in the kernel log.
+   VCSM_STATUS_HOST_WALK_PID_ALLOC, // Walks the per process host allocation on host
+                                    // driver (for current process).  Result of
+                                    // the walk is seen in the kernel log.
+   VCSM_STATUS_VC_MAP_ALL,          // Equivalent to both VCSM_STATUS_VC_WALK_ALLOC and
+                                    // VCSM_STATUS_HOST_WALK_MAP.
+                                    //
+   VCSM_STATUS_NONE,                // Must be last - invalid.
+
+} VCSM_STATUS_T;
+
+/* Different kind of cache behavior.
+*/
+typedef enum
+{
+   VCSM_CACHE_TYPE_NONE = 0,        // No caching applies.
+   VCSM_CACHE_TYPE_HOST,            // Allocation is cached on host (user space).
+   VCSM_CACHE_TYPE_VC,              // Allocation is cached on videocore.
+   VCSM_CACHE_TYPE_HOST_AND_VC,     // Allocation is cached on both host and videocore.
+
+} VCSM_CACHE_TYPE_T;
+
+/* Initialize the vcsm processing.
+**
+** Must be called once before attempting to do anything else.
+**
+** Returns 0 on success, -1 on error.
+*/
+int vcsm_init( void );
+
+
+/* Terminates the vcsm processing.
+**
+** Must be called vcsm services are no longer needed, it will
+** take care of removing any allocation under the current process
+** control if deemed necessary.
+*/
+void vcsm_exit( void );
+
+
+/* Queries the status of the the vcsm.
+**
+** Triggers dump of various kind of information, see the
+** different variants specified in VCSM_STATUS_T.
+**
+** Pid is optional.
+*/
+void vcsm_status( VCSM_STATUS_T status, int pid );
+
+
+/* Allocates a non-cached block of memory of size 'size' via the vcsm memory
+** allocator.
+**
+** Returns:        0 on error
+**                 a non-zero opaque handle on success.
+**
+** On success, the user must invoke vcsm_lock with the returned opaque
+** handle to gain access to the memory associated with the opaque handle.
+** When finished using the memory, the user calls vcsm_unlock_xx (see those
+** function definition for more details on the one that can be used).
+**
+** A well behaved application should make every attempt to lock/unlock
+** only for the duration it needs to access the memory data associated with
+** the opaque handle.
+*/
+unsigned int vcsm_malloc( unsigned int size, char *name );
+
+
+/* Allocates a cached block of memory of size 'size' via the vcsm memory
+** allocator, the type of caching requested is passed as argument of the
+** function call.
+**
+** Returns:        0 on error
+**                 a non-zero opaque handle on success.
+**
+** On success, the user must invoke vcsm_lock with the returned opaque
+** handle to gain access to the memory associated with the opaque handle.
+** When finished using the memory, the user calls vcsm_unlock_xx (see those
+** function definition for more details on the one that can be used).
+**
+** A well behaved application should make every attempt to lock/unlock
+** only for the duration it needs to access the memory data associated with
+** the opaque handle.
+*/
+unsigned int vcsm_malloc_cache( unsigned int size, VCSM_CACHE_TYPE_T cache, char *name );
+
+
+/* Shares an allocated block of memory via the vcsm memory allocator.
+**
+** Returns:        0 on error
+**                 a non-zero opaque handle on success.
+**
+** On success, the user must invoke vcsm_lock with the returned opaque
+** handle to gain access to the memory associated with the opaque handle.
+** When finished using the memory, the user calls vcsm_unlock_xx (see those
+** function definition for more details on the one that can be used).
+**
+** A well behaved application should make every attempt to lock/unlock
+** only for the duration it needs to access the memory data associated with
+** the opaque handle.
+*/
+unsigned int vcsm_malloc_share( unsigned int handle );
+
+
+/* Resizes a block of memory allocated previously by vcsm_alloc.
+**
+** Returns:        0 on success
+**                 -errno on error.
+**
+** The handle must be unlocked by user prior to attempting any
+** resize action.
+**
+** On error, the original size allocated against the handle
+** remains available the same way it would be following a
+** successful vcsm_malloc.
+*/
+int vcsm_resize( unsigned int handle, unsigned int new_size );
+
+
+/* Frees a block of memory that was successfully allocated by
+** a prior call the vcms_alloc.
+**
+** The handle should be considered invalid upon return from this
+** call.
+**
+** Whether any memory is actually freed up or not as the result of
+** this call will depends on many factors, if all goes well it will
+** be freed.  If something goes wrong, the memory will likely end up
+** being freed up as part of the vcsm_exit process.  In the end the
+** memory is guaranteed to be freed one way or another.
+*/
+void vcsm_free( unsigned int handle );
+
+
+/* Retrieves a videocore opaque handle from a mapped user address
+** pointer.  The videocore handle will correspond to the actual
+** memory mapped in videocore.
+**
+** Returns:        0 on error
+**                 a non-zero opaque handle on success.
+**
+** Note: the videocore opaque handle is distinct from the user
+**       opaque handle (allocated via vcsm_malloc) and it is only
+**       significant for such application which knows what to do
+**       with it, for the others it is just a number with little
+**       use since nothing can be done with it (in particular
+**       for safety reason it cannot be used to map anything).
+*/
+unsigned int vcsm_vc_hdl_from_ptr( void *usr_ptr );
+
+
+/* Retrieves a videocore opaque handle from a opaque handle
+** pointer.  The videocore handle will correspond to the actual
+** memory mapped in videocore.
+**
+** Returns:        0 on error
+**                 a non-zero opaque handle on success.
+**
+** Note: the videocore opaque handle is distinct from the user
+**       opaque handle (allocated via vcsm_malloc) and it is only
+**       significant for such application which knows what to do
+**       with it, for the others it is just a number with little
+**       use since nothing can be done with it (in particular
+**       for safety reason it cannot be used to map anything).
+*/
+unsigned int vcsm_vc_hdl_from_hdl( unsigned int handle );
+
+
+/* Retrieves a user opaque handle from a mapped user address
+** pointer.
+**
+** Returns:        0 on error
+**                 a non-zero opaque handle on success.
+*/
+unsigned int vcsm_usr_handle( void *usr_ptr );
+
+
+/* Retrieves a mapped user address from an opaque user
+** handle.
+**
+** Returns:        0 on error
+**                 a non-zero address on success.
+**
+** On success, the address corresponds to the pointer
+** which can access the data allocated via the vcsm_malloc
+** call.
+*/
+void *vcsm_usr_address( unsigned int handle );
+
+
+/* Locks the memory associated with this opaque handle.
+**
+** Returns:        NULL on error
+**                 a valid pointer on success.
+**
+** A user MUST lock the handle received from vcsm_malloc
+** in order to be able to use the memory associated with it.
+**
+** On success, the pointer returned is only valid within
+** the lock content (ie until a corresponding vcsm_unlock_xx
+** is invoked).
+*/
+void *vcsm_lock( unsigned int handle );
+
+
+/* Locks the memory associated with this opaque handle.  The lock
+** also gives a chance to update the *host* cache behavior of the
+** allocated buffer if so desired.  The *videocore* cache behavior
+** of the allocated buffer cannot be changed by this call and such
+** attempt will be ignored.
+**
+** The system will attempt to honour the cache_update mode request,
+** the cache_result mode will provide the final answer on which cache
+** mode is really in use.  Failing to change the cache mode will not
+** result in a failure to lock the buffer as it is an application
+** decision to choose what to do if (cache_result != cache_update)
+**
+** The value returned in cache_result can only be considered valid if
+** the returned pointer is non NULL.  The cache_result pointer may be
+** NULL if the application does not care about the actual outcome of
+** its action with regards to the cache behavior change.
+**
+** Returns:        NULL on error
+**                 a valid pointer on success.
+**
+** A user MUST lock the handle received from vcsm_malloc
+** in order to be able to use the memory associated with it.
+**
+** On success, the pointer returned is only valid within
+** the lock content (ie until a corresponding vcsm_unlock_xx
+** is invoked).
+*/
+void *vcsm_lock_cache( unsigned int handle,
+                       VCSM_CACHE_TYPE_T cache_update,
+                       VCSM_CACHE_TYPE_T *cache_result );
+
+
+/* Unlocks the memory associated with this user mapped address.
+**
+** Returns:        0 on success
+**                 -errno on error.
+**
+** After unlocking a mapped address, the user should no longer
+** attempt to reference it.
+*/
+int vcsm_unlock_ptr( void *usr_ptr );
+
+
+/* Unlocks the memory associated with this user mapped address.
+** Apply special processing that would override the otherwise
+** default behavior.
+**
+** If 'cache_no_flush' is specified:
+**    Do not flush cache as the result of the unlock (if cache
+**    flush was otherwise applicable in this case).
+**
+** Returns:        0 on success
+**                 -errno on error.
+**
+** After unlocking a mapped address, the user should no longer
+** attempt to reference it.
+*/
+int vcsm_unlock_ptr_sp( void *usr_ptr, int cache_no_flush );
+
+
+/* Unlocks the memory associated with this user opaque handle.
+**
+** Returns:        0 on success
+**                 -errno on error.
+**
+** After unlocking an opaque handle, the user should no longer
+** attempt to reference the mapped addressed once associated
+** with it.
+*/
+int vcsm_unlock_hdl( unsigned int handle );
+
+
+/* Unlocks the memory associated with this user opaque handle.
+** Apply special processing that would override the otherwise
+** default behavior.
+**
+** If 'cache_no_flush' is specified:
+**    Do not flush cache as the result of the unlock (if cache
+**    flush was otherwise applicable in this case).
+**
+** Returns:        0 on success
+**                 -errno on error.
+**
+** After unlocking an opaque handle, the user should no longer
+** attempt to reference the mapped addressed once associated
+** with it.
+*/
+int vcsm_unlock_hdl_sp( unsigned int handle, int cache_no_flush );
+
+/* Clean and/or invalidate the memory associated with this user opaque handle
+**
+** Returns:        non-zero on error
+**
+** structure contains a list of flush/invalidate commands. Commands are:
+** 0: nop
+** 1: invalidate       given virtual range in L1/L2
+** 2: clean            given virtual range in L1/L2
+** 3: clean+invalidate given virtual range in L1/L2
+** 4: flush all L1/L2
+*/
+struct vcsm_user_clean_invalid_s {
+   struct {
+      unsigned int cmd;
+      unsigned int handle;
+      unsigned int addr;
+      unsigned int size;
+   } s[8];
+};
+
+int vcsm_clean_invalid( struct vcsm_user_clean_invalid_s *s );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __USER_VCSM__H__INCLUDED__ */
diff --git a/libavcodec/rpi_zc.c b/libavcodec/rpi_zc.c
new file mode 100644
index 0000000..9580165
--- /dev/null
+++ b/libavcodec/rpi_zc.c
@@ -0,0 +1,406 @@
+#include "config.h"
+#ifdef RPI
+#include "rpi_qpu.h"
+#include "rpi_zc.h"
+
+#include "libavutil/buffer_internal.h"
+
+struct ZcPoolEnt;
+
+typedef struct ZcPool
+{
+    int numbytes;
+    struct ZcPoolEnt * head;
+    pthread_mutex_t lock;
+} ZcPool;
+
+typedef struct ZcPoolEnt
+{
+    // It is important that we start with gmem as other bits of code will expect to see that
+    GPU_MEM_PTR_T gmem;
+    struct ZcPoolEnt * next;
+    struct ZcPool * pool;
+} ZcPoolEnt;
+
+static ZcPoolEnt * zc_pool_ent_alloc(ZcPool * const pool, const int size)
+{
+    ZcPoolEnt * const zp = av_malloc(sizeof(ZcPoolEnt));
+
+    if (zp == NULL) {
+        av_log(NULL, AV_LOG_ERROR, "av_malloc(ZcPoolEnt) failed\n");
+        goto fail0;
+    }
+
+    if (gpu_malloc_cached(size, &zp->gmem) != 0)
+    {
+        av_log(NULL, AV_LOG_ERROR, "av_gpu_malloc_cached(%d) failed\n", size);
+        goto fail1;
+    }
+
+    zp->next = NULL;
+    zp->pool = pool;
+    return zp;
+
+fail1:
+    av_free(zp);
+fail0:
+    return NULL;
+}
+
+static void zc_pool_ent_free(ZcPoolEnt * const zp)
+{
+    gpu_free(&zp->gmem);
+    av_free(zp);
+}
+
+static void zc_pool_flush(ZcPool * const pool)
+{
+    ZcPoolEnt * p = pool->head;
+    pool->head = NULL;
+    while (p != NULL)
+    {
+        ZcPoolEnt * const zp = p;
+        p = p->next;
+        zc_pool_ent_free(zp);
+    }
+}
+
+static ZcPoolEnt * zc_pool_alloc(ZcPool * const pool, const int numbytes)
+{
+    ZcPoolEnt * zp;
+    pthread_mutex_lock(&pool->lock);
+
+    if (numbytes != pool->numbytes)
+    {
+        zc_pool_flush(pool);
+        pool->numbytes = numbytes;
+    }
+
+    if (pool->head != NULL)
+    {
+        zp = pool->head;
+        pool->head = zp->next;
+    }
+    else
+    {
+        zp = zc_pool_ent_alloc(pool, numbytes);
+    }
+
+    pthread_mutex_unlock(&pool->lock);
+    return zp;
+}
+
+static void zc_pool_free(ZcPoolEnt * const zp)
+{
+    ZcPool * const pool = zp == NULL ? NULL : zp->pool;
+    if (zp != NULL)
+    {
+        pthread_mutex_lock(&pool->lock);
+        if (pool->numbytes == zp->gmem.numbytes)
+        {
+            zp->next = pool->head;
+            pool->head = zp;
+            pthread_mutex_unlock(&pool->lock);
+        }
+        else
+        {
+            pthread_mutex_unlock(&pool->lock);
+            zc_pool_ent_free(zp);
+        }
+    }
+}
+
+static void
+zc_pool_init(ZcPool * const pool)
+{
+    pool->numbytes = -1;
+    pool->head = NULL;
+    pthread_mutex_init(&pool->lock, NULL);
+}
+
+static void
+zc_pool_destroy(ZcPool * const pool)
+{
+    pool->numbytes = -1;
+    zc_pool_flush(pool);
+    pthread_mutex_destroy(&pool->lock);
+}
+
+
+typedef struct AVZcEnv
+{
+    ZcPool pool;
+} ZcEnv;
+
+// Callback when buffer unrefed to zero
+static void rpi_free_display_buffer(void *opaque, uint8_t *data)
+{
+    ZcPoolEnt *const zp = opaque;
+//    printf("%s: data=%p\n", __func__, data);
+    zc_pool_free(zp);
+}
+
+static inline GPU_MEM_PTR_T * pic_gm_ptr(AVBufferRef * const buf)
+{
+    // Kludge where we check the free fn to check this is really
+    // one of our buffers - can't think of a better way
+    return buf == NULL || buf->buffer->free != rpi_free_display_buffer ? NULL :
+        av_buffer_get_opaque(buf);
+}
+
+AVRpiZcFrameGeometry av_rpi_zc_frame_geometry(
+    const unsigned int video_width, const unsigned int video_height)
+{
+    AVRpiZcFrameGeometry geo;
+    geo.stride_y = (video_width + 32 + 31) & ~31;
+    geo.stride_c = geo.stride_y / 2;
+//    geo.height_y = (video_height + 15) & ~15;
+    geo.height_y = (video_height + 32 + 31) & ~31;
+    geo.height_c = geo.height_y / 2;
+    return geo;
+}
+
+static AVBufferRef * rpi_buf_pool_alloc(ZcPool * const pool, int size)
+{
+    ZcPoolEnt *const zp = zc_pool_alloc(pool, size);
+    AVBufferRef * buf;
+
+    if (zp == NULL) {
+        av_log(NULL, AV_LOG_ERROR, "zc_pool_alloc(%d) failed\n", size);
+        goto fail0;
+    }
+
+    if ((buf = av_buffer_create(zp->gmem.arm, size, rpi_free_display_buffer, zp, AV_BUFFER_FLAG_READONLY)) == NULL)
+    {
+        av_log(NULL, AV_LOG_ERROR, "av_buffer_create() failed\n");
+        goto fail2;
+    }
+
+    return buf;
+
+fail2:
+    zc_pool_free(zp);
+fail0:
+    return NULL;
+}
+
+static int rpi_get_display_buffer(struct AVCodecContext * const s, AVFrame * const frame)
+{
+    ZcEnv *const zc = s->get_buffer_context;
+    const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(frame->width, frame->height);
+    const unsigned int size_y = geo.stride_y * geo.height_y;
+    const unsigned int size_c = geo.stride_c * geo.height_c;
+    const unsigned int size_pic = size_y + size_c * 2;
+    AVBufferRef * buf;
+    unsigned int i;
+
+//    printf("Do local alloc: format=%#x, %dx%d: %u\n", frame->format, frame->width, frame->height, size_pic);
+
+    if ((buf = rpi_buf_pool_alloc(&zc->pool, size_pic)) == NULL)
+    {
+        av_log(s, AV_LOG_ERROR, "rpi_get_display_buffer: Failed to get buffer from pool\n");
+        return AVERROR(ENOMEM);
+    }
+
+    for (i = 0; i < AV_NUM_DATA_POINTERS; i++) {
+        frame->buf[i] = NULL;
+        frame->data[i] = NULL;
+        frame->linesize[i] = 0;
+    }
+
+    frame->buf[0] = buf;
+    frame->linesize[0] = geo.stride_y;
+    frame->linesize[1] = geo.stride_c;
+    frame->linesize[2] = geo.stride_c;
+    frame->data[0] = buf->data;
+    frame->data[1] = frame->data[0] + size_y;
+    frame->data[2] = frame->data[1] + size_c;
+    frame->extended_data = frame->data;
+    // Leave extended buf alone
+
+    return 0;
+}
+
+
+#define RPI_GET_BUFFER2 1
+
+int av_rpi_zc_get_buffer2(struct AVCodecContext *s, AVFrame *frame, int flags)
+{
+#if !RPI_GET_BUFFER2
+    return avcodec_default_get_buffer2(s, frame, flags);
+#else
+    int rv;
+
+    if ((s->codec->capabilities & AV_CODEC_CAP_DR1) == 0 ||
+        frame->format != AV_PIX_FMT_YUV420P)
+    {
+//        printf("Do default alloc: format=%#x\n", frame->format);
+        rv = avcodec_default_get_buffer2(s, frame, flags);
+    }
+    else
+    {
+        rv = rpi_get_display_buffer(s, frame);
+    }
+
+#if 0
+    printf("%s: %dx%d lsize=%d/%d/%d data=%p/%p/%p bref=%p/%p/%p opaque[0]=%p\n", __func__,
+        frame->width, frame->height,
+        frame->linesize[0], frame->linesize[1], frame->linesize[2],
+        frame->data[0], frame->data[1], frame->data[2],
+        frame->buf[0], frame->buf[1], frame->buf[2],
+        av_buffer_get_opaque(frame->buf[0]));
+#endif
+    return rv;
+#endif
+}
+
+
+static AVBufferRef * zc_copy(struct AVCodecContext * const s,
+    const AVFrame * const src)
+{
+    AVFrame dest_frame;
+    AVFrame * const dest = &dest_frame;
+    unsigned int i;
+    uint8_t * psrc, * pdest;
+
+    dest->width = src->width;
+    dest->height = src->height;
+
+    if (rpi_get_display_buffer(s, dest) != 0)
+    {
+        return NULL;
+    }
+
+    for (i = 0, psrc = src->data[0], pdest = dest->data[0];
+         i != dest->height;
+         ++i, psrc += src->linesize[0], pdest += dest->linesize[0])
+    {
+        memcpy(pdest, psrc, dest->width);
+    }
+    for (i = 0, psrc = src->data[1], pdest = dest->data[1];
+         i != dest->height / 2;
+         ++i, psrc += src->linesize[1], pdest += dest->linesize[1])
+    {
+        memcpy(pdest, psrc, dest->width / 2);
+    }
+    for (i = 0, psrc = src->data[2], pdest = dest->data[2];
+         i != dest->height / 2;
+         ++i, psrc += src->linesize[2], pdest += dest->linesize[2])
+    {
+        memcpy(pdest, psrc, dest->width / 2);
+    }
+
+    return dest->buf[0];
+}
+
+
+AVRpiZcRefPtr av_rpi_zc_ref(struct AVCodecContext * const s,
+    const AVFrame * const frame, const int maycopy)
+{
+    assert(s != NULL);
+
+    if (frame->format != AV_PIX_FMT_YUV420P)
+    {
+        av_log(s, AV_LOG_WARNING, "%s: *** Format not YUV420P: %d\n", __func__, frame->format);
+        return NULL;
+    }
+
+    if (frame->buf[1] != NULL)
+    {
+        if (maycopy)
+        {
+            av_log(s, AV_LOG_INFO, "%s: *** Not a single buf frame: copying\n", __func__);
+            return zc_copy(s, frame);
+        }
+        else
+        {
+            av_log(s, AV_LOG_WARNING, "%s: *** Not a single buf frame: NULL\n", __func__);
+            return NULL;
+        }
+    }
+
+    if (pic_gm_ptr(frame->buf[0]) == NULL)
+    {
+        if (maycopy)
+        {
+            av_log(s, AV_LOG_INFO, "%s: *** Not one of our buffers: copying\n", __func__);
+            return zc_copy(s, frame);
+        }
+        else
+        {
+            av_log(s, AV_LOG_WARNING, "%s: *** Not one of our buffers: NULL\n", __func__);
+            return NULL;
+        }
+    }
+
+    return av_buffer_ref(frame->buf[0]);
+}
+
+int av_rpi_zc_vc_handle(const AVRpiZcRefPtr fr_ref)
+{
+    const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref);
+    return p == NULL ? -1 : p->vc_handle;
+}
+
+int av_rpi_zc_numbytes(const AVRpiZcRefPtr fr_ref)
+{
+    const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref);
+    return p == NULL ? 0 : p->numbytes;
+}
+
+void av_rpi_zc_unref(AVRpiZcRefPtr fr_ref)
+{
+    if (fr_ref != NULL)
+    {
+        av_buffer_unref(&fr_ref);
+    }
+}
+
+AVZcEnvPtr av_rpi_zc_env_alloc(void)
+{
+    ZcEnv * const zc = av_mallocz(sizeof(ZcEnv));
+    if (zc == NULL)
+    {
+        av_log(NULL, AV_LOG_ERROR, "av_rpi_zc_env_alloc: Context allocation failed\n");
+        return NULL;
+    }
+
+    zc_pool_init(&zc->pool);
+    return zc;
+}
+
+void av_rpi_zc_env_free(AVZcEnvPtr zc)
+{
+    if (zc != NULL)
+    {
+        zc_pool_destroy(&zc->pool); ;
+        av_free(zc);
+    }
+}
+
+int av_rpi_zc_init(struct AVCodecContext * const s)
+{
+    ZcEnv * const zc = av_rpi_zc_env_alloc();
+    if (zc == NULL)
+    {
+        return AVERROR(ENOMEM);
+    }
+
+    s->get_buffer_context = zc;
+    s->get_buffer2 = av_rpi_zc_get_buffer2;
+    return 0;
+}
+
+void av_rpi_zc_uninit(struct AVCodecContext * const s)
+{
+    if (s->get_buffer2 == av_rpi_zc_get_buffer2)
+    {
+        ZcEnv * const zc = s->get_buffer_context;
+        s->get_buffer2 = avcodec_default_get_buffer2;
+        s->get_buffer_context = NULL;
+        av_rpi_zc_env_free(zc);
+    }
+}
+
+#endif  // RPI
+
diff --git a/libavcodec/rpi_zc.h b/libavcodec/rpi_zc.h
new file mode 100644
index 0000000..f0109f4
--- /dev/null
+++ b/libavcodec/rpi_zc.h
@@ -0,0 +1,83 @@
+#ifndef LIBAVCODEC_RPI_ZC_H
+#define LIBAVCODEC_RPI_ZC_H
+
+// Zero-Copy frame code for RPi
+// RPi needs Y/U/V planes to be contiguous for display.  By default
+// ffmpeg will allocate separated planes so a memcpy is needed before
+// display.  This code prodes a method a making ffmpeg allocate a single
+// bit of memory for the frame when can then be refrence counted until
+// display ahs finsihed with it.
+
+#include "libavutil/frame.h"
+#include "libavcodec/avcodec.h"
+
+// "Opaque" pointer to whatever we are using as a buffer reference
+typedef AVBufferRef * AVRpiZcRefPtr;
+
+struct AVZcEnv;
+typedef struct AVZcEnv * AVZcEnvPtr;
+
+typedef struct AVRpiZcFrameGeometry
+{
+    unsigned int stride_y;
+    unsigned int height_y;
+    unsigned int stride_c;
+    unsigned int height_c;
+} AVRpiZcFrameGeometry;
+
+
+AVRpiZcFrameGeometry av_rpi_zc_frame_geometry(
+    const unsigned int video_width, const unsigned int video_height);
+
+// Replacement fn for avctx->get_buffer2
+// Should be set before calling avcodec_decode_open2
+//
+// N.B. in addition to to setting avctx->get_buffer2, avctx->refcounted_frames
+// must be set to 1 as otherwise the buffer info is killed before being returned
+// by avcodec_decode_video2.  Note also that this means that the AVFrame that is
+// return must be manually derefed with av_frame_unref.  This should be done
+// after av_rpi_zc_ref has been called.
+int av_rpi_zc_get_buffer2(struct AVCodecContext *s, AVFrame *frame, int flags);
+
+// Generate a ZC reference to the buffer(s) in this frame
+// If the buffer doesn't appear to be one allocated by _get_buffer_2
+// then the behaviour depends on maycopy:
+//   If maycopy=0 then return NULL
+//   If maycopy=1 && the src frame is in a form where we can easily copy
+//     the data, then allocate a new buffer and copy the data into it
+//   Otherwise return NULL
+AVRpiZcRefPtr av_rpi_zc_ref(struct AVCodecContext * const s,
+    const AVFrame * const frame, const int maycopy);
+
+// Get the vc_handle from the frame ref
+// Returns -1 if ref doesn't look valid
+int av_rpi_zc_vc_handle(const AVRpiZcRefPtr fr_ref);
+// Get the number of bytes allocated from the frame ref
+// Returns 0 if ref doesn't look valid
+int av_rpi_zc_numbytes(const AVRpiZcRefPtr fr_ref);
+
+// Unreference the buffer refed/allocated by _zc_ref
+// If fr_ref is NULL then this will NOP
+void av_rpi_zc_unref(AVRpiZcRefPtr fr_ref);
+
+// Allocate an environment for the buffer pool used by the ZC code
+// This should be put in avctx->get_buffer_context so it can be found by
+// av_rpi_zc_get_buffer2 when it is called from ffmpeg
+AVZcEnvPtr av_rpi_zc_env_alloc(void);
+
+// Allocate the environment used by the ZC code
+void av_rpi_zc_env_free(AVZcEnvPtr);
+
+
+// Init ZC into a context
+// There is nothing magic in this fn - it just packages setting
+// get_buffer2 & get_buffer_context
+int av_rpi_zc_init(struct AVCodecContext * const s);
+
+// Free ZC from a context
+// There is nothing magic in this fn - it just packages unsetting
+// get_buffer2 & get_buffer_context
+void av_rpi_zc_uninit(struct AVCodecContext * const s);
+
+#endif
+
diff --git a/libavcodec/utils.c b/libavcodec/utils.c
index f7adb52..708526e 100644
--- a/libavcodec/utils.c
+++ b/libavcodec/utils.c
@@ -26,6 +26,12 @@
  */

 #include "config.h"
+
+#ifdef RPI
+// Move video buffers to GPU memory
+#define RPI_GPU_BUFFERS
+#endif
+
 #include "libavutil/atomic.h"
 #include "libavutil/attributes.h"
 #include "libavutil/avassert.h"
@@ -64,6 +70,10 @@
 #include "libavutil/ffversion.h"
 const char av_codec_ffversion[] = "FFmpeg version " FFMPEG_VERSION;

+#ifdef RPI_GPU_BUFFERS
+#include "rpi_qpu.h"
+#endif
+
 #if HAVE_PTHREADS || HAVE_W32THREADS || HAVE_OS2THREADS
 static int default_lockmgr_cb(void **arg, enum AVLockOp op)
 {
@@ -503,6 +513,47 @@ int avcodec_fill_audio_frame(AVFrame *frame, int nb_channels,
     return ret;
 }

+#ifdef RPI_GPU_BUFFERS
+static void rpi_buffer_default_free(void *opaque, uint8_t *data)
+{
+    GPU_MEM_PTR_T *p = opaque;
+    gpu_free(p);
+    av_free(p);
+}
+
+static AVBufferRef *rpi_buffer_alloc(int size)
+{
+    AVBufferRef *ret = NULL;
+    uint8_t    *data = NULL;
+    GPU_MEM_PTR_T *p;
+
+    static int total=0;
+    total+=size;
+
+    p = av_malloc(sizeof *p);
+    if (!p)
+        return NULL;
+
+    if (gpu_malloc_cached(size,p)<0)  // Change this line to choose cached or uncached memory.  The caching here refers to the ARM data cache.
+        return NULL;
+
+    data = p->arm;
+    printf("Rpi alloc %d/%d ARM=%p VC=%x->%x\n",size,total,p->arm,p->vc,p->vc+size);
+    //memset(data, 64, size);
+
+    if (!data)
+        return NULL;
+
+    ret = av_buffer_create(data, size, rpi_buffer_default_free, p, 0);
+    if (!ret) {
+        gpu_free(p);
+        av_freep(&p);
+    }
+
+    return ret;
+}
+#endif
+
 static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame)
 {
     FramePool *pool = avctx->internal->pool;
@@ -550,6 +601,14 @@ static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame)
             av_buffer_pool_uninit(&pool->pools[i]);
             pool->linesize[i] = linesize[i];
             if (size[i]) {
+#ifdef RPI_GPU_BUFFERS
+                if (avctx->codec_id == AV_CODEC_ID_HEVC)
+                    pool->pools[i] = av_buffer_pool_init(size[i] + 16 + STRIDE_ALIGN - 1,
+                                                     CONFIG_MEMORY_POISONING ?
+                                                        NULL :
+                                                        rpi_buffer_alloc);
+                else
+#endif
                 pool->pools[i] = av_buffer_pool_init(size[i] + 16 + STRIDE_ALIGN - 1,
                                                      CONFIG_MEMORY_POISONING ?
                                                         NULL :
diff --git a/libavformat/mpegts.c b/libavformat/mpegts.c
index b31d233..2767306 100644
--- a/libavformat/mpegts.c
+++ b/libavformat/mpegts.c
@@ -701,7 +701,7 @@ static const StreamType ISO_types[] = {
 #endif
     { 0x1b, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264       },
     { 0x1c, AVMEDIA_TYPE_AUDIO, AV_CODEC_ID_AAC        },
-    { 0x20, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264       },
+    { 0x20, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264_MVC   },
     { 0x21, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_JPEG2000   },
     { 0x24, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_HEVC       },
     { 0x42, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_CAVS       },
diff --git a/libavformat/utils.c b/libavformat/utils.c
index 6f343f2..83f26d5 100644
--- a/libavformat/utils.c
+++ b/libavformat/utils.c
@@ -691,7 +691,7 @@ static int update_wrap_reference(AVFormatContext *s, AVStream *st, int stream_in
         int default_stream_index = av_find_default_stream_index(s);
         if (s->streams[default_stream_index]->pts_wrap_reference == AV_NOPTS_VALUE) {
             for (i = 0; i < s->nb_streams; i++) {
-                if (av_find_program_from_stream(s, NULL, i))
+                if (0 && av_find_program_from_stream(s, NULL, i))
                     continue;
                 s->streams[i]->pts_wrap_reference = pts_wrap_reference;
                 s->streams[i]->pts_wrap_behavior = pts_wrap_behavior;
diff --git a/libavutil/buffer.c b/libavutil/buffer.c
index 694e116..203ca7b 100644
--- a/libavutil/buffer.c
+++ b/libavutil/buffer.c
@@ -425,3 +425,9 @@ AVBufferRef *av_buffer_pool_get(AVBufferPool *pool)

     return ret;
 }
+
+// Return the opaque for the underlying frame (gives us a GPU_MEM_PTR_T)
+void *av_buffer_pool_opaque(AVBufferRef *ref) {
+  BufferPoolEntry *buf = av_buffer_get_opaque(ref);
+  return buf->opaque;
+}
diff --git a/libavutil/buffer.h b/libavutil/buffer.h
index 0c0ce12..82e0bc3 100644
--- a/libavutil/buffer.h
+++ b/libavutil/buffer.h
@@ -283,6 +283,9 @@ void av_buffer_pool_uninit(AVBufferPool **pool);
  */
 AVBufferRef *av_buffer_pool_get(AVBufferPool *pool);

+// Return the opaque for the underlying frame
+void *av_buffer_pool_opaque(AVBufferRef *ref);
+
 /**
  * @}
  */
diff --git a/pi-util/conf.sh b/pi-util/conf.sh
new file mode 100755
index 0000000..8b596a2
--- /dev/null
+++ b/pi-util/conf.sh
@@ -0,0 +1,33 @@
+echo "Configure for Pi2/3"
+
+RPI_BUILDROOT=`pwd`/build
+RPI_ROOTFS=$RPI_BUILDROOT/linux/raspian_jessie_pi1-sysroot
+RPI_TOOLROOT=$RPI_BUILDROOT/tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf
+RPI_OPT_VC=$RPI_ROOTFS/opt/vc
+#RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_ROOTFS/usr/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
+RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
+RPI_DEFS="-D__VCCOREVER__=0x04000000 -DRPI=1"
+#RPI_DEFS="-D__VCCOREVER__=0x04000000"
+RPI_LIBDIRS="-L$RPI_ROOTFS/lib -L$RPI_ROOTFS/usr/lib -L$RPI_OPT_VC/lib"
+#RPI_KEEPS="-save-temps=obj"
+RPI_KEEPS=""
+
+./configure --enable-cross-compile\
+ --arch=armv6t2\
+ --cpu=cortex-a7\
+ --target-os=linux\
+ --disable-stripping\
+ --disable-thumb\
+ --enable-mmal\
+ --extra-cflags="-g $RPI_KEEPS $RPI_DEFS $RPI_INCLUDES"\
+ --extra-cxxflags="$RPI_DEFS $RPI_INCLUDES"\
+ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_ROOTFS/lib,-rpath-link=$RPI_ROOTFS/usr/lib"\
+ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\
+ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf-
+
+# --enable-extra-warnings\
+# --arch=armv71\
+# --enable-shared\
+
+# gcc option for getting asm listing
+# -Wa,-ahls
diff --git a/pi-util/conf_h265.csv b/pi-util/conf_h265.csv
new file mode 100644
index 0000000..61d1399
--- /dev/null
+++ b/pi-util/conf_h265.csv
@@ -0,0 +1,144 @@
+1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.md5
+2,AMP_A_Samsung_6,AMP_A_Samsung_6.bin,AMP_A_Samsung_6.md5
+1,AMP_B_Samsung_6,AMP_B_Samsung_6.bin,AMP_B_Samsung_6.md5
+1,AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5
+1,AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5
+1,AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5
+1,AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5
+1,AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5
+2,AMVP_C_Samsung_6,AMVP_C_Samsung_6.bin,AMVP_C_Samsung_6.md5
+1,BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5
+1,CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5
+1,CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5
+1,CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5
+1,CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5
+1,CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5
+1,CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5
+1,CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5
+1,CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5
+1,CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5
+1,cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5
+1,CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5
+1,CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5
+1,DBLK_A_MAIN10_VIXS_3,DBLK_A_MAIN10_VIXS_3.bit,DBLK_A_MAIN10_VIXS_3.md5
+1,DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5
+1,DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5
+1,DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5
+1,DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5
+1,DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5
+1,DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5
+1,DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5
+2,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
+1,DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5
+1,DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5
+1,DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5
+1,DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5
+1,DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5
+1,ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5
+1,ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5
+1,ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5
+1,EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5
+1,FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5
+1,HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5
+1,INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5
+1,INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5
+1,ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5
+1,ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5
+1,ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5
+1,ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5
+1,ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5
+1,IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5
+1,IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5
+1,IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5
+1,LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5
+1,LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5
+1,LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5
+1,MAXBINS_A_TI_4,MAXBINS_A_TI_4.bit,MAXBINS_A_TI_4.md5
+1,MAXBINS_B_TI_4,MAXBINS_B_TI_4.bit,MAXBINS_B_TI_4.md5
+1,MAXBINS_C_TI_4,MAXBINS_C_TI_4.bit,MAXBINS_C_TI_4.md5
+1,MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5
+1,MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5
+1,MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5
+1,MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5
+1,MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5
+1,MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5
+1,MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5
+1,MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5
+1,MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5
+1,MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5
+1,NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5
+1,NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5
+2,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
+1,OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5
+1,OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5
+1,OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5
+1,PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5
+1,PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5
+1,PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5
+1,PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5
+1,PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5
+1,PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5
+1,PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5
+1,PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5
+1,PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5
+1,POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5
+2,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
+1,PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5
+1,RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5
+2,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
+1,RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5
+1,RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5
+1,RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5
+1,RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5
+1,RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5
+1,RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5
+1,RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5
+1,RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5
+1,RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5
+1,RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5
+1,RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5
+1,RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5
+1,RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5
+1,RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5
+1,RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5
+1,SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5
+1,SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5
+1,SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5
+1,SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5
+1,SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5
+1,SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5
+1,SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5
+1,SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5
+1,SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5
+1,SLIST_A_Sony_4,str.bin,SLIST_A_Sony_4_yuv.md5
+1,SLIST_B_Sony_8,str.bin,SLIST_B_Sony_8_yuv.md5
+1,SLIST_C_Sony_3,str.bin,SLIST_C_Sony_3_yuv.md5
+1,SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5
+2,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
+1,STRUCT_A_Samsung_6,STRUCT_A_Samsung_6.bin,STRUCT_A_Samsung_6.md5
+1,STRUCT_B_Samsung_6,STRUCT_B_Samsung_6.bin,STRUCT_B_Samsung_6.md5
+1,TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5
+1,TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5
+1,TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5
+1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5
+1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5
+1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5
+2,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5
+1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5
+2,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
+1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5
+1,WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5
+1,WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5
+1,WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5
+1,WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5
+1,WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5
+1,WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5
+1,WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5
+1,WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5
+1,WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5
+1,WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5
+1,WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5
+1,WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5
+1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5
+1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5
+1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
diff --git a/pi-util/ffconf.py b/pi-util/ffconf.py
new file mode 100644
index 0000000..38f942f
--- /dev/null
+++ b/pi-util/ffconf.py
@@ -0,0 +1,146 @@
+#!/usr/bin/env python
+
+import os
+import subprocess
+import re
+import argparse
+import sys
+import csv
+from stat import *
+
+conf_root = "/opt/conform/h265"
+ffmpeg_exec = "./ffmpeg"
+
+def testone(fileroot, name, es_file, md5_file):
+    tmp_root = "/tmp"
+
+    dec_file = os.path.join(tmp_root, name + ".dec.md5")
+    try:
+        os.remove(dec_file)
+    except:
+        pass
+
+    flog = open(os.path.join(tmp_root, name + ".log"), "wt")
+
+    # Unaligned needed for cropping conformance
+    rstr = subprocess.call(
+        [ffmpeg_exec, "-flags", "unaligned", "-vcodec", "hevc", "-i", os.path.join(fileroot, es_file), "-f", "md5", dec_file],
+        stdout=flog, stderr=subprocess.STDOUT)
+
+    try:
+        m1 = None
+        m2 = None
+        with open(os.path.join(fileroot, md5_file)) as f:
+            for line in f:
+                m1 = re.search("[0-9a-f]{32}", line.lower())
+                if m1:
+                    break
+
+        with open(dec_file) as f:
+            m2 = re.search("[0-9a-f]{32}", f.readline())
+    except:
+        pass
+
+    rv = False
+    if  m1 and m2 and m1.group() == m2.group():
+        print >> flog, "Match: " + m1.group()
+        rv = True
+    elif not m1:
+        print >> flog, "****** Cannot find m1"
+    elif not m2:
+        print >> flog, "****** Cannot find m2"
+    else:
+        print >> flog, "****** Mismatch: " + m1.group() + " != " + m2.group()
+    flog.close()
+    return rv
+
+def scandir(root):
+    aconf = []
+    ents = os.listdir(conf_root)
+    ents.sort(key=str.lower)
+    for name in ents:
+        test_path = os.path.join(conf_root, name)
+        if S_ISDIR(os.stat(test_path).st_mode):
+            files = os.listdir(test_path)
+            es_file = "?"
+            md5_file = "?"
+            for f in files:
+                (base, ext) = os.path.splitext(f)
+                if base[0] == '.':
+                    pass
+                elif ext == ".bit" or ext == ".bin":
+                    es_file = f
+                elif ext == ".md5":
+                    if md5_file == "?":
+                        md5_file = f
+                    elif base[-3:] == "yuv":
+                        md5_file = f
+            aconf.append((1, name, es_file, md5_file))
+    return aconf
+
+def runtest(name, tests):
+    if not tests:
+        return True
+    for t in tests:
+        if name[0:len(t)] == t:
+            return True
+        return False
+
+def doconf(csva, tests):
+    failures = []
+    unx_success = []
+    for a in csva:
+        exp_test = int(a[0])
+        if (exp_test and runtest(a[1], tests)):
+            name = a[1]
+            print "==== ", name,
+            sys.stdout.flush()
+
+            if (not testone(os.path.join(conf_root, name), name, a[2], a[3])) :
+                if exp_test == 1:
+                    failures.append(name)
+                    print ": * FAIL *"
+                else:
+                    print ": fail"
+            else:
+                if exp_test == 2:
+                    print ": * OK *"
+                    unx_success.append(name)
+                else:
+                    print ": ok"
+
+
+    if failures or unx_success:
+        print "Unexpected Failures:", failures
+        print "Unexpected Success: ", unx_success
+    else:
+        print "All tests normal"
+
+
+class ConfCSVDialect(csv.Dialect):
+    delimiter = ','
+    doublequote = True
+    lineterminator = '\n'
+    quotechar='"'
+    quoting = csv.QUOTE_MINIMAL
+    skipinitialspace = True
+    strict = True
+
+if __name__ == '__main__':
+
+    argp = argparse.ArgumentParser(description="FFmpeg h265 conformance tester")
+    argp.add_argument("tests", nargs='*')
+    argp.add_argument("--csvgen", action='store_true', help="Generate CSV file for dir")
+    argp.add_argument("--csv", default="pi-util/conf_h265.csv", help="CSV filename")
+    args = argp.parse_args()
+
+    if args.csvgen:
+        csv.writer(sys.stdout).writerows(scandir(conf_root))
+        exit(0)
+
+    with open(args.csv, 'rt') as csvfile:
+        csva = [a for a in csv.reader(csvfile, ConfCSVDialect())]
+
+
+    doconf(csva, args.tests)
+
diff --git a/pi-util/qasm.py b/pi-util/qasm.py
new file mode 100644
index 0000000..1eacc04
--- /dev/null
+++ b/pi-util/qasm.py
@@ -0,0 +1,2502 @@
+#!/usr/bin/env python
+
+#    add.ifz.setf  -, r0, ra0 ; fmul  rb1, rany2, 0 ; thrend # comment
+#    add  r0, r0, 1                    # implicit mul nop
+#    nop                               # explicit add nop, implicit mul nop
+#    bkpt                              # implicit add/mul nop
+#    mov  r0, 0x1234                   # hex immediate
+#    mov  r0, 20 * 40                  # expressions...
+#    mov  r0, f(sqrt(2.0) * 3.0)       # f() converts float to bits
+#    mov  r0, a:label                  # put address of label in r0
+# :label
+#    bra.allnn  ra2, a:1f              # branch to label 1 (searching forward), using absolute address
+# :1
+#    brr.anyz  -, r:1b                 # branch to label 1 (searching backward), using relative address
+# :1                                   # multiple definitions of numeric labels (differentiated using f/b)
+# .set my_val, 3                       # introduce alias for 3
+# .set my_reg, r0                      # and for r0
+#    mov  my_reg, my_val               # then use them
+# .set my_reg2, my_reg + my_val        # r0 plus 3 is r3
+# .macro my_add, a, b, c               # a, b, c act as if .set on entry
+# .set my_val, 10
+#    add  a, b, c
+#    mov  r0, my_val                   # 10
+# .endm                                # forget all .sets since .macro (including arg .sets)
+#    mov  r0, my_val                   # 3
+#    my_add  my_reg2, my_reg, ra0 << 4 # << rotates left (>> rotates right)
+
+import math
+import optparse
+import os
+import random
+import re
+import struct
+import sys
+import time
+
+###############################################################################
+# constants
+###############################################################################
+
+# ops
+######
+
+# negatives are internal qasm ops
+
+AOP_MOV     = -3   # two operands
+AOP_BRA     = -2   # two operands
+AOP_BRR     = -1   # two operands
+AOP_NOP     = 0x00 # no operands
+AOP_FADD    = 0x01
+AOP_FSUB    = 0x02
+AOP_FMIN    = 0x03
+AOP_FMAX    = 0x04
+AOP_FMINABS = 0x05
+AOP_FMAXABS = 0x06
+AOP_FTOI    = 0x07 # two operands
+AOP_ITOF    = 0x08 # two operands
+AOP_ADD     = 0x0c
+AOP_SUB     = 0x0d
+AOP_SHR     = 0x0e
+AOP_ASR     = 0x0f
+AOP_ROR     = 0x10
+AOP_SHL     = 0x11
+AOP_MIN     = 0x12
+AOP_MAX     = 0x13
+AOP_AND     = 0x14
+AOP_OR      = 0x15
+AOP_XOR     = 0x16
+AOP_NOT     = 0x17 # two operands
+AOP_CLZ     = 0x18 # two operands
+AOP_V8ADDS  = 0x1e
+AOP_V8SUBS  = 0x1f
+
+MOP_MOV    = -1  # two operands
+MOP_NOP    = 0x0 # no operands
+MOP_FMUL   = 0x1
+MOP_MUL24  = 0x2
+MOP_V8MULD = 0x3
+MOP_V8MIN  = 0x4
+MOP_V8MAX  = 0x5
+MOP_V8ADDS = 0x6
+MOP_V8SUBS = 0x7
+
+# ldi modes
+############
+
+LDI_32          = 0
+LDI_EL_SIGNED   = 1
+LDI_EL_UNSIGNED = 3
+LDI_SEMA        = 4
+
+# conds
+########
+
+COND_NEVER  = 0
+COND_ALWAYS = 1
+COND_IFZ    = 2
+COND_IFNZ   = 3
+COND_IFN    = 4
+COND_IFNN   = 5
+COND_IFC    = 6
+COND_IFNC   = 7
+
+BCOND_ALLZ   = 0
+BCOND_ALLNZ  = 1
+BCOND_ANYZ   = 2
+BCOND_ANYNZ  = 3
+BCOND_ALLN   = 4
+BCOND_ALLNN  = 5
+BCOND_ANYN   = 6
+BCOND_ANYNN  = 7
+BCOND_ALLC   = 8
+BCOND_ALLNC  = 9
+BCOND_ANYC   = 10
+BCOND_ANYNC  = 11
+BCOND_ALWAYS = 15
+
+# packing/unpacking
+####################
+
+# regfile a pack modes
+PACK_A_NOP   = 0
+PACK_A_16A   = 1
+PACK_A_16B   = 2
+PACK_A_8888  = 3
+PACK_A_8A    = 4
+PACK_A_8B    = 5
+PACK_A_8C    = 6
+PACK_A_8D    = 7
+PACK_A_32S   = 8
+PACK_A_16AS  = 9
+PACK_A_16BS  = 10
+PACK_A_8888S = 11
+PACK_A_8AS   = 12
+PACK_A_8BS   = 13
+PACK_A_8CS   = 14
+PACK_A_8DS   = 15
+
+# mul unit pack modes
+PACK_MUL_NOP  = 0
+PACK_MUL_8888 = 3
+PACK_MUL_8A   = 4
+PACK_MUL_8B   = 5
+PACK_MUL_8C   = 6
+PACK_MUL_8D   = 7
+
+# regfile a unpack modes
+UNPACK_A_NOP = 0
+UNPACK_A_16A = 1
+UNPACK_A_16B = 2
+UNPACK_A_8R  = 3
+UNPACK_A_8A  = 4
+UNPACK_A_8B  = 5
+UNPACK_A_8C  = 6
+UNPACK_A_8D  = 7
+
+# r4 unpack modes
+UNPACK_R4_NOP = 0
+UNPACK_R4_16A = 1
+UNPACK_R4_16B = 2
+UNPACK_R4_8R  = 3
+UNPACK_R4_8A  = 4
+UNPACK_R4_8B  = 5
+UNPACK_R4_8C  = 6
+UNPACK_R4_8D  = 7
+
+PACK_TYPE_INT    = 0
+PACK_TYPE_FLOAT  = 1
+PACK_TYPE_EITHER = -1
+
+PACK_MODE_A      = 0 # regfile a
+PACK_MODE_M      = 1 # mul unit
+PACK_MODE_EITHER = -1
+
+UNPACK_LOC_A     = 0 # regfile a
+UNPACK_LOC_R4    = 1 # r4
+UNPACK_LOC_AB    = 2 # either regfile a or regfile b
+UNPACK_LOC_OTHER = 3 # somewhere else
+
+# args
+#######
+
+# loc_t, ie internal
+MUX_AC  = 0
+MUX_ANY = 1
+MUX_A   = 2
+MUX_B   = 3
+RW_EITHER = 0
+RW_READ   = 1
+RW_WRITE  = 2
+
+RADDR_NOP = 39
+
+# negatives are for internal use
+RMUX_SEMA  = -6
+RMUX_LABEL = -5
+RMUX_IMMV  = -4
+RMUX_IMM   = -3
+RMUX_AC    = -2
+RMUX_ANY   = -1
+RMUX_A0    = 0 # followed by A1, A2, A3, A4, A5
+RMUX_A     = 6
+RMUX_B     = 7
+
+WADDR_R0  = 32 # followed by R1, R2, R3
+WADDR_NOP = 39
+
+WMUX_ANY = 0
+WMUX_A   = 1
+WMUX_B   = 2
+
+# signals
+##########
+
+SIG_BKPT       = 0
+SIG_NORMAL     = 1
+SIG_THRSW      = 2
+SIG_THREND     = 3
+SIG_SBWAIT     = 4
+SIG_SBDONE     = 5
+SIG_INT        = 6 # on a0
+SIG_LTHRSW     = 6 # on b0
+SIG_LOADCV     = 7
+SIG_LOADC      = 8
+SIG_LDCEND     = 9
+SIG_LDTMU0     = 10
+SIG_LDTMU1     = 11
+SIG_ROTATE     = 12 # on a0
+SIG_LOADAM     = 12 # on b0
+SIG_SMALLIMMED = 13
+SIG_IMMED      = 14
+SIG_BRANCH     = 15
+
+# multi-line assembler constructs
+##################################
+
+CONSTRUCT_MACRO = 0x1
+CONSTRUCT_IF    = 0x2
+CONSTRUCT_ELSE  = 0x4
+CONSTRUCT_REP   = 0x8
+
+###############################################################################
+# helpers
+###############################################################################
+
+def asm_error(message, location = None):
+   if location is None:
+      location = current_location
+   if location == '':
+      sys.stderr.write('qasm ERROR: %s\n' % message)
+   else:
+      sys.stderr.write('qasm ERROR: %s: %s\n' % (location, message))
+   sys.exit(-1)
+
+def asm_warning(message, location = None):
+   if disable_warnings or (nwarn_level != 0):
+      return
+   if location is None:
+      location = current_location
+   if location == '':
+      sys.stderr.write('qasm WARNING: %s\n' % message)
+   else:
+      sys.stderr.write('qasm WARNING: %s: %s\n' % (location, message))
+   if warnings_are_errors:
+      asm_error('warnings are errors!', location)
+
+# smart_split('') = []
+# smart_split('a') = ['a']
+# smart_split('a(1, 2),[3, 4, 5],6') = ['a(1, 2)', '[3, 4, 5]', '6']
+def smart_split(s, delim = ',', count = 0):
+   if len(s) == 0:
+      return []
+   parts = []
+   depth = 0
+   i = 0
+   for j in xrange(len(s)):
+      if s[j] in '([{':
+         depth += 1
+      elif s[j] in ')]}':
+         depth -= 1
+      elif (s[j] == delim) and (depth == 0):
+         parts.append(s[i:j])
+         i = j + 1
+         if len(parts) == count:
+            break
+   if depth != 0:
+      asm_error('bracket nesting fail')
+   parts.append(s[i:])
+   return parts
+
+def is_int(x):
+   return isinstance(x, int) or isinstance(x, long)
+
+###############################################################################
+# "parsing" stuff
+###############################################################################
+
+re_macro = re.compile('\\.macro\\s+(?P<name>\\w+)(?P<params>(\\s*,\\s*\\w+)*)$')
+re_if = re.compile('\\.if((?P<set>n?set)\\s+(?P<name>\\w+)|\\s(?P<condition>.+))$')
+re_elif = re.compile('\\.elif((?P<set>n?set)\\s+(?P<name>\\w+)|\\s(?P<condition>.+))$')
+re_rep = re.compile('\\.rep\\s+(?P<name>\\w+)\\s*,(?P<count>.+)$')
+re_include = re.compile('\\.include\\s(?P<filename>.+)$')
+re_set = re.compile('\\.set\\s+(?P<name>\\w+)\\s*,(?P<val>.+)$')
+re_unset = re.compile('\\.unset\\s+(?P<name>\\w+)$')
+re_eval = re.compile('\\.eval\\s(?P<expr>.+)$')
+re_print_info_warn_error = re.compile('\\.(?P<print_info_warn_error>print|info|warn|error)\\s(?P<message>.+)$')
+re_assert = re.compile('\\.assert\\s(?P<condition>.+)$')
+re_data = re.compile('\\.d(?P<size>[124])\\s(?P<data>.+)$')
+re_macro_inst = re.compile('(?P<name>\\w+)(?P<args>\\s.+|)$')
+re_label = re.compile(':(?P<name>:?[a-zA-Z_]\\w*|\\d+)$')
+re_op = re.compile('(?P<op>\\w+)(\\.(?P<cond>\\w+))??(\\.(?P<sf>setf))?(?P<args>\\s.+|)$')
+re_label_ref_left = re.compile('\\b([ar]):')
+re_label_ref_right = re.compile('[a-zA-Z_]\\w*|\\d+[bf]$')
+re_pack = re.compile('\\.([0-9]\\w*[a-df-zA-DF-Z_])') # a bit weird because we don't want to pick up float literals...
+
+# ops
+######
+
+aops = {
+   'mov': (AOP_MOV, 2),
+   'bra': (AOP_BRA, 2),
+   'brr': (AOP_BRR, 2),
+   'nop': (AOP_NOP, 0),
+   'fadd': (AOP_FADD, 3),
+   'fsub': (AOP_FSUB, 3),
+   'fmin': (AOP_FMIN, 3),
+   'fmax': (AOP_FMAX, 3),
+   'fminabs': (AOP_FMINABS, 3),
+   'fmaxabs': (AOP_FMAXABS, 3),
+   'ftoi': (AOP_FTOI, 2),
+   'itof': (AOP_ITOF, 2),
+   'add': (AOP_ADD, 3),
+   'sub': (AOP_SUB, 3),
+   'shr': (AOP_SHR, 3),
+   'asr': (AOP_ASR, 3),
+   'ror': (AOP_ROR, 3),
+   'shl': (AOP_SHL, 3),
+   'min': (AOP_MIN, 3),
+   'max': (AOP_MAX, 3),
+   'and': (AOP_AND, 3),
+   'or': (AOP_OR, 3),
+   'xor': (AOP_XOR, 3),
+   'not': (AOP_NOT, 2),
+   'clz': (AOP_CLZ, 2),
+   'v8adds': (AOP_V8ADDS, 3),
+   'v8subs': (AOP_V8SUBS, 3)}
+
+def get_aop(aop):
+   if aop not in aops:
+      asm_error('invalid aop')
+   return aops[aop]
+
+mops = {
+   'mov': (MOP_MOV, 2),
+   'nop': (MOP_NOP, 0),
+   'fmul': (MOP_FMUL, 3),
+   'mul24': (MOP_MUL24, 3),
+   'v8muld': (MOP_V8MULD, 3),
+   'v8min': (MOP_V8MIN, 3),
+   'v8max': (MOP_V8MAX, 3),
+   'v8adds': (MOP_V8ADDS, 3),
+   'v8subs': (MOP_V8SUBS, 3)}
+
+def get_mop(mop):
+   if mop not in mops:
+      asm_error('invalid mop')
+   return mops[mop]
+
+# conds
+########
+
+conds = {
+   'ifz': COND_IFZ,
+   'ifnz': COND_IFNZ,
+   'ifn': COND_IFN,
+   'ifnn': COND_IFNN,
+   'ifc': COND_IFC,
+   'ifnc': COND_IFNC}
+
+def get_cond(cond):
+   if not cond:
+      return COND_ALWAYS
+   if cond not in conds:
+      asm_error('invalid cond')
+   return conds[cond]
+
+bconds = {
+   'allz': BCOND_ALLZ,
+   'allnz': BCOND_ALLNZ,
+   'anyz': BCOND_ANYZ,
+   'anynz': BCOND_ANYNZ,
+   'alln': BCOND_ALLN,
+   'allnn': BCOND_ALLNN,
+   'anyn': BCOND_ANYN,
+   'anynn': BCOND_ANYNN,
+   'allc': BCOND_ALLC,
+   'allnc': BCOND_ALLNC,
+   'anyc': BCOND_ANYC,
+   'anync': BCOND_ANYNC}
+
+def get_bcond(bcond):
+   if not bcond:
+      return BCOND_ALWAYS
+   if bcond not in bconds:
+      asm_error('invalid bcond')
+   return bconds[bcond]
+
+def get_setf(setf):
+   if not setf:
+      return False
+   return True
+
+# packing/unpacking
+####################
+
+packs = {
+   '16a':    (PACK_A_16A,    PACK_TYPE_INT,    PACK_MODE_A),
+   '16b':    (PACK_A_16B,    PACK_TYPE_INT,    PACK_MODE_A),
+   '16af':   (PACK_A_16A,    PACK_TYPE_FLOAT,  PACK_MODE_A),
+   '16bf':   (PACK_A_16B,    PACK_TYPE_FLOAT,  PACK_MODE_A),
+   '8abcd':  (PACK_A_8888,   PACK_TYPE_EITHER, PACK_MODE_A),
+   '8a':     (PACK_A_8A,     PACK_TYPE_EITHER, PACK_MODE_A),
+   '8b':     (PACK_A_8B,     PACK_TYPE_EITHER, PACK_MODE_A),
+   '8c':     (PACK_A_8C,     PACK_TYPE_EITHER, PACK_MODE_A),
+   '8d':     (PACK_A_8D,     PACK_TYPE_EITHER, PACK_MODE_A),
+   's':      (PACK_A_32S,    PACK_TYPE_EITHER, PACK_MODE_A),
+   '16as':   (PACK_A_16AS,   PACK_TYPE_EITHER, PACK_MODE_A),
+   '16bs':   (PACK_A_16BS,   PACK_TYPE_EITHER, PACK_MODE_A),
+   '8abcds': (PACK_A_8888S,  PACK_TYPE_EITHER, PACK_MODE_A),
+   '8as':    (PACK_A_8AS,    PACK_TYPE_EITHER, PACK_MODE_A),
+   '8bs':    (PACK_A_8BS,    PACK_TYPE_EITHER, PACK_MODE_A),
+   '8cs':    (PACK_A_8CS,    PACK_TYPE_EITHER, PACK_MODE_A),
+   '8ds':    (PACK_A_8DS,    PACK_TYPE_EITHER, PACK_MODE_A),
+   '8abcdc': (PACK_MUL_8888, PACK_TYPE_EITHER, PACK_MODE_M),
+   '8ac':    (PACK_MUL_8A,   PACK_TYPE_EITHER, PACK_MODE_M),
+   '8bc':    (PACK_MUL_8B,   PACK_TYPE_EITHER, PACK_MODE_M),
+   '8cc':    (PACK_MUL_8C,   PACK_TYPE_EITHER, PACK_MODE_M),
+   '8dc':    (PACK_MUL_8D,   PACK_TYPE_EITHER, PACK_MODE_M)}
+
+def get_pack(pack):
+   if not pack:
+      return (0, PACK_TYPE_EITHER, PACK_MODE_EITHER)
+   if pack not in packs:
+      asm_error('invalid pack')
+   return packs[pack]
+
+a_unpacks = {
+   '16a':  (UNPACK_A_16A, PACK_TYPE_INT),
+   '16b':  (UNPACK_A_16B, PACK_TYPE_INT),
+   '16af': (UNPACK_A_16A, PACK_TYPE_FLOAT),
+   '16bf': (UNPACK_A_16B, PACK_TYPE_FLOAT),
+   '8dr':  (UNPACK_A_8R,  PACK_TYPE_EITHER),
+   '8a':   (UNPACK_A_8A,  PACK_TYPE_INT),
+   '8b':   (UNPACK_A_8B,  PACK_TYPE_INT),
+   '8c':   (UNPACK_A_8C,  PACK_TYPE_INT),
+   '8d':   (UNPACK_A_8D,  PACK_TYPE_INT),
+   '8ac':  (UNPACK_A_8A,  PACK_TYPE_FLOAT),
+   '8bc':  (UNPACK_A_8B,  PACK_TYPE_FLOAT),
+   '8cc':  (UNPACK_A_8C,  PACK_TYPE_FLOAT),
+   '8dc':  (UNPACK_A_8D,  PACK_TYPE_FLOAT)}
+
+def get_a_unpack(unpack):
+   if not unpack:
+      return (UNPACK_A_NOP, PACK_TYPE_EITHER, UNPACK_LOC_A)
+   if unpack not in a_unpacks:
+      asm_error('invalid ra unpack')
+   return a_unpacks[unpack] + (UNPACK_LOC_A,)
+
+r4_unpacks = {
+   '16af': UNPACK_R4_16A,
+   '16bf': UNPACK_R4_16B,
+   '8dr':  UNPACK_R4_8R,
+   '8ac':  UNPACK_R4_8A,
+   '8bc':  UNPACK_R4_8B,
+   '8cc':  UNPACK_R4_8C,
+   '8dc':  UNPACK_R4_8D}
+
+def get_r4_unpack(unpack):
+   if not unpack:
+      return (UNPACK_R4_NOP, PACK_TYPE_EITHER, UNPACK_LOC_R4)
+   if unpack not in r4_unpacks:
+      asm_error('invalid r4 unpack')
+   return (r4_unpacks[unpack], PACK_TYPE_EITHER, UNPACK_LOC_R4)
+
+# args
+#######
+
+class loc_t:
+   def __init__(self, mux, i, rot, r5_rot, pack, rw):
+      self.mux = mux
+      self.i = i
+      self.rot = rot % 16
+      self.r5_rot = r5_rot % 16
+      self.pack = pack
+      self.rw = rw
+
+   def copy(self):
+      return loc_t(self.mux, self.i, self.rot, self.r5_rot, self.pack, self.rw)
+
+   def __add__(self, i):
+      if not is_int(i):
+         raise Exception('can only add integer to loc')
+      return loc_t(self.mux, self.i + i, self.rot, self.r5_rot, self.pack, self.rw)
+
+   def __sub__(self, i):
+      if not is_int(i):
+         raise Exception('can only subtract integer from loc')
+      return loc_t(self.mux, self.i - i, self.rot, self.r5_rot, self.pack, self.rw)
+
+   def __cmp__(self, other):
+      if is_int(other):
+         return cmp(self.i, other)
+      if not isinstance(other, loc_t):
+         raise Exception('can only compare loc to integer or other loc')
+      if self.mux != other.mux:
+         return cmp(self.mux, other.mux)
+      if self.i != other.i:
+         return cmp(self.i, other.i)
+      if self.rot != other.rot:
+         return cmp(self.rot, other.rot)
+      if self.r5_rot != other.r5_rot:
+         return cmp(self.r5_rot, other.r5_rot)
+      return cmp(self.pack, other.pack)
+
+   def is_r5(self):
+      return (self.mux == MUX_AC) and (self.i == 5)
+
+   def shift(self, rot, left):
+      if isinstance(rot, loc_t) and rot.is_r5():
+         if (rot.rot != 0) or (rot.r5_rot != 0) or rot.pack:
+            raise Exception('can\'t rotate by rotated/unpacked r5')
+         return loc_t(self.mux, self.i, self.rot, self.r5_rot + (-1 if left else 1), self.pack, self.rw)
+      if not is_int(rot):
+         raise Exception('can only rotate by integer or r5')
+      return loc_t(self.mux, self.i, self.rot + (-rot if left else rot), self.r5_rot, self.pack, self.rw)
+
+   def __lshift__(self, rot):
+      return self.shift(rot, True)
+
+   def __rshift__(self, rot):
+      return self.shift(rot, False)
+
+   def __getattr__(self, name):
+      # discard the first character if it is an underscore. this is a total hack
+      # to allow packs starting with a digit to work
+      if name[0] == '_':
+         name = name[1:]
+      if (name in packs) or (name in a_unpacks) or (name in r4_unpacks):
+         if self.pack:
+            raise Exception('can\'t specify two packs')
+         return loc_t(self.mux, self.i, self.rot, self.r5_rot, name, self.rw)
+      raise AttributeError()
+
+   def __str__(self):
+      if self.mux == MUX_AC:
+         return 'r%d' % self.i
+      if self.mux == MUX_ANY:
+         return 'rany%d' % self.i
+      if self.mux == MUX_A:
+         return 'ra%d' % self.i
+      if self.mux == MUX_B:
+         return 'rb%d' % self.i
+      assert 0
+
+class sema_t:
+   def __init__(self, acq, i):
+      if not is_int(i):
+         raise Exception('semaphore index must be integer')
+      self.acq = acq
+      self.i = i
+
+class label_t:
+   def __init__(self, rel, name, offset):
+      self.rel = rel
+      self.name = name
+      self.offset = offset
+
+   def __add__(self, offset):
+      return label_t(self.rel, self.name, self.offset + offset)
+
+   def __sub__(self, offset):
+      return label_t(self.rel, self.name, self.offset - offset)
+
+class label_maker_t:
+   def __init__(self, rel):
+      self.rel = rel
+
+   def __getattr__(self, name):
+      # we discard the first character. this is a total hack to allow numeric labels to work
+      if not re_label_ref_right.match(name[1:]):
+         raise Exception('invalid label reference')
+      return label_t(self.rel, name[1:], 0)
+
+def bits(x, n):
+   if (x >> n) != 0:
+      raise Exception('%d doesn\'t fit in %d bits' % (x, n))
+   return x
+
+def bitsw(x, n):
+   if x == (1 << n):
+      x = 0
+   return bits(x, n)
+
+def bitsws(x, n):
+   if x == (1 << (n - 1)):
+      x = 0
+   if -(1 << (n - 1)) <= x < 0:
+      x += 1 << n
+   return bits(x, n)
+
+def vpm_setup(n, stride, addr, v2 = False):
+   horiz, laned, size, y, x, p = addr
+   if size not in (0, 1, 2):
+      raise Exception('addr size should be 0, 1, or 2')
+   if horiz:
+      if x != 0:
+         raise Exception('horizontal accesses must have x of 0')
+   else:
+      if (y & 0xf) != 0:
+         raise Exception('vertical accesses must be 16 row aligned')
+   hls = (bits(horiz, 1) << 3) | (bits(laned, 1) << 2) | (2 - size)
+   if v2:
+      return ((1 << 29) | (bitsw(n, 5) << 24) | (bitsws(stride, 7) << 16) |
+         (hls << 12) | ((bits(y, 8) | bits(x, 4)) << size) | bits(p, size))
+   return ((bitsw(n, 4) << 20) | (bitsw(stride, 6) << 12) |
+      (hls << 8) | ((bits(y, 6) | bits(x, 4)) << size) | bits(p, size))
+
+def vdw_setup_0(n, m, addr):
+   horiz, size, y, x, p = addr
+   if size not in (0, 1, 2):
+      raise Exception('addr size should be 0, 1, or 2')
+   return ((2 << 30) | (bitsw(n, 7) << 23) | (bitsw(m, 7) << 16) |
+      (bits(horiz, 1) << 14) | (bits(y, 7) << 7) | (bits(x, 4) << 3) | (size << 1) | bits(p, size))
+
+def vdr_setup_0(n, m, addr, vpm_stride, stride):
+   horiz, size, y, x, p = addr
+   if size not in (0, 1, 2):
+      raise Exception('addr size should be 0, 1, or 2')
+   if (stride < 8) or (stride & (stride - 1)):
+      raise Exception('stride must be power of 2 >= 8, 8 meaning use extended stride')
+   log2_stride = 3
+   while (1 << log2_stride) != stride:
+      log2_stride += 1
+   return ((1 << 31) | (size << 29) | (bits(p, size) << 28) | (bits(log2_stride - 3, 4) << 24) |
+      (bitsw(m, 4) << 20) | (bitsw(n, 4) << 16) | (bitsw(vpm_stride, 4) << 12) |
+      (bits(1 - horiz, 1) << 11) | (bits(y, 7) << 4) | bits(x, 4))
+
+class allocator_t:
+   def __init__(self, *available):
+      self.available = list(available)
+      self.allocated = {}
+      self.reserved = []
+
+   def copy(self):
+      a = allocator_t()
+      a.available = self.available[:]
+      a.allocated = self.allocated.copy()
+      a.reserved = self.reserved[:]
+      return a
+
+   def forget(self):
+      self.__init__(self.available + self.allocated.values() + self.reserved)
+
+   def reserve(self, *rs):
+      for r in rs:
+         self.available.remove(r)
+         self.reserved.append(r)
+
+   def retire(self, name):
+      r = self.allocated.pop(name)
+      del r.__invert__
+      del r.retire
+      self.available.append(r)
+      return r
+
+   def __getattr__(self, name):
+      if name not in self.allocated:
+         r = self.available.pop()
+         r.retire = lambda: self.retire(name) # this is an ugly hack to get nicer retire syntax
+         r.__invert__ = r.retire
+         self.allocated[name] = r
+      return self.allocated[name]
+
+def pragma_allow_xor_0(x):
+   global allow_xor_0
+
+   if not isinstance(x, bool):
+      raise Exception('allow_xor_0 must be bool')
+   x, allow_xor_0 = allow_xor_0, x
+   return x
+
+def pragma_dont_warn_when_mul_rot_inp_r5(x):
+   global dont_warn_when_mul_rot_inp_r5
+
+   if not isinstance(x, bool):
+      raise Exception('dont_warn_when_mul_rot_inp_r5 must be bool')
+   x, dont_warn_when_mul_rot_inp_r5 = dont_warn_when_mul_rot_inp_r5, x
+   return x
+
+arg_defs = {
+   # special reg names (these alias the regular names, but also have appropriate read/write restrictions)
+   'w':             loc_t(MUX_A,   15, 0, 0, None, RW_EITHER),
+   'z':             loc_t(MUX_B,   15, 0, 0, None, RW_EITHER),
+   'unif':          loc_t(MUX_ANY, 32, 0, 0, None, RW_READ),
+   'vary':          loc_t(MUX_ANY, 35, 0, 0, None, RW_READ),
+   'tmurs':         loc_t(MUX_ANY, 36, 0, 0, None, RW_WRITE),
+   'r5quad':        loc_t(MUX_A,   37, 0, 0, None, RW_WRITE),
+   'r5rep':         loc_t(MUX_B,   37, 0, 0, None, RW_WRITE),
+   'elem_num':      loc_t(MUX_A,   38, 0, 0, None, RW_READ),
+   'qpu_num':       loc_t(MUX_B,   38, 0, 0, None, RW_READ),
+   'unif_addr':     loc_t(MUX_A,   40, 0, 0, None, RW_WRITE),
+   'unif_addr_rel': loc_t(MUX_B,   40, 0, 0, None, RW_WRITE),
+   'x_coord':       loc_t(MUX_A,   41, 0, 0, None, RW_EITHER),
+   'y_coord':       loc_t(MUX_B,   41, 0, 0, None, RW_EITHER),
+   'ms_mask':       loc_t(MUX_A,   42, 0, 0, None, RW_EITHER),
+   'rev_flag':      loc_t(MUX_B,   42, 0, 0, None, RW_EITHER),
+   'stencil':       loc_t(MUX_ANY, 43, 0, 0, None, RW_WRITE),
+   'tlbz':          loc_t(MUX_ANY, 44, 0, 0, None, RW_WRITE),
+   'tlbm':          loc_t(MUX_ANY, 45, 0, 0, None, RW_WRITE),
+   'tlbc':          loc_t(MUX_ANY, 46, 0, 0, None, RW_WRITE),
+   'vpm':           loc_t(MUX_ANY, 48, 0, 0, None, RW_EITHER),
+   'vr_busy':       loc_t(MUX_A,   49, 0, 0, None, RW_READ),
+   'vw_busy':       loc_t(MUX_B,   49, 0, 0, None, RW_READ),
+   'vr_setup':      loc_t(MUX_A,   49, 0, 0, None, RW_WRITE),
+   'vw_setup':      loc_t(MUX_B,   49, 0, 0, None, RW_WRITE),
+   'vr_wait':       loc_t(MUX_A,   50, 0, 0, None, RW_READ),
+   'vw_wait':       loc_t(MUX_B,   50, 0, 0, None, RW_READ),
+   'vr_addr':       loc_t(MUX_A,   50, 0, 0, None, RW_WRITE),
+   'vw_addr':       loc_t(MUX_B,   50, 0, 0, None, RW_WRITE),
+   'mutex':         loc_t(MUX_ANY, 51, 0, 0, None, RW_EITHER),
+   'recip':         loc_t(MUX_ANY, 52, 0, 0, None, RW_WRITE),
+   'recipsqrt':     loc_t(MUX_ANY, 53, 0, 0, None, RW_WRITE),
+   'rsqrt':         loc_t(MUX_ANY, 53, 0, 0, None, RW_WRITE),
+   'exp':           loc_t(MUX_ANY, 54, 0, 0, None, RW_WRITE),
+   'log':           loc_t(MUX_ANY, 55, 0, 0, None, RW_WRITE),
+   't0s':           loc_t(MUX_ANY, 56, 0, 0, None, RW_WRITE),
+   't0t':           loc_t(MUX_ANY, 57, 0, 0, None, RW_WRITE),
+   't0r':           loc_t(MUX_ANY, 58, 0, 0, None, RW_WRITE),
+   't0b':           loc_t(MUX_ANY, 59, 0, 0, None, RW_WRITE),
+   't1s':           loc_t(MUX_ANY, 60, 0, 0, None, RW_WRITE),
+   't1t':           loc_t(MUX_ANY, 61, 0, 0, None, RW_WRITE),
+   't1r':           loc_t(MUX_ANY, 62, 0, 0, None, RW_WRITE),
+   't1b':           loc_t(MUX_ANY, 63, 0, 0, None, RW_WRITE),
+
+   # semaphore acq/rel
+   'sacq': lambda i: sema_t(True, i),
+   'srel': lambda i: sema_t(False, i),
+
+   # label makers (before evaluating, the syntax x:label gets transformed to x_label_maker._label)
+   'r_label_maker': label_maker_t(True),
+   'a_label_maker': label_maker_t(False),
+
+   # handy functions
+   'f':     lambda x: struct.unpack('I', struct.pack('f', x))[0],
+   'sqrt':  math.sqrt,
+   'sin':   math.sin,
+   'cos':   math.cos,
+   'atan2': math.atan2,
+   'pi':    math.pi,
+   'rseed': random.seed,
+   'rand':  lambda: int(random.getrandbits(32)),
+   'bits':  bits,
+   'bitsw': bitsw,
+   'bitsws': bitsws,
+
+   # handy vpm/vdw/vdr stuff
+   'h32':  lambda y:       (1, 0, 0, y, 0, 0),
+   'h16l': lambda y, p:    (1, 1, 1, y, 0, p),
+   'h16p': lambda y, p:    (1, 0, 1, y, 0, p),
+   'h8l':  lambda y, p:    (1, 1, 2, y, 0, p),
+   'h8p':  lambda y, p:    (1, 0, 2, y, 0, p),
+   'v32':  lambda y, x:    (0, 0, 0, y, x, 0),
+   'v16l': lambda y, x, p: (0, 1, 1, y, x, p),
+   'v16p': lambda y, x, p: (0, 0, 1, y, x, p),
+   'v8l':  lambda y, x, p: (0, 1, 2, y, x, p),
+   'v8p':  lambda y, x, p: (0, 0, 2, y, x, p),
+   'dma_h32':  lambda y, x:    (1, 0, y, x, 0),
+   'dma_h16p': lambda y, x, p: (1, 1, y, x, p),
+   'dma_h8p':  lambda y, x, p: (1, 2, y, x, p),
+   'dma_v32':  lambda y, x:    (0, 0, y, x, 0),
+   'dma_v16p': lambda y, x, p: (0, 1, y, x, p),
+   'dma_v8p':  lambda y, x, p: (0, 2, y, x, p),
+   'vpm_setup': vpm_setup,
+   'vpm_setup_v2': lambda n, stride, addr: vpm_setup(n, stride, addr, True),
+   'vdw_setup_0': vdw_setup_0,
+   'vdw_setup_1': lambda stride: (3 << 30) | bits(stride, 13),
+   'vdr_setup_0': vdr_setup_0,
+   'vdr_setup_ext_stride': 8, # stride of 8 means use extended stride
+   'vdr_setup_1': lambda stride: (9 << 28) | bits(stride, 13),
+
+   # annotations
+   'mul_used': lambda *is_: ('mul_used', sum(1 << i for i in is_)),
+   'mul_unused': lambda *is_: ('mul_used', sum(1 << i for i in is_) ^ 0xffff),
+   'preserve_cond': ('preserve_cond', 1),
+
+   # somewhat experimental register allocator
+   'allocator_t': allocator_t,
+
+   # pragmas
+   'pragma_allow_xor_0': pragma_allow_xor_0,
+   'pragma_dont_warn_when_mul_rot_inp_r5': pragma_dont_warn_when_mul_rot_inp_r5}
+
+# accumulators and regs (regular names -- r0, ra0, etc)
+arg_defs.update(('r%d' % i, loc_t(MUX_AC, i, 0, 0, None, RW_EITHER)) for i in xrange(6))
+arg_defs.update(('rany%d' % i, loc_t(MUX_ANY, i, 0, 0, None, RW_EITHER)) for i in xrange(64))
+arg_defs.update(('ra%d' % i, loc_t(MUX_A, i, 0, 0, None, RW_EITHER)) for i in xrange(64))
+arg_defs.update(('rb%d' % i, loc_t(MUX_B, i, 0, 0, None, RW_EITHER)) for i in xrange(64))
+
+def arg_eval(arg, sets):
+   s = (arg.strip().split('.', 1) + [None])[:2]
+   if s[0] == '-':
+      return loc_t(MUX_ANY, WADDR_NOP, 0, 0, s[1], RW_WRITE)
+   arg = re_label_ref_left.sub('\\1_label_maker._', arg) # todo: we probably don't want to replace in strings...
+   arg = re_pack.sub('._\\1', arg)
+   try:
+      # todo: i would like to be able to pass both arg_defs and sets in here
+      # (with sets hiding arg_defs in the case of conflicts), but the obvious
+      # dict(arg_defs, **sets) won't permit things such as:
+      # .set f, lambda x: y
+      # .set y, 4
+      # (the y in the lambda will be looked up in the temporary dict we created
+      # when evaluating the f .set, which doesn't contain y)
+      #
+      # instead, sets is initially set to (a copy of) arg_defs. to simulate the
+      # hiding behaviour, on an unset, we restore any hidden arg_defs value.
+      # also, before dumping sets at the end, we strip out the arg_defs stuff
+      # (this isn't entirely correct as we want to dump sets that are hiding
+      # arg_defs)
+      return eval(arg, sets)
+   except Exception, e:
+      asm_error(e)
+   except:
+      asm_error('unknown error while evaluating argument')
+
+# doesn't check/fixup pack
+def check_and_fixup_loc(loc, read):
+   if (not read) and (loc.rw == RW_READ):
+      asm_error('writing to read-only hardware register')
+   if read and (loc.rw == RW_WRITE):
+      asm_error('reading from write-only hardware register')
+   if not read:
+      # conceptually, we are writing to a location rotated right by
+      # loc.rot/loc.r5_rot. but we are actually rotating the output right by
+      # -loc.rot/-loc.r5_rot then writing it to the unrotated location
+      loc.rot = -loc.rot % 16
+      loc.r5_rot = -loc.r5_rot % 16
+   if (loc.rot != 0) and (loc.r5_rot != 0):
+      asm_error('can\'t rotate by both r5 and immediate')
+   if (loc.r5_rot != 0) and (loc.r5_rot != 1):
+      asm_error('only supported rotation by r5 is once to the %s' % ('left', 'right')[read])
+   if (not mulw_rotate) and ((loc.rot != 0) or loc.r5_rot): # mulw_rotate source checking is done later
+      if not read:
+         asm_error('target doesn\'t support write rotation')
+      if loc.mux == MUX_ANY:
+         loc.mux = MUX_A # can't do rotated read from regfile b
+      if loc.mux != MUX_A:
+         asm_error('rotation on read only allowed from regfile a')
+      if loc.i >= 32:
+         asm_warning('rotation only works from physical regfile')
+   if loc.mux == MUX_AC:
+      if (loc.i < 0) or (loc.i >= 6):
+         asm_error('reg out of range')
+      if not read:
+         if loc.i == 4:
+            asm_error('not allowed to write to r4')
+         if loc.i == 5:
+
+            asm_error('not allowed to write to r5 -- please specify r5quad or r5rep')
+   elif (loc.mux == MUX_ANY) or (loc.mux == MUX_A) or (loc.mux == MUX_B):
+      if (loc.i < 0) or (loc.i >= 64):
+         asm_error('reg out of range')
+   else:
+      assert 0
+
+def get_dst(dst, sets):
+   if not dst:
+      return None, None, (0, PACK_TYPE_EITHER, PACK_MODE_EITHER), 0, 0
+   dst = arg_eval(dst, sets)
+   if not isinstance(dst, loc_t):
+      asm_error('invalid dst')
+   dst = dst.copy()
+   check_and_fixup_loc(dst, False)
+   pack = get_pack(dst.pack)
+   if dst.mux == MUX_AC:
+      if pack[2] == PACK_MODE_A:
+         asm_warning('ra packing only works when writing to physical regfile')
+         return WADDR_R0 + dst.i, WMUX_A, pack, dst.rot, dst.r5_rot
+      return WADDR_R0 + dst.i, WMUX_ANY, pack, dst.rot, dst.r5_rot
+   if (dst.mux == MUX_A) or ((dst.mux == MUX_ANY) and (pack[2] == PACK_MODE_A)): # can't pack to regfile b with this operation
+      if (pack[2] == PACK_MODE_A) and (dst.i >= 32):
+         asm_warning('ra packing only works when writing to physical regfile')
+      return dst.i, WMUX_A, pack, dst.rot, dst.r5_rot
+   if dst.mux == MUX_ANY:
+      return dst.i, WMUX_ANY, pack, dst.rot, dst.r5_rot
+   if dst.mux == MUX_B:
+      if pack[2] == PACK_MODE_A:
+         asm_error('this packing operation can only be used for regfile a')
+      return dst.i, WMUX_B, pack, dst.rot, dst.r5_rot
+   assert 0
+
+def get_src(src, sets):
+   if not src:
+      return None, None, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), None, None
+   src = arg_eval(src, sets)
+   if isinstance(src, sema_t):
+      if not have_sema:
+         asm_error('target does not support semaphores')
+      if (src.i < 0) or (src.i >= 16):
+         asm_error('semaphore number must be in [0, 16)')
+      return src.i | (src.acq << 4), RMUX_SEMA, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), 0, 0
+   if isinstance(src, label_t):
+      return (src.name, src.rel, src.offset), RMUX_LABEL, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), 0, 0
+   if isinstance(src, list):
+      if len(src) != 16:
+         asm_error('vector immediate must have length 16')
+      src = src[:]
+      for i in xrange(16):
+         if not is_int(src[i]):
+            asm_error('all elements of vector immediate must be integers')
+         src[i] &= (1 << 32) - 1
+      return src, RMUX_IMMV, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), 0, 0
+   if is_int(src):
+      return src & ((1 << 32) - 1), RMUX_IMM, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), 0, 0
+   if not isinstance(src, loc_t):
+      asm_error('invalid src')
+   src = src.copy()
+   check_and_fixup_loc(src, True)
+   if mulw_rotate:
+      srot, sr5rot = 0, 0
+      drot, dr5rot = src.rot, src.r5_rot
+   else:
+      srot, sr5rot = src.rot, src.r5_rot
+      drot, dr5rot = 0, 0
+   if src.mux == MUX_AC:
+      if src.i == 4:
+         return 4, RMUX_AC, get_r4_unpack(src.pack), drot, dr5rot
+      if src.pack:
+         asm_error('unpack only allowed for regfile a or r4')
+      return src.i, RMUX_AC, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), drot, dr5rot
+   if (src.mux == MUX_A) or ((src.mux == MUX_ANY) and src.pack): # can't unpack from regfile b
+      return (src.i, srot, sr5rot), RMUX_A, get_a_unpack(src.pack), drot, dr5rot
+   if src.mux == MUX_ANY:
+      return src.i, RMUX_ANY, (0, PACK_TYPE_EITHER, UNPACK_LOC_AB), drot, dr5rot
+   if src.mux == MUX_B:
+      if src.pack:
+         asm_error('unpack only allowed for regfile a or r4')
+      return src.i, RMUX_B, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), drot, dr5rot
+   assert 0
+
+# signals
+##########
+
+sigs = {
+   'bkpt': SIG_BKPT,
+   'thrsw': SIG_THRSW,
+   'thrend': SIG_THREND,
+   'sbwait': SIG_SBWAIT,
+   'sbdone': SIG_SBDONE,
+   'int': SIG_INT,
+   'loadcv': SIG_LOADCV,
+   'loadc': SIG_LOADC,
+   'ldcend': SIG_LDCEND,
+   'ldtmu0': SIG_LDTMU0,
+   'ldtmu1': SIG_LDTMU1}
+
+def get_sig(sig):
+   if sig not in sigs:
+      return SIG_NORMAL
+   return sigs[sig]
+
+# annotations
+##############
+
+def get_annots(annot, sets):
+   annots = arg_eval(annot, sets)
+   if isinstance(annots, list):
+      annots = annots[:]
+   else:
+      annots = [annots]
+   for i, annot in enumerate(annots):
+      if ((not isinstance(annot, tuple)) or (len(annot) != 2) or (not isinstance(annot[0], str)) or
+         (not is_int(annot[1]))):
+         asm_error('annotation must be (string, integer) pair, or a list of such pairs')
+      annots[i] = (annot[0], annot[1] & ((1 << 32) - 1))
+   return annots
+
+###############################################################################
+# core
+###############################################################################
+
+def calculate_pack_modes(rpacks, rfloats, couldrfloat, wpacks, wfloats):
+   needfloat = PACK_TYPE_EITHER
+   havefloata = False
+   havefloatr4 = False
+   unpacka = None
+   unpackr4 = None
+   forcebs = [False, False, False, False]
+   forcerafloat = False
+
+   pm = PACK_MODE_EITHER
+   for i in (0, 1, 2, 3):
+      if (rpacks[i][2] == UNPACK_LOC_OTHER) or (rpacks[i][2] == UNPACK_LOC_AB):
+         assert rpacks[i][0] == 0
+      else:
+         if rpacks[i][2] == UNPACK_LOC_A:
+            if unpacka is None:
+               unpacka = rpacks[i][0]
+            elif unpacka != rpacks[i][0]:
+               asm_error('conflicting unpack operations on regfile a')
+            havefloata = havefloata or rfloats[i]
+         elif rpacks[i][2] == UNPACK_LOC_R4:
+            if unpackr4 is None:
+               unpackr4 = rpacks[i][0]
+            elif unpackr4 != rpacks[i][0]:
+               asm_error('conflicting unpack operations on r4')
+            havefloatr4 = havefloatr4 or rfloats[i]
+         else:
+            assert 0
+
+         if rpacks[i][1] != PACK_TYPE_EITHER:
+            if (needfloat != PACK_TYPE_EITHER) and (needfloat != rpacks[i][1]):
+               asm_error('conflicting unpack float requirements')
+            needfloat = rpacks[i][1]
+   for i in (0, 1, 2, 3):
+      if rpacks[i][2] == UNPACK_LOC_AB:
+         if (unpacka is not None) and (unpacka != UNPACK_A_NOP):
+            forcebs[i] = True # non-nop unpack from regfile a. must use b
+
+   if unpacka:
+      if (needfloat == PACK_TYPE_FLOAT) and (not havefloata) and couldrfloat:
+         havefloata = True
+         forcerafloat = True
+      havefloat = havefloata
+   else:
+      havefloat = havefloatr4
+
+   if (needfloat == PACK_TYPE_FLOAT) and (not havefloat):
+      asm_error('float unpack operation used in integer alu operations')
+   if (needfloat == PACK_TYPE_INT) and havefloat:
+      asm_error('integer unpack operation used in float alu operation')
+
+   unpack = 0
+   if unpacka and unpackr4:
+      asm_error('cannot specify pack operation for both regfile a and r4')
+   if unpacka:
+      pm = PACK_MODE_A
+      unpack = unpacka
+   elif unpackr4:
+      pm = PACK_MODE_M
+      unpack = unpackr4
+
+   pack = 0
+   if wpacks[0][2] == PACK_MODE_M:
+      asm_error('mul-unit pack operation used on add result')
+   for i in (0, 1):
+      if wpacks[i][2] == PACK_MODE_A:
+         if (pm != PACK_MODE_EITHER) and (pm != PACK_MODE_A):
+            asm_error('conflicting pack modes')
+         pm = PACK_MODE_A
+         pack = wpacks[i][0]
+      elif wpacks[i][2] == PACK_MODE_M:
+         if (pm != PACK_MODE_EITHER) and (pm != PACK_MODE_M):
+            asm_error('conflicting pack modes')
+         pm = PACK_MODE_M
+         pack = wpacks[i][0]
+
+      if (wpacks[i][1] == PACK_TYPE_FLOAT) and (not wfloats[i]):
+         asm_error('float pack operation used with integer alu result')
+      if (wpacks[i][1] == PACK_TYPE_INT) and wfloats[i]:
+         asm_error('integer pack operation used with float alu result')
+
+   if pm == PACK_MODE_EITHER:
+      pm = PACK_MODE_A
+   return pm, pack, unpack, forcebs, forcerafloat
+
+# immediates that can be encoded with SIG_SMALLIMMED
+bimms = {}
+bimms.update((i, i) for i in xrange(16))
+bimms.update(((i - 32) + (1 << 32), i) for i in xrange(16, 32))
+bimms.update(((127 + (i - 32)) << 23, i) for i in xrange(32, 40))
+bimms.update(((127 + (i - 48)) << 23, i) for i in xrange(40, 48))
+
+def merge_rmux(raddr_a, raddr_b, immb, arot_r5, raddr, rmux):
+   if rmux == RMUX_SEMA:
+      asm_error('semaphore op can only be used with mov')
+   if rmux == RMUX_LABEL:
+      asm_error('label not allowed here')
+   if rmux == RMUX_IMMV:
+      asm_error('vector immediate can only be used with mov')
+   if rmux == RMUX_IMM:
+      if raddr not in bimms:
+         asm_error('can\'t encode immediate 0x%08x' % raddr)
+      raddr = bimms[raddr]
+      if not immb:
+         if raddr_b is not None:
+            asm_error('regfile b and immediates don\'t mix')
+         raddr_b = raddr
+         immb = True
+      elif raddr_b != raddr:
+         asm_error('can only encode one rotation/immediate')
+      return raddr_a, raddr_b, immb, arot_r5, RMUX_B
+   if rmux == RMUX_AC:
+      return raddr_a, raddr_b, immb, arot_r5, RMUX_A0 + raddr
+   if rmux == RMUX_ANY:
+      if (mulw_rotate or (((not immb) or (raddr_b < 48)) and (not arot_r5))) and (raddr_a == raddr):
+         return raddr_a, raddr_b, immb, arot_r5, RMUX_A
+      if (not immb) and (raddr_b == raddr):
+         return raddr_a, raddr_b, immb, arot_r5, RMUX_B
+      if raddr_a is None:
+         assert mulw_rotate or (((not immb) or (raddr_b < 48)) and (not arot_r5))
+         raddr_a = raddr
+         return raddr_a, raddr_b, immb, arot_r5, RMUX_A
+      if raddr_b is None:
+         assert not immb
+         raddr_b = raddr
+         return raddr_a, raddr_b, immb, arot_r5, RMUX_B
+      asm_error('no free read slots')
+   if rmux == RMUX_A:
+      if (not mulw_rotate) and (raddr_a is not None) and (
+         ((raddr[1] != 0) | ((raddr[2] != 0) << 1)) != ((immb and (raddr_b >= 48)) | (arot_r5 << 1))):
+         asm_error('conflicting rotations from regfile a')
+      if raddr_a is None:
+         raddr_a = raddr[0]
+      elif raddr_a != raddr[0]:
+         asm_error('can only read from one location in each regfile')
+      arot_r5 = raddr[2]
+      if raddr[1] == 0:
+         return raddr_a, raddr_b, immb, arot_r5, RMUX_A
+      raddr = 48 + raddr[1]
+      if not immb:
+         if raddr_b is not None:
+            asm_error('regfile b and rotation don\'t mix')
+         raddr_b = raddr
+         immb = True
+      elif raddr_b != raddr:
+         asm_error('can only encode one rotation/immediate')
+      return raddr_a, raddr_b, immb, arot_r5, RMUX_A
+   if rmux == RMUX_B:
+      if immb:
+         asm_error('regfile b and rotation/immediates don\'t mix')
+      if raddr_b is None:
+         raddr_b = raddr
+      elif raddr_b != raddr:
+         asm_error('can only read from one location in each regfile')
+      return raddr_a, raddr_b, immb, arot_r5, RMUX_B
+   assert 0
+
+# ok if:
+# - accumulator (r0-r3)
+# - uniform (ie all elements identical). this is true of unif, qpu_num, vr_busy,
+#   and vw_busy. it's also true of r5 if it was written by r5rep, but not if it
+#   was written by r5quad. so, by default, r5 isn't considered uniform. todo:
+#   what about vr_wait/vw_wait/mutex?
+def read_rot_ok(rmux, raddr_a, raddr_b):
+   return ((rmux < 4) or ((rmux == 5) and dont_warn_when_mul_rot_inp_r5) or
+      ((rmux == 6) and (raddr_a in (32, 49))) or # unif/vr_busy
+      ((rmux == 7) and (raddr_b in (32, 38, 49)))) # unif/qpu_num/vw_busy
+
+def asm_flush_prog_data():
+   global prog_data
+
+   while len(prog_data) & 7:
+      prog_data.append(0)
+   for i in xrange(0, len(prog_data), 8):
+      prog.append(((prog_data[i + 3] << 24) | (prog_data[i + 2] << 16) | (prog_data[i + 1] << 8) | (prog_data[i + 0] << 0),
+         (prog_data[i + 7] << 24) | (prog_data[i + 6] << 16) | (prog_data[i + 5] << 8) | (prog_data[i + 4] << 0), 'data', {}))
+   prog_data = []
+
+def asm_line(sets, location, line):
+   global current_location, construct, nwarn_level
+
+   prev_location = current_location
+   current_location = location
+
+   try:
+      if construct != None:
+         if re_macro.match(line):
+            construct_stack.append(CONSTRUCT_MACRO)
+         elif re_if.match(line):
+            construct_stack.append(CONSTRUCT_IF)
+         elif re_rep.match(line):
+            construct_stack.append(CONSTRUCT_REP)
+         else:
+            else_m = line == '.else'
+            elif_m = re_elif.match(line)
+            if elif_m:
+               end_construct = CONSTRUCT_IF
+            else:
+               end_construct = {
+                  '.endm':  CONSTRUCT_MACRO,
+                  '.else':  CONSTRUCT_IF,
+                  '.endif': CONSTRUCT_IF | CONSTRUCT_ELSE,
+                  '.endr':  CONSTRUCT_REP}.get(line)
+            if end_construct is not None:
+               end_construct &= construct_stack.pop()
+               if end_construct == 0:
+                  if elif_m:
+                     asm_error('unexpected .elif')
+                  asm_error('unexpected %s' % line)
+               if len(construct_stack) == 0:
+                  lines = construct
+                  construct = None
+                  if end_construct == CONSTRUCT_MACRO:
+                     return
+                  if (end_construct == CONSTRUCT_IF) or (end_construct == CONSTRUCT_ELSE):
+                     condition_if, condition_else = lines[0]
+                     lines = lines[1:]
+                     if condition_if:
+                        for location, line in lines:
+                           asm_line(sets, location, line)
+                     if else_m:
+                        construct = [(condition_else, False)]
+                        construct_stack.append(CONSTRUCT_ELSE)
+                     elif elif_m:
+                        if elif_m.group('set'):
+                           condition_if = condition_else and ((elif_m.group('set') == 'nset') ^ (elif_m.group('name') in sets))
+                        else:
+                           condition_if = condition_else and arg_eval(elif_m.group('condition'), sets)
+                        condition_else = condition_else and (not condition_if)
+                        construct = [(condition_if, condition_else)]
+                        construct_stack.append(CONSTRUCT_IF)
+                     return
+                  if end_construct == CONSTRUCT_REP:
+                     name, count = lines[0]
+                     lines = lines[1:]
+                     for i in xrange(count):
+                        sets[name] = i
+                        for location, line in lines:
+                           asm_line(sets, location, line)
+                     return
+                  assert 0
+               if else_m:
+                  construct_stack.append(CONSTRUCT_ELSE)
+               elif elif_m:
+                  construct_stack.append(CONSTRUCT_IF)
+         construct.append((current_location, line))
+         return
+
+      if line in ('.endm', '.else', '.endif', '.endr'):
+         asm_error('unexpected %s' % line)
+      if re_elif.match(line):
+         asm_error('unexpected .elif')
+
+      m = re_macro.match(line)
+      if m:
+         construct = []
+         construct_stack.append(CONSTRUCT_MACRO)
+         macros[m.group('name')] = ([param.strip() for param in m.group('params').split(',')[1:]], construct)
+         return
+
+      m = re_if.match(line)
+      if m:
+         if m.group('set'):
+            condition = (m.group('set') == 'nset') ^ (m.group('name') in sets)
+         else:
+            # not not forces condition to a bool (this matters if condition is
+            # something mutable like a list)
+            condition = not not arg_eval(m.group('condition'), sets)
+         construct = [(condition, not condition)]
+         construct_stack.append(CONSTRUCT_IF)
+         return
+
+      m = re_rep.match(line)
+      if m:
+         count = arg_eval(m.group('count'), sets)
+         if not is_int(count):
+            asm_error('.rep count must be integer')
+         construct = [(m.group('name'), count)]
+         construct_stack.append(CONSTRUCT_REP)
+         return
+
+      m = re_include.match(line)
+      if m:
+         filename = arg_eval(m.group('filename'), sets)
+         if not isinstance(filename, str):
+            asm_error('expected string')
+         asm_file(sets, '%s: %s' % (current_location, filename), filename)
+         return
+
+      m = re_set.match(line)
+      if m:
+         sets[m.group('name')] = arg_eval(m.group('val'), sets)
+         return
+
+      m = re_unset.match(line)
+      if m:
+         name = m.group('name')
+         if name not in sets:
+            asm_error('%s not set' % name)
+         if name in arg_defs: # todo: see arg_eval
+            sets[name] = arg_defs[name]
+         else:
+            del sets[name]
+         return
+
+      m = re_eval.match(line)
+      if m:
+         arg_eval(m.group('expr'), sets)
+         return
+
+      m = re_print_info_warn_error.match(line)
+      if m:
+         def print_fn(message):
+            print message
+         def info_fn(message):
+            sys.stderr.write('%s\n' % message)
+         {'print': print_fn, 'info': info_fn, 'warn': asm_warning, 'error': asm_error}[
+            m.group('print_info_warn_error')](arg_eval(m.group('message'), sets))
+         return
+
+      m = re_assert.match(line)
+      if m:
+         if not arg_eval(m.group('condition'), sets):
+            asm_error('assertion failure: \'%s\'' % m.group('condition'))
+         return
+
+      m = re_data.match(line)
+      if m:
+         size = int(m.group('size'))
+         for datum in smart_split(m.group('data')):
+            datum = arg_eval(datum, sets)
+            if not is_int(datum):
+               asm_error('datum must be integer')
+            prog_data.extend(((datum >> (i * 8)) & 0xff) for i in xrange(size))
+         return
+
+      m = re_macro_inst.match(line)
+      if m:
+         name = m.group('name')
+         if name in macros:
+            params, lines = macros[name]
+            args = smart_split(m.group('args'))
+            if len(args) > len(params):
+               asm_error('too many arguments to macro')
+            sets = sets.copy()
+            sets.update(zip(params, (arg_eval(arg, sets) for arg in args)))
+            for param in params[len(args):]:
+               if param in sets:
+                  if param in arg_defs: # todo: see arg_eval
+                     sets[param] = arg_defs[param]
+                  else:
+                     del sets[param]
+            for location, line in lines:
+               asm_line(sets, '%s: %s' % (current_location, location), line)
+            return
+
+      if line == '.pushnwarn':
+         nwarn_level += 1
+         return
+      if line == '.popnwarn':
+         if nwarn_level == 0:
+            asm_error('.popnwarn without .pushnwarn')
+         nwarn_level -= 1
+         return
+
+      # everything below assumes prog is up to date
+      asm_flush_prog_data()
+
+      m = re_label.match(line)
+      if m:
+         name = m.group('name')
+         if name[0].isdigit():
+            labels.setdefault(name, []).append(len(prog))
+         else:
+            if name[0] == ':':
+               undecorated_name = name[1:]
+            else:
+               undecorated_name = name
+            if (undecorated_name in labels) or ((':' + undecorated_name) in labels):
+               asm_error('named label defined twice')
+            labels[name] = len(prog)
+         return
+
+      annots = line.split('@')
+      ops = [op.strip() for op in annots[0].split(';')]
+      annots = sum((get_annots(annot, sets) for annot in annots[1:]), [])
+      sig = get_sig(ops[-1])
+      if sig != SIG_NORMAL:
+         ops = ops[:-1]
+      if len(ops) > 2:
+         asm_error('too many ops')
+      elif (len(ops) == 1) and (ops[0] == ''):
+         ops = []
+      ops = (ops + ['nop', 'nop'])[:2]
+      m = re_op.match(ops[0])
+      if not m:
+         asm_error('invalid syntax')
+      aop, aargs_n = get_aop(m.group('op'))
+      if (aop == AOP_BRA) or (aop == AOP_BRR):
+         acond = get_bcond(m.group('cond'))
+      else:
+         acond = get_cond(m.group('cond'))
+      asf = get_setf(m.group('sf'))
+      aargs = smart_split(m.group('args'))
+      if len(aargs) != aargs_n:
+         asm_error('wrong operand count')
+      ard, ara, arb = (aargs + [None, None, None])[:3]
+      m = re_op.match(ops[1])
+      if not m:
+         asm_error('invalid syntax')
+      mop, margs_n = get_mop(m.group('op'))
+      mcond = get_cond(m.group('cond'))
+      msf = get_setf(m.group('sf'))
+      margs = smart_split(m.group('args'))
+      if len(margs) != margs_n:
+         asm_error('wrong operand count')
+      mrd, mra, mrb = (margs + [None, None, None])[:3]
+      # eval srcs first so allocator can retire and reuse registers for dst
+      aaraddr, aarmux, aarpack, aadrot, aadrot_r5 = get_src(ara, sets)
+      abraddr, abrmux, abrpack, abdrot, abdrot_r5 = get_src(arb, sets)
+      maraddr, marmux, marpack, madrot, madrot_r5 = get_src(mra, sets)
+      mbraddr, mbrmux, mbrpack, mbdrot, mbdrot_r5 = get_src(mrb, sets)
+      awaddr, awmux, awpack, awrot, awrot_r5 = get_dst(ard, sets)
+      mwaddr, mwmux, mwpack, mwrot, mwrot_r5 = get_dst(mrd, sets)
+      if (((abrmux is not None) and ((aadrot != abdrot) or (aadrot_r5 != abdrot_r5))) or
+         ((mbrmux is not None) and ((madrot != mbdrot) or (madrot_r5 != mbdrot_r5)))):
+         asm_error('cannot have 2 arguments with different rotations')
+      if aarmux is not None:
+         awrot = (awrot + aadrot) % 16
+         awrot_r5 = (awrot_r5 + aadrot_r5) % 16
+      if (awrot != 0) or awrot_r5:
+         asm_error('rotate not allowed on add write')
+      if marmux is not None:
+         mwrot = (mwrot + madrot) % 16
+         mwrot_r5 = (mwrot_r5 + madrot_r5) % 16
+
+      afloatr = aop in (AOP_FADD, AOP_FSUB, AOP_FMIN, AOP_FMAX, AOP_FMINABS, AOP_FMAXABS, AOP_FTOI)
+      afloatw = aop in (AOP_FADD, AOP_FSUB, AOP_FMIN, AOP_FMAX, AOP_FMINABS, AOP_FMAXABS, AOP_ITOF)
+      pm, pack, unpack, forcebs, forcerafloat = calculate_pack_modes(
+         [aarpack, abrpack, marpack, mbrpack],
+         [afloatr, afloatr, mop == MOP_FMUL, mop == MOP_FMUL],
+         aop == AOP_FTOI,
+         [awpack, mwpack],
+         [afloatw, mop == MOP_FMUL])
+      if forcebs[0]:
+         aarmux = RMUX_B
+      if forcebs[1]:
+         abrmux = RMUX_B
+      if forcebs[2]:
+         marmux = RMUX_B
+      if forcebs[3]:
+         mbrmux = RMUX_B
+
+      # extend nops to 3 operands
+      if aop == AOP_NOP:
+         awaddr, awmux, aaraddr, aarmux, abraddr, abrmux = WADDR_NOP, WMUX_ANY, 0, RMUX_AC, 0, RMUX_AC
+      if mop == MOP_NOP:
+         mwaddr, mwmux, maraddr, marmux, mbraddr, mbrmux = WADDR_NOP, WMUX_ANY, 0, RMUX_AC, 0, RMUX_AC
+
+      # extend 2 operand alu ops to 3 operands (by duplicating the 2nd operand)
+      if (aop == AOP_FTOI) or (aop == AOP_ITOF) or (aop == AOP_NOT) or (aop == AOP_CLZ):
+         if forcerafloat:
+            assert aop == AOP_FTOI # can only forcerafloat if we have an unused float operand
+            # instead of duplicating the 2nd operand, take the ra operand from
+            # the mul op thus forcing the ra value to be considered a float for
+            # the purposes of unpacking
+            if marmux == RMUX_A:
+               abraddr, abrmux = maraddr, marmux
+            else:
+               assert mbrmux == RMUX_A
+               abraddr, abrmux = mbraddr, mbrmux
+         else:
+            abraddr, abrmux = aaraddr, aarmux
+      else:
+         assert not forcerafloat # can only forcerafloat if we have an unused operand
+
+      # handle write addrs
+      if (awmux == mwmux) and (awmux != WMUX_ANY):
+         asm_error('add/mul ops not allowed to write to same regfile')
+      ws = (awmux == WMUX_B) or (mwmux == WMUX_A)
+
+      # handle branch
+      if (aop == AOP_BRA) or (aop == AOP_BRR):
+         # check setf
+         if asf:
+            asm_error('setf not allowed on bra/brr')
+
+         # check pack/unpack
+         if (pack != 0) or (unpack != 0):
+            asm_error('pack/unpack not allowed with bra/brr')
+
+         # handle read address
+         if aarmux == RMUX_LABEL:
+            if (aop == AOP_BRA) and aaraddr[1]:
+               asm_warning('bra with rel label')
+            if (aop == AOP_BRR) and (not aaraddr[1]):
+               asm_warning('brr with abs label')
+            aaraddr, aarmux = (current_location,) + aaraddr, RMUX_IMM
+         if aarmux == RMUX_ANY:
+            aaraddr, aarmux = (aaraddr, 0, 0), RMUX_A
+         if (aarmux != RMUX_IMM) and (aarmux != RMUX_A):
+            asm_error('branch destination must be either label, immediate, or from regfile a')
+         if aarmux == RMUX_IMM:
+            imm = aaraddr
+            raddr = 0 # can't use RADDR_NOP
+         elif aarmux == RMUX_A:
+            if (aaraddr[1] != 0) or (aaraddr[2] != 0):
+               asm_error('rotation of read from regfile a not allowed with branch')
+            if aop == AOP_BRR:
+               asm_warning('brr with ra')
+            imm = 0
+            raddr = aaraddr[0]
+         else:
+            assert 0
+
+         # check mul op is nop
+         if mop != MOP_NOP:
+            asm_error('mul op not allowed with branch')
+
+         # check sig
+         if sig != SIG_NORMAL:
+            asm_error('no signal allowed with branch')
+
+         if raddr >= 32:
+            asm_error('can only branch to register locations in physical regfile')
+         if raddr & 1:
+            asm_warning('branch instruction will destroy flags (see hw-2780)')
+
+         # construct branch instruction
+         prog.append((imm,
+            (mwaddr << 0) | (awaddr << 6) | (ws << 12) | (raddr << 13) | ((aarmux == RMUX_A) << 18) | ((aop == AOP_BRR) << 19) | (acond << 20) | (SIG_BRANCH << 28),
+            line, annots))
+
+         return
+
+      # use COND_NEVER when possible (might save power / allow mul setf)
+      if not dict(annots).get('preserve_cond', 0):
+          if (awaddr == WADDR_NOP) and (not asf):
+             acond = COND_NEVER
+          if (mwaddr == WADDR_NOP) and (not msf):
+             mcond = COND_NEVER
+
+      # attempt to convert movs to ldi
+      if (# no mul setf
+         (not msf) and
+         # ops must either be nop or mov of sema/label/imm/immv
+         ((aop == AOP_NOP) or ((aop == AOP_MOV) and (aarmux in (RMUX_SEMA, RMUX_LABEL, RMUX_IMMV, RMUX_IMM)))) and
+         ((mop == MOP_NOP) or ((mop == MOP_MOV) and (marmux in (RMUX_SEMA, RMUX_LABEL, RMUX_IMMV, RMUX_IMM)))) and
+         # but we don't want 2 nops
+         ((aop != AOP_NOP) or (mop != MOP_NOP)) and
+         # if both ops are movs, srcs must be identical
+         ((aop != AOP_MOV) or (mop != MOP_MOV) or ((aarmux == marmux) and (aaraddr == maraddr))) and
+         # no signal
+         (sig == SIG_NORMAL)):
+         # make sure aarmux/aaraddr contains the value
+         if aop != AOP_MOV:
+            aarmux = marmux
+            aaraddr = maraddr
+
+         # convert immediate
+         if aarmux == RMUX_SEMA:
+            ldi_mode = LDI_SEMA
+         elif aarmux == RMUX_LABEL:
+            ldi_mode = LDI_32
+            aaraddr, aarmux = (current_location,) + aaraddr, RMUX_IMM
+         elif aarmux == RMUX_IMMV:
+            signed, unsigned = True, True
+            imm = 0
+            for i, elem in enumerate(aaraddr):
+               if elem not in (-2 + (1 << 32), -1 + (1 << 32), 0, 1):
+                  signed = False
+               if elem not in (0, 1, 2, 3):
+                  unsigned = False
+               imm |= ((elem & 0x1) << i) | ((elem & 0x2) << (15 + i))
+            if not (signed or unsigned):
+               asm_error('can\'t encode vector immediate')
+            if signed:
+               ldi_mode = LDI_EL_SIGNED
+            else:
+               ldi_mode = LDI_EL_UNSIGNED
+            aaraddr, aarmux = imm, RMUX_IMM
+         elif aarmux == RMUX_IMM:
+            ldi_mode = LDI_32
+         else:
+            assert 0
+
+         # construct ldi instruction
+         prog.append((aaraddr,
+            (mwaddr << 0) | (awaddr << 6) | (ws << 12) | (asf << 13) | (mcond << 14) | (acond << 17) | (pack << 20) | (pm << 24) | (ldi_mode << 25) | (SIG_IMMED << 28),
+            line, annots))
+
+         return
+
+      # convert movs to alu ops
+      if aop == AOP_MOV:
+         if allow_xor_0 and (aarmux == RMUX_IMM) and (aaraddr == 0):
+            aop = AOP_XOR
+            aaraddr, aarmux = 0, RMUX_AC
+            abraddr, abrmux = 0, RMUX_AC
+         else:
+            aop = AOP_OR
+            abraddr, abrmux = aaraddr, aarmux
+      if mop == MOP_MOV:
+         if allow_xor_0 and (marmux == RMUX_IMM) and (maraddr == 0):
+            mop = MOP_V8SUBS
+            maraddr, marmux = 0, RMUX_AC
+            mbraddr, mbrmux = 0, RMUX_AC
+         else:
+            mop = MOP_V8MIN
+            mbraddr, mbrmux = maraddr, marmux
+
+      # normal alu instruction...
+
+      # handle setf
+      if asf and (aop == AOP_NOP):
+         asm_error('nop.setf is not allowed in add pipe')
+      if msf and (mop == MOP_NOP):
+         asm_warning('nop.setf, really?')
+      if (aop == AOP_NOP) or (acond == COND_NEVER):
+         sf = msf
+      else:
+         if msf:
+            asm_error('setf only allowed on mul op if add op is nop or add condition is never')
+         sf = asf
+
+      # handle read addrs
+      raddr_a = None
+      raddr_b = None
+      immb = False
+      arot_r5 = False
+      muxes = [0, 0, 0, 0]
+      if mwrot != 0:
+         raddr_b = 48 + mwrot
+         immb = True
+      if mwrot_r5 and have_am:
+         raddr_b = 48
+         immb = True
+      for f in lambda rmux: rmux != RMUX_ANY, lambda rmux: rmux == RMUX_ANY: # do RMUX_ANY last
+         for i, raddr, rmux in (0, aaraddr, aarmux), (1, abraddr, abrmux), (2, maraddr, marmux), (3, mbraddr, mbrmux):
+            if f(rmux):
+               raddr_a, raddr_b, immb, arot_r5, muxes[i] = merge_rmux(raddr_a, raddr_b, immb, arot_r5, raddr, rmux)
+      add_a, add_b, mul_a, mul_b = muxes
+      if (not read_rot_ok(mul_a, raddr_a, raddr_b)) or (not read_rot_ok(mul_b, raddr_a, raddr_b)):
+         # some output elements might not be as expected
+         if mwrot_r5 or ((mwrot >= 4) and (mwrot <= 12)):
+            bad_elems = 0xffff
+         else:
+            bad_elems = ((1 << (mwrot & 0x3)) - 1) * 0x1111
+            if mwrot > 12:
+               bad_elems ^= 0xffff
+         bad_elems &= dict(annots).get('mul_used', 0xffff)
+         if not msf:
+            if mwaddr == WADDR_NOP:
+               # not writing anywhere and not setting flags. no elements used
+               bad_elems = 0
+            elif ((mwaddr in (36, 40, 43, 49, 50, 51)) or
+               ((not ws) and (mwaddr == 37))):
+               # writing to tmurs/r5rep/unif_addr/unif_addr_rel/stencil/
+               # vr_setup/vw_setup/vr_addr/vw_addr/mutex and not setting flags.
+               # only use element 0
+               bad_elems &= 0x0001
+            elif ((mwaddr == 41) or (ws and (mwaddr == 37)) or
+               ((not ws) and (mwaddr == 42))):
+               # writing to r5quad/x_coord/y_coord/rev_flag and not setting
+               # flags. only use elements 0, 4, 8, and 12
+               bad_elems &= 0x1111
+         if bad_elems:
+            asm_warning('mul inputs don\'t come from accumulators (r0-r3). output may not be as expected')
+      if raddr_a is None:
+         raddr_a = RADDR_NOP
+      if raddr_b is None:
+         raddr_b = RADDR_NOP
+      if immb:
+         if sig != SIG_NORMAL:
+            asm_error('rotation/immediates and signal don\'t mix')
+         sig = SIG_SMALLIMMED
+      if arot_r5 or (mwrot_r5 and (not have_am)):
+         if sig != SIG_NORMAL:
+            asm_error('rotation/immediates/signal don\'t mix')
+         sig = SIG_ROTATE
+
+      # construct instruction
+      prog.append(((mul_b << 0) | (mul_a << 3) | (add_b << 6) | (add_a << 9) | (raddr_b << 12) | (raddr_a << 18) | (aop << 24) | (mop << 29),
+         (mwaddr << 0) | (awaddr << 6) | (ws << 12) | (sf << 13) | (mcond << 14) | (acond << 17) | (pack << 20) | (pm << 24) | (unpack << 25) | (sig << 28),
+         line, annots))
+   finally:
+      current_location = prev_location
+
+def preprocess_passthrough(file):
+   line_number = 0
+   for line in file:
+      line_number += 1
+      yield line_number, line
+
+def asm_file(sets, location, filename, preprocess = None):
+   global current_dir, current_location
+
+   if filename is None:
+      location = '<stdin>'
+      file = sys.stdin
+
+      prev_dir = current_dir
+   else:
+      filename = os.path.normpath(os.path.join(current_dir, filename))
+
+      try:
+         file = open(filename)
+      except Exception, e:
+         asm_error(e)
+      except:
+         asm_error('unknown error while opening file %s' % filename)
+
+      prev_dir = current_dir
+      current_dir = os.path.dirname(filename)
+
+   prev_location = current_location
+   current_location = location
+
+   if preprocess is None:
+      preprocess = preprocess_passthrough
+
+   try:
+      for line_number, line in preprocess(file):
+         # strip off comments and whitespace
+         line = line.split('#')[0].strip()
+         if line == '':
+            continue
+
+         asm_line(sets, '%s: %d' % (current_location, line_number), line)
+   finally:
+      current_dir = prev_dir
+      current_location = prev_location
+
+def asm_end_prog():
+   # check we aren't in a multi-line construct (eg .macro or .rep)
+   if construct != None:
+      asm_error({
+         CONSTRUCT_MACRO: '.macro without .endm',
+         CONSTRUCT_IF:    '.if/.elif without .endif',
+         CONSTRUCT_ELSE:  '.else without .endif',
+         CONSTRUCT_REP:   '.rep without .endr'}[construct_stack[-1]])
+
+   # check no warnings level back to 0
+   if nwarn_level != 0:
+      asm_error('.pushnwarn without .popnwarn')
+
+   # flush queued up data
+   asm_flush_prog_data()
+
+   # fixup all the label references we can
+   for pc in xrange(len(prog)):
+      if isinstance(prog[pc][0], tuple):
+         location, label, rel, offset = prog[pc][0]
+         if label[0].isdigit():
+            label_pcs = labels.get(label[:-1], [])
+            if label[-1] == 'b':
+               label_pcs = filter(lambda label_pc: label_pc <= pc, label_pcs)[-1:]
+            else:
+               label_pcs = filter(lambda label_pc: label_pc > pc, label_pcs)[:1]
+            if label_pcs == []:
+               asm_error('search for label reached begin/end of file', location = location)
+            imm = label_pcs[0]
+         elif label in labels:
+            imm = labels[label]
+         elif (':' + label) in labels:
+            imm = labels[':' + label]
+         elif external_link:
+            continue # let the external linker deal with it
+         else:
+            asm_error('undefined label', location = location)
+         imm = (imm * 8) + offset
+         if rel:
+            imm -= (pc + 4) * 8 # relative to instruction after delay slots
+            imm &= (1 << 32) - 1
+         else:
+            if not external_link:
+               asm_error('can\'t get absolute address without using an external linker. this mode doesn\'t have an external linker', location = location)
+            imm = (location, label, rel, offset, imm)
+         prog[pc] = (imm,) + prog[pc][1:]
+
+def asm_init():
+   global current_dir, current_location, prog, prog_data, macros, labels, construct, construct_stack, nwarn_level
+
+   current_dir = os.getcwd()
+   current_location = ''
+   prog = []
+   prog_data = []
+   macros = {
+      'sacq': (['dst', 'i'], [('candyland', 'mov  dst, sacq(i)')]),
+      'srel': (['dst', 'i'], [('candyland', 'mov  dst, srel(i)')])}
+   labels = {}
+   construct = None
+   construct_stack = []
+   nwarn_level = 0
+
+def asm_reset_prog():
+   global prog, labels
+
+   prog = []
+   labels = {}
+
+###############################################################################
+# dumping
+###############################################################################
+
+def print_lines(lines):
+   for line in lines:
+      print line
+
+class dumper_t:
+   def external_link(self): return False
+   def begin(self): pass
+   def label(self, pc, name): pass
+   def line(self, pc, ls, ms, line, annots, first): pass
+   def end(self): pass
+   def sets(self, sets): pass
+   def direct(self, line): pass
+
+class clif_dumper_t(dumper_t):
+   def __init__(self):
+      self.annot_mode = 0
+
+   def external_link(self):
+      return True
+
+   def parse_annot_mode(self, line):
+      l = line.split(',')
+      self.annot_mode = int(l[0])
+      if self.annot_mode not in (0, 1, 2):
+         asm_error('bad annot mode')
+      if self.annot_mode == 2:
+         if len(l) != 2:
+            asm_error('expected buffer name')
+         self.annot_name = l[1].strip()
+         self.annot_offset = 0
+      elif len(l) != 1:
+         asm_error('unexpected comma')
+
+   def label(self, pc, name):
+      if (self.annot_mode != 1) and (name[0] == ':'):
+         if self.annot_mode == 2:
+            name = name + '_annotations'
+         print '@label %s' % name[1:]
+      else:
+         print '// :%s' % name
+
+   def line(self, pc, ls, ms, line, annots, first):
+      if self.annot_mode == 0:
+         if isinstance(ls, tuple):
+            if len(ls) == 5:
+               location, label, rel, offset, offset_from_prog = ls
+               assert not rel
+               ls = '[. - %d + %d]' % (pc * 8, offset_from_prog)
+            else:
+               location, label, rel, offset = ls
+               if rel:
+                  asm_error('relative external label references not allowed in this mode', location = location)
+               ls = '[%s + %d]' % (label, offset)
+         else:
+            ls = '0x%08x' % ls
+         print '%s 0x%08x // %s' % (ls, ms, line)
+      elif self.annot_mode == 1:
+         print '// %s' % line
+         for annot in annots:
+            print '0x%08x 0x%08x // %s' % ({
+               # todo: would rather not have these hard coded
+               'mul_used':              1,
+               'preserve_cond':         2,
+               'geomd_open':            3,
+               'geomd_i':               4,
+               'geomd_tris_clear':      5,
+               'geomd_verts':           6,
+               'geomd_tris_add':        7,
+               'geomd_tris_set_center': 8,
+               'geomd_region_clear':    9,
+               'geomd_region_set':      10,
+               'geomd_images_clear':    11,
+               'geomd_images_l':        12,
+               'geomd_images_b':        13,
+               'geomd_images_r':        14,
+               'geomd_images_t':        15,
+               'geomd_images_add_vpm':  16,
+               'trace_4c':              17,
+               'geomd_images_add_tex':  18,}[annot[0]], annot[1], annot[0])
+         if len(annots) != 0:
+            print '0x00000000 // end'
+      else:
+         assert self.annot_mode == 2
+         if len(annots) == 0:
+            print '0x00000000 // %s' % line
+         else:
+            print '[%s + %d] // %s' % (self.annot_name, self.annot_offset, line)
+            self.annot_offset += (len(annots) * 8) + 4
+
+   def direct(self, line):
+      print line
+
+class plain_dumper_t(dumper_t):
+   def line(self, pc, ls, ms, line, annots, first):
+      print '0x%08x, 0x%08x, // %s' % (ls, ms, line)
+
+class c_c_dumper_t(dumper_t):
+   def __init__(self, header_name, full_header_name, array_name):
+      self.header_name = header_name
+      self.array_name = array_name
+
+   def external_link(self):
+      return True
+
+   def begin(self):
+      self.external_labels = set()
+      self.lines = []
+
+      print '#include "%s.h"' % self.header_name
+      print ''
+      print '#ifdef _MSC_VER'
+      print '   #include <stdint.h>'
+      print '   /* cast through uintptr_t to avoid warnings */'
+      print '   #define POINTER_TO_UINT(X) ((unsigned int)(uintptr_t)(X))'
+      print '#else'
+      print '   #define POINTER_TO_UINT(X) ((unsigned int)(X))'
+      print '#endif'
+      print ''
+      print '#ifdef __cplusplus'
+      print 'extern "C" { /* the types are probably wrong... */'
+      print '#endif'
+
+   def label(self, pc, name):
+      self.lines.append('// :%s' % name)
+
+   def line(self, pc, ls, ms, line, annots, first):
+      if isinstance(ls, tuple):
+         if len(ls) == 5:
+            location, label, rel, offset, offset_from_prog = ls
+            assert not rel
+            ls = 'POINTER_TO_UINT(%s) + %d' % (self.array_name, offset_from_prog)
+         else:
+            location, label, rel, offset = ls
+            if rel:
+               asm_error('relative external label references not allowed in this mode', location = location)
+            if label not in self.external_labels:
+               self.external_labels.add(label)
+               print 'extern uint8_t %s[];' % label
+            ls = 'POINTER_TO_UINT(%s) + %d' % (label, offset)
+      else:
+         ls = '0x%08x' % ls
+      self.lines.append('/* [0x%08x] */ %s, 0x%08x, // %s' % (pc * 8, ls, ms, line))
+
+   def end(self):
+      print '#ifdef __cplusplus'
+      print '}'
+      print '#endif'
+      print ''
+      print '#ifdef _MSC_VER'
+      print '__declspec(align(8))'
+      print '#elif defined(__GNUC__)'
+      print '__attribute__((aligned(8)))'
+      print '#endif'
+      print 'unsigned int %s[] = {' % self.array_name
+      print_lines(self.lines)
+      print '};'
+      print '#ifdef __HIGHC__'
+      print '#pragma Align_to(8, %s)' % self.array_name
+      print '#endif'
+
+class c_h_dumper_t(dumper_t):
+   def __init__(self, header_name, full_header_name, array_name):
+      self.full_header_name = full_header_name
+      self.array_name = array_name
+
+   def external_link(self):
+      return True
+
+   def begin(self):
+      print '#ifndef %s_H' % self.full_header_name
+      print '#define %s_H' % self.full_header_name
+      print ''
+      print 'extern unsigned int %s[];' % self.array_name
+      print ''
+
+   def label(self, pc, name):
+      if name[0] == ':':
+         print '#define %s (%s + %d)' % (name[1:], self.array_name, pc * 2)
+
+   def end(self):
+      print ''
+      print '#endif'
+
+class ml_c_dumper_t(dumper_t):
+   def __init__(self, header_name, full_header_name, name, annots):
+      self.header_name = header_name
+      self.name = name
+      self.annots = annots
+
+   def external_link(self):
+      return True
+
+   def begin(self):
+      if self.annots:
+         self.annot_lines = []
+      self.lines = []
+      self.external_labels = set()
+      self.link_lines = []
+
+      print '#include "%s.h"' % self.header_name
+      print '#include <assert.h>'
+      if self.annots:
+         print '#ifdef SIMPENROSE'
+         print '#include <stddef.h>'
+         print '#include "v3d/verification/tools/2760sim/simpenrose.h"'
+      print ''
+
+   def label(self, pc, name):
+      self.lines.append('// :%s' % name)
+
+   def line(self, pc, ls, ms, line, annots, first):
+      if self.annots:
+         if len(annots) == 0:
+            self.annot_lines.append('NULL,')
+         else:
+            print 'static unsigned int const annotations_%d[] = {' % pc
+            for annot in annots:
+               print '   SIMPENROSE_SHADER_ANNOTATION_%s, 0x%08x,' % (annot[0].upper(), annot[1])
+            print '   SIMPENROSE_SHADER_ANNOTATION_END};'
+            print ''
+            self.annot_lines.append('annotations_%d,' % pc)
+      if isinstance(ls, tuple):
+         self.link_lines.append('   assert(p[%d] == 0xdeadbeef);' % (pc * 2))
+         if len(ls) == 5:
+            location, label, rel, offset, offset_from_prog = ls
+            assert not rel
+            self.link_lines.append('   p[%d] = base + %d;' % (pc * 2, offset_from_prog))
+         else:
+            location, label, rel, offset = ls
+            self.external_labels.add(label)
+            if rel:
+               self.link_lines.append('   p[%d] = (%s + %d) - (base + %d);' % (pc * 2, label, offset, (pc + 4) * 8))
+            else:
+               self.link_lines.append('   p[%d] = %s + %d;' % (pc * 2, label, offset))
+         ls = '0xdeadbeef'
+      else:
+         ls = '0x%08x' % ls
+      self.lines.append('/* [0x%08x] */ %s, 0x%08x, // %s' % (pc * 8, ls, ms, line))
+
+   def end(self):
+      if self.annots:
+         print 'unsigned int const *const %s_annotations_array[] = {' % self.name
+         print_lines(self.annot_lines)
+         print '};'
+         print '#endif'
+         print ''
+      print 'static unsigned int const array[] = {'
+      print_lines(self.lines)
+      print '};'
+      print ''
+      print 'void %s_link(void *p_in, unsigned int base' % self.name
+      for label in sorted(self.external_labels):
+         print '   , unsigned int %s' % label
+      print '   )'
+      print '{'
+      print '   unsigned int *p = (unsigned int *)p_in;'
+      print '   unsigned int i;'
+      print '   for (i = 0; i != (%s_SIZE / 4); ++i) {' % self.name.upper()
+      print '      p[i] = array[i];'
+      print '   }'
+      print_lines(self.link_lines)
+      print '}'
+
+class ml_h_dumper_t(dumper_t):
+   def __init__(self, header_name, full_header_name, name, annots):
+      self.full_header_name = full_header_name
+      self.name = name
+      self.annots = annots
+
+   def external_link(self):
+      return True
+
+   def begin(self):
+      self.external_labels = set()
+      self.lines_n = 0
+
+      print '#ifndef %s_H' % self.full_header_name
+      print '#define %s_H' % self.full_header_name
+      print ''
+      if self.annots:
+         print '#ifdef SIMPENROSE'
+         print '   extern unsigned int const *const %s_annotations_array[];' % self.name
+         print '#endif'
+         print ''
+
+   def label(self, pc, name):
+      if name[0] == ':':
+         print '#define %s_OFFSET %d' % (name[1:].upper(), pc * 8)
+         if self.annots:
+            print '#ifdef SIMPENROSE'
+            print '   #define %s_annotations (%s_annotations_array + %d)' % (name[1:], self.name, pc)
+            print '#endif'
+
+   def line(self, pc, ls, ms, line, annots, first):
+      if isinstance(ls, tuple) and (len(ls) != 5):
+         self.external_labels.add(ls[1])
+      self.lines_n += 1
+
+   def end(self):
+      print ''
+      print 'extern void %s_link(void *p, unsigned int base' % self.name
+      for label in sorted(self.external_labels):
+         print '   , unsigned int %s' % label
+      print '   );'
+      print ''
+      print '#define %s_SIZE %d' % (self.name.upper(), (self.lines_n * 8))
+      print ''
+      print '#endif'
+
+def print_lines_lc(lines):
+   for line in lines:
+      print '%s \\' % line
+
+def print_groups_lc(groups):
+   first = True
+   for group in groups:
+      if first:
+         print '{ \\'
+      else:
+         print ', { \\'
+      print_lines_lc(group)
+      print '} \\'
+      first = False
+
+class inline_c_dumper_t(dumper_t):
+   def __init__(self, annots):
+      self.annots = annots
+      self.iteration = False
+
+   def begin_iteration(self):
+      assert not self.iteration
+      self.iteration = True
+      self.iteration_lines = []
+      if self.annots:
+         self.iteration_annot_lines = []
+         self.annot_arrs = []
+
+   def end_iteration(self):
+      assert self.iteration
+      self.iteration = False
+      print '%d, \\' % self.iteration_n
+      if self.annots:
+         print '( \\'
+      print_groups_lc(self.iteration_lines)
+      if self.annots:
+         print '), ( \\'
+         print_groups_lc(self.iteration_annot_lines)
+         print '), ( \\'
+         for annot_arr in self.annot_arrs:
+            print_lines_lc(annot_arr)
+         print ') \\'
+
+   def begin(self):
+      self.n = 0
+      self.lines = []
+      if self.annots:
+         self.annot_lines = []
+         if not self.iteration:
+            self.annot_arrs = []
+
+   def label(self, pc, name):
+      self.lines.append('/* :%s */' % name)
+      if self.annots:
+         self.annot_lines.append('/* :%s */' % name)
+
+   def line(self, pc, ls, ms, line, annots, first):
+      self.n += 1
+      if first:
+         prefix = ''
+      else:
+         prefix = ', '
+      self.lines.append('%s0x%08x, 0x%08x /* %s */' % (prefix, ls, ms, line))
+      if self.annots:
+         if len(annots) == 0:
+            a = 'NULL'
+         else:
+            a = 'annotations_%d' % len(self.annot_arrs)
+            annot_arr = ['static unsigned int const annotations_%d[] = {' % len(self.annot_arrs)]
+            for annot in annots:
+               annot_arr.append('   SIMPENROSE_SHADER_ANNOTATION_%s, 0x%08x,' % (annot[0].upper(), annot[1]))
+            annot_arr.append('   SIMPENROSE_SHADER_ANNOTATION_END};')
+            self.annot_arrs.append(annot_arr)
+         self.annot_lines.append('%s%s /* %s */' % (prefix, a, line))
+
+   def end(self):
+      if self.iteration:
+         if len(self.iteration_lines) == 0:
+            self.iteration_n = self.n
+         elif self.iteration_n != self.n:
+            asm_error('number of instructions differs between iterations')
+         self.iteration_lines.append(self.lines)
+         if self.annots:
+            self.iteration_annot_lines.append(self.annot_lines)
+      else:
+         if self.annots:
+            print '( \\'
+         print_lines_lc(self.lines)
+         if self.annots:
+            print '), ( \\'
+            print_lines_lc(self.annot_lines)
+            print '), ( \\'
+            for annot_arr in self.annot_arrs:
+               print_lines_lc(annot_arr)
+            print ') \\'
+
+   def direct(self, line):
+      print line
+
+class asvc_dumper_t(dumper_t):
+   def external_link(self):
+      return True
+
+   def begin(self):
+      print '.align 8'
+
+   def label(self, pc, name):
+      if name[0] == ':':
+         print '%s::' % name[1:]
+      else:
+         print '%s:' % name
+
+   def line(self, pc, ls, ms, line, annots, first):
+      if isinstance(ls, tuple):
+         location, label, rel, offset = ls[:4]
+         if rel:
+            ls = '%s + %d - (. + 32)' % (label, offset)
+         else:
+            ls = '%s + %d' % (label, offset)
+      else:
+         ls = '0x%08x' % ls
+      print '.word %s, 0x%08x ; %s' % (ls, ms, line)
+
+def is_ra_or_rb(val):
+   return isinstance(val, loc_t) and ((val.mux == MUX_A) or (val.mux == MUX_B))
+
+class aliases_dumper_t(dumper_t):
+   def external_link(self):
+      return True
+
+   def begin(self):
+      print '#ifndef JUST_DQASM_ARGS'
+
+   def label(self, pc, name):
+      if not name[0].isdigit():
+         if name[0] == ':':
+            name = name[1:]
+         print '"bs%s", "bs%x",' % (name, pc * 8)
+         print '"bu%s", "bu%x",' % (name, pc * 8)
+
+   def end(self):
+      print '#endif'
+
+   # todo: handle things other than ra and rb? dqasm only allows ra and rb atm
+   def sets(self, sets):
+      dqasm_args = []
+      print '#ifndef JUST_DQASM_ARGS'
+      for name in sets:
+         if is_ra_or_rb(sets[name]):
+            dqasm_args.append('-r%s=%s' % (sets[name], name))
+            print '"%s", "%s",' % (name, sets[name])
+         elif isinstance(sets[name], list):
+            for i, val in enumerate(sets[name]):
+               if is_ra_or_rb(val):
+                  dqasm_args.append('-r%s=%s[%d]' % (val, name, i))
+                  print '"%s[%d]", "%s",' % (name, i, val)
+      print '#endif'
+      print '#define DQASM_ARGS "%s"' % ' '.join(dqasm_args)
+
+def dump(dumper):
+   if (len(prog) != 0) or (len(labels) != 0):
+      dumper.begin()
+
+      sorted_labels = []
+      for name in labels:
+         if name[0].isdigit():
+            for pc in labels[name]:
+               sorted_labels.append((pc, name))
+         else:
+            sorted_labels.append((labels[name], name))
+      sorted_labels.sort(reverse = True)
+
+      first = True
+      for pc in xrange(len(prog)):
+         ls, ms, line, annots = prog[pc]
+         while (len(sorted_labels) != 0) and (sorted_labels[-1][0] == pc):
+            dumper.label(*sorted_labels.pop())
+         dumper.line(pc, ls, ms, line, annots, first)
+         first = False
+      for sorted_label in sorted_labels:
+         assert sorted_label[0] == len(prog)
+         dumper.label(*sorted_label)
+
+      dumper.end()
+
+###############################################################################
+# preprocessing
+###############################################################################
+
+def preprocess_inline_c(dumper):
+   def preprocess(file):
+      ls = None
+      line_number = 0
+      for line in file:
+         line_number += 1
+         while True:
+            if ls is None:
+               l = line.split('%[', 1)
+               if len(l) == 1:
+                  dumper.direct(l[0].rstrip())
+                  break
+               dumper.direct('%s \\' % l[0].rstrip())
+               line = l[1]
+               ls = []
+            else:
+               l = line.split('%]', 1)
+               ls.append((line_number, l[0]))
+               if len(l) == 1:
+                  break
+               line = l[1]
+               l = ls[-1][1].split('%|', 1)
+               if len(l) == 1:
+                  for l_number, l in ls:
+                     yield l_number, l
+                  asm_end_prog()
+                  dump(dumper)
+                  asm_reset_prog()
+               else:
+                  ls[-1] = (ls[-1][0], l[0])
+                  if hasattr(dumper, 'begin_iteration'):
+                     dumper.begin_iteration()
+                  for repls in l[1].split('%,'):
+                     repls = [repl.strip() for repl in repls.split('%/')]
+                     for l_number, l in ls:
+                        for i, repl in enumerate(repls):
+                           l = l.replace('%' + str(i), repl)
+                        yield l_number, l
+                     asm_end_prog()
+                     dump(dumper)
+                     asm_reset_prog()
+                  if hasattr(dumper, 'end_iteration'):
+                     dumper.end_iteration()
+               ls = None
+   return preprocess
+
+def preprocess_clif(dumper):
+   def preprocess(file):
+      in_asm = False
+      line_number = 0
+      for line in file:
+         line_number += 1
+         if in_asm:
+            if line.strip() == '%]':
+               asm_end_prog()
+               dump(dumper)
+               asm_reset_prog()
+               in_asm = False
+            else:
+               yield line_number, line
+         else:
+            if line.strip() == '%[':
+               in_asm = True
+            elif (line[:1] == '%') and (line[:2] != '%@'):
+               yield line_number, line[1:]
+            else:
+               asm_end_prog()
+               dump(dumper)
+               asm_reset_prog()
+               if line[:2] == '%@':
+                  if hasattr(dumper, 'parse_annot_mode'):
+                     dumper.parse_annot_mode(line[2:])
+               else:
+                  dumper.direct(line.rstrip())
+   return preprocess
+
+###############################################################################
+# main
+###############################################################################
+
+def main():
+   global external_link, allow_xor_0, dont_warn_when_mul_rot_inp_r5
+   global warnings_are_errors, disable_warnings, have_sema, have_am, mulw_rotate
+
+   asm_init() # do this first so we can use asm_error without having to pass a location and so asm_warning will work
+
+   # parse command line
+   parser = optparse.OptionParser(usage = 'usage: %prog [options] <filename>')
+   parser.add_option('-m', '--mode', dest = 'mode',
+      help = '<mode> should be clif, plain, ' +
+      'c_c:<header_name>,<full_header_name>,<array_name>, ' +
+      'c_h:<header_name>,<full_header_name>,<array_name>, ' +
+      'ml_c:<header_name>,<full_header_name>,<name>[,annots], ' +
+      'ml_h:<header_name>,<full_header_name>,<name>[,annots], ' +
+      'inline_c[:annots], asvc, or aliases[:<preprocess_mode>]', metavar = '<mode>')
+   parser.add_option('-t', '--target', dest = 'target',
+      help = '<target> should be a0, b0, or hera', metavar = '<target>')
+   parser.add_option('-x', '--allow_xor_0', dest = 'allow_xor_0', action = 'store_true', default = False)
+   parser.add_option('-r', '--dont_warn_when_mul_rot_inp_r5', dest = 'dont_warn_when_mul_rot_inp_r5', action = 'store_true', default = False)
+   parser.add_option('-w', '--warnings_are_errors', dest = 'warnings_are_errors', action = 'store_true', default = False)
+   parser.add_option('-d', '--disable_warnings', dest = 'disable_warnings', action = 'store_true', default = False)
+   parser.add_option('-s', '--set', dest = 'sets', action = 'append', default = [], metavar = '<name>=<val>')
+   options, args = parser.parse_args()
+   if len(args) == 0:
+      filename = None
+   elif len(args) == 1:
+      filename = args[0]
+   else:
+      parser.print_help()
+      sys.exit(-1)
+
+   # handle mode
+   mode = options.mode or 'clif' # assume clif if no mode specified
+   if mode == 'clif':
+      dumper = clif_dumper_t()
+      preprocess = preprocess_clif(dumper)
+   elif mode == 'plain':
+      dumper = plain_dumper_t()
+      preprocess = None
+   elif (mode[:4] == 'c_c:') or (mode[:4] == 'c_h:'):
+      mode_options = mode[4:].split(',')
+      if len(mode_options) != 3:
+         asm_error('badly formatted mode on command line')
+      dumper = {'c_c': c_c_dumper_t, 'c_h': c_h_dumper_t}[mode[:3]](*mode_options)
+      preprocess = None
+   elif (mode[:5] == 'ml_c:') or (mode[:5] == 'ml_h:'):
+      mode_options = mode[5:].split(',')
+      if (len(mode_options) != 3) and ((len(mode_options) != 4) or (mode_options[3] != 'annots')):
+         asm_error('badly formatted mode on command line')
+      dumper = {'ml_c': ml_c_dumper_t, 'ml_h': ml_h_dumper_t
+         }[mode[:4]](*(mode_options[:3] + [len(mode_options) == 4]))
+      preprocess = None
+   elif mode == 'inline_c':
+      dumper = inline_c_dumper_t(False)
+      preprocess = preprocess_inline_c(dumper)
+   elif mode == 'inline_c:annots':
+      dumper = inline_c_dumper_t(True)
+      preprocess = preprocess_inline_c(dumper)
+   elif mode == 'asvc':
+      dumper = asvc_dumper_t()
+      preprocess = None
+   elif mode == 'aliases':
+      dumper = aliases_dumper_t()
+      preprocess = None
+   elif mode == 'aliases:inline_c':
+      dumper = aliases_dumper_t()
+      preprocess = preprocess_inline_c(dumper)
+   else:
+      asm_error('invalid mode')
+   external_link = dumper.external_link()
+
+   # handle target
+   target = options.target or 'b0' # assume b0 if no target specified
+   if target == 'a0':
+      have_sema = False
+      have_am = False
+      mulw_rotate = False
+      have_lthrsw = False
+   elif target == 'b0':
+      have_sema = True
+      have_am = True
+      mulw_rotate = True
+      have_lthrsw = True
+   elif target == 'hera':
+      have_sema = True
+      have_am = False
+      mulw_rotate = True
+      have_lthrsw = True
+   else:
+      asm_error('invalid target')
+   if have_am:
+      sigs['loadam'] = SIG_LOADAM
+      arg_defs['tlbam'] = loc_t(MUX_ANY, 47, 0, 0, None, RW_WRITE)
+   if have_lthrsw:
+      sigs['lthrsw'] = SIG_LTHRSW
+      del sigs['int']
+      arg_defs['interrupt'] = loc_t(MUX_ANY, 38, 0, 0, None, RW_WRITE)
+
+   # handle misc options
+   allow_xor_0 = options.allow_xor_0
+   dont_warn_when_mul_rot_inp_r5 = options.dont_warn_when_mul_rot_inp_r5
+   warnings_are_errors = options.warnings_are_errors
+   disable_warnings = options.disable_warnings
+
+   # make options visible to asm
+   arg_defs['mode'] = mode
+   arg_defs['target'] = target
+
+   # arg_defs all setup at this point
+   sets = arg_defs.copy() # todo: see arg_eval
+
+   # handle command line sets
+   re_options_set = re.compile('(?P<name>\\w+)=(?P<val>.+)$')
+   for options_set in options.sets:
+      m = re_options_set.match(options_set)
+      if not m:
+         asm_error('badly formatted set on command line')
+      sets[m.group('name')] = arg_eval(m.group('val'), sets)
+
+   # assemble input file and dump
+   asm_file(sets, filename, filename, preprocess)
+   asm_end_prog()
+   dump(dumper)
+   for name in arg_defs: # todo: see arg_eval
+      del sets[name]
+   dumper.sets(sets)
+
+if __name__ == '__main__':
+   main()
diff --git a/pi-util/rebase_liblinks.py b/pi-util/rebase_liblinks.py
new file mode 100755
index 0000000..6a9a33f
--- /dev/null
+++ b/pi-util/rebase_liblinks.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python
+
+import os, sys
+from stat import *
+
+def walktree(top, callback, n, prefix):
+    '''recursively descend the directory tree rooted at top,
+       calling the callback function for each regular file'''
+
+    for f in os.listdir(top):
+        pathname = os.path.join(top, f)
+        mode = os.lstat(pathname).st_mode
+        if S_ISDIR(mode):
+            # It's a directory, recurse into it
+            walktree(pathname, callback, n+1, prefix)
+        elif S_ISLNK(mode):
+            # It's a file, call the callback function
+            callback(pathname, os.readlink(pathname), n, prefix)
+
+def visitfile(file, linkname, n, prefix):
+    if (linkname.startswith(prefix + 'lib/')):
+        newlink = "../" * n + linkname[len(prefix):]
+        print 'relinking', file, "->", newlink
+        os.remove(file)
+        os.symlink(newlink, file)
+
+if __name__ == '__main__':
+    argc = len(sys.argv)
+    if argc == 2:
+        walktree(sys.argv[1], visitfile, 0, "/")
+    elif argc == 3:
+        walktree(sys.argv[1], visitfile, 0, sys.argv[2])
+    else:
+        print "rebase_liblinks.py <local root> [<old sysroot>]"
+
+
+
diff --git a/pi-util/syncroot.sh b/pi-util/syncroot.sh
new file mode 100755
index 0000000..d8bdd91
--- /dev/null
+++ b/pi-util/syncroot.sh
@@ -0,0 +1,43 @@
+set -e
+
+if [ "$1" == "" ]; then
+  echo Usage: $0 \<src_dir\> [\<rootname\>]
+  echo src_dir is a source for rsync so may contain m/c name.
+  echo rootname will be set to \"raspian_jessie_pi1\" if missing
+  echo e.g.: pi-util/syncroot.sh my-pi: raspian_jessie_pi1
+  exit 1
+fi
+
+SYSROOT_NAME=$2
+if [ "$SYSROOT_NAME" == "" ]; then
+  SYSROOT_NAME=raspian_jessie_pi1
+fi
+
+DST_ROOT=`pwd`
+DST=$DST_ROOT/build/linux/$SYSROOT_NAME-sysroot
+SRC=$1
+
+echo Sync src:  $SRC
+echo Sync dest: $DST
+
+mkdir -p $DST/lib
+mkdir -p $DST/opt/vc/include
+mkdir -p $DST/usr/lib/pkgconfig
+mkdir -p $DST/usr/bin
+mkdir -p $DST/usr/share
+
+#### MUST NOT include /opt/vc/include/*GL*
+# Creates conflicts with GL includes inside Chrome
+
+rsync -rl $SRC/lib/arm-linux-gnueabihf $DST/lib
+rsync -rl $SRC/opt/vc/lib $DST/opt/vc
+rsync -l  $SRC/opt/vc/include/bcm_host.h $DST/opt/vc/include
+rsync -rl $SRC/opt/vc/include/interface $DST/opt/vc/include
+rsync -rl $SRC/opt/vc/include/vcinclude $DST/opt/vc/include
+rsync -rl $SRC/usr/lib/arm-linux-gnueabihf $DST/usr/lib
+rsync -rl $SRC/usr/lib/gcc $DST/usr/lib
+rsync -rl $SRC/usr/include $DST/usr
+
+pi-util/rebase_liblinks.py $DST
+
+