Files
LibreELEC.tv/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch

18556 lines
584 KiB
Diff

diff --git a/.gitignore b/.gitignore
index 524fb73..305632b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -23,6 +23,7 @@
.\#*
/.config
/.version
+/build/
/ffmpeg
/ffplay
/ffprobe
diff --git a/ffmpeg.c b/ffmpeg.c
index 9ffd833..7a86d7e 100644
--- a/ffmpeg.c
+++ b/ffmpeg.c
@@ -23,6 +23,11 @@
* multimedia converter based on the FFmpeg libraries
*/
+#ifdef RPI
+#define RPI_DISPLAY
+#define RPI_ZERO_COPY
+#endif
+
#include "config.h"
#include <ctype.h>
#include <string.h>
@@ -66,6 +71,25 @@
# include "libavfilter/buffersrc.h"
# include "libavfilter/buffersink.h"
+#ifdef RPI_DISPLAY
+#pragma GCC diagnostic push
+// Many many redundant decls in the header files
+#pragma GCC diagnostic ignored "-Wredundant-decls"
+#include <bcm_host.h>
+#include <interface/mmal/mmal.h>
+#include <interface/mmal/mmal_parameters_camera.h>
+#include <interface/mmal/mmal_buffer.h>
+#include <interface/mmal/util/mmal_util.h>
+#include <interface/mmal/util/mmal_default_components.h>
+#include <interface/mmal/util/mmal_connection.h>
+#include <interface/mmal/util/mmal_util_params.h>
+#pragma GCC diagnostic pop
+#ifdef RPI_ZERO_COPY
+#include "libavcodec/rpi_qpu.h"
+#endif
+#include "libavcodec/rpi_zc.h"
+#endif
+
#if HAVE_SYS_RESOURCE_H
#include <sys/time.h>
#include <sys/types.h>
@@ -158,6 +182,169 @@ static int restore_tty;
static void free_input_threads(void);
#endif
+#ifdef RPI_DISPLAY
+
+#define NUM_BUFFERS 4
+
+static MMAL_COMPONENT_T* rpi_display = NULL;
+static MMAL_POOL_T *rpi_pool = NULL;
+static volatile int rpi_display_count = 0;
+
+static MMAL_POOL_T* display_alloc_pool(MMAL_PORT_T* port, size_t w, size_t h)
+{
+ MMAL_POOL_T* pool;
+ size_t i;
+ size_t size = (w*h*3)/2;
+#ifdef RPI_ZERO_COPY
+ mmal_port_parameter_set_boolean(port, MMAL_PARAMETER_ZERO_COPY, MMAL_TRUE); // Does this mark that the buffer contains a vc_handle? Would have expected a vc_image?
+ pool = mmal_port_pool_create(port, NUM_BUFFERS, 0);
+ assert(pool);
+#else
+ pool = mmal_port_pool_create(port, NUM_BUFFERS, size);
+
+ for (i = 0; i < NUM_BUFFERS; ++i)
+ {
+ MMAL_BUFFER_HEADER_T* buffer = pool->header[i];
+ char * bufPtr = buffer->data;
+ memset(bufPtr, i*30, w*h);
+ memset(bufPtr+w*h, 128, (w*h)/2);
+ }
+#endif
+
+ return pool;
+}
+
+static void display_cb_input(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer) {
+#ifdef RPI_ZERO_COPY
+ av_rpi_zc_unref(buffer->user_data);
+ --rpi_display_count;
+#endif
+ mmal_buffer_header_release(buffer);
+}
+
+static void display_cb_control(MMAL_PORT_T *port,MMAL_BUFFER_HEADER_T *buffer) {
+ mmal_buffer_header_release(buffer);
+}
+
+static MMAL_COMPONENT_T* display_init(size_t x, size_t y, size_t w, size_t h)
+{
+ MMAL_COMPONENT_T* display;
+ MMAL_DISPLAYREGION_T region =
+ {
+ .hdr = {MMAL_PARAMETER_DISPLAYREGION, sizeof(region)},
+ .set = MMAL_DISPLAY_SET_LAYER | MMAL_DISPLAY_SET_FULLSCREEN | MMAL_DISPLAY_SET_DEST_RECT,
+ .layer = 2,
+ .fullscreen = 0,
+ .dest_rect = {x, y, w, h}
+ };
+ const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(w, h);
+
+ bcm_host_init(); // TODO is this needed?
+ mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, &display);
+ assert(display);
+
+ mmal_port_parameter_set(display->input[0], &region.hdr);
+
+ {
+ MMAL_ES_FORMAT_T* format = display->input[0]->format;
+ format->encoding = MMAL_ENCODING_I420;
+ format->es->video.width = geo.stride_y;
+ format->es->video.height = geo.height_y;
+ format->es->video.crop.x = 0;
+ format->es->video.crop.y = 0;
+ format->es->video.crop.width = w;
+ format->es->video.crop.height = h;
+ mmal_port_format_commit(display->input[0]);
+ }
+
+ mmal_component_enable(display);
+
+ rpi_pool = display_alloc_pool(display->input[0], geo.stride_y, geo.height_y);
+
+ mmal_port_enable(display->input[0],display_cb_input);
+ mmal_port_enable(display->control,display_cb_control);
+
+ printf("Allocated display %dx%d in %dx%d\n", w, h, geo.stride_y, geo.height_y);
+
+ return display;
+}
+
+static void display_frame(struct AVCodecContext * const s, MMAL_COMPONENT_T* const display, const AVFrame* const fr)
+{
+ MMAL_BUFFER_HEADER_T* buf;
+
+ if (!display || !rpi_pool)
+ return;
+
+ if (rpi_display_count >= 3) {
+ av_log(s, AV_LOG_VERBOSE, "Frame dropped\n");
+ return;
+ }
+
+ buf = mmal_queue_get(rpi_pool->queue);
+ if (!buf) {
+ // Running too fast so drop the frame
+ printf("Q alloc failure\n");
+ return;
+ }
+ assert(buf);
+ buf->cmd = 0;
+ buf->offset = 0; // Offset to valid data
+ buf->flags = 0;
+#ifdef RPI_ZERO_COPY
+{
+ const AVRpiZcRefPtr fr_buf = av_rpi_zc_ref(s, fr, 1);
+
+ buf->user_data = fr_buf;
+ buf->data = av_rpi_zc_vc_handle(fr_buf);
+ buf->alloc_size =
+ buf->length = av_rpi_zc_numbytes(fr_buf);
+
+ ++rpi_display_count;
+}
+#else
+{
+#error YYY
+ int w = fr->width;
+ int h = fr->height;
+ int w2 = (w+31)&~31;
+ int h2 = (h+15)&~15;
+
+ buf->length = (w2 * h2 * 3)/2;
+ buf->user_data = NULL;
+
+ //mmal_buffer_header_mem_lock(buf);
+ memcpy(buf->data, fr->data[0], w2 * h);
+ memcpy(buf->data+w2*h2, fr->data[1], w2 * h / 4);
+ memcpy(buf->data+w2*h2*5/4, fr->data[2], w2 * h / 4);
+ //mmal_buffer_header_mem_unlock(buf);
+}
+#endif
+
+ while (rpi_display_count >= 3) {
+ usleep(5000);
+ }
+
+ if (mmal_port_send_buffer(display->input[0], buf) != MMAL_SUCCESS)
+ {
+ printf("** send failed: depth=%d\n", rpi_display_count);
+ display_cb_input(NULL, buf);
+ }
+}
+
+static void display_exit(MMAL_COMPONENT_T* display)
+{
+ if (display) {
+ mmal_component_destroy(display);
+ }
+ if (rpi_pool) {
+ mmal_port_pool_destroy(display->input[0], rpi_pool);
+ }
+}
+
+#endif
+
+
/* sub2video hack:
Convert subtitles to video with alpha to insert them in filter graphs.
This is a temporary solution until libavfilter gets real subtitles support.
@@ -540,6 +727,11 @@ static void ffmpeg_cleanup(int ret)
avformat_close_input(&input_files[i]->ctx);
av_freep(&input_files[i]);
}
+
+#ifdef RPI_DISPLAY
+ display_exit(rpi_display);
+#endif
+
for (i = 0; i < nb_input_streams; i++) {
InputStream *ist = input_streams[i];
@@ -551,6 +743,9 @@ static void ffmpeg_cleanup(int ret)
av_freep(&ist->filters);
av_freep(&ist->hwaccel_device);
+#ifdef RPI_ZERO_COPY
+ av_rpi_zc_uninit(ist->dec_ctx);
+#endif
avcodec_free_context(&ist->dec_ctx);
av_freep(&input_streams[i]);
@@ -581,6 +776,7 @@ static void ffmpeg_cleanup(int ret)
}
term_exit();
ffmpeg_exited = 1;
+
}
void remove_avoptions(AVDictionary **a, AVDictionary *b)
@@ -944,6 +1140,15 @@ static void do_video_out(AVFormatContext *s,
if (ost->source_index >= 0)
ist = input_streams[ost->source_index];
+#ifdef RPI_DISPLAY
+ if (next_picture && ist != NULL)
+ {
+ if (!rpi_display)
+ rpi_display = display_init(0,0,next_picture->width,next_picture->height);
+ display_frame(ist->dec_ctx, rpi_display, next_picture);
+ }
+#endif
+
if (filter->inputs[0]->frame_rate.num > 0 &&
filter->inputs[0]->frame_rate.den > 0)
duration = 1/(av_q2d(filter->inputs[0]->frame_rate) * av_q2d(enc->time_base));
@@ -2549,6 +2754,12 @@ static int init_input_stream(int ist_index, char *error, int error_len)
ist->dec_ctx->opaque = ist;
ist->dec_ctx->get_format = get_format;
ist->dec_ctx->get_buffer2 = get_buffer;
+
+#ifdef RPI_ZERO_COPY
+ // Overrides the above get_buffer2
+ av_rpi_zc_init(ist->dec_ctx);
+#endif
+
ist->dec_ctx->thread_safe_callbacks = 1;
av_opt_set_int(ist->dec_ctx, "refcounted_frames", 1, 0);
diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index fd0d1f0..40d22d2 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -5,6 +5,11 @@ NAME = avcodec
HEADERS = avcodec.h \
avdct.h \
avfft.h \
+ rpi_qpu.h \
+ rpi_shader.h \
+ rpi_mailbox.h \
+ rpi_hevc_transform.h \
+ rpi_zc.h \
d3d11va.h \
dirac.h \
dv_profile.h \
@@ -43,6 +48,10 @@ OBJS = allcodecs.o \
resample.o \
resample2.o \
utils.o \
+ rpi_qpu.o \
+ rpi_shader.o \
+ rpi_mailbox.o \
+ rpi_zc.o \
vorbis_parser.o \
xiph.o \
@@ -1078,3 +1087,11 @@ $(SUBDIR)qdm2.o: $(SUBDIR)qdm2_tables.h
$(SUBDIR)sinewin.o: $(SUBDIR)sinewin_tables.h
$(SUBDIR)sinewin_fixed.o: $(SUBDIR)sinewin_fixed_tables.h
endif
+
+$(SUBDIR)rpi_shader.c: $(SUBDIR)rpi_shader.qasm
+ python $(SUBDIR)../pi-util/qasm.py -mc_c:rpi_shader,rpi_shader,rpi_shader $< > $@
+
+$(SUBDIR)rpi_shader.h: $(SUBDIR)rpi_shader.qasm
+ python $(SUBDIR)../pi-util/qasm.py -mc_h:rpi_shader,rpi_shader,rpi_shader $< > $@
+
+$(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_shader.h
diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
index 54efaad..02a89c3 100644
--- a/libavcodec/allcodecs.c
+++ b/libavcodec/allcodecs.c
@@ -667,6 +667,7 @@ void avcodec_register_all(void)
REGISTER_PARSER(H261, h261);
REGISTER_PARSER(H263, h263);
REGISTER_PARSER(H264, h264);
+ REGISTER_PARSER(H264_MVC, h264_mvc);
REGISTER_PARSER(HEVC, hevc);
REGISTER_PARSER(MJPEG, mjpeg);
REGISTER_PARSER(MLP, mlp);
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index a4ceca7..1354c14 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -132,8 +132,10 @@ NEON-OBJS-$(CONFIG_LLAUDDSP) += arm/lossless_audiodsp_neon.o
NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_neon.o
NEON-OBJS-$(CONFIG_HEVC_DECODER) += arm/hevcdsp_init_neon.o \
arm/hevcdsp_deblock_neon.o \
+ arm/hevcdsp_epel_neon.o \
arm/hevcdsp_idct_neon.o \
- arm/hevcdsp_qpel_neon.o
+ arm/hevcdsp_qpel_neon.o \
+ arm/hevcdsp_sao_neon.o
NEON-OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_neon.o
NEON-OBJS-$(CONFIG_RV40_DECODER) += arm/rv34dsp_neon.o \
arm/rv40dsp_neon.o
diff --git a/libavcodec/arm/cabac.h b/libavcodec/arm/cabac.h
index fdbf86b..0a3980a 100644
--- a/libavcodec/arm/cabac.h
+++ b/libavcodec/arm/cabac.h
@@ -26,13 +26,34 @@
#include "libavutil/internal.h"
#include "libavcodec/cabac.h"
+
+#if UNCHECKED_BITSTREAM_READER
+#define LOAD_16BITS_BEHI\
+ "ldrh %[tmp] , [%[ptr]] , #2 \n\t"\
+ "rev %[tmp] , %[tmp] \n\t"
+#elif CONFIG_THUMB
+#define LOAD_16BITS_BEHI\
+ "ldr %[tmp] , [%[c], %[end]] \n\t"\
+ "cmp %[tmp] , %[ptr] \n\t"\
+ "it cs \n\t"\
+ "ldrhcs %[tmp] , [%[ptr]] , #2 \n\t"\
+ "rev %[tmp] , %[tmp] \n\t"
+#else
+#define LOAD_16BITS_BEHI\
+ "ldr %[tmp] , [%[c], %[end]] \n\t"\
+ "cmp %[tmp] , %[ptr] \n\t"\
+ "ldrcsh %[tmp] , [%[ptr]] , #2 \n\t"\
+ "rev %[tmp] , %[tmp] \n\t"
+#endif
+
+
#define get_cabac_inline get_cabac_inline_arm
static av_always_inline int get_cabac_inline_arm(CABACContext *c,
uint8_t *const state)
{
int bit;
+#if 0
void *reg_b, *reg_c, *tmp;
-
__asm__ volatile(
"ldrb %[bit] , [%[state]] \n\t"
"add %[r_b] , %[tables] , %[lps_off] \n\t"
@@ -100,9 +121,141 @@ static av_always_inline int get_cabac_inline_arm(CABACContext *c,
[mlps_off]"I"(H264_MLPS_STATE_OFFSET + 128)
: "memory", "cc"
);
+#else
+ // *** Not thumb compatible yet
+ unsigned int reg_b, tmp;
+ __asm__ (
+ "ldrb %[bit] , [%[state]] \n\t"
+ "sub %[r_b] , %[mlps_tables], %[lps_off] \n\t"
+ "and %[tmp] , %[range] , #0xC0 \n\t"
+ "add %[r_b] , %[r_b] , %[bit] \n\t"
+ "ldrb %[tmp] , [%[r_b] , %[tmp], lsl #1] \n\t"
+// %bit = *state
+// %range = range
+// %tmp = RangeLPS
+ "sub %[range] , %[range] , %[tmp] \n\t"
+
+ "cmp %[low] , %[range] , lsl #17 \n\t"
+ "ittt ge \n\t"
+ "subge %[low] , %[low] , %[range], lsl #17 \n\t"
+ "mvnge %[bit] , %[bit] \n\t"
+ "movge %[range] , %[tmp] \n\t"
+
+ "clz %[tmp] , %[range] \n\t"
+ "sub %[tmp] , #23 \n\t"
+
+ "ldrb %[r_b] , [%[mlps_tables], %[bit]] \n\t"
+ "lsl %[low] , %[low] , %[tmp] \n\t"
+ "lsl %[range] , %[range] , %[tmp] \n\t"
+
+ "strb %[r_b] , [%[state]] \n\t"
+ "lsls %[tmp] , %[low] , #16 \n\t"
+
+ "bne 2f \n\t"
+ LOAD_16BITS_BEHI
+ "lsr %[tmp] , %[tmp] , #15 \n\t"
+ "movw %[r_b] , #0xFFFF \n\t"
+ "sub %[tmp] , %[tmp] , %[r_b] \n\t"
+
+ "rbit %[r_b] , %[low] \n\t"
+ "clz %[r_b] , %[r_b] \n\t"
+ "sub %[r_b] , %[r_b] , #16 \n\t"
+#if CONFIG_THUMB
+ "lsl %[tmp] , %[tmp] , %[r_b] \n\t"
+ "add %[low] , %[low] , %[tmp] \n\t"
+#else
+ "add %[low] , %[low] , %[tmp], lsl %[r_b] \n\t"
+#endif
+ "2: \n\t"
+ : [bit]"=&r"(bit),
+ [low]"+&r"(c->low),
+ [range]"+&r"(c->range),
+ [r_b]"=&r"(reg_b),
+ [ptr]"+&r"(c->bytestream),
+ [tmp]"=&r"(tmp)
+ : [state]"r"(state),
+ [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
+ [byte]"M"(offsetof(CABACContext, bytestream)),
+#if !UNCHECKED_BITSTREAM_READER
+ [c]"r"(c),
+ [end]"M"(offsetof(CABACContext, bytestream_end)),
+#endif
+ [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
+ : "memory", "cc"
+ );
+#endif
return bit & 1;
}
+
+#define get_cabac_bypass get_cabac_bypass_arm
+static inline int get_cabac_bypass_arm(CABACContext * const c)
+{
+ int rv = 0;
+ unsigned int tmp;
+ __asm (
+ "lsl %[low] , #1 \n\t"
+ "cmp %[low] , %[range] , lsl #17 \n\t"
+ "adc %[rv] , %[rv] , #0 \n\t"
+ "it cs \n\t"
+ "subcs %[low] , %[low] , %[range], lsl #17 \n\t"
+ "lsls %[tmp] , %[low] , #16 \n\t"
+ "bne 1f \n\t"
+ LOAD_16BITS_BEHI
+ "add %[low] , %[low] , %[tmp], lsr #15 \n\t"
+ "movw %[tmp] , #0xFFFF \n\t"
+ "sub %[low] , %[low] , %[tmp] \n\t"
+ "1: \n\t"
+ : // Outputs
+ [rv]"+&r"(rv),
+ [low]"+&r"(c->low),
+ [tmp]"=&r"(tmp),
+ [ptr]"+&r"(c->bytestream)
+ : // Inputs
+#if !UNCHECKED_BITSTREAM_READER
+ [c]"r"(c),
+ [end]"M"(offsetof(CABACContext, bytestream_end)),
+#endif
+ [range]"r"(c->range)
+ : "cc"
+ );
+ return rv;
+}
+
+
+#define get_cabac_bypass_sign get_cabac_bypass_sign_arm
+static inline int get_cabac_bypass_sign_arm(CABACContext * const c, int rv)
+{
+ unsigned int tmp;
+ __asm (
+ "lsl %[low] , #1 \n\t"
+ "cmp %[low] , %[range] , lsl #17 \n\t"
+ "ite cc \n\t"
+ "rsbcc %[rv] , %[rv] , #0 \n\t"
+ "subcs %[low] , %[low] , %[range], lsl #17 \n\t"
+ "lsls %[tmp] , %[low] , #16 \n\t"
+ "bne 1f \n\t"
+ LOAD_16BITS_BEHI
+ "add %[low] , %[low] , %[tmp], lsr #15 \n\t"
+ "movw %[tmp] , #0xFFFF \n\t"
+ "sub %[low] , %[low] , %[tmp] \n\t"
+ "1: \n\t"
+ : // Outputs
+ [rv]"+&r"(rv),
+ [low]"+&r"(c->low),
+ [tmp]"=&r"(tmp),
+ [ptr]"+&r"(c->bytestream)
+ : // Inputs
+#if !UNCHECKED_BITSTREAM_READER
+ [c]"r"(c),
+ [end]"M"(offsetof(CABACContext, bytestream_end)),
+#endif
+ [range]"r"(c->range)
+ : "cc"
+ );
+ return rv;
+}
+
#endif /* HAVE_ARMV6T2_INLINE */
#endif /* AVCODEC_ARM_CABAC_H */
diff --git a/libavcodec/arm/hevc_cabac.h b/libavcodec/arm/hevc_cabac.h
new file mode 100644
index 0000000..31d3c59
--- /dev/null
+++ b/libavcodec/arm/hevc_cabac.h
@@ -0,0 +1,491 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ARM_HEVC_CABAC_H
+#define AVCODEC_ARM_HEVC_CABAC_H
+
+#include "config.h"
+#if HAVE_ARMV6T2_INLINE
+
+#define hevc_mem_bits32 hevc_mem_bits32_arm
+static inline uint32_t hevc_mem_bits32_arm(const void * p, const unsigned int bits)
+{
+ unsigned int n;
+ __asm__ (
+ "rev %[n], %[x] \n\t"
+ : [n]"=r"(n)
+ : [x]"r"(*(const uint32_t *)((const uint8_t *)p + (bits >> 3)))
+ :
+ );
+ return n << (bits & 7);
+}
+
+
+// ---------------------------------------------------------------------------
+//
+// Helper fns - little bits of code where ARM has an instraction that the
+// compiler doesn't know about / use
+
+#define trans_scale_sat trans_scale_sat_arm
+static inline int trans_scale_sat_arm(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift)
+{
+ int rv;
+ int t = ((level * (int)(scale * scale_m)) >> shift) + 1;
+
+ __asm__ (
+ "ssat %[rv], #16, %[t], ASR #1 \n\t"
+ : [rv]"=r"(rv)
+ : [t]"r"(t)
+ :
+ );
+ return rv;
+}
+
+#define update_rice update_rice_arm
+static inline void update_rice_arm(uint8_t * const stat_coeff,
+ const unsigned int last_coeff_abs_level_remaining,
+ const unsigned int c_rice_param)
+{
+ int t;
+ __asm__ (
+ "lsl %[t], %[coeff], #1 \n\t"
+ "lsrs %[t], %[t], %[shift] \n\t"
+ "it eq \n\t"
+ "subeq %[stat], %[stat], #1 \n\t"
+ "cmp %[t], #6 \n\t"
+ "adc %[stat], %[stat], #0 \n\t"
+ "usat %[stat], #8, %[stat] \n\t"
+ : [stat]"+&r"(*stat_coeff),
+ [t]"=&r"(t)
+ : [coeff]"r"(last_coeff_abs_level_remaining),
+ [shift]"r"(c_rice_param)
+ : "cc"
+ );
+}
+
+// ---------------------------------------------------------------------------
+//
+// CABAC get loops
+//
+// Where the loop is simple enough we can normally do 10-30% better than the
+// compiler
+
+// Get the residual greater than 1 bits
+
+#define get_cabac_greater1_bits get_cabac_greater1_bits_arm
+static inline unsigned int get_cabac_greater1_bits_arm(CABACContext * const c, const unsigned int n,
+ uint8_t * const state0)
+{
+ unsigned int i, reg_b, st, tmp, bit, rv;
+ __asm__ (
+ "mov %[i] , #0 \n\t"
+ "mov %[rv] , #0 \n\t"
+ "1: \n\t"
+ "add %[i] , %[i] , #1 \n\t"
+ "cmp %[rv] , #0 \n\t"
+ "ite eq \n\t"
+ "usateq %[st] , #2 , %[i] \n\t"
+ "movne %[st] , #0 \n\t"
+
+ "ldrb %[bit] , [%[state0], %[st]] \n\t"
+ "sub %[r_b] , %[mlps_tables], %[lps_off] \n\t"
+ "and %[tmp] , %[range] , #0xC0 \n\t"
+ "add %[r_b] , %[r_b] , %[bit] \n\t"
+ "ldrb %[tmp] , [%[r_b], %[tmp], lsl #1] \n\t"
+ "sub %[range] , %[range] , %[tmp] \n\t"
+
+ "cmp %[low] , %[range], lsl #17 \n\t"
+ "ittt ge \n\t"
+ "subge %[low] , %[low] , %[range], lsl #17 \n\t"
+ "mvnge %[bit] , %[bit] \n\t"
+ "movge %[range] , %[tmp] \n\t"
+
+ "ldrb %[r_b] , [%[mlps_tables], %[bit]] \n\t"
+ "and %[bit] , %[bit] , #1 \n\t"
+ "orr %[rv] , %[bit] , %[rv], lsl #1 \n\t"
+
+ "clz %[tmp] , %[range] \n\t"
+ "sub %[tmp] , #23 \n\t"
+
+ "lsl %[low] , %[low] , %[tmp] \n\t"
+ "lsl %[range] , %[range] , %[tmp] \n\t"
+
+ "strb %[r_b] , [%[state0], %[st]] \n\t"
+// There is a small speed gain from combining both conditions, using a single
+// branch and then working out what that meant later
+ "lsls %[tmp] , %[low] , #16 \n\t"
+ "it ne \n\t"
+ "cmpne %[n] , %[i] \n\t"
+ "bne 1b \n\t"
+
+// If reload is not required then we must have run out of flags to decode
+ "tst %[tmp] , %[tmp] \n\t"
+ "bne 2f \n\t"
+
+// Do reload
+ "ldrh %[tmp] , [%[bptr]] , #2 \n\t"
+ "movw %[r_b] , #0xFFFF \n\t"
+ "rev %[tmp] , %[tmp] \n\t"
+ "rsb %[tmp] , %[r_b] , %[tmp], lsr #15 \n\t"
+
+ "rbit %[r_b] , %[low] \n\t"
+ "clz %[r_b] , %[r_b] \n\t"
+ "sub %[r_b] , %[r_b] , #16 \n\t"
+
+#if CONFIG_THUMB
+ "lsl %[tmp] , %[tmp] , %[r_b] \n\t"
+ "add %[low] , %[low] , %[tmp] \n\t"
+#else
+ "add %[low] , %[low] , %[tmp], lsl %[r_b] \n\t"
+#endif
+
+ "cmp %[n] , %[i] \n\t"
+ "bne 1b \n\t"
+ "2: \n\t"
+ : [bit]"=&r"(bit),
+ [low]"+&r"(c->low),
+ [range]"+&r"(c->range),
+ [r_b]"=&r"(reg_b),
+ [bptr]"+&r"(c->bytestream),
+ [i]"=&r"(i),
+ [tmp]"=&r"(tmp),
+ [st]"=&r"(st),
+ [rv]"=&r"(rv)
+ : [state0]"r"(state0),
+ [n]"r"(n),
+ [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
+ [byte]"M"(offsetof(CABACContext, bytestream)),
+ [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
+ : "memory", "cc"
+ );
+ return rv;
+}
+
+
+// n must be > 0 on entry
+#define get_cabac_sig_coeff_flag_idxs get_cabac_sig_coeff_flag_idxs_arm
+static inline uint8_t * get_cabac_sig_coeff_flag_idxs_arm(CABACContext * const c, uint8_t * const state0,
+ unsigned int n,
+ const uint8_t const * ctx_map,
+ uint8_t * p)
+{
+ unsigned int reg_b, tmp, st, bit;
+ __asm__ (
+ "1: \n\t"
+// Get bin from map
+ "ldrb %[st] , [%[ctx_map], %[n]] \n\t"
+
+// Load state & ranges
+ "sub %[r_b] , %[mlps_tables], %[lps_off] \n\t"
+ "ldrb %[bit] , [%[state0], %[st]] \n\t"
+ "and %[tmp] , %[range] , #0xC0 \n\t"
+ "add %[r_b] , %[r_b] , %[tmp], lsl #1 \n\t"
+ "ldrb %[tmp] , [%[r_b], %[bit]] \n\t"
+ "sub %[range] , %[range] , %[tmp] \n\t"
+
+ "cmp %[low] , %[range], lsl #17 \n\t"
+ "ittt ge \n\t"
+ "subge %[low] , %[low] , %[range], lsl #17 \n\t"
+ "mvnge %[bit] , %[bit] \n\t"
+ "movge %[range] , %[tmp] \n\t"
+
+ "ldrb %[r_b] , [%[mlps_tables], %[bit]] \n\t"
+ "tst %[bit] , #1 \n\t"
+// GCC asm seems to need strbne written differently for thumb and arm
+#if CONFIG_THUMB
+ "it ne \n\t"
+ "strbne %[n] , [%[idx]] , #1 \n\t"
+#else
+ "strneb %[n] , [%[idx]] , #1 \n\t"
+#endif
+
+// Renorm
+ "clz %[tmp] , %[range] \n\t"
+ "sub %[tmp] , #23 \n\t"
+ "lsl %[low] , %[low] , %[tmp] \n\t"
+ "lsl %[range] , %[range] , %[tmp] \n\t"
+
+ "strb %[r_b] , [%[state0], %[st]] \n\t"
+// There is a small speed gain from combining both conditions, using a single
+// branch and then working out what that meant later
+ "subs %[n] , %[n] , #1 \n\t"
+#if CONFIG_THUMB
+ "itt ne \n\t"
+ "lslsne %[tmp] , %[low] , #16 \n\t"
+ "bne 1b \n\t"
+#else
+ "lslnes %[tmp] , %[low] , #16 \n\t"
+ "bne 1b \n\t"
+#endif
+
+// If we have bits left then n must be 0 so give up now
+ "lsls %[tmp] , %[low] , #16 \n\t"
+ "bne 2f \n\t"
+
+// Do reload
+ "ldrh %[tmp] , [%[bptr]] , #2 \n\t"
+ "movw %[r_b] , #0xFFFF \n\t"
+ "rev %[tmp] , %[tmp] \n\t"
+ "rsb %[tmp] , %[r_b] , %[tmp], lsr #15 \n\t"
+
+ "rbit %[r_b] , %[low] \n\t"
+ "clz %[r_b] , %[r_b] \n\t"
+ "sub %[r_b] , %[r_b] , #16 \n\t"
+
+#if CONFIG_THUMB
+ "lsl %[tmp] , %[tmp] , %[r_b] \n\t"
+ "add %[low] , %[low] , %[tmp] \n\t"
+#else
+ "add %[low] , %[low] , %[tmp], lsl %[r_b] \n\t"
+#endif
+
+// Check to see if we still have more to do
+ "cmp %[n] , #0 \n\t"
+ "bne 1b \n\t"
+ "2: \n\t"
+ : [bit]"=&r"(bit),
+ [low]"+&r"(c->low),
+ [range]"+&r"(c->range),
+ [r_b]"=&r"(reg_b),
+ [bptr]"+&r"(c->bytestream),
+ [idx]"+&r"(p),
+ [n]"+&r"(n),
+ [tmp]"=&r"(tmp),
+ [st]"=&r"(st)
+ : [state0]"r"(state0),
+ [ctx_map]"r"(ctx_map),
+ [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
+ [byte]"M"(offsetof(CABACContext, bytestream)),
+ [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
+ : "memory", "cc"
+ );
+
+ return p;
+}
+
+// ---------------------------------------------------------------------------
+//
+// CABAC_BY22 functions
+//
+// By and large these are (at best) no faster than their C equivalents - the
+// only one worth having is _peek where we do a slightly better job than the
+// compiler
+//
+// The others have been stashed here for reference in case larger scale asm
+// is attempted in which case they might be a useful base
+
+
+#define get_cabac_by22_peek get_cabac_by22_peek_arm
+static inline uint32_t get_cabac_by22_peek_arm(const CABACContext *const c)
+{
+ uint32_t rv, tmp;
+ __asm__ (
+ "bic %[rv] , %[low], #1 \n\t"
+ "cmp %[inv] , #0 \n\t"
+ "it ne \n\t"
+ "umullne %[tmp] , %[rv] , %[inv], %[rv] \n\t"
+ : // Outputs
+ [rv]"=&r"(rv),
+ [tmp]"=r"(tmp)
+ : // Inputs
+ [low]"r"(c->low),
+ [inv]"r"(c->range)
+ : // Clobbers
+ "cc"
+ );
+ return rv << 1;
+}
+
+#if 0
+
+// ***** Slower than the C :-(
+#define get_cabac_by22_flush get_cabac_by22_flush_arm
+static inline void get_cabac_by22_flush_arm(CABACContext *const c, const unsigned int n, const uint32_t val)
+{
+ uint32_t m, tmp;
+ __asm__ (
+ "add %[bits], %[bits], %[n] \n\t"
+ "ldr %[m], [%[ptr], %[bits], lsr #3] \n\t"
+
+ "rsb %[tmp], %[n], #32 \n\t"
+ "lsr %[tmp], %[val], %[tmp] \n\t"
+ "mul %[tmp], %[range], %[tmp] \n\t"
+
+ "rev %[m], %[m] \n\t"
+
+ "lsl %[tmp], %[tmp], #23 \n\t"
+ "rsb %[low], %[tmp], %[low], lsl %[n] \n\t"
+
+ "and %[tmp], %[bits], #7 \n\t"
+ "lsl %[m], %[m], %[tmp] \n\t"
+
+ "orr %[low], %[low], %[m], lsr #9 \n\t"
+ : // Outputs
+ [m]"=&r"(m),
+ [tmp]"=&r"(tmp),
+ [bits]"+&r"(c->by22.bits),
+ [low]"+&r"(c->low)
+ : // Inputs
+ [n]"r"(n),
+ [val]"r"(val),
+ [inv]"r"(c->range),
+ [range]"r"(c->by22.range),
+ [ptr]"r"(c->bytestream)
+ : // Clobbers
+ );
+}
+
+
+// Works but slower than C
+#define coeff_abs_level_remaining_decode_by22(c,r) coeff_abs_level_remaining_decode_by22_arm(c, r)
+static int coeff_abs_level_remaining_decode_by22_arm(CABACContext * const c, const unsigned int c_rice_param)
+{
+ uint32_t n, val, tmp, level;
+
+// PROFILE_START();
+
+ __asm__ (
+ // Peek
+ "bic %[val], %[low], #1 \n\t"
+ "cmp %[inv], #0 \n\t"
+ "umullne %[tmp], %[val], %[inv], %[val] \n\t"
+ "lsl %[val], %[val], #1 \n\t"
+
+ // Count bits (n = prefix)
+ "mvn %[n], %[val] \n\t"
+ "clz %[n], %[n] \n\t"
+
+ "lsl %[level], %[val], %[n] \n\t"
+ "subs %[tmp], %[n], #3 \n\t"
+ "blo 2f \n\t"
+
+ // prefix >= 3
+ // < tmp = prefix - 3
+ // > tmp = prefix + rice - 3
+ "add %[tmp], %[tmp], %[rice] \n\t"
+ // > n = prefix * 2 + rice - 3
+ "add %[n], %[tmp], %[n] \n\t"
+ "cmp %[n], #21 \n\t"
+ "bhi 3f \n\t"
+
+ "orr %[level], %[level], #0x80000000 \n\t"
+ "rsb %[tmp], %[tmp], #31 \n\t"
+ "lsr %[level], %[level], %[tmp] \n\t"
+
+ "mov %[tmp], #2 \n\t"
+ "add %[level], %[level], %[tmp], lsl %[rice] \n\t"
+ "b 1f \n\t"
+
+ // > 22 bits used in total - need reload
+ "3: \n\t"
+
+ // Stash prefix + rice - 3 in level (only spare reg)
+ "mov %[level], %[tmp] \n\t"
+ // Restore n to flush value (prefix)
+ "sub %[n], %[n], %[tmp] \n\t"
+
+ // Flush + reload
+
+// "rsb %[tmp], %[n], #32 \n\t"
+// "lsr %[tmp], %[val], %[tmp] \n\t"
+// "mul %[tmp], %[range], %[tmp] \n\t"
+
+ // As it happens we know that all the bits we are flushing are 1
+ // so we can cheat slightly
+ "rsb %[tmp], %[range], %[range], lsl %[n] \n\t"
+ "lsl %[tmp], %[tmp], #23 \n\t"
+ "rsb %[low], %[tmp], %[low], lsl %[n] \n\t"
+
+ "add %[bits], %[bits], %[n] \n\t"
+ "ldr %[n], [%[ptr], %[bits], lsr #3] \n\t"
+ "rev %[n], %[n] \n\t"
+ "and %[tmp], %[bits], #7 \n\t"
+ "lsl %[n], %[n], %[tmp] \n\t"
+
+ "orr %[low], %[low], %[n], lsr #9 \n\t"
+
+ // (reload)
+
+ "bic %[val], %[low], #1 \n\t"
+ "cmp %[inv], #0 \n\t"
+ "umullne %[tmp], %[val], %[inv], %[val] \n\t"
+ "lsl %[val], %[val], #1 \n\t"
+
+ // Build value
+
+ "mov %[n], %[level] \n\t"
+
+ "orr %[tmp], %[val], #0x80000000 \n\t"
+ "rsb %[level], %[level], #31 \n\t"
+ "lsr %[level], %[tmp], %[level] \n\t"
+
+ "mov %[tmp], #2 \n\t"
+ "add %[level], %[level], %[tmp], lsl %[rice] \n\t"
+ "b 1f \n\t"
+
+ // prefix < 3
+ "2: \n\t"
+ "rsb %[tmp], %[rice], #31 \n\t"
+ "lsr %[level], %[level], %[tmp] \n\t"
+ "orr %[level], %[level], %[n], lsl %[rice] \n\t"
+ "add %[n], %[n], %[rice] \n\t"
+
+ "1: \n\t"
+ // Flush
+ "add %[n], %[n], #1 \n\t"
+
+ "rsb %[tmp], %[n], #32 \n\t"
+ "lsr %[tmp], %[val], %[tmp] \n\t"
+
+ "add %[bits], %[bits], %[n] \n\t"
+ "ldr %[val], [%[ptr], %[bits], lsr #3] \n\t"
+
+ "mul %[tmp], %[range], %[tmp] \n\t"
+ "lsl %[tmp], %[tmp], #23 \n\t"
+ "rsb %[low], %[tmp], %[low], lsl %[n] \n\t"
+
+ "rev %[val], %[val] \n\t"
+ "and %[tmp], %[bits], #7 \n\t"
+ "lsl %[val], %[val], %[tmp] \n\t"
+
+ "orr %[low], %[low], %[val], lsr #9 \n\t"
+ : // Outputs
+ [level]"=&r"(level),
+ [n]"=&r"(n),
+ [val]"=&r"(val),
+ [tmp]"=&r"(tmp),
+ [bits]"+&r"(c->by22.bits),
+ [low]"+&r"(c->low)
+ : // Inputs
+ [rice]"r"(c_rice_param),
+ [inv]"r"(c->range),
+ [range]"r"(c->by22.range),
+ [ptr]"r"(c->bytestream)
+ : // Clobbers
+ "cc"
+ );
+
+// PROFILE_ACC(residual_abs);
+
+ return level;
+}
+#endif
+
+#endif /* HAVE_ARMV6T2_INLINE */
+
+#endif /* AVCODEC_ARM_HEVC_CABAC_H */
diff --git a/libavcodec/arm/hevcdsp_deblock_neon.S b/libavcodec/arm/hevcdsp_deblock_neon.S
index 166bddb..a088cc3 100644
--- a/libavcodec/arm/hevcdsp_deblock_neon.S
+++ b/libavcodec/arm/hevcdsp_deblock_neon.S
@@ -383,3 +383,127 @@ function ff_hevc_h_loop_filter_chroma_neon, export=1
vst1.8 {d4}, [r0]
bx lr
endfunc
+
+/* ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc,
+ * int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
+ * MvField *curr, MvField *neigh, uint8_t *bs)
+ */
+function ff_hevc_deblocking_boundary_strengths_neon, export=1
+ add ip, sp, #4*4
+ push {a2-a4,v1-v8,lr}
+ ldmia ip, {v5-v7}
+1: ldmdb ip, {v1-v4}
+ ldrsb a3, [v5, #8] @ curr->ref_idx
+ ldrsb v8, [v5, #9]
+ ldrsb ip, [v6, #8] @ neigh->ref_idx
+ ldrsb lr, [v6, #9]
+ ldr v1, [v1, a3, lsl #2]
+ ldrb a3, [v5, #10] @ curr->pred_flag
+ ldr v2, [v2, v8, lsl #2]
+ ldrb v8, [v6, #10] @ neigh->pred_flag
+ ldr v3, [v3, ip, lsl #2]
+ ldr v4, [v4, lr, lsl #2]
+ teq a3, #3
+ beq 20f
+ teq v8, #3
+ beq 90f
+
+ tst a3, #1
+ itee ne
+ ldrne a3, [v5, #0] @ curr->mv[0]
+ ldreq a3, [v5, #4] @ curr->mv[1]
+ moveq v1, v2
+ tst v8, #1
+ itee ne
+ ldrne v8, [v6, #0] @ neigh->mv[0]
+ ldreq v8, [v6, #4] @ neigh->mv[1]
+ moveq v3, v4
+ teq v1, v3
+ bne 10f
+ ldr lr, =0xFFFCFFFC
+ ssub16 ip, v8, a3
+ ssub16 a3, a3, v8
+ sel a3, a3, ip
+ ands a3, a3, lr
+ @ drop through
+10: it ne
+ movne a3, #1
+11: subs a2, a2, #1
+12:
+A strbhs a3, [v7], a4
+T itt hs
+T strbhs a3, [v7]
+T addhs v7, v7, a4
+ subs a2, a2, #1
+ bhs 12b
+
+ ldm sp, {a2, a3}
+ add ip, sp, #16*4
+ subs a1, a1, #1
+ add v5, v5, a3
+ add v6, v6, a3
+ bhi 1b
+ pop {a2-a4,v1-v8,pc}
+
+20: teq v8, #3
+ bne 10b
+
+ teq v1, v3
+ it eq
+ teqeq v2, v4
+ bne 40f
+ teq v1, v2
+ bne 30f
+
+ ldrd v1, v2, [v5] @ curr->mv
+ ldrd v3, v4, [v6] @ neigh->mv
+ ldr lr, =0xFFFCFFFC
+ ssub16 ip, v3, v1
+ ssub16 a3, v1, v3
+ sel a3, a3, ip
+ ands a3, a3, lr
+ bne 25f
+ ssub16 ip, v4, v2
+ ssub16 a3, v2, v4
+ sel a3, a3, ip
+ ands a3, a3, lr
+ beq 11b
+ @ drop through
+25: ssub16 ip, v4, v1
+ ssub16 a3, v1, v4
+ sel a3, a3, ip
+ ands a3, a3, lr
+ bne 10b
+ ssub16 ip, v3, v2
+ ssub16 a3, v2, v3
+ sel a3, a3, ip
+ ands a3, a3, lr
+ b 10b
+
+30: ldrd v1, v2, [v5] @ curr->mv
+ ldrd v3, v4, [v6] @ neigh->mv
+ ldr lr, =0xFFFCFFFC
+ ssub16 ip, v3, v1
+ ssub16 a3, v1, v3
+ sel a3, a3, ip
+ ands a3, a3, lr
+ bne 10b
+ ssub16 ip, v4, v2
+ ssub16 a3, v2, v4
+ sel a3, a3, ip
+ ands a3, a3, lr
+ b 10b
+
+40: teq v1, v4
+ ite eq
+ teqeq v2, v3
+ bne 10b
+
+ ldrd v1, v2, [v5] @ curr->mv
+ ldrd v3, v4, [v6] @ neigh->mv
+ ldr lr, =0xFFFCFFFC
+ b 25b
+
+90: mov a3, #1
+ b 11b
+endfunc
diff --git a/libavcodec/arm/hevcdsp_epel_neon.S b/libavcodec/arm/hevcdsp_epel_neon.S
new file mode 100644
index 0000000..00eab9e
--- /dev/null
+++ b/libavcodec/arm/hevcdsp_epel_neon.S
@@ -0,0 +1,337 @@
+/*
+ * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
+#define MAX_PB_SIZE #64
+
+.macro vextin_d4
+ vld1.8 {q10}, [r1], r2
+ vmov d16, d20
+ vext.8 d17, d20, d21, #1
+ vext.8 d18, d20, d21, #2
+ vext.8 d19, d20, d21, #3
+.endm
+
+.macro vextin_d4_8
+ vld1.8 d16, [r1], r2
+ vext.8 d17, d16, d16, #1
+ vext.8 d18, d16, d16, #2
+ vext.8 d19, d16, d16, #3
+.endm
+
+.macro load_coeffs_16b coeffs
+ ldr \coeffs, [\coeffs]
+ vdup.i8 d0, \coeffs
+ lsr \coeffs, #8
+ vdup.i8 d1, \coeffs
+ lsr \coeffs, #8
+ vdup.i8 d2, \coeffs
+ lsr \coeffs, #8
+ vdup.i8 d3, \coeffs
+.endm
+
+.macro epel_filter_16b out=q12
+ vmull.u8 q3, d16, d0
+ vmull.u8 q11, d19, d3
+ vmull.u8 \out, d17, d1
+ vmull.u8 q10, d18, d2
+ vadd.s16 q3, q11
+ vadd.s16 \out, q10
+ vsub.s16 \out, q3
+.endm
+
+.macro load_coeffs_32b coeffs
+ ldr \coeffs, [\coeffs]
+ vmov.i64 d4, #0
+ vmov.8 d4[0], \coeffs
+ lsr \coeffs, #8
+ vmov.8 d4[2], \coeffs
+ lsr \coeffs, #8
+ vmov.8 d4[4], \coeffs
+ lsr \coeffs, #8
+ vmov.8 d4[6], \coeffs
+.endm
+
+.macro epel_filter_32b
+ vmull.s16 q3, d24, d4[0] //q12
+ vmull.s16 q4, d25, d4[0]
+ vmull.s16 q5, d30, d4[3] //q15
+ vmull.s16 q6, d31, d4[3]
+
+ vmull.s16 q7, d26, d4[1] // q13
+ vmull.s16 q8, d27, d4[1]
+ vmull.s16 q9, d28, d4[2] // q14
+ vmull.s16 q10, d29, d4[2]
+ vadd.s32 q3, q5
+ vadd.s32 q4, q6
+ vadd.s32 q7, q9
+ vadd.s32 q8, q10
+ vsub.s32 q7, q3
+ vsub.s32 q8, q4
+ vqshrn.s32 d6, q7, #6
+ vqshrn.s32 d7, q8, #6
+.endm
+
+.macro epel_filter_32b_4
+ vmull.s16 q3, d24, d4[0] //q12
+ vmull.s16 q5, d30, d4[3] //q15
+ vmull.s16 q7, d26, d4[1] // q13
+ vmull.s16 q9, d28, d4[2] // q14
+ vadd.s32 q3, q5
+ vadd.s32 q7, q9
+ vsub.s32 q7, q3
+ vqshrn.s32 d6, q7, #6
+.endm
+
+function ff_hevc_put_epel_h_neon_8, export=1
+ push {r4-r7}
+ mov r4, MAX_PB_SIZE
+ ldr r7, [sp, #16] // mx
+ ldr r5, [sp, #24] // width
+ sub r7, #1
+ lsl r7, #2
+ vpush {d8-d15}
+@ adr reaches if we are in thumb mode but not in arm
+T adr r12, epel_coeffs
+A adrl r12, epel_coeffs
+ add r7, r12
+ sub r1, #1
+ lsl r4, #1
+ load_coeffs_16b r7
+ mov r12, r3
+ mov r6, r0
+ mov r7, r1
+ cmp r5, #6
+ bgt 8f
+ cmp r5, #4
+ blt 2f
+ b 4f
+8: subs r3, #1
+ pld [r1]
+ vextin_d4
+ epel_filter_16b
+ vst1.16 {q12}, [r0], r4
+ bne 8b
+ subs r5, #8
+ beq 99f
+ mov r3, r12
+ add r6, #16
+ mov r0, r6
+ add r7, #8
+ mov r1, r7
+ cmp r5, #4
+ bgt 8b
+4: subs r3, #1
+ pld [r1]
+ vextin_d4_8
+ epel_filter_16b
+ vst1.16 d24, [r0], r4
+ bne 4b
+ subs r5, #4
+ beq 99f
+ mov r3, r12
+ add r6, #8
+ mov r0, r6
+ add r7, #4
+ mov r1, r7
+2: subs r3, #1
+ pld [r1]
+ vextin_d4_8
+ epel_filter_16b
+ vst1.32 d24[0], [r0], r4
+ bne 2b
+99: vpop {d8-d15}
+ pop {r4-r7}
+ bx lr
+endfunc
+
+function ff_hevc_put_epel_v_neon_8, export=1
+ push {r4-r7}
+ mov r4, MAX_PB_SIZE
+ ldr r7, [sp, #20] // my
+ ldr r5, [sp, #24] // width
+ sub r7, #1
+ lsl r7, #2
+ vpush {d8-d15}
+T adr r12, epel_coeffs
+A adrl r12, epel_coeffs
+ add r7, r12
+ load_coeffs_16b r7
+ sub r1, r2
+ lsl r4, #1
+ mov r12, r3
+ mov r6, r0
+ mov r7, r1
+0: pld [r1]
+ vld1.8 {d16}, [r1], r2
+ pld [r1]
+ vld1.8 {d17}, [r1], r2
+ pld [r1]
+ vld1.8 {d18}, [r1], r2
+ cmp r5, #6
+ bgt 8f
+ cmp r5, #4
+ blt 2f
+ b 4f
+8: pld [r1]
+ vld1.8 {d19}, [r1], r2
+ subs r3, #1
+ epel_filter_16b
+ vst1.16 {q12}, [r0], r4
+ vmov d16, d17
+ vmov d17, d18
+ vmov d18, d19
+ bne 8b
+ subs r5, #8
+ beq 99f
+ mov r3, r12
+ add r6, #16
+ mov r0, r6
+ add r7, #8
+ mov r1, r7
+ b 0b
+4: pld [r1]
+ vld1.8 {d19}, [r1], r2
+ subs r3, #1
+ epel_filter_16b
+ vst1.16 d24, [r0], r4
+ vmov d16, d17
+ vmov d17, d18
+ vmov d18, d19
+ bne 4b
+ subs r5, #4
+ beq 99f
+ mov r3, r12
+ add r6, #8
+ mov r0, r6
+ add r7, #4
+ mov r1, r7
+ b 0b
+2: pld [r1]
+ vld1.8 {d19}, [r1], r2
+ subs r3, #1
+ epel_filter_16b
+ vst1.32 d24[0], [r0], r4
+ vmov d16, d17
+ vmov d17, d18
+ vmov d18, d19
+ bne 2b
+99: vpop {d8-d15}
+ pop {r4-r7}
+ bx lr
+endfunc
+
+function ff_hevc_put_epel_hv_neon_8, export=1
+ push {r4-r7}
+ mov r4, MAX_PB_SIZE
+ ldr r6, [sp, #16] // mx
+ ldr r7, [sp, #20] // my
+ ldr r5, [sp, #24] // width
+ sub r7, #1
+ lsl r7, #2
+ vpush {d8-d15}
+ adr r12, epel_coeffs
+ sub r6, #1
+ lsl r6, #2
+ add r6, r12 // mx epel coeff offset
+ add r7, r12
+ sub r1, #1
+ sub r1, r2
+ lsl r4, #1
+ load_coeffs_16b r6
+ load_coeffs_32b r7
+ mov r12, r3
+ mov r6, r0
+ mov r7, r1
+0: pld [r1]
+ vextin_d4
+ epel_filter_16b q12
+ pld [r1]
+ vextin_d4
+ epel_filter_16b q13
+ pld [r1]
+ vextin_d4
+ epel_filter_16b q14
+ cmp r5, #6
+ bgt 8f
+ cmp r5, #4
+ blt 2f
+ b 4f
+8: pld [r1]
+ vextin_d4
+ epel_filter_16b q15
+ subs r3, #1
+ epel_filter_32b
+ vst1.16 {q3}, [r0], r4
+ vmov q12, q13
+ vmov q13, q14
+ vmov q14, q15
+ bne 8b
+ subs r5, #8
+ beq 99f
+ mov r3, r12
+ add r6, #16
+ mov r0, r6
+ add r7, #8
+ mov r1, r7
+ b 0b
+4: pld [r1]
+ vextin_d4_8
+ epel_filter_16b q15
+ subs r3, #1
+ epel_filter_32b_4
+ vst1.16 d6, [r0], r4
+ vmov q12, q13
+ vmov q13, q14
+ vmov q14, q15
+ bne 4b
+ subs r5, #4
+ beq 99f
+ mov r3, r12
+ add r6, #8
+ mov r0, r6
+ add r7, #4
+ mov r1, r7
+ b 0b
+2: pld [r1]
+ vextin_d4_8
+ epel_filter_16b q15
+ subs r3, #1
+ epel_filter_32b_4
+ vst1.32 d6[0], [r0], r4
+ vmov q12, q13
+ vmov q13, q14
+ vmov q14, q15
+ bne 2b
+99: vpop {d8-d15}
+ pop {r4-r7}
+ bx lr
+endfunc
+
+epel_coeffs:
+ .byte 2, 58, 10, 2
+ .byte 4, 54, 16, 2
+ .byte 6, 46, 28, 4
+ .byte 4, 36, 36, 4
+ .byte 4, 28, 46, 6
+ .byte 2, 16, 54, 4
+ .byte 2, 10, 58, 2
diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
index 5591807..49c70dd 100644
--- a/libavcodec/arm/hevcdsp_init_neon.c
+++ b/libavcodec/arm/hevcdsp_init_neon.c
@@ -22,6 +22,8 @@
#include "libavutil/arm/cpu.h"
#include "libavcodec/hevcdsp.h"
#include "hevcdsp_arm.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/bit_depth_template.c"
void ff_hevc_v_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
void ff_hevc_h_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
@@ -43,6 +45,21 @@ void ff_hevc_transform_add_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
void ff_hevc_transform_add_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
ptrdiff_t stride);
+void ff_hevc_sao_band_w8_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
+void ff_hevc_sao_band_w16_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
+void ff_hevc_sao_band_w32_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
+void ff_hevc_sao_band_w64_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
+
+void ff_hevc_sao_edge_eo0_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
+void ff_hevc_sao_edge_eo1_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
+void ff_hevc_sao_edge_eo2_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
+void ff_hevc_sao_edge_eo3_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
+
+void ff_hevc_sao_edge_eo0_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
+void ff_hevc_sao_edge_eo1_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
+void ff_hevc_sao_edge_eo2_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
+void ff_hevc_sao_edge_eo3_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
+
#define PUT_PIXELS(name) \
void name(int16_t *dst, uint8_t *src, \
ptrdiff_t srcstride, int height, \
@@ -58,6 +75,15 @@ PUT_PIXELS(ff_hevc_put_pixels_w32_neon_8);
PUT_PIXELS(ff_hevc_put_pixels_w48_neon_8);
PUT_PIXELS(ff_hevc_put_pixels_w64_neon_8);
#undef PUT_PIXELS
+void ff_hevc_put_epel_h_neon_8(int16_t *dst, uint8_t *src,
+ ptrdiff_t srcstride, int height,
+ intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_epel_v_neon_8(int16_t *dst, uint8_t *src,
+ ptrdiff_t srcstride, int height,
+ intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_epel_hv_neon_8(int16_t *dst, uint8_t *src,
+ ptrdiff_t srcstride, int height,
+ intptr_t mx, intptr_t my, int width);
static void (*put_hevc_qpel_neon[4][4])(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
int height, int width);
@@ -142,6 +168,132 @@ void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t
put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, src2, MAX_PB_SIZE);
}
+static void ff_hevc_sao_band_neon_wrapper(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+ int16_t *sao_offset_val, int sao_left_class, int width, int height)
+{
+ pixel *dst = (pixel *)_dst;
+ pixel *src = (pixel *)_src;
+ int8_t offset_table[32] = { 0 };
+ int k, y, x;
+ int shift = 3; // BIT_DEPTH - 5
+ int cwidth = 0;
+
+ stride_src /= sizeof(pixel);
+ stride_dst /= sizeof(pixel);
+
+ for (k = 0; k < 4; k++)
+ offset_table[(k + sao_left_class) & 31] = sao_offset_val[k + 1];
+
+ if (height % 8 == 0)
+ cwidth = width;
+
+ switch(cwidth){
+ case 8:
+ ff_hevc_sao_band_w8_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height);
+ break;
+ case 16:
+ ff_hevc_sao_band_w16_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height);
+ break;
+ case 32:
+ ff_hevc_sao_band_w32_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height);
+ break;
+ case 64:
+ ff_hevc_sao_band_w64_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height);
+ break;
+ default:
+ for (y = 0; y < height; y++) {
+ for (x = 0; x < width; x++)
+ dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
+ dst += stride_dst;
+ src += stride_src;
+ }
+ }
+}
+
+#define CMP(a, b) ((a) > (b) ? 1 : ((a) == (b) ? 0 : -1))
+static void ff_hevc_sao_edge_neon_wrapper(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
+ int16_t *_sao_offset_val, int eo, int width, int height)
+{
+ static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
+ static const int8_t pos[4][2][2] = {
+ { { -1, 0 }, { 1, 0 } }, // horizontal
+ { { 0, -1 }, { 0, 1 } }, // vertical
+ { { -1, -1 }, { 1, 1 } }, // 45 degree
+ { { 1, -1 }, { -1, 1 } }, // 135 degree
+ };
+ int8_t sao_offset_val[8]; // padding of 3 for vld
+ ptrdiff_t stride_src = (2*MAX_PB_SIZE + FF_INPUT_BUFFER_PADDING_SIZE);
+ pixel *dst = (pixel *)_dst;
+ pixel *src = (pixel *)_src;
+ int a_stride, b_stride;
+ int x, y;
+ int cwidth = 0;
+
+ for (x = 0; x < 5; x++) {
+ sao_offset_val[x] = _sao_offset_val[edge_idx[x]];
+ }
+
+ if (height % 8 == 0)
+ cwidth = width;
+
+ stride_src /= sizeof(pixel);
+ stride_dst /= sizeof(pixel);
+
+ switch (cwidth) {
+ case 32:
+ switch(eo) {
+ case 0:
+ ff_hevc_sao_edge_eo0_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
+ break;
+ case 1:
+ ff_hevc_sao_edge_eo1_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
+ break;
+ case 2:
+ ff_hevc_sao_edge_eo2_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
+ break;
+ case 3:
+ ff_hevc_sao_edge_eo3_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
+ break;
+ }
+ break;
+ case 64:
+ switch(eo) {
+ case 0:
+ ff_hevc_sao_edge_eo0_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
+ break;
+ case 1:
+ ff_hevc_sao_edge_eo1_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
+ break;
+ case 2:
+ ff_hevc_sao_edge_eo2_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
+ break;
+ case 3:
+ ff_hevc_sao_edge_eo3_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
+ break;
+ }
+ break;
+ default:
+ a_stride = pos[eo][0][0] + pos[eo][0][1] * stride_src;
+ b_stride = pos[eo][1][0] + pos[eo][1][1] * stride_src;
+ for (y = 0; y < height; y++) {
+ for (x = 0; x < width; x++) {
+ int diff0 = CMP(src[x], src[x + a_stride]);
+ int diff1 = CMP(src[x], src[x + b_stride]);
+ int idx = diff0 + diff1;
+ if (idx)
+ dst[x] = av_clip_pixel(src[x] + sao_offset_val[idx+2]);
+ }
+ src += stride_src;
+ dst += stride_dst;
+ }
+ }
+}
+#undef CMP
+
+void ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc,
+ int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
+ MvField *curr, MvField *neigh, uint8_t *bs);
+
av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
{
if (bit_depth == 8) {
@@ -161,6 +313,10 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
c->transform_add[2] = ff_hevc_transform_add_16x16_neon_8;
c->transform_add[3] = ff_hevc_transform_add_32x32_neon_8;
c->idct_4x4_luma = ff_hevc_transform_luma_4x4_neon_8;
+ for (x = 0; x < sizeof c->sao_band_filter / sizeof *c->sao_band_filter; x++) {
+ c->sao_band_filter[x] = ff_hevc_sao_band_neon_wrapper;
+ c->sao_edge_filter[x] = ff_hevc_sao_edge_neon_wrapper;
+ }
put_hevc_qpel_neon[1][0] = ff_hevc_put_qpel_v1_neon_8;
put_hevc_qpel_neon[2][0] = ff_hevc_put_qpel_v2_neon_8;
put_hevc_qpel_neon[3][0] = ff_hevc_put_qpel_v3_neon_8;
@@ -201,7 +357,21 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
c->put_hevc_qpel_bi[x][1][0] = ff_hevc_put_qpel_bi_neon_wrapper;
c->put_hevc_qpel_bi[x][0][1] = ff_hevc_put_qpel_bi_neon_wrapper;
c->put_hevc_qpel_bi[x][1][1] = ff_hevc_put_qpel_bi_neon_wrapper;
+ c->put_hevc_epel[x][1][0] = ff_hevc_put_epel_v_neon_8;
+ c->put_hevc_epel[x][0][1] = ff_hevc_put_epel_h_neon_8;
+ c->put_hevc_epel[x][1][1] = ff_hevc_put_epel_hv_neon_8;
}
+ c->put_hevc_epel[0][0][0] = ff_hevc_put_pixels_w2_neon_8;
+ c->put_hevc_epel[1][0][0] = ff_hevc_put_pixels_w4_neon_8;
+ c->put_hevc_epel[2][0][0] = ff_hevc_put_pixels_w6_neon_8;
+ c->put_hevc_epel[3][0][0] = ff_hevc_put_pixels_w8_neon_8;
+ c->put_hevc_epel[4][0][0] = ff_hevc_put_pixels_w12_neon_8;
+ c->put_hevc_epel[5][0][0] = ff_hevc_put_pixels_w16_neon_8;
+ c->put_hevc_epel[6][0][0] = ff_hevc_put_pixels_w24_neon_8;
+ c->put_hevc_epel[7][0][0] = ff_hevc_put_pixels_w32_neon_8;
+ c->put_hevc_epel[8][0][0] = ff_hevc_put_pixels_w48_neon_8;
+ c->put_hevc_epel[9][0][0] = ff_hevc_put_pixels_w64_neon_8;
+
c->put_hevc_qpel[0][0][0] = ff_hevc_put_pixels_w2_neon_8;
c->put_hevc_qpel[1][0][0] = ff_hevc_put_pixels_w4_neon_8;
c->put_hevc_qpel[2][0][0] = ff_hevc_put_pixels_w6_neon_8;
@@ -221,4 +391,9 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_qpel_uw_pixels_w48_neon_8;
c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_qpel_uw_pixels_w64_neon_8;
}
+
+ assert(offsetof(MvField, mv) == 0);
+ assert(offsetof(MvField, ref_idx) == 8);
+ assert(offsetof(MvField, pred_flag) == 10);
+ c->hevc_deblocking_boundary_strengths = ff_hevc_deblocking_boundary_strengths_neon;
}
diff --git a/libavcodec/arm/hevcdsp_sao_neon.S b/libavcodec/arm/hevcdsp_sao_neon.S
new file mode 100644
index 0000000..9c7808d
--- /dev/null
+++ b/libavcodec/arm/hevcdsp_sao_neon.S
@@ -0,0 +1,510 @@
+/*
+ * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
+.macro init_sao_band
+ pld [r1]
+ vld1.8 {q0, q1}, [r2] // offset table
+ ldr r2, [sp, #0] // stride_dst
+ ldr r12, [sp, #4] // height
+ vmov.u8 q3, #128
+.endm
+
+// 128 in q3
+// input q8 - q11
+.macro sao_band_64
+ vtbl.8 d24, {d0, d1, d2, d3}, d24
+ vadd.s8 q8, q3
+ vtbl.8 d25, {d0, d1, d2, d3}, d25
+ vadd.s8 q9, q3
+ vtbl.8 d26, {d0, d1, d2, d3}, d26
+ vadd.s8 q10, q3
+ vtbl.8 d27, {d0, d1, d2, d3}, d27
+ vadd.s8 q11, q3
+ vtbl.8 d28, {d0, d1, d2, d3}, d28
+ vqadd.s8 q8, q12
+ vtbl.8 d29, {d0, d1, d2, d3}, d29
+ vqadd.s8 q9, q13
+ vtbl.8 d30, {d0, d1, d2, d3}, d30
+ vqadd.s8 q10, q14
+ vtbl.8 d31, {d0, d1, d2, d3}, d31
+ vsub.s8 q8, q3
+ vqadd.s8 q11, q15
+ vsub.s8 q9, q3
+ vsub.s8 q10, q3
+ vsub.s8 q11, q3
+.endm
+
+function ff_hevc_sao_band_w8_neon_8, export=1
+ init_sao_band
+1: subs r12, #8
+ vld1.8 {d16}, [r1, :64], r3
+ vld1.8 {d17}, [r1, :64], r3
+ vshr.u8 q12, q8, #3
+ vld1.8 {d18}, [r1, :64], r3
+ vld1.8 {d19}, [r1, :64], r3
+ vshr.u8 q13, q9, #3
+ vld1.8 {d20}, [r1, :64], r3
+ vld1.8 {d21}, [r1, :64], r3
+ vshr.u8 q14, q10, #3
+ vld1.8 {d22}, [r1, :64], r3
+ vld1.8 {d23}, [r1, :64], r3
+ vshr.u8 q15, q11, #3
+ sao_band_64
+ vst1.8 {d16}, [r0, :64], r2
+ vst1.8 {d17}, [r0, :64], r2
+ vst1.8 {d18}, [r0, :64], r2
+ vst1.8 {d19}, [r0, :64], r2
+ vst1.8 {d20}, [r0, :64], r2
+ vst1.8 {d21}, [r0, :64], r2
+ vst1.8 {d22}, [r0, :64], r2
+ vst1.8 {d23}, [r0, :64], r2
+ bne 1b
+
+ bx lr
+endfunc
+
+function ff_hevc_sao_band_w16_neon_8, export=1
+ init_sao_band
+1: subs r12, #4
+ vld1.8 {q8}, [r1, :128], r3
+ vshr.u8 q12, q8, #3
+ vld1.8 {q9}, [r1, :128], r3
+ vshr.u8 q13, q9, #3
+ vld1.8 {q10}, [r1, :128], r3
+ vshr.u8 q14, q10, #3
+ vld1.8 {q11}, [r1, :128], r3
+ vshr.u8 q15, q11, #3
+ sao_band_64
+ vst1.8 {q8}, [r0, :128], r2
+ vst1.8 {q9}, [r0, :128], r2
+ vst1.8 {q10}, [r0, :128], r2
+ vst1.8 {q11}, [r0, :128], r2
+ bne 1b
+
+ bx lr
+endfunc
+
+function ff_hevc_sao_band_w32_neon_8, export=1
+ init_sao_band
+1: subs r12, #2
+ vld1.8 {q8-q9}, [r1, :128], r3
+ vshr.u8 q12, q8, #3
+ vshr.u8 q13, q9, #3
+ vld1.8 {q10-q11}, [r1, :128], r3
+ vshr.u8 q14, q10, #3
+ vshr.u8 q15, q11, #3
+ sao_band_64
+ vst1.8 {q8-q9}, [r0, :128], r2
+ vst1.8 {q10-q11}, [r0, :128], r2
+ bne 1b
+
+ bx lr
+endfunc
+
+function ff_hevc_sao_band_w64_neon_8, export=1
+ init_sao_band
+1: subs r12, #1
+ pld [r1, r3]
+ vld1.8 {q8-q9}, [r1, :128]!
+ vshr.u8 q12, q8, #3
+ vshr.u8 q13, q9, #3
+ vld1.8 {q10-q11}, [r1, :128], r3
+ vshr.u8 q14, q10, #3
+ vshr.u8 q15, q11, #3
+ sub r1, #32
+ sao_band_64
+ vst1.8 {q8-q9}, [r0, :128]!
+ vst1.8 {q10-q11}, [r0, :128], r2
+ sub r0, #32
+ bne 1b
+
+ bx lr
+endfunc
+
+.macro diff32 out0, out1, tmp0, tmp1, in0, in1, in2, in3
+ vcgt.u8 \out0, \in2, \in0 // c > a -> -1 , otherwise 0
+ vcgt.u8 \tmp0, \in0, \in2 // a > c -> -1 , otherwise 0
+ vcgt.u8 \out1, \in3, \in1 // c > a -> -1 , otherwise 0 part 2
+ vcgt.u8 \tmp1, \in1, \in3 // a > c -> -1 , otherwise 0 part 2
+ vsub.s8 \out0, \tmp0, \out0 // diff0
+ vsub.s8 \out1, \tmp1, \out1 // diff0 part 2
+.endm
+
+.macro table64
+ vmov.s8 q13, #2 // 2 to all elements
+ vmov.32 d24[0], r4 // load offset table from general registers
+ vmov.32 d24[1], r5 // load rest of offset table
+
+ vadd.s8 q0, q13
+ vadd.s8 q1, q13
+ vadd.s8 q2, q13
+ vadd.s8 q3, q13
+
+ vmov.u8 q15, #128 // s8 #-128
+ vtbl.8 d0, {d24}, d0
+ vadd.s8 q13, q4, q15
+ vtbl.8 d1, {d24}, d1
+ vadd.s8 q14, q5, q15
+ vtbl.8 d2, {d24}, d2
+ vqadd.s8 q0, q13
+ vtbl.8 d3, {d24}, d3
+ vqadd.s8 q1, q14
+ vtbl.8 d4, {d24}, d4
+ vadd.s8 q13, q6, q15
+ vtbl.8 d5, {d24}, d5
+ vadd.s8 q14, q7, q15
+ vtbl.8 d6, {d24}, d6
+ vqadd.s8 q2, q13
+ vtbl.8 d7, {d24}, d7
+ vqadd.s8 q3, q14
+ vsub.s8 q0, q15
+ vsub.s8 q1, q15
+ vsub.s8 q2, q15
+ vsub.s8 q3, q15
+ vst1.8 {q0-q1}, [r0, :128]!
+ vst1.8 {q2-q3}, [r0, :128], r2
+ sub r0, #32
+.endm
+
+// input
+// a in q0 - q3
+// c in q4 - q7
+// b in q8 - q11
+// offset table in r7 and r5
+// output in q0 - q3
+// clobbers q12 - q15
+.macro edge_w64_body
+ diff32 q12, q13, q0, q1, q0, q1, q4, q5
+ diff32 q0, q1, q14, q15, q8, q9, q4, q5
+
+ vadd.s8 q0, q12 //diff0 + diff1
+ vadd.s8 q1, q13
+
+ diff32 q14, q15, q2, q3, q2, q3, q6, q7
+ diff32 q2, q3, q12, q13, q10, q11, q6, q7
+
+ vadd.s8 q2, q14
+ vadd.s8 q3, q15
+ table64
+.endm
+
+.macro init_edge_64
+ push {r4-r5}
+ ldr r12, [sp, #8] // height
+ ldr r5, [sp, #12] // sao_offset_val_table
+ ldr r4, [r5]
+ add r5, #4
+ ldr r5, [r5]
+.endm
+
+function ff_hevc_sao_edge_eo0_w64_neon_8, export=1
+ init_edge_64
+ vpush {d8-d15}
+ sub r1, #8
+1: subs r12, #1
+ vld1.64 {d7}, [r1, :64]!
+ vld1.64 {q4-q5}, [r1, :128]! // load c
+ vld1.64 {q6-q7}, [r1, :128]!
+ vld1.64 {d24}, [r1, :64], r3
+ sub r1, #72
+ // load a
+ vext.8 q0, q3, q4, #15
+ vext.8 q1, q4, q5, #15
+ vext.8 q2, q5, q6, #15
+ vext.8 q3, q6, q7, #15
+ // load b
+ vext.8 q8, q4, q5, #1
+ vext.8 q9, q5, q6, #1
+ vext.8 q10, q6, q7, #1
+ vext.8 q11, q7, q12, #1
+ edge_w64_body
+ bne 1b
+ vpop {d8-d15}
+ pop {r4-r5}
+ bx lr
+endfunc
+
+function ff_hevc_sao_edge_eo1_w64_neon_8, export=1
+ init_edge_64
+ vpush {d8-d15}
+ sub r1, r3
+ // load a
+ vld1.8 {q0-q1}, [r1, :128]!
+ vld1.8 {q2-q3}, [r1, :128], r3
+ sub r1, #32
+ // load c
+ vld1.8 {q4-q5}, [r1, :128]!
+ vld1.8 {q6-q7}, [r1, :128], r3
+ sub r1, #32
+1: subs r12, #1
+ // load b
+ vld1.8 {q8-q9}, [r1, :128]!
+ vld1.8 {q10-q11}, [r1, :128], r3
+ sub r1, #32
+ edge_w64_body
+ // copy c to a
+ vmov.64 q0, q4
+ vmov.64 q1, q5
+ vmov.64 q2, q6
+ vmov.64 q3, q7
+ // copy b to c
+ vmov.64 q4, q8
+ vmov.64 q5, q9
+ vmov.64 q6, q10
+ vmov.64 q7, q11
+ bne 1b
+ vpop {d8-d15}
+ pop {r4-r5}
+ bx lr
+endfunc
+
+function ff_hevc_sao_edge_eo2_w64_neon_8, export=1
+ init_edge_64
+ vpush {d8-d15}
+1: sub r1, r3
+ // load a
+ // TODO: fix unaligned load
+ // don't reload a like in eo1
+ sub r1, #1
+ vld1.8 {q0-q1}, [r1]!
+ vld1.8 {q2-q3}, [r1], r3
+ sub r1, #31
+ subs r12, #1
+ // load c
+ vld1.8 {q4-q5}, [r1, :128]!
+ vld1.8 {q6-q7}, [r1, :128], r3
+ sub r1, #32
+ // load b
+ add r1, #1
+ vld1.8 {q8-q9}, [r1]!
+ vld1.8 {q10-q11}, [r1]
+ sub r1, #33
+ edge_w64_body
+ bne 1b
+ vpop {d8-d15}
+ pop {r4-r5}
+ bx lr
+endfunc
+
+function ff_hevc_sao_edge_eo3_w64_neon_8, export=1
+ init_edge_64
+ vpush {d8-d15}
+1: sub r1, r3
+ // load a
+ // TODO: fix unaligned load
+ // don't reload a like in eo1
+ add r1, #1
+ vld1.8 {q0-q1}, [r1]!
+ vld1.8 {q2-q3}, [r1], r3
+ sub r1, #33
+ subs r12, #1
+ // load c
+ vld1.8 {q4-q5}, [r1, :128]!
+ vld1.8 {q6-q7}, [r1, :128], r3
+ sub r1, #32
+ // load b
+ sub r1, #1
+ vld1.8 {q8-q9}, [r1]!
+ vld1.8 {q10-q11}, [r1]
+ sub r1, #31
+ edge_w64_body
+ bne 1b
+ vpop {d8-d15}
+ pop {r4-r5}
+ bx lr
+endfunc
+
+.macro init_edge_32
+ ldr r12, [sp, #4] // sao_offset_val_table
+ vld1.32 {d31}, [r12]
+ ldr r12, [sp] // height
+.endm
+
+.macro diff out0, tmp0, in0, in1
+ vcgt.u8 \out0, \in1, \in0 // c > a -> -1 , otherwise 0
+ vcgt.u8 \tmp0, \in0, \in1 // a > c -> -1 , otherwise 0
+ vsub.s8 \out0, \tmp0, \out0 // diff0
+.endm
+
+.macro table32
+ vmov.s8 q10, #2
+ vadd.s8 q0, q10
+ vadd.s8 q1, q10
+ vmov.s8 q10, #128
+ vtbl.8 d0, {d31}, d0
+ vadd.s8 q11, q2, q10
+ vtbl.8 d1, {d31}, d1
+ vadd.s8 q12, q3, q10
+ vtbl.8 d2, {d31}, d2
+ vqadd.s8 q11, q0
+ vtbl.8 d3, {d31}, d3
+ vqadd.s8 q12, q1
+ vsub.s8 q0, q11, q10
+ vsub.s8 q1, q12, q10
+ vst1.8 {q0-q1}, [r0, :128], r2
+.endm
+
+function ff_hevc_sao_edge_eo0_w32_neon_8, export=1
+ init_edge_32
+ vpush {q4-q7}
+ sub r1, #4
+1: subs r12, #1
+ vld1.8 {q13-q14}, [r1]!
+ vld1.32 d30, [r1], r3
+ sub r1, #32
+ // a
+ vext.8 q0, q13, q14, #3
+ vext.8 q1, q14, q15, #3
+ vshr.u64 d24, d30, #24
+ // c
+ vext.8 q2, q13, q14, #4
+ vext.8 q3, q14, q15, #4
+ vshr.u64 d16, d30, #32
+ // diff0
+ diff32 q13, q14, q4, q5, q0, q1, q2, q3
+ diff d18, d25, d24, d16
+ // -diff1
+ vext.s8 q0, q13, q14, #1
+ vext.s8 q1, q14, q9, #1
+
+ vsub.s8 q0, q13, q0 //diff0 + diff1
+ vsub.s8 q1, q14, q1
+ table32
+ bne 1b
+ vpop {q4-q7}
+
+ bx lr
+endfunc
+
+function ff_hevc_sao_edge_eo1_w32_neon_8, export=1
+ init_edge_32
+ vpush {q4-q7}
+ // load a
+ sub r1, r3
+ vld1.8 {q0-q1}, [r1, :128], r3
+ // load c
+ vld1.8 {q2-q3}, [r1, :128], r3
+ diff32 q12, q13, q0, q1, q0, q1, q2, q3 // CMP ( c, a )
+1: subs r12, #1
+ // load b
+ vld1.8 {q8-q9}, [r1, :128], r3
+ diff32 q4, q5, q10, q11, q8, q9, q2, q3 // CMP ( c, b )
+ vadd.s8 q0, q4, q12 //diff0 + diff1
+ vadd.s8 q1, q5, q13
+ table32
+ // CMP ( c, a )
+ vneg.s8 q12, q4
+ vneg.s8 q13, q5
+ // c
+ vmov.64 q2, q8
+ vmov.64 q3, q9
+ bne 1b
+ vpop {q4-q7}
+ bx lr
+endfunc
+
+function ff_hevc_sao_edge_eo2_w32_neon_8, export=1
+ init_edge_32
+ vpush {d8-d15}
+ // load a
+ sub r1, r3
+ sub r1, #8
+ vld1.8 {q10-q11}, [r1, :64]!
+ vld1.8 {d24}, [r1, :64], r3
+ sub r1, #32
+ vext.8 q0, q10, q11, #7
+ vext.8 q1, q11, q12, #7
+ // load c
+ vld1.8 {d9}, [r1, :64]!
+ vld1.8 {q2-q3}, [r1, :64], r3
+ sub r1, #8
+ vext.8 q4, q4, q2, #15
+1: subs r12, #1
+ // load b
+ vld1.8 {q10-q11}, [r1, :64]!
+ vld1.8 {q12}, [r1, :64], r3
+ sub r1, #32
+ vext.8 q8, q10, q11, #9
+ vext.8 q9, q11, q12, #9
+ vext.8 q6, q10, q11, #8
+ vext.8 q7, q11, q12, #8
+ vext.8 q5, q10, q11, #7
+ diff32 q12, q13, q0, q1, q0, q1, q2, q3
+ diff32 q0, q1, q10, q11, q8, q9, q2, q3
+ vadd.s8 q0, q12 //diff0 + diff1
+ vadd.s8 q1, q13
+ table32
+ // inputs for next loop iteration
+ // a
+ vmov.8 q0, q4
+ vext.8 q1, q2, q3, #15
+ // c
+ vmov.8 q2, q6
+ vmov.8 q3, q7
+ vmov.8 q4, q5
+ bne 1b
+ vpop {d8-d15}
+ bx lr
+endfunc
+
+function ff_hevc_sao_edge_eo3_w32_neon_8, export=1
+ init_edge_32
+ sub r1, r3
+ // load a
+ vld1.8 {q10-q11}, [r1, :64]!
+ vld1.8 {d24}, [r1, :64], r3
+ sub r1, #32
+ vext.8 q0, q10, q11, #1
+ vext.8 q1, q11, q12, #1
+ // load c
+ vld1.8 {q2-q3}, [r1, :64]!
+ vld1.8 {d30}, [r1, :64], r3
+ sub r1, #40
+1: subs r12, #1
+ // load b
+ vld1.8 {q10-q11}, [r1, :64]!
+ vld1.8 {q12}, [r1, :64], r3
+ sub r1, #32
+ vext.8 q8, q10, q11, #7
+ vext.8 q9, q11, q12, #7
+ vext.8 q14, q12, q10, #7
+
+ diff32 q12, q13, q0, q1, q0, q1, q2, q3
+ diff32 q0, q1, q10, q11, q8, q9, q2, q3
+
+ vadd.s8 q0, q12 //diff0 + diff1
+ vadd.s8 q1, q13
+ table32
+
+ // inputs for next loop iteration
+ // a
+ vext.8 q0, q2, q3, #1
+ vext.8 q1, q3, q15, #1
+ // c
+ vext.8 q2, q8, q9, #1
+ vext.8 q3, q9, q14, #1
+ vext.8 d30, d28, d2, #1
+ bne 1b
+ bx lr
+endfunc
+
diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
index 39713ed..25eb52b 100644
--- a/libavcodec/avcodec.h
+++ b/libavcodec/avcodec.h
@@ -410,6 +410,8 @@ enum AVCodecID {
AV_CODEC_ID_SHEERVIDEO,
AV_CODEC_ID_YLC,
+ AV_CODEC_ID_H264_MVC,
+
/* various PCM "codecs" */
AV_CODEC_ID_FIRST_AUDIO = 0x10000, ///< A dummy id pointing at the start of audio codecs
AV_CODEC_ID_PCM_S16LE = 0x10000,
@@ -2850,6 +2852,7 @@ typedef struct AVCodecContext {
#define FF_BUG_DC_CLIP 4096
#define FF_BUG_MS 8192 ///< Work around various bugs in Microsoft's broken decoders.
#define FF_BUG_TRUNCATED 16384
+#define FF_BUG_GMC_UNSUPPORTED 32768
/**
* strictly follow the standard (MPEG-4, ...).
@@ -3195,6 +3198,9 @@ typedef struct AVCodecContext {
#define FF_PROFILE_H264_HIGH_444_PREDICTIVE 244
#define FF_PROFILE_H264_HIGH_444_INTRA (244|FF_PROFILE_H264_INTRA)
#define FF_PROFILE_H264_CAVLC_444 44
+#define FF_PROFILE_H264_MULTIVIEW_HIGH 118
+#define FF_PROFILE_H264_STEREO_HIGH 128
+#define FF_PROFILE_H264_MULTIVIEW_HIGH_DEPTH 138
#define FF_PROFILE_VC1_SIMPLE 0
#define FF_PROFILE_VC1_MAIN 1
@@ -3505,6 +3511,12 @@ typedef struct AVCodecContext {
#define FF_SUB_TEXT_FMT_ASS_WITH_TIMINGS 1
#endif
+ /**
+ * Opaque pointer for use by replacement get_buffer2 code
+ *
+ * @author jc (08/02/2016)
+ */
+ void * get_buffer_context;
} AVCodecContext;
AVRational av_codec_get_pkt_timebase (const AVCodecContext *avctx);
diff --git a/libavcodec/cabac.h b/libavcodec/cabac.h
index 1bf1c62..ccfa991 100644
--- a/libavcodec/cabac.h
+++ b/libavcodec/cabac.h
@@ -43,7 +43,14 @@ extern const uint8_t ff_h264_cabac_tables[512 + 4*2*64 + 4*64 + 63];
typedef struct CABACContext{
int low;
int range;
- int outstanding_count;
+ union
+ {
+ int outstanding_count;
+ struct {
+ uint16_t bits;
+ uint16_t range;
+ } by22;
+ };
const uint8_t *bytestream_start;
const uint8_t *bytestream;
const uint8_t *bytestream_end;
diff --git a/libavcodec/codec_desc.c b/libavcodec/codec_desc.c
index 9d94b72..535ebf0 100644
--- a/libavcodec/codec_desc.c
+++ b/libavcodec/codec_desc.c
@@ -1563,6 +1563,13 @@ static const AVCodecDescriptor codec_descriptors[] = {
.long_name = NULL_IF_CONFIG_SMALL("YUY2 Lossless Codec"),
.props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
},
+ {
+ .id = AV_CODEC_ID_H264_MVC,
+ .type = AVMEDIA_TYPE_VIDEO,
+ .name = "h264_mvc",
+ .long_name = NULL_IF_CONFIG_SMALL("H264 MVC"),
+ .props = AV_CODEC_PROP_LOSSY,
+ },
/* various PCM "codecs" */
{
diff --git a/libavcodec/h264.h b/libavcodec/h264.h
index efe3555..16358aa 100644
--- a/libavcodec/h264.h
+++ b/libavcodec/h264.h
@@ -126,7 +126,9 @@ enum {
NAL_END_STREAM = 11,
NAL_FILLER_DATA = 12,
NAL_SPS_EXT = 13,
+ NAL_SPS_SUBSET = 15,
NAL_AUXILIARY_SLICE = 19,
+ NAL_SLICE_EXT = 20,
NAL_FF_IGNORE = 0xff0f001,
};
diff --git a/libavcodec/h264_parser.c b/libavcodec/h264_parser.c
index ce4bab2..b9b0c78 100644
--- a/libavcodec/h264_parser.c
+++ b/libavcodec/h264_parser.c
@@ -58,6 +58,8 @@ typedef struct H264ParseContext {
uint8_t parse_history[6];
int parse_history_count;
int parse_last_mb;
+ int is_mvc;
+ int slice_ext;
} H264ParseContext;
@@ -105,24 +107,27 @@ static int h264_find_frame_end(H264ParseContext *p, const uint8_t *buf,
} else if (state <= 5) {
int nalu_type = buf[i] & 0x1F;
if (nalu_type == NAL_SEI || nalu_type == NAL_SPS ||
- nalu_type == NAL_PPS || nalu_type == NAL_AUD) {
+ nalu_type == NAL_PPS || nalu_type == NAL_AUD ||
+ nalu_type == NAL_SPS_SUBSET) {
if (pc->frame_start_found) {
i++;
goto found;
}
} else if (nalu_type == NAL_SLICE || nalu_type == NAL_DPA ||
- nalu_type == NAL_IDR_SLICE) {
+ nalu_type == NAL_IDR_SLICE || (p->is_mvc && nalu_type == NAL_SLICE_EXT)) {
state += 8;
+
+ p->slice_ext = (nalu_type == NAL_SLICE_EXT);
continue;
}
state = 7;
} else {
p->parse_history[p->parse_history_count++] = buf[i];
- if (p->parse_history_count > 5) {
+ if (p->parse_history_count > 8) {
unsigned int mb, last_mb = p->parse_last_mb;
GetBitContext gb;
- init_get_bits(&gb, p->parse_history, 8*p->parse_history_count);
+ init_get_bits8(&gb, p->parse_history + 3*p->slice_ext, p->parse_history_count - 3*p->slice_ext);
p->parse_history_count = 0;
mb= get_ue_golomb_long(&gb);
p->parse_last_mb = mb;
@@ -145,7 +150,7 @@ found:
pc->frame_start_found = 0;
if (p->is_avc)
return next_avc;
- return i - (state & 5) - 5 * (state > 7);
+ return i - (state & 5) - 8 * (state > 7);
}
static int scan_mmco_reset(AVCodecParserContext *s, GetBitContext *gb,
@@ -585,7 +590,8 @@ static int h264_parse(AVCodecParserContext *s,
}
}
- parse_nal_units(s, avctx, buf, buf_size);
+ if (!p->is_mvc)
+ parse_nal_units(s, avctx, buf, buf_size);
if (avctx->framerate.num)
avctx->time_base = av_inv_q(av_mul_q(avctx->framerate, (AVRational){avctx->ticks_per_frame, 1}));
@@ -622,7 +628,7 @@ static int h264_split(AVCodecContext *avctx,
if ((state & 0xFFFFFF00) != 0x100)
break;
nalu_type = state & 0x1F;
- if (nalu_type == NAL_SPS) {
+ if (nalu_type == NAL_SPS || nalu_type == NAL_SPS_SUBSET) {
has_sps = 1;
} else if (nalu_type == NAL_PPS)
has_pps = 1;
@@ -672,3 +678,23 @@ AVCodecParser ff_h264_parser = {
.parser_close = h264_close,
.split = h264_split,
};
+
+static av_cold int init_mvc(AVCodecParserContext *s)
+{
+ H264ParseContext *p = s->priv_data;
+ int ret = init(s);
+ if (ret < 0)
+ return ret;
+
+ p->is_mvc = 1;
+ return 0;
+}
+
+AVCodecParser ff_h264_mvc_parser = {
+ .codec_ids = { AV_CODEC_ID_H264_MVC },
+ .priv_data_size = sizeof(H264ParseContext),
+ .parser_init = init_mvc,
+ .parser_parse = h264_parse,
+ .parser_close = h264_close,
+ .split = h264_split,
+};
diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
index b478065..88dd40b 100644
--- a/libavcodec/hevc.c
+++ b/libavcodec/hevc.c
@@ -41,8 +41,186 @@
#include "hevc.h"
#include "profiles.h"
+#ifdef RPI
+ #include "rpi_qpu.h"
+ #include "rpi_user_vcsm.h"
+ // Move Inter prediction into separate pass
+ #define RPI_INTER
+
+ #ifdef RPI_INTER_QPU
+ // Define RPI_MULTI_MAILBOX to use the updated mailbox that can launch both QPU and VPU
+ #define RPI_MULTI_MAILBOX
+ #endif
+
+ // Define RPI_CACHE_UNIF_MVS to write motion vector uniform stream to cached memory
+ // RPI_CACHE_UNIF_MVS doesn't seem to make much difference, so left undefined.
+
+ // Define RPI_SIMULATE_QPUS for debugging to run QPU code on the ARMs (*rotted*)
+ //#define RPI_SIMULATE_QPUS
+ #ifdef RPI_WORKER
+ #include "pthread.h"
+ #endif
+
+ static void rpi_execute_dblk_cmds(HEVCContext *s);
+ static void rpi_execute_transform(HEVCContext *s);
+ static void rpi_launch_vpu_qpu(HEVCContext *s);
+ static void rpi_execute_pred_cmds(HEVCContext *s);
+ static void rpi_execute_inter_cmds(HEVCContext *s);
+ static void rpi_begin(HEVCContext *s);
+ static void flush_frame(HEVCContext *s,AVFrame *frame);
+ static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2, int job);
+
+#endif
+
+// #define DISABLE_MC
+
+#define PACK2(hi,lo) (((hi) << 16) | ((lo) & 0xffff))
+
+#ifndef av_mod_uintp2
+static av_always_inline av_const unsigned av_mod_uintp2_c(unsigned a, unsigned p)
+{
+ return a & ((1 << p) - 1);
+}
+# define av_mod_uintp2 av_mod_uintp2_c
+#endif
+
const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
+
+#ifdef RPI_INTER_QPU
+
+// Each luma QPU processes 2*RPI_NUM_CHUNKS 64x64 blocks
+// Each chroma QPU processes 3*RPI_NUM_CHUNKS 64x64 blocks, but requires two commands for B blocks
+// For each block of 64*64 the smallest block size is 8x4
+// We also need an extra command for the setup information
+
+#define RPI_CHROMA_COMMAND_WORDS 12
+#define UV_COMMANDS_PER_QPU ((1 + 3*RPI_NUM_CHUNKS*(64*64)*2/(8*4)) * RPI_CHROMA_COMMAND_WORDS)
+// The QPU code for UV blocks only works up to a block width of 8
+#define RPI_CHROMA_BLOCK_WIDTH 8
+
+#define RPI_LUMA_COMMAND_WORDS 10
+#define Y_COMMANDS_PER_QPU ((1+2*RPI_NUM_CHUNKS*(64*64)/(8*4)) * RPI_LUMA_COMMAND_WORDS)
+
+#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
+
+// TODO Chroma only needs 4 taps
+
+// Actual filter goes -ve, +ve, +ve, -ve using these values
+static const uint32_t rpi_filter_coefs[8][1] = {
+ { ENCODE_COEFFS( 0, 64, 0, 0) },
+ { ENCODE_COEFFS( 2, 58, 10, 2) },
+ { ENCODE_COEFFS( 4, 54, 16, 2) },
+ { ENCODE_COEFFS( 6, 46, 28, 4) },
+ { ENCODE_COEFFS( 4, 36, 36, 4) },
+ { ENCODE_COEFFS( 4, 28, 46, 6) },
+ { ENCODE_COEFFS( 2, 16, 54, 4) },
+ { ENCODE_COEFFS( 2, 10, 58, 2) }
+};
+
+#endif
+
+
+#ifdef RPI_WORKER
+
+//#define LOG_ENTER printf("Enter %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s);
+//#define LOG_EXIT printf("Exit %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s);
+
+#define LOG_ENTER
+#define LOG_EXIT
+
+// Call this when we have completed pass0 and wish to trigger pass1 for the current job
+static void worker_submit_job(HEVCContext *s)
+{
+ LOG_ENTER
+ pthread_mutex_lock(&s->worker_mutex);
+ s->worker_tail++;
+ s->pass0_job = (s->pass0_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
+ pthread_cond_broadcast(&s->worker_cond_tail); // Let people know that the tail has moved
+ pthread_mutex_unlock(&s->worker_mutex);
+ LOG_EXIT
+}
+
+// Call this to say we have completed pass1
+static void worker_complete_job(HEVCContext *s)
+{
+ LOG_ENTER
+ pthread_mutex_lock(&s->worker_mutex);
+ s->worker_head++;
+ s->pass1_job = (s->pass1_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
+ pthread_cond_broadcast(&s->worker_cond_head); // Let people know that the head has moved
+ pthread_mutex_unlock(&s->worker_mutex);
+ LOG_EXIT
+}
+
+// Call this to wait for all jobs to have completed at the end of a frame
+static void worker_wait(HEVCContext *s)
+{
+ LOG_ENTER
+ pthread_mutex_lock(&s->worker_mutex);
+ while( s->worker_head !=s->worker_tail)
+ {
+ pthread_cond_wait(&s->worker_cond_head, &s->worker_mutex);
+ }
+ pthread_mutex_unlock(&s->worker_mutex);
+ LOG_EXIT
+}
+
+// Call worker_pass0_ready to wait until the s->pass0_job slot becomes
+// available to receive the next job.
+static void worker_pass0_ready(HEVCContext *s)
+{
+ LOG_ENTER
+ pthread_mutex_lock(&s->worker_mutex);
+ // tail is number of submitted jobs
+ // head is number of completed jobs
+ // tail-head is number of outstanding jobs in the queue
+ // we need to ensure there is at least 1 space left for us to use
+ while( s->worker_tail - s->worker_head >= RPI_MAX_JOBS)
+ {
+ // Wait until another job is completed
+ pthread_cond_wait(&s->worker_cond_head, &s->worker_mutex);
+ }
+ pthread_mutex_unlock(&s->worker_mutex);
+ LOG_EXIT
+}
+
+static void *worker_start(void *arg)
+{
+ HEVCContext *s = (HEVCContext *)arg;
+ while(1) {
+ pthread_mutex_lock(&s->worker_mutex);
+
+ while( !s->kill_worker && s->worker_tail - s->worker_head <= 0)
+ {
+ pthread_cond_wait(&s->worker_cond_tail, &s->worker_mutex);
+ }
+ pthread_mutex_unlock(&s->worker_mutex);
+
+ if (s->kill_worker) {
+ break;
+ }
+ LOG_ENTER
+ // printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
+ rpi_launch_vpu_qpu(s);
+ // Perform inter prediction
+ rpi_execute_inter_cmds(s);
+ // Wait for transform completion
+ vpu_wait(s->vpu_id);
+
+ // Perform intra prediction and residual reconstruction
+ rpi_execute_pred_cmds(s);
+ // Perform deblocking for CTBs in this row
+ rpi_execute_dblk_cmds(s);
+
+ worker_complete_job(s);
+ LOG_EXIT
+ }
+ return NULL;
+}
+
+#endif
+
/**
* NOTE: Each function hls_foo correspond to the function foo in the
* specification (HLS stands for High Level Syntax).
@@ -55,6 +233,32 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
/* free everything allocated by pic_arrays_init() */
static void pic_arrays_free(HEVCContext *s)
{
+#ifdef RPI
+ int job;
+ for(job=0;job<RPI_MAX_JOBS;job++) {
+ if (s->coeffs_buf_arm[job][0]) {
+ gpu_free(&s->coeffs_buf_default[job]);
+ s->coeffs_buf_arm[job][0] = 0;
+ }
+ if (s->coeffs_buf_arm[job][2]) {
+ gpu_free(&s->coeffs_buf_accelerated[job]);
+ s->coeffs_buf_arm[job][2] = 0;
+ }
+ }
+#endif
+#ifdef RPI_DEBLOCK_VPU
+ {
+ int i;
+ for (i = 0; i != RPI_DEBLOCK_VPU_Q_COUNT; ++i) {
+ struct dblk_vpu_q_s * const dvq = s->dvq_ents + i;
+
+ if (dvq->vpu_cmds_arm) {
+ gpu_free(&dvq->deblock_vpu_gmem);
+ dvq->vpu_cmds_arm = 0;
+ }
+ }
+ }
+#endif
av_freep(&s->sao);
av_freep(&s->deblock);
@@ -91,6 +295,87 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
int ctb_count = sps->ctb_width * sps->ctb_height;
int min_pu_size = sps->min_pu_width * sps->min_pu_height;
+#ifdef RPI
+ int coefs_in_ctb = (1 << sps->log2_ctb_size) * (1 << sps->log2_ctb_size);
+ int coefs_per_luma = 64*64*24*RPI_NUM_CHUNKS;
+ int coefs_per_chroma = (coefs_per_luma * 2) >> sps->vshift[1] >> sps->hshift[1];
+ int coefs_per_row = coefs_per_luma + coefs_per_chroma;
+ int job;
+
+ av_assert0(sps);
+ s->max_ctu_count = coefs_per_luma / coefs_in_ctb;
+ s->ctu_per_y_chan = s->max_ctu_count / 12;
+ s->ctu_per_uv_chan = s->max_ctu_count / 8;
+ for(job=0;job<RPI_MAX_JOBS;job++) {
+ printf("Allocated %d\n",coefs_per_row);
+ for(job=0;job<RPI_MAX_JOBS;job++) {
+ gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default[job]);
+ s->coeffs_buf_arm[job][0] = (int16_t*) s->coeffs_buf_default[job].arm;
+ if (!s->coeffs_buf_arm[job][0])
+ goto fail;
+ gpu_malloc_cached(sizeof(int16_t) * (coefs_per_row + 32*32), &s->coeffs_buf_accelerated[job]); // We prefetch past the end so provide an extra blocks worth of data
+ s->coeffs_buf_arm[job][2] = (int16_t*) s->coeffs_buf_accelerated[job].arm;
+ s->coeffs_buf_vc[job][2] = s->coeffs_buf_accelerated[job].vc;
+ if (!s->coeffs_buf_arm[job][2])
+ goto fail;
+ s->coeffs_buf_arm[job][3] = coefs_per_row + s->coeffs_buf_arm[job][2]; // This points to just beyond the end of the buffer. Coefficients fill in backwards.
+ s->coeffs_buf_vc[job][3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[job][2];
+ }
+ }
+#endif
+#ifdef RPI_DEBLOCK_VPU
+ {
+ int i;
+ s->enable_rpi_deblock = !sps->sao_enabled;
+ s->setup_width = (sps->width+15) / 16;
+ s->setup_height = (sps->height+15) / 16;
+ s->uv_setup_width = ( (sps->width >> sps->hshift[1]) + 15) / 16;
+ s->uv_setup_height = ( (sps->height >> sps->vshift[1]) + 15) / 16;
+
+ for (i = 0; i != RPI_DEBLOCK_VPU_Q_COUNT; ++i)
+ {
+ struct dblk_vpu_q_s * const dvq = s->dvq_ents + i;
+ const unsigned int cmd_size = (sizeof(*dvq->vpu_cmds_arm) * 3 + 15) & ~15;
+ const unsigned int y_size = (sizeof(*dvq->y_setup_arm) * s->setup_width * s->setup_height + 15) & ~15;
+ const unsigned int uv_size = (sizeof(*dvq->uv_setup_arm) * s->uv_setup_width * s->uv_setup_height + 15) & ~15;
+ const unsigned int total_size =- cmd_size + y_size + uv_size;
+ int p_vc;
+ uint8_t * p_arm;
+ #if RPI_VPU_DEBLOCK_CACHED
+ gpu_malloc_cached(total_size, &dvq->deblock_vpu_gmem);
+ #else
+ gpu_malloc_uncached(total_size, &dvq->deblock_vpu_gmem);
+ #endif
+ p_vc = dvq->deblock_vpu_gmem.vc;
+ p_arm = dvq->deblock_vpu_gmem.arm;
+
+ // Zap all
+ memset(p_arm, 0, dvq->deblock_vpu_gmem.numbytes);
+
+ // Subdivide
+ dvq->vpu_cmds_arm = (void*)p_arm;
+ dvq->vpu_cmds_vc = p_vc;
+
+ p_arm += cmd_size;
+ p_vc += cmd_size;
+
+ dvq->y_setup_arm = (void*)p_arm;
+ dvq->y_setup_vc = (void*)p_vc;
+
+ p_arm += y_size;
+ p_vc += y_size;
+
+ dvq->uv_setup_arm = (void*)p_arm;
+ dvq->uv_setup_vc = (void*)p_vc;
+
+ dvq->cmd_id = -1;
+ }
+
+ s->dvq_n = 0;
+ s->dvq = s->dvq_ents + s->dvq_n;
+ }
+#endif
+
s->bs_width = (width >> 2) + 1;
s->bs_height = (height >> 2) + 1;
@@ -137,6 +422,29 @@ fail:
return AVERROR(ENOMEM);
}
+static void default_pred_weight_table(HEVCContext * const s)
+{
+ unsigned int i;
+ s->sh.luma_log2_weight_denom = 0;
+ s->sh.chroma_log2_weight_denom = 0;
+ for (i = 0; i < s->sh.nb_refs[L0]; i++) {
+ s->sh.luma_weight_l0[i] = 1;
+ s->sh.luma_offset_l0[i] = 0;
+ s->sh.chroma_weight_l0[i][0] = 1;
+ s->sh.chroma_offset_l0[i][0] = 0;
+ s->sh.chroma_weight_l0[i][1] = 1;
+ s->sh.chroma_offset_l0[i][1] = 0;
+ }
+ for (i = 0; i < s->sh.nb_refs[L1]; i++) {
+ s->sh.luma_weight_l1[i] = 1;
+ s->sh.luma_offset_l1[i] = 0;
+ s->sh.chroma_weight_l1[i][0] = 1;
+ s->sh.chroma_offset_l1[i][0] = 0;
+ s->sh.chroma_weight_l1[i][1] = 1;
+ s->sh.chroma_offset_l1[i][1] = 0;
+ }
+}
+
static void pred_weight_table(HEVCContext *s, GetBitContext *gb)
{
int i = 0;
@@ -674,6 +982,11 @@ static int hls_slice_header(HEVCContext *s)
(s->ps.pps->weighted_bipred_flag && sh->slice_type == B_SLICE)) {
pred_weight_table(s, gb);
}
+ else
+ {
+ // Give us unit weights
+ default_pred_weight_table(s);
+ }
sh->max_num_merge_cand = 5 - get_ue_golomb_long(gb);
if (sh->max_num_merge_cand < 1 || sh->max_num_merge_cand > 5) {
@@ -931,6 +1244,25 @@ static int hls_cross_component_pred(HEVCContext *s, int idx) {
return 0;
}
+#ifdef RPI
+static void rpi_intra_pred(HEVCContext *s, int log2_trafo_size, int x0, int y0, int c_idx)
+{
+ if (s->enable_rpi) {
+ HEVCLocalContext *lc = s->HEVClc;
+ HEVCPredCmd *cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
+ cmd->type = RPI_PRED_INTRA;
+ cmd->size = log2_trafo_size;
+ cmd->c_idx = c_idx;
+ cmd->x = x0;
+ cmd->y = y0;
+ cmd->na = (lc->na.cand_bottom_left<<4) + (lc->na.cand_left<<3) + (lc->na.cand_up_left<<2) + (lc->na.cand_up<<1) + lc->na.cand_up_right;
+ cmd->mode = c_idx ? lc->tu.intra_pred_mode_c : lc->tu.intra_pred_mode;
+ } else {
+ s->hpc.intra_pred[log2_trafo_size - 2](s, x0, y0, c_idx);
+ }
+}
+#endif
+
static int hls_transform_unit(HEVCContext *s, int x0, int y0,
int xBase, int yBase, int cb_xBase, int cb_yBase,
int log2_cb_size, int log2_trafo_size,
@@ -943,8 +1275,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
if (lc->cu.pred_mode == MODE_INTRA) {
int trafo_size = 1 << log2_trafo_size;
ff_hevc_set_neighbour_available(s, x0, y0, trafo_size, trafo_size);
-
+#ifdef RPI
+ rpi_intra_pred(s, log2_trafo_size, x0, y0, 0);
+#else
s->hpc.intra_pred[log2_trafo_size - 2](s, x0, y0, 0);
+#endif
}
if (cbf_luma || cbf_cb[0] || cbf_cr[0] ||
@@ -1030,7 +1365,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
if (lc->cu.pred_mode == MODE_INTRA) {
ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
+#ifdef RPI
+ rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 1);
+#else
s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (i << log2_trafo_size_c), 1);
+#endif
}
if (cbf_cb[i])
ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
@@ -1059,7 +1398,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
if (lc->cu.pred_mode == MODE_INTRA) {
ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
+#ifdef RPI
+ rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 2);
+#else
s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (i << log2_trafo_size_c), 2);
+#endif
}
if (cbf_cr[i])
ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
@@ -1088,7 +1431,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
if (lc->cu.pred_mode == MODE_INTRA) {
ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
trafo_size_h, trafo_size_v);
+#ifdef RPI
+ rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 1);
+#else
s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (i << log2_trafo_size), 1);
+#endif
}
if (cbf_cb[i])
ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
@@ -1098,7 +1445,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
if (lc->cu.pred_mode == MODE_INTRA) {
ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
trafo_size_h, trafo_size_v);
+#ifdef RPI
+ rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 2);
+#else
s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (i << log2_trafo_size), 2);
+#endif
}
if (cbf_cr[i])
ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
@@ -1110,26 +1461,46 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
int trafo_size_h = 1 << (log2_trafo_size_c + s->ps.sps->hshift[1]);
int trafo_size_v = 1 << (log2_trafo_size_c + s->ps.sps->vshift[1]);
ff_hevc_set_neighbour_available(s, x0, y0, trafo_size_h, trafo_size_v);
+#ifdef RPI
+ rpi_intra_pred(s, log2_trafo_size_c, x0, y0, 1);
+ rpi_intra_pred(s, log2_trafo_size_c, x0, y0, 2);
+#else
s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0, 1);
s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0, 2);
+#endif
if (s->ps.sps->chroma_format_idc == 2) {
ff_hevc_set_neighbour_available(s, x0, y0 + (1 << log2_trafo_size_c),
trafo_size_h, trafo_size_v);
+#ifdef RPI
+ rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 1);
+ rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 2);
+#else
s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (1 << log2_trafo_size_c), 1);
s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (1 << log2_trafo_size_c), 2);
+#endif
}
} else if (blk_idx == 3) {
int trafo_size_h = 1 << (log2_trafo_size + 1);
int trafo_size_v = 1 << (log2_trafo_size + s->ps.sps->vshift[1]);
ff_hevc_set_neighbour_available(s, xBase, yBase,
trafo_size_h, trafo_size_v);
+#ifdef RPI
+ rpi_intra_pred(s, log2_trafo_size, xBase, yBase, 1);
+ rpi_intra_pred(s, log2_trafo_size, xBase, yBase, 2);
+#else
s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase, 1);
s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase, 2);
+#endif
if (s->ps.sps->chroma_format_idc == 2) {
ff_hevc_set_neighbour_available(s, xBase, yBase + (1 << (log2_trafo_size)),
trafo_size_h, trafo_size_v);
+#ifdef RPI
+ rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 1);
+ rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 2);
+#else
s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (1 << (log2_trafo_size)), 1);
s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (1 << (log2_trafo_size)), 2);
+#endif
}
}
}
@@ -1332,6 +1703,93 @@ static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size)
* @param luma_offset additive offset applied to the luma prediction value
*/
+#ifdef RPI_INTER
+#define RPI_REDIRECT(fn) (s->enable_rpi ? rpi_ ## fn : fn)
+static void rpi_luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+ AVFrame *ref, const Mv *mv, int x_off, int y_off,
+ int block_w, int block_h, int luma_weight, int luma_offset)
+{
+ HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
+ cmd->cmd = RPI_CMD_LUMA_UNI;
+ cmd->dst = dst;
+ cmd->dststride = dststride;
+ cmd->src = ref->data[0];
+ cmd->srcstride = ref->linesize[0];
+ cmd->mv = *mv;
+ cmd->x_off = x_off;
+ cmd->y_off = y_off;
+ cmd->block_w = block_w;
+ cmd->block_h = block_h;
+ cmd->weight = luma_weight;
+ cmd->offset = luma_offset;
+}
+
+static void rpi_luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+ AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
+ int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv)
+{
+ HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
+ cmd->cmd = RPI_CMD_LUMA_BI;
+ cmd->dst = dst;
+ cmd->dststride = dststride;
+ cmd->src = ref0->data[0];
+ cmd->srcstride = ref0->linesize[0];
+ cmd->mv = *mv0;
+ cmd->x_off = x_off;
+ cmd->y_off = y_off;
+ cmd->block_w = block_w;
+ cmd->block_h = block_h;
+ cmd->src1 = ref1->data[0];
+ cmd->srcstride1 = ref1->linesize[0];
+ cmd->mv1 = *mv1;
+ cmd->ref_idx[0] = current_mv->ref_idx[0];
+ cmd->ref_idx[1] = current_mv->ref_idx[1];
+}
+
+static void rpi_chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
+ ptrdiff_t dststride, uint8_t *src0, ptrdiff_t srcstride, int reflist,
+ int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int chroma_weight, int chroma_offset)
+{
+ HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
+ cmd->cmd = RPI_CMD_CHROMA_UNI;
+ cmd->dst = dst0;
+ cmd->dststride = dststride;
+ cmd->src = src0;
+ cmd->srcstride = srcstride;
+ cmd->mv = current_mv->mv[reflist];
+ cmd->x_off = x_off;
+ cmd->y_off = y_off;
+ cmd->block_w = block_w;
+ cmd->block_h = block_h;
+ cmd->weight = chroma_weight;
+ cmd->offset = chroma_offset;
+}
+
+static void rpi_chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVFrame *ref0, AVFrame *ref1,
+ int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int cidx)
+{
+ HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
+ cmd->cmd = RPI_CMD_CHROMA_BI+cidx;
+ cmd->dst = dst0;
+ cmd->dststride = dststride;
+ cmd->src = ref0->data[cidx+1];
+ cmd->srcstride = ref0->linesize[cidx+1];
+ cmd->mv = current_mv->mv[0];
+ cmd->mv1 = current_mv->mv[1];
+ cmd->x_off = x_off;
+ cmd->y_off = y_off;
+ cmd->block_w = block_w;
+ cmd->block_h = block_h;
+ cmd->src1 = ref1->data[cidx+1];
+ cmd->srcstride1 = ref1->linesize[cidx+1];
+ cmd->ref_idx[0] = current_mv->ref_idx[0];
+ cmd->ref_idx[1] = current_mv->ref_idx[1];
+}
+
+#else
+#define RPI_REDIRECT(fn) fn
+#endif
+
static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
AVFrame *ref, const Mv *mv, int x_off, int y_off,
int block_w, int block_h, int luma_weight, int luma_offset)
@@ -1347,6 +1805,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
(s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
int idx = ff_hevc_pel_weight[block_w];
+#ifdef DISABLE_MC
+ return;
+#endif
+
x_off += mv->x >> 2;
y_off += mv->y >> 2;
src += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
@@ -1393,7 +1855,7 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
* @param mv1 motion vector1 (relative to block position) to get pixel data from
* @param current_mv current motion vector structure
*/
- static void luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+static void luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv)
{
@@ -1417,6 +1879,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
uint8_t *src0 = ref0->data[0] + y_off0 * src0stride + (int)((unsigned)x_off0 << s->ps.sps->pixel_shift);
uint8_t *src1 = ref1->data[0] + y_off1 * src1stride + (int)((unsigned)x_off1 << s->ps.sps->pixel_shift);
+#ifdef DISABLE_MC
+ return;
+#endif
+
if (x_off0 < QPEL_EXTRA_BEFORE || y_off0 < QPEL_EXTRA_AFTER ||
x_off0 >= pic_width - block_w - QPEL_EXTRA_AFTER ||
y_off0 >= pic_height - block_h - QPEL_EXTRA_AFTER) {
@@ -1502,6 +1968,10 @@ static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
intptr_t _mx = mx << (1 - hshift);
intptr_t _my = my << (1 - vshift);
+#ifdef DISABLE_MC
+ return;
+#endif
+
x_off += mv->x >> (2 + hshift);
y_off += mv->y >> (2 + vshift);
src0 += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
@@ -1566,6 +2036,10 @@ static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVF
int hshift = s->ps.sps->hshift[1];
int vshift = s->ps.sps->vshift[1];
+#ifdef DISABLE_MC
+ return;
+#endif
+
intptr_t mx0 = av_mod_uintp2(mv0->x, 2 + hshift);
intptr_t my0 = av_mod_uintp2(mv0->y, 2 + vshift);
intptr_t mx1 = av_mod_uintp2(mv1->x, 2 + hshift);
@@ -1693,14 +2167,14 @@ static void hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW,
}
}
-static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
- int nPbW, int nPbH,
- int log2_cb_size, int partIdx, int idx)
+static void hls_prediction_unit(HEVCContext * const s, const int x0, const int y0,
+ const int nPbW, const int nPbH,
+ const unsigned int log2_cb_size, const unsigned int partIdx, const unsigned int idx)
{
#define POS(c_idx, x, y) \
&s->frame->data[c_idx][((y) >> s->ps.sps->vshift[c_idx]) * s->frame->linesize[c_idx] + \
(((x) >> s->ps.sps->hshift[c_idx]) << s->ps.sps->pixel_shift)]
- HEVCLocalContext *lc = s->HEVClc;
+ HEVCLocalContext * const lc = s->HEVClc;
int merge_idx = 0;
struct MvField current_mv = {{{ 0 }}};
@@ -1718,8 +2192,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
int y_cb = y0 >> log2_min_cb_size;
int x_pu, y_pu;
int i, j;
-
- int skip_flag = SAMPLE_CTB(s->skip_flag, x_cb, y_cb);
+ const int skip_flag = SAMPLE_CTB(s->skip_flag, x_cb, y_cb);
if (!skip_flag)
lc->pu.merge_flag = ff_hevc_merge_flag_decode(s);
@@ -1763,16 +2236,89 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
int nPbW_c = nPbW >> s->ps.sps->hshift[1];
int nPbH_c = nPbH >> s->ps.sps->vshift[1];
- luma_mc_uni(s, dst0, s->frame->linesize[0], ref0->frame,
+#ifdef RPI_LUMA_QPU
+ if (s->enable_rpi) {
+ const Mv * const mv = &current_mv.mv[0];
+ const unsigned int mx = mv->x & 3;
+ const unsigned int my = mv->y & 3;
+ const unsigned int my_mx = (my<<8) | mx;
+ const uint32_t my2_mx2_my_mx = (my_mx << 16) | my_mx;
+ const int x1_m3 = x0 + (mv->x >> 2) - 3;
+ const int y1_m3 = y0 + (mv->y >> 2) - 3;
+ const uint32_t src_vc_address_y = get_vc_address_y(ref0->frame);
+ uint32_t * y = s->curr_y_mvs;
+
+ for(int start_y=0;start_y < nPbH;start_y+=16) { // Potentially we could change the assembly code to support taller sizes in one go
+ const uint32_t src_yx_hi = ((y1_m3 + start_y) << 16);
+
+ for(int start_x=0;start_x < nPbW;start_x+=16) {
+ const int bw = nPbW-start_x;
+ const int bh = nPbH-start_y;
+ y++[-RPI_LUMA_COMMAND_WORDS] = src_yx_hi | ((x1_m3 + start_x) & 0xffff);
+ y++[-RPI_LUMA_COMMAND_WORDS] = src_vc_address_y;
+ y++[-RPI_LUMA_COMMAND_WORDS] = src_yx_hi | ((x1_m3 + 8 + start_x) & 0xffff);
+ y++[-RPI_LUMA_COMMAND_WORDS] = src_vc_address_y;
+ *y++ = ( (bw<16 ? bw : 16) << 16 ) + (bh<16 ? bh : 16);
+ *y++ = my2_mx2_my_mx;
+ *y++ = s->sh.luma_weight_l0[current_mv.ref_idx[0]];
+ *y++ = s->sh.luma_offset_l0[current_mv.ref_idx[0]] * 2 + 1;
+ *y++ = (get_vc_address_y(s->frame) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
+ y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter;
+ }
+ }
+ s->curr_y_mvs = y;
+ } else
+#endif
+ {
+ RPI_REDIRECT(luma_mc_uni)(s, dst0, s->frame->linesize[0], ref0->frame,
&current_mv.mv[0], x0, y0, nPbW, nPbH,
s->sh.luma_weight_l0[current_mv.ref_idx[0]],
s->sh.luma_offset_l0[current_mv.ref_idx[0]]);
+ }
if (s->ps.sps->chroma_format_idc) {
- chroma_mc_uni(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1],
+#ifdef RPI_INTER_QPU
+ if (s->enable_rpi) {
+ int hshift = s->ps.sps->hshift[1];
+ int vshift = s->ps.sps->vshift[1];
+ const Mv *mv = &current_mv.mv[0];
+ intptr_t mx = av_mod_uintp2(mv->x, 2 + hshift);
+ intptr_t my = av_mod_uintp2(mv->y, 2 + vshift);
+ intptr_t _mx = mx << (1 - hshift);
+ intptr_t _my = my << (1 - vshift); // Fractional part of motion vector
+
+ int x1_c = x0_c + (mv->x >> (2 + hshift));
+ int y1_c = y0_c + (mv->y >> (2 + hshift));
+
+ uint32_t *u = s->curr_u_mvs;
+ for(int start_y=0;start_y < nPbH_c;start_y+=16) {
+ for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
+ int bw = nPbW_c-start_x;
+ int bh = nPbH_c-start_y;
+ u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
+ u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
+ u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
+ u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref0->frame);
+ u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref0->frame);
+ *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
+ *u++ = rpi_filter_coefs[_mx][0];
+ *u++ = rpi_filter_coefs[_my][0];
+ *u++ = PACK2(s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0] * 2 + 1,
+ s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0]);
+ *u++ = PACK2(s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1] * 2 + 1,
+ s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1]);
+ *u++ = (get_vc_address_u(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
+ *u++ = (get_vc_address_v(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+ }
+ }
+ s->curr_u_mvs = u;
+ return;
+ }
+#endif
+ RPI_REDIRECT(chroma_mc_uni)(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1],
0, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]);
- chroma_mc_uni(s, dst2, s->frame->linesize[2], ref0->frame->data[2], ref0->frame->linesize[2],
+ RPI_REDIRECT(chroma_mc_uni)(s, dst2, s->frame->linesize[2], ref0->frame->data[2], ref0->frame->linesize[2],
0, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1]);
}
@@ -1782,17 +2328,89 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
int nPbW_c = nPbW >> s->ps.sps->hshift[1];
int nPbH_c = nPbH >> s->ps.sps->vshift[1];
- luma_mc_uni(s, dst0, s->frame->linesize[0], ref1->frame,
+#ifdef RPI_LUMA_QPU
+ if (s->enable_rpi) {
+ const int reflist = 1;
+ const Mv *mv = &current_mv.mv[reflist];
+ int mx = mv->x & 3;
+ int my = mv->y & 3;
+ int my_mx = (my<<8) + mx;
+ int my2_mx2_my_mx = (my_mx << 16) + my_mx;
+ int x1 = x0 + (mv->x >> 2);
+ int y1 = y0 + (mv->y >> 2);
+ uint32_t *y = s->curr_y_mvs;
+ for(int start_y=0;start_y < nPbH;start_y+=16) { // Potentially we could change the assembly code to support taller sizes in one go
+ for(int start_x=0;start_x < nPbW;start_x+=16) {
+ int bw = nPbW-start_x;
+ int bh = nPbH-start_y;
+ y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
+ y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref1->frame);
+ y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + 8 + start_x) & 0xffff);
+ y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref1->frame);
+ *y++ = ( (bw<16 ? bw : 16) << 16 ) + (bh<16 ? bh : 16);
+ *y++ = my2_mx2_my_mx;
+ *y++ = s->sh.luma_weight_l1[current_mv.ref_idx[reflist]];
+ *y++ = s->sh.luma_offset_l1[current_mv.ref_idx[reflist]] * 2 + 1;
+ *y++ = (get_vc_address_y(s->frame) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
+ y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter;
+ }
+ }
+ s->curr_y_mvs = y;
+ } else
+#endif
+
+ {
+ RPI_REDIRECT(luma_mc_uni)(s, dst0, s->frame->linesize[0], ref1->frame,
&current_mv.mv[1], x0, y0, nPbW, nPbH,
s->sh.luma_weight_l1[current_mv.ref_idx[1]],
s->sh.luma_offset_l1[current_mv.ref_idx[1]]);
+ }
if (s->ps.sps->chroma_format_idc) {
- chroma_mc_uni(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1],
+#ifdef RPI_INTER_QPU
+ if (s->enable_rpi) {
+ const int reflist = 1;
+ const int hshift = s->ps.sps->hshift[1];
+ const int vshift = s->ps.sps->vshift[1];
+ const Mv * const mv = &current_mv.mv[reflist];
+ const intptr_t mx = av_mod_uintp2(mv->x, 2 + hshift);
+ const intptr_t my = av_mod_uintp2(mv->y, 2 + vshift);
+ const intptr_t _mx = mx << (1 - hshift);
+ const intptr_t _my = my << (1 - vshift); // Fractional part of motion vector
+
+ const int x1_c = x0_c + (mv->x >> (2 + hshift));
+ const int y1_c = y0_c + (mv->y >> (2 + hshift));
+
+ uint32_t * u = s->curr_u_mvs;
+ for(int start_y=0;start_y < nPbH_c;start_y+=16) {
+ for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
+ const int bw = nPbW_c-start_x;
+ const int bh = nPbH_c-start_y;
+ u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
+ u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
+ u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
+ u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref1->frame);
+ u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref1->frame);
+ *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
+ *u++ = rpi_filter_coefs[_mx][0];
+ *u++ = rpi_filter_coefs[_my][0];
+ *u++ = PACK2(s->sh.chroma_offset_l1[current_mv.ref_idx[reflist]][0] * 2 + 1,
+ s->sh.chroma_weight_l1[current_mv.ref_idx[reflist]][0]);
+ *u++ = PACK2(s->sh.chroma_offset_l1[current_mv.ref_idx[reflist]][1] * 2 + 1,
+ s->sh.chroma_weight_l1[current_mv.ref_idx[reflist]][1]);
+ *u++ = (get_vc_address_u(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
+ *u++ = (get_vc_address_v(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+ }
+ }
+ s->curr_u_mvs = u;
+ return;
+ }
+#endif
+ RPI_REDIRECT(chroma_mc_uni)(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1],
1, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0]);
- chroma_mc_uni(s, dst2, s->frame->linesize[2], ref1->frame->data[2], ref1->frame->linesize[2],
+ RPI_REDIRECT(chroma_mc_uni)(s, dst2, s->frame->linesize[2], ref1->frame->data[2], ref1->frame->linesize[2],
1, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
s->sh.chroma_weight_l1[current_mv.ref_idx[1]][1], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][1]);
}
@@ -1802,15 +2420,118 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
int nPbW_c = nPbW >> s->ps.sps->hshift[1];
int nPbH_c = nPbH >> s->ps.sps->vshift[1];
- luma_mc_bi(s, dst0, s->frame->linesize[0], ref0->frame,
+#ifdef RPI_LUMA_QPU
+ if (s->enable_rpi && 0) {
+ const Mv *mv = &current_mv.mv[0];
+ int mx = mv->x & 3;
+ int my = mv->y & 3;
+ int my_mx = (my<<8) + mx;
+ const Mv *mv2 = &current_mv.mv[1];
+ int mx2 = mv2->x & 3;
+ int my2 = mv2->y & 3;
+ int my2_mx2 = (my2<<8) + mx2;
+ int my2_mx2_my_mx = (my2_mx2 << 16) + my_mx;
+ int x1 = x0 + (mv->x >> 2);
+ int y1 = y0 + (mv->y >> 2);
+ int x2 = x0 + (mv2->x >> 2);
+ int y2 = y0 + (mv2->y >> 2);
+ uint32_t *y = s->curr_y_mvs;
+ for(int start_y=0;start_y < nPbH;start_y+=16) { // Potentially we could change the assembly code to support taller sizes in one go
+ for(int start_x=0;start_x < nPbW;start_x+=8) { // B blocks work 8 at a time
+ int bw = nPbW-start_x;
+ int bh = nPbH-start_y;
+ y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
+ y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref0->frame);
+ y++[-RPI_LUMA_COMMAND_WORDS] = ((y2 - 3 + start_y) << 16) + ( (x2 - 3 + start_x) & 0xffff); // Second fetch is for ref1
+ y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref1->frame);
+ *y++ = PACK2(bw<8 ? bw : 8, bh<16 ? bh : 16);
+ *y++ = my2_mx2_my_mx;
+
+ *y++ = PACK2(s->sh.luma_weight_l1[current_mv.ref_idx[1]],
+ s->sh.luma_weight_l0[current_mv.ref_idx[0]]);
+ *y++ = s->sh.luma_offset_l0[current_mv.ref_idx[0]] +
+ s->sh.luma_offset_l1[current_mv.ref_idx[1]] + 1;
+
+ *y++ = (get_vc_address_y(s->frame) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
+ y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter_b;
+ }
+ }
+ s->curr_y_mvs = y;
+ } else
+#endif
+ {
+ RPI_REDIRECT(luma_mc_bi)(s, dst0, s->frame->linesize[0], ref0->frame,
&current_mv.mv[0], x0, y0, nPbW, nPbH,
ref1->frame, &current_mv.mv[1], &current_mv);
+ }
if (s->ps.sps->chroma_format_idc) {
- chroma_mc_bi(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame,
+#ifdef RPI_INTER_QPU
+ if (s->enable_rpi) {
+ int hshift = s->ps.sps->hshift[1];
+ int vshift = s->ps.sps->vshift[1];
+ const Mv *mv = &current_mv.mv[0];
+ intptr_t mx = av_mod_uintp2(mv->x, 2 + hshift);
+ intptr_t my = av_mod_uintp2(mv->y, 2 + vshift);
+ intptr_t _mx = mx << (1 - hshift);
+ intptr_t _my = my << (1 - vshift); // Fractional part of motion vector
+ int x1_c = x0_c + (mv->x >> (2 + hshift));
+ int y1_c = y0_c + (mv->y >> (2 + hshift));
+
+ const Mv *mv2 = &current_mv.mv[1];
+ intptr_t mx2 = av_mod_uintp2(mv2->x, 2 + hshift);
+ intptr_t my2 = av_mod_uintp2(mv2->y, 2 + vshift);
+ intptr_t _mx2 = mx2 << (1 - hshift);
+ intptr_t _my2 = my2 << (1 - vshift); // Fractional part of motion vector
+
+ int x2_c = x0_c + (mv2->x >> (2 + hshift));
+ int y2_c = y0_c + (mv2->y >> (2 + hshift));
+
+
+ uint32_t *u = s->curr_u_mvs;
+ for(int start_y=0;start_y < nPbH_c;start_y+=16) {
+ for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
+ int bw = nPbW_c-start_x;
+ int bh = nPbH_c-start_y;
+ u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b0;
+ u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
+ u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
+ u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref0->frame);
+ u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref0->frame);
+ *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
+ *u++ = rpi_filter_coefs[_mx][0];
+ *u++ = rpi_filter_coefs[_my][0];
+ *u++ = s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0]; // Weight L0 U
+ *u++ = s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1]; // Weight L0 V
+ *u++ = 0; // Intermediate results are not written back in first pass of B filtering
+ *u++ = 0;
+
+ u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b;
+ u++[-RPI_CHROMA_COMMAND_WORDS] = x2_c - 1 + start_x;
+ u++[-RPI_CHROMA_COMMAND_WORDS] = y2_c - 1 + start_y;
+ u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref1->frame);
+ u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref1->frame);
+ *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
+ *u++ = rpi_filter_coefs[_mx2][0];
+ *u++ = rpi_filter_coefs[_my2][0];
+ *u++ = PACK2(s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0] +
+ s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0] + 1,
+ s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0]);
+ *u++ = PACK2(s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1] +
+ s->sh.chroma_offset_l1[current_mv.ref_idx[1]][1] + 1,
+ s->sh.chroma_weight_l1[current_mv.ref_idx[1]][1]);
+ *u++ = (get_vc_address_u(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
+ *u++ = (get_vc_address_v(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+ }
+ }
+ s->curr_u_mvs = u;
+ return;
+ }
+#endif
+ RPI_REDIRECT(chroma_mc_bi)(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame,
x0_c, y0_c, nPbW_c, nPbH_c, &current_mv, 0);
- chroma_mc_bi(s, dst2, s->frame->linesize[2], ref0->frame, ref1->frame,
+ RPI_REDIRECT(chroma_mc_bi)(s, dst2, s->frame->linesize[2], ref0->frame, ref1->frame,
x0_c, y0_c, nPbW_c, nPbH_c, &current_mv, 1);
}
}
@@ -2304,6 +3025,734 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
lc->ctb_up_left_flag = ((x_ctb > 0) && (y_ctb > 0) && (ctb_addr_in_slice-1 >= s->ps.sps->ctb_width) && (s->ps.pps->tile_id[ctb_addr_ts] == s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1 - s->ps.sps->ctb_width]]));
}
+#ifdef RPI
+static void rpi_execute_dblk_cmds(HEVCContext *s)
+{
+ int n;
+ int job = s->pass1_job;
+ int ctb_size = 1 << s->ps.sps->log2_ctb_size;
+ int (*p)[2] = s->dblk_cmds[job];
+ for(n = s->num_dblk_cmds[job]; n>0 ;n--,p++) {
+ ff_hevc_hls_filters(s, (*p)[0], (*p)[1], ctb_size);
+ }
+ s->num_dblk_cmds[job] = 0;
+}
+
+static void rpi_execute_transform(HEVCContext *s)
+{
+ int i=2;
+ int job = s->pass1_job;
+ /*int j;
+ int16_t *coeffs = s->coeffs_buf_arm[job][i];
+ for(j=s->num_coeffs[job][i]; j > 0; j-= 16*16, coeffs+=16*16) {
+ s->hevcdsp.idct[4-2](coeffs, 16);
+ }
+ i=3;
+ coeffs = s->coeffs_buf_arm[job][i] - s->num_coeffs[job][i];
+ for(j=s->num_coeffs[job][i]; j > 0; j-= 32*32, coeffs+=32*32) {
+ s->hevcdsp.idct[5-2](coeffs, 32);
+ }*/
+
+ gpu_cache_flush(&s->coeffs_buf_accelerated[job]);
+ s->vpu_id = vpu_post_code2( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2],
+ s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3],
+ s->num_coeffs[job][3] >> 10, 0, &s->coeffs_buf_accelerated[job]);
+ //vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0);
+ //gpu_cache_flush(&s->coeffs_buf_accelerated);
+ //vpu_wait(s->vpu_id);
+
+ for(i=0;i<4;i++)
+ s->num_coeffs[job][i] = 0;
+}
+
+static void rpi_execute_pred_cmds(HEVCContext *s)
+{
+ int i;
+ int job = s->pass1_job;
+ HEVCPredCmd *cmd = s->univ_pred_cmds[job];
+#ifdef RPI_WORKER
+ HEVCLocalContextIntra *lc = &s->HEVClcIntra;
+#else
+ HEVCLocalContext *lc = s->HEVClc;
+#endif
+
+ for(i = s->num_pred_cmds[job]; i > 0; i--, cmd++) {
+ //printf("i=%d cmd=%p job1=%d job0=%d\n",i,cmd,s->pass1_job,s->pass0_job);
+ if (cmd->type == RPI_PRED_INTRA) {
+ lc->tu.intra_pred_mode_c = lc->tu.intra_pred_mode = cmd->mode;
+ lc->na.cand_bottom_left = (cmd->na >> 4) & 1;
+ lc->na.cand_left = (cmd->na >> 3) & 1;
+ lc->na.cand_up_left = (cmd->na >> 2) & 1;
+ lc->na.cand_up = (cmd->na >> 1) & 1;
+ lc->na.cand_up_right = (cmd->na >> 0) & 1;
+ s->hpc.intra_pred[cmd->size - 2](s, cmd->x, cmd->y, cmd->c_idx);
+ } else {
+#ifdef RPI_PRECLEAR
+ int trafo_size = 1 << cmd->size;
+#endif
+ s->hevcdsp.transform_add[cmd->size-2](cmd->dst, cmd->buf, cmd->stride);
+#ifdef RPI_PRECLEAR
+ memset(cmd->buf, 0, trafo_size * trafo_size * sizeof(int16_t)); // Clear coefficients here while they are in the cache
+#endif
+ }
+ }
+ s->num_pred_cmds[job] = 0;
+}
+
+static void rpi_execute_inter_cmds(HEVCContext *s)
+{
+ int job = s->pass1_job;
+ HEVCMvCmd *cmd = s->unif_mv_cmds[job];
+ int n,cidx;
+ AVFrame myref;
+ AVFrame myref1;
+ struct MvField mymv;
+ if (s->num_mv_cmds[job] > RPI_MAX_MV_CMDS) {
+ printf("Overflow inter_cmds\n");
+ exit(-1);
+ }
+ for(n = s->num_mv_cmds[job]; n>0 ; n--, cmd++) {
+ switch(cmd->cmd) {
+ case RPI_CMD_LUMA_UNI:
+ myref.data[0] = cmd->src;
+ myref.linesize[0] = cmd->srcstride;
+ luma_mc_uni(s, cmd->dst, cmd->dststride, &myref, &cmd->mv, cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, cmd->weight, cmd->offset);
+ break;
+ case RPI_CMD_LUMA_BI:
+ myref.data[0] = cmd->src;
+ myref.linesize[0] = cmd->srcstride;
+ myref1.data[0] = cmd->src1;
+ myref1.linesize[0] = cmd->srcstride1;
+ mymv.ref_idx[0] = cmd->ref_idx[0];
+ mymv.ref_idx[1] = cmd->ref_idx[1];
+ luma_mc_bi(s, cmd->dst, cmd->dststride,
+ &myref, &cmd->mv, cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h,
+ &myref1, &cmd->mv1, &mymv);
+ break;
+ case RPI_CMD_CHROMA_UNI:
+ mymv.mv[0] = cmd->mv;
+ chroma_mc_uni(s, cmd->dst,
+ cmd->dststride, cmd->src, cmd->srcstride, 0,
+ cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, &mymv, cmd->weight, cmd->offset);
+ break;
+ case RPI_CMD_CHROMA_BI:
+ case RPI_CMD_CHROMA_BI+1:
+ cidx = cmd->cmd - RPI_CMD_CHROMA_BI;
+ myref.data[cidx+1] = cmd->src;
+ myref.linesize[cidx+1] = cmd->srcstride;
+ myref1.data[cidx+1] = cmd->src1;
+ myref1.linesize[cidx+1] = cmd->srcstride1;
+ mymv.ref_idx[0] = cmd->ref_idx[0];
+ mymv.ref_idx[1] = cmd->ref_idx[1];
+ mymv.mv[0] = cmd->mv;
+ mymv.mv[1] = cmd->mv1;
+ chroma_mc_bi(s, cmd->dst, cmd->dststride, &myref, &myref1,
+ cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, &mymv, cidx);
+ break;
+ }
+ }
+ s->num_mv_cmds[job] = 0;
+}
+
+static void rpi_do_all_passes(HEVCContext *s)
+{
+ // Kick off QPUs and VPUs
+ rpi_launch_vpu_qpu(s);
+ // Perform luma inter prediction
+ rpi_execute_inter_cmds(s);
+ // Wait for transform completion
+ vpu_wait(s->vpu_id);
+ // Perform intra prediction and residual reconstruction
+ rpi_execute_pred_cmds(s);
+ // Perform deblocking for CTBs in this row
+ rpi_execute_dblk_cmds(s);
+ // Prepare next batch
+ rpi_begin(s);
+}
+
+#endif
+
+#ifdef RPI
+static void rpi_begin(HEVCContext *s)
+{
+ int job = s->pass0_job;
+ int i;
+#ifdef RPI_INTER_QPU
+ int pic_width = s->ps.sps->width >> s->ps.sps->hshift[1];
+ int pic_height = s->ps.sps->height >> s->ps.sps->vshift[1];
+
+ for(i=0;i<8;i++) {
+ s->u_mvs[job][i] = s->mvs_base[job][i];
+ *s->u_mvs[job][i]++ = 0;
+ *s->u_mvs[job][i]++ = 0;
+ *s->u_mvs[job][i]++ = 0;
+ *s->u_mvs[job][i]++ = 0;
+ *s->u_mvs[job][i]++ = 0;
+ *s->u_mvs[job][i]++ = pic_width;
+ *s->u_mvs[job][i]++ = pic_height;
+ *s->u_mvs[job][i]++ = s->frame->linesize[1];
+ *s->u_mvs[job][i]++ = s->frame->linesize[2];
+ *s->u_mvs[job][i]++ = s->sh.chroma_log2_weight_denom + 6;
+ *s->u_mvs[job][i]++ = 0;
+ *s->u_mvs[job][i]++ = i; // Select section of VPM (avoid collisions with 3d unit)
+ }
+ s->curr_u_mvs = s->u_mvs[job][0];
+#endif
+
+#ifdef RPI_LUMA_QPU
+ for(i=0;i<12;i++) {
+ // This needs to have a generally similar structure to the
+ // actual filter code as various pipelined bits need to land correctly
+ // when inserted by the filter requests
+ s->y_mvs[job][i] = s->y_mvs_base[job][i];
+ *s->y_mvs[job][i]++ = 0; // y_x
+ *s->y_mvs[job][i]++ = 0; // ref_y_base
+ *s->y_mvs[job][i]++ = 0; // y2_x2
+ *s->y_mvs[job][i]++ = 0; // ref_y2_base
+ *s->y_mvs[job][i]++ = (s->ps.sps->width << 16) + s->ps.sps->height;
+ *s->y_mvs[job][i]++ = s->frame->linesize[0]; // pitch
+ *s->y_mvs[job][i]++ = s->frame->linesize[0]; // dst_pitch
+ *s->y_mvs[job][i]++ = s->sh.luma_log2_weight_denom + 6; // weight demon + 6
+ *s->y_mvs[job][i]++ = 0; // Unused - alignment with per-block
+ *s->y_mvs[job][i]++ = 0; // Next kernel
+ }
+ s->curr_y_mvs = s->y_mvs[job][0];
+#endif
+ s->ctu_count = 0;
+}
+#endif
+
+#ifdef RPI_SIMULATE_QPUS
+
+static int32_t clipx(int x,int FRAME_WIDTH)
+{
+ if (x<=0) return 0;
+ if (x>=FRAME_WIDTH) return FRAME_WIDTH-1;
+ return x;
+}
+
+static int32_t clipy(int y,int FRAME_HEIGHT)
+{
+ if (y<=0) return 0;
+ if (y>=FRAME_HEIGHT) return FRAME_HEIGHT-1;
+ return y;
+}
+
+/*static int32_t filter8(uint8_t *data, int x0, int y0, int pitch, int mx, int my,int round,int denom,int weight,int offset)
+{
+ int32_t vsum = 0;
+ int x, y;
+
+ for (y = 0; y < 8; y++) {
+ int32_t hsum = 0;
+
+ for (x = 0; x < 8; x++)
+ hsum += lumaFilter[mx][x]*data[clipx(x + x0) + clipy(y + y0) * pitch];
+
+ vsum += lumaFilter[my][y]*hsum;
+ }
+ vsum >>= 6;
+ vsum = (((vsum*weight)+round)>>denom)+offset;
+
+ return av_clip_uint8( vsum );
+}*/
+
+static int32_t filter8_chroma(uint8_t *data, int x0, int y0, int pitch, int hcoeffs, int vcoeffs,int offset_weight,int offset_before,int denom,int pic_width, int pic_height)
+{
+ int32_t vsum = 0;
+ int x, y;
+ int chromaFilterH[4];
+ int chromaFilterV[4];
+ int i;
+ int offset_after = offset_weight>>16;
+ int weight = (offset_weight<<16)>>16;
+ for(i=0;i<4;i++) {
+ chromaFilterH[i] = ((hcoeffs>>(8*i))<<24)>>24;
+ chromaFilterV[i] = ((vcoeffs>>(8*i))<<24)>>24;
+ }
+
+ for (y = 0; y < 4; y++) {
+ int32_t hsum = 0;
+
+ for (x = 0; x < 4; x++)
+ hsum += chromaFilterH[x]*data[clipx(x + x0,pic_width) + clipy(y + y0,pic_height) * pitch];
+
+ vsum += chromaFilterV[y]*hsum;
+ }
+ vsum >>= 6;
+ vsum = (((vsum*weight)+offset_before)>>denom)+offset_after;
+
+ return vsum;
+}
+
+int lumaFilter[4][8]={ {0,0,0,64,0,0,0,0},{-1,4,-10,58,17,-5,1,0},{-1,4,-11,40,40,-11,4,-1},{0,1,-5,17,58,-10,4,-1} };
+
+static int32_t filter8_luma(uint8_t *data, int x0, int y0, int pitch, int my_mx,int offset_weight,int offset_before,int denom,int pic_width, int pic_height)
+{
+ int32_t vsum = 0;
+ int x, y;
+ int i;
+ int offset_after = offset_weight>>16;
+ int weight = (offset_weight<<16)>>16;
+
+ for (y = 0; y < 8; y++) {
+ int32_t hsum = 0;
+
+ for (x = 0; x < 8; x++)
+ hsum += lumaFilter[my_mx&3][x]*data[clipx(x + x0,pic_width) + clipy(y + y0,pic_height) * pitch];
+
+ vsum += lumaFilter[(my_mx>>8)&3][y]*hsum;
+ }
+ vsum >>= 6;
+ vsum = (((vsum*weight)+offset_before)>>denom)+offset_after;
+
+ return vsum;
+}
+
+static uint8_t *test_frame(HEVCContext *s,uint32_t p, AVFrame *frame, const int cIdx)
+{
+ //int pic_width = s->ps.sps->width >> s->ps.sps->hshift[cIdx];
+ int pic_height = s->ps.sps->height >> s->ps.sps->vshift[cIdx];
+ int pitch = frame->linesize[cIdx];
+ uint32_t base = cIdx == 0 ? get_vc_address_y(frame) :
+ cIdx == 1 ? get_vc_address_u(frame) : get_vc_address_v(frame);
+ if (p>=base && p<base+pitch*pic_height) {
+ return frame->data[cIdx] + (p-base);
+ }
+ return NULL;
+}
+
+static uint8_t *compute_arm_addr(HEVCContext *s,uint32_t p, int cIdx)
+{
+ SliceHeader *sh = &s->sh;
+ uint8_t *arm = test_frame(s,p,s->frame,cIdx);
+ int i;
+ if (arm) return arm;
+ if (sh->slice_type == P_SLICE || sh->slice_type == B_SLICE)
+ {
+ for(i=0;i<sh->nb_refs[L0];i++) {
+ arm = test_frame(s,p,s->ref->refPicList[0].ref[i]->frame,cIdx);
+ if (arm) return arm;
+ }
+ }
+ if (sh->slice_type == B_SLICE) {
+ for(i=0;i<sh->nb_refs[L1];i++) {
+ arm = test_frame(s,p,s->ref->refPicList[1].ref[i]->frame,cIdx);
+ if (arm) return arm;
+ }
+ }
+ printf("Frame 0x%x not found! Exit=%x\n",p,qpu_get_fn(QPU_MC_EXIT));
+ exit(-1);
+ return NULL;
+}
+
+static void rpi_simulate_inter_chroma(HEVCContext *s,uint32_t *p)
+{
+ uint32_t next_kernel;
+ uint32_t x0;
+ uint32_t y0;
+ uint8_t *ref_u_base;
+ uint8_t *ref_v_base;
+ uint32_t frame_width = p[5];
+ uint32_t frame_height = p[6];
+ uint32_t pitch = p[7];
+ uint32_t dst_pitch = p[8];
+ int32_t offset_before = p[9];
+ int32_t denom = p[10];
+ uint32_t vpm_id = p[11];
+ uint32_t tmp_u_dst[256];
+ uint32_t tmp_v_dst[256];
+ while(1) {
+ p += 12;
+ next_kernel = p[0-12];
+ x0 = p[1-12];
+ y0 = p[2-12];
+ if (next_kernel==s->mc_filter_uv || next_kernel==s->mc_filter_uv_b0 || next_kernel==s->mc_filter_uv_b) {
+ int x,y;
+ uint32_t width_height = p[5];
+ uint32_t hcoeffs = p[6];
+ uint32_t vcoeffs = p[7];
+ uint32_t offset_weight_u = p[8];
+ uint32_t offset_weight_v = p[9];
+ uint8_t *this_u_dst;
+ uint8_t *this_v_dst;
+ uint32_t width = width_height >> 16;
+ uint32_t height = (width_height << 16) >> 16;
+ ref_u_base = compute_arm_addr(s,p[3-12],1);
+ ref_v_base = compute_arm_addr(s,p[4-12],2);
+ if (next_kernel!=s->mc_filter_uv_b0)
+ {
+ this_u_dst = compute_arm_addr(s,p[10],1);
+ this_v_dst = compute_arm_addr(s,p[11],2);
+ }
+ for (y=0; y<height; ++y) {
+ for (x=0; x<width; ++x) {
+ if (next_kernel==s->mc_filter_uv) {
+ int32_t refa = filter8_chroma(ref_u_base,x+x0, y+y0, pitch, hcoeffs, vcoeffs, offset_weight_u,offset_before,denom,frame_width,frame_height);
+ int32_t refb = filter8_chroma(ref_v_base,x+x0, y+y0, pitch, hcoeffs, vcoeffs, offset_weight_v,offset_before,denom,frame_width,frame_height);
+ this_u_dst[x+y*dst_pitch] = av_clip_uint8(refa);
+ this_v_dst[x+y*dst_pitch] = av_clip_uint8(refb);
+ } else if (next_kernel==s->mc_filter_uv_b0) {
+ int32_t refa = filter8_chroma(ref_u_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1,0,0,frame_width,frame_height);
+ int32_t refb = filter8_chroma(ref_v_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1,0,0,frame_width,frame_height);
+ tmp_u_dst[x+y*16] = refa;
+ tmp_v_dst[x+y*16] = refb;
+ } else {
+ int32_t refa = filter8_chroma(ref_u_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1, 64 + tmp_u_dst[x+y*16], 7, frame_width, frame_height);
+ int32_t refb = filter8_chroma(ref_v_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1, 64 + tmp_v_dst[x+y*16], 7, frame_width, frame_height);
+ this_u_dst[x+y*dst_pitch] = av_clip_uint8(refa);
+ this_v_dst[x+y*dst_pitch] = av_clip_uint8(refb);
+ }
+ }
+ }
+ } else {
+ av_assert0(next_kernel==qpu_get_fn(QPU_MC_INTERRUPT_EXIT8) || next_kernel==qpu_get_fn(QPU_MC_EXIT) );
+ break;
+ }
+ }
+}
+
+// mc_setup(y_x, ref_y_base, y2_x2, ref_y2_base, frame_width_height, pitch, dst_pitch, offset_shift, next_kernel)
+static void rpi_simulate_inter_luma(HEVCContext *s,uint32_t *p,int chan)
+{
+ uint32_t next_kernel;
+ int y_x,y2_x2;
+ int x0;
+ int y0;
+ int x2;
+ int y2;
+ uint32_t *p0 = p;
+ uint8_t *ref_y_base;
+ uint8_t *ref_y2_base;
+ uint32_t frame_width_height = p[4];
+ uint32_t frame_width = frame_width_height>>16;
+ uint32_t frame_height = (frame_width_height<<16)>>16;
+ uint32_t pitch = p[5];
+ uint32_t dst_pitch = p[6];
+ int offset_shift = p[7];
+ int32_t offset_before = offset_shift>>16;
+ int32_t denom = (offset_shift<<16)>>16;
+ while(1) {
+ p += 9;
+ next_kernel = p[8-9];
+ y_x = p[0-9];
+ x0 = (y_x<<16)>>16;
+ y0 = y_x>>16;
+ y2_x2 = p[2-9];
+ x2 = (y2_x2<<16)>>16;
+ y2 = y2_x2>>16;
+
+ if (next_kernel==s->mc_filter || next_kernel==s->mc_filter_b) {
+ // y_x, frame_base, y2_x2, frame_base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
+ int x,y;
+ uint32_t width_height = p[4];
+ uint32_t my2_mx2_my_mx = p[5];
+ uint32_t offset_weight = p[6];
+ uint8_t *this_dst = compute_arm_addr(s,p[7],0);
+ uint32_t width = width_height >> 16;
+ uint32_t height = (width_height << 16) >> 16;
+ uint8_t *dst_base = s->frame->data[0];
+ ref_y_base = compute_arm_addr(s,p[1-9],0);
+ ref_y2_base = compute_arm_addr(s,p[3-9],0);
+ for (y=0; y<height; ++y) {
+ for (x=0; x<width; ++x) {
+ if (next_kernel==s->mc_filter) {
+ int32_t refa = filter8_luma(ref_y_base,x+x0, y+y0, pitch, my2_mx2_my_mx, offset_weight,offset_before,denom,frame_width,frame_height);
+ refa = av_clip_uint8(refa);
+ this_dst[x+y*dst_pitch] = refa;
+ }
+ else {
+ int32_t refa = filter8_luma(ref_y_base, x+x0, y+y0, pitch, my2_mx2_my_mx, 1, 0, 0, frame_width, frame_height);
+ int32_t refb = filter8_luma(ref_y2_base, x+x2, y+y2, pitch, my2_mx2_my_mx>>16, 1, 64 + refa, 7, frame_width, frame_height);
+ this_dst[x+y*dst_pitch] = av_clip_uint8(refb);
+ }
+ }
+ }
+ } else {
+ av_assert0(next_kernel==qpu_get_fn(QPU_MC_INTERRUPT_EXIT12) || next_kernel==qpu_get_fn(QPU_MC_EXIT) );
+ break;
+ }
+ }
+}
+
+static void rpi_simulate_inter_qpu(HEVCContext *s)
+{
+ // First run the transform as normal
+ int i;
+ rpi_execute_transform(s);
+ for(i=0;i<8;i++)
+ {
+ rpi_simulate_inter_chroma(s,s->mvs_base[i]);
+ }
+ for(i=0;i<12;i++)
+ {
+ rpi_simulate_inter_luma(s,s->y_mvs_base[i],i);
+ }
+}
+
+#endif
+
+#ifdef RPI_INTER_QPU
+
+static void rpi_launch_vpu_qpu(HEVCContext *s)
+{
+ int k;
+ int job = s->pass1_job;
+ int i;
+ uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr[job].vc;
+#ifdef RPI_LUMA_QPU
+ uint32_t *y_unif_vc = (uint32_t *)s->y_unif_mvs_ptr[job].vc;
+#endif
+ if (s->sh.slice_type == I_SLICE) {
+#ifdef RPI_MULTI_MAILBOX
+ rpi_execute_transform(s);
+ return;
+#endif
+ }
+ for(k=0;k<8;k++) {
+ s->u_mvs[job][k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
+ s->u_mvs[job][k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
+ s->u_mvs[job][k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for V
+ av_assert0(s->u_mvs[job][k] - s->mvs_base[job][k] < UV_COMMANDS_PER_QPU);
+ }
+
+ s->u_mvs[job][8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
+
+#ifdef RPI_LUMA_QPU
+ for(k=0;k<12;k++) {
+ s->y_mvs[job][k][-RPI_LUMA_COMMAND_WORDS+1] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
+ s->y_mvs[job][k][-RPI_LUMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for second request
+ s->y_mvs[job][k][-1] = qpu_get_fn(QPU_MC_EXIT); // Add exit command (Final uniform)
+ av_assert0(s->y_mvs[job][k] - s->y_mvs_base[job][k] < Y_COMMANDS_PER_QPU);
+ }
+ s->y_mvs[job][12-1][-1] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT12); // This QPU will signal interrupt when all others are done and have acquired a semaphore
+#endif
+
+#ifdef RPI_SIMULATE_QPUS
+ rpi_simulate_inter_qpu(s);
+ return;
+#endif
+
+#ifdef RPI_MULTI_MAILBOX
+#ifdef RPI_CACHE_UNIF_MVS
+ flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],&s->y_unif_mvs_ptr[job], &s->unif_mvs_ptr[job], job);
+#else
+ flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],NULL,NULL, job);
+#endif
+
+#if 1
+ {
+ unsigned int i;
+ uint32_t * p;
+ uint32_t code = qpu_get_fn(QPU_MC_SETUP_UV);
+ uint32_t mail_uv[QPU_N_UV * QPU_MAIL_EL_VALS];
+ uint32_t mail_y[QPU_N_Y * QPU_MAIL_EL_VALS];
+
+ for (p = mail_uv, i = 0; i != QPU_N_UV; ++i) {
+ *p++ = (uint32_t)(unif_vc + (s->mvs_base[job][i] - (uint32_t*)s->unif_mvs_ptr[job].arm));
+ *p++ = code;
+ }
+
+ code = qpu_get_fn(QPU_MC_SETUP);
+ for (p = mail_y, i = 0; i != QPU_N_Y; ++i) {
+ *p++ = (uint32_t)(y_unif_vc + (s->y_mvs_base[job][i] - (uint32_t*)s->y_unif_mvs_ptr[job].arm));
+ *p++ = code;
+ }
+
+ s->vpu_id = vpu_qpu_post_code2(vpu_get_fn(),
+ vpu_get_constants(),
+ s->coeffs_buf_vc[job][2],
+ s->num_coeffs[job][2] >> 8,
+ s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3],
+ s->num_coeffs[job][3] >> 10,
+ 0,
+ // QPU job 1
+ QPU_N_UV,
+ mail_uv,
+ // QPU job 2
+ QPU_N_Y,
+ mail_y
+ );
+ }
+
+#else
+ s->vpu_id = vpu_qpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2], s->num_coeffs[job][2] >> 8,
+ s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3], s->num_coeffs[job][3] >> 10, 0,
+ qpu_get_fn(QPU_MC_SETUP_UV),
+ (uint32_t)(unif_vc+(s->mvs_base[job][0 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+ (uint32_t)(unif_vc+(s->mvs_base[job][1 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+ (uint32_t)(unif_vc+(s->mvs_base[job][2 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+ (uint32_t)(unif_vc+(s->mvs_base[job][3 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+ (uint32_t)(unif_vc+(s->mvs_base[job][4 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+ (uint32_t)(unif_vc+(s->mvs_base[job][5 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+ (uint32_t)(unif_vc+(s->mvs_base[job][6 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+ (uint32_t)(unif_vc+(s->mvs_base[job][7 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+#ifdef RPI_LUMA_QPU
+ qpu_get_fn(QPU_MC_SETUP),
+ (uint32_t)(y_unif_vc+(s->y_mvs_base[job][0 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+ (uint32_t)(y_unif_vc+(s->y_mvs_base[job][1 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+ (uint32_t)(y_unif_vc+(s->y_mvs_base[job][2 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+ (uint32_t)(y_unif_vc+(s->y_mvs_base[job][3 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+ (uint32_t)(y_unif_vc+(s->y_mvs_base[job][4 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+ (uint32_t)(y_unif_vc+(s->y_mvs_base[job][5 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+ (uint32_t)(y_unif_vc+(s->y_mvs_base[job][6 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+ (uint32_t)(y_unif_vc+(s->y_mvs_base[job][7 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+ (uint32_t)(y_unif_vc+(s->y_mvs_base[job][8 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+ (uint32_t)(y_unif_vc+(s->y_mvs_base[job][9 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+ (uint32_t)(y_unif_vc+(s->y_mvs_base[job][10 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+ (uint32_t)(y_unif_vc+(s->y_mvs_base[job][11 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm))
+#else
+ 0,
+ 0,0,0,0,
+ 0,0,0,0,
+ 0,0,0,0
+#endif
+ );
+#endif
+ for(i=0;i<4;i++)
+ s->num_coeffs[job][i] = 0;
+#else
+#error Code rotted here
+ qpu_run_shader8(qpu_get_fn(QPU_MC_SETUP_UV),
+ (uint32_t)(unif_vc+(s->mvs_base[job][0 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+ (uint32_t)(unif_vc+(s->mvs_base[job][1 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+ (uint32_t)(unif_vc+(s->mvs_base[job][2 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+ (uint32_t)(unif_vc+(s->mvs_base[job][3 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+ (uint32_t)(unif_vc+(s->mvs_base[job][4 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+ (uint32_t)(unif_vc+(s->mvs_base[job][5 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+ (uint32_t)(unif_vc+(s->mvs_base[job][6 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+ (uint32_t)(unif_vc+(s->mvs_base[job][7 ] - (uint32_t*)s->unif_mvs_ptr[job].arm))
+ );
+#endif
+
+
+}
+#else
+
+#ifdef RPI
+static void rpi_launch_vpu_qpu(HEVCContext *s)
+{
+ rpi_execute_transform(s);
+}
+#endif
+
+#endif
+
+#ifdef RPI
+
+#ifndef RPI_FAST_CACHEFLUSH
+#error RPI_FAST_CACHEFLUSH is broken
+static void flush_buffer(AVBufferRef *bref) {
+ GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
+ gpu_cache_flush(p);
+}
+#endif
+
+static void flush_frame(HEVCContext *s,AVFrame *frame)
+{
+#ifdef RPI_FAST_CACHEFLUSH
+ struct vcsm_user_clean_invalid_s iocache = {};
+ GPU_MEM_PTR_T p = get_gpu_mem_ptr_u(s->frame);
+ int n = s->ps.sps->height;
+ int curr_y = 0;
+ int curr_uv = 0;
+ int n_uv = n >> s->ps.sps->vshift[1];
+ int sz,base;
+ sz = s->frame->linesize[1] * (n_uv-curr_uv);
+ base = s->frame->linesize[1] * curr_uv;
+ iocache.s[0].handle = p.vcsm_handle;
+ iocache.s[0].cmd = 3; // clean+invalidate
+ iocache.s[0].addr = (int)(p.arm) + base;
+ iocache.s[0].size = sz;
+ p = get_gpu_mem_ptr_v(s->frame);
+ iocache.s[1].handle = p.vcsm_handle;
+ iocache.s[1].cmd = 3; // clean+invalidate
+ iocache.s[1].addr = (int)(p.arm) + base;
+ iocache.s[1].size = sz;
+ p = get_gpu_mem_ptr_y(s->frame);
+ sz = s->frame->linesize[0] * (n-curr_y);
+ base = s->frame->linesize[0] * curr_y;
+ iocache.s[2].handle = p.vcsm_handle;
+ iocache.s[2].cmd = 3; // clean+invalidate
+ iocache.s[2].addr = (int)(p.arm) + base;
+ iocache.s[2].size = sz;
+ vcsm_clean_invalid( &iocache );
+#else
+ flush_buffer(frame->buf[0]);
+ flush_buffer(frame->buf[1]);
+ flush_buffer(frame->buf[2]);
+#endif
+}
+
+static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2, int job)
+{
+#ifdef RPI_FAST_CACHEFLUSH
+ struct vcsm_user_clean_invalid_s iocache = {};
+ int n;
+ int curr_y;
+ int curr_uv;
+ int n_uv;
+ GPU_MEM_PTR_T p = get_gpu_mem_ptr_u(s->frame);
+ int sz,base;
+ int (*d)[2] = s->dblk_cmds[job];
+ int low=(*d)[1];
+ int high=(*d)[1];
+ for(n = s->num_dblk_cmds[job]; n>0 ;n--,d++) {
+ int y = (*d)[1];
+ low=FFMIN(low,y);
+ high=FFMAX(high,y);
+ }
+ curr_y = low;
+ n = high+(1 << s->ps.sps->log2_ctb_size);
+ curr_uv = curr_y >> s->ps.sps->vshift[1];
+ n_uv = n >> s->ps.sps->vshift[1];
+
+ sz = s->frame->linesize[1] * (n_uv-curr_uv);
+ base = s->frame->linesize[1] * curr_uv;
+ iocache.s[0].handle = p.vcsm_handle;
+ iocache.s[0].cmd = 3; // clean+invalidate
+ iocache.s[0].addr = (int)(p.arm) + base;
+ iocache.s[0].size = sz;
+ p = get_gpu_mem_ptr_v(s->frame);
+ iocache.s[1].handle = p.vcsm_handle;
+ iocache.s[1].cmd = 3; // clean+invalidate
+ iocache.s[1].addr = (int)(p.arm) + base;
+ iocache.s[1].size = sz;
+ p = get_gpu_mem_ptr_y(s->frame);
+ sz = s->frame->linesize[0] * (n-curr_y);
+ base = s->frame->linesize[0] * curr_y;
+ iocache.s[2].handle = p.vcsm_handle;
+ iocache.s[2].cmd = 3; // clean+invalidate
+ iocache.s[2].addr = (int)(p.arm) + base;
+ iocache.s[2].size = sz;
+
+ iocache.s[3].handle = p0->vcsm_handle;
+ iocache.s[3].cmd = 3; // clean+invalidate
+ iocache.s[3].addr = (int) p0->arm;
+ iocache.s[3].size = p0->numbytes;
+ if (p1) {
+ iocache.s[4].handle = p1->vcsm_handle;
+ iocache.s[4].cmd = 3; // clean+invalidate
+ iocache.s[4].addr = (int) p1->arm;
+ iocache.s[4].size = p1->numbytes;
+ }
+ if (p2) {
+ iocache.s[5].handle = p2->vcsm_handle;
+ iocache.s[5].cmd = 3; // clean+invalidate
+ iocache.s[5].addr = (int) p2->arm;
+ iocache.s[5].size = p2->numbytes;
+ }
+ vcsm_clean_invalid( &iocache );
+#else
+ flush_buffer(frame->buf[0]);
+ flush_buffer(frame->buf[1]);
+ flush_buffer(frame->buf[2]);
+ gpu_cache_flush3(p0, p1, p2);
+#endif
+}
+
+#endif
+
static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
{
HEVCContext *s = avctxt->priv_data;
@@ -2313,6 +3762,17 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
int y_ctb = 0;
int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
+#ifdef RPI
+ s->enable_rpi = s->ps.sps->bit_depth == 8
+ && !s->ps.pps->cross_component_prediction_enabled_flag;
+
+ if (!s->enable_rpi) {
+ if (s->ps.pps->cross_component_prediction_enabled_flag)
+ printf("Cross component\n");
+ }
+#endif
+ //printf("L0=%d L1=%d\n",s->sh.nb_refs[L1],s->sh.nb_refs[L1]);
+
if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
return AVERROR_INVALIDDATA;
@@ -2326,6 +3786,14 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
}
}
+#ifdef RPI_WORKER
+ s->pass0_job = 0;
+ s->pass1_job = 0;
+#endif
+#ifdef RPI
+ rpi_begin(s);
+#endif
+
while (more_data && ctb_addr_ts < s->ps.sps->ctb_size) {
int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
@@ -2341,7 +3809,57 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
s->deblock[ctb_addr_rs].tc_offset = s->sh.tc_offset;
s->filter_slice_edges[ctb_addr_rs] = s->sh.slice_loop_filter_across_slices_enabled_flag;
+#ifdef RPI_INTER_QPU
+ s->curr_u_mvs = s->u_mvs[s->pass0_job][s->ctu_count % 8];
+#endif
+#ifdef RPI_LUMA_QPU
+ s->curr_y_mvs = s->y_mvs[s->pass0_job][s->ctu_count % 12];
+#endif
+
more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
+
+#ifdef RPI_INTER_QPU
+ s->u_mvs[s->pass0_job][s->ctu_count % 8]= s->curr_u_mvs;
+#endif
+#ifdef RPI_LUMA_QPU
+ s->y_mvs[s->pass0_job][s->ctu_count % 12] = s->curr_y_mvs;
+#endif
+
+#ifdef RPI
+ if (s->enable_rpi) {
+ //av_assert0(s->num_dblk_cmds[s->pass0_job]>=0);
+ //av_assert0(s->num_dblk_cmds[s->pass0_job]<RPI_MAX_DEBLOCK_CMDS);
+ //av_assert0(s->pass0_job<RPI_MAX_JOBS);
+ //av_assert0(s->pass0_job>=0);
+ s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]][0] = x_ctb;
+ s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]++][1] = y_ctb;
+ s->ctu_count++;
+ //printf("%d %d/%d job=%d\n",s->ctu_count,s->num_dblk_cmds[s->pass0_job],RPI_MAX_DEBLOCK_CMDS,s->pass0_job);
+
+ if ( s->ctu_count >= s->max_ctu_count ) {
+#ifdef RPI_WORKER
+ if (s->used_for_ref) {
+ // Split work load onto separate threads so we make as rapid progress as possible with this frame
+ // Pass on this job to worker thread
+ worker_submit_job(s);
+ // Make sure we have space to prepare the next job
+ worker_pass0_ready(s);
+
+ // Prepare the next batch of commands
+ rpi_begin(s);
+ } else {
+ // Non-ref frame so do it all on this thread
+ rpi_do_all_passes(s);
+ }
+#else
+ rpi_do_all_passes(s);
+#endif
+ }
+
+ }
+#endif
+
+
if (more_data < 0) {
s->tab_slice_address[ctb_addr_rs] = -1;
return more_data;
@@ -2350,9 +3868,29 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
ctb_addr_ts++;
ff_hevc_save_states(s, ctb_addr_ts);
+#ifdef RPI
+ if (s->enable_rpi)
+ continue;
+#endif
ff_hevc_hls_filters(s, x_ctb, y_ctb, ctb_size);
}
+#ifdef RPI
+
+#ifdef RPI_WORKER
+ // Wait for the worker to finish all its jobs
+ if (s->enable_rpi) {
+ worker_wait(s);
+ }
+#endif
+
+ // Finish off any half-completed rows
+ if (s->enable_rpi && s->ctu_count) {
+ rpi_do_all_passes(s);
+ }
+
+#endif
+
if (x_ctb + ctb_size >= s->ps.sps->width &&
y_ctb + ctb_size >= s->ps.sps->height)
ff_hevc_hls_filter(s, x_ctb, y_ctb, ctb_size);
@@ -2387,6 +3925,11 @@ static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int
s = s1->sList[self_id];
lc = s->HEVClc;
+#ifdef RPI
+ s->enable_rpi = 0;
+ //printf("Wavefront\n");
+#endif
+
if(ctb_row) {
ret = init_get_bits8(&lc->gb, s->data + s->sh.offset[ctb_row - 1], s->sh.size[ctb_row - 1]);
@@ -2767,6 +4310,16 @@ static int decode_nal_unit(HEVCContext *s, const H2645NAL *nal)
if (ret < 0)
return ret;
+ s->used_for_ref = !(s->nal_unit_type == NAL_TRAIL_N ||
+ s->nal_unit_type == NAL_TSA_N ||
+ s->nal_unit_type == NAL_STSA_N ||
+ s->nal_unit_type == NAL_RADL_N ||
+ s->nal_unit_type == NAL_RASL_N);
+
+ if (!s->used_for_ref && s->avctx->skip_frame >= AVDISCARD_NONREF) {
+ s->is_decoded = 0;
+ break;
+ }
if (s->max_ra == INT_MAX) {
if (s->nal_unit_type == NAL_CRA_NUT || IS_BLA(s)) {
s->max_ra = s->poc;
@@ -2891,9 +4444,17 @@ static int decode_nal_units(HEVCContext *s, const uint8_t *buf, int length)
}
fail:
- if (s->ref && s->threads_type == FF_THREAD_FRAME)
+ if (s->ref && s->threads_type == FF_THREAD_FRAME) {
+#ifdef RPI_INTER_QPU
+ ff_hevc_flush_buffer(s, &s->ref->tf, s->ps.sps->height);
+#endif
ff_thread_report_progress(&s->ref->tf, INT_MAX, 0);
-
+ } else if (s->ref) {
+#ifdef RPI_INTER_QPU
+ // When running single threaded we need to flush the whole frame
+ flush_frame(s,s->frame);
+#endif
+ }
return ret;
}
@@ -3064,6 +4625,41 @@ fail:
return AVERROR(ENOMEM);
}
+#ifdef RPI_WORKER
+static av_cold void hevc_init_worker(HEVCContext *s)
+{
+ int err;
+ pthread_cond_init(&s->worker_cond_head, NULL);
+ pthread_cond_init(&s->worker_cond_tail, NULL);
+ pthread_mutex_init(&s->worker_mutex, NULL);
+
+ s->worker_tail=0;
+ s->worker_head=0;
+ s->kill_worker=0;
+ err = pthread_create(&s->worker_thread, NULL, worker_start, s);
+ if (err) {
+ printf("Failed to create worker thread\n");
+ exit(-1);
+ }
+}
+
+static av_cold void hevc_exit_worker(HEVCContext *s)
+{
+ void *res;
+ s->kill_worker=1;
+ pthread_cond_broadcast(&s->worker_cond_tail);
+ pthread_join(s->worker_thread, &res);
+
+ pthread_cond_destroy(&s->worker_cond_head);
+ pthread_cond_destroy(&s->worker_cond_tail);
+ pthread_mutex_destroy(&s->worker_mutex);
+
+ s->worker_tail=0;
+ s->worker_head=0;
+ s->kill_worker=0;
+}
+#endif
+
static av_cold int hevc_decode_free(AVCodecContext *avctx)
{
HEVCContext *s = avctx->priv_data;
@@ -3075,6 +4671,32 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
av_freep(&s->cabac_state);
+#ifdef RPI
+
+#ifdef RPI_WORKER
+ hevc_exit_worker(s);
+#endif
+
+ for(i=0;i<RPI_MAX_JOBS;i++) {
+ av_freep(&s->unif_mv_cmds[i]);
+ av_freep(&s->univ_pred_cmds[i]);
+
+#ifdef RPI_INTER_QPU
+ if (s->unif_mvs[i]) {
+ gpu_free( &s->unif_mvs_ptr[i] );
+ s->unif_mvs[i] = 0;
+ }
+#endif
+#ifdef RPI_LUMA_QPU
+ if (s->y_unif_mvs[i]) {
+ gpu_free( &s->y_unif_mvs_ptr[i] );
+ s->y_unif_mvs[i] = 0;
+ }
+#endif
+ }
+
+#endif
+
for (i = 0; i < 3; i++) {
av_freep(&s->sao_pixel_buffer_h[i]);
av_freep(&s->sao_pixel_buffer_v[i]);
@@ -3116,10 +4738,23 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
return 0;
}
+#ifdef RPI
+#ifdef RPI_PRECLEAR
+static av_cold void memclear16(int16_t *p, int n)
+{
+ vpu_execute_code( vpu_get_fn(), p, n, 0, 0, 0, 1);
+ //int i;
+ //for(i=0;i<n;i++)
+ // p[i] = 0;
+}
+#endif
+#endif
+
static av_cold int hevc_init_context(AVCodecContext *avctx)
{
HEVCContext *s = avctx->priv_data;
int i;
+ int job;
s->avctx = avctx;
@@ -3129,6 +4764,78 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
s->HEVClcList[0] = s->HEVClc;
s->sList[0] = s;
+#ifdef RPI
+ for(job=0;job<RPI_MAX_JOBS;job++) {
+ s->unif_mv_cmds[job] = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS);
+ if (!s->unif_mv_cmds[job])
+ goto fail;
+ s->univ_pred_cmds[job] = av_mallocz(sizeof(HEVCPredCmd)*RPI_MAX_PRED_CMDS);
+ if (!s->univ_pred_cmds[job])
+ goto fail;
+ }
+
+#ifdef RPI_INTER_QPU
+ // We divide the image into blocks 256 wide and 64 high
+ // We support up to 2048 widths
+ // We compute the number of chroma motion vector commands for 4:4:4 format and 4x4 chroma blocks - assuming all blocks are B predicted
+ // Also add space for the startup command for each stream.
+
+ {
+ int uv_commands_per_qpu = UV_COMMANDS_PER_QPU;
+ uint32_t *p;
+ for(job=0;job<RPI_MAX_JOBS;job++) {
+#ifdef RPI_CACHE_UNIF_MVS
+ gpu_malloc_cached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr[job] );
+#else
+ gpu_malloc_uncached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr[job] );
+#endif
+ s->unif_mvs[job] = (uint32_t *) s->unif_mvs_ptr[job].arm;
+
+ // Set up initial locations for uniform streams
+ p = s->unif_mvs[job];
+ for(i = 0; i < 8; i++) {
+ s->mvs_base[job][i] = p;
+ p += uv_commands_per_qpu;
+ }
+ }
+ s->mc_filter_uv = qpu_get_fn(QPU_MC_FILTER_UV);
+ s->mc_filter_uv_b0 = qpu_get_fn(QPU_MC_FILTER_UV_B0);
+ s->mc_filter_uv_b = qpu_get_fn(QPU_MC_FILTER_UV_B);
+ }
+
+#endif
+#ifdef RPI_LUMA_QPU
+ for(job=0;job<RPI_MAX_JOBS;job++)
+ {
+ int y_commands_per_qpu = Y_COMMANDS_PER_QPU;
+ uint32_t *p;
+#ifdef RPI_CACHE_UNIF_MVS
+ gpu_malloc_cached( 12 * y_commands_per_qpu * sizeof(uint32_t), &s->y_unif_mvs_ptr[job] );
+#else
+ gpu_malloc_uncached( 12 * y_commands_per_qpu * sizeof(uint32_t), &s->y_unif_mvs_ptr[job] );
+#endif
+ s->y_unif_mvs[job] = (uint32_t *) s->y_unif_mvs_ptr[job].arm;
+
+ // Set up initial locations for uniform streams
+ p = s->y_unif_mvs[job];
+ for(i = 0; i < 12; i++) {
+ s->y_mvs_base[job][i] = p;
+ p += y_commands_per_qpu;
+ }
+ }
+ s->mc_filter = qpu_get_fn(QPU_MC_FILTER);
+ s->mc_filter_b = qpu_get_fn(QPU_MC_FILTER_B);
+#endif
+ //gpu_malloc_uncached(2048*64,&s->dummy);
+
+ s->enable_rpi = 0;
+
+#ifdef RPI_WORKER
+ hevc_init_worker(s);
+#endif
+
+#endif
+
s->cabac_state = av_malloc(HEVC_CONTEXTS);
if (!s->cabac_state)
goto fail;
diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
index be91010..6b03ea8 100644
--- a/libavcodec/hevc.h
+++ b/libavcodec/hevc.h
@@ -23,6 +23,9 @@
#ifndef AVCODEC_HEVC_H
#define AVCODEC_HEVC_H
+// define RPI to split the CABAC/prediction/transform into separate stages
+#include "config.h"
+
#include "libavutil/buffer.h"
#include "libavutil/md5.h"
@@ -37,6 +40,29 @@
#include "thread.h"
#include "videodsp.h"
+// define RPI to split the CABAC/prediction/transform into separate stages
+#ifdef RPI
+
+ #include "rpi_qpu.h"
+ // Define RPI_INTER_QPU to use QPU for chroma inter prediction
+ #define RPI_INTER_QPU
+
+ #ifdef RPI_INTER_QPU
+ // Define RPI_LUMA_QPU to also use QPU for luma inter prediction
+ #define RPI_LUMA_QPU
+ #endif
+
+ // By passing jobs to a worker thread we hope to be able to catch up during slow frames
+ #define RPI_MAX_JOBS 2
+ // Define RPI_WORKER to launch a worker thread for pixel processing tasks
+ #define RPI_WORKER
+ // Define RPI_DEBLOCK_VPU to perform deblocking on the VPUs
+// #define RPI_DEBLOCK_VPU
+
+#endif
+
+#define RPI_VPU_DEBLOCK_CACHED 1
+
#define MAX_DPB_SIZE 16 // A.4.1
#define MAX_REFS 16
@@ -660,17 +686,6 @@ typedef struct CodingUnit {
uint8_t cu_transquant_bypass_flag;
} CodingUnit;
-typedef struct Mv {
- int16_t x; ///< horizontal component of motion vector
- int16_t y; ///< vertical component of motion vector
-} Mv;
-
-typedef struct MvField {
- DECLARE_ALIGNED(4, Mv, mv)[2];
- int8_t ref_idx[2];
- int8_t pred_flag;
-} MvField;
-
typedef struct NeighbourAvailable {
int cand_bottom_left;
int cand_left;
@@ -747,7 +762,17 @@ typedef struct HEVCFrame {
uint8_t flags;
} HEVCFrame;
+#ifdef RPI_WORKER
+typedef struct HEVCLocalContextIntra {
+ TransformUnit tu;
+ NeighbourAvailable na;
+} HEVCLocalContextIntra;
+#endif
+
typedef struct HEVCLocalContext {
+ TransformUnit tu;
+ NeighbourAvailable na; // WARNING tu and na must be the first two fields to match HEVCLocalContextIntra
+
uint8_t cabac_state[HEVC_CONTEXTS];
uint8_t stat_coeff[4];
@@ -762,7 +787,6 @@ typedef struct HEVCLocalContext {
int qPy_pred;
- TransformUnit tu;
uint8_t ctb_left_flag;
uint8_t ctb_up_flag;
@@ -779,7 +803,6 @@ typedef struct HEVCLocalContext {
int ct_depth;
CodingUnit cu;
PredictionUnit pu;
- NeighbourAvailable na;
#define BOUNDARY_LEFT_SLICE (1 << 0)
#define BOUNDARY_LEFT_TILE (1 << 1)
@@ -790,6 +813,80 @@ typedef struct HEVCLocalContext {
int boundary_flags;
} HEVCLocalContext;
+
+#ifdef RPI
+
+// The processing is done in chunks
+// Each chunk corresponds to 24 64x64 luma blocks (24 so it is divisible by 8 for chroma and 12 for luma)
+// This is a distance of 1536 pixels across the screen
+// Increasing RPI_NUM_CHUNKS will reduce time spent activating QPUs and cache flushing,
+// but allocate more memory and increase the latency before data in the next frame can be processed
+#define RPI_NUM_CHUNKS 1
+
+// RPI_MAX_WIDTH is maximum width in pixels supported by the accelerated code
+#define RPI_MAX_WIDTH (RPI_NUM_CHUNKS*64*24)
+
+// Worst case is for 4:4:4 4x4 blocks with 64 high coding tree blocks, so 16 MV cmds per 4 pixels across for each colour plane, * 2 for bi
+#define RPI_MAX_MV_CMDS (2*16*3*(RPI_MAX_WIDTH/4))
+// Each block can have an intra prediction and a transform_add command
+#define RPI_MAX_PRED_CMDS (2*16*3*(RPI_MAX_WIDTH/4))
+// Worst case is 16x16 CTUs
+#define RPI_MAX_DEBLOCK_CMDS (RPI_MAX_WIDTH*4/16)
+
+#define RPI_CMD_LUMA_UNI 0
+#define RPI_CMD_CHROMA_UNI 1
+#define RPI_CMD_LUMA_BI 2
+#define RPI_CMD_CHROMA_BI 3
+#define RPI_CMD_V_BI 4
+
+// RPI_PRECLEAR is not working yet - perhaps clearing on VPUs is flawed?
+// #define RPI_PRECLEAR
+
+// Command for inter prediction
+typedef struct HEVCMvCmd {
+ int cmd;
+ uint8_t *dst;
+ ptrdiff_t dststride;
+ uint8_t *src;
+ ptrdiff_t srcstride;
+ Mv mv;
+ int x_off;
+ int y_off;
+ int block_w;
+ int block_h;
+ int weight;
+ int offset;
+ uint8_t *src1;
+ ptrdiff_t srcstride1;
+ Mv mv1;
+ int8_t ref_idx[2];
+} HEVCMvCmd;
+
+
+// Command for intra prediction and transform_add of predictions to coefficients
+#define RPI_PRED_TRANSFORM_ADD 0
+#define RPI_PRED_INTRA 1
+typedef struct HEVCPredCmd {
+ uint8_t size;
+ uint8_t type;
+ uint8_t na;
+ uint8_t c_idx;
+ union {
+ uint8_t *dst; // RPI_PRED_TRANSFORM_ADD
+ uint32_t x; // RPI_PRED_INTRA
+ };
+ union {
+ int16_t *buf; // RPI_PRED_TRANSFORM_ADD
+ uint32_t y; // RPI_PRED_INTRA
+ };
+ union {
+ enum IntraPredMode mode; // RPI_PRED_TRANSFORM_ADD
+ uint32_t stride; // RPI_PRED_INTRA
+ };
+} HEVCPredCmd;
+
+#endif
+
typedef struct HEVCContext {
const AVClass *c; // needed by private avoptions
AVCodecContext *avctx;
@@ -798,13 +895,107 @@ typedef struct HEVCContext {
HEVCLocalContext *HEVClcList[MAX_NB_THREADS];
HEVCLocalContext *HEVClc;
-
+#ifdef RPI_WORKER
+ HEVCLocalContextIntra HEVClcIntra;
+#endif
uint8_t threads_type;
uint8_t threads_number;
int width;
int height;
+ int used_for_ref;
+
+#ifdef RPI
+ int enable_rpi;
+ HEVCMvCmd *unif_mv_cmds[RPI_MAX_JOBS];
+ HEVCPredCmd *univ_pred_cmds[RPI_MAX_JOBS];
+ int buf_width;
+ GPU_MEM_PTR_T coeffs_buf_default[RPI_MAX_JOBS];
+ GPU_MEM_PTR_T coeffs_buf_accelerated[RPI_MAX_JOBS];
+ int16_t *coeffs_buf_arm[RPI_MAX_JOBS][4];
+ unsigned int coeffs_buf_vc[RPI_MAX_JOBS][4];
+ int num_coeffs[RPI_MAX_JOBS][4];
+ int num_xfm_cmds[RPI_MAX_JOBS];
+ int num_mv_cmds[RPI_MAX_JOBS];
+ int num_pred_cmds[RPI_MAX_JOBS];
+ int num_dblk_cmds[RPI_MAX_JOBS];
+ int vpu_id;
+ int pass0_job; // Pass0 does coefficient decode
+ int pass1_job; // Pass1 does pixel processing
+ int ctu_count; // Number of CTUs done in pass0 so far
+ int max_ctu_count; // Number of CTUs when we trigger a round of processing
+ int ctu_per_y_chan; // Number of CTUs per luma QPU
+ int ctu_per_uv_chan; // Number of CTUs per chroma QPU
+#ifdef RPI_INTER_QPU
+ GPU_MEM_PTR_T unif_mvs_ptr[RPI_MAX_JOBS];
+ uint32_t *unif_mvs[RPI_MAX_JOBS]; // Base of memory for motion vector commands
+
+ // _base pointers are to the start of the row
+ uint32_t *mvs_base[RPI_MAX_JOBS][8];
+ // these pointers are to the next free space
+ uint32_t *u_mvs[RPI_MAX_JOBS][8];
+ uint32_t *curr_u_mvs; // Current uniform stream to use for chroma
+ // Function pointers
+ uint32_t mc_filter_uv;
+ uint32_t mc_filter_uv_b0;
+ uint32_t mc_filter_uv_b;
+#endif
+#ifdef RPI_LUMA_QPU
+ GPU_MEM_PTR_T y_unif_mvs_ptr[RPI_MAX_JOBS];
+ uint32_t *y_unif_mvs[RPI_MAX_JOBS]; // Base of memory for motion vector commands
+ uint32_t *y_mvs_base[RPI_MAX_JOBS][12];
+ uint32_t *y_mvs[RPI_MAX_JOBS][12];
+ uint32_t *curr_y_mvs; // Current uniform stream for luma
+ // Function pointers
+ uint32_t mc_filter;
+ uint32_t mc_filter_b;
+#endif
+
+#ifdef RPI_WORKER
+ pthread_t worker_thread;
+ pthread_cond_t worker_cond_head;
+ pthread_cond_t worker_cond_tail;
+ pthread_mutex_t worker_mutex;
+
+ int worker_tail; // Contains the number of posted jobs
+ int worker_head; // Contains the number of completed jobs
+ int kill_worker; // set to 1 to terminate the worker
+#endif
+
+#define RPI_DEBLOCK_VPU_Q_COUNT 2
+
+#ifdef RPI_DEBLOCK_VPU
+ int enable_rpi_deblock;
+
+ int uv_setup_width;
+ int uv_setup_height;
+ int setup_width; // Number of 16x16 blocks across the image
+ int setup_height; // Number of 16x16 blocks down the image
+
+ struct dblk_vpu_q_s
+ {
+ GPU_MEM_PTR_T deblock_vpu_gmem;
+
+ uint8_t (*y_setup_arm)[2][2][2][4];
+ uint8_t (*y_setup_vc)[2][2][2][4];
+
+ uint8_t (*uv_setup_arm)[2][2][2][4]; // Half of this is unused [][][1][], but easier for the VPU as it allows us to store with zeros and addresses are aligned
+ uint8_t (*uv_setup_vc)[2][2][2][4];
+
+ int (*vpu_cmds_arm)[6]; // r0-r5 for each command
+ int vpu_cmds_vc;
+
+ int cmd_id;
+ } dvq_ents[RPI_DEBLOCK_VPU_Q_COUNT];
+
+ struct dblk_vpu_q_s * dvq;
+ unsigned int dvq_n;
+
+#endif
+
+#endif
+
uint8_t *cabac_state;
/** 1 if the independent slice segment header was successfully parsed */
@@ -922,6 +1113,9 @@ typedef struct HEVCContext {
uint32_t max_mastering_luminance;
uint32_t min_mastering_luminance;
+#ifdef RPI
+ int dblk_cmds[RPI_MAX_JOBS][RPI_MAX_DEBLOCK_CMDS][2];
+#endif
} HEVCContext;
int ff_hevc_decode_short_term_rps(GetBitContext *gb, AVCodecContext *avctx,
@@ -1048,6 +1242,10 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
int log2_trafo_size, enum ScanType scan_idx,
int c_idx);
+#ifdef RPI_INTER_QPU
+extern void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n);
+#endif
+
void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size);
diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
index 05b2821..e2f1f4e 100644
--- a/libavcodec/hevc_cabac.c
+++ b/libavcodec/hevc_cabac.c
@@ -21,14 +21,72 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
+#define UNCHECKED_BITSTREAM_READER 1
+
#include "libavutil/attributes.h"
#include "libavutil/common.h"
-#include "cabac_functions.h"
#include "hevc.h"
+#include "cabac_functions.h"
+
+// BY22 is probably faster than simple bypass if the processor has
+// either a fast 32-bit divide or a fast 32x32->64[63:32] instruction
+// x86 has fast int divide
+// Arm doesn't have divide or general fast 64 bit, but does have the multiply
+// * Beware: ARCH_xxx isn't set if configure --disable-asm is used
+#define USE_BY22 (HAVE_FAST_64BIT || ARCH_ARM || ARCH_X86)
+// Use native divide if we have a fast one - otherwise use mpy 1/x
+// x86 has a fast integer divide - arm doesn't - unsure about other
+// architectures
+#define USE_BY22_DIV ARCH_X86
+
+// Special case blocks with a single significant ceoff
+// Decreases the complexity of the code for a common case but increases the
+// code size.
+#define USE_N_END_1 1
+
+#if ARCH_ARM
+#include "arm/hevc_cabac.h"
+#endif
#define CABAC_MAX_BIN 31
+
+#if USE_BY22 && !USE_BY22_DIV
+#define I(x) (uint32_t)((0x10000000000ULL / (uint64_t)(x)) + 1ULL)
+
+static const uint32_t cabac_by22_inv_range[256] = {
+ 0, I(257), I(258), I(259),
+ I(260), I(261), I(262), I(263), I(264), I(265), I(266), I(267), I(268), I(269),
+ I(270), I(271), I(272), I(273), I(274), I(275), I(276), I(277), I(278), I(279),
+ I(280), I(281), I(282), I(283), I(284), I(285), I(286), I(287), I(288), I(289),
+ I(290), I(291), I(292), I(293), I(294), I(295), I(296), I(297), I(298), I(299),
+ I(300), I(301), I(302), I(303), I(304), I(305), I(306), I(307), I(308), I(309),
+ I(310), I(311), I(312), I(313), I(314), I(315), I(316), I(317), I(318), I(319),
+ I(320), I(321), I(322), I(323), I(324), I(325), I(326), I(327), I(328), I(329),
+ I(330), I(331), I(332), I(333), I(334), I(335), I(336), I(337), I(338), I(339),
+ I(340), I(341), I(342), I(343), I(344), I(345), I(346), I(347), I(348), I(349),
+ I(350), I(351), I(352), I(353), I(354), I(355), I(356), I(357), I(358), I(359),
+ I(360), I(361), I(362), I(363), I(364), I(365), I(366), I(367), I(368), I(369),
+ I(370), I(371), I(372), I(373), I(374), I(375), I(376), I(377), I(378), I(379),
+ I(380), I(381), I(382), I(383), I(384), I(385), I(386), I(387), I(388), I(389),
+ I(390), I(391), I(392), I(393), I(394), I(395), I(396), I(397), I(398), I(399),
+ I(400), I(401), I(402), I(403), I(404), I(405), I(406), I(407), I(408), I(409),
+ I(410), I(411), I(412), I(413), I(414), I(415), I(416), I(417), I(418), I(419),
+ I(420), I(421), I(422), I(423), I(424), I(425), I(426), I(427), I(428), I(429),
+ I(430), I(431), I(432), I(433), I(434), I(435), I(436), I(437), I(438), I(439),
+ I(440), I(441), I(442), I(443), I(444), I(445), I(446), I(447), I(448), I(449),
+ I(450), I(451), I(452), I(453), I(454), I(455), I(456), I(457), I(458), I(459),
+ I(460), I(461), I(462), I(463), I(464), I(465), I(466), I(467), I(468), I(469),
+ I(470), I(471), I(472), I(473), I(474), I(475), I(476), I(477), I(478), I(479),
+ I(480), I(481), I(482), I(483), I(484), I(485), I(486), I(487), I(488), I(489),
+ I(490), I(491), I(492), I(493), I(494), I(495), I(496), I(497), I(498), I(499),
+ I(500), I(501), I(502), I(503), I(504), I(505), I(506), I(507), I(508), I(509),
+ I(510), I(511)
+};
+#undef I
+#endif // USE_BY22
+
/**
* number of bin by SyntaxElement.
*/
@@ -445,6 +503,211 @@ static const uint8_t diag_scan8x8_inv[8][8] = {
{ 28, 36, 43, 49, 54, 58, 61, 63, },
};
+
+typedef struct
+{
+ uint16_t coeff;
+ uint16_t scale;
+} xy_off_t;
+
+#define XYT_C(x,y,t) ((x) + ((y) << (t)))
+#define SCALE_TRAFO(t) ((t) > 3 ? 3 : (t))
+#define SCALE_SHR(t) ((t) - SCALE_TRAFO(t))
+#define XYT_S(x,y,t) (((x) >> SCALE_SHR(t)) + (((y) >> SCALE_SHR(t)) << SCALE_TRAFO(t)))
+
+#define XYT(x,y,t) {XYT_C(x,y,t), XYT_S(x,y,t)}
+
+#define OFF_DIAG(t) {\
+ XYT(0,0,t), XYT(0,1,t), XYT(1,0,t), XYT(0,2,t),\
+ XYT(1,1,t), XYT(2,0,t), XYT(0,3,t), XYT(1,2,t),\
+ XYT(2,1,t), XYT(3,0,t), XYT(1,3,t), XYT(2,2,t),\
+ XYT(3,1,t), XYT(2,3,t), XYT(3,2,t), XYT(3,3,t)\
+}
+
+#define OFF_HORIZ(t) {\
+ XYT(0,0,t), XYT(1,0,t), XYT(2,0,t), XYT(3,0,t),\
+ XYT(0,1,t), XYT(1,1,t), XYT(2,1,t), XYT(3,1,t),\
+ XYT(0,2,t), XYT(1,2,t), XYT(2,2,t), XYT(3,2,t),\
+ XYT(0,3,t), XYT(1,3,t), XYT(2,3,t), XYT(3,3,t)\
+}
+
+#define OFF_VERT(t) {\
+ XYT(0,0,t), XYT(0,1,t), XYT(0,2,t), XYT(0,3,t),\
+ XYT(1,0,t), XYT(1,1,t), XYT(1,2,t), XYT(1,3,t),\
+ XYT(2,0,t), XYT(2,1,t), XYT(2,2,t), XYT(2,3,t),\
+ XYT(3,0,t), XYT(3,1,t), XYT(3,2,t), XYT(3,3,t)\
+}
+
+static const xy_off_t off_xys[3][4][16] =
+{
+ {OFF_DIAG(2), OFF_DIAG(3), OFF_DIAG(4), OFF_DIAG(5)},
+ {OFF_HORIZ(2), OFF_HORIZ(3), OFF_HORIZ(4), OFF_HORIZ(5)},
+ {OFF_VERT(2), OFF_VERT(3), OFF_VERT(4), OFF_VERT(5)}
+};
+
+
+// Helper fns
+#ifndef hevc_mem_bits32
+static av_always_inline uint32_t hevc_mem_bits32(const void * buf, const unsigned int offset)
+{
+ return AV_RB32((const uint8_t *)buf + (offset >> 3)) << (offset & 7);
+}
+#endif
+
+#if AV_GCC_VERSION_AT_LEAST(3,4) && !defined(hevc_clz32)
+#define hevc_clz32 hevc_clz32_builtin
+static av_always_inline unsigned int hevc_clz32_builtin(const uint32_t x)
+{
+ // __builtin_clz says it works on ints - so adjust if int is >32 bits long
+ return __builtin_clz(x) - (sizeof(int) * 8 - 32);
+}
+#endif
+
+// It is unlikely that we will ever need this but include for completeness
+#ifndef hevc_clz32
+static inline unsigned int hevc_clz32(unsigned int x)
+{
+ unsigned int n = 1;
+ if ((x & 0xffff0000) == 0) {
+ n += 16;
+ x <<= 16;
+ }
+ if ((x & 0xff000000) == 0) {
+ n += 8;
+ x <<= 8;
+ }
+ if ((x & 0xf0000000) == 0) {
+ n += 4;
+ x <<= 4;
+ }
+ if ((x & 0xc0000000) == 0) {
+ n += 2;
+ x <<= 2;
+ }
+ return n - ((x >> 31) & 1);
+}
+#endif
+
+
+#if !USE_BY22
+// If no by22 then _by22 functions will revert to normal and so _peek/_flush
+// will no longer be called but the setup calls will still exist and we want
+// to null them out
+#define bypass_start(s)
+#define bypass_finish(s)
+#else
+// Use BY22 for residual bypass block
+
+#define bypass_start(s) get_cabac_by22_start(&s->HEVClc->cc)
+#define bypass_finish(s) get_cabac_by22_finish(&s->HEVClc->cc)
+
+// BY22 notes that bypass is simply a divide into the bitstream and so we
+// can peek out large quantities of bits at once and treat the result as if
+// it was VLC. In many cases this will lead to O(1) processing rather than
+// O(n) though the setup and teardown is sufficiently expensive that it is
+// only worth using if we expect to be dealing with more than a few bits
+// The definition of "a few bits" will vary from platform to platform but
+// tests on ARM show that it probably isn't worth it for a single coded
+// residual, but is for >1 - it also seems likely that if there are
+// more residuals then they are likely to be bigger and this will make the
+// O(1) nature of the code more worthwhile.
+
+
+#if !USE_BY22_DIV
+// * 1/x @ 32 bits gets us 22 bits of accuracy
+#define CABAC_BY22_PEEK_BITS 22
+#else
+// A real 32-bit divide gets us another bit
+// If we have a 64 bit int & a unit time divider then we should get a lot
+// of bits (55) but that is untested and it is unclear if it would give
+// us a large advantage
+#define CABAC_BY22_PEEK_BITS 23
+#endif
+
+// Bypass block start
+// Must be called before _by22_peek is used as it sets the CABAC environment
+// into the correct state. _by22_finish must be called to return to 'normal'
+// (i.e. non-bypass) cabac decoding
+static inline void get_cabac_by22_start(CABACContext * const c)
+{
+ const unsigned int bits = __builtin_ctz(c->low);
+ const uint32_t m = hevc_mem_bits32(c->bytestream, 0);
+ uint32_t x = (c->low << (22 - CABAC_BITS)) ^ ((m ^ 0x80000000U) >> (9 + CABAC_BITS - bits));
+#if !USE_BY22_DIV
+ const uint32_t inv = cabac_by22_inv_range[c->range & 0xff];
+#endif
+
+ c->bytestream -= (CABAC_BITS / 8);
+ c->by22.bits = bits;
+#if !USE_BY22_DIV
+ c->by22.range = c->range;
+ c->range = inv;
+#endif
+ c->low = x;
+}
+
+// Bypass block finish
+// Must be called at the end of the bypass block to return to normal operation
+static inline void get_cabac_by22_finish(CABACContext * const c)
+{
+ unsigned int used = c->by22.bits;
+ unsigned int bytes_used = (used / CABAC_BITS) * (CABAC_BITS / 8);
+ unsigned int bits_used = used & (CABAC_BITS == 16 ? 15 : 7);
+
+ c->bytestream += bytes_used + (CABAC_BITS / 8);
+ c->low = (((uint32_t)c->low >> (22 - CABAC_BITS + bits_used)) | 1) << bits_used;
+#if !USE_BY22_DIV
+ c->range = c->by22.range;
+#endif
+}
+
+// Peek bypass bits
+// _by22_start must be called before _by22_peek is called and _by22_flush
+// must be called afterwards to flush any used bits
+// The actual number of valid bits returned is
+// min(<coded bypass block length>, CABAC_BY22_PEEK_BITS). CABAC_BY22_PEEK_BITS
+// will be at least 22 which should be long enough for any prefix or suffix
+// though probably not long enough for the worst case combination
+#ifndef get_cabac_by22_peek
+static inline uint32_t get_cabac_by22_peek(const CABACContext * const c)
+{
+#if USE_BY22_DIV
+ return ((unsigned int)c->low / (unsigned int)c->range) << 9;
+#else
+ uint32_t x = c->low & ~1U;
+ const uint32_t inv = c->range;
+
+ if (inv != 0)
+ x = (uint32_t)(((uint64_t)x * (uint64_t)inv) >> 32);
+
+ return x << 1;
+#endif
+}
+#endif
+
+// Flush bypass bits peeked by _by22_peek
+// Flush n bypass bits. n must be >= 1 to guarantee correct operation
+// val is an unmodified copy of whatever _by22_peek returned
+#ifndef get_cabac_by22_flush
+static inline void get_cabac_by22_flush(CABACContext * c, const unsigned int n, const uint32_t val)
+{
+ // Subtract the bits used & reshift up to the top of the word
+#if USE_BY22_DIV
+ const uint32_t low = (((unsigned int)c->low << n) - (((val >> (32 - n)) * (unsigned int)c->range) << 23));
+#else
+ const uint32_t low = (((uint32_t)c->low << n) - (((val >> (32 - n)) * c->by22.range) << 23));
+#endif
+
+ // and refill lower bits
+ // We will probably OR over some existing bits but that doesn't matter
+ c->by22.bits += n;
+ c->low = low | (hevc_mem_bits32(c->bytestream, c->by22.bits) >> 9);
+}
+#endif
+
+#endif // USE_BY22
+
+
void ff_hevc_save_states(HEVCContext *s, int ctb_addr_ts)
{
if (s->ps.pps->entropy_coding_sync_enabled_flag &&
@@ -863,19 +1126,19 @@ int ff_hevc_cbf_luma_decode(HEVCContext *s, int trafo_depth)
return GET_CABAC(elem_offset[CBF_LUMA] + !trafo_depth);
}
-static int hevc_transform_skip_flag_decode(HEVCContext *s, int c_idx)
+static int hevc_transform_skip_flag_decode(HEVCContext *s, int c_idx_nz)
{
- return GET_CABAC(elem_offset[TRANSFORM_SKIP_FLAG] + !!c_idx);
+ return GET_CABAC(elem_offset[TRANSFORM_SKIP_FLAG] + c_idx_nz);
}
-static int explicit_rdpcm_flag_decode(HEVCContext *s, int c_idx)
+static int explicit_rdpcm_flag_decode(HEVCContext *s, int c_idx_nz)
{
- return GET_CABAC(elem_offset[EXPLICIT_RDPCM_FLAG] + !!c_idx);
+ return GET_CABAC(elem_offset[EXPLICIT_RDPCM_FLAG] + c_idx_nz);
}
-static int explicit_rdpcm_dir_flag_decode(HEVCContext *s, int c_idx)
+static int explicit_rdpcm_dir_flag_decode(HEVCContext *s, int c_idx_nz)
{
- return GET_CABAC(elem_offset[EXPLICIT_RDPCM_DIR_FLAG] + !!c_idx);
+ return GET_CABAC(elem_offset[EXPLICIT_RDPCM_DIR_FLAG] + c_idx_nz);
}
int ff_hevc_log2_res_scale_abs(HEVCContext *s, int idx) {
@@ -891,14 +1154,14 @@ int ff_hevc_res_scale_sign_flag(HEVCContext *s, int idx) {
return GET_CABAC(elem_offset[RES_SCALE_SIGN_FLAG] + idx);
}
-static av_always_inline void last_significant_coeff_xy_prefix_decode(HEVCContext *s, int c_idx,
+static av_always_inline void last_significant_coeff_xy_prefix_decode(HEVCContext *s, int c_idx_nz,
int log2_size, int *last_scx_prefix, int *last_scy_prefix)
{
int i = 0;
int max = (log2_size << 1) - 1;
int ctx_offset, ctx_shift;
- if (!c_idx) {
+ if (!c_idx_nz) {
ctx_offset = 3 * (log2_size - 2) + ((log2_size - 1) >> 2);
ctx_shift = (log2_size + 1) >> 2;
} else {
@@ -929,22 +1192,16 @@ static av_always_inline int last_significant_coeff_suffix_decode(HEVCContext *s,
return value;
}
-static av_always_inline int significant_coeff_group_flag_decode(HEVCContext *s, int c_idx, int ctx_cg)
+static av_always_inline int significant_coeff_group_flag_decode(HEVCContext *s, int c_idx_nz, int ctx_cg)
{
int inc;
- inc = FFMIN(ctx_cg, 1) + (c_idx>0 ? 2 : 0);
+ inc = (ctx_cg != 0) + (c_idx_nz << 1);
return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_GROUP_FLAG] + inc);
}
-static av_always_inline int significant_coeff_flag_decode(HEVCContext *s, int x_c, int y_c,
- int offset, const uint8_t *ctx_idx_map)
-{
- int inc = ctx_idx_map[(y_c << 2) + x_c] + offset;
- return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + inc);
-}
-static av_always_inline int significant_coeff_flag_decode_0(HEVCContext *s, int c_idx, int offset)
+static av_always_inline int significant_coeff_flag_decode_0(HEVCContext *s, int offset)
{
return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + offset);
}
@@ -966,90 +1223,366 @@ static av_always_inline int coeff_abs_level_greater2_flag_decode(HEVCContext *s,
return GET_CABAC(elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] + inc);
}
-static av_always_inline int coeff_abs_level_remaining_decode(HEVCContext *s, int rc_rice_param)
+
+#if !USE_BY22
+#define coeff_abs_level_remaining_decode_bypass(s,r) coeff_abs_level_remaining_decode(s, r)
+#endif
+
+
+#ifndef coeff_abs_level_remaining_decode_bypass
+static int coeff_abs_level_remaining_decode_bypass(HEVCContext * const s, const unsigned int rice_param)
{
+ CABACContext * const c = &s->HEVClc->cc;
+ uint32_t y;
+ unsigned int prefix;
+ unsigned int last_coeff_abs_level_remaining;
+ unsigned int n;
+
+ y = get_cabac_by22_peek(c);
+ prefix = hevc_clz32(~y);
+ // y << prefix will always have top bit 0
+
+ if (prefix < 3) {
+ const unsigned int suffix = (y << prefix) >> (31 - rice_param);
+ last_coeff_abs_level_remaining = (prefix << rice_param) + suffix;
+ n = prefix + 1 + rice_param;
+ }
+ else if (prefix * 2 + rice_param <= CABAC_BY22_PEEK_BITS + 2)
+ {
+ const uint32_t suffix = ((y << prefix) | 0x80000000) >> (34 - (prefix + rice_param));
+
+ last_coeff_abs_level_remaining = (2 << rice_param) + suffix;
+ n = prefix * 2 + rice_param - 2;
+ }
+ else {
+ unsigned int suffix;
+
+ get_cabac_by22_flush(c, prefix, y);
+ y = get_cabac_by22_peek(c);
+
+ suffix = (y | 0x80000000) >> (34 - (prefix + rice_param));
+ last_coeff_abs_level_remaining = (2 << rice_param) + suffix;
+ n = prefix + rice_param - 2;
+ }
+
+ get_cabac_by22_flush(c, n, y);
+
+ return last_coeff_abs_level_remaining;
+}
+#endif
+
+static int coeff_abs_level_remaining_decode(HEVCContext * const s, int rc_rice_param)
+{
+ CABACContext * const c = &s->HEVClc->cc;
int prefix = 0;
int suffix = 0;
int last_coeff_abs_level_remaining;
int i;
- while (prefix < CABAC_MAX_BIN && get_cabac_bypass(&s->HEVClc->cc))
+ while (prefix < CABAC_MAX_BIN && get_cabac_bypass(c))
prefix++;
if (prefix == CABAC_MAX_BIN) {
av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", prefix);
return 0;
}
+
if (prefix < 3) {
for (i = 0; i < rc_rice_param; i++)
- suffix = (suffix << 1) | get_cabac_bypass(&s->HEVClc->cc);
+ suffix = (suffix << 1) | get_cabac_bypass(c);
last_coeff_abs_level_remaining = (prefix << rc_rice_param) + suffix;
} else {
int prefix_minus3 = prefix - 3;
for (i = 0; i < prefix_minus3 + rc_rice_param; i++)
- suffix = (suffix << 1) | get_cabac_bypass(&s->HEVClc->cc);
+ suffix = (suffix << 1) | get_cabac_bypass(c);
last_coeff_abs_level_remaining = (((1 << prefix_minus3) + 3 - 1)
<< rc_rice_param) + suffix;
}
+
return last_coeff_abs_level_remaining;
}
-static av_always_inline int coeff_sign_flag_decode(HEVCContext *s, uint8_t nb)
+#if !USE_BY22
+#define coeff_sign_flag_decode_bypass coeff_sign_flag_decode
+static inline uint32_t coeff_sign_flag_decode(HEVCContext * const s, const unsigned int nb)
{
- int i;
- int ret = 0;
+ CABACContext * const c = &s->HEVClc->cc;
+ unsigned int i;
+ uint32_t ret = 0;
for (i = 0; i < nb; i++)
- ret = (ret << 1) | get_cabac_bypass(&s->HEVClc->cc);
- return ret;
+ ret = (ret << 1) | get_cabac_bypass(c);
+
+ return ret << (32 - nb);
+}
+#endif
+
+#ifndef coeff_sign_flag_decode_bypass
+static inline uint32_t coeff_sign_flag_decode_bypass(HEVCContext * const s, const unsigned int nb)
+{
+ CABACContext * const c = &s->HEVClc->cc;
+ uint32_t y;
+ y = get_cabac_by22_peek(c);
+ get_cabac_by22_flush(c, nb, y);
+ return y & ~(0xffffffffU >> nb);
+}
+#endif
+
+
+#ifndef get_cabac_greater1_bits
+static inline unsigned int get_cabac_greater1_bits(CABACContext * const c, const unsigned int n,
+ uint8_t * const state0)
+{
+ unsigned int i;
+ unsigned int rv = 0;
+ for (i = 0; i != n; ++i) {
+ const unsigned int idx = rv != 0 ? 0 : i < 3 ? i + 1 : 3;
+ const unsigned int b = get_cabac(c, state0 + idx);
+ rv = (rv << 1) | b;
+ }
+ return rv;
+}
+#endif
+
+
+// N.B. levels returned are the values assuming coeff_abs_level_remaining
+// is uncoded, so 1 must be added if it is coded. sum_abs also reflects
+// this version of events.
+static inline uint32_t get_greaterx_bits(HEVCContext * const s, const unsigned int n_end, int * const levels,
+ int * const pprev_subset_coded, int * const psum,
+ const unsigned int idx0_gt1, const unsigned int idx_gt2)
+{
+ CABACContext * const c = &s->HEVClc->cc;
+ uint8_t * const state0 = s->HEVClc->cabac_state + idx0_gt1;
+ uint8_t * const state_gt2 = s->HEVClc->cabac_state + idx_gt2;
+ unsigned int rv;
+ unsigned int i;
+ const unsigned int n = FFMIN(n_end, 8);
+
+ // Really this is i != n but the simple unconditional loop is cheaper
+ // and faster
+ for (i = 0; i != 8; ++i)
+ levels[i] = 1;
+
+ rv = get_cabac_greater1_bits(c, n, state0);
+
+ *pprev_subset_coded = 0;
+ *psum = n;
+
+ rv <<= (32 - n);
+ if (rv != 0)
+ {
+ *pprev_subset_coded = 1;
+ *psum = n + 1;
+ i = hevc_clz32(rv);
+ levels[i] = 2;
+ if (get_cabac(c, state_gt2) == 0)
+ {
+ // Unset first coded bit
+ rv &= ~(0x80000000U >> i);
+ }
+ }
+
+ if (n_end > 8) {
+ const unsigned int g8 = n_end - 8;
+ rv |= ((1 << g8) - 1) << (24 - g8);
+ for (i = 0; i != g8; ++i) {
+ levels[i + 8] = 0;
+ }
+ }
+
+ return rv;
+}
+
+// extended_precision_processing_flag must be false given we are
+// putting the result into a 16-bit array
+// So trans_coeff_level must fit in 16 bits too (7.4.9.1 definition of coeff_abs_level_remaining)
+// scale_m is uint8_t
+//
+// scale is [40 - 72] << [0..12] based on qp- worst case is (45 << 12)
+// or it can be 2 (if we have transquant_bypass)
+// shift is set to one less than we really want but would normally be
+// s->ps.sps->bit_depth (max 16, min 8) + log2_trafo_size (max 5, min 2?) - 5 = max 16 min 5?
+// however the scale shift is substracted from shift to a min 0 so scale_m worst = 45 << 6
+// This can still theoretically lead to overflow but the coding would have to be very odd (& inefficient)
+// to achieve it
+
+#ifndef trans_scale_sat
+static inline int trans_scale_sat(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift)
+{
+ return av_clip_int16((((level * (int)(scale * scale_m)) >> shift) + 1) >> 1);
}
+#endif
+
+
+#ifndef update_rice
+static inline void update_rice(uint8_t * const stat_coeff,
+ const unsigned int last_coeff_abs_level_remaining,
+ const unsigned int c_rice_param)
+{
+ const unsigned int x = (last_coeff_abs_level_remaining << 1) >> c_rice_param;
+ if (x >= 6)
+ (*stat_coeff)++;
+ else if (x == 0 && *stat_coeff > 0)
+ (*stat_coeff)--;
+}
+#endif
+
+
+// n must be > 0 on entry
+#ifndef get_cabac_sig_coeff_flag_idxs
+static inline uint8_t * get_cabac_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0,
+ unsigned int n,
+ const uint8_t const * ctx_map,
+ uint8_t * p)
+{
+ do {
+ if (get_cabac(c, state0 + ctx_map[n]))
+ *p++ = n;
+ } while (--n != 0);
+ return p;
+}
+#endif
+
+
+static int get_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0,
+ unsigned int n,
+ const uint8_t const * ctx_map,
+ uint8_t * const flag_idx)
+{
+ int rv;
+
+ rv = get_cabac_sig_coeff_flag_idxs(c, state0, n, ctx_map, flag_idx) - flag_idx;
+
+ return rv;
+}
+
+#define H4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
+ x0, x1, x2, x3,\
+ x4, x5, x6, x7,\
+ x8, x9, x10, x11,\
+ x12, x13, x14, x15}
+
+#define V4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
+ x0, x4, x8, x12,\
+ x1, x5, x9, x13,\
+ x2, x6, x10, x14,\
+ x3, x7, x11, x15}
+
+#define D4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
+ x0, x4, x1, x8,\
+ x5, x2, x12, x9,\
+ x6, x3, x13, x10,\
+ x7, x14, x11, x15}
+
+
+static inline int next_subset(HEVCContext * const s, int i, const int c_idx_nz,
+ uint8_t * const significant_coeff_group_flag,
+ const uint8_t * const scan_x_cg, const uint8_t * const scan_y_cg,
+ int * const pPrev_sig)
+{
+ while (--i >= 0) {
+ unsigned int x_cg = scan_x_cg[i];
+ unsigned int y_cg = scan_y_cg[i];
+
+ // For the flag decode we only care about Z/NZ but
+ // we use the full Right + Down * 2 when calculating
+ // significant coeff flags so we obtain it here
+ //.
+ // The group flag array is one longer than it needs to
+ // be so we don't need to check for y_cg limits
+ unsigned int prev_sig = ((significant_coeff_group_flag[y_cg] >> (x_cg + 1)) & 1) |
+ (((significant_coeff_group_flag[y_cg + 1] >> x_cg) & 1) << 1);
+
+ if (i == 0 ||
+ significant_coeff_group_flag_decode(s, c_idx_nz, prev_sig))
+ {
+ significant_coeff_group_flag[y_cg] |= (1 << x_cg);
+ *pPrev_sig = prev_sig;
+ break;
+ }
+ }
+
+ return i;
+}
+
void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
int log2_trafo_size, enum ScanType scan_idx,
int c_idx)
{
-#define GET_COORD(offset, n) \
- do { \
- x_c = (x_cg << 2) + scan_x_off[n]; \
- y_c = (y_cg << 2) + scan_y_off[n]; \
- } while (0)
- HEVCLocalContext *lc = s->HEVClc;
- int transform_skip_flag = 0;
+ HEVCLocalContext * const lc = s->HEVClc;
+ int trans_skip_or_bypass = lc->cu.cu_transquant_bypass_flag;
int last_significant_coeff_x, last_significant_coeff_y;
- int last_scan_pos;
- int n_end;
int num_coeff = 0;
- int greater1_ctx = 1;
+ int prev_subset_coded = 0;
int num_last_subset;
int x_cg_last_sig, y_cg_last_sig;
- const uint8_t *scan_x_cg, *scan_y_cg, *scan_x_off, *scan_y_off;
+ const uint8_t *scan_x_cg, *scan_y_cg;
+ const xy_off_t * scan_xy_off;
ptrdiff_t stride = s->frame->linesize[c_idx];
int hshift = s->ps.sps->hshift[c_idx];
int vshift = s->ps.sps->vshift[c_idx];
uint8_t *dst = &s->frame->data[c_idx][(y0 >> vshift) * stride +
((x0 >> hshift) << s->ps.sps->pixel_shift)];
+#ifdef RPI
+ //***** transform_skip_flag decoded later!
+ int use_vpu = s->enable_rpi && !lc->cu.cu_transquant_bypass_flag /* && !transform_skip_flag*/ && !lc->tu.cross_pf && log2_trafo_size>=4;
+#endif
int16_t *coeffs = (int16_t*)(c_idx ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
- uint8_t significant_coeff_group_flag[8][8] = {{0}};
+ uint8_t significant_coeff_group_flag[9] = {0}; // Allow 1 final byte that is always zero
int explicit_rdpcm_flag = 0;
int explicit_rdpcm_dir_flag;
int trafo_size = 1 << log2_trafo_size;
int i;
- int qp,shift,add,scale,scale_m;
+ int qp,shift,scale;
static const uint8_t level_scale[] = { 40, 45, 51, 57, 64, 72 };
const uint8_t *scale_matrix = NULL;
uint8_t dc_scale;
int pred_mode_intra = (c_idx == 0) ? lc->tu.intra_pred_mode :
lc->tu.intra_pred_mode_c;
+ int prev_sig = 0;
+ const int c_idx_nz = (c_idx != 0);
+
+ int may_hide_sign;
+
+#ifdef RPI
+ if (s->enable_rpi) {
+ int n = trafo_size * trafo_size;
+ if (use_vpu) {
+ // We support size 4 and size 5.
+ // Size 4 grows from the front (Coeffs_buf_arm[2] points to start of buf)
+ // Size 5 grows from the back (Coeffs_buf_arm[3] points to end of buf)
+ // num_coeffs is indexed by log2_trafo_size-2
+ if (log2_trafo_size == 4)
+ coeffs = s->coeffs_buf_arm[s->pass0_job][log2_trafo_size - 2] + s->num_coeffs[s->pass0_job][log2_trafo_size - 2];
+ else
+ coeffs = s->coeffs_buf_arm[s->pass0_job][log2_trafo_size - 2] - s->num_coeffs[s->pass0_job][log2_trafo_size - 2] - n;
+ s->num_coeffs[s->pass0_job][log2_trafo_size - 2] += n;
+ } else {
+ coeffs = s->coeffs_buf_arm[s->pass0_job][0] + s->num_coeffs[s->pass0_job][0];
+ s->num_coeffs[s->pass0_job][0] += n;
+ }
+ }
+ // We now do the memset after transform_add while we know the data is cached.
+ #ifdef RPI_PRECLEAR
+ #else
+ memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
+ #endif
+#else
memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
+#endif
+
+
// Derive QP for dequant
if (!lc->cu.cu_transquant_bypass_flag) {
- static const int qp_c[] = { 29, 30, 31, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37 };
+ static const uint8_t qp_c[] = { 29, 30, 31, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37 };
static const uint8_t rem6[51 + 4 * 6 + 1] = {
0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2,
3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
@@ -1065,9 +1598,19 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
};
int qp_y = lc->qp_y;
+ may_hide_sign = s->ps.pps->sign_data_hiding_flag;
+
if (s->ps.pps->transform_skip_enabled_flag &&
log2_trafo_size <= s->ps.pps->log2_max_transform_skip_block_size) {
- transform_skip_flag = hevc_transform_skip_flag_decode(s, c_idx);
+ int transform_skip_flag = hevc_transform_skip_flag_decode(s, c_idx_nz);
+ if (transform_skip_flag) {
+ trans_skip_or_bypass = 1;
+ if (lc->cu.pred_mode == MODE_INTRA &&
+ s->ps.sps->implicit_rdpcm_enabled_flag &&
+ (pred_mode_intra == 10 || pred_mode_intra == 26)) {
+ may_hide_sign = 0;
+ }
+ }
}
if (c_idx == 0) {
@@ -1100,39 +1643,73 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
qp += s->ps.sps->qp_bd_offset;
}
- shift = s->ps.sps->bit_depth + log2_trafo_size - 5;
- add = 1 << (shift-1);
- scale = level_scale[rem6[qp]] << (div6[qp]);
- scale_m = 16; // default when no custom scaling lists.
- dc_scale = 16;
+ // Shift is set to one less than will actually occur as the scale
+ // and saturate step adds 1 and then shifts right again
+ shift = s->ps.sps->bit_depth + log2_trafo_size - 6;
+ scale = level_scale[rem6[qp]];
+ if (div6[qp] >= shift) {
+ scale <<= (div6[qp] - shift);
+ shift = 0;
+ } else {
+ shift -= div6[qp];
+ }
- if (s->ps.sps->scaling_list_enable_flag && !(transform_skip_flag && log2_trafo_size > 2)) {
+ if (s->ps.sps->scaling_list_enable_flag && !(trans_skip_or_bypass && log2_trafo_size > 2)) {
const ScalingList *sl = s->ps.pps->scaling_list_data_present_flag ?
- &s->ps.pps->scaling_list : &s->ps.sps->scaling_list;
+ &s->ps.pps->scaling_list : &s->ps.sps->scaling_list;
int matrix_id = lc->cu.pred_mode != MODE_INTRA;
matrix_id = 3 * matrix_id + c_idx;
scale_matrix = sl->sl[log2_trafo_size - 2][matrix_id];
+ dc_scale = scale_matrix[0];
if (log2_trafo_size >= 4)
dc_scale = sl->sl_dc[log2_trafo_size - 4][matrix_id];
}
+ else
+ {
+ static const uint8_t sixteen_scale[64] = {
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16
+ };
+ scale_matrix = sixteen_scale;
+ dc_scale = 16;
+ }
} else {
+ static const uint8_t unit_scale[64] = {
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ };
+ scale_matrix = unit_scale;
shift = 0;
- add = 0;
- scale = 0;
- dc_scale = 0;
+ scale = 2; // We will shift right to kill this
+ dc_scale = 1;
+
+ may_hide_sign = 0;
}
if (lc->cu.pred_mode == MODE_INTER && s->ps.sps->explicit_rdpcm_enabled_flag &&
- (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
- explicit_rdpcm_flag = explicit_rdpcm_flag_decode(s, c_idx);
+ trans_skip_or_bypass) {
+ explicit_rdpcm_flag = explicit_rdpcm_flag_decode(s, c_idx_nz);
if (explicit_rdpcm_flag) {
- explicit_rdpcm_dir_flag = explicit_rdpcm_dir_flag_decode(s, c_idx);
+ may_hide_sign = 0;
+ explicit_rdpcm_dir_flag = explicit_rdpcm_dir_flag_decode(s, c_idx_nz);
}
}
- last_significant_coeff_xy_prefix_decode(s, c_idx, log2_trafo_size,
+ last_significant_coeff_xy_prefix_decode(s, c_idx_nz, log2_trafo_size,
&last_significant_coeff_x, &last_significant_coeff_y);
if (last_significant_coeff_x > 3) {
@@ -1160,119 +1737,113 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
int last_x_c = last_significant_coeff_x & 3;
int last_y_c = last_significant_coeff_y & 3;
- scan_x_off = ff_hevc_diag_scan4x4_x;
- scan_y_off = ff_hevc_diag_scan4x4_y;
num_coeff = diag_scan4x4_inv[last_y_c][last_x_c];
- if (trafo_size == 4) {
+
+ switch (log2_trafo_size) {
+ case 2:
scan_x_cg = scan_1x1;
scan_y_cg = scan_1x1;
- } else if (trafo_size == 8) {
+ break;
+ case 3:
num_coeff += diag_scan2x2_inv[y_cg_last_sig][x_cg_last_sig] << 4;
scan_x_cg = diag_scan2x2_x;
scan_y_cg = diag_scan2x2_y;
- } else if (trafo_size == 16) {
+ break;
+ case 4:
num_coeff += diag_scan4x4_inv[y_cg_last_sig][x_cg_last_sig] << 4;
scan_x_cg = ff_hevc_diag_scan4x4_x;
scan_y_cg = ff_hevc_diag_scan4x4_y;
- } else { // trafo_size == 32
+ break;
+ case 5:
+ default:
num_coeff += diag_scan8x8_inv[y_cg_last_sig][x_cg_last_sig] << 4;
scan_x_cg = ff_hevc_diag_scan8x8_x;
scan_y_cg = ff_hevc_diag_scan8x8_y;
+ break;
}
break;
}
case SCAN_HORIZ:
scan_x_cg = horiz_scan2x2_x;
scan_y_cg = horiz_scan2x2_y;
- scan_x_off = horiz_scan4x4_x;
- scan_y_off = horiz_scan4x4_y;
num_coeff = horiz_scan8x8_inv[last_significant_coeff_y][last_significant_coeff_x];
break;
default: //SCAN_VERT
scan_x_cg = horiz_scan2x2_y;
scan_y_cg = horiz_scan2x2_x;
- scan_x_off = horiz_scan4x4_y;
- scan_y_off = horiz_scan4x4_x;
num_coeff = horiz_scan8x8_inv[last_significant_coeff_x][last_significant_coeff_y];
break;
}
num_coeff++;
num_last_subset = (num_coeff - 1) >> 4;
- for (i = num_last_subset; i >= 0; i--) {
- int n, m;
- int x_cg, y_cg, x_c, y_c, pos;
- int implicit_non_zero_coeff = 0;
- int64_t trans_coeff_level;
- int prev_sig = 0;
- int offset = i << 4;
- int rice_init = 0;
+ significant_coeff_group_flag[y_cg_last_sig] = 1 << x_cg_last_sig; // 1st subset always significant
- uint8_t significant_coeff_flag_idx[16];
- uint8_t nb_significant_coeff_flag = 0;
-
- x_cg = scan_x_cg[i];
- y_cg = scan_y_cg[i];
-
- if ((i < num_last_subset) && (i > 0)) {
- int ctx_cg = 0;
- if (x_cg < (1 << (log2_trafo_size - 2)) - 1)
- ctx_cg += significant_coeff_group_flag[x_cg + 1][y_cg];
- if (y_cg < (1 << (log2_trafo_size - 2)) - 1)
- ctx_cg += significant_coeff_group_flag[x_cg][y_cg + 1];
+ scan_xy_off = off_xys[scan_idx][log2_trafo_size - 2];
- significant_coeff_group_flag[x_cg][y_cg] =
- significant_coeff_group_flag_decode(s, c_idx, ctx_cg);
- implicit_non_zero_coeff = 1;
- } else {
- significant_coeff_group_flag[x_cg][y_cg] =
- ((x_cg == x_cg_last_sig && y_cg == y_cg_last_sig) ||
- (x_cg == 0 && y_cg == 0));
- }
+ i = num_last_subset;
+ do {
+ int implicit_non_zero_coeff = 0;
+ int n_end;
- last_scan_pos = num_coeff - offset - 1;
+ uint8_t significant_coeff_flag_idx[16];
+ unsigned int nb_significant_coeff_flag = 0;
if (i == num_last_subset) {
+ // First time through
+ int last_scan_pos = num_coeff - (i << 4) - 1;
n_end = last_scan_pos - 1;
significant_coeff_flag_idx[0] = last_scan_pos;
nb_significant_coeff_flag = 1;
} else {
n_end = 15;
+ implicit_non_zero_coeff = (i != 0);
}
- if (x_cg < ((1 << log2_trafo_size) - 1) >> 2)
- prev_sig = !!significant_coeff_group_flag[x_cg + 1][y_cg];
- if (y_cg < ((1 << log2_trafo_size) - 1) >> 2)
- prev_sig += (!!significant_coeff_group_flag[x_cg][y_cg + 1] << 1);
-
- if (significant_coeff_group_flag[x_cg][y_cg] && n_end >= 0) {
- static const uint8_t ctx_idx_map[] = {
- 0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8, // log2_trafo_size == 2
- 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, // prev_sig == 0
- 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, // prev_sig == 1
- 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, // prev_sig == 2
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 // default
+ if (n_end >= 0) {
+ static const uint8_t ctx_idx_maps_ts2[3][16] = {
+ D4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2
+ H4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2
+ V4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8) // log2_trafo_size == 2
+ };
+ static const uint8_t ctx_idx_maps[3][4][16] = {
+ {
+ D4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
+ D4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 1
+ D4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 2
+ D4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2) // prev_sig == 3, default
+ },
+ {
+ H4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
+ H4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 1
+ H4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 2
+ H4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2) // prev_sig == 3, default
+ },
+ {
+ V4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
+ V4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 1
+ V4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 2
+ V4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2) // prev_sig == 3, default
+ }
};
const uint8_t *ctx_idx_map_p;
int scf_offset = 0;
- if (s->ps.sps->transform_skip_context_enabled_flag &&
- (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
- ctx_idx_map_p = (uint8_t*) &ctx_idx_map[4 * 16];
- if (c_idx == 0) {
- scf_offset = 40;
- } else {
- scf_offset = 14 + 27;
- }
+
+ if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) {
+ ctx_idx_map_p = ctx_idx_maps[0][3];
+ scf_offset = 40 + c_idx_nz;
} else {
- if (c_idx != 0)
+ if (c_idx_nz != 0)
scf_offset = 27;
+
if (log2_trafo_size == 2) {
- ctx_idx_map_p = (uint8_t*) &ctx_idx_map[0];
+ ctx_idx_map_p = ctx_idx_maps_ts2[scan_idx];
} else {
- ctx_idx_map_p = (uint8_t*) &ctx_idx_map[(prev_sig + 1) << 4];
- if (c_idx == 0) {
- if ((x_cg > 0 || y_cg > 0))
+ ctx_idx_map_p = ctx_idx_maps[scan_idx][prev_sig];
+ if (!c_idx_nz) {
+ if (i != 0)
scf_offset += 3;
+
if (log2_trafo_size == 3) {
scf_offset += (scan_idx == SCAN_DIAG) ? 9 : 15;
} else {
@@ -1286,34 +1857,30 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
}
}
}
- for (n = n_end; n > 0; n--) {
- x_c = scan_x_off[n];
- y_c = scan_y_off[n];
- if (significant_coeff_flag_decode(s, x_c, y_c, scf_offset, ctx_idx_map_p)) {
- significant_coeff_flag_idx[nb_significant_coeff_flag] = n;
- nb_significant_coeff_flag++;
+
+ if (n_end > 0) {
+ int cnt = get_sig_coeff_flag_idxs(&s->HEVClc->cc,
+ s->HEVClc->cabac_state + elem_offset[SIGNIFICANT_COEFF_FLAG] + scf_offset,
+ n_end, ctx_idx_map_p,
+ significant_coeff_flag_idx + nb_significant_coeff_flag);
+
+ nb_significant_coeff_flag += cnt;
+ if (cnt != 0) {
implicit_non_zero_coeff = 0;
}
}
+
if (implicit_non_zero_coeff == 0) {
- if (s->ps.sps->transform_skip_context_enabled_flag &&
- (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
- if (c_idx == 0) {
- scf_offset = 42;
- } else {
- scf_offset = 16 + 27;
- }
+ if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) {
+ scf_offset = 42 + c_idx_nz;
} else {
if (i == 0) {
- if (c_idx == 0)
- scf_offset = 0;
- else
- scf_offset = 27;
+ scf_offset = c_idx_nz ? 27 : 0;
} else {
scf_offset = 2 + scf_offset;
}
}
- if (significant_coeff_flag_decode_0(s, c_idx, scf_offset) == 1) {
+ if (significant_coeff_flag_decode_0(s, scf_offset) == 1) {
significant_coeff_flag_idx[nb_significant_coeff_flag] = 0;
nb_significant_coeff_flag++;
}
@@ -1323,141 +1890,185 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
}
}
- n_end = nb_significant_coeff_flag;
-
+ if (nb_significant_coeff_flag != 0) {
+ const unsigned int gt1_idx_delta = (c_idx_nz << 2) |
+ ((i != 0 && !c_idx_nz) ? 2 : 0) |
+ prev_subset_coded;
+ const unsigned int idx0_gt1 = elem_offset[COEFF_ABS_LEVEL_GREATER1_FLAG] +
+ (gt1_idx_delta << 2);
+ const unsigned int idx_gt2 = elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] +
+ gt1_idx_delta;
+
+ const unsigned int x_cg = scan_x_cg[i];
+ const unsigned int y_cg = scan_y_cg[i];
+ int16_t * const blk_coeffs = coeffs +
+ ((x_cg + (y_cg << log2_trafo_size)) << 2);
+ // This calculation is 'wrong' for log2_traffo_size == 2
+ // but that doesn't mattor as in this case x_cg & y_cg
+ // are always 0 so result is correct (0) anyway
+ const uint8_t * const blk_scale = scale_matrix +
+ (((x_cg + (y_cg << 3)) << (5 - log2_trafo_size)));
+
+ // * The following code block doesn't deal with these flags:
+ // (nor did the one it replaces)
+ //
+ // cabac_bypass_alignment_enabled_flag
+ // This should be easy but I can't find a test case
+ // extended_precision_processing_flag
+ // This can extend the required precision past 16bits
+ // so is probably tricky - also no example found yet
+
+#if USE_N_END_1
+ if (nb_significant_coeff_flag == 1) {
+ // There is a small gain to be had from special casing the single
+ // transform coefficient case. The reduction in complexity
+ // makes up for the code duplicatioon.
+
+ int trans_coeff_level = 1;
+ int coeff_sign_flag;
+ int coded_val = 0;
+
+ // initialize first elem of coeff_bas_level_greater1_flag
+ prev_subset_coded = 0;
+
+ if (get_cabac(&s->HEVClc->cc, s->HEVClc->cabac_state + idx0_gt1 + 1)) {
+ trans_coeff_level = 2;
+ prev_subset_coded = 1;
+ coded_val = get_cabac(&s->HEVClc->cc, s->HEVClc->cabac_state + idx_gt2);
+ }
- if (n_end) {
- int first_nz_pos_in_cg;
- int last_nz_pos_in_cg;
- int c_rice_param = 0;
- int first_greater1_coeff_idx = -1;
- uint8_t coeff_abs_level_greater1_flag[8];
- uint16_t coeff_sign_flag;
- int sum_abs = 0;
- int sign_hidden;
- int sb_type;
+ // Probably not worth the overhead of starting by22 for just one value
+ coeff_sign_flag = get_cabac_bypass(&s->HEVClc->cc);
+ if (coded_val)
+ {
+ if (!s->ps.sps->persistent_rice_adaptation_enabled_flag) {
+ trans_coeff_level = 3 + coeff_abs_level_remaining_decode(s, 0);
+ } else {
+ uint8_t * const stat_coeff =
+ lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1);
+ const unsigned int c_rice_param = *stat_coeff >> 2;
+ const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
- // initialize first elem of coeff_bas_level_greater1_flag
- int ctx_set = (i > 0 && c_idx == 0) ? 2 : 0;
+ trans_coeff_level = 3 + last_coeff_abs_level_remaining;
+ update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
+ }
+ }
- if (s->ps.sps->persistent_rice_adaptation_enabled_flag) {
- if (!transform_skip_flag && !lc->cu.cu_transquant_bypass_flag)
- sb_type = 2 * (c_idx == 0 ? 1 : 0);
- else
- sb_type = 2 * (c_idx == 0 ? 1 : 0) + 1;
- c_rice_param = lc->stat_coeff[sb_type] / 4;
- }
+ {
+ const xy_off_t * const xy_off = scan_xy_off + significant_coeff_flag_idx[0];
+ const int k = (int32_t)(coeff_sign_flag << 31) >> 31;
+ const unsigned int scale_m = blk_scale[xy_off->scale];
- if (!(i == num_last_subset) && greater1_ctx == 0)
- ctx_set++;
- greater1_ctx = 1;
- last_nz_pos_in_cg = significant_coeff_flag_idx[0];
-
- for (m = 0; m < (n_end > 8 ? 8 : n_end); m++) {
- int inc = (ctx_set << 2) + greater1_ctx;
- coeff_abs_level_greater1_flag[m] =
- coeff_abs_level_greater1_flag_decode(s, c_idx, inc);
- if (coeff_abs_level_greater1_flag[m]) {
- greater1_ctx = 0;
- if (first_greater1_coeff_idx == -1)
- first_greater1_coeff_idx = m;
- } else if (greater1_ctx > 0 && greater1_ctx < 3) {
- greater1_ctx++;
+ blk_coeffs[xy_off->coeff] = trans_scale_sat(
+ (trans_coeff_level ^ k) - k, // Apply sign
+ scale,
+ i == 0 && xy_off->coeff == 0 ? dc_scale : scale_m,
+ shift);
}
}
- first_nz_pos_in_cg = significant_coeff_flag_idx[n_end - 1];
-
- if (lc->cu.cu_transquant_bypass_flag ||
- (lc->cu.pred_mode == MODE_INTRA &&
- s->ps.sps->implicit_rdpcm_enabled_flag && transform_skip_flag &&
- (pred_mode_intra == 10 || pred_mode_intra == 26 )) ||
- explicit_rdpcm_flag)
- sign_hidden = 0;
else
- sign_hidden = (last_nz_pos_in_cg - first_nz_pos_in_cg >= 4);
+#endif
+ {
+ int sign_hidden = may_hide_sign;
+ int levels[16]; // Should be able to get away with int16_t but that fails some tests
+ uint32_t coeff_sign_flags;
+ uint32_t coded_vals = 0;
+ // Sum(abs(level[]))
+ // In fact we only need the bottom bit and in some future
+ // version that may be all we calculate
+ unsigned int sum_abs;
+
+ coded_vals = get_greaterx_bits(s, nb_significant_coeff_flag, levels,
+ &prev_subset_coded, &sum_abs, idx0_gt1, idx_gt2);
+
+ if (significant_coeff_flag_idx[0] - significant_coeff_flag_idx[nb_significant_coeff_flag - 1] <= 3)
+ sign_hidden = 0;
+
+ // -- Start bypass block
+
+ bypass_start(s);
+
+ coeff_sign_flags = coeff_sign_flag_decode_bypass(s, nb_significant_coeff_flag - sign_hidden);
+
+ if (coded_vals != 0)
+ {
+ const int rice_adaptation_enabled = s->ps.sps->persistent_rice_adaptation_enabled_flag;
+ uint8_t * stat_coeff = !rice_adaptation_enabled ? NULL :
+ lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1);
+ int c_rice_param = !rice_adaptation_enabled ? 0 : *stat_coeff >> 2;
+ int * level = levels - 1;
+
+ do {
+ {
+ const unsigned int z = hevc_clz32(coded_vals) + 1;
+ level += z;
+ coded_vals <<= z;
+ }
- if (first_greater1_coeff_idx != -1) {
- coeff_abs_level_greater1_flag[first_greater1_coeff_idx] += coeff_abs_level_greater2_flag_decode(s, c_idx, ctx_set);
- }
- if (!s->ps.pps->sign_data_hiding_flag || !sign_hidden ) {
- coeff_sign_flag = coeff_sign_flag_decode(s, nb_significant_coeff_flag) << (16 - nb_significant_coeff_flag);
- } else {
- coeff_sign_flag = coeff_sign_flag_decode(s, nb_significant_coeff_flag - 1) << (16 - (nb_significant_coeff_flag - 1));
- }
+ {
+ const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode_bypass(s, c_rice_param);
+ const int trans_coeff_level = *level + last_coeff_abs_level_remaining + 1;
+
+ sum_abs += last_coeff_abs_level_remaining + 1;
+ *level = trans_coeff_level;
+
+ if (stat_coeff != NULL)
+ update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
+ stat_coeff = NULL;
- for (m = 0; m < n_end; m++) {
- n = significant_coeff_flag_idx[m];
- GET_COORD(offset, n);
- if (m < 8) {
- trans_coeff_level = 1 + coeff_abs_level_greater1_flag[m];
- if (trans_coeff_level == ((m == first_greater1_coeff_idx) ? 3 : 2)) {
- int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
-
- trans_coeff_level += last_coeff_abs_level_remaining;
- if (trans_coeff_level > (3 << c_rice_param))
- c_rice_param = s->ps.sps->persistent_rice_adaptation_enabled_flag ? c_rice_param + 1 : FFMIN(c_rice_param + 1, 4);
- if (s->ps.sps->persistent_rice_adaptation_enabled_flag && !rice_init) {
- int c_rice_p_init = lc->stat_coeff[sb_type] / 4;
- if (last_coeff_abs_level_remaining >= (3 << c_rice_p_init))
- lc->stat_coeff[sb_type]++;
- else if (2 * last_coeff_abs_level_remaining < (1 << c_rice_p_init))
- if (lc->stat_coeff[sb_type] > 0)
- lc->stat_coeff[sb_type]--;
- rice_init = 1;
+ if (trans_coeff_level > (3 << c_rice_param) &&
+ (c_rice_param < 4 || rice_adaptation_enabled))
+ ++c_rice_param;
}
- }
- } else {
- int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
-
- trans_coeff_level = 1 + last_coeff_abs_level_remaining;
- if (trans_coeff_level > (3 << c_rice_param))
- c_rice_param = s->ps.sps->persistent_rice_adaptation_enabled_flag ? c_rice_param + 1 : FFMIN(c_rice_param + 1, 4);
- if (s->ps.sps->persistent_rice_adaptation_enabled_flag && !rice_init) {
- int c_rice_p_init = lc->stat_coeff[sb_type] / 4;
- if (last_coeff_abs_level_remaining >= (3 << c_rice_p_init))
- lc->stat_coeff[sb_type]++;
- else if (2 * last_coeff_abs_level_remaining < (1 << c_rice_p_init))
- if (lc->stat_coeff[sb_type] > 0)
- lc->stat_coeff[sb_type]--;
- rice_init = 1;
- }
+ } while (coded_vals != 0);
}
- if (s->ps.pps->sign_data_hiding_flag && sign_hidden) {
- sum_abs += trans_coeff_level;
- if (n == first_nz_pos_in_cg && (sum_abs&1))
- trans_coeff_level = -trans_coeff_level;
+
+ // sign_hidden = 0 or 1 so we can combine the tests
+ if ((sign_hidden & sum_abs) != 0) {
+ levels[nb_significant_coeff_flag - 1] = -levels[nb_significant_coeff_flag - 1];
}
- if (coeff_sign_flag >> 15)
- trans_coeff_level = -trans_coeff_level;
- coeff_sign_flag <<= 1;
- if(!lc->cu.cu_transquant_bypass_flag) {
- if (s->ps.sps->scaling_list_enable_flag && !(transform_skip_flag && log2_trafo_size > 2)) {
- if(y_c || x_c || log2_trafo_size < 4) {
- switch(log2_trafo_size) {
- case 3: pos = (y_c << 3) + x_c; break;
- case 4: pos = ((y_c >> 1) << 3) + (x_c >> 1); break;
- case 5: pos = ((y_c >> 2) << 3) + (x_c >> 2); break;
- default: pos = (y_c << 2) + x_c; break;
- }
- scale_m = scale_matrix[pos];
- } else {
- scale_m = dc_scale;
- }
+
+ bypass_finish(s);
+
+ // -- Finish bypass block
+
+ // Scale loop
+ {
+ int m = nb_significant_coeff_flag - 1;
+
+ // Deal with DC component (if any) first
+ if (i == 0 && significant_coeff_flag_idx[m] == 0)
+ {
+ const int k = (int32_t)(coeff_sign_flags << m) >> 31;
+ blk_coeffs[0] = trans_scale_sat(
+ (levels[m] ^ k) - k, scale, dc_scale, shift);
+ --m;
}
- trans_coeff_level = (trans_coeff_level * (int64_t)scale * (int64_t)scale_m + add) >> shift;
- if(trans_coeff_level < 0) {
- if((~trans_coeff_level) & 0xFffffffffff8000)
- trans_coeff_level = -32768;
- } else {
- if(trans_coeff_level & 0xffffffffffff8000)
- trans_coeff_level = 32767;
+
+#if !USE_N_END_1
+ // If N_END_1 set then m was at least 1 initially
+ if (m >= 0)
+#endif
+ {
+ do {
+ const xy_off_t * const xy_off = scan_xy_off +
+ significant_coeff_flag_idx[m];
+ const int k = (int32_t)(coeff_sign_flags << m) >> 31;
+
+ blk_coeffs[xy_off->coeff] = trans_scale_sat(
+ (levels[m] ^ k) - k,
+ scale,
+ blk_scale[xy_off->scale],
+ shift);
+ } while (--m >= 0);
}
}
- coeffs[y_c * trafo_size + x_c] = trans_coeff_level;
+
}
}
- }
+ } while ((i = next_subset(s, i, c_idx_nz,
+ significant_coeff_group_flag, scan_x_cg, scan_y_cg, &prev_sig)) >= 0);
if (lc->cu.cu_transquant_bypass_flag) {
if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
@@ -1467,7 +2078,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
}
} else {
- if (transform_skip_flag) {
+ if (trans_skip_or_bypass) { // Must be trans_skip as we've already dealt with bypass
int rot = s->ps.sps->transform_skip_rotation_enabled_flag &&
log2_trafo_size == 2 &&
lc->cu.pred_mode == MODE_INTRA;
@@ -1475,7 +2086,6 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
for (i = 0; i < 8; i++)
FFSWAP(int16_t, coeffs[i], coeffs[16 - i - 1]);
}
-
s->hevcdsp.transform_skip(coeffs, log2_trafo_size);
if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
@@ -1486,8 +2096,26 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
}
} else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) {
- s->hevcdsp.idct_4x4_luma(coeffs);
+ s->hevcdsp.idct_4x4_luma(coeffs);
} else {
+#ifdef RPI
+ if (!use_vpu) {
+ int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
+ if (max_xy == 0) {
+ s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs);
+ } else {
+ int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4;
+ if (max_xy < 4)
+ col_limit = FFMIN(4, col_limit);
+ else if (max_xy < 8)
+ col_limit = FFMIN(8, col_limit);
+ else if (max_xy < 12)
+ col_limit = FFMIN(24, col_limit);
+
+ s->hevcdsp.idct[log2_trafo_size-2](coeffs, col_limit);
+ }
+ }
+#else
int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
if (max_xy == 0)
s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs);
@@ -1501,6 +2129,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
col_limit = FFMIN(24, col_limit);
s->hevcdsp.idct[log2_trafo_size-2](coeffs, col_limit);
}
+#endif
}
}
if (lc->tu.cross_pf) {
@@ -1510,6 +2139,17 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
coeffs[i] = coeffs[i] + ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
}
}
+#ifdef RPI
+ if (s->enable_rpi) {
+ HEVCPredCmd *cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
+ cmd->type = RPI_PRED_TRANSFORM_ADD;
+ cmd->size = log2_trafo_size;
+ cmd->buf = coeffs;
+ cmd->dst = dst;
+ cmd->stride = stride;
+ return;
+ }
+#endif
s->hevcdsp.transform_add[log2_trafo_size-2](dst, coeffs, stride);
}
diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
index 1f33b0c..55a0315 100644
--- a/libavcodec/hevc_filter.c
+++ b/libavcodec/hevc_filter.c
@@ -22,6 +22,12 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
+//#define DISABLE_SAO
+//#define DISABLE_DEBLOCK
+//#define DISABLE_STRENGTHS
+// define DISABLE_DEBLOCK_NONREF for a 6% speed boost (by skipping deblocking on unimportant frames)
+//#define DISABLE_DEBLOCK_NONREF
+
#include "libavutil/common.h"
#include "libavutil/internal.h"
@@ -31,6 +37,11 @@
#include "bit_depth_template.c"
+#ifdef RPI
+#include "rpi_user_vcsm.h"
+#include "rpi_qpu.h"
+#endif
+
#define LUMA 0
#define CB 1
#define CR 2
@@ -273,6 +284,10 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
edges[2] = x_ctb == s->ps.sps->ctb_width - 1;
edges[3] = y_ctb == s->ps.sps->ctb_height - 1;
+#ifdef DISABLE_SAO
+ return;
+#endif
+
if (restore) {
if (!edges[0]) {
left_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1]];
@@ -496,6 +511,15 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
s->ps.sps->pcm.loop_filter_disable_flag) ||
s->ps.pps->transquant_bypass_enable_flag;
+#ifdef DISABLE_DEBLOCK_NONREF
+ if (!s->used_for_ref)
+ return; // Don't deblock non-reference frames
+#endif
+#ifdef DISABLE_DEBLOCK
+ return;
+#endif
+ if (!s->used_for_ref && s->avctx->skip_loop_filter >= AVDISCARD_NONREF)
+ return;
if (x0) {
left_tc_offset = s->deblock[ctb - 1].tc_offset;
left_beta_offset = s->deblock[ctb - 1].beta_offset;
@@ -539,6 +563,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
s->frame->linesize[LUMA],
beta, tc, no_p, no_q);
} else
+#ifdef RPI_DEBLOCK_VPU
+ if (s->enable_rpi_deblock) {
+ uint8_t (*setup)[2][2][4];
+ int num16 = (y>>4)*s->setup_width + (x>>4);
+ int a = ((y>>3) & 1) << 1;
+ int b = (x>>3) & 1;
+ setup = s->dvq->y_setup_arm[num16];
+ setup[0][b][0][a] = beta;
+ setup[0][b][0][a + 1] = beta;
+ setup[0][b][1][a] = tc[0];
+ setup[0][b][1][a + 1] = tc[1];
+ } else
+#endif
s->hevcdsp.hevc_v_loop_filter_luma(src,
s->frame->linesize[LUMA],
beta, tc, no_p, no_q);
@@ -571,6 +608,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
s->frame->linesize[LUMA],
beta, tc, no_p, no_q);
} else
+#ifdef RPI_DEBLOCK_VPU
+ if (s->enable_rpi_deblock) {
+ uint8_t (*setup)[2][2][4];
+ int num16 = (y>>4)*s->setup_width + (x>>4);
+ int a = ((x>>3) & 1) << 1;
+ int b = (y>>3) & 1;
+ setup = s->dvq->y_setup_arm[num16];
+ setup[1][b][0][a] = beta;
+ setup[1][b][0][a + 1] = beta;
+ setup[1][b][1][a] = tc[0];
+ setup[1][b][1][a + 1] = tc[1];
+ } else
+#endif
s->hevcdsp.hevc_h_loop_filter_luma(src,
s->frame->linesize[LUMA],
beta, tc, no_p, no_q);
@@ -605,9 +655,23 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
s->frame->linesize[chroma],
c_tc, no_p, no_q);
} else
+#ifdef RPI_DEBLOCK_VPU
+ if (s->enable_rpi_deblock) {
+ uint8_t (*setup)[2][2][4];
+ int xc = x>>s->ps.sps->hshift[chroma];
+ int yc = y>>s->ps.sps->vshift[chroma];
+ int num16 = (yc>>4)*s->uv_setup_width + (xc>>4);
+ int a = ((yc>>3) & 1) << 1;
+ int b = (xc>>3) & 1;
+ setup = s->dvq->uv_setup_arm[num16];
+ setup[0][b][0][a] = c_tc[0];
+ setup[0][b][0][a + 1] = c_tc[1];
+ } else
+#endif
s->hevcdsp.hevc_v_loop_filter_chroma(src,
s->frame->linesize[chroma],
c_tc, no_p, no_q);
+
}
}
@@ -638,6 +702,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
s->frame->linesize[chroma],
c_tc, no_p, no_q);
} else
+#ifdef RPI_DEBLOCK_VPU
+ if (s->enable_rpi_deblock) {
+ uint8_t (*setup)[2][2][4];
+ int xc = x>>s->ps.sps->hshift[chroma];
+ int yc = y>>s->ps.sps->vshift[chroma];
+ int num16 = (yc>>4)*s->uv_setup_width + (xc>>4);
+ int a = ((xc>>3) & 1) << 1;
+ int b = (yc>>3) & 1;
+ setup = s->dvq->uv_setup_arm[num16];
+ setup[1][b][0][a] = c_tc[0];
+ setup[1][b][0][a + 1] = c_tc[1];
+ } else
+#endif
s->hevcdsp.hevc_h_loop_filter_chroma(src,
s->frame->linesize[chroma],
c_tc, no_p, no_q);
@@ -648,69 +725,6 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
}
}
-static int boundary_strength(HEVCContext *s, MvField *curr, MvField *neigh,
- RefPicList *neigh_refPicList)
-{
- if (curr->pred_flag == PF_BI && neigh->pred_flag == PF_BI) {
- // same L0 and L1
- if (s->ref->refPicList[0].list[curr->ref_idx[0]] == neigh_refPicList[0].list[neigh->ref_idx[0]] &&
- s->ref->refPicList[0].list[curr->ref_idx[0]] == s->ref->refPicList[1].list[curr->ref_idx[1]] &&
- neigh_refPicList[0].list[neigh->ref_idx[0]] == neigh_refPicList[1].list[neigh->ref_idx[1]]) {
- if ((FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
- FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4) &&
- (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
- FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4))
- return 1;
- else
- return 0;
- } else if (neigh_refPicList[0].list[neigh->ref_idx[0]] == s->ref->refPicList[0].list[curr->ref_idx[0]] &&
- neigh_refPicList[1].list[neigh->ref_idx[1]] == s->ref->refPicList[1].list[curr->ref_idx[1]]) {
- if (FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
- FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4)
- return 1;
- else
- return 0;
- } else if (neigh_refPicList[1].list[neigh->ref_idx[1]] == s->ref->refPicList[0].list[curr->ref_idx[0]] &&
- neigh_refPicList[0].list[neigh->ref_idx[0]] == s->ref->refPicList[1].list[curr->ref_idx[1]]) {
- if (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
- FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4)
- return 1;
- else
- return 0;
- } else {
- return 1;
- }
- } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV
- Mv A, B;
- int ref_A, ref_B;
-
- if (curr->pred_flag & 1) {
- A = curr->mv[0];
- ref_A = s->ref->refPicList[0].list[curr->ref_idx[0]];
- } else {
- A = curr->mv[1];
- ref_A = s->ref->refPicList[1].list[curr->ref_idx[1]];
- }
-
- if (neigh->pred_flag & 1) {
- B = neigh->mv[0];
- ref_B = neigh_refPicList[0].list[neigh->ref_idx[0]];
- } else {
- B = neigh->mv[1];
- ref_B = neigh_refPicList[1].list[neigh->ref_idx[1]];
- }
-
- if (ref_A == ref_B) {
- if (FFABS(A.x - B.x) >= 4 || FFABS(A.y - B.y) >= 4)
- return 1;
- else
- return 0;
- } else
- return 1;
- }
-
- return 1;
-}
void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
int log2_trafo_size)
@@ -721,10 +735,21 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
int log2_min_tu_size = s->ps.sps->log2_min_tb_size;
int min_pu_width = s->ps.sps->min_pu_width;
int min_tu_width = s->ps.sps->min_tb_width;
- int is_intra = tab_mvf[(y0 >> log2_min_pu_size) * min_pu_width +
- (x0 >> log2_min_pu_size)].pred_flag == PF_INTRA;
int boundary_upper, boundary_left;
- int i, j, bs;
+ int i, j;
+ RefPicList *rpl = s->ref->refPicList;
+ int min_pu_in_4pix = (1 << log2_min_pu_size) >> 2;
+ int trafo_in_min_pus = (1 << log2_trafo_size) >> log2_min_pu_size;
+ int y_pu = y0 >> log2_min_pu_size;
+ int x_pu = x0 >> log2_min_pu_size;
+ MvField *curr = &tab_mvf[y_pu * min_pu_width + x_pu];
+ int is_intra = curr->pred_flag == PF_INTRA;
+ int inc = log2_min_pu_size == 2 ? 2 : 1;
+ uint8_t *bs;
+
+#ifdef DISABLE_STRENGTHS
+ return;
+#endif
boundary_upper = y0 > 0 && !(y0 & 7);
if (boundary_upper &&
@@ -736,34 +761,56 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
(y0 % (1 << s->ps.sps->log2_ctb_size)) == 0)))
boundary_upper = 0;
+ bs = &s->horizontal_bs[(x0 + y0 * s->bs_width) >> 2];
+
if (boundary_upper) {
RefPicList *rpl_top = (lc->boundary_flags & BOUNDARY_UPPER_SLICE) ?
ff_hevc_get_ref_list(s, s->ref, x0, y0 - 1) :
- s->ref->refPicList;
- int yp_pu = (y0 - 1) >> log2_min_pu_size;
- int yq_pu = y0 >> log2_min_pu_size;
- int yp_tu = (y0 - 1) >> log2_min_tu_size;
- int yq_tu = y0 >> log2_min_tu_size;
+ rpl;
+ MvField *top = curr - min_pu_width;
+
+ if (is_intra) {
+ for (i = 0; i < (1 << log2_trafo_size); i += 4)
+ bs[i >> 2] = 2;
+
+ } else {
+ int y_tu = y0 >> log2_min_tu_size;
+ int x_tu = x0 >> log2_min_tu_size;
+ uint8_t *curr_cbf_luma = &s->cbf_luma[y_tu * min_tu_width + x_tu];
+ uint8_t *top_cbf_luma = curr_cbf_luma - min_tu_width;
+
+ s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
+ min_pu_in_4pix, sizeof (MvField), 4 >> 2,
+ rpl[0].list, rpl[1].list, rpl_top[0].list, rpl_top[1].list,
+ curr, top, bs);
for (i = 0; i < (1 << log2_trafo_size); i += 4) {
- int x_pu = (x0 + i) >> log2_min_pu_size;
- int x_tu = (x0 + i) >> log2_min_tu_size;
- MvField *top = &tab_mvf[yp_pu * min_pu_width + x_pu];
- MvField *curr = &tab_mvf[yq_pu * min_pu_width + x_pu];
- uint8_t top_cbf_luma = s->cbf_luma[yp_tu * min_tu_width + x_tu];
- uint8_t curr_cbf_luma = s->cbf_luma[yq_tu * min_tu_width + x_tu];
-
- if (curr->pred_flag == PF_INTRA || top->pred_flag == PF_INTRA)
- bs = 2;
- else if (curr_cbf_luma || top_cbf_luma)
- bs = 1;
- else
- bs = boundary_strength(s, curr, top, rpl_top);
- s->horizontal_bs[((x0 + i) + y0 * s->bs_width) >> 2] = bs;
+ int i_pu = i >> log2_min_pu_size;
+ int i_tu = i >> log2_min_tu_size;
+
+ if (top[i_pu].pred_flag == PF_INTRA)
+ bs[i >> 2] = 2;
+ else if (curr_cbf_luma[i_tu] || top_cbf_luma[i_tu])
+ bs[i >> 2] = 1;
}
+ }
+ }
+
+ if (!is_intra) {
+ for (j = inc; j < trafo_in_min_pus; j += inc) {
+ MvField *top;
+
+ curr += min_pu_width * inc;
+ top = curr - min_pu_width;
+ bs += s->bs_width * inc << log2_min_pu_size >> 2;
+
+ s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
+ min_pu_in_4pix, sizeof (MvField), 4 >> 2,
+ rpl[0].list, rpl[1].list, rpl[0].list, rpl[1].list,
+ curr, top, bs);
+ }
}
- // bs for vertical TU boundaries
boundary_left = x0 > 0 && !(x0 & 7);
if (boundary_left &&
((!s->sh.slice_loop_filter_across_slices_enabled_flag &&
@@ -774,64 +821,54 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
(x0 % (1 << s->ps.sps->log2_ctb_size)) == 0)))
boundary_left = 0;
+ curr = &tab_mvf[y_pu * min_pu_width + x_pu];
+ bs = &s->vertical_bs[(x0 + y0 * s->bs_width) >> 2];
+
if (boundary_left) {
RefPicList *rpl_left = (lc->boundary_flags & BOUNDARY_LEFT_SLICE) ?
ff_hevc_get_ref_list(s, s->ref, x0 - 1, y0) :
- s->ref->refPicList;
- int xp_pu = (x0 - 1) >> log2_min_pu_size;
- int xq_pu = x0 >> log2_min_pu_size;
- int xp_tu = (x0 - 1) >> log2_min_tu_size;
- int xq_tu = x0 >> log2_min_tu_size;
+ rpl;
+ MvField *left = curr - 1;
- for (i = 0; i < (1 << log2_trafo_size); i += 4) {
- int y_pu = (y0 + i) >> log2_min_pu_size;
- int y_tu = (y0 + i) >> log2_min_tu_size;
- MvField *left = &tab_mvf[y_pu * min_pu_width + xp_pu];
- MvField *curr = &tab_mvf[y_pu * min_pu_width + xq_pu];
- uint8_t left_cbf_luma = s->cbf_luma[y_tu * min_tu_width + xp_tu];
- uint8_t curr_cbf_luma = s->cbf_luma[y_tu * min_tu_width + xq_tu];
-
- if (curr->pred_flag == PF_INTRA || left->pred_flag == PF_INTRA)
- bs = 2;
- else if (curr_cbf_luma || left_cbf_luma)
- bs = 1;
- else
- bs = boundary_strength(s, curr, left, rpl_left);
- s->vertical_bs[(x0 + (y0 + i) * s->bs_width) >> 2] = bs;
- }
- }
+ if (is_intra) {
+ for (j = 0; j < (1 << log2_trafo_size); j += 4)
+ bs[j * s->bs_width >> 2] = 2;
- if (log2_trafo_size > log2_min_pu_size && !is_intra) {
- RefPicList *rpl = s->ref->refPicList;
-
- // bs for TU internal horizontal PU boundaries
- for (j = 8; j < (1 << log2_trafo_size); j += 8) {
- int yp_pu = (y0 + j - 1) >> log2_min_pu_size;
- int yq_pu = (y0 + j) >> log2_min_pu_size;
-
- for (i = 0; i < (1 << log2_trafo_size); i += 4) {
- int x_pu = (x0 + i) >> log2_min_pu_size;
- MvField *top = &tab_mvf[yp_pu * min_pu_width + x_pu];
- MvField *curr = &tab_mvf[yq_pu * min_pu_width + x_pu];
-
- bs = boundary_strength(s, curr, top, rpl);
- s->horizontal_bs[((x0 + i) + (y0 + j) * s->bs_width) >> 2] = bs;
+ } else {
+ int y_tu = y0 >> log2_min_tu_size;
+ int x_tu = x0 >> log2_min_tu_size;
+ uint8_t *curr_cbf_luma = &s->cbf_luma[y_tu * min_tu_width + x_tu];
+ uint8_t *left_cbf_luma = curr_cbf_luma - 1;
+
+ s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
+ min_pu_in_4pix, min_pu_width * sizeof (MvField), 4 * s->bs_width >> 2,
+ rpl[0].list, rpl[1].list, rpl_left[0].list, rpl_left[1].list,
+ curr, left, bs);
+
+ for (j = 0; j < (1 << log2_trafo_size); j += 4) {
+ int j_pu = j >> log2_min_pu_size;
+ int j_tu = j >> log2_min_tu_size;
+
+ if (left[j_pu * min_pu_width].pred_flag == PF_INTRA)
+ bs[j * s->bs_width >> 2] = 2;
+ else if (curr_cbf_luma[j_tu * min_tu_width] || left_cbf_luma[j_tu * min_tu_width])
+ bs[j * s->bs_width >> 2] = 1;
}
}
+ }
- // bs for TU internal vertical PU boundaries
- for (j = 0; j < (1 << log2_trafo_size); j += 4) {
- int y_pu = (y0 + j) >> log2_min_pu_size;
+ if (!is_intra) {
+ for (i = inc; i < trafo_in_min_pus; i += inc) {
+ MvField *left;
- for (i = 8; i < (1 << log2_trafo_size); i += 8) {
- int xp_pu = (x0 + i - 1) >> log2_min_pu_size;
- int xq_pu = (x0 + i) >> log2_min_pu_size;
- MvField *left = &tab_mvf[y_pu * min_pu_width + xp_pu];
- MvField *curr = &tab_mvf[y_pu * min_pu_width + xq_pu];
+ curr += inc;
+ left = curr - 1;
+ bs += inc << log2_min_pu_size >> 2;
- bs = boundary_strength(s, curr, left, rpl);
- s->vertical_bs[((x0 + i) + (y0 + j) * s->bs_width) >> 2] = bs;
- }
+ s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
+ min_pu_in_4pix, min_pu_width * sizeof (MvField), 4 * s->bs_width >> 2,
+ rpl[0].list, rpl[1].list, rpl[0].list, rpl[1].list,
+ curr, left, bs);
}
}
}
@@ -840,11 +877,196 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
#undef CB
#undef CR
+#if !defined(RPI_FAST_CACHEFLUSH)
+#if defined(RPI_LUMA_QPU) || defined(RPI_DEBLOCK_VPU)
+static void flush_buffer_y(const AVFrame * const frame) {
+ GPU_MEM_PTR_T p = get_gpu_mem_ptr_y(frame);
+ gpu_cache_flush(&p);
+}
+
+static void flush_buffer_u(const AVFrame * const frame) {
+ GPU_MEM_PTR_T p = get_gpu_mem_ptr_u(frame);
+ gpu_cache_flush(&p);
+}
+
+static void flush_buffer_v(const AVFrame * const frame) {
+ GPU_MEM_PTR_T p = get_gpu_mem_ptr_v(frame);
+ gpu_cache_flush(&p);
+}
+#endif
+#endif
+
+
+#ifdef RPI_DEBLOCK_VPU
+#error Not fixed yet
+
+// ff_hevc_flush_buffer_lines
+// flushes and invalidates all pixel rows in [start,end-1]
+static void ff_hevc_flush_buffer_lines(HEVCContext *s, int start, int end, int flush_luma, int flush_chroma)
+{
+#ifdef RPI_FAST_CACHEFLUSH
+ struct vcsm_user_clean_invalid_s iocache = {};
+ int curr_y = start;
+ int n = end;
+ int curr_uv = curr_y >> s->ps.sps->vshift[1];
+ int n_uv = n >> s->ps.sps->vshift[1];
+ int sz,base;
+ GPU_MEM_PTR_T p;
+ if (curr_uv < 0) curr_uv = 0;
+ if (n_uv<=curr_uv) { return; }
+ sz = s->frame->linesize[1] * (n_uv-curr_uv);
+ base = s->frame->linesize[1] * curr_uv;
+ if (flush_chroma) {
+ p = get_gpu_mem_ptr_u(s->frame);
+ iocache.s[0].handle = p.vcsm_handle;
+ iocache.s[0].cmd = 3; // clean+invalidate
+ iocache.s[0].addr = (int)p.arm + base;
+ iocache.s[0].size = sz;
+ p = get_gpu_mem_ptr_v(s->frame);
+ iocache.s[1].handle = p.vcsm_handle;
+ iocache.s[1].cmd = 3; // clean+invalidate
+ iocache.s[1].addr = (int)p.arm + base;
+ iocache.s[1].size = sz;
+ }
+ if (flush_luma) {
+ p = get_gpu_mem_ptr_y(s->frame);
+ sz = s->frame->linesize[0] * (n-curr_y);
+ base = s->frame->linesize[0] * curr_y;
+ iocache.s[2].handle = p.vcsm_handle;
+ iocache.s[2].cmd = 3; // clean+invalidate
+ iocache.s[2].addr = (int)p.arm + base;
+ iocache.s[2].size = sz;
+ }
+ vcsm_clean_invalid( &iocache );
+#else
+ if (flush_chroma) {
+ flush_buffer_u(s->frame);
+ flush_buffer_v(s->frame);
+ }
+ if (flush_luma) {
+ flush_buffer_y(s->frame);
+ }
+#endif
+}
+#endif
+
+#ifdef RPI_INTER_QPU
+void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
+{
+ if (s->enable_rpi && s->used_for_ref) {
+ // TODO make this use ff_hevc_flush_buffer_lines
+#ifdef RPI_FAST_CACHEFLUSH
+ struct vcsm_user_clean_invalid_s iocache = {};
+ int curr_y = ((int *)f->progress->data)[0];
+ int curr_uv = curr_y >> s->ps.sps->vshift[1];
+ int n_uv = n >> s->ps.sps->vshift[1];
+ int sz,base;
+ GPU_MEM_PTR_T p;
+ if (curr_uv < 0) curr_uv = 0;
+ if (n_uv<=curr_uv) { return; }
+ sz = s->frame->linesize[1] * (n_uv-curr_uv);
+ base = s->frame->linesize[1] * curr_uv;
+ p = get_gpu_mem_ptr_u(s->frame);
+ iocache.s[0].handle = p.vcsm_handle;
+ iocache.s[0].cmd = 3; // clean+invalidate
+ iocache.s[0].addr = (int)p.arm + base;
+ iocache.s[0].size = sz;
+ p = get_gpu_mem_ptr_v(s->frame);
+ iocache.s[1].handle = p.vcsm_handle;
+ iocache.s[1].cmd = 3; // clean+invalidate
+ iocache.s[1].addr = (int)p.arm + base;
+ iocache.s[1].size = sz;
+
+#ifdef RPI_LUMA_QPU
+ p = get_gpu_mem_ptr_y(s->frame);
+ sz = s->frame->linesize[0] * (n-curr_y);
+ base = s->frame->linesize[0] * curr_y;
+ iocache.s[2].handle = p.vcsm_handle;
+ iocache.s[2].cmd = 3; // clean+invalidate
+ iocache.s[2].addr = (int)p.arm + base;
+ iocache.s[2].size = sz;
+#endif
+ vcsm_clean_invalid( &iocache );
+#else
+ flush_buffer_u(s->frame);
+ flush_buffer_v(s->frame);
+#ifdef RPI_LUMA_QPU
+ flush_buffer_y(s->frame);
+#endif
+
+#endif
+ //memcpy(s->dummy.arm,s->frame->data[0],2048*64);
+ //memcpy(s->dummy.arm,s->frame->data[1],1024*32);
+ //memcpy(s->dummy.arm,s->frame->data[2],1024*32);
+ }
+}
+#endif
+
+#ifdef RPI_DEBLOCK_VPU
+#error XXX
+/* rpi_deblock deblocks an entire row of ctbs using the VPU */
+static void rpi_deblock(HEVCContext *s, int y, int ctb_size)
+{
+ // Flush image, 4 lines above to bottom of ctb stripe
+ ff_hevc_flush_buffer_lines(s, FFMAX(y-4,0), y+ctb_size, 1, 1);
+ // TODO flush buffer of beta/tc setup when it becomes cached
+
+ // Prepare three commands at once to avoid calling overhead
+ s->dvq->vpu_cmds_arm[0][0] = get_vc_address_y(s->frame) + s->frame->linesize[0] * y;
+ s->dvq->vpu_cmds_arm[0][1] = s->frame->linesize[0];
+ s->dvq->vpu_cmds_arm[0][2] = s->setup_width;
+ s->dvq->vpu_cmds_arm[0][3] = (int) ( s->dvq->y_setup_vc + s->setup_width * (y>>4) );
+ s->dvq->vpu_cmds_arm[0][4] = ctb_size>>4;
+ s->dvq->vpu_cmds_arm[0][5] = 2;
+
+ s->dvq->vpu_cmds_arm[1][0] = get_vc_address_u(s->frame) + s->frame->linesize[1] * (y>> s->ps.sps->vshift[1]);
+ s->dvq->vpu_cmds_arm[1][1] = s->frame->linesize[1];
+ s->dvq->vpu_cmds_arm[1][2] = s->uv_setup_width;
+ s->dvq->vpu_cmds_arm[1][3] = (int) ( s->dvq->uv_setup_vc + s->uv_setup_width * ((y>>4)>> s->ps.sps->vshift[1]) );
+ s->dvq->vpu_cmds_arm[1][4] = (ctb_size>>4)>> s->ps.sps->vshift[1];
+ s->dvq->vpu_cmds_arm[1][5] = 3;
+
+ s->dvq->vpu_cmds_arm[2][0] = get_vc_address_v(s->frame) + s->frame->linesize[2] * (y>> s->ps.sps->vshift[2]);
+ s->dvq->vpu_cmds_arm[2][1] = s->frame->linesize[2];
+ s->dvq->vpu_cmds_arm[2][2] = s->uv_setup_width;
+ s->dvq->vpu_cmds_arm[2][3] = (int) ( s->dvq->uv_setup_vc + s->uv_setup_width * ((y>>4)>> s->ps.sps->vshift[1]) );
+ s->dvq->vpu_cmds_arm[2][4] = (ctb_size>>4)>> s->ps.sps->vshift[1];
+ s->dvq->vpu_cmds_arm[2][5] = 4;
+ // Call VPU
+ s->dvq->cmd_id = vpu_post_code2( vpu_get_fn(), s->dvq->vpu_cmds_vc, 3, 0, 0, 0, 5, 0); // 5 means to do all the commands
+
+ s->dvq_n = (s->dvq_n + 1) & (RPI_DEBLOCK_VPU_Q_COUNT - 1);
+ s->dvq = s->dvq_ents + s->dvq_n;
+
+ if (s->dvq->cmd_id != -1) {
+ vpu_wait(s->dvq->cmd_id);
+ s->dvq->cmd_id = -1;
+ }
+}
+
+#endif
+
void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
{
int x_end = x >= s->ps.sps->width - ctb_size;
+#ifdef RPI_DEBLOCK_VPU
+ int done_deblock = 0;
+#endif
if (s->avctx->skip_loop_filter < AVDISCARD_ALL)
deblocking_filter_CTB(s, x, y);
+#ifdef RPI_DEBLOCK_VPU
+ if (s->enable_rpi_deblock && x_end)
+ {
+ int y_at_end = y >= s->ps.sps->height - ctb_size;
+ int height = 64; // Deblock in units 64 high to avoid too many VPU calls
+ int y_start = y&~63;
+ if (y_at_end) height = s->ps.sps->height - y_start;
+ if ((((y+ctb_size)&63)==0) || y_at_end) {
+ done_deblock = 1;
+ rpi_deblock(s, y_start, height);
+ }
+ }
+#endif
if (s->ps.sps->sao_enabled) {
int y_end = y >= s->ps.sps->height - ctb_size;
if (y && x)
@@ -853,16 +1075,46 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
sao_filter_CTB(s, x - ctb_size, y);
if (y && x_end) {
sao_filter_CTB(s, x, y - ctb_size);
- if (s->threads_type & FF_THREAD_FRAME )
+ if (s->threads_type & FF_THREAD_FRAME ) {
+#ifdef RPI_INTER_QPU
+ ff_hevc_flush_buffer(s,&s->ref->tf, y);
+#endif
ff_thread_report_progress(&s->ref->tf, y, 0);
+ }
}
if (x_end && y_end) {
sao_filter_CTB(s, x , y);
- if (s->threads_type & FF_THREAD_FRAME )
+ if (s->threads_type & FF_THREAD_FRAME ) {
+#ifdef RPI_INTER_QPU
+ ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size);
+#endif
ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0);
+ }
+ }
+ } else if (s->threads_type & FF_THREAD_FRAME && x_end) {
+ //int newh = y + ctb_size - 4;
+ //int currh = s->ref->tf.progress->data[0];
+ //if (((y + ctb_size)&63)==0)
+#ifdef RPI_DEBLOCK_VPU
+ if (s->enable_rpi_deblock) {
+ // we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi
+ if (done_deblock) {
+ ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
+ }
+ } else {
+#ifdef RPI_INTER_QPU
+ ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size - 4);
+#endif
+ ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
}
- } else if (s->threads_type & FF_THREAD_FRAME && x_end)
+#else
+#ifdef RPI_INTER_QPU
+ ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size - 4);
+ // we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi
+#endif
ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
+#endif
+ }
}
void ff_hevc_hls_filters(HEVCContext *s, int x_ctb, int y_ctb, int ctb_size)
diff --git a/libavcodec/hevc_ps.c b/libavcodec/hevc_ps.c
index 83f2ec2..6882a8d 100644
--- a/libavcodec/hevc_ps.c
+++ b/libavcodec/hevc_ps.c
@@ -989,6 +989,8 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
sps->amp_enabled_flag = get_bits1(gb);
sps->sao_enabled = get_bits1(gb);
+ av_log(avctx, AV_LOG_INFO, "sao_enabled=%d\n", sps->sao_enabled);
+
sps->pcm_enabled_flag = get_bits1(gb);
if (sps->pcm_enabled_flag) {
sps->pcm.bit_depth = get_bits(gb, 4) + 1;
diff --git a/libavcodec/hevcdsp.c b/libavcodec/hevcdsp.c
index 9d773d9..a6534a9 100644
--- a/libavcodec/hevcdsp.c
+++ b/libavcodec/hevcdsp.c
@@ -123,6 +123,120 @@ DECLARE_ALIGNED(16, const int8_t, ff_hevc_qpel_filters[3][16]) = {
#include "hevcdsp_template.c"
#undef BIT_DEPTH
+static void hevc_deblocking_boundary_strengths(int pus, int dup, int in_inc, int out_inc,
+ int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
+ MvField *curr, MvField *neigh, uint8_t *bs)
+{
+ for (; pus > 0; pus--) {
+ int strength, out;
+ int curr_refL0 = curr_rpl0[curr->ref_idx[0]];
+ int curr_refL1 = curr_rpl1[curr->ref_idx[1]];
+ int neigh_refL0 = neigh_rpl0[neigh->ref_idx[0]];
+ int neigh_refL1 = neigh_rpl1[neigh->ref_idx[1]];
+
+#if 1 // This more directly matches the original implementation
+ if (curr->pred_flag == PF_BI && neigh->pred_flag == PF_BI) {
+ // same L0 and L1
+ if (curr_refL0 == neigh_refL0 &&
+ curr_refL0 == curr_refL1 &&
+ neigh_refL0 == neigh_refL1) {
+ if ((FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
+ FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4) &&
+ (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
+ FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4))
+ strength = 1;
+ else
+ strength = 0;
+ } else if (neigh_refL0 == curr_refL0 &&
+ neigh_refL1 == curr_refL1) {
+ if (FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
+ FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4)
+ strength = 1;
+ else
+ strength = 0;
+ } else if (neigh_refL1 == curr_refL0 &&
+ neigh_refL0 == curr_refL1) {
+ if (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
+ FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4)
+ strength = 1;
+ else
+ strength = 0;
+ } else {
+ strength = 1;
+ }
+ } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV
+ Mv curr_mv0, neigh_mv0;
+
+ if (curr->pred_flag & 1) {
+ curr_mv0 = curr->mv[0];
+ } else {
+ curr_mv0 = curr->mv[1];
+ curr_refL0 = curr_refL1;
+ }
+
+ if (neigh->pred_flag & 1) {
+ neigh_mv0 = neigh->mv[0];
+ } else {
+ neigh_mv0 = neigh->mv[1];
+ neigh_refL0 = neigh_refL1;
+ }
+
+ if (curr_refL0 == neigh_refL0) {
+ if (FFABS(curr_mv0.x - neigh_mv0.x) >= 4 || FFABS(curr_mv0.y - neigh_mv0.y) >= 4)
+ strength = 1;
+ else
+ strength = 0;
+ } else
+ strength = 1;
+ } else
+ strength = 1;
+#else // This has exactly the same effect, but is more suitable for vectorisation
+ Mv curr_mv[2];
+ Mv neigh_mv[2];
+ memcpy(curr_mv, curr->mv, sizeof curr_mv);
+ memcpy(neigh_mv, neigh->mv, sizeof neigh_mv);
+
+ if (!(curr->pred_flag & 2)) {
+ curr_mv[1] = curr_mv[0];
+ curr_refL1 = curr_refL0;
+ }
+ if (!(neigh->pred_flag & 2)) {
+ neigh_mv[1] = neigh_mv[0];
+ neigh_refL1 = neigh_refL0;
+ }
+ if (!(curr->pred_flag & 1)) {
+ curr_mv[0] = curr_mv[1];
+ curr_refL0 = curr_refL1;
+ }
+ if (!(neigh->pred_flag & 1)) {
+ neigh_mv[0] = neigh_mv[1];
+ neigh_refL0 = neigh_refL1;
+ }
+
+ strength = 1;
+
+ strength &= (neigh_refL0 != curr_refL0) | (neigh_refL1 != curr_refL1) |
+ (FFABS(neigh_mv[0].x - curr_mv[0].x) >= 4) | (FFABS(neigh_mv[0].y - curr_mv[0].y) >= 4) |
+ (FFABS(neigh_mv[1].x - curr_mv[1].x) >= 4) | (FFABS(neigh_mv[1].y - curr_mv[1].y) >= 4);
+
+ strength &= (neigh_refL1 != curr_refL0) | (neigh_refL0 != curr_refL1) |
+ (FFABS(neigh_mv[1].x - curr_mv[0].x) >= 4) | (FFABS(neigh_mv[1].y - curr_mv[0].y) >= 4) |
+ (FFABS(neigh_mv[0].x - curr_mv[1].x) >= 4) | (FFABS(neigh_mv[0].y - curr_mv[1].y) >= 4);
+
+ strength |= (((curr->pred_flag + 1) ^ (neigh->pred_flag + 1)) >> 2);
+#endif
+
+ curr += in_inc / sizeof (MvField);
+ neigh += in_inc / sizeof (MvField);
+
+ for (out = dup; out > 0; out--)
+ {
+ *bs = strength;
+ bs += out_inc;
+ }
+ }
+}
+
void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
{
#undef FUNC
@@ -257,6 +371,8 @@ int i = 0;
break;
}
+ hevcdsp->hevc_deblocking_boundary_strengths = hevc_deblocking_boundary_strengths;
+
if (ARCH_X86)
ff_hevc_dsp_init_x86(hevcdsp, bit_depth);
if (ARCH_ARM)
diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h
index 9f1f6dd..e221e54 100644
--- a/libavcodec/hevcdsp.h
+++ b/libavcodec/hevcdsp.h
@@ -42,6 +42,17 @@ typedef struct SAOParams {
uint8_t type_idx[3]; ///< sao_type_idx
} SAOParams;
+typedef struct Mv {
+ int16_t x; ///< horizontal component of motion vector
+ int16_t y; ///< vertical component of motion vector
+} Mv;
+
+typedef struct MvField {
+ DECLARE_ALIGNED(4, Mv, mv)[2];
+ int8_t ref_idx[2];
+ int8_t pred_flag;
+} MvField;
+
typedef struct HEVCDSPContext {
void (*put_pcm)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
struct GetBitContext *gb, int pcm_bit_depth);
@@ -120,6 +131,9 @@ typedef struct HEVCDSPContext {
void (*hevc_v_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
int32_t *tc, uint8_t *no_p,
uint8_t *no_q);
+ void (*hevc_deblocking_boundary_strengths)(int pus, int dup, int in_inc, int out_inc,
+ int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
+ MvField *curr, MvField *neigh, uint8_t *bs);
} HEVCDSPContext;
void ff_hevc_dsp_init(HEVCDSPContext *hpc, int bit_depth);
diff --git a/libavcodec/hevcpred_template.c b/libavcodec/hevcpred_template.c
index 6ae87cc..28d2653 100644
--- a/libavcodec/hevcpred_template.c
+++ b/libavcodec/hevcpred_template.c
@@ -20,6 +20,8 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
+//#define DISABLE_INTRA
+
#include "libavutil/pixdesc.h"
#include "bit_depth_template.c"
@@ -69,8 +71,11 @@ do { \
AV_WN4P(&ptr[i], a); \
else \
a = PIXEL_SPLAT_X4(ptr[i + 3])
-
+#ifdef RPI_WORKER
+ HEVCLocalContextIntra *lc = (s->enable_rpi) ? &s->HEVClcIntra : (HEVCLocalContextIntra *)s->HEVClc ;
+#else
HEVCLocalContext *lc = s->HEVClc;
+#endif
int i;
int hshift = s->ps.sps->hshift[c_idx];
int vshift = s->ps.sps->vshift[c_idx];
@@ -114,6 +119,10 @@ do { \
int top_right_size = (FFMIN(x0 + 2 * size_in_luma_h, s->ps.sps->width) -
(x0 + size_in_luma_h)) >> hshift;
+#ifdef DISABLE_INTRA
+ return;
+#endif
+
if (s->ps.pps->constrained_intra_pred_flag == 1) {
int size_in_luma_pu_v = PU(size_in_luma_v);
int size_in_luma_pu_h = PU(size_in_luma_h);
diff --git a/libavcodec/mmaldec.c b/libavcodec/mmaldec.c
index 099a8c5..bdff2d2 100644
--- a/libavcodec/mmaldec.c
+++ b/libavcodec/mmaldec.c
@@ -24,6 +24,9 @@
* MMAL Video Decoder
*/
+#pragma GCC diagnostic push
+// Many many redundant decls in the header files
+#pragma GCC diagnostic ignored "-Wredundant-decls"
#include <bcm_host.h>
#include <interface/mmal/mmal.h>
#include <interface/mmal/mmal_parameters_video.h>
@@ -31,6 +34,7 @@
#include <interface/mmal/util/mmal_util_params.h>
#include <interface/mmal/util/mmal_default_components.h>
#include <interface/mmal/vc/mmal_vc_api.h>
+#pragma GCC diagnostic pop
#include "avcodec.h"
#include "internal.h"
diff --git a/libavcodec/mpeg4videodec.c b/libavcodec/mpeg4videodec.c
index 3adf28d..2f9195f 100644
--- a/libavcodec/mpeg4videodec.c
+++ b/libavcodec/mpeg4videodec.c
@@ -2205,6 +2205,9 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx)
if (ctx->divx_version >= 0)
s->workaround_bugs |= FF_BUG_HPEL_CHROMA;
+
+ if (ctx->num_sprite_warping_points > 1)
+ s->workaround_bugs |= FF_BUG_GMC_UNSUPPORTED;
}
if (s->workaround_bugs & FF_BUG_STD_QPEL) {
@@ -2229,6 +2232,7 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx)
s->workaround_bugs, ctx->lavc_build, ctx->xvid_build,
ctx->divx_version, ctx->divx_build, s->divx_packed ? "p" : "");
+ avctx->workaround_bugs = s->workaround_bugs;
if (CONFIG_MPEG4_DECODER && ctx->xvid_build >= 0 &&
s->codec_id == AV_CODEC_ID_MPEG4 &&
avctx->idct_algo == FF_IDCT_AUTO) {
diff --git a/libavcodec/rpi_hevc_transform.h b/libavcodec/rpi_hevc_transform.h
new file mode 100644
index 0000000..4309f1c
--- /dev/null
+++ b/libavcodec/rpi_hevc_transform.h
@@ -0,0 +1,3070 @@
+unsigned char rpi_hevc_transform [] = {
+21,
+106,
+0,
+144,
+47,
+1,
+37,
+106,
+0,
+144,
+66,
+1,
+53,
+106,
+0,
+144,
+192,
+4,
+69,
+106,
+0,
+144,
+192,
+4,
+85,
+106,
+0,
+144,
+220,
+5,
+169,
+3,
+62,
+64,
+79,
+64,
+3,
+232,
+32,
+0,
+0,
+0,
+12,
+248,
+0,
+136,
+0,
+0,
+192,
+248,
+0,
+0,
+64,
+232,
+0,
+2,
+0,
+0,
+12,
+248,
+0,
+168,
+0,
+0,
+192,
+248,
+0,
+0,
+0,
+96,
+3,
+232,
+32,
+0,
+0,
+0,
+7,
+232,
+0,
+2,
+0,
+0,
+8,
+232,
+0,
+4,
+0,
+0,
+12,
+248,
+0,
+128,
+0,
+0,
+192,
+8,
+4,
+0,
+4,
+232,
+64,
+0,
+0,
+0,
+5,
+232,
+0,
+8,
+0,
+0,
+128,
+69,
+113,
+66,
+12,
+248,
+0,
+128,
+0,
+0,
+192,
+8,
+4,
+0,
+128,
+69,
+113,
+70,
+128,
+144,
+40,
+0,
+4,
+255,
+48,
+192,
+128,
+3,
+32,
+8,
+16,
+0,
+76,
+254,
+48,
+192,
+9,
+4,
+32,
+8,
+0,
+0,
+4,
+254,
+0,
+144,
+128,
+2,
+0,
+8,
+2,
+0,
+128,
+144,
+23,
+0,
+4,
+255,
+48,
+192,
+128,
+3,
+32,
+8,
+20,
+0,
+76,
+254,
+48,
+192,
+4,
+4,
+32,
+8,
+0,
+0,
+140,
+248,
+44,
+0,
+0,
+0,
+32,
+48,
+4,
+0,
+128,
+69,
+113,
+66,
+242,
+140,
+211,
+192,
+34,
+31,
+41,
+3,
+70,
+192,
+80,
+7,
+164,
+255,
+36,
+204,
+96,
+2,
+0,
+248,
+62,
+0,
+3,
+255,
+55,
+208,
+120,
+3,
+224,
+3,
+190,
+11,
+16,
+139,
+246,
+91,
+0,
+103,
+90,
+0,
+70,
+192,
+80,
+7,
+164,
+255,
+36,
+204,
+224,
+2,
+0,
+248,
+62,
+0,
+3,
+255,
+55,
+208,
+120,
+3,
+224,
+3,
+190,
+11,
+16,
+139,
+246,
+91,
+0,
+103,
+90,
+0,
+225,
+64,
+242,
+64,
+3,
+232,
+128,
+0,
+0,
+0,
+7,
+232,
+0,
+2,
+0,
+0,
+57,
+239,
+224,
+247,
+255,
+255,
+72,
+192,
+95,
+207,
+88,
+122,
+88,
+124,
+137,
+64,
+26,
+64,
+4,
+232,
+64,
+0,
+0,
+0,
+149,
+96,
+161,
+64,
+152,
+64,
+128,
+144,
+35,
+0,
+72,
+232,
+0,
+4,
+0,
+0,
+65,
+232,
+32,
+0,
+0,
+0,
+128,
+144,
+27,
+0,
+4,
+232,
+0,
+8,
+0,
+0,
+69,
+96,
+145,
+64,
+168,
+64,
+128,
+144,
+19,
+0,
+72,
+232,
+0,
+4,
+0,
+0,
+65,
+232,
+32,
+0,
+0,
+0,
+128,
+144,
+11,
+0,
+74,
+232,
+0,
+8,
+0,
+0,
+242,
+140,
+221,
+192,
+57,
+239,
+32,
+8,
+0,
+0,
+41,
+3,
+239,
+3,
+12,
+248,
+0,
+128,
+0,
+0,
+192,
+248,
+4,
+0,
+12,
+248,
+0,
+132,
+64,
+0,
+192,
+248,
+4,
+0,
+0,
+96,
+255,
+159,
+154,
+255,
+0,
+232,
+0,
+4,
+0,
+0,
+255,
+159,
+165,
+255,
+4,
+255,
+48,
+204,
+16,
+3,
+224,
+251,
+62,
+0,
+4,
+255,
+51,
+204,
+128,
+3,
+224,
+251,
+16,
+0,
+76,
+254,
+51,
+204,
+128,
+3,
+224,
+251,
+20,
+0,
+128,
+64,
+6,
+232,
+64,
+0,
+0,
+0,
+140,
+248,
+47,
+0,
+0,
+0,
+224,
+99,
+0,
+0,
+32,
+247,
+240,
+207,
+16,
+3,
+32,
+247,
+176,
+207,
+17,
+19,
+32,
+247,
+112,
+207,
+18,
+35,
+32,
+247,
+48,
+207,
+19,
+51,
+32,
+247,
+240,
+206,
+20,
+67,
+32,
+247,
+176,
+206,
+21,
+83,
+32,
+247,
+112,
+206,
+22,
+99,
+32,
+247,
+48,
+206,
+23,
+115,
+32,
+247,
+240,
+205,
+24,
+131,
+32,
+247,
+176,
+205,
+25,
+147,
+32,
+247,
+112,
+205,
+26,
+163,
+32,
+247,
+48,
+205,
+27,
+179,
+32,
+247,
+240,
+204,
+28,
+195,
+32,
+247,
+176,
+204,
+29,
+211,
+32,
+247,
+112,
+204,
+30,
+227,
+32,
+247,
+48,
+204,
+31,
+243,
+4,
+255,
+51,
+204,
+128,
+3,
+224,
+251,
+16,
+0,
+76,
+254,
+51,
+204,
+128,
+3,
+224,
+251,
+20,
+0,
+0,
+237,
+32,
+0,
+0,
+0,
+140,
+248,
+47,
+0,
+0,
+0,
+224,
+99,
+0,
+0,
+111,
+3,
+4,
+254,
+0,
+128,
+0,
+4,
+0,
+248,
+0,
+0,
+2,
+232,
+32,
+0,
+0,
+0,
+140,
+248,
+32,
+0,
+0,
+0,
+224,
+35,
+0,
+0,
+64,
+232,
+0,
+2,
+0,
+0,
+193,
+232,
+0,
+1,
+0,
+0,
+1,
+106,
+116,
+30,
+90,
+0,
+169,
+3,
+73,
+64,
+52,
+64,
+45,
+64,
+2,
+64,
+10,
+64,
+64,
+198,
+1,
+7,
+8,
+232,
+63,
+0,
+0,
+0,
+6,
+232,
+253,
+255,
+255,
+255,
+0,
+246,
+0,
+0,
+0,
+4,
+215,
+64,
+3,
+96,
+2,
+248,
+0,
+35,
+0,
+0,
+64,
+56,
+0,
+0,
+4,
+248,
+0,
+36,
+0,
+0,
+64,
+56,
+8,
+0,
+0,
+240,
+64,
+0,
+132,
+3,
+128,
+240,
+0,
+0,
+132,
+3,
+128,
+144,
+137,
+0,
+131,
+98,
+0,
+255,
+64,
+0,
+0,
+20,
+200,
+243,
+0,
+0,
+128,
+144,
+129,
+0,
+131,
+102,
+0,
+158,
+67,
+0,
+2,
+248,
+0,
+35,
+0,
+0,
+64,
+56,
+0,
+0,
+4,
+248,
+0,
+36,
+0,
+0,
+64,
+56,
+8,
+0,
+0,
+240,
+64,
+0,
+132,
+3,
+128,
+240,
+0,
+0,
+132,
+3,
+128,
+144,
+108,
+0,
+131,
+98,
+0,
+255,
+64,
+0,
+0,
+20,
+200,
+243,
+0,
+0,
+128,
+144,
+100,
+0,
+131,
+102,
+0,
+248,
+64,
+0,
+112,
+0,
+192,
+243,
+211,
+31,
+128,
+248,
+0,
+0,
+112,
+0,
+192,
+243,
+211,
+31,
+128,
+144,
+161,
+0,
+188,
+64,
+67,
+232,
+0,
+2,
+0,
+0,
+0,
+255,
+64,
+0,
+0,
+20,
+200,
+243,
+0,
+0,
+128,
+144,
+150,
+0,
+195,
+232,
+0,
+2,
+0,
+0,
+12,
+128,
+7,
+192,
+130,
+248,
+0,
+0,
+112,
+192,
+224,
+16,
+195,
+31,
+132,
+248,
+1,
+0,
+112,
+0,
+224,
+16,
+203,
+31,
+3,
+99,
+131,
+71,
+68,
+232,
+32,
+0,
+0,
+0,
+0,
+99,
+2,
+99,
+23,
+102,
+7,
+106,
+127,
+156,
+182,
+255,
+0,
+248,
+64,
+0,
+112,
+0,
+192,
+243,
+211,
+31,
+128,
+248,
+0,
+0,
+112,
+0,
+192,
+243,
+211,
+31,
+128,
+144,
+112,
+0,
+188,
+64,
+67,
+232,
+0,
+2,
+0,
+0,
+0,
+255,
+64,
+0,
+0,
+20,
+200,
+243,
+0,
+0,
+128,
+144,
+101,
+0,
+195,
+232,
+0,
+2,
+0,
+0,
+12,
+128,
+7,
+192,
+130,
+248,
+0,
+0,
+112,
+192,
+224,
+16,
+195,
+31,
+132,
+248,
+1,
+0,
+112,
+0,
+224,
+16,
+203,
+31,
+25,
+102,
+9,
+106,
+2,
+30,
+41,
+3,
+26,
+87,
+162,
+64,
+64,
+198,
+1,
+23,
+127,
+158,
+103,
+255,
+239,
+3,
+0,
+254,
+0,
+143,
+92,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+64,
+143,
+93,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+128,
+143,
+94,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+192,
+143,
+95,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+192,
+142,
+208,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+128,
+142,
+209,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+64,
+142,
+210,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+0,
+142,
+211,
+0,
+0,
+240,
+12,
+0,
+128,
+144,
+107,
+0,
+8,
+255,
+99,
+23,
+0,
+212,
+192,
+51,
+0,
+0,
+8,
+255,
+163,
+23,
+0,
+228,
+192,
+51,
+0,
+0,
+8,
+255,
+227,
+23,
+0,
+244,
+192,
+51,
+0,
+0,
+8,
+255,
+35,
+52,
+0,
+180,
+192,
+51,
+0,
+0,
+8,
+255,
+99,
+52,
+0,
+164,
+192,
+51,
+0,
+0,
+8,
+255,
+163,
+52,
+0,
+148,
+192,
+51,
+0,
+0,
+111,
+3,
+239,
+3,
+0,
+254,
+0,
+143,
+12,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+64,
+143,
+13,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+128,
+143,
+14,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+192,
+143,
+15,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+192,
+142,
+16,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+128,
+142,
+17,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+64,
+142,
+18,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+0,
+142,
+19,
+0,
+0,
+240,
+12,
+0,
+128,
+144,
+33,
+0,
+8,
+255,
+99,
+3,
+0,
+212,
+192,
+51,
+0,
+0,
+8,
+255,
+163,
+3,
+0,
+228,
+192,
+51,
+0,
+0,
+8,
+255,
+227,
+3,
+0,
+244,
+192,
+51,
+0,
+0,
+8,
+255,
+35,
+4,
+0,
+180,
+192,
+51,
+0,
+0,
+8,
+255,
+99,
+4,
+0,
+164,
+192,
+51,
+0,
+0,
+8,
+255,
+163,
+4,
+0,
+148,
+192,
+51,
+0,
+0,
+111,
+3,
+32,
+246,
+192,
+11,
+1,
+16,
+32,
+246,
+2,
+137,
+47,
+240,
+40,
+246,
+2,
+140,
+47,
+240,
+128,
+245,
+99,
+140,
+5,
+4,
+0,
+247,
+99,
+140,
+1,
+20,
+88,
+246,
+99,
+140,
+1,
+20,
+0,
+247,
+35,
+136,
+62,
+226,
+32,
+247,
+35,
+136,
+32,
+210,
+0,
+247,
+34,
+136,
+63,
+2,
+208,
+246,
+34,
+136,
+0,
+4,
+0,
+247,
+99,
+136,
+58,
+162,
+32,
+247,
+99,
+136,
+33,
+146,
+0,
+247,
+98,
+136,
+59,
+18,
+208,
+246,
+98,
+136,
+0,
+20,
+0,
+247,
+162,
+136,
+33,
+2,
+88,
+246,
+98,
+137,
+2,
+68,
+88,
+246,
+162,
+137,
+3,
+68,
+208,
+254,
+227,
+136,
+60,
+242,
+192,
+243,
+188,
+11,
+208,
+254,
+227,
+136,
+56,
+178,
+192,
+243,
+188,
+10,
+32,
+255,
+226,
+136,
+38,
+58,
+192,
+243,
+60,
+0,
+208,
+254,
+227,
+136,
+59,
+242,
+192,
+243,
+60,
+128,
+32,
+255,
+226,
+136,
+49,
+58,
+192,
+243,
+60,
+128,
+0,
+255,
+226,
+136,
+34,
+34,
+192,
+243,
+60,
+128,
+32,
+255,
+226,
+136,
+37,
+58,
+192,
+243,
+60,
+128,
+0,
+254,
+192,
+136,
+1,
+4,
+0,
+240,
+0,
+160,
+0,
+255,
+194,
+8,
+0,
+52,
+195,
+243,
+0,
+128,
+0,
+255,
+202,
+40,
+0,
+52,
+195,
+243,
+0,
+128,
+0,
+254,
+0,
+240,
+35,
+10,
+0,
+240,
+60,
+0,
+0,
+254,
+192,
+136,
+1,
+4,
+0,
+240,
+0,
+160,
+0,
+255,
+226,
+140,
+34,
+34,
+195,
+243,
+60,
+0,
+32,
+255,
+227,
+140,
+36,
+58,
+192,
+243,
+60,
+0,
+0,
+254,
+192,
+136,
+0,
+4,
+0,
+240,
+0,
+160,
+16,
+246,
+226,
+136,
+35,
+50,
+16,
+246,
+226,
+136,
+35,
+50,
+32,
+246,
+226,
+136,
+35,
+50,
+32,
+254,
+226,
+136,
+35,
+58,
+192,
+243,
+60,
+0,
+11,
+96,
+0,
+254,
+0,
+240,
+1,
+4,
+0,
+240,
+64,
+115,
+5,
+106,
+0,
+144,
+173,
+1,
+27,
+96,
+0,
+254,
+0,
+240,
+1,
+4,
+0,
+240,
+64,
+147,
+5,
+106,
+0,
+144,
+227,
+0,
+64,
+246,
+163,
+140,
+1,
+4,
+0,
+246,
+192,
+175,
+63,
+2,
+0,
+246,
+192,
+174,
+59,
+2,
+0,
+246,
+128,
+175,
+62,
+2,
+0,
+246,
+128,
+174,
+58,
+2,
+0,
+246,
+64,
+175,
+61,
+2,
+0,
+246,
+64,
+174,
+57,
+2,
+0,
+255,
+43,
+240,
+4,
+212,
+192,
+243,
+128,
+11,
+64,
+254,
+43,
+240,
+1,
+228,
+192,
+243,
+128,
+10,
+64,
+254,
+43,
+240,
+1,
+244,
+192,
+243,
+128,
+10,
+64,
+254,
+43,
+240,
+1,
+180,
+192,
+243,
+128,
+10,
+64,
+254,
+43,
+141,
+0,
+164,
+192,
+243,
+128,
+10,
+88,
+246,
+35,
+141,
+3,
+68,
+32,
+247,
+35,
+141,
+191,
+66,
+240,
+246,
+35,
+141,
+50,
+66,
+0,
+255,
+235,
+143,
+52,
+242,
+192,
+243,
+60,
+128,
+0,
+255,
+43,
+240,
+2,
+212,
+192,
+243,
+128,
+11,
+0,
+255,
+43,
+240,
+191,
+226,
+192,
+243,
+188,
+10,
+64,
+254,
+43,
+141,
+0,
+180,
+192,
+243,
+128,
+10,
+88,
+246,
+35,
+141,
+2,
+68,
+32,
+247,
+35,
+141,
+190,
+66,
+240,
+246,
+35,
+141,
+50,
+66,
+0,
+255,
+171,
+143,
+52,
+226,
+192,
+243,
+60,
+128,
+0,
+255,
+43,
+240,
+4,
+180,
+192,
+243,
+128,
+11,
+0,
+255,
+43,
+240,
+191,
+226,
+192,
+243,
+188,
+10,
+128,
+253,
+43,
+240,
+3,
+212,
+192,
+243,
+128,
+10,
+64,
+254,
+35,
+141,
+1,
+196,
+192,
+243,
+128,
+10,
+88,
+246,
+35,
+141,
+3,
+68,
+32,
+247,
+35,
+141,
+189,
+66,
+240,
+246,
+35,
+141,
+50,
+66,
+0,
+255,
+107,
+143,
+52,
+210,
+192,
+243,
+60,
+128,
+0,
+255,
+43,
+240,
+4,
+148,
+192,
+243,
+128,
+11,
+64,
+254,
+43,
+240,
+1,
+164,
+192,
+243,
+128,
+10,
+64,
+254,
+43,
+240,
+1,
+180,
+192,
+243,
+128,
+10,
+64,
+254,
+43,
+240,
+1,
+244,
+192,
+243,
+128,
+10,
+64,
+254,
+43,
+141,
+0,
+228,
+192,
+243,
+128,
+10,
+88,
+246,
+35,
+141,
+3,
+68,
+32,
+247,
+35,
+141,
+187,
+66,
+240,
+246,
+35,
+141,
+50,
+66,
+0,
+255,
+235,
+142,
+52,
+178,
+192,
+243,
+60,
+128,
+0,
+255,
+43,
+240,
+2,
+148,
+192,
+243,
+128,
+11,
+0,
+255,
+43,
+240,
+187,
+162,
+192,
+243,
+188,
+10,
+64,
+254,
+43,
+141,
+0,
+244,
+192,
+243,
+128,
+10,
+88,
+246,
+35,
+141,
+2,
+68,
+32,
+247,
+35,
+141,
+186,
+66,
+240,
+246,
+35,
+141,
+50,
+66,
+0,
+255,
+171,
+142,
+52,
+162,
+192,
+243,
+60,
+128,
+0,
+255,
+43,
+240,
+4,
+244,
+192,
+243,
+128,
+11,
+0,
+255,
+43,
+240,
+187,
+162,
+192,
+243,
+188,
+10,
+128,
+253,
+43,
+240,
+3,
+148,
+192,
+243,
+128,
+10,
+64,
+254,
+35,
+141,
+1,
+132,
+192,
+243,
+128,
+10,
+88,
+246,
+35,
+141,
+3,
+68,
+32,
+247,
+35,
+141,
+185,
+66,
+240,
+246,
+35,
+141,
+50,
+66,
+0,
+255,
+107,
+142,
+52,
+146,
+192,
+243,
+60,
+128,
+64,
+255,
+98,
+141,
+0,
+52,
+192,
+243,
+0,
+0,
+0,
+254,
+0,
+240,
+53,
+10,
+0,
+240,
+60,
+0,
+0,
+254,
+0,
+240,
+1,
+4,
+0,
+240,
+64,
+147,
+5,
+106,
+0,
+144,
+177,
+0,
+88,
+246,
+163,
+140,
+1,
+4,
+128,
+245,
+99,
+141,
+10,
+4,
+88,
+246,
+162,
+138,
+1,
+68,
+0,
+247,
+162,
+138,
+36,
+162,
+88,
+254,
+162,
+138,
+3,
+164,
+192,
+243,
+128,
+11,
+0,
+255,
+226,
+137,
+32,
+2,
+195,
+243,
+60,
+0,
+32,
+247,
+226,
+137,
+42,
+114,
+0,
+255,
+34,
+138,
+33,
+18,
+195,
+243,
+60,
+0,
+32,
+247,
+34,
+138,
+42,
+130,
+16,
+246,
+98,
+138,
+40,
+114,
+16,
+246,
+98,
+138,
+41,
+146,
+32,
+246,
+98,
+138,
+41,
+146,
+32,
+246,
+226,
+137,
+41,
+146,
+40,
+246,
+34,
+138,
+41,
+146,
+32,
+247,
+163,
+141,
+63,
+178,
+32,
+247,
+227,
+141,
+62,
+162,
+0,
+254,
+0,
+240,
+8,
+4,
+0,
+240,
+128,
+11,
+128,
+253,
+35,
+240,
+9,
+100,
+192,
+243,
+128,
+10,
+128,
+253,
+163,
+141,
+128,
+115,
+192,
+243,
+152,
+10,
+88,
+246,
+163,
+141,
+4,
+100,
+208,
+246,
+35,
+139,
+0,
+100,
+32,
+255,
+34,
+139,
+53,
+202,
+192,
+243,
+60,
+128,
+0,
+254,
+0,
+139,
+0,
+4,
+0,
+240,
+0,
+160,
+240,
+246,
+163,
+141,
+48,
+98,
+0,
+247,
+99,
+139,
+63,
+210,
+0,
+247,
+98,
+139,
+1,
+212,
+88,
+254,
+98,
+139,
+1,
+212,
+192,
+243,
+128,
+11,
+32,
+255,
+99,
+139,
+62,
+98,
+192,
+243,
+188,
+10,
+88,
+246,
+98,
+139,
+1,
+212,
+240,
+246,
+98,
+139,
+50,
+210,
+0,
+247,
+163,
+128,
+59,
+146,
+0,
+247,
+160,
+128,
+1,
+36,
+88,
+254,
+160,
+128,
+1,
+36,
+192,
+243,
+128,
+11,
+0,
+247,
+163,
+128,
+58,
+98,
+64,
+255,
+35,
+240,
+0,
+100,
+192,
+243,
+128,
+10,
+64,
+255,
+163,
+128,
+0,
+164,
+192,
+243,
+128,
+10,
+88,
+246,
+160,
+128,
+1,
+36,
+240,
+246,
+160,
+128,
+50,
+34,
+8,
+255,
+227,
+143,
+54,
+242,
+192,
+243,
+60,
+128,
+40,
+255,
+227,
+142,
+54,
+178,
+192,
+243,
+60,
+128,
+0,
+254,
+0,
+240,
+39,
+10,
+0,
+240,
+60,
+128,
+8,
+255,
+163,
+143,
+45,
+226,
+192,
+243,
+60,
+128,
+0,
+254,
+0,
+240,
+44,
+10,
+0,
+240,
+60,
+0,
+0,
+254,
+0,
+240,
+40,
+10,
+0,
+240,
+60,
+128,
+8,
+255,
+163,
+142,
+2,
+162,
+192,
+243,
+60,
+128,
+90,
+0,
+169,
+3,
+14,
+96,
+4,
+31,
+169,
+3,
+30,
+96,
+1,
+31,
+73,
+64,
+52,
+64,
+45,
+64,
+2,
+64,
+10,
+64,
+64,
+198,
+1,
+7,
+8,
+232,
+63,
+0,
+0,
+0,
+6,
+232,
+253,
+255,
+255,
+255,
+0,
+246,
+0,
+0,
+0,
+4,
+215,
+64,
+3,
+96,
+2,
+248,
+0,
+35,
+0,
+0,
+64,
+56,
+0,
+0,
+4,
+248,
+0,
+36,
+0,
+0,
+64,
+56,
+8,
+0,
+0,
+240,
+64,
+0,
+132,
+3,
+30,
+106,
+132,
+24,
+128,
+240,
+0,
+0,
+132,
+3,
+128,
+144,
+143,
+0,
+131,
+98,
+0,
+255,
+64,
+0,
+0,
+20,
+200,
+243,
+0,
+0,
+128,
+144,
+135,
+0,
+131,
+102,
+0,
+158,
+71,
+0,
+2,
+248,
+0,
+35,
+0,
+0,
+64,
+56,
+0,
+0,
+4,
+248,
+0,
+36,
+0,
+0,
+64,
+56,
+8,
+0,
+0,
+240,
+64,
+0,
+132,
+3,
+30,
+106,
+132,
+24,
+128,
+240,
+0,
+0,
+132,
+3,
+128,
+144,
+112,
+0,
+131,
+98,
+0,
+255,
+64,
+0,
+0,
+20,
+200,
+243,
+0,
+0,
+128,
+144,
+104,
+0,
+131,
+102,
+0,
+248,
+64,
+0,
+112,
+0,
+192,
+243,
+211,
+31,
+30,
+106,
+134,
+24,
+128,
+248,
+0,
+0,
+112,
+0,
+192,
+243,
+211,
+31,
+128,
+144,
+123,
+0,
+188,
+64,
+67,
+232,
+0,
+2,
+0,
+0,
+0,
+255,
+64,
+0,
+0,
+20,
+200,
+243,
+0,
+0,
+128,
+144,
+112,
+0,
+195,
+232,
+0,
+2,
+0,
+0,
+12,
+128,
+7,
+192,
+130,
+248,
+0,
+0,
+112,
+192,
+224,
+16,
+195,
+31,
+132,
+248,
+1,
+0,
+112,
+0,
+224,
+16,
+203,
+31,
+3,
+99,
+131,
+71,
+68,
+232,
+32,
+0,
+0,
+0,
+0,
+99,
+2,
+99,
+23,
+102,
+7,
+106,
+127,
+156,
+178,
+255,
+0,
+248,
+64,
+0,
+112,
+0,
+192,
+243,
+211,
+31,
+30,
+106,
+134,
+24,
+128,
+248,
+0,
+0,
+112,
+0,
+192,
+243,
+211,
+31,
+128,
+144,
+72,
+0,
+188,
+64,
+67,
+232,
+0,
+2,
+0,
+0,
+0,
+255,
+64,
+0,
+0,
+20,
+200,
+243,
+0,
+0,
+128,
+144,
+61,
+0,
+195,
+232,
+0,
+2,
+0,
+0,
+12,
+128,
+7,
+192,
+130,
+248,
+0,
+0,
+112,
+192,
+224,
+16,
+195,
+31,
+132,
+248,
+1,
+0,
+112,
+0,
+224,
+16,
+203,
+31,
+25,
+102,
+9,
+106,
+2,
+30,
+41,
+3,
+26,
+87,
+162,
+64,
+64,
+198,
+1,
+23,
+127,
+158,
+95,
+255,
+239,
+3,
+0,
+254,
+128,
+143,
+94,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+192,
+143,
+95,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+192,
+142,
+208,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+128,
+142,
+209,
+0,
+0,
+240,
+12,
+0,
+128,
+144,
+47,
+0,
+8,
+255,
+227,
+23,
+0,
+244,
+192,
+51,
+0,
+0,
+8,
+255,
+35,
+52,
+0,
+180,
+192,
+51,
+0,
+0,
+111,
+3,
+239,
+3,
+0,
+254,
+128,
+143,
+14,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+192,
+143,
+15,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+192,
+142,
+16,
+0,
+0,
+240,
+12,
+0,
+0,
+254,
+128,
+142,
+17,
+0,
+0,
+240,
+12,
+0,
+128,
+144,
+13,
+0,
+8,
+255,
+227,
+3,
+0,
+244,
+192,
+51,
+0,
+0,
+8,
+255,
+35,
+4,
+0,
+180,
+192,
+51,
+0,
+0,
+111,
+3,
+32,
+246,
+192,
+11,
+1,
+16,
+32,
+246,
+2,
+140,
+47,
+240,
+32,
+247,
+35,
+141,
+63,
+178,
+64,
+254,
+35,
+141,
+2,
+68,
+192,
+243,
+128,
+11,
+32,
+255,
+35,
+240,
+58,
+226,
+192,
+243,
+188,
+10,
+0,
+254,
+0,
+141,
+4,
+4,
+0,
+240,
+128,
+10,
+88,
+246,
+35,
+141,
+3,
+68,
+240,
+246,
+35,
+141,
+48,
+66,
+0,
+247,
+227,
+143,
+52,
+242,
+32,
+247,
+227,
+142,
+52,
+178,
+90,
+0,
+161,
+3,
+6,
+64,
+23,
+64,
+96,
+8,
+70,
+98,
+97,
+8,
+70,
+98,
+98,
+8,
+70,
+98,
+99,
+8,
+70,
+98,
+100,
+8,
+70,
+98,
+101,
+8,
+70,
+98,
+255,
+159,
+8,
+250,
+23,
+102,
+7,
+106,
+112,
+30,
+33,
+3,
+};
diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s
new file mode 100644
index 0000000..5543093
--- /dev/null
+++ b/libavcodec/rpi_hevc_transform.s
@@ -0,0 +1,917 @@
+# ******************************************************************************
+# Argon Design Ltd.
+# (c) Copyright 2015 Argon Design Ltd. All rights reserved.
+#
+# Module : HEVC
+# Author : Peter de Rivaz
+# ******************************************************************************
+
+# HEVC VPU Transform
+#
+# Transform matrix can be thought of as
+# output row vector = input row vector * transMatrix2
+#
+# The even rows of the matrix are symmetric
+# The odd rows of the matrix are antisymmetric
+#
+# So only need to compute the first half of the results, then can compute the remainder with a butterfly
+#
+# EXAMPLE
+# (a b c d) (1 2 2 1)
+# (3 4 -4 -3)
+# (5 6 6 5)
+# (7 8 -8 -7)
+#
+# x=(a c)(1 2) = 1a+5c 2a+6c
+# (5 6)
+#
+# y=(b d)(3 4) = 3b+7d 4b+8d
+# (7 8)
+#
+# u=x+y = 1a+5c+3b+7d 2a+4b+6c+8d
+# v=x-y = 1a+5c-3b-7d 2a+6c-4b-8d
+#
+# Final results are (u , v[::-1])
+#
+#
+# For 32x1 input, load even rows into HX(0++,0), odd rows into HX(16++,0)
+# Apply the even matrix first and stop before rounding
+# Then apply the odd matrix in a full manner:
+#
+# First step is to compute partial products with the first input (16 cycles)
+# 1a 3b 5c 7d 16x1 input coefficients produce 16x16 output
+# 2a 4b 6c 8d
+# 2a -4b 6c -8d
+# 1a -3b 5c -7d
+#
+# Second step is to sum partial products into final position (8 cycles)
+# 1a+3b+5c+7d
+# 2a+4b+6c+8d
+# 2a-4b+6c-8d
+# 1a-3b+5c-7d
+#
+# Then can apply butterfly to combine even results and odd results + rounding to produce 16 rows of output at a time (need to save in transposed format)
+#
+# For 16x16 no butterfly is required and can store final results in original location (Could do 2 16x16s in parallel to make use of the trick - saves on the adds)
+#
+# For 8x8 we could compute two in parallel.
+#
+#
+
+# Columns are transformed first
+#
+# Store top left half of transMatrix2 in
+# Store bottom left half of transMatrix2 in HX(32,32)
+#
+# For 16x16
+# HX(0:15,0) contains input data before transform
+# HY(0:15,0) contains 32bit output data after transform
+# HX(32,0) contains even rows of left half of transMatrix2
+# HX(32,32) contains odd rows of left half of transMatrix2
+# HY(48,0) contains partial products ready for summing
+#
+
+
+# hevc_trans_16x16(short *transMatrix2, short *coeffs, int num) # TODO add size so we can branch to correct implementation (or perhaps have coeffs32 and num32 as secondary inputs!)
+# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory)
+# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
+# num: number of 16x16 transforms to be done
+# coeffs32
+# num32: number of 32x32 transforms
+# command 0 for transform, 1 for memclear16(int16_t *dst,num16)
+#
+hevc_trans_16x16:
+ cmp r5,1
+ beq memclear16
+ cmp r5,2
+ beq hevc_deblock_16x16
+ cmp r5,3
+ beq hevc_uv_deblock_16x16
+ cmp r5,4
+ beq hevc_uv_deblock_16x16_with_clear
+ cmp r5,5
+ beq hevc_run_command_list
+
+ push r6-r15, lr # TODO cut down number of used registers
+ mov r14,r3 # coeffs32
+ mov r15,r4 # num32
+ mov r3, 16*2 # Stride of transMatrix2 in bytes
+ vldh HX(32++,0),(r0 += r3) REP 16 # This is the 16x16 matrix, a transform is equivalent to multiplying input row vector * matrix
+
+ add r0, 16*16*2 # For 32x32 transforms we also need this matrix
+ vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
+
+ # Now use r0 to describe which matrix we are working on.
+ # Allows us to prefetch the next block of coefficients for efficiency.
+ mov r0,0 # This describes the location where we read our coefficients from
+ mov r3,16*2 # Stride of coefficients in bytes (TODO remove)
+ mov r7,16*16*2 # Total block size
+ mov r8,64*16 # Value used to swap from current to next VRF location
+ vldh HX(0++,0)+r0,(r1 += r3) REP 16
+ mov r4,64 # Constant used for rounding first pass
+ mov r5,1<<11 # Constant used for rounding second pass
+
+ # At start of block r0,r1 point to the current block (that has already been loaded)
+block_loop:
+ eor r0,r8
+ add r1,r7
+ # Prefetch the next block
+ vldh HX(0++,0)+r0,(r1 += r3) REP 16
+ eor r0,r8
+ sub r1,r7
+
+ # Transform the current block
+ bl col_trans_16
+ vadd HY(0++,0)+r0,HY(0++,0)+r0,r4 REP 16 # Now add on rounding, shift down by 7, and saturate
+ #vsasls HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # 9+7=16 so this ends up with the output saturated and in the top half of the word.
+ vasl HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # This should be saturating, but the instruction above does not assemble?
+ vmov VX(0,0++)+r0, HX(0++,32)+r0 REP 16 # For simplicity transpose this back to the original position
+
+ bl col_trans_16
+ vadd HY(0++,0)+r0,HY(0++,0)+r0,r5 REP 16 # Now add on rounding, shift down by 7, and saturate
+ #vsasls HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16 # 4+12=16 so this ends up with the output saturated and in the top half of the word.
+ vasl HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16 # This should be saturating, but the instruction above does not assemble? (Probably because it ends with ls which is interpreted as a condition flag)
+
+ # Save results - note there has been a transposition during the processing so we save columns
+ vsth VX(0,32++)+r0, (r1 += r3) REP 16
+
+ # Move onto next block
+ eor r0,r8
+ add r1,r7
+
+ addcmpbgt r2,-1,0,block_loop
+
+ # Now go and do any 32x32 transforms
+ b hevc_trans_32x32
+
+ pop r6-r15, pc
+
+# r1,r2,r3 r7,r8 should be preserved
+# HX(0++,0)+r0 is the block to be transformed
+# HX(32++,0)+r6 is the 16x16 matrix of transform coefficients
+# Use HY(48,0) for intermediate results
+# r0 can be used, but should be returned to its original value at the end
+col_trans_16:
+ add r6,r0,16 # Final value for this loop
+col_trans_16_loop:
+ # First compute partial products for a single column
+ vmul32s HY(48++,0), VX(0,0)+r0, VX(32,0++) REP 16
+ # Then sum up the results and place back
+ vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
+ addcmpblt r0,1,r6,col_trans_16_loop
+ sub r0,16 # put r0 back to its original value
+ b lr
+
+col_trans_odd_16:
+ add r6,r0,16 # Final value for this loop
+col_trans_odd_16_loop:
+ # First compute partial products for a single column
+ vmul32s HY(48++,0), VX(0,0)+r0, VX(32,32++) REP 16
+ # Then sum up the results and place back
+ vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
+ addcmpblt r0,1,r6,col_trans_odd_16_loop
+ sub r0,16 # put r0 back to its original value
+ b lr
+
+# hevc_trans_32x32(short *transMatrix2, short *coeffs, int num)
+# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory) Even followed by odd
+# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
+# num: number of 16x16 transforms to be done
+#
+hevc_trans_32x32:
+ mov r1,r14 # coeffs
+ mov r2,r15 # num
+
+ # Fetch odd transform matrix
+ #mov r3, 16*2 # Stride of transMatrix2 in bytes (and of coefficients)
+ #vldh HX(32++,0),(r0 += r3) REP 16 # This is the even 16x16 matrix
+ #add r0, 16*16*2
+ #vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
+
+ mov r3, 32*2*2 # Stride used to fetch alternate rows of our input coefficient buffer
+ mov r7, 16*16*2 # Total block size
+ sub sp,sp,32*32*2+32 # Allocate some space on the stack for us to store 32*32 shorts as temporary results (needs to be aligned)
+ # set r8 to 32byte aligned stack pointer
+ add r8,sp,31
+ lsr r8,5
+ lsl r8,5
+ mov r9,r8 # Backup of the temporary storage
+ mov r10,r1 # Backup of the coefficient buffer
+block_loop32:
+
+ # COLUMN TRANSFORM
+ mov r4, 64 # Constant used for rounding first pass
+ mov r5, 9 # left shift used for rounding first pass
+
+ # Transform the first 16 columns
+ mov r1,r10 # Input Coefficient buffer
+ mov r8,r9 # Output temporary storage
+ bl trans32
+ # Transform the second 16 columns
+ add r8,32*16*2
+ add r1,32
+ bl trans32
+
+ # ROW TRANSFORM
+ mov r4, 1<<11 # Constant used for rounding second pass
+ mov r5, 4 # left shift used for rounding second pass
+
+ mov r1,r9 # Input temporary storage
+ mov r8,r10 # Output Coefficient buffer
+ bl trans32
+ # Transform the second 16 columns
+ add r8,32*16*2
+ add r1,32
+ bl trans32
+
+ add r10, 32*32*2 # move onto next block of coefficients
+ addcmpbgt r2,-1,0,block_loop32
+
+ add sp,sp,32*32*2+32 # Restore stack
+
+ pop r6-r15, pc
+
+trans32:
+ push lr
+ # We can no longer afford the VRF space to do prefetching when doing 32x32
+ # Fetch the even rows
+ vldh HX(0++,0),(r1 += r3) REP 16
+ # Fetch the odd rows
+ vldh HX(16++,0),64(r1 += r3) REP 16 # First odd row is 32 shorts ahead of r1
+
+ # Transform the even rows using even matrix
+ mov r0, 0 # Even rows
+ bl col_trans_16
+
+ # Now transform the odd rows using odd matrix
+ mov r0, 64*16 # Odd rows
+ bl col_trans_odd_16
+
+ # Now apply butterfly to compute the first 16 results
+ vadd HY(48++,0),HY(0++,0),HY(16++,0) REP 16
+ vadd HY(48++,0),HY(48++,0),r4 REP 16 # add on rounding,
+ vasl HY(48++,0),HY(48++,0),r5 REP 16 # shift down by 7, and saturate
+ # 16bit results now in HX(48,32)
+ mov r0,r8
+ mov r6,32*2
+ vsth VX(48,32++),(r0+=r6) REP 16
+
+ # Now apply butterfly to compute the second 16 results (in reverse order)
+ vsub HY(63,0),HY(0 ,0),HY(16,0)
+ vsub HY(62,0),HY(1 ,0),HY(17,0)
+ vsub HY(61,0),HY(2 ,0),HY(18,0)
+ vsub HY(60,0),HY(3 ,0),HY(19,0)
+ vsub HY(59,0),HY(4 ,0),HY(20,0)
+ vsub HY(58,0),HY(5 ,0),HY(21,0)
+ vsub HY(57,0),HY(6 ,0),HY(22,0)
+ vsub HY(56,0),HY(7 ,0),HY(23,0)
+ vsub HY(55,0),HY(8 ,0),HY(24,0)
+ vsub HY(54,0),HY(9 ,0),HY(25,0)
+ vsub HY(53,0),HY(10,0),HY(26,0)
+ vsub HY(52,0),HY(11,0),HY(27,0)
+ vsub HY(51,0),HY(12,0),HY(28,0)
+ vsub HY(50,0),HY(13,0),HY(29,0)
+ vsub HY(49,0),HY(14,0),HY(30,0)
+ vsub HY(48,0),HY(15,0),HY(31,0)
+ vadd HY(48++,0),HY(48++,0),r4 REP 16 # add on rounding,
+ vasl HY(48++,0),HY(48++,0),r5 REP 16 # shift down by 7, and saturate
+ add r0,r8,32
+ vsth VX(48,32++),(r0+=r6) REP 16
+ pop pc
+
+memclear16:
+ # r0 is address
+ # r1 is number of 16bits values to set to 0 (may overrun past end and clear more than specified)
+ vmov HX(0++,0),0 REP 16
+ mov r2,32
+loop:
+ vsth HX(0++,0),(r0+=r2) REP 16
+ add r0,16*16*2
+ sub r1,16*16
+ cmp r1,0
+ bgt loop
+ b lr
+
+
+################################################################################
+# HEVC VPU Deblock
+#
+# Vertical edges before horizontal
+# Decision can change every 4 pixels, but only 8 pixel boundaries are deblocked
+#
+# ARM is responsible for storing beta and tc for each 4 pixels horiz and vert edge.
+# The VPU code works in units of 16x16 blocks.
+# We do vertical filtering for the current block followed by horizontal filtering for the previous (except for the first time).
+# One final horizontal filter is required at the end.
+# PCM is not allowed in this code.
+#
+#
+# H(16-4:16+15,0) contains previous block (note that we need 4 lines above of context that may get altered during filtering)
+# H(16:31,16) contains current block (note that we do not need the upper lines until the horizontal filtering.
+
+.set P0,63
+.set P1,62
+.set P2,61
+.set P3,60
+.set Q0,59
+.set Q1,58
+.set Q2,57
+.set Q3,56
+
+.set dp,32
+.set dq,33
+.set d,34
+.set decision,35
+.set beta,36
+.set beta2,37
+.set beta3,38
+.set ptest,39
+.set qtest,40
+.set pqtest,41
+.set thresh,42
+.set deltatest, 44
+.set deltap1, 45
+.set tc25, 46
+.set setup,47
+.set tc,48
+.set tc25,49
+.set tc2, 50
+.set do_filter, 51
+.set delta, 52
+.set tc10, 53
+.set delta0, 54
+.set delta1, 55
+.set zeros, 0
+.set setup_input, 1
+.set deltaq1, 2
+
+
+
+# hevc_deblock_16x16 deblocks an entire row that is 16 pixels high by the full width of the image.
+# Row has num16 16x16 blocks across
+# Beta goes from 0 to 64
+# tc goes from 0 to 24
+# setup[block_idx][0=vert,1=horz][0=first edge, 1=second edge][0=beta,1=tc][0..3=edge number]
+# has 8 bytes per edge
+# has 16 bytes per direction
+# has 32 bytes per 16x16 block
+# hevc_deblock_16x16(uint8_t *img (r0), int stride (r1), int num16w (r2), uint8_t setup[num16][2][2][2][4](r3),int num16h(r4))
+hevc_deblock_16x16:
+ push r6-r15, lr
+ mov r9,r4
+ mov r4,r3
+ mov r13,r2
+ mov r2,r0
+ mov r10,r0
+ subscale4 r0,r1
+ mov r8,63
+ mov r6,-3
+ vmov H(zeros,0),0
+# r7 is number of blocks still to load
+# r0 is location of current block - 4 * stride
+# r1 is stride
+# r2 is location of current block
+# r3 is offset of start of block (actual edges start at H(16,16)+r3 for horizontal and H(16,0)+r3 for vertical
+# r4 is setup
+# r5 is for temporary calculations
+# r8 holds 63
+# r6 holds -3
+# r9 holds the number of 16 high rows to process
+# r10 holds the original img base
+# r11 returns 0 if no filtering was done on the edge
+# r12 saves a copy of this
+# r13 is copy of width
+
+process_row:
+ # First iteration does not do horizontal filtering on previous
+ mov r7, r13
+ mov r3,0
+ vldb H(12++,16)+r3,(r0 += r1) REP 4 # Load the current block
+ vldb H(16++,16)+r3,(r2 += r1) REP 16
+ vldb H(setup_input,0), (r4) # We may wish to prefetch these
+ vstb H(zeros,0),(r4)
+ bl vert_filter
+ add r3,8
+ vadd H(setup_input,0),H(setup_input,8),0 # Rotate to second set of 8
+ bl vert_filter
+ sub r3,8
+ b start_deblock_loop
+deblock_loop:
+ # Middle iterations do vertical on current block and horizontal on preceding
+ vldb H(12++,16)+r3,(r0 += r1) REP 4 # load the current block
+ vldb H(16++,16)+r3,(r2 += r1) REP 16
+ vldb H(setup_input,0), (r4)
+ vstb H(zeros,0),(r4)
+ bl vert_filter
+ add r3,8
+ vadd H(setup_input,0),H(setup_input,8),0
+ bl vert_filter
+ sub r3,8
+ vldb H(setup_input,0), -16(r4)
+ vstb H(zeros,0),-16(r4)
+ bl horz_filter
+ mov r12,r11
+ add r3,8*64
+ vadd H(setup_input,0),H(setup_input,8),0
+ bl horz_filter
+ sub r3,8*64
+ addcmpbeq r12,0,0,skip_save_top
+ vstb H(12++,0)+r3,-16(r0 += r1) REP 4 # Save the deblocked pixels for the previous block
+skip_save_top:
+ vstb H(16++,0)+r3,-16(r2 += r1) REP 16
+start_deblock_loop:
+ # move onto next 16x16 (could do this with circular buffer support instead)
+ add r3,16
+ and r3,r8
+ add r4,32
+ # Perform loop counter operations (may work with an addcmpbgt as well?)
+ add r0,16
+ add r2,16
+ sub r7,1
+ cmp r7,0 # Are there still more blocks to load
+ bgt deblock_loop
+
+ # Final iteration needs to just do horizontal filtering
+ vldb H(setup_input,0), -16(r4)
+ vstb H(zeros,0),-16(r4)
+ bl horz_filter
+ mov r12,r11
+ add r3,8*64
+ vadd H(setup_input,0),H(setup_input,8),0
+ bl horz_filter
+ sub r3,64*8
+ addcmpbeq r12,0,0,skip_save_top2
+ vstb H(12++,0)+r3,-16(r0 += r1) REP 4 # Save the deblocked pixels for the previous block
+skip_save_top2:
+ vstb H(16++,0)+r3,-16(r2 += r1) REP 16
+
+# Now look to see if we should do another row
+ sub r9,1
+ cmp r9,0
+ bgt start_again
+ pop r6-r15, pc
+start_again:
+ # Need to sort out r0,r2 to point to next row down
+ addscale16 r10,r1
+ mov r2,r10
+ subscale4 r0,r2,r1
+ b process_row
+
+
+# At this stage H(16,16)+r3 points to the first pixel of the 16 high edge to be filtered
+# So we can reuse the code we move the parts to be filtered into HX(P0/P1/P2/P3/Q0/Q1/Q2/Q3,0) - we will perform a final saturation step on placing them back into the correct locations
+
+vert_filter:
+ push lr
+
+ vmov HX(P3,0), V(16,12)+r3
+ vmov HX(P2,0), V(16,13)+r3
+ vmov HX(P1,0), V(16,14)+r3
+ vmov HX(P0,0), V(16,15)+r3
+ vmov HX(Q0,0), V(16,16)+r3
+ vmov HX(Q1,0), V(16,17)+r3
+ vmov HX(Q2,0), V(16,18)+r3
+ vmov HX(Q3,0), V(16,19)+r3
+
+ bl do_luma_filter
+
+ vadds V(16,13)+r3, HX(P2,0), 0
+ vadds V(16,14)+r3, HX(P1,0), 0
+ vadds V(16,15)+r3, HX(P0,0), 0
+ # P3 and Q3 never change so don't bother saving back
+ vadds V(16,16)+r3, HX(Q0,0), 0
+ vadds V(16,17)+r3, HX(Q1,0), 0
+ vadds V(16,18)+r3, HX(Q2,0), 0
+
+ pop pc
+
+# Filter edge at H(16,0)+r3
+horz_filter:
+ push lr
+
+ vmov HX(P3,0), H(12,0)+r3
+ vmov HX(P2,0), H(13,0)+r3
+ vmov HX(P1,0), H(14,0)+r3
+ vmov HX(P0,0), H(15,0)+r3
+ vmov HX(Q0,0), H(16,0)+r3
+ vmov HX(Q1,0), H(17,0)+r3
+ vmov HX(Q2,0), H(18,0)+r3
+ vmov HX(Q3,0), H(19,0)+r3
+
+ bl do_luma_filter
+
+ vadds H(13,0)+r3, HX(P2,0), 0
+ vadds H(14,0)+r3, HX(P1,0), 0
+ vadds H(15,0)+r3, HX(P0,0), 0
+ # P3 and Q3 never change so don't bother saving back
+ vadds H(16,0)+r3, HX(Q0,0), 0
+ vadds H(17,0)+r3, HX(Q1,0), 0
+ vadds H(18,0)+r3, HX(Q2,0), 0
+
+ pop pc
+
+# r4 points to array of beta/tc for each 4 length edge
+do_luma_filter:
+ valtl H(setup,0),H(setup_input,0),H(setup_input,0) # b*8tc*8
+ valtl HX(beta,0),H(setup,0),H(setup,0)
+ valtu HX(tc,0),H(setup,0),H(setup,0)
+ vmul HX(tc25,0), HX(tc,0), 5
+ vadd HX(tc25,0),HX(tc25,0), 1
+ vasr HX(tc25,0), HX(tc25,0), 1
+
+ # Compute decision
+ vadd HX(dp,0),HX(P1,0),HX(P1,0) # 2*P1
+ vsub HX(dp,0),HX(P2,0),HX(dp,0) # P2-2*P1
+ vadd HX(dp,0),HX(dp,0),HX(P0,0) # P2-2*P1+P0
+ vdist HX(dp,0),HX(dp,0),0 # abs(P2-2*P1+P0) # dp0
+
+ vadd HX(dq,0),HX(Q1,0),HX(Q1,0) # 2*Q1
+ vsub HX(dq,0),HX(Q2,0),HX(dq,0) # Q2-2*Q1
+ vadd HX(dq,0),HX(dq,0),HX(Q0,0) # Q2-2*Q1+Q0
+ vdist HX(dq,0),HX(dq,0),0 # abs(Q2-2*Q1+Q0) # dq0
+
+ vadd HX(d,0), HX(dp,0), HX(dq,0)
+ vasr HX(beta2,0),HX(beta,0),2
+ vasr HX(beta3,0),HX(beta,0),3
+
+ # Compute flags that are negative if all conditions pass
+ vdist HX(decision,0), HX(P0,0), HX(P3,0) CLRA SACC
+ vdist HX(decision,0), HX(Q0,0), HX(Q3,0) SACC
+ vsub HX(decision,0), HX(decision,0), HX(beta3,0) SETF
+
+ vdist HX(decision,0), HX(P0,0), HX(Q0,0) IFN
+ vsub HX(decision,0), HX(decision,0), HX(tc25,0) IFN SETF
+ vadd HX(decision,0), HX(d,0), HX(d,0) IFN
+ vsub HX(decision,0), HX(decision,0), HX(beta2,0) IFN SETF
+ vmov HX(decision,0), 1 IFNN
+ vadd H(decision,0),H(decision,3),0 IFN
+ vadd H(decision,16),H(decision,19),0 IFN
+ vmov -,HX(decision,0) SETF # N marks strong filter
+ vmov HX(decision,0), 1 IFNN # NN marks normal filter
+
+ vadd HX(do_filter,0), HX(d,3), HX(d,0)
+ vsub HX(do_filter,0), HX(do_filter,0), HX(beta,0) SETF # IFNN means no filter
+ vmov HX(decision,0),0 IFNN # Z marks no filter
+
+ # Expand out decision (currently valid one every 4 pixels) 0...1...2...3
+ # First extract out even terms
+ vodd HX(decision,0),HX(decision,0),HX(decision,0) # 0.1.2.3
+ vodd HX(decision,0),HX(decision,0),HX(decision,0) # 0123
+ # Now expand back
+ valtl HX(decision,0),HX(decision,0),HX(decision,0) # 00112233
+ valtl HX(decision,0),HX(decision,0),HX(decision,0) SETF # 0000111122223333
+
+ # HX(decision,0) is negative if want strong filtering, 1 if want normal filtering, 0 if want no filtering
+
+ # Do a quick check to see if there is anything to do
+ mov r11, 0 # Signal no filtering
+ vmov -,1 IFNZ SUMS r5
+ cmp r5,0
+ beq filtering_done
+ mov r11, 1 # Signal some filtering
+ # And whether there is any strong filtering
+ vmov -,1 IFN SUMS r5
+ cmp r5,0
+ beq normal_filtering
+
+ ##############################################################################
+ # Strong filtering - could maybe fast case if all have same sign? (especially if all disabled!)
+ vshl HX(tc2,0), HX(tc,0), 1 # Note that in normal filtering tx2 is tc/2, while here it is tc*2
+
+ # Take a copy of the original pixels for use in decision calculation
+ vmov HX(P0,32),HX(P0,0)
+ vmov HX(Q0,32),HX(Q0,0)
+ vmov HX(P1,32),HX(P1,0)
+ vmov HX(Q1,32),HX(Q1,0)
+ vmov HX(P2,32),HX(P2,0)
+ vmov HX(Q2,32),HX(Q2,0)
+
+ vadd -,HX(P2,32),4 CLRA SACC
+ vshl -,HX(P1,32),1 SACC
+ vshl -,HX(P0,32),1 SACC
+ vshl -,HX(Q0,32),1 SACC
+ vshl HX(delta,0),HX(Q1,32),0 SACC
+ vasr HX(delta,0),HX(delta,0), 3
+ vsub HX(delta,0),HX(delta,0),HX(P0,32)
+ vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
+ vadd HX(P0,0),HX(P0,32),HX(delta,0) IFN
+
+ vadd -,HX(P2,32),2 CLRA SACC
+ vadd -,HX(P1,32),HX(P0,32) SACC
+ vshl HX(delta,0),HX(Q0,32),0 SACC
+ vasr HX(delta,0),HX(delta,0), 2
+ vsub HX(delta,0),HX(delta,0),HX(P1,32)
+ vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
+ vadd HX(P1,0),HX(P1,32),HX(delta,0) IFN
+
+ vadd -,HX(Q0,32),4 CLRA SACC
+ vadd -,HX(P1,32),HX(P0,32) SACC
+ vmul -,HX(P2,32),3 SACC
+ vshl HX(delta,0),HX(P3,0),1 SACC # Note that we have not made a copy of P3, so using P3,0 is correct
+ vasr HX(delta,0),HX(delta,0), 3
+ vsub HX(delta,0),HX(delta,0),HX(P2,32)
+ vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
+ vadd HX(P2,0),HX(P2,32),HX(delta,0) IFN
+ #vmov HX(P2,0),3 IFN
+
+ # Now reverse all P/Qs
+
+ vadd -,HX(Q2,32),4 CLRA SACC
+ vshl -,HX(Q1,32),1 SACC
+ vshl -,HX(Q0,32),1 SACC
+ vshl -,HX(P0,32),1 SACC
+ vshl HX(delta,0),HX(P1,32),0 SACC
+ vasr HX(delta,0),HX(delta,0), 3
+ vsub HX(delta,0),HX(delta,0),HX(Q0,32)
+ vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
+ vadd HX(Q0,0),HX(Q0,32),HX(delta,0) IFN
+
+ vadd -,HX(Q2,32),2 CLRA SACC
+ vadd -,HX(Q1,32),HX(Q0,32) SACC
+ vshl HX(delta,0),HX(P0,32),0 SACC
+ vasr HX(delta,0),HX(delta,0), 2
+ vsub HX(delta,0),HX(delta,0),HX(Q1,32)
+ vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
+ vadd HX(Q1,0),HX(Q1,32),HX(delta,0) IFN
+
+ vadd -,HX(P0,32),4 CLRA SACC
+ vadd -,HX(Q1,32),HX(Q0,32) SACC
+ vmul -,HX(Q2,32),3 SACC
+ vshl HX(delta,0),HX(Q3,0),1 SACC # Note that we have not made a copy of Q3, so using Q3,0 is correct
+ vasr HX(delta,0),HX(delta,0), 3
+ vsub HX(delta,0),HX(delta,0),HX(Q2,32)
+ vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
+ vadd HX(Q2,0),HX(Q2,32),HX(delta,0) IFN
+
+ ##############################################################################
+ # Normal filtering
+normal_filtering:
+ # Invert the decision flags
+ # make instruction more complicated as assembler has error and loses SETF
+ vrsub HX(tc10,0), HX(decision,0), 0 SETF # IFN means normal filtering
+ vmov -, HX(tc10,0) SETF # IFN means normal filtering
+
+ vmov -,1 IFN SUMS r5
+ cmp r5,0
+ beq filtering_done
+
+ vasr HX(tc2,0), HX(tc,0), 1
+ vmul HX(tc10,0), HX(tc,0), 10
+
+ vasr HX(thresh,0), HX(beta,0), 1
+ vadd HX(thresh,0), HX(thresh,0), HX(beta,0)
+ vasr HX(thresh,0), HX(thresh,0), 3 CLRA SACC
+
+ vadd HX(ptest,0),HX(dp,3),HX(dp,0)
+ vsub HX(ptest,0),HX(ptest,0),HX(thresh,0) # ptest is negative if we need to do the P2 pixel
+ vadd HX(qtest,0),HX(dq,3),HX(dq,0)
+ vsub HX(qtest,0),HX(qtest,0),HX(thresh,0) # qtest is negative if we need to do the Q2 pixel
+ # Expand ptest and qtest together
+ vodd HX(pqtest,0),HX(ptest,0),HX(qtest,0) # p.p.p.p.q.q.q.q
+ vodd HX(pqtest,0),HX(pqtest,0),HX(pqtest,0) # ppppqqqq........
+ valtl HX(pqtest,0),HX(pqtest,0),HX(pqtest,0) # ppppppppqqqqqqqq
+ valtl HX(ptest,0),HX(pqtest,0),HX(pqtest,0)
+ valtu HX(qtest,0),HX(pqtest,0),HX(pqtest,0)
+
+ vsub HX(delta0,0), HX(Q0,0), HX(P0,0)
+ vsub HX(delta1,0), HX(Q1,0), HX(P1,0)
+ vmov -,8 CLRA SACC
+ vmul -,HX(delta0,0), 9 SACC
+ vmul HX(delta0,0),HX(delta1,0), r6 SACC
+ vasr HX(delta0,0), HX(delta0,0), 4
+ vdist HX(deltatest,0), HX(delta0,0), 0
+ vsub HX(deltatest,0), HX(deltatest,0), HX(tc10,0) IFN SETF # negative if still need to do something
+ vmov HX(deltatest,0), 0 IFNN # clear if no need to do anything so we can reload flags later
+
+ vclamps HX(delta0,0), HX(delta0,0), HX(tc,0)
+
+ vadd HX(deltap1,0), HX(P2,0), HX(P0,0)
+ vadd HX(deltap1,0), HX(deltap1,0), 1
+ vasr HX(deltap1,0), HX(deltap1,0), 1 CLRA SACC
+ vsub HX(deltap1,0), HX(delta0,0), HX(P1,0) SACC
+ vasr HX(deltap1,0), HX(deltap1,0), 1
+ vclamps HX(deltap1,0), HX(deltap1,0), HX(tc2,0)
+
+ vadd HX(deltaq1,0), HX(Q2,0), HX(Q0,0)
+ vadd HX(deltaq1,0), HX(deltaq1,0), 1
+ vasr HX(deltaq1,0), HX(deltaq1,0), 1 CLRA SACC
+ vadd HX(deltaq1,0), HX(delta0,0), HX(Q1,0)
+ vrsub -, HX(delta0,0), 0 SACC
+ vrsub HX(deltaq1,0), HX(Q1,0), 0 SACC
+ vasr HX(deltaq1,0), HX(deltaq1,0), 1
+ vclamps HX(deltaq1,0), HX(deltaq1,0), HX(tc2,0)
+
+ vadds HX(P0,0), HX(P0,0), HX(delta0,0) IFN
+ vsubs HX(Q0,0), HX(Q0,0), HX(delta0,0) IFN
+
+ vmov -,HX(ptest,0) IFN SETF # Negative if need to do p1
+ vadds HX(P1,0), HX(P1,0), HX(deltap1,0) IFN
+
+ vmov -,HX(deltatest,0) SETF
+ vmov -,HX(qtest,0) IFN SETF # Negative if need to do q1
+ vadds HX(Q1,0), HX(Q1,0), HX(deltaq1,0) IFN
+
+ #vmov HX(P2,0),1 IFN
+
+filtering_done:
+ b lr
+
+
+hevc_uv_deblock_16x16:
+ push r6-r15, lr
+ mov r14,0
+ b hevc_uv_start
+hevc_uv_deblock_16x16_with_clear:
+ push r6-r15, lr
+ mov r14,1
+ b hevc_uv_start
+
+hevc_uv_start:
+ mov r9,r4
+ mov r4,r3
+ mov r13,r2
+ mov r2,r0
+ mov r10,r0
+ subscale4 r0,r1
+ mov r8,63
+ mov r6,-3
+ vmov H(zeros,0),0
+# r7 is number of blocks still to load
+# r0 is location of current block - 4 * stride
+# r1 is stride
+# r2 is location of current block
+# r3 is offset of start of block (actual edges start at H(16,16)+r3 for horizontal and H(16,0)+r3 for vertical
+# r4 is setup
+# r5 is for temporary calculations
+# r8 holds 63
+# r6 holds -3
+# r9 holds the number of 16 high rows to process
+# r10 holds the original img base
+# r11 returns 0 if no filtering was done on the edge
+# r12 saves a copy of this
+# r13 is copy of width
+# r14 is 1 if we should clear the old contents, or 0 if not
+
+uv_process_row:
+ # First iteration does not do horizontal filtering on previous
+ mov r7, r13
+ mov r3,0
+ vldb H(12++,16)+r3,(r0 += r1) REP 4 # Load the current block
+ vldb H(16++,16)+r3,(r2 += r1) REP 16
+ vldb H(setup_input,0), (r4) # We may wish to prefetch these
+ cmp r14,1
+ bne uv_skip0
+ vstb H(zeros,0),(r4)
+uv_skip0:
+ bl uv_vert_filter
+ add r3,8
+ vadd H(setup_input,0),H(setup_input,8),0 # Rotate to second set of 8
+ bl uv_vert_filter
+ sub r3,8
+ b uv_start_deblock_loop
+uv_deblock_loop:
+ # Middle iterations do vertical on current block and horizontal on preceding
+ vldb H(12++,16)+r3,(r0 += r1) REP 4 # load the current block
+ vldb H(16++,16)+r3,(r2 += r1) REP 16
+ vldb H(setup_input,0), (r4)
+ cmp r14,1
+ bne uv_skip1
+ vstb H(zeros,0),(r4)
+uv_skip1:
+ bl uv_vert_filter
+ add r3,8
+ vadd H(setup_input,0),H(setup_input,8),0
+ bl uv_vert_filter
+ sub r3,8
+ vldb H(setup_input,0), -16(r4)
+ cmp r14,1
+ bne uv_skip3
+ vstb H(zeros,0),-16(r4)
+uv_skip3:
+ bl uv_horz_filter
+ mov r12,r11
+ add r3,8*64
+ vadd H(setup_input,0),H(setup_input,8),0
+ bl uv_horz_filter
+ sub r3,8*64
+ addcmpbeq r12,0,0,uv_skip_save_top
+ vstb H(12++,0)+r3,-16(r0 += r1) REP 4 # Save the deblocked pixels for the previous block
+uv_skip_save_top:
+ vstb H(16++,0)+r3,-16(r2 += r1) REP 16
+uv_start_deblock_loop:
+ # move onto next 16x16 (could do this with circular buffer support instead)
+ add r3,16
+ and r3,r8
+ add r4,32
+ # Perform loop counter operations (may work with an addcmpbgt as well?)
+ add r0,16
+ add r2,16
+ sub r7,1
+ cmp r7,0 # Are there still more blocks to load
+ bgt uv_deblock_loop
+
+ # Final iteration needs to just do horizontal filtering
+ vldb H(setup_input,0), -16(r4)
+ cmp r14,1
+ bne uv_skip2
+ vstb H(zeros,0),-16(r4)
+uv_skip2:
+ bl uv_horz_filter
+ mov r12,r11
+ add r3,8*64
+ vadd H(setup_input,0),H(setup_input,8),0
+ bl uv_horz_filter
+ sub r3,64*8
+ addcmpbeq r12,0,0,uv_skip_save_top2
+ vstb H(12++,0)+r3,-16(r0 += r1) REP 4 # Save the deblocked pixels for the previous block
+uv_skip_save_top2:
+ vstb H(16++,0)+r3,-16(r2 += r1) REP 16
+
+# Now look to see if we should do another row
+ sub r9,1
+ cmp r9,0
+ bgt uv_start_again
+ pop r6-r15, pc
+uv_start_again:
+ # Need to sort out r0,r2 to point to next row down
+ addscale16 r10,r1
+ mov r2,r10
+ subscale4 r0,r2,r1
+ b uv_process_row
+
+
+# At this stage H(16,16)+r3 points to the first pixel of the 16 high edge to be filtered
+# So we can reuse the code we move the parts to be filtered into HX(P0/P1/P2/P3/Q0/Q1/Q2/Q3,0) - we will perform a final saturation step on placing them back into the correct locations
+
+uv_vert_filter:
+ push lr
+
+ vmov HX(P1,0), V(16,14)+r3
+ vmov HX(P0,0), V(16,15)+r3
+ vmov HX(Q0,0), V(16,16)+r3
+ vmov HX(Q1,0), V(16,17)+r3
+
+ bl do_chroma_filter
+
+ vadds V(16,15)+r3, HX(P0,0), 0
+ vadds V(16,16)+r3, HX(Q0,0), 0
+
+ pop pc
+
+# Filter edge at H(16,0)+r3
+uv_horz_filter:
+ push lr
+
+ vmov HX(P1,0), H(14,0)+r3
+ vmov HX(P0,0), H(15,0)+r3
+ vmov HX(Q0,0), H(16,0)+r3
+ vmov HX(Q1,0), H(17,0)+r3
+
+ bl do_chroma_filter
+
+ vadds H(15,0)+r3, HX(P0,0), 0
+ # P3 and Q3 never change so don't bother saving back
+ vadds H(16,0)+r3, HX(Q0,0), 0
+
+ pop pc
+
+# r4 points to array of beta/tc for each 4 length edge
+do_chroma_filter:
+ valtl H(setup,0),H(setup_input,0),H(setup_input,0) # tc*8
+ valtl HX(tc,0),H(setup,0),H(setup,0)
+
+ vsub HX(delta,0),HX(Q0,0),HX(P0,0)
+ vshl HX(delta,0),HX(delta,0),2 CLRA SACC
+ vsub -,HX(P1,0),HX(Q1,0) SACC
+ vmov HX(delta,0),4 SACC
+ vasr HX(delta,0),HX(delta,0),3
+ vclamps HX(delta,0), HX(delta,0), HX(tc,0)
+ vadd HX(P0,0),HX(P0,0),HX(delta,0)
+ vsub HX(Q0,0),HX(Q0,0),HX(delta,0)
+ b lr
+
+# r0 = list
+# r1 = number
+hevc_run_command_list:
+ push r6-r7, lr
+ mov r6, r0
+ mov r7, r1
+loop_cmds:
+ ld r0,(r6) # How to encode r6++?
+ add r6,4
+ ld r1,(r6)
+ add r6,4
+ ld r2,(r6)
+ add r6,4
+ ld r3,(r6)
+ add r6,4
+ ld r4,(r6)
+ add r6,4
+ ld r5,(r6)
+ add r6,4
+ bl hevc_trans_16x16
+ sub r7,1
+ cmp r7,0
+ bgt loop_cmds
+
+ pop r6-r7, pc
diff --git a/libavcodec/rpi_mailbox.c b/libavcodec/rpi_mailbox.c
new file mode 100644
index 0000000..3904efc
--- /dev/null
+++ b/libavcodec/rpi_mailbox.c
@@ -0,0 +1,340 @@
+/*
+Copyright (c) 2012, Broadcom Europe Ltd.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the copyright holder nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <assert.h>
+#include <stdint.h>
+#include <sys/mman.h>
+#include <sys/ioctl.h>
+
+#include <linux/ioctl.h>
+
+#define MAJOR_NUM 100
+#define IOCTL_MBOX_PROPERTY _IOWR(MAJOR_NUM, 0, char *)
+#define DEVICE_FILE_NAME "/dev/vcio"
+
+#include "rpi_mailbox.h"
+
+#define PAGE_SIZE (4*1024)
+
+// Shared memory will not be cached in ARM cache
+void *mapmem_shared(unsigned base, unsigned size)
+{
+ int mem_fd;
+ unsigned offset = base % PAGE_SIZE;
+ base = base - offset;
+ /* open /dev/mem */
+ if ((mem_fd = open("/dev/mem", O_RDWR|O_SYNC) ) < 0) {
+ printf("can't open /dev/mem\nThis program should be run as root. Try prefixing command with: sudo\n");
+ return NULL;
+ }
+ void *mem = mmap(
+ 0,
+ size,
+ PROT_READ|PROT_WRITE,
+ MAP_SHARED/*|MAP_FIXED*/,
+ mem_fd,
+ base);
+#ifdef DEBUG
+ printf("base=0x%x, mem=%p\n", base, mem);
+#endif
+ if (mem == MAP_FAILED) {
+ printf("mmap error %d\n", (int)mem);
+ return NULL;
+ }
+ close(mem_fd);
+ return (char *)mem + offset;
+}
+
+// Unshared memory will be faster as lives in ARM cache, but requires cache flushing
+void *mapmem_private(unsigned base, unsigned size)
+{
+ int mem_fd;
+ unsigned offset = base % PAGE_SIZE;
+ base = base - offset;
+ /* open /dev/mem */
+ if ((mem_fd = open("/dev/mem", O_RDWR|O_SYNC) ) < 0) {
+ printf("can't open /dev/mem\nThis program should be run as root. Try prefixing command with: sudo\n");
+ return NULL;
+ }
+ void *mem = mmap(
+ 0,
+ size,
+ PROT_READ|PROT_WRITE,
+ MAP_PRIVATE/*|MAP_FIXED*/,
+ mem_fd,
+ base);
+#ifdef DEBUG
+ printf("base=0x%x, mem=%p\n", base, mem);
+#endif
+ if (mem == MAP_FAILED) {
+ printf("mmap error %d\n", (int)mem);
+ return NULL;
+ }
+ close(mem_fd);
+ return (char *)mem + offset;
+}
+
+void unmapmem(void *addr, unsigned size)
+{
+ int s = munmap(addr, size);
+ if (s != 0) {
+ printf("munmap error %d\n", s);
+ exit (-1);
+ }
+}
+
+/*
+ * use ioctl to send mbox property message
+ */
+
+static int mbox_property(int file_desc, void *buf)
+{
+ int ret_val = ioctl(file_desc, IOCTL_MBOX_PROPERTY, buf);
+
+ if (ret_val < 0) {
+ printf("ioctl_set_msg failed:%d\n", ret_val);
+ }
+
+#ifdef DEBUG
+ unsigned *p = buf; int i; unsigned size = *(unsigned *)buf;
+ for (i=0; i<size/4; i++)
+ printf("%04x: 0x%08x\n", i*sizeof *p, p[i]);
+#endif
+ return ret_val;
+}
+
+unsigned mem_alloc(int file_desc, unsigned size, unsigned align, unsigned flags)
+{
+ int i=0;
+ unsigned p[32];
+ p[i++] = 0; // size
+ p[i++] = 0x00000000; // process request
+
+ p[i++] = 0x3000c; // (the tag id)
+ p[i++] = 12; // (size of the buffer)
+ p[i++] = 12; // (size of the data)
+ p[i++] = size; // (num bytes? or pages?)
+ p[i++] = align; // (alignment)
+ p[i++] = flags; // (MEM_FLAG_L1_NONALLOCATING)
+
+ p[i++] = 0x00000000; // end tag
+ p[0] = i*sizeof *p; // actual size
+
+ mbox_property(file_desc, p);
+ return p[5];
+}
+
+unsigned mem_free(int file_desc, unsigned handle)
+{
+ int i=0;
+ unsigned p[32];
+ p[i++] = 0; // size
+ p[i++] = 0x00000000; // process request
+
+ p[i++] = 0x3000f; // (the tag id)
+ p[i++] = 4; // (size of the buffer)
+ p[i++] = 4; // (size of the data)
+ p[i++] = handle;
+
+ p[i++] = 0x00000000; // end tag
+ p[0] = i*sizeof *p; // actual size
+
+ mbox_property(file_desc, p);
+ return p[5];
+}
+
+unsigned mem_lock(int file_desc, unsigned handle)
+{
+ int i=0;
+ unsigned p[32];
+ p[i++] = 0; // size
+ p[i++] = 0x00000000; // process request
+
+ p[i++] = 0x3000d; // (the tag id)
+ p[i++] = 4; // (size of the buffer)
+ p[i++] = 4; // (size of the data)
+ p[i++] = handle;
+
+ p[i++] = 0x00000000; // end tag
+ p[0] = i*sizeof *p; // actual size
+
+ mbox_property(file_desc, p);
+ return p[5];
+}
+
+unsigned mem_unlock(int file_desc, unsigned handle)
+{
+ int i=0;
+ unsigned p[32];
+ p[i++] = 0; // size
+ p[i++] = 0x00000000; // process request
+
+ p[i++] = 0x3000e; // (the tag id)
+ p[i++] = 4; // (size of the buffer)
+ p[i++] = 4; // (size of the data)
+ p[i++] = handle;
+
+ p[i++] = 0x00000000; // end tag
+ p[0] = i*sizeof *p; // actual size
+
+ mbox_property(file_desc, p);
+ return p[5];
+}
+
+unsigned execute_code(int file_desc, unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5)
+{
+ int i=0;
+ unsigned p[32];
+ p[i++] = 0; // size
+ p[i++] = 0x00000000; // process request
+
+ p[i++] = 0x30010; // (the tag id)
+ p[i++] = 28; // (size of the buffer)
+ p[i++] = 28; // (size of the data)
+ p[i++] = code;
+ p[i++] = r0;
+ p[i++] = r1;
+ p[i++] = r2;
+ p[i++] = r3;
+ p[i++] = r4;
+ p[i++] = r5;
+
+ p[i++] = 0x00000000; // end tag
+ p[0] = i*sizeof *p; // actual size
+
+ mbox_property(file_desc, p);
+ return p[5];
+}
+
+unsigned qpu_enable(int file_desc, unsigned enable)
+{
+ int i=0;
+ unsigned p[32];
+
+ p[i++] = 0; // size
+ p[i++] = 0x00000000; // process request
+
+ p[i++] = 0x30012; // (the tag id)
+ p[i++] = 4; // (size of the buffer)
+ p[i++] = 4; // (size of the data)
+ p[i++] = enable;
+
+ p[i++] = 0x00000000; // end tag
+ p[0] = i*sizeof *p; // actual size
+
+ mbox_property(file_desc, p);
+ return p[5];
+}
+
+unsigned execute_qpu(int file_desc, unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout) {
+ int i=0;
+ unsigned p[32];
+
+ p[i++] = 0; // size
+ p[i++] = 0x00000000; // process request
+ p[i++] = 0x30011; // (the tag id)
+ p[i++] = 16; // (size of the buffer)
+ p[i++] = 16; // (size of the data)
+ p[i++] = num_qpus;
+ p[i++] = control;
+ p[i++] = noflush;
+ p[i++] = timeout; // ms
+
+ p[i++] = 0x00000000; // end tag
+ p[0] = i*sizeof *p; // actual size
+
+ mbox_property(file_desc, p);
+ return p[5];
+}
+
+void execute_multi(int file_desc,
+ unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout,
+ unsigned num_qpus_2, unsigned control_2, unsigned noflush_2, unsigned timeout_2,
+ unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
+ unsigned code_2, unsigned r0_2, unsigned r1_2, unsigned r2_2, unsigned r3_2, unsigned r4_2, unsigned r5_2) {
+ int i=0;
+ unsigned p[32];
+
+ p[i++] = 0; // size
+ p[i++] = 0x00000000; // process request
+ p[i++] = 0x30018; // (the tag id)
+ p[i++] = 88; // (size of the buffer)
+ p[i++] = 88; // (size of the data)
+
+ p[i++] = num_qpus;
+ p[i++] = control;
+ p[i++] = noflush;
+ p[i++] = timeout; // ms
+
+ p[i++] = num_qpus_2;
+ p[i++] = control_2;
+ p[i++] = noflush_2;
+ p[i++] = timeout_2; // ms
+
+ p[i++] = code;
+ p[i++] = r0;
+ p[i++] = r1;
+ p[i++] = r2;
+ p[i++] = r3;
+ p[i++] = r4;
+ p[i++] = r5;
+
+ p[i++] = code_2;
+ p[i++] = r0_2;
+ p[i++] = r1_2;
+ p[i++] = r2_2;
+ p[i++] = r3_2;
+ p[i++] = r4_2;
+ p[i++] = r5_2;
+
+ p[i++] = 0x00000000; // end tag
+ p[0] = i*sizeof *p; // actual size
+
+ mbox_property(file_desc, p);
+ return;
+}
+
+int mbox_open() {
+ int file_desc;
+
+ // open a char device file used for communicating with kernel mbox driver
+ file_desc = open(DEVICE_FILE_NAME, 0);
+ if (file_desc < 0) {
+ printf("Can't open device file: %s\n", DEVICE_FILE_NAME);
+ printf("Try creating a device file with: sudo mknod %s c %d 0\n", DEVICE_FILE_NAME, MAJOR_NUM);
+ }
+ return file_desc;
+}
+
+void mbox_close(int file_desc) {
+ close(file_desc);
+}
diff --git a/libavcodec/rpi_mailbox.h b/libavcodec/rpi_mailbox.h
new file mode 100644
index 0000000..5898102
--- /dev/null
+++ b/libavcodec/rpi_mailbox.h
@@ -0,0 +1,25 @@
+#ifndef RPI_MAILBOX_H
+#define RPI_MAILBOX_H
+
+extern int mbox_open(void);
+extern void mbox_close(int file_desc);
+
+extern unsigned get_version(int file_desc);
+extern unsigned mem_alloc(int file_desc, unsigned size, unsigned align, unsigned flags);
+extern unsigned mem_free(int file_desc, unsigned handle);
+extern unsigned mem_lock(int file_desc, unsigned handle);
+extern unsigned mem_unlock(int file_desc, unsigned handle);
+extern void *mapmem_shared(unsigned base, unsigned size);
+extern void *mapmem_private(unsigned base, unsigned size);
+extern void unmapmem(void *addr, unsigned size);
+
+extern unsigned execute_code(int file_desc, unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
+extern unsigned execute_qpu(int file_desc, unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout);
+extern void execute_multi(int file_desc,
+ unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout,
+ unsigned num_qpus_2, unsigned control_2, unsigned noflush_2, unsigned timeout_2,
+ unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
+ unsigned code_2, unsigned r0_2, unsigned r1_2, unsigned r2_2, unsigned r3_2, unsigned r4_2, unsigned r5_2);
+extern unsigned qpu_enable(int file_desc, unsigned enable);
+
+#endif
diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
new file mode 100644
index 0000000..a01c051
--- /dev/null
+++ b/libavcodec/rpi_qpu.c
@@ -0,0 +1,991 @@
+#ifdef RPI
+// Use vchiq service for submitting jobs
+#define GPUSERVICE
+
+// This works better than the mmap in that the memory can be cached, but requires a kernel modification to enable the device.
+// define RPI_TIME_TOTAL_QPU to print out how much time is spent in the QPU code
+//#define RPI_TIME_TOTAL_QPU
+// define RPI_TIME_TOTAL_VPU to print out how much time is spent in the VPI code
+//#define RPI_TIME_TOTAL_VPU
+// define RPI_TIME_TOTAL_POSTED to print out how much time is spent in the multi execute QPU/VPU combined
+#define RPI_TIME_TOTAL_POSTED
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stddef.h>
+#include <stdint.h>
+#include "libavutil/avassert.h"
+
+#include "config.h"
+
+#include <pthread.h>
+#include <time.h>
+
+#include "rpi_mailbox.h"
+#include "rpi_qpu.h"
+#include "rpi_shader.h"
+#include "rpi_hevc_transform.h"
+
+#include "rpi_user_vcsm.h"
+#ifdef GPUSERVICE
+#pragma GCC diagnostic push
+// Many many redundant decls in the header files
+#pragma GCC diagnostic ignored "-Wredundant-decls"
+#include "interface/vmcs_host/vc_vchi_gpuserv.h"
+#pragma GCC diagnostic pop
+#endif
+
+// QPU profile flags
+#define NO_FLUSH 1
+#define CLEAR_PROFILE 2
+#define OUTPUT_COUNTS 4
+
+#define FLAGS_FOR_PROFILING (NO_FLUSH)
+
+
+// On Pi2 there is no way to access the VPU L2 cache
+// GPU_MEM_FLG should be 4 for uncached memory. (Or C for alias to allocate in the VPU L2 cache)
+// However, if using VCSM allocated buffers, need to use C at the moment because VCSM does not allocate uncached memory correctly
+// The QPU crashes if we mix L2 cached and L2 uncached accesses due to a HW bug.
+#define GPU_MEM_FLG 0x4
+// GPU_MEM_MAP is meaningless on the Pi2 and should be left at 0 (On Pi1 it allows ARM to access VPU L2 cache)
+#define GPU_MEM_MAP 0x0
+
+#define vcos_verify_ge0(x) ((x)>=0)
+
+/*static const unsigned code[] =
+{
+ #include "rpi_shader.hex"
+};*/
+
+// Size in 32bit words
+#define QPU_CODE_SIZE 2048
+#define VPU_CODE_SIZE 2048
+
+const short rpi_transMatrix2even[32][16] = { // Even rows first
+{64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64},
+{90, 87, 80, 70, 57, 43, 25, 9, -9, -25, -43, -57, -70, -80, -87, -90},
+{89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89},
+{87, 57, 9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87},
+{83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83},
+{80, 9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80},
+{75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75},
+{70, -43, -87, 9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70},
+{64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64},
+{57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87, 9, -90, 25, 80, -57},
+{50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50},
+{43, -90, 57, 25, -87, 70, 9, -80, 80, -9, -70, 87, -25, -57, 90, -43},
+{36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36},
+{25, -70, 90, -80, 43, 9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25},
+{18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18},
+{ 9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9},
+// Odd rows
+{90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4},
+{90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13},
+{88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22},
+{85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31},
+{82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38},
+{78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46},
+{73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54},
+{67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61},
+{61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67},
+{54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73},
+{46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78},
+{38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82},
+{31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85},
+{22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88},
+{13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90},
+{ 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90}
+};
+
+struct GPU
+{
+ unsigned int qpu_code[QPU_CODE_SIZE];
+ unsigned int vpu_code[VPU_CODE_SIZE];
+ short transMatrix2even[16*16*2];
+ int open_count; // Number of allocated video buffers
+ int mb; // Mailbox handle
+ int vc; // Address in GPU memory
+ int mail[12*2]; // These are used to pass pairs of code/unifs to the QPUs for the first QPU task
+ int mail2[12*2]; // These are used to pass pairs of code/unifs to the QPUs for the second QPU task
+};
+
+// Stop more than one thread trying to allocate memory or use the processing resources at once
+static pthread_mutex_t gpu_mutex = PTHREAD_MUTEX_INITIALIZER;
+static volatile struct GPU* gpu = NULL;
+static GPU_MEM_PTR_T gpu_mem_ptr;
+
+#if defined(RPI_TIME_TOTAL_QPU) || defined(RPI_TIME_TOTAL_VPU) || defined(RPI_TIME_TOTAL_POSTED)
+static unsigned int Microseconds(void) {
+ struct timespec ts;
+ unsigned int x;
+ static unsigned int base = 0;
+ clock_gettime(CLOCK_REALTIME, &ts);
+ x = ts.tv_sec*1000000 + ts.tv_nsec/1000;
+ if (base==0) base=x;
+ return x-base;
+}
+#endif
+
+static int gpu_malloc_uncached_internal(int numbytes, GPU_MEM_PTR_T *p, int mb);
+static void gpu_free_internal(GPU_MEM_PTR_T *p);
+
+// Connect to QPU, returns 0 on success.
+static int gpu_init(volatile struct GPU **gpu) {
+ int mb = mbox_open();
+ int vc;
+ volatile struct GPU* ptr;
+ if (mb < 0)
+ return -1;
+#ifndef RPI_ASYNC
+ if (qpu_enable(mb, 1)) return -2;
+#endif
+ vcsm_init();
+ gpu_malloc_uncached_internal(sizeof(struct GPU), &gpu_mem_ptr, mb);
+ ptr = (volatile struct GPU*)gpu_mem_ptr.arm;
+ memset((void*)ptr, 0, sizeof *ptr);
+ vc = gpu_mem_ptr.vc;
+
+ ptr->mb = mb;
+ ptr->vc = vc;
+
+ printf("GPU allocated at 0x%x\n",vc);
+
+ *gpu = ptr;
+
+ // Now copy over the QPU code into GPU memory
+ {
+ int num_bytes = qpu_get_fn(QPU_MC_END) - qpu_get_fn(QPU_MC_SETUP_UV);
+ av_assert0(num_bytes<=QPU_CODE_SIZE*sizeof(unsigned int));
+ memcpy((void*)ptr->qpu_code, rpi_shader, num_bytes);
+ }
+ // And the VPU code
+ {
+ int num_bytes = sizeof(rpi_hevc_transform);
+ av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
+ memcpy((void*)ptr->vpu_code, rpi_hevc_transform, num_bytes);
+ }
+ // And the transform coefficients
+ memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, sizeof(rpi_transMatrix2even));
+
+#ifdef RPI_ASYNC
+ {
+ int err;
+ vpu_async_tail = 0;
+ vpu_async_head = 0;
+ err = pthread_create(&vpu_thread, NULL, vpu_start, NULL);
+ //printf("Created thread\n");
+ if (err) {
+ av_log(NULL, AV_LOG_FATAL, "Failed to create vpu thread\n");
+ return -4;
+ }
+
+ {
+ struct sched_param param = {0};
+ int policy = 0;
+
+ if (pthread_getschedparam(vpu_thread, &policy, &param) != 0)
+ {
+ av_log(NULL, AV_LOG_ERROR, "Unable to get VPU thread scheduling parameters\n");
+ }
+ else
+ {
+ av_log(NULL, AV_LOG_INFO, "VPU thread: policy=%d (%s), pri=%d\n",
+ policy,
+ policy == SCHED_RR ? "RR" : policy == SCHED_FIFO ? "FIFO" : "???" ,
+ param.sched_priority);
+
+ policy = SCHED_FIFO;
+ param.sched_priority = sched_get_priority_max(SCHED_FIFO);
+
+ av_log(NULL, AV_LOG_INFO, "Attempt to set: policy=%d (%s), pri=%d\n",
+ policy,
+ policy == SCHED_RR ? "RR" : policy == SCHED_FIFO ? "FIFO" : "???" ,
+ param.sched_priority);
+
+ if (pthread_setschedparam(vpu_thread, policy, &param) != 0)
+ {
+ av_log(NULL, AV_LOG_ERROR, "Unable to set VPU thread scheduling parameters\n");
+ }
+ else
+ {
+ if (pthread_getschedparam(vpu_thread, &policy, &param) != 0)
+ {
+ av_log(NULL, AV_LOG_ERROR, "Unable to get VPU thread scheduling parameters\n");
+ }
+ else
+ {
+ av_log(NULL, AV_LOG_INFO, "VPU thread (after): policy=%d (%s), pri=%d\n",
+ policy,
+ policy == SCHED_RR ? "RR" : policy == SCHED_FIFO ? "FIFO" : "???" ,
+ param.sched_priority);
+ }
+ }
+ }
+
+ }
+
+ }
+#endif
+
+ return 0;
+}
+
+// Returns 1 if the gpu is currently idle
+static int gpu_idle(void)
+{
+ int ret = pthread_mutex_trylock(&gpu_mutex);
+ if (ret==0) {
+ pthread_mutex_unlock(&gpu_mutex);
+ return 1;
+ }
+ return 0;
+}
+
+// Make sure we have exclusive access to the mailbox, and enable qpu if necessary.
+static void gpu_lock(void) {
+ pthread_mutex_lock(&gpu_mutex);
+
+ if (gpu==NULL) {
+ gpu_init(&gpu);
+ }
+}
+
+static void gpu_unlock(void) {
+ pthread_mutex_unlock(&gpu_mutex);
+}
+
+static int gpu_malloc_uncached_internal(int numbytes, GPU_MEM_PTR_T *p, int mb) {
+ p->numbytes = numbytes;
+ p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
+ av_assert0(p->vcsm_handle);
+ p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
+ av_assert0(p->vc_handle);
+ p->arm = vcsm_lock(p->vcsm_handle);
+ av_assert0(p->arm);
+ p->vc = mem_lock(mb, p->vc_handle);
+ av_assert0(p->vc);
+ return 0;
+}
+
+// Allocate memory on GPU
+// Fills in structure <p> containing ARM pointer, videocore handle, videocore memory address, numbytes
+// Returns 0 on success.
+// This allocates memory that will not be cached in ARM's data cache.
+// Therefore safe to use without data cache flushing.
+int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p)
+{
+ int r;
+ gpu_lock();
+ r = gpu_malloc_uncached_internal(numbytes, p, gpu->mb);
+ gpu->open_count++;
+ gpu_unlock();
+ return r;
+}
+
+int gpu_get_mailbox(void)
+{
+ av_assert0(gpu);
+ return gpu->mb;
+}
+
+// Call this to clean and invalidate a region of memory
+void gpu_cache_flush(const GPU_MEM_PTR_T * const p)
+{
+#ifdef RPI_FAST_CACHEFLUSH
+ struct vcsm_user_clean_invalid_s iocache = {};
+ iocache.s[0].handle = p->vcsm_handle;
+ iocache.s[0].cmd = 3; // clean+invalidate
+ iocache.s[0].addr = (int) p->arm;
+ iocache.s[0].size = p->numbytes;
+ vcsm_clean_invalid( &iocache );
+#else
+ void *tmp = vcsm_lock(p->vcsm_handle);
+ vcsm_unlock_ptr(tmp);
+#endif
+}
+
+void gpu_cache_flush3(GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2)
+{
+#ifdef RPI_FAST_CACHEFLUSH
+ struct vcsm_user_clean_invalid_s iocache = {};
+ iocache.s[0].handle = p0->vcsm_handle;
+ iocache.s[0].cmd = 3; // clean+invalidate
+ iocache.s[0].addr = (int) p0->arm;
+ iocache.s[0].size = p0->numbytes;
+ iocache.s[1].handle = p1->vcsm_handle;
+ iocache.s[1].cmd = 3; // clean+invalidate
+ iocache.s[1].addr = (int) p1->arm;
+ iocache.s[1].size = p1->numbytes;
+ iocache.s[2].handle = p2->vcsm_handle;
+ iocache.s[2].cmd = 3; // clean+invalidate
+ iocache.s[2].addr = (int) p2->arm;
+ iocache.s[2].size = p2->numbytes;
+ vcsm_clean_invalid( &iocache );
+#else
+ void *tmp;
+ tmp = vcsm_lock(p0->vcsm_handle);
+ vcsm_unlock_ptr(tmp);
+ tmp = vcsm_lock(p1->vcsm_handle);
+ vcsm_unlock_ptr(tmp);
+ tmp = vcsm_lock(p2->vcsm_handle);
+ vcsm_unlock_ptr(tmp);
+#endif
+}
+
+static int gpu_malloc_cached_internal(int numbytes, GPU_MEM_PTR_T *p) {
+ p->numbytes = numbytes;
+ p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST, (char *)"Video Frame" );
+ //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" );
+ //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
+ //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST_AND_VC, (char *)"Video Frame" );
+ av_assert0(p->vcsm_handle);
+ p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
+ av_assert0(p->vc_handle);
+ p->arm = vcsm_lock(p->vcsm_handle);
+ av_assert0(p->arm);
+ p->vc = mem_lock(gpu->mb, p->vc_handle);
+ av_assert0(p->vc);
+ return 0;
+}
+
+// This allocates data that will be
+// Cached in ARM L2
+// Uncached in VPU L2
+int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p)
+{
+ int r;
+ gpu_lock();
+ r = gpu_malloc_cached_internal(numbytes, p);
+ gpu->open_count++;
+ gpu_unlock();
+ return r;
+}
+
+static void gpu_term(void)
+{
+ int mb;
+
+ if (gpu==NULL)
+ return;
+ mb = gpu->mb;
+
+ // ??? Tear down anything needed for gpuexecute
+
+ qpu_enable(mb, 0);
+ gpu_free_internal(&gpu_mem_ptr);
+
+ vcsm_exit();
+
+ mbox_close(mb);
+ gpu = NULL;
+}
+
+void gpu_free_internal(GPU_MEM_PTR_T *p) {
+ int mb = gpu->mb;
+ mem_unlock(mb,p->vc_handle);
+ vcsm_unlock_ptr(p->arm);
+ vcsm_free(p->vcsm_handle);
+}
+
+void gpu_free(GPU_MEM_PTR_T *p) {
+ gpu_lock();
+
+ gpu_free_internal(p);
+
+ gpu->open_count--;
+ if (gpu->open_count==0) {
+ printf("Closing GPU\n");
+ gpu_term();
+ gpu = NULL;
+ }
+ gpu_unlock();
+}
+
+unsigned int vpu_get_fn(void) {
+ // Make sure that the gpu is initialized
+ if (gpu==NULL) {
+ printf("Preparing gpu\n");
+ gpu_lock();
+ gpu_unlock();
+ }
+ return gpu->vc + offsetof(struct GPU,vpu_code);
+}
+
+unsigned int vpu_get_constants(void) {
+ if (gpu==NULL) {
+ gpu_lock();
+ gpu_unlock();
+ }
+ return gpu->vc + offsetof(struct GPU,transMatrix2even);
+}
+
+#ifdef GPUSERVICE
+static void callback(void *cookie)
+{
+ sem_post((sem_t *)cookie);
+}
+#endif
+
+
+static volatile uint32_t post_done = 0;
+static volatile uint32_t post_qed = 0;
+
+static void post_code2_cb(void * v)
+{
+ uint32_t n = (uint32_t)v;
+ if ((int32_t)(n - post_done) > 0) {
+ post_done = n;
+ }
+}
+
+
+// Post a command to the queue
+// Returns an id which we can use to wait for completion
+int vpu_post_code2(unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, GPU_MEM_PTR_T *buf)
+{
+ struct gpu_job_s j[1] = {
+ {
+ .command = EXECUTE_VPU,
+ .u.v.q = {code, r0, r1, r2, r3, r4, r5},
+ .callback.func = post_code2_cb
+ }
+ };
+ uint32_t id;
+
+ j[0].callback.cookie = (void *)(id = ++post_qed);
+
+ av_assert0(vc_gpuserv_execute_code(1, j) == 0);
+
+ return id;
+}
+
+int vpu_qpu_post_code2(unsigned vpu_code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
+ int qpu0_n, const uint32_t * qpu0_mail,
+ int qpu1_n, const uint32_t * qpu1_mail)
+{
+#if 1
+ sem_t sync0;
+ struct gpu_job_s j[4];
+
+ sem_init(&sync0, 0, 0);
+
+ j[0].command = EXECUTE_VPU;
+ j[0].u.v.q[0] = vpu_code;
+ j[0].u.v.q[1] = r0;
+ j[0].u.v.q[2] = r1;
+ j[0].u.v.q[3] = r2;
+ j[0].u.v.q[4] = r3;
+ j[0].u.v.q[5] = r4;
+ j[0].u.v.q[6] = r5;
+ j[0].callback.func = 0;
+ j[0].callback.cookie = NULL;
+
+ j[1].command = EXECUTE_QPU;
+ j[1].u.q.jobs = qpu1_n;
+ memcpy(j[1].u.q.control, qpu1_mail, qpu1_n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
+ j[1].u.q.noflush = FLAGS_FOR_PROFILING;
+ j[1].u.q.timeout = 5000;
+ j[1].callback.func = 0;
+ j[1].callback.cookie = NULL;
+
+ j[2].command = EXECUTE_QPU;
+ j[2].u.q.jobs = qpu0_n;
+ memcpy(j[2].u.q.control, qpu0_mail, qpu0_n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
+ j[2].u.q.noflush = 1;
+ j[2].u.q.timeout = 5000;
+ j[2].callback.func = 0;
+ j[2].callback.cookie = NULL;
+
+ j[3].command = EXECUTE_SYNC;
+ j[3].u.s.mask = 3;
+ j[3].callback.func = callback;
+ j[3].callback.cookie = (void *)&sync0;
+
+ av_assert0(vc_gpuserv_execute_code(4, j) == 0);
+
+ sem_wait(&sync0);
+#else
+
+ sem_t sync0, sync2;
+ struct gpu_job_s j[3];
+
+ sem_init(&sync0, 0, 0);
+ sem_init(&sync2, 0, 0);
+
+ j[0].command = EXECUTE_VPU;
+ j[0].u.v.q[0] = vpu_code;
+ j[0].u.v.q[1] = r0;
+ j[0].u.v.q[2] = r1;
+ j[0].u.v.q[3] = r2;
+ j[0].u.v.q[4] = r3;
+ j[0].u.v.q[5] = r4;
+ j[0].u.v.q[6] = r5;
+ j[0].callback.func = callback;
+ j[0].callback.cookie = (void *)&sync0;
+
+ j[1].command = EXECUTE_QPU;
+ j[1].u.q.jobs = qpu1_n;
+ memcpy(j[1].u.q.control, qpu1_mail, qpu1_n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
+ j[1].u.q.noflush = FLAGS_FOR_PROFILING;
+ j[1].u.q.timeout = 5000;
+ j[1].callback.func = 0;
+ j[1].callback.cookie = NULL;
+
+ j[2].command = EXECUTE_QPU;
+ j[2].u.q.jobs = qpu0_n;
+ memcpy(j[2].u.q.control, qpu0_mail, qpu0_n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
+ j[2].u.q.noflush = 1;
+ j[2].u.q.timeout = 5000;
+ j[2].callback.func = callback;
+ j[2].callback.cookie = (void *)&sync2;
+
+ av_assert0(vc_gpuserv_execute_code(3, j) == 0);
+
+ sem_wait(&sync0);
+ sem_wait(&sync2);
+#endif
+
+ return 0;
+}
+
+
+// Wait for completion of the given command
+void vpu_wait(int id)
+{
+ if (id == 0) {
+#if 0
+ sem_t sync0;
+ struct gpu_job_s j[1] =
+ {
+ {
+ .command = EXECUTE_SYNC,
+ .u.s.mask = 3,
+ .callback.func = callback,
+ .callback.cookie = (void *)&sync0
+ }
+ };
+
+ sem_init(&sync0, 0, 0);
+
+ av_assert0(vc_gpuserv_execute_code(1, j) == 0);
+
+ sem_wait(&sync0);
+#endif
+ }
+ else {
+ while ((int32_t)(post_done - (uint32_t)id) < 0) {
+ usleep(1000);
+ }
+ }
+}
+
+
+unsigned int qpu_get_fn(int num) {
+ // Make sure that the gpu is initialized
+ unsigned int *fn;
+ if (gpu==NULL) {
+ printf("Preparing gpu\n");
+ gpu_lock();
+ gpu_unlock();
+ }
+ switch(num) {
+ case QPU_MC_SETUP:
+ fn = mc_setup;
+ break;
+ case QPU_MC_FILTER:
+ fn = mc_filter;
+ break;
+ case QPU_MC_EXIT:
+ fn = mc_exit;
+ break;
+ case QPU_MC_INTERRUPT_EXIT12:
+ fn = mc_interrupt_exit12;
+ break;
+ case QPU_MC_FILTER_B:
+ fn = mc_filter_b;
+ break;
+ //case QPU_MC_FILTER_HONLY:
+ // fn = mc_filter_honly;
+ // break;
+ case QPU_MC_SETUP_UV:
+ fn = mc_setup_uv;
+ break;
+ case QPU_MC_FILTER_UV:
+ fn = mc_filter_uv;
+ break;
+ case QPU_MC_FILTER_UV_B0:
+ fn = mc_filter_uv_b0;
+ break;
+ case QPU_MC_FILTER_UV_B:
+ fn = mc_filter_uv_b;
+ break;
+ case QPU_MC_INTERRUPT_EXIT8:
+ fn = mc_interrupt_exit8;
+ break;
+ case QPU_MC_END:
+ fn = mc_end;
+ break;
+ default:
+ printf("Unknown function\n");
+ exit(-1);
+ }
+ return gpu->vc + 4*(int)(fn-rpi_shader);
+ //return code[num] + gpu->vc;
+}
+
+#if 0
+typedef unsigned int uint32_t;
+
+typedef struct mvs_s {
+ GPU_MEM_PTR_T unif_mvs_ptr;
+ uint32_t *unif_mvs; // Base of memory for motion vector commands
+
+ // _base pointers are to the start of the row
+ uint32_t *mvs_base[8];
+ // these pointers are to the next free space
+ uint32_t *u_mvs[8];
+
+} HEVCContext;
+
+#define RPI_CHROMA_COMMAND_WORDS 12
+
+static void rpi_inter_clear(HEVCContext *s)
+{
+ int i;
+ for(i=0;i<8;i++) {
+ s->u_mvs[i] = s->mvs_base[i];
+ *s->u_mvs[i]++ = 0;
+ *s->u_mvs[i]++ = 0;
+ *s->u_mvs[i]++ = 0;
+ *s->u_mvs[i]++ = 0;
+ *s->u_mvs[i]++ = 0;
+ *s->u_mvs[i]++ = 128; // w
+ *s->u_mvs[i]++ = 128; // h
+ *s->u_mvs[i]++ = 128; // stride u
+ *s->u_mvs[i]++ = 128; // stride v
+ s->u_mvs[i] += 3; // Padding words
+ }
+}
+
+static void rpi_execute_inter_qpu(HEVCContext *s)
+{
+ int k;
+ uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr.vc;
+
+ for(k=0;k<8;k++) {
+ s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
+ s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
+ s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP); // dummy location for V
+ }
+
+ s->u_mvs[8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
+
+ qpu_run_shader8(qpu_get_fn(QPU_MC_SETUP_UV),
+ (uint32_t)(unif_vc+(s->mvs_base[0 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+ (uint32_t)(unif_vc+(s->mvs_base[1 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+ (uint32_t)(unif_vc+(s->mvs_base[2 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+ (uint32_t)(unif_vc+(s->mvs_base[3 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+ (uint32_t)(unif_vc+(s->mvs_base[4 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+ (uint32_t)(unif_vc+(s->mvs_base[5 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+ (uint32_t)(unif_vc+(s->mvs_base[6 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+ (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm))
+ );
+}
+
+void rpi_test_qpu(void)
+{
+ HEVCContext mvs;
+ HEVCContext *s = &mvs;
+ int i;
+ int uv_commands_per_qpu = (1 + (256*64*2)/(4*4)) * RPI_CHROMA_COMMAND_WORDS;
+ uint32_t *p;
+ printf("Allocate memory\n");
+ gpu_malloc_uncached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr );
+ s->unif_mvs = (uint32_t *) s->unif_mvs_ptr.arm;
+
+ // Set up initial locations for uniform streams
+ p = s->unif_mvs;
+ for(i = 0; i < 8; i++) {
+ s->mvs_base[i] = p;
+ p += uv_commands_per_qpu;
+ }
+ // Now run a simple program that should just quit immediately after a single texture fetch
+ rpi_inter_clear(s);
+ for(i=0;i<4;i++) {
+ printf("Launch QPUs\n");
+ rpi_execute_inter_qpu(s);
+ printf("Done\n");
+ }
+ printf("Free memory\n");
+ gpu_free(&s->unif_mvs_ptr);
+ return;
+}
+#endif
+
+#if 0
+
+int32_t hcoeffs[] = {-4, 10, -21, 70, 90, -24, 11, -4};
+//int32_t hcoeffs[] = {1, 1, 1, 1, 1, 1, 1, 1};
+int32_t vcoeffs[] = {-2, 6, -13, 37, 115, -20, 9, -4};
+//int32_t vcoeffs[] = {1, 1, 1, 1, 1, 1, 1, 1};
+
+#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0-1) & 0xff) | ((c1-1) & 0xff) << 8 | ((c2-1) & 0xff) << 16 | ((c3-1) & 0xff) << 24);
+
+static uint8_t av_clip_uint8(int32_t a)
+{
+ if (a&(~255)) return (-a)>>31;
+ else return a;
+}
+
+static int32_t filter8(const uint8_t *data, int pitch)
+{
+ int32_t vsum = 0;
+ int x, y;
+
+ for (y = 0; y < 8; y++) {
+ int32_t hsum = 0;
+
+ for (x = 0; x < 8; x++)
+ hsum += hcoeffs[x]*data[x + y * pitch];
+
+ vsum += vcoeffs[y]*av_clip_uint8( (hsum + 64) >> 7); // Added brackets to stop compiler warning
+ }
+
+ return av_clip_uint8( (vsum + 64) >> 7);
+}
+
+// Note regression changes coefficients so is not thread safe
+//#define REGRESSION
+#ifdef REGRESSION
+#define CMAX 100
+#else
+#define CMAX 2
+#endif
+#define YMAX 16
+
+int rpi_test_shader(void)
+{
+ int i, c;
+
+ uint32_t *unifs;
+
+ uint8_t *in_buffer;
+ uint8_t *out_buffer[2];
+
+ GPU_MEM_PTR_T unifs_ptr;
+ GPU_MEM_PTR_T in_buffer_ptr;
+ GPU_MEM_PTR_T out_buffer_ptr[2];
+
+ // Addresses in GPU memory of filter programs
+ uint32_t mc_setup = 0;
+ uint32_t mc_filter = 0;
+ uint32_t mc_exit = 0;
+
+ int pitch = 0x500;
+
+ if (gpu==NULL) {
+ gpu_lock();
+ gpu_unlock();
+ }
+
+ printf("This needs to change to reflect new assembler\n");
+ // Use table to compute locations of program start points
+ mc_setup = code[0] + gpu->vc;
+ mc_filter = code[1] + gpu->vc;
+ mc_exit = code[2] + gpu->vc;
+
+ if (!vcos_verify_ge0(gpu_malloc_uncached(4*64,&unifs_ptr))) {
+ return -2;
+ }
+ unifs = (uint32_t*)unifs_ptr.arm;
+
+ if (!vcos_verify_ge0(gpu_malloc_uncached(64*23,&in_buffer_ptr))) {
+ return -3;
+ }
+ in_buffer = (uint8_t*)in_buffer_ptr.arm;
+
+ if (!vcos_verify_ge0(gpu_malloc_uncached(16*pitch,&out_buffer_ptr[0])) || !vcos_verify_ge0(gpu_malloc_uncached(16*pitch,&out_buffer_ptr[1]))) {
+ return -4;
+ }
+ out_buffer[0] = (uint8_t*)out_buffer_ptr[0].arm;
+ out_buffer[1] = (uint8_t*)out_buffer_ptr[1].arm;
+
+ for (c = 0; c < CMAX; c++) {
+ int xo[] = {rand()&31, rand()&31};
+
+#ifdef REGRESSION
+ for (i = 0; i < 8; i++) {
+ hcoeffs[i] = (int8_t)rand();
+ vcoeffs[i] = (int8_t)rand();
+ if (hcoeffs[i]==-128)
+ hcoeffs[i]++;
+ if (vcoeffs[i]==-128)
+ vcoeffs[i]++;
+ }
+#endif
+
+ for (i = 0; i < 64*23; i++) {
+ //printf("%d %d %p\n",i,gpu->mb,&in_buffer[i]);
+ in_buffer[i] = rand();
+ }
+
+ // Clear output array
+ {
+ int b;
+ for(b=0;b<2;b++) {
+ for(i=0;i<16*16;i++) {
+ out_buffer[b][i] = 3;
+ }
+ }
+ }
+
+ unifs[0] = mc_filter;
+ unifs[1] = in_buffer_ptr.vc+xo[0]+16;
+ unifs[2] = 64; // src pitch
+ unifs[3] = pitch; // dst pitch
+ unifs[4] = 0; // Padding
+ unifs[5] = 0;
+ unifs[6] = 0;
+ unifs[7 ] = mc_filter;
+ unifs[8 ] = in_buffer_ptr.vc+xo[1]+16;
+ unifs[9 ] = ENCODE_COEFFS(hcoeffs[0], hcoeffs[1], hcoeffs[2], hcoeffs[3]);
+ unifs[10] = ENCODE_COEFFS(hcoeffs[4], hcoeffs[5], hcoeffs[6], hcoeffs[7]);
+ unifs[11] = ENCODE_COEFFS(vcoeffs[0], vcoeffs[1], vcoeffs[2], vcoeffs[3]);
+ unifs[12] = ENCODE_COEFFS(vcoeffs[4], vcoeffs[5], vcoeffs[6], vcoeffs[7]);
+ unifs[13] = out_buffer_ptr[0].vc;
+ unifs[14] = mc_exit;
+ unifs[15] = in_buffer_ptr.vc+xo[1]+16; // dummy
+ unifs[16] = ENCODE_COEFFS(hcoeffs[0], hcoeffs[1], hcoeffs[2], hcoeffs[3]);
+ unifs[17] = ENCODE_COEFFS(hcoeffs[4], hcoeffs[5], hcoeffs[6], hcoeffs[7]);
+ unifs[18] = ENCODE_COEFFS(vcoeffs[0], vcoeffs[1], vcoeffs[2], vcoeffs[3]);
+ unifs[19] = ENCODE_COEFFS(vcoeffs[4], vcoeffs[5], vcoeffs[6], vcoeffs[7]);
+ unifs[20] = out_buffer_ptr[1].vc;
+
+ printf("Gpu->vc=%x Code=%x dst=%x\n",gpu->vc, mc_filter,out_buffer_ptr[1].vc);
+
+ // flush_dcache(); TODO is this needed on ARM side? - tried to use the direct alias to avoid this problem
+
+ //qpu_run_shader(mc_setup, unifs_ptr.vc);
+ //qpu_run_shader(gpu, gpu->vc, unifs_ptr.vc);
+ rpi_do_block(in_buffer_ptr.vc+xo[0]+16, 64, out_buffer_ptr[0].vc, pitch,out_buffer[0]);
+ rpi_do_block(in_buffer_ptr.vc+xo[1]+16, 64, out_buffer_ptr[1].vc, pitch,out_buffer[1]);
+
+ if (1)
+ {
+ int x, y, b;
+ int bad = 0;
+
+ for (b=0; b<2; ++b)
+ for (y=0; y<YMAX; ++y)
+ for (x=0; x<16; ++x) {
+ int32_t ref = filter8(in_buffer+x+y*64+xo[b], 64);
+
+ if (out_buffer[b][x+y*pitch] != ref) {
+ bad = 1;
+// printf("%d, %d, %d, %d\n", c, b, x, y);
+ }
+#ifndef REGRESSION
+ //printf("%08x %08x\n", out_buffer[b][x+y*pitch], ref);
+#endif
+ }
+ if (bad)
+ printf("Failed dst=%x test=%d\n",out_buffer_ptr[1].vc,c);
+ else
+ printf("Passed dst=%x test=%d\n",out_buffer_ptr[1].vc,c);
+ }
+ //printf("%d\n", simpenrose_get_qpu_tick_count());
+ }
+
+ gpu_free(&out_buffer_ptr[0]);
+ gpu_free(&out_buffer_ptr[1]);
+ gpu_free(&in_buffer_ptr);
+ gpu_free(&unifs_ptr);
+
+ return 0;
+}
+
+void rpi_do_block_arm(const uint8_t *in_buffer, int src_pitch, uint8_t *dst, int dst_pitch)
+{
+ int x,y;
+ for (y=0; y<16; ++y) {
+ for (x=0; x<16; ++x) {
+ dst[x+y*dst_pitch] = filter8(in_buffer+x+y*src_pitch, src_pitch);
+ }
+ }
+}
+
+void rpi_do_block(const uint8_t *in_buffer_vc, int src_pitch, uint8_t *dst_vc, int dst_pitch, uint8_t *dst)
+{
+ uint32_t *unifs;
+
+ GPU_MEM_PTR_T unifs_ptr;
+ //uint8_t *out_buffer;
+ //GPU_MEM_PTR_T out_buffer_ptr;
+
+ // Addresses in GPU memory of filter programs
+ uint32_t mc_setup = 0;
+ uint32_t mc_filter = 0;
+ uint32_t mc_exit = 0;
+ //int x,y;
+
+ if (gpu==NULL) {
+ gpu_lock();
+ gpu_unlock();
+ }
+
+ // Use table to compute locations of program start points
+ mc_setup = code[0] + gpu->vc;
+ mc_filter = code[1] + gpu->vc;
+ mc_exit = code[2] + gpu->vc;
+
+ if (!vcos_verify_ge0(gpu_malloc_uncached(4*64,&unifs_ptr))) {
+ return;
+ }
+ //gpu_malloc_uncached(16*dst_pitch,&out_buffer_ptr);
+ //out_buffer = (uint8_t*)out_buffer_ptr.arm;
+
+ /*for (y=0; y<16; ++y) {
+ for (x=0; x<16; ++x) {
+ out_buffer[x+y*dst_pitch] = 7;
+ }
+ }*/
+
+ unifs = (uint32_t*)unifs_ptr.arm;
+
+ unifs[0] = mc_filter;
+ unifs[1] = (int)in_buffer_vc;
+ unifs[2] = src_pitch; // src pitch
+ unifs[3] = dst_pitch; // dst pitch
+ unifs[4] = 0; // Padding
+ unifs[5] = 0;
+ unifs[6] = 0;
+ unifs[7 ] = mc_exit;
+ unifs[8 ] = (int)in_buffer_vc;
+ unifs[9 ] = ENCODE_COEFFS(hcoeffs[0], hcoeffs[1], hcoeffs[2], hcoeffs[3]);
+ unifs[10] = ENCODE_COEFFS(hcoeffs[4], hcoeffs[5], hcoeffs[6], hcoeffs[7]);
+ unifs[11] = ENCODE_COEFFS(vcoeffs[0], vcoeffs[1], vcoeffs[2], vcoeffs[3]);
+ unifs[12] = ENCODE_COEFFS(vcoeffs[4], vcoeffs[5], vcoeffs[6], vcoeffs[7]);
+ unifs[13] = (int)dst_vc;
+ //unifs[13] = (int)out_buffer_ptr.vc;
+
+ //printf("Gpu->vc=%x Code=%x dst=%x\n",gpu->vc, mc_filter,out_buffer_ptr[1].vc);
+
+ qpu_run_shader(mc_setup, unifs_ptr.vc);
+
+ /*for (y=0; y<16; ++y) {
+ for (x=0; x<16; ++x) {
+ dst[x+y*dst_pitch] = out_buffer[x+y*dst_pitch];
+ }
+ }*/
+
+ gpu_free(&unifs_ptr);
+ //gpu_free(&out_buffer_ptr);
+}
+
+
+
+#endif
+
+#endif // RPI
diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
new file mode 100644
index 0000000..c6cdb2b
--- /dev/null
+++ b/libavcodec/rpi_qpu.h
@@ -0,0 +1,176 @@
+#ifndef RPI_QPU_H
+#define RPI_QPU_H
+
+// Define RPI_FAST_CACHEFLUSH to use the VCSM cache flush code
+// *** N.B. Code has rotted & crashes if this is unset (before this set of changes)
+#define RPI_FAST_CACHEFLUSH
+
+#define RPI_ONE_BUF 1
+
+typedef struct gpu_mem_ptr_s {
+ unsigned char *arm; // Pointer to memory mapped on ARM side
+ int vc_handle; // Videocore handle of relocatable memory
+ int vcsm_handle; // Handle for use by VCSM
+ int vc; // Address for use in GPU code
+ int numbytes; // Size of memory block
+} GPU_MEM_PTR_T;
+
+// General GPU functions
+extern int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p);
+extern int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p);
+extern void gpu_free(GPU_MEM_PTR_T *p);
+extern void gpu_cache_flush(const GPU_MEM_PTR_T * const p);
+extern void gpu_cache_flush3(GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2);
+
+#include "libavutil/frame.h"
+#if !RPI_ONE_BUF
+static inline uint32_t get_vc_address_y(const AVFrame * const frame) {
+ GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[0]);
+ return p->vc;
+}
+
+static inline uint32_t get_vc_address_u(const AVFrame * const frame) {
+ GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[1]);
+ return p->vc;
+}
+
+static inline uint32_t get_vc_address_v(const AVFrame * const frame) {
+ GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[2]);
+ return p->vc;
+}
+
+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) {
+ return *(GPU_MEM_PTR_T *)av_buffer_pool_opaque(frame->buf[0]);
+}
+
+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_u(const AVFrame * const frame) {
+ return *(GPU_MEM_PTR_T *)av_buffer_pool_opaque(frame->buf[1]);
+}
+
+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_v(const AVFrame * const frame) {
+ return *(GPU_MEM_PTR_T *)av_buffer_pool_opaque(frame->buf[2]);
+}
+
+#else
+
+static inline int gpu_is_buf1(const AVFrame * const frame)
+{
+ return frame->buf[1] == NULL;
+}
+
+static inline GPU_MEM_PTR_T * gpu_buf1_gmem(const AVFrame * const frame)
+{
+ return av_buffer_get_opaque(frame->buf[0]);
+}
+
+static inline GPU_MEM_PTR_T * gpu_buf3_gmem(const AVFrame * const frame, const int n)
+{
+ return av_buffer_pool_opaque(frame->buf[n]);
+}
+
+
+static inline uint32_t get_vc_address_y(const AVFrame * const frame) {
+ return gpu_is_buf1(frame) ? gpu_buf1_gmem(frame)->vc : gpu_buf3_gmem(frame, 0)->vc;
+}
+
+static inline uint32_t get_vc_address_u(const AVFrame * const frame) {
+ return gpu_is_buf1(frame) ?
+ gpu_buf1_gmem(frame)->vc + frame->data[1] - frame->data[0] :
+ gpu_buf3_gmem(frame, 1)->vc;
+}
+
+static inline uint32_t get_vc_address_v(const AVFrame * const frame) {
+ return gpu_is_buf1(frame) ?
+ gpu_buf1_gmem(frame)->vc + frame->data[2] - frame->data[0] :
+ gpu_buf3_gmem(frame, 2)->vc;
+}
+
+
+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) {
+ if (gpu_is_buf1(frame))
+ {
+ GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
+ g.numbytes = frame->data[1] - frame->data[0];
+ return g;
+ }
+ else
+ return *gpu_buf3_gmem(frame, 0);
+}
+
+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_u(const AVFrame * const frame) {
+ if (gpu_is_buf1(frame))
+ {
+ GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
+ g.arm += frame->data[1] - frame->data[0];
+ g.vc += frame->data[1] - frame->data[0];
+ g.numbytes = frame->data[2] - frame->data[1]; // chroma size
+ return g;
+ }
+ else
+ return *gpu_buf3_gmem(frame, 1);
+}
+
+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_v(const AVFrame * const frame) {
+ if (gpu_is_buf1(frame))
+ {
+ GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
+ g.arm += frame->data[2] - frame->data[0];
+ g.vc += frame->data[2] - frame->data[0];
+ g.numbytes = frame->data[2] - frame->data[1]; // chroma size
+ return g;
+ }
+ else
+ return *gpu_buf3_gmem(frame, 2);
+}
+
+#endif
+
+
+// QPU specific functions
+extern void rpi_test_qpu(void);
+
+enum {
+ QPU_MC_SETUP,
+ QPU_MC_FILTER,
+ QPU_MC_EXIT,
+ QPU_MC_INTERRUPT_EXIT12,
+ QPU_MC_FILTER_B,
+ QPU_MC_FILTER_HONLY,
+ QPU_MC_SETUP_UV,
+ QPU_MC_FILTER_UV,
+ QPU_MC_FILTER_UV_B0,
+ QPU_MC_FILTER_UV_B,
+ QPU_MC_INTERRUPT_EXIT8,
+ QPU_MC_END
+ };
+extern unsigned int qpu_get_fn(int num);
+
+#define QPU_N_UV 8
+#define QPU_N_Y 12
+#define QPU_N_MAX 16
+
+#define QPU_MAIL_EL_VALS 2
+#define QPU_MAIL_EL_SIZE (QPU_MAIL_EL_VALS * sizeof(uint32_t))
+#define QPU_MAIL_VALS_MAX (QPU_N_MAX * QPU_MAIL_EL_VALS)
+#define QPU_MAIL_SIZE (QPU_MAIL_VALS_MAX * sizeof(uint32_t))
+
+// VPU specific functions
+extern unsigned int vpu_get_fn(void);
+extern unsigned int vpu_get_constants(void);
+//extern unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
+extern int vpu_post_code2( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, GPU_MEM_PTR_T *buf);
+int vpu_qpu_post_code2(unsigned vpu_code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
+ int qpu0_n, const uint32_t * qpu0_mail,
+ int qpu1_n, const uint32_t * qpu1_mail);
+
+extern void vpu_wait( int id);
+
+// Simple test of shader code
+extern int rpi_test_shader(void);
+
+extern void rpi_do_block(const unsigned char *in_buffer_vc, int src_pitch, unsigned char *dst_vc, int dst_pitch, unsigned char *dst);
+extern void rpi_do_block_arm(const unsigned char *in_buffer, int src_pitch, unsigned char *dst, int dst_pitch);
+
+extern int gpu_get_mailbox(void);
+
+#endif
diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
new file mode 100644
index 0000000..06fb166
--- /dev/null
+++ b/libavcodec/rpi_shader.c
@@ -0,0 +1,629 @@
+#include "rpi_shader.h"
+
+#ifdef _MSC_VER
+ #include <stdint.h>
+ /* cast through uintptr_t to avoid warnings */
+ #define POINTER_TO_UINT(X) ((unsigned int)(uintptr_t)(X))
+#else
+ #define POINTER_TO_UINT(X) ((unsigned int)(X))
+#endif
+
+#ifdef __cplusplus
+extern "C" { /* the types are probably wrong... */
+#endif
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef _MSC_VER
+__declspec(align(8))
+#elif defined(__GNUC__)
+__attribute__((aligned(8)))
+#endif
+unsigned int rpi_shader[] = {
+// ::mc_setup_uv
+/* [0x00000000] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+/* [0x00000008] */ 0x0c9a0f80, 0x10020427, // add ra_x, unif, elem_num
+/* [0x00000010] */ 0x15827d80, 0x10020767, // mov ra_y, unif
+/* [0x00000018] */ 0x15827d80, 0x10020627, // mov ra_frame_base, unif
+/* [0x00000020] */ 0x009e7000, 0x100009e7, // nop
+/* [0x00000028] */ 0x0d620f80, 0x10020667, // sub ra_u2v_ref_offset, unif, ra_frame_base
+/* [0x00000030] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
+/* [0x00000038] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
+/* [0x00000040] */ 0x15827d80, 0x10021427, // mov rb16, unif
+/* [0x00000048] */ 0x15827d80, 0x10020827, // mov r0, unif
+/* [0x00000050] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+/* [0x00000058] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
+/* [0x00000060] */ 0x00010000, 0xe0020127, // mov ra4, 0x10000
+/* [0x00000068] */ 0x00000001, 0xe0020527, // mov ra_k1, 1
+/* [0x00000070] */ 0x00000100, 0xe00205a7, // mov ra_k256, 256
+/* [0x00000078] */ 0x00000040, 0xe00207a7, // mov ra30, 64
+/* [0x00000080] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
+/* [0x00000088] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255
+/* [0x00000090] */ 0x00000018, 0xe00215e7, // mov rb23, 24
+/* [0x00000098] */ 0x00000000, 0xe0020227, // mov ra8, 0
+/* [0x000000a0] */ 0x00000000, 0xe0020267, // mov ra9, 0
+/* [0x000000a8] */ 0x00000000, 0xe00202a7, // mov ra10, 0
+/* [0x000000b0] */ 0x00000000, 0xe00202e7, // mov ra11, 0
+/* [0x000000b8] */ 0x00000000, 0xe0020327, // mov ra12, 0
+/* [0x000000c0] */ 0x00000000, 0xe0020367, // mov ra13, 0
+/* [0x000000c8] */ 0x00000000, 0xe00203a7, // mov ra14, 0
+/* [0x000000d0] */ 0x00000000, 0xe00203e7, // mov ra15, 0
+/* [0x000000d8] */ 0x15427d80, 0x10020827, // mov r0, ra_x
+/* [0x000000e0] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
+/* [0x000000e8] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base
+/* [0x000000f0] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
+/* [0x000000f8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
+/* [0x00000100] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+/* [0x00000108] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+/* [0x00000110] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x, r0
+/* [0x00000118] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+/* [0x00000120] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
+/* [0x00000128] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_frame_base, r2
+/* [0x00000130] */ 0x0c9e7440, 0x10020f27, // add t1s, r2, r1
+/* [0x00000138] */ 0x00000009, 0xe00208a7, // mov r2, 9
+/* [0x00000140] */ 0x0c827580, 0x10021367, // add rb13, r2, unif
+/* [0x00000148] */ 0x15827d80, 0x100009e7, // mov -, unif
+/* [0x00000150] */ 0x15827d80, 0x100208a7, // mov r2, unif
+/* [0x00000158] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
+/* [0x00000160] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+/* [0x00000168] */ 0x159e7480, 0x10020867, // mov r1, r2
+/* [0x00000170] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+/* [0x00000178] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+/* [0x00000180] */ 0x159e7480, 0x10020827, // mov r0, r2
+/* [0x00000188] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+/* [0x00000190] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+/* [0x00000198] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+/* [0x000001a0] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
+/* [0x000001a8] */ 0x0f9c11c0, 0xd00208a7, // asr r2, r0, 1
+/* [0x000001b0] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
+/* [0x000001b8] */ 0x0c9e7440, 0x10021567, // add rb21, r2, r1
+/* [0x000001c0] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+/* [0x000001c8] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+/* [0x000001d0] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
+/* [0x000001d8] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+/* [0x000001e0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+/* [0x000001e8] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+/* [0x000001f0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+/* [0x000001f8] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+/* [0x00000200] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x
+/* [0x00000208] */ 0x0c627380, 0x10020f27, // add t1s, r1, ra_frame_base
+// ::mc_filter_uv
+/* [0x00000210] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+/* [0x00000218] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+/* [0x00000220] */ 0x938001f6, 0xd0024821, // max r0, r0, 0 ; mov r1, unif
+/* [0x00000228] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+/* [0x00000230] */ 0x8d4e0ef6, 0x10025891, // sub r2, unif, r3 ; mov ra_xshift, ra_xshift_next
+/* [0x00000238] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+/* [0x00000240] */ 0x8c8270f6, 0x10025801, // add r0, r0, r3 ; mov ra1, unif
+/* [0x00000248] */ 0x9481c1f6, 0xd00254c0, // and rb_x_next, r0, ~3 ; mov ra0, unif
+/* [0x00000250] */ 0x959dc27f, 0x10024731, // mov ra_y_next, r1 ; mov vw_setup, rb28
+/* [0x00000258] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
+/* [0x00000260] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b
+/* [0x00000268] */ 0x0c041dc0, 0xd2021467, // add rb17, ra1.16a, 1
+/* [0x00000270] */ 0x0c043dc0, 0xd20214a7, // add rb18, ra1.16a, 3
+/* [0x00000278] */ 0x11047dc0, 0xd2020827, // shl r0, ra1.16a, 7
+/* [0x00000280] */ 0x0c067180, 0x14020827, // add r0, r0, ra1.16b
+/* [0x00000288] */ 0x119d01c0, 0xd0020827, // shl r0, r0, i_shift16
+/* [0x00000290] */ 0x8c81b1f6, 0x10025683, // add rb26, r0, rb27 ; mov ra3, unif
+/* [0x00000298] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+/* [0x000002a0] */ 0x950e0ff6, 0x18024048, // mov ra1, unif ; mov rb8, ra3.8a
+/* [0x000002a8] */ 0x950e0ff6, 0x1a064049, // mov.ifnz ra1, unif ; mov rb9, ra3.8b
+/* [0x000002b0] */ 0x800e7036, 0x1c0049ca, // nop ; mov rb10, ra3.8c
+/* [0x000002b8] */ 0x950c0ff6, 0xde0248cb, // mov r3, 0 ; mov rb11, ra3.8d
+/* [0x000002c0] */ 0x1104ddc0, 0x14020867, // shl r1, ra1.16b, rb13
+/* [0x000002c8] */ 0x0f9c13c0, 0xd0021327, // asr rb12, r1, 1
+/* [0x000002d0] */ 0x11041dc0, 0xd20213a7, // shl rb14, ra1.16a, 1
+// :uvloop
+/* [0x000002d8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0
+/* [0x000002e0] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift ; mov.ifz ra_x, rb_x_next ; ldtmu1
+/* [0x000002e8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+/* [0x000002f0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch
+/* [0x000002f8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift ; v8subs r0, r0, rb20
+/* [0x00000300] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+/* [0x00000308] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+/* [0x00000310] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3
+/* [0x00000318] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2 ; v8subs r1, r1, rb20
+/* [0x00000320] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
+/* [0x00000328] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+/* [0x00000330] */ 0x40027030, 0x180049e3, // nop ; mul24 r3, ra0.8a, r0
+/* [0x00000338] */ 0x40038031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
+/* [0x00000340] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1
+/* [0x00000348] */ 0x40037031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
+/* [0x00000350] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2
+/* [0x00000358] */ 0x40036031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+/* [0x00000360] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3
+/* [0x00000368] */ 0x40035031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+/* [0x00000370] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3 ; mov r3, rb31
+/* [0x00000378] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4 ; mov ra12, ra13
+/* [0x00000380] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
+/* [0x00000388] */ 0x55389db7, 0x10024361, // mov ra13, ra14 ; mul24 r1, ra14, rb9
+/* [0x00000390] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+/* [0x00000398] */ 0x55308037, 0x100243e0, // mov ra15, r0 ; mul24 r0, ra12, rb8
+/* [0x000003a0] */ 0x4d38a237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra14, rb10
+/* [0x000003a8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra15, rb11
+/* [0x000003b0] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0 ; mov -, vw_wait
+/* [0x000003b8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18 ; mul24 r1, r1, ra_k256
+/* [0x000003c0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+/* [0x000003c8] */ 0x409ce00f, 0x100049e1, // nop ; mul24 r1, r1, rb14
+/* [0x000003d0] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
+/* [0x000003d8] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
+/* [0x000003e0] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:uvloop
+/* [0x000003e8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
+/* [0x000003f0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
+/* [0x000003f8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+/* [0x00000400] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+/* [0x00000408] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+/* [0x00000410] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+/* [0x00000418] */ 0x00000010, 0xe0020827, // mov r0, 16
+/* [0x00000420] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00000428] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+/* [0x00000430] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+/* [0x00000438] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+/* [0x00000440] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+// ::mc_filter_uv_b0
+/* [0x00000448] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+/* [0x00000450] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+/* [0x00000458] */ 0x938001f6, 0xd0024821, // max r0, r0, 0 ; mov r1, unif
+/* [0x00000460] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+/* [0x00000468] */ 0x8d4e0ef6, 0x10025891, // sub r2, unif, r3 ; mov ra_xshift, ra_xshift_next
+/* [0x00000470] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+/* [0x00000478] */ 0x8c8270f6, 0x10025801, // add r0, r0, r3 ; mov ra1, unif
+/* [0x00000480] */ 0x9481c1f6, 0xd00254c0, // and rb_x_next, r0, ~3 ; mov ra0, unif
+/* [0x00000488] */ 0x959d527f, 0x10024731, // mov ra_y_next, r1 ; mov vw_setup, rb21
+/* [0x00000490] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
+/* [0x00000498] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b
+/* [0x000004a0] */ 0x0c041dc0, 0xd2021467, // add rb17, ra1.16a, 1
+/* [0x000004a8] */ 0x0c043dc0, 0xd20214a7, // add rb18, ra1.16a, 3
+/* [0x000004b0] */ 0x11047dc0, 0xd2020827, // shl r0, ra1.16a, 7
+/* [0x000004b8] */ 0x0c067180, 0x14020827, // add r0, r0, ra1.16b
+/* [0x000004c0] */ 0x918101f6, 0xd0025803, // shl r0, r0, i_shift16 ; mov ra3, unif
+/* [0x000004c8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+/* [0x000004d0] */ 0x150e7d80, 0x18021227, // mov rb8, ra3.8a
+/* [0x000004d8] */ 0x150e7d80, 0x1a021267, // mov rb9, ra3.8b
+/* [0x000004e0] */ 0x150e7d80, 0x1c0212a7, // mov rb10, ra3.8c
+/* [0x000004e8] */ 0x150e7d80, 0x1e0212e7, // mov rb11, ra3.8d
+/* [0x000004f0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+/* [0x000004f8] */ 0x15827d80, 0x100213a7, // mov rb14, unif
+/* [0x00000500] */ 0x95800dbf, 0xd00653a3, // mov.ifnz rb14, unif ; mov r3, 0
+// :uvloop_b0
+/* [0x00000508] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0
+/* [0x00000510] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift ; mov.ifz ra_x, rb_x_next ; ldtmu1
+/* [0x00000518] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+/* [0x00000520] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch
+/* [0x00000528] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift ; v8subs r0, r0, rb20
+/* [0x00000530] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+/* [0x00000538] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+/* [0x00000540] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3
+/* [0x00000548] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2 ; v8subs r1, r1, rb20
+/* [0x00000550] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
+/* [0x00000558] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+/* [0x00000560] */ 0x40027030, 0x180049e3, // nop ; mul24 r3, ra0.8a, r0
+/* [0x00000568] */ 0x40038031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
+/* [0x00000570] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1
+/* [0x00000578] */ 0x40037031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
+/* [0x00000580] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2
+/* [0x00000588] */ 0x40036031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+/* [0x00000590] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3
+/* [0x00000598] */ 0x40035031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+/* [0x000005a0] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3 ; mov r3, rb31
+/* [0x000005a8] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4 ; mov ra12, ra13
+/* [0x000005b0] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+/* [0x000005b8] */ 0x55389db7, 0x10024361, // mov ra13, ra14 ; mul24 r1, ra14, rb9
+/* [0x000005c0] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+/* [0x000005c8] */ 0x55308037, 0x100243e0, // mov ra15, r0 ; mul24 r0, ra12, rb8
+/* [0x000005d0] */ 0x4d38a237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra14, rb10
+/* [0x000005d8] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
+/* [0x000005e0] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+/* [0x000005e8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra15, rb11
+/* [0x000005f0] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0 ; mov -, vw_wait
+/* [0x000005f8] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
+/* [0x00000600] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+/* [0x00000608] */ 0x15827d80, 0x100009e7, // mov -, unif
+/* [0x00000610] */ 0x15827d80, 0x100009e7, // mov -, unif
+/* [0x00000618] */ 0x009e7000, 0x100009e7, // nop
+// ::mc_filter_uv_b
+/* [0x00000620] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+/* [0x00000628] */ 0x954dcdbf, 0x10024471, // mov ra_xshift, ra_xshift_next ; mov vw_setup, rb28
+/* [0x00000630] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+/* [0x00000638] */ 0x938001f6, 0xd002581c, // max r0, r0, 0 ; mov ra_y_next, unif
+/* [0x00000640] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+/* [0x00000648] */ 0x4d808cc7, 0xd0025893, // sub r2, unif, r3 ; mul24 ra_xshift_next, r0, 8
+/* [0x00000650] */ 0x8c8270f6, 0x10025801, // add r0, r0, r3 ; mov ra1, unif
+/* [0x00000658] */ 0x9481c1f6, 0xd00254c0, // and rb_x_next, r0, ~3 ; mov ra0, unif
+/* [0x00000660] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b
+/* [0x00000668] */ 0x0c041dc0, 0xd2021467, // add rb17, ra1.16a, 1
+/* [0x00000670] */ 0x0c043dc0, 0xd20214a7, // add rb18, ra1.16a, 3
+/* [0x00000678] */ 0x11047dc0, 0xd2020827, // shl r0, ra1.16a, 7
+/* [0x00000680] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
+/* [0x00000688] */ 0x918151f6, 0xd00258c3, // shl r3, r0, i_shift21 ; mov ra3, unif
+/* [0x00000690] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+/* [0x00000698] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
+/* [0x000006a0] */ 0x0c067180, 0x14020827, // add r0, r0, ra1.16b
+/* [0x000006a8] */ 0x119d01c0, 0xd0020827, // shl r0, r0, i_shift16
+/* [0x000006b0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+/* [0x000006b8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+/* [0x000006c0] */ 0x950e0ff6, 0x18024048, // mov ra1, unif ; mov rb8, ra3.8a
+/* [0x000006c8] */ 0x950e0ff6, 0x1a064049, // mov.ifnz ra1, unif ; mov rb9, ra3.8b
+/* [0x000006d0] */ 0x800e7036, 0x1c0049ca, // nop ; mov rb10, ra3.8c
+/* [0x000006d8] */ 0x950c0ff6, 0xde0248cb, // mov r3, 0 ; mov rb11, ra3.8d
+/* [0x000006e0] */ 0x1104ddc0, 0x14020867, // shl r1, ra1.16b, rb13
+/* [0x000006e8] */ 0x0f9c13c0, 0xd0021327, // asr rb12, r1, 1
+// :uvloop_b
+/* [0x000006f0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0
+/* [0x000006f8] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift ; mov.ifz ra_x, rb_x_next ; ldtmu1
+/* [0x00000700] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+/* [0x00000708] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch
+/* [0x00000710] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift ; v8subs r0, r0, rb20
+/* [0x00000718] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+/* [0x00000720] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+/* [0x00000728] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3
+/* [0x00000730] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2 ; v8subs r1, r1, rb20
+/* [0x00000738] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
+/* [0x00000740] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+/* [0x00000748] */ 0x40027030, 0x180049e3, // nop ; mul24 r3, ra0.8a, r0
+/* [0x00000750] */ 0x40038031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
+/* [0x00000758] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1
+/* [0x00000760] */ 0x40037031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
+/* [0x00000768] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2
+/* [0x00000770] */ 0x40036031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+/* [0x00000778] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3
+/* [0x00000780] */ 0x40035031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+/* [0x00000788] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3 ; mov r3, rb31
+/* [0x00000790] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4 ; mov ra12, ra13
+/* [0x00000798] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
+/* [0x000007a0] */ 0x55389db7, 0x10024361, // mov ra13, ra14 ; mul24 r1, ra14, rb9
+/* [0x000007a8] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+/* [0x000007b0] */ 0x55308037, 0x100243e0, // mov ra15, r0 ; mul24 r0, ra12, rb8
+/* [0x000007b8] */ 0x4d38a237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra14, rb10
+/* [0x000007c0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra15, rb11
+/* [0x000007c8] */ 0x4d13023e, 0x10024860, // sub r1, r1, r0 ; mul24 r0, vpm, ra4
+/* [0x000007d0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18 ; mul24 r1, r1, ra_k256
+/* [0x000007d8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+/* [0x000007e0] */ 0x4f0501ce, 0xd2024821, // asr r0, r0, i_shift16 ; mul24 r1, r1, ra1.16a
+/* [0x000007e8] */ 0x409ce007, 0x100049e0, // nop ; mul24 r0, r0, rb14
+/* [0x000007f0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0 ; mov -, vw_wait
+/* [0x000007f8] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
+/* [0x00000800] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
+/* [0x00000808] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:uvloop_b
+/* [0x00000810] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
+/* [0x00000818] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
+/* [0x00000820] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+/* [0x00000828] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+/* [0x00000830] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+/* [0x00000838] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+/* [0x00000840] */ 0x00000010, 0xe0020827, // mov r0, 16
+/* [0x00000848] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00000850] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+/* [0x00000858] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+/* [0x00000860] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+/* [0x00000868] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+// ::mc_exit
+/* [0x00000870] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00000878] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+/* [0x00000880] */ 0x009e7000, 0xa00009e7, // ldtmu0
+/* [0x00000888] */ 0x009e7000, 0xb00009e7, // ldtmu1
+/* [0x00000890] */ 0x009e7000, 0xa00009e7, // ldtmu0
+/* [0x00000898] */ 0x009e7000, 0xb00009e7, // ldtmu1
+/* [0x000008a0] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend
+/* [0x000008a8] */ 0x009e7000, 0x100009e7, // nop ; nop
+/* [0x000008b0] */ 0x009e7000, 0x100009e7, // nop ; nop
+// ::mc_interrupt_exit8
+/* [0x000008b8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x000008c0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+/* [0x000008c8] */ 0x009e7000, 0xb00009e7, // ldtmu1
+/* [0x000008d0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+/* [0x000008d8] */ 0x009e7000, 0xb00009e7, // ldtmu1
+/* [0x000008e0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+/* [0x000008e8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+/* [0x000008f0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+/* [0x000008f8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+/* [0x00000900] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+/* [0x00000908] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+/* [0x00000910] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+/* [0x00000918] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend
+/* [0x00000920] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+/* [0x00000928] */ 0x009e7000, 0x100009e7, // nop ; nop
+// ::mc_setup
+/* [0x00000930] */ 0x00000010, 0xe00208e7, // mov r3, 16
+/* [0x00000938] */ 0x15827d80, 0x10020227, // mov ra8, unif
+/* [0x00000940] */ 0x15827d80, 0x10020267, // mov ra9, unif
+/* [0x00000948] */ 0x15827d80, 0x100202a7, // mov ra10, unif
+/* [0x00000950] */ 0x15827d80, 0x100202e7, // mov ra11, unif
+/* [0x00000958] */ 0x15827d80, 0x10020867, // mov r1, unif
+/* [0x00000960] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
+/* [0x00000968] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
+/* [0x00000970] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
+/* [0x00000978] */ 0x0d9c13c0, 0xd0021667, // sub rb_frame_width_minus_1,r1,1
+/* [0x00000980] */ 0x0d9c11c0, 0xd00217a7, // sub rb_frame_height_minus_1,r0,1
+/* [0x00000988] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
+/* [0x00000990] */ 0x15827d80, 0x10020827, // mov r0, unif
+/* [0x00000998] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+/* [0x000009a0] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
+/* [0x000009a8] */ 0x15227d80, 0x10020867, // mov r1, ra8
+/* [0x000009b0] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
+/* [0x000009b8] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
+/* [0x000009c0] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
+/* [0x000009c8] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
+/* [0x000009d0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+/* [0x000009d8] */ 0x922591f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, ra9
+/* [0x000009e0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+/* [0x000009e8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
+/* [0x000009f0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+/* [0x000009f8] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
+/* [0x00000a00] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+/* [0x00000a08] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+/* [0x00000a10] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+/* [0x00000a18] */ 0x8c9e7452, 0x10025e18, // add t0s, r2, r1 ; mov ra_frame_base, r2
+/* [0x00000a20] */ 0x152a7d80, 0x10020867, // mov r1, ra10
+/* [0x00000a28] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
+/* [0x00000a30] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
+/* [0x00000a38] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
+/* [0x00000a40] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
+/* [0x00000a48] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+/* [0x00000a50] */ 0x922d91f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, ra11
+/* [0x00000a58] */ 0x119c31c0, 0xd0021067, // shl rx_xshift2_next, r0, 3
+/* [0x00000a60] */ 0x0c9c13c0, 0xd0120567, // add ra_y2, r1, 1
+/* [0x00000a68] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+/* [0x00000a70] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
+/* [0x00000a78] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+/* [0x00000a80] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+/* [0x00000a88] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+/* [0x00000a90] */ 0x8c9e7452, 0x10025f19, // add t1s, r2, r1 ; mov ra_frame_base2, r2
+/* [0x00000a98] */ 0x00000001, 0xe0020527, // mov ra_k1, 1
+/* [0x00000aa0] */ 0x00000100, 0xe00205a7, // mov ra_k256, 256
+/* [0x00000aa8] */ 0x00000040, 0xe00207a7, // mov ra30, 64
+/* [0x00000ab0] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
+/* [0x00000ab8] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255
+/* [0x00000ac0] */ 0x00000018, 0xe00215e7, // mov rb23, 24
+/* [0x00000ac8] */ 0x00000000, 0xe0020227, // mov ra8, 0
+/* [0x00000ad0] */ 0x00000000, 0xe0020267, // mov ra9, 0
+/* [0x00000ad8] */ 0x00000000, 0xe00202a7, // mov ra10, 0
+/* [0x00000ae0] */ 0x00000000, 0xe00202e7, // mov ra11, 0
+/* [0x00000ae8] */ 0x00000000, 0xe0020327, // mov ra12, 0
+/* [0x00000af0] */ 0x00000000, 0xe0020367, // mov ra13, 0
+/* [0x00000af8] */ 0x00000000, 0xe00203a7, // mov ra14, 0
+/* [0x00000b00] */ 0x00000000, 0xe00203e7, // mov ra15, 0
+/* [0x00000b08] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+/* [0x00000b10] */ 0x159e7480, 0x10020867, // mov r1, r2
+/* [0x00000b18] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+/* [0x00000b20] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+/* [0x00000b28] */ 0x159e7480, 0x10020827, // mov r0, r2
+/* [0x00000b30] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+/* [0x00000b38] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+/* [0x00000b40] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+/* [0x00000b48] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
+/* [0x00000b50] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+/* [0x00000b58] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+/* [0x00000b60] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
+/* [0x00000b68] */ 0x0c809dc0, 0xd0021367, // add rb13, unif, 9
+/* [0x00000b70] */ 0x15827d80, 0x100009e7, // mov -, unif
+/* [0x00000b78] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+/* [0x00000b80] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+/* [0x00000b88] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+/* [0x00000b90] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+/* [0x00000b98] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
+/* [0x00000ba0] */ 0x13540dc0, 0xd2020867, // max r1, ra_y2, 0
+/* [0x00000ba8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+/* [0x00000bb0] */ 0x0c541dc0, 0xd2120567, // add ra_y2, ra_y2, 1
+/* [0x00000bb8] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+/* [0x00000bc0] */ 0x0c667380, 0x10020f27, // add t1s, r1, ra_frame_base2
+// :per_block_setup
+/* [0x00000bc8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+/* [0x00000bd0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+/* [0x00000bd8] */ 0x959a0ff6, 0x10024061, // mov ra1, unif ; mov r1, elem_num
+/* [0x00000be0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+/* [0x00000be8] */ 0x159c1fc0, 0x10021027, // mov rx_xshift2, rx_xshift2_next
+/* [0x00000bf0] */ 0x0c067c40, 0x12020827, // add r0, ra1.16a, r1
+/* [0x00000bf8] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+/* [0x00000c00] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+/* [0x00000c08] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+/* [0x00000c10] */ 0x95048ff6, 0xd40258dc, // mov r3, 8 ; mov ra_y_next, ra1.16b
+/* [0x00000c18] */ 0x9481c1f6, 0xd0025801, // and r0, r0, ~3 ; mov ra1, unif
+/* [0x00000c20] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0
+/* [0x00000c28] */ 0x0c067c40, 0x12020827, // add r0, ra1.16a, r1
+/* [0x00000c30] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+/* [0x00000c38] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+/* [0x00000c40] */ 0x119c31c0, 0xd0021067, // shl rx_xshift2_next, r0, 3
+/* [0x00000c48] */ 0x8c0676f6, 0x142258d5, // add r3, r3, r3 ; mov ra_y2_next, ra1.16b
+/* [0x00000c50] */ 0x9481c1f6, 0xd0025801, // and r0, r0, ~3 ; mov ra1, unif
+/* [0x00000c58] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
+/* [0x00000c60] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+/* [0x00000c68] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b
+/* [0x00000c70] */ 0x0c045dc0, 0xd2021467, // add rb17, ra1.16a, 5
+/* [0x00000c78] */ 0x0c047dc0, 0xd20214a7, // add rb18, ra1.16a, 7
+/* [0x00000c80] */ 0x11047dc0, 0xd2020827, // shl r0, ra1.16a, 7
+/* [0x00000c88] */ 0x0c067180, 0x14020827, // add r0, r0, ra1.16b
+/* [0x00000c90] */ 0x119d01c0, 0xd0020827, // shl r0, r0, i_shift16
+/* [0x00000c98] */ 0x8c81b1f6, 0x100256a0, // add rb26, r0, rb27 ; mov r0, unif
+/* [0x00000ca0] */ 0x119d01c0, 0xd0040827, // shl.ifz r0, r0, i_shift16
+/* [0x00000ca8] */ 0x119c31c0, 0xd0020227, // shl ra8, r0, 3
+/* [0x00000cb0] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
+/* [0x00000cb8] */ 0x10227380, 0x1e4200a7, // ror ra2.8a, r1, ra8.8d
+/* [0x00000cc0] */ 0x10227380, 0x1c420027, // ror ra0.8a, r1, ra8.8c
+/* [0x00000cc8] */ 0x01040400, 0xe0020867, // mov r1,0x01040400
+/* [0x00000cd0] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d
+/* [0x00000cd8] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c
+/* [0x00000ce0] */ 0x050b0a00, 0xe0020867, // mov r1,0x050b0a00
+/* [0x00000ce8] */ 0x10227380, 0x1e6200a7, // ror ra2.8c, r1, ra8.8d
+/* [0x00000cf0] */ 0x10227380, 0x1c620027, // ror ra0.8c, r1, ra8.8c
+/* [0x00000cf8] */ 0x11283a40, 0xe0020867, // mov r1,0x11283a40
+/* [0x00000d00] */ 0x10227380, 0x1e7200a7, // ror ra2.8d, r1, ra8.8d
+/* [0x00000d08] */ 0x10227380, 0x1c720027, // ror ra0.8d, r1, ra8.8c
+/* [0x00000d10] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
+/* [0x00000d18] */ 0x10227380, 0x1e4200e7, // ror ra3.8a, r1, ra8.8d
+/* [0x00000d20] */ 0x10227380, 0x1c420067, // ror ra1.8a, r1, ra8.8c
+/* [0x00000d28] */ 0x0a0b0500, 0xe0020867, // mov r1,0x0a0b0500
+/* [0x00000d30] */ 0x10227380, 0x1e5200e7, // ror ra3.8b, r1, ra8.8d
+/* [0x00000d38] */ 0x10227380, 0x1c520067, // ror ra1.8b, r1, ra8.8c
+/* [0x00000d40] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
+/* [0x00000d48] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d
+/* [0x00000d50] */ 0x10227380, 0x1c620067, // ror ra1.8c, r1, ra8.8c
+/* [0x00000d58] */ 0x01010000, 0xe0020867, // mov r1,0x01010000
+/* [0x00000d60] */ 0x902203bf, 0x1e7240e0, // ror ra3.8d, r1, ra8.8d ; mov r0, unif
+/* [0x00000d68] */ 0x9020d3bf, 0x1c724061, // ror ra1.8d, r1, ra8.8c ; mov r1, rb13
+/* [0x00000d70] */ 0x910e0e76, 0x18024844, // shl r1, unif, r1 ; mov rb4, ra3.8a
+/* [0x00000d78] */ 0x8f0e70f6, 0x1a024485, // asr ra18, r0, r3 ; mov rb5, ra3.8b
+/* [0x00000d80] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+/* [0x00000d88] */ 0x910e70f6, 0x1c024806, // shl r0, r0, r3 ; mov rb6, ra3.8c
+/* [0x00000d90] */ 0x950c0ff6, 0xde0248c7, // mov r3, 0 ; mov rb7, ra3.8d
+/* [0x00000d98] */ 0x0f9c93c0, 0xd0021327, // asr rb12, r1, 9
+// ::mc_filter
+/* [0x00000da0] */ 0x0f9cf1c0, 0xd00213a7, // asr rb14, r0, 15
+// :yloop
+/* [0x00000da8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0
+/* [0x00000db0] */ 0x8e4539bf, 0xb0029819, // shr r0, r4, ra_xshift ; mov.ifz ra_frame_base2, rx_frame_base2_next ; ldtmu1
+/* [0x00000db8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+/* [0x00000dc0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch
+/* [0x00000dc8] */ 0x8e5409f6, 0x14129855, // shr r1, r4, rx_xshift2 ; mov.ifz ra_y2, ra_y2_next
+/* [0x00000dd0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+/* [0x00000dd8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+/* [0x00000de0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3
+/* [0x00000de8] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2 ; v8subs r0, r0, rb20
+/* [0x00000df0] */ 0x13540dc0, 0xd20208a7, // max r2, ra_y2, 0
+/* [0x00000df8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+/* [0x00000e00] */ 0x4c541dd3, 0xd2124562, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3
+/* [0x00000e08] */ 0xec654c8f, 0x10024f21, // add t1s, ra_frame_base2, r2 ; v8subs r1, r1, rb20
+/* [0x00000e10] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+/* [0x00000e18] */ 0x40027030, 0x180049e3, // nop ; mul24 r3, ra0.8a, r0
+/* [0x00000e20] */ 0x40038031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
+/* [0x00000e28] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1
+/* [0x00000e30] */ 0x40037031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
+/* [0x00000e38] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2
+/* [0x00000e40] */ 0x40036031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+/* [0x00000e48] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3
+/* [0x00000e50] */ 0x40035031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+/* [0x00000e58] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4
+/* [0x00000e60] */ 0x40074031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra1.8a << 12, r1 << 12
+/* [0x00000e68] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5
+/* [0x00000e70] */ 0x40073031, 0xda00c9e3, // nop ; mul24.ifnz r3, ra1.8b << 13, r1 << 13
+/* [0x00000e78] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6
+/* [0x00000e80] */ 0x40072031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra1.8c << 14, r1 << 14
+/* [0x00000e88] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7
+/* [0x00000e90] */ 0x40071031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra1.8d << 15, r1 << 15
+/* [0x00000e98] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3 ; mov r3, rb31
+/* [0x00000ea0] */ 0x8d2087f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra8
+/* [0x00000ea8] */ 0x95249dbf, 0x10024208, // mov ra8, ra9 ; mov rb8, rb9
+/* [0x00000eb0] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:yloop
+/* [0x00000eb8] */ 0x9528adbf, 0x10024249, // mov ra9, ra10 ; mov rb9, rb10
+/* [0x00000ec0] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11 ; mov rb10, rb11
+/* [0x00000ec8] */ 0x959e7009, 0x100242cb, // mov ra11, r0 ; mov rb11, r1
+/* [0x00000ed0] */ 0x4008803e, 0x180049e0, // nop ; mul24 r0, rb8, ra2.8a
+/* [0x00000ed8] */ 0x4008903e, 0x1a0049e1, // nop ; mul24 r1, rb9, ra2.8b
+/* [0x00000ee0] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c
+/* [0x00000ee8] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d
+/* [0x00000ef0] */ 0x4c204237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb4
+/* [0x00000ef8] */ 0x4c245237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra9, rb5
+/* [0x00000f00] */ 0x4d286237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra10, rb6
+/* [0x00000f08] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb7
+/* [0x00000f10] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0 ; mov -, vw_wait
+/* [0x00000f18] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18 ; mul24 r1, r1, ra_k256
+/* [0x00000f20] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+/* [0x00000f28] */ 0x409ce00f, 0x100049e1, // nop ; mul24 r1, r1, rb14
+/* [0x00000f30] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
+/* [0x00000f38] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
+/* [0x00000f40] */ 0xfffffe48, 0xf06809e7, // brr.anyn -, r:yloop
+/* [0x00000f48] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
+/* [0x00000f50] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
+/* [0x00000f58] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+/* [0x00000f60] */ 0xfffffc48, 0xf0f809e7, // brr -, r:per_block_setup
+/* [0x00000f68] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+/* [0x00000f70] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+/* [0x00000f78] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+// ::mc_filter_b
+/* [0x00000f80] */ 0x0f9d01c0, 0xd00213a7, // asr rb14, r0, i_shift16
+// :yloopb
+/* [0x00000f88] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0
+/* [0x00000f90] */ 0x8e4539bf, 0xb0029819, // shr r0, r4, ra_xshift ; mov.ifz ra_frame_base2, rx_frame_base2_next ; ldtmu1
+/* [0x00000f98] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+/* [0x00000fa0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch
+/* [0x00000fa8] */ 0x8e5409f6, 0x14129855, // shr r1, r4, rx_xshift2 ; mov.ifz ra_y2, ra_y2_next
+/* [0x00000fb0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+/* [0x00000fb8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+/* [0x00000fc0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3
+/* [0x00000fc8] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2 ; v8subs r0, r0, rb20
+/* [0x00000fd0] */ 0x13540dc0, 0xd20208a7, // max r2, ra_y2, 0
+/* [0x00000fd8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+/* [0x00000fe0] */ 0x4c541dd3, 0xd2124562, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3
+/* [0x00000fe8] */ 0xec654c8f, 0x10024f21, // add t1s, ra_frame_base2, r2 ; v8subs r1, r1, rb20
+/* [0x00000ff0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+/* [0x00000ff8] */ 0x40027030, 0x180049e3, // nop ; mul24 r3, ra0.8a, r0
+/* [0x00001000] */ 0x40038031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
+/* [0x00001008] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1
+/* [0x00001010] */ 0x40037031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
+/* [0x00001018] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2
+/* [0x00001020] */ 0x40036031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+/* [0x00001028] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3
+/* [0x00001030] */ 0x40035031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+/* [0x00001038] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4
+/* [0x00001040] */ 0x40074031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra1.8a << 12, r1 << 12
+/* [0x00001048] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5
+/* [0x00001050] */ 0x40073031, 0xda00c9e3, // nop ; mul24.ifnz r3, ra1.8b << 13, r1 << 13
+/* [0x00001058] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6
+/* [0x00001060] */ 0x40072031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra1.8c << 14, r1 << 14
+/* [0x00001068] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7
+/* [0x00001070] */ 0x40071031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra1.8d << 15, r1 << 15
+/* [0x00001078] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3 ; mov r3, rb31
+/* [0x00001080] */ 0x8d2087f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra8
+/* [0x00001088] */ 0x95249dbf, 0x10024208, // mov ra8, ra9 ; mov rb8, rb9
+/* [0x00001090] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:yloopb
+/* [0x00001098] */ 0x9528adbf, 0x10024249, // mov ra9, ra10 ; mov rb9, rb10
+/* [0x000010a0] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11 ; mov rb10, rb11
+/* [0x000010a8] */ 0x959e7009, 0x100242cb, // mov ra11, r0 ; mov rb11, r1
+/* [0x000010b0] */ 0x4008803e, 0x180049e0, // nop ; mul24 r0, rb8, ra2.8a
+/* [0x000010b8] */ 0x4008903e, 0x1a0049e1, // nop ; mul24 r1, rb9, ra2.8b
+/* [0x000010c0] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c
+/* [0x000010c8] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d
+/* [0x000010d0] */ 0x4c204237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb4
+/* [0x000010d8] */ 0x4c245237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra9, rb5
+/* [0x000010e0] */ 0x4d286237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra10, rb6
+/* [0x000010e8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb7
+/* [0x000010f0] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0 ; mov r2, rb12
+/* [0x000010f8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18 ; mul24 r1, r1, ra_k256
+/* [0x00001100] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+/* [0x00001108] */ 0x409ce00f, 0x100049e0, // nop ; mul24 r0, r1, rb14
+/* [0x00001110] */ 0x4c4b808e, 0xd0024821, // add r0, r0, r2 ; mul24 r1, r1 << 8, ra18 << 8
+/* [0x00001118] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0 ; mov -, vw_wait
+/* [0x00001120] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
+/* [0x00001128] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:yloopb
+/* [0x00001130] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
+/* [0x00001138] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
+/* [0x00001140] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+/* [0x00001148] */ 0xfffffa60, 0xf0f809e7, // brr -, r:per_block_setup
+/* [0x00001150] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+/* [0x00001158] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+/* [0x00001160] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+// ::mc_interrupt_exit12
+/* [0x00001168] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00001170] */ 0x009e7000, 0xa00009e7, // ldtmu0
+/* [0x00001178] */ 0x009e7000, 0xa00009e7, // ldtmu0
+/* [0x00001180] */ 0x009e7000, 0xb00009e7, // ldtmu1
+/* [0x00001188] */ 0x009e7000, 0xb00009e7, // ldtmu1
+/* [0x00001190] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+/* [0x00001198] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+/* [0x000011a0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+/* [0x000011a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+/* [0x000011b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+/* [0x000011b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+/* [0x000011c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+/* [0x000011c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+/* [0x000011d0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+/* [0x000011d8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+/* [0x000011e0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+/* [0x000011e8] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend
+/* [0x000011f0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+/* [0x000011f8] */ 0x009e7000, 0x100009e7, // nop ; nop
+// ::mc_exit1
+/* [0x00001200] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00001208] */ 0x009e7000, 0xa00009e7, // ldtmu0
+/* [0x00001210] */ 0x009e7000, 0xb00009e7, // ldtmu1
+/* [0x00001218] */ 0x009e7000, 0xa00009e7, // ldtmu0
+/* [0x00001220] */ 0x009e7000, 0xb00009e7, // ldtmu1
+/* [0x00001228] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend
+/* [0x00001230] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+/* [0x00001238] */ 0x009e7000, 0x100009e7, // nop ; nop
+// ::mc_end
+};
+#ifdef __HIGHC__
+#pragma Align_to(8, rpi_shader)
+#endif
diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
new file mode 100644
index 0000000..9772796
--- /dev/null
+++ b/libavcodec/rpi_shader.h
@@ -0,0 +1,19 @@
+#ifndef rpi_shader_H
+#define rpi_shader_H
+
+extern unsigned int rpi_shader[];
+
+#define mc_setup_uv (rpi_shader + 0)
+#define mc_filter_uv (rpi_shader + 132)
+#define mc_filter_uv_b0 (rpi_shader + 274)
+#define mc_filter_uv_b (rpi_shader + 392)
+#define mc_exit (rpi_shader + 540)
+#define mc_interrupt_exit8 (rpi_shader + 558)
+#define mc_setup (rpi_shader + 588)
+#define mc_filter (rpi_shader + 872)
+#define mc_filter_b (rpi_shader + 992)
+#define mc_interrupt_exit12 (rpi_shader + 1114)
+#define mc_exit1 (rpi_shader + 1152)
+#define mc_end (rpi_shader + 1168)
+
+#endif
diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
new file mode 100644
index 0000000..aa9e1e7
--- /dev/null
+++ b/libavcodec/rpi_shader.qasm
@@ -0,0 +1,1098 @@
+# register allocation
+#
+# ra0...ra7 eight horizontal filter coefficients
+#
+# rb0 rx_shift2
+# rb1 rb_y2_next
+#
+# rb4...rb7
+#
+# rb8..rb11, ra8...ra11 Y: eight filtered rows of context (ra11 == most recent)
+#
+# (ra15 isn't clamped to zero - this happens during the
+# copy to ra14, and during its use in the vertical filter)
+#
+# rb8...rb11 eight vertical filter coefficients
+
+# ra4 y: Fiter, UV: 0x10000
+
+# rb12 offset to add before shift (round + weighting offsets)
+# rb13 shift: denom + 6 + 9
+# rb14 L0 weight (U on left, V on right)
+# rb15 -- free --
+#
+# ra16 clipped(row start address+elem_num)&~3
+# ra17 per-channel shifts
+# ra18 L1 weight (Y)
+# ra19 next ra17
+#
+# rb16 pitch
+# rb17 height + 1
+# rb18 height + 3
+# rb19 next ra16
+#
+# ra20 1
+# ra21 ra_21
+# ra22 ra_k256 256
+# ra23 ra_y2_next ra_y2_next
+#
+# rb20 0xffffff00
+# rb21 vpm_setup for reading/writing 16bit results into VPM
+# rb22 rb_k255 255
+# rb23 24
+#
+# rb24 vdw_setup_1(dst_pitch)
+# rb25 frame width-1
+# rb26 height<<23 + width<<16 + vdw_setup_0
+# rb27 vdw_setup_0 (depends on QPU number)
+# rb28 vpm_setup (depends on QPU number) for writing 8bit results into VPM
+# rb29 vdw_setup_1(dst_pitch-width)
+# rb30 frame height-1
+# rb31 used as temp to count loop iterations
+#
+# ra24 clipped(row start address+8+elem_num)&~3
+# ra25 per-channel shifts 2
+# ra26 next ra24
+# ra27 next ra25
+# ra28 next y
+# ra29 y for next texture access
+# ra30 64
+#
+# ra31 next kernel address
+
+.set rb_frame_width_minus_1, rb25
+.set rb_frame_height_minus_1, rb30
+.set rb_pitch, rb16
+.set ra_x, ra16
+.set ra_y2, ra21.16a
+.set ra_y2_next, ra21.16b
+
+.set rb_x_next, rb19
+.set rx_frame_base2_next, rb19
+
+.set ra_frame_base, ra24
+.set ra_frame_base_next, ra26
+.set ra_xshift, ra17
+
+.set ra_u2v_ref_offset, ra25
+.set ra_frame_base2, ra25
+
+.set ra_xshift_next, ra19
+.set rx_xshift2, rb0
+.set rx_xshift2_next, rb1
+
+.set ra_u2v_dst_offset, ra27
+
+.set ra_y_next, ra28
+.set ra_y, ra29
+
+.set ra_k1, ra20
+.set rb_k255, rb22
+.set ra_k256, ra22
+
+# With shifts only the bottom 5 bits are considered so -16=16, -15=17 etc.
+.set i_shift16, -16
+.set i_shift21, -11
+
+################################################################################
+# mc_setup_uv(next_kernel, x, y, ref_u_base, ref_v_base, frame_width, frame_height, pitch, dst_pitch, offset, denom, vpm_id)
+::mc_setup_uv
+
+# Read starting kernel
+mov ra31, unif
+
+# Load first request location
+add ra_x, unif, elem_num # Store x
+mov ra_y, unif # Store y
+mov ra_frame_base, unif # Store frame u base
+nop
+sub ra_u2v_ref_offset, unif, ra_frame_base # Store offset to add to move from u to v in reference frame
+
+# Read image dimensions
+sub rb25,unif,1
+sub rb30,unif,1
+
+# get source pitch
+mov rb16, unif
+
+# get destination pitch
+mov r0, unif
+mov r1, vdw_setup_1(0)
+add rb24, r1, r0
+
+# load constants
+
+mov ra4, 0x10000
+mov ra_k1, 1
+mov ra_k256, 256
+mov ra30, 64
+
+mov rb20, 0xffffff00
+mov rb_k255, 255
+mov rb23, 24
+
+# touch vertical context to keep simulator happy
+
+mov ra8, 0
+mov ra9, 0
+mov ra10, 0
+mov ra11, 0
+mov ra12, 0
+mov ra13, 0
+mov ra14, 0
+mov ra15, 0
+
+# Compute base address for first and second access
+mov r0, ra_x # Load x
+max r0, r0, 0; mov r1, ra_y # Load y
+min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base # Load the frame base
+shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
+add ra_y, r1, 1
+add r0, r0, r3
+and r0, r0, ~3
+max r1, r1, 0 ; mov ra_x, r0 # y
+min r1, r1, rb_frame_height_minus_1
+# submit texture requests for first line
+add r2, r2, r0 ; mul24 r1, r1, rb_pitch
+add t0s, r0, r1 ; mov ra_frame_base, r2
+add t1s, r2, r1
+
+mov r2, 9
+add rb13, r2, unif # denominator
+mov -, unif # Unused
+
+# Compute part of VPM to use for DMA output
+mov r2, unif
+shl r2, r2, 1 # Convert QPU numbers to be even (this means we can only use 8 QPUs, but is necessary as we need to save 16bit intermediate results)
+and r2, r2, 15
+mov r1, r2
+asr r1, r1, 2
+shl r1, r1, 6
+mov r0, r2
+and r0, r0, 3
+add r0, r0, r1
+
+mov r1, vpm_setup(0, 4, h8p(0, 0)) # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
+add rb28, r0, r1 # VPM 8bit storage
+asr r2, r0, 1 # r0 = bc0000d
+mov r1, vpm_setup(0, 2, h16p(0, 0)) # 2 is stride - stride acts on ADDR which is Y[5:0],H[0] for 16 bit
+add rb21, r2, r1 # VPM for 16bit intermediates
+mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
+shl r0, r0, 5
+add rb27, r0, r1 # DMA out
+
+# submit texture requests for second line
+max r1, ra_y, 0
+min r1, r1, rb_frame_height_minus_1
+add ra_y, ra_y, 1
+bra -, ra31
+nop ; mul24 r1, r1, rb_pitch
+add t0s, r1, ra_x
+add t1s, r1, ra_frame_base
+
+
+
+################################################################################
+
+# mc_filter_uv(next_kernel, x, y, frame_u_base, frame_v_base, width_height, hcoeffs, vcoeffs, offset_weight_u, offset_weight_v, this_u_dst, this_v_dst)
+
+# At this point we have already issued two pairs of texture requests for the current block
+# ra_x, ra_x16_base point to the current coordinates for this block
+::mc_filter_uv
+mov ra31, unif
+
+# per-channel shifts were calculated on the *previous* invocation
+
+# get base addresses and per-channel shifts for *next* invocation
+add r0, unif, elem_num # x
+max r0, r0, 0 ; mov r1, unif # y
+min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
+# compute offset from frame base u to frame base v
+sub r2, unif, r3 ; mov ra_xshift, ra_xshift_next
+shl ra_xshift_next, r0, 3
+add r0, r0, r3 ; mov ra1, unif # ; width_height
+and rb_x_next, r0, ~3 ; mov ra0, unif # H filter coeffs
+mov ra_y_next, r1 ; mov vw_setup, rb28
+add ra_frame_base_next, rb_x_next, r2
+
+# set up VPM write
+# get width,height of block
+
+sub rb29, rb24, ra1.16b # Compute vdw_setup1(dst_pitch-width)
+add rb17, ra1.16a, 1
+add rb18, ra1.16a, 3
+shl r0, ra1.16a, 7
+add r0, r0, ra1.16b # Combine width and height of destination area
+shl r0, r0, i_shift16 # Shift into bits 16 upwards of the vdw_setup0 register
+add rb26, r0, rb27 ; mov ra3, unif # ; V filter coeffs
+
+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+
+# unpack filter coefficients
+
+mov ra1, unif ; mov rb8, ra3.8a # U offset/weight
+mov.ifnz ra1, unif ; mov rb9, ra3.8b # V offset/weight
+nop ; mov rb10, ra3.8c
+mov r3, 0 ; mov rb11, ra3.8d # Loop count
+
+shl r1, ra1.16b, rb13
+asr rb12, r1, 1
+shl rb14, ra1.16a, 1 # b14 = weight*2
+
+# rb14 - weight L0 * 2
+# rb13 = weight denom + 6 + 9
+# rb12 = (((is P) ? offset L0 * 2 : offset L1 + offset L0) + 1) << (rb13 - 1)
+
+# r2 is elem_num
+# retrieve texture results and pick out bytes
+# then submit two more texture requests
+
+# r3 = 0
+:uvloop
+# retrieve texture results and pick out bytes
+# then submit two more texture requests
+
+sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0 # loop counter increment
+shr r0, r4, ra_xshift ; mov.ifz ra_x, rb_x_next ; ldtmu1
+mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch
+shr r1, r4, ra_xshift ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
+
+max r2, ra_y, 0 # y
+min r2, r2, rb_frame_height_minus_1
+add ra_y, ra_y, 1 ; mul24 r2, r2, r3
+add t0s, ra_x, r2 ; v8subs r1, r1, rb20
+add t1s, ra_frame_base, r2
+
+# generate seven shifted versions
+# interleave with scroll of vertical context
+
+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+
+# apply horizontal filter
+nop ; mul24 r3, ra0.8a, r0
+nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
+nop ; mul24 r2, ra0.8b << 1, r0 << 1
+nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
+sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2
+nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3
+nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+sub r0, r2, r3 ; mov r3, rb31
+sub.setf -, r3, 4 ; mov ra12, ra13
+brr.anyn -, r:uvloop
+mov ra13, ra14 ; mul24 r1, ra14, rb9
+mov ra14, ra15
+mov ra15, r0 ; mul24 r0, ra12, rb8
+# >>> .anyn uvloop
+
+# apply vertical filter and write to VPM
+
+sub r1, r1, r0 ; mul24 r0, ra14, rb10
+add r1, r1, r0 ; mul24 r0, ra15, rb11
+sub r1, r1, r0 ; mov -, vw_wait
+sub.setf -, r3, rb18 ; mul24 r1, r1, ra_k256
+asr r1, r1, 14
+nop ; mul24 r1, r1, rb14
+shl r1, r1, 8
+
+add r1, r1, rb12
+brr.anyn -, r:uvloop
+asr r1, r1, rb13
+min r1, r1, rb_k255 # Delay 2
+max vpm, r1, 0 # Delay 3
+
+# DMA out for U
+
+mov vw_setup, rb26 # VDW setup 0
+mov vw_setup, rb29 # Stride
+mov vw_addr, unif # start the VDW
+
+# DMA out for V
+# We need to wait for the U to complete first, but have nothing useful to compute while we wait.
+# Could potentially push this write into the start of the next pipeline stage.
+mov r0, 16
+mov -, vw_wait
+
+bra -, ra31
+add vw_setup, rb26, r0 # VDW setup 0
+mov vw_setup, rb29 # Stride
+mov vw_addr, unif # start the VDW
+
+
+################################################################################
+
+# mc_filter_uv_b0(next_kernel, x, y, frame_u_base, frame_v_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_u_dst, this_v_dst)
+
+# At this point we have already issued two pairs of texture requests for the current block
+# ra_x, ra_x16_base point to the current coordinates for this block
+::mc_filter_uv_b0
+mov ra31, unif
+
+# per-channel shifts were calculated on the *previous* invocation
+
+# get base addresses and per-channel shifts for *next* invocation
+add r0, unif, elem_num # x
+max r0, r0, 0 ; mov r1, unif # y
+min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
+sub r2, unif, r3 ; mov ra_xshift, ra_xshift_next # compute offset from frame base u to frame base v ;
+shl ra_xshift_next, r0, 3
+add r0, r0, r3 ; mov ra1, unif # ; width_height
+and rb_x_next, r0, ~3 ; mov ra0, unif # ; H filter coeffs
+mov ra_y_next, r1 ; mov vw_setup, rb21
+
+add ra_frame_base_next, rb_x_next, r2
+
+# Need to have unsigned coeffs to so we can just unpack in the filter
+# chroma filter always goes -ve, +ve, +ve, -ve. This is fixed in the
+# filter code. Unpack into b regs for V
+
+# set up VPM write, we need to save 16bit precision
+
+sub rb29, rb24, ra1.16b # Compute vdw_setup1(dst_pitch-width)
+add rb17, ra1.16a, 1
+add rb18, ra1.16a, 3
+shl r0, ra1.16a, 7
+add r0, r0, ra1.16b # Combine width and height of destination area
+shl r0, r0, i_shift16 ; mov ra3, unif # ; V filter coeffs
+add rb26, r0, rb27
+
+mov rb8, ra3.8a
+mov rb9, ra3.8b
+mov rb10, ra3.8c
+mov rb11, ra3.8d
+
+# r2 is elem_num
+# r3 is loop counter
+
+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+
+mov rb14, unif # U weight L0
+mov.ifnz rb14, unif ; mov r3, 0 # V weight L0 ; Loop counter
+# rb14 unused in b0 but will hang around till the second pass
+
+# retrieve texture results and pick out bytes
+# then submit two more texture requests
+
+# r3 = 0
+:uvloop_b0
+# retrieve texture results and pick out bytes
+# then submit two more texture requests
+
+sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0 # loop counter increment
+shr r0, r4, ra_xshift ; mov.ifz ra_x, rb_x_next ; ldtmu1
+mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch
+shr r1, r4, ra_xshift ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
+
+max r2, ra_y, 0 # y
+min r2, r2, rb_frame_height_minus_1
+add ra_y, ra_y, 1 ; mul24 r2, r2, r3
+add t0s, ra_x, r2 ; v8subs r1, r1, rb20
+add t1s, ra_frame_base, r2
+
+# generate seven shifted versions
+# interleave with scroll of vertical context
+
+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+
+nop ; mul24 r3, ra0.8a, r0
+nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
+nop ; mul24 r2, ra0.8b << 1, r0 << 1
+nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
+sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2
+nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3
+nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+sub r0, r2, r3 ; mov r3, rb31
+sub.setf -, r3, 4 ; mov ra12, ra13
+brr.anyn -, r:uvloop_b0
+mov ra13, ra14 ; mul24 r1, ra14, rb9 # ra14 is about to be ra13
+mov ra14, ra15
+mov ra15, r0 ; mul24 r0, ra12, rb8
+# >>> .anyn uvloop_b0
+
+# apply vertical filter and write to VPM
+
+sub r1, r1, r0 ; mul24 r0, ra14, rb10
+sub.setf -, r3, rb18
+brr.anyn -, r:uvloop_b0
+add r1, r1, r0 ; mul24 r0, ra15, rb11
+sub r1, r1, r0 ; mov -, vw_wait
+asr vpm, r1, 6
+# >>> .anyn uvloop_b0
+
+# in pass0 we don't really need to save any results, but need to discard the uniforms
+# DMA out for U
+
+bra -, ra31
+mov -, unif # Delay 1
+mov -, unif # Delay 2
+nop # Delay 3
+
+
+################################################################################
+
+::mc_filter_uv_b
+mov ra31, unif
+
+# per-channel shifts were calculated on the *previous* invocation
+
+# set up VPM write
+mov ra_xshift, ra_xshift_next ; mov vw_setup, rb28
+
+# get base addresses and per-channel shifts for *next* invocation
+add r0, unif, elem_num # x
+max r0, r0, 0 ; mov ra_y_next, unif # y
+min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # V frame_base
+# compute offset from frame base u to frame base v
+sub r2, unif, r3 ; mul24 ra_xshift_next, r0, 8 # U frame_base
+add r0, r0, r3 ; mov ra1, unif # width_height
+and rb_x_next, r0, ~3 ; mov ra0, unif # H filter coeffs
+
+sub rb29, rb24, ra1.16b # Compute vdw_setup1(dst_pitch-width)
+add rb17, ra1.16a, 1
+add rb18, ra1.16a, 3
+shl r0, ra1.16a, 7
+
+add ra_frame_base_next, rb_x_next, r2
+
+# r0 is currently height<<7
+# For vr_setup we want height<<20 (so 20-7=13 additional bits)
+shl r3, r0, i_shift21 ; mov ra3, unif # Shl 13 + Mask off top 8 bits ; V filter coeffs
+shr r3, r3, 8
+add vr_setup, r3, rb21
+
+add r0, r0, ra1.16b # Combine width and height of destination area
+shl r0, r0, i_shift16 # Shift into bits 16 upwards of the vdw_setup0 register
+add rb26, r0, rb27
+
+# get filter coefficients
+
+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+
+# Get offset & weight stuff
+
+# The unif read occurs unconditionally, only the write is conditional
+mov ra1, unif ; mov rb8, ra3.8a # U offset/weight ;
+mov.ifnz ra1, unif ; mov rb9, ra3.8b # V offset/weight ;
+nop ; mov rb10, ra3.8c
+mov r3, 0 ; mov rb11, ra3.8d # Loop counter ;
+
+shl r1, ra1.16b, rb13
+asr rb12, r1, 1
+
+# ra1.16a used directly in the loop
+
+# retrieve texture results and pick out bytes
+# then submit two more texture requests
+
+# r3 = 0
+:uvloop_b
+# retrieve texture results and pick out bytes
+# then submit two more texture requests
+
+sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0 # loop counter increment
+shr r0, r4, ra_xshift ; mov.ifz ra_x, rb_x_next ; ldtmu1
+mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch
+shr r1, r4, ra_xshift ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
+
+max r2, ra_y, 0 # y
+min r2, r2, rb_frame_height_minus_1
+add ra_y, ra_y, 1 ; mul24 r2, r2, r3
+add t0s, ra_x, r2 ; v8subs r1, r1, rb20
+add t1s, ra_frame_base, r2
+
+# generate seven shifted versions
+# interleave with scroll of vertical context
+
+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+
+nop ; mul24 r3, ra0.8a, r0
+nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
+nop ; mul24 r2, ra0.8b << 1, r0 << 1
+nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
+sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2
+nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3
+nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+sub r0, r2, r3 ; mov r3, rb31
+sub.setf -, r3, 4 ; mov ra12, ra13
+brr.anyn -, r:uvloop_b
+mov ra13, ra14 ; mul24 r1, ra14, rb9
+mov ra14, ra15
+mov ra15, r0 ; mul24 r0, ra12, rb8
+# >>> .anyn uvloop_b
+
+# apply vertical filter and write to VPM
+
+sub r1, r1, r0 ; mul24 r0, ra14, rb10
+add r1, r1, r0 ; mul24 r0, ra15, rb11
+# Beware: vpm read gets unsigned 16-bit value, so we must sign extend it
+sub r1, r1, r0 ; mul24 r0, vpm, ra4 # ra4 = 0x10000
+sub.setf -, r3, rb18 ; mul24 r1, r1, ra_k256
+asr r1, r1, 14 # shift2=6
+
+asr r0, r0, i_shift16 ; mul24 r1, r1, ra1.16a
+nop ; mul24 r0, r0, rb14
+
+add r1, r1, r0 ; mov -, vw_wait
+shl r1, r1, 8 # Lose bad top 8 bits & sign extend
+
+add r1, r1, rb12 # rb12 = (offsetL0 + offsetL1 + 1) << (rb13 - 1)
+
+brr.anyn -, r:uvloop_b
+asr r1, r1, rb13 # Delay 1
+min r1, r1, rb_k255 # Delay 2
+max vpm, r1, 0 # Delay 3
+
+
+# DMA out for U
+
+mov vw_setup, rb26 # VDW setup 0
+mov vw_setup, rb29 # Stride
+mov vw_addr, unif # start the VDW
+
+# DMA out for V
+# We need to wait for the U to complete first, but have nothing useful to compute while we wait.
+# Could potentially push this write into the start of the next pipeline stage.
+mov r0, 16
+mov -, vw_wait
+
+bra -, ra31
+add vw_setup, rb26, r0 # VDW setup 0
+mov vw_setup, rb29 # Stride
+mov vw_addr, unif # start the VDW
+
+################################################################################
+
+# mc_exit()
+
+::mc_exit
+mov -, vw_wait # wait on the VDW
+
+mov -,srel(0)
+
+ldtmu0
+ldtmu1
+ldtmu0
+ldtmu1
+
+nop ; nop ; thrend
+nop ; nop # delay slot 1
+nop ; nop # delay slot 2
+
+# mc_interrupt_exit8()
+::mc_interrupt_exit8
+mov -, vw_wait # wait on the VDW
+
+ldtmu0
+ldtmu1
+ldtmu0
+ldtmu1
+
+mov -,sacq(0) # 1
+mov -,sacq(0) # 2
+mov -,sacq(0) # 3
+mov -,sacq(0) # 4
+mov -,sacq(0) # 5
+mov -,sacq(0) # 6
+mov -,sacq(0) # 7
+
+nop ; nop ; thrend
+mov interrupt, 1; nop # delay slot 1
+nop ; nop # delay slot 2
+
+
+
+
+
+# LUMA CODE
+
+# The idea is to form B predictions by doing 8 pixels from ref0 in parallel with 8 pixels from ref1.
+# For P frames we make the second x,y coordinates offset by +8
+
+################################################################################
+# mc_setup(y_x, ref_y_base, y2_x2, ref_y2_base, frame_width_height, pitch, dst_pitch, offset_shift, tbd, next_kernel)
+::mc_setup
+ mov r3, 16
+
+ # Need to save these because we need to know the frame dimensions before computing texture coordinates
+ mov ra8, unif # y_x
+ mov ra9, unif # ref_y_base
+ mov ra10, unif # y2_x2
+ mov ra11, unif # ref_y2_base
+
+# Read image dimensions
+ mov r1, unif # width_height
+ shl r0,r1,r3
+ asr r1,r1,r3 # width
+ asr r0,r0,r3 # height
+ sub rb_frame_width_minus_1,r1,1
+ sub rb_frame_height_minus_1,r0,1
+
+# get source pitch
+ mov rb_pitch, unif # src_pitch
+
+# get destination pitch
+ mov r0, unif # dst_pitch
+ mov r1, vdw_setup_1(0)
+ add rb24, r1, r0
+
+# Compute base address for first and second access
+ mov r1, ra8 # y_x
+ shl r0,r1,r3 # r0 is x<<16
+ asr r1,r1,r3 # r1 is y
+ asr r0,r0,r3 # r0 is x
+ add r0, r0, elem_num # Load x
+ max r0, r0, 0
+ min r0, r0, rb_frame_width_minus_1 ; mov r2, ra9 # Load the frame base
+ shl ra_xshift_next, r0, 3 # Compute shifts
+ add ra_y, r1, 1
+ and r0, r0, ~3 # r0 gives the clipped and aligned x coordinate
+ add r2, r2, r0 # r2 is address for frame0 (not including y offset)
+ max r1, r1, 0
+ min r1, r1, rb_frame_height_minus_1
+ nop ; mul24 r1, r1, rb_pitch # r2 contains the addresses (not including y offset) for frame0
+ add t0s, r2, r1 ; mov ra_frame_base, r2
+
+ mov r1, ra10 # y_x
+ shl r0,r1,r3 # r0 is x<<16
+ asr r1,r1,r3 # r1 is y
+ asr r0,r0,r3 # r0 is x
+ add r0, r0, elem_num # Load x
+ max r0, r0, 0
+ min r0, r0, rb_frame_width_minus_1 ; mov r2, ra11 # Load the frame base
+ shl rx_xshift2_next, r0, 3 # Compute shifts
+ add ra_y2, r1, 1
+ and r0, r0, ~3 # r0 gives the clipped and aligned x coordinate
+ add r2, r2, r0 # r2 is address for frame1 (not including y offset)
+ max r1, r1, 0
+ min r1, r1, rb_frame_height_minus_1
+ nop ; mul24 r1, r1, rb_pitch # r2 contains the addresses (not including y offset) for frame0
+ add t1s, r2, r1 ; mov ra_frame_base2, r2
+
+
+# load constants
+
+ mov ra_k1, 1
+ mov ra_k256, 256
+ mov ra30, 64
+
+ mov rb20, 0xffffff00
+ mov rb_k255, 255
+ mov rb23, 24
+
+# touch vertical context to keep simulator happy
+
+ mov ra8, 0
+ mov ra9, 0
+ mov ra10, 0
+ mov ra11, 0
+ mov ra12, 0
+ mov ra13, 0
+ mov ra14, 0
+ mov ra15, 0
+
+# Compute part of VPM to use
+ mov r2, qpu_num
+ mov r1, r2
+ asr r1, r1, 2
+ shl r1, r1, 6
+ mov r0, r2
+ and r0, r0, 3
+ add r0, r0, r1
+ mov r1, vpm_setup(0, 4, h8p(0, 0)) # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
+ add rb28, r0, r1 # VPM for saving data
+ mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
+ shl r0, r0, 5
+ add rb27, r0, r1 # Command for dma output
+
+# Weighted prediction denom
+ add rb13, unif, 9 # unif = weight denom + 6
+
+ mov -, unif # Unused
+
+# submit texture requests for second line
+ max r1, ra_y, 0
+ min r1, r1, rb_frame_height_minus_1
+ add ra_y, ra_y, 1
+ nop ; mul24 r1, r1, rb_pitch
+ add t0s, r1, ra_frame_base
+
+ max r1, ra_y2, 0
+ min r1, r1, rb_frame_height_minus_1
+ add ra_y2, ra_y2, 1
+ nop ; mul24 r1, r1, rb_pitch
+ add t1s, r1, ra_frame_base2
+
+# FALL THROUGHT TO PER-BLOCK SETUP
+
+# Start of per-block setup code
+# P and B blocks share the same setup code to save on Icache space
+:per_block_setup
+ mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+ mov ra31, unif
+
+ mov ra1, unif ; mov r1, elem_num # y_x ; elem_num has implicit unpack??
+
+# per-channel shifts were calculated on the *previous* invocation
+ mov ra_xshift, ra_xshift_next
+ mov rx_xshift2, rx_xshift2_next
+
+# get base addresses and per-channel shifts for *next* invocation
+
+ add r0, ra1.16a, r1 # Load x
+ max r0, r0, 0
+ min r0, r0, rb_frame_width_minus_1 ; mov r2, unif # Load the frame base
+ shl ra_xshift_next, r0, 3 # Compute shifts
+ mov r3, 8 ; mov ra_y_next, ra1.16b
+ and r0, r0, ~3 ; mov ra1, unif # y2_x2
+ add ra_frame_base_next, r2, r0
+
+ add r0, ra1.16a, r1 # Load x
+ max r0, r0, 0
+ min r0, r0, rb_frame_width_minus_1 ; mov r2, unif # Load the frame base
+ shl rx_xshift2_next, r0, 3 # Compute shifts
+ add r3, r3, r3 ; mov ra_y2_next, ra1.16b # r3 = 16 ;
+ and r0, r0, ~3 ; mov ra1, unif # width_height ; r0 gives the clipped and aligned x coordinate
+ add rx_frame_base2_next, r2, r0 # r2 is address for frame1 (not including y offset)
+
+# set up VPM write
+ mov vw_setup, rb28
+
+# get width,height of block (unif load above)
+ sub rb29, rb24, ra1.16b # Compute vdw_setup1(dst_pitch-width)
+ add rb17, ra1.16a, 5
+ add rb18, ra1.16a, 7
+ shl r0, ra1.16a, 7
+ add r0, r0, ra1.16b # Combine width and height of destination area
+ shl r0, r0, i_shift16 # Shift into bits 16 upwards of the vdw_setup0 register
+ add rb26, r0, rb27 ; mov r0, unif # Packed filter offsets
+
+# get filter coefficients and discard unused B frame values
+ shl.ifz r0, r0, i_shift16 # Pick half to use
+ shl ra8, r0, 3
+
+# Pack the 1st 4 filter coefs for H & V tightly
+
+ mov r1,0x00010100 # -ve
+ ror ra2.8a, r1, ra8.8d
+ ror ra0.8a, r1, ra8.8c
+
+ mov r1,0x01040400
+ ror ra2.8b, r1, ra8.8d
+ ror ra0.8b, r1, ra8.8c
+
+ mov r1,0x050b0a00 # -ve
+ ror ra2.8c, r1, ra8.8d
+ ror ra0.8c, r1, ra8.8c
+
+ mov r1,0x11283a40
+ ror ra2.8d, r1, ra8.8d
+ ror ra0.8d, r1, ra8.8c
+
+# In the 2nd vertical half we use b registers due to
+# using a-side fifo regs. The easiest way to achieve this to pack it
+# and then unpack!
+
+ mov r1,0x3a281100
+ ror ra3.8a, r1, ra8.8d
+ ror ra1.8a, r1, ra8.8c
+
+ mov r1,0x0a0b0500 # -ve
+ ror ra3.8b, r1, ra8.8d
+ ror ra1.8b, r1, ra8.8c
+
+ mov r1,0x04040100
+ ror ra3.8c, r1, ra8.8d
+ ror ra1.8c, r1, ra8.8c
+
+# Extract weighted prediction information in parallel
+
+ mov r1,0x01010000 # -ve
+ ror ra3.8d, r1, ra8.8d ; mov r0, unif # ; weight L1 weight L1 (hi16)/weight L0 (lo16)
+ ror ra1.8d, r1, ra8.8c ; mov r1, rb13 # ; rb13 = weight denom + 6 + 9
+
+# r3 = 16 from (long way) above
+ shl r1, unif, r1 ; mov rb4, ra3.8a # combined offet = ((is P) ? offset L0 * 2 : offset L1 + offset L0) + 1) ;
+ asr ra18, r0, r3 ; mov rb5, ra3.8b
+ bra -, ra31
+ shl r0, r0, r3 ; mov rb6, ra3.8c
+ mov r3, 0 ; mov rb7, ra3.8d # loop count ;
+ asr rb12, r1, 9
+
+# >>> branch ra31
+#
+# r3 = 0
+# ra18 = weight L1
+# r0 = weight L0 << 16 (will be put into rb14 in filter preamble)
+# rb13 = weight denom + 6 + 9
+# rb12 = (((is P) ? offset L0 * 2 : offset L1 + offset L0) + 1) << (rb13 - 1)
+
+
+################################################################################
+# mc_filter(y_x, frame_base, y2_x2, frame_base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
+# In a P block, y2_x2 should be y_x+8
+# At this point we have already issued two pairs of texture requests for the current block
+
+::mc_filter
+# r0 = weight << 16; We want weight * 2 in rb14
+ asr rb14, r0, 15
+
+# r3 = 0
+
+:yloop
+# retrieve texture results and pick out bytes
+# then submit two more texture requests
+
+# If we knew there was no clipping then this code would get simpler.
+# Perhaps we could add on the pitch and clip using larger values?
+
+# N.B. Whilst y == y2 as far as this loop is concerned we will start
+# the grab for the next block before we finish with this block and that
+# might be B where y != y2 so we must do full processing on both y and y2
+
+ sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0
+ shr r0, r4, ra_xshift ; mov.ifz ra_frame_base2, rx_frame_base2_next ; ldtmu1
+ mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+ mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch
+ shr r1, r4, rx_xshift2 ; mov.ifz ra_y2, ra_y2_next
+
+ max r2, ra_y, 0 # y
+ min r2, r2, rb_frame_height_minus_1
+ add ra_y, ra_y, 1 ; mul24 r2, r2, r3
+ add t0s, ra_frame_base, r2 ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
+
+ max r2, ra_y2, 0 # y
+ min r2, r2, rb_frame_height_minus_1
+ add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3
+ add t1s, ra_frame_base2, r2 ; v8subs r1, r1, rb20
+
+# generate seven shifted versions
+# interleave with scroll of vertical context
+
+ mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+
+# apply horizontal filter
+ nop ; mul24 r3, ra0.8a, r0
+ nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
+ nop ; mul24 r2, ra0.8b << 1, r0 << 1
+ nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
+ sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2
+ nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+ sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3
+ nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+ add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4
+ nop ; mul24.ifnz r3, ra1.8a << 12, r1 << 12
+ add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5
+ nop ; mul24.ifnz r3, ra1.8b << 13, r1 << 13
+ sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6
+ nop ; mul24.ifnz r3, ra1.8c << 14, r1 << 14
+ add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7
+ nop ; mul24.ifnz r3, ra1.8d << 15, r1 << 15
+ sub r0, r2, r3 ; mov r3, rb31
+
+ sub.setf -, r3, 8 ; mov r1, ra8
+ mov ra8, ra9 ; mov rb8, rb9
+ brr.anyn -, r:yloop
+ mov ra9, ra10 ; mov rb9, rb10
+ mov ra10, ra11 ; mov rb10, rb11
+ mov ra11, r0 ; mov rb11, r1
+ # >>> .anyn yloop
+
+ # apply vertical filter and write to VPM
+
+ nop ; mul24 r0, rb8, ra2.8a
+ nop ; mul24 r1, rb9, ra2.8b
+ sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c
+ sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d
+ add r1, r1, r0 ; mul24 r0, ra8, rb4
+ add r1, r1, r0 ; mul24 r0, ra9, rb5
+ sub r1, r1, r0 ; mul24 r0, ra10, rb6
+ add r1, r1, r0 ; mul24 r0, ra11, rb7
+ sub r1, r1, r0 ; mov -, vw_wait
+# At this point r1 is a 22-bit signed quantity: 8 (original sample),
+# +6, +6 (each pass), +1 (the passes can overflow slightly), +1 (sign)
+# The top 8 bits have rubbish in them as mul24 is unsigned
+# The low 6 bits need discard before weighting
+ sub.setf -, r3, rb18 ; mul24 r1, r1, ra_k256 # x256 - sign extend & discard rubbish
+ asr r1, r1, 14
+ nop ; mul24 r1, r1, rb14
+ add r1, r1, rb12
+
+ shl r1, r1, 8
+ brr.anyn -, r:yloop
+ asr r1, r1, rb13
+# We have a saturating pack unit - I can't help feeling it should be useful here
+ min r1, r1, rb_k255 # Delay 2 rb_k255 = 255
+ max vpm, r1, 0 # Delay 3
+# >>> branch.anyn yloop
+
+# DMA out
+
+ brr -, r:per_block_setup
+ mov vw_setup, rb26 # VDW setup 0 Delay 1
+ mov vw_setup, rb29 # Stride Delay 2
+ mov vw_addr, unif # start the VDW Delay 3
+
+
+
+################################################################################
+
+# mc_filter_b(y_x, frame_base, y2_x2, frame_base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
+# In a P block, only the first half of coefficients contain used information.
+# At this point we have already issued two pairs of texture requests for the current block
+# May be better to just send 16.16 motion vector and figure out the coefficients inside this block (only 4 cases so can compute hcoeffs in around 24 cycles?)
+# Can fill in the coefficients so only
+# Can also assume default weighted prediction for B frames.
+# Perhaps can unpack coefficients in a more efficient manner by doing H/V for a and b at the same time?
+# Or possibly by taking advantage of symmetry?
+# From 19->7 32bits per command.
+
+::mc_filter_b
+ # r0 = weightL0 << 16, we want it in rb14
+ asr rb14, r0, i_shift16
+
+:yloopb
+# retrieve texture results and pick out bytes
+# then submit two more texture requests
+
+# If we knew there was no clipping then this code would get simpler.
+# Perhaps we could add on the pitch and clip using larger values?
+
+ sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0
+ shr r0, r4, ra_xshift ; mov.ifz ra_frame_base2, rx_frame_base2_next ; ldtmu1
+ mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+ mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch
+ shr r1, r4, rx_xshift2 ; mov.ifz ra_y2, ra_y2_next
+
+ max r2, ra_y, 0 # y
+ min r2, r2, rb_frame_height_minus_1
+ add ra_y, ra_y, 1 ; mul24 r2, r2, r3
+ add t0s, ra_frame_base, r2 ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
+
+ max r2, ra_y2, 0 # y
+ min r2, r2, rb_frame_height_minus_1
+ add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3
+ add t1s, ra_frame_base2, r2 ; v8subs r1, r1, rb20
+
+# generate seven shifted versions
+# interleave with scroll of vertical context
+
+ mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+
+# apply horizontal filter
+ nop ; mul24 r3, ra0.8a, r0
+ nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
+ nop ; mul24 r2, ra0.8b << 1, r0 << 1
+ nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
+ sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2
+ nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+ sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3
+ nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+ add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4
+ nop ; mul24.ifnz r3, ra1.8a << 12, r1 << 12
+ add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5
+ nop ; mul24.ifnz r3, ra1.8b << 13, r1 << 13
+ sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6
+ nop ; mul24.ifnz r3, ra1.8c << 14, r1 << 14
+ add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7
+ nop ; mul24.ifnz r3, ra1.8d << 15, r1 << 15
+ sub r0, r2, r3 ; mov r3, rb31
+
+ sub.setf -, r3, 8 ; mov r1, ra8
+ mov ra8, ra9 ; mov rb8, rb9
+ brr.anyn -, r:yloopb
+ mov ra9, ra10 ; mov rb9, rb10
+ mov ra10, ra11 ; mov rb10, rb11
+ mov ra11, r0 ; mov rb11, r1
+ # >>> .anyn yloopb
+
+ # apply vertical filter and write to VPM
+
+ nop ; mul24 r0, rb8, ra2.8a
+ nop ; mul24 r1, rb9, ra2.8b
+ sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c
+ sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d
+ add r1, r1, r0 ; mul24 r0, ra8, rb4
+ add r1, r1, r0 ; mul24 r0, ra9, rb5
+ sub r1, r1, r0 ; mul24 r0, ra10, rb6
+ add r1, r1, r0 ; mul24 r0, ra11, rb7
+ sub r1, r1, r0 ; mov r2, rb12
+# As with P-pred r1 is a 22-bit signed quantity in 32-bits
+# Top 8 bits are bad - low 6 bits should be discarded
+ sub.setf -, r3, rb18 ; mul24 r1, r1, ra_k256
+
+ asr r1, r1, 14
+ nop ; mul24 r0, r1, rb14
+ add r0, r0, r2 ; mul24 r1, r1 << 8, ra18 << 8
+
+ add r1, r1, r0 ; mov -, vw_wait
+ shl r1, r1, 8
+
+ brr.anyn -, r:yloopb
+ asr r1, r1, rb13 # Delay 1
+ min r1, r1, rb_k255 # Delay 2
+ max vpm, r1, 0 # Delay 3
+
+# DMA out
+ brr -, r:per_block_setup
+ mov vw_setup, rb26 # VDW setup 0 Delay 1
+ mov vw_setup, rb29 # Stride Delay 2
+ mov vw_addr, unif # start the VDW Delay 3
+
+################################################################################
+
+# mc_interrupt_exit12()
+::mc_interrupt_exit12
+ mov -, vw_wait # wait on the VDW
+
+ # Dummy wait to test instructions
+# mov r3,1000000
+#:dummy_loop
+# sub.setf r3, r3, 1
+# nop
+# nop
+# brr.anynn -, r:dummy_loop
+# nop
+# nop
+# nop
+
+ ldtmu0
+ ldtmu0
+ ldtmu1
+ ldtmu1
+
+ mov -,sacq(0) # 1
+ mov -,sacq(0) # 2
+ mov -,sacq(0) # 3
+ mov -,sacq(0) # 4
+ mov -,sacq(0) # 5
+ mov -,sacq(0) # 6
+ mov -,sacq(0) # 7
+ mov -,sacq(0) # 8
+ mov -,sacq(0) # 9
+ mov -,sacq(0) # 10
+ mov -,sacq(0) # 11
+
+ nop ; nop ; thrend
+ mov interrupt, 1; nop # delay slot 1
+ nop ; nop # delay slot 2
+
+
+::mc_exit1
+ mov -, vw_wait # wait on the VDW
+
+ ldtmu0
+ ldtmu1
+ ldtmu0
+ ldtmu1
+ nop ; nop ; thrend
+ mov interrupt, 1; nop # delay slot 1
+ nop ; nop # delay slot 2
+
+
+::mc_end
+# Do not add code here because mc_end must appear after all other code.
diff --git a/libavcodec/rpi_user_vcsm.h b/libavcodec/rpi_user_vcsm.h
new file mode 100644
index 0000000..db41a4d
--- /dev/null
+++ b/libavcodec/rpi_user_vcsm.h
@@ -0,0 +1,459 @@
+/*****************************************************************************
+* Copyright 2001 - 2011 Broadcom Corporation. All rights reserved.
+*
+* This program is the proprietary software of Broadcom Corporation and/or
+* its licensors, and may only be used, duplicated, modified or distributed
+* pursuant to the terms and conditions of a separate, written license
+* agreement executed between you and Broadcom (an "Authorized License").
+* Except as set forth in an Authorized License, Broadcom grants no license
+* (express or implied), right to use, or waiver of any kind with respect to
+* the Software, and Broadcom expressly reserves all rights in and to the
+* Software and all intellectual property rights therein. IF YOU HAVE NO
+* AUTHORIZED LICENSE, THEN YOU HAVE NO RIGHT TO USE THIS SOFTWARE IN ANY
+* WAY, AND SHOULD IMMEDIATELY NOTIFY BROADCOM AND DISCONTINUE ALL USE OF
+* THE SOFTWARE.
+*
+* Except as expressly set forth in the Authorized License,
+* 1. This program, including its structure, sequence and organization,
+* constitutes the valuable trade secrets of Broadcom, and you shall use
+* all reasonable efforts to protect the confidentiality thereof, and to
+* use this information only in connection with your use of Broadcom
+* integrated circuit products.
+* 2. TO THE MAXIMUM EXTENT PERMITTED BY LAW, THE SOFTWARE IS PROVIDED "AS IS"
+* AND WITH ALL FAULTS AND BROADCOM MAKES NO PROMISES, REPRESENTATIONS OR
+* WARRANTIES, EITHER EXPRESS, IMPLIED, STATUTORY, OR OTHERWISE, WITH
+* RESPECT TO THE SOFTWARE. BROADCOM SPECIFICALLY DISCLAIMS ANY AND ALL
+* IMPLIED WARRANTIES OF TITLE, MERCHANTABILITY, NONINFRINGEMENT, FITNESS
+* FOR A PARTICULAR PURPOSE, LACK OF VIRUSES, ACCURACY OR COMPLETENESS,
+* QUIET ENJOYMENT, QUIET POSSESSION OR CORRESPONDENCE TO DESCRIPTION. YOU
+* ASSUME THE ENTIRE RISK ARISING OUT OF USE OR PERFORMANCE OF THE SOFTWARE.
+* 3. TO THE MAXIMUM EXTENT PERMITTED BY LAW, IN NO EVENT SHALL BROADCOM OR ITS
+* LICENSORS BE LIABLE FOR (i) CONSEQUENTIAL, INCIDENTAL, SPECIAL, INDIRECT,
+* OR EXEMPLARY DAMAGES WHATSOEVER ARISING OUT OF OR IN ANY WAY RELATING TO
+* YOUR USE OF OR INABILITY TO USE THE SOFTWARE EVEN IF BROADCOM HAS BEEN
+* ADVISED OF THE POSSIBILITY OF SUCH DAMAGES; OR (ii) ANY AMOUNT IN EXCESS
+* OF THE AMOUNT ACTUALLY PAID FOR THE SOFTWARE ITSELF OR U.S. $1, WHICHEVER
+* IS GREATER. THESE LIMITATIONS SHALL APPLY NOTWITHSTANDING ANY FAILURE OF
+* ESSENTIAL PURPOSE OF ANY LIMITED REMEDY.
+*****************************************************************************/
+
+#ifndef __USER_VCSM__H__INCLUDED__
+#define __USER_VCSM__H__INCLUDED__
+
+/* VideoCore Shared Memory - user interface library.
+**
+** This library provides all the necessary abstraction for any application to
+** make use of the shared memory service which is distributed accross a kernel
+** driver and a videocore service.
+**
+** It is an application design decision to choose or not to use this service.
+**
+** The logical flow of operations that a user application needs to follow when
+** using this service is:
+**
+** 1) Initialize the service.
+** 2) Allocate shared memory blocks.
+** 3) Start using the allocated blocks.
+** - In order to gain ownership on a block, lock the allocated block,
+** locking a block returns a valid address that the user application
+** can access.
+** - When finished with using the block for the current execution cycle
+** or function, and so when giving up the ownership, unlock the block.
+** 4) A block can be locked/unlocked as many times required - within or outside
+** of - a specific execution context.
+** 5) To completely release an allocated block, free it.
+** 6) If the service is no longer required, terminate it.
+**
+**
+** Some generic considerations:
+
+** Allocating memory blocks.
+**
+** Memory blocks can be allocated in different manners depending on the cache
+** behavior desired. A given block can either be:
+
+** - Allocated in a non cached fashion all the way through host and videocore.
+** - Allocated in a cached fashion on host OR videocore.
+** - Allocated in a cached fashion on host AND videocore.
+**
+** It is an application decision to determine how to allocate a block. Evidently
+** if the application will be doing substantial read/write accesses to a given block,
+** it is recommended to allocate the block at least in a 'host cached' fashion for
+** better results.
+**
+**
+** Locking memory blocks.
+**
+** When the memory block has been allocated in a host cached fashion, locking the
+** memory block (and so taking ownership of it) will trigger a cache invalidation.
+**
+** For the above reason and when using host cached allocation, it is important that
+** an application properly implements the lock/unlock mechanism to ensure cache will
+** stay coherent, otherwise there is no guarantee it will at all be.
+**
+** It is possible to dynamically change the host cache behavior (ie cached or non
+** cached) of a given allocation without needing to free and re-allocate the block.
+** This feature can be useful for such application which requires access to the block
+** only at certain times and not otherwise. By changing the cache behavior dynamically
+** the application can optimize performances for a given duration of use.
+** Such dynamic cache behavior remapping only applies to host cache and not videocore
+** cache. If one requires to change the videocore cache behavior, then a new block
+** must be created to replace the old one.
+**
+** On successful locking, a valid pointer is returned that the application can use
+** to access to data inside the block. There is no guarantee that the pointer will
+** stay valid following the unlock action corresponding to this lock.
+**
+**
+** Unocking memory blocks.
+**
+** When the memory block has been allocated in a host cached fashion, unlocking the
+** memory block (and so forgiving its ownership) will trigger a cache flush unless
+** explicitely asked not to flush the cache for performances reasons.
+**
+** For the above reason and when using host cached allocation, it is important that
+** an application properly implements the lock/unlock mechanism to ensure cache will
+** stay coherent, otherwise there is no guarantee it will at all be.
+**
+**
+** A complete API is defined below.
+*/
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/* Different status that can be dumped.
+*/
+typedef enum
+{
+ VCSM_STATUS_VC_WALK_ALLOC = 0, // Walks *all* the allocation on videocore.
+ // Result of the walk is seen in the videocore
+ // log.
+ VCSM_STATUS_HOST_WALK_MAP, // Walks the *full* mapping allocation on host
+ // driver (ie for all processes). Result of
+ // the walk is seen in the kernel log.
+ VCSM_STATUS_HOST_WALK_PID_MAP, // Walks the per process mapping allocation on host
+ // driver (for current process). Result of
+ // the walk is seen in the kernel log.
+ VCSM_STATUS_HOST_WALK_PID_ALLOC, // Walks the per process host allocation on host
+ // driver (for current process). Result of
+ // the walk is seen in the kernel log.
+ VCSM_STATUS_VC_MAP_ALL, // Equivalent to both VCSM_STATUS_VC_WALK_ALLOC and
+ // VCSM_STATUS_HOST_WALK_MAP.
+ //
+ VCSM_STATUS_NONE, // Must be last - invalid.
+
+} VCSM_STATUS_T;
+
+/* Different kind of cache behavior.
+*/
+typedef enum
+{
+ VCSM_CACHE_TYPE_NONE = 0, // No caching applies.
+ VCSM_CACHE_TYPE_HOST, // Allocation is cached on host (user space).
+ VCSM_CACHE_TYPE_VC, // Allocation is cached on videocore.
+ VCSM_CACHE_TYPE_HOST_AND_VC, // Allocation is cached on both host and videocore.
+
+} VCSM_CACHE_TYPE_T;
+
+/* Initialize the vcsm processing.
+**
+** Must be called once before attempting to do anything else.
+**
+** Returns 0 on success, -1 on error.
+*/
+int vcsm_init( void );
+
+
+/* Terminates the vcsm processing.
+**
+** Must be called vcsm services are no longer needed, it will
+** take care of removing any allocation under the current process
+** control if deemed necessary.
+*/
+void vcsm_exit( void );
+
+
+/* Queries the status of the the vcsm.
+**
+** Triggers dump of various kind of information, see the
+** different variants specified in VCSM_STATUS_T.
+**
+** Pid is optional.
+*/
+void vcsm_status( VCSM_STATUS_T status, int pid );
+
+
+/* Allocates a non-cached block of memory of size 'size' via the vcsm memory
+** allocator.
+**
+** Returns: 0 on error
+** a non-zero opaque handle on success.
+**
+** On success, the user must invoke vcsm_lock with the returned opaque
+** handle to gain access to the memory associated with the opaque handle.
+** When finished using the memory, the user calls vcsm_unlock_xx (see those
+** function definition for more details on the one that can be used).
+**
+** A well behaved application should make every attempt to lock/unlock
+** only for the duration it needs to access the memory data associated with
+** the opaque handle.
+*/
+unsigned int vcsm_malloc( unsigned int size, char *name );
+
+
+/* Allocates a cached block of memory of size 'size' via the vcsm memory
+** allocator, the type of caching requested is passed as argument of the
+** function call.
+**
+** Returns: 0 on error
+** a non-zero opaque handle on success.
+**
+** On success, the user must invoke vcsm_lock with the returned opaque
+** handle to gain access to the memory associated with the opaque handle.
+** When finished using the memory, the user calls vcsm_unlock_xx (see those
+** function definition for more details on the one that can be used).
+**
+** A well behaved application should make every attempt to lock/unlock
+** only for the duration it needs to access the memory data associated with
+** the opaque handle.
+*/
+unsigned int vcsm_malloc_cache( unsigned int size, VCSM_CACHE_TYPE_T cache, char *name );
+
+
+/* Shares an allocated block of memory via the vcsm memory allocator.
+**
+** Returns: 0 on error
+** a non-zero opaque handle on success.
+**
+** On success, the user must invoke vcsm_lock with the returned opaque
+** handle to gain access to the memory associated with the opaque handle.
+** When finished using the memory, the user calls vcsm_unlock_xx (see those
+** function definition for more details on the one that can be used).
+**
+** A well behaved application should make every attempt to lock/unlock
+** only for the duration it needs to access the memory data associated with
+** the opaque handle.
+*/
+unsigned int vcsm_malloc_share( unsigned int handle );
+
+
+/* Resizes a block of memory allocated previously by vcsm_alloc.
+**
+** Returns: 0 on success
+** -errno on error.
+**
+** The handle must be unlocked by user prior to attempting any
+** resize action.
+**
+** On error, the original size allocated against the handle
+** remains available the same way it would be following a
+** successful vcsm_malloc.
+*/
+int vcsm_resize( unsigned int handle, unsigned int new_size );
+
+
+/* Frees a block of memory that was successfully allocated by
+** a prior call the vcms_alloc.
+**
+** The handle should be considered invalid upon return from this
+** call.
+**
+** Whether any memory is actually freed up or not as the result of
+** this call will depends on many factors, if all goes well it will
+** be freed. If something goes wrong, the memory will likely end up
+** being freed up as part of the vcsm_exit process. In the end the
+** memory is guaranteed to be freed one way or another.
+*/
+void vcsm_free( unsigned int handle );
+
+
+/* Retrieves a videocore opaque handle from a mapped user address
+** pointer. The videocore handle will correspond to the actual
+** memory mapped in videocore.
+**
+** Returns: 0 on error
+** a non-zero opaque handle on success.
+**
+** Note: the videocore opaque handle is distinct from the user
+** opaque handle (allocated via vcsm_malloc) and it is only
+** significant for such application which knows what to do
+** with it, for the others it is just a number with little
+** use since nothing can be done with it (in particular
+** for safety reason it cannot be used to map anything).
+*/
+unsigned int vcsm_vc_hdl_from_ptr( void *usr_ptr );
+
+
+/* Retrieves a videocore opaque handle from a opaque handle
+** pointer. The videocore handle will correspond to the actual
+** memory mapped in videocore.
+**
+** Returns: 0 on error
+** a non-zero opaque handle on success.
+**
+** Note: the videocore opaque handle is distinct from the user
+** opaque handle (allocated via vcsm_malloc) and it is only
+** significant for such application which knows what to do
+** with it, for the others it is just a number with little
+** use since nothing can be done with it (in particular
+** for safety reason it cannot be used to map anything).
+*/
+unsigned int vcsm_vc_hdl_from_hdl( unsigned int handle );
+
+
+/* Retrieves a user opaque handle from a mapped user address
+** pointer.
+**
+** Returns: 0 on error
+** a non-zero opaque handle on success.
+*/
+unsigned int vcsm_usr_handle( void *usr_ptr );
+
+
+/* Retrieves a mapped user address from an opaque user
+** handle.
+**
+** Returns: 0 on error
+** a non-zero address on success.
+**
+** On success, the address corresponds to the pointer
+** which can access the data allocated via the vcsm_malloc
+** call.
+*/
+void *vcsm_usr_address( unsigned int handle );
+
+
+/* Locks the memory associated with this opaque handle.
+**
+** Returns: NULL on error
+** a valid pointer on success.
+**
+** A user MUST lock the handle received from vcsm_malloc
+** in order to be able to use the memory associated with it.
+**
+** On success, the pointer returned is only valid within
+** the lock content (ie until a corresponding vcsm_unlock_xx
+** is invoked).
+*/
+void *vcsm_lock( unsigned int handle );
+
+
+/* Locks the memory associated with this opaque handle. The lock
+** also gives a chance to update the *host* cache behavior of the
+** allocated buffer if so desired. The *videocore* cache behavior
+** of the allocated buffer cannot be changed by this call and such
+** attempt will be ignored.
+**
+** The system will attempt to honour the cache_update mode request,
+** the cache_result mode will provide the final answer on which cache
+** mode is really in use. Failing to change the cache mode will not
+** result in a failure to lock the buffer as it is an application
+** decision to choose what to do if (cache_result != cache_update)
+**
+** The value returned in cache_result can only be considered valid if
+** the returned pointer is non NULL. The cache_result pointer may be
+** NULL if the application does not care about the actual outcome of
+** its action with regards to the cache behavior change.
+**
+** Returns: NULL on error
+** a valid pointer on success.
+**
+** A user MUST lock the handle received from vcsm_malloc
+** in order to be able to use the memory associated with it.
+**
+** On success, the pointer returned is only valid within
+** the lock content (ie until a corresponding vcsm_unlock_xx
+** is invoked).
+*/
+void *vcsm_lock_cache( unsigned int handle,
+ VCSM_CACHE_TYPE_T cache_update,
+ VCSM_CACHE_TYPE_T *cache_result );
+
+
+/* Unlocks the memory associated with this user mapped address.
+**
+** Returns: 0 on success
+** -errno on error.
+**
+** After unlocking a mapped address, the user should no longer
+** attempt to reference it.
+*/
+int vcsm_unlock_ptr( void *usr_ptr );
+
+
+/* Unlocks the memory associated with this user mapped address.
+** Apply special processing that would override the otherwise
+** default behavior.
+**
+** If 'cache_no_flush' is specified:
+** Do not flush cache as the result of the unlock (if cache
+** flush was otherwise applicable in this case).
+**
+** Returns: 0 on success
+** -errno on error.
+**
+** After unlocking a mapped address, the user should no longer
+** attempt to reference it.
+*/
+int vcsm_unlock_ptr_sp( void *usr_ptr, int cache_no_flush );
+
+
+/* Unlocks the memory associated with this user opaque handle.
+**
+** Returns: 0 on success
+** -errno on error.
+**
+** After unlocking an opaque handle, the user should no longer
+** attempt to reference the mapped addressed once associated
+** with it.
+*/
+int vcsm_unlock_hdl( unsigned int handle );
+
+
+/* Unlocks the memory associated with this user opaque handle.
+** Apply special processing that would override the otherwise
+** default behavior.
+**
+** If 'cache_no_flush' is specified:
+** Do not flush cache as the result of the unlock (if cache
+** flush was otherwise applicable in this case).
+**
+** Returns: 0 on success
+** -errno on error.
+**
+** After unlocking an opaque handle, the user should no longer
+** attempt to reference the mapped addressed once associated
+** with it.
+*/
+int vcsm_unlock_hdl_sp( unsigned int handle, int cache_no_flush );
+
+/* Clean and/or invalidate the memory associated with this user opaque handle
+**
+** Returns: non-zero on error
+**
+** structure contains a list of flush/invalidate commands. Commands are:
+** 0: nop
+** 1: invalidate given virtual range in L1/L2
+** 2: clean given virtual range in L1/L2
+** 3: clean+invalidate given virtual range in L1/L2
+** 4: flush all L1/L2
+*/
+struct vcsm_user_clean_invalid_s {
+ struct {
+ unsigned int cmd;
+ unsigned int handle;
+ unsigned int addr;
+ unsigned int size;
+ } s[8];
+};
+
+int vcsm_clean_invalid( struct vcsm_user_clean_invalid_s *s );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __USER_VCSM__H__INCLUDED__ */
diff --git a/libavcodec/rpi_zc.c b/libavcodec/rpi_zc.c
new file mode 100644
index 0000000..9580165
--- /dev/null
+++ b/libavcodec/rpi_zc.c
@@ -0,0 +1,406 @@
+#include "config.h"
+#ifdef RPI
+#include "rpi_qpu.h"
+#include "rpi_zc.h"
+
+#include "libavutil/buffer_internal.h"
+
+struct ZcPoolEnt;
+
+typedef struct ZcPool
+{
+ int numbytes;
+ struct ZcPoolEnt * head;
+ pthread_mutex_t lock;
+} ZcPool;
+
+typedef struct ZcPoolEnt
+{
+ // It is important that we start with gmem as other bits of code will expect to see that
+ GPU_MEM_PTR_T gmem;
+ struct ZcPoolEnt * next;
+ struct ZcPool * pool;
+} ZcPoolEnt;
+
+static ZcPoolEnt * zc_pool_ent_alloc(ZcPool * const pool, const int size)
+{
+ ZcPoolEnt * const zp = av_malloc(sizeof(ZcPoolEnt));
+
+ if (zp == NULL) {
+ av_log(NULL, AV_LOG_ERROR, "av_malloc(ZcPoolEnt) failed\n");
+ goto fail0;
+ }
+
+ if (gpu_malloc_cached(size, &zp->gmem) != 0)
+ {
+ av_log(NULL, AV_LOG_ERROR, "av_gpu_malloc_cached(%d) failed\n", size);
+ goto fail1;
+ }
+
+ zp->next = NULL;
+ zp->pool = pool;
+ return zp;
+
+fail1:
+ av_free(zp);
+fail0:
+ return NULL;
+}
+
+static void zc_pool_ent_free(ZcPoolEnt * const zp)
+{
+ gpu_free(&zp->gmem);
+ av_free(zp);
+}
+
+static void zc_pool_flush(ZcPool * const pool)
+{
+ ZcPoolEnt * p = pool->head;
+ pool->head = NULL;
+ while (p != NULL)
+ {
+ ZcPoolEnt * const zp = p;
+ p = p->next;
+ zc_pool_ent_free(zp);
+ }
+}
+
+static ZcPoolEnt * zc_pool_alloc(ZcPool * const pool, const int numbytes)
+{
+ ZcPoolEnt * zp;
+ pthread_mutex_lock(&pool->lock);
+
+ if (numbytes != pool->numbytes)
+ {
+ zc_pool_flush(pool);
+ pool->numbytes = numbytes;
+ }
+
+ if (pool->head != NULL)
+ {
+ zp = pool->head;
+ pool->head = zp->next;
+ }
+ else
+ {
+ zp = zc_pool_ent_alloc(pool, numbytes);
+ }
+
+ pthread_mutex_unlock(&pool->lock);
+ return zp;
+}
+
+static void zc_pool_free(ZcPoolEnt * const zp)
+{
+ ZcPool * const pool = zp == NULL ? NULL : zp->pool;
+ if (zp != NULL)
+ {
+ pthread_mutex_lock(&pool->lock);
+ if (pool->numbytes == zp->gmem.numbytes)
+ {
+ zp->next = pool->head;
+ pool->head = zp;
+ pthread_mutex_unlock(&pool->lock);
+ }
+ else
+ {
+ pthread_mutex_unlock(&pool->lock);
+ zc_pool_ent_free(zp);
+ }
+ }
+}
+
+static void
+zc_pool_init(ZcPool * const pool)
+{
+ pool->numbytes = -1;
+ pool->head = NULL;
+ pthread_mutex_init(&pool->lock, NULL);
+}
+
+static void
+zc_pool_destroy(ZcPool * const pool)
+{
+ pool->numbytes = -1;
+ zc_pool_flush(pool);
+ pthread_mutex_destroy(&pool->lock);
+}
+
+
+typedef struct AVZcEnv
+{
+ ZcPool pool;
+} ZcEnv;
+
+// Callback when buffer unrefed to zero
+static void rpi_free_display_buffer(void *opaque, uint8_t *data)
+{
+ ZcPoolEnt *const zp = opaque;
+// printf("%s: data=%p\n", __func__, data);
+ zc_pool_free(zp);
+}
+
+static inline GPU_MEM_PTR_T * pic_gm_ptr(AVBufferRef * const buf)
+{
+ // Kludge where we check the free fn to check this is really
+ // one of our buffers - can't think of a better way
+ return buf == NULL || buf->buffer->free != rpi_free_display_buffer ? NULL :
+ av_buffer_get_opaque(buf);
+}
+
+AVRpiZcFrameGeometry av_rpi_zc_frame_geometry(
+ const unsigned int video_width, const unsigned int video_height)
+{
+ AVRpiZcFrameGeometry geo;
+ geo.stride_y = (video_width + 32 + 31) & ~31;
+ geo.stride_c = geo.stride_y / 2;
+// geo.height_y = (video_height + 15) & ~15;
+ geo.height_y = (video_height + 32 + 31) & ~31;
+ geo.height_c = geo.height_y / 2;
+ return geo;
+}
+
+static AVBufferRef * rpi_buf_pool_alloc(ZcPool * const pool, int size)
+{
+ ZcPoolEnt *const zp = zc_pool_alloc(pool, size);
+ AVBufferRef * buf;
+
+ if (zp == NULL) {
+ av_log(NULL, AV_LOG_ERROR, "zc_pool_alloc(%d) failed\n", size);
+ goto fail0;
+ }
+
+ if ((buf = av_buffer_create(zp->gmem.arm, size, rpi_free_display_buffer, zp, AV_BUFFER_FLAG_READONLY)) == NULL)
+ {
+ av_log(NULL, AV_LOG_ERROR, "av_buffer_create() failed\n");
+ goto fail2;
+ }
+
+ return buf;
+
+fail2:
+ zc_pool_free(zp);
+fail0:
+ return NULL;
+}
+
+static int rpi_get_display_buffer(struct AVCodecContext * const s, AVFrame * const frame)
+{
+ ZcEnv *const zc = s->get_buffer_context;
+ const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(frame->width, frame->height);
+ const unsigned int size_y = geo.stride_y * geo.height_y;
+ const unsigned int size_c = geo.stride_c * geo.height_c;
+ const unsigned int size_pic = size_y + size_c * 2;
+ AVBufferRef * buf;
+ unsigned int i;
+
+// printf("Do local alloc: format=%#x, %dx%d: %u\n", frame->format, frame->width, frame->height, size_pic);
+
+ if ((buf = rpi_buf_pool_alloc(&zc->pool, size_pic)) == NULL)
+ {
+ av_log(s, AV_LOG_ERROR, "rpi_get_display_buffer: Failed to get buffer from pool\n");
+ return AVERROR(ENOMEM);
+ }
+
+ for (i = 0; i < AV_NUM_DATA_POINTERS; i++) {
+ frame->buf[i] = NULL;
+ frame->data[i] = NULL;
+ frame->linesize[i] = 0;
+ }
+
+ frame->buf[0] = buf;
+ frame->linesize[0] = geo.stride_y;
+ frame->linesize[1] = geo.stride_c;
+ frame->linesize[2] = geo.stride_c;
+ frame->data[0] = buf->data;
+ frame->data[1] = frame->data[0] + size_y;
+ frame->data[2] = frame->data[1] + size_c;
+ frame->extended_data = frame->data;
+ // Leave extended buf alone
+
+ return 0;
+}
+
+
+#define RPI_GET_BUFFER2 1
+
+int av_rpi_zc_get_buffer2(struct AVCodecContext *s, AVFrame *frame, int flags)
+{
+#if !RPI_GET_BUFFER2
+ return avcodec_default_get_buffer2(s, frame, flags);
+#else
+ int rv;
+
+ if ((s->codec->capabilities & AV_CODEC_CAP_DR1) == 0 ||
+ frame->format != AV_PIX_FMT_YUV420P)
+ {
+// printf("Do default alloc: format=%#x\n", frame->format);
+ rv = avcodec_default_get_buffer2(s, frame, flags);
+ }
+ else
+ {
+ rv = rpi_get_display_buffer(s, frame);
+ }
+
+#if 0
+ printf("%s: %dx%d lsize=%d/%d/%d data=%p/%p/%p bref=%p/%p/%p opaque[0]=%p\n", __func__,
+ frame->width, frame->height,
+ frame->linesize[0], frame->linesize[1], frame->linesize[2],
+ frame->data[0], frame->data[1], frame->data[2],
+ frame->buf[0], frame->buf[1], frame->buf[2],
+ av_buffer_get_opaque(frame->buf[0]));
+#endif
+ return rv;
+#endif
+}
+
+
+static AVBufferRef * zc_copy(struct AVCodecContext * const s,
+ const AVFrame * const src)
+{
+ AVFrame dest_frame;
+ AVFrame * const dest = &dest_frame;
+ unsigned int i;
+ uint8_t * psrc, * pdest;
+
+ dest->width = src->width;
+ dest->height = src->height;
+
+ if (rpi_get_display_buffer(s, dest) != 0)
+ {
+ return NULL;
+ }
+
+ for (i = 0, psrc = src->data[0], pdest = dest->data[0];
+ i != dest->height;
+ ++i, psrc += src->linesize[0], pdest += dest->linesize[0])
+ {
+ memcpy(pdest, psrc, dest->width);
+ }
+ for (i = 0, psrc = src->data[1], pdest = dest->data[1];
+ i != dest->height / 2;
+ ++i, psrc += src->linesize[1], pdest += dest->linesize[1])
+ {
+ memcpy(pdest, psrc, dest->width / 2);
+ }
+ for (i = 0, psrc = src->data[2], pdest = dest->data[2];
+ i != dest->height / 2;
+ ++i, psrc += src->linesize[2], pdest += dest->linesize[2])
+ {
+ memcpy(pdest, psrc, dest->width / 2);
+ }
+
+ return dest->buf[0];
+}
+
+
+AVRpiZcRefPtr av_rpi_zc_ref(struct AVCodecContext * const s,
+ const AVFrame * const frame, const int maycopy)
+{
+ assert(s != NULL);
+
+ if (frame->format != AV_PIX_FMT_YUV420P)
+ {
+ av_log(s, AV_LOG_WARNING, "%s: *** Format not YUV420P: %d\n", __func__, frame->format);
+ return NULL;
+ }
+
+ if (frame->buf[1] != NULL)
+ {
+ if (maycopy)
+ {
+ av_log(s, AV_LOG_INFO, "%s: *** Not a single buf frame: copying\n", __func__);
+ return zc_copy(s, frame);
+ }
+ else
+ {
+ av_log(s, AV_LOG_WARNING, "%s: *** Not a single buf frame: NULL\n", __func__);
+ return NULL;
+ }
+ }
+
+ if (pic_gm_ptr(frame->buf[0]) == NULL)
+ {
+ if (maycopy)
+ {
+ av_log(s, AV_LOG_INFO, "%s: *** Not one of our buffers: copying\n", __func__);
+ return zc_copy(s, frame);
+ }
+ else
+ {
+ av_log(s, AV_LOG_WARNING, "%s: *** Not one of our buffers: NULL\n", __func__);
+ return NULL;
+ }
+ }
+
+ return av_buffer_ref(frame->buf[0]);
+}
+
+int av_rpi_zc_vc_handle(const AVRpiZcRefPtr fr_ref)
+{
+ const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref);
+ return p == NULL ? -1 : p->vc_handle;
+}
+
+int av_rpi_zc_numbytes(const AVRpiZcRefPtr fr_ref)
+{
+ const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref);
+ return p == NULL ? 0 : p->numbytes;
+}
+
+void av_rpi_zc_unref(AVRpiZcRefPtr fr_ref)
+{
+ if (fr_ref != NULL)
+ {
+ av_buffer_unref(&fr_ref);
+ }
+}
+
+AVZcEnvPtr av_rpi_zc_env_alloc(void)
+{
+ ZcEnv * const zc = av_mallocz(sizeof(ZcEnv));
+ if (zc == NULL)
+ {
+ av_log(NULL, AV_LOG_ERROR, "av_rpi_zc_env_alloc: Context allocation failed\n");
+ return NULL;
+ }
+
+ zc_pool_init(&zc->pool);
+ return zc;
+}
+
+void av_rpi_zc_env_free(AVZcEnvPtr zc)
+{
+ if (zc != NULL)
+ {
+ zc_pool_destroy(&zc->pool); ;
+ av_free(zc);
+ }
+}
+
+int av_rpi_zc_init(struct AVCodecContext * const s)
+{
+ ZcEnv * const zc = av_rpi_zc_env_alloc();
+ if (zc == NULL)
+ {
+ return AVERROR(ENOMEM);
+ }
+
+ s->get_buffer_context = zc;
+ s->get_buffer2 = av_rpi_zc_get_buffer2;
+ return 0;
+}
+
+void av_rpi_zc_uninit(struct AVCodecContext * const s)
+{
+ if (s->get_buffer2 == av_rpi_zc_get_buffer2)
+ {
+ ZcEnv * const zc = s->get_buffer_context;
+ s->get_buffer2 = avcodec_default_get_buffer2;
+ s->get_buffer_context = NULL;
+ av_rpi_zc_env_free(zc);
+ }
+}
+
+#endif // RPI
+
diff --git a/libavcodec/rpi_zc.h b/libavcodec/rpi_zc.h
new file mode 100644
index 0000000..f0109f4
--- /dev/null
+++ b/libavcodec/rpi_zc.h
@@ -0,0 +1,83 @@
+#ifndef LIBAVCODEC_RPI_ZC_H
+#define LIBAVCODEC_RPI_ZC_H
+
+// Zero-Copy frame code for RPi
+// RPi needs Y/U/V planes to be contiguous for display. By default
+// ffmpeg will allocate separated planes so a memcpy is needed before
+// display. This code prodes a method a making ffmpeg allocate a single
+// bit of memory for the frame when can then be refrence counted until
+// display ahs finsihed with it.
+
+#include "libavutil/frame.h"
+#include "libavcodec/avcodec.h"
+
+// "Opaque" pointer to whatever we are using as a buffer reference
+typedef AVBufferRef * AVRpiZcRefPtr;
+
+struct AVZcEnv;
+typedef struct AVZcEnv * AVZcEnvPtr;
+
+typedef struct AVRpiZcFrameGeometry
+{
+ unsigned int stride_y;
+ unsigned int height_y;
+ unsigned int stride_c;
+ unsigned int height_c;
+} AVRpiZcFrameGeometry;
+
+
+AVRpiZcFrameGeometry av_rpi_zc_frame_geometry(
+ const unsigned int video_width, const unsigned int video_height);
+
+// Replacement fn for avctx->get_buffer2
+// Should be set before calling avcodec_decode_open2
+//
+// N.B. in addition to to setting avctx->get_buffer2, avctx->refcounted_frames
+// must be set to 1 as otherwise the buffer info is killed before being returned
+// by avcodec_decode_video2. Note also that this means that the AVFrame that is
+// return must be manually derefed with av_frame_unref. This should be done
+// after av_rpi_zc_ref has been called.
+int av_rpi_zc_get_buffer2(struct AVCodecContext *s, AVFrame *frame, int flags);
+
+// Generate a ZC reference to the buffer(s) in this frame
+// If the buffer doesn't appear to be one allocated by _get_buffer_2
+// then the behaviour depends on maycopy:
+// If maycopy=0 then return NULL
+// If maycopy=1 && the src frame is in a form where we can easily copy
+// the data, then allocate a new buffer and copy the data into it
+// Otherwise return NULL
+AVRpiZcRefPtr av_rpi_zc_ref(struct AVCodecContext * const s,
+ const AVFrame * const frame, const int maycopy);
+
+// Get the vc_handle from the frame ref
+// Returns -1 if ref doesn't look valid
+int av_rpi_zc_vc_handle(const AVRpiZcRefPtr fr_ref);
+// Get the number of bytes allocated from the frame ref
+// Returns 0 if ref doesn't look valid
+int av_rpi_zc_numbytes(const AVRpiZcRefPtr fr_ref);
+
+// Unreference the buffer refed/allocated by _zc_ref
+// If fr_ref is NULL then this will NOP
+void av_rpi_zc_unref(AVRpiZcRefPtr fr_ref);
+
+// Allocate an environment for the buffer pool used by the ZC code
+// This should be put in avctx->get_buffer_context so it can be found by
+// av_rpi_zc_get_buffer2 when it is called from ffmpeg
+AVZcEnvPtr av_rpi_zc_env_alloc(void);
+
+// Allocate the environment used by the ZC code
+void av_rpi_zc_env_free(AVZcEnvPtr);
+
+
+// Init ZC into a context
+// There is nothing magic in this fn - it just packages setting
+// get_buffer2 & get_buffer_context
+int av_rpi_zc_init(struct AVCodecContext * const s);
+
+// Free ZC from a context
+// There is nothing magic in this fn - it just packages unsetting
+// get_buffer2 & get_buffer_context
+void av_rpi_zc_uninit(struct AVCodecContext * const s);
+
+#endif
+
diff --git a/libavcodec/utils.c b/libavcodec/utils.c
index f7adb52..708526e 100644
--- a/libavcodec/utils.c
+++ b/libavcodec/utils.c
@@ -26,6 +26,12 @@
*/
#include "config.h"
+
+#ifdef RPI
+// Move video buffers to GPU memory
+#define RPI_GPU_BUFFERS
+#endif
+
#include "libavutil/atomic.h"
#include "libavutil/attributes.h"
#include "libavutil/avassert.h"
@@ -64,6 +70,10 @@
#include "libavutil/ffversion.h"
const char av_codec_ffversion[] = "FFmpeg version " FFMPEG_VERSION;
+#ifdef RPI_GPU_BUFFERS
+#include "rpi_qpu.h"
+#endif
+
#if HAVE_PTHREADS || HAVE_W32THREADS || HAVE_OS2THREADS
static int default_lockmgr_cb(void **arg, enum AVLockOp op)
{
@@ -503,6 +513,47 @@ int avcodec_fill_audio_frame(AVFrame *frame, int nb_channels,
return ret;
}
+#ifdef RPI_GPU_BUFFERS
+static void rpi_buffer_default_free(void *opaque, uint8_t *data)
+{
+ GPU_MEM_PTR_T *p = opaque;
+ gpu_free(p);
+ av_free(p);
+}
+
+static AVBufferRef *rpi_buffer_alloc(int size)
+{
+ AVBufferRef *ret = NULL;
+ uint8_t *data = NULL;
+ GPU_MEM_PTR_T *p;
+
+ static int total=0;
+ total+=size;
+
+ p = av_malloc(sizeof *p);
+ if (!p)
+ return NULL;
+
+ if (gpu_malloc_cached(size,p)<0) // Change this line to choose cached or uncached memory. The caching here refers to the ARM data cache.
+ return NULL;
+
+ data = p->arm;
+ printf("Rpi alloc %d/%d ARM=%p VC=%x->%x\n",size,total,p->arm,p->vc,p->vc+size);
+ //memset(data, 64, size);
+
+ if (!data)
+ return NULL;
+
+ ret = av_buffer_create(data, size, rpi_buffer_default_free, p, 0);
+ if (!ret) {
+ gpu_free(p);
+ av_freep(&p);
+ }
+
+ return ret;
+}
+#endif
+
static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame)
{
FramePool *pool = avctx->internal->pool;
@@ -550,6 +601,14 @@ static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame)
av_buffer_pool_uninit(&pool->pools[i]);
pool->linesize[i] = linesize[i];
if (size[i]) {
+#ifdef RPI_GPU_BUFFERS
+ if (avctx->codec_id == AV_CODEC_ID_HEVC)
+ pool->pools[i] = av_buffer_pool_init(size[i] + 16 + STRIDE_ALIGN - 1,
+ CONFIG_MEMORY_POISONING ?
+ NULL :
+ rpi_buffer_alloc);
+ else
+#endif
pool->pools[i] = av_buffer_pool_init(size[i] + 16 + STRIDE_ALIGN - 1,
CONFIG_MEMORY_POISONING ?
NULL :
diff --git a/libavformat/mpegts.c b/libavformat/mpegts.c
index b31d233..2767306 100644
--- a/libavformat/mpegts.c
+++ b/libavformat/mpegts.c
@@ -701,7 +701,7 @@ static const StreamType ISO_types[] = {
#endif
{ 0x1b, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264 },
{ 0x1c, AVMEDIA_TYPE_AUDIO, AV_CODEC_ID_AAC },
- { 0x20, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264 },
+ { 0x20, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264_MVC },
{ 0x21, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_JPEG2000 },
{ 0x24, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_HEVC },
{ 0x42, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_CAVS },
diff --git a/libavformat/utils.c b/libavformat/utils.c
index 6f343f2..83f26d5 100644
--- a/libavformat/utils.c
+++ b/libavformat/utils.c
@@ -691,7 +691,7 @@ static int update_wrap_reference(AVFormatContext *s, AVStream *st, int stream_in
int default_stream_index = av_find_default_stream_index(s);
if (s->streams[default_stream_index]->pts_wrap_reference == AV_NOPTS_VALUE) {
for (i = 0; i < s->nb_streams; i++) {
- if (av_find_program_from_stream(s, NULL, i))
+ if (0 && av_find_program_from_stream(s, NULL, i))
continue;
s->streams[i]->pts_wrap_reference = pts_wrap_reference;
s->streams[i]->pts_wrap_behavior = pts_wrap_behavior;
diff --git a/libavutil/buffer.c b/libavutil/buffer.c
index 694e116..203ca7b 100644
--- a/libavutil/buffer.c
+++ b/libavutil/buffer.c
@@ -425,3 +425,9 @@ AVBufferRef *av_buffer_pool_get(AVBufferPool *pool)
return ret;
}
+
+// Return the opaque for the underlying frame (gives us a GPU_MEM_PTR_T)
+void *av_buffer_pool_opaque(AVBufferRef *ref) {
+ BufferPoolEntry *buf = av_buffer_get_opaque(ref);
+ return buf->opaque;
+}
diff --git a/libavutil/buffer.h b/libavutil/buffer.h
index 0c0ce12..82e0bc3 100644
--- a/libavutil/buffer.h
+++ b/libavutil/buffer.h
@@ -283,6 +283,9 @@ void av_buffer_pool_uninit(AVBufferPool **pool);
*/
AVBufferRef *av_buffer_pool_get(AVBufferPool *pool);
+// Return the opaque for the underlying frame
+void *av_buffer_pool_opaque(AVBufferRef *ref);
+
/**
* @}
*/
diff --git a/pi-util/conf.sh b/pi-util/conf.sh
new file mode 100755
index 0000000..8b596a2
--- /dev/null
+++ b/pi-util/conf.sh
@@ -0,0 +1,33 @@
+echo "Configure for Pi2/3"
+
+RPI_BUILDROOT=`pwd`/build
+RPI_ROOTFS=$RPI_BUILDROOT/linux/raspian_jessie_pi1-sysroot
+RPI_TOOLROOT=$RPI_BUILDROOT/tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf
+RPI_OPT_VC=$RPI_ROOTFS/opt/vc
+#RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_ROOTFS/usr/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
+RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
+RPI_DEFS="-D__VCCOREVER__=0x04000000 -DRPI=1"
+#RPI_DEFS="-D__VCCOREVER__=0x04000000"
+RPI_LIBDIRS="-L$RPI_ROOTFS/lib -L$RPI_ROOTFS/usr/lib -L$RPI_OPT_VC/lib"
+#RPI_KEEPS="-save-temps=obj"
+RPI_KEEPS=""
+
+./configure --enable-cross-compile\
+ --arch=armv6t2\
+ --cpu=cortex-a7\
+ --target-os=linux\
+ --disable-stripping\
+ --disable-thumb\
+ --enable-mmal\
+ --extra-cflags="-g $RPI_KEEPS $RPI_DEFS $RPI_INCLUDES"\
+ --extra-cxxflags="$RPI_DEFS $RPI_INCLUDES"\
+ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_ROOTFS/lib,-rpath-link=$RPI_ROOTFS/usr/lib"\
+ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\
+ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf-
+
+# --enable-extra-warnings\
+# --arch=armv71\
+# --enable-shared\
+
+# gcc option for getting asm listing
+# -Wa,-ahls
diff --git a/pi-util/conf_h265.csv b/pi-util/conf_h265.csv
new file mode 100644
index 0000000..61d1399
--- /dev/null
+++ b/pi-util/conf_h265.csv
@@ -0,0 +1,144 @@
+1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.md5
+2,AMP_A_Samsung_6,AMP_A_Samsung_6.bin,AMP_A_Samsung_6.md5
+1,AMP_B_Samsung_6,AMP_B_Samsung_6.bin,AMP_B_Samsung_6.md5
+1,AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5
+1,AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5
+1,AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5
+1,AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5
+1,AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5
+2,AMVP_C_Samsung_6,AMVP_C_Samsung_6.bin,AMVP_C_Samsung_6.md5
+1,BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5
+1,CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5
+1,CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5
+1,CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5
+1,CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5
+1,CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5
+1,CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5
+1,CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5
+1,CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5
+1,CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5
+1,cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5
+1,CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5
+1,CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5
+1,DBLK_A_MAIN10_VIXS_3,DBLK_A_MAIN10_VIXS_3.bit,DBLK_A_MAIN10_VIXS_3.md5
+1,DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5
+1,DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5
+1,DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5
+1,DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5
+1,DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5
+1,DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5
+1,DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5
+2,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
+1,DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5
+1,DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5
+1,DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5
+1,DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5
+1,DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5
+1,ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5
+1,ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5
+1,ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5
+1,EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5
+1,FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5
+1,HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5
+1,INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5
+1,INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5
+1,ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5
+1,ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5
+1,ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5
+1,ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5
+1,ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5
+1,IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5
+1,IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5
+1,IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5
+1,LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5
+1,LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5
+1,LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5
+1,MAXBINS_A_TI_4,MAXBINS_A_TI_4.bit,MAXBINS_A_TI_4.md5
+1,MAXBINS_B_TI_4,MAXBINS_B_TI_4.bit,MAXBINS_B_TI_4.md5
+1,MAXBINS_C_TI_4,MAXBINS_C_TI_4.bit,MAXBINS_C_TI_4.md5
+1,MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5
+1,MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5
+1,MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5
+1,MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5
+1,MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5
+1,MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5
+1,MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5
+1,MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5
+1,MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5
+1,MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5
+1,NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5
+1,NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5
+2,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
+1,OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5
+1,OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5
+1,OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5
+1,PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5
+1,PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5
+1,PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5
+1,PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5
+1,PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5
+1,PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5
+1,PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5
+1,PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5
+1,PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5
+1,POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5
+2,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
+1,PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5
+1,RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5
+2,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
+1,RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5
+1,RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5
+1,RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5
+1,RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5
+1,RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5
+1,RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5
+1,RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5
+1,RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5
+1,RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5
+1,RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5
+1,RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5
+1,RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5
+1,RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5
+1,RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5
+1,RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5
+1,SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5
+1,SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5
+1,SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5
+1,SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5
+1,SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5
+1,SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5
+1,SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5
+1,SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5
+1,SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5
+1,SLIST_A_Sony_4,str.bin,SLIST_A_Sony_4_yuv.md5
+1,SLIST_B_Sony_8,str.bin,SLIST_B_Sony_8_yuv.md5
+1,SLIST_C_Sony_3,str.bin,SLIST_C_Sony_3_yuv.md5
+1,SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5
+2,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
+1,STRUCT_A_Samsung_6,STRUCT_A_Samsung_6.bin,STRUCT_A_Samsung_6.md5
+1,STRUCT_B_Samsung_6,STRUCT_B_Samsung_6.bin,STRUCT_B_Samsung_6.md5
+1,TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5
+1,TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5
+1,TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5
+1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5
+1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5
+1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5
+2,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5
+1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5
+2,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
+1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5
+1,WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5
+1,WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5
+1,WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5
+1,WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5
+1,WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5
+1,WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5
+1,WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5
+1,WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5
+1,WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5
+1,WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5
+1,WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5
+1,WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5
+1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5
+1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5
+1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
diff --git a/pi-util/ffconf.py b/pi-util/ffconf.py
new file mode 100644
index 0000000..38f942f
--- /dev/null
+++ b/pi-util/ffconf.py
@@ -0,0 +1,146 @@
+#!/usr/bin/env python
+
+import os
+import subprocess
+import re
+import argparse
+import sys
+import csv
+from stat import *
+
+conf_root = "/opt/conform/h265"
+ffmpeg_exec = "./ffmpeg"
+
+def testone(fileroot, name, es_file, md5_file):
+ tmp_root = "/tmp"
+
+ dec_file = os.path.join(tmp_root, name + ".dec.md5")
+ try:
+ os.remove(dec_file)
+ except:
+ pass
+
+ flog = open(os.path.join(tmp_root, name + ".log"), "wt")
+
+ # Unaligned needed for cropping conformance
+ rstr = subprocess.call(
+ [ffmpeg_exec, "-flags", "unaligned", "-vcodec", "hevc", "-i", os.path.join(fileroot, es_file), "-f", "md5", dec_file],
+ stdout=flog, stderr=subprocess.STDOUT)
+
+ try:
+ m1 = None
+ m2 = None
+ with open(os.path.join(fileroot, md5_file)) as f:
+ for line in f:
+ m1 = re.search("[0-9a-f]{32}", line.lower())
+ if m1:
+ break
+
+ with open(dec_file) as f:
+ m2 = re.search("[0-9a-f]{32}", f.readline())
+ except:
+ pass
+
+ rv = False
+ if m1 and m2 and m1.group() == m2.group():
+ print >> flog, "Match: " + m1.group()
+ rv = True
+ elif not m1:
+ print >> flog, "****** Cannot find m1"
+ elif not m2:
+ print >> flog, "****** Cannot find m2"
+ else:
+ print >> flog, "****** Mismatch: " + m1.group() + " != " + m2.group()
+ flog.close()
+ return rv
+
+def scandir(root):
+ aconf = []
+ ents = os.listdir(conf_root)
+ ents.sort(key=str.lower)
+ for name in ents:
+ test_path = os.path.join(conf_root, name)
+ if S_ISDIR(os.stat(test_path).st_mode):
+ files = os.listdir(test_path)
+ es_file = "?"
+ md5_file = "?"
+ for f in files:
+ (base, ext) = os.path.splitext(f)
+ if base[0] == '.':
+ pass
+ elif ext == ".bit" or ext == ".bin":
+ es_file = f
+ elif ext == ".md5":
+ if md5_file == "?":
+ md5_file = f
+ elif base[-3:] == "yuv":
+ md5_file = f
+ aconf.append((1, name, es_file, md5_file))
+ return aconf
+
+def runtest(name, tests):
+ if not tests:
+ return True
+ for t in tests:
+ if name[0:len(t)] == t:
+ return True
+ return False
+
+def doconf(csva, tests):
+ failures = []
+ unx_success = []
+ for a in csva:
+ exp_test = int(a[0])
+ if (exp_test and runtest(a[1], tests)):
+ name = a[1]
+ print "==== ", name,
+ sys.stdout.flush()
+
+ if (not testone(os.path.join(conf_root, name), name, a[2], a[3])) :
+ if exp_test == 1:
+ failures.append(name)
+ print ": * FAIL *"
+ else:
+ print ": fail"
+ else:
+ if exp_test == 2:
+ print ": * OK *"
+ unx_success.append(name)
+ else:
+ print ": ok"
+
+
+ if failures or unx_success:
+ print "Unexpected Failures:", failures
+ print "Unexpected Success: ", unx_success
+ else:
+ print "All tests normal"
+
+
+class ConfCSVDialect(csv.Dialect):
+ delimiter = ','
+ doublequote = True
+ lineterminator = '\n'
+ quotechar='"'
+ quoting = csv.QUOTE_MINIMAL
+ skipinitialspace = True
+ strict = True
+
+if __name__ == '__main__':
+
+ argp = argparse.ArgumentParser(description="FFmpeg h265 conformance tester")
+ argp.add_argument("tests", nargs='*')
+ argp.add_argument("--csvgen", action='store_true', help="Generate CSV file for dir")
+ argp.add_argument("--csv", default="pi-util/conf_h265.csv", help="CSV filename")
+ args = argp.parse_args()
+
+ if args.csvgen:
+ csv.writer(sys.stdout).writerows(scandir(conf_root))
+ exit(0)
+
+ with open(args.csv, 'rt') as csvfile:
+ csva = [a for a in csv.reader(csvfile, ConfCSVDialect())]
+
+
+ doconf(csva, args.tests)
+
diff --git a/pi-util/qasm.py b/pi-util/qasm.py
new file mode 100644
index 0000000..1eacc04
--- /dev/null
+++ b/pi-util/qasm.py
@@ -0,0 +1,2502 @@
+#!/usr/bin/env python
+
+# add.ifz.setf -, r0, ra0 ; fmul rb1, rany2, 0 ; thrend # comment
+# add r0, r0, 1 # implicit mul nop
+# nop # explicit add nop, implicit mul nop
+# bkpt # implicit add/mul nop
+# mov r0, 0x1234 # hex immediate
+# mov r0, 20 * 40 # expressions...
+# mov r0, f(sqrt(2.0) * 3.0) # f() converts float to bits
+# mov r0, a:label # put address of label in r0
+# :label
+# bra.allnn ra2, a:1f # branch to label 1 (searching forward), using absolute address
+# :1
+# brr.anyz -, r:1b # branch to label 1 (searching backward), using relative address
+# :1 # multiple definitions of numeric labels (differentiated using f/b)
+# .set my_val, 3 # introduce alias for 3
+# .set my_reg, r0 # and for r0
+# mov my_reg, my_val # then use them
+# .set my_reg2, my_reg + my_val # r0 plus 3 is r3
+# .macro my_add, a, b, c # a, b, c act as if .set on entry
+# .set my_val, 10
+# add a, b, c
+# mov r0, my_val # 10
+# .endm # forget all .sets since .macro (including arg .sets)
+# mov r0, my_val # 3
+# my_add my_reg2, my_reg, ra0 << 4 # << rotates left (>> rotates right)
+
+import math
+import optparse
+import os
+import random
+import re
+import struct
+import sys
+import time
+
+###############################################################################
+# constants
+###############################################################################
+
+# ops
+######
+
+# negatives are internal qasm ops
+
+AOP_MOV = -3 # two operands
+AOP_BRA = -2 # two operands
+AOP_BRR = -1 # two operands
+AOP_NOP = 0x00 # no operands
+AOP_FADD = 0x01
+AOP_FSUB = 0x02
+AOP_FMIN = 0x03
+AOP_FMAX = 0x04
+AOP_FMINABS = 0x05
+AOP_FMAXABS = 0x06
+AOP_FTOI = 0x07 # two operands
+AOP_ITOF = 0x08 # two operands
+AOP_ADD = 0x0c
+AOP_SUB = 0x0d
+AOP_SHR = 0x0e
+AOP_ASR = 0x0f
+AOP_ROR = 0x10
+AOP_SHL = 0x11
+AOP_MIN = 0x12
+AOP_MAX = 0x13
+AOP_AND = 0x14
+AOP_OR = 0x15
+AOP_XOR = 0x16
+AOP_NOT = 0x17 # two operands
+AOP_CLZ = 0x18 # two operands
+AOP_V8ADDS = 0x1e
+AOP_V8SUBS = 0x1f
+
+MOP_MOV = -1 # two operands
+MOP_NOP = 0x0 # no operands
+MOP_FMUL = 0x1
+MOP_MUL24 = 0x2
+MOP_V8MULD = 0x3
+MOP_V8MIN = 0x4
+MOP_V8MAX = 0x5
+MOP_V8ADDS = 0x6
+MOP_V8SUBS = 0x7
+
+# ldi modes
+############
+
+LDI_32 = 0
+LDI_EL_SIGNED = 1
+LDI_EL_UNSIGNED = 3
+LDI_SEMA = 4
+
+# conds
+########
+
+COND_NEVER = 0
+COND_ALWAYS = 1
+COND_IFZ = 2
+COND_IFNZ = 3
+COND_IFN = 4
+COND_IFNN = 5
+COND_IFC = 6
+COND_IFNC = 7
+
+BCOND_ALLZ = 0
+BCOND_ALLNZ = 1
+BCOND_ANYZ = 2
+BCOND_ANYNZ = 3
+BCOND_ALLN = 4
+BCOND_ALLNN = 5
+BCOND_ANYN = 6
+BCOND_ANYNN = 7
+BCOND_ALLC = 8
+BCOND_ALLNC = 9
+BCOND_ANYC = 10
+BCOND_ANYNC = 11
+BCOND_ALWAYS = 15
+
+# packing/unpacking
+####################
+
+# regfile a pack modes
+PACK_A_NOP = 0
+PACK_A_16A = 1
+PACK_A_16B = 2
+PACK_A_8888 = 3
+PACK_A_8A = 4
+PACK_A_8B = 5
+PACK_A_8C = 6
+PACK_A_8D = 7
+PACK_A_32S = 8
+PACK_A_16AS = 9
+PACK_A_16BS = 10
+PACK_A_8888S = 11
+PACK_A_8AS = 12
+PACK_A_8BS = 13
+PACK_A_8CS = 14
+PACK_A_8DS = 15
+
+# mul unit pack modes
+PACK_MUL_NOP = 0
+PACK_MUL_8888 = 3
+PACK_MUL_8A = 4
+PACK_MUL_8B = 5
+PACK_MUL_8C = 6
+PACK_MUL_8D = 7
+
+# regfile a unpack modes
+UNPACK_A_NOP = 0
+UNPACK_A_16A = 1
+UNPACK_A_16B = 2
+UNPACK_A_8R = 3
+UNPACK_A_8A = 4
+UNPACK_A_8B = 5
+UNPACK_A_8C = 6
+UNPACK_A_8D = 7
+
+# r4 unpack modes
+UNPACK_R4_NOP = 0
+UNPACK_R4_16A = 1
+UNPACK_R4_16B = 2
+UNPACK_R4_8R = 3
+UNPACK_R4_8A = 4
+UNPACK_R4_8B = 5
+UNPACK_R4_8C = 6
+UNPACK_R4_8D = 7
+
+PACK_TYPE_INT = 0
+PACK_TYPE_FLOAT = 1
+PACK_TYPE_EITHER = -1
+
+PACK_MODE_A = 0 # regfile a
+PACK_MODE_M = 1 # mul unit
+PACK_MODE_EITHER = -1
+
+UNPACK_LOC_A = 0 # regfile a
+UNPACK_LOC_R4 = 1 # r4
+UNPACK_LOC_AB = 2 # either regfile a or regfile b
+UNPACK_LOC_OTHER = 3 # somewhere else
+
+# args
+#######
+
+# loc_t, ie internal
+MUX_AC = 0
+MUX_ANY = 1
+MUX_A = 2
+MUX_B = 3
+RW_EITHER = 0
+RW_READ = 1
+RW_WRITE = 2
+
+RADDR_NOP = 39
+
+# negatives are for internal use
+RMUX_SEMA = -6
+RMUX_LABEL = -5
+RMUX_IMMV = -4
+RMUX_IMM = -3
+RMUX_AC = -2
+RMUX_ANY = -1
+RMUX_A0 = 0 # followed by A1, A2, A3, A4, A5
+RMUX_A = 6
+RMUX_B = 7
+
+WADDR_R0 = 32 # followed by R1, R2, R3
+WADDR_NOP = 39
+
+WMUX_ANY = 0
+WMUX_A = 1
+WMUX_B = 2
+
+# signals
+##########
+
+SIG_BKPT = 0
+SIG_NORMAL = 1
+SIG_THRSW = 2
+SIG_THREND = 3
+SIG_SBWAIT = 4
+SIG_SBDONE = 5
+SIG_INT = 6 # on a0
+SIG_LTHRSW = 6 # on b0
+SIG_LOADCV = 7
+SIG_LOADC = 8
+SIG_LDCEND = 9
+SIG_LDTMU0 = 10
+SIG_LDTMU1 = 11
+SIG_ROTATE = 12 # on a0
+SIG_LOADAM = 12 # on b0
+SIG_SMALLIMMED = 13
+SIG_IMMED = 14
+SIG_BRANCH = 15
+
+# multi-line assembler constructs
+##################################
+
+CONSTRUCT_MACRO = 0x1
+CONSTRUCT_IF = 0x2
+CONSTRUCT_ELSE = 0x4
+CONSTRUCT_REP = 0x8
+
+###############################################################################
+# helpers
+###############################################################################
+
+def asm_error(message, location = None):
+ if location is None:
+ location = current_location
+ if location == '':
+ sys.stderr.write('qasm ERROR: %s\n' % message)
+ else:
+ sys.stderr.write('qasm ERROR: %s: %s\n' % (location, message))
+ sys.exit(-1)
+
+def asm_warning(message, location = None):
+ if disable_warnings or (nwarn_level != 0):
+ return
+ if location is None:
+ location = current_location
+ if location == '':
+ sys.stderr.write('qasm WARNING: %s\n' % message)
+ else:
+ sys.stderr.write('qasm WARNING: %s: %s\n' % (location, message))
+ if warnings_are_errors:
+ asm_error('warnings are errors!', location)
+
+# smart_split('') = []
+# smart_split('a') = ['a']
+# smart_split('a(1, 2),[3, 4, 5],6') = ['a(1, 2)', '[3, 4, 5]', '6']
+def smart_split(s, delim = ',', count = 0):
+ if len(s) == 0:
+ return []
+ parts = []
+ depth = 0
+ i = 0
+ for j in xrange(len(s)):
+ if s[j] in '([{':
+ depth += 1
+ elif s[j] in ')]}':
+ depth -= 1
+ elif (s[j] == delim) and (depth == 0):
+ parts.append(s[i:j])
+ i = j + 1
+ if len(parts) == count:
+ break
+ if depth != 0:
+ asm_error('bracket nesting fail')
+ parts.append(s[i:])
+ return parts
+
+def is_int(x):
+ return isinstance(x, int) or isinstance(x, long)
+
+###############################################################################
+# "parsing" stuff
+###############################################################################
+
+re_macro = re.compile('\\.macro\\s+(?P<name>\\w+)(?P<params>(\\s*,\\s*\\w+)*)$')
+re_if = re.compile('\\.if((?P<set>n?set)\\s+(?P<name>\\w+)|\\s(?P<condition>.+))$')
+re_elif = re.compile('\\.elif((?P<set>n?set)\\s+(?P<name>\\w+)|\\s(?P<condition>.+))$')
+re_rep = re.compile('\\.rep\\s+(?P<name>\\w+)\\s*,(?P<count>.+)$')
+re_include = re.compile('\\.include\\s(?P<filename>.+)$')
+re_set = re.compile('\\.set\\s+(?P<name>\\w+)\\s*,(?P<val>.+)$')
+re_unset = re.compile('\\.unset\\s+(?P<name>\\w+)$')
+re_eval = re.compile('\\.eval\\s(?P<expr>.+)$')
+re_print_info_warn_error = re.compile('\\.(?P<print_info_warn_error>print|info|warn|error)\\s(?P<message>.+)$')
+re_assert = re.compile('\\.assert\\s(?P<condition>.+)$')
+re_data = re.compile('\\.d(?P<size>[124])\\s(?P<data>.+)$')
+re_macro_inst = re.compile('(?P<name>\\w+)(?P<args>\\s.+|)$')
+re_label = re.compile(':(?P<name>:?[a-zA-Z_]\\w*|\\d+)$')
+re_op = re.compile('(?P<op>\\w+)(\\.(?P<cond>\\w+))??(\\.(?P<sf>setf))?(?P<args>\\s.+|)$')
+re_label_ref_left = re.compile('\\b([ar]):')
+re_label_ref_right = re.compile('[a-zA-Z_]\\w*|\\d+[bf]$')
+re_pack = re.compile('\\.([0-9]\\w*[a-df-zA-DF-Z_])') # a bit weird because we don't want to pick up float literals...
+
+# ops
+######
+
+aops = {
+ 'mov': (AOP_MOV, 2),
+ 'bra': (AOP_BRA, 2),
+ 'brr': (AOP_BRR, 2),
+ 'nop': (AOP_NOP, 0),
+ 'fadd': (AOP_FADD, 3),
+ 'fsub': (AOP_FSUB, 3),
+ 'fmin': (AOP_FMIN, 3),
+ 'fmax': (AOP_FMAX, 3),
+ 'fminabs': (AOP_FMINABS, 3),
+ 'fmaxabs': (AOP_FMAXABS, 3),
+ 'ftoi': (AOP_FTOI, 2),
+ 'itof': (AOP_ITOF, 2),
+ 'add': (AOP_ADD, 3),
+ 'sub': (AOP_SUB, 3),
+ 'shr': (AOP_SHR, 3),
+ 'asr': (AOP_ASR, 3),
+ 'ror': (AOP_ROR, 3),
+ 'shl': (AOP_SHL, 3),
+ 'min': (AOP_MIN, 3),
+ 'max': (AOP_MAX, 3),
+ 'and': (AOP_AND, 3),
+ 'or': (AOP_OR, 3),
+ 'xor': (AOP_XOR, 3),
+ 'not': (AOP_NOT, 2),
+ 'clz': (AOP_CLZ, 2),
+ 'v8adds': (AOP_V8ADDS, 3),
+ 'v8subs': (AOP_V8SUBS, 3)}
+
+def get_aop(aop):
+ if aop not in aops:
+ asm_error('invalid aop')
+ return aops[aop]
+
+mops = {
+ 'mov': (MOP_MOV, 2),
+ 'nop': (MOP_NOP, 0),
+ 'fmul': (MOP_FMUL, 3),
+ 'mul24': (MOP_MUL24, 3),
+ 'v8muld': (MOP_V8MULD, 3),
+ 'v8min': (MOP_V8MIN, 3),
+ 'v8max': (MOP_V8MAX, 3),
+ 'v8adds': (MOP_V8ADDS, 3),
+ 'v8subs': (MOP_V8SUBS, 3)}
+
+def get_mop(mop):
+ if mop not in mops:
+ asm_error('invalid mop')
+ return mops[mop]
+
+# conds
+########
+
+conds = {
+ 'ifz': COND_IFZ,
+ 'ifnz': COND_IFNZ,
+ 'ifn': COND_IFN,
+ 'ifnn': COND_IFNN,
+ 'ifc': COND_IFC,
+ 'ifnc': COND_IFNC}
+
+def get_cond(cond):
+ if not cond:
+ return COND_ALWAYS
+ if cond not in conds:
+ asm_error('invalid cond')
+ return conds[cond]
+
+bconds = {
+ 'allz': BCOND_ALLZ,
+ 'allnz': BCOND_ALLNZ,
+ 'anyz': BCOND_ANYZ,
+ 'anynz': BCOND_ANYNZ,
+ 'alln': BCOND_ALLN,
+ 'allnn': BCOND_ALLNN,
+ 'anyn': BCOND_ANYN,
+ 'anynn': BCOND_ANYNN,
+ 'allc': BCOND_ALLC,
+ 'allnc': BCOND_ALLNC,
+ 'anyc': BCOND_ANYC,
+ 'anync': BCOND_ANYNC}
+
+def get_bcond(bcond):
+ if not bcond:
+ return BCOND_ALWAYS
+ if bcond not in bconds:
+ asm_error('invalid bcond')
+ return bconds[bcond]
+
+def get_setf(setf):
+ if not setf:
+ return False
+ return True
+
+# packing/unpacking
+####################
+
+packs = {
+ '16a': (PACK_A_16A, PACK_TYPE_INT, PACK_MODE_A),
+ '16b': (PACK_A_16B, PACK_TYPE_INT, PACK_MODE_A),
+ '16af': (PACK_A_16A, PACK_TYPE_FLOAT, PACK_MODE_A),
+ '16bf': (PACK_A_16B, PACK_TYPE_FLOAT, PACK_MODE_A),
+ '8abcd': (PACK_A_8888, PACK_TYPE_EITHER, PACK_MODE_A),
+ '8a': (PACK_A_8A, PACK_TYPE_EITHER, PACK_MODE_A),
+ '8b': (PACK_A_8B, PACK_TYPE_EITHER, PACK_MODE_A),
+ '8c': (PACK_A_8C, PACK_TYPE_EITHER, PACK_MODE_A),
+ '8d': (PACK_A_8D, PACK_TYPE_EITHER, PACK_MODE_A),
+ 's': (PACK_A_32S, PACK_TYPE_EITHER, PACK_MODE_A),
+ '16as': (PACK_A_16AS, PACK_TYPE_EITHER, PACK_MODE_A),
+ '16bs': (PACK_A_16BS, PACK_TYPE_EITHER, PACK_MODE_A),
+ '8abcds': (PACK_A_8888S, PACK_TYPE_EITHER, PACK_MODE_A),
+ '8as': (PACK_A_8AS, PACK_TYPE_EITHER, PACK_MODE_A),
+ '8bs': (PACK_A_8BS, PACK_TYPE_EITHER, PACK_MODE_A),
+ '8cs': (PACK_A_8CS, PACK_TYPE_EITHER, PACK_MODE_A),
+ '8ds': (PACK_A_8DS, PACK_TYPE_EITHER, PACK_MODE_A),
+ '8abcdc': (PACK_MUL_8888, PACK_TYPE_EITHER, PACK_MODE_M),
+ '8ac': (PACK_MUL_8A, PACK_TYPE_EITHER, PACK_MODE_M),
+ '8bc': (PACK_MUL_8B, PACK_TYPE_EITHER, PACK_MODE_M),
+ '8cc': (PACK_MUL_8C, PACK_TYPE_EITHER, PACK_MODE_M),
+ '8dc': (PACK_MUL_8D, PACK_TYPE_EITHER, PACK_MODE_M)}
+
+def get_pack(pack):
+ if not pack:
+ return (0, PACK_TYPE_EITHER, PACK_MODE_EITHER)
+ if pack not in packs:
+ asm_error('invalid pack')
+ return packs[pack]
+
+a_unpacks = {
+ '16a': (UNPACK_A_16A, PACK_TYPE_INT),
+ '16b': (UNPACK_A_16B, PACK_TYPE_INT),
+ '16af': (UNPACK_A_16A, PACK_TYPE_FLOAT),
+ '16bf': (UNPACK_A_16B, PACK_TYPE_FLOAT),
+ '8dr': (UNPACK_A_8R, PACK_TYPE_EITHER),
+ '8a': (UNPACK_A_8A, PACK_TYPE_INT),
+ '8b': (UNPACK_A_8B, PACK_TYPE_INT),
+ '8c': (UNPACK_A_8C, PACK_TYPE_INT),
+ '8d': (UNPACK_A_8D, PACK_TYPE_INT),
+ '8ac': (UNPACK_A_8A, PACK_TYPE_FLOAT),
+ '8bc': (UNPACK_A_8B, PACK_TYPE_FLOAT),
+ '8cc': (UNPACK_A_8C, PACK_TYPE_FLOAT),
+ '8dc': (UNPACK_A_8D, PACK_TYPE_FLOAT)}
+
+def get_a_unpack(unpack):
+ if not unpack:
+ return (UNPACK_A_NOP, PACK_TYPE_EITHER, UNPACK_LOC_A)
+ if unpack not in a_unpacks:
+ asm_error('invalid ra unpack')
+ return a_unpacks[unpack] + (UNPACK_LOC_A,)
+
+r4_unpacks = {
+ '16af': UNPACK_R4_16A,
+ '16bf': UNPACK_R4_16B,
+ '8dr': UNPACK_R4_8R,
+ '8ac': UNPACK_R4_8A,
+ '8bc': UNPACK_R4_8B,
+ '8cc': UNPACK_R4_8C,
+ '8dc': UNPACK_R4_8D}
+
+def get_r4_unpack(unpack):
+ if not unpack:
+ return (UNPACK_R4_NOP, PACK_TYPE_EITHER, UNPACK_LOC_R4)
+ if unpack not in r4_unpacks:
+ asm_error('invalid r4 unpack')
+ return (r4_unpacks[unpack], PACK_TYPE_EITHER, UNPACK_LOC_R4)
+
+# args
+#######
+
+class loc_t:
+ def __init__(self, mux, i, rot, r5_rot, pack, rw):
+ self.mux = mux
+ self.i = i
+ self.rot = rot % 16
+ self.r5_rot = r5_rot % 16
+ self.pack = pack
+ self.rw = rw
+
+ def copy(self):
+ return loc_t(self.mux, self.i, self.rot, self.r5_rot, self.pack, self.rw)
+
+ def __add__(self, i):
+ if not is_int(i):
+ raise Exception('can only add integer to loc')
+ return loc_t(self.mux, self.i + i, self.rot, self.r5_rot, self.pack, self.rw)
+
+ def __sub__(self, i):
+ if not is_int(i):
+ raise Exception('can only subtract integer from loc')
+ return loc_t(self.mux, self.i - i, self.rot, self.r5_rot, self.pack, self.rw)
+
+ def __cmp__(self, other):
+ if is_int(other):
+ return cmp(self.i, other)
+ if not isinstance(other, loc_t):
+ raise Exception('can only compare loc to integer or other loc')
+ if self.mux != other.mux:
+ return cmp(self.mux, other.mux)
+ if self.i != other.i:
+ return cmp(self.i, other.i)
+ if self.rot != other.rot:
+ return cmp(self.rot, other.rot)
+ if self.r5_rot != other.r5_rot:
+ return cmp(self.r5_rot, other.r5_rot)
+ return cmp(self.pack, other.pack)
+
+ def is_r5(self):
+ return (self.mux == MUX_AC) and (self.i == 5)
+
+ def shift(self, rot, left):
+ if isinstance(rot, loc_t) and rot.is_r5():
+ if (rot.rot != 0) or (rot.r5_rot != 0) or rot.pack:
+ raise Exception('can\'t rotate by rotated/unpacked r5')
+ return loc_t(self.mux, self.i, self.rot, self.r5_rot + (-1 if left else 1), self.pack, self.rw)
+ if not is_int(rot):
+ raise Exception('can only rotate by integer or r5')
+ return loc_t(self.mux, self.i, self.rot + (-rot if left else rot), self.r5_rot, self.pack, self.rw)
+
+ def __lshift__(self, rot):
+ return self.shift(rot, True)
+
+ def __rshift__(self, rot):
+ return self.shift(rot, False)
+
+ def __getattr__(self, name):
+ # discard the first character if it is an underscore. this is a total hack
+ # to allow packs starting with a digit to work
+ if name[0] == '_':
+ name = name[1:]
+ if (name in packs) or (name in a_unpacks) or (name in r4_unpacks):
+ if self.pack:
+ raise Exception('can\'t specify two packs')
+ return loc_t(self.mux, self.i, self.rot, self.r5_rot, name, self.rw)
+ raise AttributeError()
+
+ def __str__(self):
+ if self.mux == MUX_AC:
+ return 'r%d' % self.i
+ if self.mux == MUX_ANY:
+ return 'rany%d' % self.i
+ if self.mux == MUX_A:
+ return 'ra%d' % self.i
+ if self.mux == MUX_B:
+ return 'rb%d' % self.i
+ assert 0
+
+class sema_t:
+ def __init__(self, acq, i):
+ if not is_int(i):
+ raise Exception('semaphore index must be integer')
+ self.acq = acq
+ self.i = i
+
+class label_t:
+ def __init__(self, rel, name, offset):
+ self.rel = rel
+ self.name = name
+ self.offset = offset
+
+ def __add__(self, offset):
+ return label_t(self.rel, self.name, self.offset + offset)
+
+ def __sub__(self, offset):
+ return label_t(self.rel, self.name, self.offset - offset)
+
+class label_maker_t:
+ def __init__(self, rel):
+ self.rel = rel
+
+ def __getattr__(self, name):
+ # we discard the first character. this is a total hack to allow numeric labels to work
+ if not re_label_ref_right.match(name[1:]):
+ raise Exception('invalid label reference')
+ return label_t(self.rel, name[1:], 0)
+
+def bits(x, n):
+ if (x >> n) != 0:
+ raise Exception('%d doesn\'t fit in %d bits' % (x, n))
+ return x
+
+def bitsw(x, n):
+ if x == (1 << n):
+ x = 0
+ return bits(x, n)
+
+def bitsws(x, n):
+ if x == (1 << (n - 1)):
+ x = 0
+ if -(1 << (n - 1)) <= x < 0:
+ x += 1 << n
+ return bits(x, n)
+
+def vpm_setup(n, stride, addr, v2 = False):
+ horiz, laned, size, y, x, p = addr
+ if size not in (0, 1, 2):
+ raise Exception('addr size should be 0, 1, or 2')
+ if horiz:
+ if x != 0:
+ raise Exception('horizontal accesses must have x of 0')
+ else:
+ if (y & 0xf) != 0:
+ raise Exception('vertical accesses must be 16 row aligned')
+ hls = (bits(horiz, 1) << 3) | (bits(laned, 1) << 2) | (2 - size)
+ if v2:
+ return ((1 << 29) | (bitsw(n, 5) << 24) | (bitsws(stride, 7) << 16) |
+ (hls << 12) | ((bits(y, 8) | bits(x, 4)) << size) | bits(p, size))
+ return ((bitsw(n, 4) << 20) | (bitsw(stride, 6) << 12) |
+ (hls << 8) | ((bits(y, 6) | bits(x, 4)) << size) | bits(p, size))
+
+def vdw_setup_0(n, m, addr):
+ horiz, size, y, x, p = addr
+ if size not in (0, 1, 2):
+ raise Exception('addr size should be 0, 1, or 2')
+ return ((2 << 30) | (bitsw(n, 7) << 23) | (bitsw(m, 7) << 16) |
+ (bits(horiz, 1) << 14) | (bits(y, 7) << 7) | (bits(x, 4) << 3) | (size << 1) | bits(p, size))
+
+def vdr_setup_0(n, m, addr, vpm_stride, stride):
+ horiz, size, y, x, p = addr
+ if size not in (0, 1, 2):
+ raise Exception('addr size should be 0, 1, or 2')
+ if (stride < 8) or (stride & (stride - 1)):
+ raise Exception('stride must be power of 2 >= 8, 8 meaning use extended stride')
+ log2_stride = 3
+ while (1 << log2_stride) != stride:
+ log2_stride += 1
+ return ((1 << 31) | (size << 29) | (bits(p, size) << 28) | (bits(log2_stride - 3, 4) << 24) |
+ (bitsw(m, 4) << 20) | (bitsw(n, 4) << 16) | (bitsw(vpm_stride, 4) << 12) |
+ (bits(1 - horiz, 1) << 11) | (bits(y, 7) << 4) | bits(x, 4))
+
+class allocator_t:
+ def __init__(self, *available):
+ self.available = list(available)
+ self.allocated = {}
+ self.reserved = []
+
+ def copy(self):
+ a = allocator_t()
+ a.available = self.available[:]
+ a.allocated = self.allocated.copy()
+ a.reserved = self.reserved[:]
+ return a
+
+ def forget(self):
+ self.__init__(self.available + self.allocated.values() + self.reserved)
+
+ def reserve(self, *rs):
+ for r in rs:
+ self.available.remove(r)
+ self.reserved.append(r)
+
+ def retire(self, name):
+ r = self.allocated.pop(name)
+ del r.__invert__
+ del r.retire
+ self.available.append(r)
+ return r
+
+ def __getattr__(self, name):
+ if name not in self.allocated:
+ r = self.available.pop()
+ r.retire = lambda: self.retire(name) # this is an ugly hack to get nicer retire syntax
+ r.__invert__ = r.retire
+ self.allocated[name] = r
+ return self.allocated[name]
+
+def pragma_allow_xor_0(x):
+ global allow_xor_0
+
+ if not isinstance(x, bool):
+ raise Exception('allow_xor_0 must be bool')
+ x, allow_xor_0 = allow_xor_0, x
+ return x
+
+def pragma_dont_warn_when_mul_rot_inp_r5(x):
+ global dont_warn_when_mul_rot_inp_r5
+
+ if not isinstance(x, bool):
+ raise Exception('dont_warn_when_mul_rot_inp_r5 must be bool')
+ x, dont_warn_when_mul_rot_inp_r5 = dont_warn_when_mul_rot_inp_r5, x
+ return x
+
+arg_defs = {
+ # special reg names (these alias the regular names, but also have appropriate read/write restrictions)
+ 'w': loc_t(MUX_A, 15, 0, 0, None, RW_EITHER),
+ 'z': loc_t(MUX_B, 15, 0, 0, None, RW_EITHER),
+ 'unif': loc_t(MUX_ANY, 32, 0, 0, None, RW_READ),
+ 'vary': loc_t(MUX_ANY, 35, 0, 0, None, RW_READ),
+ 'tmurs': loc_t(MUX_ANY, 36, 0, 0, None, RW_WRITE),
+ 'r5quad': loc_t(MUX_A, 37, 0, 0, None, RW_WRITE),
+ 'r5rep': loc_t(MUX_B, 37, 0, 0, None, RW_WRITE),
+ 'elem_num': loc_t(MUX_A, 38, 0, 0, None, RW_READ),
+ 'qpu_num': loc_t(MUX_B, 38, 0, 0, None, RW_READ),
+ 'unif_addr': loc_t(MUX_A, 40, 0, 0, None, RW_WRITE),
+ 'unif_addr_rel': loc_t(MUX_B, 40, 0, 0, None, RW_WRITE),
+ 'x_coord': loc_t(MUX_A, 41, 0, 0, None, RW_EITHER),
+ 'y_coord': loc_t(MUX_B, 41, 0, 0, None, RW_EITHER),
+ 'ms_mask': loc_t(MUX_A, 42, 0, 0, None, RW_EITHER),
+ 'rev_flag': loc_t(MUX_B, 42, 0, 0, None, RW_EITHER),
+ 'stencil': loc_t(MUX_ANY, 43, 0, 0, None, RW_WRITE),
+ 'tlbz': loc_t(MUX_ANY, 44, 0, 0, None, RW_WRITE),
+ 'tlbm': loc_t(MUX_ANY, 45, 0, 0, None, RW_WRITE),
+ 'tlbc': loc_t(MUX_ANY, 46, 0, 0, None, RW_WRITE),
+ 'vpm': loc_t(MUX_ANY, 48, 0, 0, None, RW_EITHER),
+ 'vr_busy': loc_t(MUX_A, 49, 0, 0, None, RW_READ),
+ 'vw_busy': loc_t(MUX_B, 49, 0, 0, None, RW_READ),
+ 'vr_setup': loc_t(MUX_A, 49, 0, 0, None, RW_WRITE),
+ 'vw_setup': loc_t(MUX_B, 49, 0, 0, None, RW_WRITE),
+ 'vr_wait': loc_t(MUX_A, 50, 0, 0, None, RW_READ),
+ 'vw_wait': loc_t(MUX_B, 50, 0, 0, None, RW_READ),
+ 'vr_addr': loc_t(MUX_A, 50, 0, 0, None, RW_WRITE),
+ 'vw_addr': loc_t(MUX_B, 50, 0, 0, None, RW_WRITE),
+ 'mutex': loc_t(MUX_ANY, 51, 0, 0, None, RW_EITHER),
+ 'recip': loc_t(MUX_ANY, 52, 0, 0, None, RW_WRITE),
+ 'recipsqrt': loc_t(MUX_ANY, 53, 0, 0, None, RW_WRITE),
+ 'rsqrt': loc_t(MUX_ANY, 53, 0, 0, None, RW_WRITE),
+ 'exp': loc_t(MUX_ANY, 54, 0, 0, None, RW_WRITE),
+ 'log': loc_t(MUX_ANY, 55, 0, 0, None, RW_WRITE),
+ 't0s': loc_t(MUX_ANY, 56, 0, 0, None, RW_WRITE),
+ 't0t': loc_t(MUX_ANY, 57, 0, 0, None, RW_WRITE),
+ 't0r': loc_t(MUX_ANY, 58, 0, 0, None, RW_WRITE),
+ 't0b': loc_t(MUX_ANY, 59, 0, 0, None, RW_WRITE),
+ 't1s': loc_t(MUX_ANY, 60, 0, 0, None, RW_WRITE),
+ 't1t': loc_t(MUX_ANY, 61, 0, 0, None, RW_WRITE),
+ 't1r': loc_t(MUX_ANY, 62, 0, 0, None, RW_WRITE),
+ 't1b': loc_t(MUX_ANY, 63, 0, 0, None, RW_WRITE),
+
+ # semaphore acq/rel
+ 'sacq': lambda i: sema_t(True, i),
+ 'srel': lambda i: sema_t(False, i),
+
+ # label makers (before evaluating, the syntax x:label gets transformed to x_label_maker._label)
+ 'r_label_maker': label_maker_t(True),
+ 'a_label_maker': label_maker_t(False),
+
+ # handy functions
+ 'f': lambda x: struct.unpack('I', struct.pack('f', x))[0],
+ 'sqrt': math.sqrt,
+ 'sin': math.sin,
+ 'cos': math.cos,
+ 'atan2': math.atan2,
+ 'pi': math.pi,
+ 'rseed': random.seed,
+ 'rand': lambda: int(random.getrandbits(32)),
+ 'bits': bits,
+ 'bitsw': bitsw,
+ 'bitsws': bitsws,
+
+ # handy vpm/vdw/vdr stuff
+ 'h32': lambda y: (1, 0, 0, y, 0, 0),
+ 'h16l': lambda y, p: (1, 1, 1, y, 0, p),
+ 'h16p': lambda y, p: (1, 0, 1, y, 0, p),
+ 'h8l': lambda y, p: (1, 1, 2, y, 0, p),
+ 'h8p': lambda y, p: (1, 0, 2, y, 0, p),
+ 'v32': lambda y, x: (0, 0, 0, y, x, 0),
+ 'v16l': lambda y, x, p: (0, 1, 1, y, x, p),
+ 'v16p': lambda y, x, p: (0, 0, 1, y, x, p),
+ 'v8l': lambda y, x, p: (0, 1, 2, y, x, p),
+ 'v8p': lambda y, x, p: (0, 0, 2, y, x, p),
+ 'dma_h32': lambda y, x: (1, 0, y, x, 0),
+ 'dma_h16p': lambda y, x, p: (1, 1, y, x, p),
+ 'dma_h8p': lambda y, x, p: (1, 2, y, x, p),
+ 'dma_v32': lambda y, x: (0, 0, y, x, 0),
+ 'dma_v16p': lambda y, x, p: (0, 1, y, x, p),
+ 'dma_v8p': lambda y, x, p: (0, 2, y, x, p),
+ 'vpm_setup': vpm_setup,
+ 'vpm_setup_v2': lambda n, stride, addr: vpm_setup(n, stride, addr, True),
+ 'vdw_setup_0': vdw_setup_0,
+ 'vdw_setup_1': lambda stride: (3 << 30) | bits(stride, 13),
+ 'vdr_setup_0': vdr_setup_0,
+ 'vdr_setup_ext_stride': 8, # stride of 8 means use extended stride
+ 'vdr_setup_1': lambda stride: (9 << 28) | bits(stride, 13),
+
+ # annotations
+ 'mul_used': lambda *is_: ('mul_used', sum(1 << i for i in is_)),
+ 'mul_unused': lambda *is_: ('mul_used', sum(1 << i for i in is_) ^ 0xffff),
+ 'preserve_cond': ('preserve_cond', 1),
+
+ # somewhat experimental register allocator
+ 'allocator_t': allocator_t,
+
+ # pragmas
+ 'pragma_allow_xor_0': pragma_allow_xor_0,
+ 'pragma_dont_warn_when_mul_rot_inp_r5': pragma_dont_warn_when_mul_rot_inp_r5}
+
+# accumulators and regs (regular names -- r0, ra0, etc)
+arg_defs.update(('r%d' % i, loc_t(MUX_AC, i, 0, 0, None, RW_EITHER)) for i in xrange(6))
+arg_defs.update(('rany%d' % i, loc_t(MUX_ANY, i, 0, 0, None, RW_EITHER)) for i in xrange(64))
+arg_defs.update(('ra%d' % i, loc_t(MUX_A, i, 0, 0, None, RW_EITHER)) for i in xrange(64))
+arg_defs.update(('rb%d' % i, loc_t(MUX_B, i, 0, 0, None, RW_EITHER)) for i in xrange(64))
+
+def arg_eval(arg, sets):
+ s = (arg.strip().split('.', 1) + [None])[:2]
+ if s[0] == '-':
+ return loc_t(MUX_ANY, WADDR_NOP, 0, 0, s[1], RW_WRITE)
+ arg = re_label_ref_left.sub('\\1_label_maker._', arg) # todo: we probably don't want to replace in strings...
+ arg = re_pack.sub('._\\1', arg)
+ try:
+ # todo: i would like to be able to pass both arg_defs and sets in here
+ # (with sets hiding arg_defs in the case of conflicts), but the obvious
+ # dict(arg_defs, **sets) won't permit things such as:
+ # .set f, lambda x: y
+ # .set y, 4
+ # (the y in the lambda will be looked up in the temporary dict we created
+ # when evaluating the f .set, which doesn't contain y)
+ #
+ # instead, sets is initially set to (a copy of) arg_defs. to simulate the
+ # hiding behaviour, on an unset, we restore any hidden arg_defs value.
+ # also, before dumping sets at the end, we strip out the arg_defs stuff
+ # (this isn't entirely correct as we want to dump sets that are hiding
+ # arg_defs)
+ return eval(arg, sets)
+ except Exception, e:
+ asm_error(e)
+ except:
+ asm_error('unknown error while evaluating argument')
+
+# doesn't check/fixup pack
+def check_and_fixup_loc(loc, read):
+ if (not read) and (loc.rw == RW_READ):
+ asm_error('writing to read-only hardware register')
+ if read and (loc.rw == RW_WRITE):
+ asm_error('reading from write-only hardware register')
+ if not read:
+ # conceptually, we are writing to a location rotated right by
+ # loc.rot/loc.r5_rot. but we are actually rotating the output right by
+ # -loc.rot/-loc.r5_rot then writing it to the unrotated location
+ loc.rot = -loc.rot % 16
+ loc.r5_rot = -loc.r5_rot % 16
+ if (loc.rot != 0) and (loc.r5_rot != 0):
+ asm_error('can\'t rotate by both r5 and immediate')
+ if (loc.r5_rot != 0) and (loc.r5_rot != 1):
+ asm_error('only supported rotation by r5 is once to the %s' % ('left', 'right')[read])
+ if (not mulw_rotate) and ((loc.rot != 0) or loc.r5_rot): # mulw_rotate source checking is done later
+ if not read:
+ asm_error('target doesn\'t support write rotation')
+ if loc.mux == MUX_ANY:
+ loc.mux = MUX_A # can't do rotated read from regfile b
+ if loc.mux != MUX_A:
+ asm_error('rotation on read only allowed from regfile a')
+ if loc.i >= 32:
+ asm_warning('rotation only works from physical regfile')
+ if loc.mux == MUX_AC:
+ if (loc.i < 0) or (loc.i >= 6):
+ asm_error('reg out of range')
+ if not read:
+ if loc.i == 4:
+ asm_error('not allowed to write to r4')
+ if loc.i == 5:
+
+ asm_error('not allowed to write to r5 -- please specify r5quad or r5rep')
+ elif (loc.mux == MUX_ANY) or (loc.mux == MUX_A) or (loc.mux == MUX_B):
+ if (loc.i < 0) or (loc.i >= 64):
+ asm_error('reg out of range')
+ else:
+ assert 0
+
+def get_dst(dst, sets):
+ if not dst:
+ return None, None, (0, PACK_TYPE_EITHER, PACK_MODE_EITHER), 0, 0
+ dst = arg_eval(dst, sets)
+ if not isinstance(dst, loc_t):
+ asm_error('invalid dst')
+ dst = dst.copy()
+ check_and_fixup_loc(dst, False)
+ pack = get_pack(dst.pack)
+ if dst.mux == MUX_AC:
+ if pack[2] == PACK_MODE_A:
+ asm_warning('ra packing only works when writing to physical regfile')
+ return WADDR_R0 + dst.i, WMUX_A, pack, dst.rot, dst.r5_rot
+ return WADDR_R0 + dst.i, WMUX_ANY, pack, dst.rot, dst.r5_rot
+ if (dst.mux == MUX_A) or ((dst.mux == MUX_ANY) and (pack[2] == PACK_MODE_A)): # can't pack to regfile b with this operation
+ if (pack[2] == PACK_MODE_A) and (dst.i >= 32):
+ asm_warning('ra packing only works when writing to physical regfile')
+ return dst.i, WMUX_A, pack, dst.rot, dst.r5_rot
+ if dst.mux == MUX_ANY:
+ return dst.i, WMUX_ANY, pack, dst.rot, dst.r5_rot
+ if dst.mux == MUX_B:
+ if pack[2] == PACK_MODE_A:
+ asm_error('this packing operation can only be used for regfile a')
+ return dst.i, WMUX_B, pack, dst.rot, dst.r5_rot
+ assert 0
+
+def get_src(src, sets):
+ if not src:
+ return None, None, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), None, None
+ src = arg_eval(src, sets)
+ if isinstance(src, sema_t):
+ if not have_sema:
+ asm_error('target does not support semaphores')
+ if (src.i < 0) or (src.i >= 16):
+ asm_error('semaphore number must be in [0, 16)')
+ return src.i | (src.acq << 4), RMUX_SEMA, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), 0, 0
+ if isinstance(src, label_t):
+ return (src.name, src.rel, src.offset), RMUX_LABEL, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), 0, 0
+ if isinstance(src, list):
+ if len(src) != 16:
+ asm_error('vector immediate must have length 16')
+ src = src[:]
+ for i in xrange(16):
+ if not is_int(src[i]):
+ asm_error('all elements of vector immediate must be integers')
+ src[i] &= (1 << 32) - 1
+ return src, RMUX_IMMV, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), 0, 0
+ if is_int(src):
+ return src & ((1 << 32) - 1), RMUX_IMM, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), 0, 0
+ if not isinstance(src, loc_t):
+ asm_error('invalid src')
+ src = src.copy()
+ check_and_fixup_loc(src, True)
+ if mulw_rotate:
+ srot, sr5rot = 0, 0
+ drot, dr5rot = src.rot, src.r5_rot
+ else:
+ srot, sr5rot = src.rot, src.r5_rot
+ drot, dr5rot = 0, 0
+ if src.mux == MUX_AC:
+ if src.i == 4:
+ return 4, RMUX_AC, get_r4_unpack(src.pack), drot, dr5rot
+ if src.pack:
+ asm_error('unpack only allowed for regfile a or r4')
+ return src.i, RMUX_AC, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), drot, dr5rot
+ if (src.mux == MUX_A) or ((src.mux == MUX_ANY) and src.pack): # can't unpack from regfile b
+ return (src.i, srot, sr5rot), RMUX_A, get_a_unpack(src.pack), drot, dr5rot
+ if src.mux == MUX_ANY:
+ return src.i, RMUX_ANY, (0, PACK_TYPE_EITHER, UNPACK_LOC_AB), drot, dr5rot
+ if src.mux == MUX_B:
+ if src.pack:
+ asm_error('unpack only allowed for regfile a or r4')
+ return src.i, RMUX_B, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), drot, dr5rot
+ assert 0
+
+# signals
+##########
+
+sigs = {
+ 'bkpt': SIG_BKPT,
+ 'thrsw': SIG_THRSW,
+ 'thrend': SIG_THREND,
+ 'sbwait': SIG_SBWAIT,
+ 'sbdone': SIG_SBDONE,
+ 'int': SIG_INT,
+ 'loadcv': SIG_LOADCV,
+ 'loadc': SIG_LOADC,
+ 'ldcend': SIG_LDCEND,
+ 'ldtmu0': SIG_LDTMU0,
+ 'ldtmu1': SIG_LDTMU1}
+
+def get_sig(sig):
+ if sig not in sigs:
+ return SIG_NORMAL
+ return sigs[sig]
+
+# annotations
+##############
+
+def get_annots(annot, sets):
+ annots = arg_eval(annot, sets)
+ if isinstance(annots, list):
+ annots = annots[:]
+ else:
+ annots = [annots]
+ for i, annot in enumerate(annots):
+ if ((not isinstance(annot, tuple)) or (len(annot) != 2) or (not isinstance(annot[0], str)) or
+ (not is_int(annot[1]))):
+ asm_error('annotation must be (string, integer) pair, or a list of such pairs')
+ annots[i] = (annot[0], annot[1] & ((1 << 32) - 1))
+ return annots
+
+###############################################################################
+# core
+###############################################################################
+
+def calculate_pack_modes(rpacks, rfloats, couldrfloat, wpacks, wfloats):
+ needfloat = PACK_TYPE_EITHER
+ havefloata = False
+ havefloatr4 = False
+ unpacka = None
+ unpackr4 = None
+ forcebs = [False, False, False, False]
+ forcerafloat = False
+
+ pm = PACK_MODE_EITHER
+ for i in (0, 1, 2, 3):
+ if (rpacks[i][2] == UNPACK_LOC_OTHER) or (rpacks[i][2] == UNPACK_LOC_AB):
+ assert rpacks[i][0] == 0
+ else:
+ if rpacks[i][2] == UNPACK_LOC_A:
+ if unpacka is None:
+ unpacka = rpacks[i][0]
+ elif unpacka != rpacks[i][0]:
+ asm_error('conflicting unpack operations on regfile a')
+ havefloata = havefloata or rfloats[i]
+ elif rpacks[i][2] == UNPACK_LOC_R4:
+ if unpackr4 is None:
+ unpackr4 = rpacks[i][0]
+ elif unpackr4 != rpacks[i][0]:
+ asm_error('conflicting unpack operations on r4')
+ havefloatr4 = havefloatr4 or rfloats[i]
+ else:
+ assert 0
+
+ if rpacks[i][1] != PACK_TYPE_EITHER:
+ if (needfloat != PACK_TYPE_EITHER) and (needfloat != rpacks[i][1]):
+ asm_error('conflicting unpack float requirements')
+ needfloat = rpacks[i][1]
+ for i in (0, 1, 2, 3):
+ if rpacks[i][2] == UNPACK_LOC_AB:
+ if (unpacka is not None) and (unpacka != UNPACK_A_NOP):
+ forcebs[i] = True # non-nop unpack from regfile a. must use b
+
+ if unpacka:
+ if (needfloat == PACK_TYPE_FLOAT) and (not havefloata) and couldrfloat:
+ havefloata = True
+ forcerafloat = True
+ havefloat = havefloata
+ else:
+ havefloat = havefloatr4
+
+ if (needfloat == PACK_TYPE_FLOAT) and (not havefloat):
+ asm_error('float unpack operation used in integer alu operations')
+ if (needfloat == PACK_TYPE_INT) and havefloat:
+ asm_error('integer unpack operation used in float alu operation')
+
+ unpack = 0
+ if unpacka and unpackr4:
+ asm_error('cannot specify pack operation for both regfile a and r4')
+ if unpacka:
+ pm = PACK_MODE_A
+ unpack = unpacka
+ elif unpackr4:
+ pm = PACK_MODE_M
+ unpack = unpackr4
+
+ pack = 0
+ if wpacks[0][2] == PACK_MODE_M:
+ asm_error('mul-unit pack operation used on add result')
+ for i in (0, 1):
+ if wpacks[i][2] == PACK_MODE_A:
+ if (pm != PACK_MODE_EITHER) and (pm != PACK_MODE_A):
+ asm_error('conflicting pack modes')
+ pm = PACK_MODE_A
+ pack = wpacks[i][0]
+ elif wpacks[i][2] == PACK_MODE_M:
+ if (pm != PACK_MODE_EITHER) and (pm != PACK_MODE_M):
+ asm_error('conflicting pack modes')
+ pm = PACK_MODE_M
+ pack = wpacks[i][0]
+
+ if (wpacks[i][1] == PACK_TYPE_FLOAT) and (not wfloats[i]):
+ asm_error('float pack operation used with integer alu result')
+ if (wpacks[i][1] == PACK_TYPE_INT) and wfloats[i]:
+ asm_error('integer pack operation used with float alu result')
+
+ if pm == PACK_MODE_EITHER:
+ pm = PACK_MODE_A
+ return pm, pack, unpack, forcebs, forcerafloat
+
+# immediates that can be encoded with SIG_SMALLIMMED
+bimms = {}
+bimms.update((i, i) for i in xrange(16))
+bimms.update(((i - 32) + (1 << 32), i) for i in xrange(16, 32))
+bimms.update(((127 + (i - 32)) << 23, i) for i in xrange(32, 40))
+bimms.update(((127 + (i - 48)) << 23, i) for i in xrange(40, 48))
+
+def merge_rmux(raddr_a, raddr_b, immb, arot_r5, raddr, rmux):
+ if rmux == RMUX_SEMA:
+ asm_error('semaphore op can only be used with mov')
+ if rmux == RMUX_LABEL:
+ asm_error('label not allowed here')
+ if rmux == RMUX_IMMV:
+ asm_error('vector immediate can only be used with mov')
+ if rmux == RMUX_IMM:
+ if raddr not in bimms:
+ asm_error('can\'t encode immediate 0x%08x' % raddr)
+ raddr = bimms[raddr]
+ if not immb:
+ if raddr_b is not None:
+ asm_error('regfile b and immediates don\'t mix')
+ raddr_b = raddr
+ immb = True
+ elif raddr_b != raddr:
+ asm_error('can only encode one rotation/immediate')
+ return raddr_a, raddr_b, immb, arot_r5, RMUX_B
+ if rmux == RMUX_AC:
+ return raddr_a, raddr_b, immb, arot_r5, RMUX_A0 + raddr
+ if rmux == RMUX_ANY:
+ if (mulw_rotate or (((not immb) or (raddr_b < 48)) and (not arot_r5))) and (raddr_a == raddr):
+ return raddr_a, raddr_b, immb, arot_r5, RMUX_A
+ if (not immb) and (raddr_b == raddr):
+ return raddr_a, raddr_b, immb, arot_r5, RMUX_B
+ if raddr_a is None:
+ assert mulw_rotate or (((not immb) or (raddr_b < 48)) and (not arot_r5))
+ raddr_a = raddr
+ return raddr_a, raddr_b, immb, arot_r5, RMUX_A
+ if raddr_b is None:
+ assert not immb
+ raddr_b = raddr
+ return raddr_a, raddr_b, immb, arot_r5, RMUX_B
+ asm_error('no free read slots')
+ if rmux == RMUX_A:
+ if (not mulw_rotate) and (raddr_a is not None) and (
+ ((raddr[1] != 0) | ((raddr[2] != 0) << 1)) != ((immb and (raddr_b >= 48)) | (arot_r5 << 1))):
+ asm_error('conflicting rotations from regfile a')
+ if raddr_a is None:
+ raddr_a = raddr[0]
+ elif raddr_a != raddr[0]:
+ asm_error('can only read from one location in each regfile')
+ arot_r5 = raddr[2]
+ if raddr[1] == 0:
+ return raddr_a, raddr_b, immb, arot_r5, RMUX_A
+ raddr = 48 + raddr[1]
+ if not immb:
+ if raddr_b is not None:
+ asm_error('regfile b and rotation don\'t mix')
+ raddr_b = raddr
+ immb = True
+ elif raddr_b != raddr:
+ asm_error('can only encode one rotation/immediate')
+ return raddr_a, raddr_b, immb, arot_r5, RMUX_A
+ if rmux == RMUX_B:
+ if immb:
+ asm_error('regfile b and rotation/immediates don\'t mix')
+ if raddr_b is None:
+ raddr_b = raddr
+ elif raddr_b != raddr:
+ asm_error('can only read from one location in each regfile')
+ return raddr_a, raddr_b, immb, arot_r5, RMUX_B
+ assert 0
+
+# ok if:
+# - accumulator (r0-r3)
+# - uniform (ie all elements identical). this is true of unif, qpu_num, vr_busy,
+# and vw_busy. it's also true of r5 if it was written by r5rep, but not if it
+# was written by r5quad. so, by default, r5 isn't considered uniform. todo:
+# what about vr_wait/vw_wait/mutex?
+def read_rot_ok(rmux, raddr_a, raddr_b):
+ return ((rmux < 4) or ((rmux == 5) and dont_warn_when_mul_rot_inp_r5) or
+ ((rmux == 6) and (raddr_a in (32, 49))) or # unif/vr_busy
+ ((rmux == 7) and (raddr_b in (32, 38, 49)))) # unif/qpu_num/vw_busy
+
+def asm_flush_prog_data():
+ global prog_data
+
+ while len(prog_data) & 7:
+ prog_data.append(0)
+ for i in xrange(0, len(prog_data), 8):
+ prog.append(((prog_data[i + 3] << 24) | (prog_data[i + 2] << 16) | (prog_data[i + 1] << 8) | (prog_data[i + 0] << 0),
+ (prog_data[i + 7] << 24) | (prog_data[i + 6] << 16) | (prog_data[i + 5] << 8) | (prog_data[i + 4] << 0), 'data', {}))
+ prog_data = []
+
+def asm_line(sets, location, line):
+ global current_location, construct, nwarn_level
+
+ prev_location = current_location
+ current_location = location
+
+ try:
+ if construct != None:
+ if re_macro.match(line):
+ construct_stack.append(CONSTRUCT_MACRO)
+ elif re_if.match(line):
+ construct_stack.append(CONSTRUCT_IF)
+ elif re_rep.match(line):
+ construct_stack.append(CONSTRUCT_REP)
+ else:
+ else_m = line == '.else'
+ elif_m = re_elif.match(line)
+ if elif_m:
+ end_construct = CONSTRUCT_IF
+ else:
+ end_construct = {
+ '.endm': CONSTRUCT_MACRO,
+ '.else': CONSTRUCT_IF,
+ '.endif': CONSTRUCT_IF | CONSTRUCT_ELSE,
+ '.endr': CONSTRUCT_REP}.get(line)
+ if end_construct is not None:
+ end_construct &= construct_stack.pop()
+ if end_construct == 0:
+ if elif_m:
+ asm_error('unexpected .elif')
+ asm_error('unexpected %s' % line)
+ if len(construct_stack) == 0:
+ lines = construct
+ construct = None
+ if end_construct == CONSTRUCT_MACRO:
+ return
+ if (end_construct == CONSTRUCT_IF) or (end_construct == CONSTRUCT_ELSE):
+ condition_if, condition_else = lines[0]
+ lines = lines[1:]
+ if condition_if:
+ for location, line in lines:
+ asm_line(sets, location, line)
+ if else_m:
+ construct = [(condition_else, False)]
+ construct_stack.append(CONSTRUCT_ELSE)
+ elif elif_m:
+ if elif_m.group('set'):
+ condition_if = condition_else and ((elif_m.group('set') == 'nset') ^ (elif_m.group('name') in sets))
+ else:
+ condition_if = condition_else and arg_eval(elif_m.group('condition'), sets)
+ condition_else = condition_else and (not condition_if)
+ construct = [(condition_if, condition_else)]
+ construct_stack.append(CONSTRUCT_IF)
+ return
+ if end_construct == CONSTRUCT_REP:
+ name, count = lines[0]
+ lines = lines[1:]
+ for i in xrange(count):
+ sets[name] = i
+ for location, line in lines:
+ asm_line(sets, location, line)
+ return
+ assert 0
+ if else_m:
+ construct_stack.append(CONSTRUCT_ELSE)
+ elif elif_m:
+ construct_stack.append(CONSTRUCT_IF)
+ construct.append((current_location, line))
+ return
+
+ if line in ('.endm', '.else', '.endif', '.endr'):
+ asm_error('unexpected %s' % line)
+ if re_elif.match(line):
+ asm_error('unexpected .elif')
+
+ m = re_macro.match(line)
+ if m:
+ construct = []
+ construct_stack.append(CONSTRUCT_MACRO)
+ macros[m.group('name')] = ([param.strip() for param in m.group('params').split(',')[1:]], construct)
+ return
+
+ m = re_if.match(line)
+ if m:
+ if m.group('set'):
+ condition = (m.group('set') == 'nset') ^ (m.group('name') in sets)
+ else:
+ # not not forces condition to a bool (this matters if condition is
+ # something mutable like a list)
+ condition = not not arg_eval(m.group('condition'), sets)
+ construct = [(condition, not condition)]
+ construct_stack.append(CONSTRUCT_IF)
+ return
+
+ m = re_rep.match(line)
+ if m:
+ count = arg_eval(m.group('count'), sets)
+ if not is_int(count):
+ asm_error('.rep count must be integer')
+ construct = [(m.group('name'), count)]
+ construct_stack.append(CONSTRUCT_REP)
+ return
+
+ m = re_include.match(line)
+ if m:
+ filename = arg_eval(m.group('filename'), sets)
+ if not isinstance(filename, str):
+ asm_error('expected string')
+ asm_file(sets, '%s: %s' % (current_location, filename), filename)
+ return
+
+ m = re_set.match(line)
+ if m:
+ sets[m.group('name')] = arg_eval(m.group('val'), sets)
+ return
+
+ m = re_unset.match(line)
+ if m:
+ name = m.group('name')
+ if name not in sets:
+ asm_error('%s not set' % name)
+ if name in arg_defs: # todo: see arg_eval
+ sets[name] = arg_defs[name]
+ else:
+ del sets[name]
+ return
+
+ m = re_eval.match(line)
+ if m:
+ arg_eval(m.group('expr'), sets)
+ return
+
+ m = re_print_info_warn_error.match(line)
+ if m:
+ def print_fn(message):
+ print message
+ def info_fn(message):
+ sys.stderr.write('%s\n' % message)
+ {'print': print_fn, 'info': info_fn, 'warn': asm_warning, 'error': asm_error}[
+ m.group('print_info_warn_error')](arg_eval(m.group('message'), sets))
+ return
+
+ m = re_assert.match(line)
+ if m:
+ if not arg_eval(m.group('condition'), sets):
+ asm_error('assertion failure: \'%s\'' % m.group('condition'))
+ return
+
+ m = re_data.match(line)
+ if m:
+ size = int(m.group('size'))
+ for datum in smart_split(m.group('data')):
+ datum = arg_eval(datum, sets)
+ if not is_int(datum):
+ asm_error('datum must be integer')
+ prog_data.extend(((datum >> (i * 8)) & 0xff) for i in xrange(size))
+ return
+
+ m = re_macro_inst.match(line)
+ if m:
+ name = m.group('name')
+ if name in macros:
+ params, lines = macros[name]
+ args = smart_split(m.group('args'))
+ if len(args) > len(params):
+ asm_error('too many arguments to macro')
+ sets = sets.copy()
+ sets.update(zip(params, (arg_eval(arg, sets) for arg in args)))
+ for param in params[len(args):]:
+ if param in sets:
+ if param in arg_defs: # todo: see arg_eval
+ sets[param] = arg_defs[param]
+ else:
+ del sets[param]
+ for location, line in lines:
+ asm_line(sets, '%s: %s' % (current_location, location), line)
+ return
+
+ if line == '.pushnwarn':
+ nwarn_level += 1
+ return
+ if line == '.popnwarn':
+ if nwarn_level == 0:
+ asm_error('.popnwarn without .pushnwarn')
+ nwarn_level -= 1
+ return
+
+ # everything below assumes prog is up to date
+ asm_flush_prog_data()
+
+ m = re_label.match(line)
+ if m:
+ name = m.group('name')
+ if name[0].isdigit():
+ labels.setdefault(name, []).append(len(prog))
+ else:
+ if name[0] == ':':
+ undecorated_name = name[1:]
+ else:
+ undecorated_name = name
+ if (undecorated_name in labels) or ((':' + undecorated_name) in labels):
+ asm_error('named label defined twice')
+ labels[name] = len(prog)
+ return
+
+ annots = line.split('@')
+ ops = [op.strip() for op in annots[0].split(';')]
+ annots = sum((get_annots(annot, sets) for annot in annots[1:]), [])
+ sig = get_sig(ops[-1])
+ if sig != SIG_NORMAL:
+ ops = ops[:-1]
+ if len(ops) > 2:
+ asm_error('too many ops')
+ elif (len(ops) == 1) and (ops[0] == ''):
+ ops = []
+ ops = (ops + ['nop', 'nop'])[:2]
+ m = re_op.match(ops[0])
+ if not m:
+ asm_error('invalid syntax')
+ aop, aargs_n = get_aop(m.group('op'))
+ if (aop == AOP_BRA) or (aop == AOP_BRR):
+ acond = get_bcond(m.group('cond'))
+ else:
+ acond = get_cond(m.group('cond'))
+ asf = get_setf(m.group('sf'))
+ aargs = smart_split(m.group('args'))
+ if len(aargs) != aargs_n:
+ asm_error('wrong operand count')
+ ard, ara, arb = (aargs + [None, None, None])[:3]
+ m = re_op.match(ops[1])
+ if not m:
+ asm_error('invalid syntax')
+ mop, margs_n = get_mop(m.group('op'))
+ mcond = get_cond(m.group('cond'))
+ msf = get_setf(m.group('sf'))
+ margs = smart_split(m.group('args'))
+ if len(margs) != margs_n:
+ asm_error('wrong operand count')
+ mrd, mra, mrb = (margs + [None, None, None])[:3]
+ # eval srcs first so allocator can retire and reuse registers for dst
+ aaraddr, aarmux, aarpack, aadrot, aadrot_r5 = get_src(ara, sets)
+ abraddr, abrmux, abrpack, abdrot, abdrot_r5 = get_src(arb, sets)
+ maraddr, marmux, marpack, madrot, madrot_r5 = get_src(mra, sets)
+ mbraddr, mbrmux, mbrpack, mbdrot, mbdrot_r5 = get_src(mrb, sets)
+ awaddr, awmux, awpack, awrot, awrot_r5 = get_dst(ard, sets)
+ mwaddr, mwmux, mwpack, mwrot, mwrot_r5 = get_dst(mrd, sets)
+ if (((abrmux is not None) and ((aadrot != abdrot) or (aadrot_r5 != abdrot_r5))) or
+ ((mbrmux is not None) and ((madrot != mbdrot) or (madrot_r5 != mbdrot_r5)))):
+ asm_error('cannot have 2 arguments with different rotations')
+ if aarmux is not None:
+ awrot = (awrot + aadrot) % 16
+ awrot_r5 = (awrot_r5 + aadrot_r5) % 16
+ if (awrot != 0) or awrot_r5:
+ asm_error('rotate not allowed on add write')
+ if marmux is not None:
+ mwrot = (mwrot + madrot) % 16
+ mwrot_r5 = (mwrot_r5 + madrot_r5) % 16
+
+ afloatr = aop in (AOP_FADD, AOP_FSUB, AOP_FMIN, AOP_FMAX, AOP_FMINABS, AOP_FMAXABS, AOP_FTOI)
+ afloatw = aop in (AOP_FADD, AOP_FSUB, AOP_FMIN, AOP_FMAX, AOP_FMINABS, AOP_FMAXABS, AOP_ITOF)
+ pm, pack, unpack, forcebs, forcerafloat = calculate_pack_modes(
+ [aarpack, abrpack, marpack, mbrpack],
+ [afloatr, afloatr, mop == MOP_FMUL, mop == MOP_FMUL],
+ aop == AOP_FTOI,
+ [awpack, mwpack],
+ [afloatw, mop == MOP_FMUL])
+ if forcebs[0]:
+ aarmux = RMUX_B
+ if forcebs[1]:
+ abrmux = RMUX_B
+ if forcebs[2]:
+ marmux = RMUX_B
+ if forcebs[3]:
+ mbrmux = RMUX_B
+
+ # extend nops to 3 operands
+ if aop == AOP_NOP:
+ awaddr, awmux, aaraddr, aarmux, abraddr, abrmux = WADDR_NOP, WMUX_ANY, 0, RMUX_AC, 0, RMUX_AC
+ if mop == MOP_NOP:
+ mwaddr, mwmux, maraddr, marmux, mbraddr, mbrmux = WADDR_NOP, WMUX_ANY, 0, RMUX_AC, 0, RMUX_AC
+
+ # extend 2 operand alu ops to 3 operands (by duplicating the 2nd operand)
+ if (aop == AOP_FTOI) or (aop == AOP_ITOF) or (aop == AOP_NOT) or (aop == AOP_CLZ):
+ if forcerafloat:
+ assert aop == AOP_FTOI # can only forcerafloat if we have an unused float operand
+ # instead of duplicating the 2nd operand, take the ra operand from
+ # the mul op thus forcing the ra value to be considered a float for
+ # the purposes of unpacking
+ if marmux == RMUX_A:
+ abraddr, abrmux = maraddr, marmux
+ else:
+ assert mbrmux == RMUX_A
+ abraddr, abrmux = mbraddr, mbrmux
+ else:
+ abraddr, abrmux = aaraddr, aarmux
+ else:
+ assert not forcerafloat # can only forcerafloat if we have an unused operand
+
+ # handle write addrs
+ if (awmux == mwmux) and (awmux != WMUX_ANY):
+ asm_error('add/mul ops not allowed to write to same regfile')
+ ws = (awmux == WMUX_B) or (mwmux == WMUX_A)
+
+ # handle branch
+ if (aop == AOP_BRA) or (aop == AOP_BRR):
+ # check setf
+ if asf:
+ asm_error('setf not allowed on bra/brr')
+
+ # check pack/unpack
+ if (pack != 0) or (unpack != 0):
+ asm_error('pack/unpack not allowed with bra/brr')
+
+ # handle read address
+ if aarmux == RMUX_LABEL:
+ if (aop == AOP_BRA) and aaraddr[1]:
+ asm_warning('bra with rel label')
+ if (aop == AOP_BRR) and (not aaraddr[1]):
+ asm_warning('brr with abs label')
+ aaraddr, aarmux = (current_location,) + aaraddr, RMUX_IMM
+ if aarmux == RMUX_ANY:
+ aaraddr, aarmux = (aaraddr, 0, 0), RMUX_A
+ if (aarmux != RMUX_IMM) and (aarmux != RMUX_A):
+ asm_error('branch destination must be either label, immediate, or from regfile a')
+ if aarmux == RMUX_IMM:
+ imm = aaraddr
+ raddr = 0 # can't use RADDR_NOP
+ elif aarmux == RMUX_A:
+ if (aaraddr[1] != 0) or (aaraddr[2] != 0):
+ asm_error('rotation of read from regfile a not allowed with branch')
+ if aop == AOP_BRR:
+ asm_warning('brr with ra')
+ imm = 0
+ raddr = aaraddr[0]
+ else:
+ assert 0
+
+ # check mul op is nop
+ if mop != MOP_NOP:
+ asm_error('mul op not allowed with branch')
+
+ # check sig
+ if sig != SIG_NORMAL:
+ asm_error('no signal allowed with branch')
+
+ if raddr >= 32:
+ asm_error('can only branch to register locations in physical regfile')
+ if raddr & 1:
+ asm_warning('branch instruction will destroy flags (see hw-2780)')
+
+ # construct branch instruction
+ prog.append((imm,
+ (mwaddr << 0) | (awaddr << 6) | (ws << 12) | (raddr << 13) | ((aarmux == RMUX_A) << 18) | ((aop == AOP_BRR) << 19) | (acond << 20) | (SIG_BRANCH << 28),
+ line, annots))
+
+ return
+
+ # use COND_NEVER when possible (might save power / allow mul setf)
+ if not dict(annots).get('preserve_cond', 0):
+ if (awaddr == WADDR_NOP) and (not asf):
+ acond = COND_NEVER
+ if (mwaddr == WADDR_NOP) and (not msf):
+ mcond = COND_NEVER
+
+ # attempt to convert movs to ldi
+ if (# no mul setf
+ (not msf) and
+ # ops must either be nop or mov of sema/label/imm/immv
+ ((aop == AOP_NOP) or ((aop == AOP_MOV) and (aarmux in (RMUX_SEMA, RMUX_LABEL, RMUX_IMMV, RMUX_IMM)))) and
+ ((mop == MOP_NOP) or ((mop == MOP_MOV) and (marmux in (RMUX_SEMA, RMUX_LABEL, RMUX_IMMV, RMUX_IMM)))) and
+ # but we don't want 2 nops
+ ((aop != AOP_NOP) or (mop != MOP_NOP)) and
+ # if both ops are movs, srcs must be identical
+ ((aop != AOP_MOV) or (mop != MOP_MOV) or ((aarmux == marmux) and (aaraddr == maraddr))) and
+ # no signal
+ (sig == SIG_NORMAL)):
+ # make sure aarmux/aaraddr contains the value
+ if aop != AOP_MOV:
+ aarmux = marmux
+ aaraddr = maraddr
+
+ # convert immediate
+ if aarmux == RMUX_SEMA:
+ ldi_mode = LDI_SEMA
+ elif aarmux == RMUX_LABEL:
+ ldi_mode = LDI_32
+ aaraddr, aarmux = (current_location,) + aaraddr, RMUX_IMM
+ elif aarmux == RMUX_IMMV:
+ signed, unsigned = True, True
+ imm = 0
+ for i, elem in enumerate(aaraddr):
+ if elem not in (-2 + (1 << 32), -1 + (1 << 32), 0, 1):
+ signed = False
+ if elem not in (0, 1, 2, 3):
+ unsigned = False
+ imm |= ((elem & 0x1) << i) | ((elem & 0x2) << (15 + i))
+ if not (signed or unsigned):
+ asm_error('can\'t encode vector immediate')
+ if signed:
+ ldi_mode = LDI_EL_SIGNED
+ else:
+ ldi_mode = LDI_EL_UNSIGNED
+ aaraddr, aarmux = imm, RMUX_IMM
+ elif aarmux == RMUX_IMM:
+ ldi_mode = LDI_32
+ else:
+ assert 0
+
+ # construct ldi instruction
+ prog.append((aaraddr,
+ (mwaddr << 0) | (awaddr << 6) | (ws << 12) | (asf << 13) | (mcond << 14) | (acond << 17) | (pack << 20) | (pm << 24) | (ldi_mode << 25) | (SIG_IMMED << 28),
+ line, annots))
+
+ return
+
+ # convert movs to alu ops
+ if aop == AOP_MOV:
+ if allow_xor_0 and (aarmux == RMUX_IMM) and (aaraddr == 0):
+ aop = AOP_XOR
+ aaraddr, aarmux = 0, RMUX_AC
+ abraddr, abrmux = 0, RMUX_AC
+ else:
+ aop = AOP_OR
+ abraddr, abrmux = aaraddr, aarmux
+ if mop == MOP_MOV:
+ if allow_xor_0 and (marmux == RMUX_IMM) and (maraddr == 0):
+ mop = MOP_V8SUBS
+ maraddr, marmux = 0, RMUX_AC
+ mbraddr, mbrmux = 0, RMUX_AC
+ else:
+ mop = MOP_V8MIN
+ mbraddr, mbrmux = maraddr, marmux
+
+ # normal alu instruction...
+
+ # handle setf
+ if asf and (aop == AOP_NOP):
+ asm_error('nop.setf is not allowed in add pipe')
+ if msf and (mop == MOP_NOP):
+ asm_warning('nop.setf, really?')
+ if (aop == AOP_NOP) or (acond == COND_NEVER):
+ sf = msf
+ else:
+ if msf:
+ asm_error('setf only allowed on mul op if add op is nop or add condition is never')
+ sf = asf
+
+ # handle read addrs
+ raddr_a = None
+ raddr_b = None
+ immb = False
+ arot_r5 = False
+ muxes = [0, 0, 0, 0]
+ if mwrot != 0:
+ raddr_b = 48 + mwrot
+ immb = True
+ if mwrot_r5 and have_am:
+ raddr_b = 48
+ immb = True
+ for f in lambda rmux: rmux != RMUX_ANY, lambda rmux: rmux == RMUX_ANY: # do RMUX_ANY last
+ for i, raddr, rmux in (0, aaraddr, aarmux), (1, abraddr, abrmux), (2, maraddr, marmux), (3, mbraddr, mbrmux):
+ if f(rmux):
+ raddr_a, raddr_b, immb, arot_r5, muxes[i] = merge_rmux(raddr_a, raddr_b, immb, arot_r5, raddr, rmux)
+ add_a, add_b, mul_a, mul_b = muxes
+ if (not read_rot_ok(mul_a, raddr_a, raddr_b)) or (not read_rot_ok(mul_b, raddr_a, raddr_b)):
+ # some output elements might not be as expected
+ if mwrot_r5 or ((mwrot >= 4) and (mwrot <= 12)):
+ bad_elems = 0xffff
+ else:
+ bad_elems = ((1 << (mwrot & 0x3)) - 1) * 0x1111
+ if mwrot > 12:
+ bad_elems ^= 0xffff
+ bad_elems &= dict(annots).get('mul_used', 0xffff)
+ if not msf:
+ if mwaddr == WADDR_NOP:
+ # not writing anywhere and not setting flags. no elements used
+ bad_elems = 0
+ elif ((mwaddr in (36, 40, 43, 49, 50, 51)) or
+ ((not ws) and (mwaddr == 37))):
+ # writing to tmurs/r5rep/unif_addr/unif_addr_rel/stencil/
+ # vr_setup/vw_setup/vr_addr/vw_addr/mutex and not setting flags.
+ # only use element 0
+ bad_elems &= 0x0001
+ elif ((mwaddr == 41) or (ws and (mwaddr == 37)) or
+ ((not ws) and (mwaddr == 42))):
+ # writing to r5quad/x_coord/y_coord/rev_flag and not setting
+ # flags. only use elements 0, 4, 8, and 12
+ bad_elems &= 0x1111
+ if bad_elems:
+ asm_warning('mul inputs don\'t come from accumulators (r0-r3). output may not be as expected')
+ if raddr_a is None:
+ raddr_a = RADDR_NOP
+ if raddr_b is None:
+ raddr_b = RADDR_NOP
+ if immb:
+ if sig != SIG_NORMAL:
+ asm_error('rotation/immediates and signal don\'t mix')
+ sig = SIG_SMALLIMMED
+ if arot_r5 or (mwrot_r5 and (not have_am)):
+ if sig != SIG_NORMAL:
+ asm_error('rotation/immediates/signal don\'t mix')
+ sig = SIG_ROTATE
+
+ # construct instruction
+ prog.append(((mul_b << 0) | (mul_a << 3) | (add_b << 6) | (add_a << 9) | (raddr_b << 12) | (raddr_a << 18) | (aop << 24) | (mop << 29),
+ (mwaddr << 0) | (awaddr << 6) | (ws << 12) | (sf << 13) | (mcond << 14) | (acond << 17) | (pack << 20) | (pm << 24) | (unpack << 25) | (sig << 28),
+ line, annots))
+ finally:
+ current_location = prev_location
+
+def preprocess_passthrough(file):
+ line_number = 0
+ for line in file:
+ line_number += 1
+ yield line_number, line
+
+def asm_file(sets, location, filename, preprocess = None):
+ global current_dir, current_location
+
+ if filename is None:
+ location = '<stdin>'
+ file = sys.stdin
+
+ prev_dir = current_dir
+ else:
+ filename = os.path.normpath(os.path.join(current_dir, filename))
+
+ try:
+ file = open(filename)
+ except Exception, e:
+ asm_error(e)
+ except:
+ asm_error('unknown error while opening file %s' % filename)
+
+ prev_dir = current_dir
+ current_dir = os.path.dirname(filename)
+
+ prev_location = current_location
+ current_location = location
+
+ if preprocess is None:
+ preprocess = preprocess_passthrough
+
+ try:
+ for line_number, line in preprocess(file):
+ # strip off comments and whitespace
+ line = line.split('#')[0].strip()
+ if line == '':
+ continue
+
+ asm_line(sets, '%s: %d' % (current_location, line_number), line)
+ finally:
+ current_dir = prev_dir
+ current_location = prev_location
+
+def asm_end_prog():
+ # check we aren't in a multi-line construct (eg .macro or .rep)
+ if construct != None:
+ asm_error({
+ CONSTRUCT_MACRO: '.macro without .endm',
+ CONSTRUCT_IF: '.if/.elif without .endif',
+ CONSTRUCT_ELSE: '.else without .endif',
+ CONSTRUCT_REP: '.rep without .endr'}[construct_stack[-1]])
+
+ # check no warnings level back to 0
+ if nwarn_level != 0:
+ asm_error('.pushnwarn without .popnwarn')
+
+ # flush queued up data
+ asm_flush_prog_data()
+
+ # fixup all the label references we can
+ for pc in xrange(len(prog)):
+ if isinstance(prog[pc][0], tuple):
+ location, label, rel, offset = prog[pc][0]
+ if label[0].isdigit():
+ label_pcs = labels.get(label[:-1], [])
+ if label[-1] == 'b':
+ label_pcs = filter(lambda label_pc: label_pc <= pc, label_pcs)[-1:]
+ else:
+ label_pcs = filter(lambda label_pc: label_pc > pc, label_pcs)[:1]
+ if label_pcs == []:
+ asm_error('search for label reached begin/end of file', location = location)
+ imm = label_pcs[0]
+ elif label in labels:
+ imm = labels[label]
+ elif (':' + label) in labels:
+ imm = labels[':' + label]
+ elif external_link:
+ continue # let the external linker deal with it
+ else:
+ asm_error('undefined label', location = location)
+ imm = (imm * 8) + offset
+ if rel:
+ imm -= (pc + 4) * 8 # relative to instruction after delay slots
+ imm &= (1 << 32) - 1
+ else:
+ if not external_link:
+ asm_error('can\'t get absolute address without using an external linker. this mode doesn\'t have an external linker', location = location)
+ imm = (location, label, rel, offset, imm)
+ prog[pc] = (imm,) + prog[pc][1:]
+
+def asm_init():
+ global current_dir, current_location, prog, prog_data, macros, labels, construct, construct_stack, nwarn_level
+
+ current_dir = os.getcwd()
+ current_location = ''
+ prog = []
+ prog_data = []
+ macros = {
+ 'sacq': (['dst', 'i'], [('candyland', 'mov dst, sacq(i)')]),
+ 'srel': (['dst', 'i'], [('candyland', 'mov dst, srel(i)')])}
+ labels = {}
+ construct = None
+ construct_stack = []
+ nwarn_level = 0
+
+def asm_reset_prog():
+ global prog, labels
+
+ prog = []
+ labels = {}
+
+###############################################################################
+# dumping
+###############################################################################
+
+def print_lines(lines):
+ for line in lines:
+ print line
+
+class dumper_t:
+ def external_link(self): return False
+ def begin(self): pass
+ def label(self, pc, name): pass
+ def line(self, pc, ls, ms, line, annots, first): pass
+ def end(self): pass
+ def sets(self, sets): pass
+ def direct(self, line): pass
+
+class clif_dumper_t(dumper_t):
+ def __init__(self):
+ self.annot_mode = 0
+
+ def external_link(self):
+ return True
+
+ def parse_annot_mode(self, line):
+ l = line.split(',')
+ self.annot_mode = int(l[0])
+ if self.annot_mode not in (0, 1, 2):
+ asm_error('bad annot mode')
+ if self.annot_mode == 2:
+ if len(l) != 2:
+ asm_error('expected buffer name')
+ self.annot_name = l[1].strip()
+ self.annot_offset = 0
+ elif len(l) != 1:
+ asm_error('unexpected comma')
+
+ def label(self, pc, name):
+ if (self.annot_mode != 1) and (name[0] == ':'):
+ if self.annot_mode == 2:
+ name = name + '_annotations'
+ print '@label %s' % name[1:]
+ else:
+ print '// :%s' % name
+
+ def line(self, pc, ls, ms, line, annots, first):
+ if self.annot_mode == 0:
+ if isinstance(ls, tuple):
+ if len(ls) == 5:
+ location, label, rel, offset, offset_from_prog = ls
+ assert not rel
+ ls = '[. - %d + %d]' % (pc * 8, offset_from_prog)
+ else:
+ location, label, rel, offset = ls
+ if rel:
+ asm_error('relative external label references not allowed in this mode', location = location)
+ ls = '[%s + %d]' % (label, offset)
+ else:
+ ls = '0x%08x' % ls
+ print '%s 0x%08x // %s' % (ls, ms, line)
+ elif self.annot_mode == 1:
+ print '// %s' % line
+ for annot in annots:
+ print '0x%08x 0x%08x // %s' % ({
+ # todo: would rather not have these hard coded
+ 'mul_used': 1,
+ 'preserve_cond': 2,
+ 'geomd_open': 3,
+ 'geomd_i': 4,
+ 'geomd_tris_clear': 5,
+ 'geomd_verts': 6,
+ 'geomd_tris_add': 7,
+ 'geomd_tris_set_center': 8,
+ 'geomd_region_clear': 9,
+ 'geomd_region_set': 10,
+ 'geomd_images_clear': 11,
+ 'geomd_images_l': 12,
+ 'geomd_images_b': 13,
+ 'geomd_images_r': 14,
+ 'geomd_images_t': 15,
+ 'geomd_images_add_vpm': 16,
+ 'trace_4c': 17,
+ 'geomd_images_add_tex': 18,}[annot[0]], annot[1], annot[0])
+ if len(annots) != 0:
+ print '0x00000000 // end'
+ else:
+ assert self.annot_mode == 2
+ if len(annots) == 0:
+ print '0x00000000 // %s' % line
+ else:
+ print '[%s + %d] // %s' % (self.annot_name, self.annot_offset, line)
+ self.annot_offset += (len(annots) * 8) + 4
+
+ def direct(self, line):
+ print line
+
+class plain_dumper_t(dumper_t):
+ def line(self, pc, ls, ms, line, annots, first):
+ print '0x%08x, 0x%08x, // %s' % (ls, ms, line)
+
+class c_c_dumper_t(dumper_t):
+ def __init__(self, header_name, full_header_name, array_name):
+ self.header_name = header_name
+ self.array_name = array_name
+
+ def external_link(self):
+ return True
+
+ def begin(self):
+ self.external_labels = set()
+ self.lines = []
+
+ print '#include "%s.h"' % self.header_name
+ print ''
+ print '#ifdef _MSC_VER'
+ print ' #include <stdint.h>'
+ print ' /* cast through uintptr_t to avoid warnings */'
+ print ' #define POINTER_TO_UINT(X) ((unsigned int)(uintptr_t)(X))'
+ print '#else'
+ print ' #define POINTER_TO_UINT(X) ((unsigned int)(X))'
+ print '#endif'
+ print ''
+ print '#ifdef __cplusplus'
+ print 'extern "C" { /* the types are probably wrong... */'
+ print '#endif'
+
+ def label(self, pc, name):
+ self.lines.append('// :%s' % name)
+
+ def line(self, pc, ls, ms, line, annots, first):
+ if isinstance(ls, tuple):
+ if len(ls) == 5:
+ location, label, rel, offset, offset_from_prog = ls
+ assert not rel
+ ls = 'POINTER_TO_UINT(%s) + %d' % (self.array_name, offset_from_prog)
+ else:
+ location, label, rel, offset = ls
+ if rel:
+ asm_error('relative external label references not allowed in this mode', location = location)
+ if label not in self.external_labels:
+ self.external_labels.add(label)
+ print 'extern uint8_t %s[];' % label
+ ls = 'POINTER_TO_UINT(%s) + %d' % (label, offset)
+ else:
+ ls = '0x%08x' % ls
+ self.lines.append('/* [0x%08x] */ %s, 0x%08x, // %s' % (pc * 8, ls, ms, line))
+
+ def end(self):
+ print '#ifdef __cplusplus'
+ print '}'
+ print '#endif'
+ print ''
+ print '#ifdef _MSC_VER'
+ print '__declspec(align(8))'
+ print '#elif defined(__GNUC__)'
+ print '__attribute__((aligned(8)))'
+ print '#endif'
+ print 'unsigned int %s[] = {' % self.array_name
+ print_lines(self.lines)
+ print '};'
+ print '#ifdef __HIGHC__'
+ print '#pragma Align_to(8, %s)' % self.array_name
+ print '#endif'
+
+class c_h_dumper_t(dumper_t):
+ def __init__(self, header_name, full_header_name, array_name):
+ self.full_header_name = full_header_name
+ self.array_name = array_name
+
+ def external_link(self):
+ return True
+
+ def begin(self):
+ print '#ifndef %s_H' % self.full_header_name
+ print '#define %s_H' % self.full_header_name
+ print ''
+ print 'extern unsigned int %s[];' % self.array_name
+ print ''
+
+ def label(self, pc, name):
+ if name[0] == ':':
+ print '#define %s (%s + %d)' % (name[1:], self.array_name, pc * 2)
+
+ def end(self):
+ print ''
+ print '#endif'
+
+class ml_c_dumper_t(dumper_t):
+ def __init__(self, header_name, full_header_name, name, annots):
+ self.header_name = header_name
+ self.name = name
+ self.annots = annots
+
+ def external_link(self):
+ return True
+
+ def begin(self):
+ if self.annots:
+ self.annot_lines = []
+ self.lines = []
+ self.external_labels = set()
+ self.link_lines = []
+
+ print '#include "%s.h"' % self.header_name
+ print '#include <assert.h>'
+ if self.annots:
+ print '#ifdef SIMPENROSE'
+ print '#include <stddef.h>'
+ print '#include "v3d/verification/tools/2760sim/simpenrose.h"'
+ print ''
+
+ def label(self, pc, name):
+ self.lines.append('// :%s' % name)
+
+ def line(self, pc, ls, ms, line, annots, first):
+ if self.annots:
+ if len(annots) == 0:
+ self.annot_lines.append('NULL,')
+ else:
+ print 'static unsigned int const annotations_%d[] = {' % pc
+ for annot in annots:
+ print ' SIMPENROSE_SHADER_ANNOTATION_%s, 0x%08x,' % (annot[0].upper(), annot[1])
+ print ' SIMPENROSE_SHADER_ANNOTATION_END};'
+ print ''
+ self.annot_lines.append('annotations_%d,' % pc)
+ if isinstance(ls, tuple):
+ self.link_lines.append(' assert(p[%d] == 0xdeadbeef);' % (pc * 2))
+ if len(ls) == 5:
+ location, label, rel, offset, offset_from_prog = ls
+ assert not rel
+ self.link_lines.append(' p[%d] = base + %d;' % (pc * 2, offset_from_prog))
+ else:
+ location, label, rel, offset = ls
+ self.external_labels.add(label)
+ if rel:
+ self.link_lines.append(' p[%d] = (%s + %d) - (base + %d);' % (pc * 2, label, offset, (pc + 4) * 8))
+ else:
+ self.link_lines.append(' p[%d] = %s + %d;' % (pc * 2, label, offset))
+ ls = '0xdeadbeef'
+ else:
+ ls = '0x%08x' % ls
+ self.lines.append('/* [0x%08x] */ %s, 0x%08x, // %s' % (pc * 8, ls, ms, line))
+
+ def end(self):
+ if self.annots:
+ print 'unsigned int const *const %s_annotations_array[] = {' % self.name
+ print_lines(self.annot_lines)
+ print '};'
+ print '#endif'
+ print ''
+ print 'static unsigned int const array[] = {'
+ print_lines(self.lines)
+ print '};'
+ print ''
+ print 'void %s_link(void *p_in, unsigned int base' % self.name
+ for label in sorted(self.external_labels):
+ print ' , unsigned int %s' % label
+ print ' )'
+ print '{'
+ print ' unsigned int *p = (unsigned int *)p_in;'
+ print ' unsigned int i;'
+ print ' for (i = 0; i != (%s_SIZE / 4); ++i) {' % self.name.upper()
+ print ' p[i] = array[i];'
+ print ' }'
+ print_lines(self.link_lines)
+ print '}'
+
+class ml_h_dumper_t(dumper_t):
+ def __init__(self, header_name, full_header_name, name, annots):
+ self.full_header_name = full_header_name
+ self.name = name
+ self.annots = annots
+
+ def external_link(self):
+ return True
+
+ def begin(self):
+ self.external_labels = set()
+ self.lines_n = 0
+
+ print '#ifndef %s_H' % self.full_header_name
+ print '#define %s_H' % self.full_header_name
+ print ''
+ if self.annots:
+ print '#ifdef SIMPENROSE'
+ print ' extern unsigned int const *const %s_annotations_array[];' % self.name
+ print '#endif'
+ print ''
+
+ def label(self, pc, name):
+ if name[0] == ':':
+ print '#define %s_OFFSET %d' % (name[1:].upper(), pc * 8)
+ if self.annots:
+ print '#ifdef SIMPENROSE'
+ print ' #define %s_annotations (%s_annotations_array + %d)' % (name[1:], self.name, pc)
+ print '#endif'
+
+ def line(self, pc, ls, ms, line, annots, first):
+ if isinstance(ls, tuple) and (len(ls) != 5):
+ self.external_labels.add(ls[1])
+ self.lines_n += 1
+
+ def end(self):
+ print ''
+ print 'extern void %s_link(void *p, unsigned int base' % self.name
+ for label in sorted(self.external_labels):
+ print ' , unsigned int %s' % label
+ print ' );'
+ print ''
+ print '#define %s_SIZE %d' % (self.name.upper(), (self.lines_n * 8))
+ print ''
+ print '#endif'
+
+def print_lines_lc(lines):
+ for line in lines:
+ print '%s \\' % line
+
+def print_groups_lc(groups):
+ first = True
+ for group in groups:
+ if first:
+ print '{ \\'
+ else:
+ print ', { \\'
+ print_lines_lc(group)
+ print '} \\'
+ first = False
+
+class inline_c_dumper_t(dumper_t):
+ def __init__(self, annots):
+ self.annots = annots
+ self.iteration = False
+
+ def begin_iteration(self):
+ assert not self.iteration
+ self.iteration = True
+ self.iteration_lines = []
+ if self.annots:
+ self.iteration_annot_lines = []
+ self.annot_arrs = []
+
+ def end_iteration(self):
+ assert self.iteration
+ self.iteration = False
+ print '%d, \\' % self.iteration_n
+ if self.annots:
+ print '( \\'
+ print_groups_lc(self.iteration_lines)
+ if self.annots:
+ print '), ( \\'
+ print_groups_lc(self.iteration_annot_lines)
+ print '), ( \\'
+ for annot_arr in self.annot_arrs:
+ print_lines_lc(annot_arr)
+ print ') \\'
+
+ def begin(self):
+ self.n = 0
+ self.lines = []
+ if self.annots:
+ self.annot_lines = []
+ if not self.iteration:
+ self.annot_arrs = []
+
+ def label(self, pc, name):
+ self.lines.append('/* :%s */' % name)
+ if self.annots:
+ self.annot_lines.append('/* :%s */' % name)
+
+ def line(self, pc, ls, ms, line, annots, first):
+ self.n += 1
+ if first:
+ prefix = ''
+ else:
+ prefix = ', '
+ self.lines.append('%s0x%08x, 0x%08x /* %s */' % (prefix, ls, ms, line))
+ if self.annots:
+ if len(annots) == 0:
+ a = 'NULL'
+ else:
+ a = 'annotations_%d' % len(self.annot_arrs)
+ annot_arr = ['static unsigned int const annotations_%d[] = {' % len(self.annot_arrs)]
+ for annot in annots:
+ annot_arr.append(' SIMPENROSE_SHADER_ANNOTATION_%s, 0x%08x,' % (annot[0].upper(), annot[1]))
+ annot_arr.append(' SIMPENROSE_SHADER_ANNOTATION_END};')
+ self.annot_arrs.append(annot_arr)
+ self.annot_lines.append('%s%s /* %s */' % (prefix, a, line))
+
+ def end(self):
+ if self.iteration:
+ if len(self.iteration_lines) == 0:
+ self.iteration_n = self.n
+ elif self.iteration_n != self.n:
+ asm_error('number of instructions differs between iterations')
+ self.iteration_lines.append(self.lines)
+ if self.annots:
+ self.iteration_annot_lines.append(self.annot_lines)
+ else:
+ if self.annots:
+ print '( \\'
+ print_lines_lc(self.lines)
+ if self.annots:
+ print '), ( \\'
+ print_lines_lc(self.annot_lines)
+ print '), ( \\'
+ for annot_arr in self.annot_arrs:
+ print_lines_lc(annot_arr)
+ print ') \\'
+
+ def direct(self, line):
+ print line
+
+class asvc_dumper_t(dumper_t):
+ def external_link(self):
+ return True
+
+ def begin(self):
+ print '.align 8'
+
+ def label(self, pc, name):
+ if name[0] == ':':
+ print '%s::' % name[1:]
+ else:
+ print '%s:' % name
+
+ def line(self, pc, ls, ms, line, annots, first):
+ if isinstance(ls, tuple):
+ location, label, rel, offset = ls[:4]
+ if rel:
+ ls = '%s + %d - (. + 32)' % (label, offset)
+ else:
+ ls = '%s + %d' % (label, offset)
+ else:
+ ls = '0x%08x' % ls
+ print '.word %s, 0x%08x ; %s' % (ls, ms, line)
+
+def is_ra_or_rb(val):
+ return isinstance(val, loc_t) and ((val.mux == MUX_A) or (val.mux == MUX_B))
+
+class aliases_dumper_t(dumper_t):
+ def external_link(self):
+ return True
+
+ def begin(self):
+ print '#ifndef JUST_DQASM_ARGS'
+
+ def label(self, pc, name):
+ if not name[0].isdigit():
+ if name[0] == ':':
+ name = name[1:]
+ print '"bs%s", "bs%x",' % (name, pc * 8)
+ print '"bu%s", "bu%x",' % (name, pc * 8)
+
+ def end(self):
+ print '#endif'
+
+ # todo: handle things other than ra and rb? dqasm only allows ra and rb atm
+ def sets(self, sets):
+ dqasm_args = []
+ print '#ifndef JUST_DQASM_ARGS'
+ for name in sets:
+ if is_ra_or_rb(sets[name]):
+ dqasm_args.append('-r%s=%s' % (sets[name], name))
+ print '"%s", "%s",' % (name, sets[name])
+ elif isinstance(sets[name], list):
+ for i, val in enumerate(sets[name]):
+ if is_ra_or_rb(val):
+ dqasm_args.append('-r%s=%s[%d]' % (val, name, i))
+ print '"%s[%d]", "%s",' % (name, i, val)
+ print '#endif'
+ print '#define DQASM_ARGS "%s"' % ' '.join(dqasm_args)
+
+def dump(dumper):
+ if (len(prog) != 0) or (len(labels) != 0):
+ dumper.begin()
+
+ sorted_labels = []
+ for name in labels:
+ if name[0].isdigit():
+ for pc in labels[name]:
+ sorted_labels.append((pc, name))
+ else:
+ sorted_labels.append((labels[name], name))
+ sorted_labels.sort(reverse = True)
+
+ first = True
+ for pc in xrange(len(prog)):
+ ls, ms, line, annots = prog[pc]
+ while (len(sorted_labels) != 0) and (sorted_labels[-1][0] == pc):
+ dumper.label(*sorted_labels.pop())
+ dumper.line(pc, ls, ms, line, annots, first)
+ first = False
+ for sorted_label in sorted_labels:
+ assert sorted_label[0] == len(prog)
+ dumper.label(*sorted_label)
+
+ dumper.end()
+
+###############################################################################
+# preprocessing
+###############################################################################
+
+def preprocess_inline_c(dumper):
+ def preprocess(file):
+ ls = None
+ line_number = 0
+ for line in file:
+ line_number += 1
+ while True:
+ if ls is None:
+ l = line.split('%[', 1)
+ if len(l) == 1:
+ dumper.direct(l[0].rstrip())
+ break
+ dumper.direct('%s \\' % l[0].rstrip())
+ line = l[1]
+ ls = []
+ else:
+ l = line.split('%]', 1)
+ ls.append((line_number, l[0]))
+ if len(l) == 1:
+ break
+ line = l[1]
+ l = ls[-1][1].split('%|', 1)
+ if len(l) == 1:
+ for l_number, l in ls:
+ yield l_number, l
+ asm_end_prog()
+ dump(dumper)
+ asm_reset_prog()
+ else:
+ ls[-1] = (ls[-1][0], l[0])
+ if hasattr(dumper, 'begin_iteration'):
+ dumper.begin_iteration()
+ for repls in l[1].split('%,'):
+ repls = [repl.strip() for repl in repls.split('%/')]
+ for l_number, l in ls:
+ for i, repl in enumerate(repls):
+ l = l.replace('%' + str(i), repl)
+ yield l_number, l
+ asm_end_prog()
+ dump(dumper)
+ asm_reset_prog()
+ if hasattr(dumper, 'end_iteration'):
+ dumper.end_iteration()
+ ls = None
+ return preprocess
+
+def preprocess_clif(dumper):
+ def preprocess(file):
+ in_asm = False
+ line_number = 0
+ for line in file:
+ line_number += 1
+ if in_asm:
+ if line.strip() == '%]':
+ asm_end_prog()
+ dump(dumper)
+ asm_reset_prog()
+ in_asm = False
+ else:
+ yield line_number, line
+ else:
+ if line.strip() == '%[':
+ in_asm = True
+ elif (line[:1] == '%') and (line[:2] != '%@'):
+ yield line_number, line[1:]
+ else:
+ asm_end_prog()
+ dump(dumper)
+ asm_reset_prog()
+ if line[:2] == '%@':
+ if hasattr(dumper, 'parse_annot_mode'):
+ dumper.parse_annot_mode(line[2:])
+ else:
+ dumper.direct(line.rstrip())
+ return preprocess
+
+###############################################################################
+# main
+###############################################################################
+
+def main():
+ global external_link, allow_xor_0, dont_warn_when_mul_rot_inp_r5
+ global warnings_are_errors, disable_warnings, have_sema, have_am, mulw_rotate
+
+ asm_init() # do this first so we can use asm_error without having to pass a location and so asm_warning will work
+
+ # parse command line
+ parser = optparse.OptionParser(usage = 'usage: %prog [options] <filename>')
+ parser.add_option('-m', '--mode', dest = 'mode',
+ help = '<mode> should be clif, plain, ' +
+ 'c_c:<header_name>,<full_header_name>,<array_name>, ' +
+ 'c_h:<header_name>,<full_header_name>,<array_name>, ' +
+ 'ml_c:<header_name>,<full_header_name>,<name>[,annots], ' +
+ 'ml_h:<header_name>,<full_header_name>,<name>[,annots], ' +
+ 'inline_c[:annots], asvc, or aliases[:<preprocess_mode>]', metavar = '<mode>')
+ parser.add_option('-t', '--target', dest = 'target',
+ help = '<target> should be a0, b0, or hera', metavar = '<target>')
+ parser.add_option('-x', '--allow_xor_0', dest = 'allow_xor_0', action = 'store_true', default = False)
+ parser.add_option('-r', '--dont_warn_when_mul_rot_inp_r5', dest = 'dont_warn_when_mul_rot_inp_r5', action = 'store_true', default = False)
+ parser.add_option('-w', '--warnings_are_errors', dest = 'warnings_are_errors', action = 'store_true', default = False)
+ parser.add_option('-d', '--disable_warnings', dest = 'disable_warnings', action = 'store_true', default = False)
+ parser.add_option('-s', '--set', dest = 'sets', action = 'append', default = [], metavar = '<name>=<val>')
+ options, args = parser.parse_args()
+ if len(args) == 0:
+ filename = None
+ elif len(args) == 1:
+ filename = args[0]
+ else:
+ parser.print_help()
+ sys.exit(-1)
+
+ # handle mode
+ mode = options.mode or 'clif' # assume clif if no mode specified
+ if mode == 'clif':
+ dumper = clif_dumper_t()
+ preprocess = preprocess_clif(dumper)
+ elif mode == 'plain':
+ dumper = plain_dumper_t()
+ preprocess = None
+ elif (mode[:4] == 'c_c:') or (mode[:4] == 'c_h:'):
+ mode_options = mode[4:].split(',')
+ if len(mode_options) != 3:
+ asm_error('badly formatted mode on command line')
+ dumper = {'c_c': c_c_dumper_t, 'c_h': c_h_dumper_t}[mode[:3]](*mode_options)
+ preprocess = None
+ elif (mode[:5] == 'ml_c:') or (mode[:5] == 'ml_h:'):
+ mode_options = mode[5:].split(',')
+ if (len(mode_options) != 3) and ((len(mode_options) != 4) or (mode_options[3] != 'annots')):
+ asm_error('badly formatted mode on command line')
+ dumper = {'ml_c': ml_c_dumper_t, 'ml_h': ml_h_dumper_t
+ }[mode[:4]](*(mode_options[:3] + [len(mode_options) == 4]))
+ preprocess = None
+ elif mode == 'inline_c':
+ dumper = inline_c_dumper_t(False)
+ preprocess = preprocess_inline_c(dumper)
+ elif mode == 'inline_c:annots':
+ dumper = inline_c_dumper_t(True)
+ preprocess = preprocess_inline_c(dumper)
+ elif mode == 'asvc':
+ dumper = asvc_dumper_t()
+ preprocess = None
+ elif mode == 'aliases':
+ dumper = aliases_dumper_t()
+ preprocess = None
+ elif mode == 'aliases:inline_c':
+ dumper = aliases_dumper_t()
+ preprocess = preprocess_inline_c(dumper)
+ else:
+ asm_error('invalid mode')
+ external_link = dumper.external_link()
+
+ # handle target
+ target = options.target or 'b0' # assume b0 if no target specified
+ if target == 'a0':
+ have_sema = False
+ have_am = False
+ mulw_rotate = False
+ have_lthrsw = False
+ elif target == 'b0':
+ have_sema = True
+ have_am = True
+ mulw_rotate = True
+ have_lthrsw = True
+ elif target == 'hera':
+ have_sema = True
+ have_am = False
+ mulw_rotate = True
+ have_lthrsw = True
+ else:
+ asm_error('invalid target')
+ if have_am:
+ sigs['loadam'] = SIG_LOADAM
+ arg_defs['tlbam'] = loc_t(MUX_ANY, 47, 0, 0, None, RW_WRITE)
+ if have_lthrsw:
+ sigs['lthrsw'] = SIG_LTHRSW
+ del sigs['int']
+ arg_defs['interrupt'] = loc_t(MUX_ANY, 38, 0, 0, None, RW_WRITE)
+
+ # handle misc options
+ allow_xor_0 = options.allow_xor_0
+ dont_warn_when_mul_rot_inp_r5 = options.dont_warn_when_mul_rot_inp_r5
+ warnings_are_errors = options.warnings_are_errors
+ disable_warnings = options.disable_warnings
+
+ # make options visible to asm
+ arg_defs['mode'] = mode
+ arg_defs['target'] = target
+
+ # arg_defs all setup at this point
+ sets = arg_defs.copy() # todo: see arg_eval
+
+ # handle command line sets
+ re_options_set = re.compile('(?P<name>\\w+)=(?P<val>.+)$')
+ for options_set in options.sets:
+ m = re_options_set.match(options_set)
+ if not m:
+ asm_error('badly formatted set on command line')
+ sets[m.group('name')] = arg_eval(m.group('val'), sets)
+
+ # assemble input file and dump
+ asm_file(sets, filename, filename, preprocess)
+ asm_end_prog()
+ dump(dumper)
+ for name in arg_defs: # todo: see arg_eval
+ del sets[name]
+ dumper.sets(sets)
+
+if __name__ == '__main__':
+ main()
diff --git a/pi-util/rebase_liblinks.py b/pi-util/rebase_liblinks.py
new file mode 100755
index 0000000..6a9a33f
--- /dev/null
+++ b/pi-util/rebase_liblinks.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python
+
+import os, sys
+from stat import *
+
+def walktree(top, callback, n, prefix):
+ '''recursively descend the directory tree rooted at top,
+ calling the callback function for each regular file'''
+
+ for f in os.listdir(top):
+ pathname = os.path.join(top, f)
+ mode = os.lstat(pathname).st_mode
+ if S_ISDIR(mode):
+ # It's a directory, recurse into it
+ walktree(pathname, callback, n+1, prefix)
+ elif S_ISLNK(mode):
+ # It's a file, call the callback function
+ callback(pathname, os.readlink(pathname), n, prefix)
+
+def visitfile(file, linkname, n, prefix):
+ if (linkname.startswith(prefix + 'lib/')):
+ newlink = "../" * n + linkname[len(prefix):]
+ print 'relinking', file, "->", newlink
+ os.remove(file)
+ os.symlink(newlink, file)
+
+if __name__ == '__main__':
+ argc = len(sys.argv)
+ if argc == 2:
+ walktree(sys.argv[1], visitfile, 0, "/")
+ elif argc == 3:
+ walktree(sys.argv[1], visitfile, 0, sys.argv[2])
+ else:
+ print "rebase_liblinks.py <local root> [<old sysroot>]"
+
+
+
diff --git a/pi-util/syncroot.sh b/pi-util/syncroot.sh
new file mode 100755
index 0000000..d8bdd91
--- /dev/null
+++ b/pi-util/syncroot.sh
@@ -0,0 +1,43 @@
+set -e
+
+if [ "$1" == "" ]; then
+ echo Usage: $0 \<src_dir\> [\<rootname\>]
+ echo src_dir is a source for rsync so may contain m/c name.
+ echo rootname will be set to \"raspian_jessie_pi1\" if missing
+ echo e.g.: pi-util/syncroot.sh my-pi: raspian_jessie_pi1
+ exit 1
+fi
+
+SYSROOT_NAME=$2
+if [ "$SYSROOT_NAME" == "" ]; then
+ SYSROOT_NAME=raspian_jessie_pi1
+fi
+
+DST_ROOT=`pwd`
+DST=$DST_ROOT/build/linux/$SYSROOT_NAME-sysroot
+SRC=$1
+
+echo Sync src: $SRC
+echo Sync dest: $DST
+
+mkdir -p $DST/lib
+mkdir -p $DST/opt/vc/include
+mkdir -p $DST/usr/lib/pkgconfig
+mkdir -p $DST/usr/bin
+mkdir -p $DST/usr/share
+
+#### MUST NOT include /opt/vc/include/*GL*
+# Creates conflicts with GL includes inside Chrome
+
+rsync -rl $SRC/lib/arm-linux-gnueabihf $DST/lib
+rsync -rl $SRC/opt/vc/lib $DST/opt/vc
+rsync -l $SRC/opt/vc/include/bcm_host.h $DST/opt/vc/include
+rsync -rl $SRC/opt/vc/include/interface $DST/opt/vc/include
+rsync -rl $SRC/opt/vc/include/vcinclude $DST/opt/vc/include
+rsync -rl $SRC/usr/lib/arm-linux-gnueabihf $DST/usr/lib
+rsync -rl $SRC/usr/lib/gcc $DST/usr/lib
+rsync -rl $SRC/usr/include $DST/usr
+
+pi-util/rebase_liblinks.py $DST
+
+