mirror of
https://github.com/LibreELEC/LibreELEC.tv
synced 2025-09-24 19:46:01 +07:00
18556 lines
584 KiB
Diff
18556 lines
584 KiB
Diff
diff --git a/.gitignore b/.gitignore
|
|
index 524fb73..305632b 100644
|
|
--- a/.gitignore
|
|
+++ b/.gitignore
|
|
@@ -23,6 +23,7 @@
|
|
.\#*
|
|
/.config
|
|
/.version
|
|
+/build/
|
|
/ffmpeg
|
|
/ffplay
|
|
/ffprobe
|
|
diff --git a/ffmpeg.c b/ffmpeg.c
|
|
index 9ffd833..7a86d7e 100644
|
|
--- a/ffmpeg.c
|
|
+++ b/ffmpeg.c
|
|
@@ -23,6 +23,11 @@
|
|
* multimedia converter based on the FFmpeg libraries
|
|
*/
|
|
|
|
+#ifdef RPI
|
|
+#define RPI_DISPLAY
|
|
+#define RPI_ZERO_COPY
|
|
+#endif
|
|
+
|
|
#include "config.h"
|
|
#include <ctype.h>
|
|
#include <string.h>
|
|
@@ -66,6 +71,25 @@
|
|
# include "libavfilter/buffersrc.h"
|
|
# include "libavfilter/buffersink.h"
|
|
|
|
+#ifdef RPI_DISPLAY
|
|
+#pragma GCC diagnostic push
|
|
+// Many many redundant decls in the header files
|
|
+#pragma GCC diagnostic ignored "-Wredundant-decls"
|
|
+#include <bcm_host.h>
|
|
+#include <interface/mmal/mmal.h>
|
|
+#include <interface/mmal/mmal_parameters_camera.h>
|
|
+#include <interface/mmal/mmal_buffer.h>
|
|
+#include <interface/mmal/util/mmal_util.h>
|
|
+#include <interface/mmal/util/mmal_default_components.h>
|
|
+#include <interface/mmal/util/mmal_connection.h>
|
|
+#include <interface/mmal/util/mmal_util_params.h>
|
|
+#pragma GCC diagnostic pop
|
|
+#ifdef RPI_ZERO_COPY
|
|
+#include "libavcodec/rpi_qpu.h"
|
|
+#endif
|
|
+#include "libavcodec/rpi_zc.h"
|
|
+#endif
|
|
+
|
|
#if HAVE_SYS_RESOURCE_H
|
|
#include <sys/time.h>
|
|
#include <sys/types.h>
|
|
@@ -158,6 +182,169 @@ static int restore_tty;
|
|
static void free_input_threads(void);
|
|
#endif
|
|
|
|
+#ifdef RPI_DISPLAY
|
|
+
|
|
+#define NUM_BUFFERS 4
|
|
+
|
|
+static MMAL_COMPONENT_T* rpi_display = NULL;
|
|
+static MMAL_POOL_T *rpi_pool = NULL;
|
|
+static volatile int rpi_display_count = 0;
|
|
+
|
|
+static MMAL_POOL_T* display_alloc_pool(MMAL_PORT_T* port, size_t w, size_t h)
|
|
+{
|
|
+ MMAL_POOL_T* pool;
|
|
+ size_t i;
|
|
+ size_t size = (w*h*3)/2;
|
|
+#ifdef RPI_ZERO_COPY
|
|
+ mmal_port_parameter_set_boolean(port, MMAL_PARAMETER_ZERO_COPY, MMAL_TRUE); // Does this mark that the buffer contains a vc_handle? Would have expected a vc_image?
|
|
+ pool = mmal_port_pool_create(port, NUM_BUFFERS, 0);
|
|
+ assert(pool);
|
|
+#else
|
|
+ pool = mmal_port_pool_create(port, NUM_BUFFERS, size);
|
|
+
|
|
+ for (i = 0; i < NUM_BUFFERS; ++i)
|
|
+ {
|
|
+ MMAL_BUFFER_HEADER_T* buffer = pool->header[i];
|
|
+ char * bufPtr = buffer->data;
|
|
+ memset(bufPtr, i*30, w*h);
|
|
+ memset(bufPtr+w*h, 128, (w*h)/2);
|
|
+ }
|
|
+#endif
|
|
+
|
|
+ return pool;
|
|
+}
|
|
+
|
|
+static void display_cb_input(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer) {
|
|
+#ifdef RPI_ZERO_COPY
|
|
+ av_rpi_zc_unref(buffer->user_data);
|
|
+ --rpi_display_count;
|
|
+#endif
|
|
+ mmal_buffer_header_release(buffer);
|
|
+}
|
|
+
|
|
+static void display_cb_control(MMAL_PORT_T *port,MMAL_BUFFER_HEADER_T *buffer) {
|
|
+ mmal_buffer_header_release(buffer);
|
|
+}
|
|
+
|
|
+static MMAL_COMPONENT_T* display_init(size_t x, size_t y, size_t w, size_t h)
|
|
+{
|
|
+ MMAL_COMPONENT_T* display;
|
|
+ MMAL_DISPLAYREGION_T region =
|
|
+ {
|
|
+ .hdr = {MMAL_PARAMETER_DISPLAYREGION, sizeof(region)},
|
|
+ .set = MMAL_DISPLAY_SET_LAYER | MMAL_DISPLAY_SET_FULLSCREEN | MMAL_DISPLAY_SET_DEST_RECT,
|
|
+ .layer = 2,
|
|
+ .fullscreen = 0,
|
|
+ .dest_rect = {x, y, w, h}
|
|
+ };
|
|
+ const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(w, h);
|
|
+
|
|
+ bcm_host_init(); // TODO is this needed?
|
|
+ mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, &display);
|
|
+ assert(display);
|
|
+
|
|
+ mmal_port_parameter_set(display->input[0], ®ion.hdr);
|
|
+
|
|
+ {
|
|
+ MMAL_ES_FORMAT_T* format = display->input[0]->format;
|
|
+ format->encoding = MMAL_ENCODING_I420;
|
|
+ format->es->video.width = geo.stride_y;
|
|
+ format->es->video.height = geo.height_y;
|
|
+ format->es->video.crop.x = 0;
|
|
+ format->es->video.crop.y = 0;
|
|
+ format->es->video.crop.width = w;
|
|
+ format->es->video.crop.height = h;
|
|
+ mmal_port_format_commit(display->input[0]);
|
|
+ }
|
|
+
|
|
+ mmal_component_enable(display);
|
|
+
|
|
+ rpi_pool = display_alloc_pool(display->input[0], geo.stride_y, geo.height_y);
|
|
+
|
|
+ mmal_port_enable(display->input[0],display_cb_input);
|
|
+ mmal_port_enable(display->control,display_cb_control);
|
|
+
|
|
+ printf("Allocated display %dx%d in %dx%d\n", w, h, geo.stride_y, geo.height_y);
|
|
+
|
|
+ return display;
|
|
+}
|
|
+
|
|
+static void display_frame(struct AVCodecContext * const s, MMAL_COMPONENT_T* const display, const AVFrame* const fr)
|
|
+{
|
|
+ MMAL_BUFFER_HEADER_T* buf;
|
|
+
|
|
+ if (!display || !rpi_pool)
|
|
+ return;
|
|
+
|
|
+ if (rpi_display_count >= 3) {
|
|
+ av_log(s, AV_LOG_VERBOSE, "Frame dropped\n");
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ buf = mmal_queue_get(rpi_pool->queue);
|
|
+ if (!buf) {
|
|
+ // Running too fast so drop the frame
|
|
+ printf("Q alloc failure\n");
|
|
+ return;
|
|
+ }
|
|
+ assert(buf);
|
|
+ buf->cmd = 0;
|
|
+ buf->offset = 0; // Offset to valid data
|
|
+ buf->flags = 0;
|
|
+#ifdef RPI_ZERO_COPY
|
|
+{
|
|
+ const AVRpiZcRefPtr fr_buf = av_rpi_zc_ref(s, fr, 1);
|
|
+
|
|
+ buf->user_data = fr_buf;
|
|
+ buf->data = av_rpi_zc_vc_handle(fr_buf);
|
|
+ buf->alloc_size =
|
|
+ buf->length = av_rpi_zc_numbytes(fr_buf);
|
|
+
|
|
+ ++rpi_display_count;
|
|
+}
|
|
+#else
|
|
+{
|
|
+#error YYY
|
|
+ int w = fr->width;
|
|
+ int h = fr->height;
|
|
+ int w2 = (w+31)&~31;
|
|
+ int h2 = (h+15)&~15;
|
|
+
|
|
+ buf->length = (w2 * h2 * 3)/2;
|
|
+ buf->user_data = NULL;
|
|
+
|
|
+ //mmal_buffer_header_mem_lock(buf);
|
|
+ memcpy(buf->data, fr->data[0], w2 * h);
|
|
+ memcpy(buf->data+w2*h2, fr->data[1], w2 * h / 4);
|
|
+ memcpy(buf->data+w2*h2*5/4, fr->data[2], w2 * h / 4);
|
|
+ //mmal_buffer_header_mem_unlock(buf);
|
|
+}
|
|
+#endif
|
|
+
|
|
+ while (rpi_display_count >= 3) {
|
|
+ usleep(5000);
|
|
+ }
|
|
+
|
|
+ if (mmal_port_send_buffer(display->input[0], buf) != MMAL_SUCCESS)
|
|
+ {
|
|
+ printf("** send failed: depth=%d\n", rpi_display_count);
|
|
+ display_cb_input(NULL, buf);
|
|
+ }
|
|
+}
|
|
+
|
|
+static void display_exit(MMAL_COMPONENT_T* display)
|
|
+{
|
|
+ if (display) {
|
|
+ mmal_component_destroy(display);
|
|
+ }
|
|
+ if (rpi_pool) {
|
|
+ mmal_port_pool_destroy(display->input[0], rpi_pool);
|
|
+ }
|
|
+}
|
|
+
|
|
+#endif
|
|
+
|
|
+
|
|
/* sub2video hack:
|
|
Convert subtitles to video with alpha to insert them in filter graphs.
|
|
This is a temporary solution until libavfilter gets real subtitles support.
|
|
@@ -540,6 +727,11 @@ static void ffmpeg_cleanup(int ret)
|
|
avformat_close_input(&input_files[i]->ctx);
|
|
av_freep(&input_files[i]);
|
|
}
|
|
+
|
|
+#ifdef RPI_DISPLAY
|
|
+ display_exit(rpi_display);
|
|
+#endif
|
|
+
|
|
for (i = 0; i < nb_input_streams; i++) {
|
|
InputStream *ist = input_streams[i];
|
|
|
|
@@ -551,6 +743,9 @@ static void ffmpeg_cleanup(int ret)
|
|
av_freep(&ist->filters);
|
|
av_freep(&ist->hwaccel_device);
|
|
|
|
+#ifdef RPI_ZERO_COPY
|
|
+ av_rpi_zc_uninit(ist->dec_ctx);
|
|
+#endif
|
|
avcodec_free_context(&ist->dec_ctx);
|
|
|
|
av_freep(&input_streams[i]);
|
|
@@ -581,6 +776,7 @@ static void ffmpeg_cleanup(int ret)
|
|
}
|
|
term_exit();
|
|
ffmpeg_exited = 1;
|
|
+
|
|
}
|
|
|
|
void remove_avoptions(AVDictionary **a, AVDictionary *b)
|
|
@@ -944,6 +1140,15 @@ static void do_video_out(AVFormatContext *s,
|
|
if (ost->source_index >= 0)
|
|
ist = input_streams[ost->source_index];
|
|
|
|
+#ifdef RPI_DISPLAY
|
|
+ if (next_picture && ist != NULL)
|
|
+ {
|
|
+ if (!rpi_display)
|
|
+ rpi_display = display_init(0,0,next_picture->width,next_picture->height);
|
|
+ display_frame(ist->dec_ctx, rpi_display, next_picture);
|
|
+ }
|
|
+#endif
|
|
+
|
|
if (filter->inputs[0]->frame_rate.num > 0 &&
|
|
filter->inputs[0]->frame_rate.den > 0)
|
|
duration = 1/(av_q2d(filter->inputs[0]->frame_rate) * av_q2d(enc->time_base));
|
|
@@ -2549,6 +2754,12 @@ static int init_input_stream(int ist_index, char *error, int error_len)
|
|
ist->dec_ctx->opaque = ist;
|
|
ist->dec_ctx->get_format = get_format;
|
|
ist->dec_ctx->get_buffer2 = get_buffer;
|
|
+
|
|
+#ifdef RPI_ZERO_COPY
|
|
+ // Overrides the above get_buffer2
|
|
+ av_rpi_zc_init(ist->dec_ctx);
|
|
+#endif
|
|
+
|
|
ist->dec_ctx->thread_safe_callbacks = 1;
|
|
|
|
av_opt_set_int(ist->dec_ctx, "refcounted_frames", 1, 0);
|
|
diff --git a/libavcodec/Makefile b/libavcodec/Makefile
|
|
index fd0d1f0..40d22d2 100644
|
|
--- a/libavcodec/Makefile
|
|
+++ b/libavcodec/Makefile
|
|
@@ -5,6 +5,11 @@ NAME = avcodec
|
|
HEADERS = avcodec.h \
|
|
avdct.h \
|
|
avfft.h \
|
|
+ rpi_qpu.h \
|
|
+ rpi_shader.h \
|
|
+ rpi_mailbox.h \
|
|
+ rpi_hevc_transform.h \
|
|
+ rpi_zc.h \
|
|
d3d11va.h \
|
|
dirac.h \
|
|
dv_profile.h \
|
|
@@ -43,6 +48,10 @@ OBJS = allcodecs.o \
|
|
resample.o \
|
|
resample2.o \
|
|
utils.o \
|
|
+ rpi_qpu.o \
|
|
+ rpi_shader.o \
|
|
+ rpi_mailbox.o \
|
|
+ rpi_zc.o \
|
|
vorbis_parser.o \
|
|
xiph.o \
|
|
|
|
@@ -1078,3 +1087,11 @@ $(SUBDIR)qdm2.o: $(SUBDIR)qdm2_tables.h
|
|
$(SUBDIR)sinewin.o: $(SUBDIR)sinewin_tables.h
|
|
$(SUBDIR)sinewin_fixed.o: $(SUBDIR)sinewin_fixed_tables.h
|
|
endif
|
|
+
|
|
+$(SUBDIR)rpi_shader.c: $(SUBDIR)rpi_shader.qasm
|
|
+ python $(SUBDIR)../pi-util/qasm.py -mc_c:rpi_shader,rpi_shader,rpi_shader $< > $@
|
|
+
|
|
+$(SUBDIR)rpi_shader.h: $(SUBDIR)rpi_shader.qasm
|
|
+ python $(SUBDIR)../pi-util/qasm.py -mc_h:rpi_shader,rpi_shader,rpi_shader $< > $@
|
|
+
|
|
+$(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_shader.h
|
|
diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
|
|
index 54efaad..02a89c3 100644
|
|
--- a/libavcodec/allcodecs.c
|
|
+++ b/libavcodec/allcodecs.c
|
|
@@ -667,6 +667,7 @@ void avcodec_register_all(void)
|
|
REGISTER_PARSER(H261, h261);
|
|
REGISTER_PARSER(H263, h263);
|
|
REGISTER_PARSER(H264, h264);
|
|
+ REGISTER_PARSER(H264_MVC, h264_mvc);
|
|
REGISTER_PARSER(HEVC, hevc);
|
|
REGISTER_PARSER(MJPEG, mjpeg);
|
|
REGISTER_PARSER(MLP, mlp);
|
|
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
|
|
index a4ceca7..1354c14 100644
|
|
--- a/libavcodec/arm/Makefile
|
|
+++ b/libavcodec/arm/Makefile
|
|
@@ -132,8 +132,10 @@ NEON-OBJS-$(CONFIG_LLAUDDSP) += arm/lossless_audiodsp_neon.o
|
|
NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_neon.o
|
|
NEON-OBJS-$(CONFIG_HEVC_DECODER) += arm/hevcdsp_init_neon.o \
|
|
arm/hevcdsp_deblock_neon.o \
|
|
+ arm/hevcdsp_epel_neon.o \
|
|
arm/hevcdsp_idct_neon.o \
|
|
- arm/hevcdsp_qpel_neon.o
|
|
+ arm/hevcdsp_qpel_neon.o \
|
|
+ arm/hevcdsp_sao_neon.o
|
|
NEON-OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_neon.o
|
|
NEON-OBJS-$(CONFIG_RV40_DECODER) += arm/rv34dsp_neon.o \
|
|
arm/rv40dsp_neon.o
|
|
diff --git a/libavcodec/arm/cabac.h b/libavcodec/arm/cabac.h
|
|
index fdbf86b..0a3980a 100644
|
|
--- a/libavcodec/arm/cabac.h
|
|
+++ b/libavcodec/arm/cabac.h
|
|
@@ -26,13 +26,34 @@
|
|
#include "libavutil/internal.h"
|
|
#include "libavcodec/cabac.h"
|
|
|
|
+
|
|
+#if UNCHECKED_BITSTREAM_READER
|
|
+#define LOAD_16BITS_BEHI\
|
|
+ "ldrh %[tmp] , [%[ptr]] , #2 \n\t"\
|
|
+ "rev %[tmp] , %[tmp] \n\t"
|
|
+#elif CONFIG_THUMB
|
|
+#define LOAD_16BITS_BEHI\
|
|
+ "ldr %[tmp] , [%[c], %[end]] \n\t"\
|
|
+ "cmp %[tmp] , %[ptr] \n\t"\
|
|
+ "it cs \n\t"\
|
|
+ "ldrhcs %[tmp] , [%[ptr]] , #2 \n\t"\
|
|
+ "rev %[tmp] , %[tmp] \n\t"
|
|
+#else
|
|
+#define LOAD_16BITS_BEHI\
|
|
+ "ldr %[tmp] , [%[c], %[end]] \n\t"\
|
|
+ "cmp %[tmp] , %[ptr] \n\t"\
|
|
+ "ldrcsh %[tmp] , [%[ptr]] , #2 \n\t"\
|
|
+ "rev %[tmp] , %[tmp] \n\t"
|
|
+#endif
|
|
+
|
|
+
|
|
#define get_cabac_inline get_cabac_inline_arm
|
|
static av_always_inline int get_cabac_inline_arm(CABACContext *c,
|
|
uint8_t *const state)
|
|
{
|
|
int bit;
|
|
+#if 0
|
|
void *reg_b, *reg_c, *tmp;
|
|
-
|
|
__asm__ volatile(
|
|
"ldrb %[bit] , [%[state]] \n\t"
|
|
"add %[r_b] , %[tables] , %[lps_off] \n\t"
|
|
@@ -100,9 +121,141 @@ static av_always_inline int get_cabac_inline_arm(CABACContext *c,
|
|
[mlps_off]"I"(H264_MLPS_STATE_OFFSET + 128)
|
|
: "memory", "cc"
|
|
);
|
|
+#else
|
|
+ // *** Not thumb compatible yet
|
|
+ unsigned int reg_b, tmp;
|
|
+ __asm__ (
|
|
+ "ldrb %[bit] , [%[state]] \n\t"
|
|
+ "sub %[r_b] , %[mlps_tables], %[lps_off] \n\t"
|
|
+ "and %[tmp] , %[range] , #0xC0 \n\t"
|
|
+ "add %[r_b] , %[r_b] , %[bit] \n\t"
|
|
+ "ldrb %[tmp] , [%[r_b] , %[tmp], lsl #1] \n\t"
|
|
+// %bit = *state
|
|
+// %range = range
|
|
+// %tmp = RangeLPS
|
|
+ "sub %[range] , %[range] , %[tmp] \n\t"
|
|
+
|
|
+ "cmp %[low] , %[range] , lsl #17 \n\t"
|
|
+ "ittt ge \n\t"
|
|
+ "subge %[low] , %[low] , %[range], lsl #17 \n\t"
|
|
+ "mvnge %[bit] , %[bit] \n\t"
|
|
+ "movge %[range] , %[tmp] \n\t"
|
|
+
|
|
+ "clz %[tmp] , %[range] \n\t"
|
|
+ "sub %[tmp] , #23 \n\t"
|
|
+
|
|
+ "ldrb %[r_b] , [%[mlps_tables], %[bit]] \n\t"
|
|
+ "lsl %[low] , %[low] , %[tmp] \n\t"
|
|
+ "lsl %[range] , %[range] , %[tmp] \n\t"
|
|
+
|
|
+ "strb %[r_b] , [%[state]] \n\t"
|
|
+ "lsls %[tmp] , %[low] , #16 \n\t"
|
|
+
|
|
+ "bne 2f \n\t"
|
|
+ LOAD_16BITS_BEHI
|
|
+ "lsr %[tmp] , %[tmp] , #15 \n\t"
|
|
+ "movw %[r_b] , #0xFFFF \n\t"
|
|
+ "sub %[tmp] , %[tmp] , %[r_b] \n\t"
|
|
+
|
|
+ "rbit %[r_b] , %[low] \n\t"
|
|
+ "clz %[r_b] , %[r_b] \n\t"
|
|
+ "sub %[r_b] , %[r_b] , #16 \n\t"
|
|
+#if CONFIG_THUMB
|
|
+ "lsl %[tmp] , %[tmp] , %[r_b] \n\t"
|
|
+ "add %[low] , %[low] , %[tmp] \n\t"
|
|
+#else
|
|
+ "add %[low] , %[low] , %[tmp], lsl %[r_b] \n\t"
|
|
+#endif
|
|
+ "2: \n\t"
|
|
+ : [bit]"=&r"(bit),
|
|
+ [low]"+&r"(c->low),
|
|
+ [range]"+&r"(c->range),
|
|
+ [r_b]"=&r"(reg_b),
|
|
+ [ptr]"+&r"(c->bytestream),
|
|
+ [tmp]"=&r"(tmp)
|
|
+ : [state]"r"(state),
|
|
+ [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
|
|
+ [byte]"M"(offsetof(CABACContext, bytestream)),
|
|
+#if !UNCHECKED_BITSTREAM_READER
|
|
+ [c]"r"(c),
|
|
+ [end]"M"(offsetof(CABACContext, bytestream_end)),
|
|
+#endif
|
|
+ [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
|
|
+ : "memory", "cc"
|
|
+ );
|
|
+#endif
|
|
|
|
return bit & 1;
|
|
}
|
|
+
|
|
+#define get_cabac_bypass get_cabac_bypass_arm
|
|
+static inline int get_cabac_bypass_arm(CABACContext * const c)
|
|
+{
|
|
+ int rv = 0;
|
|
+ unsigned int tmp;
|
|
+ __asm (
|
|
+ "lsl %[low] , #1 \n\t"
|
|
+ "cmp %[low] , %[range] , lsl #17 \n\t"
|
|
+ "adc %[rv] , %[rv] , #0 \n\t"
|
|
+ "it cs \n\t"
|
|
+ "subcs %[low] , %[low] , %[range], lsl #17 \n\t"
|
|
+ "lsls %[tmp] , %[low] , #16 \n\t"
|
|
+ "bne 1f \n\t"
|
|
+ LOAD_16BITS_BEHI
|
|
+ "add %[low] , %[low] , %[tmp], lsr #15 \n\t"
|
|
+ "movw %[tmp] , #0xFFFF \n\t"
|
|
+ "sub %[low] , %[low] , %[tmp] \n\t"
|
|
+ "1: \n\t"
|
|
+ : // Outputs
|
|
+ [rv]"+&r"(rv),
|
|
+ [low]"+&r"(c->low),
|
|
+ [tmp]"=&r"(tmp),
|
|
+ [ptr]"+&r"(c->bytestream)
|
|
+ : // Inputs
|
|
+#if !UNCHECKED_BITSTREAM_READER
|
|
+ [c]"r"(c),
|
|
+ [end]"M"(offsetof(CABACContext, bytestream_end)),
|
|
+#endif
|
|
+ [range]"r"(c->range)
|
|
+ : "cc"
|
|
+ );
|
|
+ return rv;
|
|
+}
|
|
+
|
|
+
|
|
+#define get_cabac_bypass_sign get_cabac_bypass_sign_arm
|
|
+static inline int get_cabac_bypass_sign_arm(CABACContext * const c, int rv)
|
|
+{
|
|
+ unsigned int tmp;
|
|
+ __asm (
|
|
+ "lsl %[low] , #1 \n\t"
|
|
+ "cmp %[low] , %[range] , lsl #17 \n\t"
|
|
+ "ite cc \n\t"
|
|
+ "rsbcc %[rv] , %[rv] , #0 \n\t"
|
|
+ "subcs %[low] , %[low] , %[range], lsl #17 \n\t"
|
|
+ "lsls %[tmp] , %[low] , #16 \n\t"
|
|
+ "bne 1f \n\t"
|
|
+ LOAD_16BITS_BEHI
|
|
+ "add %[low] , %[low] , %[tmp], lsr #15 \n\t"
|
|
+ "movw %[tmp] , #0xFFFF \n\t"
|
|
+ "sub %[low] , %[low] , %[tmp] \n\t"
|
|
+ "1: \n\t"
|
|
+ : // Outputs
|
|
+ [rv]"+&r"(rv),
|
|
+ [low]"+&r"(c->low),
|
|
+ [tmp]"=&r"(tmp),
|
|
+ [ptr]"+&r"(c->bytestream)
|
|
+ : // Inputs
|
|
+#if !UNCHECKED_BITSTREAM_READER
|
|
+ [c]"r"(c),
|
|
+ [end]"M"(offsetof(CABACContext, bytestream_end)),
|
|
+#endif
|
|
+ [range]"r"(c->range)
|
|
+ : "cc"
|
|
+ );
|
|
+ return rv;
|
|
+}
|
|
+
|
|
#endif /* HAVE_ARMV6T2_INLINE */
|
|
|
|
#endif /* AVCODEC_ARM_CABAC_H */
|
|
diff --git a/libavcodec/arm/hevc_cabac.h b/libavcodec/arm/hevc_cabac.h
|
|
new file mode 100644
|
|
index 0000000..31d3c59
|
|
--- /dev/null
|
|
+++ b/libavcodec/arm/hevc_cabac.h
|
|
@@ -0,0 +1,491 @@
|
|
+/*
|
|
+ * This file is part of FFmpeg.
|
|
+ *
|
|
+ * FFmpeg is free software; you can redistribute it and/or
|
|
+ * modify it under the terms of the GNU Lesser General Public
|
|
+ * License as published by the Free Software Foundation; either
|
|
+ * version 2.1 of the License, or (at your option) any later version.
|
|
+ *
|
|
+ * FFmpeg is distributed in the hope that it will be useful,
|
|
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
+ * Lesser General Public License for more details.
|
|
+ *
|
|
+ * You should have received a copy of the GNU Lesser General Public
|
|
+ * License along with FFmpeg; if not, write to the Free Software
|
|
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
+ */
|
|
+
|
|
+#ifndef AVCODEC_ARM_HEVC_CABAC_H
|
|
+#define AVCODEC_ARM_HEVC_CABAC_H
|
|
+
|
|
+#include "config.h"
|
|
+#if HAVE_ARMV6T2_INLINE
|
|
+
|
|
+#define hevc_mem_bits32 hevc_mem_bits32_arm
|
|
+static inline uint32_t hevc_mem_bits32_arm(const void * p, const unsigned int bits)
|
|
+{
|
|
+ unsigned int n;
|
|
+ __asm__ (
|
|
+ "rev %[n], %[x] \n\t"
|
|
+ : [n]"=r"(n)
|
|
+ : [x]"r"(*(const uint32_t *)((const uint8_t *)p + (bits >> 3)))
|
|
+ :
|
|
+ );
|
|
+ return n << (bits & 7);
|
|
+}
|
|
+
|
|
+
|
|
+// ---------------------------------------------------------------------------
|
|
+//
|
|
+// Helper fns - little bits of code where ARM has an instraction that the
|
|
+// compiler doesn't know about / use
|
|
+
|
|
+#define trans_scale_sat trans_scale_sat_arm
|
|
+static inline int trans_scale_sat_arm(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift)
|
|
+{
|
|
+ int rv;
|
|
+ int t = ((level * (int)(scale * scale_m)) >> shift) + 1;
|
|
+
|
|
+ __asm__ (
|
|
+ "ssat %[rv], #16, %[t], ASR #1 \n\t"
|
|
+ : [rv]"=r"(rv)
|
|
+ : [t]"r"(t)
|
|
+ :
|
|
+ );
|
|
+ return rv;
|
|
+}
|
|
+
|
|
+#define update_rice update_rice_arm
|
|
+static inline void update_rice_arm(uint8_t * const stat_coeff,
|
|
+ const unsigned int last_coeff_abs_level_remaining,
|
|
+ const unsigned int c_rice_param)
|
|
+{
|
|
+ int t;
|
|
+ __asm__ (
|
|
+ "lsl %[t], %[coeff], #1 \n\t"
|
|
+ "lsrs %[t], %[t], %[shift] \n\t"
|
|
+ "it eq \n\t"
|
|
+ "subeq %[stat], %[stat], #1 \n\t"
|
|
+ "cmp %[t], #6 \n\t"
|
|
+ "adc %[stat], %[stat], #0 \n\t"
|
|
+ "usat %[stat], #8, %[stat] \n\t"
|
|
+ : [stat]"+&r"(*stat_coeff),
|
|
+ [t]"=&r"(t)
|
|
+ : [coeff]"r"(last_coeff_abs_level_remaining),
|
|
+ [shift]"r"(c_rice_param)
|
|
+ : "cc"
|
|
+ );
|
|
+}
|
|
+
|
|
+// ---------------------------------------------------------------------------
|
|
+//
|
|
+// CABAC get loops
|
|
+//
|
|
+// Where the loop is simple enough we can normally do 10-30% better than the
|
|
+// compiler
|
|
+
|
|
+// Get the residual greater than 1 bits
|
|
+
|
|
+#define get_cabac_greater1_bits get_cabac_greater1_bits_arm
|
|
+static inline unsigned int get_cabac_greater1_bits_arm(CABACContext * const c, const unsigned int n,
|
|
+ uint8_t * const state0)
|
|
+{
|
|
+ unsigned int i, reg_b, st, tmp, bit, rv;
|
|
+ __asm__ (
|
|
+ "mov %[i] , #0 \n\t"
|
|
+ "mov %[rv] , #0 \n\t"
|
|
+ "1: \n\t"
|
|
+ "add %[i] , %[i] , #1 \n\t"
|
|
+ "cmp %[rv] , #0 \n\t"
|
|
+ "ite eq \n\t"
|
|
+ "usateq %[st] , #2 , %[i] \n\t"
|
|
+ "movne %[st] , #0 \n\t"
|
|
+
|
|
+ "ldrb %[bit] , [%[state0], %[st]] \n\t"
|
|
+ "sub %[r_b] , %[mlps_tables], %[lps_off] \n\t"
|
|
+ "and %[tmp] , %[range] , #0xC0 \n\t"
|
|
+ "add %[r_b] , %[r_b] , %[bit] \n\t"
|
|
+ "ldrb %[tmp] , [%[r_b], %[tmp], lsl #1] \n\t"
|
|
+ "sub %[range] , %[range] , %[tmp] \n\t"
|
|
+
|
|
+ "cmp %[low] , %[range], lsl #17 \n\t"
|
|
+ "ittt ge \n\t"
|
|
+ "subge %[low] , %[low] , %[range], lsl #17 \n\t"
|
|
+ "mvnge %[bit] , %[bit] \n\t"
|
|
+ "movge %[range] , %[tmp] \n\t"
|
|
+
|
|
+ "ldrb %[r_b] , [%[mlps_tables], %[bit]] \n\t"
|
|
+ "and %[bit] , %[bit] , #1 \n\t"
|
|
+ "orr %[rv] , %[bit] , %[rv], lsl #1 \n\t"
|
|
+
|
|
+ "clz %[tmp] , %[range] \n\t"
|
|
+ "sub %[tmp] , #23 \n\t"
|
|
+
|
|
+ "lsl %[low] , %[low] , %[tmp] \n\t"
|
|
+ "lsl %[range] , %[range] , %[tmp] \n\t"
|
|
+
|
|
+ "strb %[r_b] , [%[state0], %[st]] \n\t"
|
|
+// There is a small speed gain from combining both conditions, using a single
|
|
+// branch and then working out what that meant later
|
|
+ "lsls %[tmp] , %[low] , #16 \n\t"
|
|
+ "it ne \n\t"
|
|
+ "cmpne %[n] , %[i] \n\t"
|
|
+ "bne 1b \n\t"
|
|
+
|
|
+// If reload is not required then we must have run out of flags to decode
|
|
+ "tst %[tmp] , %[tmp] \n\t"
|
|
+ "bne 2f \n\t"
|
|
+
|
|
+// Do reload
|
|
+ "ldrh %[tmp] , [%[bptr]] , #2 \n\t"
|
|
+ "movw %[r_b] , #0xFFFF \n\t"
|
|
+ "rev %[tmp] , %[tmp] \n\t"
|
|
+ "rsb %[tmp] , %[r_b] , %[tmp], lsr #15 \n\t"
|
|
+
|
|
+ "rbit %[r_b] , %[low] \n\t"
|
|
+ "clz %[r_b] , %[r_b] \n\t"
|
|
+ "sub %[r_b] , %[r_b] , #16 \n\t"
|
|
+
|
|
+#if CONFIG_THUMB
|
|
+ "lsl %[tmp] , %[tmp] , %[r_b] \n\t"
|
|
+ "add %[low] , %[low] , %[tmp] \n\t"
|
|
+#else
|
|
+ "add %[low] , %[low] , %[tmp], lsl %[r_b] \n\t"
|
|
+#endif
|
|
+
|
|
+ "cmp %[n] , %[i] \n\t"
|
|
+ "bne 1b \n\t"
|
|
+ "2: \n\t"
|
|
+ : [bit]"=&r"(bit),
|
|
+ [low]"+&r"(c->low),
|
|
+ [range]"+&r"(c->range),
|
|
+ [r_b]"=&r"(reg_b),
|
|
+ [bptr]"+&r"(c->bytestream),
|
|
+ [i]"=&r"(i),
|
|
+ [tmp]"=&r"(tmp),
|
|
+ [st]"=&r"(st),
|
|
+ [rv]"=&r"(rv)
|
|
+ : [state0]"r"(state0),
|
|
+ [n]"r"(n),
|
|
+ [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
|
|
+ [byte]"M"(offsetof(CABACContext, bytestream)),
|
|
+ [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
|
|
+ : "memory", "cc"
|
|
+ );
|
|
+ return rv;
|
|
+}
|
|
+
|
|
+
|
|
+// n must be > 0 on entry
|
|
+#define get_cabac_sig_coeff_flag_idxs get_cabac_sig_coeff_flag_idxs_arm
|
|
+static inline uint8_t * get_cabac_sig_coeff_flag_idxs_arm(CABACContext * const c, uint8_t * const state0,
|
|
+ unsigned int n,
|
|
+ const uint8_t const * ctx_map,
|
|
+ uint8_t * p)
|
|
+{
|
|
+ unsigned int reg_b, tmp, st, bit;
|
|
+ __asm__ (
|
|
+ "1: \n\t"
|
|
+// Get bin from map
|
|
+ "ldrb %[st] , [%[ctx_map], %[n]] \n\t"
|
|
+
|
|
+// Load state & ranges
|
|
+ "sub %[r_b] , %[mlps_tables], %[lps_off] \n\t"
|
|
+ "ldrb %[bit] , [%[state0], %[st]] \n\t"
|
|
+ "and %[tmp] , %[range] , #0xC0 \n\t"
|
|
+ "add %[r_b] , %[r_b] , %[tmp], lsl #1 \n\t"
|
|
+ "ldrb %[tmp] , [%[r_b], %[bit]] \n\t"
|
|
+ "sub %[range] , %[range] , %[tmp] \n\t"
|
|
+
|
|
+ "cmp %[low] , %[range], lsl #17 \n\t"
|
|
+ "ittt ge \n\t"
|
|
+ "subge %[low] , %[low] , %[range], lsl #17 \n\t"
|
|
+ "mvnge %[bit] , %[bit] \n\t"
|
|
+ "movge %[range] , %[tmp] \n\t"
|
|
+
|
|
+ "ldrb %[r_b] , [%[mlps_tables], %[bit]] \n\t"
|
|
+ "tst %[bit] , #1 \n\t"
|
|
+// GCC asm seems to need strbne written differently for thumb and arm
|
|
+#if CONFIG_THUMB
|
|
+ "it ne \n\t"
|
|
+ "strbne %[n] , [%[idx]] , #1 \n\t"
|
|
+#else
|
|
+ "strneb %[n] , [%[idx]] , #1 \n\t"
|
|
+#endif
|
|
+
|
|
+// Renorm
|
|
+ "clz %[tmp] , %[range] \n\t"
|
|
+ "sub %[tmp] , #23 \n\t"
|
|
+ "lsl %[low] , %[low] , %[tmp] \n\t"
|
|
+ "lsl %[range] , %[range] , %[tmp] \n\t"
|
|
+
|
|
+ "strb %[r_b] , [%[state0], %[st]] \n\t"
|
|
+// There is a small speed gain from combining both conditions, using a single
|
|
+// branch and then working out what that meant later
|
|
+ "subs %[n] , %[n] , #1 \n\t"
|
|
+#if CONFIG_THUMB
|
|
+ "itt ne \n\t"
|
|
+ "lslsne %[tmp] , %[low] , #16 \n\t"
|
|
+ "bne 1b \n\t"
|
|
+#else
|
|
+ "lslnes %[tmp] , %[low] , #16 \n\t"
|
|
+ "bne 1b \n\t"
|
|
+#endif
|
|
+
|
|
+// If we have bits left then n must be 0 so give up now
|
|
+ "lsls %[tmp] , %[low] , #16 \n\t"
|
|
+ "bne 2f \n\t"
|
|
+
|
|
+// Do reload
|
|
+ "ldrh %[tmp] , [%[bptr]] , #2 \n\t"
|
|
+ "movw %[r_b] , #0xFFFF \n\t"
|
|
+ "rev %[tmp] , %[tmp] \n\t"
|
|
+ "rsb %[tmp] , %[r_b] , %[tmp], lsr #15 \n\t"
|
|
+
|
|
+ "rbit %[r_b] , %[low] \n\t"
|
|
+ "clz %[r_b] , %[r_b] \n\t"
|
|
+ "sub %[r_b] , %[r_b] , #16 \n\t"
|
|
+
|
|
+#if CONFIG_THUMB
|
|
+ "lsl %[tmp] , %[tmp] , %[r_b] \n\t"
|
|
+ "add %[low] , %[low] , %[tmp] \n\t"
|
|
+#else
|
|
+ "add %[low] , %[low] , %[tmp], lsl %[r_b] \n\t"
|
|
+#endif
|
|
+
|
|
+// Check to see if we still have more to do
|
|
+ "cmp %[n] , #0 \n\t"
|
|
+ "bne 1b \n\t"
|
|
+ "2: \n\t"
|
|
+ : [bit]"=&r"(bit),
|
|
+ [low]"+&r"(c->low),
|
|
+ [range]"+&r"(c->range),
|
|
+ [r_b]"=&r"(reg_b),
|
|
+ [bptr]"+&r"(c->bytestream),
|
|
+ [idx]"+&r"(p),
|
|
+ [n]"+&r"(n),
|
|
+ [tmp]"=&r"(tmp),
|
|
+ [st]"=&r"(st)
|
|
+ : [state0]"r"(state0),
|
|
+ [ctx_map]"r"(ctx_map),
|
|
+ [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
|
|
+ [byte]"M"(offsetof(CABACContext, bytestream)),
|
|
+ [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
|
|
+ : "memory", "cc"
|
|
+ );
|
|
+
|
|
+ return p;
|
|
+}
|
|
+
|
|
+// ---------------------------------------------------------------------------
|
|
+//
|
|
+// CABAC_BY22 functions
|
|
+//
|
|
+// By and large these are (at best) no faster than their C equivalents - the
|
|
+// only one worth having is _peek where we do a slightly better job than the
|
|
+// compiler
|
|
+//
|
|
+// The others have been stashed here for reference in case larger scale asm
|
|
+// is attempted in which case they might be a useful base
|
|
+
|
|
+
|
|
+#define get_cabac_by22_peek get_cabac_by22_peek_arm
|
|
+static inline uint32_t get_cabac_by22_peek_arm(const CABACContext *const c)
|
|
+{
|
|
+ uint32_t rv, tmp;
|
|
+ __asm__ (
|
|
+ "bic %[rv] , %[low], #1 \n\t"
|
|
+ "cmp %[inv] , #0 \n\t"
|
|
+ "it ne \n\t"
|
|
+ "umullne %[tmp] , %[rv] , %[inv], %[rv] \n\t"
|
|
+ : // Outputs
|
|
+ [rv]"=&r"(rv),
|
|
+ [tmp]"=r"(tmp)
|
|
+ : // Inputs
|
|
+ [low]"r"(c->low),
|
|
+ [inv]"r"(c->range)
|
|
+ : // Clobbers
|
|
+ "cc"
|
|
+ );
|
|
+ return rv << 1;
|
|
+}
|
|
+
|
|
+#if 0
|
|
+
|
|
+// ***** Slower than the C :-(
|
|
+#define get_cabac_by22_flush get_cabac_by22_flush_arm
|
|
+static inline void get_cabac_by22_flush_arm(CABACContext *const c, const unsigned int n, const uint32_t val)
|
|
+{
|
|
+ uint32_t m, tmp;
|
|
+ __asm__ (
|
|
+ "add %[bits], %[bits], %[n] \n\t"
|
|
+ "ldr %[m], [%[ptr], %[bits], lsr #3] \n\t"
|
|
+
|
|
+ "rsb %[tmp], %[n], #32 \n\t"
|
|
+ "lsr %[tmp], %[val], %[tmp] \n\t"
|
|
+ "mul %[tmp], %[range], %[tmp] \n\t"
|
|
+
|
|
+ "rev %[m], %[m] \n\t"
|
|
+
|
|
+ "lsl %[tmp], %[tmp], #23 \n\t"
|
|
+ "rsb %[low], %[tmp], %[low], lsl %[n] \n\t"
|
|
+
|
|
+ "and %[tmp], %[bits], #7 \n\t"
|
|
+ "lsl %[m], %[m], %[tmp] \n\t"
|
|
+
|
|
+ "orr %[low], %[low], %[m], lsr #9 \n\t"
|
|
+ : // Outputs
|
|
+ [m]"=&r"(m),
|
|
+ [tmp]"=&r"(tmp),
|
|
+ [bits]"+&r"(c->by22.bits),
|
|
+ [low]"+&r"(c->low)
|
|
+ : // Inputs
|
|
+ [n]"r"(n),
|
|
+ [val]"r"(val),
|
|
+ [inv]"r"(c->range),
|
|
+ [range]"r"(c->by22.range),
|
|
+ [ptr]"r"(c->bytestream)
|
|
+ : // Clobbers
|
|
+ );
|
|
+}
|
|
+
|
|
+
|
|
+// Works but slower than C
|
|
+#define coeff_abs_level_remaining_decode_by22(c,r) coeff_abs_level_remaining_decode_by22_arm(c, r)
|
|
+static int coeff_abs_level_remaining_decode_by22_arm(CABACContext * const c, const unsigned int c_rice_param)
|
|
+{
|
|
+ uint32_t n, val, tmp, level;
|
|
+
|
|
+// PROFILE_START();
|
|
+
|
|
+ __asm__ (
|
|
+ // Peek
|
|
+ "bic %[val], %[low], #1 \n\t"
|
|
+ "cmp %[inv], #0 \n\t"
|
|
+ "umullne %[tmp], %[val], %[inv], %[val] \n\t"
|
|
+ "lsl %[val], %[val], #1 \n\t"
|
|
+
|
|
+ // Count bits (n = prefix)
|
|
+ "mvn %[n], %[val] \n\t"
|
|
+ "clz %[n], %[n] \n\t"
|
|
+
|
|
+ "lsl %[level], %[val], %[n] \n\t"
|
|
+ "subs %[tmp], %[n], #3 \n\t"
|
|
+ "blo 2f \n\t"
|
|
+
|
|
+ // prefix >= 3
|
|
+ // < tmp = prefix - 3
|
|
+ // > tmp = prefix + rice - 3
|
|
+ "add %[tmp], %[tmp], %[rice] \n\t"
|
|
+ // > n = prefix * 2 + rice - 3
|
|
+ "add %[n], %[tmp], %[n] \n\t"
|
|
+ "cmp %[n], #21 \n\t"
|
|
+ "bhi 3f \n\t"
|
|
+
|
|
+ "orr %[level], %[level], #0x80000000 \n\t"
|
|
+ "rsb %[tmp], %[tmp], #31 \n\t"
|
|
+ "lsr %[level], %[level], %[tmp] \n\t"
|
|
+
|
|
+ "mov %[tmp], #2 \n\t"
|
|
+ "add %[level], %[level], %[tmp], lsl %[rice] \n\t"
|
|
+ "b 1f \n\t"
|
|
+
|
|
+ // > 22 bits used in total - need reload
|
|
+ "3: \n\t"
|
|
+
|
|
+ // Stash prefix + rice - 3 in level (only spare reg)
|
|
+ "mov %[level], %[tmp] \n\t"
|
|
+ // Restore n to flush value (prefix)
|
|
+ "sub %[n], %[n], %[tmp] \n\t"
|
|
+
|
|
+ // Flush + reload
|
|
+
|
|
+// "rsb %[tmp], %[n], #32 \n\t"
|
|
+// "lsr %[tmp], %[val], %[tmp] \n\t"
|
|
+// "mul %[tmp], %[range], %[tmp] \n\t"
|
|
+
|
|
+ // As it happens we know that all the bits we are flushing are 1
|
|
+ // so we can cheat slightly
|
|
+ "rsb %[tmp], %[range], %[range], lsl %[n] \n\t"
|
|
+ "lsl %[tmp], %[tmp], #23 \n\t"
|
|
+ "rsb %[low], %[tmp], %[low], lsl %[n] \n\t"
|
|
+
|
|
+ "add %[bits], %[bits], %[n] \n\t"
|
|
+ "ldr %[n], [%[ptr], %[bits], lsr #3] \n\t"
|
|
+ "rev %[n], %[n] \n\t"
|
|
+ "and %[tmp], %[bits], #7 \n\t"
|
|
+ "lsl %[n], %[n], %[tmp] \n\t"
|
|
+
|
|
+ "orr %[low], %[low], %[n], lsr #9 \n\t"
|
|
+
|
|
+ // (reload)
|
|
+
|
|
+ "bic %[val], %[low], #1 \n\t"
|
|
+ "cmp %[inv], #0 \n\t"
|
|
+ "umullne %[tmp], %[val], %[inv], %[val] \n\t"
|
|
+ "lsl %[val], %[val], #1 \n\t"
|
|
+
|
|
+ // Build value
|
|
+
|
|
+ "mov %[n], %[level] \n\t"
|
|
+
|
|
+ "orr %[tmp], %[val], #0x80000000 \n\t"
|
|
+ "rsb %[level], %[level], #31 \n\t"
|
|
+ "lsr %[level], %[tmp], %[level] \n\t"
|
|
+
|
|
+ "mov %[tmp], #2 \n\t"
|
|
+ "add %[level], %[level], %[tmp], lsl %[rice] \n\t"
|
|
+ "b 1f \n\t"
|
|
+
|
|
+ // prefix < 3
|
|
+ "2: \n\t"
|
|
+ "rsb %[tmp], %[rice], #31 \n\t"
|
|
+ "lsr %[level], %[level], %[tmp] \n\t"
|
|
+ "orr %[level], %[level], %[n], lsl %[rice] \n\t"
|
|
+ "add %[n], %[n], %[rice] \n\t"
|
|
+
|
|
+ "1: \n\t"
|
|
+ // Flush
|
|
+ "add %[n], %[n], #1 \n\t"
|
|
+
|
|
+ "rsb %[tmp], %[n], #32 \n\t"
|
|
+ "lsr %[tmp], %[val], %[tmp] \n\t"
|
|
+
|
|
+ "add %[bits], %[bits], %[n] \n\t"
|
|
+ "ldr %[val], [%[ptr], %[bits], lsr #3] \n\t"
|
|
+
|
|
+ "mul %[tmp], %[range], %[tmp] \n\t"
|
|
+ "lsl %[tmp], %[tmp], #23 \n\t"
|
|
+ "rsb %[low], %[tmp], %[low], lsl %[n] \n\t"
|
|
+
|
|
+ "rev %[val], %[val] \n\t"
|
|
+ "and %[tmp], %[bits], #7 \n\t"
|
|
+ "lsl %[val], %[val], %[tmp] \n\t"
|
|
+
|
|
+ "orr %[low], %[low], %[val], lsr #9 \n\t"
|
|
+ : // Outputs
|
|
+ [level]"=&r"(level),
|
|
+ [n]"=&r"(n),
|
|
+ [val]"=&r"(val),
|
|
+ [tmp]"=&r"(tmp),
|
|
+ [bits]"+&r"(c->by22.bits),
|
|
+ [low]"+&r"(c->low)
|
|
+ : // Inputs
|
|
+ [rice]"r"(c_rice_param),
|
|
+ [inv]"r"(c->range),
|
|
+ [range]"r"(c->by22.range),
|
|
+ [ptr]"r"(c->bytestream)
|
|
+ : // Clobbers
|
|
+ "cc"
|
|
+ );
|
|
+
|
|
+// PROFILE_ACC(residual_abs);
|
|
+
|
|
+ return level;
|
|
+}
|
|
+#endif
|
|
+
|
|
+#endif /* HAVE_ARMV6T2_INLINE */
|
|
+
|
|
+#endif /* AVCODEC_ARM_HEVC_CABAC_H */
|
|
diff --git a/libavcodec/arm/hevcdsp_deblock_neon.S b/libavcodec/arm/hevcdsp_deblock_neon.S
|
|
index 166bddb..a088cc3 100644
|
|
--- a/libavcodec/arm/hevcdsp_deblock_neon.S
|
|
+++ b/libavcodec/arm/hevcdsp_deblock_neon.S
|
|
@@ -383,3 +383,127 @@ function ff_hevc_h_loop_filter_chroma_neon, export=1
|
|
vst1.8 {d4}, [r0]
|
|
bx lr
|
|
endfunc
|
|
+
|
|
+/* ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc,
|
|
+ * int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
|
|
+ * MvField *curr, MvField *neigh, uint8_t *bs)
|
|
+ */
|
|
+function ff_hevc_deblocking_boundary_strengths_neon, export=1
|
|
+ add ip, sp, #4*4
|
|
+ push {a2-a4,v1-v8,lr}
|
|
+ ldmia ip, {v5-v7}
|
|
+1: ldmdb ip, {v1-v4}
|
|
+ ldrsb a3, [v5, #8] @ curr->ref_idx
|
|
+ ldrsb v8, [v5, #9]
|
|
+ ldrsb ip, [v6, #8] @ neigh->ref_idx
|
|
+ ldrsb lr, [v6, #9]
|
|
+ ldr v1, [v1, a3, lsl #2]
|
|
+ ldrb a3, [v5, #10] @ curr->pred_flag
|
|
+ ldr v2, [v2, v8, lsl #2]
|
|
+ ldrb v8, [v6, #10] @ neigh->pred_flag
|
|
+ ldr v3, [v3, ip, lsl #2]
|
|
+ ldr v4, [v4, lr, lsl #2]
|
|
+ teq a3, #3
|
|
+ beq 20f
|
|
+ teq v8, #3
|
|
+ beq 90f
|
|
+
|
|
+ tst a3, #1
|
|
+ itee ne
|
|
+ ldrne a3, [v5, #0] @ curr->mv[0]
|
|
+ ldreq a3, [v5, #4] @ curr->mv[1]
|
|
+ moveq v1, v2
|
|
+ tst v8, #1
|
|
+ itee ne
|
|
+ ldrne v8, [v6, #0] @ neigh->mv[0]
|
|
+ ldreq v8, [v6, #4] @ neigh->mv[1]
|
|
+ moveq v3, v4
|
|
+ teq v1, v3
|
|
+ bne 10f
|
|
+ ldr lr, =0xFFFCFFFC
|
|
+ ssub16 ip, v8, a3
|
|
+ ssub16 a3, a3, v8
|
|
+ sel a3, a3, ip
|
|
+ ands a3, a3, lr
|
|
+ @ drop through
|
|
+10: it ne
|
|
+ movne a3, #1
|
|
+11: subs a2, a2, #1
|
|
+12:
|
|
+A strbhs a3, [v7], a4
|
|
+T itt hs
|
|
+T strbhs a3, [v7]
|
|
+T addhs v7, v7, a4
|
|
+ subs a2, a2, #1
|
|
+ bhs 12b
|
|
+
|
|
+ ldm sp, {a2, a3}
|
|
+ add ip, sp, #16*4
|
|
+ subs a1, a1, #1
|
|
+ add v5, v5, a3
|
|
+ add v6, v6, a3
|
|
+ bhi 1b
|
|
+ pop {a2-a4,v1-v8,pc}
|
|
+
|
|
+20: teq v8, #3
|
|
+ bne 10b
|
|
+
|
|
+ teq v1, v3
|
|
+ it eq
|
|
+ teqeq v2, v4
|
|
+ bne 40f
|
|
+ teq v1, v2
|
|
+ bne 30f
|
|
+
|
|
+ ldrd v1, v2, [v5] @ curr->mv
|
|
+ ldrd v3, v4, [v6] @ neigh->mv
|
|
+ ldr lr, =0xFFFCFFFC
|
|
+ ssub16 ip, v3, v1
|
|
+ ssub16 a3, v1, v3
|
|
+ sel a3, a3, ip
|
|
+ ands a3, a3, lr
|
|
+ bne 25f
|
|
+ ssub16 ip, v4, v2
|
|
+ ssub16 a3, v2, v4
|
|
+ sel a3, a3, ip
|
|
+ ands a3, a3, lr
|
|
+ beq 11b
|
|
+ @ drop through
|
|
+25: ssub16 ip, v4, v1
|
|
+ ssub16 a3, v1, v4
|
|
+ sel a3, a3, ip
|
|
+ ands a3, a3, lr
|
|
+ bne 10b
|
|
+ ssub16 ip, v3, v2
|
|
+ ssub16 a3, v2, v3
|
|
+ sel a3, a3, ip
|
|
+ ands a3, a3, lr
|
|
+ b 10b
|
|
+
|
|
+30: ldrd v1, v2, [v5] @ curr->mv
|
|
+ ldrd v3, v4, [v6] @ neigh->mv
|
|
+ ldr lr, =0xFFFCFFFC
|
|
+ ssub16 ip, v3, v1
|
|
+ ssub16 a3, v1, v3
|
|
+ sel a3, a3, ip
|
|
+ ands a3, a3, lr
|
|
+ bne 10b
|
|
+ ssub16 ip, v4, v2
|
|
+ ssub16 a3, v2, v4
|
|
+ sel a3, a3, ip
|
|
+ ands a3, a3, lr
|
|
+ b 10b
|
|
+
|
|
+40: teq v1, v4
|
|
+ ite eq
|
|
+ teqeq v2, v3
|
|
+ bne 10b
|
|
+
|
|
+ ldrd v1, v2, [v5] @ curr->mv
|
|
+ ldrd v3, v4, [v6] @ neigh->mv
|
|
+ ldr lr, =0xFFFCFFFC
|
|
+ b 25b
|
|
+
|
|
+90: mov a3, #1
|
|
+ b 11b
|
|
+endfunc
|
|
diff --git a/libavcodec/arm/hevcdsp_epel_neon.S b/libavcodec/arm/hevcdsp_epel_neon.S
|
|
new file mode 100644
|
|
index 0000000..00eab9e
|
|
--- /dev/null
|
|
+++ b/libavcodec/arm/hevcdsp_epel_neon.S
|
|
@@ -0,0 +1,337 @@
|
|
+/*
|
|
+ * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
|
|
+ *
|
|
+ * This file is part of FFmpeg.
|
|
+ *
|
|
+ * FFmpeg is free software; you can redistribute it and/or
|
|
+ * modify it under the terms of the GNU Lesser General Public
|
|
+ * License as published by the Free Software Foundation; either
|
|
+ * version 2.1 of the License, or (at your option) any later version.
|
|
+ *
|
|
+ * FFmpeg is distributed in the hope that it will be useful,
|
|
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
+ * Lesser General Public License for more details.
|
|
+ *
|
|
+ * You should have received a copy of the GNU Lesser General Public
|
|
+ * License along with FFmpeg; if not, write to the Free Software
|
|
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
+ */
|
|
+
|
|
+#include "libavutil/arm/asm.S"
|
|
+#include "neon.S"
|
|
+
|
|
+#define MAX_PB_SIZE #64
|
|
+
|
|
+.macro vextin_d4
|
|
+ vld1.8 {q10}, [r1], r2
|
|
+ vmov d16, d20
|
|
+ vext.8 d17, d20, d21, #1
|
|
+ vext.8 d18, d20, d21, #2
|
|
+ vext.8 d19, d20, d21, #3
|
|
+.endm
|
|
+
|
|
+.macro vextin_d4_8
|
|
+ vld1.8 d16, [r1], r2
|
|
+ vext.8 d17, d16, d16, #1
|
|
+ vext.8 d18, d16, d16, #2
|
|
+ vext.8 d19, d16, d16, #3
|
|
+.endm
|
|
+
|
|
+.macro load_coeffs_16b coeffs
|
|
+ ldr \coeffs, [\coeffs]
|
|
+ vdup.i8 d0, \coeffs
|
|
+ lsr \coeffs, #8
|
|
+ vdup.i8 d1, \coeffs
|
|
+ lsr \coeffs, #8
|
|
+ vdup.i8 d2, \coeffs
|
|
+ lsr \coeffs, #8
|
|
+ vdup.i8 d3, \coeffs
|
|
+.endm
|
|
+
|
|
+.macro epel_filter_16b out=q12
|
|
+ vmull.u8 q3, d16, d0
|
|
+ vmull.u8 q11, d19, d3
|
|
+ vmull.u8 \out, d17, d1
|
|
+ vmull.u8 q10, d18, d2
|
|
+ vadd.s16 q3, q11
|
|
+ vadd.s16 \out, q10
|
|
+ vsub.s16 \out, q3
|
|
+.endm
|
|
+
|
|
+.macro load_coeffs_32b coeffs
|
|
+ ldr \coeffs, [\coeffs]
|
|
+ vmov.i64 d4, #0
|
|
+ vmov.8 d4[0], \coeffs
|
|
+ lsr \coeffs, #8
|
|
+ vmov.8 d4[2], \coeffs
|
|
+ lsr \coeffs, #8
|
|
+ vmov.8 d4[4], \coeffs
|
|
+ lsr \coeffs, #8
|
|
+ vmov.8 d4[6], \coeffs
|
|
+.endm
|
|
+
|
|
+.macro epel_filter_32b
|
|
+ vmull.s16 q3, d24, d4[0] //q12
|
|
+ vmull.s16 q4, d25, d4[0]
|
|
+ vmull.s16 q5, d30, d4[3] //q15
|
|
+ vmull.s16 q6, d31, d4[3]
|
|
+
|
|
+ vmull.s16 q7, d26, d4[1] // q13
|
|
+ vmull.s16 q8, d27, d4[1]
|
|
+ vmull.s16 q9, d28, d4[2] // q14
|
|
+ vmull.s16 q10, d29, d4[2]
|
|
+ vadd.s32 q3, q5
|
|
+ vadd.s32 q4, q6
|
|
+ vadd.s32 q7, q9
|
|
+ vadd.s32 q8, q10
|
|
+ vsub.s32 q7, q3
|
|
+ vsub.s32 q8, q4
|
|
+ vqshrn.s32 d6, q7, #6
|
|
+ vqshrn.s32 d7, q8, #6
|
|
+.endm
|
|
+
|
|
+.macro epel_filter_32b_4
|
|
+ vmull.s16 q3, d24, d4[0] //q12
|
|
+ vmull.s16 q5, d30, d4[3] //q15
|
|
+ vmull.s16 q7, d26, d4[1] // q13
|
|
+ vmull.s16 q9, d28, d4[2] // q14
|
|
+ vadd.s32 q3, q5
|
|
+ vadd.s32 q7, q9
|
|
+ vsub.s32 q7, q3
|
|
+ vqshrn.s32 d6, q7, #6
|
|
+.endm
|
|
+
|
|
+function ff_hevc_put_epel_h_neon_8, export=1
|
|
+ push {r4-r7}
|
|
+ mov r4, MAX_PB_SIZE
|
|
+ ldr r7, [sp, #16] // mx
|
|
+ ldr r5, [sp, #24] // width
|
|
+ sub r7, #1
|
|
+ lsl r7, #2
|
|
+ vpush {d8-d15}
|
|
+@ adr reaches if we are in thumb mode but not in arm
|
|
+T adr r12, epel_coeffs
|
|
+A adrl r12, epel_coeffs
|
|
+ add r7, r12
|
|
+ sub r1, #1
|
|
+ lsl r4, #1
|
|
+ load_coeffs_16b r7
|
|
+ mov r12, r3
|
|
+ mov r6, r0
|
|
+ mov r7, r1
|
|
+ cmp r5, #6
|
|
+ bgt 8f
|
|
+ cmp r5, #4
|
|
+ blt 2f
|
|
+ b 4f
|
|
+8: subs r3, #1
|
|
+ pld [r1]
|
|
+ vextin_d4
|
|
+ epel_filter_16b
|
|
+ vst1.16 {q12}, [r0], r4
|
|
+ bne 8b
|
|
+ subs r5, #8
|
|
+ beq 99f
|
|
+ mov r3, r12
|
|
+ add r6, #16
|
|
+ mov r0, r6
|
|
+ add r7, #8
|
|
+ mov r1, r7
|
|
+ cmp r5, #4
|
|
+ bgt 8b
|
|
+4: subs r3, #1
|
|
+ pld [r1]
|
|
+ vextin_d4_8
|
|
+ epel_filter_16b
|
|
+ vst1.16 d24, [r0], r4
|
|
+ bne 4b
|
|
+ subs r5, #4
|
|
+ beq 99f
|
|
+ mov r3, r12
|
|
+ add r6, #8
|
|
+ mov r0, r6
|
|
+ add r7, #4
|
|
+ mov r1, r7
|
|
+2: subs r3, #1
|
|
+ pld [r1]
|
|
+ vextin_d4_8
|
|
+ epel_filter_16b
|
|
+ vst1.32 d24[0], [r0], r4
|
|
+ bne 2b
|
|
+99: vpop {d8-d15}
|
|
+ pop {r4-r7}
|
|
+ bx lr
|
|
+endfunc
|
|
+
|
|
+function ff_hevc_put_epel_v_neon_8, export=1
|
|
+ push {r4-r7}
|
|
+ mov r4, MAX_PB_SIZE
|
|
+ ldr r7, [sp, #20] // my
|
|
+ ldr r5, [sp, #24] // width
|
|
+ sub r7, #1
|
|
+ lsl r7, #2
|
|
+ vpush {d8-d15}
|
|
+T adr r12, epel_coeffs
|
|
+A adrl r12, epel_coeffs
|
|
+ add r7, r12
|
|
+ load_coeffs_16b r7
|
|
+ sub r1, r2
|
|
+ lsl r4, #1
|
|
+ mov r12, r3
|
|
+ mov r6, r0
|
|
+ mov r7, r1
|
|
+0: pld [r1]
|
|
+ vld1.8 {d16}, [r1], r2
|
|
+ pld [r1]
|
|
+ vld1.8 {d17}, [r1], r2
|
|
+ pld [r1]
|
|
+ vld1.8 {d18}, [r1], r2
|
|
+ cmp r5, #6
|
|
+ bgt 8f
|
|
+ cmp r5, #4
|
|
+ blt 2f
|
|
+ b 4f
|
|
+8: pld [r1]
|
|
+ vld1.8 {d19}, [r1], r2
|
|
+ subs r3, #1
|
|
+ epel_filter_16b
|
|
+ vst1.16 {q12}, [r0], r4
|
|
+ vmov d16, d17
|
|
+ vmov d17, d18
|
|
+ vmov d18, d19
|
|
+ bne 8b
|
|
+ subs r5, #8
|
|
+ beq 99f
|
|
+ mov r3, r12
|
|
+ add r6, #16
|
|
+ mov r0, r6
|
|
+ add r7, #8
|
|
+ mov r1, r7
|
|
+ b 0b
|
|
+4: pld [r1]
|
|
+ vld1.8 {d19}, [r1], r2
|
|
+ subs r3, #1
|
|
+ epel_filter_16b
|
|
+ vst1.16 d24, [r0], r4
|
|
+ vmov d16, d17
|
|
+ vmov d17, d18
|
|
+ vmov d18, d19
|
|
+ bne 4b
|
|
+ subs r5, #4
|
|
+ beq 99f
|
|
+ mov r3, r12
|
|
+ add r6, #8
|
|
+ mov r0, r6
|
|
+ add r7, #4
|
|
+ mov r1, r7
|
|
+ b 0b
|
|
+2: pld [r1]
|
|
+ vld1.8 {d19}, [r1], r2
|
|
+ subs r3, #1
|
|
+ epel_filter_16b
|
|
+ vst1.32 d24[0], [r0], r4
|
|
+ vmov d16, d17
|
|
+ vmov d17, d18
|
|
+ vmov d18, d19
|
|
+ bne 2b
|
|
+99: vpop {d8-d15}
|
|
+ pop {r4-r7}
|
|
+ bx lr
|
|
+endfunc
|
|
+
|
|
+function ff_hevc_put_epel_hv_neon_8, export=1
|
|
+ push {r4-r7}
|
|
+ mov r4, MAX_PB_SIZE
|
|
+ ldr r6, [sp, #16] // mx
|
|
+ ldr r7, [sp, #20] // my
|
|
+ ldr r5, [sp, #24] // width
|
|
+ sub r7, #1
|
|
+ lsl r7, #2
|
|
+ vpush {d8-d15}
|
|
+ adr r12, epel_coeffs
|
|
+ sub r6, #1
|
|
+ lsl r6, #2
|
|
+ add r6, r12 // mx epel coeff offset
|
|
+ add r7, r12
|
|
+ sub r1, #1
|
|
+ sub r1, r2
|
|
+ lsl r4, #1
|
|
+ load_coeffs_16b r6
|
|
+ load_coeffs_32b r7
|
|
+ mov r12, r3
|
|
+ mov r6, r0
|
|
+ mov r7, r1
|
|
+0: pld [r1]
|
|
+ vextin_d4
|
|
+ epel_filter_16b q12
|
|
+ pld [r1]
|
|
+ vextin_d4
|
|
+ epel_filter_16b q13
|
|
+ pld [r1]
|
|
+ vextin_d4
|
|
+ epel_filter_16b q14
|
|
+ cmp r5, #6
|
|
+ bgt 8f
|
|
+ cmp r5, #4
|
|
+ blt 2f
|
|
+ b 4f
|
|
+8: pld [r1]
|
|
+ vextin_d4
|
|
+ epel_filter_16b q15
|
|
+ subs r3, #1
|
|
+ epel_filter_32b
|
|
+ vst1.16 {q3}, [r0], r4
|
|
+ vmov q12, q13
|
|
+ vmov q13, q14
|
|
+ vmov q14, q15
|
|
+ bne 8b
|
|
+ subs r5, #8
|
|
+ beq 99f
|
|
+ mov r3, r12
|
|
+ add r6, #16
|
|
+ mov r0, r6
|
|
+ add r7, #8
|
|
+ mov r1, r7
|
|
+ b 0b
|
|
+4: pld [r1]
|
|
+ vextin_d4_8
|
|
+ epel_filter_16b q15
|
|
+ subs r3, #1
|
|
+ epel_filter_32b_4
|
|
+ vst1.16 d6, [r0], r4
|
|
+ vmov q12, q13
|
|
+ vmov q13, q14
|
|
+ vmov q14, q15
|
|
+ bne 4b
|
|
+ subs r5, #4
|
|
+ beq 99f
|
|
+ mov r3, r12
|
|
+ add r6, #8
|
|
+ mov r0, r6
|
|
+ add r7, #4
|
|
+ mov r1, r7
|
|
+ b 0b
|
|
+2: pld [r1]
|
|
+ vextin_d4_8
|
|
+ epel_filter_16b q15
|
|
+ subs r3, #1
|
|
+ epel_filter_32b_4
|
|
+ vst1.32 d6[0], [r0], r4
|
|
+ vmov q12, q13
|
|
+ vmov q13, q14
|
|
+ vmov q14, q15
|
|
+ bne 2b
|
|
+99: vpop {d8-d15}
|
|
+ pop {r4-r7}
|
|
+ bx lr
|
|
+endfunc
|
|
+
|
|
+epel_coeffs:
|
|
+ .byte 2, 58, 10, 2
|
|
+ .byte 4, 54, 16, 2
|
|
+ .byte 6, 46, 28, 4
|
|
+ .byte 4, 36, 36, 4
|
|
+ .byte 4, 28, 46, 6
|
|
+ .byte 2, 16, 54, 4
|
|
+ .byte 2, 10, 58, 2
|
|
diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
|
|
index 5591807..49c70dd 100644
|
|
--- a/libavcodec/arm/hevcdsp_init_neon.c
|
|
+++ b/libavcodec/arm/hevcdsp_init_neon.c
|
|
@@ -22,6 +22,8 @@
|
|
#include "libavutil/arm/cpu.h"
|
|
#include "libavcodec/hevcdsp.h"
|
|
#include "hevcdsp_arm.h"
|
|
+#include "libavcodec/avcodec.h"
|
|
+#include "libavcodec/bit_depth_template.c"
|
|
|
|
void ff_hevc_v_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
|
|
void ff_hevc_h_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
|
|
@@ -43,6 +45,21 @@ void ff_hevc_transform_add_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
|
|
void ff_hevc_transform_add_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
|
|
ptrdiff_t stride);
|
|
|
|
+void ff_hevc_sao_band_w8_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
|
|
+void ff_hevc_sao_band_w16_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
|
|
+void ff_hevc_sao_band_w32_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
|
|
+void ff_hevc_sao_band_w64_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
|
|
+
|
|
+void ff_hevc_sao_edge_eo0_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
|
|
+void ff_hevc_sao_edge_eo1_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
|
|
+void ff_hevc_sao_edge_eo2_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
|
|
+void ff_hevc_sao_edge_eo3_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
|
|
+
|
|
+void ff_hevc_sao_edge_eo0_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
|
|
+void ff_hevc_sao_edge_eo1_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
|
|
+void ff_hevc_sao_edge_eo2_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
|
|
+void ff_hevc_sao_edge_eo3_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
|
|
+
|
|
#define PUT_PIXELS(name) \
|
|
void name(int16_t *dst, uint8_t *src, \
|
|
ptrdiff_t srcstride, int height, \
|
|
@@ -58,6 +75,15 @@ PUT_PIXELS(ff_hevc_put_pixels_w32_neon_8);
|
|
PUT_PIXELS(ff_hevc_put_pixels_w48_neon_8);
|
|
PUT_PIXELS(ff_hevc_put_pixels_w64_neon_8);
|
|
#undef PUT_PIXELS
|
|
+void ff_hevc_put_epel_h_neon_8(int16_t *dst, uint8_t *src,
|
|
+ ptrdiff_t srcstride, int height,
|
|
+ intptr_t mx, intptr_t my, int width);
|
|
+void ff_hevc_put_epel_v_neon_8(int16_t *dst, uint8_t *src,
|
|
+ ptrdiff_t srcstride, int height,
|
|
+ intptr_t mx, intptr_t my, int width);
|
|
+void ff_hevc_put_epel_hv_neon_8(int16_t *dst, uint8_t *src,
|
|
+ ptrdiff_t srcstride, int height,
|
|
+ intptr_t mx, intptr_t my, int width);
|
|
|
|
static void (*put_hevc_qpel_neon[4][4])(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
|
|
int height, int width);
|
|
@@ -142,6 +168,132 @@ void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t
|
|
put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, src2, MAX_PB_SIZE);
|
|
}
|
|
|
|
+static void ff_hevc_sao_band_neon_wrapper(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
|
|
+ int16_t *sao_offset_val, int sao_left_class, int width, int height)
|
|
+{
|
|
+ pixel *dst = (pixel *)_dst;
|
|
+ pixel *src = (pixel *)_src;
|
|
+ int8_t offset_table[32] = { 0 };
|
|
+ int k, y, x;
|
|
+ int shift = 3; // BIT_DEPTH - 5
|
|
+ int cwidth = 0;
|
|
+
|
|
+ stride_src /= sizeof(pixel);
|
|
+ stride_dst /= sizeof(pixel);
|
|
+
|
|
+ for (k = 0; k < 4; k++)
|
|
+ offset_table[(k + sao_left_class) & 31] = sao_offset_val[k + 1];
|
|
+
|
|
+ if (height % 8 == 0)
|
|
+ cwidth = width;
|
|
+
|
|
+ switch(cwidth){
|
|
+ case 8:
|
|
+ ff_hevc_sao_band_w8_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height);
|
|
+ break;
|
|
+ case 16:
|
|
+ ff_hevc_sao_band_w16_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height);
|
|
+ break;
|
|
+ case 32:
|
|
+ ff_hevc_sao_band_w32_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height);
|
|
+ break;
|
|
+ case 64:
|
|
+ ff_hevc_sao_band_w64_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height);
|
|
+ break;
|
|
+ default:
|
|
+ for (y = 0; y < height; y++) {
|
|
+ for (x = 0; x < width; x++)
|
|
+ dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
|
|
+ dst += stride_dst;
|
|
+ src += stride_src;
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+#define CMP(a, b) ((a) > (b) ? 1 : ((a) == (b) ? 0 : -1))
|
|
+static void ff_hevc_sao_edge_neon_wrapper(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
|
|
+ int16_t *_sao_offset_val, int eo, int width, int height)
|
|
+{
|
|
+ static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
|
|
+ static const int8_t pos[4][2][2] = {
|
|
+ { { -1, 0 }, { 1, 0 } }, // horizontal
|
|
+ { { 0, -1 }, { 0, 1 } }, // vertical
|
|
+ { { -1, -1 }, { 1, 1 } }, // 45 degree
|
|
+ { { 1, -1 }, { -1, 1 } }, // 135 degree
|
|
+ };
|
|
+ int8_t sao_offset_val[8]; // padding of 3 for vld
|
|
+ ptrdiff_t stride_src = (2*MAX_PB_SIZE + FF_INPUT_BUFFER_PADDING_SIZE);
|
|
+ pixel *dst = (pixel *)_dst;
|
|
+ pixel *src = (pixel *)_src;
|
|
+ int a_stride, b_stride;
|
|
+ int x, y;
|
|
+ int cwidth = 0;
|
|
+
|
|
+ for (x = 0; x < 5; x++) {
|
|
+ sao_offset_val[x] = _sao_offset_val[edge_idx[x]];
|
|
+ }
|
|
+
|
|
+ if (height % 8 == 0)
|
|
+ cwidth = width;
|
|
+
|
|
+ stride_src /= sizeof(pixel);
|
|
+ stride_dst /= sizeof(pixel);
|
|
+
|
|
+ switch (cwidth) {
|
|
+ case 32:
|
|
+ switch(eo) {
|
|
+ case 0:
|
|
+ ff_hevc_sao_edge_eo0_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
|
|
+ break;
|
|
+ case 1:
|
|
+ ff_hevc_sao_edge_eo1_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
|
|
+ break;
|
|
+ case 2:
|
|
+ ff_hevc_sao_edge_eo2_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
|
|
+ break;
|
|
+ case 3:
|
|
+ ff_hevc_sao_edge_eo3_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
|
|
+ break;
|
|
+ }
|
|
+ break;
|
|
+ case 64:
|
|
+ switch(eo) {
|
|
+ case 0:
|
|
+ ff_hevc_sao_edge_eo0_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
|
|
+ break;
|
|
+ case 1:
|
|
+ ff_hevc_sao_edge_eo1_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
|
|
+ break;
|
|
+ case 2:
|
|
+ ff_hevc_sao_edge_eo2_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
|
|
+ break;
|
|
+ case 3:
|
|
+ ff_hevc_sao_edge_eo3_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
|
|
+ break;
|
|
+ }
|
|
+ break;
|
|
+ default:
|
|
+ a_stride = pos[eo][0][0] + pos[eo][0][1] * stride_src;
|
|
+ b_stride = pos[eo][1][0] + pos[eo][1][1] * stride_src;
|
|
+ for (y = 0; y < height; y++) {
|
|
+ for (x = 0; x < width; x++) {
|
|
+ int diff0 = CMP(src[x], src[x + a_stride]);
|
|
+ int diff1 = CMP(src[x], src[x + b_stride]);
|
|
+ int idx = diff0 + diff1;
|
|
+ if (idx)
|
|
+ dst[x] = av_clip_pixel(src[x] + sao_offset_val[idx+2]);
|
|
+ }
|
|
+ src += stride_src;
|
|
+ dst += stride_dst;
|
|
+ }
|
|
+ }
|
|
+}
|
|
+#undef CMP
|
|
+
|
|
+void ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc,
|
|
+ int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
|
|
+ MvField *curr, MvField *neigh, uint8_t *bs);
|
|
+
|
|
av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
|
|
{
|
|
if (bit_depth == 8) {
|
|
@@ -161,6 +313,10 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
|
|
c->transform_add[2] = ff_hevc_transform_add_16x16_neon_8;
|
|
c->transform_add[3] = ff_hevc_transform_add_32x32_neon_8;
|
|
c->idct_4x4_luma = ff_hevc_transform_luma_4x4_neon_8;
|
|
+ for (x = 0; x < sizeof c->sao_band_filter / sizeof *c->sao_band_filter; x++) {
|
|
+ c->sao_band_filter[x] = ff_hevc_sao_band_neon_wrapper;
|
|
+ c->sao_edge_filter[x] = ff_hevc_sao_edge_neon_wrapper;
|
|
+ }
|
|
put_hevc_qpel_neon[1][0] = ff_hevc_put_qpel_v1_neon_8;
|
|
put_hevc_qpel_neon[2][0] = ff_hevc_put_qpel_v2_neon_8;
|
|
put_hevc_qpel_neon[3][0] = ff_hevc_put_qpel_v3_neon_8;
|
|
@@ -201,7 +357,21 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
|
|
c->put_hevc_qpel_bi[x][1][0] = ff_hevc_put_qpel_bi_neon_wrapper;
|
|
c->put_hevc_qpel_bi[x][0][1] = ff_hevc_put_qpel_bi_neon_wrapper;
|
|
c->put_hevc_qpel_bi[x][1][1] = ff_hevc_put_qpel_bi_neon_wrapper;
|
|
+ c->put_hevc_epel[x][1][0] = ff_hevc_put_epel_v_neon_8;
|
|
+ c->put_hevc_epel[x][0][1] = ff_hevc_put_epel_h_neon_8;
|
|
+ c->put_hevc_epel[x][1][1] = ff_hevc_put_epel_hv_neon_8;
|
|
}
|
|
+ c->put_hevc_epel[0][0][0] = ff_hevc_put_pixels_w2_neon_8;
|
|
+ c->put_hevc_epel[1][0][0] = ff_hevc_put_pixels_w4_neon_8;
|
|
+ c->put_hevc_epel[2][0][0] = ff_hevc_put_pixels_w6_neon_8;
|
|
+ c->put_hevc_epel[3][0][0] = ff_hevc_put_pixels_w8_neon_8;
|
|
+ c->put_hevc_epel[4][0][0] = ff_hevc_put_pixels_w12_neon_8;
|
|
+ c->put_hevc_epel[5][0][0] = ff_hevc_put_pixels_w16_neon_8;
|
|
+ c->put_hevc_epel[6][0][0] = ff_hevc_put_pixels_w24_neon_8;
|
|
+ c->put_hevc_epel[7][0][0] = ff_hevc_put_pixels_w32_neon_8;
|
|
+ c->put_hevc_epel[8][0][0] = ff_hevc_put_pixels_w48_neon_8;
|
|
+ c->put_hevc_epel[9][0][0] = ff_hevc_put_pixels_w64_neon_8;
|
|
+
|
|
c->put_hevc_qpel[0][0][0] = ff_hevc_put_pixels_w2_neon_8;
|
|
c->put_hevc_qpel[1][0][0] = ff_hevc_put_pixels_w4_neon_8;
|
|
c->put_hevc_qpel[2][0][0] = ff_hevc_put_pixels_w6_neon_8;
|
|
@@ -221,4 +391,9 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
|
|
c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_qpel_uw_pixels_w48_neon_8;
|
|
c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_qpel_uw_pixels_w64_neon_8;
|
|
}
|
|
+
|
|
+ assert(offsetof(MvField, mv) == 0);
|
|
+ assert(offsetof(MvField, ref_idx) == 8);
|
|
+ assert(offsetof(MvField, pred_flag) == 10);
|
|
+ c->hevc_deblocking_boundary_strengths = ff_hevc_deblocking_boundary_strengths_neon;
|
|
}
|
|
diff --git a/libavcodec/arm/hevcdsp_sao_neon.S b/libavcodec/arm/hevcdsp_sao_neon.S
|
|
new file mode 100644
|
|
index 0000000..9c7808d
|
|
--- /dev/null
|
|
+++ b/libavcodec/arm/hevcdsp_sao_neon.S
|
|
@@ -0,0 +1,510 @@
|
|
+/*
|
|
+ * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
|
|
+ *
|
|
+ * This file is part of FFmpeg.
|
|
+ *
|
|
+ * FFmpeg is free software; you can redistribute it and/or
|
|
+ * modify it under the terms of the GNU Lesser General Public
|
|
+ * License as published by the Free Software Foundation; either
|
|
+ * version 2.1 of the License, or (at your option) any later version.
|
|
+ *
|
|
+ * FFmpeg is distributed in the hope that it will be useful,
|
|
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
+ * Lesser General Public License for more details.
|
|
+ *
|
|
+ * You should have received a copy of the GNU Lesser General Public
|
|
+ * License along with FFmpeg; if not, write to the Free Software
|
|
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
+ */
|
|
+
|
|
+#include "libavutil/arm/asm.S"
|
|
+#include "neon.S"
|
|
+
|
|
+.macro init_sao_band
|
|
+ pld [r1]
|
|
+ vld1.8 {q0, q1}, [r2] // offset table
|
|
+ ldr r2, [sp, #0] // stride_dst
|
|
+ ldr r12, [sp, #4] // height
|
|
+ vmov.u8 q3, #128
|
|
+.endm
|
|
+
|
|
+// 128 in q3
|
|
+// input q8 - q11
|
|
+.macro sao_band_64
|
|
+ vtbl.8 d24, {d0, d1, d2, d3}, d24
|
|
+ vadd.s8 q8, q3
|
|
+ vtbl.8 d25, {d0, d1, d2, d3}, d25
|
|
+ vadd.s8 q9, q3
|
|
+ vtbl.8 d26, {d0, d1, d2, d3}, d26
|
|
+ vadd.s8 q10, q3
|
|
+ vtbl.8 d27, {d0, d1, d2, d3}, d27
|
|
+ vadd.s8 q11, q3
|
|
+ vtbl.8 d28, {d0, d1, d2, d3}, d28
|
|
+ vqadd.s8 q8, q12
|
|
+ vtbl.8 d29, {d0, d1, d2, d3}, d29
|
|
+ vqadd.s8 q9, q13
|
|
+ vtbl.8 d30, {d0, d1, d2, d3}, d30
|
|
+ vqadd.s8 q10, q14
|
|
+ vtbl.8 d31, {d0, d1, d2, d3}, d31
|
|
+ vsub.s8 q8, q3
|
|
+ vqadd.s8 q11, q15
|
|
+ vsub.s8 q9, q3
|
|
+ vsub.s8 q10, q3
|
|
+ vsub.s8 q11, q3
|
|
+.endm
|
|
+
|
|
+function ff_hevc_sao_band_w8_neon_8, export=1
|
|
+ init_sao_band
|
|
+1: subs r12, #8
|
|
+ vld1.8 {d16}, [r1, :64], r3
|
|
+ vld1.8 {d17}, [r1, :64], r3
|
|
+ vshr.u8 q12, q8, #3
|
|
+ vld1.8 {d18}, [r1, :64], r3
|
|
+ vld1.8 {d19}, [r1, :64], r3
|
|
+ vshr.u8 q13, q9, #3
|
|
+ vld1.8 {d20}, [r1, :64], r3
|
|
+ vld1.8 {d21}, [r1, :64], r3
|
|
+ vshr.u8 q14, q10, #3
|
|
+ vld1.8 {d22}, [r1, :64], r3
|
|
+ vld1.8 {d23}, [r1, :64], r3
|
|
+ vshr.u8 q15, q11, #3
|
|
+ sao_band_64
|
|
+ vst1.8 {d16}, [r0, :64], r2
|
|
+ vst1.8 {d17}, [r0, :64], r2
|
|
+ vst1.8 {d18}, [r0, :64], r2
|
|
+ vst1.8 {d19}, [r0, :64], r2
|
|
+ vst1.8 {d20}, [r0, :64], r2
|
|
+ vst1.8 {d21}, [r0, :64], r2
|
|
+ vst1.8 {d22}, [r0, :64], r2
|
|
+ vst1.8 {d23}, [r0, :64], r2
|
|
+ bne 1b
|
|
+
|
|
+ bx lr
|
|
+endfunc
|
|
+
|
|
+function ff_hevc_sao_band_w16_neon_8, export=1
|
|
+ init_sao_band
|
|
+1: subs r12, #4
|
|
+ vld1.8 {q8}, [r1, :128], r3
|
|
+ vshr.u8 q12, q8, #3
|
|
+ vld1.8 {q9}, [r1, :128], r3
|
|
+ vshr.u8 q13, q9, #3
|
|
+ vld1.8 {q10}, [r1, :128], r3
|
|
+ vshr.u8 q14, q10, #3
|
|
+ vld1.8 {q11}, [r1, :128], r3
|
|
+ vshr.u8 q15, q11, #3
|
|
+ sao_band_64
|
|
+ vst1.8 {q8}, [r0, :128], r2
|
|
+ vst1.8 {q9}, [r0, :128], r2
|
|
+ vst1.8 {q10}, [r0, :128], r2
|
|
+ vst1.8 {q11}, [r0, :128], r2
|
|
+ bne 1b
|
|
+
|
|
+ bx lr
|
|
+endfunc
|
|
+
|
|
+function ff_hevc_sao_band_w32_neon_8, export=1
|
|
+ init_sao_band
|
|
+1: subs r12, #2
|
|
+ vld1.8 {q8-q9}, [r1, :128], r3
|
|
+ vshr.u8 q12, q8, #3
|
|
+ vshr.u8 q13, q9, #3
|
|
+ vld1.8 {q10-q11}, [r1, :128], r3
|
|
+ vshr.u8 q14, q10, #3
|
|
+ vshr.u8 q15, q11, #3
|
|
+ sao_band_64
|
|
+ vst1.8 {q8-q9}, [r0, :128], r2
|
|
+ vst1.8 {q10-q11}, [r0, :128], r2
|
|
+ bne 1b
|
|
+
|
|
+ bx lr
|
|
+endfunc
|
|
+
|
|
+function ff_hevc_sao_band_w64_neon_8, export=1
|
|
+ init_sao_band
|
|
+1: subs r12, #1
|
|
+ pld [r1, r3]
|
|
+ vld1.8 {q8-q9}, [r1, :128]!
|
|
+ vshr.u8 q12, q8, #3
|
|
+ vshr.u8 q13, q9, #3
|
|
+ vld1.8 {q10-q11}, [r1, :128], r3
|
|
+ vshr.u8 q14, q10, #3
|
|
+ vshr.u8 q15, q11, #3
|
|
+ sub r1, #32
|
|
+ sao_band_64
|
|
+ vst1.8 {q8-q9}, [r0, :128]!
|
|
+ vst1.8 {q10-q11}, [r0, :128], r2
|
|
+ sub r0, #32
|
|
+ bne 1b
|
|
+
|
|
+ bx lr
|
|
+endfunc
|
|
+
|
|
+.macro diff32 out0, out1, tmp0, tmp1, in0, in1, in2, in3
|
|
+ vcgt.u8 \out0, \in2, \in0 // c > a -> -1 , otherwise 0
|
|
+ vcgt.u8 \tmp0, \in0, \in2 // a > c -> -1 , otherwise 0
|
|
+ vcgt.u8 \out1, \in3, \in1 // c > a -> -1 , otherwise 0 part 2
|
|
+ vcgt.u8 \tmp1, \in1, \in3 // a > c -> -1 , otherwise 0 part 2
|
|
+ vsub.s8 \out0, \tmp0, \out0 // diff0
|
|
+ vsub.s8 \out1, \tmp1, \out1 // diff0 part 2
|
|
+.endm
|
|
+
|
|
+.macro table64
|
|
+ vmov.s8 q13, #2 // 2 to all elements
|
|
+ vmov.32 d24[0], r4 // load offset table from general registers
|
|
+ vmov.32 d24[1], r5 // load rest of offset table
|
|
+
|
|
+ vadd.s8 q0, q13
|
|
+ vadd.s8 q1, q13
|
|
+ vadd.s8 q2, q13
|
|
+ vadd.s8 q3, q13
|
|
+
|
|
+ vmov.u8 q15, #128 // s8 #-128
|
|
+ vtbl.8 d0, {d24}, d0
|
|
+ vadd.s8 q13, q4, q15
|
|
+ vtbl.8 d1, {d24}, d1
|
|
+ vadd.s8 q14, q5, q15
|
|
+ vtbl.8 d2, {d24}, d2
|
|
+ vqadd.s8 q0, q13
|
|
+ vtbl.8 d3, {d24}, d3
|
|
+ vqadd.s8 q1, q14
|
|
+ vtbl.8 d4, {d24}, d4
|
|
+ vadd.s8 q13, q6, q15
|
|
+ vtbl.8 d5, {d24}, d5
|
|
+ vadd.s8 q14, q7, q15
|
|
+ vtbl.8 d6, {d24}, d6
|
|
+ vqadd.s8 q2, q13
|
|
+ vtbl.8 d7, {d24}, d7
|
|
+ vqadd.s8 q3, q14
|
|
+ vsub.s8 q0, q15
|
|
+ vsub.s8 q1, q15
|
|
+ vsub.s8 q2, q15
|
|
+ vsub.s8 q3, q15
|
|
+ vst1.8 {q0-q1}, [r0, :128]!
|
|
+ vst1.8 {q2-q3}, [r0, :128], r2
|
|
+ sub r0, #32
|
|
+.endm
|
|
+
|
|
+// input
|
|
+// a in q0 - q3
|
|
+// c in q4 - q7
|
|
+// b in q8 - q11
|
|
+// offset table in r7 and r5
|
|
+// output in q0 - q3
|
|
+// clobbers q12 - q15
|
|
+.macro edge_w64_body
|
|
+ diff32 q12, q13, q0, q1, q0, q1, q4, q5
|
|
+ diff32 q0, q1, q14, q15, q8, q9, q4, q5
|
|
+
|
|
+ vadd.s8 q0, q12 //diff0 + diff1
|
|
+ vadd.s8 q1, q13
|
|
+
|
|
+ diff32 q14, q15, q2, q3, q2, q3, q6, q7
|
|
+ diff32 q2, q3, q12, q13, q10, q11, q6, q7
|
|
+
|
|
+ vadd.s8 q2, q14
|
|
+ vadd.s8 q3, q15
|
|
+ table64
|
|
+.endm
|
|
+
|
|
+.macro init_edge_64
|
|
+ push {r4-r5}
|
|
+ ldr r12, [sp, #8] // height
|
|
+ ldr r5, [sp, #12] // sao_offset_val_table
|
|
+ ldr r4, [r5]
|
|
+ add r5, #4
|
|
+ ldr r5, [r5]
|
|
+.endm
|
|
+
|
|
+function ff_hevc_sao_edge_eo0_w64_neon_8, export=1
|
|
+ init_edge_64
|
|
+ vpush {d8-d15}
|
|
+ sub r1, #8
|
|
+1: subs r12, #1
|
|
+ vld1.64 {d7}, [r1, :64]!
|
|
+ vld1.64 {q4-q5}, [r1, :128]! // load c
|
|
+ vld1.64 {q6-q7}, [r1, :128]!
|
|
+ vld1.64 {d24}, [r1, :64], r3
|
|
+ sub r1, #72
|
|
+ // load a
|
|
+ vext.8 q0, q3, q4, #15
|
|
+ vext.8 q1, q4, q5, #15
|
|
+ vext.8 q2, q5, q6, #15
|
|
+ vext.8 q3, q6, q7, #15
|
|
+ // load b
|
|
+ vext.8 q8, q4, q5, #1
|
|
+ vext.8 q9, q5, q6, #1
|
|
+ vext.8 q10, q6, q7, #1
|
|
+ vext.8 q11, q7, q12, #1
|
|
+ edge_w64_body
|
|
+ bne 1b
|
|
+ vpop {d8-d15}
|
|
+ pop {r4-r5}
|
|
+ bx lr
|
|
+endfunc
|
|
+
|
|
+function ff_hevc_sao_edge_eo1_w64_neon_8, export=1
|
|
+ init_edge_64
|
|
+ vpush {d8-d15}
|
|
+ sub r1, r3
|
|
+ // load a
|
|
+ vld1.8 {q0-q1}, [r1, :128]!
|
|
+ vld1.8 {q2-q3}, [r1, :128], r3
|
|
+ sub r1, #32
|
|
+ // load c
|
|
+ vld1.8 {q4-q5}, [r1, :128]!
|
|
+ vld1.8 {q6-q7}, [r1, :128], r3
|
|
+ sub r1, #32
|
|
+1: subs r12, #1
|
|
+ // load b
|
|
+ vld1.8 {q8-q9}, [r1, :128]!
|
|
+ vld1.8 {q10-q11}, [r1, :128], r3
|
|
+ sub r1, #32
|
|
+ edge_w64_body
|
|
+ // copy c to a
|
|
+ vmov.64 q0, q4
|
|
+ vmov.64 q1, q5
|
|
+ vmov.64 q2, q6
|
|
+ vmov.64 q3, q7
|
|
+ // copy b to c
|
|
+ vmov.64 q4, q8
|
|
+ vmov.64 q5, q9
|
|
+ vmov.64 q6, q10
|
|
+ vmov.64 q7, q11
|
|
+ bne 1b
|
|
+ vpop {d8-d15}
|
|
+ pop {r4-r5}
|
|
+ bx lr
|
|
+endfunc
|
|
+
|
|
+function ff_hevc_sao_edge_eo2_w64_neon_8, export=1
|
|
+ init_edge_64
|
|
+ vpush {d8-d15}
|
|
+1: sub r1, r3
|
|
+ // load a
|
|
+ // TODO: fix unaligned load
|
|
+ // don't reload a like in eo1
|
|
+ sub r1, #1
|
|
+ vld1.8 {q0-q1}, [r1]!
|
|
+ vld1.8 {q2-q3}, [r1], r3
|
|
+ sub r1, #31
|
|
+ subs r12, #1
|
|
+ // load c
|
|
+ vld1.8 {q4-q5}, [r1, :128]!
|
|
+ vld1.8 {q6-q7}, [r1, :128], r3
|
|
+ sub r1, #32
|
|
+ // load b
|
|
+ add r1, #1
|
|
+ vld1.8 {q8-q9}, [r1]!
|
|
+ vld1.8 {q10-q11}, [r1]
|
|
+ sub r1, #33
|
|
+ edge_w64_body
|
|
+ bne 1b
|
|
+ vpop {d8-d15}
|
|
+ pop {r4-r5}
|
|
+ bx lr
|
|
+endfunc
|
|
+
|
|
+function ff_hevc_sao_edge_eo3_w64_neon_8, export=1
|
|
+ init_edge_64
|
|
+ vpush {d8-d15}
|
|
+1: sub r1, r3
|
|
+ // load a
|
|
+ // TODO: fix unaligned load
|
|
+ // don't reload a like in eo1
|
|
+ add r1, #1
|
|
+ vld1.8 {q0-q1}, [r1]!
|
|
+ vld1.8 {q2-q3}, [r1], r3
|
|
+ sub r1, #33
|
|
+ subs r12, #1
|
|
+ // load c
|
|
+ vld1.8 {q4-q5}, [r1, :128]!
|
|
+ vld1.8 {q6-q7}, [r1, :128], r3
|
|
+ sub r1, #32
|
|
+ // load b
|
|
+ sub r1, #1
|
|
+ vld1.8 {q8-q9}, [r1]!
|
|
+ vld1.8 {q10-q11}, [r1]
|
|
+ sub r1, #31
|
|
+ edge_w64_body
|
|
+ bne 1b
|
|
+ vpop {d8-d15}
|
|
+ pop {r4-r5}
|
|
+ bx lr
|
|
+endfunc
|
|
+
|
|
+.macro init_edge_32
|
|
+ ldr r12, [sp, #4] // sao_offset_val_table
|
|
+ vld1.32 {d31}, [r12]
|
|
+ ldr r12, [sp] // height
|
|
+.endm
|
|
+
|
|
+.macro diff out0, tmp0, in0, in1
|
|
+ vcgt.u8 \out0, \in1, \in0 // c > a -> -1 , otherwise 0
|
|
+ vcgt.u8 \tmp0, \in0, \in1 // a > c -> -1 , otherwise 0
|
|
+ vsub.s8 \out0, \tmp0, \out0 // diff0
|
|
+.endm
|
|
+
|
|
+.macro table32
|
|
+ vmov.s8 q10, #2
|
|
+ vadd.s8 q0, q10
|
|
+ vadd.s8 q1, q10
|
|
+ vmov.s8 q10, #128
|
|
+ vtbl.8 d0, {d31}, d0
|
|
+ vadd.s8 q11, q2, q10
|
|
+ vtbl.8 d1, {d31}, d1
|
|
+ vadd.s8 q12, q3, q10
|
|
+ vtbl.8 d2, {d31}, d2
|
|
+ vqadd.s8 q11, q0
|
|
+ vtbl.8 d3, {d31}, d3
|
|
+ vqadd.s8 q12, q1
|
|
+ vsub.s8 q0, q11, q10
|
|
+ vsub.s8 q1, q12, q10
|
|
+ vst1.8 {q0-q1}, [r0, :128], r2
|
|
+.endm
|
|
+
|
|
+function ff_hevc_sao_edge_eo0_w32_neon_8, export=1
|
|
+ init_edge_32
|
|
+ vpush {q4-q7}
|
|
+ sub r1, #4
|
|
+1: subs r12, #1
|
|
+ vld1.8 {q13-q14}, [r1]!
|
|
+ vld1.32 d30, [r1], r3
|
|
+ sub r1, #32
|
|
+ // a
|
|
+ vext.8 q0, q13, q14, #3
|
|
+ vext.8 q1, q14, q15, #3
|
|
+ vshr.u64 d24, d30, #24
|
|
+ // c
|
|
+ vext.8 q2, q13, q14, #4
|
|
+ vext.8 q3, q14, q15, #4
|
|
+ vshr.u64 d16, d30, #32
|
|
+ // diff0
|
|
+ diff32 q13, q14, q4, q5, q0, q1, q2, q3
|
|
+ diff d18, d25, d24, d16
|
|
+ // -diff1
|
|
+ vext.s8 q0, q13, q14, #1
|
|
+ vext.s8 q1, q14, q9, #1
|
|
+
|
|
+ vsub.s8 q0, q13, q0 //diff0 + diff1
|
|
+ vsub.s8 q1, q14, q1
|
|
+ table32
|
|
+ bne 1b
|
|
+ vpop {q4-q7}
|
|
+
|
|
+ bx lr
|
|
+endfunc
|
|
+
|
|
+function ff_hevc_sao_edge_eo1_w32_neon_8, export=1
|
|
+ init_edge_32
|
|
+ vpush {q4-q7}
|
|
+ // load a
|
|
+ sub r1, r3
|
|
+ vld1.8 {q0-q1}, [r1, :128], r3
|
|
+ // load c
|
|
+ vld1.8 {q2-q3}, [r1, :128], r3
|
|
+ diff32 q12, q13, q0, q1, q0, q1, q2, q3 // CMP ( c, a )
|
|
+1: subs r12, #1
|
|
+ // load b
|
|
+ vld1.8 {q8-q9}, [r1, :128], r3
|
|
+ diff32 q4, q5, q10, q11, q8, q9, q2, q3 // CMP ( c, b )
|
|
+ vadd.s8 q0, q4, q12 //diff0 + diff1
|
|
+ vadd.s8 q1, q5, q13
|
|
+ table32
|
|
+ // CMP ( c, a )
|
|
+ vneg.s8 q12, q4
|
|
+ vneg.s8 q13, q5
|
|
+ // c
|
|
+ vmov.64 q2, q8
|
|
+ vmov.64 q3, q9
|
|
+ bne 1b
|
|
+ vpop {q4-q7}
|
|
+ bx lr
|
|
+endfunc
|
|
+
|
|
+function ff_hevc_sao_edge_eo2_w32_neon_8, export=1
|
|
+ init_edge_32
|
|
+ vpush {d8-d15}
|
|
+ // load a
|
|
+ sub r1, r3
|
|
+ sub r1, #8
|
|
+ vld1.8 {q10-q11}, [r1, :64]!
|
|
+ vld1.8 {d24}, [r1, :64], r3
|
|
+ sub r1, #32
|
|
+ vext.8 q0, q10, q11, #7
|
|
+ vext.8 q1, q11, q12, #7
|
|
+ // load c
|
|
+ vld1.8 {d9}, [r1, :64]!
|
|
+ vld1.8 {q2-q3}, [r1, :64], r3
|
|
+ sub r1, #8
|
|
+ vext.8 q4, q4, q2, #15
|
|
+1: subs r12, #1
|
|
+ // load b
|
|
+ vld1.8 {q10-q11}, [r1, :64]!
|
|
+ vld1.8 {q12}, [r1, :64], r3
|
|
+ sub r1, #32
|
|
+ vext.8 q8, q10, q11, #9
|
|
+ vext.8 q9, q11, q12, #9
|
|
+ vext.8 q6, q10, q11, #8
|
|
+ vext.8 q7, q11, q12, #8
|
|
+ vext.8 q5, q10, q11, #7
|
|
+ diff32 q12, q13, q0, q1, q0, q1, q2, q3
|
|
+ diff32 q0, q1, q10, q11, q8, q9, q2, q3
|
|
+ vadd.s8 q0, q12 //diff0 + diff1
|
|
+ vadd.s8 q1, q13
|
|
+ table32
|
|
+ // inputs for next loop iteration
|
|
+ // a
|
|
+ vmov.8 q0, q4
|
|
+ vext.8 q1, q2, q3, #15
|
|
+ // c
|
|
+ vmov.8 q2, q6
|
|
+ vmov.8 q3, q7
|
|
+ vmov.8 q4, q5
|
|
+ bne 1b
|
|
+ vpop {d8-d15}
|
|
+ bx lr
|
|
+endfunc
|
|
+
|
|
+function ff_hevc_sao_edge_eo3_w32_neon_8, export=1
|
|
+ init_edge_32
|
|
+ sub r1, r3
|
|
+ // load a
|
|
+ vld1.8 {q10-q11}, [r1, :64]!
|
|
+ vld1.8 {d24}, [r1, :64], r3
|
|
+ sub r1, #32
|
|
+ vext.8 q0, q10, q11, #1
|
|
+ vext.8 q1, q11, q12, #1
|
|
+ // load c
|
|
+ vld1.8 {q2-q3}, [r1, :64]!
|
|
+ vld1.8 {d30}, [r1, :64], r3
|
|
+ sub r1, #40
|
|
+1: subs r12, #1
|
|
+ // load b
|
|
+ vld1.8 {q10-q11}, [r1, :64]!
|
|
+ vld1.8 {q12}, [r1, :64], r3
|
|
+ sub r1, #32
|
|
+ vext.8 q8, q10, q11, #7
|
|
+ vext.8 q9, q11, q12, #7
|
|
+ vext.8 q14, q12, q10, #7
|
|
+
|
|
+ diff32 q12, q13, q0, q1, q0, q1, q2, q3
|
|
+ diff32 q0, q1, q10, q11, q8, q9, q2, q3
|
|
+
|
|
+ vadd.s8 q0, q12 //diff0 + diff1
|
|
+ vadd.s8 q1, q13
|
|
+ table32
|
|
+
|
|
+ // inputs for next loop iteration
|
|
+ // a
|
|
+ vext.8 q0, q2, q3, #1
|
|
+ vext.8 q1, q3, q15, #1
|
|
+ // c
|
|
+ vext.8 q2, q8, q9, #1
|
|
+ vext.8 q3, q9, q14, #1
|
|
+ vext.8 d30, d28, d2, #1
|
|
+ bne 1b
|
|
+ bx lr
|
|
+endfunc
|
|
+
|
|
diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
|
|
index 39713ed..25eb52b 100644
|
|
--- a/libavcodec/avcodec.h
|
|
+++ b/libavcodec/avcodec.h
|
|
@@ -410,6 +410,8 @@ enum AVCodecID {
|
|
AV_CODEC_ID_SHEERVIDEO,
|
|
AV_CODEC_ID_YLC,
|
|
|
|
+ AV_CODEC_ID_H264_MVC,
|
|
+
|
|
/* various PCM "codecs" */
|
|
AV_CODEC_ID_FIRST_AUDIO = 0x10000, ///< A dummy id pointing at the start of audio codecs
|
|
AV_CODEC_ID_PCM_S16LE = 0x10000,
|
|
@@ -2850,6 +2852,7 @@ typedef struct AVCodecContext {
|
|
#define FF_BUG_DC_CLIP 4096
|
|
#define FF_BUG_MS 8192 ///< Work around various bugs in Microsoft's broken decoders.
|
|
#define FF_BUG_TRUNCATED 16384
|
|
+#define FF_BUG_GMC_UNSUPPORTED 32768
|
|
|
|
/**
|
|
* strictly follow the standard (MPEG-4, ...).
|
|
@@ -3195,6 +3198,9 @@ typedef struct AVCodecContext {
|
|
#define FF_PROFILE_H264_HIGH_444_PREDICTIVE 244
|
|
#define FF_PROFILE_H264_HIGH_444_INTRA (244|FF_PROFILE_H264_INTRA)
|
|
#define FF_PROFILE_H264_CAVLC_444 44
|
|
+#define FF_PROFILE_H264_MULTIVIEW_HIGH 118
|
|
+#define FF_PROFILE_H264_STEREO_HIGH 128
|
|
+#define FF_PROFILE_H264_MULTIVIEW_HIGH_DEPTH 138
|
|
|
|
#define FF_PROFILE_VC1_SIMPLE 0
|
|
#define FF_PROFILE_VC1_MAIN 1
|
|
@@ -3505,6 +3511,12 @@ typedef struct AVCodecContext {
|
|
#define FF_SUB_TEXT_FMT_ASS_WITH_TIMINGS 1
|
|
#endif
|
|
|
|
+ /**
|
|
+ * Opaque pointer for use by replacement get_buffer2 code
|
|
+ *
|
|
+ * @author jc (08/02/2016)
|
|
+ */
|
|
+ void * get_buffer_context;
|
|
} AVCodecContext;
|
|
|
|
AVRational av_codec_get_pkt_timebase (const AVCodecContext *avctx);
|
|
diff --git a/libavcodec/cabac.h b/libavcodec/cabac.h
|
|
index 1bf1c62..ccfa991 100644
|
|
--- a/libavcodec/cabac.h
|
|
+++ b/libavcodec/cabac.h
|
|
@@ -43,7 +43,14 @@ extern const uint8_t ff_h264_cabac_tables[512 + 4*2*64 + 4*64 + 63];
|
|
typedef struct CABACContext{
|
|
int low;
|
|
int range;
|
|
- int outstanding_count;
|
|
+ union
|
|
+ {
|
|
+ int outstanding_count;
|
|
+ struct {
|
|
+ uint16_t bits;
|
|
+ uint16_t range;
|
|
+ } by22;
|
|
+ };
|
|
const uint8_t *bytestream_start;
|
|
const uint8_t *bytestream;
|
|
const uint8_t *bytestream_end;
|
|
diff --git a/libavcodec/codec_desc.c b/libavcodec/codec_desc.c
|
|
index 9d94b72..535ebf0 100644
|
|
--- a/libavcodec/codec_desc.c
|
|
+++ b/libavcodec/codec_desc.c
|
|
@@ -1563,6 +1563,13 @@ static const AVCodecDescriptor codec_descriptors[] = {
|
|
.long_name = NULL_IF_CONFIG_SMALL("YUY2 Lossless Codec"),
|
|
.props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
|
|
},
|
|
+ {
|
|
+ .id = AV_CODEC_ID_H264_MVC,
|
|
+ .type = AVMEDIA_TYPE_VIDEO,
|
|
+ .name = "h264_mvc",
|
|
+ .long_name = NULL_IF_CONFIG_SMALL("H264 MVC"),
|
|
+ .props = AV_CODEC_PROP_LOSSY,
|
|
+ },
|
|
|
|
/* various PCM "codecs" */
|
|
{
|
|
diff --git a/libavcodec/h264.h b/libavcodec/h264.h
|
|
index efe3555..16358aa 100644
|
|
--- a/libavcodec/h264.h
|
|
+++ b/libavcodec/h264.h
|
|
@@ -126,7 +126,9 @@ enum {
|
|
NAL_END_STREAM = 11,
|
|
NAL_FILLER_DATA = 12,
|
|
NAL_SPS_EXT = 13,
|
|
+ NAL_SPS_SUBSET = 15,
|
|
NAL_AUXILIARY_SLICE = 19,
|
|
+ NAL_SLICE_EXT = 20,
|
|
NAL_FF_IGNORE = 0xff0f001,
|
|
};
|
|
|
|
diff --git a/libavcodec/h264_parser.c b/libavcodec/h264_parser.c
|
|
index ce4bab2..b9b0c78 100644
|
|
--- a/libavcodec/h264_parser.c
|
|
+++ b/libavcodec/h264_parser.c
|
|
@@ -58,6 +58,8 @@ typedef struct H264ParseContext {
|
|
uint8_t parse_history[6];
|
|
int parse_history_count;
|
|
int parse_last_mb;
|
|
+ int is_mvc;
|
|
+ int slice_ext;
|
|
} H264ParseContext;
|
|
|
|
|
|
@@ -105,24 +107,27 @@ static int h264_find_frame_end(H264ParseContext *p, const uint8_t *buf,
|
|
} else if (state <= 5) {
|
|
int nalu_type = buf[i] & 0x1F;
|
|
if (nalu_type == NAL_SEI || nalu_type == NAL_SPS ||
|
|
- nalu_type == NAL_PPS || nalu_type == NAL_AUD) {
|
|
+ nalu_type == NAL_PPS || nalu_type == NAL_AUD ||
|
|
+ nalu_type == NAL_SPS_SUBSET) {
|
|
if (pc->frame_start_found) {
|
|
i++;
|
|
goto found;
|
|
}
|
|
} else if (nalu_type == NAL_SLICE || nalu_type == NAL_DPA ||
|
|
- nalu_type == NAL_IDR_SLICE) {
|
|
+ nalu_type == NAL_IDR_SLICE || (p->is_mvc && nalu_type == NAL_SLICE_EXT)) {
|
|
state += 8;
|
|
+
|
|
+ p->slice_ext = (nalu_type == NAL_SLICE_EXT);
|
|
continue;
|
|
}
|
|
state = 7;
|
|
} else {
|
|
p->parse_history[p->parse_history_count++] = buf[i];
|
|
- if (p->parse_history_count > 5) {
|
|
+ if (p->parse_history_count > 8) {
|
|
unsigned int mb, last_mb = p->parse_last_mb;
|
|
GetBitContext gb;
|
|
|
|
- init_get_bits(&gb, p->parse_history, 8*p->parse_history_count);
|
|
+ init_get_bits8(&gb, p->parse_history + 3*p->slice_ext, p->parse_history_count - 3*p->slice_ext);
|
|
p->parse_history_count = 0;
|
|
mb= get_ue_golomb_long(&gb);
|
|
p->parse_last_mb = mb;
|
|
@@ -145,7 +150,7 @@ found:
|
|
pc->frame_start_found = 0;
|
|
if (p->is_avc)
|
|
return next_avc;
|
|
- return i - (state & 5) - 5 * (state > 7);
|
|
+ return i - (state & 5) - 8 * (state > 7);
|
|
}
|
|
|
|
static int scan_mmco_reset(AVCodecParserContext *s, GetBitContext *gb,
|
|
@@ -585,7 +590,8 @@ static int h264_parse(AVCodecParserContext *s,
|
|
}
|
|
}
|
|
|
|
- parse_nal_units(s, avctx, buf, buf_size);
|
|
+ if (!p->is_mvc)
|
|
+ parse_nal_units(s, avctx, buf, buf_size);
|
|
|
|
if (avctx->framerate.num)
|
|
avctx->time_base = av_inv_q(av_mul_q(avctx->framerate, (AVRational){avctx->ticks_per_frame, 1}));
|
|
@@ -622,7 +628,7 @@ static int h264_split(AVCodecContext *avctx,
|
|
if ((state & 0xFFFFFF00) != 0x100)
|
|
break;
|
|
nalu_type = state & 0x1F;
|
|
- if (nalu_type == NAL_SPS) {
|
|
+ if (nalu_type == NAL_SPS || nalu_type == NAL_SPS_SUBSET) {
|
|
has_sps = 1;
|
|
} else if (nalu_type == NAL_PPS)
|
|
has_pps = 1;
|
|
@@ -672,3 +678,23 @@ AVCodecParser ff_h264_parser = {
|
|
.parser_close = h264_close,
|
|
.split = h264_split,
|
|
};
|
|
+
|
|
+static av_cold int init_mvc(AVCodecParserContext *s)
|
|
+{
|
|
+ H264ParseContext *p = s->priv_data;
|
|
+ int ret = init(s);
|
|
+ if (ret < 0)
|
|
+ return ret;
|
|
+
|
|
+ p->is_mvc = 1;
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+AVCodecParser ff_h264_mvc_parser = {
|
|
+ .codec_ids = { AV_CODEC_ID_H264_MVC },
|
|
+ .priv_data_size = sizeof(H264ParseContext),
|
|
+ .parser_init = init_mvc,
|
|
+ .parser_parse = h264_parse,
|
|
+ .parser_close = h264_close,
|
|
+ .split = h264_split,
|
|
+};
|
|
diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
|
|
index b478065..88dd40b 100644
|
|
--- a/libavcodec/hevc.c
|
|
+++ b/libavcodec/hevc.c
|
|
@@ -41,8 +41,186 @@
|
|
#include "hevc.h"
|
|
#include "profiles.h"
|
|
|
|
+#ifdef RPI
|
|
+ #include "rpi_qpu.h"
|
|
+ #include "rpi_user_vcsm.h"
|
|
+ // Move Inter prediction into separate pass
|
|
+ #define RPI_INTER
|
|
+
|
|
+ #ifdef RPI_INTER_QPU
|
|
+ // Define RPI_MULTI_MAILBOX to use the updated mailbox that can launch both QPU and VPU
|
|
+ #define RPI_MULTI_MAILBOX
|
|
+ #endif
|
|
+
|
|
+ // Define RPI_CACHE_UNIF_MVS to write motion vector uniform stream to cached memory
|
|
+ // RPI_CACHE_UNIF_MVS doesn't seem to make much difference, so left undefined.
|
|
+
|
|
+ // Define RPI_SIMULATE_QPUS for debugging to run QPU code on the ARMs (*rotted*)
|
|
+ //#define RPI_SIMULATE_QPUS
|
|
+ #ifdef RPI_WORKER
|
|
+ #include "pthread.h"
|
|
+ #endif
|
|
+
|
|
+ static void rpi_execute_dblk_cmds(HEVCContext *s);
|
|
+ static void rpi_execute_transform(HEVCContext *s);
|
|
+ static void rpi_launch_vpu_qpu(HEVCContext *s);
|
|
+ static void rpi_execute_pred_cmds(HEVCContext *s);
|
|
+ static void rpi_execute_inter_cmds(HEVCContext *s);
|
|
+ static void rpi_begin(HEVCContext *s);
|
|
+ static void flush_frame(HEVCContext *s,AVFrame *frame);
|
|
+ static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2, int job);
|
|
+
|
|
+#endif
|
|
+
|
|
+// #define DISABLE_MC
|
|
+
|
|
+#define PACK2(hi,lo) (((hi) << 16) | ((lo) & 0xffff))
|
|
+
|
|
+#ifndef av_mod_uintp2
|
|
+static av_always_inline av_const unsigned av_mod_uintp2_c(unsigned a, unsigned p)
|
|
+{
|
|
+ return a & ((1 << p) - 1);
|
|
+}
|
|
+# define av_mod_uintp2 av_mod_uintp2_c
|
|
+#endif
|
|
+
|
|
const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
|
|
|
|
+
|
|
+#ifdef RPI_INTER_QPU
|
|
+
|
|
+// Each luma QPU processes 2*RPI_NUM_CHUNKS 64x64 blocks
|
|
+// Each chroma QPU processes 3*RPI_NUM_CHUNKS 64x64 blocks, but requires two commands for B blocks
|
|
+// For each block of 64*64 the smallest block size is 8x4
|
|
+// We also need an extra command for the setup information
|
|
+
|
|
+#define RPI_CHROMA_COMMAND_WORDS 12
|
|
+#define UV_COMMANDS_PER_QPU ((1 + 3*RPI_NUM_CHUNKS*(64*64)*2/(8*4)) * RPI_CHROMA_COMMAND_WORDS)
|
|
+// The QPU code for UV blocks only works up to a block width of 8
|
|
+#define RPI_CHROMA_BLOCK_WIDTH 8
|
|
+
|
|
+#define RPI_LUMA_COMMAND_WORDS 10
|
|
+#define Y_COMMANDS_PER_QPU ((1+2*RPI_NUM_CHUNKS*(64*64)/(8*4)) * RPI_LUMA_COMMAND_WORDS)
|
|
+
|
|
+#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
|
|
+
|
|
+// TODO Chroma only needs 4 taps
|
|
+
|
|
+// Actual filter goes -ve, +ve, +ve, -ve using these values
|
|
+static const uint32_t rpi_filter_coefs[8][1] = {
|
|
+ { ENCODE_COEFFS( 0, 64, 0, 0) },
|
|
+ { ENCODE_COEFFS( 2, 58, 10, 2) },
|
|
+ { ENCODE_COEFFS( 4, 54, 16, 2) },
|
|
+ { ENCODE_COEFFS( 6, 46, 28, 4) },
|
|
+ { ENCODE_COEFFS( 4, 36, 36, 4) },
|
|
+ { ENCODE_COEFFS( 4, 28, 46, 6) },
|
|
+ { ENCODE_COEFFS( 2, 16, 54, 4) },
|
|
+ { ENCODE_COEFFS( 2, 10, 58, 2) }
|
|
+};
|
|
+
|
|
+#endif
|
|
+
|
|
+
|
|
+#ifdef RPI_WORKER
|
|
+
|
|
+//#define LOG_ENTER printf("Enter %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s);
|
|
+//#define LOG_EXIT printf("Exit %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s);
|
|
+
|
|
+#define LOG_ENTER
|
|
+#define LOG_EXIT
|
|
+
|
|
+// Call this when we have completed pass0 and wish to trigger pass1 for the current job
|
|
+static void worker_submit_job(HEVCContext *s)
|
|
+{
|
|
+ LOG_ENTER
|
|
+ pthread_mutex_lock(&s->worker_mutex);
|
|
+ s->worker_tail++;
|
|
+ s->pass0_job = (s->pass0_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
|
|
+ pthread_cond_broadcast(&s->worker_cond_tail); // Let people know that the tail has moved
|
|
+ pthread_mutex_unlock(&s->worker_mutex);
|
|
+ LOG_EXIT
|
|
+}
|
|
+
|
|
+// Call this to say we have completed pass1
|
|
+static void worker_complete_job(HEVCContext *s)
|
|
+{
|
|
+ LOG_ENTER
|
|
+ pthread_mutex_lock(&s->worker_mutex);
|
|
+ s->worker_head++;
|
|
+ s->pass1_job = (s->pass1_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
|
|
+ pthread_cond_broadcast(&s->worker_cond_head); // Let people know that the head has moved
|
|
+ pthread_mutex_unlock(&s->worker_mutex);
|
|
+ LOG_EXIT
|
|
+}
|
|
+
|
|
+// Call this to wait for all jobs to have completed at the end of a frame
|
|
+static void worker_wait(HEVCContext *s)
|
|
+{
|
|
+ LOG_ENTER
|
|
+ pthread_mutex_lock(&s->worker_mutex);
|
|
+ while( s->worker_head !=s->worker_tail)
|
|
+ {
|
|
+ pthread_cond_wait(&s->worker_cond_head, &s->worker_mutex);
|
|
+ }
|
|
+ pthread_mutex_unlock(&s->worker_mutex);
|
|
+ LOG_EXIT
|
|
+}
|
|
+
|
|
+// Call worker_pass0_ready to wait until the s->pass0_job slot becomes
|
|
+// available to receive the next job.
|
|
+static void worker_pass0_ready(HEVCContext *s)
|
|
+{
|
|
+ LOG_ENTER
|
|
+ pthread_mutex_lock(&s->worker_mutex);
|
|
+ // tail is number of submitted jobs
|
|
+ // head is number of completed jobs
|
|
+ // tail-head is number of outstanding jobs in the queue
|
|
+ // we need to ensure there is at least 1 space left for us to use
|
|
+ while( s->worker_tail - s->worker_head >= RPI_MAX_JOBS)
|
|
+ {
|
|
+ // Wait until another job is completed
|
|
+ pthread_cond_wait(&s->worker_cond_head, &s->worker_mutex);
|
|
+ }
|
|
+ pthread_mutex_unlock(&s->worker_mutex);
|
|
+ LOG_EXIT
|
|
+}
|
|
+
|
|
+static void *worker_start(void *arg)
|
|
+{
|
|
+ HEVCContext *s = (HEVCContext *)arg;
|
|
+ while(1) {
|
|
+ pthread_mutex_lock(&s->worker_mutex);
|
|
+
|
|
+ while( !s->kill_worker && s->worker_tail - s->worker_head <= 0)
|
|
+ {
|
|
+ pthread_cond_wait(&s->worker_cond_tail, &s->worker_mutex);
|
|
+ }
|
|
+ pthread_mutex_unlock(&s->worker_mutex);
|
|
+
|
|
+ if (s->kill_worker) {
|
|
+ break;
|
|
+ }
|
|
+ LOG_ENTER
|
|
+ // printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
|
|
+ rpi_launch_vpu_qpu(s);
|
|
+ // Perform inter prediction
|
|
+ rpi_execute_inter_cmds(s);
|
|
+ // Wait for transform completion
|
|
+ vpu_wait(s->vpu_id);
|
|
+
|
|
+ // Perform intra prediction and residual reconstruction
|
|
+ rpi_execute_pred_cmds(s);
|
|
+ // Perform deblocking for CTBs in this row
|
|
+ rpi_execute_dblk_cmds(s);
|
|
+
|
|
+ worker_complete_job(s);
|
|
+ LOG_EXIT
|
|
+ }
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+#endif
|
|
+
|
|
/**
|
|
* NOTE: Each function hls_foo correspond to the function foo in the
|
|
* specification (HLS stands for High Level Syntax).
|
|
@@ -55,6 +233,32 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
|
|
/* free everything allocated by pic_arrays_init() */
|
|
static void pic_arrays_free(HEVCContext *s)
|
|
{
|
|
+#ifdef RPI
|
|
+ int job;
|
|
+ for(job=0;job<RPI_MAX_JOBS;job++) {
|
|
+ if (s->coeffs_buf_arm[job][0]) {
|
|
+ gpu_free(&s->coeffs_buf_default[job]);
|
|
+ s->coeffs_buf_arm[job][0] = 0;
|
|
+ }
|
|
+ if (s->coeffs_buf_arm[job][2]) {
|
|
+ gpu_free(&s->coeffs_buf_accelerated[job]);
|
|
+ s->coeffs_buf_arm[job][2] = 0;
|
|
+ }
|
|
+ }
|
|
+#endif
|
|
+#ifdef RPI_DEBLOCK_VPU
|
|
+ {
|
|
+ int i;
|
|
+ for (i = 0; i != RPI_DEBLOCK_VPU_Q_COUNT; ++i) {
|
|
+ struct dblk_vpu_q_s * const dvq = s->dvq_ents + i;
|
|
+
|
|
+ if (dvq->vpu_cmds_arm) {
|
|
+ gpu_free(&dvq->deblock_vpu_gmem);
|
|
+ dvq->vpu_cmds_arm = 0;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+#endif
|
|
av_freep(&s->sao);
|
|
av_freep(&s->deblock);
|
|
|
|
@@ -91,6 +295,87 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
|
|
int ctb_count = sps->ctb_width * sps->ctb_height;
|
|
int min_pu_size = sps->min_pu_width * sps->min_pu_height;
|
|
|
|
+#ifdef RPI
|
|
+ int coefs_in_ctb = (1 << sps->log2_ctb_size) * (1 << sps->log2_ctb_size);
|
|
+ int coefs_per_luma = 64*64*24*RPI_NUM_CHUNKS;
|
|
+ int coefs_per_chroma = (coefs_per_luma * 2) >> sps->vshift[1] >> sps->hshift[1];
|
|
+ int coefs_per_row = coefs_per_luma + coefs_per_chroma;
|
|
+ int job;
|
|
+
|
|
+ av_assert0(sps);
|
|
+ s->max_ctu_count = coefs_per_luma / coefs_in_ctb;
|
|
+ s->ctu_per_y_chan = s->max_ctu_count / 12;
|
|
+ s->ctu_per_uv_chan = s->max_ctu_count / 8;
|
|
+ for(job=0;job<RPI_MAX_JOBS;job++) {
|
|
+ printf("Allocated %d\n",coefs_per_row);
|
|
+ for(job=0;job<RPI_MAX_JOBS;job++) {
|
|
+ gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default[job]);
|
|
+ s->coeffs_buf_arm[job][0] = (int16_t*) s->coeffs_buf_default[job].arm;
|
|
+ if (!s->coeffs_buf_arm[job][0])
|
|
+ goto fail;
|
|
+ gpu_malloc_cached(sizeof(int16_t) * (coefs_per_row + 32*32), &s->coeffs_buf_accelerated[job]); // We prefetch past the end so provide an extra blocks worth of data
|
|
+ s->coeffs_buf_arm[job][2] = (int16_t*) s->coeffs_buf_accelerated[job].arm;
|
|
+ s->coeffs_buf_vc[job][2] = s->coeffs_buf_accelerated[job].vc;
|
|
+ if (!s->coeffs_buf_arm[job][2])
|
|
+ goto fail;
|
|
+ s->coeffs_buf_arm[job][3] = coefs_per_row + s->coeffs_buf_arm[job][2]; // This points to just beyond the end of the buffer. Coefficients fill in backwards.
|
|
+ s->coeffs_buf_vc[job][3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[job][2];
|
|
+ }
|
|
+ }
|
|
+#endif
|
|
+#ifdef RPI_DEBLOCK_VPU
|
|
+ {
|
|
+ int i;
|
|
+ s->enable_rpi_deblock = !sps->sao_enabled;
|
|
+ s->setup_width = (sps->width+15) / 16;
|
|
+ s->setup_height = (sps->height+15) / 16;
|
|
+ s->uv_setup_width = ( (sps->width >> sps->hshift[1]) + 15) / 16;
|
|
+ s->uv_setup_height = ( (sps->height >> sps->vshift[1]) + 15) / 16;
|
|
+
|
|
+ for (i = 0; i != RPI_DEBLOCK_VPU_Q_COUNT; ++i)
|
|
+ {
|
|
+ struct dblk_vpu_q_s * const dvq = s->dvq_ents + i;
|
|
+ const unsigned int cmd_size = (sizeof(*dvq->vpu_cmds_arm) * 3 + 15) & ~15;
|
|
+ const unsigned int y_size = (sizeof(*dvq->y_setup_arm) * s->setup_width * s->setup_height + 15) & ~15;
|
|
+ const unsigned int uv_size = (sizeof(*dvq->uv_setup_arm) * s->uv_setup_width * s->uv_setup_height + 15) & ~15;
|
|
+ const unsigned int total_size =- cmd_size + y_size + uv_size;
|
|
+ int p_vc;
|
|
+ uint8_t * p_arm;
|
|
+ #if RPI_VPU_DEBLOCK_CACHED
|
|
+ gpu_malloc_cached(total_size, &dvq->deblock_vpu_gmem);
|
|
+ #else
|
|
+ gpu_malloc_uncached(total_size, &dvq->deblock_vpu_gmem);
|
|
+ #endif
|
|
+ p_vc = dvq->deblock_vpu_gmem.vc;
|
|
+ p_arm = dvq->deblock_vpu_gmem.arm;
|
|
+
|
|
+ // Zap all
|
|
+ memset(p_arm, 0, dvq->deblock_vpu_gmem.numbytes);
|
|
+
|
|
+ // Subdivide
|
|
+ dvq->vpu_cmds_arm = (void*)p_arm;
|
|
+ dvq->vpu_cmds_vc = p_vc;
|
|
+
|
|
+ p_arm += cmd_size;
|
|
+ p_vc += cmd_size;
|
|
+
|
|
+ dvq->y_setup_arm = (void*)p_arm;
|
|
+ dvq->y_setup_vc = (void*)p_vc;
|
|
+
|
|
+ p_arm += y_size;
|
|
+ p_vc += y_size;
|
|
+
|
|
+ dvq->uv_setup_arm = (void*)p_arm;
|
|
+ dvq->uv_setup_vc = (void*)p_vc;
|
|
+
|
|
+ dvq->cmd_id = -1;
|
|
+ }
|
|
+
|
|
+ s->dvq_n = 0;
|
|
+ s->dvq = s->dvq_ents + s->dvq_n;
|
|
+ }
|
|
+#endif
|
|
+
|
|
s->bs_width = (width >> 2) + 1;
|
|
s->bs_height = (height >> 2) + 1;
|
|
|
|
@@ -137,6 +422,29 @@ fail:
|
|
return AVERROR(ENOMEM);
|
|
}
|
|
|
|
+static void default_pred_weight_table(HEVCContext * const s)
|
|
+{
|
|
+ unsigned int i;
|
|
+ s->sh.luma_log2_weight_denom = 0;
|
|
+ s->sh.chroma_log2_weight_denom = 0;
|
|
+ for (i = 0; i < s->sh.nb_refs[L0]; i++) {
|
|
+ s->sh.luma_weight_l0[i] = 1;
|
|
+ s->sh.luma_offset_l0[i] = 0;
|
|
+ s->sh.chroma_weight_l0[i][0] = 1;
|
|
+ s->sh.chroma_offset_l0[i][0] = 0;
|
|
+ s->sh.chroma_weight_l0[i][1] = 1;
|
|
+ s->sh.chroma_offset_l0[i][1] = 0;
|
|
+ }
|
|
+ for (i = 0; i < s->sh.nb_refs[L1]; i++) {
|
|
+ s->sh.luma_weight_l1[i] = 1;
|
|
+ s->sh.luma_offset_l1[i] = 0;
|
|
+ s->sh.chroma_weight_l1[i][0] = 1;
|
|
+ s->sh.chroma_offset_l1[i][0] = 0;
|
|
+ s->sh.chroma_weight_l1[i][1] = 1;
|
|
+ s->sh.chroma_offset_l1[i][1] = 0;
|
|
+ }
|
|
+}
|
|
+
|
|
static void pred_weight_table(HEVCContext *s, GetBitContext *gb)
|
|
{
|
|
int i = 0;
|
|
@@ -674,6 +982,11 @@ static int hls_slice_header(HEVCContext *s)
|
|
(s->ps.pps->weighted_bipred_flag && sh->slice_type == B_SLICE)) {
|
|
pred_weight_table(s, gb);
|
|
}
|
|
+ else
|
|
+ {
|
|
+ // Give us unit weights
|
|
+ default_pred_weight_table(s);
|
|
+ }
|
|
|
|
sh->max_num_merge_cand = 5 - get_ue_golomb_long(gb);
|
|
if (sh->max_num_merge_cand < 1 || sh->max_num_merge_cand > 5) {
|
|
@@ -931,6 +1244,25 @@ static int hls_cross_component_pred(HEVCContext *s, int idx) {
|
|
return 0;
|
|
}
|
|
|
|
+#ifdef RPI
|
|
+static void rpi_intra_pred(HEVCContext *s, int log2_trafo_size, int x0, int y0, int c_idx)
|
|
+{
|
|
+ if (s->enable_rpi) {
|
|
+ HEVCLocalContext *lc = s->HEVClc;
|
|
+ HEVCPredCmd *cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
|
|
+ cmd->type = RPI_PRED_INTRA;
|
|
+ cmd->size = log2_trafo_size;
|
|
+ cmd->c_idx = c_idx;
|
|
+ cmd->x = x0;
|
|
+ cmd->y = y0;
|
|
+ cmd->na = (lc->na.cand_bottom_left<<4) + (lc->na.cand_left<<3) + (lc->na.cand_up_left<<2) + (lc->na.cand_up<<1) + lc->na.cand_up_right;
|
|
+ cmd->mode = c_idx ? lc->tu.intra_pred_mode_c : lc->tu.intra_pred_mode;
|
|
+ } else {
|
|
+ s->hpc.intra_pred[log2_trafo_size - 2](s, x0, y0, c_idx);
|
|
+ }
|
|
+}
|
|
+#endif
|
|
+
|
|
static int hls_transform_unit(HEVCContext *s, int x0, int y0,
|
|
int xBase, int yBase, int cb_xBase, int cb_yBase,
|
|
int log2_cb_size, int log2_trafo_size,
|
|
@@ -943,8 +1275,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
|
|
if (lc->cu.pred_mode == MODE_INTRA) {
|
|
int trafo_size = 1 << log2_trafo_size;
|
|
ff_hevc_set_neighbour_available(s, x0, y0, trafo_size, trafo_size);
|
|
-
|
|
+#ifdef RPI
|
|
+ rpi_intra_pred(s, log2_trafo_size, x0, y0, 0);
|
|
+#else
|
|
s->hpc.intra_pred[log2_trafo_size - 2](s, x0, y0, 0);
|
|
+#endif
|
|
}
|
|
|
|
if (cbf_luma || cbf_cb[0] || cbf_cr[0] ||
|
|
@@ -1030,7 +1365,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
|
|
for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
|
|
if (lc->cu.pred_mode == MODE_INTRA) {
|
|
ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
|
|
+#ifdef RPI
|
|
+ rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 1);
|
|
+#else
|
|
s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (i << log2_trafo_size_c), 1);
|
|
+#endif
|
|
}
|
|
if (cbf_cb[i])
|
|
ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
|
|
@@ -1059,7 +1398,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
|
|
for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
|
|
if (lc->cu.pred_mode == MODE_INTRA) {
|
|
ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
|
|
+#ifdef RPI
|
|
+ rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 2);
|
|
+#else
|
|
s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (i << log2_trafo_size_c), 2);
|
|
+#endif
|
|
}
|
|
if (cbf_cr[i])
|
|
ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
|
|
@@ -1088,7 +1431,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
|
|
if (lc->cu.pred_mode == MODE_INTRA) {
|
|
ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
|
|
trafo_size_h, trafo_size_v);
|
|
+#ifdef RPI
|
|
+ rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 1);
|
|
+#else
|
|
s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (i << log2_trafo_size), 1);
|
|
+#endif
|
|
}
|
|
if (cbf_cb[i])
|
|
ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
|
|
@@ -1098,7 +1445,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
|
|
if (lc->cu.pred_mode == MODE_INTRA) {
|
|
ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
|
|
trafo_size_h, trafo_size_v);
|
|
+#ifdef RPI
|
|
+ rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 2);
|
|
+#else
|
|
s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (i << log2_trafo_size), 2);
|
|
+#endif
|
|
}
|
|
if (cbf_cr[i])
|
|
ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
|
|
@@ -1110,26 +1461,46 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
|
|
int trafo_size_h = 1 << (log2_trafo_size_c + s->ps.sps->hshift[1]);
|
|
int trafo_size_v = 1 << (log2_trafo_size_c + s->ps.sps->vshift[1]);
|
|
ff_hevc_set_neighbour_available(s, x0, y0, trafo_size_h, trafo_size_v);
|
|
+#ifdef RPI
|
|
+ rpi_intra_pred(s, log2_trafo_size_c, x0, y0, 1);
|
|
+ rpi_intra_pred(s, log2_trafo_size_c, x0, y0, 2);
|
|
+#else
|
|
s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0, 1);
|
|
s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0, 2);
|
|
+#endif
|
|
if (s->ps.sps->chroma_format_idc == 2) {
|
|
ff_hevc_set_neighbour_available(s, x0, y0 + (1 << log2_trafo_size_c),
|
|
trafo_size_h, trafo_size_v);
|
|
+#ifdef RPI
|
|
+ rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 1);
|
|
+ rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 2);
|
|
+#else
|
|
s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (1 << log2_trafo_size_c), 1);
|
|
s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (1 << log2_trafo_size_c), 2);
|
|
+#endif
|
|
}
|
|
} else if (blk_idx == 3) {
|
|
int trafo_size_h = 1 << (log2_trafo_size + 1);
|
|
int trafo_size_v = 1 << (log2_trafo_size + s->ps.sps->vshift[1]);
|
|
ff_hevc_set_neighbour_available(s, xBase, yBase,
|
|
trafo_size_h, trafo_size_v);
|
|
+#ifdef RPI
|
|
+ rpi_intra_pred(s, log2_trafo_size, xBase, yBase, 1);
|
|
+ rpi_intra_pred(s, log2_trafo_size, xBase, yBase, 2);
|
|
+#else
|
|
s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase, 1);
|
|
s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase, 2);
|
|
+#endif
|
|
if (s->ps.sps->chroma_format_idc == 2) {
|
|
ff_hevc_set_neighbour_available(s, xBase, yBase + (1 << (log2_trafo_size)),
|
|
trafo_size_h, trafo_size_v);
|
|
+#ifdef RPI
|
|
+ rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 1);
|
|
+ rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 2);
|
|
+#else
|
|
s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (1 << (log2_trafo_size)), 1);
|
|
s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (1 << (log2_trafo_size)), 2);
|
|
+#endif
|
|
}
|
|
}
|
|
}
|
|
@@ -1332,6 +1703,93 @@ static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size)
|
|
* @param luma_offset additive offset applied to the luma prediction value
|
|
*/
|
|
|
|
+#ifdef RPI_INTER
|
|
+#define RPI_REDIRECT(fn) (s->enable_rpi ? rpi_ ## fn : fn)
|
|
+static void rpi_luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
|
|
+ AVFrame *ref, const Mv *mv, int x_off, int y_off,
|
|
+ int block_w, int block_h, int luma_weight, int luma_offset)
|
|
+{
|
|
+ HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
|
|
+ cmd->cmd = RPI_CMD_LUMA_UNI;
|
|
+ cmd->dst = dst;
|
|
+ cmd->dststride = dststride;
|
|
+ cmd->src = ref->data[0];
|
|
+ cmd->srcstride = ref->linesize[0];
|
|
+ cmd->mv = *mv;
|
|
+ cmd->x_off = x_off;
|
|
+ cmd->y_off = y_off;
|
|
+ cmd->block_w = block_w;
|
|
+ cmd->block_h = block_h;
|
|
+ cmd->weight = luma_weight;
|
|
+ cmd->offset = luma_offset;
|
|
+}
|
|
+
|
|
+static void rpi_luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
|
|
+ AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
|
|
+ int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv)
|
|
+{
|
|
+ HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
|
|
+ cmd->cmd = RPI_CMD_LUMA_BI;
|
|
+ cmd->dst = dst;
|
|
+ cmd->dststride = dststride;
|
|
+ cmd->src = ref0->data[0];
|
|
+ cmd->srcstride = ref0->linesize[0];
|
|
+ cmd->mv = *mv0;
|
|
+ cmd->x_off = x_off;
|
|
+ cmd->y_off = y_off;
|
|
+ cmd->block_w = block_w;
|
|
+ cmd->block_h = block_h;
|
|
+ cmd->src1 = ref1->data[0];
|
|
+ cmd->srcstride1 = ref1->linesize[0];
|
|
+ cmd->mv1 = *mv1;
|
|
+ cmd->ref_idx[0] = current_mv->ref_idx[0];
|
|
+ cmd->ref_idx[1] = current_mv->ref_idx[1];
|
|
+}
|
|
+
|
|
+static void rpi_chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
|
|
+ ptrdiff_t dststride, uint8_t *src0, ptrdiff_t srcstride, int reflist,
|
|
+ int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int chroma_weight, int chroma_offset)
|
|
+{
|
|
+ HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
|
|
+ cmd->cmd = RPI_CMD_CHROMA_UNI;
|
|
+ cmd->dst = dst0;
|
|
+ cmd->dststride = dststride;
|
|
+ cmd->src = src0;
|
|
+ cmd->srcstride = srcstride;
|
|
+ cmd->mv = current_mv->mv[reflist];
|
|
+ cmd->x_off = x_off;
|
|
+ cmd->y_off = y_off;
|
|
+ cmd->block_w = block_w;
|
|
+ cmd->block_h = block_h;
|
|
+ cmd->weight = chroma_weight;
|
|
+ cmd->offset = chroma_offset;
|
|
+}
|
|
+
|
|
+static void rpi_chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVFrame *ref0, AVFrame *ref1,
|
|
+ int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int cidx)
|
|
+{
|
|
+ HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
|
|
+ cmd->cmd = RPI_CMD_CHROMA_BI+cidx;
|
|
+ cmd->dst = dst0;
|
|
+ cmd->dststride = dststride;
|
|
+ cmd->src = ref0->data[cidx+1];
|
|
+ cmd->srcstride = ref0->linesize[cidx+1];
|
|
+ cmd->mv = current_mv->mv[0];
|
|
+ cmd->mv1 = current_mv->mv[1];
|
|
+ cmd->x_off = x_off;
|
|
+ cmd->y_off = y_off;
|
|
+ cmd->block_w = block_w;
|
|
+ cmd->block_h = block_h;
|
|
+ cmd->src1 = ref1->data[cidx+1];
|
|
+ cmd->srcstride1 = ref1->linesize[cidx+1];
|
|
+ cmd->ref_idx[0] = current_mv->ref_idx[0];
|
|
+ cmd->ref_idx[1] = current_mv->ref_idx[1];
|
|
+}
|
|
+
|
|
+#else
|
|
+#define RPI_REDIRECT(fn) fn
|
|
+#endif
|
|
+
|
|
static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
|
|
AVFrame *ref, const Mv *mv, int x_off, int y_off,
|
|
int block_w, int block_h, int luma_weight, int luma_offset)
|
|
@@ -1347,6 +1805,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
|
|
(s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
|
|
int idx = ff_hevc_pel_weight[block_w];
|
|
|
|
+#ifdef DISABLE_MC
|
|
+ return;
|
|
+#endif
|
|
+
|
|
x_off += mv->x >> 2;
|
|
y_off += mv->y >> 2;
|
|
src += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
|
|
@@ -1393,7 +1855,7 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
|
|
* @param mv1 motion vector1 (relative to block position) to get pixel data from
|
|
* @param current_mv current motion vector structure
|
|
*/
|
|
- static void luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
|
|
+static void luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
|
|
AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
|
|
int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv)
|
|
{
|
|
@@ -1417,6 +1879,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
|
|
uint8_t *src0 = ref0->data[0] + y_off0 * src0stride + (int)((unsigned)x_off0 << s->ps.sps->pixel_shift);
|
|
uint8_t *src1 = ref1->data[0] + y_off1 * src1stride + (int)((unsigned)x_off1 << s->ps.sps->pixel_shift);
|
|
|
|
+#ifdef DISABLE_MC
|
|
+ return;
|
|
+#endif
|
|
+
|
|
if (x_off0 < QPEL_EXTRA_BEFORE || y_off0 < QPEL_EXTRA_AFTER ||
|
|
x_off0 >= pic_width - block_w - QPEL_EXTRA_AFTER ||
|
|
y_off0 >= pic_height - block_h - QPEL_EXTRA_AFTER) {
|
|
@@ -1502,6 +1968,10 @@ static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
|
|
intptr_t _mx = mx << (1 - hshift);
|
|
intptr_t _my = my << (1 - vshift);
|
|
|
|
+#ifdef DISABLE_MC
|
|
+ return;
|
|
+#endif
|
|
+
|
|
x_off += mv->x >> (2 + hshift);
|
|
y_off += mv->y >> (2 + vshift);
|
|
src0 += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
|
|
@@ -1566,6 +2036,10 @@ static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVF
|
|
int hshift = s->ps.sps->hshift[1];
|
|
int vshift = s->ps.sps->vshift[1];
|
|
|
|
+#ifdef DISABLE_MC
|
|
+ return;
|
|
+#endif
|
|
+
|
|
intptr_t mx0 = av_mod_uintp2(mv0->x, 2 + hshift);
|
|
intptr_t my0 = av_mod_uintp2(mv0->y, 2 + vshift);
|
|
intptr_t mx1 = av_mod_uintp2(mv1->x, 2 + hshift);
|
|
@@ -1693,14 +2167,14 @@ static void hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW,
|
|
}
|
|
}
|
|
|
|
-static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
|
|
- int nPbW, int nPbH,
|
|
- int log2_cb_size, int partIdx, int idx)
|
|
+static void hls_prediction_unit(HEVCContext * const s, const int x0, const int y0,
|
|
+ const int nPbW, const int nPbH,
|
|
+ const unsigned int log2_cb_size, const unsigned int partIdx, const unsigned int idx)
|
|
{
|
|
#define POS(c_idx, x, y) \
|
|
&s->frame->data[c_idx][((y) >> s->ps.sps->vshift[c_idx]) * s->frame->linesize[c_idx] + \
|
|
(((x) >> s->ps.sps->hshift[c_idx]) << s->ps.sps->pixel_shift)]
|
|
- HEVCLocalContext *lc = s->HEVClc;
|
|
+ HEVCLocalContext * const lc = s->HEVClc;
|
|
int merge_idx = 0;
|
|
struct MvField current_mv = {{{ 0 }}};
|
|
|
|
@@ -1718,8 +2192,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
|
|
int y_cb = y0 >> log2_min_cb_size;
|
|
int x_pu, y_pu;
|
|
int i, j;
|
|
-
|
|
- int skip_flag = SAMPLE_CTB(s->skip_flag, x_cb, y_cb);
|
|
+ const int skip_flag = SAMPLE_CTB(s->skip_flag, x_cb, y_cb);
|
|
|
|
if (!skip_flag)
|
|
lc->pu.merge_flag = ff_hevc_merge_flag_decode(s);
|
|
@@ -1763,16 +2236,89 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
|
|
int nPbW_c = nPbW >> s->ps.sps->hshift[1];
|
|
int nPbH_c = nPbH >> s->ps.sps->vshift[1];
|
|
|
|
- luma_mc_uni(s, dst0, s->frame->linesize[0], ref0->frame,
|
|
+#ifdef RPI_LUMA_QPU
|
|
+ if (s->enable_rpi) {
|
|
+ const Mv * const mv = ¤t_mv.mv[0];
|
|
+ const unsigned int mx = mv->x & 3;
|
|
+ const unsigned int my = mv->y & 3;
|
|
+ const unsigned int my_mx = (my<<8) | mx;
|
|
+ const uint32_t my2_mx2_my_mx = (my_mx << 16) | my_mx;
|
|
+ const int x1_m3 = x0 + (mv->x >> 2) - 3;
|
|
+ const int y1_m3 = y0 + (mv->y >> 2) - 3;
|
|
+ const uint32_t src_vc_address_y = get_vc_address_y(ref0->frame);
|
|
+ uint32_t * y = s->curr_y_mvs;
|
|
+
|
|
+ for(int start_y=0;start_y < nPbH;start_y+=16) { // Potentially we could change the assembly code to support taller sizes in one go
|
|
+ const uint32_t src_yx_hi = ((y1_m3 + start_y) << 16);
|
|
+
|
|
+ for(int start_x=0;start_x < nPbW;start_x+=16) {
|
|
+ const int bw = nPbW-start_x;
|
|
+ const int bh = nPbH-start_y;
|
|
+ y++[-RPI_LUMA_COMMAND_WORDS] = src_yx_hi | ((x1_m3 + start_x) & 0xffff);
|
|
+ y++[-RPI_LUMA_COMMAND_WORDS] = src_vc_address_y;
|
|
+ y++[-RPI_LUMA_COMMAND_WORDS] = src_yx_hi | ((x1_m3 + 8 + start_x) & 0xffff);
|
|
+ y++[-RPI_LUMA_COMMAND_WORDS] = src_vc_address_y;
|
|
+ *y++ = ( (bw<16 ? bw : 16) << 16 ) + (bh<16 ? bh : 16);
|
|
+ *y++ = my2_mx2_my_mx;
|
|
+ *y++ = s->sh.luma_weight_l0[current_mv.ref_idx[0]];
|
|
+ *y++ = s->sh.luma_offset_l0[current_mv.ref_idx[0]] * 2 + 1;
|
|
+ *y++ = (get_vc_address_y(s->frame) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
|
|
+ y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter;
|
|
+ }
|
|
+ }
|
|
+ s->curr_y_mvs = y;
|
|
+ } else
|
|
+#endif
|
|
+ {
|
|
+ RPI_REDIRECT(luma_mc_uni)(s, dst0, s->frame->linesize[0], ref0->frame,
|
|
¤t_mv.mv[0], x0, y0, nPbW, nPbH,
|
|
s->sh.luma_weight_l0[current_mv.ref_idx[0]],
|
|
s->sh.luma_offset_l0[current_mv.ref_idx[0]]);
|
|
+ }
|
|
|
|
if (s->ps.sps->chroma_format_idc) {
|
|
- chroma_mc_uni(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1],
|
|
+#ifdef RPI_INTER_QPU
|
|
+ if (s->enable_rpi) {
|
|
+ int hshift = s->ps.sps->hshift[1];
|
|
+ int vshift = s->ps.sps->vshift[1];
|
|
+ const Mv *mv = ¤t_mv.mv[0];
|
|
+ intptr_t mx = av_mod_uintp2(mv->x, 2 + hshift);
|
|
+ intptr_t my = av_mod_uintp2(mv->y, 2 + vshift);
|
|
+ intptr_t _mx = mx << (1 - hshift);
|
|
+ intptr_t _my = my << (1 - vshift); // Fractional part of motion vector
|
|
+
|
|
+ int x1_c = x0_c + (mv->x >> (2 + hshift));
|
|
+ int y1_c = y0_c + (mv->y >> (2 + hshift));
|
|
+
|
|
+ uint32_t *u = s->curr_u_mvs;
|
|
+ for(int start_y=0;start_y < nPbH_c;start_y+=16) {
|
|
+ for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
|
|
+ int bw = nPbW_c-start_x;
|
|
+ int bh = nPbH_c-start_y;
|
|
+ u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
|
|
+ u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
|
|
+ u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
|
|
+ u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref0->frame);
|
|
+ u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref0->frame);
|
|
+ *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
|
|
+ *u++ = rpi_filter_coefs[_mx][0];
|
|
+ *u++ = rpi_filter_coefs[_my][0];
|
|
+ *u++ = PACK2(s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0] * 2 + 1,
|
|
+ s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0]);
|
|
+ *u++ = PACK2(s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1] * 2 + 1,
|
|
+ s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1]);
|
|
+ *u++ = (get_vc_address_u(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
|
|
+ *u++ = (get_vc_address_v(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
|
|
+ }
|
|
+ }
|
|
+ s->curr_u_mvs = u;
|
|
+ return;
|
|
+ }
|
|
+#endif
|
|
+ RPI_REDIRECT(chroma_mc_uni)(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1],
|
|
0, x0_c, y0_c, nPbW_c, nPbH_c, ¤t_mv,
|
|
s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]);
|
|
- chroma_mc_uni(s, dst2, s->frame->linesize[2], ref0->frame->data[2], ref0->frame->linesize[2],
|
|
+ RPI_REDIRECT(chroma_mc_uni)(s, dst2, s->frame->linesize[2], ref0->frame->data[2], ref0->frame->linesize[2],
|
|
0, x0_c, y0_c, nPbW_c, nPbH_c, ¤t_mv,
|
|
s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1]);
|
|
}
|
|
@@ -1782,17 +2328,89 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
|
|
int nPbW_c = nPbW >> s->ps.sps->hshift[1];
|
|
int nPbH_c = nPbH >> s->ps.sps->vshift[1];
|
|
|
|
- luma_mc_uni(s, dst0, s->frame->linesize[0], ref1->frame,
|
|
+#ifdef RPI_LUMA_QPU
|
|
+ if (s->enable_rpi) {
|
|
+ const int reflist = 1;
|
|
+ const Mv *mv = ¤t_mv.mv[reflist];
|
|
+ int mx = mv->x & 3;
|
|
+ int my = mv->y & 3;
|
|
+ int my_mx = (my<<8) + mx;
|
|
+ int my2_mx2_my_mx = (my_mx << 16) + my_mx;
|
|
+ int x1 = x0 + (mv->x >> 2);
|
|
+ int y1 = y0 + (mv->y >> 2);
|
|
+ uint32_t *y = s->curr_y_mvs;
|
|
+ for(int start_y=0;start_y < nPbH;start_y+=16) { // Potentially we could change the assembly code to support taller sizes in one go
|
|
+ for(int start_x=0;start_x < nPbW;start_x+=16) {
|
|
+ int bw = nPbW-start_x;
|
|
+ int bh = nPbH-start_y;
|
|
+ y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
|
|
+ y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref1->frame);
|
|
+ y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + 8 + start_x) & 0xffff);
|
|
+ y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref1->frame);
|
|
+ *y++ = ( (bw<16 ? bw : 16) << 16 ) + (bh<16 ? bh : 16);
|
|
+ *y++ = my2_mx2_my_mx;
|
|
+ *y++ = s->sh.luma_weight_l1[current_mv.ref_idx[reflist]];
|
|
+ *y++ = s->sh.luma_offset_l1[current_mv.ref_idx[reflist]] * 2 + 1;
|
|
+ *y++ = (get_vc_address_y(s->frame) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
|
|
+ y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter;
|
|
+ }
|
|
+ }
|
|
+ s->curr_y_mvs = y;
|
|
+ } else
|
|
+#endif
|
|
+
|
|
+ {
|
|
+ RPI_REDIRECT(luma_mc_uni)(s, dst0, s->frame->linesize[0], ref1->frame,
|
|
¤t_mv.mv[1], x0, y0, nPbW, nPbH,
|
|
s->sh.luma_weight_l1[current_mv.ref_idx[1]],
|
|
s->sh.luma_offset_l1[current_mv.ref_idx[1]]);
|
|
+ }
|
|
|
|
if (s->ps.sps->chroma_format_idc) {
|
|
- chroma_mc_uni(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1],
|
|
+#ifdef RPI_INTER_QPU
|
|
+ if (s->enable_rpi) {
|
|
+ const int reflist = 1;
|
|
+ const int hshift = s->ps.sps->hshift[1];
|
|
+ const int vshift = s->ps.sps->vshift[1];
|
|
+ const Mv * const mv = ¤t_mv.mv[reflist];
|
|
+ const intptr_t mx = av_mod_uintp2(mv->x, 2 + hshift);
|
|
+ const intptr_t my = av_mod_uintp2(mv->y, 2 + vshift);
|
|
+ const intptr_t _mx = mx << (1 - hshift);
|
|
+ const intptr_t _my = my << (1 - vshift); // Fractional part of motion vector
|
|
+
|
|
+ const int x1_c = x0_c + (mv->x >> (2 + hshift));
|
|
+ const int y1_c = y0_c + (mv->y >> (2 + hshift));
|
|
+
|
|
+ uint32_t * u = s->curr_u_mvs;
|
|
+ for(int start_y=0;start_y < nPbH_c;start_y+=16) {
|
|
+ for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
|
|
+ const int bw = nPbW_c-start_x;
|
|
+ const int bh = nPbH_c-start_y;
|
|
+ u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
|
|
+ u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
|
|
+ u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
|
|
+ u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref1->frame);
|
|
+ u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref1->frame);
|
|
+ *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
|
|
+ *u++ = rpi_filter_coefs[_mx][0];
|
|
+ *u++ = rpi_filter_coefs[_my][0];
|
|
+ *u++ = PACK2(s->sh.chroma_offset_l1[current_mv.ref_idx[reflist]][0] * 2 + 1,
|
|
+ s->sh.chroma_weight_l1[current_mv.ref_idx[reflist]][0]);
|
|
+ *u++ = PACK2(s->sh.chroma_offset_l1[current_mv.ref_idx[reflist]][1] * 2 + 1,
|
|
+ s->sh.chroma_weight_l1[current_mv.ref_idx[reflist]][1]);
|
|
+ *u++ = (get_vc_address_u(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
|
|
+ *u++ = (get_vc_address_v(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
|
|
+ }
|
|
+ }
|
|
+ s->curr_u_mvs = u;
|
|
+ return;
|
|
+ }
|
|
+#endif
|
|
+ RPI_REDIRECT(chroma_mc_uni)(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1],
|
|
1, x0_c, y0_c, nPbW_c, nPbH_c, ¤t_mv,
|
|
s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0]);
|
|
|
|
- chroma_mc_uni(s, dst2, s->frame->linesize[2], ref1->frame->data[2], ref1->frame->linesize[2],
|
|
+ RPI_REDIRECT(chroma_mc_uni)(s, dst2, s->frame->linesize[2], ref1->frame->data[2], ref1->frame->linesize[2],
|
|
1, x0_c, y0_c, nPbW_c, nPbH_c, ¤t_mv,
|
|
s->sh.chroma_weight_l1[current_mv.ref_idx[1]][1], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][1]);
|
|
}
|
|
@@ -1802,15 +2420,118 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
|
|
int nPbW_c = nPbW >> s->ps.sps->hshift[1];
|
|
int nPbH_c = nPbH >> s->ps.sps->vshift[1];
|
|
|
|
- luma_mc_bi(s, dst0, s->frame->linesize[0], ref0->frame,
|
|
+#ifdef RPI_LUMA_QPU
|
|
+ if (s->enable_rpi && 0) {
|
|
+ const Mv *mv = ¤t_mv.mv[0];
|
|
+ int mx = mv->x & 3;
|
|
+ int my = mv->y & 3;
|
|
+ int my_mx = (my<<8) + mx;
|
|
+ const Mv *mv2 = ¤t_mv.mv[1];
|
|
+ int mx2 = mv2->x & 3;
|
|
+ int my2 = mv2->y & 3;
|
|
+ int my2_mx2 = (my2<<8) + mx2;
|
|
+ int my2_mx2_my_mx = (my2_mx2 << 16) + my_mx;
|
|
+ int x1 = x0 + (mv->x >> 2);
|
|
+ int y1 = y0 + (mv->y >> 2);
|
|
+ int x2 = x0 + (mv2->x >> 2);
|
|
+ int y2 = y0 + (mv2->y >> 2);
|
|
+ uint32_t *y = s->curr_y_mvs;
|
|
+ for(int start_y=0;start_y < nPbH;start_y+=16) { // Potentially we could change the assembly code to support taller sizes in one go
|
|
+ for(int start_x=0;start_x < nPbW;start_x+=8) { // B blocks work 8 at a time
|
|
+ int bw = nPbW-start_x;
|
|
+ int bh = nPbH-start_y;
|
|
+ y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
|
|
+ y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref0->frame);
|
|
+ y++[-RPI_LUMA_COMMAND_WORDS] = ((y2 - 3 + start_y) << 16) + ( (x2 - 3 + start_x) & 0xffff); // Second fetch is for ref1
|
|
+ y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref1->frame);
|
|
+ *y++ = PACK2(bw<8 ? bw : 8, bh<16 ? bh : 16);
|
|
+ *y++ = my2_mx2_my_mx;
|
|
+
|
|
+ *y++ = PACK2(s->sh.luma_weight_l1[current_mv.ref_idx[1]],
|
|
+ s->sh.luma_weight_l0[current_mv.ref_idx[0]]);
|
|
+ *y++ = s->sh.luma_offset_l0[current_mv.ref_idx[0]] +
|
|
+ s->sh.luma_offset_l1[current_mv.ref_idx[1]] + 1;
|
|
+
|
|
+ *y++ = (get_vc_address_y(s->frame) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
|
|
+ y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter_b;
|
|
+ }
|
|
+ }
|
|
+ s->curr_y_mvs = y;
|
|
+ } else
|
|
+#endif
|
|
+ {
|
|
+ RPI_REDIRECT(luma_mc_bi)(s, dst0, s->frame->linesize[0], ref0->frame,
|
|
¤t_mv.mv[0], x0, y0, nPbW, nPbH,
|
|
ref1->frame, ¤t_mv.mv[1], ¤t_mv);
|
|
+ }
|
|
|
|
if (s->ps.sps->chroma_format_idc) {
|
|
- chroma_mc_bi(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame,
|
|
+#ifdef RPI_INTER_QPU
|
|
+ if (s->enable_rpi) {
|
|
+ int hshift = s->ps.sps->hshift[1];
|
|
+ int vshift = s->ps.sps->vshift[1];
|
|
+ const Mv *mv = ¤t_mv.mv[0];
|
|
+ intptr_t mx = av_mod_uintp2(mv->x, 2 + hshift);
|
|
+ intptr_t my = av_mod_uintp2(mv->y, 2 + vshift);
|
|
+ intptr_t _mx = mx << (1 - hshift);
|
|
+ intptr_t _my = my << (1 - vshift); // Fractional part of motion vector
|
|
+ int x1_c = x0_c + (mv->x >> (2 + hshift));
|
|
+ int y1_c = y0_c + (mv->y >> (2 + hshift));
|
|
+
|
|
+ const Mv *mv2 = ¤t_mv.mv[1];
|
|
+ intptr_t mx2 = av_mod_uintp2(mv2->x, 2 + hshift);
|
|
+ intptr_t my2 = av_mod_uintp2(mv2->y, 2 + vshift);
|
|
+ intptr_t _mx2 = mx2 << (1 - hshift);
|
|
+ intptr_t _my2 = my2 << (1 - vshift); // Fractional part of motion vector
|
|
+
|
|
+ int x2_c = x0_c + (mv2->x >> (2 + hshift));
|
|
+ int y2_c = y0_c + (mv2->y >> (2 + hshift));
|
|
+
|
|
+
|
|
+ uint32_t *u = s->curr_u_mvs;
|
|
+ for(int start_y=0;start_y < nPbH_c;start_y+=16) {
|
|
+ for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
|
|
+ int bw = nPbW_c-start_x;
|
|
+ int bh = nPbH_c-start_y;
|
|
+ u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b0;
|
|
+ u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
|
|
+ u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
|
|
+ u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref0->frame);
|
|
+ u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref0->frame);
|
|
+ *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
|
|
+ *u++ = rpi_filter_coefs[_mx][0];
|
|
+ *u++ = rpi_filter_coefs[_my][0];
|
|
+ *u++ = s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0]; // Weight L0 U
|
|
+ *u++ = s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1]; // Weight L0 V
|
|
+ *u++ = 0; // Intermediate results are not written back in first pass of B filtering
|
|
+ *u++ = 0;
|
|
+
|
|
+ u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b;
|
|
+ u++[-RPI_CHROMA_COMMAND_WORDS] = x2_c - 1 + start_x;
|
|
+ u++[-RPI_CHROMA_COMMAND_WORDS] = y2_c - 1 + start_y;
|
|
+ u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref1->frame);
|
|
+ u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref1->frame);
|
|
+ *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
|
|
+ *u++ = rpi_filter_coefs[_mx2][0];
|
|
+ *u++ = rpi_filter_coefs[_my2][0];
|
|
+ *u++ = PACK2(s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0] +
|
|
+ s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0] + 1,
|
|
+ s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0]);
|
|
+ *u++ = PACK2(s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1] +
|
|
+ s->sh.chroma_offset_l1[current_mv.ref_idx[1]][1] + 1,
|
|
+ s->sh.chroma_weight_l1[current_mv.ref_idx[1]][1]);
|
|
+ *u++ = (get_vc_address_u(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
|
|
+ *u++ = (get_vc_address_v(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
|
|
+ }
|
|
+ }
|
|
+ s->curr_u_mvs = u;
|
|
+ return;
|
|
+ }
|
|
+#endif
|
|
+ RPI_REDIRECT(chroma_mc_bi)(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame,
|
|
x0_c, y0_c, nPbW_c, nPbH_c, ¤t_mv, 0);
|
|
|
|
- chroma_mc_bi(s, dst2, s->frame->linesize[2], ref0->frame, ref1->frame,
|
|
+ RPI_REDIRECT(chroma_mc_bi)(s, dst2, s->frame->linesize[2], ref0->frame, ref1->frame,
|
|
x0_c, y0_c, nPbW_c, nPbH_c, ¤t_mv, 1);
|
|
}
|
|
}
|
|
@@ -2304,6 +3025,734 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
|
|
lc->ctb_up_left_flag = ((x_ctb > 0) && (y_ctb > 0) && (ctb_addr_in_slice-1 >= s->ps.sps->ctb_width) && (s->ps.pps->tile_id[ctb_addr_ts] == s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1 - s->ps.sps->ctb_width]]));
|
|
}
|
|
|
|
+#ifdef RPI
|
|
+static void rpi_execute_dblk_cmds(HEVCContext *s)
|
|
+{
|
|
+ int n;
|
|
+ int job = s->pass1_job;
|
|
+ int ctb_size = 1 << s->ps.sps->log2_ctb_size;
|
|
+ int (*p)[2] = s->dblk_cmds[job];
|
|
+ for(n = s->num_dblk_cmds[job]; n>0 ;n--,p++) {
|
|
+ ff_hevc_hls_filters(s, (*p)[0], (*p)[1], ctb_size);
|
|
+ }
|
|
+ s->num_dblk_cmds[job] = 0;
|
|
+}
|
|
+
|
|
+static void rpi_execute_transform(HEVCContext *s)
|
|
+{
|
|
+ int i=2;
|
|
+ int job = s->pass1_job;
|
|
+ /*int j;
|
|
+ int16_t *coeffs = s->coeffs_buf_arm[job][i];
|
|
+ for(j=s->num_coeffs[job][i]; j > 0; j-= 16*16, coeffs+=16*16) {
|
|
+ s->hevcdsp.idct[4-2](coeffs, 16);
|
|
+ }
|
|
+ i=3;
|
|
+ coeffs = s->coeffs_buf_arm[job][i] - s->num_coeffs[job][i];
|
|
+ for(j=s->num_coeffs[job][i]; j > 0; j-= 32*32, coeffs+=32*32) {
|
|
+ s->hevcdsp.idct[5-2](coeffs, 32);
|
|
+ }*/
|
|
+
|
|
+ gpu_cache_flush(&s->coeffs_buf_accelerated[job]);
|
|
+ s->vpu_id = vpu_post_code2( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2],
|
|
+ s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3],
|
|
+ s->num_coeffs[job][3] >> 10, 0, &s->coeffs_buf_accelerated[job]);
|
|
+ //vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0);
|
|
+ //gpu_cache_flush(&s->coeffs_buf_accelerated);
|
|
+ //vpu_wait(s->vpu_id);
|
|
+
|
|
+ for(i=0;i<4;i++)
|
|
+ s->num_coeffs[job][i] = 0;
|
|
+}
|
|
+
|
|
+static void rpi_execute_pred_cmds(HEVCContext *s)
|
|
+{
|
|
+ int i;
|
|
+ int job = s->pass1_job;
|
|
+ HEVCPredCmd *cmd = s->univ_pred_cmds[job];
|
|
+#ifdef RPI_WORKER
|
|
+ HEVCLocalContextIntra *lc = &s->HEVClcIntra;
|
|
+#else
|
|
+ HEVCLocalContext *lc = s->HEVClc;
|
|
+#endif
|
|
+
|
|
+ for(i = s->num_pred_cmds[job]; i > 0; i--, cmd++) {
|
|
+ //printf("i=%d cmd=%p job1=%d job0=%d\n",i,cmd,s->pass1_job,s->pass0_job);
|
|
+ if (cmd->type == RPI_PRED_INTRA) {
|
|
+ lc->tu.intra_pred_mode_c = lc->tu.intra_pred_mode = cmd->mode;
|
|
+ lc->na.cand_bottom_left = (cmd->na >> 4) & 1;
|
|
+ lc->na.cand_left = (cmd->na >> 3) & 1;
|
|
+ lc->na.cand_up_left = (cmd->na >> 2) & 1;
|
|
+ lc->na.cand_up = (cmd->na >> 1) & 1;
|
|
+ lc->na.cand_up_right = (cmd->na >> 0) & 1;
|
|
+ s->hpc.intra_pred[cmd->size - 2](s, cmd->x, cmd->y, cmd->c_idx);
|
|
+ } else {
|
|
+#ifdef RPI_PRECLEAR
|
|
+ int trafo_size = 1 << cmd->size;
|
|
+#endif
|
|
+ s->hevcdsp.transform_add[cmd->size-2](cmd->dst, cmd->buf, cmd->stride);
|
|
+#ifdef RPI_PRECLEAR
|
|
+ memset(cmd->buf, 0, trafo_size * trafo_size * sizeof(int16_t)); // Clear coefficients here while they are in the cache
|
|
+#endif
|
|
+ }
|
|
+ }
|
|
+ s->num_pred_cmds[job] = 0;
|
|
+}
|
|
+
|
|
+static void rpi_execute_inter_cmds(HEVCContext *s)
|
|
+{
|
|
+ int job = s->pass1_job;
|
|
+ HEVCMvCmd *cmd = s->unif_mv_cmds[job];
|
|
+ int n,cidx;
|
|
+ AVFrame myref;
|
|
+ AVFrame myref1;
|
|
+ struct MvField mymv;
|
|
+ if (s->num_mv_cmds[job] > RPI_MAX_MV_CMDS) {
|
|
+ printf("Overflow inter_cmds\n");
|
|
+ exit(-1);
|
|
+ }
|
|
+ for(n = s->num_mv_cmds[job]; n>0 ; n--, cmd++) {
|
|
+ switch(cmd->cmd) {
|
|
+ case RPI_CMD_LUMA_UNI:
|
|
+ myref.data[0] = cmd->src;
|
|
+ myref.linesize[0] = cmd->srcstride;
|
|
+ luma_mc_uni(s, cmd->dst, cmd->dststride, &myref, &cmd->mv, cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, cmd->weight, cmd->offset);
|
|
+ break;
|
|
+ case RPI_CMD_LUMA_BI:
|
|
+ myref.data[0] = cmd->src;
|
|
+ myref.linesize[0] = cmd->srcstride;
|
|
+ myref1.data[0] = cmd->src1;
|
|
+ myref1.linesize[0] = cmd->srcstride1;
|
|
+ mymv.ref_idx[0] = cmd->ref_idx[0];
|
|
+ mymv.ref_idx[1] = cmd->ref_idx[1];
|
|
+ luma_mc_bi(s, cmd->dst, cmd->dststride,
|
|
+ &myref, &cmd->mv, cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h,
|
|
+ &myref1, &cmd->mv1, &mymv);
|
|
+ break;
|
|
+ case RPI_CMD_CHROMA_UNI:
|
|
+ mymv.mv[0] = cmd->mv;
|
|
+ chroma_mc_uni(s, cmd->dst,
|
|
+ cmd->dststride, cmd->src, cmd->srcstride, 0,
|
|
+ cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, &mymv, cmd->weight, cmd->offset);
|
|
+ break;
|
|
+ case RPI_CMD_CHROMA_BI:
|
|
+ case RPI_CMD_CHROMA_BI+1:
|
|
+ cidx = cmd->cmd - RPI_CMD_CHROMA_BI;
|
|
+ myref.data[cidx+1] = cmd->src;
|
|
+ myref.linesize[cidx+1] = cmd->srcstride;
|
|
+ myref1.data[cidx+1] = cmd->src1;
|
|
+ myref1.linesize[cidx+1] = cmd->srcstride1;
|
|
+ mymv.ref_idx[0] = cmd->ref_idx[0];
|
|
+ mymv.ref_idx[1] = cmd->ref_idx[1];
|
|
+ mymv.mv[0] = cmd->mv;
|
|
+ mymv.mv[1] = cmd->mv1;
|
|
+ chroma_mc_bi(s, cmd->dst, cmd->dststride, &myref, &myref1,
|
|
+ cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, &mymv, cidx);
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+ s->num_mv_cmds[job] = 0;
|
|
+}
|
|
+
|
|
+static void rpi_do_all_passes(HEVCContext *s)
|
|
+{
|
|
+ // Kick off QPUs and VPUs
|
|
+ rpi_launch_vpu_qpu(s);
|
|
+ // Perform luma inter prediction
|
|
+ rpi_execute_inter_cmds(s);
|
|
+ // Wait for transform completion
|
|
+ vpu_wait(s->vpu_id);
|
|
+ // Perform intra prediction and residual reconstruction
|
|
+ rpi_execute_pred_cmds(s);
|
|
+ // Perform deblocking for CTBs in this row
|
|
+ rpi_execute_dblk_cmds(s);
|
|
+ // Prepare next batch
|
|
+ rpi_begin(s);
|
|
+}
|
|
+
|
|
+#endif
|
|
+
|
|
+#ifdef RPI
|
|
+static void rpi_begin(HEVCContext *s)
|
|
+{
|
|
+ int job = s->pass0_job;
|
|
+ int i;
|
|
+#ifdef RPI_INTER_QPU
|
|
+ int pic_width = s->ps.sps->width >> s->ps.sps->hshift[1];
|
|
+ int pic_height = s->ps.sps->height >> s->ps.sps->vshift[1];
|
|
+
|
|
+ for(i=0;i<8;i++) {
|
|
+ s->u_mvs[job][i] = s->mvs_base[job][i];
|
|
+ *s->u_mvs[job][i]++ = 0;
|
|
+ *s->u_mvs[job][i]++ = 0;
|
|
+ *s->u_mvs[job][i]++ = 0;
|
|
+ *s->u_mvs[job][i]++ = 0;
|
|
+ *s->u_mvs[job][i]++ = 0;
|
|
+ *s->u_mvs[job][i]++ = pic_width;
|
|
+ *s->u_mvs[job][i]++ = pic_height;
|
|
+ *s->u_mvs[job][i]++ = s->frame->linesize[1];
|
|
+ *s->u_mvs[job][i]++ = s->frame->linesize[2];
|
|
+ *s->u_mvs[job][i]++ = s->sh.chroma_log2_weight_denom + 6;
|
|
+ *s->u_mvs[job][i]++ = 0;
|
|
+ *s->u_mvs[job][i]++ = i; // Select section of VPM (avoid collisions with 3d unit)
|
|
+ }
|
|
+ s->curr_u_mvs = s->u_mvs[job][0];
|
|
+#endif
|
|
+
|
|
+#ifdef RPI_LUMA_QPU
|
|
+ for(i=0;i<12;i++) {
|
|
+ // This needs to have a generally similar structure to the
|
|
+ // actual filter code as various pipelined bits need to land correctly
|
|
+ // when inserted by the filter requests
|
|
+ s->y_mvs[job][i] = s->y_mvs_base[job][i];
|
|
+ *s->y_mvs[job][i]++ = 0; // y_x
|
|
+ *s->y_mvs[job][i]++ = 0; // ref_y_base
|
|
+ *s->y_mvs[job][i]++ = 0; // y2_x2
|
|
+ *s->y_mvs[job][i]++ = 0; // ref_y2_base
|
|
+ *s->y_mvs[job][i]++ = (s->ps.sps->width << 16) + s->ps.sps->height;
|
|
+ *s->y_mvs[job][i]++ = s->frame->linesize[0]; // pitch
|
|
+ *s->y_mvs[job][i]++ = s->frame->linesize[0]; // dst_pitch
|
|
+ *s->y_mvs[job][i]++ = s->sh.luma_log2_weight_denom + 6; // weight demon + 6
|
|
+ *s->y_mvs[job][i]++ = 0; // Unused - alignment with per-block
|
|
+ *s->y_mvs[job][i]++ = 0; // Next kernel
|
|
+ }
|
|
+ s->curr_y_mvs = s->y_mvs[job][0];
|
|
+#endif
|
|
+ s->ctu_count = 0;
|
|
+}
|
|
+#endif
|
|
+
|
|
+#ifdef RPI_SIMULATE_QPUS
|
|
+
|
|
+static int32_t clipx(int x,int FRAME_WIDTH)
|
|
+{
|
|
+ if (x<=0) return 0;
|
|
+ if (x>=FRAME_WIDTH) return FRAME_WIDTH-1;
|
|
+ return x;
|
|
+}
|
|
+
|
|
+static int32_t clipy(int y,int FRAME_HEIGHT)
|
|
+{
|
|
+ if (y<=0) return 0;
|
|
+ if (y>=FRAME_HEIGHT) return FRAME_HEIGHT-1;
|
|
+ return y;
|
|
+}
|
|
+
|
|
+/*static int32_t filter8(uint8_t *data, int x0, int y0, int pitch, int mx, int my,int round,int denom,int weight,int offset)
|
|
+{
|
|
+ int32_t vsum = 0;
|
|
+ int x, y;
|
|
+
|
|
+ for (y = 0; y < 8; y++) {
|
|
+ int32_t hsum = 0;
|
|
+
|
|
+ for (x = 0; x < 8; x++)
|
|
+ hsum += lumaFilter[mx][x]*data[clipx(x + x0) + clipy(y + y0) * pitch];
|
|
+
|
|
+ vsum += lumaFilter[my][y]*hsum;
|
|
+ }
|
|
+ vsum >>= 6;
|
|
+ vsum = (((vsum*weight)+round)>>denom)+offset;
|
|
+
|
|
+ return av_clip_uint8( vsum );
|
|
+}*/
|
|
+
|
|
+static int32_t filter8_chroma(uint8_t *data, int x0, int y0, int pitch, int hcoeffs, int vcoeffs,int offset_weight,int offset_before,int denom,int pic_width, int pic_height)
|
|
+{
|
|
+ int32_t vsum = 0;
|
|
+ int x, y;
|
|
+ int chromaFilterH[4];
|
|
+ int chromaFilterV[4];
|
|
+ int i;
|
|
+ int offset_after = offset_weight>>16;
|
|
+ int weight = (offset_weight<<16)>>16;
|
|
+ for(i=0;i<4;i++) {
|
|
+ chromaFilterH[i] = ((hcoeffs>>(8*i))<<24)>>24;
|
|
+ chromaFilterV[i] = ((vcoeffs>>(8*i))<<24)>>24;
|
|
+ }
|
|
+
|
|
+ for (y = 0; y < 4; y++) {
|
|
+ int32_t hsum = 0;
|
|
+
|
|
+ for (x = 0; x < 4; x++)
|
|
+ hsum += chromaFilterH[x]*data[clipx(x + x0,pic_width) + clipy(y + y0,pic_height) * pitch];
|
|
+
|
|
+ vsum += chromaFilterV[y]*hsum;
|
|
+ }
|
|
+ vsum >>= 6;
|
|
+ vsum = (((vsum*weight)+offset_before)>>denom)+offset_after;
|
|
+
|
|
+ return vsum;
|
|
+}
|
|
+
|
|
+int lumaFilter[4][8]={ {0,0,0,64,0,0,0,0},{-1,4,-10,58,17,-5,1,0},{-1,4,-11,40,40,-11,4,-1},{0,1,-5,17,58,-10,4,-1} };
|
|
+
|
|
+static int32_t filter8_luma(uint8_t *data, int x0, int y0, int pitch, int my_mx,int offset_weight,int offset_before,int denom,int pic_width, int pic_height)
|
|
+{
|
|
+ int32_t vsum = 0;
|
|
+ int x, y;
|
|
+ int i;
|
|
+ int offset_after = offset_weight>>16;
|
|
+ int weight = (offset_weight<<16)>>16;
|
|
+
|
|
+ for (y = 0; y < 8; y++) {
|
|
+ int32_t hsum = 0;
|
|
+
|
|
+ for (x = 0; x < 8; x++)
|
|
+ hsum += lumaFilter[my_mx&3][x]*data[clipx(x + x0,pic_width) + clipy(y + y0,pic_height) * pitch];
|
|
+
|
|
+ vsum += lumaFilter[(my_mx>>8)&3][y]*hsum;
|
|
+ }
|
|
+ vsum >>= 6;
|
|
+ vsum = (((vsum*weight)+offset_before)>>denom)+offset_after;
|
|
+
|
|
+ return vsum;
|
|
+}
|
|
+
|
|
+static uint8_t *test_frame(HEVCContext *s,uint32_t p, AVFrame *frame, const int cIdx)
|
|
+{
|
|
+ //int pic_width = s->ps.sps->width >> s->ps.sps->hshift[cIdx];
|
|
+ int pic_height = s->ps.sps->height >> s->ps.sps->vshift[cIdx];
|
|
+ int pitch = frame->linesize[cIdx];
|
|
+ uint32_t base = cIdx == 0 ? get_vc_address_y(frame) :
|
|
+ cIdx == 1 ? get_vc_address_u(frame) : get_vc_address_v(frame);
|
|
+ if (p>=base && p<base+pitch*pic_height) {
|
|
+ return frame->data[cIdx] + (p-base);
|
|
+ }
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+static uint8_t *compute_arm_addr(HEVCContext *s,uint32_t p, int cIdx)
|
|
+{
|
|
+ SliceHeader *sh = &s->sh;
|
|
+ uint8_t *arm = test_frame(s,p,s->frame,cIdx);
|
|
+ int i;
|
|
+ if (arm) return arm;
|
|
+ if (sh->slice_type == P_SLICE || sh->slice_type == B_SLICE)
|
|
+ {
|
|
+ for(i=0;i<sh->nb_refs[L0];i++) {
|
|
+ arm = test_frame(s,p,s->ref->refPicList[0].ref[i]->frame,cIdx);
|
|
+ if (arm) return arm;
|
|
+ }
|
|
+ }
|
|
+ if (sh->slice_type == B_SLICE) {
|
|
+ for(i=0;i<sh->nb_refs[L1];i++) {
|
|
+ arm = test_frame(s,p,s->ref->refPicList[1].ref[i]->frame,cIdx);
|
|
+ if (arm) return arm;
|
|
+ }
|
|
+ }
|
|
+ printf("Frame 0x%x not found! Exit=%x\n",p,qpu_get_fn(QPU_MC_EXIT));
|
|
+ exit(-1);
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+static void rpi_simulate_inter_chroma(HEVCContext *s,uint32_t *p)
|
|
+{
|
|
+ uint32_t next_kernel;
|
|
+ uint32_t x0;
|
|
+ uint32_t y0;
|
|
+ uint8_t *ref_u_base;
|
|
+ uint8_t *ref_v_base;
|
|
+ uint32_t frame_width = p[5];
|
|
+ uint32_t frame_height = p[6];
|
|
+ uint32_t pitch = p[7];
|
|
+ uint32_t dst_pitch = p[8];
|
|
+ int32_t offset_before = p[9];
|
|
+ int32_t denom = p[10];
|
|
+ uint32_t vpm_id = p[11];
|
|
+ uint32_t tmp_u_dst[256];
|
|
+ uint32_t tmp_v_dst[256];
|
|
+ while(1) {
|
|
+ p += 12;
|
|
+ next_kernel = p[0-12];
|
|
+ x0 = p[1-12];
|
|
+ y0 = p[2-12];
|
|
+ if (next_kernel==s->mc_filter_uv || next_kernel==s->mc_filter_uv_b0 || next_kernel==s->mc_filter_uv_b) {
|
|
+ int x,y;
|
|
+ uint32_t width_height = p[5];
|
|
+ uint32_t hcoeffs = p[6];
|
|
+ uint32_t vcoeffs = p[7];
|
|
+ uint32_t offset_weight_u = p[8];
|
|
+ uint32_t offset_weight_v = p[9];
|
|
+ uint8_t *this_u_dst;
|
|
+ uint8_t *this_v_dst;
|
|
+ uint32_t width = width_height >> 16;
|
|
+ uint32_t height = (width_height << 16) >> 16;
|
|
+ ref_u_base = compute_arm_addr(s,p[3-12],1);
|
|
+ ref_v_base = compute_arm_addr(s,p[4-12],2);
|
|
+ if (next_kernel!=s->mc_filter_uv_b0)
|
|
+ {
|
|
+ this_u_dst = compute_arm_addr(s,p[10],1);
|
|
+ this_v_dst = compute_arm_addr(s,p[11],2);
|
|
+ }
|
|
+ for (y=0; y<height; ++y) {
|
|
+ for (x=0; x<width; ++x) {
|
|
+ if (next_kernel==s->mc_filter_uv) {
|
|
+ int32_t refa = filter8_chroma(ref_u_base,x+x0, y+y0, pitch, hcoeffs, vcoeffs, offset_weight_u,offset_before,denom,frame_width,frame_height);
|
|
+ int32_t refb = filter8_chroma(ref_v_base,x+x0, y+y0, pitch, hcoeffs, vcoeffs, offset_weight_v,offset_before,denom,frame_width,frame_height);
|
|
+ this_u_dst[x+y*dst_pitch] = av_clip_uint8(refa);
|
|
+ this_v_dst[x+y*dst_pitch] = av_clip_uint8(refb);
|
|
+ } else if (next_kernel==s->mc_filter_uv_b0) {
|
|
+ int32_t refa = filter8_chroma(ref_u_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1,0,0,frame_width,frame_height);
|
|
+ int32_t refb = filter8_chroma(ref_v_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1,0,0,frame_width,frame_height);
|
|
+ tmp_u_dst[x+y*16] = refa;
|
|
+ tmp_v_dst[x+y*16] = refb;
|
|
+ } else {
|
|
+ int32_t refa = filter8_chroma(ref_u_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1, 64 + tmp_u_dst[x+y*16], 7, frame_width, frame_height);
|
|
+ int32_t refb = filter8_chroma(ref_v_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1, 64 + tmp_v_dst[x+y*16], 7, frame_width, frame_height);
|
|
+ this_u_dst[x+y*dst_pitch] = av_clip_uint8(refa);
|
|
+ this_v_dst[x+y*dst_pitch] = av_clip_uint8(refb);
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ } else {
|
|
+ av_assert0(next_kernel==qpu_get_fn(QPU_MC_INTERRUPT_EXIT8) || next_kernel==qpu_get_fn(QPU_MC_EXIT) );
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+// mc_setup(y_x, ref_y_base, y2_x2, ref_y2_base, frame_width_height, pitch, dst_pitch, offset_shift, next_kernel)
|
|
+static void rpi_simulate_inter_luma(HEVCContext *s,uint32_t *p,int chan)
|
|
+{
|
|
+ uint32_t next_kernel;
|
|
+ int y_x,y2_x2;
|
|
+ int x0;
|
|
+ int y0;
|
|
+ int x2;
|
|
+ int y2;
|
|
+ uint32_t *p0 = p;
|
|
+ uint8_t *ref_y_base;
|
|
+ uint8_t *ref_y2_base;
|
|
+ uint32_t frame_width_height = p[4];
|
|
+ uint32_t frame_width = frame_width_height>>16;
|
|
+ uint32_t frame_height = (frame_width_height<<16)>>16;
|
|
+ uint32_t pitch = p[5];
|
|
+ uint32_t dst_pitch = p[6];
|
|
+ int offset_shift = p[7];
|
|
+ int32_t offset_before = offset_shift>>16;
|
|
+ int32_t denom = (offset_shift<<16)>>16;
|
|
+ while(1) {
|
|
+ p += 9;
|
|
+ next_kernel = p[8-9];
|
|
+ y_x = p[0-9];
|
|
+ x0 = (y_x<<16)>>16;
|
|
+ y0 = y_x>>16;
|
|
+ y2_x2 = p[2-9];
|
|
+ x2 = (y2_x2<<16)>>16;
|
|
+ y2 = y2_x2>>16;
|
|
+
|
|
+ if (next_kernel==s->mc_filter || next_kernel==s->mc_filter_b) {
|
|
+ // y_x, frame_base, y2_x2, frame_base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
|
|
+ int x,y;
|
|
+ uint32_t width_height = p[4];
|
|
+ uint32_t my2_mx2_my_mx = p[5];
|
|
+ uint32_t offset_weight = p[6];
|
|
+ uint8_t *this_dst = compute_arm_addr(s,p[7],0);
|
|
+ uint32_t width = width_height >> 16;
|
|
+ uint32_t height = (width_height << 16) >> 16;
|
|
+ uint8_t *dst_base = s->frame->data[0];
|
|
+ ref_y_base = compute_arm_addr(s,p[1-9],0);
|
|
+ ref_y2_base = compute_arm_addr(s,p[3-9],0);
|
|
+ for (y=0; y<height; ++y) {
|
|
+ for (x=0; x<width; ++x) {
|
|
+ if (next_kernel==s->mc_filter) {
|
|
+ int32_t refa = filter8_luma(ref_y_base,x+x0, y+y0, pitch, my2_mx2_my_mx, offset_weight,offset_before,denom,frame_width,frame_height);
|
|
+ refa = av_clip_uint8(refa);
|
|
+ this_dst[x+y*dst_pitch] = refa;
|
|
+ }
|
|
+ else {
|
|
+ int32_t refa = filter8_luma(ref_y_base, x+x0, y+y0, pitch, my2_mx2_my_mx, 1, 0, 0, frame_width, frame_height);
|
|
+ int32_t refb = filter8_luma(ref_y2_base, x+x2, y+y2, pitch, my2_mx2_my_mx>>16, 1, 64 + refa, 7, frame_width, frame_height);
|
|
+ this_dst[x+y*dst_pitch] = av_clip_uint8(refb);
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ } else {
|
|
+ av_assert0(next_kernel==qpu_get_fn(QPU_MC_INTERRUPT_EXIT12) || next_kernel==qpu_get_fn(QPU_MC_EXIT) );
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+static void rpi_simulate_inter_qpu(HEVCContext *s)
|
|
+{
|
|
+ // First run the transform as normal
|
|
+ int i;
|
|
+ rpi_execute_transform(s);
|
|
+ for(i=0;i<8;i++)
|
|
+ {
|
|
+ rpi_simulate_inter_chroma(s,s->mvs_base[i]);
|
|
+ }
|
|
+ for(i=0;i<12;i++)
|
|
+ {
|
|
+ rpi_simulate_inter_luma(s,s->y_mvs_base[i],i);
|
|
+ }
|
|
+}
|
|
+
|
|
+#endif
|
|
+
|
|
+#ifdef RPI_INTER_QPU
|
|
+
|
|
+static void rpi_launch_vpu_qpu(HEVCContext *s)
|
|
+{
|
|
+ int k;
|
|
+ int job = s->pass1_job;
|
|
+ int i;
|
|
+ uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr[job].vc;
|
|
+#ifdef RPI_LUMA_QPU
|
|
+ uint32_t *y_unif_vc = (uint32_t *)s->y_unif_mvs_ptr[job].vc;
|
|
+#endif
|
|
+ if (s->sh.slice_type == I_SLICE) {
|
|
+#ifdef RPI_MULTI_MAILBOX
|
|
+ rpi_execute_transform(s);
|
|
+ return;
|
|
+#endif
|
|
+ }
|
|
+ for(k=0;k<8;k++) {
|
|
+ s->u_mvs[job][k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
|
|
+ s->u_mvs[job][k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
|
|
+ s->u_mvs[job][k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for V
|
|
+ av_assert0(s->u_mvs[job][k] - s->mvs_base[job][k] < UV_COMMANDS_PER_QPU);
|
|
+ }
|
|
+
|
|
+ s->u_mvs[job][8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
|
|
+
|
|
+#ifdef RPI_LUMA_QPU
|
|
+ for(k=0;k<12;k++) {
|
|
+ s->y_mvs[job][k][-RPI_LUMA_COMMAND_WORDS+1] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
|
|
+ s->y_mvs[job][k][-RPI_LUMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for second request
|
|
+ s->y_mvs[job][k][-1] = qpu_get_fn(QPU_MC_EXIT); // Add exit command (Final uniform)
|
|
+ av_assert0(s->y_mvs[job][k] - s->y_mvs_base[job][k] < Y_COMMANDS_PER_QPU);
|
|
+ }
|
|
+ s->y_mvs[job][12-1][-1] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT12); // This QPU will signal interrupt when all others are done and have acquired a semaphore
|
|
+#endif
|
|
+
|
|
+#ifdef RPI_SIMULATE_QPUS
|
|
+ rpi_simulate_inter_qpu(s);
|
|
+ return;
|
|
+#endif
|
|
+
|
|
+#ifdef RPI_MULTI_MAILBOX
|
|
+#ifdef RPI_CACHE_UNIF_MVS
|
|
+ flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],&s->y_unif_mvs_ptr[job], &s->unif_mvs_ptr[job], job);
|
|
+#else
|
|
+ flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],NULL,NULL, job);
|
|
+#endif
|
|
+
|
|
+#if 1
|
|
+ {
|
|
+ unsigned int i;
|
|
+ uint32_t * p;
|
|
+ uint32_t code = qpu_get_fn(QPU_MC_SETUP_UV);
|
|
+ uint32_t mail_uv[QPU_N_UV * QPU_MAIL_EL_VALS];
|
|
+ uint32_t mail_y[QPU_N_Y * QPU_MAIL_EL_VALS];
|
|
+
|
|
+ for (p = mail_uv, i = 0; i != QPU_N_UV; ++i) {
|
|
+ *p++ = (uint32_t)(unif_vc + (s->mvs_base[job][i] - (uint32_t*)s->unif_mvs_ptr[job].arm));
|
|
+ *p++ = code;
|
|
+ }
|
|
+
|
|
+ code = qpu_get_fn(QPU_MC_SETUP);
|
|
+ for (p = mail_y, i = 0; i != QPU_N_Y; ++i) {
|
|
+ *p++ = (uint32_t)(y_unif_vc + (s->y_mvs_base[job][i] - (uint32_t*)s->y_unif_mvs_ptr[job].arm));
|
|
+ *p++ = code;
|
|
+ }
|
|
+
|
|
+ s->vpu_id = vpu_qpu_post_code2(vpu_get_fn(),
|
|
+ vpu_get_constants(),
|
|
+ s->coeffs_buf_vc[job][2],
|
|
+ s->num_coeffs[job][2] >> 8,
|
|
+ s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3],
|
|
+ s->num_coeffs[job][3] >> 10,
|
|
+ 0,
|
|
+ // QPU job 1
|
|
+ QPU_N_UV,
|
|
+ mail_uv,
|
|
+ // QPU job 2
|
|
+ QPU_N_Y,
|
|
+ mail_y
|
|
+ );
|
|
+ }
|
|
+
|
|
+#else
|
|
+ s->vpu_id = vpu_qpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2], s->num_coeffs[job][2] >> 8,
|
|
+ s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3], s->num_coeffs[job][3] >> 10, 0,
|
|
+ qpu_get_fn(QPU_MC_SETUP_UV),
|
|
+ (uint32_t)(unif_vc+(s->mvs_base[job][0 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
|
|
+ (uint32_t)(unif_vc+(s->mvs_base[job][1 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
|
|
+ (uint32_t)(unif_vc+(s->mvs_base[job][2 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
|
|
+ (uint32_t)(unif_vc+(s->mvs_base[job][3 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
|
|
+ (uint32_t)(unif_vc+(s->mvs_base[job][4 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
|
|
+ (uint32_t)(unif_vc+(s->mvs_base[job][5 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
|
|
+ (uint32_t)(unif_vc+(s->mvs_base[job][6 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
|
|
+ (uint32_t)(unif_vc+(s->mvs_base[job][7 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
|
|
+#ifdef RPI_LUMA_QPU
|
|
+ qpu_get_fn(QPU_MC_SETUP),
|
|
+ (uint32_t)(y_unif_vc+(s->y_mvs_base[job][0 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
|
|
+ (uint32_t)(y_unif_vc+(s->y_mvs_base[job][1 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
|
|
+ (uint32_t)(y_unif_vc+(s->y_mvs_base[job][2 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
|
|
+ (uint32_t)(y_unif_vc+(s->y_mvs_base[job][3 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
|
|
+ (uint32_t)(y_unif_vc+(s->y_mvs_base[job][4 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
|
|
+ (uint32_t)(y_unif_vc+(s->y_mvs_base[job][5 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
|
|
+ (uint32_t)(y_unif_vc+(s->y_mvs_base[job][6 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
|
|
+ (uint32_t)(y_unif_vc+(s->y_mvs_base[job][7 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
|
|
+ (uint32_t)(y_unif_vc+(s->y_mvs_base[job][8 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
|
|
+ (uint32_t)(y_unif_vc+(s->y_mvs_base[job][9 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
|
|
+ (uint32_t)(y_unif_vc+(s->y_mvs_base[job][10 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
|
|
+ (uint32_t)(y_unif_vc+(s->y_mvs_base[job][11 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm))
|
|
+#else
|
|
+ 0,
|
|
+ 0,0,0,0,
|
|
+ 0,0,0,0,
|
|
+ 0,0,0,0
|
|
+#endif
|
|
+ );
|
|
+#endif
|
|
+ for(i=0;i<4;i++)
|
|
+ s->num_coeffs[job][i] = 0;
|
|
+#else
|
|
+#error Code rotted here
|
|
+ qpu_run_shader8(qpu_get_fn(QPU_MC_SETUP_UV),
|
|
+ (uint32_t)(unif_vc+(s->mvs_base[job][0 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
|
|
+ (uint32_t)(unif_vc+(s->mvs_base[job][1 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
|
|
+ (uint32_t)(unif_vc+(s->mvs_base[job][2 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
|
|
+ (uint32_t)(unif_vc+(s->mvs_base[job][3 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
|
|
+ (uint32_t)(unif_vc+(s->mvs_base[job][4 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
|
|
+ (uint32_t)(unif_vc+(s->mvs_base[job][5 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
|
|
+ (uint32_t)(unif_vc+(s->mvs_base[job][6 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
|
|
+ (uint32_t)(unif_vc+(s->mvs_base[job][7 ] - (uint32_t*)s->unif_mvs_ptr[job].arm))
|
|
+ );
|
|
+#endif
|
|
+
|
|
+
|
|
+}
|
|
+#else
|
|
+
|
|
+#ifdef RPI
|
|
+static void rpi_launch_vpu_qpu(HEVCContext *s)
|
|
+{
|
|
+ rpi_execute_transform(s);
|
|
+}
|
|
+#endif
|
|
+
|
|
+#endif
|
|
+
|
|
+#ifdef RPI
|
|
+
|
|
+#ifndef RPI_FAST_CACHEFLUSH
|
|
+#error RPI_FAST_CACHEFLUSH is broken
|
|
+static void flush_buffer(AVBufferRef *bref) {
|
|
+ GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
|
|
+ gpu_cache_flush(p);
|
|
+}
|
|
+#endif
|
|
+
|
|
+static void flush_frame(HEVCContext *s,AVFrame *frame)
|
|
+{
|
|
+#ifdef RPI_FAST_CACHEFLUSH
|
|
+ struct vcsm_user_clean_invalid_s iocache = {};
|
|
+ GPU_MEM_PTR_T p = get_gpu_mem_ptr_u(s->frame);
|
|
+ int n = s->ps.sps->height;
|
|
+ int curr_y = 0;
|
|
+ int curr_uv = 0;
|
|
+ int n_uv = n >> s->ps.sps->vshift[1];
|
|
+ int sz,base;
|
|
+ sz = s->frame->linesize[1] * (n_uv-curr_uv);
|
|
+ base = s->frame->linesize[1] * curr_uv;
|
|
+ iocache.s[0].handle = p.vcsm_handle;
|
|
+ iocache.s[0].cmd = 3; // clean+invalidate
|
|
+ iocache.s[0].addr = (int)(p.arm) + base;
|
|
+ iocache.s[0].size = sz;
|
|
+ p = get_gpu_mem_ptr_v(s->frame);
|
|
+ iocache.s[1].handle = p.vcsm_handle;
|
|
+ iocache.s[1].cmd = 3; // clean+invalidate
|
|
+ iocache.s[1].addr = (int)(p.arm) + base;
|
|
+ iocache.s[1].size = sz;
|
|
+ p = get_gpu_mem_ptr_y(s->frame);
|
|
+ sz = s->frame->linesize[0] * (n-curr_y);
|
|
+ base = s->frame->linesize[0] * curr_y;
|
|
+ iocache.s[2].handle = p.vcsm_handle;
|
|
+ iocache.s[2].cmd = 3; // clean+invalidate
|
|
+ iocache.s[2].addr = (int)(p.arm) + base;
|
|
+ iocache.s[2].size = sz;
|
|
+ vcsm_clean_invalid( &iocache );
|
|
+#else
|
|
+ flush_buffer(frame->buf[0]);
|
|
+ flush_buffer(frame->buf[1]);
|
|
+ flush_buffer(frame->buf[2]);
|
|
+#endif
|
|
+}
|
|
+
|
|
+static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2, int job)
|
|
+{
|
|
+#ifdef RPI_FAST_CACHEFLUSH
|
|
+ struct vcsm_user_clean_invalid_s iocache = {};
|
|
+ int n;
|
|
+ int curr_y;
|
|
+ int curr_uv;
|
|
+ int n_uv;
|
|
+ GPU_MEM_PTR_T p = get_gpu_mem_ptr_u(s->frame);
|
|
+ int sz,base;
|
|
+ int (*d)[2] = s->dblk_cmds[job];
|
|
+ int low=(*d)[1];
|
|
+ int high=(*d)[1];
|
|
+ for(n = s->num_dblk_cmds[job]; n>0 ;n--,d++) {
|
|
+ int y = (*d)[1];
|
|
+ low=FFMIN(low,y);
|
|
+ high=FFMAX(high,y);
|
|
+ }
|
|
+ curr_y = low;
|
|
+ n = high+(1 << s->ps.sps->log2_ctb_size);
|
|
+ curr_uv = curr_y >> s->ps.sps->vshift[1];
|
|
+ n_uv = n >> s->ps.sps->vshift[1];
|
|
+
|
|
+ sz = s->frame->linesize[1] * (n_uv-curr_uv);
|
|
+ base = s->frame->linesize[1] * curr_uv;
|
|
+ iocache.s[0].handle = p.vcsm_handle;
|
|
+ iocache.s[0].cmd = 3; // clean+invalidate
|
|
+ iocache.s[0].addr = (int)(p.arm) + base;
|
|
+ iocache.s[0].size = sz;
|
|
+ p = get_gpu_mem_ptr_v(s->frame);
|
|
+ iocache.s[1].handle = p.vcsm_handle;
|
|
+ iocache.s[1].cmd = 3; // clean+invalidate
|
|
+ iocache.s[1].addr = (int)(p.arm) + base;
|
|
+ iocache.s[1].size = sz;
|
|
+ p = get_gpu_mem_ptr_y(s->frame);
|
|
+ sz = s->frame->linesize[0] * (n-curr_y);
|
|
+ base = s->frame->linesize[0] * curr_y;
|
|
+ iocache.s[2].handle = p.vcsm_handle;
|
|
+ iocache.s[2].cmd = 3; // clean+invalidate
|
|
+ iocache.s[2].addr = (int)(p.arm) + base;
|
|
+ iocache.s[2].size = sz;
|
|
+
|
|
+ iocache.s[3].handle = p0->vcsm_handle;
|
|
+ iocache.s[3].cmd = 3; // clean+invalidate
|
|
+ iocache.s[3].addr = (int) p0->arm;
|
|
+ iocache.s[3].size = p0->numbytes;
|
|
+ if (p1) {
|
|
+ iocache.s[4].handle = p1->vcsm_handle;
|
|
+ iocache.s[4].cmd = 3; // clean+invalidate
|
|
+ iocache.s[4].addr = (int) p1->arm;
|
|
+ iocache.s[4].size = p1->numbytes;
|
|
+ }
|
|
+ if (p2) {
|
|
+ iocache.s[5].handle = p2->vcsm_handle;
|
|
+ iocache.s[5].cmd = 3; // clean+invalidate
|
|
+ iocache.s[5].addr = (int) p2->arm;
|
|
+ iocache.s[5].size = p2->numbytes;
|
|
+ }
|
|
+ vcsm_clean_invalid( &iocache );
|
|
+#else
|
|
+ flush_buffer(frame->buf[0]);
|
|
+ flush_buffer(frame->buf[1]);
|
|
+ flush_buffer(frame->buf[2]);
|
|
+ gpu_cache_flush3(p0, p1, p2);
|
|
+#endif
|
|
+}
|
|
+
|
|
+#endif
|
|
+
|
|
static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
|
|
{
|
|
HEVCContext *s = avctxt->priv_data;
|
|
@@ -2313,6 +3762,17 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
|
|
int y_ctb = 0;
|
|
int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
|
|
|
|
+#ifdef RPI
|
|
+ s->enable_rpi = s->ps.sps->bit_depth == 8
|
|
+ && !s->ps.pps->cross_component_prediction_enabled_flag;
|
|
+
|
|
+ if (!s->enable_rpi) {
|
|
+ if (s->ps.pps->cross_component_prediction_enabled_flag)
|
|
+ printf("Cross component\n");
|
|
+ }
|
|
+#endif
|
|
+ //printf("L0=%d L1=%d\n",s->sh.nb_refs[L1],s->sh.nb_refs[L1]);
|
|
+
|
|
if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
|
|
av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
|
|
return AVERROR_INVALIDDATA;
|
|
@@ -2326,6 +3786,14 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
|
|
}
|
|
}
|
|
|
|
+#ifdef RPI_WORKER
|
|
+ s->pass0_job = 0;
|
|
+ s->pass1_job = 0;
|
|
+#endif
|
|
+#ifdef RPI
|
|
+ rpi_begin(s);
|
|
+#endif
|
|
+
|
|
while (more_data && ctb_addr_ts < s->ps.sps->ctb_size) {
|
|
int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
|
|
|
|
@@ -2341,7 +3809,57 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
|
|
s->deblock[ctb_addr_rs].tc_offset = s->sh.tc_offset;
|
|
s->filter_slice_edges[ctb_addr_rs] = s->sh.slice_loop_filter_across_slices_enabled_flag;
|
|
|
|
+#ifdef RPI_INTER_QPU
|
|
+ s->curr_u_mvs = s->u_mvs[s->pass0_job][s->ctu_count % 8];
|
|
+#endif
|
|
+#ifdef RPI_LUMA_QPU
|
|
+ s->curr_y_mvs = s->y_mvs[s->pass0_job][s->ctu_count % 12];
|
|
+#endif
|
|
+
|
|
more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
|
|
+
|
|
+#ifdef RPI_INTER_QPU
|
|
+ s->u_mvs[s->pass0_job][s->ctu_count % 8]= s->curr_u_mvs;
|
|
+#endif
|
|
+#ifdef RPI_LUMA_QPU
|
|
+ s->y_mvs[s->pass0_job][s->ctu_count % 12] = s->curr_y_mvs;
|
|
+#endif
|
|
+
|
|
+#ifdef RPI
|
|
+ if (s->enable_rpi) {
|
|
+ //av_assert0(s->num_dblk_cmds[s->pass0_job]>=0);
|
|
+ //av_assert0(s->num_dblk_cmds[s->pass0_job]<RPI_MAX_DEBLOCK_CMDS);
|
|
+ //av_assert0(s->pass0_job<RPI_MAX_JOBS);
|
|
+ //av_assert0(s->pass0_job>=0);
|
|
+ s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]][0] = x_ctb;
|
|
+ s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]++][1] = y_ctb;
|
|
+ s->ctu_count++;
|
|
+ //printf("%d %d/%d job=%d\n",s->ctu_count,s->num_dblk_cmds[s->pass0_job],RPI_MAX_DEBLOCK_CMDS,s->pass0_job);
|
|
+
|
|
+ if ( s->ctu_count >= s->max_ctu_count ) {
|
|
+#ifdef RPI_WORKER
|
|
+ if (s->used_for_ref) {
|
|
+ // Split work load onto separate threads so we make as rapid progress as possible with this frame
|
|
+ // Pass on this job to worker thread
|
|
+ worker_submit_job(s);
|
|
+ // Make sure we have space to prepare the next job
|
|
+ worker_pass0_ready(s);
|
|
+
|
|
+ // Prepare the next batch of commands
|
|
+ rpi_begin(s);
|
|
+ } else {
|
|
+ // Non-ref frame so do it all on this thread
|
|
+ rpi_do_all_passes(s);
|
|
+ }
|
|
+#else
|
|
+ rpi_do_all_passes(s);
|
|
+#endif
|
|
+ }
|
|
+
|
|
+ }
|
|
+#endif
|
|
+
|
|
+
|
|
if (more_data < 0) {
|
|
s->tab_slice_address[ctb_addr_rs] = -1;
|
|
return more_data;
|
|
@@ -2350,9 +3868,29 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
|
|
|
|
ctb_addr_ts++;
|
|
ff_hevc_save_states(s, ctb_addr_ts);
|
|
+#ifdef RPI
|
|
+ if (s->enable_rpi)
|
|
+ continue;
|
|
+#endif
|
|
ff_hevc_hls_filters(s, x_ctb, y_ctb, ctb_size);
|
|
}
|
|
|
|
+#ifdef RPI
|
|
+
|
|
+#ifdef RPI_WORKER
|
|
+ // Wait for the worker to finish all its jobs
|
|
+ if (s->enable_rpi) {
|
|
+ worker_wait(s);
|
|
+ }
|
|
+#endif
|
|
+
|
|
+ // Finish off any half-completed rows
|
|
+ if (s->enable_rpi && s->ctu_count) {
|
|
+ rpi_do_all_passes(s);
|
|
+ }
|
|
+
|
|
+#endif
|
|
+
|
|
if (x_ctb + ctb_size >= s->ps.sps->width &&
|
|
y_ctb + ctb_size >= s->ps.sps->height)
|
|
ff_hevc_hls_filter(s, x_ctb, y_ctb, ctb_size);
|
|
@@ -2387,6 +3925,11 @@ static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int
|
|
s = s1->sList[self_id];
|
|
lc = s->HEVClc;
|
|
|
|
+#ifdef RPI
|
|
+ s->enable_rpi = 0;
|
|
+ //printf("Wavefront\n");
|
|
+#endif
|
|
+
|
|
if(ctb_row) {
|
|
ret = init_get_bits8(&lc->gb, s->data + s->sh.offset[ctb_row - 1], s->sh.size[ctb_row - 1]);
|
|
|
|
@@ -2767,6 +4310,16 @@ static int decode_nal_unit(HEVCContext *s, const H2645NAL *nal)
|
|
if (ret < 0)
|
|
return ret;
|
|
|
|
+ s->used_for_ref = !(s->nal_unit_type == NAL_TRAIL_N ||
|
|
+ s->nal_unit_type == NAL_TSA_N ||
|
|
+ s->nal_unit_type == NAL_STSA_N ||
|
|
+ s->nal_unit_type == NAL_RADL_N ||
|
|
+ s->nal_unit_type == NAL_RASL_N);
|
|
+
|
|
+ if (!s->used_for_ref && s->avctx->skip_frame >= AVDISCARD_NONREF) {
|
|
+ s->is_decoded = 0;
|
|
+ break;
|
|
+ }
|
|
if (s->max_ra == INT_MAX) {
|
|
if (s->nal_unit_type == NAL_CRA_NUT || IS_BLA(s)) {
|
|
s->max_ra = s->poc;
|
|
@@ -2891,9 +4444,17 @@ static int decode_nal_units(HEVCContext *s, const uint8_t *buf, int length)
|
|
}
|
|
|
|
fail:
|
|
- if (s->ref && s->threads_type == FF_THREAD_FRAME)
|
|
+ if (s->ref && s->threads_type == FF_THREAD_FRAME) {
|
|
+#ifdef RPI_INTER_QPU
|
|
+ ff_hevc_flush_buffer(s, &s->ref->tf, s->ps.sps->height);
|
|
+#endif
|
|
ff_thread_report_progress(&s->ref->tf, INT_MAX, 0);
|
|
-
|
|
+ } else if (s->ref) {
|
|
+#ifdef RPI_INTER_QPU
|
|
+ // When running single threaded we need to flush the whole frame
|
|
+ flush_frame(s,s->frame);
|
|
+#endif
|
|
+ }
|
|
return ret;
|
|
}
|
|
|
|
@@ -3064,6 +4625,41 @@ fail:
|
|
return AVERROR(ENOMEM);
|
|
}
|
|
|
|
+#ifdef RPI_WORKER
|
|
+static av_cold void hevc_init_worker(HEVCContext *s)
|
|
+{
|
|
+ int err;
|
|
+ pthread_cond_init(&s->worker_cond_head, NULL);
|
|
+ pthread_cond_init(&s->worker_cond_tail, NULL);
|
|
+ pthread_mutex_init(&s->worker_mutex, NULL);
|
|
+
|
|
+ s->worker_tail=0;
|
|
+ s->worker_head=0;
|
|
+ s->kill_worker=0;
|
|
+ err = pthread_create(&s->worker_thread, NULL, worker_start, s);
|
|
+ if (err) {
|
|
+ printf("Failed to create worker thread\n");
|
|
+ exit(-1);
|
|
+ }
|
|
+}
|
|
+
|
|
+static av_cold void hevc_exit_worker(HEVCContext *s)
|
|
+{
|
|
+ void *res;
|
|
+ s->kill_worker=1;
|
|
+ pthread_cond_broadcast(&s->worker_cond_tail);
|
|
+ pthread_join(s->worker_thread, &res);
|
|
+
|
|
+ pthread_cond_destroy(&s->worker_cond_head);
|
|
+ pthread_cond_destroy(&s->worker_cond_tail);
|
|
+ pthread_mutex_destroy(&s->worker_mutex);
|
|
+
|
|
+ s->worker_tail=0;
|
|
+ s->worker_head=0;
|
|
+ s->kill_worker=0;
|
|
+}
|
|
+#endif
|
|
+
|
|
static av_cold int hevc_decode_free(AVCodecContext *avctx)
|
|
{
|
|
HEVCContext *s = avctx->priv_data;
|
|
@@ -3075,6 +4671,32 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
|
|
|
|
av_freep(&s->cabac_state);
|
|
|
|
+#ifdef RPI
|
|
+
|
|
+#ifdef RPI_WORKER
|
|
+ hevc_exit_worker(s);
|
|
+#endif
|
|
+
|
|
+ for(i=0;i<RPI_MAX_JOBS;i++) {
|
|
+ av_freep(&s->unif_mv_cmds[i]);
|
|
+ av_freep(&s->univ_pred_cmds[i]);
|
|
+
|
|
+#ifdef RPI_INTER_QPU
|
|
+ if (s->unif_mvs[i]) {
|
|
+ gpu_free( &s->unif_mvs_ptr[i] );
|
|
+ s->unif_mvs[i] = 0;
|
|
+ }
|
|
+#endif
|
|
+#ifdef RPI_LUMA_QPU
|
|
+ if (s->y_unif_mvs[i]) {
|
|
+ gpu_free( &s->y_unif_mvs_ptr[i] );
|
|
+ s->y_unif_mvs[i] = 0;
|
|
+ }
|
|
+#endif
|
|
+ }
|
|
+
|
|
+#endif
|
|
+
|
|
for (i = 0; i < 3; i++) {
|
|
av_freep(&s->sao_pixel_buffer_h[i]);
|
|
av_freep(&s->sao_pixel_buffer_v[i]);
|
|
@@ -3116,10 +4738,23 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
|
|
return 0;
|
|
}
|
|
|
|
+#ifdef RPI
|
|
+#ifdef RPI_PRECLEAR
|
|
+static av_cold void memclear16(int16_t *p, int n)
|
|
+{
|
|
+ vpu_execute_code( vpu_get_fn(), p, n, 0, 0, 0, 1);
|
|
+ //int i;
|
|
+ //for(i=0;i<n;i++)
|
|
+ // p[i] = 0;
|
|
+}
|
|
+#endif
|
|
+#endif
|
|
+
|
|
static av_cold int hevc_init_context(AVCodecContext *avctx)
|
|
{
|
|
HEVCContext *s = avctx->priv_data;
|
|
int i;
|
|
+ int job;
|
|
|
|
s->avctx = avctx;
|
|
|
|
@@ -3129,6 +4764,78 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
|
|
s->HEVClcList[0] = s->HEVClc;
|
|
s->sList[0] = s;
|
|
|
|
+#ifdef RPI
|
|
+ for(job=0;job<RPI_MAX_JOBS;job++) {
|
|
+ s->unif_mv_cmds[job] = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS);
|
|
+ if (!s->unif_mv_cmds[job])
|
|
+ goto fail;
|
|
+ s->univ_pred_cmds[job] = av_mallocz(sizeof(HEVCPredCmd)*RPI_MAX_PRED_CMDS);
|
|
+ if (!s->univ_pred_cmds[job])
|
|
+ goto fail;
|
|
+ }
|
|
+
|
|
+#ifdef RPI_INTER_QPU
|
|
+ // We divide the image into blocks 256 wide and 64 high
|
|
+ // We support up to 2048 widths
|
|
+ // We compute the number of chroma motion vector commands for 4:4:4 format and 4x4 chroma blocks - assuming all blocks are B predicted
|
|
+ // Also add space for the startup command for each stream.
|
|
+
|
|
+ {
|
|
+ int uv_commands_per_qpu = UV_COMMANDS_PER_QPU;
|
|
+ uint32_t *p;
|
|
+ for(job=0;job<RPI_MAX_JOBS;job++) {
|
|
+#ifdef RPI_CACHE_UNIF_MVS
|
|
+ gpu_malloc_cached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr[job] );
|
|
+#else
|
|
+ gpu_malloc_uncached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr[job] );
|
|
+#endif
|
|
+ s->unif_mvs[job] = (uint32_t *) s->unif_mvs_ptr[job].arm;
|
|
+
|
|
+ // Set up initial locations for uniform streams
|
|
+ p = s->unif_mvs[job];
|
|
+ for(i = 0; i < 8; i++) {
|
|
+ s->mvs_base[job][i] = p;
|
|
+ p += uv_commands_per_qpu;
|
|
+ }
|
|
+ }
|
|
+ s->mc_filter_uv = qpu_get_fn(QPU_MC_FILTER_UV);
|
|
+ s->mc_filter_uv_b0 = qpu_get_fn(QPU_MC_FILTER_UV_B0);
|
|
+ s->mc_filter_uv_b = qpu_get_fn(QPU_MC_FILTER_UV_B);
|
|
+ }
|
|
+
|
|
+#endif
|
|
+#ifdef RPI_LUMA_QPU
|
|
+ for(job=0;job<RPI_MAX_JOBS;job++)
|
|
+ {
|
|
+ int y_commands_per_qpu = Y_COMMANDS_PER_QPU;
|
|
+ uint32_t *p;
|
|
+#ifdef RPI_CACHE_UNIF_MVS
|
|
+ gpu_malloc_cached( 12 * y_commands_per_qpu * sizeof(uint32_t), &s->y_unif_mvs_ptr[job] );
|
|
+#else
|
|
+ gpu_malloc_uncached( 12 * y_commands_per_qpu * sizeof(uint32_t), &s->y_unif_mvs_ptr[job] );
|
|
+#endif
|
|
+ s->y_unif_mvs[job] = (uint32_t *) s->y_unif_mvs_ptr[job].arm;
|
|
+
|
|
+ // Set up initial locations for uniform streams
|
|
+ p = s->y_unif_mvs[job];
|
|
+ for(i = 0; i < 12; i++) {
|
|
+ s->y_mvs_base[job][i] = p;
|
|
+ p += y_commands_per_qpu;
|
|
+ }
|
|
+ }
|
|
+ s->mc_filter = qpu_get_fn(QPU_MC_FILTER);
|
|
+ s->mc_filter_b = qpu_get_fn(QPU_MC_FILTER_B);
|
|
+#endif
|
|
+ //gpu_malloc_uncached(2048*64,&s->dummy);
|
|
+
|
|
+ s->enable_rpi = 0;
|
|
+
|
|
+#ifdef RPI_WORKER
|
|
+ hevc_init_worker(s);
|
|
+#endif
|
|
+
|
|
+#endif
|
|
+
|
|
s->cabac_state = av_malloc(HEVC_CONTEXTS);
|
|
if (!s->cabac_state)
|
|
goto fail;
|
|
diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
|
|
index be91010..6b03ea8 100644
|
|
--- a/libavcodec/hevc.h
|
|
+++ b/libavcodec/hevc.h
|
|
@@ -23,6 +23,9 @@
|
|
#ifndef AVCODEC_HEVC_H
|
|
#define AVCODEC_HEVC_H
|
|
|
|
+// define RPI to split the CABAC/prediction/transform into separate stages
|
|
+#include "config.h"
|
|
+
|
|
#include "libavutil/buffer.h"
|
|
#include "libavutil/md5.h"
|
|
|
|
@@ -37,6 +40,29 @@
|
|
#include "thread.h"
|
|
#include "videodsp.h"
|
|
|
|
+// define RPI to split the CABAC/prediction/transform into separate stages
|
|
+#ifdef RPI
|
|
+
|
|
+ #include "rpi_qpu.h"
|
|
+ // Define RPI_INTER_QPU to use QPU for chroma inter prediction
|
|
+ #define RPI_INTER_QPU
|
|
+
|
|
+ #ifdef RPI_INTER_QPU
|
|
+ // Define RPI_LUMA_QPU to also use QPU for luma inter prediction
|
|
+ #define RPI_LUMA_QPU
|
|
+ #endif
|
|
+
|
|
+ // By passing jobs to a worker thread we hope to be able to catch up during slow frames
|
|
+ #define RPI_MAX_JOBS 2
|
|
+ // Define RPI_WORKER to launch a worker thread for pixel processing tasks
|
|
+ #define RPI_WORKER
|
|
+ // Define RPI_DEBLOCK_VPU to perform deblocking on the VPUs
|
|
+// #define RPI_DEBLOCK_VPU
|
|
+
|
|
+#endif
|
|
+
|
|
+#define RPI_VPU_DEBLOCK_CACHED 1
|
|
+
|
|
#define MAX_DPB_SIZE 16 // A.4.1
|
|
#define MAX_REFS 16
|
|
|
|
@@ -660,17 +686,6 @@ typedef struct CodingUnit {
|
|
uint8_t cu_transquant_bypass_flag;
|
|
} CodingUnit;
|
|
|
|
-typedef struct Mv {
|
|
- int16_t x; ///< horizontal component of motion vector
|
|
- int16_t y; ///< vertical component of motion vector
|
|
-} Mv;
|
|
-
|
|
-typedef struct MvField {
|
|
- DECLARE_ALIGNED(4, Mv, mv)[2];
|
|
- int8_t ref_idx[2];
|
|
- int8_t pred_flag;
|
|
-} MvField;
|
|
-
|
|
typedef struct NeighbourAvailable {
|
|
int cand_bottom_left;
|
|
int cand_left;
|
|
@@ -747,7 +762,17 @@ typedef struct HEVCFrame {
|
|
uint8_t flags;
|
|
} HEVCFrame;
|
|
|
|
+#ifdef RPI_WORKER
|
|
+typedef struct HEVCLocalContextIntra {
|
|
+ TransformUnit tu;
|
|
+ NeighbourAvailable na;
|
|
+} HEVCLocalContextIntra;
|
|
+#endif
|
|
+
|
|
typedef struct HEVCLocalContext {
|
|
+ TransformUnit tu;
|
|
+ NeighbourAvailable na; // WARNING tu and na must be the first two fields to match HEVCLocalContextIntra
|
|
+
|
|
uint8_t cabac_state[HEVC_CONTEXTS];
|
|
|
|
uint8_t stat_coeff[4];
|
|
@@ -762,7 +787,6 @@ typedef struct HEVCLocalContext {
|
|
|
|
int qPy_pred;
|
|
|
|
- TransformUnit tu;
|
|
|
|
uint8_t ctb_left_flag;
|
|
uint8_t ctb_up_flag;
|
|
@@ -779,7 +803,6 @@ typedef struct HEVCLocalContext {
|
|
int ct_depth;
|
|
CodingUnit cu;
|
|
PredictionUnit pu;
|
|
- NeighbourAvailable na;
|
|
|
|
#define BOUNDARY_LEFT_SLICE (1 << 0)
|
|
#define BOUNDARY_LEFT_TILE (1 << 1)
|
|
@@ -790,6 +813,80 @@ typedef struct HEVCLocalContext {
|
|
int boundary_flags;
|
|
} HEVCLocalContext;
|
|
|
|
+
|
|
+#ifdef RPI
|
|
+
|
|
+// The processing is done in chunks
|
|
+// Each chunk corresponds to 24 64x64 luma blocks (24 so it is divisible by 8 for chroma and 12 for luma)
|
|
+// This is a distance of 1536 pixels across the screen
|
|
+// Increasing RPI_NUM_CHUNKS will reduce time spent activating QPUs and cache flushing,
|
|
+// but allocate more memory and increase the latency before data in the next frame can be processed
|
|
+#define RPI_NUM_CHUNKS 1
|
|
+
|
|
+// RPI_MAX_WIDTH is maximum width in pixels supported by the accelerated code
|
|
+#define RPI_MAX_WIDTH (RPI_NUM_CHUNKS*64*24)
|
|
+
|
|
+// Worst case is for 4:4:4 4x4 blocks with 64 high coding tree blocks, so 16 MV cmds per 4 pixels across for each colour plane, * 2 for bi
|
|
+#define RPI_MAX_MV_CMDS (2*16*3*(RPI_MAX_WIDTH/4))
|
|
+// Each block can have an intra prediction and a transform_add command
|
|
+#define RPI_MAX_PRED_CMDS (2*16*3*(RPI_MAX_WIDTH/4))
|
|
+// Worst case is 16x16 CTUs
|
|
+#define RPI_MAX_DEBLOCK_CMDS (RPI_MAX_WIDTH*4/16)
|
|
+
|
|
+#define RPI_CMD_LUMA_UNI 0
|
|
+#define RPI_CMD_CHROMA_UNI 1
|
|
+#define RPI_CMD_LUMA_BI 2
|
|
+#define RPI_CMD_CHROMA_BI 3
|
|
+#define RPI_CMD_V_BI 4
|
|
+
|
|
+// RPI_PRECLEAR is not working yet - perhaps clearing on VPUs is flawed?
|
|
+// #define RPI_PRECLEAR
|
|
+
|
|
+// Command for inter prediction
|
|
+typedef struct HEVCMvCmd {
|
|
+ int cmd;
|
|
+ uint8_t *dst;
|
|
+ ptrdiff_t dststride;
|
|
+ uint8_t *src;
|
|
+ ptrdiff_t srcstride;
|
|
+ Mv mv;
|
|
+ int x_off;
|
|
+ int y_off;
|
|
+ int block_w;
|
|
+ int block_h;
|
|
+ int weight;
|
|
+ int offset;
|
|
+ uint8_t *src1;
|
|
+ ptrdiff_t srcstride1;
|
|
+ Mv mv1;
|
|
+ int8_t ref_idx[2];
|
|
+} HEVCMvCmd;
|
|
+
|
|
+
|
|
+// Command for intra prediction and transform_add of predictions to coefficients
|
|
+#define RPI_PRED_TRANSFORM_ADD 0
|
|
+#define RPI_PRED_INTRA 1
|
|
+typedef struct HEVCPredCmd {
|
|
+ uint8_t size;
|
|
+ uint8_t type;
|
|
+ uint8_t na;
|
|
+ uint8_t c_idx;
|
|
+ union {
|
|
+ uint8_t *dst; // RPI_PRED_TRANSFORM_ADD
|
|
+ uint32_t x; // RPI_PRED_INTRA
|
|
+ };
|
|
+ union {
|
|
+ int16_t *buf; // RPI_PRED_TRANSFORM_ADD
|
|
+ uint32_t y; // RPI_PRED_INTRA
|
|
+ };
|
|
+ union {
|
|
+ enum IntraPredMode mode; // RPI_PRED_TRANSFORM_ADD
|
|
+ uint32_t stride; // RPI_PRED_INTRA
|
|
+ };
|
|
+} HEVCPredCmd;
|
|
+
|
|
+#endif
|
|
+
|
|
typedef struct HEVCContext {
|
|
const AVClass *c; // needed by private avoptions
|
|
AVCodecContext *avctx;
|
|
@@ -798,13 +895,107 @@ typedef struct HEVCContext {
|
|
|
|
HEVCLocalContext *HEVClcList[MAX_NB_THREADS];
|
|
HEVCLocalContext *HEVClc;
|
|
-
|
|
+#ifdef RPI_WORKER
|
|
+ HEVCLocalContextIntra HEVClcIntra;
|
|
+#endif
|
|
uint8_t threads_type;
|
|
uint8_t threads_number;
|
|
|
|
int width;
|
|
int height;
|
|
|
|
+ int used_for_ref;
|
|
+
|
|
+#ifdef RPI
|
|
+ int enable_rpi;
|
|
+ HEVCMvCmd *unif_mv_cmds[RPI_MAX_JOBS];
|
|
+ HEVCPredCmd *univ_pred_cmds[RPI_MAX_JOBS];
|
|
+ int buf_width;
|
|
+ GPU_MEM_PTR_T coeffs_buf_default[RPI_MAX_JOBS];
|
|
+ GPU_MEM_PTR_T coeffs_buf_accelerated[RPI_MAX_JOBS];
|
|
+ int16_t *coeffs_buf_arm[RPI_MAX_JOBS][4];
|
|
+ unsigned int coeffs_buf_vc[RPI_MAX_JOBS][4];
|
|
+ int num_coeffs[RPI_MAX_JOBS][4];
|
|
+ int num_xfm_cmds[RPI_MAX_JOBS];
|
|
+ int num_mv_cmds[RPI_MAX_JOBS];
|
|
+ int num_pred_cmds[RPI_MAX_JOBS];
|
|
+ int num_dblk_cmds[RPI_MAX_JOBS];
|
|
+ int vpu_id;
|
|
+ int pass0_job; // Pass0 does coefficient decode
|
|
+ int pass1_job; // Pass1 does pixel processing
|
|
+ int ctu_count; // Number of CTUs done in pass0 so far
|
|
+ int max_ctu_count; // Number of CTUs when we trigger a round of processing
|
|
+ int ctu_per_y_chan; // Number of CTUs per luma QPU
|
|
+ int ctu_per_uv_chan; // Number of CTUs per chroma QPU
|
|
+#ifdef RPI_INTER_QPU
|
|
+ GPU_MEM_PTR_T unif_mvs_ptr[RPI_MAX_JOBS];
|
|
+ uint32_t *unif_mvs[RPI_MAX_JOBS]; // Base of memory for motion vector commands
|
|
+
|
|
+ // _base pointers are to the start of the row
|
|
+ uint32_t *mvs_base[RPI_MAX_JOBS][8];
|
|
+ // these pointers are to the next free space
|
|
+ uint32_t *u_mvs[RPI_MAX_JOBS][8];
|
|
+ uint32_t *curr_u_mvs; // Current uniform stream to use for chroma
|
|
+ // Function pointers
|
|
+ uint32_t mc_filter_uv;
|
|
+ uint32_t mc_filter_uv_b0;
|
|
+ uint32_t mc_filter_uv_b;
|
|
+#endif
|
|
+#ifdef RPI_LUMA_QPU
|
|
+ GPU_MEM_PTR_T y_unif_mvs_ptr[RPI_MAX_JOBS];
|
|
+ uint32_t *y_unif_mvs[RPI_MAX_JOBS]; // Base of memory for motion vector commands
|
|
+ uint32_t *y_mvs_base[RPI_MAX_JOBS][12];
|
|
+ uint32_t *y_mvs[RPI_MAX_JOBS][12];
|
|
+ uint32_t *curr_y_mvs; // Current uniform stream for luma
|
|
+ // Function pointers
|
|
+ uint32_t mc_filter;
|
|
+ uint32_t mc_filter_b;
|
|
+#endif
|
|
+
|
|
+#ifdef RPI_WORKER
|
|
+ pthread_t worker_thread;
|
|
+ pthread_cond_t worker_cond_head;
|
|
+ pthread_cond_t worker_cond_tail;
|
|
+ pthread_mutex_t worker_mutex;
|
|
+
|
|
+ int worker_tail; // Contains the number of posted jobs
|
|
+ int worker_head; // Contains the number of completed jobs
|
|
+ int kill_worker; // set to 1 to terminate the worker
|
|
+#endif
|
|
+
|
|
+#define RPI_DEBLOCK_VPU_Q_COUNT 2
|
|
+
|
|
+#ifdef RPI_DEBLOCK_VPU
|
|
+ int enable_rpi_deblock;
|
|
+
|
|
+ int uv_setup_width;
|
|
+ int uv_setup_height;
|
|
+ int setup_width; // Number of 16x16 blocks across the image
|
|
+ int setup_height; // Number of 16x16 blocks down the image
|
|
+
|
|
+ struct dblk_vpu_q_s
|
|
+ {
|
|
+ GPU_MEM_PTR_T deblock_vpu_gmem;
|
|
+
|
|
+ uint8_t (*y_setup_arm)[2][2][2][4];
|
|
+ uint8_t (*y_setup_vc)[2][2][2][4];
|
|
+
|
|
+ uint8_t (*uv_setup_arm)[2][2][2][4]; // Half of this is unused [][][1][], but easier for the VPU as it allows us to store with zeros and addresses are aligned
|
|
+ uint8_t (*uv_setup_vc)[2][2][2][4];
|
|
+
|
|
+ int (*vpu_cmds_arm)[6]; // r0-r5 for each command
|
|
+ int vpu_cmds_vc;
|
|
+
|
|
+ int cmd_id;
|
|
+ } dvq_ents[RPI_DEBLOCK_VPU_Q_COUNT];
|
|
+
|
|
+ struct dblk_vpu_q_s * dvq;
|
|
+ unsigned int dvq_n;
|
|
+
|
|
+#endif
|
|
+
|
|
+#endif
|
|
+
|
|
uint8_t *cabac_state;
|
|
|
|
/** 1 if the independent slice segment header was successfully parsed */
|
|
@@ -922,6 +1113,9 @@ typedef struct HEVCContext {
|
|
uint32_t max_mastering_luminance;
|
|
uint32_t min_mastering_luminance;
|
|
|
|
+#ifdef RPI
|
|
+ int dblk_cmds[RPI_MAX_JOBS][RPI_MAX_DEBLOCK_CMDS][2];
|
|
+#endif
|
|
} HEVCContext;
|
|
|
|
int ff_hevc_decode_short_term_rps(GetBitContext *gb, AVCodecContext *avctx,
|
|
@@ -1048,6 +1242,10 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
|
|
int log2_trafo_size, enum ScanType scan_idx,
|
|
int c_idx);
|
|
|
|
+#ifdef RPI_INTER_QPU
|
|
+extern void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n);
|
|
+#endif
|
|
+
|
|
void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size);
|
|
|
|
|
|
diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
|
|
index 05b2821..e2f1f4e 100644
|
|
--- a/libavcodec/hevc_cabac.c
|
|
+++ b/libavcodec/hevc_cabac.c
|
|
@@ -21,14 +21,72 @@
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
*/
|
|
|
|
+#define UNCHECKED_BITSTREAM_READER 1
|
|
+
|
|
#include "libavutil/attributes.h"
|
|
#include "libavutil/common.h"
|
|
|
|
-#include "cabac_functions.h"
|
|
#include "hevc.h"
|
|
+#include "cabac_functions.h"
|
|
+
|
|
+// BY22 is probably faster than simple bypass if the processor has
|
|
+// either a fast 32-bit divide or a fast 32x32->64[63:32] instruction
|
|
+// x86 has fast int divide
|
|
+// Arm doesn't have divide or general fast 64 bit, but does have the multiply
|
|
+// * Beware: ARCH_xxx isn't set if configure --disable-asm is used
|
|
+#define USE_BY22 (HAVE_FAST_64BIT || ARCH_ARM || ARCH_X86)
|
|
+// Use native divide if we have a fast one - otherwise use mpy 1/x
|
|
+// x86 has a fast integer divide - arm doesn't - unsure about other
|
|
+// architectures
|
|
+#define USE_BY22_DIV ARCH_X86
|
|
+
|
|
+// Special case blocks with a single significant ceoff
|
|
+// Decreases the complexity of the code for a common case but increases the
|
|
+// code size.
|
|
+#define USE_N_END_1 1
|
|
+
|
|
+#if ARCH_ARM
|
|
+#include "arm/hevc_cabac.h"
|
|
+#endif
|
|
|
|
#define CABAC_MAX_BIN 31
|
|
|
|
+
|
|
+#if USE_BY22 && !USE_BY22_DIV
|
|
+#define I(x) (uint32_t)((0x10000000000ULL / (uint64_t)(x)) + 1ULL)
|
|
+
|
|
+static const uint32_t cabac_by22_inv_range[256] = {
|
|
+ 0, I(257), I(258), I(259),
|
|
+ I(260), I(261), I(262), I(263), I(264), I(265), I(266), I(267), I(268), I(269),
|
|
+ I(270), I(271), I(272), I(273), I(274), I(275), I(276), I(277), I(278), I(279),
|
|
+ I(280), I(281), I(282), I(283), I(284), I(285), I(286), I(287), I(288), I(289),
|
|
+ I(290), I(291), I(292), I(293), I(294), I(295), I(296), I(297), I(298), I(299),
|
|
+ I(300), I(301), I(302), I(303), I(304), I(305), I(306), I(307), I(308), I(309),
|
|
+ I(310), I(311), I(312), I(313), I(314), I(315), I(316), I(317), I(318), I(319),
|
|
+ I(320), I(321), I(322), I(323), I(324), I(325), I(326), I(327), I(328), I(329),
|
|
+ I(330), I(331), I(332), I(333), I(334), I(335), I(336), I(337), I(338), I(339),
|
|
+ I(340), I(341), I(342), I(343), I(344), I(345), I(346), I(347), I(348), I(349),
|
|
+ I(350), I(351), I(352), I(353), I(354), I(355), I(356), I(357), I(358), I(359),
|
|
+ I(360), I(361), I(362), I(363), I(364), I(365), I(366), I(367), I(368), I(369),
|
|
+ I(370), I(371), I(372), I(373), I(374), I(375), I(376), I(377), I(378), I(379),
|
|
+ I(380), I(381), I(382), I(383), I(384), I(385), I(386), I(387), I(388), I(389),
|
|
+ I(390), I(391), I(392), I(393), I(394), I(395), I(396), I(397), I(398), I(399),
|
|
+ I(400), I(401), I(402), I(403), I(404), I(405), I(406), I(407), I(408), I(409),
|
|
+ I(410), I(411), I(412), I(413), I(414), I(415), I(416), I(417), I(418), I(419),
|
|
+ I(420), I(421), I(422), I(423), I(424), I(425), I(426), I(427), I(428), I(429),
|
|
+ I(430), I(431), I(432), I(433), I(434), I(435), I(436), I(437), I(438), I(439),
|
|
+ I(440), I(441), I(442), I(443), I(444), I(445), I(446), I(447), I(448), I(449),
|
|
+ I(450), I(451), I(452), I(453), I(454), I(455), I(456), I(457), I(458), I(459),
|
|
+ I(460), I(461), I(462), I(463), I(464), I(465), I(466), I(467), I(468), I(469),
|
|
+ I(470), I(471), I(472), I(473), I(474), I(475), I(476), I(477), I(478), I(479),
|
|
+ I(480), I(481), I(482), I(483), I(484), I(485), I(486), I(487), I(488), I(489),
|
|
+ I(490), I(491), I(492), I(493), I(494), I(495), I(496), I(497), I(498), I(499),
|
|
+ I(500), I(501), I(502), I(503), I(504), I(505), I(506), I(507), I(508), I(509),
|
|
+ I(510), I(511)
|
|
+};
|
|
+#undef I
|
|
+#endif // USE_BY22
|
|
+
|
|
/**
|
|
* number of bin by SyntaxElement.
|
|
*/
|
|
@@ -445,6 +503,211 @@ static const uint8_t diag_scan8x8_inv[8][8] = {
|
|
{ 28, 36, 43, 49, 54, 58, 61, 63, },
|
|
};
|
|
|
|
+
|
|
+typedef struct
|
|
+{
|
|
+ uint16_t coeff;
|
|
+ uint16_t scale;
|
|
+} xy_off_t;
|
|
+
|
|
+#define XYT_C(x,y,t) ((x) + ((y) << (t)))
|
|
+#define SCALE_TRAFO(t) ((t) > 3 ? 3 : (t))
|
|
+#define SCALE_SHR(t) ((t) - SCALE_TRAFO(t))
|
|
+#define XYT_S(x,y,t) (((x) >> SCALE_SHR(t)) + (((y) >> SCALE_SHR(t)) << SCALE_TRAFO(t)))
|
|
+
|
|
+#define XYT(x,y,t) {XYT_C(x,y,t), XYT_S(x,y,t)}
|
|
+
|
|
+#define OFF_DIAG(t) {\
|
|
+ XYT(0,0,t), XYT(0,1,t), XYT(1,0,t), XYT(0,2,t),\
|
|
+ XYT(1,1,t), XYT(2,0,t), XYT(0,3,t), XYT(1,2,t),\
|
|
+ XYT(2,1,t), XYT(3,0,t), XYT(1,3,t), XYT(2,2,t),\
|
|
+ XYT(3,1,t), XYT(2,3,t), XYT(3,2,t), XYT(3,3,t)\
|
|
+}
|
|
+
|
|
+#define OFF_HORIZ(t) {\
|
|
+ XYT(0,0,t), XYT(1,0,t), XYT(2,0,t), XYT(3,0,t),\
|
|
+ XYT(0,1,t), XYT(1,1,t), XYT(2,1,t), XYT(3,1,t),\
|
|
+ XYT(0,2,t), XYT(1,2,t), XYT(2,2,t), XYT(3,2,t),\
|
|
+ XYT(0,3,t), XYT(1,3,t), XYT(2,3,t), XYT(3,3,t)\
|
|
+}
|
|
+
|
|
+#define OFF_VERT(t) {\
|
|
+ XYT(0,0,t), XYT(0,1,t), XYT(0,2,t), XYT(0,3,t),\
|
|
+ XYT(1,0,t), XYT(1,1,t), XYT(1,2,t), XYT(1,3,t),\
|
|
+ XYT(2,0,t), XYT(2,1,t), XYT(2,2,t), XYT(2,3,t),\
|
|
+ XYT(3,0,t), XYT(3,1,t), XYT(3,2,t), XYT(3,3,t)\
|
|
+}
|
|
+
|
|
+static const xy_off_t off_xys[3][4][16] =
|
|
+{
|
|
+ {OFF_DIAG(2), OFF_DIAG(3), OFF_DIAG(4), OFF_DIAG(5)},
|
|
+ {OFF_HORIZ(2), OFF_HORIZ(3), OFF_HORIZ(4), OFF_HORIZ(5)},
|
|
+ {OFF_VERT(2), OFF_VERT(3), OFF_VERT(4), OFF_VERT(5)}
|
|
+};
|
|
+
|
|
+
|
|
+// Helper fns
|
|
+#ifndef hevc_mem_bits32
|
|
+static av_always_inline uint32_t hevc_mem_bits32(const void * buf, const unsigned int offset)
|
|
+{
|
|
+ return AV_RB32((const uint8_t *)buf + (offset >> 3)) << (offset & 7);
|
|
+}
|
|
+#endif
|
|
+
|
|
+#if AV_GCC_VERSION_AT_LEAST(3,4) && !defined(hevc_clz32)
|
|
+#define hevc_clz32 hevc_clz32_builtin
|
|
+static av_always_inline unsigned int hevc_clz32_builtin(const uint32_t x)
|
|
+{
|
|
+ // __builtin_clz says it works on ints - so adjust if int is >32 bits long
|
|
+ return __builtin_clz(x) - (sizeof(int) * 8 - 32);
|
|
+}
|
|
+#endif
|
|
+
|
|
+// It is unlikely that we will ever need this but include for completeness
|
|
+#ifndef hevc_clz32
|
|
+static inline unsigned int hevc_clz32(unsigned int x)
|
|
+{
|
|
+ unsigned int n = 1;
|
|
+ if ((x & 0xffff0000) == 0) {
|
|
+ n += 16;
|
|
+ x <<= 16;
|
|
+ }
|
|
+ if ((x & 0xff000000) == 0) {
|
|
+ n += 8;
|
|
+ x <<= 8;
|
|
+ }
|
|
+ if ((x & 0xf0000000) == 0) {
|
|
+ n += 4;
|
|
+ x <<= 4;
|
|
+ }
|
|
+ if ((x & 0xc0000000) == 0) {
|
|
+ n += 2;
|
|
+ x <<= 2;
|
|
+ }
|
|
+ return n - ((x >> 31) & 1);
|
|
+}
|
|
+#endif
|
|
+
|
|
+
|
|
+#if !USE_BY22
|
|
+// If no by22 then _by22 functions will revert to normal and so _peek/_flush
|
|
+// will no longer be called but the setup calls will still exist and we want
|
|
+// to null them out
|
|
+#define bypass_start(s)
|
|
+#define bypass_finish(s)
|
|
+#else
|
|
+// Use BY22 for residual bypass block
|
|
+
|
|
+#define bypass_start(s) get_cabac_by22_start(&s->HEVClc->cc)
|
|
+#define bypass_finish(s) get_cabac_by22_finish(&s->HEVClc->cc)
|
|
+
|
|
+// BY22 notes that bypass is simply a divide into the bitstream and so we
|
|
+// can peek out large quantities of bits at once and treat the result as if
|
|
+// it was VLC. In many cases this will lead to O(1) processing rather than
|
|
+// O(n) though the setup and teardown is sufficiently expensive that it is
|
|
+// only worth using if we expect to be dealing with more than a few bits
|
|
+// The definition of "a few bits" will vary from platform to platform but
|
|
+// tests on ARM show that it probably isn't worth it for a single coded
|
|
+// residual, but is for >1 - it also seems likely that if there are
|
|
+// more residuals then they are likely to be bigger and this will make the
|
|
+// O(1) nature of the code more worthwhile.
|
|
+
|
|
+
|
|
+#if !USE_BY22_DIV
|
|
+// * 1/x @ 32 bits gets us 22 bits of accuracy
|
|
+#define CABAC_BY22_PEEK_BITS 22
|
|
+#else
|
|
+// A real 32-bit divide gets us another bit
|
|
+// If we have a 64 bit int & a unit time divider then we should get a lot
|
|
+// of bits (55) but that is untested and it is unclear if it would give
|
|
+// us a large advantage
|
|
+#define CABAC_BY22_PEEK_BITS 23
|
|
+#endif
|
|
+
|
|
+// Bypass block start
|
|
+// Must be called before _by22_peek is used as it sets the CABAC environment
|
|
+// into the correct state. _by22_finish must be called to return to 'normal'
|
|
+// (i.e. non-bypass) cabac decoding
|
|
+static inline void get_cabac_by22_start(CABACContext * const c)
|
|
+{
|
|
+ const unsigned int bits = __builtin_ctz(c->low);
|
|
+ const uint32_t m = hevc_mem_bits32(c->bytestream, 0);
|
|
+ uint32_t x = (c->low << (22 - CABAC_BITS)) ^ ((m ^ 0x80000000U) >> (9 + CABAC_BITS - bits));
|
|
+#if !USE_BY22_DIV
|
|
+ const uint32_t inv = cabac_by22_inv_range[c->range & 0xff];
|
|
+#endif
|
|
+
|
|
+ c->bytestream -= (CABAC_BITS / 8);
|
|
+ c->by22.bits = bits;
|
|
+#if !USE_BY22_DIV
|
|
+ c->by22.range = c->range;
|
|
+ c->range = inv;
|
|
+#endif
|
|
+ c->low = x;
|
|
+}
|
|
+
|
|
+// Bypass block finish
|
|
+// Must be called at the end of the bypass block to return to normal operation
|
|
+static inline void get_cabac_by22_finish(CABACContext * const c)
|
|
+{
|
|
+ unsigned int used = c->by22.bits;
|
|
+ unsigned int bytes_used = (used / CABAC_BITS) * (CABAC_BITS / 8);
|
|
+ unsigned int bits_used = used & (CABAC_BITS == 16 ? 15 : 7);
|
|
+
|
|
+ c->bytestream += bytes_used + (CABAC_BITS / 8);
|
|
+ c->low = (((uint32_t)c->low >> (22 - CABAC_BITS + bits_used)) | 1) << bits_used;
|
|
+#if !USE_BY22_DIV
|
|
+ c->range = c->by22.range;
|
|
+#endif
|
|
+}
|
|
+
|
|
+// Peek bypass bits
|
|
+// _by22_start must be called before _by22_peek is called and _by22_flush
|
|
+// must be called afterwards to flush any used bits
|
|
+// The actual number of valid bits returned is
|
|
+// min(<coded bypass block length>, CABAC_BY22_PEEK_BITS). CABAC_BY22_PEEK_BITS
|
|
+// will be at least 22 which should be long enough for any prefix or suffix
|
|
+// though probably not long enough for the worst case combination
|
|
+#ifndef get_cabac_by22_peek
|
|
+static inline uint32_t get_cabac_by22_peek(const CABACContext * const c)
|
|
+{
|
|
+#if USE_BY22_DIV
|
|
+ return ((unsigned int)c->low / (unsigned int)c->range) << 9;
|
|
+#else
|
|
+ uint32_t x = c->low & ~1U;
|
|
+ const uint32_t inv = c->range;
|
|
+
|
|
+ if (inv != 0)
|
|
+ x = (uint32_t)(((uint64_t)x * (uint64_t)inv) >> 32);
|
|
+
|
|
+ return x << 1;
|
|
+#endif
|
|
+}
|
|
+#endif
|
|
+
|
|
+// Flush bypass bits peeked by _by22_peek
|
|
+// Flush n bypass bits. n must be >= 1 to guarantee correct operation
|
|
+// val is an unmodified copy of whatever _by22_peek returned
|
|
+#ifndef get_cabac_by22_flush
|
|
+static inline void get_cabac_by22_flush(CABACContext * c, const unsigned int n, const uint32_t val)
|
|
+{
|
|
+ // Subtract the bits used & reshift up to the top of the word
|
|
+#if USE_BY22_DIV
|
|
+ const uint32_t low = (((unsigned int)c->low << n) - (((val >> (32 - n)) * (unsigned int)c->range) << 23));
|
|
+#else
|
|
+ const uint32_t low = (((uint32_t)c->low << n) - (((val >> (32 - n)) * c->by22.range) << 23));
|
|
+#endif
|
|
+
|
|
+ // and refill lower bits
|
|
+ // We will probably OR over some existing bits but that doesn't matter
|
|
+ c->by22.bits += n;
|
|
+ c->low = low | (hevc_mem_bits32(c->bytestream, c->by22.bits) >> 9);
|
|
+}
|
|
+#endif
|
|
+
|
|
+#endif // USE_BY22
|
|
+
|
|
+
|
|
void ff_hevc_save_states(HEVCContext *s, int ctb_addr_ts)
|
|
{
|
|
if (s->ps.pps->entropy_coding_sync_enabled_flag &&
|
|
@@ -863,19 +1126,19 @@ int ff_hevc_cbf_luma_decode(HEVCContext *s, int trafo_depth)
|
|
return GET_CABAC(elem_offset[CBF_LUMA] + !trafo_depth);
|
|
}
|
|
|
|
-static int hevc_transform_skip_flag_decode(HEVCContext *s, int c_idx)
|
|
+static int hevc_transform_skip_flag_decode(HEVCContext *s, int c_idx_nz)
|
|
{
|
|
- return GET_CABAC(elem_offset[TRANSFORM_SKIP_FLAG] + !!c_idx);
|
|
+ return GET_CABAC(elem_offset[TRANSFORM_SKIP_FLAG] + c_idx_nz);
|
|
}
|
|
|
|
-static int explicit_rdpcm_flag_decode(HEVCContext *s, int c_idx)
|
|
+static int explicit_rdpcm_flag_decode(HEVCContext *s, int c_idx_nz)
|
|
{
|
|
- return GET_CABAC(elem_offset[EXPLICIT_RDPCM_FLAG] + !!c_idx);
|
|
+ return GET_CABAC(elem_offset[EXPLICIT_RDPCM_FLAG] + c_idx_nz);
|
|
}
|
|
|
|
-static int explicit_rdpcm_dir_flag_decode(HEVCContext *s, int c_idx)
|
|
+static int explicit_rdpcm_dir_flag_decode(HEVCContext *s, int c_idx_nz)
|
|
{
|
|
- return GET_CABAC(elem_offset[EXPLICIT_RDPCM_DIR_FLAG] + !!c_idx);
|
|
+ return GET_CABAC(elem_offset[EXPLICIT_RDPCM_DIR_FLAG] + c_idx_nz);
|
|
}
|
|
|
|
int ff_hevc_log2_res_scale_abs(HEVCContext *s, int idx) {
|
|
@@ -891,14 +1154,14 @@ int ff_hevc_res_scale_sign_flag(HEVCContext *s, int idx) {
|
|
return GET_CABAC(elem_offset[RES_SCALE_SIGN_FLAG] + idx);
|
|
}
|
|
|
|
-static av_always_inline void last_significant_coeff_xy_prefix_decode(HEVCContext *s, int c_idx,
|
|
+static av_always_inline void last_significant_coeff_xy_prefix_decode(HEVCContext *s, int c_idx_nz,
|
|
int log2_size, int *last_scx_prefix, int *last_scy_prefix)
|
|
{
|
|
int i = 0;
|
|
int max = (log2_size << 1) - 1;
|
|
int ctx_offset, ctx_shift;
|
|
|
|
- if (!c_idx) {
|
|
+ if (!c_idx_nz) {
|
|
ctx_offset = 3 * (log2_size - 2) + ((log2_size - 1) >> 2);
|
|
ctx_shift = (log2_size + 1) >> 2;
|
|
} else {
|
|
@@ -929,22 +1192,16 @@ static av_always_inline int last_significant_coeff_suffix_decode(HEVCContext *s,
|
|
return value;
|
|
}
|
|
|
|
-static av_always_inline int significant_coeff_group_flag_decode(HEVCContext *s, int c_idx, int ctx_cg)
|
|
+static av_always_inline int significant_coeff_group_flag_decode(HEVCContext *s, int c_idx_nz, int ctx_cg)
|
|
{
|
|
int inc;
|
|
|
|
- inc = FFMIN(ctx_cg, 1) + (c_idx>0 ? 2 : 0);
|
|
+ inc = (ctx_cg != 0) + (c_idx_nz << 1);
|
|
|
|
return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_GROUP_FLAG] + inc);
|
|
}
|
|
-static av_always_inline int significant_coeff_flag_decode(HEVCContext *s, int x_c, int y_c,
|
|
- int offset, const uint8_t *ctx_idx_map)
|
|
-{
|
|
- int inc = ctx_idx_map[(y_c << 2) + x_c] + offset;
|
|
- return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + inc);
|
|
-}
|
|
|
|
-static av_always_inline int significant_coeff_flag_decode_0(HEVCContext *s, int c_idx, int offset)
|
|
+static av_always_inline int significant_coeff_flag_decode_0(HEVCContext *s, int offset)
|
|
{
|
|
return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + offset);
|
|
}
|
|
@@ -966,90 +1223,366 @@ static av_always_inline int coeff_abs_level_greater2_flag_decode(HEVCContext *s,
|
|
return GET_CABAC(elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] + inc);
|
|
}
|
|
|
|
-static av_always_inline int coeff_abs_level_remaining_decode(HEVCContext *s, int rc_rice_param)
|
|
+
|
|
+#if !USE_BY22
|
|
+#define coeff_abs_level_remaining_decode_bypass(s,r) coeff_abs_level_remaining_decode(s, r)
|
|
+#endif
|
|
+
|
|
+
|
|
+#ifndef coeff_abs_level_remaining_decode_bypass
|
|
+static int coeff_abs_level_remaining_decode_bypass(HEVCContext * const s, const unsigned int rice_param)
|
|
{
|
|
+ CABACContext * const c = &s->HEVClc->cc;
|
|
+ uint32_t y;
|
|
+ unsigned int prefix;
|
|
+ unsigned int last_coeff_abs_level_remaining;
|
|
+ unsigned int n;
|
|
+
|
|
+ y = get_cabac_by22_peek(c);
|
|
+ prefix = hevc_clz32(~y);
|
|
+ // y << prefix will always have top bit 0
|
|
+
|
|
+ if (prefix < 3) {
|
|
+ const unsigned int suffix = (y << prefix) >> (31 - rice_param);
|
|
+ last_coeff_abs_level_remaining = (prefix << rice_param) + suffix;
|
|
+ n = prefix + 1 + rice_param;
|
|
+ }
|
|
+ else if (prefix * 2 + rice_param <= CABAC_BY22_PEEK_BITS + 2)
|
|
+ {
|
|
+ const uint32_t suffix = ((y << prefix) | 0x80000000) >> (34 - (prefix + rice_param));
|
|
+
|
|
+ last_coeff_abs_level_remaining = (2 << rice_param) + suffix;
|
|
+ n = prefix * 2 + rice_param - 2;
|
|
+ }
|
|
+ else {
|
|
+ unsigned int suffix;
|
|
+
|
|
+ get_cabac_by22_flush(c, prefix, y);
|
|
+ y = get_cabac_by22_peek(c);
|
|
+
|
|
+ suffix = (y | 0x80000000) >> (34 - (prefix + rice_param));
|
|
+ last_coeff_abs_level_remaining = (2 << rice_param) + suffix;
|
|
+ n = prefix + rice_param - 2;
|
|
+ }
|
|
+
|
|
+ get_cabac_by22_flush(c, n, y);
|
|
+
|
|
+ return last_coeff_abs_level_remaining;
|
|
+}
|
|
+#endif
|
|
+
|
|
+static int coeff_abs_level_remaining_decode(HEVCContext * const s, int rc_rice_param)
|
|
+{
|
|
+ CABACContext * const c = &s->HEVClc->cc;
|
|
int prefix = 0;
|
|
int suffix = 0;
|
|
int last_coeff_abs_level_remaining;
|
|
int i;
|
|
|
|
- while (prefix < CABAC_MAX_BIN && get_cabac_bypass(&s->HEVClc->cc))
|
|
+ while (prefix < CABAC_MAX_BIN && get_cabac_bypass(c))
|
|
prefix++;
|
|
if (prefix == CABAC_MAX_BIN) {
|
|
av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", prefix);
|
|
return 0;
|
|
}
|
|
+
|
|
if (prefix < 3) {
|
|
for (i = 0; i < rc_rice_param; i++)
|
|
- suffix = (suffix << 1) | get_cabac_bypass(&s->HEVClc->cc);
|
|
+ suffix = (suffix << 1) | get_cabac_bypass(c);
|
|
last_coeff_abs_level_remaining = (prefix << rc_rice_param) + suffix;
|
|
} else {
|
|
int prefix_minus3 = prefix - 3;
|
|
for (i = 0; i < prefix_minus3 + rc_rice_param; i++)
|
|
- suffix = (suffix << 1) | get_cabac_bypass(&s->HEVClc->cc);
|
|
+ suffix = (suffix << 1) | get_cabac_bypass(c);
|
|
last_coeff_abs_level_remaining = (((1 << prefix_minus3) + 3 - 1)
|
|
<< rc_rice_param) + suffix;
|
|
}
|
|
+
|
|
return last_coeff_abs_level_remaining;
|
|
}
|
|
|
|
-static av_always_inline int coeff_sign_flag_decode(HEVCContext *s, uint8_t nb)
|
|
+#if !USE_BY22
|
|
+#define coeff_sign_flag_decode_bypass coeff_sign_flag_decode
|
|
+static inline uint32_t coeff_sign_flag_decode(HEVCContext * const s, const unsigned int nb)
|
|
{
|
|
- int i;
|
|
- int ret = 0;
|
|
+ CABACContext * const c = &s->HEVClc->cc;
|
|
+ unsigned int i;
|
|
+ uint32_t ret = 0;
|
|
|
|
for (i = 0; i < nb; i++)
|
|
- ret = (ret << 1) | get_cabac_bypass(&s->HEVClc->cc);
|
|
- return ret;
|
|
+ ret = (ret << 1) | get_cabac_bypass(c);
|
|
+
|
|
+ return ret << (32 - nb);
|
|
+}
|
|
+#endif
|
|
+
|
|
+#ifndef coeff_sign_flag_decode_bypass
|
|
+static inline uint32_t coeff_sign_flag_decode_bypass(HEVCContext * const s, const unsigned int nb)
|
|
+{
|
|
+ CABACContext * const c = &s->HEVClc->cc;
|
|
+ uint32_t y;
|
|
+ y = get_cabac_by22_peek(c);
|
|
+ get_cabac_by22_flush(c, nb, y);
|
|
+ return y & ~(0xffffffffU >> nb);
|
|
+}
|
|
+#endif
|
|
+
|
|
+
|
|
+#ifndef get_cabac_greater1_bits
|
|
+static inline unsigned int get_cabac_greater1_bits(CABACContext * const c, const unsigned int n,
|
|
+ uint8_t * const state0)
|
|
+{
|
|
+ unsigned int i;
|
|
+ unsigned int rv = 0;
|
|
+ for (i = 0; i != n; ++i) {
|
|
+ const unsigned int idx = rv != 0 ? 0 : i < 3 ? i + 1 : 3;
|
|
+ const unsigned int b = get_cabac(c, state0 + idx);
|
|
+ rv = (rv << 1) | b;
|
|
+ }
|
|
+ return rv;
|
|
+}
|
|
+#endif
|
|
+
|
|
+
|
|
+// N.B. levels returned are the values assuming coeff_abs_level_remaining
|
|
+// is uncoded, so 1 must be added if it is coded. sum_abs also reflects
|
|
+// this version of events.
|
|
+static inline uint32_t get_greaterx_bits(HEVCContext * const s, const unsigned int n_end, int * const levels,
|
|
+ int * const pprev_subset_coded, int * const psum,
|
|
+ const unsigned int idx0_gt1, const unsigned int idx_gt2)
|
|
+{
|
|
+ CABACContext * const c = &s->HEVClc->cc;
|
|
+ uint8_t * const state0 = s->HEVClc->cabac_state + idx0_gt1;
|
|
+ uint8_t * const state_gt2 = s->HEVClc->cabac_state + idx_gt2;
|
|
+ unsigned int rv;
|
|
+ unsigned int i;
|
|
+ const unsigned int n = FFMIN(n_end, 8);
|
|
+
|
|
+ // Really this is i != n but the simple unconditional loop is cheaper
|
|
+ // and faster
|
|
+ for (i = 0; i != 8; ++i)
|
|
+ levels[i] = 1;
|
|
+
|
|
+ rv = get_cabac_greater1_bits(c, n, state0);
|
|
+
|
|
+ *pprev_subset_coded = 0;
|
|
+ *psum = n;
|
|
+
|
|
+ rv <<= (32 - n);
|
|
+ if (rv != 0)
|
|
+ {
|
|
+ *pprev_subset_coded = 1;
|
|
+ *psum = n + 1;
|
|
+ i = hevc_clz32(rv);
|
|
+ levels[i] = 2;
|
|
+ if (get_cabac(c, state_gt2) == 0)
|
|
+ {
|
|
+ // Unset first coded bit
|
|
+ rv &= ~(0x80000000U >> i);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (n_end > 8) {
|
|
+ const unsigned int g8 = n_end - 8;
|
|
+ rv |= ((1 << g8) - 1) << (24 - g8);
|
|
+ for (i = 0; i != g8; ++i) {
|
|
+ levels[i + 8] = 0;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return rv;
|
|
+}
|
|
+
|
|
+// extended_precision_processing_flag must be false given we are
|
|
+// putting the result into a 16-bit array
|
|
+// So trans_coeff_level must fit in 16 bits too (7.4.9.1 definition of coeff_abs_level_remaining)
|
|
+// scale_m is uint8_t
|
|
+//
|
|
+// scale is [40 - 72] << [0..12] based on qp- worst case is (45 << 12)
|
|
+// or it can be 2 (if we have transquant_bypass)
|
|
+// shift is set to one less than we really want but would normally be
|
|
+// s->ps.sps->bit_depth (max 16, min 8) + log2_trafo_size (max 5, min 2?) - 5 = max 16 min 5?
|
|
+// however the scale shift is substracted from shift to a min 0 so scale_m worst = 45 << 6
|
|
+// This can still theoretically lead to overflow but the coding would have to be very odd (& inefficient)
|
|
+// to achieve it
|
|
+
|
|
+#ifndef trans_scale_sat
|
|
+static inline int trans_scale_sat(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift)
|
|
+{
|
|
+ return av_clip_int16((((level * (int)(scale * scale_m)) >> shift) + 1) >> 1);
|
|
}
|
|
+#endif
|
|
+
|
|
+
|
|
+#ifndef update_rice
|
|
+static inline void update_rice(uint8_t * const stat_coeff,
|
|
+ const unsigned int last_coeff_abs_level_remaining,
|
|
+ const unsigned int c_rice_param)
|
|
+{
|
|
+ const unsigned int x = (last_coeff_abs_level_remaining << 1) >> c_rice_param;
|
|
+ if (x >= 6)
|
|
+ (*stat_coeff)++;
|
|
+ else if (x == 0 && *stat_coeff > 0)
|
|
+ (*stat_coeff)--;
|
|
+}
|
|
+#endif
|
|
+
|
|
+
|
|
+// n must be > 0 on entry
|
|
+#ifndef get_cabac_sig_coeff_flag_idxs
|
|
+static inline uint8_t * get_cabac_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0,
|
|
+ unsigned int n,
|
|
+ const uint8_t const * ctx_map,
|
|
+ uint8_t * p)
|
|
+{
|
|
+ do {
|
|
+ if (get_cabac(c, state0 + ctx_map[n]))
|
|
+ *p++ = n;
|
|
+ } while (--n != 0);
|
|
+ return p;
|
|
+}
|
|
+#endif
|
|
+
|
|
+
|
|
+static int get_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0,
|
|
+ unsigned int n,
|
|
+ const uint8_t const * ctx_map,
|
|
+ uint8_t * const flag_idx)
|
|
+{
|
|
+ int rv;
|
|
+
|
|
+ rv = get_cabac_sig_coeff_flag_idxs(c, state0, n, ctx_map, flag_idx) - flag_idx;
|
|
+
|
|
+ return rv;
|
|
+}
|
|
+
|
|
+#define H4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
|
|
+ x0, x1, x2, x3,\
|
|
+ x4, x5, x6, x7,\
|
|
+ x8, x9, x10, x11,\
|
|
+ x12, x13, x14, x15}
|
|
+
|
|
+#define V4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
|
|
+ x0, x4, x8, x12,\
|
|
+ x1, x5, x9, x13,\
|
|
+ x2, x6, x10, x14,\
|
|
+ x3, x7, x11, x15}
|
|
+
|
|
+#define D4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
|
|
+ x0, x4, x1, x8,\
|
|
+ x5, x2, x12, x9,\
|
|
+ x6, x3, x13, x10,\
|
|
+ x7, x14, x11, x15}
|
|
+
|
|
+
|
|
+static inline int next_subset(HEVCContext * const s, int i, const int c_idx_nz,
|
|
+ uint8_t * const significant_coeff_group_flag,
|
|
+ const uint8_t * const scan_x_cg, const uint8_t * const scan_y_cg,
|
|
+ int * const pPrev_sig)
|
|
+{
|
|
+ while (--i >= 0) {
|
|
+ unsigned int x_cg = scan_x_cg[i];
|
|
+ unsigned int y_cg = scan_y_cg[i];
|
|
+
|
|
+ // For the flag decode we only care about Z/NZ but
|
|
+ // we use the full Right + Down * 2 when calculating
|
|
+ // significant coeff flags so we obtain it here
|
|
+ //.
|
|
+ // The group flag array is one longer than it needs to
|
|
+ // be so we don't need to check for y_cg limits
|
|
+ unsigned int prev_sig = ((significant_coeff_group_flag[y_cg] >> (x_cg + 1)) & 1) |
|
|
+ (((significant_coeff_group_flag[y_cg + 1] >> x_cg) & 1) << 1);
|
|
+
|
|
+ if (i == 0 ||
|
|
+ significant_coeff_group_flag_decode(s, c_idx_nz, prev_sig))
|
|
+ {
|
|
+ significant_coeff_group_flag[y_cg] |= (1 << x_cg);
|
|
+ *pPrev_sig = prev_sig;
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return i;
|
|
+}
|
|
+
|
|
|
|
void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
|
|
int log2_trafo_size, enum ScanType scan_idx,
|
|
int c_idx)
|
|
{
|
|
-#define GET_COORD(offset, n) \
|
|
- do { \
|
|
- x_c = (x_cg << 2) + scan_x_off[n]; \
|
|
- y_c = (y_cg << 2) + scan_y_off[n]; \
|
|
- } while (0)
|
|
- HEVCLocalContext *lc = s->HEVClc;
|
|
- int transform_skip_flag = 0;
|
|
+ HEVCLocalContext * const lc = s->HEVClc;
|
|
+ int trans_skip_or_bypass = lc->cu.cu_transquant_bypass_flag;
|
|
|
|
int last_significant_coeff_x, last_significant_coeff_y;
|
|
- int last_scan_pos;
|
|
- int n_end;
|
|
int num_coeff = 0;
|
|
- int greater1_ctx = 1;
|
|
+ int prev_subset_coded = 0;
|
|
|
|
int num_last_subset;
|
|
int x_cg_last_sig, y_cg_last_sig;
|
|
|
|
- const uint8_t *scan_x_cg, *scan_y_cg, *scan_x_off, *scan_y_off;
|
|
+ const uint8_t *scan_x_cg, *scan_y_cg;
|
|
+ const xy_off_t * scan_xy_off;
|
|
|
|
ptrdiff_t stride = s->frame->linesize[c_idx];
|
|
int hshift = s->ps.sps->hshift[c_idx];
|
|
int vshift = s->ps.sps->vshift[c_idx];
|
|
uint8_t *dst = &s->frame->data[c_idx][(y0 >> vshift) * stride +
|
|
((x0 >> hshift) << s->ps.sps->pixel_shift)];
|
|
+#ifdef RPI
|
|
+ //***** transform_skip_flag decoded later!
|
|
+ int use_vpu = s->enable_rpi && !lc->cu.cu_transquant_bypass_flag /* && !transform_skip_flag*/ && !lc->tu.cross_pf && log2_trafo_size>=4;
|
|
+#endif
|
|
int16_t *coeffs = (int16_t*)(c_idx ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
|
|
- uint8_t significant_coeff_group_flag[8][8] = {{0}};
|
|
+ uint8_t significant_coeff_group_flag[9] = {0}; // Allow 1 final byte that is always zero
|
|
int explicit_rdpcm_flag = 0;
|
|
int explicit_rdpcm_dir_flag;
|
|
|
|
int trafo_size = 1 << log2_trafo_size;
|
|
int i;
|
|
- int qp,shift,add,scale,scale_m;
|
|
+ int qp,shift,scale;
|
|
static const uint8_t level_scale[] = { 40, 45, 51, 57, 64, 72 };
|
|
const uint8_t *scale_matrix = NULL;
|
|
uint8_t dc_scale;
|
|
int pred_mode_intra = (c_idx == 0) ? lc->tu.intra_pred_mode :
|
|
lc->tu.intra_pred_mode_c;
|
|
|
|
+ int prev_sig = 0;
|
|
+ const int c_idx_nz = (c_idx != 0);
|
|
+
|
|
+ int may_hide_sign;
|
|
+
|
|
+#ifdef RPI
|
|
+ if (s->enable_rpi) {
|
|
+ int n = trafo_size * trafo_size;
|
|
+ if (use_vpu) {
|
|
+ // We support size 4 and size 5.
|
|
+ // Size 4 grows from the front (Coeffs_buf_arm[2] points to start of buf)
|
|
+ // Size 5 grows from the back (Coeffs_buf_arm[3] points to end of buf)
|
|
+ // num_coeffs is indexed by log2_trafo_size-2
|
|
+ if (log2_trafo_size == 4)
|
|
+ coeffs = s->coeffs_buf_arm[s->pass0_job][log2_trafo_size - 2] + s->num_coeffs[s->pass0_job][log2_trafo_size - 2];
|
|
+ else
|
|
+ coeffs = s->coeffs_buf_arm[s->pass0_job][log2_trafo_size - 2] - s->num_coeffs[s->pass0_job][log2_trafo_size - 2] - n;
|
|
+ s->num_coeffs[s->pass0_job][log2_trafo_size - 2] += n;
|
|
+ } else {
|
|
+ coeffs = s->coeffs_buf_arm[s->pass0_job][0] + s->num_coeffs[s->pass0_job][0];
|
|
+ s->num_coeffs[s->pass0_job][0] += n;
|
|
+ }
|
|
+ }
|
|
+ // We now do the memset after transform_add while we know the data is cached.
|
|
+ #ifdef RPI_PRECLEAR
|
|
+ #else
|
|
+ memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
|
|
+ #endif
|
|
+#else
|
|
memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
|
|
+#endif
|
|
+
|
|
+
|
|
|
|
// Derive QP for dequant
|
|
if (!lc->cu.cu_transquant_bypass_flag) {
|
|
- static const int qp_c[] = { 29, 30, 31, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37 };
|
|
+ static const uint8_t qp_c[] = { 29, 30, 31, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37 };
|
|
static const uint8_t rem6[51 + 4 * 6 + 1] = {
|
|
0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2,
|
|
3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
|
|
@@ -1065,9 +1598,19 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
|
|
};
|
|
int qp_y = lc->qp_y;
|
|
|
|
+ may_hide_sign = s->ps.pps->sign_data_hiding_flag;
|
|
+
|
|
if (s->ps.pps->transform_skip_enabled_flag &&
|
|
log2_trafo_size <= s->ps.pps->log2_max_transform_skip_block_size) {
|
|
- transform_skip_flag = hevc_transform_skip_flag_decode(s, c_idx);
|
|
+ int transform_skip_flag = hevc_transform_skip_flag_decode(s, c_idx_nz);
|
|
+ if (transform_skip_flag) {
|
|
+ trans_skip_or_bypass = 1;
|
|
+ if (lc->cu.pred_mode == MODE_INTRA &&
|
|
+ s->ps.sps->implicit_rdpcm_enabled_flag &&
|
|
+ (pred_mode_intra == 10 || pred_mode_intra == 26)) {
|
|
+ may_hide_sign = 0;
|
|
+ }
|
|
+ }
|
|
}
|
|
|
|
if (c_idx == 0) {
|
|
@@ -1100,39 +1643,73 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
|
|
qp += s->ps.sps->qp_bd_offset;
|
|
}
|
|
|
|
- shift = s->ps.sps->bit_depth + log2_trafo_size - 5;
|
|
- add = 1 << (shift-1);
|
|
- scale = level_scale[rem6[qp]] << (div6[qp]);
|
|
- scale_m = 16; // default when no custom scaling lists.
|
|
- dc_scale = 16;
|
|
+ // Shift is set to one less than will actually occur as the scale
|
|
+ // and saturate step adds 1 and then shifts right again
|
|
+ shift = s->ps.sps->bit_depth + log2_trafo_size - 6;
|
|
+ scale = level_scale[rem6[qp]];
|
|
+ if (div6[qp] >= shift) {
|
|
+ scale <<= (div6[qp] - shift);
|
|
+ shift = 0;
|
|
+ } else {
|
|
+ shift -= div6[qp];
|
|
+ }
|
|
|
|
- if (s->ps.sps->scaling_list_enable_flag && !(transform_skip_flag && log2_trafo_size > 2)) {
|
|
+ if (s->ps.sps->scaling_list_enable_flag && !(trans_skip_or_bypass && log2_trafo_size > 2)) {
|
|
const ScalingList *sl = s->ps.pps->scaling_list_data_present_flag ?
|
|
- &s->ps.pps->scaling_list : &s->ps.sps->scaling_list;
|
|
+ &s->ps.pps->scaling_list : &s->ps.sps->scaling_list;
|
|
int matrix_id = lc->cu.pred_mode != MODE_INTRA;
|
|
|
|
matrix_id = 3 * matrix_id + c_idx;
|
|
|
|
scale_matrix = sl->sl[log2_trafo_size - 2][matrix_id];
|
|
+ dc_scale = scale_matrix[0];
|
|
if (log2_trafo_size >= 4)
|
|
dc_scale = sl->sl_dc[log2_trafo_size - 4][matrix_id];
|
|
}
|
|
+ else
|
|
+ {
|
|
+ static const uint8_t sixteen_scale[64] = {
|
|
+ 16, 16, 16, 16, 16, 16, 16, 16,
|
|
+ 16, 16, 16, 16, 16, 16, 16, 16,
|
|
+ 16, 16, 16, 16, 16, 16, 16, 16,
|
|
+ 16, 16, 16, 16, 16, 16, 16, 16,
|
|
+ 16, 16, 16, 16, 16, 16, 16, 16,
|
|
+ 16, 16, 16, 16, 16, 16, 16, 16,
|
|
+ 16, 16, 16, 16, 16, 16, 16, 16,
|
|
+ 16, 16, 16, 16, 16, 16, 16, 16
|
|
+ };
|
|
+ scale_matrix = sixteen_scale;
|
|
+ dc_scale = 16;
|
|
+ }
|
|
} else {
|
|
+ static const uint8_t unit_scale[64] = {
|
|
+ 1, 1, 1, 1, 1, 1, 1, 1,
|
|
+ 1, 1, 1, 1, 1, 1, 1, 1,
|
|
+ 1, 1, 1, 1, 1, 1, 1, 1,
|
|
+ 1, 1, 1, 1, 1, 1, 1, 1,
|
|
+ 1, 1, 1, 1, 1, 1, 1, 1,
|
|
+ 1, 1, 1, 1, 1, 1, 1, 1,
|
|
+ 1, 1, 1, 1, 1, 1, 1, 1,
|
|
+ 1, 1, 1, 1, 1, 1, 1, 1,
|
|
+ };
|
|
+ scale_matrix = unit_scale;
|
|
shift = 0;
|
|
- add = 0;
|
|
- scale = 0;
|
|
- dc_scale = 0;
|
|
+ scale = 2; // We will shift right to kill this
|
|
+ dc_scale = 1;
|
|
+
|
|
+ may_hide_sign = 0;
|
|
}
|
|
|
|
if (lc->cu.pred_mode == MODE_INTER && s->ps.sps->explicit_rdpcm_enabled_flag &&
|
|
- (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
|
|
- explicit_rdpcm_flag = explicit_rdpcm_flag_decode(s, c_idx);
|
|
+ trans_skip_or_bypass) {
|
|
+ explicit_rdpcm_flag = explicit_rdpcm_flag_decode(s, c_idx_nz);
|
|
if (explicit_rdpcm_flag) {
|
|
- explicit_rdpcm_dir_flag = explicit_rdpcm_dir_flag_decode(s, c_idx);
|
|
+ may_hide_sign = 0;
|
|
+ explicit_rdpcm_dir_flag = explicit_rdpcm_dir_flag_decode(s, c_idx_nz);
|
|
}
|
|
}
|
|
|
|
- last_significant_coeff_xy_prefix_decode(s, c_idx, log2_trafo_size,
|
|
+ last_significant_coeff_xy_prefix_decode(s, c_idx_nz, log2_trafo_size,
|
|
&last_significant_coeff_x, &last_significant_coeff_y);
|
|
|
|
if (last_significant_coeff_x > 3) {
|
|
@@ -1160,119 +1737,113 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
|
|
int last_x_c = last_significant_coeff_x & 3;
|
|
int last_y_c = last_significant_coeff_y & 3;
|
|
|
|
- scan_x_off = ff_hevc_diag_scan4x4_x;
|
|
- scan_y_off = ff_hevc_diag_scan4x4_y;
|
|
num_coeff = diag_scan4x4_inv[last_y_c][last_x_c];
|
|
- if (trafo_size == 4) {
|
|
+
|
|
+ switch (log2_trafo_size) {
|
|
+ case 2:
|
|
scan_x_cg = scan_1x1;
|
|
scan_y_cg = scan_1x1;
|
|
- } else if (trafo_size == 8) {
|
|
+ break;
|
|
+ case 3:
|
|
num_coeff += diag_scan2x2_inv[y_cg_last_sig][x_cg_last_sig] << 4;
|
|
scan_x_cg = diag_scan2x2_x;
|
|
scan_y_cg = diag_scan2x2_y;
|
|
- } else if (trafo_size == 16) {
|
|
+ break;
|
|
+ case 4:
|
|
num_coeff += diag_scan4x4_inv[y_cg_last_sig][x_cg_last_sig] << 4;
|
|
scan_x_cg = ff_hevc_diag_scan4x4_x;
|
|
scan_y_cg = ff_hevc_diag_scan4x4_y;
|
|
- } else { // trafo_size == 32
|
|
+ break;
|
|
+ case 5:
|
|
+ default:
|
|
num_coeff += diag_scan8x8_inv[y_cg_last_sig][x_cg_last_sig] << 4;
|
|
scan_x_cg = ff_hevc_diag_scan8x8_x;
|
|
scan_y_cg = ff_hevc_diag_scan8x8_y;
|
|
+ break;
|
|
}
|
|
break;
|
|
}
|
|
case SCAN_HORIZ:
|
|
scan_x_cg = horiz_scan2x2_x;
|
|
scan_y_cg = horiz_scan2x2_y;
|
|
- scan_x_off = horiz_scan4x4_x;
|
|
- scan_y_off = horiz_scan4x4_y;
|
|
num_coeff = horiz_scan8x8_inv[last_significant_coeff_y][last_significant_coeff_x];
|
|
break;
|
|
default: //SCAN_VERT
|
|
scan_x_cg = horiz_scan2x2_y;
|
|
scan_y_cg = horiz_scan2x2_x;
|
|
- scan_x_off = horiz_scan4x4_y;
|
|
- scan_y_off = horiz_scan4x4_x;
|
|
num_coeff = horiz_scan8x8_inv[last_significant_coeff_x][last_significant_coeff_y];
|
|
break;
|
|
}
|
|
num_coeff++;
|
|
num_last_subset = (num_coeff - 1) >> 4;
|
|
|
|
- for (i = num_last_subset; i >= 0; i--) {
|
|
- int n, m;
|
|
- int x_cg, y_cg, x_c, y_c, pos;
|
|
- int implicit_non_zero_coeff = 0;
|
|
- int64_t trans_coeff_level;
|
|
- int prev_sig = 0;
|
|
- int offset = i << 4;
|
|
- int rice_init = 0;
|
|
+ significant_coeff_group_flag[y_cg_last_sig] = 1 << x_cg_last_sig; // 1st subset always significant
|
|
|
|
- uint8_t significant_coeff_flag_idx[16];
|
|
- uint8_t nb_significant_coeff_flag = 0;
|
|
-
|
|
- x_cg = scan_x_cg[i];
|
|
- y_cg = scan_y_cg[i];
|
|
-
|
|
- if ((i < num_last_subset) && (i > 0)) {
|
|
- int ctx_cg = 0;
|
|
- if (x_cg < (1 << (log2_trafo_size - 2)) - 1)
|
|
- ctx_cg += significant_coeff_group_flag[x_cg + 1][y_cg];
|
|
- if (y_cg < (1 << (log2_trafo_size - 2)) - 1)
|
|
- ctx_cg += significant_coeff_group_flag[x_cg][y_cg + 1];
|
|
+ scan_xy_off = off_xys[scan_idx][log2_trafo_size - 2];
|
|
|
|
- significant_coeff_group_flag[x_cg][y_cg] =
|
|
- significant_coeff_group_flag_decode(s, c_idx, ctx_cg);
|
|
- implicit_non_zero_coeff = 1;
|
|
- } else {
|
|
- significant_coeff_group_flag[x_cg][y_cg] =
|
|
- ((x_cg == x_cg_last_sig && y_cg == y_cg_last_sig) ||
|
|
- (x_cg == 0 && y_cg == 0));
|
|
- }
|
|
+ i = num_last_subset;
|
|
+ do {
|
|
+ int implicit_non_zero_coeff = 0;
|
|
+ int n_end;
|
|
|
|
- last_scan_pos = num_coeff - offset - 1;
|
|
+ uint8_t significant_coeff_flag_idx[16];
|
|
+ unsigned int nb_significant_coeff_flag = 0;
|
|
|
|
if (i == num_last_subset) {
|
|
+ // First time through
|
|
+ int last_scan_pos = num_coeff - (i << 4) - 1;
|
|
n_end = last_scan_pos - 1;
|
|
significant_coeff_flag_idx[0] = last_scan_pos;
|
|
nb_significant_coeff_flag = 1;
|
|
} else {
|
|
n_end = 15;
|
|
+ implicit_non_zero_coeff = (i != 0);
|
|
}
|
|
|
|
- if (x_cg < ((1 << log2_trafo_size) - 1) >> 2)
|
|
- prev_sig = !!significant_coeff_group_flag[x_cg + 1][y_cg];
|
|
- if (y_cg < ((1 << log2_trafo_size) - 1) >> 2)
|
|
- prev_sig += (!!significant_coeff_group_flag[x_cg][y_cg + 1] << 1);
|
|
-
|
|
- if (significant_coeff_group_flag[x_cg][y_cg] && n_end >= 0) {
|
|
- static const uint8_t ctx_idx_map[] = {
|
|
- 0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8, // log2_trafo_size == 2
|
|
- 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, // prev_sig == 0
|
|
- 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, // prev_sig == 1
|
|
- 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, // prev_sig == 2
|
|
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 // default
|
|
+ if (n_end >= 0) {
|
|
+ static const uint8_t ctx_idx_maps_ts2[3][16] = {
|
|
+ D4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2
|
|
+ H4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2
|
|
+ V4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8) // log2_trafo_size == 2
|
|
+ };
|
|
+ static const uint8_t ctx_idx_maps[3][4][16] = {
|
|
+ {
|
|
+ D4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
|
|
+ D4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 1
|
|
+ D4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 2
|
|
+ D4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2) // prev_sig == 3, default
|
|
+ },
|
|
+ {
|
|
+ H4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
|
|
+ H4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 1
|
|
+ H4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 2
|
|
+ H4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2) // prev_sig == 3, default
|
|
+ },
|
|
+ {
|
|
+ V4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
|
|
+ V4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 1
|
|
+ V4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 2
|
|
+ V4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2) // prev_sig == 3, default
|
|
+ }
|
|
};
|
|
const uint8_t *ctx_idx_map_p;
|
|
int scf_offset = 0;
|
|
- if (s->ps.sps->transform_skip_context_enabled_flag &&
|
|
- (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
|
|
- ctx_idx_map_p = (uint8_t*) &ctx_idx_map[4 * 16];
|
|
- if (c_idx == 0) {
|
|
- scf_offset = 40;
|
|
- } else {
|
|
- scf_offset = 14 + 27;
|
|
- }
|
|
+
|
|
+ if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) {
|
|
+ ctx_idx_map_p = ctx_idx_maps[0][3];
|
|
+ scf_offset = 40 + c_idx_nz;
|
|
} else {
|
|
- if (c_idx != 0)
|
|
+ if (c_idx_nz != 0)
|
|
scf_offset = 27;
|
|
+
|
|
if (log2_trafo_size == 2) {
|
|
- ctx_idx_map_p = (uint8_t*) &ctx_idx_map[0];
|
|
+ ctx_idx_map_p = ctx_idx_maps_ts2[scan_idx];
|
|
} else {
|
|
- ctx_idx_map_p = (uint8_t*) &ctx_idx_map[(prev_sig + 1) << 4];
|
|
- if (c_idx == 0) {
|
|
- if ((x_cg > 0 || y_cg > 0))
|
|
+ ctx_idx_map_p = ctx_idx_maps[scan_idx][prev_sig];
|
|
+ if (!c_idx_nz) {
|
|
+ if (i != 0)
|
|
scf_offset += 3;
|
|
+
|
|
if (log2_trafo_size == 3) {
|
|
scf_offset += (scan_idx == SCAN_DIAG) ? 9 : 15;
|
|
} else {
|
|
@@ -1286,34 +1857,30 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
|
|
}
|
|
}
|
|
}
|
|
- for (n = n_end; n > 0; n--) {
|
|
- x_c = scan_x_off[n];
|
|
- y_c = scan_y_off[n];
|
|
- if (significant_coeff_flag_decode(s, x_c, y_c, scf_offset, ctx_idx_map_p)) {
|
|
- significant_coeff_flag_idx[nb_significant_coeff_flag] = n;
|
|
- nb_significant_coeff_flag++;
|
|
+
|
|
+ if (n_end > 0) {
|
|
+ int cnt = get_sig_coeff_flag_idxs(&s->HEVClc->cc,
|
|
+ s->HEVClc->cabac_state + elem_offset[SIGNIFICANT_COEFF_FLAG] + scf_offset,
|
|
+ n_end, ctx_idx_map_p,
|
|
+ significant_coeff_flag_idx + nb_significant_coeff_flag);
|
|
+
|
|
+ nb_significant_coeff_flag += cnt;
|
|
+ if (cnt != 0) {
|
|
implicit_non_zero_coeff = 0;
|
|
}
|
|
}
|
|
+
|
|
if (implicit_non_zero_coeff == 0) {
|
|
- if (s->ps.sps->transform_skip_context_enabled_flag &&
|
|
- (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
|
|
- if (c_idx == 0) {
|
|
- scf_offset = 42;
|
|
- } else {
|
|
- scf_offset = 16 + 27;
|
|
- }
|
|
+ if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) {
|
|
+ scf_offset = 42 + c_idx_nz;
|
|
} else {
|
|
if (i == 0) {
|
|
- if (c_idx == 0)
|
|
- scf_offset = 0;
|
|
- else
|
|
- scf_offset = 27;
|
|
+ scf_offset = c_idx_nz ? 27 : 0;
|
|
} else {
|
|
scf_offset = 2 + scf_offset;
|
|
}
|
|
}
|
|
- if (significant_coeff_flag_decode_0(s, c_idx, scf_offset) == 1) {
|
|
+ if (significant_coeff_flag_decode_0(s, scf_offset) == 1) {
|
|
significant_coeff_flag_idx[nb_significant_coeff_flag] = 0;
|
|
nb_significant_coeff_flag++;
|
|
}
|
|
@@ -1323,141 +1890,185 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
|
|
}
|
|
}
|
|
|
|
- n_end = nb_significant_coeff_flag;
|
|
-
|
|
+ if (nb_significant_coeff_flag != 0) {
|
|
+ const unsigned int gt1_idx_delta = (c_idx_nz << 2) |
|
|
+ ((i != 0 && !c_idx_nz) ? 2 : 0) |
|
|
+ prev_subset_coded;
|
|
+ const unsigned int idx0_gt1 = elem_offset[COEFF_ABS_LEVEL_GREATER1_FLAG] +
|
|
+ (gt1_idx_delta << 2);
|
|
+ const unsigned int idx_gt2 = elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] +
|
|
+ gt1_idx_delta;
|
|
+
|
|
+ const unsigned int x_cg = scan_x_cg[i];
|
|
+ const unsigned int y_cg = scan_y_cg[i];
|
|
+ int16_t * const blk_coeffs = coeffs +
|
|
+ ((x_cg + (y_cg << log2_trafo_size)) << 2);
|
|
+ // This calculation is 'wrong' for log2_traffo_size == 2
|
|
+ // but that doesn't mattor as in this case x_cg & y_cg
|
|
+ // are always 0 so result is correct (0) anyway
|
|
+ const uint8_t * const blk_scale = scale_matrix +
|
|
+ (((x_cg + (y_cg << 3)) << (5 - log2_trafo_size)));
|
|
+
|
|
+ // * The following code block doesn't deal with these flags:
|
|
+ // (nor did the one it replaces)
|
|
+ //
|
|
+ // cabac_bypass_alignment_enabled_flag
|
|
+ // This should be easy but I can't find a test case
|
|
+ // extended_precision_processing_flag
|
|
+ // This can extend the required precision past 16bits
|
|
+ // so is probably tricky - also no example found yet
|
|
+
|
|
+#if USE_N_END_1
|
|
+ if (nb_significant_coeff_flag == 1) {
|
|
+ // There is a small gain to be had from special casing the single
|
|
+ // transform coefficient case. The reduction in complexity
|
|
+ // makes up for the code duplicatioon.
|
|
+
|
|
+ int trans_coeff_level = 1;
|
|
+ int coeff_sign_flag;
|
|
+ int coded_val = 0;
|
|
+
|
|
+ // initialize first elem of coeff_bas_level_greater1_flag
|
|
+ prev_subset_coded = 0;
|
|
+
|
|
+ if (get_cabac(&s->HEVClc->cc, s->HEVClc->cabac_state + idx0_gt1 + 1)) {
|
|
+ trans_coeff_level = 2;
|
|
+ prev_subset_coded = 1;
|
|
+ coded_val = get_cabac(&s->HEVClc->cc, s->HEVClc->cabac_state + idx_gt2);
|
|
+ }
|
|
|
|
- if (n_end) {
|
|
- int first_nz_pos_in_cg;
|
|
- int last_nz_pos_in_cg;
|
|
- int c_rice_param = 0;
|
|
- int first_greater1_coeff_idx = -1;
|
|
- uint8_t coeff_abs_level_greater1_flag[8];
|
|
- uint16_t coeff_sign_flag;
|
|
- int sum_abs = 0;
|
|
- int sign_hidden;
|
|
- int sb_type;
|
|
+ // Probably not worth the overhead of starting by22 for just one value
|
|
+ coeff_sign_flag = get_cabac_bypass(&s->HEVClc->cc);
|
|
|
|
+ if (coded_val)
|
|
+ {
|
|
+ if (!s->ps.sps->persistent_rice_adaptation_enabled_flag) {
|
|
+ trans_coeff_level = 3 + coeff_abs_level_remaining_decode(s, 0);
|
|
+ } else {
|
|
+ uint8_t * const stat_coeff =
|
|
+ lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1);
|
|
+ const unsigned int c_rice_param = *stat_coeff >> 2;
|
|
+ const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
|
|
|
|
- // initialize first elem of coeff_bas_level_greater1_flag
|
|
- int ctx_set = (i > 0 && c_idx == 0) ? 2 : 0;
|
|
+ trans_coeff_level = 3 + last_coeff_abs_level_remaining;
|
|
+ update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
|
|
+ }
|
|
+ }
|
|
|
|
- if (s->ps.sps->persistent_rice_adaptation_enabled_flag) {
|
|
- if (!transform_skip_flag && !lc->cu.cu_transquant_bypass_flag)
|
|
- sb_type = 2 * (c_idx == 0 ? 1 : 0);
|
|
- else
|
|
- sb_type = 2 * (c_idx == 0 ? 1 : 0) + 1;
|
|
- c_rice_param = lc->stat_coeff[sb_type] / 4;
|
|
- }
|
|
+ {
|
|
+ const xy_off_t * const xy_off = scan_xy_off + significant_coeff_flag_idx[0];
|
|
+ const int k = (int32_t)(coeff_sign_flag << 31) >> 31;
|
|
+ const unsigned int scale_m = blk_scale[xy_off->scale];
|
|
|
|
- if (!(i == num_last_subset) && greater1_ctx == 0)
|
|
- ctx_set++;
|
|
- greater1_ctx = 1;
|
|
- last_nz_pos_in_cg = significant_coeff_flag_idx[0];
|
|
-
|
|
- for (m = 0; m < (n_end > 8 ? 8 : n_end); m++) {
|
|
- int inc = (ctx_set << 2) + greater1_ctx;
|
|
- coeff_abs_level_greater1_flag[m] =
|
|
- coeff_abs_level_greater1_flag_decode(s, c_idx, inc);
|
|
- if (coeff_abs_level_greater1_flag[m]) {
|
|
- greater1_ctx = 0;
|
|
- if (first_greater1_coeff_idx == -1)
|
|
- first_greater1_coeff_idx = m;
|
|
- } else if (greater1_ctx > 0 && greater1_ctx < 3) {
|
|
- greater1_ctx++;
|
|
+ blk_coeffs[xy_off->coeff] = trans_scale_sat(
|
|
+ (trans_coeff_level ^ k) - k, // Apply sign
|
|
+ scale,
|
|
+ i == 0 && xy_off->coeff == 0 ? dc_scale : scale_m,
|
|
+ shift);
|
|
}
|
|
}
|
|
- first_nz_pos_in_cg = significant_coeff_flag_idx[n_end - 1];
|
|
-
|
|
- if (lc->cu.cu_transquant_bypass_flag ||
|
|
- (lc->cu.pred_mode == MODE_INTRA &&
|
|
- s->ps.sps->implicit_rdpcm_enabled_flag && transform_skip_flag &&
|
|
- (pred_mode_intra == 10 || pred_mode_intra == 26 )) ||
|
|
- explicit_rdpcm_flag)
|
|
- sign_hidden = 0;
|
|
else
|
|
- sign_hidden = (last_nz_pos_in_cg - first_nz_pos_in_cg >= 4);
|
|
+#endif
|
|
+ {
|
|
+ int sign_hidden = may_hide_sign;
|
|
+ int levels[16]; // Should be able to get away with int16_t but that fails some tests
|
|
+ uint32_t coeff_sign_flags;
|
|
+ uint32_t coded_vals = 0;
|
|
+ // Sum(abs(level[]))
|
|
+ // In fact we only need the bottom bit and in some future
|
|
+ // version that may be all we calculate
|
|
+ unsigned int sum_abs;
|
|
+
|
|
+ coded_vals = get_greaterx_bits(s, nb_significant_coeff_flag, levels,
|
|
+ &prev_subset_coded, &sum_abs, idx0_gt1, idx_gt2);
|
|
+
|
|
+ if (significant_coeff_flag_idx[0] - significant_coeff_flag_idx[nb_significant_coeff_flag - 1] <= 3)
|
|
+ sign_hidden = 0;
|
|
+
|
|
+ // -- Start bypass block
|
|
+
|
|
+ bypass_start(s);
|
|
+
|
|
+ coeff_sign_flags = coeff_sign_flag_decode_bypass(s, nb_significant_coeff_flag - sign_hidden);
|
|
+
|
|
+ if (coded_vals != 0)
|
|
+ {
|
|
+ const int rice_adaptation_enabled = s->ps.sps->persistent_rice_adaptation_enabled_flag;
|
|
+ uint8_t * stat_coeff = !rice_adaptation_enabled ? NULL :
|
|
+ lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1);
|
|
+ int c_rice_param = !rice_adaptation_enabled ? 0 : *stat_coeff >> 2;
|
|
+ int * level = levels - 1;
|
|
+
|
|
+ do {
|
|
+ {
|
|
+ const unsigned int z = hevc_clz32(coded_vals) + 1;
|
|
+ level += z;
|
|
+ coded_vals <<= z;
|
|
+ }
|
|
|
|
- if (first_greater1_coeff_idx != -1) {
|
|
- coeff_abs_level_greater1_flag[first_greater1_coeff_idx] += coeff_abs_level_greater2_flag_decode(s, c_idx, ctx_set);
|
|
- }
|
|
- if (!s->ps.pps->sign_data_hiding_flag || !sign_hidden ) {
|
|
- coeff_sign_flag = coeff_sign_flag_decode(s, nb_significant_coeff_flag) << (16 - nb_significant_coeff_flag);
|
|
- } else {
|
|
- coeff_sign_flag = coeff_sign_flag_decode(s, nb_significant_coeff_flag - 1) << (16 - (nb_significant_coeff_flag - 1));
|
|
- }
|
|
+ {
|
|
+ const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode_bypass(s, c_rice_param);
|
|
+ const int trans_coeff_level = *level + last_coeff_abs_level_remaining + 1;
|
|
+
|
|
+ sum_abs += last_coeff_abs_level_remaining + 1;
|
|
+ *level = trans_coeff_level;
|
|
+
|
|
+ if (stat_coeff != NULL)
|
|
+ update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
|
|
+ stat_coeff = NULL;
|
|
|
|
- for (m = 0; m < n_end; m++) {
|
|
- n = significant_coeff_flag_idx[m];
|
|
- GET_COORD(offset, n);
|
|
- if (m < 8) {
|
|
- trans_coeff_level = 1 + coeff_abs_level_greater1_flag[m];
|
|
- if (trans_coeff_level == ((m == first_greater1_coeff_idx) ? 3 : 2)) {
|
|
- int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
|
|
-
|
|
- trans_coeff_level += last_coeff_abs_level_remaining;
|
|
- if (trans_coeff_level > (3 << c_rice_param))
|
|
- c_rice_param = s->ps.sps->persistent_rice_adaptation_enabled_flag ? c_rice_param + 1 : FFMIN(c_rice_param + 1, 4);
|
|
- if (s->ps.sps->persistent_rice_adaptation_enabled_flag && !rice_init) {
|
|
- int c_rice_p_init = lc->stat_coeff[sb_type] / 4;
|
|
- if (last_coeff_abs_level_remaining >= (3 << c_rice_p_init))
|
|
- lc->stat_coeff[sb_type]++;
|
|
- else if (2 * last_coeff_abs_level_remaining < (1 << c_rice_p_init))
|
|
- if (lc->stat_coeff[sb_type] > 0)
|
|
- lc->stat_coeff[sb_type]--;
|
|
- rice_init = 1;
|
|
+ if (trans_coeff_level > (3 << c_rice_param) &&
|
|
+ (c_rice_param < 4 || rice_adaptation_enabled))
|
|
+ ++c_rice_param;
|
|
}
|
|
- }
|
|
- } else {
|
|
- int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
|
|
-
|
|
- trans_coeff_level = 1 + last_coeff_abs_level_remaining;
|
|
- if (trans_coeff_level > (3 << c_rice_param))
|
|
- c_rice_param = s->ps.sps->persistent_rice_adaptation_enabled_flag ? c_rice_param + 1 : FFMIN(c_rice_param + 1, 4);
|
|
- if (s->ps.sps->persistent_rice_adaptation_enabled_flag && !rice_init) {
|
|
- int c_rice_p_init = lc->stat_coeff[sb_type] / 4;
|
|
- if (last_coeff_abs_level_remaining >= (3 << c_rice_p_init))
|
|
- lc->stat_coeff[sb_type]++;
|
|
- else if (2 * last_coeff_abs_level_remaining < (1 << c_rice_p_init))
|
|
- if (lc->stat_coeff[sb_type] > 0)
|
|
- lc->stat_coeff[sb_type]--;
|
|
- rice_init = 1;
|
|
- }
|
|
+ } while (coded_vals != 0);
|
|
}
|
|
- if (s->ps.pps->sign_data_hiding_flag && sign_hidden) {
|
|
- sum_abs += trans_coeff_level;
|
|
- if (n == first_nz_pos_in_cg && (sum_abs&1))
|
|
- trans_coeff_level = -trans_coeff_level;
|
|
+
|
|
+ // sign_hidden = 0 or 1 so we can combine the tests
|
|
+ if ((sign_hidden & sum_abs) != 0) {
|
|
+ levels[nb_significant_coeff_flag - 1] = -levels[nb_significant_coeff_flag - 1];
|
|
}
|
|
- if (coeff_sign_flag >> 15)
|
|
- trans_coeff_level = -trans_coeff_level;
|
|
- coeff_sign_flag <<= 1;
|
|
- if(!lc->cu.cu_transquant_bypass_flag) {
|
|
- if (s->ps.sps->scaling_list_enable_flag && !(transform_skip_flag && log2_trafo_size > 2)) {
|
|
- if(y_c || x_c || log2_trafo_size < 4) {
|
|
- switch(log2_trafo_size) {
|
|
- case 3: pos = (y_c << 3) + x_c; break;
|
|
- case 4: pos = ((y_c >> 1) << 3) + (x_c >> 1); break;
|
|
- case 5: pos = ((y_c >> 2) << 3) + (x_c >> 2); break;
|
|
- default: pos = (y_c << 2) + x_c; break;
|
|
- }
|
|
- scale_m = scale_matrix[pos];
|
|
- } else {
|
|
- scale_m = dc_scale;
|
|
- }
|
|
+
|
|
+ bypass_finish(s);
|
|
+
|
|
+ // -- Finish bypass block
|
|
+
|
|
+ // Scale loop
|
|
+ {
|
|
+ int m = nb_significant_coeff_flag - 1;
|
|
+
|
|
+ // Deal with DC component (if any) first
|
|
+ if (i == 0 && significant_coeff_flag_idx[m] == 0)
|
|
+ {
|
|
+ const int k = (int32_t)(coeff_sign_flags << m) >> 31;
|
|
+ blk_coeffs[0] = trans_scale_sat(
|
|
+ (levels[m] ^ k) - k, scale, dc_scale, shift);
|
|
+ --m;
|
|
}
|
|
- trans_coeff_level = (trans_coeff_level * (int64_t)scale * (int64_t)scale_m + add) >> shift;
|
|
- if(trans_coeff_level < 0) {
|
|
- if((~trans_coeff_level) & 0xFffffffffff8000)
|
|
- trans_coeff_level = -32768;
|
|
- } else {
|
|
- if(trans_coeff_level & 0xffffffffffff8000)
|
|
- trans_coeff_level = 32767;
|
|
+
|
|
+#if !USE_N_END_1
|
|
+ // If N_END_1 set then m was at least 1 initially
|
|
+ if (m >= 0)
|
|
+#endif
|
|
+ {
|
|
+ do {
|
|
+ const xy_off_t * const xy_off = scan_xy_off +
|
|
+ significant_coeff_flag_idx[m];
|
|
+ const int k = (int32_t)(coeff_sign_flags << m) >> 31;
|
|
+
|
|
+ blk_coeffs[xy_off->coeff] = trans_scale_sat(
|
|
+ (levels[m] ^ k) - k,
|
|
+ scale,
|
|
+ blk_scale[xy_off->scale],
|
|
+ shift);
|
|
+ } while (--m >= 0);
|
|
}
|
|
}
|
|
- coeffs[y_c * trafo_size + x_c] = trans_coeff_level;
|
|
+
|
|
}
|
|
}
|
|
- }
|
|
+ } while ((i = next_subset(s, i, c_idx_nz,
|
|
+ significant_coeff_group_flag, scan_x_cg, scan_y_cg, &prev_sig)) >= 0);
|
|
|
|
if (lc->cu.cu_transquant_bypass_flag) {
|
|
if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
|
|
@@ -1467,7 +2078,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
|
|
s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
|
|
}
|
|
} else {
|
|
- if (transform_skip_flag) {
|
|
+ if (trans_skip_or_bypass) { // Must be trans_skip as we've already dealt with bypass
|
|
int rot = s->ps.sps->transform_skip_rotation_enabled_flag &&
|
|
log2_trafo_size == 2 &&
|
|
lc->cu.pred_mode == MODE_INTRA;
|
|
@@ -1475,7 +2086,6 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
|
|
for (i = 0; i < 8; i++)
|
|
FFSWAP(int16_t, coeffs[i], coeffs[16 - i - 1]);
|
|
}
|
|
-
|
|
s->hevcdsp.transform_skip(coeffs, log2_trafo_size);
|
|
|
|
if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
|
|
@@ -1486,8 +2096,26 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
|
|
s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
|
|
}
|
|
} else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) {
|
|
- s->hevcdsp.idct_4x4_luma(coeffs);
|
|
+ s->hevcdsp.idct_4x4_luma(coeffs);
|
|
} else {
|
|
+#ifdef RPI
|
|
+ if (!use_vpu) {
|
|
+ int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
|
|
+ if (max_xy == 0) {
|
|
+ s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs);
|
|
+ } else {
|
|
+ int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4;
|
|
+ if (max_xy < 4)
|
|
+ col_limit = FFMIN(4, col_limit);
|
|
+ else if (max_xy < 8)
|
|
+ col_limit = FFMIN(8, col_limit);
|
|
+ else if (max_xy < 12)
|
|
+ col_limit = FFMIN(24, col_limit);
|
|
+
|
|
+ s->hevcdsp.idct[log2_trafo_size-2](coeffs, col_limit);
|
|
+ }
|
|
+ }
|
|
+#else
|
|
int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
|
|
if (max_xy == 0)
|
|
s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs);
|
|
@@ -1501,6 +2129,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
|
|
col_limit = FFMIN(24, col_limit);
|
|
s->hevcdsp.idct[log2_trafo_size-2](coeffs, col_limit);
|
|
}
|
|
+#endif
|
|
}
|
|
}
|
|
if (lc->tu.cross_pf) {
|
|
@@ -1510,6 +2139,17 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
|
|
coeffs[i] = coeffs[i] + ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
|
|
}
|
|
}
|
|
+#ifdef RPI
|
|
+ if (s->enable_rpi) {
|
|
+ HEVCPredCmd *cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
|
|
+ cmd->type = RPI_PRED_TRANSFORM_ADD;
|
|
+ cmd->size = log2_trafo_size;
|
|
+ cmd->buf = coeffs;
|
|
+ cmd->dst = dst;
|
|
+ cmd->stride = stride;
|
|
+ return;
|
|
+ }
|
|
+#endif
|
|
s->hevcdsp.transform_add[log2_trafo_size-2](dst, coeffs, stride);
|
|
}
|
|
|
|
diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
|
|
index 1f33b0c..55a0315 100644
|
|
--- a/libavcodec/hevc_filter.c
|
|
+++ b/libavcodec/hevc_filter.c
|
|
@@ -22,6 +22,12 @@
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
*/
|
|
|
|
+//#define DISABLE_SAO
|
|
+//#define DISABLE_DEBLOCK
|
|
+//#define DISABLE_STRENGTHS
|
|
+// define DISABLE_DEBLOCK_NONREF for a 6% speed boost (by skipping deblocking on unimportant frames)
|
|
+//#define DISABLE_DEBLOCK_NONREF
|
|
+
|
|
#include "libavutil/common.h"
|
|
#include "libavutil/internal.h"
|
|
|
|
@@ -31,6 +37,11 @@
|
|
|
|
#include "bit_depth_template.c"
|
|
|
|
+#ifdef RPI
|
|
+#include "rpi_user_vcsm.h"
|
|
+#include "rpi_qpu.h"
|
|
+#endif
|
|
+
|
|
#define LUMA 0
|
|
#define CB 1
|
|
#define CR 2
|
|
@@ -273,6 +284,10 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
|
|
edges[2] = x_ctb == s->ps.sps->ctb_width - 1;
|
|
edges[3] = y_ctb == s->ps.sps->ctb_height - 1;
|
|
|
|
+#ifdef DISABLE_SAO
|
|
+ return;
|
|
+#endif
|
|
+
|
|
if (restore) {
|
|
if (!edges[0]) {
|
|
left_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1]];
|
|
@@ -496,6 +511,15 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
|
|
s->ps.sps->pcm.loop_filter_disable_flag) ||
|
|
s->ps.pps->transquant_bypass_enable_flag;
|
|
|
|
+#ifdef DISABLE_DEBLOCK_NONREF
|
|
+ if (!s->used_for_ref)
|
|
+ return; // Don't deblock non-reference frames
|
|
+#endif
|
|
+#ifdef DISABLE_DEBLOCK
|
|
+ return;
|
|
+#endif
|
|
+ if (!s->used_for_ref && s->avctx->skip_loop_filter >= AVDISCARD_NONREF)
|
|
+ return;
|
|
if (x0) {
|
|
left_tc_offset = s->deblock[ctb - 1].tc_offset;
|
|
left_beta_offset = s->deblock[ctb - 1].beta_offset;
|
|
@@ -539,6 +563,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
|
|
s->frame->linesize[LUMA],
|
|
beta, tc, no_p, no_q);
|
|
} else
|
|
+#ifdef RPI_DEBLOCK_VPU
|
|
+ if (s->enable_rpi_deblock) {
|
|
+ uint8_t (*setup)[2][2][4];
|
|
+ int num16 = (y>>4)*s->setup_width + (x>>4);
|
|
+ int a = ((y>>3) & 1) << 1;
|
|
+ int b = (x>>3) & 1;
|
|
+ setup = s->dvq->y_setup_arm[num16];
|
|
+ setup[0][b][0][a] = beta;
|
|
+ setup[0][b][0][a + 1] = beta;
|
|
+ setup[0][b][1][a] = tc[0];
|
|
+ setup[0][b][1][a + 1] = tc[1];
|
|
+ } else
|
|
+#endif
|
|
s->hevcdsp.hevc_v_loop_filter_luma(src,
|
|
s->frame->linesize[LUMA],
|
|
beta, tc, no_p, no_q);
|
|
@@ -571,6 +608,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
|
|
s->frame->linesize[LUMA],
|
|
beta, tc, no_p, no_q);
|
|
} else
|
|
+#ifdef RPI_DEBLOCK_VPU
|
|
+ if (s->enable_rpi_deblock) {
|
|
+ uint8_t (*setup)[2][2][4];
|
|
+ int num16 = (y>>4)*s->setup_width + (x>>4);
|
|
+ int a = ((x>>3) & 1) << 1;
|
|
+ int b = (y>>3) & 1;
|
|
+ setup = s->dvq->y_setup_arm[num16];
|
|
+ setup[1][b][0][a] = beta;
|
|
+ setup[1][b][0][a + 1] = beta;
|
|
+ setup[1][b][1][a] = tc[0];
|
|
+ setup[1][b][1][a + 1] = tc[1];
|
|
+ } else
|
|
+#endif
|
|
s->hevcdsp.hevc_h_loop_filter_luma(src,
|
|
s->frame->linesize[LUMA],
|
|
beta, tc, no_p, no_q);
|
|
@@ -605,9 +655,23 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
|
|
s->frame->linesize[chroma],
|
|
c_tc, no_p, no_q);
|
|
} else
|
|
+#ifdef RPI_DEBLOCK_VPU
|
|
+ if (s->enable_rpi_deblock) {
|
|
+ uint8_t (*setup)[2][2][4];
|
|
+ int xc = x>>s->ps.sps->hshift[chroma];
|
|
+ int yc = y>>s->ps.sps->vshift[chroma];
|
|
+ int num16 = (yc>>4)*s->uv_setup_width + (xc>>4);
|
|
+ int a = ((yc>>3) & 1) << 1;
|
|
+ int b = (xc>>3) & 1;
|
|
+ setup = s->dvq->uv_setup_arm[num16];
|
|
+ setup[0][b][0][a] = c_tc[0];
|
|
+ setup[0][b][0][a + 1] = c_tc[1];
|
|
+ } else
|
|
+#endif
|
|
s->hevcdsp.hevc_v_loop_filter_chroma(src,
|
|
s->frame->linesize[chroma],
|
|
c_tc, no_p, no_q);
|
|
+
|
|
}
|
|
}
|
|
|
|
@@ -638,6 +702,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
|
|
s->frame->linesize[chroma],
|
|
c_tc, no_p, no_q);
|
|
} else
|
|
+#ifdef RPI_DEBLOCK_VPU
|
|
+ if (s->enable_rpi_deblock) {
|
|
+ uint8_t (*setup)[2][2][4];
|
|
+ int xc = x>>s->ps.sps->hshift[chroma];
|
|
+ int yc = y>>s->ps.sps->vshift[chroma];
|
|
+ int num16 = (yc>>4)*s->uv_setup_width + (xc>>4);
|
|
+ int a = ((xc>>3) & 1) << 1;
|
|
+ int b = (yc>>3) & 1;
|
|
+ setup = s->dvq->uv_setup_arm[num16];
|
|
+ setup[1][b][0][a] = c_tc[0];
|
|
+ setup[1][b][0][a + 1] = c_tc[1];
|
|
+ } else
|
|
+#endif
|
|
s->hevcdsp.hevc_h_loop_filter_chroma(src,
|
|
s->frame->linesize[chroma],
|
|
c_tc, no_p, no_q);
|
|
@@ -648,69 +725,6 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
|
|
}
|
|
}
|
|
|
|
-static int boundary_strength(HEVCContext *s, MvField *curr, MvField *neigh,
|
|
- RefPicList *neigh_refPicList)
|
|
-{
|
|
- if (curr->pred_flag == PF_BI && neigh->pred_flag == PF_BI) {
|
|
- // same L0 and L1
|
|
- if (s->ref->refPicList[0].list[curr->ref_idx[0]] == neigh_refPicList[0].list[neigh->ref_idx[0]] &&
|
|
- s->ref->refPicList[0].list[curr->ref_idx[0]] == s->ref->refPicList[1].list[curr->ref_idx[1]] &&
|
|
- neigh_refPicList[0].list[neigh->ref_idx[0]] == neigh_refPicList[1].list[neigh->ref_idx[1]]) {
|
|
- if ((FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
|
|
- FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4) &&
|
|
- (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
|
|
- FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4))
|
|
- return 1;
|
|
- else
|
|
- return 0;
|
|
- } else if (neigh_refPicList[0].list[neigh->ref_idx[0]] == s->ref->refPicList[0].list[curr->ref_idx[0]] &&
|
|
- neigh_refPicList[1].list[neigh->ref_idx[1]] == s->ref->refPicList[1].list[curr->ref_idx[1]]) {
|
|
- if (FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
|
|
- FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4)
|
|
- return 1;
|
|
- else
|
|
- return 0;
|
|
- } else if (neigh_refPicList[1].list[neigh->ref_idx[1]] == s->ref->refPicList[0].list[curr->ref_idx[0]] &&
|
|
- neigh_refPicList[0].list[neigh->ref_idx[0]] == s->ref->refPicList[1].list[curr->ref_idx[1]]) {
|
|
- if (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
|
|
- FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4)
|
|
- return 1;
|
|
- else
|
|
- return 0;
|
|
- } else {
|
|
- return 1;
|
|
- }
|
|
- } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV
|
|
- Mv A, B;
|
|
- int ref_A, ref_B;
|
|
-
|
|
- if (curr->pred_flag & 1) {
|
|
- A = curr->mv[0];
|
|
- ref_A = s->ref->refPicList[0].list[curr->ref_idx[0]];
|
|
- } else {
|
|
- A = curr->mv[1];
|
|
- ref_A = s->ref->refPicList[1].list[curr->ref_idx[1]];
|
|
- }
|
|
-
|
|
- if (neigh->pred_flag & 1) {
|
|
- B = neigh->mv[0];
|
|
- ref_B = neigh_refPicList[0].list[neigh->ref_idx[0]];
|
|
- } else {
|
|
- B = neigh->mv[1];
|
|
- ref_B = neigh_refPicList[1].list[neigh->ref_idx[1]];
|
|
- }
|
|
-
|
|
- if (ref_A == ref_B) {
|
|
- if (FFABS(A.x - B.x) >= 4 || FFABS(A.y - B.y) >= 4)
|
|
- return 1;
|
|
- else
|
|
- return 0;
|
|
- } else
|
|
- return 1;
|
|
- }
|
|
-
|
|
- return 1;
|
|
-}
|
|
|
|
void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
|
|
int log2_trafo_size)
|
|
@@ -721,10 +735,21 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
|
|
int log2_min_tu_size = s->ps.sps->log2_min_tb_size;
|
|
int min_pu_width = s->ps.sps->min_pu_width;
|
|
int min_tu_width = s->ps.sps->min_tb_width;
|
|
- int is_intra = tab_mvf[(y0 >> log2_min_pu_size) * min_pu_width +
|
|
- (x0 >> log2_min_pu_size)].pred_flag == PF_INTRA;
|
|
int boundary_upper, boundary_left;
|
|
- int i, j, bs;
|
|
+ int i, j;
|
|
+ RefPicList *rpl = s->ref->refPicList;
|
|
+ int min_pu_in_4pix = (1 << log2_min_pu_size) >> 2;
|
|
+ int trafo_in_min_pus = (1 << log2_trafo_size) >> log2_min_pu_size;
|
|
+ int y_pu = y0 >> log2_min_pu_size;
|
|
+ int x_pu = x0 >> log2_min_pu_size;
|
|
+ MvField *curr = &tab_mvf[y_pu * min_pu_width + x_pu];
|
|
+ int is_intra = curr->pred_flag == PF_INTRA;
|
|
+ int inc = log2_min_pu_size == 2 ? 2 : 1;
|
|
+ uint8_t *bs;
|
|
+
|
|
+#ifdef DISABLE_STRENGTHS
|
|
+ return;
|
|
+#endif
|
|
|
|
boundary_upper = y0 > 0 && !(y0 & 7);
|
|
if (boundary_upper &&
|
|
@@ -736,34 +761,56 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
|
|
(y0 % (1 << s->ps.sps->log2_ctb_size)) == 0)))
|
|
boundary_upper = 0;
|
|
|
|
+ bs = &s->horizontal_bs[(x0 + y0 * s->bs_width) >> 2];
|
|
+
|
|
if (boundary_upper) {
|
|
RefPicList *rpl_top = (lc->boundary_flags & BOUNDARY_UPPER_SLICE) ?
|
|
ff_hevc_get_ref_list(s, s->ref, x0, y0 - 1) :
|
|
- s->ref->refPicList;
|
|
- int yp_pu = (y0 - 1) >> log2_min_pu_size;
|
|
- int yq_pu = y0 >> log2_min_pu_size;
|
|
- int yp_tu = (y0 - 1) >> log2_min_tu_size;
|
|
- int yq_tu = y0 >> log2_min_tu_size;
|
|
+ rpl;
|
|
+ MvField *top = curr - min_pu_width;
|
|
+
|
|
+ if (is_intra) {
|
|
+ for (i = 0; i < (1 << log2_trafo_size); i += 4)
|
|
+ bs[i >> 2] = 2;
|
|
+
|
|
+ } else {
|
|
+ int y_tu = y0 >> log2_min_tu_size;
|
|
+ int x_tu = x0 >> log2_min_tu_size;
|
|
+ uint8_t *curr_cbf_luma = &s->cbf_luma[y_tu * min_tu_width + x_tu];
|
|
+ uint8_t *top_cbf_luma = curr_cbf_luma - min_tu_width;
|
|
+
|
|
+ s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
|
|
+ min_pu_in_4pix, sizeof (MvField), 4 >> 2,
|
|
+ rpl[0].list, rpl[1].list, rpl_top[0].list, rpl_top[1].list,
|
|
+ curr, top, bs);
|
|
|
|
for (i = 0; i < (1 << log2_trafo_size); i += 4) {
|
|
- int x_pu = (x0 + i) >> log2_min_pu_size;
|
|
- int x_tu = (x0 + i) >> log2_min_tu_size;
|
|
- MvField *top = &tab_mvf[yp_pu * min_pu_width + x_pu];
|
|
- MvField *curr = &tab_mvf[yq_pu * min_pu_width + x_pu];
|
|
- uint8_t top_cbf_luma = s->cbf_luma[yp_tu * min_tu_width + x_tu];
|
|
- uint8_t curr_cbf_luma = s->cbf_luma[yq_tu * min_tu_width + x_tu];
|
|
-
|
|
- if (curr->pred_flag == PF_INTRA || top->pred_flag == PF_INTRA)
|
|
- bs = 2;
|
|
- else if (curr_cbf_luma || top_cbf_luma)
|
|
- bs = 1;
|
|
- else
|
|
- bs = boundary_strength(s, curr, top, rpl_top);
|
|
- s->horizontal_bs[((x0 + i) + y0 * s->bs_width) >> 2] = bs;
|
|
+ int i_pu = i >> log2_min_pu_size;
|
|
+ int i_tu = i >> log2_min_tu_size;
|
|
+
|
|
+ if (top[i_pu].pred_flag == PF_INTRA)
|
|
+ bs[i >> 2] = 2;
|
|
+ else if (curr_cbf_luma[i_tu] || top_cbf_luma[i_tu])
|
|
+ bs[i >> 2] = 1;
|
|
}
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (!is_intra) {
|
|
+ for (j = inc; j < trafo_in_min_pus; j += inc) {
|
|
+ MvField *top;
|
|
+
|
|
+ curr += min_pu_width * inc;
|
|
+ top = curr - min_pu_width;
|
|
+ bs += s->bs_width * inc << log2_min_pu_size >> 2;
|
|
+
|
|
+ s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
|
|
+ min_pu_in_4pix, sizeof (MvField), 4 >> 2,
|
|
+ rpl[0].list, rpl[1].list, rpl[0].list, rpl[1].list,
|
|
+ curr, top, bs);
|
|
+ }
|
|
}
|
|
|
|
- // bs for vertical TU boundaries
|
|
boundary_left = x0 > 0 && !(x0 & 7);
|
|
if (boundary_left &&
|
|
((!s->sh.slice_loop_filter_across_slices_enabled_flag &&
|
|
@@ -774,64 +821,54 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
|
|
(x0 % (1 << s->ps.sps->log2_ctb_size)) == 0)))
|
|
boundary_left = 0;
|
|
|
|
+ curr = &tab_mvf[y_pu * min_pu_width + x_pu];
|
|
+ bs = &s->vertical_bs[(x0 + y0 * s->bs_width) >> 2];
|
|
+
|
|
if (boundary_left) {
|
|
RefPicList *rpl_left = (lc->boundary_flags & BOUNDARY_LEFT_SLICE) ?
|
|
ff_hevc_get_ref_list(s, s->ref, x0 - 1, y0) :
|
|
- s->ref->refPicList;
|
|
- int xp_pu = (x0 - 1) >> log2_min_pu_size;
|
|
- int xq_pu = x0 >> log2_min_pu_size;
|
|
- int xp_tu = (x0 - 1) >> log2_min_tu_size;
|
|
- int xq_tu = x0 >> log2_min_tu_size;
|
|
+ rpl;
|
|
+ MvField *left = curr - 1;
|
|
|
|
- for (i = 0; i < (1 << log2_trafo_size); i += 4) {
|
|
- int y_pu = (y0 + i) >> log2_min_pu_size;
|
|
- int y_tu = (y0 + i) >> log2_min_tu_size;
|
|
- MvField *left = &tab_mvf[y_pu * min_pu_width + xp_pu];
|
|
- MvField *curr = &tab_mvf[y_pu * min_pu_width + xq_pu];
|
|
- uint8_t left_cbf_luma = s->cbf_luma[y_tu * min_tu_width + xp_tu];
|
|
- uint8_t curr_cbf_luma = s->cbf_luma[y_tu * min_tu_width + xq_tu];
|
|
-
|
|
- if (curr->pred_flag == PF_INTRA || left->pred_flag == PF_INTRA)
|
|
- bs = 2;
|
|
- else if (curr_cbf_luma || left_cbf_luma)
|
|
- bs = 1;
|
|
- else
|
|
- bs = boundary_strength(s, curr, left, rpl_left);
|
|
- s->vertical_bs[(x0 + (y0 + i) * s->bs_width) >> 2] = bs;
|
|
- }
|
|
- }
|
|
+ if (is_intra) {
|
|
+ for (j = 0; j < (1 << log2_trafo_size); j += 4)
|
|
+ bs[j * s->bs_width >> 2] = 2;
|
|
|
|
- if (log2_trafo_size > log2_min_pu_size && !is_intra) {
|
|
- RefPicList *rpl = s->ref->refPicList;
|
|
-
|
|
- // bs for TU internal horizontal PU boundaries
|
|
- for (j = 8; j < (1 << log2_trafo_size); j += 8) {
|
|
- int yp_pu = (y0 + j - 1) >> log2_min_pu_size;
|
|
- int yq_pu = (y0 + j) >> log2_min_pu_size;
|
|
-
|
|
- for (i = 0; i < (1 << log2_trafo_size); i += 4) {
|
|
- int x_pu = (x0 + i) >> log2_min_pu_size;
|
|
- MvField *top = &tab_mvf[yp_pu * min_pu_width + x_pu];
|
|
- MvField *curr = &tab_mvf[yq_pu * min_pu_width + x_pu];
|
|
-
|
|
- bs = boundary_strength(s, curr, top, rpl);
|
|
- s->horizontal_bs[((x0 + i) + (y0 + j) * s->bs_width) >> 2] = bs;
|
|
+ } else {
|
|
+ int y_tu = y0 >> log2_min_tu_size;
|
|
+ int x_tu = x0 >> log2_min_tu_size;
|
|
+ uint8_t *curr_cbf_luma = &s->cbf_luma[y_tu * min_tu_width + x_tu];
|
|
+ uint8_t *left_cbf_luma = curr_cbf_luma - 1;
|
|
+
|
|
+ s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
|
|
+ min_pu_in_4pix, min_pu_width * sizeof (MvField), 4 * s->bs_width >> 2,
|
|
+ rpl[0].list, rpl[1].list, rpl_left[0].list, rpl_left[1].list,
|
|
+ curr, left, bs);
|
|
+
|
|
+ for (j = 0; j < (1 << log2_trafo_size); j += 4) {
|
|
+ int j_pu = j >> log2_min_pu_size;
|
|
+ int j_tu = j >> log2_min_tu_size;
|
|
+
|
|
+ if (left[j_pu * min_pu_width].pred_flag == PF_INTRA)
|
|
+ bs[j * s->bs_width >> 2] = 2;
|
|
+ else if (curr_cbf_luma[j_tu * min_tu_width] || left_cbf_luma[j_tu * min_tu_width])
|
|
+ bs[j * s->bs_width >> 2] = 1;
|
|
}
|
|
}
|
|
+ }
|
|
|
|
- // bs for TU internal vertical PU boundaries
|
|
- for (j = 0; j < (1 << log2_trafo_size); j += 4) {
|
|
- int y_pu = (y0 + j) >> log2_min_pu_size;
|
|
+ if (!is_intra) {
|
|
+ for (i = inc; i < trafo_in_min_pus; i += inc) {
|
|
+ MvField *left;
|
|
|
|
- for (i = 8; i < (1 << log2_trafo_size); i += 8) {
|
|
- int xp_pu = (x0 + i - 1) >> log2_min_pu_size;
|
|
- int xq_pu = (x0 + i) >> log2_min_pu_size;
|
|
- MvField *left = &tab_mvf[y_pu * min_pu_width + xp_pu];
|
|
- MvField *curr = &tab_mvf[y_pu * min_pu_width + xq_pu];
|
|
+ curr += inc;
|
|
+ left = curr - 1;
|
|
+ bs += inc << log2_min_pu_size >> 2;
|
|
|
|
- bs = boundary_strength(s, curr, left, rpl);
|
|
- s->vertical_bs[((x0 + i) + (y0 + j) * s->bs_width) >> 2] = bs;
|
|
- }
|
|
+ s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
|
|
+ min_pu_in_4pix, min_pu_width * sizeof (MvField), 4 * s->bs_width >> 2,
|
|
+ rpl[0].list, rpl[1].list, rpl[0].list, rpl[1].list,
|
|
+ curr, left, bs);
|
|
}
|
|
}
|
|
}
|
|
@@ -840,11 +877,196 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
|
|
#undef CB
|
|
#undef CR
|
|
|
|
+#if !defined(RPI_FAST_CACHEFLUSH)
|
|
+#if defined(RPI_LUMA_QPU) || defined(RPI_DEBLOCK_VPU)
|
|
+static void flush_buffer_y(const AVFrame * const frame) {
|
|
+ GPU_MEM_PTR_T p = get_gpu_mem_ptr_y(frame);
|
|
+ gpu_cache_flush(&p);
|
|
+}
|
|
+
|
|
+static void flush_buffer_u(const AVFrame * const frame) {
|
|
+ GPU_MEM_PTR_T p = get_gpu_mem_ptr_u(frame);
|
|
+ gpu_cache_flush(&p);
|
|
+}
|
|
+
|
|
+static void flush_buffer_v(const AVFrame * const frame) {
|
|
+ GPU_MEM_PTR_T p = get_gpu_mem_ptr_v(frame);
|
|
+ gpu_cache_flush(&p);
|
|
+}
|
|
+#endif
|
|
+#endif
|
|
+
|
|
+
|
|
+#ifdef RPI_DEBLOCK_VPU
|
|
+#error Not fixed yet
|
|
+
|
|
+// ff_hevc_flush_buffer_lines
|
|
+// flushes and invalidates all pixel rows in [start,end-1]
|
|
+static void ff_hevc_flush_buffer_lines(HEVCContext *s, int start, int end, int flush_luma, int flush_chroma)
|
|
+{
|
|
+#ifdef RPI_FAST_CACHEFLUSH
|
|
+ struct vcsm_user_clean_invalid_s iocache = {};
|
|
+ int curr_y = start;
|
|
+ int n = end;
|
|
+ int curr_uv = curr_y >> s->ps.sps->vshift[1];
|
|
+ int n_uv = n >> s->ps.sps->vshift[1];
|
|
+ int sz,base;
|
|
+ GPU_MEM_PTR_T p;
|
|
+ if (curr_uv < 0) curr_uv = 0;
|
|
+ if (n_uv<=curr_uv) { return; }
|
|
+ sz = s->frame->linesize[1] * (n_uv-curr_uv);
|
|
+ base = s->frame->linesize[1] * curr_uv;
|
|
+ if (flush_chroma) {
|
|
+ p = get_gpu_mem_ptr_u(s->frame);
|
|
+ iocache.s[0].handle = p.vcsm_handle;
|
|
+ iocache.s[0].cmd = 3; // clean+invalidate
|
|
+ iocache.s[0].addr = (int)p.arm + base;
|
|
+ iocache.s[0].size = sz;
|
|
+ p = get_gpu_mem_ptr_v(s->frame);
|
|
+ iocache.s[1].handle = p.vcsm_handle;
|
|
+ iocache.s[1].cmd = 3; // clean+invalidate
|
|
+ iocache.s[1].addr = (int)p.arm + base;
|
|
+ iocache.s[1].size = sz;
|
|
+ }
|
|
+ if (flush_luma) {
|
|
+ p = get_gpu_mem_ptr_y(s->frame);
|
|
+ sz = s->frame->linesize[0] * (n-curr_y);
|
|
+ base = s->frame->linesize[0] * curr_y;
|
|
+ iocache.s[2].handle = p.vcsm_handle;
|
|
+ iocache.s[2].cmd = 3; // clean+invalidate
|
|
+ iocache.s[2].addr = (int)p.arm + base;
|
|
+ iocache.s[2].size = sz;
|
|
+ }
|
|
+ vcsm_clean_invalid( &iocache );
|
|
+#else
|
|
+ if (flush_chroma) {
|
|
+ flush_buffer_u(s->frame);
|
|
+ flush_buffer_v(s->frame);
|
|
+ }
|
|
+ if (flush_luma) {
|
|
+ flush_buffer_y(s->frame);
|
|
+ }
|
|
+#endif
|
|
+}
|
|
+#endif
|
|
+
|
|
+#ifdef RPI_INTER_QPU
|
|
+void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
|
|
+{
|
|
+ if (s->enable_rpi && s->used_for_ref) {
|
|
+ // TODO make this use ff_hevc_flush_buffer_lines
|
|
+#ifdef RPI_FAST_CACHEFLUSH
|
|
+ struct vcsm_user_clean_invalid_s iocache = {};
|
|
+ int curr_y = ((int *)f->progress->data)[0];
|
|
+ int curr_uv = curr_y >> s->ps.sps->vshift[1];
|
|
+ int n_uv = n >> s->ps.sps->vshift[1];
|
|
+ int sz,base;
|
|
+ GPU_MEM_PTR_T p;
|
|
+ if (curr_uv < 0) curr_uv = 0;
|
|
+ if (n_uv<=curr_uv) { return; }
|
|
+ sz = s->frame->linesize[1] * (n_uv-curr_uv);
|
|
+ base = s->frame->linesize[1] * curr_uv;
|
|
+ p = get_gpu_mem_ptr_u(s->frame);
|
|
+ iocache.s[0].handle = p.vcsm_handle;
|
|
+ iocache.s[0].cmd = 3; // clean+invalidate
|
|
+ iocache.s[0].addr = (int)p.arm + base;
|
|
+ iocache.s[0].size = sz;
|
|
+ p = get_gpu_mem_ptr_v(s->frame);
|
|
+ iocache.s[1].handle = p.vcsm_handle;
|
|
+ iocache.s[1].cmd = 3; // clean+invalidate
|
|
+ iocache.s[1].addr = (int)p.arm + base;
|
|
+ iocache.s[1].size = sz;
|
|
+
|
|
+#ifdef RPI_LUMA_QPU
|
|
+ p = get_gpu_mem_ptr_y(s->frame);
|
|
+ sz = s->frame->linesize[0] * (n-curr_y);
|
|
+ base = s->frame->linesize[0] * curr_y;
|
|
+ iocache.s[2].handle = p.vcsm_handle;
|
|
+ iocache.s[2].cmd = 3; // clean+invalidate
|
|
+ iocache.s[2].addr = (int)p.arm + base;
|
|
+ iocache.s[2].size = sz;
|
|
+#endif
|
|
+ vcsm_clean_invalid( &iocache );
|
|
+#else
|
|
+ flush_buffer_u(s->frame);
|
|
+ flush_buffer_v(s->frame);
|
|
+#ifdef RPI_LUMA_QPU
|
|
+ flush_buffer_y(s->frame);
|
|
+#endif
|
|
+
|
|
+#endif
|
|
+ //memcpy(s->dummy.arm,s->frame->data[0],2048*64);
|
|
+ //memcpy(s->dummy.arm,s->frame->data[1],1024*32);
|
|
+ //memcpy(s->dummy.arm,s->frame->data[2],1024*32);
|
|
+ }
|
|
+}
|
|
+#endif
|
|
+
|
|
+#ifdef RPI_DEBLOCK_VPU
|
|
+#error XXX
|
|
+/* rpi_deblock deblocks an entire row of ctbs using the VPU */
|
|
+static void rpi_deblock(HEVCContext *s, int y, int ctb_size)
|
|
+{
|
|
+ // Flush image, 4 lines above to bottom of ctb stripe
|
|
+ ff_hevc_flush_buffer_lines(s, FFMAX(y-4,0), y+ctb_size, 1, 1);
|
|
+ // TODO flush buffer of beta/tc setup when it becomes cached
|
|
+
|
|
+ // Prepare three commands at once to avoid calling overhead
|
|
+ s->dvq->vpu_cmds_arm[0][0] = get_vc_address_y(s->frame) + s->frame->linesize[0] * y;
|
|
+ s->dvq->vpu_cmds_arm[0][1] = s->frame->linesize[0];
|
|
+ s->dvq->vpu_cmds_arm[0][2] = s->setup_width;
|
|
+ s->dvq->vpu_cmds_arm[0][3] = (int) ( s->dvq->y_setup_vc + s->setup_width * (y>>4) );
|
|
+ s->dvq->vpu_cmds_arm[0][4] = ctb_size>>4;
|
|
+ s->dvq->vpu_cmds_arm[0][5] = 2;
|
|
+
|
|
+ s->dvq->vpu_cmds_arm[1][0] = get_vc_address_u(s->frame) + s->frame->linesize[1] * (y>> s->ps.sps->vshift[1]);
|
|
+ s->dvq->vpu_cmds_arm[1][1] = s->frame->linesize[1];
|
|
+ s->dvq->vpu_cmds_arm[1][2] = s->uv_setup_width;
|
|
+ s->dvq->vpu_cmds_arm[1][3] = (int) ( s->dvq->uv_setup_vc + s->uv_setup_width * ((y>>4)>> s->ps.sps->vshift[1]) );
|
|
+ s->dvq->vpu_cmds_arm[1][4] = (ctb_size>>4)>> s->ps.sps->vshift[1];
|
|
+ s->dvq->vpu_cmds_arm[1][5] = 3;
|
|
+
|
|
+ s->dvq->vpu_cmds_arm[2][0] = get_vc_address_v(s->frame) + s->frame->linesize[2] * (y>> s->ps.sps->vshift[2]);
|
|
+ s->dvq->vpu_cmds_arm[2][1] = s->frame->linesize[2];
|
|
+ s->dvq->vpu_cmds_arm[2][2] = s->uv_setup_width;
|
|
+ s->dvq->vpu_cmds_arm[2][3] = (int) ( s->dvq->uv_setup_vc + s->uv_setup_width * ((y>>4)>> s->ps.sps->vshift[1]) );
|
|
+ s->dvq->vpu_cmds_arm[2][4] = (ctb_size>>4)>> s->ps.sps->vshift[1];
|
|
+ s->dvq->vpu_cmds_arm[2][5] = 4;
|
|
+ // Call VPU
|
|
+ s->dvq->cmd_id = vpu_post_code2( vpu_get_fn(), s->dvq->vpu_cmds_vc, 3, 0, 0, 0, 5, 0); // 5 means to do all the commands
|
|
+
|
|
+ s->dvq_n = (s->dvq_n + 1) & (RPI_DEBLOCK_VPU_Q_COUNT - 1);
|
|
+ s->dvq = s->dvq_ents + s->dvq_n;
|
|
+
|
|
+ if (s->dvq->cmd_id != -1) {
|
|
+ vpu_wait(s->dvq->cmd_id);
|
|
+ s->dvq->cmd_id = -1;
|
|
+ }
|
|
+}
|
|
+
|
|
+#endif
|
|
+
|
|
void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
|
|
{
|
|
int x_end = x >= s->ps.sps->width - ctb_size;
|
|
+#ifdef RPI_DEBLOCK_VPU
|
|
+ int done_deblock = 0;
|
|
+#endif
|
|
if (s->avctx->skip_loop_filter < AVDISCARD_ALL)
|
|
deblocking_filter_CTB(s, x, y);
|
|
+#ifdef RPI_DEBLOCK_VPU
|
|
+ if (s->enable_rpi_deblock && x_end)
|
|
+ {
|
|
+ int y_at_end = y >= s->ps.sps->height - ctb_size;
|
|
+ int height = 64; // Deblock in units 64 high to avoid too many VPU calls
|
|
+ int y_start = y&~63;
|
|
+ if (y_at_end) height = s->ps.sps->height - y_start;
|
|
+ if ((((y+ctb_size)&63)==0) || y_at_end) {
|
|
+ done_deblock = 1;
|
|
+ rpi_deblock(s, y_start, height);
|
|
+ }
|
|
+ }
|
|
+#endif
|
|
if (s->ps.sps->sao_enabled) {
|
|
int y_end = y >= s->ps.sps->height - ctb_size;
|
|
if (y && x)
|
|
@@ -853,16 +1075,46 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
|
|
sao_filter_CTB(s, x - ctb_size, y);
|
|
if (y && x_end) {
|
|
sao_filter_CTB(s, x, y - ctb_size);
|
|
- if (s->threads_type & FF_THREAD_FRAME )
|
|
+ if (s->threads_type & FF_THREAD_FRAME ) {
|
|
+#ifdef RPI_INTER_QPU
|
|
+ ff_hevc_flush_buffer(s,&s->ref->tf, y);
|
|
+#endif
|
|
ff_thread_report_progress(&s->ref->tf, y, 0);
|
|
+ }
|
|
}
|
|
if (x_end && y_end) {
|
|
sao_filter_CTB(s, x , y);
|
|
- if (s->threads_type & FF_THREAD_FRAME )
|
|
+ if (s->threads_type & FF_THREAD_FRAME ) {
|
|
+#ifdef RPI_INTER_QPU
|
|
+ ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size);
|
|
+#endif
|
|
ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0);
|
|
+ }
|
|
+ }
|
|
+ } else if (s->threads_type & FF_THREAD_FRAME && x_end) {
|
|
+ //int newh = y + ctb_size - 4;
|
|
+ //int currh = s->ref->tf.progress->data[0];
|
|
+ //if (((y + ctb_size)&63)==0)
|
|
+#ifdef RPI_DEBLOCK_VPU
|
|
+ if (s->enable_rpi_deblock) {
|
|
+ // we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi
|
|
+ if (done_deblock) {
|
|
+ ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
|
|
+ }
|
|
+ } else {
|
|
+#ifdef RPI_INTER_QPU
|
|
+ ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size - 4);
|
|
+#endif
|
|
+ ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
|
|
}
|
|
- } else if (s->threads_type & FF_THREAD_FRAME && x_end)
|
|
+#else
|
|
+#ifdef RPI_INTER_QPU
|
|
+ ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size - 4);
|
|
+ // we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi
|
|
+#endif
|
|
ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
|
|
+#endif
|
|
+ }
|
|
}
|
|
|
|
void ff_hevc_hls_filters(HEVCContext *s, int x_ctb, int y_ctb, int ctb_size)
|
|
diff --git a/libavcodec/hevc_ps.c b/libavcodec/hevc_ps.c
|
|
index 83f2ec2..6882a8d 100644
|
|
--- a/libavcodec/hevc_ps.c
|
|
+++ b/libavcodec/hevc_ps.c
|
|
@@ -989,6 +989,8 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
|
|
sps->amp_enabled_flag = get_bits1(gb);
|
|
sps->sao_enabled = get_bits1(gb);
|
|
|
|
+ av_log(avctx, AV_LOG_INFO, "sao_enabled=%d\n", sps->sao_enabled);
|
|
+
|
|
sps->pcm_enabled_flag = get_bits1(gb);
|
|
if (sps->pcm_enabled_flag) {
|
|
sps->pcm.bit_depth = get_bits(gb, 4) + 1;
|
|
diff --git a/libavcodec/hevcdsp.c b/libavcodec/hevcdsp.c
|
|
index 9d773d9..a6534a9 100644
|
|
--- a/libavcodec/hevcdsp.c
|
|
+++ b/libavcodec/hevcdsp.c
|
|
@@ -123,6 +123,120 @@ DECLARE_ALIGNED(16, const int8_t, ff_hevc_qpel_filters[3][16]) = {
|
|
#include "hevcdsp_template.c"
|
|
#undef BIT_DEPTH
|
|
|
|
+static void hevc_deblocking_boundary_strengths(int pus, int dup, int in_inc, int out_inc,
|
|
+ int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
|
|
+ MvField *curr, MvField *neigh, uint8_t *bs)
|
|
+{
|
|
+ for (; pus > 0; pus--) {
|
|
+ int strength, out;
|
|
+ int curr_refL0 = curr_rpl0[curr->ref_idx[0]];
|
|
+ int curr_refL1 = curr_rpl1[curr->ref_idx[1]];
|
|
+ int neigh_refL0 = neigh_rpl0[neigh->ref_idx[0]];
|
|
+ int neigh_refL1 = neigh_rpl1[neigh->ref_idx[1]];
|
|
+
|
|
+#if 1 // This more directly matches the original implementation
|
|
+ if (curr->pred_flag == PF_BI && neigh->pred_flag == PF_BI) {
|
|
+ // same L0 and L1
|
|
+ if (curr_refL0 == neigh_refL0 &&
|
|
+ curr_refL0 == curr_refL1 &&
|
|
+ neigh_refL0 == neigh_refL1) {
|
|
+ if ((FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
|
|
+ FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4) &&
|
|
+ (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
|
|
+ FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4))
|
|
+ strength = 1;
|
|
+ else
|
|
+ strength = 0;
|
|
+ } else if (neigh_refL0 == curr_refL0 &&
|
|
+ neigh_refL1 == curr_refL1) {
|
|
+ if (FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
|
|
+ FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4)
|
|
+ strength = 1;
|
|
+ else
|
|
+ strength = 0;
|
|
+ } else if (neigh_refL1 == curr_refL0 &&
|
|
+ neigh_refL0 == curr_refL1) {
|
|
+ if (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
|
|
+ FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4)
|
|
+ strength = 1;
|
|
+ else
|
|
+ strength = 0;
|
|
+ } else {
|
|
+ strength = 1;
|
|
+ }
|
|
+ } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV
|
|
+ Mv curr_mv0, neigh_mv0;
|
|
+
|
|
+ if (curr->pred_flag & 1) {
|
|
+ curr_mv0 = curr->mv[0];
|
|
+ } else {
|
|
+ curr_mv0 = curr->mv[1];
|
|
+ curr_refL0 = curr_refL1;
|
|
+ }
|
|
+
|
|
+ if (neigh->pred_flag & 1) {
|
|
+ neigh_mv0 = neigh->mv[0];
|
|
+ } else {
|
|
+ neigh_mv0 = neigh->mv[1];
|
|
+ neigh_refL0 = neigh_refL1;
|
|
+ }
|
|
+
|
|
+ if (curr_refL0 == neigh_refL0) {
|
|
+ if (FFABS(curr_mv0.x - neigh_mv0.x) >= 4 || FFABS(curr_mv0.y - neigh_mv0.y) >= 4)
|
|
+ strength = 1;
|
|
+ else
|
|
+ strength = 0;
|
|
+ } else
|
|
+ strength = 1;
|
|
+ } else
|
|
+ strength = 1;
|
|
+#else // This has exactly the same effect, but is more suitable for vectorisation
|
|
+ Mv curr_mv[2];
|
|
+ Mv neigh_mv[2];
|
|
+ memcpy(curr_mv, curr->mv, sizeof curr_mv);
|
|
+ memcpy(neigh_mv, neigh->mv, sizeof neigh_mv);
|
|
+
|
|
+ if (!(curr->pred_flag & 2)) {
|
|
+ curr_mv[1] = curr_mv[0];
|
|
+ curr_refL1 = curr_refL0;
|
|
+ }
|
|
+ if (!(neigh->pred_flag & 2)) {
|
|
+ neigh_mv[1] = neigh_mv[0];
|
|
+ neigh_refL1 = neigh_refL0;
|
|
+ }
|
|
+ if (!(curr->pred_flag & 1)) {
|
|
+ curr_mv[0] = curr_mv[1];
|
|
+ curr_refL0 = curr_refL1;
|
|
+ }
|
|
+ if (!(neigh->pred_flag & 1)) {
|
|
+ neigh_mv[0] = neigh_mv[1];
|
|
+ neigh_refL0 = neigh_refL1;
|
|
+ }
|
|
+
|
|
+ strength = 1;
|
|
+
|
|
+ strength &= (neigh_refL0 != curr_refL0) | (neigh_refL1 != curr_refL1) |
|
|
+ (FFABS(neigh_mv[0].x - curr_mv[0].x) >= 4) | (FFABS(neigh_mv[0].y - curr_mv[0].y) >= 4) |
|
|
+ (FFABS(neigh_mv[1].x - curr_mv[1].x) >= 4) | (FFABS(neigh_mv[1].y - curr_mv[1].y) >= 4);
|
|
+
|
|
+ strength &= (neigh_refL1 != curr_refL0) | (neigh_refL0 != curr_refL1) |
|
|
+ (FFABS(neigh_mv[1].x - curr_mv[0].x) >= 4) | (FFABS(neigh_mv[1].y - curr_mv[0].y) >= 4) |
|
|
+ (FFABS(neigh_mv[0].x - curr_mv[1].x) >= 4) | (FFABS(neigh_mv[0].y - curr_mv[1].y) >= 4);
|
|
+
|
|
+ strength |= (((curr->pred_flag + 1) ^ (neigh->pred_flag + 1)) >> 2);
|
|
+#endif
|
|
+
|
|
+ curr += in_inc / sizeof (MvField);
|
|
+ neigh += in_inc / sizeof (MvField);
|
|
+
|
|
+ for (out = dup; out > 0; out--)
|
|
+ {
|
|
+ *bs = strength;
|
|
+ bs += out_inc;
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
|
|
{
|
|
#undef FUNC
|
|
@@ -257,6 +371,8 @@ int i = 0;
|
|
break;
|
|
}
|
|
|
|
+ hevcdsp->hevc_deblocking_boundary_strengths = hevc_deblocking_boundary_strengths;
|
|
+
|
|
if (ARCH_X86)
|
|
ff_hevc_dsp_init_x86(hevcdsp, bit_depth);
|
|
if (ARCH_ARM)
|
|
diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h
|
|
index 9f1f6dd..e221e54 100644
|
|
--- a/libavcodec/hevcdsp.h
|
|
+++ b/libavcodec/hevcdsp.h
|
|
@@ -42,6 +42,17 @@ typedef struct SAOParams {
|
|
uint8_t type_idx[3]; ///< sao_type_idx
|
|
} SAOParams;
|
|
|
|
+typedef struct Mv {
|
|
+ int16_t x; ///< horizontal component of motion vector
|
|
+ int16_t y; ///< vertical component of motion vector
|
|
+} Mv;
|
|
+
|
|
+typedef struct MvField {
|
|
+ DECLARE_ALIGNED(4, Mv, mv)[2];
|
|
+ int8_t ref_idx[2];
|
|
+ int8_t pred_flag;
|
|
+} MvField;
|
|
+
|
|
typedef struct HEVCDSPContext {
|
|
void (*put_pcm)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
|
|
struct GetBitContext *gb, int pcm_bit_depth);
|
|
@@ -120,6 +131,9 @@ typedef struct HEVCDSPContext {
|
|
void (*hevc_v_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
|
|
int32_t *tc, uint8_t *no_p,
|
|
uint8_t *no_q);
|
|
+ void (*hevc_deblocking_boundary_strengths)(int pus, int dup, int in_inc, int out_inc,
|
|
+ int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
|
|
+ MvField *curr, MvField *neigh, uint8_t *bs);
|
|
} HEVCDSPContext;
|
|
|
|
void ff_hevc_dsp_init(HEVCDSPContext *hpc, int bit_depth);
|
|
diff --git a/libavcodec/hevcpred_template.c b/libavcodec/hevcpred_template.c
|
|
index 6ae87cc..28d2653 100644
|
|
--- a/libavcodec/hevcpred_template.c
|
|
+++ b/libavcodec/hevcpred_template.c
|
|
@@ -20,6 +20,8 @@
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
*/
|
|
|
|
+//#define DISABLE_INTRA
|
|
+
|
|
#include "libavutil/pixdesc.h"
|
|
|
|
#include "bit_depth_template.c"
|
|
@@ -69,8 +71,11 @@ do { \
|
|
AV_WN4P(&ptr[i], a); \
|
|
else \
|
|
a = PIXEL_SPLAT_X4(ptr[i + 3])
|
|
-
|
|
+#ifdef RPI_WORKER
|
|
+ HEVCLocalContextIntra *lc = (s->enable_rpi) ? &s->HEVClcIntra : (HEVCLocalContextIntra *)s->HEVClc ;
|
|
+#else
|
|
HEVCLocalContext *lc = s->HEVClc;
|
|
+#endif
|
|
int i;
|
|
int hshift = s->ps.sps->hshift[c_idx];
|
|
int vshift = s->ps.sps->vshift[c_idx];
|
|
@@ -114,6 +119,10 @@ do { \
|
|
int top_right_size = (FFMIN(x0 + 2 * size_in_luma_h, s->ps.sps->width) -
|
|
(x0 + size_in_luma_h)) >> hshift;
|
|
|
|
+#ifdef DISABLE_INTRA
|
|
+ return;
|
|
+#endif
|
|
+
|
|
if (s->ps.pps->constrained_intra_pred_flag == 1) {
|
|
int size_in_luma_pu_v = PU(size_in_luma_v);
|
|
int size_in_luma_pu_h = PU(size_in_luma_h);
|
|
diff --git a/libavcodec/mmaldec.c b/libavcodec/mmaldec.c
|
|
index 099a8c5..bdff2d2 100644
|
|
--- a/libavcodec/mmaldec.c
|
|
+++ b/libavcodec/mmaldec.c
|
|
@@ -24,6 +24,9 @@
|
|
* MMAL Video Decoder
|
|
*/
|
|
|
|
+#pragma GCC diagnostic push
|
|
+// Many many redundant decls in the header files
|
|
+#pragma GCC diagnostic ignored "-Wredundant-decls"
|
|
#include <bcm_host.h>
|
|
#include <interface/mmal/mmal.h>
|
|
#include <interface/mmal/mmal_parameters_video.h>
|
|
@@ -31,6 +34,7 @@
|
|
#include <interface/mmal/util/mmal_util_params.h>
|
|
#include <interface/mmal/util/mmal_default_components.h>
|
|
#include <interface/mmal/vc/mmal_vc_api.h>
|
|
+#pragma GCC diagnostic pop
|
|
|
|
#include "avcodec.h"
|
|
#include "internal.h"
|
|
diff --git a/libavcodec/mpeg4videodec.c b/libavcodec/mpeg4videodec.c
|
|
index 3adf28d..2f9195f 100644
|
|
--- a/libavcodec/mpeg4videodec.c
|
|
+++ b/libavcodec/mpeg4videodec.c
|
|
@@ -2205,6 +2205,9 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx)
|
|
|
|
if (ctx->divx_version >= 0)
|
|
s->workaround_bugs |= FF_BUG_HPEL_CHROMA;
|
|
+
|
|
+ if (ctx->num_sprite_warping_points > 1)
|
|
+ s->workaround_bugs |= FF_BUG_GMC_UNSUPPORTED;
|
|
}
|
|
|
|
if (s->workaround_bugs & FF_BUG_STD_QPEL) {
|
|
@@ -2229,6 +2232,7 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx)
|
|
s->workaround_bugs, ctx->lavc_build, ctx->xvid_build,
|
|
ctx->divx_version, ctx->divx_build, s->divx_packed ? "p" : "");
|
|
|
|
+ avctx->workaround_bugs = s->workaround_bugs;
|
|
if (CONFIG_MPEG4_DECODER && ctx->xvid_build >= 0 &&
|
|
s->codec_id == AV_CODEC_ID_MPEG4 &&
|
|
avctx->idct_algo == FF_IDCT_AUTO) {
|
|
diff --git a/libavcodec/rpi_hevc_transform.h b/libavcodec/rpi_hevc_transform.h
|
|
new file mode 100644
|
|
index 0000000..4309f1c
|
|
--- /dev/null
|
|
+++ b/libavcodec/rpi_hevc_transform.h
|
|
@@ -0,0 +1,3070 @@
|
|
+unsigned char rpi_hevc_transform [] = {
|
|
+21,
|
|
+106,
|
|
+0,
|
|
+144,
|
|
+47,
|
|
+1,
|
|
+37,
|
|
+106,
|
|
+0,
|
|
+144,
|
|
+66,
|
|
+1,
|
|
+53,
|
|
+106,
|
|
+0,
|
|
+144,
|
|
+192,
|
|
+4,
|
|
+69,
|
|
+106,
|
|
+0,
|
|
+144,
|
|
+192,
|
|
+4,
|
|
+85,
|
|
+106,
|
|
+0,
|
|
+144,
|
|
+220,
|
|
+5,
|
|
+169,
|
|
+3,
|
|
+62,
|
|
+64,
|
|
+79,
|
|
+64,
|
|
+3,
|
|
+232,
|
|
+32,
|
|
+0,
|
|
+0,
|
|
+0,
|
|
+12,
|
|
+248,
|
|
+0,
|
|
+136,
|
|
+0,
|
|
+0,
|
|
+192,
|
|
+248,
|
|
+0,
|
|
+0,
|
|
+64,
|
|
+232,
|
|
+0,
|
|
+2,
|
|
+0,
|
|
+0,
|
|
+12,
|
|
+248,
|
|
+0,
|
|
+168,
|
|
+0,
|
|
+0,
|
|
+192,
|
|
+248,
|
|
+0,
|
|
+0,
|
|
+0,
|
|
+96,
|
|
+3,
|
|
+232,
|
|
+32,
|
|
+0,
|
|
+0,
|
|
+0,
|
|
+7,
|
|
+232,
|
|
+0,
|
|
+2,
|
|
+0,
|
|
+0,
|
|
+8,
|
|
+232,
|
|
+0,
|
|
+4,
|
|
+0,
|
|
+0,
|
|
+12,
|
|
+248,
|
|
+0,
|
|
+128,
|
|
+0,
|
|
+0,
|
|
+192,
|
|
+8,
|
|
+4,
|
|
+0,
|
|
+4,
|
|
+232,
|
|
+64,
|
|
+0,
|
|
+0,
|
|
+0,
|
|
+5,
|
|
+232,
|
|
+0,
|
|
+8,
|
|
+0,
|
|
+0,
|
|
+128,
|
|
+69,
|
|
+113,
|
|
+66,
|
|
+12,
|
|
+248,
|
|
+0,
|
|
+128,
|
|
+0,
|
|
+0,
|
|
+192,
|
|
+8,
|
|
+4,
|
|
+0,
|
|
+128,
|
|
+69,
|
|
+113,
|
|
+70,
|
|
+128,
|
|
+144,
|
|
+40,
|
|
+0,
|
|
+4,
|
|
+255,
|
|
+48,
|
|
+192,
|
|
+128,
|
|
+3,
|
|
+32,
|
|
+8,
|
|
+16,
|
|
+0,
|
|
+76,
|
|
+254,
|
|
+48,
|
|
+192,
|
|
+9,
|
|
+4,
|
|
+32,
|
|
+8,
|
|
+0,
|
|
+0,
|
|
+4,
|
|
+254,
|
|
+0,
|
|
+144,
|
|
+128,
|
|
+2,
|
|
+0,
|
|
+8,
|
|
+2,
|
|
+0,
|
|
+128,
|
|
+144,
|
|
+23,
|
|
+0,
|
|
+4,
|
|
+255,
|
|
+48,
|
|
+192,
|
|
+128,
|
|
+3,
|
|
+32,
|
|
+8,
|
|
+20,
|
|
+0,
|
|
+76,
|
|
+254,
|
|
+48,
|
|
+192,
|
|
+4,
|
|
+4,
|
|
+32,
|
|
+8,
|
|
+0,
|
|
+0,
|
|
+140,
|
|
+248,
|
|
+44,
|
|
+0,
|
|
+0,
|
|
+0,
|
|
+32,
|
|
+48,
|
|
+4,
|
|
+0,
|
|
+128,
|
|
+69,
|
|
+113,
|
|
+66,
|
|
+242,
|
|
+140,
|
|
+211,
|
|
+192,
|
|
+34,
|
|
+31,
|
|
+41,
|
|
+3,
|
|
+70,
|
|
+192,
|
|
+80,
|
|
+7,
|
|
+164,
|
|
+255,
|
|
+36,
|
|
+204,
|
|
+96,
|
|
+2,
|
|
+0,
|
|
+248,
|
|
+62,
|
|
+0,
|
|
+3,
|
|
+255,
|
|
+55,
|
|
+208,
|
|
+120,
|
|
+3,
|
|
+224,
|
|
+3,
|
|
+190,
|
|
+11,
|
|
+16,
|
|
+139,
|
|
+246,
|
|
+91,
|
|
+0,
|
|
+103,
|
|
+90,
|
|
+0,
|
|
+70,
|
|
+192,
|
|
+80,
|
|
+7,
|
|
+164,
|
|
+255,
|
|
+36,
|
|
+204,
|
|
+224,
|
|
+2,
|
|
+0,
|
|
+248,
|
|
+62,
|
|
+0,
|
|
+3,
|
|
+255,
|
|
+55,
|
|
+208,
|
|
+120,
|
|
+3,
|
|
+224,
|
|
+3,
|
|
+190,
|
|
+11,
|
|
+16,
|
|
+139,
|
|
+246,
|
|
+91,
|
|
+0,
|
|
+103,
|
|
+90,
|
|
+0,
|
|
+225,
|
|
+64,
|
|
+242,
|
|
+64,
|
|
+3,
|
|
+232,
|
|
+128,
|
|
+0,
|
|
+0,
|
|
+0,
|
|
+7,
|
|
+232,
|
|
+0,
|
|
+2,
|
|
+0,
|
|
+0,
|
|
+57,
|
|
+239,
|
|
+224,
|
|
+247,
|
|
+255,
|
|
+255,
|
|
+72,
|
|
+192,
|
|
+95,
|
|
+207,
|
|
+88,
|
|
+122,
|
|
+88,
|
|
+124,
|
|
+137,
|
|
+64,
|
|
+26,
|
|
+64,
|
|
+4,
|
|
+232,
|
|
+64,
|
|
+0,
|
|
+0,
|
|
+0,
|
|
+149,
|
|
+96,
|
|
+161,
|
|
+64,
|
|
+152,
|
|
+64,
|
|
+128,
|
|
+144,
|
|
+35,
|
|
+0,
|
|
+72,
|
|
+232,
|
|
+0,
|
|
+4,
|
|
+0,
|
|
+0,
|
|
+65,
|
|
+232,
|
|
+32,
|
|
+0,
|
|
+0,
|
|
+0,
|
|
+128,
|
|
+144,
|
|
+27,
|
|
+0,
|
|
+4,
|
|
+232,
|
|
+0,
|
|
+8,
|
|
+0,
|
|
+0,
|
|
+69,
|
|
+96,
|
|
+145,
|
|
+64,
|
|
+168,
|
|
+64,
|
|
+128,
|
|
+144,
|
|
+19,
|
|
+0,
|
|
+72,
|
|
+232,
|
|
+0,
|
|
+4,
|
|
+0,
|
|
+0,
|
|
+65,
|
|
+232,
|
|
+32,
|
|
+0,
|
|
+0,
|
|
+0,
|
|
+128,
|
|
+144,
|
|
+11,
|
|
+0,
|
|
+74,
|
|
+232,
|
|
+0,
|
|
+8,
|
|
+0,
|
|
+0,
|
|
+242,
|
|
+140,
|
|
+221,
|
|
+192,
|
|
+57,
|
|
+239,
|
|
+32,
|
|
+8,
|
|
+0,
|
|
+0,
|
|
+41,
|
|
+3,
|
|
+239,
|
|
+3,
|
|
+12,
|
|
+248,
|
|
+0,
|
|
+128,
|
|
+0,
|
|
+0,
|
|
+192,
|
|
+248,
|
|
+4,
|
|
+0,
|
|
+12,
|
|
+248,
|
|
+0,
|
|
+132,
|
|
+64,
|
|
+0,
|
|
+192,
|
|
+248,
|
|
+4,
|
|
+0,
|
|
+0,
|
|
+96,
|
|
+255,
|
|
+159,
|
|
+154,
|
|
+255,
|
|
+0,
|
|
+232,
|
|
+0,
|
|
+4,
|
|
+0,
|
|
+0,
|
|
+255,
|
|
+159,
|
|
+165,
|
|
+255,
|
|
+4,
|
|
+255,
|
|
+48,
|
|
+204,
|
|
+16,
|
|
+3,
|
|
+224,
|
|
+251,
|
|
+62,
|
|
+0,
|
|
+4,
|
|
+255,
|
|
+51,
|
|
+204,
|
|
+128,
|
|
+3,
|
|
+224,
|
|
+251,
|
|
+16,
|
|
+0,
|
|
+76,
|
|
+254,
|
|
+51,
|
|
+204,
|
|
+128,
|
|
+3,
|
|
+224,
|
|
+251,
|
|
+20,
|
|
+0,
|
|
+128,
|
|
+64,
|
|
+6,
|
|
+232,
|
|
+64,
|
|
+0,
|
|
+0,
|
|
+0,
|
|
+140,
|
|
+248,
|
|
+47,
|
|
+0,
|
|
+0,
|
|
+0,
|
|
+224,
|
|
+99,
|
|
+0,
|
|
+0,
|
|
+32,
|
|
+247,
|
|
+240,
|
|
+207,
|
|
+16,
|
|
+3,
|
|
+32,
|
|
+247,
|
|
+176,
|
|
+207,
|
|
+17,
|
|
+19,
|
|
+32,
|
|
+247,
|
|
+112,
|
|
+207,
|
|
+18,
|
|
+35,
|
|
+32,
|
|
+247,
|
|
+48,
|
|
+207,
|
|
+19,
|
|
+51,
|
|
+32,
|
|
+247,
|
|
+240,
|
|
+206,
|
|
+20,
|
|
+67,
|
|
+32,
|
|
+247,
|
|
+176,
|
|
+206,
|
|
+21,
|
|
+83,
|
|
+32,
|
|
+247,
|
|
+112,
|
|
+206,
|
|
+22,
|
|
+99,
|
|
+32,
|
|
+247,
|
|
+48,
|
|
+206,
|
|
+23,
|
|
+115,
|
|
+32,
|
|
+247,
|
|
+240,
|
|
+205,
|
|
+24,
|
|
+131,
|
|
+32,
|
|
+247,
|
|
+176,
|
|
+205,
|
|
+25,
|
|
+147,
|
|
+32,
|
|
+247,
|
|
+112,
|
|
+205,
|
|
+26,
|
|
+163,
|
|
+32,
|
|
+247,
|
|
+48,
|
|
+205,
|
|
+27,
|
|
+179,
|
|
+32,
|
|
+247,
|
|
+240,
|
|
+204,
|
|
+28,
|
|
+195,
|
|
+32,
|
|
+247,
|
|
+176,
|
|
+204,
|
|
+29,
|
|
+211,
|
|
+32,
|
|
+247,
|
|
+112,
|
|
+204,
|
|
+30,
|
|
+227,
|
|
+32,
|
|
+247,
|
|
+48,
|
|
+204,
|
|
+31,
|
|
+243,
|
|
+4,
|
|
+255,
|
|
+51,
|
|
+204,
|
|
+128,
|
|
+3,
|
|
+224,
|
|
+251,
|
|
+16,
|
|
+0,
|
|
+76,
|
|
+254,
|
|
+51,
|
|
+204,
|
|
+128,
|
|
+3,
|
|
+224,
|
|
+251,
|
|
+20,
|
|
+0,
|
|
+0,
|
|
+237,
|
|
+32,
|
|
+0,
|
|
+0,
|
|
+0,
|
|
+140,
|
|
+248,
|
|
+47,
|
|
+0,
|
|
+0,
|
|
+0,
|
|
+224,
|
|
+99,
|
|
+0,
|
|
+0,
|
|
+111,
|
|
+3,
|
|
+4,
|
|
+254,
|
|
+0,
|
|
+128,
|
|
+0,
|
|
+4,
|
|
+0,
|
|
+248,
|
|
+0,
|
|
+0,
|
|
+2,
|
|
+232,
|
|
+32,
|
|
+0,
|
|
+0,
|
|
+0,
|
|
+140,
|
|
+248,
|
|
+32,
|
|
+0,
|
|
+0,
|
|
+0,
|
|
+224,
|
|
+35,
|
|
+0,
|
|
+0,
|
|
+64,
|
|
+232,
|
|
+0,
|
|
+2,
|
|
+0,
|
|
+0,
|
|
+193,
|
|
+232,
|
|
+0,
|
|
+1,
|
|
+0,
|
|
+0,
|
|
+1,
|
|
+106,
|
|
+116,
|
|
+30,
|
|
+90,
|
|
+0,
|
|
+169,
|
|
+3,
|
|
+73,
|
|
+64,
|
|
+52,
|
|
+64,
|
|
+45,
|
|
+64,
|
|
+2,
|
|
+64,
|
|
+10,
|
|
+64,
|
|
+64,
|
|
+198,
|
|
+1,
|
|
+7,
|
|
+8,
|
|
+232,
|
|
+63,
|
|
+0,
|
|
+0,
|
|
+0,
|
|
+6,
|
|
+232,
|
|
+253,
|
|
+255,
|
|
+255,
|
|
+255,
|
|
+0,
|
|
+246,
|
|
+0,
|
|
+0,
|
|
+0,
|
|
+4,
|
|
+215,
|
|
+64,
|
|
+3,
|
|
+96,
|
|
+2,
|
|
+248,
|
|
+0,
|
|
+35,
|
|
+0,
|
|
+0,
|
|
+64,
|
|
+56,
|
|
+0,
|
|
+0,
|
|
+4,
|
|
+248,
|
|
+0,
|
|
+36,
|
|
+0,
|
|
+0,
|
|
+64,
|
|
+56,
|
|
+8,
|
|
+0,
|
|
+0,
|
|
+240,
|
|
+64,
|
|
+0,
|
|
+132,
|
|
+3,
|
|
+128,
|
|
+240,
|
|
+0,
|
|
+0,
|
|
+132,
|
|
+3,
|
|
+128,
|
|
+144,
|
|
+137,
|
|
+0,
|
|
+131,
|
|
+98,
|
|
+0,
|
|
+255,
|
|
+64,
|
|
+0,
|
|
+0,
|
|
+20,
|
|
+200,
|
|
+243,
|
|
+0,
|
|
+0,
|
|
+128,
|
|
+144,
|
|
+129,
|
|
+0,
|
|
+131,
|
|
+102,
|
|
+0,
|
|
+158,
|
|
+67,
|
|
+0,
|
|
+2,
|
|
+248,
|
|
+0,
|
|
+35,
|
|
+0,
|
|
+0,
|
|
+64,
|
|
+56,
|
|
+0,
|
|
+0,
|
|
+4,
|
|
+248,
|
|
+0,
|
|
+36,
|
|
+0,
|
|
+0,
|
|
+64,
|
|
+56,
|
|
+8,
|
|
+0,
|
|
+0,
|
|
+240,
|
|
+64,
|
|
+0,
|
|
+132,
|
|
+3,
|
|
+128,
|
|
+240,
|
|
+0,
|
|
+0,
|
|
+132,
|
|
+3,
|
|
+128,
|
|
+144,
|
|
+108,
|
|
+0,
|
|
+131,
|
|
+98,
|
|
+0,
|
|
+255,
|
|
+64,
|
|
+0,
|
|
+0,
|
|
+20,
|
|
+200,
|
|
+243,
|
|
+0,
|
|
+0,
|
|
+128,
|
|
+144,
|
|
+100,
|
|
+0,
|
|
+131,
|
|
+102,
|
|
+0,
|
|
+248,
|
|
+64,
|
|
+0,
|
|
+112,
|
|
+0,
|
|
+192,
|
|
+243,
|
|
+211,
|
|
+31,
|
|
+128,
|
|
+248,
|
|
+0,
|
|
+0,
|
|
+112,
|
|
+0,
|
|
+192,
|
|
+243,
|
|
+211,
|
|
+31,
|
|
+128,
|
|
+144,
|
|
+161,
|
|
+0,
|
|
+188,
|
|
+64,
|
|
+67,
|
|
+232,
|
|
+0,
|
|
+2,
|
|
+0,
|
|
+0,
|
|
+0,
|
|
+255,
|
|
+64,
|
|
+0,
|
|
+0,
|
|
+20,
|
|
+200,
|
|
+243,
|
|
+0,
|
|
+0,
|
|
+128,
|
|
+144,
|
|
+150,
|
|
+0,
|
|
+195,
|
|
+232,
|
|
+0,
|
|
+2,
|
|
+0,
|
|
+0,
|
|
+12,
|
|
+128,
|
|
+7,
|
|
+192,
|
|
+130,
|
|
+248,
|
|
+0,
|
|
+0,
|
|
+112,
|
|
+192,
|
|
+224,
|
|
+16,
|
|
+195,
|
|
+31,
|
|
+132,
|
|
+248,
|
|
+1,
|
|
+0,
|
|
+112,
|
|
+0,
|
|
+224,
|
|
+16,
|
|
+203,
|
|
+31,
|
|
+3,
|
|
+99,
|
|
+131,
|
|
+71,
|
|
+68,
|
|
+232,
|
|
+32,
|
|
+0,
|
|
+0,
|
|
+0,
|
|
+0,
|
|
+99,
|
|
+2,
|
|
+99,
|
|
+23,
|
|
+102,
|
|
+7,
|
|
+106,
|
|
+127,
|
|
+156,
|
|
+182,
|
|
+255,
|
|
+0,
|
|
+248,
|
|
+64,
|
|
+0,
|
|
+112,
|
|
+0,
|
|
+192,
|
|
+243,
|
|
+211,
|
|
+31,
|
|
+128,
|
|
+248,
|
|
+0,
|
|
+0,
|
|
+112,
|
|
+0,
|
|
+192,
|
|
+243,
|
|
+211,
|
|
+31,
|
|
+128,
|
|
+144,
|
|
+112,
|
|
+0,
|
|
+188,
|
|
+64,
|
|
+67,
|
|
+232,
|
|
+0,
|
|
+2,
|
|
+0,
|
|
+0,
|
|
+0,
|
|
+255,
|
|
+64,
|
|
+0,
|
|
+0,
|
|
+20,
|
|
+200,
|
|
+243,
|
|
+0,
|
|
+0,
|
|
+128,
|
|
+144,
|
|
+101,
|
|
+0,
|
|
+195,
|
|
+232,
|
|
+0,
|
|
+2,
|
|
+0,
|
|
+0,
|
|
+12,
|
|
+128,
|
|
+7,
|
|
+192,
|
|
+130,
|
|
+248,
|
|
+0,
|
|
+0,
|
|
+112,
|
|
+192,
|
|
+224,
|
|
+16,
|
|
+195,
|
|
+31,
|
|
+132,
|
|
+248,
|
|
+1,
|
|
+0,
|
|
+112,
|
|
+0,
|
|
+224,
|
|
+16,
|
|
+203,
|
|
+31,
|
|
+25,
|
|
+102,
|
|
+9,
|
|
+106,
|
|
+2,
|
|
+30,
|
|
+41,
|
|
+3,
|
|
+26,
|
|
+87,
|
|
+162,
|
|
+64,
|
|
+64,
|
|
+198,
|
|
+1,
|
|
+23,
|
|
+127,
|
|
+158,
|
|
+103,
|
|
+255,
|
|
+239,
|
|
+3,
|
|
+0,
|
|
+254,
|
|
+0,
|
|
+143,
|
|
+92,
|
|
+0,
|
|
+0,
|
|
+240,
|
|
+12,
|
|
+0,
|
|
+0,
|
|
+254,
|
|
+64,
|
|
+143,
|
|
+93,
|
|
+0,
|
|
+0,
|
|
+240,
|
|
+12,
|
|
+0,
|
|
+0,
|
|
+254,
|
|
+128,
|
|
+143,
|
|
+94,
|
|
+0,
|
|
+0,
|
|
+240,
|
|
+12,
|
|
+0,
|
|
+0,
|
|
+254,
|
|
+192,
|
|
+143,
|
|
+95,
|
|
+0,
|
|
+0,
|
|
+240,
|
|
+12,
|
|
+0,
|
|
+0,
|
|
+254,
|
|
+192,
|
|
+142,
|
|
+208,
|
|
+0,
|
|
+0,
|
|
+240,
|
|
+12,
|
|
+0,
|
|
+0,
|
|
+254,
|
|
+128,
|
|
+142,
|
|
+209,
|
|
+0,
|
|
+0,
|
|
+240,
|
|
+12,
|
|
+0,
|
|
+0,
|
|
+254,
|
|
+64,
|
|
+142,
|
|
+210,
|
|
+0,
|
|
+0,
|
|
+240,
|
|
+12,
|
|
+0,
|
|
+0,
|
|
+254,
|
|
+0,
|
|
+142,
|
|
+211,
|
|
+0,
|
|
+0,
|
|
+240,
|
|
+12,
|
|
+0,
|
|
+128,
|
|
+144,
|
|
+107,
|
|
+0,
|
|
+8,
|
|
+255,
|
|
+99,
|
|
+23,
|
|
+0,
|
|
+212,
|
|
+192,
|
|
+51,
|
|
+0,
|
|
+0,
|
|
+8,
|
|
+255,
|
|
+163,
|
|
+23,
|
|
+0,
|
|
+228,
|
|
+192,
|
|
+51,
|
|
+0,
|
|
+0,
|
|
+8,
|
|
+255,
|
|
+227,
|
|
+23,
|
|
+0,
|
|
+244,
|
|
+192,
|
|
+51,
|
|
+0,
|
|
+0,
|
|
+8,
|
|
+255,
|
|
+35,
|
|
+52,
|
|
+0,
|
|
+180,
|
|
+192,
|
|
+51,
|
|
+0,
|
|
+0,
|
|
+8,
|
|
+255,
|
|
+99,
|
|
+52,
|
|
+0,
|
|
+164,
|
|
+192,
|
|
+51,
|
|
+0,
|
|
+0,
|
|
+8,
|
|
+255,
|
|
+163,
|
|
+52,
|
|
+0,
|
|
+148,
|
|
+192,
|
|
+51,
|
|
+0,
|
|
+0,
|
|
+111,
|
|
+3,
|
|
+239,
|
|
+3,
|
|
+0,
|
|
+254,
|
|
+0,
|
|
+143,
|
|
+12,
|
|
+0,
|
|
+0,
|
|
+240,
|
|
+12,
|
|
+0,
|
|
+0,
|
|
+254,
|
|
+64,
|
|
+143,
|
|
+13,
|
|
+0,
|
|
+0,
|
|
+240,
|
|
+12,
|
|
+0,
|
|
+0,
|
|
+254,
|
|
+128,
|
|
+143,
|
|
+14,
|
|
+0,
|
|
+0,
|
|
+240,
|
|
+12,
|
|
+0,
|
|
+0,
|
|
+254,
|
|
+192,
|
|
+143,
|
|
+15,
|
|
+0,
|
|
+0,
|
|
+240,
|
|
+12,
|
|
+0,
|
|
+0,
|
|
+254,
|
|
+192,
|
|
+142,
|
|
+16,
|
|
+0,
|
|
+0,
|
|
+240,
|
|
+12,
|
|
+0,
|
|
+0,
|
|
+254,
|
|
+128,
|
|
+142,
|
|
+17,
|
|
+0,
|
|
+0,
|
|
+240,
|
|
+12,
|
|
+0,
|
|
+0,
|
|
+254,
|
|
+64,
|
|
+142,
|
|
+18,
|
|
+0,
|
|
+0,
|
|
+240,
|
|
+12,
|
|
+0,
|
|
+0,
|
|
+254,
|
|
+0,
|
|
+142,
|
|
+19,
|
|
+0,
|
|
+0,
|
|
+240,
|
|
+12,
|
|
+0,
|
|
+128,
|
|
+144,
|
|
+33,
|
|
+0,
|
|
+8,
|
|
+255,
|
|
+99,
|
|
+3,
|
|
+0,
|
|
+212,
|
|
+192,
|
|
+51,
|
|
+0,
|
|
+0,
|
|
+8,
|
|
+255,
|
|
+163,
|
|
+3,
|
|
+0,
|
|
+228,
|
|
+192,
|
|
+51,
|
|
+0,
|
|
+0,
|
|
+8,
|
|
+255,
|
|
+227,
|
|
+3,
|
|
+0,
|
|
+244,
|
|
+192,
|
|
+51,
|
|
+0,
|
|
+0,
|
|
+8,
|
|
+255,
|
|
+35,
|
|
+4,
|
|
+0,
|
|
+180,
|
|
+192,
|
|
+51,
|
|
+0,
|
|
+0,
|
|
+8,
|
|
+255,
|
|
+99,
|
|
+4,
|
|
+0,
|
|
+164,
|
|
+192,
|
|
+51,
|
|
+0,
|
|
+0,
|
|
+8,
|
|
+255,
|
|
+163,
|
|
+4,
|
|
+0,
|
|
+148,
|
|
+192,
|
|
+51,
|
|
+0,
|
|
+0,
|
|
+111,
|
|
+3,
|
|
+32,
|
|
+246,
|
|
+192,
|
|
+11,
|
|
+1,
|
|
+16,
|
|
+32,
|
|
+246,
|
|
+2,
|
|
+137,
|
|
+47,
|
|
+240,
|
|
+40,
|
|
+246,
|
|
+2,
|
|
+140,
|
|
+47,
|
|
+240,
|
|
+128,
|
|
+245,
|
|
+99,
|
|
+140,
|
|
+5,
|
|
+4,
|
|
+0,
|
|
+247,
|
|
+99,
|
|
+140,
|
|
+1,
|
|
+20,
|
|
+88,
|
|
+246,
|
|
+99,
|
|
+140,
|
|
+1,
|
|
+20,
|
|
+0,
|
|
+247,
|
|
+35,
|
|
+136,
|
|
+62,
|
|
+226,
|
|
+32,
|
|
+247,
|
|
+35,
|
|
+136,
|
|
+32,
|
|
+210,
|
|
+0,
|
|
+247,
|
|
+34,
|
|
+136,
|
|
+63,
|
|
+2,
|
|
+208,
|
|
+246,
|
|
+34,
|
|
+136,
|
|
+0,
|
|
+4,
|
|
+0,
|
|
+247,
|
|
+99,
|
|
+136,
|
|
+58,
|
|
+162,
|
|
+32,
|
|
+247,
|
|
+99,
|
|
+136,
|
|
+33,
|
|
+146,
|
|
+0,
|
|
+247,
|
|
+98,
|
|
+136,
|
|
+59,
|
|
+18,
|
|
+208,
|
|
+246,
|
|
+98,
|
|
+136,
|
|
+0,
|
|
+20,
|
|
+0,
|
|
+247,
|
|
+162,
|
|
+136,
|
|
+33,
|
|
+2,
|
|
+88,
|
|
+246,
|
|
+98,
|
|
+137,
|
|
+2,
|
|
+68,
|
|
+88,
|
|
+246,
|
|
+162,
|
|
+137,
|
|
+3,
|
|
+68,
|
|
+208,
|
|
+254,
|
|
+227,
|
|
+136,
|
|
+60,
|
|
+242,
|
|
+192,
|
|
+243,
|
|
+188,
|
|
+11,
|
|
+208,
|
|
+254,
|
|
+227,
|
|
+136,
|
|
+56,
|
|
+178,
|
|
+192,
|
|
+243,
|
|
+188,
|
|
+10,
|
|
+32,
|
|
+255,
|
|
+226,
|
|
+136,
|
|
+38,
|
|
+58,
|
|
+192,
|
|
+243,
|
|
+60,
|
|
+0,
|
|
+208,
|
|
+254,
|
|
+227,
|
|
+136,
|
|
+59,
|
|
+242,
|
|
+192,
|
|
+243,
|
|
+60,
|
|
+128,
|
|
+32,
|
|
+255,
|
|
+226,
|
|
+136,
|
|
+49,
|
|
+58,
|
|
+192,
|
|
+243,
|
|
+60,
|
|
+128,
|
|
+0,
|
|
+255,
|
|
+226,
|
|
+136,
|
|
+34,
|
|
+34,
|
|
+192,
|
|
+243,
|
|
+60,
|
|
+128,
|
|
+32,
|
|
+255,
|
|
+226,
|
|
+136,
|
|
+37,
|
|
+58,
|
|
+192,
|
|
+243,
|
|
+60,
|
|
+128,
|
|
+0,
|
|
+254,
|
|
+192,
|
|
+136,
|
|
+1,
|
|
+4,
|
|
+0,
|
|
+240,
|
|
+0,
|
|
+160,
|
|
+0,
|
|
+255,
|
|
+194,
|
|
+8,
|
|
+0,
|
|
+52,
|
|
+195,
|
|
+243,
|
|
+0,
|
|
+128,
|
|
+0,
|
|
+255,
|
|
+202,
|
|
+40,
|
|
+0,
|
|
+52,
|
|
+195,
|
|
+243,
|
|
+0,
|
|
+128,
|
|
+0,
|
|
+254,
|
|
+0,
|
|
+240,
|
|
+35,
|
|
+10,
|
|
+0,
|
|
+240,
|
|
+60,
|
|
+0,
|
|
+0,
|
|
+254,
|
|
+192,
|
|
+136,
|
|
+1,
|
|
+4,
|
|
+0,
|
|
+240,
|
|
+0,
|
|
+160,
|
|
+0,
|
|
+255,
|
|
+226,
|
|
+140,
|
|
+34,
|
|
+34,
|
|
+195,
|
|
+243,
|
|
+60,
|
|
+0,
|
|
+32,
|
|
+255,
|
|
+227,
|
|
+140,
|
|
+36,
|
|
+58,
|
|
+192,
|
|
+243,
|
|
+60,
|
|
+0,
|
|
+0,
|
|
+254,
|
|
+192,
|
|
+136,
|
|
+0,
|
|
+4,
|
|
+0,
|
|
+240,
|
|
+0,
|
|
+160,
|
|
+16,
|
|
+246,
|
|
+226,
|
|
+136,
|
|
+35,
|
|
+50,
|
|
+16,
|
|
+246,
|
|
+226,
|
|
+136,
|
|
+35,
|
|
+50,
|
|
+32,
|
|
+246,
|
|
+226,
|
|
+136,
|
|
+35,
|
|
+50,
|
|
+32,
|
|
+254,
|
|
+226,
|
|
+136,
|
|
+35,
|
|
+58,
|
|
+192,
|
|
+243,
|
|
+60,
|
|
+0,
|
|
+11,
|
|
+96,
|
|
+0,
|
|
+254,
|
|
+0,
|
|
+240,
|
|
+1,
|
|
+4,
|
|
+0,
|
|
+240,
|
|
+64,
|
|
+115,
|
|
+5,
|
|
+106,
|
|
+0,
|
|
+144,
|
|
+173,
|
|
+1,
|
|
+27,
|
|
+96,
|
|
+0,
|
|
+254,
|
|
+0,
|
|
+240,
|
|
+1,
|
|
+4,
|
|
+0,
|
|
+240,
|
|
+64,
|
|
+147,
|
|
+5,
|
|
+106,
|
|
+0,
|
|
+144,
|
|
+227,
|
|
+0,
|
|
+64,
|
|
+246,
|
|
+163,
|
|
+140,
|
|
+1,
|
|
+4,
|
|
+0,
|
|
+246,
|
|
+192,
|
|
+175,
|
|
+63,
|
|
+2,
|
|
+0,
|
|
+246,
|
|
+192,
|
|
+174,
|
|
+59,
|
|
+2,
|
|
+0,
|
|
+246,
|
|
+128,
|
|
+175,
|
|
+62,
|
|
+2,
|
|
+0,
|
|
+246,
|
|
+128,
|
|
+174,
|
|
+58,
|
|
+2,
|
|
+0,
|
|
+246,
|
|
+64,
|
|
+175,
|
|
+61,
|
|
+2,
|
|
+0,
|
|
+246,
|
|
+64,
|
|
+174,
|
|
+57,
|
|
+2,
|
|
+0,
|
|
+255,
|
|
+43,
|
|
+240,
|
|
+4,
|
|
+212,
|
|
+192,
|
|
+243,
|
|
+128,
|
|
+11,
|
|
+64,
|
|
+254,
|
|
+43,
|
|
+240,
|
|
+1,
|
|
+228,
|
|
+192,
|
|
+243,
|
|
+128,
|
|
+10,
|
|
+64,
|
|
+254,
|
|
+43,
|
|
+240,
|
|
+1,
|
|
+244,
|
|
+192,
|
|
+243,
|
|
+128,
|
|
+10,
|
|
+64,
|
|
+254,
|
|
+43,
|
|
+240,
|
|
+1,
|
|
+180,
|
|
+192,
|
|
+243,
|
|
+128,
|
|
+10,
|
|
+64,
|
|
+254,
|
|
+43,
|
|
+141,
|
|
+0,
|
|
+164,
|
|
+192,
|
|
+243,
|
|
+128,
|
|
+10,
|
|
+88,
|
|
+246,
|
|
+35,
|
|
+141,
|
|
+3,
|
|
+68,
|
|
+32,
|
|
+247,
|
|
+35,
|
|
+141,
|
|
+191,
|
|
+66,
|
|
+240,
|
|
+246,
|
|
+35,
|
|
+141,
|
|
+50,
|
|
+66,
|
|
+0,
|
|
+255,
|
|
+235,
|
|
+143,
|
|
+52,
|
|
+242,
|
|
+192,
|
|
+243,
|
|
+60,
|
|
+128,
|
|
+0,
|
|
+255,
|
|
+43,
|
|
+240,
|
|
+2,
|
|
+212,
|
|
+192,
|
|
+243,
|
|
+128,
|
|
+11,
|
|
+0,
|
|
+255,
|
|
+43,
|
|
+240,
|
|
+191,
|
|
+226,
|
|
+192,
|
|
+243,
|
|
+188,
|
|
+10,
|
|
+64,
|
|
+254,
|
|
+43,
|
|
+141,
|
|
+0,
|
|
+180,
|
|
+192,
|
|
+243,
|
|
+128,
|
|
+10,
|
|
+88,
|
|
+246,
|
|
+35,
|
|
+141,
|
|
+2,
|
|
+68,
|
|
+32,
|
|
+247,
|
|
+35,
|
|
+141,
|
|
+190,
|
|
+66,
|
|
+240,
|
|
+246,
|
|
+35,
|
|
+141,
|
|
+50,
|
|
+66,
|
|
+0,
|
|
+255,
|
|
+171,
|
|
+143,
|
|
+52,
|
|
+226,
|
|
+192,
|
|
+243,
|
|
+60,
|
|
+128,
|
|
+0,
|
|
+255,
|
|
+43,
|
|
+240,
|
|
+4,
|
|
+180,
|
|
+192,
|
|
+243,
|
|
+128,
|
|
+11,
|
|
+0,
|
|
+255,
|
|
+43,
|
|
+240,
|
|
+191,
|
|
+226,
|
|
+192,
|
|
+243,
|
|
+188,
|
|
+10,
|
|
+128,
|
|
+253,
|
|
+43,
|
|
+240,
|
|
+3,
|
|
+212,
|
|
+192,
|
|
+243,
|
|
+128,
|
|
+10,
|
|
+64,
|
|
+254,
|
|
+35,
|
|
+141,
|
|
+1,
|
|
+196,
|
|
+192,
|
|
+243,
|
|
+128,
|
|
+10,
|
|
+88,
|
|
+246,
|
|
+35,
|
|
+141,
|
|
+3,
|
|
+68,
|
|
+32,
|
|
+247,
|
|
+35,
|
|
+141,
|
|
+189,
|
|
+66,
|
|
+240,
|
|
+246,
|
|
+35,
|
|
+141,
|
|
+50,
|
|
+66,
|
|
+0,
|
|
+255,
|
|
+107,
|
|
+143,
|
|
+52,
|
|
+210,
|
|
+192,
|
|
+243,
|
|
+60,
|
|
+128,
|
|
+0,
|
|
+255,
|
|
+43,
|
|
+240,
|
|
+4,
|
|
+148,
|
|
+192,
|
|
+243,
|
|
+128,
|
|
+11,
|
|
+64,
|
|
+254,
|
|
+43,
|
|
+240,
|
|
+1,
|
|
+164,
|
|
+192,
|
|
+243,
|
|
+128,
|
|
+10,
|
|
+64,
|
|
+254,
|
|
+43,
|
|
+240,
|
|
+1,
|
|
+180,
|
|
+192,
|
|
+243,
|
|
+128,
|
|
+10,
|
|
+64,
|
|
+254,
|
|
+43,
|
|
+240,
|
|
+1,
|
|
+244,
|
|
+192,
|
|
+243,
|
|
+128,
|
|
+10,
|
|
+64,
|
|
+254,
|
|
+43,
|
|
+141,
|
|
+0,
|
|
+228,
|
|
+192,
|
|
+243,
|
|
+128,
|
|
+10,
|
|
+88,
|
|
+246,
|
|
+35,
|
|
+141,
|
|
+3,
|
|
+68,
|
|
+32,
|
|
+247,
|
|
+35,
|
|
+141,
|
|
+187,
|
|
+66,
|
|
+240,
|
|
+246,
|
|
+35,
|
|
+141,
|
|
+50,
|
|
+66,
|
|
+0,
|
|
+255,
|
|
+235,
|
|
+142,
|
|
+52,
|
|
+178,
|
|
+192,
|
|
+243,
|
|
+60,
|
|
+128,
|
|
+0,
|
|
+255,
|
|
+43,
|
|
+240,
|
|
+2,
|
|
+148,
|
|
+192,
|
|
+243,
|
|
+128,
|
|
+11,
|
|
+0,
|
|
+255,
|
|
+43,
|
|
+240,
|
|
+187,
|
|
+162,
|
|
+192,
|
|
+243,
|
|
+188,
|
|
+10,
|
|
+64,
|
|
+254,
|
|
+43,
|
|
+141,
|
|
+0,
|
|
+244,
|
|
+192,
|
|
+243,
|
|
+128,
|
|
+10,
|
|
+88,
|
|
+246,
|
|
+35,
|
|
+141,
|
|
+2,
|
|
+68,
|
|
+32,
|
|
+247,
|
|
+35,
|
|
+141,
|
|
+186,
|
|
+66,
|
|
+240,
|
|
+246,
|
|
+35,
|
|
+141,
|
|
+50,
|
|
+66,
|
|
+0,
|
|
+255,
|
|
+171,
|
|
+142,
|
|
+52,
|
|
+162,
|
|
+192,
|
|
+243,
|
|
+60,
|
|
+128,
|
|
+0,
|
|
+255,
|
|
+43,
|
|
+240,
|
|
+4,
|
|
+244,
|
|
+192,
|
|
+243,
|
|
+128,
|
|
+11,
|
|
+0,
|
|
+255,
|
|
+43,
|
|
+240,
|
|
+187,
|
|
+162,
|
|
+192,
|
|
+243,
|
|
+188,
|
|
+10,
|
|
+128,
|
|
+253,
|
|
+43,
|
|
+240,
|
|
+3,
|
|
+148,
|
|
+192,
|
|
+243,
|
|
+128,
|
|
+10,
|
|
+64,
|
|
+254,
|
|
+35,
|
|
+141,
|
|
+1,
|
|
+132,
|
|
+192,
|
|
+243,
|
|
+128,
|
|
+10,
|
|
+88,
|
|
+246,
|
|
+35,
|
|
+141,
|
|
+3,
|
|
+68,
|
|
+32,
|
|
+247,
|
|
+35,
|
|
+141,
|
|
+185,
|
|
+66,
|
|
+240,
|
|
+246,
|
|
+35,
|
|
+141,
|
|
+50,
|
|
+66,
|
|
+0,
|
|
+255,
|
|
+107,
|
|
+142,
|
|
+52,
|
|
+146,
|
|
+192,
|
|
+243,
|
|
+60,
|
|
+128,
|
|
+64,
|
|
+255,
|
|
+98,
|
|
+141,
|
|
+0,
|
|
+52,
|
|
+192,
|
|
+243,
|
|
+0,
|
|
+0,
|
|
+0,
|
|
+254,
|
|
+0,
|
|
+240,
|
|
+53,
|
|
+10,
|
|
+0,
|
|
+240,
|
|
+60,
|
|
+0,
|
|
+0,
|
|
+254,
|
|
+0,
|
|
+240,
|
|
+1,
|
|
+4,
|
|
+0,
|
|
+240,
|
|
+64,
|
|
+147,
|
|
+5,
|
|
+106,
|
|
+0,
|
|
+144,
|
|
+177,
|
|
+0,
|
|
+88,
|
|
+246,
|
|
+163,
|
|
+140,
|
|
+1,
|
|
+4,
|
|
+128,
|
|
+245,
|
|
+99,
|
|
+141,
|
|
+10,
|
|
+4,
|
|
+88,
|
|
+246,
|
|
+162,
|
|
+138,
|
|
+1,
|
|
+68,
|
|
+0,
|
|
+247,
|
|
+162,
|
|
+138,
|
|
+36,
|
|
+162,
|
|
+88,
|
|
+254,
|
|
+162,
|
|
+138,
|
|
+3,
|
|
+164,
|
|
+192,
|
|
+243,
|
|
+128,
|
|
+11,
|
|
+0,
|
|
+255,
|
|
+226,
|
|
+137,
|
|
+32,
|
|
+2,
|
|
+195,
|
|
+243,
|
|
+60,
|
|
+0,
|
|
+32,
|
|
+247,
|
|
+226,
|
|
+137,
|
|
+42,
|
|
+114,
|
|
+0,
|
|
+255,
|
|
+34,
|
|
+138,
|
|
+33,
|
|
+18,
|
|
+195,
|
|
+243,
|
|
+60,
|
|
+0,
|
|
+32,
|
|
+247,
|
|
+34,
|
|
+138,
|
|
+42,
|
|
+130,
|
|
+16,
|
|
+246,
|
|
+98,
|
|
+138,
|
|
+40,
|
|
+114,
|
|
+16,
|
|
+246,
|
|
+98,
|
|
+138,
|
|
+41,
|
|
+146,
|
|
+32,
|
|
+246,
|
|
+98,
|
|
+138,
|
|
+41,
|
|
+146,
|
|
+32,
|
|
+246,
|
|
+226,
|
|
+137,
|
|
+41,
|
|
+146,
|
|
+40,
|
|
+246,
|
|
+34,
|
|
+138,
|
|
+41,
|
|
+146,
|
|
+32,
|
|
+247,
|
|
+163,
|
|
+141,
|
|
+63,
|
|
+178,
|
|
+32,
|
|
+247,
|
|
+227,
|
|
+141,
|
|
+62,
|
|
+162,
|
|
+0,
|
|
+254,
|
|
+0,
|
|
+240,
|
|
+8,
|
|
+4,
|
|
+0,
|
|
+240,
|
|
+128,
|
|
+11,
|
|
+128,
|
|
+253,
|
|
+35,
|
|
+240,
|
|
+9,
|
|
+100,
|
|
+192,
|
|
+243,
|
|
+128,
|
|
+10,
|
|
+128,
|
|
+253,
|
|
+163,
|
|
+141,
|
|
+128,
|
|
+115,
|
|
+192,
|
|
+243,
|
|
+152,
|
|
+10,
|
|
+88,
|
|
+246,
|
|
+163,
|
|
+141,
|
|
+4,
|
|
+100,
|
|
+208,
|
|
+246,
|
|
+35,
|
|
+139,
|
|
+0,
|
|
+100,
|
|
+32,
|
|
+255,
|
|
+34,
|
|
+139,
|
|
+53,
|
|
+202,
|
|
+192,
|
|
+243,
|
|
+60,
|
|
+128,
|
|
+0,
|
|
+254,
|
|
+0,
|
|
+139,
|
|
+0,
|
|
+4,
|
|
+0,
|
|
+240,
|
|
+0,
|
|
+160,
|
|
+240,
|
|
+246,
|
|
+163,
|
|
+141,
|
|
+48,
|
|
+98,
|
|
+0,
|
|
+247,
|
|
+99,
|
|
+139,
|
|
+63,
|
|
+210,
|
|
+0,
|
|
+247,
|
|
+98,
|
|
+139,
|
|
+1,
|
|
+212,
|
|
+88,
|
|
+254,
|
|
+98,
|
|
+139,
|
|
+1,
|
|
+212,
|
|
+192,
|
|
+243,
|
|
+128,
|
|
+11,
|
|
+32,
|
|
+255,
|
|
+99,
|
|
+139,
|
|
+62,
|
|
+98,
|
|
+192,
|
|
+243,
|
|
+188,
|
|
+10,
|
|
+88,
|
|
+246,
|
|
+98,
|
|
+139,
|
|
+1,
|
|
+212,
|
|
+240,
|
|
+246,
|
|
+98,
|
|
+139,
|
|
+50,
|
|
+210,
|
|
+0,
|
|
+247,
|
|
+163,
|
|
+128,
|
|
+59,
|
|
+146,
|
|
+0,
|
|
+247,
|
|
+160,
|
|
+128,
|
|
+1,
|
|
+36,
|
|
+88,
|
|
+254,
|
|
+160,
|
|
+128,
|
|
+1,
|
|
+36,
|
|
+192,
|
|
+243,
|
|
+128,
|
|
+11,
|
|
+0,
|
|
+247,
|
|
+163,
|
|
+128,
|
|
+58,
|
|
+98,
|
|
+64,
|
|
+255,
|
|
+35,
|
|
+240,
|
|
+0,
|
|
+100,
|
|
+192,
|
|
+243,
|
|
+128,
|
|
+10,
|
|
+64,
|
|
+255,
|
|
+163,
|
|
+128,
|
|
+0,
|
|
+164,
|
|
+192,
|
|
+243,
|
|
+128,
|
|
+10,
|
|
+88,
|
|
+246,
|
|
+160,
|
|
+128,
|
|
+1,
|
|
+36,
|
|
+240,
|
|
+246,
|
|
+160,
|
|
+128,
|
|
+50,
|
|
+34,
|
|
+8,
|
|
+255,
|
|
+227,
|
|
+143,
|
|
+54,
|
|
+242,
|
|
+192,
|
|
+243,
|
|
+60,
|
|
+128,
|
|
+40,
|
|
+255,
|
|
+227,
|
|
+142,
|
|
+54,
|
|
+178,
|
|
+192,
|
|
+243,
|
|
+60,
|
|
+128,
|
|
+0,
|
|
+254,
|
|
+0,
|
|
+240,
|
|
+39,
|
|
+10,
|
|
+0,
|
|
+240,
|
|
+60,
|
|
+128,
|
|
+8,
|
|
+255,
|
|
+163,
|
|
+143,
|
|
+45,
|
|
+226,
|
|
+192,
|
|
+243,
|
|
+60,
|
|
+128,
|
|
+0,
|
|
+254,
|
|
+0,
|
|
+240,
|
|
+44,
|
|
+10,
|
|
+0,
|
|
+240,
|
|
+60,
|
|
+0,
|
|
+0,
|
|
+254,
|
|
+0,
|
|
+240,
|
|
+40,
|
|
+10,
|
|
+0,
|
|
+240,
|
|
+60,
|
|
+128,
|
|
+8,
|
|
+255,
|
|
+163,
|
|
+142,
|
|
+2,
|
|
+162,
|
|
+192,
|
|
+243,
|
|
+60,
|
|
+128,
|
|
+90,
|
|
+0,
|
|
+169,
|
|
+3,
|
|
+14,
|
|
+96,
|
|
+4,
|
|
+31,
|
|
+169,
|
|
+3,
|
|
+30,
|
|
+96,
|
|
+1,
|
|
+31,
|
|
+73,
|
|
+64,
|
|
+52,
|
|
+64,
|
|
+45,
|
|
+64,
|
|
+2,
|
|
+64,
|
|
+10,
|
|
+64,
|
|
+64,
|
|
+198,
|
|
+1,
|
|
+7,
|
|
+8,
|
|
+232,
|
|
+63,
|
|
+0,
|
|
+0,
|
|
+0,
|
|
+6,
|
|
+232,
|
|
+253,
|
|
+255,
|
|
+255,
|
|
+255,
|
|
+0,
|
|
+246,
|
|
+0,
|
|
+0,
|
|
+0,
|
|
+4,
|
|
+215,
|
|
+64,
|
|
+3,
|
|
+96,
|
|
+2,
|
|
+248,
|
|
+0,
|
|
+35,
|
|
+0,
|
|
+0,
|
|
+64,
|
|
+56,
|
|
+0,
|
|
+0,
|
|
+4,
|
|
+248,
|
|
+0,
|
|
+36,
|
|
+0,
|
|
+0,
|
|
+64,
|
|
+56,
|
|
+8,
|
|
+0,
|
|
+0,
|
|
+240,
|
|
+64,
|
|
+0,
|
|
+132,
|
|
+3,
|
|
+30,
|
|
+106,
|
|
+132,
|
|
+24,
|
|
+128,
|
|
+240,
|
|
+0,
|
|
+0,
|
|
+132,
|
|
+3,
|
|
+128,
|
|
+144,
|
|
+143,
|
|
+0,
|
|
+131,
|
|
+98,
|
|
+0,
|
|
+255,
|
|
+64,
|
|
+0,
|
|
+0,
|
|
+20,
|
|
+200,
|
|
+243,
|
|
+0,
|
|
+0,
|
|
+128,
|
|
+144,
|
|
+135,
|
|
+0,
|
|
+131,
|
|
+102,
|
|
+0,
|
|
+158,
|
|
+71,
|
|
+0,
|
|
+2,
|
|
+248,
|
|
+0,
|
|
+35,
|
|
+0,
|
|
+0,
|
|
+64,
|
|
+56,
|
|
+0,
|
|
+0,
|
|
+4,
|
|
+248,
|
|
+0,
|
|
+36,
|
|
+0,
|
|
+0,
|
|
+64,
|
|
+56,
|
|
+8,
|
|
+0,
|
|
+0,
|
|
+240,
|
|
+64,
|
|
+0,
|
|
+132,
|
|
+3,
|
|
+30,
|
|
+106,
|
|
+132,
|
|
+24,
|
|
+128,
|
|
+240,
|
|
+0,
|
|
+0,
|
|
+132,
|
|
+3,
|
|
+128,
|
|
+144,
|
|
+112,
|
|
+0,
|
|
+131,
|
|
+98,
|
|
+0,
|
|
+255,
|
|
+64,
|
|
+0,
|
|
+0,
|
|
+20,
|
|
+200,
|
|
+243,
|
|
+0,
|
|
+0,
|
|
+128,
|
|
+144,
|
|
+104,
|
|
+0,
|
|
+131,
|
|
+102,
|
|
+0,
|
|
+248,
|
|
+64,
|
|
+0,
|
|
+112,
|
|
+0,
|
|
+192,
|
|
+243,
|
|
+211,
|
|
+31,
|
|
+30,
|
|
+106,
|
|
+134,
|
|
+24,
|
|
+128,
|
|
+248,
|
|
+0,
|
|
+0,
|
|
+112,
|
|
+0,
|
|
+192,
|
|
+243,
|
|
+211,
|
|
+31,
|
|
+128,
|
|
+144,
|
|
+123,
|
|
+0,
|
|
+188,
|
|
+64,
|
|
+67,
|
|
+232,
|
|
+0,
|
|
+2,
|
|
+0,
|
|
+0,
|
|
+0,
|
|
+255,
|
|
+64,
|
|
+0,
|
|
+0,
|
|
+20,
|
|
+200,
|
|
+243,
|
|
+0,
|
|
+0,
|
|
+128,
|
|
+144,
|
|
+112,
|
|
+0,
|
|
+195,
|
|
+232,
|
|
+0,
|
|
+2,
|
|
+0,
|
|
+0,
|
|
+12,
|
|
+128,
|
|
+7,
|
|
+192,
|
|
+130,
|
|
+248,
|
|
+0,
|
|
+0,
|
|
+112,
|
|
+192,
|
|
+224,
|
|
+16,
|
|
+195,
|
|
+31,
|
|
+132,
|
|
+248,
|
|
+1,
|
|
+0,
|
|
+112,
|
|
+0,
|
|
+224,
|
|
+16,
|
|
+203,
|
|
+31,
|
|
+3,
|
|
+99,
|
|
+131,
|
|
+71,
|
|
+68,
|
|
+232,
|
|
+32,
|
|
+0,
|
|
+0,
|
|
+0,
|
|
+0,
|
|
+99,
|
|
+2,
|
|
+99,
|
|
+23,
|
|
+102,
|
|
+7,
|
|
+106,
|
|
+127,
|
|
+156,
|
|
+178,
|
|
+255,
|
|
+0,
|
|
+248,
|
|
+64,
|
|
+0,
|
|
+112,
|
|
+0,
|
|
+192,
|
|
+243,
|
|
+211,
|
|
+31,
|
|
+30,
|
|
+106,
|
|
+134,
|
|
+24,
|
|
+128,
|
|
+248,
|
|
+0,
|
|
+0,
|
|
+112,
|
|
+0,
|
|
+192,
|
|
+243,
|
|
+211,
|
|
+31,
|
|
+128,
|
|
+144,
|
|
+72,
|
|
+0,
|
|
+188,
|
|
+64,
|
|
+67,
|
|
+232,
|
|
+0,
|
|
+2,
|
|
+0,
|
|
+0,
|
|
+0,
|
|
+255,
|
|
+64,
|
|
+0,
|
|
+0,
|
|
+20,
|
|
+200,
|
|
+243,
|
|
+0,
|
|
+0,
|
|
+128,
|
|
+144,
|
|
+61,
|
|
+0,
|
|
+195,
|
|
+232,
|
|
+0,
|
|
+2,
|
|
+0,
|
|
+0,
|
|
+12,
|
|
+128,
|
|
+7,
|
|
+192,
|
|
+130,
|
|
+248,
|
|
+0,
|
|
+0,
|
|
+112,
|
|
+192,
|
|
+224,
|
|
+16,
|
|
+195,
|
|
+31,
|
|
+132,
|
|
+248,
|
|
+1,
|
|
+0,
|
|
+112,
|
|
+0,
|
|
+224,
|
|
+16,
|
|
+203,
|
|
+31,
|
|
+25,
|
|
+102,
|
|
+9,
|
|
+106,
|
|
+2,
|
|
+30,
|
|
+41,
|
|
+3,
|
|
+26,
|
|
+87,
|
|
+162,
|
|
+64,
|
|
+64,
|
|
+198,
|
|
+1,
|
|
+23,
|
|
+127,
|
|
+158,
|
|
+95,
|
|
+255,
|
|
+239,
|
|
+3,
|
|
+0,
|
|
+254,
|
|
+128,
|
|
+143,
|
|
+94,
|
|
+0,
|
|
+0,
|
|
+240,
|
|
+12,
|
|
+0,
|
|
+0,
|
|
+254,
|
|
+192,
|
|
+143,
|
|
+95,
|
|
+0,
|
|
+0,
|
|
+240,
|
|
+12,
|
|
+0,
|
|
+0,
|
|
+254,
|
|
+192,
|
|
+142,
|
|
+208,
|
|
+0,
|
|
+0,
|
|
+240,
|
|
+12,
|
|
+0,
|
|
+0,
|
|
+254,
|
|
+128,
|
|
+142,
|
|
+209,
|
|
+0,
|
|
+0,
|
|
+240,
|
|
+12,
|
|
+0,
|
|
+128,
|
|
+144,
|
|
+47,
|
|
+0,
|
|
+8,
|
|
+255,
|
|
+227,
|
|
+23,
|
|
+0,
|
|
+244,
|
|
+192,
|
|
+51,
|
|
+0,
|
|
+0,
|
|
+8,
|
|
+255,
|
|
+35,
|
|
+52,
|
|
+0,
|
|
+180,
|
|
+192,
|
|
+51,
|
|
+0,
|
|
+0,
|
|
+111,
|
|
+3,
|
|
+239,
|
|
+3,
|
|
+0,
|
|
+254,
|
|
+128,
|
|
+143,
|
|
+14,
|
|
+0,
|
|
+0,
|
|
+240,
|
|
+12,
|
|
+0,
|
|
+0,
|
|
+254,
|
|
+192,
|
|
+143,
|
|
+15,
|
|
+0,
|
|
+0,
|
|
+240,
|
|
+12,
|
|
+0,
|
|
+0,
|
|
+254,
|
|
+192,
|
|
+142,
|
|
+16,
|
|
+0,
|
|
+0,
|
|
+240,
|
|
+12,
|
|
+0,
|
|
+0,
|
|
+254,
|
|
+128,
|
|
+142,
|
|
+17,
|
|
+0,
|
|
+0,
|
|
+240,
|
|
+12,
|
|
+0,
|
|
+128,
|
|
+144,
|
|
+13,
|
|
+0,
|
|
+8,
|
|
+255,
|
|
+227,
|
|
+3,
|
|
+0,
|
|
+244,
|
|
+192,
|
|
+51,
|
|
+0,
|
|
+0,
|
|
+8,
|
|
+255,
|
|
+35,
|
|
+4,
|
|
+0,
|
|
+180,
|
|
+192,
|
|
+51,
|
|
+0,
|
|
+0,
|
|
+111,
|
|
+3,
|
|
+32,
|
|
+246,
|
|
+192,
|
|
+11,
|
|
+1,
|
|
+16,
|
|
+32,
|
|
+246,
|
|
+2,
|
|
+140,
|
|
+47,
|
|
+240,
|
|
+32,
|
|
+247,
|
|
+35,
|
|
+141,
|
|
+63,
|
|
+178,
|
|
+64,
|
|
+254,
|
|
+35,
|
|
+141,
|
|
+2,
|
|
+68,
|
|
+192,
|
|
+243,
|
|
+128,
|
|
+11,
|
|
+32,
|
|
+255,
|
|
+35,
|
|
+240,
|
|
+58,
|
|
+226,
|
|
+192,
|
|
+243,
|
|
+188,
|
|
+10,
|
|
+0,
|
|
+254,
|
|
+0,
|
|
+141,
|
|
+4,
|
|
+4,
|
|
+0,
|
|
+240,
|
|
+128,
|
|
+10,
|
|
+88,
|
|
+246,
|
|
+35,
|
|
+141,
|
|
+3,
|
|
+68,
|
|
+240,
|
|
+246,
|
|
+35,
|
|
+141,
|
|
+48,
|
|
+66,
|
|
+0,
|
|
+247,
|
|
+227,
|
|
+143,
|
|
+52,
|
|
+242,
|
|
+32,
|
|
+247,
|
|
+227,
|
|
+142,
|
|
+52,
|
|
+178,
|
|
+90,
|
|
+0,
|
|
+161,
|
|
+3,
|
|
+6,
|
|
+64,
|
|
+23,
|
|
+64,
|
|
+96,
|
|
+8,
|
|
+70,
|
|
+98,
|
|
+97,
|
|
+8,
|
|
+70,
|
|
+98,
|
|
+98,
|
|
+8,
|
|
+70,
|
|
+98,
|
|
+99,
|
|
+8,
|
|
+70,
|
|
+98,
|
|
+100,
|
|
+8,
|
|
+70,
|
|
+98,
|
|
+101,
|
|
+8,
|
|
+70,
|
|
+98,
|
|
+255,
|
|
+159,
|
|
+8,
|
|
+250,
|
|
+23,
|
|
+102,
|
|
+7,
|
|
+106,
|
|
+112,
|
|
+30,
|
|
+33,
|
|
+3,
|
|
+};
|
|
diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s
|
|
new file mode 100644
|
|
index 0000000..5543093
|
|
--- /dev/null
|
|
+++ b/libavcodec/rpi_hevc_transform.s
|
|
@@ -0,0 +1,917 @@
|
|
+# ******************************************************************************
|
|
+# Argon Design Ltd.
|
|
+# (c) Copyright 2015 Argon Design Ltd. All rights reserved.
|
|
+#
|
|
+# Module : HEVC
|
|
+# Author : Peter de Rivaz
|
|
+# ******************************************************************************
|
|
+
|
|
+# HEVC VPU Transform
|
|
+#
|
|
+# Transform matrix can be thought of as
|
|
+# output row vector = input row vector * transMatrix2
|
|
+#
|
|
+# The even rows of the matrix are symmetric
|
|
+# The odd rows of the matrix are antisymmetric
|
|
+#
|
|
+# So only need to compute the first half of the results, then can compute the remainder with a butterfly
|
|
+#
|
|
+# EXAMPLE
|
|
+# (a b c d) (1 2 2 1)
|
|
+# (3 4 -4 -3)
|
|
+# (5 6 6 5)
|
|
+# (7 8 -8 -7)
|
|
+#
|
|
+# x=(a c)(1 2) = 1a+5c 2a+6c
|
|
+# (5 6)
|
|
+#
|
|
+# y=(b d)(3 4) = 3b+7d 4b+8d
|
|
+# (7 8)
|
|
+#
|
|
+# u=x+y = 1a+5c+3b+7d 2a+4b+6c+8d
|
|
+# v=x-y = 1a+5c-3b-7d 2a+6c-4b-8d
|
|
+#
|
|
+# Final results are (u , v[::-1])
|
|
+#
|
|
+#
|
|
+# For 32x1 input, load even rows into HX(0++,0), odd rows into HX(16++,0)
|
|
+# Apply the even matrix first and stop before rounding
|
|
+# Then apply the odd matrix in a full manner:
|
|
+#
|
|
+# First step is to compute partial products with the first input (16 cycles)
|
|
+# 1a 3b 5c 7d 16x1 input coefficients produce 16x16 output
|
|
+# 2a 4b 6c 8d
|
|
+# 2a -4b 6c -8d
|
|
+# 1a -3b 5c -7d
|
|
+#
|
|
+# Second step is to sum partial products into final position (8 cycles)
|
|
+# 1a+3b+5c+7d
|
|
+# 2a+4b+6c+8d
|
|
+# 2a-4b+6c-8d
|
|
+# 1a-3b+5c-7d
|
|
+#
|
|
+# Then can apply butterfly to combine even results and odd results + rounding to produce 16 rows of output at a time (need to save in transposed format)
|
|
+#
|
|
+# For 16x16 no butterfly is required and can store final results in original location (Could do 2 16x16s in parallel to make use of the trick - saves on the adds)
|
|
+#
|
|
+# For 8x8 we could compute two in parallel.
|
|
+#
|
|
+#
|
|
+
|
|
+# Columns are transformed first
|
|
+#
|
|
+# Store top left half of transMatrix2 in
|
|
+# Store bottom left half of transMatrix2 in HX(32,32)
|
|
+#
|
|
+# For 16x16
|
|
+# HX(0:15,0) contains input data before transform
|
|
+# HY(0:15,0) contains 32bit output data after transform
|
|
+# HX(32,0) contains even rows of left half of transMatrix2
|
|
+# HX(32,32) contains odd rows of left half of transMatrix2
|
|
+# HY(48,0) contains partial products ready for summing
|
|
+#
|
|
+
|
|
+
|
|
+# hevc_trans_16x16(short *transMatrix2, short *coeffs, int num) # TODO add size so we can branch to correct implementation (or perhaps have coeffs32 and num32 as secondary inputs!)
|
|
+# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory)
|
|
+# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
|
|
+# num: number of 16x16 transforms to be done
|
|
+# coeffs32
|
|
+# num32: number of 32x32 transforms
|
|
+# command 0 for transform, 1 for memclear16(int16_t *dst,num16)
|
|
+#
|
|
+hevc_trans_16x16:
|
|
+ cmp r5,1
|
|
+ beq memclear16
|
|
+ cmp r5,2
|
|
+ beq hevc_deblock_16x16
|
|
+ cmp r5,3
|
|
+ beq hevc_uv_deblock_16x16
|
|
+ cmp r5,4
|
|
+ beq hevc_uv_deblock_16x16_with_clear
|
|
+ cmp r5,5
|
|
+ beq hevc_run_command_list
|
|
+
|
|
+ push r6-r15, lr # TODO cut down number of used registers
|
|
+ mov r14,r3 # coeffs32
|
|
+ mov r15,r4 # num32
|
|
+ mov r3, 16*2 # Stride of transMatrix2 in bytes
|
|
+ vldh HX(32++,0),(r0 += r3) REP 16 # This is the 16x16 matrix, a transform is equivalent to multiplying input row vector * matrix
|
|
+
|
|
+ add r0, 16*16*2 # For 32x32 transforms we also need this matrix
|
|
+ vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
|
|
+
|
|
+ # Now use r0 to describe which matrix we are working on.
|
|
+ # Allows us to prefetch the next block of coefficients for efficiency.
|
|
+ mov r0,0 # This describes the location where we read our coefficients from
|
|
+ mov r3,16*2 # Stride of coefficients in bytes (TODO remove)
|
|
+ mov r7,16*16*2 # Total block size
|
|
+ mov r8,64*16 # Value used to swap from current to next VRF location
|
|
+ vldh HX(0++,0)+r0,(r1 += r3) REP 16
|
|
+ mov r4,64 # Constant used for rounding first pass
|
|
+ mov r5,1<<11 # Constant used for rounding second pass
|
|
+
|
|
+ # At start of block r0,r1 point to the current block (that has already been loaded)
|
|
+block_loop:
|
|
+ eor r0,r8
|
|
+ add r1,r7
|
|
+ # Prefetch the next block
|
|
+ vldh HX(0++,0)+r0,(r1 += r3) REP 16
|
|
+ eor r0,r8
|
|
+ sub r1,r7
|
|
+
|
|
+ # Transform the current block
|
|
+ bl col_trans_16
|
|
+ vadd HY(0++,0)+r0,HY(0++,0)+r0,r4 REP 16 # Now add on rounding, shift down by 7, and saturate
|
|
+ #vsasls HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # 9+7=16 so this ends up with the output saturated and in the top half of the word.
|
|
+ vasl HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # This should be saturating, but the instruction above does not assemble?
|
|
+ vmov VX(0,0++)+r0, HX(0++,32)+r0 REP 16 # For simplicity transpose this back to the original position
|
|
+
|
|
+ bl col_trans_16
|
|
+ vadd HY(0++,0)+r0,HY(0++,0)+r0,r5 REP 16 # Now add on rounding, shift down by 7, and saturate
|
|
+ #vsasls HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16 # 4+12=16 so this ends up with the output saturated and in the top half of the word.
|
|
+ vasl HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16 # This should be saturating, but the instruction above does not assemble? (Probably because it ends with ls which is interpreted as a condition flag)
|
|
+
|
|
+ # Save results - note there has been a transposition during the processing so we save columns
|
|
+ vsth VX(0,32++)+r0, (r1 += r3) REP 16
|
|
+
|
|
+ # Move onto next block
|
|
+ eor r0,r8
|
|
+ add r1,r7
|
|
+
|
|
+ addcmpbgt r2,-1,0,block_loop
|
|
+
|
|
+ # Now go and do any 32x32 transforms
|
|
+ b hevc_trans_32x32
|
|
+
|
|
+ pop r6-r15, pc
|
|
+
|
|
+# r1,r2,r3 r7,r8 should be preserved
|
|
+# HX(0++,0)+r0 is the block to be transformed
|
|
+# HX(32++,0)+r6 is the 16x16 matrix of transform coefficients
|
|
+# Use HY(48,0) for intermediate results
|
|
+# r0 can be used, but should be returned to its original value at the end
|
|
+col_trans_16:
|
|
+ add r6,r0,16 # Final value for this loop
|
|
+col_trans_16_loop:
|
|
+ # First compute partial products for a single column
|
|
+ vmul32s HY(48++,0), VX(0,0)+r0, VX(32,0++) REP 16
|
|
+ # Then sum up the results and place back
|
|
+ vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
|
|
+ addcmpblt r0,1,r6,col_trans_16_loop
|
|
+ sub r0,16 # put r0 back to its original value
|
|
+ b lr
|
|
+
|
|
+col_trans_odd_16:
|
|
+ add r6,r0,16 # Final value for this loop
|
|
+col_trans_odd_16_loop:
|
|
+ # First compute partial products for a single column
|
|
+ vmul32s HY(48++,0), VX(0,0)+r0, VX(32,32++) REP 16
|
|
+ # Then sum up the results and place back
|
|
+ vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
|
|
+ addcmpblt r0,1,r6,col_trans_odd_16_loop
|
|
+ sub r0,16 # put r0 back to its original value
|
|
+ b lr
|
|
+
|
|
+# hevc_trans_32x32(short *transMatrix2, short *coeffs, int num)
|
|
+# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory) Even followed by odd
|
|
+# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
|
|
+# num: number of 16x16 transforms to be done
|
|
+#
|
|
+hevc_trans_32x32:
|
|
+ mov r1,r14 # coeffs
|
|
+ mov r2,r15 # num
|
|
+
|
|
+ # Fetch odd transform matrix
|
|
+ #mov r3, 16*2 # Stride of transMatrix2 in bytes (and of coefficients)
|
|
+ #vldh HX(32++,0),(r0 += r3) REP 16 # This is the even 16x16 matrix
|
|
+ #add r0, 16*16*2
|
|
+ #vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
|
|
+
|
|
+ mov r3, 32*2*2 # Stride used to fetch alternate rows of our input coefficient buffer
|
|
+ mov r7, 16*16*2 # Total block size
|
|
+ sub sp,sp,32*32*2+32 # Allocate some space on the stack for us to store 32*32 shorts as temporary results (needs to be aligned)
|
|
+ # set r8 to 32byte aligned stack pointer
|
|
+ add r8,sp,31
|
|
+ lsr r8,5
|
|
+ lsl r8,5
|
|
+ mov r9,r8 # Backup of the temporary storage
|
|
+ mov r10,r1 # Backup of the coefficient buffer
|
|
+block_loop32:
|
|
+
|
|
+ # COLUMN TRANSFORM
|
|
+ mov r4, 64 # Constant used for rounding first pass
|
|
+ mov r5, 9 # left shift used for rounding first pass
|
|
+
|
|
+ # Transform the first 16 columns
|
|
+ mov r1,r10 # Input Coefficient buffer
|
|
+ mov r8,r9 # Output temporary storage
|
|
+ bl trans32
|
|
+ # Transform the second 16 columns
|
|
+ add r8,32*16*2
|
|
+ add r1,32
|
|
+ bl trans32
|
|
+
|
|
+ # ROW TRANSFORM
|
|
+ mov r4, 1<<11 # Constant used for rounding second pass
|
|
+ mov r5, 4 # left shift used for rounding second pass
|
|
+
|
|
+ mov r1,r9 # Input temporary storage
|
|
+ mov r8,r10 # Output Coefficient buffer
|
|
+ bl trans32
|
|
+ # Transform the second 16 columns
|
|
+ add r8,32*16*2
|
|
+ add r1,32
|
|
+ bl trans32
|
|
+
|
|
+ add r10, 32*32*2 # move onto next block of coefficients
|
|
+ addcmpbgt r2,-1,0,block_loop32
|
|
+
|
|
+ add sp,sp,32*32*2+32 # Restore stack
|
|
+
|
|
+ pop r6-r15, pc
|
|
+
|
|
+trans32:
|
|
+ push lr
|
|
+ # We can no longer afford the VRF space to do prefetching when doing 32x32
|
|
+ # Fetch the even rows
|
|
+ vldh HX(0++,0),(r1 += r3) REP 16
|
|
+ # Fetch the odd rows
|
|
+ vldh HX(16++,0),64(r1 += r3) REP 16 # First odd row is 32 shorts ahead of r1
|
|
+
|
|
+ # Transform the even rows using even matrix
|
|
+ mov r0, 0 # Even rows
|
|
+ bl col_trans_16
|
|
+
|
|
+ # Now transform the odd rows using odd matrix
|
|
+ mov r0, 64*16 # Odd rows
|
|
+ bl col_trans_odd_16
|
|
+
|
|
+ # Now apply butterfly to compute the first 16 results
|
|
+ vadd HY(48++,0),HY(0++,0),HY(16++,0) REP 16
|
|
+ vadd HY(48++,0),HY(48++,0),r4 REP 16 # add on rounding,
|
|
+ vasl HY(48++,0),HY(48++,0),r5 REP 16 # shift down by 7, and saturate
|
|
+ # 16bit results now in HX(48,32)
|
|
+ mov r0,r8
|
|
+ mov r6,32*2
|
|
+ vsth VX(48,32++),(r0+=r6) REP 16
|
|
+
|
|
+ # Now apply butterfly to compute the second 16 results (in reverse order)
|
|
+ vsub HY(63,0),HY(0 ,0),HY(16,0)
|
|
+ vsub HY(62,0),HY(1 ,0),HY(17,0)
|
|
+ vsub HY(61,0),HY(2 ,0),HY(18,0)
|
|
+ vsub HY(60,0),HY(3 ,0),HY(19,0)
|
|
+ vsub HY(59,0),HY(4 ,0),HY(20,0)
|
|
+ vsub HY(58,0),HY(5 ,0),HY(21,0)
|
|
+ vsub HY(57,0),HY(6 ,0),HY(22,0)
|
|
+ vsub HY(56,0),HY(7 ,0),HY(23,0)
|
|
+ vsub HY(55,0),HY(8 ,0),HY(24,0)
|
|
+ vsub HY(54,0),HY(9 ,0),HY(25,0)
|
|
+ vsub HY(53,0),HY(10,0),HY(26,0)
|
|
+ vsub HY(52,0),HY(11,0),HY(27,0)
|
|
+ vsub HY(51,0),HY(12,0),HY(28,0)
|
|
+ vsub HY(50,0),HY(13,0),HY(29,0)
|
|
+ vsub HY(49,0),HY(14,0),HY(30,0)
|
|
+ vsub HY(48,0),HY(15,0),HY(31,0)
|
|
+ vadd HY(48++,0),HY(48++,0),r4 REP 16 # add on rounding,
|
|
+ vasl HY(48++,0),HY(48++,0),r5 REP 16 # shift down by 7, and saturate
|
|
+ add r0,r8,32
|
|
+ vsth VX(48,32++),(r0+=r6) REP 16
|
|
+ pop pc
|
|
+
|
|
+memclear16:
|
|
+ # r0 is address
|
|
+ # r1 is number of 16bits values to set to 0 (may overrun past end and clear more than specified)
|
|
+ vmov HX(0++,0),0 REP 16
|
|
+ mov r2,32
|
|
+loop:
|
|
+ vsth HX(0++,0),(r0+=r2) REP 16
|
|
+ add r0,16*16*2
|
|
+ sub r1,16*16
|
|
+ cmp r1,0
|
|
+ bgt loop
|
|
+ b lr
|
|
+
|
|
+
|
|
+################################################################################
|
|
+# HEVC VPU Deblock
|
|
+#
|
|
+# Vertical edges before horizontal
|
|
+# Decision can change every 4 pixels, but only 8 pixel boundaries are deblocked
|
|
+#
|
|
+# ARM is responsible for storing beta and tc for each 4 pixels horiz and vert edge.
|
|
+# The VPU code works in units of 16x16 blocks.
|
|
+# We do vertical filtering for the current block followed by horizontal filtering for the previous (except for the first time).
|
|
+# One final horizontal filter is required at the end.
|
|
+# PCM is not allowed in this code.
|
|
+#
|
|
+#
|
|
+# H(16-4:16+15,0) contains previous block (note that we need 4 lines above of context that may get altered during filtering)
|
|
+# H(16:31,16) contains current block (note that we do not need the upper lines until the horizontal filtering.
|
|
+
|
|
+.set P0,63
|
|
+.set P1,62
|
|
+.set P2,61
|
|
+.set P3,60
|
|
+.set Q0,59
|
|
+.set Q1,58
|
|
+.set Q2,57
|
|
+.set Q3,56
|
|
+
|
|
+.set dp,32
|
|
+.set dq,33
|
|
+.set d,34
|
|
+.set decision,35
|
|
+.set beta,36
|
|
+.set beta2,37
|
|
+.set beta3,38
|
|
+.set ptest,39
|
|
+.set qtest,40
|
|
+.set pqtest,41
|
|
+.set thresh,42
|
|
+.set deltatest, 44
|
|
+.set deltap1, 45
|
|
+.set tc25, 46
|
|
+.set setup,47
|
|
+.set tc,48
|
|
+.set tc25,49
|
|
+.set tc2, 50
|
|
+.set do_filter, 51
|
|
+.set delta, 52
|
|
+.set tc10, 53
|
|
+.set delta0, 54
|
|
+.set delta1, 55
|
|
+.set zeros, 0
|
|
+.set setup_input, 1
|
|
+.set deltaq1, 2
|
|
+
|
|
+
|
|
+
|
|
+# hevc_deblock_16x16 deblocks an entire row that is 16 pixels high by the full width of the image.
|
|
+# Row has num16 16x16 blocks across
|
|
+# Beta goes from 0 to 64
|
|
+# tc goes from 0 to 24
|
|
+# setup[block_idx][0=vert,1=horz][0=first edge, 1=second edge][0=beta,1=tc][0..3=edge number]
|
|
+# has 8 bytes per edge
|
|
+# has 16 bytes per direction
|
|
+# has 32 bytes per 16x16 block
|
|
+# hevc_deblock_16x16(uint8_t *img (r0), int stride (r1), int num16w (r2), uint8_t setup[num16][2][2][2][4](r3),int num16h(r4))
|
|
+hevc_deblock_16x16:
|
|
+ push r6-r15, lr
|
|
+ mov r9,r4
|
|
+ mov r4,r3
|
|
+ mov r13,r2
|
|
+ mov r2,r0
|
|
+ mov r10,r0
|
|
+ subscale4 r0,r1
|
|
+ mov r8,63
|
|
+ mov r6,-3
|
|
+ vmov H(zeros,0),0
|
|
+# r7 is number of blocks still to load
|
|
+# r0 is location of current block - 4 * stride
|
|
+# r1 is stride
|
|
+# r2 is location of current block
|
|
+# r3 is offset of start of block (actual edges start at H(16,16)+r3 for horizontal and H(16,0)+r3 for vertical
|
|
+# r4 is setup
|
|
+# r5 is for temporary calculations
|
|
+# r8 holds 63
|
|
+# r6 holds -3
|
|
+# r9 holds the number of 16 high rows to process
|
|
+# r10 holds the original img base
|
|
+# r11 returns 0 if no filtering was done on the edge
|
|
+# r12 saves a copy of this
|
|
+# r13 is copy of width
|
|
+
|
|
+process_row:
|
|
+ # First iteration does not do horizontal filtering on previous
|
|
+ mov r7, r13
|
|
+ mov r3,0
|
|
+ vldb H(12++,16)+r3,(r0 += r1) REP 4 # Load the current block
|
|
+ vldb H(16++,16)+r3,(r2 += r1) REP 16
|
|
+ vldb H(setup_input,0), (r4) # We may wish to prefetch these
|
|
+ vstb H(zeros,0),(r4)
|
|
+ bl vert_filter
|
|
+ add r3,8
|
|
+ vadd H(setup_input,0),H(setup_input,8),0 # Rotate to second set of 8
|
|
+ bl vert_filter
|
|
+ sub r3,8
|
|
+ b start_deblock_loop
|
|
+deblock_loop:
|
|
+ # Middle iterations do vertical on current block and horizontal on preceding
|
|
+ vldb H(12++,16)+r3,(r0 += r1) REP 4 # load the current block
|
|
+ vldb H(16++,16)+r3,(r2 += r1) REP 16
|
|
+ vldb H(setup_input,0), (r4)
|
|
+ vstb H(zeros,0),(r4)
|
|
+ bl vert_filter
|
|
+ add r3,8
|
|
+ vadd H(setup_input,0),H(setup_input,8),0
|
|
+ bl vert_filter
|
|
+ sub r3,8
|
|
+ vldb H(setup_input,0), -16(r4)
|
|
+ vstb H(zeros,0),-16(r4)
|
|
+ bl horz_filter
|
|
+ mov r12,r11
|
|
+ add r3,8*64
|
|
+ vadd H(setup_input,0),H(setup_input,8),0
|
|
+ bl horz_filter
|
|
+ sub r3,8*64
|
|
+ addcmpbeq r12,0,0,skip_save_top
|
|
+ vstb H(12++,0)+r3,-16(r0 += r1) REP 4 # Save the deblocked pixels for the previous block
|
|
+skip_save_top:
|
|
+ vstb H(16++,0)+r3,-16(r2 += r1) REP 16
|
|
+start_deblock_loop:
|
|
+ # move onto next 16x16 (could do this with circular buffer support instead)
|
|
+ add r3,16
|
|
+ and r3,r8
|
|
+ add r4,32
|
|
+ # Perform loop counter operations (may work with an addcmpbgt as well?)
|
|
+ add r0,16
|
|
+ add r2,16
|
|
+ sub r7,1
|
|
+ cmp r7,0 # Are there still more blocks to load
|
|
+ bgt deblock_loop
|
|
+
|
|
+ # Final iteration needs to just do horizontal filtering
|
|
+ vldb H(setup_input,0), -16(r4)
|
|
+ vstb H(zeros,0),-16(r4)
|
|
+ bl horz_filter
|
|
+ mov r12,r11
|
|
+ add r3,8*64
|
|
+ vadd H(setup_input,0),H(setup_input,8),0
|
|
+ bl horz_filter
|
|
+ sub r3,64*8
|
|
+ addcmpbeq r12,0,0,skip_save_top2
|
|
+ vstb H(12++,0)+r3,-16(r0 += r1) REP 4 # Save the deblocked pixels for the previous block
|
|
+skip_save_top2:
|
|
+ vstb H(16++,0)+r3,-16(r2 += r1) REP 16
|
|
+
|
|
+# Now look to see if we should do another row
|
|
+ sub r9,1
|
|
+ cmp r9,0
|
|
+ bgt start_again
|
|
+ pop r6-r15, pc
|
|
+start_again:
|
|
+ # Need to sort out r0,r2 to point to next row down
|
|
+ addscale16 r10,r1
|
|
+ mov r2,r10
|
|
+ subscale4 r0,r2,r1
|
|
+ b process_row
|
|
+
|
|
+
|
|
+# At this stage H(16,16)+r3 points to the first pixel of the 16 high edge to be filtered
|
|
+# So we can reuse the code we move the parts to be filtered into HX(P0/P1/P2/P3/Q0/Q1/Q2/Q3,0) - we will perform a final saturation step on placing them back into the correct locations
|
|
+
|
|
+vert_filter:
|
|
+ push lr
|
|
+
|
|
+ vmov HX(P3,0), V(16,12)+r3
|
|
+ vmov HX(P2,0), V(16,13)+r3
|
|
+ vmov HX(P1,0), V(16,14)+r3
|
|
+ vmov HX(P0,0), V(16,15)+r3
|
|
+ vmov HX(Q0,0), V(16,16)+r3
|
|
+ vmov HX(Q1,0), V(16,17)+r3
|
|
+ vmov HX(Q2,0), V(16,18)+r3
|
|
+ vmov HX(Q3,0), V(16,19)+r3
|
|
+
|
|
+ bl do_luma_filter
|
|
+
|
|
+ vadds V(16,13)+r3, HX(P2,0), 0
|
|
+ vadds V(16,14)+r3, HX(P1,0), 0
|
|
+ vadds V(16,15)+r3, HX(P0,0), 0
|
|
+ # P3 and Q3 never change so don't bother saving back
|
|
+ vadds V(16,16)+r3, HX(Q0,0), 0
|
|
+ vadds V(16,17)+r3, HX(Q1,0), 0
|
|
+ vadds V(16,18)+r3, HX(Q2,0), 0
|
|
+
|
|
+ pop pc
|
|
+
|
|
+# Filter edge at H(16,0)+r3
|
|
+horz_filter:
|
|
+ push lr
|
|
+
|
|
+ vmov HX(P3,0), H(12,0)+r3
|
|
+ vmov HX(P2,0), H(13,0)+r3
|
|
+ vmov HX(P1,0), H(14,0)+r3
|
|
+ vmov HX(P0,0), H(15,0)+r3
|
|
+ vmov HX(Q0,0), H(16,0)+r3
|
|
+ vmov HX(Q1,0), H(17,0)+r3
|
|
+ vmov HX(Q2,0), H(18,0)+r3
|
|
+ vmov HX(Q3,0), H(19,0)+r3
|
|
+
|
|
+ bl do_luma_filter
|
|
+
|
|
+ vadds H(13,0)+r3, HX(P2,0), 0
|
|
+ vadds H(14,0)+r3, HX(P1,0), 0
|
|
+ vadds H(15,0)+r3, HX(P0,0), 0
|
|
+ # P3 and Q3 never change so don't bother saving back
|
|
+ vadds H(16,0)+r3, HX(Q0,0), 0
|
|
+ vadds H(17,0)+r3, HX(Q1,0), 0
|
|
+ vadds H(18,0)+r3, HX(Q2,0), 0
|
|
+
|
|
+ pop pc
|
|
+
|
|
+# r4 points to array of beta/tc for each 4 length edge
|
|
+do_luma_filter:
|
|
+ valtl H(setup,0),H(setup_input,0),H(setup_input,0) # b*8tc*8
|
|
+ valtl HX(beta,0),H(setup,0),H(setup,0)
|
|
+ valtu HX(tc,0),H(setup,0),H(setup,0)
|
|
+ vmul HX(tc25,0), HX(tc,0), 5
|
|
+ vadd HX(tc25,0),HX(tc25,0), 1
|
|
+ vasr HX(tc25,0), HX(tc25,0), 1
|
|
+
|
|
+ # Compute decision
|
|
+ vadd HX(dp,0),HX(P1,0),HX(P1,0) # 2*P1
|
|
+ vsub HX(dp,0),HX(P2,0),HX(dp,0) # P2-2*P1
|
|
+ vadd HX(dp,0),HX(dp,0),HX(P0,0) # P2-2*P1+P0
|
|
+ vdist HX(dp,0),HX(dp,0),0 # abs(P2-2*P1+P0) # dp0
|
|
+
|
|
+ vadd HX(dq,0),HX(Q1,0),HX(Q1,0) # 2*Q1
|
|
+ vsub HX(dq,0),HX(Q2,0),HX(dq,0) # Q2-2*Q1
|
|
+ vadd HX(dq,0),HX(dq,0),HX(Q0,0) # Q2-2*Q1+Q0
|
|
+ vdist HX(dq,0),HX(dq,0),0 # abs(Q2-2*Q1+Q0) # dq0
|
|
+
|
|
+ vadd HX(d,0), HX(dp,0), HX(dq,0)
|
|
+ vasr HX(beta2,0),HX(beta,0),2
|
|
+ vasr HX(beta3,0),HX(beta,0),3
|
|
+
|
|
+ # Compute flags that are negative if all conditions pass
|
|
+ vdist HX(decision,0), HX(P0,0), HX(P3,0) CLRA SACC
|
|
+ vdist HX(decision,0), HX(Q0,0), HX(Q3,0) SACC
|
|
+ vsub HX(decision,0), HX(decision,0), HX(beta3,0) SETF
|
|
+
|
|
+ vdist HX(decision,0), HX(P0,0), HX(Q0,0) IFN
|
|
+ vsub HX(decision,0), HX(decision,0), HX(tc25,0) IFN SETF
|
|
+ vadd HX(decision,0), HX(d,0), HX(d,0) IFN
|
|
+ vsub HX(decision,0), HX(decision,0), HX(beta2,0) IFN SETF
|
|
+ vmov HX(decision,0), 1 IFNN
|
|
+ vadd H(decision,0),H(decision,3),0 IFN
|
|
+ vadd H(decision,16),H(decision,19),0 IFN
|
|
+ vmov -,HX(decision,0) SETF # N marks strong filter
|
|
+ vmov HX(decision,0), 1 IFNN # NN marks normal filter
|
|
+
|
|
+ vadd HX(do_filter,0), HX(d,3), HX(d,0)
|
|
+ vsub HX(do_filter,0), HX(do_filter,0), HX(beta,0) SETF # IFNN means no filter
|
|
+ vmov HX(decision,0),0 IFNN # Z marks no filter
|
|
+
|
|
+ # Expand out decision (currently valid one every 4 pixels) 0...1...2...3
|
|
+ # First extract out even terms
|
|
+ vodd HX(decision,0),HX(decision,0),HX(decision,0) # 0.1.2.3
|
|
+ vodd HX(decision,0),HX(decision,0),HX(decision,0) # 0123
|
|
+ # Now expand back
|
|
+ valtl HX(decision,0),HX(decision,0),HX(decision,0) # 00112233
|
|
+ valtl HX(decision,0),HX(decision,0),HX(decision,0) SETF # 0000111122223333
|
|
+
|
|
+ # HX(decision,0) is negative if want strong filtering, 1 if want normal filtering, 0 if want no filtering
|
|
+
|
|
+ # Do a quick check to see if there is anything to do
|
|
+ mov r11, 0 # Signal no filtering
|
|
+ vmov -,1 IFNZ SUMS r5
|
|
+ cmp r5,0
|
|
+ beq filtering_done
|
|
+ mov r11, 1 # Signal some filtering
|
|
+ # And whether there is any strong filtering
|
|
+ vmov -,1 IFN SUMS r5
|
|
+ cmp r5,0
|
|
+ beq normal_filtering
|
|
+
|
|
+ ##############################################################################
|
|
+ # Strong filtering - could maybe fast case if all have same sign? (especially if all disabled!)
|
|
+ vshl HX(tc2,0), HX(tc,0), 1 # Note that in normal filtering tx2 is tc/2, while here it is tc*2
|
|
+
|
|
+ # Take a copy of the original pixels for use in decision calculation
|
|
+ vmov HX(P0,32),HX(P0,0)
|
|
+ vmov HX(Q0,32),HX(Q0,0)
|
|
+ vmov HX(P1,32),HX(P1,0)
|
|
+ vmov HX(Q1,32),HX(Q1,0)
|
|
+ vmov HX(P2,32),HX(P2,0)
|
|
+ vmov HX(Q2,32),HX(Q2,0)
|
|
+
|
|
+ vadd -,HX(P2,32),4 CLRA SACC
|
|
+ vshl -,HX(P1,32),1 SACC
|
|
+ vshl -,HX(P0,32),1 SACC
|
|
+ vshl -,HX(Q0,32),1 SACC
|
|
+ vshl HX(delta,0),HX(Q1,32),0 SACC
|
|
+ vasr HX(delta,0),HX(delta,0), 3
|
|
+ vsub HX(delta,0),HX(delta,0),HX(P0,32)
|
|
+ vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
|
|
+ vadd HX(P0,0),HX(P0,32),HX(delta,0) IFN
|
|
+
|
|
+ vadd -,HX(P2,32),2 CLRA SACC
|
|
+ vadd -,HX(P1,32),HX(P0,32) SACC
|
|
+ vshl HX(delta,0),HX(Q0,32),0 SACC
|
|
+ vasr HX(delta,0),HX(delta,0), 2
|
|
+ vsub HX(delta,0),HX(delta,0),HX(P1,32)
|
|
+ vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
|
|
+ vadd HX(P1,0),HX(P1,32),HX(delta,0) IFN
|
|
+
|
|
+ vadd -,HX(Q0,32),4 CLRA SACC
|
|
+ vadd -,HX(P1,32),HX(P0,32) SACC
|
|
+ vmul -,HX(P2,32),3 SACC
|
|
+ vshl HX(delta,0),HX(P3,0),1 SACC # Note that we have not made a copy of P3, so using P3,0 is correct
|
|
+ vasr HX(delta,0),HX(delta,0), 3
|
|
+ vsub HX(delta,0),HX(delta,0),HX(P2,32)
|
|
+ vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
|
|
+ vadd HX(P2,0),HX(P2,32),HX(delta,0) IFN
|
|
+ #vmov HX(P2,0),3 IFN
|
|
+
|
|
+ # Now reverse all P/Qs
|
|
+
|
|
+ vadd -,HX(Q2,32),4 CLRA SACC
|
|
+ vshl -,HX(Q1,32),1 SACC
|
|
+ vshl -,HX(Q0,32),1 SACC
|
|
+ vshl -,HX(P0,32),1 SACC
|
|
+ vshl HX(delta,0),HX(P1,32),0 SACC
|
|
+ vasr HX(delta,0),HX(delta,0), 3
|
|
+ vsub HX(delta,0),HX(delta,0),HX(Q0,32)
|
|
+ vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
|
|
+ vadd HX(Q0,0),HX(Q0,32),HX(delta,0) IFN
|
|
+
|
|
+ vadd -,HX(Q2,32),2 CLRA SACC
|
|
+ vadd -,HX(Q1,32),HX(Q0,32) SACC
|
|
+ vshl HX(delta,0),HX(P0,32),0 SACC
|
|
+ vasr HX(delta,0),HX(delta,0), 2
|
|
+ vsub HX(delta,0),HX(delta,0),HX(Q1,32)
|
|
+ vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
|
|
+ vadd HX(Q1,0),HX(Q1,32),HX(delta,0) IFN
|
|
+
|
|
+ vadd -,HX(P0,32),4 CLRA SACC
|
|
+ vadd -,HX(Q1,32),HX(Q0,32) SACC
|
|
+ vmul -,HX(Q2,32),3 SACC
|
|
+ vshl HX(delta,0),HX(Q3,0),1 SACC # Note that we have not made a copy of Q3, so using Q3,0 is correct
|
|
+ vasr HX(delta,0),HX(delta,0), 3
|
|
+ vsub HX(delta,0),HX(delta,0),HX(Q2,32)
|
|
+ vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
|
|
+ vadd HX(Q2,0),HX(Q2,32),HX(delta,0) IFN
|
|
+
|
|
+ ##############################################################################
|
|
+ # Normal filtering
|
|
+normal_filtering:
|
|
+ # Invert the decision flags
|
|
+ # make instruction more complicated as assembler has error and loses SETF
|
|
+ vrsub HX(tc10,0), HX(decision,0), 0 SETF # IFN means normal filtering
|
|
+ vmov -, HX(tc10,0) SETF # IFN means normal filtering
|
|
+
|
|
+ vmov -,1 IFN SUMS r5
|
|
+ cmp r5,0
|
|
+ beq filtering_done
|
|
+
|
|
+ vasr HX(tc2,0), HX(tc,0), 1
|
|
+ vmul HX(tc10,0), HX(tc,0), 10
|
|
+
|
|
+ vasr HX(thresh,0), HX(beta,0), 1
|
|
+ vadd HX(thresh,0), HX(thresh,0), HX(beta,0)
|
|
+ vasr HX(thresh,0), HX(thresh,0), 3 CLRA SACC
|
|
+
|
|
+ vadd HX(ptest,0),HX(dp,3),HX(dp,0)
|
|
+ vsub HX(ptest,0),HX(ptest,0),HX(thresh,0) # ptest is negative if we need to do the P2 pixel
|
|
+ vadd HX(qtest,0),HX(dq,3),HX(dq,0)
|
|
+ vsub HX(qtest,0),HX(qtest,0),HX(thresh,0) # qtest is negative if we need to do the Q2 pixel
|
|
+ # Expand ptest and qtest together
|
|
+ vodd HX(pqtest,0),HX(ptest,0),HX(qtest,0) # p.p.p.p.q.q.q.q
|
|
+ vodd HX(pqtest,0),HX(pqtest,0),HX(pqtest,0) # ppppqqqq........
|
|
+ valtl HX(pqtest,0),HX(pqtest,0),HX(pqtest,0) # ppppppppqqqqqqqq
|
|
+ valtl HX(ptest,0),HX(pqtest,0),HX(pqtest,0)
|
|
+ valtu HX(qtest,0),HX(pqtest,0),HX(pqtest,0)
|
|
+
|
|
+ vsub HX(delta0,0), HX(Q0,0), HX(P0,0)
|
|
+ vsub HX(delta1,0), HX(Q1,0), HX(P1,0)
|
|
+ vmov -,8 CLRA SACC
|
|
+ vmul -,HX(delta0,0), 9 SACC
|
|
+ vmul HX(delta0,0),HX(delta1,0), r6 SACC
|
|
+ vasr HX(delta0,0), HX(delta0,0), 4
|
|
+ vdist HX(deltatest,0), HX(delta0,0), 0
|
|
+ vsub HX(deltatest,0), HX(deltatest,0), HX(tc10,0) IFN SETF # negative if still need to do something
|
|
+ vmov HX(deltatest,0), 0 IFNN # clear if no need to do anything so we can reload flags later
|
|
+
|
|
+ vclamps HX(delta0,0), HX(delta0,0), HX(tc,0)
|
|
+
|
|
+ vadd HX(deltap1,0), HX(P2,0), HX(P0,0)
|
|
+ vadd HX(deltap1,0), HX(deltap1,0), 1
|
|
+ vasr HX(deltap1,0), HX(deltap1,0), 1 CLRA SACC
|
|
+ vsub HX(deltap1,0), HX(delta0,0), HX(P1,0) SACC
|
|
+ vasr HX(deltap1,0), HX(deltap1,0), 1
|
|
+ vclamps HX(deltap1,0), HX(deltap1,0), HX(tc2,0)
|
|
+
|
|
+ vadd HX(deltaq1,0), HX(Q2,0), HX(Q0,0)
|
|
+ vadd HX(deltaq1,0), HX(deltaq1,0), 1
|
|
+ vasr HX(deltaq1,0), HX(deltaq1,0), 1 CLRA SACC
|
|
+ vadd HX(deltaq1,0), HX(delta0,0), HX(Q1,0)
|
|
+ vrsub -, HX(delta0,0), 0 SACC
|
|
+ vrsub HX(deltaq1,0), HX(Q1,0), 0 SACC
|
|
+ vasr HX(deltaq1,0), HX(deltaq1,0), 1
|
|
+ vclamps HX(deltaq1,0), HX(deltaq1,0), HX(tc2,0)
|
|
+
|
|
+ vadds HX(P0,0), HX(P0,0), HX(delta0,0) IFN
|
|
+ vsubs HX(Q0,0), HX(Q0,0), HX(delta0,0) IFN
|
|
+
|
|
+ vmov -,HX(ptest,0) IFN SETF # Negative if need to do p1
|
|
+ vadds HX(P1,0), HX(P1,0), HX(deltap1,0) IFN
|
|
+
|
|
+ vmov -,HX(deltatest,0) SETF
|
|
+ vmov -,HX(qtest,0) IFN SETF # Negative if need to do q1
|
|
+ vadds HX(Q1,0), HX(Q1,0), HX(deltaq1,0) IFN
|
|
+
|
|
+ #vmov HX(P2,0),1 IFN
|
|
+
|
|
+filtering_done:
|
|
+ b lr
|
|
+
|
|
+
|
|
+hevc_uv_deblock_16x16:
|
|
+ push r6-r15, lr
|
|
+ mov r14,0
|
|
+ b hevc_uv_start
|
|
+hevc_uv_deblock_16x16_with_clear:
|
|
+ push r6-r15, lr
|
|
+ mov r14,1
|
|
+ b hevc_uv_start
|
|
+
|
|
+hevc_uv_start:
|
|
+ mov r9,r4
|
|
+ mov r4,r3
|
|
+ mov r13,r2
|
|
+ mov r2,r0
|
|
+ mov r10,r0
|
|
+ subscale4 r0,r1
|
|
+ mov r8,63
|
|
+ mov r6,-3
|
|
+ vmov H(zeros,0),0
|
|
+# r7 is number of blocks still to load
|
|
+# r0 is location of current block - 4 * stride
|
|
+# r1 is stride
|
|
+# r2 is location of current block
|
|
+# r3 is offset of start of block (actual edges start at H(16,16)+r3 for horizontal and H(16,0)+r3 for vertical
|
|
+# r4 is setup
|
|
+# r5 is for temporary calculations
|
|
+# r8 holds 63
|
|
+# r6 holds -3
|
|
+# r9 holds the number of 16 high rows to process
|
|
+# r10 holds the original img base
|
|
+# r11 returns 0 if no filtering was done on the edge
|
|
+# r12 saves a copy of this
|
|
+# r13 is copy of width
|
|
+# r14 is 1 if we should clear the old contents, or 0 if not
|
|
+
|
|
+uv_process_row:
|
|
+ # First iteration does not do horizontal filtering on previous
|
|
+ mov r7, r13
|
|
+ mov r3,0
|
|
+ vldb H(12++,16)+r3,(r0 += r1) REP 4 # Load the current block
|
|
+ vldb H(16++,16)+r3,(r2 += r1) REP 16
|
|
+ vldb H(setup_input,0), (r4) # We may wish to prefetch these
|
|
+ cmp r14,1
|
|
+ bne uv_skip0
|
|
+ vstb H(zeros,0),(r4)
|
|
+uv_skip0:
|
|
+ bl uv_vert_filter
|
|
+ add r3,8
|
|
+ vadd H(setup_input,0),H(setup_input,8),0 # Rotate to second set of 8
|
|
+ bl uv_vert_filter
|
|
+ sub r3,8
|
|
+ b uv_start_deblock_loop
|
|
+uv_deblock_loop:
|
|
+ # Middle iterations do vertical on current block and horizontal on preceding
|
|
+ vldb H(12++,16)+r3,(r0 += r1) REP 4 # load the current block
|
|
+ vldb H(16++,16)+r3,(r2 += r1) REP 16
|
|
+ vldb H(setup_input,0), (r4)
|
|
+ cmp r14,1
|
|
+ bne uv_skip1
|
|
+ vstb H(zeros,0),(r4)
|
|
+uv_skip1:
|
|
+ bl uv_vert_filter
|
|
+ add r3,8
|
|
+ vadd H(setup_input,0),H(setup_input,8),0
|
|
+ bl uv_vert_filter
|
|
+ sub r3,8
|
|
+ vldb H(setup_input,0), -16(r4)
|
|
+ cmp r14,1
|
|
+ bne uv_skip3
|
|
+ vstb H(zeros,0),-16(r4)
|
|
+uv_skip3:
|
|
+ bl uv_horz_filter
|
|
+ mov r12,r11
|
|
+ add r3,8*64
|
|
+ vadd H(setup_input,0),H(setup_input,8),0
|
|
+ bl uv_horz_filter
|
|
+ sub r3,8*64
|
|
+ addcmpbeq r12,0,0,uv_skip_save_top
|
|
+ vstb H(12++,0)+r3,-16(r0 += r1) REP 4 # Save the deblocked pixels for the previous block
|
|
+uv_skip_save_top:
|
|
+ vstb H(16++,0)+r3,-16(r2 += r1) REP 16
|
|
+uv_start_deblock_loop:
|
|
+ # move onto next 16x16 (could do this with circular buffer support instead)
|
|
+ add r3,16
|
|
+ and r3,r8
|
|
+ add r4,32
|
|
+ # Perform loop counter operations (may work with an addcmpbgt as well?)
|
|
+ add r0,16
|
|
+ add r2,16
|
|
+ sub r7,1
|
|
+ cmp r7,0 # Are there still more blocks to load
|
|
+ bgt uv_deblock_loop
|
|
+
|
|
+ # Final iteration needs to just do horizontal filtering
|
|
+ vldb H(setup_input,0), -16(r4)
|
|
+ cmp r14,1
|
|
+ bne uv_skip2
|
|
+ vstb H(zeros,0),-16(r4)
|
|
+uv_skip2:
|
|
+ bl uv_horz_filter
|
|
+ mov r12,r11
|
|
+ add r3,8*64
|
|
+ vadd H(setup_input,0),H(setup_input,8),0
|
|
+ bl uv_horz_filter
|
|
+ sub r3,64*8
|
|
+ addcmpbeq r12,0,0,uv_skip_save_top2
|
|
+ vstb H(12++,0)+r3,-16(r0 += r1) REP 4 # Save the deblocked pixels for the previous block
|
|
+uv_skip_save_top2:
|
|
+ vstb H(16++,0)+r3,-16(r2 += r1) REP 16
|
|
+
|
|
+# Now look to see if we should do another row
|
|
+ sub r9,1
|
|
+ cmp r9,0
|
|
+ bgt uv_start_again
|
|
+ pop r6-r15, pc
|
|
+uv_start_again:
|
|
+ # Need to sort out r0,r2 to point to next row down
|
|
+ addscale16 r10,r1
|
|
+ mov r2,r10
|
|
+ subscale4 r0,r2,r1
|
|
+ b uv_process_row
|
|
+
|
|
+
|
|
+# At this stage H(16,16)+r3 points to the first pixel of the 16 high edge to be filtered
|
|
+# So we can reuse the code we move the parts to be filtered into HX(P0/P1/P2/P3/Q0/Q1/Q2/Q3,0) - we will perform a final saturation step on placing them back into the correct locations
|
|
+
|
|
+uv_vert_filter:
|
|
+ push lr
|
|
+
|
|
+ vmov HX(P1,0), V(16,14)+r3
|
|
+ vmov HX(P0,0), V(16,15)+r3
|
|
+ vmov HX(Q0,0), V(16,16)+r3
|
|
+ vmov HX(Q1,0), V(16,17)+r3
|
|
+
|
|
+ bl do_chroma_filter
|
|
+
|
|
+ vadds V(16,15)+r3, HX(P0,0), 0
|
|
+ vadds V(16,16)+r3, HX(Q0,0), 0
|
|
+
|
|
+ pop pc
|
|
+
|
|
+# Filter edge at H(16,0)+r3
|
|
+uv_horz_filter:
|
|
+ push lr
|
|
+
|
|
+ vmov HX(P1,0), H(14,0)+r3
|
|
+ vmov HX(P0,0), H(15,0)+r3
|
|
+ vmov HX(Q0,0), H(16,0)+r3
|
|
+ vmov HX(Q1,0), H(17,0)+r3
|
|
+
|
|
+ bl do_chroma_filter
|
|
+
|
|
+ vadds H(15,0)+r3, HX(P0,0), 0
|
|
+ # P3 and Q3 never change so don't bother saving back
|
|
+ vadds H(16,0)+r3, HX(Q0,0), 0
|
|
+
|
|
+ pop pc
|
|
+
|
|
+# r4 points to array of beta/tc for each 4 length edge
|
|
+do_chroma_filter:
|
|
+ valtl H(setup,0),H(setup_input,0),H(setup_input,0) # tc*8
|
|
+ valtl HX(tc,0),H(setup,0),H(setup,0)
|
|
+
|
|
+ vsub HX(delta,0),HX(Q0,0),HX(P0,0)
|
|
+ vshl HX(delta,0),HX(delta,0),2 CLRA SACC
|
|
+ vsub -,HX(P1,0),HX(Q1,0) SACC
|
|
+ vmov HX(delta,0),4 SACC
|
|
+ vasr HX(delta,0),HX(delta,0),3
|
|
+ vclamps HX(delta,0), HX(delta,0), HX(tc,0)
|
|
+ vadd HX(P0,0),HX(P0,0),HX(delta,0)
|
|
+ vsub HX(Q0,0),HX(Q0,0),HX(delta,0)
|
|
+ b lr
|
|
+
|
|
+# r0 = list
|
|
+# r1 = number
|
|
+hevc_run_command_list:
|
|
+ push r6-r7, lr
|
|
+ mov r6, r0
|
|
+ mov r7, r1
|
|
+loop_cmds:
|
|
+ ld r0,(r6) # How to encode r6++?
|
|
+ add r6,4
|
|
+ ld r1,(r6)
|
|
+ add r6,4
|
|
+ ld r2,(r6)
|
|
+ add r6,4
|
|
+ ld r3,(r6)
|
|
+ add r6,4
|
|
+ ld r4,(r6)
|
|
+ add r6,4
|
|
+ ld r5,(r6)
|
|
+ add r6,4
|
|
+ bl hevc_trans_16x16
|
|
+ sub r7,1
|
|
+ cmp r7,0
|
|
+ bgt loop_cmds
|
|
+
|
|
+ pop r6-r7, pc
|
|
diff --git a/libavcodec/rpi_mailbox.c b/libavcodec/rpi_mailbox.c
|
|
new file mode 100644
|
|
index 0000000..3904efc
|
|
--- /dev/null
|
|
+++ b/libavcodec/rpi_mailbox.c
|
|
@@ -0,0 +1,340 @@
|
|
+/*
|
|
+Copyright (c) 2012, Broadcom Europe Ltd.
|
|
+All rights reserved.
|
|
+
|
|
+Redistribution and use in source and binary forms, with or without
|
|
+modification, are permitted provided that the following conditions are met:
|
|
+ * Redistributions of source code must retain the above copyright
|
|
+ notice, this list of conditions and the following disclaimer.
|
|
+ * Redistributions in binary form must reproduce the above copyright
|
|
+ notice, this list of conditions and the following disclaimer in the
|
|
+ documentation and/or other materials provided with the distribution.
|
|
+ * Neither the name of the copyright holder nor the
|
|
+ names of its contributors may be used to endorse or promote products
|
|
+ derived from this software without specific prior written permission.
|
|
+
|
|
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
|
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
|
|
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
|
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
|
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
+*/
|
|
+
|
|
+#include <stdio.h>
|
|
+#include <string.h>
|
|
+#include <stdlib.h>
|
|
+#include <fcntl.h>
|
|
+#include <unistd.h>
|
|
+#include <assert.h>
|
|
+#include <stdint.h>
|
|
+#include <sys/mman.h>
|
|
+#include <sys/ioctl.h>
|
|
+
|
|
+#include <linux/ioctl.h>
|
|
+
|
|
+#define MAJOR_NUM 100
|
|
+#define IOCTL_MBOX_PROPERTY _IOWR(MAJOR_NUM, 0, char *)
|
|
+#define DEVICE_FILE_NAME "/dev/vcio"
|
|
+
|
|
+#include "rpi_mailbox.h"
|
|
+
|
|
+#define PAGE_SIZE (4*1024)
|
|
+
|
|
+// Shared memory will not be cached in ARM cache
|
|
+void *mapmem_shared(unsigned base, unsigned size)
|
|
+{
|
|
+ int mem_fd;
|
|
+ unsigned offset = base % PAGE_SIZE;
|
|
+ base = base - offset;
|
|
+ /* open /dev/mem */
|
|
+ if ((mem_fd = open("/dev/mem", O_RDWR|O_SYNC) ) < 0) {
|
|
+ printf("can't open /dev/mem\nThis program should be run as root. Try prefixing command with: sudo\n");
|
|
+ return NULL;
|
|
+ }
|
|
+ void *mem = mmap(
|
|
+ 0,
|
|
+ size,
|
|
+ PROT_READ|PROT_WRITE,
|
|
+ MAP_SHARED/*|MAP_FIXED*/,
|
|
+ mem_fd,
|
|
+ base);
|
|
+#ifdef DEBUG
|
|
+ printf("base=0x%x, mem=%p\n", base, mem);
|
|
+#endif
|
|
+ if (mem == MAP_FAILED) {
|
|
+ printf("mmap error %d\n", (int)mem);
|
|
+ return NULL;
|
|
+ }
|
|
+ close(mem_fd);
|
|
+ return (char *)mem + offset;
|
|
+}
|
|
+
|
|
+// Unshared memory will be faster as lives in ARM cache, but requires cache flushing
|
|
+void *mapmem_private(unsigned base, unsigned size)
|
|
+{
|
|
+ int mem_fd;
|
|
+ unsigned offset = base % PAGE_SIZE;
|
|
+ base = base - offset;
|
|
+ /* open /dev/mem */
|
|
+ if ((mem_fd = open("/dev/mem", O_RDWR|O_SYNC) ) < 0) {
|
|
+ printf("can't open /dev/mem\nThis program should be run as root. Try prefixing command with: sudo\n");
|
|
+ return NULL;
|
|
+ }
|
|
+ void *mem = mmap(
|
|
+ 0,
|
|
+ size,
|
|
+ PROT_READ|PROT_WRITE,
|
|
+ MAP_PRIVATE/*|MAP_FIXED*/,
|
|
+ mem_fd,
|
|
+ base);
|
|
+#ifdef DEBUG
|
|
+ printf("base=0x%x, mem=%p\n", base, mem);
|
|
+#endif
|
|
+ if (mem == MAP_FAILED) {
|
|
+ printf("mmap error %d\n", (int)mem);
|
|
+ return NULL;
|
|
+ }
|
|
+ close(mem_fd);
|
|
+ return (char *)mem + offset;
|
|
+}
|
|
+
|
|
+void unmapmem(void *addr, unsigned size)
|
|
+{
|
|
+ int s = munmap(addr, size);
|
|
+ if (s != 0) {
|
|
+ printf("munmap error %d\n", s);
|
|
+ exit (-1);
|
|
+ }
|
|
+}
|
|
+
|
|
+/*
|
|
+ * use ioctl to send mbox property message
|
|
+ */
|
|
+
|
|
+static int mbox_property(int file_desc, void *buf)
|
|
+{
|
|
+ int ret_val = ioctl(file_desc, IOCTL_MBOX_PROPERTY, buf);
|
|
+
|
|
+ if (ret_val < 0) {
|
|
+ printf("ioctl_set_msg failed:%d\n", ret_val);
|
|
+ }
|
|
+
|
|
+#ifdef DEBUG
|
|
+ unsigned *p = buf; int i; unsigned size = *(unsigned *)buf;
|
|
+ for (i=0; i<size/4; i++)
|
|
+ printf("%04x: 0x%08x\n", i*sizeof *p, p[i]);
|
|
+#endif
|
|
+ return ret_val;
|
|
+}
|
|
+
|
|
+unsigned mem_alloc(int file_desc, unsigned size, unsigned align, unsigned flags)
|
|
+{
|
|
+ int i=0;
|
|
+ unsigned p[32];
|
|
+ p[i++] = 0; // size
|
|
+ p[i++] = 0x00000000; // process request
|
|
+
|
|
+ p[i++] = 0x3000c; // (the tag id)
|
|
+ p[i++] = 12; // (size of the buffer)
|
|
+ p[i++] = 12; // (size of the data)
|
|
+ p[i++] = size; // (num bytes? or pages?)
|
|
+ p[i++] = align; // (alignment)
|
|
+ p[i++] = flags; // (MEM_FLAG_L1_NONALLOCATING)
|
|
+
|
|
+ p[i++] = 0x00000000; // end tag
|
|
+ p[0] = i*sizeof *p; // actual size
|
|
+
|
|
+ mbox_property(file_desc, p);
|
|
+ return p[5];
|
|
+}
|
|
+
|
|
+unsigned mem_free(int file_desc, unsigned handle)
|
|
+{
|
|
+ int i=0;
|
|
+ unsigned p[32];
|
|
+ p[i++] = 0; // size
|
|
+ p[i++] = 0x00000000; // process request
|
|
+
|
|
+ p[i++] = 0x3000f; // (the tag id)
|
|
+ p[i++] = 4; // (size of the buffer)
|
|
+ p[i++] = 4; // (size of the data)
|
|
+ p[i++] = handle;
|
|
+
|
|
+ p[i++] = 0x00000000; // end tag
|
|
+ p[0] = i*sizeof *p; // actual size
|
|
+
|
|
+ mbox_property(file_desc, p);
|
|
+ return p[5];
|
|
+}
|
|
+
|
|
+unsigned mem_lock(int file_desc, unsigned handle)
|
|
+{
|
|
+ int i=0;
|
|
+ unsigned p[32];
|
|
+ p[i++] = 0; // size
|
|
+ p[i++] = 0x00000000; // process request
|
|
+
|
|
+ p[i++] = 0x3000d; // (the tag id)
|
|
+ p[i++] = 4; // (size of the buffer)
|
|
+ p[i++] = 4; // (size of the data)
|
|
+ p[i++] = handle;
|
|
+
|
|
+ p[i++] = 0x00000000; // end tag
|
|
+ p[0] = i*sizeof *p; // actual size
|
|
+
|
|
+ mbox_property(file_desc, p);
|
|
+ return p[5];
|
|
+}
|
|
+
|
|
+unsigned mem_unlock(int file_desc, unsigned handle)
|
|
+{
|
|
+ int i=0;
|
|
+ unsigned p[32];
|
|
+ p[i++] = 0; // size
|
|
+ p[i++] = 0x00000000; // process request
|
|
+
|
|
+ p[i++] = 0x3000e; // (the tag id)
|
|
+ p[i++] = 4; // (size of the buffer)
|
|
+ p[i++] = 4; // (size of the data)
|
|
+ p[i++] = handle;
|
|
+
|
|
+ p[i++] = 0x00000000; // end tag
|
|
+ p[0] = i*sizeof *p; // actual size
|
|
+
|
|
+ mbox_property(file_desc, p);
|
|
+ return p[5];
|
|
+}
|
|
+
|
|
+unsigned execute_code(int file_desc, unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5)
|
|
+{
|
|
+ int i=0;
|
|
+ unsigned p[32];
|
|
+ p[i++] = 0; // size
|
|
+ p[i++] = 0x00000000; // process request
|
|
+
|
|
+ p[i++] = 0x30010; // (the tag id)
|
|
+ p[i++] = 28; // (size of the buffer)
|
|
+ p[i++] = 28; // (size of the data)
|
|
+ p[i++] = code;
|
|
+ p[i++] = r0;
|
|
+ p[i++] = r1;
|
|
+ p[i++] = r2;
|
|
+ p[i++] = r3;
|
|
+ p[i++] = r4;
|
|
+ p[i++] = r5;
|
|
+
|
|
+ p[i++] = 0x00000000; // end tag
|
|
+ p[0] = i*sizeof *p; // actual size
|
|
+
|
|
+ mbox_property(file_desc, p);
|
|
+ return p[5];
|
|
+}
|
|
+
|
|
+unsigned qpu_enable(int file_desc, unsigned enable)
|
|
+{
|
|
+ int i=0;
|
|
+ unsigned p[32];
|
|
+
|
|
+ p[i++] = 0; // size
|
|
+ p[i++] = 0x00000000; // process request
|
|
+
|
|
+ p[i++] = 0x30012; // (the tag id)
|
|
+ p[i++] = 4; // (size of the buffer)
|
|
+ p[i++] = 4; // (size of the data)
|
|
+ p[i++] = enable;
|
|
+
|
|
+ p[i++] = 0x00000000; // end tag
|
|
+ p[0] = i*sizeof *p; // actual size
|
|
+
|
|
+ mbox_property(file_desc, p);
|
|
+ return p[5];
|
|
+}
|
|
+
|
|
+unsigned execute_qpu(int file_desc, unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout) {
|
|
+ int i=0;
|
|
+ unsigned p[32];
|
|
+
|
|
+ p[i++] = 0; // size
|
|
+ p[i++] = 0x00000000; // process request
|
|
+ p[i++] = 0x30011; // (the tag id)
|
|
+ p[i++] = 16; // (size of the buffer)
|
|
+ p[i++] = 16; // (size of the data)
|
|
+ p[i++] = num_qpus;
|
|
+ p[i++] = control;
|
|
+ p[i++] = noflush;
|
|
+ p[i++] = timeout; // ms
|
|
+
|
|
+ p[i++] = 0x00000000; // end tag
|
|
+ p[0] = i*sizeof *p; // actual size
|
|
+
|
|
+ mbox_property(file_desc, p);
|
|
+ return p[5];
|
|
+}
|
|
+
|
|
+void execute_multi(int file_desc,
|
|
+ unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout,
|
|
+ unsigned num_qpus_2, unsigned control_2, unsigned noflush_2, unsigned timeout_2,
|
|
+ unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
|
|
+ unsigned code_2, unsigned r0_2, unsigned r1_2, unsigned r2_2, unsigned r3_2, unsigned r4_2, unsigned r5_2) {
|
|
+ int i=0;
|
|
+ unsigned p[32];
|
|
+
|
|
+ p[i++] = 0; // size
|
|
+ p[i++] = 0x00000000; // process request
|
|
+ p[i++] = 0x30018; // (the tag id)
|
|
+ p[i++] = 88; // (size of the buffer)
|
|
+ p[i++] = 88; // (size of the data)
|
|
+
|
|
+ p[i++] = num_qpus;
|
|
+ p[i++] = control;
|
|
+ p[i++] = noflush;
|
|
+ p[i++] = timeout; // ms
|
|
+
|
|
+ p[i++] = num_qpus_2;
|
|
+ p[i++] = control_2;
|
|
+ p[i++] = noflush_2;
|
|
+ p[i++] = timeout_2; // ms
|
|
+
|
|
+ p[i++] = code;
|
|
+ p[i++] = r0;
|
|
+ p[i++] = r1;
|
|
+ p[i++] = r2;
|
|
+ p[i++] = r3;
|
|
+ p[i++] = r4;
|
|
+ p[i++] = r5;
|
|
+
|
|
+ p[i++] = code_2;
|
|
+ p[i++] = r0_2;
|
|
+ p[i++] = r1_2;
|
|
+ p[i++] = r2_2;
|
|
+ p[i++] = r3_2;
|
|
+ p[i++] = r4_2;
|
|
+ p[i++] = r5_2;
|
|
+
|
|
+ p[i++] = 0x00000000; // end tag
|
|
+ p[0] = i*sizeof *p; // actual size
|
|
+
|
|
+ mbox_property(file_desc, p);
|
|
+ return;
|
|
+}
|
|
+
|
|
+int mbox_open() {
|
|
+ int file_desc;
|
|
+
|
|
+ // open a char device file used for communicating with kernel mbox driver
|
|
+ file_desc = open(DEVICE_FILE_NAME, 0);
|
|
+ if (file_desc < 0) {
|
|
+ printf("Can't open device file: %s\n", DEVICE_FILE_NAME);
|
|
+ printf("Try creating a device file with: sudo mknod %s c %d 0\n", DEVICE_FILE_NAME, MAJOR_NUM);
|
|
+ }
|
|
+ return file_desc;
|
|
+}
|
|
+
|
|
+void mbox_close(int file_desc) {
|
|
+ close(file_desc);
|
|
+}
|
|
diff --git a/libavcodec/rpi_mailbox.h b/libavcodec/rpi_mailbox.h
|
|
new file mode 100644
|
|
index 0000000..5898102
|
|
--- /dev/null
|
|
+++ b/libavcodec/rpi_mailbox.h
|
|
@@ -0,0 +1,25 @@
|
|
+#ifndef RPI_MAILBOX_H
|
|
+#define RPI_MAILBOX_H
|
|
+
|
|
+extern int mbox_open(void);
|
|
+extern void mbox_close(int file_desc);
|
|
+
|
|
+extern unsigned get_version(int file_desc);
|
|
+extern unsigned mem_alloc(int file_desc, unsigned size, unsigned align, unsigned flags);
|
|
+extern unsigned mem_free(int file_desc, unsigned handle);
|
|
+extern unsigned mem_lock(int file_desc, unsigned handle);
|
|
+extern unsigned mem_unlock(int file_desc, unsigned handle);
|
|
+extern void *mapmem_shared(unsigned base, unsigned size);
|
|
+extern void *mapmem_private(unsigned base, unsigned size);
|
|
+extern void unmapmem(void *addr, unsigned size);
|
|
+
|
|
+extern unsigned execute_code(int file_desc, unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
|
|
+extern unsigned execute_qpu(int file_desc, unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout);
|
|
+extern void execute_multi(int file_desc,
|
|
+ unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout,
|
|
+ unsigned num_qpus_2, unsigned control_2, unsigned noflush_2, unsigned timeout_2,
|
|
+ unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
|
|
+ unsigned code_2, unsigned r0_2, unsigned r1_2, unsigned r2_2, unsigned r3_2, unsigned r4_2, unsigned r5_2);
|
|
+extern unsigned qpu_enable(int file_desc, unsigned enable);
|
|
+
|
|
+#endif
|
|
diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
|
|
new file mode 100644
|
|
index 0000000..a01c051
|
|
--- /dev/null
|
|
+++ b/libavcodec/rpi_qpu.c
|
|
@@ -0,0 +1,991 @@
|
|
+#ifdef RPI
|
|
+// Use vchiq service for submitting jobs
|
|
+#define GPUSERVICE
|
|
+
|
|
+// This works better than the mmap in that the memory can be cached, but requires a kernel modification to enable the device.
|
|
+// define RPI_TIME_TOTAL_QPU to print out how much time is spent in the QPU code
|
|
+//#define RPI_TIME_TOTAL_QPU
|
|
+// define RPI_TIME_TOTAL_VPU to print out how much time is spent in the VPI code
|
|
+//#define RPI_TIME_TOTAL_VPU
|
|
+// define RPI_TIME_TOTAL_POSTED to print out how much time is spent in the multi execute QPU/VPU combined
|
|
+#define RPI_TIME_TOTAL_POSTED
|
|
+
|
|
+#include <stdio.h>
|
|
+#include <stdlib.h>
|
|
+#include <string.h>
|
|
+#include <stddef.h>
|
|
+#include <stdint.h>
|
|
+#include "libavutil/avassert.h"
|
|
+
|
|
+#include "config.h"
|
|
+
|
|
+#include <pthread.h>
|
|
+#include <time.h>
|
|
+
|
|
+#include "rpi_mailbox.h"
|
|
+#include "rpi_qpu.h"
|
|
+#include "rpi_shader.h"
|
|
+#include "rpi_hevc_transform.h"
|
|
+
|
|
+#include "rpi_user_vcsm.h"
|
|
+#ifdef GPUSERVICE
|
|
+#pragma GCC diagnostic push
|
|
+// Many many redundant decls in the header files
|
|
+#pragma GCC diagnostic ignored "-Wredundant-decls"
|
|
+#include "interface/vmcs_host/vc_vchi_gpuserv.h"
|
|
+#pragma GCC diagnostic pop
|
|
+#endif
|
|
+
|
|
+// QPU profile flags
|
|
+#define NO_FLUSH 1
|
|
+#define CLEAR_PROFILE 2
|
|
+#define OUTPUT_COUNTS 4
|
|
+
|
|
+#define FLAGS_FOR_PROFILING (NO_FLUSH)
|
|
+
|
|
+
|
|
+// On Pi2 there is no way to access the VPU L2 cache
|
|
+// GPU_MEM_FLG should be 4 for uncached memory. (Or C for alias to allocate in the VPU L2 cache)
|
|
+// However, if using VCSM allocated buffers, need to use C at the moment because VCSM does not allocate uncached memory correctly
|
|
+// The QPU crashes if we mix L2 cached and L2 uncached accesses due to a HW bug.
|
|
+#define GPU_MEM_FLG 0x4
|
|
+// GPU_MEM_MAP is meaningless on the Pi2 and should be left at 0 (On Pi1 it allows ARM to access VPU L2 cache)
|
|
+#define GPU_MEM_MAP 0x0
|
|
+
|
|
+#define vcos_verify_ge0(x) ((x)>=0)
|
|
+
|
|
+/*static const unsigned code[] =
|
|
+{
|
|
+ #include "rpi_shader.hex"
|
|
+};*/
|
|
+
|
|
+// Size in 32bit words
|
|
+#define QPU_CODE_SIZE 2048
|
|
+#define VPU_CODE_SIZE 2048
|
|
+
|
|
+const short rpi_transMatrix2even[32][16] = { // Even rows first
|
|
+{64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64},
|
|
+{90, 87, 80, 70, 57, 43, 25, 9, -9, -25, -43, -57, -70, -80, -87, -90},
|
|
+{89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89},
|
|
+{87, 57, 9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87},
|
|
+{83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83},
|
|
+{80, 9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80},
|
|
+{75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75},
|
|
+{70, -43, -87, 9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70},
|
|
+{64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64},
|
|
+{57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87, 9, -90, 25, 80, -57},
|
|
+{50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50},
|
|
+{43, -90, 57, 25, -87, 70, 9, -80, 80, -9, -70, 87, -25, -57, 90, -43},
|
|
+{36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36},
|
|
+{25, -70, 90, -80, 43, 9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25},
|
|
+{18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18},
|
|
+{ 9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9},
|
|
+// Odd rows
|
|
+{90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4},
|
|
+{90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13},
|
|
+{88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22},
|
|
+{85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31},
|
|
+{82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38},
|
|
+{78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46},
|
|
+{73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54},
|
|
+{67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61},
|
|
+{61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67},
|
|
+{54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73},
|
|
+{46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78},
|
|
+{38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82},
|
|
+{31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85},
|
|
+{22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88},
|
|
+{13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90},
|
|
+{ 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90}
|
|
+};
|
|
+
|
|
+struct GPU
|
|
+{
|
|
+ unsigned int qpu_code[QPU_CODE_SIZE];
|
|
+ unsigned int vpu_code[VPU_CODE_SIZE];
|
|
+ short transMatrix2even[16*16*2];
|
|
+ int open_count; // Number of allocated video buffers
|
|
+ int mb; // Mailbox handle
|
|
+ int vc; // Address in GPU memory
|
|
+ int mail[12*2]; // These are used to pass pairs of code/unifs to the QPUs for the first QPU task
|
|
+ int mail2[12*2]; // These are used to pass pairs of code/unifs to the QPUs for the second QPU task
|
|
+};
|
|
+
|
|
+// Stop more than one thread trying to allocate memory or use the processing resources at once
|
|
+static pthread_mutex_t gpu_mutex = PTHREAD_MUTEX_INITIALIZER;
|
|
+static volatile struct GPU* gpu = NULL;
|
|
+static GPU_MEM_PTR_T gpu_mem_ptr;
|
|
+
|
|
+#if defined(RPI_TIME_TOTAL_QPU) || defined(RPI_TIME_TOTAL_VPU) || defined(RPI_TIME_TOTAL_POSTED)
|
|
+static unsigned int Microseconds(void) {
|
|
+ struct timespec ts;
|
|
+ unsigned int x;
|
|
+ static unsigned int base = 0;
|
|
+ clock_gettime(CLOCK_REALTIME, &ts);
|
|
+ x = ts.tv_sec*1000000 + ts.tv_nsec/1000;
|
|
+ if (base==0) base=x;
|
|
+ return x-base;
|
|
+}
|
|
+#endif
|
|
+
|
|
+static int gpu_malloc_uncached_internal(int numbytes, GPU_MEM_PTR_T *p, int mb);
|
|
+static void gpu_free_internal(GPU_MEM_PTR_T *p);
|
|
+
|
|
+// Connect to QPU, returns 0 on success.
|
|
+static int gpu_init(volatile struct GPU **gpu) {
|
|
+ int mb = mbox_open();
|
|
+ int vc;
|
|
+ volatile struct GPU* ptr;
|
|
+ if (mb < 0)
|
|
+ return -1;
|
|
+#ifndef RPI_ASYNC
|
|
+ if (qpu_enable(mb, 1)) return -2;
|
|
+#endif
|
|
+ vcsm_init();
|
|
+ gpu_malloc_uncached_internal(sizeof(struct GPU), &gpu_mem_ptr, mb);
|
|
+ ptr = (volatile struct GPU*)gpu_mem_ptr.arm;
|
|
+ memset((void*)ptr, 0, sizeof *ptr);
|
|
+ vc = gpu_mem_ptr.vc;
|
|
+
|
|
+ ptr->mb = mb;
|
|
+ ptr->vc = vc;
|
|
+
|
|
+ printf("GPU allocated at 0x%x\n",vc);
|
|
+
|
|
+ *gpu = ptr;
|
|
+
|
|
+ // Now copy over the QPU code into GPU memory
|
|
+ {
|
|
+ int num_bytes = qpu_get_fn(QPU_MC_END) - qpu_get_fn(QPU_MC_SETUP_UV);
|
|
+ av_assert0(num_bytes<=QPU_CODE_SIZE*sizeof(unsigned int));
|
|
+ memcpy((void*)ptr->qpu_code, rpi_shader, num_bytes);
|
|
+ }
|
|
+ // And the VPU code
|
|
+ {
|
|
+ int num_bytes = sizeof(rpi_hevc_transform);
|
|
+ av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
|
|
+ memcpy((void*)ptr->vpu_code, rpi_hevc_transform, num_bytes);
|
|
+ }
|
|
+ // And the transform coefficients
|
|
+ memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, sizeof(rpi_transMatrix2even));
|
|
+
|
|
+#ifdef RPI_ASYNC
|
|
+ {
|
|
+ int err;
|
|
+ vpu_async_tail = 0;
|
|
+ vpu_async_head = 0;
|
|
+ err = pthread_create(&vpu_thread, NULL, vpu_start, NULL);
|
|
+ //printf("Created thread\n");
|
|
+ if (err) {
|
|
+ av_log(NULL, AV_LOG_FATAL, "Failed to create vpu thread\n");
|
|
+ return -4;
|
|
+ }
|
|
+
|
|
+ {
|
|
+ struct sched_param param = {0};
|
|
+ int policy = 0;
|
|
+
|
|
+ if (pthread_getschedparam(vpu_thread, &policy, ¶m) != 0)
|
|
+ {
|
|
+ av_log(NULL, AV_LOG_ERROR, "Unable to get VPU thread scheduling parameters\n");
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ av_log(NULL, AV_LOG_INFO, "VPU thread: policy=%d (%s), pri=%d\n",
|
|
+ policy,
|
|
+ policy == SCHED_RR ? "RR" : policy == SCHED_FIFO ? "FIFO" : "???" ,
|
|
+ param.sched_priority);
|
|
+
|
|
+ policy = SCHED_FIFO;
|
|
+ param.sched_priority = sched_get_priority_max(SCHED_FIFO);
|
|
+
|
|
+ av_log(NULL, AV_LOG_INFO, "Attempt to set: policy=%d (%s), pri=%d\n",
|
|
+ policy,
|
|
+ policy == SCHED_RR ? "RR" : policy == SCHED_FIFO ? "FIFO" : "???" ,
|
|
+ param.sched_priority);
|
|
+
|
|
+ if (pthread_setschedparam(vpu_thread, policy, ¶m) != 0)
|
|
+ {
|
|
+ av_log(NULL, AV_LOG_ERROR, "Unable to set VPU thread scheduling parameters\n");
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ if (pthread_getschedparam(vpu_thread, &policy, ¶m) != 0)
|
|
+ {
|
|
+ av_log(NULL, AV_LOG_ERROR, "Unable to get VPU thread scheduling parameters\n");
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ av_log(NULL, AV_LOG_INFO, "VPU thread (after): policy=%d (%s), pri=%d\n",
|
|
+ policy,
|
|
+ policy == SCHED_RR ? "RR" : policy == SCHED_FIFO ? "FIFO" : "???" ,
|
|
+ param.sched_priority);
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+
|
|
+ }
|
|
+
|
|
+ }
|
|
+#endif
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+// Returns 1 if the gpu is currently idle
|
|
+static int gpu_idle(void)
|
|
+{
|
|
+ int ret = pthread_mutex_trylock(&gpu_mutex);
|
|
+ if (ret==0) {
|
|
+ pthread_mutex_unlock(&gpu_mutex);
|
|
+ return 1;
|
|
+ }
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+// Make sure we have exclusive access to the mailbox, and enable qpu if necessary.
|
|
+static void gpu_lock(void) {
|
|
+ pthread_mutex_lock(&gpu_mutex);
|
|
+
|
|
+ if (gpu==NULL) {
|
|
+ gpu_init(&gpu);
|
|
+ }
|
|
+}
|
|
+
|
|
+static void gpu_unlock(void) {
|
|
+ pthread_mutex_unlock(&gpu_mutex);
|
|
+}
|
|
+
|
|
+static int gpu_malloc_uncached_internal(int numbytes, GPU_MEM_PTR_T *p, int mb) {
|
|
+ p->numbytes = numbytes;
|
|
+ p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
|
|
+ av_assert0(p->vcsm_handle);
|
|
+ p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
|
|
+ av_assert0(p->vc_handle);
|
|
+ p->arm = vcsm_lock(p->vcsm_handle);
|
|
+ av_assert0(p->arm);
|
|
+ p->vc = mem_lock(mb, p->vc_handle);
|
|
+ av_assert0(p->vc);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+// Allocate memory on GPU
|
|
+// Fills in structure <p> containing ARM pointer, videocore handle, videocore memory address, numbytes
|
|
+// Returns 0 on success.
|
|
+// This allocates memory that will not be cached in ARM's data cache.
|
|
+// Therefore safe to use without data cache flushing.
|
|
+int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p)
|
|
+{
|
|
+ int r;
|
|
+ gpu_lock();
|
|
+ r = gpu_malloc_uncached_internal(numbytes, p, gpu->mb);
|
|
+ gpu->open_count++;
|
|
+ gpu_unlock();
|
|
+ return r;
|
|
+}
|
|
+
|
|
+int gpu_get_mailbox(void)
|
|
+{
|
|
+ av_assert0(gpu);
|
|
+ return gpu->mb;
|
|
+}
|
|
+
|
|
+// Call this to clean and invalidate a region of memory
|
|
+void gpu_cache_flush(const GPU_MEM_PTR_T * const p)
|
|
+{
|
|
+#ifdef RPI_FAST_CACHEFLUSH
|
|
+ struct vcsm_user_clean_invalid_s iocache = {};
|
|
+ iocache.s[0].handle = p->vcsm_handle;
|
|
+ iocache.s[0].cmd = 3; // clean+invalidate
|
|
+ iocache.s[0].addr = (int) p->arm;
|
|
+ iocache.s[0].size = p->numbytes;
|
|
+ vcsm_clean_invalid( &iocache );
|
|
+#else
|
|
+ void *tmp = vcsm_lock(p->vcsm_handle);
|
|
+ vcsm_unlock_ptr(tmp);
|
|
+#endif
|
|
+}
|
|
+
|
|
+void gpu_cache_flush3(GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2)
|
|
+{
|
|
+#ifdef RPI_FAST_CACHEFLUSH
|
|
+ struct vcsm_user_clean_invalid_s iocache = {};
|
|
+ iocache.s[0].handle = p0->vcsm_handle;
|
|
+ iocache.s[0].cmd = 3; // clean+invalidate
|
|
+ iocache.s[0].addr = (int) p0->arm;
|
|
+ iocache.s[0].size = p0->numbytes;
|
|
+ iocache.s[1].handle = p1->vcsm_handle;
|
|
+ iocache.s[1].cmd = 3; // clean+invalidate
|
|
+ iocache.s[1].addr = (int) p1->arm;
|
|
+ iocache.s[1].size = p1->numbytes;
|
|
+ iocache.s[2].handle = p2->vcsm_handle;
|
|
+ iocache.s[2].cmd = 3; // clean+invalidate
|
|
+ iocache.s[2].addr = (int) p2->arm;
|
|
+ iocache.s[2].size = p2->numbytes;
|
|
+ vcsm_clean_invalid( &iocache );
|
|
+#else
|
|
+ void *tmp;
|
|
+ tmp = vcsm_lock(p0->vcsm_handle);
|
|
+ vcsm_unlock_ptr(tmp);
|
|
+ tmp = vcsm_lock(p1->vcsm_handle);
|
|
+ vcsm_unlock_ptr(tmp);
|
|
+ tmp = vcsm_lock(p2->vcsm_handle);
|
|
+ vcsm_unlock_ptr(tmp);
|
|
+#endif
|
|
+}
|
|
+
|
|
+static int gpu_malloc_cached_internal(int numbytes, GPU_MEM_PTR_T *p) {
|
|
+ p->numbytes = numbytes;
|
|
+ p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST, (char *)"Video Frame" );
|
|
+ //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" );
|
|
+ //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
|
|
+ //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST_AND_VC, (char *)"Video Frame" );
|
|
+ av_assert0(p->vcsm_handle);
|
|
+ p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
|
|
+ av_assert0(p->vc_handle);
|
|
+ p->arm = vcsm_lock(p->vcsm_handle);
|
|
+ av_assert0(p->arm);
|
|
+ p->vc = mem_lock(gpu->mb, p->vc_handle);
|
|
+ av_assert0(p->vc);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+// This allocates data that will be
|
|
+// Cached in ARM L2
|
|
+// Uncached in VPU L2
|
|
+int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p)
|
|
+{
|
|
+ int r;
|
|
+ gpu_lock();
|
|
+ r = gpu_malloc_cached_internal(numbytes, p);
|
|
+ gpu->open_count++;
|
|
+ gpu_unlock();
|
|
+ return r;
|
|
+}
|
|
+
|
|
+static void gpu_term(void)
|
|
+{
|
|
+ int mb;
|
|
+
|
|
+ if (gpu==NULL)
|
|
+ return;
|
|
+ mb = gpu->mb;
|
|
+
|
|
+ // ??? Tear down anything needed for gpuexecute
|
|
+
|
|
+ qpu_enable(mb, 0);
|
|
+ gpu_free_internal(&gpu_mem_ptr);
|
|
+
|
|
+ vcsm_exit();
|
|
+
|
|
+ mbox_close(mb);
|
|
+ gpu = NULL;
|
|
+}
|
|
+
|
|
+void gpu_free_internal(GPU_MEM_PTR_T *p) {
|
|
+ int mb = gpu->mb;
|
|
+ mem_unlock(mb,p->vc_handle);
|
|
+ vcsm_unlock_ptr(p->arm);
|
|
+ vcsm_free(p->vcsm_handle);
|
|
+}
|
|
+
|
|
+void gpu_free(GPU_MEM_PTR_T *p) {
|
|
+ gpu_lock();
|
|
+
|
|
+ gpu_free_internal(p);
|
|
+
|
|
+ gpu->open_count--;
|
|
+ if (gpu->open_count==0) {
|
|
+ printf("Closing GPU\n");
|
|
+ gpu_term();
|
|
+ gpu = NULL;
|
|
+ }
|
|
+ gpu_unlock();
|
|
+}
|
|
+
|
|
+unsigned int vpu_get_fn(void) {
|
|
+ // Make sure that the gpu is initialized
|
|
+ if (gpu==NULL) {
|
|
+ printf("Preparing gpu\n");
|
|
+ gpu_lock();
|
|
+ gpu_unlock();
|
|
+ }
|
|
+ return gpu->vc + offsetof(struct GPU,vpu_code);
|
|
+}
|
|
+
|
|
+unsigned int vpu_get_constants(void) {
|
|
+ if (gpu==NULL) {
|
|
+ gpu_lock();
|
|
+ gpu_unlock();
|
|
+ }
|
|
+ return gpu->vc + offsetof(struct GPU,transMatrix2even);
|
|
+}
|
|
+
|
|
+#ifdef GPUSERVICE
|
|
+static void callback(void *cookie)
|
|
+{
|
|
+ sem_post((sem_t *)cookie);
|
|
+}
|
|
+#endif
|
|
+
|
|
+
|
|
+static volatile uint32_t post_done = 0;
|
|
+static volatile uint32_t post_qed = 0;
|
|
+
|
|
+static void post_code2_cb(void * v)
|
|
+{
|
|
+ uint32_t n = (uint32_t)v;
|
|
+ if ((int32_t)(n - post_done) > 0) {
|
|
+ post_done = n;
|
|
+ }
|
|
+}
|
|
+
|
|
+
|
|
+// Post a command to the queue
|
|
+// Returns an id which we can use to wait for completion
|
|
+int vpu_post_code2(unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, GPU_MEM_PTR_T *buf)
|
|
+{
|
|
+ struct gpu_job_s j[1] = {
|
|
+ {
|
|
+ .command = EXECUTE_VPU,
|
|
+ .u.v.q = {code, r0, r1, r2, r3, r4, r5},
|
|
+ .callback.func = post_code2_cb
|
|
+ }
|
|
+ };
|
|
+ uint32_t id;
|
|
+
|
|
+ j[0].callback.cookie = (void *)(id = ++post_qed);
|
|
+
|
|
+ av_assert0(vc_gpuserv_execute_code(1, j) == 0);
|
|
+
|
|
+ return id;
|
|
+}
|
|
+
|
|
+int vpu_qpu_post_code2(unsigned vpu_code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
|
|
+ int qpu0_n, const uint32_t * qpu0_mail,
|
|
+ int qpu1_n, const uint32_t * qpu1_mail)
|
|
+{
|
|
+#if 1
|
|
+ sem_t sync0;
|
|
+ struct gpu_job_s j[4];
|
|
+
|
|
+ sem_init(&sync0, 0, 0);
|
|
+
|
|
+ j[0].command = EXECUTE_VPU;
|
|
+ j[0].u.v.q[0] = vpu_code;
|
|
+ j[0].u.v.q[1] = r0;
|
|
+ j[0].u.v.q[2] = r1;
|
|
+ j[0].u.v.q[3] = r2;
|
|
+ j[0].u.v.q[4] = r3;
|
|
+ j[0].u.v.q[5] = r4;
|
|
+ j[0].u.v.q[6] = r5;
|
|
+ j[0].callback.func = 0;
|
|
+ j[0].callback.cookie = NULL;
|
|
+
|
|
+ j[1].command = EXECUTE_QPU;
|
|
+ j[1].u.q.jobs = qpu1_n;
|
|
+ memcpy(j[1].u.q.control, qpu1_mail, qpu1_n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
|
|
+ j[1].u.q.noflush = FLAGS_FOR_PROFILING;
|
|
+ j[1].u.q.timeout = 5000;
|
|
+ j[1].callback.func = 0;
|
|
+ j[1].callback.cookie = NULL;
|
|
+
|
|
+ j[2].command = EXECUTE_QPU;
|
|
+ j[2].u.q.jobs = qpu0_n;
|
|
+ memcpy(j[2].u.q.control, qpu0_mail, qpu0_n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
|
|
+ j[2].u.q.noflush = 1;
|
|
+ j[2].u.q.timeout = 5000;
|
|
+ j[2].callback.func = 0;
|
|
+ j[2].callback.cookie = NULL;
|
|
+
|
|
+ j[3].command = EXECUTE_SYNC;
|
|
+ j[3].u.s.mask = 3;
|
|
+ j[3].callback.func = callback;
|
|
+ j[3].callback.cookie = (void *)&sync0;
|
|
+
|
|
+ av_assert0(vc_gpuserv_execute_code(4, j) == 0);
|
|
+
|
|
+ sem_wait(&sync0);
|
|
+#else
|
|
+
|
|
+ sem_t sync0, sync2;
|
|
+ struct gpu_job_s j[3];
|
|
+
|
|
+ sem_init(&sync0, 0, 0);
|
|
+ sem_init(&sync2, 0, 0);
|
|
+
|
|
+ j[0].command = EXECUTE_VPU;
|
|
+ j[0].u.v.q[0] = vpu_code;
|
|
+ j[0].u.v.q[1] = r0;
|
|
+ j[0].u.v.q[2] = r1;
|
|
+ j[0].u.v.q[3] = r2;
|
|
+ j[0].u.v.q[4] = r3;
|
|
+ j[0].u.v.q[5] = r4;
|
|
+ j[0].u.v.q[6] = r5;
|
|
+ j[0].callback.func = callback;
|
|
+ j[0].callback.cookie = (void *)&sync0;
|
|
+
|
|
+ j[1].command = EXECUTE_QPU;
|
|
+ j[1].u.q.jobs = qpu1_n;
|
|
+ memcpy(j[1].u.q.control, qpu1_mail, qpu1_n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
|
|
+ j[1].u.q.noflush = FLAGS_FOR_PROFILING;
|
|
+ j[1].u.q.timeout = 5000;
|
|
+ j[1].callback.func = 0;
|
|
+ j[1].callback.cookie = NULL;
|
|
+
|
|
+ j[2].command = EXECUTE_QPU;
|
|
+ j[2].u.q.jobs = qpu0_n;
|
|
+ memcpy(j[2].u.q.control, qpu0_mail, qpu0_n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
|
|
+ j[2].u.q.noflush = 1;
|
|
+ j[2].u.q.timeout = 5000;
|
|
+ j[2].callback.func = callback;
|
|
+ j[2].callback.cookie = (void *)&sync2;
|
|
+
|
|
+ av_assert0(vc_gpuserv_execute_code(3, j) == 0);
|
|
+
|
|
+ sem_wait(&sync0);
|
|
+ sem_wait(&sync2);
|
|
+#endif
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+
|
|
+// Wait for completion of the given command
|
|
+void vpu_wait(int id)
|
|
+{
|
|
+ if (id == 0) {
|
|
+#if 0
|
|
+ sem_t sync0;
|
|
+ struct gpu_job_s j[1] =
|
|
+ {
|
|
+ {
|
|
+ .command = EXECUTE_SYNC,
|
|
+ .u.s.mask = 3,
|
|
+ .callback.func = callback,
|
|
+ .callback.cookie = (void *)&sync0
|
|
+ }
|
|
+ };
|
|
+
|
|
+ sem_init(&sync0, 0, 0);
|
|
+
|
|
+ av_assert0(vc_gpuserv_execute_code(1, j) == 0);
|
|
+
|
|
+ sem_wait(&sync0);
|
|
+#endif
|
|
+ }
|
|
+ else {
|
|
+ while ((int32_t)(post_done - (uint32_t)id) < 0) {
|
|
+ usleep(1000);
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+
|
|
+unsigned int qpu_get_fn(int num) {
|
|
+ // Make sure that the gpu is initialized
|
|
+ unsigned int *fn;
|
|
+ if (gpu==NULL) {
|
|
+ printf("Preparing gpu\n");
|
|
+ gpu_lock();
|
|
+ gpu_unlock();
|
|
+ }
|
|
+ switch(num) {
|
|
+ case QPU_MC_SETUP:
|
|
+ fn = mc_setup;
|
|
+ break;
|
|
+ case QPU_MC_FILTER:
|
|
+ fn = mc_filter;
|
|
+ break;
|
|
+ case QPU_MC_EXIT:
|
|
+ fn = mc_exit;
|
|
+ break;
|
|
+ case QPU_MC_INTERRUPT_EXIT12:
|
|
+ fn = mc_interrupt_exit12;
|
|
+ break;
|
|
+ case QPU_MC_FILTER_B:
|
|
+ fn = mc_filter_b;
|
|
+ break;
|
|
+ //case QPU_MC_FILTER_HONLY:
|
|
+ // fn = mc_filter_honly;
|
|
+ // break;
|
|
+ case QPU_MC_SETUP_UV:
|
|
+ fn = mc_setup_uv;
|
|
+ break;
|
|
+ case QPU_MC_FILTER_UV:
|
|
+ fn = mc_filter_uv;
|
|
+ break;
|
|
+ case QPU_MC_FILTER_UV_B0:
|
|
+ fn = mc_filter_uv_b0;
|
|
+ break;
|
|
+ case QPU_MC_FILTER_UV_B:
|
|
+ fn = mc_filter_uv_b;
|
|
+ break;
|
|
+ case QPU_MC_INTERRUPT_EXIT8:
|
|
+ fn = mc_interrupt_exit8;
|
|
+ break;
|
|
+ case QPU_MC_END:
|
|
+ fn = mc_end;
|
|
+ break;
|
|
+ default:
|
|
+ printf("Unknown function\n");
|
|
+ exit(-1);
|
|
+ }
|
|
+ return gpu->vc + 4*(int)(fn-rpi_shader);
|
|
+ //return code[num] + gpu->vc;
|
|
+}
|
|
+
|
|
+#if 0
|
|
+typedef unsigned int uint32_t;
|
|
+
|
|
+typedef struct mvs_s {
|
|
+ GPU_MEM_PTR_T unif_mvs_ptr;
|
|
+ uint32_t *unif_mvs; // Base of memory for motion vector commands
|
|
+
|
|
+ // _base pointers are to the start of the row
|
|
+ uint32_t *mvs_base[8];
|
|
+ // these pointers are to the next free space
|
|
+ uint32_t *u_mvs[8];
|
|
+
|
|
+} HEVCContext;
|
|
+
|
|
+#define RPI_CHROMA_COMMAND_WORDS 12
|
|
+
|
|
+static void rpi_inter_clear(HEVCContext *s)
|
|
+{
|
|
+ int i;
|
|
+ for(i=0;i<8;i++) {
|
|
+ s->u_mvs[i] = s->mvs_base[i];
|
|
+ *s->u_mvs[i]++ = 0;
|
|
+ *s->u_mvs[i]++ = 0;
|
|
+ *s->u_mvs[i]++ = 0;
|
|
+ *s->u_mvs[i]++ = 0;
|
|
+ *s->u_mvs[i]++ = 0;
|
|
+ *s->u_mvs[i]++ = 128; // w
|
|
+ *s->u_mvs[i]++ = 128; // h
|
|
+ *s->u_mvs[i]++ = 128; // stride u
|
|
+ *s->u_mvs[i]++ = 128; // stride v
|
|
+ s->u_mvs[i] += 3; // Padding words
|
|
+ }
|
|
+}
|
|
+
|
|
+static void rpi_execute_inter_qpu(HEVCContext *s)
|
|
+{
|
|
+ int k;
|
|
+ uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr.vc;
|
|
+
|
|
+ for(k=0;k<8;k++) {
|
|
+ s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
|
|
+ s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
|
|
+ s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP); // dummy location for V
|
|
+ }
|
|
+
|
|
+ s->u_mvs[8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
|
|
+
|
|
+ qpu_run_shader8(qpu_get_fn(QPU_MC_SETUP_UV),
|
|
+ (uint32_t)(unif_vc+(s->mvs_base[0 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
|
|
+ (uint32_t)(unif_vc+(s->mvs_base[1 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
|
|
+ (uint32_t)(unif_vc+(s->mvs_base[2 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
|
|
+ (uint32_t)(unif_vc+(s->mvs_base[3 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
|
|
+ (uint32_t)(unif_vc+(s->mvs_base[4 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
|
|
+ (uint32_t)(unif_vc+(s->mvs_base[5 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
|
|
+ (uint32_t)(unif_vc+(s->mvs_base[6 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
|
|
+ (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm))
|
|
+ );
|
|
+}
|
|
+
|
|
+void rpi_test_qpu(void)
|
|
+{
|
|
+ HEVCContext mvs;
|
|
+ HEVCContext *s = &mvs;
|
|
+ int i;
|
|
+ int uv_commands_per_qpu = (1 + (256*64*2)/(4*4)) * RPI_CHROMA_COMMAND_WORDS;
|
|
+ uint32_t *p;
|
|
+ printf("Allocate memory\n");
|
|
+ gpu_malloc_uncached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr );
|
|
+ s->unif_mvs = (uint32_t *) s->unif_mvs_ptr.arm;
|
|
+
|
|
+ // Set up initial locations for uniform streams
|
|
+ p = s->unif_mvs;
|
|
+ for(i = 0; i < 8; i++) {
|
|
+ s->mvs_base[i] = p;
|
|
+ p += uv_commands_per_qpu;
|
|
+ }
|
|
+ // Now run a simple program that should just quit immediately after a single texture fetch
|
|
+ rpi_inter_clear(s);
|
|
+ for(i=0;i<4;i++) {
|
|
+ printf("Launch QPUs\n");
|
|
+ rpi_execute_inter_qpu(s);
|
|
+ printf("Done\n");
|
|
+ }
|
|
+ printf("Free memory\n");
|
|
+ gpu_free(&s->unif_mvs_ptr);
|
|
+ return;
|
|
+}
|
|
+#endif
|
|
+
|
|
+#if 0
|
|
+
|
|
+int32_t hcoeffs[] = {-4, 10, -21, 70, 90, -24, 11, -4};
|
|
+//int32_t hcoeffs[] = {1, 1, 1, 1, 1, 1, 1, 1};
|
|
+int32_t vcoeffs[] = {-2, 6, -13, 37, 115, -20, 9, -4};
|
|
+//int32_t vcoeffs[] = {1, 1, 1, 1, 1, 1, 1, 1};
|
|
+
|
|
+#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0-1) & 0xff) | ((c1-1) & 0xff) << 8 | ((c2-1) & 0xff) << 16 | ((c3-1) & 0xff) << 24);
|
|
+
|
|
+static uint8_t av_clip_uint8(int32_t a)
|
|
+{
|
|
+ if (a&(~255)) return (-a)>>31;
|
|
+ else return a;
|
|
+}
|
|
+
|
|
+static int32_t filter8(const uint8_t *data, int pitch)
|
|
+{
|
|
+ int32_t vsum = 0;
|
|
+ int x, y;
|
|
+
|
|
+ for (y = 0; y < 8; y++) {
|
|
+ int32_t hsum = 0;
|
|
+
|
|
+ for (x = 0; x < 8; x++)
|
|
+ hsum += hcoeffs[x]*data[x + y * pitch];
|
|
+
|
|
+ vsum += vcoeffs[y]*av_clip_uint8( (hsum + 64) >> 7); // Added brackets to stop compiler warning
|
|
+ }
|
|
+
|
|
+ return av_clip_uint8( (vsum + 64) >> 7);
|
|
+}
|
|
+
|
|
+// Note regression changes coefficients so is not thread safe
|
|
+//#define REGRESSION
|
|
+#ifdef REGRESSION
|
|
+#define CMAX 100
|
|
+#else
|
|
+#define CMAX 2
|
|
+#endif
|
|
+#define YMAX 16
|
|
+
|
|
+int rpi_test_shader(void)
|
|
+{
|
|
+ int i, c;
|
|
+
|
|
+ uint32_t *unifs;
|
|
+
|
|
+ uint8_t *in_buffer;
|
|
+ uint8_t *out_buffer[2];
|
|
+
|
|
+ GPU_MEM_PTR_T unifs_ptr;
|
|
+ GPU_MEM_PTR_T in_buffer_ptr;
|
|
+ GPU_MEM_PTR_T out_buffer_ptr[2];
|
|
+
|
|
+ // Addresses in GPU memory of filter programs
|
|
+ uint32_t mc_setup = 0;
|
|
+ uint32_t mc_filter = 0;
|
|
+ uint32_t mc_exit = 0;
|
|
+
|
|
+ int pitch = 0x500;
|
|
+
|
|
+ if (gpu==NULL) {
|
|
+ gpu_lock();
|
|
+ gpu_unlock();
|
|
+ }
|
|
+
|
|
+ printf("This needs to change to reflect new assembler\n");
|
|
+ // Use table to compute locations of program start points
|
|
+ mc_setup = code[0] + gpu->vc;
|
|
+ mc_filter = code[1] + gpu->vc;
|
|
+ mc_exit = code[2] + gpu->vc;
|
|
+
|
|
+ if (!vcos_verify_ge0(gpu_malloc_uncached(4*64,&unifs_ptr))) {
|
|
+ return -2;
|
|
+ }
|
|
+ unifs = (uint32_t*)unifs_ptr.arm;
|
|
+
|
|
+ if (!vcos_verify_ge0(gpu_malloc_uncached(64*23,&in_buffer_ptr))) {
|
|
+ return -3;
|
|
+ }
|
|
+ in_buffer = (uint8_t*)in_buffer_ptr.arm;
|
|
+
|
|
+ if (!vcos_verify_ge0(gpu_malloc_uncached(16*pitch,&out_buffer_ptr[0])) || !vcos_verify_ge0(gpu_malloc_uncached(16*pitch,&out_buffer_ptr[1]))) {
|
|
+ return -4;
|
|
+ }
|
|
+ out_buffer[0] = (uint8_t*)out_buffer_ptr[0].arm;
|
|
+ out_buffer[1] = (uint8_t*)out_buffer_ptr[1].arm;
|
|
+
|
|
+ for (c = 0; c < CMAX; c++) {
|
|
+ int xo[] = {rand()&31, rand()&31};
|
|
+
|
|
+#ifdef REGRESSION
|
|
+ for (i = 0; i < 8; i++) {
|
|
+ hcoeffs[i] = (int8_t)rand();
|
|
+ vcoeffs[i] = (int8_t)rand();
|
|
+ if (hcoeffs[i]==-128)
|
|
+ hcoeffs[i]++;
|
|
+ if (vcoeffs[i]==-128)
|
|
+ vcoeffs[i]++;
|
|
+ }
|
|
+#endif
|
|
+
|
|
+ for (i = 0; i < 64*23; i++) {
|
|
+ //printf("%d %d %p\n",i,gpu->mb,&in_buffer[i]);
|
|
+ in_buffer[i] = rand();
|
|
+ }
|
|
+
|
|
+ // Clear output array
|
|
+ {
|
|
+ int b;
|
|
+ for(b=0;b<2;b++) {
|
|
+ for(i=0;i<16*16;i++) {
|
|
+ out_buffer[b][i] = 3;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+
|
|
+ unifs[0] = mc_filter;
|
|
+ unifs[1] = in_buffer_ptr.vc+xo[0]+16;
|
|
+ unifs[2] = 64; // src pitch
|
|
+ unifs[3] = pitch; // dst pitch
|
|
+ unifs[4] = 0; // Padding
|
|
+ unifs[5] = 0;
|
|
+ unifs[6] = 0;
|
|
+ unifs[7 ] = mc_filter;
|
|
+ unifs[8 ] = in_buffer_ptr.vc+xo[1]+16;
|
|
+ unifs[9 ] = ENCODE_COEFFS(hcoeffs[0], hcoeffs[1], hcoeffs[2], hcoeffs[3]);
|
|
+ unifs[10] = ENCODE_COEFFS(hcoeffs[4], hcoeffs[5], hcoeffs[6], hcoeffs[7]);
|
|
+ unifs[11] = ENCODE_COEFFS(vcoeffs[0], vcoeffs[1], vcoeffs[2], vcoeffs[3]);
|
|
+ unifs[12] = ENCODE_COEFFS(vcoeffs[4], vcoeffs[5], vcoeffs[6], vcoeffs[7]);
|
|
+ unifs[13] = out_buffer_ptr[0].vc;
|
|
+ unifs[14] = mc_exit;
|
|
+ unifs[15] = in_buffer_ptr.vc+xo[1]+16; // dummy
|
|
+ unifs[16] = ENCODE_COEFFS(hcoeffs[0], hcoeffs[1], hcoeffs[2], hcoeffs[3]);
|
|
+ unifs[17] = ENCODE_COEFFS(hcoeffs[4], hcoeffs[5], hcoeffs[6], hcoeffs[7]);
|
|
+ unifs[18] = ENCODE_COEFFS(vcoeffs[0], vcoeffs[1], vcoeffs[2], vcoeffs[3]);
|
|
+ unifs[19] = ENCODE_COEFFS(vcoeffs[4], vcoeffs[5], vcoeffs[6], vcoeffs[7]);
|
|
+ unifs[20] = out_buffer_ptr[1].vc;
|
|
+
|
|
+ printf("Gpu->vc=%x Code=%x dst=%x\n",gpu->vc, mc_filter,out_buffer_ptr[1].vc);
|
|
+
|
|
+ // flush_dcache(); TODO is this needed on ARM side? - tried to use the direct alias to avoid this problem
|
|
+
|
|
+ //qpu_run_shader(mc_setup, unifs_ptr.vc);
|
|
+ //qpu_run_shader(gpu, gpu->vc, unifs_ptr.vc);
|
|
+ rpi_do_block(in_buffer_ptr.vc+xo[0]+16, 64, out_buffer_ptr[0].vc, pitch,out_buffer[0]);
|
|
+ rpi_do_block(in_buffer_ptr.vc+xo[1]+16, 64, out_buffer_ptr[1].vc, pitch,out_buffer[1]);
|
|
+
|
|
+ if (1)
|
|
+ {
|
|
+ int x, y, b;
|
|
+ int bad = 0;
|
|
+
|
|
+ for (b=0; b<2; ++b)
|
|
+ for (y=0; y<YMAX; ++y)
|
|
+ for (x=0; x<16; ++x) {
|
|
+ int32_t ref = filter8(in_buffer+x+y*64+xo[b], 64);
|
|
+
|
|
+ if (out_buffer[b][x+y*pitch] != ref) {
|
|
+ bad = 1;
|
|
+// printf("%d, %d, %d, %d\n", c, b, x, y);
|
|
+ }
|
|
+#ifndef REGRESSION
|
|
+ //printf("%08x %08x\n", out_buffer[b][x+y*pitch], ref);
|
|
+#endif
|
|
+ }
|
|
+ if (bad)
|
|
+ printf("Failed dst=%x test=%d\n",out_buffer_ptr[1].vc,c);
|
|
+ else
|
|
+ printf("Passed dst=%x test=%d\n",out_buffer_ptr[1].vc,c);
|
|
+ }
|
|
+ //printf("%d\n", simpenrose_get_qpu_tick_count());
|
|
+ }
|
|
+
|
|
+ gpu_free(&out_buffer_ptr[0]);
|
|
+ gpu_free(&out_buffer_ptr[1]);
|
|
+ gpu_free(&in_buffer_ptr);
|
|
+ gpu_free(&unifs_ptr);
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+void rpi_do_block_arm(const uint8_t *in_buffer, int src_pitch, uint8_t *dst, int dst_pitch)
|
|
+{
|
|
+ int x,y;
|
|
+ for (y=0; y<16; ++y) {
|
|
+ for (x=0; x<16; ++x) {
|
|
+ dst[x+y*dst_pitch] = filter8(in_buffer+x+y*src_pitch, src_pitch);
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+void rpi_do_block(const uint8_t *in_buffer_vc, int src_pitch, uint8_t *dst_vc, int dst_pitch, uint8_t *dst)
|
|
+{
|
|
+ uint32_t *unifs;
|
|
+
|
|
+ GPU_MEM_PTR_T unifs_ptr;
|
|
+ //uint8_t *out_buffer;
|
|
+ //GPU_MEM_PTR_T out_buffer_ptr;
|
|
+
|
|
+ // Addresses in GPU memory of filter programs
|
|
+ uint32_t mc_setup = 0;
|
|
+ uint32_t mc_filter = 0;
|
|
+ uint32_t mc_exit = 0;
|
|
+ //int x,y;
|
|
+
|
|
+ if (gpu==NULL) {
|
|
+ gpu_lock();
|
|
+ gpu_unlock();
|
|
+ }
|
|
+
|
|
+ // Use table to compute locations of program start points
|
|
+ mc_setup = code[0] + gpu->vc;
|
|
+ mc_filter = code[1] + gpu->vc;
|
|
+ mc_exit = code[2] + gpu->vc;
|
|
+
|
|
+ if (!vcos_verify_ge0(gpu_malloc_uncached(4*64,&unifs_ptr))) {
|
|
+ return;
|
|
+ }
|
|
+ //gpu_malloc_uncached(16*dst_pitch,&out_buffer_ptr);
|
|
+ //out_buffer = (uint8_t*)out_buffer_ptr.arm;
|
|
+
|
|
+ /*for (y=0; y<16; ++y) {
|
|
+ for (x=0; x<16; ++x) {
|
|
+ out_buffer[x+y*dst_pitch] = 7;
|
|
+ }
|
|
+ }*/
|
|
+
|
|
+ unifs = (uint32_t*)unifs_ptr.arm;
|
|
+
|
|
+ unifs[0] = mc_filter;
|
|
+ unifs[1] = (int)in_buffer_vc;
|
|
+ unifs[2] = src_pitch; // src pitch
|
|
+ unifs[3] = dst_pitch; // dst pitch
|
|
+ unifs[4] = 0; // Padding
|
|
+ unifs[5] = 0;
|
|
+ unifs[6] = 0;
|
|
+ unifs[7 ] = mc_exit;
|
|
+ unifs[8 ] = (int)in_buffer_vc;
|
|
+ unifs[9 ] = ENCODE_COEFFS(hcoeffs[0], hcoeffs[1], hcoeffs[2], hcoeffs[3]);
|
|
+ unifs[10] = ENCODE_COEFFS(hcoeffs[4], hcoeffs[5], hcoeffs[6], hcoeffs[7]);
|
|
+ unifs[11] = ENCODE_COEFFS(vcoeffs[0], vcoeffs[1], vcoeffs[2], vcoeffs[3]);
|
|
+ unifs[12] = ENCODE_COEFFS(vcoeffs[4], vcoeffs[5], vcoeffs[6], vcoeffs[7]);
|
|
+ unifs[13] = (int)dst_vc;
|
|
+ //unifs[13] = (int)out_buffer_ptr.vc;
|
|
+
|
|
+ //printf("Gpu->vc=%x Code=%x dst=%x\n",gpu->vc, mc_filter,out_buffer_ptr[1].vc);
|
|
+
|
|
+ qpu_run_shader(mc_setup, unifs_ptr.vc);
|
|
+
|
|
+ /*for (y=0; y<16; ++y) {
|
|
+ for (x=0; x<16; ++x) {
|
|
+ dst[x+y*dst_pitch] = out_buffer[x+y*dst_pitch];
|
|
+ }
|
|
+ }*/
|
|
+
|
|
+ gpu_free(&unifs_ptr);
|
|
+ //gpu_free(&out_buffer_ptr);
|
|
+}
|
|
+
|
|
+
|
|
+
|
|
+#endif
|
|
+
|
|
+#endif // RPI
|
|
diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
|
|
new file mode 100644
|
|
index 0000000..c6cdb2b
|
|
--- /dev/null
|
|
+++ b/libavcodec/rpi_qpu.h
|
|
@@ -0,0 +1,176 @@
|
|
+#ifndef RPI_QPU_H
|
|
+#define RPI_QPU_H
|
|
+
|
|
+// Define RPI_FAST_CACHEFLUSH to use the VCSM cache flush code
|
|
+// *** N.B. Code has rotted & crashes if this is unset (before this set of changes)
|
|
+#define RPI_FAST_CACHEFLUSH
|
|
+
|
|
+#define RPI_ONE_BUF 1
|
|
+
|
|
+typedef struct gpu_mem_ptr_s {
|
|
+ unsigned char *arm; // Pointer to memory mapped on ARM side
|
|
+ int vc_handle; // Videocore handle of relocatable memory
|
|
+ int vcsm_handle; // Handle for use by VCSM
|
|
+ int vc; // Address for use in GPU code
|
|
+ int numbytes; // Size of memory block
|
|
+} GPU_MEM_PTR_T;
|
|
+
|
|
+// General GPU functions
|
|
+extern int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p);
|
|
+extern int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p);
|
|
+extern void gpu_free(GPU_MEM_PTR_T *p);
|
|
+extern void gpu_cache_flush(const GPU_MEM_PTR_T * const p);
|
|
+extern void gpu_cache_flush3(GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2);
|
|
+
|
|
+#include "libavutil/frame.h"
|
|
+#if !RPI_ONE_BUF
|
|
+static inline uint32_t get_vc_address_y(const AVFrame * const frame) {
|
|
+ GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[0]);
|
|
+ return p->vc;
|
|
+}
|
|
+
|
|
+static inline uint32_t get_vc_address_u(const AVFrame * const frame) {
|
|
+ GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[1]);
|
|
+ return p->vc;
|
|
+}
|
|
+
|
|
+static inline uint32_t get_vc_address_v(const AVFrame * const frame) {
|
|
+ GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[2]);
|
|
+ return p->vc;
|
|
+}
|
|
+
|
|
+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) {
|
|
+ return *(GPU_MEM_PTR_T *)av_buffer_pool_opaque(frame->buf[0]);
|
|
+}
|
|
+
|
|
+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_u(const AVFrame * const frame) {
|
|
+ return *(GPU_MEM_PTR_T *)av_buffer_pool_opaque(frame->buf[1]);
|
|
+}
|
|
+
|
|
+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_v(const AVFrame * const frame) {
|
|
+ return *(GPU_MEM_PTR_T *)av_buffer_pool_opaque(frame->buf[2]);
|
|
+}
|
|
+
|
|
+#else
|
|
+
|
|
+static inline int gpu_is_buf1(const AVFrame * const frame)
|
|
+{
|
|
+ return frame->buf[1] == NULL;
|
|
+}
|
|
+
|
|
+static inline GPU_MEM_PTR_T * gpu_buf1_gmem(const AVFrame * const frame)
|
|
+{
|
|
+ return av_buffer_get_opaque(frame->buf[0]);
|
|
+}
|
|
+
|
|
+static inline GPU_MEM_PTR_T * gpu_buf3_gmem(const AVFrame * const frame, const int n)
|
|
+{
|
|
+ return av_buffer_pool_opaque(frame->buf[n]);
|
|
+}
|
|
+
|
|
+
|
|
+static inline uint32_t get_vc_address_y(const AVFrame * const frame) {
|
|
+ return gpu_is_buf1(frame) ? gpu_buf1_gmem(frame)->vc : gpu_buf3_gmem(frame, 0)->vc;
|
|
+}
|
|
+
|
|
+static inline uint32_t get_vc_address_u(const AVFrame * const frame) {
|
|
+ return gpu_is_buf1(frame) ?
|
|
+ gpu_buf1_gmem(frame)->vc + frame->data[1] - frame->data[0] :
|
|
+ gpu_buf3_gmem(frame, 1)->vc;
|
|
+}
|
|
+
|
|
+static inline uint32_t get_vc_address_v(const AVFrame * const frame) {
|
|
+ return gpu_is_buf1(frame) ?
|
|
+ gpu_buf1_gmem(frame)->vc + frame->data[2] - frame->data[0] :
|
|
+ gpu_buf3_gmem(frame, 2)->vc;
|
|
+}
|
|
+
|
|
+
|
|
+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) {
|
|
+ if (gpu_is_buf1(frame))
|
|
+ {
|
|
+ GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
|
|
+ g.numbytes = frame->data[1] - frame->data[0];
|
|
+ return g;
|
|
+ }
|
|
+ else
|
|
+ return *gpu_buf3_gmem(frame, 0);
|
|
+}
|
|
+
|
|
+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_u(const AVFrame * const frame) {
|
|
+ if (gpu_is_buf1(frame))
|
|
+ {
|
|
+ GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
|
|
+ g.arm += frame->data[1] - frame->data[0];
|
|
+ g.vc += frame->data[1] - frame->data[0];
|
|
+ g.numbytes = frame->data[2] - frame->data[1]; // chroma size
|
|
+ return g;
|
|
+ }
|
|
+ else
|
|
+ return *gpu_buf3_gmem(frame, 1);
|
|
+}
|
|
+
|
|
+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_v(const AVFrame * const frame) {
|
|
+ if (gpu_is_buf1(frame))
|
|
+ {
|
|
+ GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
|
|
+ g.arm += frame->data[2] - frame->data[0];
|
|
+ g.vc += frame->data[2] - frame->data[0];
|
|
+ g.numbytes = frame->data[2] - frame->data[1]; // chroma size
|
|
+ return g;
|
|
+ }
|
|
+ else
|
|
+ return *gpu_buf3_gmem(frame, 2);
|
|
+}
|
|
+
|
|
+#endif
|
|
+
|
|
+
|
|
+// QPU specific functions
|
|
+extern void rpi_test_qpu(void);
|
|
+
|
|
+enum {
|
|
+ QPU_MC_SETUP,
|
|
+ QPU_MC_FILTER,
|
|
+ QPU_MC_EXIT,
|
|
+ QPU_MC_INTERRUPT_EXIT12,
|
|
+ QPU_MC_FILTER_B,
|
|
+ QPU_MC_FILTER_HONLY,
|
|
+ QPU_MC_SETUP_UV,
|
|
+ QPU_MC_FILTER_UV,
|
|
+ QPU_MC_FILTER_UV_B0,
|
|
+ QPU_MC_FILTER_UV_B,
|
|
+ QPU_MC_INTERRUPT_EXIT8,
|
|
+ QPU_MC_END
|
|
+ };
|
|
+extern unsigned int qpu_get_fn(int num);
|
|
+
|
|
+#define QPU_N_UV 8
|
|
+#define QPU_N_Y 12
|
|
+#define QPU_N_MAX 16
|
|
+
|
|
+#define QPU_MAIL_EL_VALS 2
|
|
+#define QPU_MAIL_EL_SIZE (QPU_MAIL_EL_VALS * sizeof(uint32_t))
|
|
+#define QPU_MAIL_VALS_MAX (QPU_N_MAX * QPU_MAIL_EL_VALS)
|
|
+#define QPU_MAIL_SIZE (QPU_MAIL_VALS_MAX * sizeof(uint32_t))
|
|
+
|
|
+// VPU specific functions
|
|
+extern unsigned int vpu_get_fn(void);
|
|
+extern unsigned int vpu_get_constants(void);
|
|
+//extern unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
|
|
+extern int vpu_post_code2( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, GPU_MEM_PTR_T *buf);
|
|
+int vpu_qpu_post_code2(unsigned vpu_code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
|
|
+ int qpu0_n, const uint32_t * qpu0_mail,
|
|
+ int qpu1_n, const uint32_t * qpu1_mail);
|
|
+
|
|
+extern void vpu_wait( int id);
|
|
+
|
|
+// Simple test of shader code
|
|
+extern int rpi_test_shader(void);
|
|
+
|
|
+extern void rpi_do_block(const unsigned char *in_buffer_vc, int src_pitch, unsigned char *dst_vc, int dst_pitch, unsigned char *dst);
|
|
+extern void rpi_do_block_arm(const unsigned char *in_buffer, int src_pitch, unsigned char *dst, int dst_pitch);
|
|
+
|
|
+extern int gpu_get_mailbox(void);
|
|
+
|
|
+#endif
|
|
diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
|
|
new file mode 100644
|
|
index 0000000..06fb166
|
|
--- /dev/null
|
|
+++ b/libavcodec/rpi_shader.c
|
|
@@ -0,0 +1,629 @@
|
|
+#include "rpi_shader.h"
|
|
+
|
|
+#ifdef _MSC_VER
|
|
+ #include <stdint.h>
|
|
+ /* cast through uintptr_t to avoid warnings */
|
|
+ #define POINTER_TO_UINT(X) ((unsigned int)(uintptr_t)(X))
|
|
+#else
|
|
+ #define POINTER_TO_UINT(X) ((unsigned int)(X))
|
|
+#endif
|
|
+
|
|
+#ifdef __cplusplus
|
|
+extern "C" { /* the types are probably wrong... */
|
|
+#endif
|
|
+#ifdef __cplusplus
|
|
+}
|
|
+#endif
|
|
+
|
|
+#ifdef _MSC_VER
|
|
+__declspec(align(8))
|
|
+#elif defined(__GNUC__)
|
|
+__attribute__((aligned(8)))
|
|
+#endif
|
|
+unsigned int rpi_shader[] = {
|
|
+// ::mc_setup_uv
|
|
+/* [0x00000000] */ 0x15827d80, 0x100207e7, // mov ra31, unif
|
|
+/* [0x00000008] */ 0x0c9a0f80, 0x10020427, // add ra_x, unif, elem_num
|
|
+/* [0x00000010] */ 0x15827d80, 0x10020767, // mov ra_y, unif
|
|
+/* [0x00000018] */ 0x15827d80, 0x10020627, // mov ra_frame_base, unif
|
|
+/* [0x00000020] */ 0x009e7000, 0x100009e7, // nop
|
|
+/* [0x00000028] */ 0x0d620f80, 0x10020667, // sub ra_u2v_ref_offset, unif, ra_frame_base
|
|
+/* [0x00000030] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
|
|
+/* [0x00000038] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
|
|
+/* [0x00000040] */ 0x15827d80, 0x10021427, // mov rb16, unif
|
|
+/* [0x00000048] */ 0x15827d80, 0x10020827, // mov r0, unif
|
|
+/* [0x00000050] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
|
|
+/* [0x00000058] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
|
|
+/* [0x00000060] */ 0x00010000, 0xe0020127, // mov ra4, 0x10000
|
|
+/* [0x00000068] */ 0x00000001, 0xe0020527, // mov ra_k1, 1
|
|
+/* [0x00000070] */ 0x00000100, 0xe00205a7, // mov ra_k256, 256
|
|
+/* [0x00000078] */ 0x00000040, 0xe00207a7, // mov ra30, 64
|
|
+/* [0x00000080] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
|
|
+/* [0x00000088] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255
|
|
+/* [0x00000090] */ 0x00000018, 0xe00215e7, // mov rb23, 24
|
|
+/* [0x00000098] */ 0x00000000, 0xe0020227, // mov ra8, 0
|
|
+/* [0x000000a0] */ 0x00000000, 0xe0020267, // mov ra9, 0
|
|
+/* [0x000000a8] */ 0x00000000, 0xe00202a7, // mov ra10, 0
|
|
+/* [0x000000b0] */ 0x00000000, 0xe00202e7, // mov ra11, 0
|
|
+/* [0x000000b8] */ 0x00000000, 0xe0020327, // mov ra12, 0
|
|
+/* [0x000000c0] */ 0x00000000, 0xe0020367, // mov ra13, 0
|
|
+/* [0x000000c8] */ 0x00000000, 0xe00203a7, // mov ra14, 0
|
|
+/* [0x000000d0] */ 0x00000000, 0xe00203e7, // mov ra15, 0
|
|
+/* [0x000000d8] */ 0x15427d80, 0x10020827, // mov r0, ra_x
|
|
+/* [0x000000e0] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
|
|
+/* [0x000000e8] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base
|
|
+/* [0x000000f0] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
|
|
+/* [0x000000f8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
|
|
+/* [0x00000100] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
|
|
+/* [0x00000108] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
|
|
+/* [0x00000110] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x, r0
|
|
+/* [0x00000118] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
|
|
+/* [0x00000120] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
|
|
+/* [0x00000128] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_frame_base, r2
|
|
+/* [0x00000130] */ 0x0c9e7440, 0x10020f27, // add t1s, r2, r1
|
|
+/* [0x00000138] */ 0x00000009, 0xe00208a7, // mov r2, 9
|
|
+/* [0x00000140] */ 0x0c827580, 0x10021367, // add rb13, r2, unif
|
|
+/* [0x00000148] */ 0x15827d80, 0x100009e7, // mov -, unif
|
|
+/* [0x00000150] */ 0x15827d80, 0x100208a7, // mov r2, unif
|
|
+/* [0x00000158] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
|
|
+/* [0x00000160] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
|
|
+/* [0x00000168] */ 0x159e7480, 0x10020867, // mov r1, r2
|
|
+/* [0x00000170] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
|
|
+/* [0x00000178] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
|
|
+/* [0x00000180] */ 0x159e7480, 0x10020827, // mov r0, r2
|
|
+/* [0x00000188] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
|
|
+/* [0x00000190] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
|
|
+/* [0x00000198] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
|
|
+/* [0x000001a0] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
|
|
+/* [0x000001a8] */ 0x0f9c11c0, 0xd00208a7, // asr r2, r0, 1
|
|
+/* [0x000001b0] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
|
|
+/* [0x000001b8] */ 0x0c9e7440, 0x10021567, // add rb21, r2, r1
|
|
+/* [0x000001c0] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
|
|
+/* [0x000001c8] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
|
|
+/* [0x000001d0] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
|
|
+/* [0x000001d8] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
|
|
+/* [0x000001e0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
|
|
+/* [0x000001e8] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
|
|
+/* [0x000001f0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
|
|
+/* [0x000001f8] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
|
|
+/* [0x00000200] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x
|
|
+/* [0x00000208] */ 0x0c627380, 0x10020f27, // add t1s, r1, ra_frame_base
|
|
+// ::mc_filter_uv
|
|
+/* [0x00000210] */ 0x15827d80, 0x100207e7, // mov ra31, unif
|
|
+/* [0x00000218] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
|
|
+/* [0x00000220] */ 0x938001f6, 0xd0024821, // max r0, r0, 0 ; mov r1, unif
|
|
+/* [0x00000228] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
|
|
+/* [0x00000230] */ 0x8d4e0ef6, 0x10025891, // sub r2, unif, r3 ; mov ra_xshift, ra_xshift_next
|
|
+/* [0x00000238] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
|
|
+/* [0x00000240] */ 0x8c8270f6, 0x10025801, // add r0, r0, r3 ; mov ra1, unif
|
|
+/* [0x00000248] */ 0x9481c1f6, 0xd00254c0, // and rb_x_next, r0, ~3 ; mov ra0, unif
|
|
+/* [0x00000250] */ 0x959dc27f, 0x10024731, // mov ra_y_next, r1 ; mov vw_setup, rb28
|
|
+/* [0x00000258] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
|
|
+/* [0x00000260] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b
|
|
+/* [0x00000268] */ 0x0c041dc0, 0xd2021467, // add rb17, ra1.16a, 1
|
|
+/* [0x00000270] */ 0x0c043dc0, 0xd20214a7, // add rb18, ra1.16a, 3
|
|
+/* [0x00000278] */ 0x11047dc0, 0xd2020827, // shl r0, ra1.16a, 7
|
|
+/* [0x00000280] */ 0x0c067180, 0x14020827, // add r0, r0, ra1.16b
|
|
+/* [0x00000288] */ 0x119d01c0, 0xd0020827, // shl r0, r0, i_shift16
|
|
+/* [0x00000290] */ 0x8c81b1f6, 0x10025683, // add rb26, r0, rb27 ; mov ra3, unif
|
|
+/* [0x00000298] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
|
|
+/* [0x000002a0] */ 0x950e0ff6, 0x18024048, // mov ra1, unif ; mov rb8, ra3.8a
|
|
+/* [0x000002a8] */ 0x950e0ff6, 0x1a064049, // mov.ifnz ra1, unif ; mov rb9, ra3.8b
|
|
+/* [0x000002b0] */ 0x800e7036, 0x1c0049ca, // nop ; mov rb10, ra3.8c
|
|
+/* [0x000002b8] */ 0x950c0ff6, 0xde0248cb, // mov r3, 0 ; mov rb11, ra3.8d
|
|
+/* [0x000002c0] */ 0x1104ddc0, 0x14020867, // shl r1, ra1.16b, rb13
|
|
+/* [0x000002c8] */ 0x0f9c13c0, 0xd0021327, // asr rb12, r1, 1
|
|
+/* [0x000002d0] */ 0x11041dc0, 0xd20213a7, // shl rb14, ra1.16a, 1
|
|
+// :uvloop
|
|
+/* [0x000002d8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0
|
|
+/* [0x000002e0] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift ; mov.ifz ra_x, rb_x_next ; ldtmu1
|
|
+/* [0x000002e8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
|
|
+/* [0x000002f0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch
|
|
+/* [0x000002f8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift ; v8subs r0, r0, rb20
|
|
+/* [0x00000300] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
|
|
+/* [0x00000308] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
|
|
+/* [0x00000310] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3
|
|
+/* [0x00000318] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2 ; v8subs r1, r1, rb20
|
|
+/* [0x00000320] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
|
|
+/* [0x00000328] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
|
|
+/* [0x00000330] */ 0x40027030, 0x180049e3, // nop ; mul24 r3, ra0.8a, r0
|
|
+/* [0x00000338] */ 0x40038031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
|
|
+/* [0x00000340] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1
|
|
+/* [0x00000348] */ 0x40037031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
|
|
+/* [0x00000350] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2
|
|
+/* [0x00000358] */ 0x40036031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
|
|
+/* [0x00000360] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3
|
|
+/* [0x00000368] */ 0x40035031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
|
|
+/* [0x00000370] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3 ; mov r3, rb31
|
|
+/* [0x00000378] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4 ; mov ra12, ra13
|
|
+/* [0x00000380] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
|
|
+/* [0x00000388] */ 0x55389db7, 0x10024361, // mov ra13, ra14 ; mul24 r1, ra14, rb9
|
|
+/* [0x00000390] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
|
|
+/* [0x00000398] */ 0x55308037, 0x100243e0, // mov ra15, r0 ; mul24 r0, ra12, rb8
|
|
+/* [0x000003a0] */ 0x4d38a237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra14, rb10
|
|
+/* [0x000003a8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra15, rb11
|
|
+/* [0x000003b0] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0 ; mov -, vw_wait
|
|
+/* [0x000003b8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18 ; mul24 r1, r1, ra_k256
|
|
+/* [0x000003c0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
|
|
+/* [0x000003c8] */ 0x409ce00f, 0x100049e1, // nop ; mul24 r1, r1, rb14
|
|
+/* [0x000003d0] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
|
|
+/* [0x000003d8] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
|
|
+/* [0x000003e0] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:uvloop
|
|
+/* [0x000003e8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
|
|
+/* [0x000003f0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
|
|
+/* [0x000003f8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
|
|
+/* [0x00000400] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
|
|
+/* [0x00000408] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
|
|
+/* [0x00000410] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
|
|
+/* [0x00000418] */ 0x00000010, 0xe0020827, // mov r0, 16
|
|
+/* [0x00000420] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
|
|
+/* [0x00000428] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
|
|
+/* [0x00000430] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
|
|
+/* [0x00000438] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
|
|
+/* [0x00000440] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
|
|
+// ::mc_filter_uv_b0
|
|
+/* [0x00000448] */ 0x15827d80, 0x100207e7, // mov ra31, unif
|
|
+/* [0x00000450] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
|
|
+/* [0x00000458] */ 0x938001f6, 0xd0024821, // max r0, r0, 0 ; mov r1, unif
|
|
+/* [0x00000460] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
|
|
+/* [0x00000468] */ 0x8d4e0ef6, 0x10025891, // sub r2, unif, r3 ; mov ra_xshift, ra_xshift_next
|
|
+/* [0x00000470] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
|
|
+/* [0x00000478] */ 0x8c8270f6, 0x10025801, // add r0, r0, r3 ; mov ra1, unif
|
|
+/* [0x00000480] */ 0x9481c1f6, 0xd00254c0, // and rb_x_next, r0, ~3 ; mov ra0, unif
|
|
+/* [0x00000488] */ 0x959d527f, 0x10024731, // mov ra_y_next, r1 ; mov vw_setup, rb21
|
|
+/* [0x00000490] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
|
|
+/* [0x00000498] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b
|
|
+/* [0x000004a0] */ 0x0c041dc0, 0xd2021467, // add rb17, ra1.16a, 1
|
|
+/* [0x000004a8] */ 0x0c043dc0, 0xd20214a7, // add rb18, ra1.16a, 3
|
|
+/* [0x000004b0] */ 0x11047dc0, 0xd2020827, // shl r0, ra1.16a, 7
|
|
+/* [0x000004b8] */ 0x0c067180, 0x14020827, // add r0, r0, ra1.16b
|
|
+/* [0x000004c0] */ 0x918101f6, 0xd0025803, // shl r0, r0, i_shift16 ; mov ra3, unif
|
|
+/* [0x000004c8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
|
|
+/* [0x000004d0] */ 0x150e7d80, 0x18021227, // mov rb8, ra3.8a
|
|
+/* [0x000004d8] */ 0x150e7d80, 0x1a021267, // mov rb9, ra3.8b
|
|
+/* [0x000004e0] */ 0x150e7d80, 0x1c0212a7, // mov rb10, ra3.8c
|
|
+/* [0x000004e8] */ 0x150e7d80, 0x1e0212e7, // mov rb11, ra3.8d
|
|
+/* [0x000004f0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
|
|
+/* [0x000004f8] */ 0x15827d80, 0x100213a7, // mov rb14, unif
|
|
+/* [0x00000500] */ 0x95800dbf, 0xd00653a3, // mov.ifnz rb14, unif ; mov r3, 0
|
|
+// :uvloop_b0
|
|
+/* [0x00000508] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0
|
|
+/* [0x00000510] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift ; mov.ifz ra_x, rb_x_next ; ldtmu1
|
|
+/* [0x00000518] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
|
|
+/* [0x00000520] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch
|
|
+/* [0x00000528] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift ; v8subs r0, r0, rb20
|
|
+/* [0x00000530] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
|
|
+/* [0x00000538] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
|
|
+/* [0x00000540] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3
|
|
+/* [0x00000548] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2 ; v8subs r1, r1, rb20
|
|
+/* [0x00000550] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
|
|
+/* [0x00000558] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
|
|
+/* [0x00000560] */ 0x40027030, 0x180049e3, // nop ; mul24 r3, ra0.8a, r0
|
|
+/* [0x00000568] */ 0x40038031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
|
|
+/* [0x00000570] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1
|
|
+/* [0x00000578] */ 0x40037031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
|
|
+/* [0x00000580] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2
|
|
+/* [0x00000588] */ 0x40036031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
|
|
+/* [0x00000590] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3
|
|
+/* [0x00000598] */ 0x40035031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
|
|
+/* [0x000005a0] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3 ; mov r3, rb31
|
|
+/* [0x000005a8] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4 ; mov ra12, ra13
|
|
+/* [0x000005b0] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
|
|
+/* [0x000005b8] */ 0x55389db7, 0x10024361, // mov ra13, ra14 ; mul24 r1, ra14, rb9
|
|
+/* [0x000005c0] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
|
|
+/* [0x000005c8] */ 0x55308037, 0x100243e0, // mov ra15, r0 ; mul24 r0, ra12, rb8
|
|
+/* [0x000005d0] */ 0x4d38a237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra14, rb10
|
|
+/* [0x000005d8] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
|
|
+/* [0x000005e0] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:uvloop_b0
|
|
+/* [0x000005e8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra15, rb11
|
|
+/* [0x000005f0] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0 ; mov -, vw_wait
|
|
+/* [0x000005f8] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
|
|
+/* [0x00000600] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
|
|
+/* [0x00000608] */ 0x15827d80, 0x100009e7, // mov -, unif
|
|
+/* [0x00000610] */ 0x15827d80, 0x100009e7, // mov -, unif
|
|
+/* [0x00000618] */ 0x009e7000, 0x100009e7, // nop
|
|
+// ::mc_filter_uv_b
|
|
+/* [0x00000620] */ 0x15827d80, 0x100207e7, // mov ra31, unif
|
|
+/* [0x00000628] */ 0x954dcdbf, 0x10024471, // mov ra_xshift, ra_xshift_next ; mov vw_setup, rb28
|
|
+/* [0x00000630] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
|
|
+/* [0x00000638] */ 0x938001f6, 0xd002581c, // max r0, r0, 0 ; mov ra_y_next, unif
|
|
+/* [0x00000640] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
|
|
+/* [0x00000648] */ 0x4d808cc7, 0xd0025893, // sub r2, unif, r3 ; mul24 ra_xshift_next, r0, 8
|
|
+/* [0x00000650] */ 0x8c8270f6, 0x10025801, // add r0, r0, r3 ; mov ra1, unif
|
|
+/* [0x00000658] */ 0x9481c1f6, 0xd00254c0, // and rb_x_next, r0, ~3 ; mov ra0, unif
|
|
+/* [0x00000660] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b
|
|
+/* [0x00000668] */ 0x0c041dc0, 0xd2021467, // add rb17, ra1.16a, 1
|
|
+/* [0x00000670] */ 0x0c043dc0, 0xd20214a7, // add rb18, ra1.16a, 3
|
|
+/* [0x00000678] */ 0x11047dc0, 0xd2020827, // shl r0, ra1.16a, 7
|
|
+/* [0x00000680] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
|
|
+/* [0x00000688] */ 0x918151f6, 0xd00258c3, // shl r3, r0, i_shift21 ; mov ra3, unif
|
|
+/* [0x00000690] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
|
|
+/* [0x00000698] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
|
|
+/* [0x000006a0] */ 0x0c067180, 0x14020827, // add r0, r0, ra1.16b
|
|
+/* [0x000006a8] */ 0x119d01c0, 0xd0020827, // shl r0, r0, i_shift16
|
|
+/* [0x000006b0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
|
|
+/* [0x000006b8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
|
|
+/* [0x000006c0] */ 0x950e0ff6, 0x18024048, // mov ra1, unif ; mov rb8, ra3.8a
|
|
+/* [0x000006c8] */ 0x950e0ff6, 0x1a064049, // mov.ifnz ra1, unif ; mov rb9, ra3.8b
|
|
+/* [0x000006d0] */ 0x800e7036, 0x1c0049ca, // nop ; mov rb10, ra3.8c
|
|
+/* [0x000006d8] */ 0x950c0ff6, 0xde0248cb, // mov r3, 0 ; mov rb11, ra3.8d
|
|
+/* [0x000006e0] */ 0x1104ddc0, 0x14020867, // shl r1, ra1.16b, rb13
|
|
+/* [0x000006e8] */ 0x0f9c13c0, 0xd0021327, // asr rb12, r1, 1
|
|
+// :uvloop_b
|
|
+/* [0x000006f0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0
|
|
+/* [0x000006f8] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift ; mov.ifz ra_x, rb_x_next ; ldtmu1
|
|
+/* [0x00000700] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
|
|
+/* [0x00000708] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch
|
|
+/* [0x00000710] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift ; v8subs r0, r0, rb20
|
|
+/* [0x00000718] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
|
|
+/* [0x00000720] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
|
|
+/* [0x00000728] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3
|
|
+/* [0x00000730] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2 ; v8subs r1, r1, rb20
|
|
+/* [0x00000738] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
|
|
+/* [0x00000740] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
|
|
+/* [0x00000748] */ 0x40027030, 0x180049e3, // nop ; mul24 r3, ra0.8a, r0
|
|
+/* [0x00000750] */ 0x40038031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
|
|
+/* [0x00000758] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1
|
|
+/* [0x00000760] */ 0x40037031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
|
|
+/* [0x00000768] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2
|
|
+/* [0x00000770] */ 0x40036031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
|
|
+/* [0x00000778] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3
|
|
+/* [0x00000780] */ 0x40035031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
|
|
+/* [0x00000788] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3 ; mov r3, rb31
|
|
+/* [0x00000790] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4 ; mov ra12, ra13
|
|
+/* [0x00000798] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
|
|
+/* [0x000007a0] */ 0x55389db7, 0x10024361, // mov ra13, ra14 ; mul24 r1, ra14, rb9
|
|
+/* [0x000007a8] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
|
|
+/* [0x000007b0] */ 0x55308037, 0x100243e0, // mov ra15, r0 ; mul24 r0, ra12, rb8
|
|
+/* [0x000007b8] */ 0x4d38a237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra14, rb10
|
|
+/* [0x000007c0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra15, rb11
|
|
+/* [0x000007c8] */ 0x4d13023e, 0x10024860, // sub r1, r1, r0 ; mul24 r0, vpm, ra4
|
|
+/* [0x000007d0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18 ; mul24 r1, r1, ra_k256
|
|
+/* [0x000007d8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
|
|
+/* [0x000007e0] */ 0x4f0501ce, 0xd2024821, // asr r0, r0, i_shift16 ; mul24 r1, r1, ra1.16a
|
|
+/* [0x000007e8] */ 0x409ce007, 0x100049e0, // nop ; mul24 r0, r0, rb14
|
|
+/* [0x000007f0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0 ; mov -, vw_wait
|
|
+/* [0x000007f8] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
|
|
+/* [0x00000800] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
|
|
+/* [0x00000808] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:uvloop_b
|
|
+/* [0x00000810] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
|
|
+/* [0x00000818] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
|
|
+/* [0x00000820] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
|
|
+/* [0x00000828] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
|
|
+/* [0x00000830] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
|
|
+/* [0x00000838] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
|
|
+/* [0x00000840] */ 0x00000010, 0xe0020827, // mov r0, 16
|
|
+/* [0x00000848] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
|
|
+/* [0x00000850] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
|
|
+/* [0x00000858] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
|
|
+/* [0x00000860] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
|
|
+/* [0x00000868] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
|
|
+// ::mc_exit
|
|
+/* [0x00000870] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
|
|
+/* [0x00000878] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
|
|
+/* [0x00000880] */ 0x009e7000, 0xa00009e7, // ldtmu0
|
|
+/* [0x00000888] */ 0x009e7000, 0xb00009e7, // ldtmu1
|
|
+/* [0x00000890] */ 0x009e7000, 0xa00009e7, // ldtmu0
|
|
+/* [0x00000898] */ 0x009e7000, 0xb00009e7, // ldtmu1
|
|
+/* [0x000008a0] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend
|
|
+/* [0x000008a8] */ 0x009e7000, 0x100009e7, // nop ; nop
|
|
+/* [0x000008b0] */ 0x009e7000, 0x100009e7, // nop ; nop
|
|
+// ::mc_interrupt_exit8
|
|
+/* [0x000008b8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
|
|
+/* [0x000008c0] */ 0x009e7000, 0xa00009e7, // ldtmu0
|
|
+/* [0x000008c8] */ 0x009e7000, 0xb00009e7, // ldtmu1
|
|
+/* [0x000008d0] */ 0x009e7000, 0xa00009e7, // ldtmu0
|
|
+/* [0x000008d8] */ 0x009e7000, 0xb00009e7, // ldtmu1
|
|
+/* [0x000008e0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
|
|
+/* [0x000008e8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
|
|
+/* [0x000008f0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
|
|
+/* [0x000008f8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
|
|
+/* [0x00000900] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
|
|
+/* [0x00000908] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
|
|
+/* [0x00000910] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
|
|
+/* [0x00000918] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend
|
|
+/* [0x00000920] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
|
|
+/* [0x00000928] */ 0x009e7000, 0x100009e7, // nop ; nop
|
|
+// ::mc_setup
|
|
+/* [0x00000930] */ 0x00000010, 0xe00208e7, // mov r3, 16
|
|
+/* [0x00000938] */ 0x15827d80, 0x10020227, // mov ra8, unif
|
|
+/* [0x00000940] */ 0x15827d80, 0x10020267, // mov ra9, unif
|
|
+/* [0x00000948] */ 0x15827d80, 0x100202a7, // mov ra10, unif
|
|
+/* [0x00000950] */ 0x15827d80, 0x100202e7, // mov ra11, unif
|
|
+/* [0x00000958] */ 0x15827d80, 0x10020867, // mov r1, unif
|
|
+/* [0x00000960] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
|
|
+/* [0x00000968] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
|
|
+/* [0x00000970] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
|
|
+/* [0x00000978] */ 0x0d9c13c0, 0xd0021667, // sub rb_frame_width_minus_1,r1,1
|
|
+/* [0x00000980] */ 0x0d9c11c0, 0xd00217a7, // sub rb_frame_height_minus_1,r0,1
|
|
+/* [0x00000988] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
|
|
+/* [0x00000990] */ 0x15827d80, 0x10020827, // mov r0, unif
|
|
+/* [0x00000998] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
|
|
+/* [0x000009a0] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
|
|
+/* [0x000009a8] */ 0x15227d80, 0x10020867, // mov r1, ra8
|
|
+/* [0x000009b0] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
|
|
+/* [0x000009b8] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
|
|
+/* [0x000009c0] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
|
|
+/* [0x000009c8] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
|
|
+/* [0x000009d0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
|
|
+/* [0x000009d8] */ 0x922591f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, ra9
|
|
+/* [0x000009e0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
|
|
+/* [0x000009e8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
|
|
+/* [0x000009f0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
|
|
+/* [0x000009f8] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
|
|
+/* [0x00000a00] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
|
|
+/* [0x00000a08] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
|
|
+/* [0x00000a10] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
|
|
+/* [0x00000a18] */ 0x8c9e7452, 0x10025e18, // add t0s, r2, r1 ; mov ra_frame_base, r2
|
|
+/* [0x00000a20] */ 0x152a7d80, 0x10020867, // mov r1, ra10
|
|
+/* [0x00000a28] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
|
|
+/* [0x00000a30] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
|
|
+/* [0x00000a38] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
|
|
+/* [0x00000a40] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
|
|
+/* [0x00000a48] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
|
|
+/* [0x00000a50] */ 0x922d91f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, ra11
|
|
+/* [0x00000a58] */ 0x119c31c0, 0xd0021067, // shl rx_xshift2_next, r0, 3
|
|
+/* [0x00000a60] */ 0x0c9c13c0, 0xd0120567, // add ra_y2, r1, 1
|
|
+/* [0x00000a68] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
|
|
+/* [0x00000a70] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
|
|
+/* [0x00000a78] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
|
|
+/* [0x00000a80] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
|
|
+/* [0x00000a88] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
|
|
+/* [0x00000a90] */ 0x8c9e7452, 0x10025f19, // add t1s, r2, r1 ; mov ra_frame_base2, r2
|
|
+/* [0x00000a98] */ 0x00000001, 0xe0020527, // mov ra_k1, 1
|
|
+/* [0x00000aa0] */ 0x00000100, 0xe00205a7, // mov ra_k256, 256
|
|
+/* [0x00000aa8] */ 0x00000040, 0xe00207a7, // mov ra30, 64
|
|
+/* [0x00000ab0] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
|
|
+/* [0x00000ab8] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255
|
|
+/* [0x00000ac0] */ 0x00000018, 0xe00215e7, // mov rb23, 24
|
|
+/* [0x00000ac8] */ 0x00000000, 0xe0020227, // mov ra8, 0
|
|
+/* [0x00000ad0] */ 0x00000000, 0xe0020267, // mov ra9, 0
|
|
+/* [0x00000ad8] */ 0x00000000, 0xe00202a7, // mov ra10, 0
|
|
+/* [0x00000ae0] */ 0x00000000, 0xe00202e7, // mov ra11, 0
|
|
+/* [0x00000ae8] */ 0x00000000, 0xe0020327, // mov ra12, 0
|
|
+/* [0x00000af0] */ 0x00000000, 0xe0020367, // mov ra13, 0
|
|
+/* [0x00000af8] */ 0x00000000, 0xe00203a7, // mov ra14, 0
|
|
+/* [0x00000b00] */ 0x00000000, 0xe00203e7, // mov ra15, 0
|
|
+/* [0x00000b08] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
|
|
+/* [0x00000b10] */ 0x159e7480, 0x10020867, // mov r1, r2
|
|
+/* [0x00000b18] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
|
|
+/* [0x00000b20] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
|
|
+/* [0x00000b28] */ 0x159e7480, 0x10020827, // mov r0, r2
|
|
+/* [0x00000b30] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
|
|
+/* [0x00000b38] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
|
|
+/* [0x00000b40] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
|
|
+/* [0x00000b48] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
|
|
+/* [0x00000b50] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
|
|
+/* [0x00000b58] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
|
|
+/* [0x00000b60] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
|
|
+/* [0x00000b68] */ 0x0c809dc0, 0xd0021367, // add rb13, unif, 9
|
|
+/* [0x00000b70] */ 0x15827d80, 0x100009e7, // mov -, unif
|
|
+/* [0x00000b78] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
|
|
+/* [0x00000b80] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
|
|
+/* [0x00000b88] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
|
|
+/* [0x00000b90] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
|
|
+/* [0x00000b98] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
|
|
+/* [0x00000ba0] */ 0x13540dc0, 0xd2020867, // max r1, ra_y2, 0
|
|
+/* [0x00000ba8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
|
|
+/* [0x00000bb0] */ 0x0c541dc0, 0xd2120567, // add ra_y2, ra_y2, 1
|
|
+/* [0x00000bb8] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
|
|
+/* [0x00000bc0] */ 0x0c667380, 0x10020f27, // add t1s, r1, ra_frame_base2
|
|
+// :per_block_setup
|
|
+/* [0x00000bc8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
|
|
+/* [0x00000bd0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
|
|
+/* [0x00000bd8] */ 0x959a0ff6, 0x10024061, // mov ra1, unif ; mov r1, elem_num
|
|
+/* [0x00000be0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
|
|
+/* [0x00000be8] */ 0x159c1fc0, 0x10021027, // mov rx_xshift2, rx_xshift2_next
|
|
+/* [0x00000bf0] */ 0x0c067c40, 0x12020827, // add r0, ra1.16a, r1
|
|
+/* [0x00000bf8] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
|
|
+/* [0x00000c00] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
|
|
+/* [0x00000c08] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
|
|
+/* [0x00000c10] */ 0x95048ff6, 0xd40258dc, // mov r3, 8 ; mov ra_y_next, ra1.16b
|
|
+/* [0x00000c18] */ 0x9481c1f6, 0xd0025801, // and r0, r0, ~3 ; mov ra1, unif
|
|
+/* [0x00000c20] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0
|
|
+/* [0x00000c28] */ 0x0c067c40, 0x12020827, // add r0, ra1.16a, r1
|
|
+/* [0x00000c30] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
|
|
+/* [0x00000c38] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
|
|
+/* [0x00000c40] */ 0x119c31c0, 0xd0021067, // shl rx_xshift2_next, r0, 3
|
|
+/* [0x00000c48] */ 0x8c0676f6, 0x142258d5, // add r3, r3, r3 ; mov ra_y2_next, ra1.16b
|
|
+/* [0x00000c50] */ 0x9481c1f6, 0xd0025801, // and r0, r0, ~3 ; mov ra1, unif
|
|
+/* [0x00000c58] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
|
|
+/* [0x00000c60] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
|
|
+/* [0x00000c68] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b
|
|
+/* [0x00000c70] */ 0x0c045dc0, 0xd2021467, // add rb17, ra1.16a, 5
|
|
+/* [0x00000c78] */ 0x0c047dc0, 0xd20214a7, // add rb18, ra1.16a, 7
|
|
+/* [0x00000c80] */ 0x11047dc0, 0xd2020827, // shl r0, ra1.16a, 7
|
|
+/* [0x00000c88] */ 0x0c067180, 0x14020827, // add r0, r0, ra1.16b
|
|
+/* [0x00000c90] */ 0x119d01c0, 0xd0020827, // shl r0, r0, i_shift16
|
|
+/* [0x00000c98] */ 0x8c81b1f6, 0x100256a0, // add rb26, r0, rb27 ; mov r0, unif
|
|
+/* [0x00000ca0] */ 0x119d01c0, 0xd0040827, // shl.ifz r0, r0, i_shift16
|
|
+/* [0x00000ca8] */ 0x119c31c0, 0xd0020227, // shl ra8, r0, 3
|
|
+/* [0x00000cb0] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
|
|
+/* [0x00000cb8] */ 0x10227380, 0x1e4200a7, // ror ra2.8a, r1, ra8.8d
|
|
+/* [0x00000cc0] */ 0x10227380, 0x1c420027, // ror ra0.8a, r1, ra8.8c
|
|
+/* [0x00000cc8] */ 0x01040400, 0xe0020867, // mov r1,0x01040400
|
|
+/* [0x00000cd0] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d
|
|
+/* [0x00000cd8] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c
|
|
+/* [0x00000ce0] */ 0x050b0a00, 0xe0020867, // mov r1,0x050b0a00
|
|
+/* [0x00000ce8] */ 0x10227380, 0x1e6200a7, // ror ra2.8c, r1, ra8.8d
|
|
+/* [0x00000cf0] */ 0x10227380, 0x1c620027, // ror ra0.8c, r1, ra8.8c
|
|
+/* [0x00000cf8] */ 0x11283a40, 0xe0020867, // mov r1,0x11283a40
|
|
+/* [0x00000d00] */ 0x10227380, 0x1e7200a7, // ror ra2.8d, r1, ra8.8d
|
|
+/* [0x00000d08] */ 0x10227380, 0x1c720027, // ror ra0.8d, r1, ra8.8c
|
|
+/* [0x00000d10] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
|
|
+/* [0x00000d18] */ 0x10227380, 0x1e4200e7, // ror ra3.8a, r1, ra8.8d
|
|
+/* [0x00000d20] */ 0x10227380, 0x1c420067, // ror ra1.8a, r1, ra8.8c
|
|
+/* [0x00000d28] */ 0x0a0b0500, 0xe0020867, // mov r1,0x0a0b0500
|
|
+/* [0x00000d30] */ 0x10227380, 0x1e5200e7, // ror ra3.8b, r1, ra8.8d
|
|
+/* [0x00000d38] */ 0x10227380, 0x1c520067, // ror ra1.8b, r1, ra8.8c
|
|
+/* [0x00000d40] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
|
|
+/* [0x00000d48] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d
|
|
+/* [0x00000d50] */ 0x10227380, 0x1c620067, // ror ra1.8c, r1, ra8.8c
|
|
+/* [0x00000d58] */ 0x01010000, 0xe0020867, // mov r1,0x01010000
|
|
+/* [0x00000d60] */ 0x902203bf, 0x1e7240e0, // ror ra3.8d, r1, ra8.8d ; mov r0, unif
|
|
+/* [0x00000d68] */ 0x9020d3bf, 0x1c724061, // ror ra1.8d, r1, ra8.8c ; mov r1, rb13
|
|
+/* [0x00000d70] */ 0x910e0e76, 0x18024844, // shl r1, unif, r1 ; mov rb4, ra3.8a
|
|
+/* [0x00000d78] */ 0x8f0e70f6, 0x1a024485, // asr ra18, r0, r3 ; mov rb5, ra3.8b
|
|
+/* [0x00000d80] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
|
|
+/* [0x00000d88] */ 0x910e70f6, 0x1c024806, // shl r0, r0, r3 ; mov rb6, ra3.8c
|
|
+/* [0x00000d90] */ 0x950c0ff6, 0xde0248c7, // mov r3, 0 ; mov rb7, ra3.8d
|
|
+/* [0x00000d98] */ 0x0f9c93c0, 0xd0021327, // asr rb12, r1, 9
|
|
+// ::mc_filter
|
|
+/* [0x00000da0] */ 0x0f9cf1c0, 0xd00213a7, // asr rb14, r0, 15
|
|
+// :yloop
|
|
+/* [0x00000da8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0
|
|
+/* [0x00000db0] */ 0x8e4539bf, 0xb0029819, // shr r0, r4, ra_xshift ; mov.ifz ra_frame_base2, rx_frame_base2_next ; ldtmu1
|
|
+/* [0x00000db8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
|
|
+/* [0x00000dc0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch
|
|
+/* [0x00000dc8] */ 0x8e5409f6, 0x14129855, // shr r1, r4, rx_xshift2 ; mov.ifz ra_y2, ra_y2_next
|
|
+/* [0x00000dd0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
|
|
+/* [0x00000dd8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
|
|
+/* [0x00000de0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3
|
|
+/* [0x00000de8] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2 ; v8subs r0, r0, rb20
|
|
+/* [0x00000df0] */ 0x13540dc0, 0xd20208a7, // max r2, ra_y2, 0
|
|
+/* [0x00000df8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
|
|
+/* [0x00000e00] */ 0x4c541dd3, 0xd2124562, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3
|
|
+/* [0x00000e08] */ 0xec654c8f, 0x10024f21, // add t1s, ra_frame_base2, r2 ; v8subs r1, r1, rb20
|
|
+/* [0x00000e10] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
|
|
+/* [0x00000e18] */ 0x40027030, 0x180049e3, // nop ; mul24 r3, ra0.8a, r0
|
|
+/* [0x00000e20] */ 0x40038031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
|
|
+/* [0x00000e28] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1
|
|
+/* [0x00000e30] */ 0x40037031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
|
|
+/* [0x00000e38] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2
|
|
+/* [0x00000e40] */ 0x40036031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
|
|
+/* [0x00000e48] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3
|
|
+/* [0x00000e50] */ 0x40035031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
|
|
+/* [0x00000e58] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4
|
|
+/* [0x00000e60] */ 0x40074031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra1.8a << 12, r1 << 12
|
|
+/* [0x00000e68] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5
|
|
+/* [0x00000e70] */ 0x40073031, 0xda00c9e3, // nop ; mul24.ifnz r3, ra1.8b << 13, r1 << 13
|
|
+/* [0x00000e78] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6
|
|
+/* [0x00000e80] */ 0x40072031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra1.8c << 14, r1 << 14
|
|
+/* [0x00000e88] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7
|
|
+/* [0x00000e90] */ 0x40071031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra1.8d << 15, r1 << 15
|
|
+/* [0x00000e98] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3 ; mov r3, rb31
|
|
+/* [0x00000ea0] */ 0x8d2087f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra8
|
|
+/* [0x00000ea8] */ 0x95249dbf, 0x10024208, // mov ra8, ra9 ; mov rb8, rb9
|
|
+/* [0x00000eb0] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:yloop
|
|
+/* [0x00000eb8] */ 0x9528adbf, 0x10024249, // mov ra9, ra10 ; mov rb9, rb10
|
|
+/* [0x00000ec0] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11 ; mov rb10, rb11
|
|
+/* [0x00000ec8] */ 0x959e7009, 0x100242cb, // mov ra11, r0 ; mov rb11, r1
|
|
+/* [0x00000ed0] */ 0x4008803e, 0x180049e0, // nop ; mul24 r0, rb8, ra2.8a
|
|
+/* [0x00000ed8] */ 0x4008903e, 0x1a0049e1, // nop ; mul24 r1, rb9, ra2.8b
|
|
+/* [0x00000ee0] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c
|
|
+/* [0x00000ee8] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d
|
|
+/* [0x00000ef0] */ 0x4c204237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb4
|
|
+/* [0x00000ef8] */ 0x4c245237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra9, rb5
|
|
+/* [0x00000f00] */ 0x4d286237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra10, rb6
|
|
+/* [0x00000f08] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb7
|
|
+/* [0x00000f10] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0 ; mov -, vw_wait
|
|
+/* [0x00000f18] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18 ; mul24 r1, r1, ra_k256
|
|
+/* [0x00000f20] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
|
|
+/* [0x00000f28] */ 0x409ce00f, 0x100049e1, // nop ; mul24 r1, r1, rb14
|
|
+/* [0x00000f30] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
|
|
+/* [0x00000f38] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
|
|
+/* [0x00000f40] */ 0xfffffe48, 0xf06809e7, // brr.anyn -, r:yloop
|
|
+/* [0x00000f48] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
|
|
+/* [0x00000f50] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
|
|
+/* [0x00000f58] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
|
|
+/* [0x00000f60] */ 0xfffffc48, 0xf0f809e7, // brr -, r:per_block_setup
|
|
+/* [0x00000f68] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
|
|
+/* [0x00000f70] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
|
|
+/* [0x00000f78] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
|
|
+// ::mc_filter_b
|
|
+/* [0x00000f80] */ 0x0f9d01c0, 0xd00213a7, // asr rb14, r0, i_shift16
|
|
+// :yloopb
|
|
+/* [0x00000f88] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0
|
|
+/* [0x00000f90] */ 0x8e4539bf, 0xb0029819, // shr r0, r4, ra_xshift ; mov.ifz ra_frame_base2, rx_frame_base2_next ; ldtmu1
|
|
+/* [0x00000f98] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
|
|
+/* [0x00000fa0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch
|
|
+/* [0x00000fa8] */ 0x8e5409f6, 0x14129855, // shr r1, r4, rx_xshift2 ; mov.ifz ra_y2, ra_y2_next
|
|
+/* [0x00000fb0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
|
|
+/* [0x00000fb8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
|
|
+/* [0x00000fc0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3
|
|
+/* [0x00000fc8] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2 ; v8subs r0, r0, rb20
|
|
+/* [0x00000fd0] */ 0x13540dc0, 0xd20208a7, // max r2, ra_y2, 0
|
|
+/* [0x00000fd8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
|
|
+/* [0x00000fe0] */ 0x4c541dd3, 0xd2124562, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3
|
|
+/* [0x00000fe8] */ 0xec654c8f, 0x10024f21, // add t1s, ra_frame_base2, r2 ; v8subs r1, r1, rb20
|
|
+/* [0x00000ff0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
|
|
+/* [0x00000ff8] */ 0x40027030, 0x180049e3, // nop ; mul24 r3, ra0.8a, r0
|
|
+/* [0x00001000] */ 0x40038031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
|
|
+/* [0x00001008] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1
|
|
+/* [0x00001010] */ 0x40037031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
|
|
+/* [0x00001018] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2
|
|
+/* [0x00001020] */ 0x40036031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
|
|
+/* [0x00001028] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3
|
|
+/* [0x00001030] */ 0x40035031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
|
|
+/* [0x00001038] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4
|
|
+/* [0x00001040] */ 0x40074031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra1.8a << 12, r1 << 12
|
|
+/* [0x00001048] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5
|
|
+/* [0x00001050] */ 0x40073031, 0xda00c9e3, // nop ; mul24.ifnz r3, ra1.8b << 13, r1 << 13
|
|
+/* [0x00001058] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6
|
|
+/* [0x00001060] */ 0x40072031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra1.8c << 14, r1 << 14
|
|
+/* [0x00001068] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7
|
|
+/* [0x00001070] */ 0x40071031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra1.8d << 15, r1 << 15
|
|
+/* [0x00001078] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3 ; mov r3, rb31
|
|
+/* [0x00001080] */ 0x8d2087f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra8
|
|
+/* [0x00001088] */ 0x95249dbf, 0x10024208, // mov ra8, ra9 ; mov rb8, rb9
|
|
+/* [0x00001090] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:yloopb
|
|
+/* [0x00001098] */ 0x9528adbf, 0x10024249, // mov ra9, ra10 ; mov rb9, rb10
|
|
+/* [0x000010a0] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11 ; mov rb10, rb11
|
|
+/* [0x000010a8] */ 0x959e7009, 0x100242cb, // mov ra11, r0 ; mov rb11, r1
|
|
+/* [0x000010b0] */ 0x4008803e, 0x180049e0, // nop ; mul24 r0, rb8, ra2.8a
|
|
+/* [0x000010b8] */ 0x4008903e, 0x1a0049e1, // nop ; mul24 r1, rb9, ra2.8b
|
|
+/* [0x000010c0] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c
|
|
+/* [0x000010c8] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d
|
|
+/* [0x000010d0] */ 0x4c204237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb4
|
|
+/* [0x000010d8] */ 0x4c245237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra9, rb5
|
|
+/* [0x000010e0] */ 0x4d286237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra10, rb6
|
|
+/* [0x000010e8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb7
|
|
+/* [0x000010f0] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0 ; mov r2, rb12
|
|
+/* [0x000010f8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18 ; mul24 r1, r1, ra_k256
|
|
+/* [0x00001100] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
|
|
+/* [0x00001108] */ 0x409ce00f, 0x100049e0, // nop ; mul24 r0, r1, rb14
|
|
+/* [0x00001110] */ 0x4c4b808e, 0xd0024821, // add r0, r0, r2 ; mul24 r1, r1 << 8, ra18 << 8
|
|
+/* [0x00001118] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0 ; mov -, vw_wait
|
|
+/* [0x00001120] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
|
|
+/* [0x00001128] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:yloopb
|
|
+/* [0x00001130] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
|
|
+/* [0x00001138] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
|
|
+/* [0x00001140] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
|
|
+/* [0x00001148] */ 0xfffffa60, 0xf0f809e7, // brr -, r:per_block_setup
|
|
+/* [0x00001150] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
|
|
+/* [0x00001158] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
|
|
+/* [0x00001160] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
|
|
+// ::mc_interrupt_exit12
|
|
+/* [0x00001168] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
|
|
+/* [0x00001170] */ 0x009e7000, 0xa00009e7, // ldtmu0
|
|
+/* [0x00001178] */ 0x009e7000, 0xa00009e7, // ldtmu0
|
|
+/* [0x00001180] */ 0x009e7000, 0xb00009e7, // ldtmu1
|
|
+/* [0x00001188] */ 0x009e7000, 0xb00009e7, // ldtmu1
|
|
+/* [0x00001190] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
|
|
+/* [0x00001198] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
|
|
+/* [0x000011a0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
|
|
+/* [0x000011a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
|
|
+/* [0x000011b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
|
|
+/* [0x000011b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
|
|
+/* [0x000011c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
|
|
+/* [0x000011c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
|
|
+/* [0x000011d0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
|
|
+/* [0x000011d8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
|
|
+/* [0x000011e0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
|
|
+/* [0x000011e8] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend
|
|
+/* [0x000011f0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
|
|
+/* [0x000011f8] */ 0x009e7000, 0x100009e7, // nop ; nop
|
|
+// ::mc_exit1
|
|
+/* [0x00001200] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
|
|
+/* [0x00001208] */ 0x009e7000, 0xa00009e7, // ldtmu0
|
|
+/* [0x00001210] */ 0x009e7000, 0xb00009e7, // ldtmu1
|
|
+/* [0x00001218] */ 0x009e7000, 0xa00009e7, // ldtmu0
|
|
+/* [0x00001220] */ 0x009e7000, 0xb00009e7, // ldtmu1
|
|
+/* [0x00001228] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend
|
|
+/* [0x00001230] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
|
|
+/* [0x00001238] */ 0x009e7000, 0x100009e7, // nop ; nop
|
|
+// ::mc_end
|
|
+};
|
|
+#ifdef __HIGHC__
|
|
+#pragma Align_to(8, rpi_shader)
|
|
+#endif
|
|
diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
|
|
new file mode 100644
|
|
index 0000000..9772796
|
|
--- /dev/null
|
|
+++ b/libavcodec/rpi_shader.h
|
|
@@ -0,0 +1,19 @@
|
|
+#ifndef rpi_shader_H
|
|
+#define rpi_shader_H
|
|
+
|
|
+extern unsigned int rpi_shader[];
|
|
+
|
|
+#define mc_setup_uv (rpi_shader + 0)
|
|
+#define mc_filter_uv (rpi_shader + 132)
|
|
+#define mc_filter_uv_b0 (rpi_shader + 274)
|
|
+#define mc_filter_uv_b (rpi_shader + 392)
|
|
+#define mc_exit (rpi_shader + 540)
|
|
+#define mc_interrupt_exit8 (rpi_shader + 558)
|
|
+#define mc_setup (rpi_shader + 588)
|
|
+#define mc_filter (rpi_shader + 872)
|
|
+#define mc_filter_b (rpi_shader + 992)
|
|
+#define mc_interrupt_exit12 (rpi_shader + 1114)
|
|
+#define mc_exit1 (rpi_shader + 1152)
|
|
+#define mc_end (rpi_shader + 1168)
|
|
+
|
|
+#endif
|
|
diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
|
|
new file mode 100644
|
|
index 0000000..aa9e1e7
|
|
--- /dev/null
|
|
+++ b/libavcodec/rpi_shader.qasm
|
|
@@ -0,0 +1,1098 @@
|
|
+# register allocation
|
|
+#
|
|
+# ra0...ra7 eight horizontal filter coefficients
|
|
+#
|
|
+# rb0 rx_shift2
|
|
+# rb1 rb_y2_next
|
|
+#
|
|
+# rb4...rb7
|
|
+#
|
|
+# rb8..rb11, ra8...ra11 Y: eight filtered rows of context (ra11 == most recent)
|
|
+#
|
|
+# (ra15 isn't clamped to zero - this happens during the
|
|
+# copy to ra14, and during its use in the vertical filter)
|
|
+#
|
|
+# rb8...rb11 eight vertical filter coefficients
|
|
+
|
|
+# ra4 y: Fiter, UV: 0x10000
|
|
+
|
|
+# rb12 offset to add before shift (round + weighting offsets)
|
|
+# rb13 shift: denom + 6 + 9
|
|
+# rb14 L0 weight (U on left, V on right)
|
|
+# rb15 -- free --
|
|
+#
|
|
+# ra16 clipped(row start address+elem_num)&~3
|
|
+# ra17 per-channel shifts
|
|
+# ra18 L1 weight (Y)
|
|
+# ra19 next ra17
|
|
+#
|
|
+# rb16 pitch
|
|
+# rb17 height + 1
|
|
+# rb18 height + 3
|
|
+# rb19 next ra16
|
|
+#
|
|
+# ra20 1
|
|
+# ra21 ra_21
|
|
+# ra22 ra_k256 256
|
|
+# ra23 ra_y2_next ra_y2_next
|
|
+#
|
|
+# rb20 0xffffff00
|
|
+# rb21 vpm_setup for reading/writing 16bit results into VPM
|
|
+# rb22 rb_k255 255
|
|
+# rb23 24
|
|
+#
|
|
+# rb24 vdw_setup_1(dst_pitch)
|
|
+# rb25 frame width-1
|
|
+# rb26 height<<23 + width<<16 + vdw_setup_0
|
|
+# rb27 vdw_setup_0 (depends on QPU number)
|
|
+# rb28 vpm_setup (depends on QPU number) for writing 8bit results into VPM
|
|
+# rb29 vdw_setup_1(dst_pitch-width)
|
|
+# rb30 frame height-1
|
|
+# rb31 used as temp to count loop iterations
|
|
+#
|
|
+# ra24 clipped(row start address+8+elem_num)&~3
|
|
+# ra25 per-channel shifts 2
|
|
+# ra26 next ra24
|
|
+# ra27 next ra25
|
|
+# ra28 next y
|
|
+# ra29 y for next texture access
|
|
+# ra30 64
|
|
+#
|
|
+# ra31 next kernel address
|
|
+
|
|
+.set rb_frame_width_minus_1, rb25
|
|
+.set rb_frame_height_minus_1, rb30
|
|
+.set rb_pitch, rb16
|
|
+.set ra_x, ra16
|
|
+.set ra_y2, ra21.16a
|
|
+.set ra_y2_next, ra21.16b
|
|
+
|
|
+.set rb_x_next, rb19
|
|
+.set rx_frame_base2_next, rb19
|
|
+
|
|
+.set ra_frame_base, ra24
|
|
+.set ra_frame_base_next, ra26
|
|
+.set ra_xshift, ra17
|
|
+
|
|
+.set ra_u2v_ref_offset, ra25
|
|
+.set ra_frame_base2, ra25
|
|
+
|
|
+.set ra_xshift_next, ra19
|
|
+.set rx_xshift2, rb0
|
|
+.set rx_xshift2_next, rb1
|
|
+
|
|
+.set ra_u2v_dst_offset, ra27
|
|
+
|
|
+.set ra_y_next, ra28
|
|
+.set ra_y, ra29
|
|
+
|
|
+.set ra_k1, ra20
|
|
+.set rb_k255, rb22
|
|
+.set ra_k256, ra22
|
|
+
|
|
+# With shifts only the bottom 5 bits are considered so -16=16, -15=17 etc.
|
|
+.set i_shift16, -16
|
|
+.set i_shift21, -11
|
|
+
|
|
+################################################################################
|
|
+# mc_setup_uv(next_kernel, x, y, ref_u_base, ref_v_base, frame_width, frame_height, pitch, dst_pitch, offset, denom, vpm_id)
|
|
+::mc_setup_uv
|
|
+
|
|
+# Read starting kernel
|
|
+mov ra31, unif
|
|
+
|
|
+# Load first request location
|
|
+add ra_x, unif, elem_num # Store x
|
|
+mov ra_y, unif # Store y
|
|
+mov ra_frame_base, unif # Store frame u base
|
|
+nop
|
|
+sub ra_u2v_ref_offset, unif, ra_frame_base # Store offset to add to move from u to v in reference frame
|
|
+
|
|
+# Read image dimensions
|
|
+sub rb25,unif,1
|
|
+sub rb30,unif,1
|
|
+
|
|
+# get source pitch
|
|
+mov rb16, unif
|
|
+
|
|
+# get destination pitch
|
|
+mov r0, unif
|
|
+mov r1, vdw_setup_1(0)
|
|
+add rb24, r1, r0
|
|
+
|
|
+# load constants
|
|
+
|
|
+mov ra4, 0x10000
|
|
+mov ra_k1, 1
|
|
+mov ra_k256, 256
|
|
+mov ra30, 64
|
|
+
|
|
+mov rb20, 0xffffff00
|
|
+mov rb_k255, 255
|
|
+mov rb23, 24
|
|
+
|
|
+# touch vertical context to keep simulator happy
|
|
+
|
|
+mov ra8, 0
|
|
+mov ra9, 0
|
|
+mov ra10, 0
|
|
+mov ra11, 0
|
|
+mov ra12, 0
|
|
+mov ra13, 0
|
|
+mov ra14, 0
|
|
+mov ra15, 0
|
|
+
|
|
+# Compute base address for first and second access
|
|
+mov r0, ra_x # Load x
|
|
+max r0, r0, 0; mov r1, ra_y # Load y
|
|
+min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base # Load the frame base
|
|
+shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
|
|
+add ra_y, r1, 1
|
|
+add r0, r0, r3
|
|
+and r0, r0, ~3
|
|
+max r1, r1, 0 ; mov ra_x, r0 # y
|
|
+min r1, r1, rb_frame_height_minus_1
|
|
+# submit texture requests for first line
|
|
+add r2, r2, r0 ; mul24 r1, r1, rb_pitch
|
|
+add t0s, r0, r1 ; mov ra_frame_base, r2
|
|
+add t1s, r2, r1
|
|
+
|
|
+mov r2, 9
|
|
+add rb13, r2, unif # denominator
|
|
+mov -, unif # Unused
|
|
+
|
|
+# Compute part of VPM to use for DMA output
|
|
+mov r2, unif
|
|
+shl r2, r2, 1 # Convert QPU numbers to be even (this means we can only use 8 QPUs, but is necessary as we need to save 16bit intermediate results)
|
|
+and r2, r2, 15
|
|
+mov r1, r2
|
|
+asr r1, r1, 2
|
|
+shl r1, r1, 6
|
|
+mov r0, r2
|
|
+and r0, r0, 3
|
|
+add r0, r0, r1
|
|
+
|
|
+mov r1, vpm_setup(0, 4, h8p(0, 0)) # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
|
|
+add rb28, r0, r1 # VPM 8bit storage
|
|
+asr r2, r0, 1 # r0 = bc0000d
|
|
+mov r1, vpm_setup(0, 2, h16p(0, 0)) # 2 is stride - stride acts on ADDR which is Y[5:0],H[0] for 16 bit
|
|
+add rb21, r2, r1 # VPM for 16bit intermediates
|
|
+mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
|
|
+shl r0, r0, 5
|
|
+add rb27, r0, r1 # DMA out
|
|
+
|
|
+# submit texture requests for second line
|
|
+max r1, ra_y, 0
|
|
+min r1, r1, rb_frame_height_minus_1
|
|
+add ra_y, ra_y, 1
|
|
+bra -, ra31
|
|
+nop ; mul24 r1, r1, rb_pitch
|
|
+add t0s, r1, ra_x
|
|
+add t1s, r1, ra_frame_base
|
|
+
|
|
+
|
|
+
|
|
+################################################################################
|
|
+
|
|
+# mc_filter_uv(next_kernel, x, y, frame_u_base, frame_v_base, width_height, hcoeffs, vcoeffs, offset_weight_u, offset_weight_v, this_u_dst, this_v_dst)
|
|
+
|
|
+# At this point we have already issued two pairs of texture requests for the current block
|
|
+# ra_x, ra_x16_base point to the current coordinates for this block
|
|
+::mc_filter_uv
|
|
+mov ra31, unif
|
|
+
|
|
+# per-channel shifts were calculated on the *previous* invocation
|
|
+
|
|
+# get base addresses and per-channel shifts for *next* invocation
|
|
+add r0, unif, elem_num # x
|
|
+max r0, r0, 0 ; mov r1, unif # y
|
|
+min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
|
|
+# compute offset from frame base u to frame base v
|
|
+sub r2, unif, r3 ; mov ra_xshift, ra_xshift_next
|
|
+shl ra_xshift_next, r0, 3
|
|
+add r0, r0, r3 ; mov ra1, unif # ; width_height
|
|
+and rb_x_next, r0, ~3 ; mov ra0, unif # H filter coeffs
|
|
+mov ra_y_next, r1 ; mov vw_setup, rb28
|
|
+add ra_frame_base_next, rb_x_next, r2
|
|
+
|
|
+# set up VPM write
|
|
+# get width,height of block
|
|
+
|
|
+sub rb29, rb24, ra1.16b # Compute vdw_setup1(dst_pitch-width)
|
|
+add rb17, ra1.16a, 1
|
|
+add rb18, ra1.16a, 3
|
|
+shl r0, ra1.16a, 7
|
|
+add r0, r0, ra1.16b # Combine width and height of destination area
|
|
+shl r0, r0, i_shift16 # Shift into bits 16 upwards of the vdw_setup0 register
|
|
+add rb26, r0, rb27 ; mov ra3, unif # ; V filter coeffs
|
|
+
|
|
+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
|
|
+
|
|
+# unpack filter coefficients
|
|
+
|
|
+mov ra1, unif ; mov rb8, ra3.8a # U offset/weight
|
|
+mov.ifnz ra1, unif ; mov rb9, ra3.8b # V offset/weight
|
|
+nop ; mov rb10, ra3.8c
|
|
+mov r3, 0 ; mov rb11, ra3.8d # Loop count
|
|
+
|
|
+shl r1, ra1.16b, rb13
|
|
+asr rb12, r1, 1
|
|
+shl rb14, ra1.16a, 1 # b14 = weight*2
|
|
+
|
|
+# rb14 - weight L0 * 2
|
|
+# rb13 = weight denom + 6 + 9
|
|
+# rb12 = (((is P) ? offset L0 * 2 : offset L1 + offset L0) + 1) << (rb13 - 1)
|
|
+
|
|
+# r2 is elem_num
|
|
+# retrieve texture results and pick out bytes
|
|
+# then submit two more texture requests
|
|
+
|
|
+# r3 = 0
|
|
+:uvloop
|
|
+# retrieve texture results and pick out bytes
|
|
+# then submit two more texture requests
|
|
+
|
|
+sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0 # loop counter increment
|
|
+shr r0, r4, ra_xshift ; mov.ifz ra_x, rb_x_next ; ldtmu1
|
|
+mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
|
|
+mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch
|
|
+shr r1, r4, ra_xshift ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
|
|
+
|
|
+max r2, ra_y, 0 # y
|
|
+min r2, r2, rb_frame_height_minus_1
|
|
+add ra_y, ra_y, 1 ; mul24 r2, r2, r3
|
|
+add t0s, ra_x, r2 ; v8subs r1, r1, rb20
|
|
+add t1s, ra_frame_base, r2
|
|
+
|
|
+# generate seven shifted versions
|
|
+# interleave with scroll of vertical context
|
|
+
|
|
+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
|
|
+
|
|
+# apply horizontal filter
|
|
+nop ; mul24 r3, ra0.8a, r0
|
|
+nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
|
|
+nop ; mul24 r2, ra0.8b << 1, r0 << 1
|
|
+nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
|
|
+sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2
|
|
+nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
|
|
+add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3
|
|
+nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
|
|
+sub r0, r2, r3 ; mov r3, rb31
|
|
+sub.setf -, r3, 4 ; mov ra12, ra13
|
|
+brr.anyn -, r:uvloop
|
|
+mov ra13, ra14 ; mul24 r1, ra14, rb9
|
|
+mov ra14, ra15
|
|
+mov ra15, r0 ; mul24 r0, ra12, rb8
|
|
+# >>> .anyn uvloop
|
|
+
|
|
+# apply vertical filter and write to VPM
|
|
+
|
|
+sub r1, r1, r0 ; mul24 r0, ra14, rb10
|
|
+add r1, r1, r0 ; mul24 r0, ra15, rb11
|
|
+sub r1, r1, r0 ; mov -, vw_wait
|
|
+sub.setf -, r3, rb18 ; mul24 r1, r1, ra_k256
|
|
+asr r1, r1, 14
|
|
+nop ; mul24 r1, r1, rb14
|
|
+shl r1, r1, 8
|
|
+
|
|
+add r1, r1, rb12
|
|
+brr.anyn -, r:uvloop
|
|
+asr r1, r1, rb13
|
|
+min r1, r1, rb_k255 # Delay 2
|
|
+max vpm, r1, 0 # Delay 3
|
|
+
|
|
+# DMA out for U
|
|
+
|
|
+mov vw_setup, rb26 # VDW setup 0
|
|
+mov vw_setup, rb29 # Stride
|
|
+mov vw_addr, unif # start the VDW
|
|
+
|
|
+# DMA out for V
|
|
+# We need to wait for the U to complete first, but have nothing useful to compute while we wait.
|
|
+# Could potentially push this write into the start of the next pipeline stage.
|
|
+mov r0, 16
|
|
+mov -, vw_wait
|
|
+
|
|
+bra -, ra31
|
|
+add vw_setup, rb26, r0 # VDW setup 0
|
|
+mov vw_setup, rb29 # Stride
|
|
+mov vw_addr, unif # start the VDW
|
|
+
|
|
+
|
|
+################################################################################
|
|
+
|
|
+# mc_filter_uv_b0(next_kernel, x, y, frame_u_base, frame_v_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_u_dst, this_v_dst)
|
|
+
|
|
+# At this point we have already issued two pairs of texture requests for the current block
|
|
+# ra_x, ra_x16_base point to the current coordinates for this block
|
|
+::mc_filter_uv_b0
|
|
+mov ra31, unif
|
|
+
|
|
+# per-channel shifts were calculated on the *previous* invocation
|
|
+
|
|
+# get base addresses and per-channel shifts for *next* invocation
|
|
+add r0, unif, elem_num # x
|
|
+max r0, r0, 0 ; mov r1, unif # y
|
|
+min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
|
|
+sub r2, unif, r3 ; mov ra_xshift, ra_xshift_next # compute offset from frame base u to frame base v ;
|
|
+shl ra_xshift_next, r0, 3
|
|
+add r0, r0, r3 ; mov ra1, unif # ; width_height
|
|
+and rb_x_next, r0, ~3 ; mov ra0, unif # ; H filter coeffs
|
|
+mov ra_y_next, r1 ; mov vw_setup, rb21
|
|
+
|
|
+add ra_frame_base_next, rb_x_next, r2
|
|
+
|
|
+# Need to have unsigned coeffs to so we can just unpack in the filter
|
|
+# chroma filter always goes -ve, +ve, +ve, -ve. This is fixed in the
|
|
+# filter code. Unpack into b regs for V
|
|
+
|
|
+# set up VPM write, we need to save 16bit precision
|
|
+
|
|
+sub rb29, rb24, ra1.16b # Compute vdw_setup1(dst_pitch-width)
|
|
+add rb17, ra1.16a, 1
|
|
+add rb18, ra1.16a, 3
|
|
+shl r0, ra1.16a, 7
|
|
+add r0, r0, ra1.16b # Combine width and height of destination area
|
|
+shl r0, r0, i_shift16 ; mov ra3, unif # ; V filter coeffs
|
|
+add rb26, r0, rb27
|
|
+
|
|
+mov rb8, ra3.8a
|
|
+mov rb9, ra3.8b
|
|
+mov rb10, ra3.8c
|
|
+mov rb11, ra3.8d
|
|
+
|
|
+# r2 is elem_num
|
|
+# r3 is loop counter
|
|
+
|
|
+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
|
|
+
|
|
+mov rb14, unif # U weight L0
|
|
+mov.ifnz rb14, unif ; mov r3, 0 # V weight L0 ; Loop counter
|
|
+# rb14 unused in b0 but will hang around till the second pass
|
|
+
|
|
+# retrieve texture results and pick out bytes
|
|
+# then submit two more texture requests
|
|
+
|
|
+# r3 = 0
|
|
+:uvloop_b0
|
|
+# retrieve texture results and pick out bytes
|
|
+# then submit two more texture requests
|
|
+
|
|
+sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0 # loop counter increment
|
|
+shr r0, r4, ra_xshift ; mov.ifz ra_x, rb_x_next ; ldtmu1
|
|
+mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
|
|
+mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch
|
|
+shr r1, r4, ra_xshift ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
|
|
+
|
|
+max r2, ra_y, 0 # y
|
|
+min r2, r2, rb_frame_height_minus_1
|
|
+add ra_y, ra_y, 1 ; mul24 r2, r2, r3
|
|
+add t0s, ra_x, r2 ; v8subs r1, r1, rb20
|
|
+add t1s, ra_frame_base, r2
|
|
+
|
|
+# generate seven shifted versions
|
|
+# interleave with scroll of vertical context
|
|
+
|
|
+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
|
|
+
|
|
+nop ; mul24 r3, ra0.8a, r0
|
|
+nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
|
|
+nop ; mul24 r2, ra0.8b << 1, r0 << 1
|
|
+nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
|
|
+sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2
|
|
+nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
|
|
+add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3
|
|
+nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
|
|
+sub r0, r2, r3 ; mov r3, rb31
|
|
+sub.setf -, r3, 4 ; mov ra12, ra13
|
|
+brr.anyn -, r:uvloop_b0
|
|
+mov ra13, ra14 ; mul24 r1, ra14, rb9 # ra14 is about to be ra13
|
|
+mov ra14, ra15
|
|
+mov ra15, r0 ; mul24 r0, ra12, rb8
|
|
+# >>> .anyn uvloop_b0
|
|
+
|
|
+# apply vertical filter and write to VPM
|
|
+
|
|
+sub r1, r1, r0 ; mul24 r0, ra14, rb10
|
|
+sub.setf -, r3, rb18
|
|
+brr.anyn -, r:uvloop_b0
|
|
+add r1, r1, r0 ; mul24 r0, ra15, rb11
|
|
+sub r1, r1, r0 ; mov -, vw_wait
|
|
+asr vpm, r1, 6
|
|
+# >>> .anyn uvloop_b0
|
|
+
|
|
+# in pass0 we don't really need to save any results, but need to discard the uniforms
|
|
+# DMA out for U
|
|
+
|
|
+bra -, ra31
|
|
+mov -, unif # Delay 1
|
|
+mov -, unif # Delay 2
|
|
+nop # Delay 3
|
|
+
|
|
+
|
|
+################################################################################
|
|
+
|
|
+::mc_filter_uv_b
|
|
+mov ra31, unif
|
|
+
|
|
+# per-channel shifts were calculated on the *previous* invocation
|
|
+
|
|
+# set up VPM write
|
|
+mov ra_xshift, ra_xshift_next ; mov vw_setup, rb28
|
|
+
|
|
+# get base addresses and per-channel shifts for *next* invocation
|
|
+add r0, unif, elem_num # x
|
|
+max r0, r0, 0 ; mov ra_y_next, unif # y
|
|
+min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # V frame_base
|
|
+# compute offset from frame base u to frame base v
|
|
+sub r2, unif, r3 ; mul24 ra_xshift_next, r0, 8 # U frame_base
|
|
+add r0, r0, r3 ; mov ra1, unif # width_height
|
|
+and rb_x_next, r0, ~3 ; mov ra0, unif # H filter coeffs
|
|
+
|
|
+sub rb29, rb24, ra1.16b # Compute vdw_setup1(dst_pitch-width)
|
|
+add rb17, ra1.16a, 1
|
|
+add rb18, ra1.16a, 3
|
|
+shl r0, ra1.16a, 7
|
|
+
|
|
+add ra_frame_base_next, rb_x_next, r2
|
|
+
|
|
+# r0 is currently height<<7
|
|
+# For vr_setup we want height<<20 (so 20-7=13 additional bits)
|
|
+shl r3, r0, i_shift21 ; mov ra3, unif # Shl 13 + Mask off top 8 bits ; V filter coeffs
|
|
+shr r3, r3, 8
|
|
+add vr_setup, r3, rb21
|
|
+
|
|
+add r0, r0, ra1.16b # Combine width and height of destination area
|
|
+shl r0, r0, i_shift16 # Shift into bits 16 upwards of the vdw_setup0 register
|
|
+add rb26, r0, rb27
|
|
+
|
|
+# get filter coefficients
|
|
+
|
|
+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
|
|
+
|
|
+# Get offset & weight stuff
|
|
+
|
|
+# The unif read occurs unconditionally, only the write is conditional
|
|
+mov ra1, unif ; mov rb8, ra3.8a # U offset/weight ;
|
|
+mov.ifnz ra1, unif ; mov rb9, ra3.8b # V offset/weight ;
|
|
+nop ; mov rb10, ra3.8c
|
|
+mov r3, 0 ; mov rb11, ra3.8d # Loop counter ;
|
|
+
|
|
+shl r1, ra1.16b, rb13
|
|
+asr rb12, r1, 1
|
|
+
|
|
+# ra1.16a used directly in the loop
|
|
+
|
|
+# retrieve texture results and pick out bytes
|
|
+# then submit two more texture requests
|
|
+
|
|
+# r3 = 0
|
|
+:uvloop_b
|
|
+# retrieve texture results and pick out bytes
|
|
+# then submit two more texture requests
|
|
+
|
|
+sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0 # loop counter increment
|
|
+shr r0, r4, ra_xshift ; mov.ifz ra_x, rb_x_next ; ldtmu1
|
|
+mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
|
|
+mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch
|
|
+shr r1, r4, ra_xshift ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
|
|
+
|
|
+max r2, ra_y, 0 # y
|
|
+min r2, r2, rb_frame_height_minus_1
|
|
+add ra_y, ra_y, 1 ; mul24 r2, r2, r3
|
|
+add t0s, ra_x, r2 ; v8subs r1, r1, rb20
|
|
+add t1s, ra_frame_base, r2
|
|
+
|
|
+# generate seven shifted versions
|
|
+# interleave with scroll of vertical context
|
|
+
|
|
+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
|
|
+
|
|
+nop ; mul24 r3, ra0.8a, r0
|
|
+nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
|
|
+nop ; mul24 r2, ra0.8b << 1, r0 << 1
|
|
+nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
|
|
+sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2
|
|
+nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
|
|
+add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3
|
|
+nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
|
|
+sub r0, r2, r3 ; mov r3, rb31
|
|
+sub.setf -, r3, 4 ; mov ra12, ra13
|
|
+brr.anyn -, r:uvloop_b
|
|
+mov ra13, ra14 ; mul24 r1, ra14, rb9
|
|
+mov ra14, ra15
|
|
+mov ra15, r0 ; mul24 r0, ra12, rb8
|
|
+# >>> .anyn uvloop_b
|
|
+
|
|
+# apply vertical filter and write to VPM
|
|
+
|
|
+sub r1, r1, r0 ; mul24 r0, ra14, rb10
|
|
+add r1, r1, r0 ; mul24 r0, ra15, rb11
|
|
+# Beware: vpm read gets unsigned 16-bit value, so we must sign extend it
|
|
+sub r1, r1, r0 ; mul24 r0, vpm, ra4 # ra4 = 0x10000
|
|
+sub.setf -, r3, rb18 ; mul24 r1, r1, ra_k256
|
|
+asr r1, r1, 14 # shift2=6
|
|
+
|
|
+asr r0, r0, i_shift16 ; mul24 r1, r1, ra1.16a
|
|
+nop ; mul24 r0, r0, rb14
|
|
+
|
|
+add r1, r1, r0 ; mov -, vw_wait
|
|
+shl r1, r1, 8 # Lose bad top 8 bits & sign extend
|
|
+
|
|
+add r1, r1, rb12 # rb12 = (offsetL0 + offsetL1 + 1) << (rb13 - 1)
|
|
+
|
|
+brr.anyn -, r:uvloop_b
|
|
+asr r1, r1, rb13 # Delay 1
|
|
+min r1, r1, rb_k255 # Delay 2
|
|
+max vpm, r1, 0 # Delay 3
|
|
+
|
|
+
|
|
+# DMA out for U
|
|
+
|
|
+mov vw_setup, rb26 # VDW setup 0
|
|
+mov vw_setup, rb29 # Stride
|
|
+mov vw_addr, unif # start the VDW
|
|
+
|
|
+# DMA out for V
|
|
+# We need to wait for the U to complete first, but have nothing useful to compute while we wait.
|
|
+# Could potentially push this write into the start of the next pipeline stage.
|
|
+mov r0, 16
|
|
+mov -, vw_wait
|
|
+
|
|
+bra -, ra31
|
|
+add vw_setup, rb26, r0 # VDW setup 0
|
|
+mov vw_setup, rb29 # Stride
|
|
+mov vw_addr, unif # start the VDW
|
|
+
|
|
+################################################################################
|
|
+
|
|
+# mc_exit()
|
|
+
|
|
+::mc_exit
|
|
+mov -, vw_wait # wait on the VDW
|
|
+
|
|
+mov -,srel(0)
|
|
+
|
|
+ldtmu0
|
|
+ldtmu1
|
|
+ldtmu0
|
|
+ldtmu1
|
|
+
|
|
+nop ; nop ; thrend
|
|
+nop ; nop # delay slot 1
|
|
+nop ; nop # delay slot 2
|
|
+
|
|
+# mc_interrupt_exit8()
|
|
+::mc_interrupt_exit8
|
|
+mov -, vw_wait # wait on the VDW
|
|
+
|
|
+ldtmu0
|
|
+ldtmu1
|
|
+ldtmu0
|
|
+ldtmu1
|
|
+
|
|
+mov -,sacq(0) # 1
|
|
+mov -,sacq(0) # 2
|
|
+mov -,sacq(0) # 3
|
|
+mov -,sacq(0) # 4
|
|
+mov -,sacq(0) # 5
|
|
+mov -,sacq(0) # 6
|
|
+mov -,sacq(0) # 7
|
|
+
|
|
+nop ; nop ; thrend
|
|
+mov interrupt, 1; nop # delay slot 1
|
|
+nop ; nop # delay slot 2
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+# LUMA CODE
|
|
+
|
|
+# The idea is to form B predictions by doing 8 pixels from ref0 in parallel with 8 pixels from ref1.
|
|
+# For P frames we make the second x,y coordinates offset by +8
|
|
+
|
|
+################################################################################
|
|
+# mc_setup(y_x, ref_y_base, y2_x2, ref_y2_base, frame_width_height, pitch, dst_pitch, offset_shift, tbd, next_kernel)
|
|
+::mc_setup
|
|
+ mov r3, 16
|
|
+
|
|
+ # Need to save these because we need to know the frame dimensions before computing texture coordinates
|
|
+ mov ra8, unif # y_x
|
|
+ mov ra9, unif # ref_y_base
|
|
+ mov ra10, unif # y2_x2
|
|
+ mov ra11, unif # ref_y2_base
|
|
+
|
|
+# Read image dimensions
|
|
+ mov r1, unif # width_height
|
|
+ shl r0,r1,r3
|
|
+ asr r1,r1,r3 # width
|
|
+ asr r0,r0,r3 # height
|
|
+ sub rb_frame_width_minus_1,r1,1
|
|
+ sub rb_frame_height_minus_1,r0,1
|
|
+
|
|
+# get source pitch
|
|
+ mov rb_pitch, unif # src_pitch
|
|
+
|
|
+# get destination pitch
|
|
+ mov r0, unif # dst_pitch
|
|
+ mov r1, vdw_setup_1(0)
|
|
+ add rb24, r1, r0
|
|
+
|
|
+# Compute base address for first and second access
|
|
+ mov r1, ra8 # y_x
|
|
+ shl r0,r1,r3 # r0 is x<<16
|
|
+ asr r1,r1,r3 # r1 is y
|
|
+ asr r0,r0,r3 # r0 is x
|
|
+ add r0, r0, elem_num # Load x
|
|
+ max r0, r0, 0
|
|
+ min r0, r0, rb_frame_width_minus_1 ; mov r2, ra9 # Load the frame base
|
|
+ shl ra_xshift_next, r0, 3 # Compute shifts
|
|
+ add ra_y, r1, 1
|
|
+ and r0, r0, ~3 # r0 gives the clipped and aligned x coordinate
|
|
+ add r2, r2, r0 # r2 is address for frame0 (not including y offset)
|
|
+ max r1, r1, 0
|
|
+ min r1, r1, rb_frame_height_minus_1
|
|
+ nop ; mul24 r1, r1, rb_pitch # r2 contains the addresses (not including y offset) for frame0
|
|
+ add t0s, r2, r1 ; mov ra_frame_base, r2
|
|
+
|
|
+ mov r1, ra10 # y_x
|
|
+ shl r0,r1,r3 # r0 is x<<16
|
|
+ asr r1,r1,r3 # r1 is y
|
|
+ asr r0,r0,r3 # r0 is x
|
|
+ add r0, r0, elem_num # Load x
|
|
+ max r0, r0, 0
|
|
+ min r0, r0, rb_frame_width_minus_1 ; mov r2, ra11 # Load the frame base
|
|
+ shl rx_xshift2_next, r0, 3 # Compute shifts
|
|
+ add ra_y2, r1, 1
|
|
+ and r0, r0, ~3 # r0 gives the clipped and aligned x coordinate
|
|
+ add r2, r2, r0 # r2 is address for frame1 (not including y offset)
|
|
+ max r1, r1, 0
|
|
+ min r1, r1, rb_frame_height_minus_1
|
|
+ nop ; mul24 r1, r1, rb_pitch # r2 contains the addresses (not including y offset) for frame0
|
|
+ add t1s, r2, r1 ; mov ra_frame_base2, r2
|
|
+
|
|
+
|
|
+# load constants
|
|
+
|
|
+ mov ra_k1, 1
|
|
+ mov ra_k256, 256
|
|
+ mov ra30, 64
|
|
+
|
|
+ mov rb20, 0xffffff00
|
|
+ mov rb_k255, 255
|
|
+ mov rb23, 24
|
|
+
|
|
+# touch vertical context to keep simulator happy
|
|
+
|
|
+ mov ra8, 0
|
|
+ mov ra9, 0
|
|
+ mov ra10, 0
|
|
+ mov ra11, 0
|
|
+ mov ra12, 0
|
|
+ mov ra13, 0
|
|
+ mov ra14, 0
|
|
+ mov ra15, 0
|
|
+
|
|
+# Compute part of VPM to use
|
|
+ mov r2, qpu_num
|
|
+ mov r1, r2
|
|
+ asr r1, r1, 2
|
|
+ shl r1, r1, 6
|
|
+ mov r0, r2
|
|
+ and r0, r0, 3
|
|
+ add r0, r0, r1
|
|
+ mov r1, vpm_setup(0, 4, h8p(0, 0)) # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
|
|
+ add rb28, r0, r1 # VPM for saving data
|
|
+ mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
|
|
+ shl r0, r0, 5
|
|
+ add rb27, r0, r1 # Command for dma output
|
|
+
|
|
+# Weighted prediction denom
|
|
+ add rb13, unif, 9 # unif = weight denom + 6
|
|
+
|
|
+ mov -, unif # Unused
|
|
+
|
|
+# submit texture requests for second line
|
|
+ max r1, ra_y, 0
|
|
+ min r1, r1, rb_frame_height_minus_1
|
|
+ add ra_y, ra_y, 1
|
|
+ nop ; mul24 r1, r1, rb_pitch
|
|
+ add t0s, r1, ra_frame_base
|
|
+
|
|
+ max r1, ra_y2, 0
|
|
+ min r1, r1, rb_frame_height_minus_1
|
|
+ add ra_y2, ra_y2, 1
|
|
+ nop ; mul24 r1, r1, rb_pitch
|
|
+ add t1s, r1, ra_frame_base2
|
|
+
|
|
+# FALL THROUGHT TO PER-BLOCK SETUP
|
|
+
|
|
+# Start of per-block setup code
|
|
+# P and B blocks share the same setup code to save on Icache space
|
|
+:per_block_setup
|
|
+ mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
|
|
+ mov ra31, unif
|
|
+
|
|
+ mov ra1, unif ; mov r1, elem_num # y_x ; elem_num has implicit unpack??
|
|
+
|
|
+# per-channel shifts were calculated on the *previous* invocation
|
|
+ mov ra_xshift, ra_xshift_next
|
|
+ mov rx_xshift2, rx_xshift2_next
|
|
+
|
|
+# get base addresses and per-channel shifts for *next* invocation
|
|
+
|
|
+ add r0, ra1.16a, r1 # Load x
|
|
+ max r0, r0, 0
|
|
+ min r0, r0, rb_frame_width_minus_1 ; mov r2, unif # Load the frame base
|
|
+ shl ra_xshift_next, r0, 3 # Compute shifts
|
|
+ mov r3, 8 ; mov ra_y_next, ra1.16b
|
|
+ and r0, r0, ~3 ; mov ra1, unif # y2_x2
|
|
+ add ra_frame_base_next, r2, r0
|
|
+
|
|
+ add r0, ra1.16a, r1 # Load x
|
|
+ max r0, r0, 0
|
|
+ min r0, r0, rb_frame_width_minus_1 ; mov r2, unif # Load the frame base
|
|
+ shl rx_xshift2_next, r0, 3 # Compute shifts
|
|
+ add r3, r3, r3 ; mov ra_y2_next, ra1.16b # r3 = 16 ;
|
|
+ and r0, r0, ~3 ; mov ra1, unif # width_height ; r0 gives the clipped and aligned x coordinate
|
|
+ add rx_frame_base2_next, r2, r0 # r2 is address for frame1 (not including y offset)
|
|
+
|
|
+# set up VPM write
|
|
+ mov vw_setup, rb28
|
|
+
|
|
+# get width,height of block (unif load above)
|
|
+ sub rb29, rb24, ra1.16b # Compute vdw_setup1(dst_pitch-width)
|
|
+ add rb17, ra1.16a, 5
|
|
+ add rb18, ra1.16a, 7
|
|
+ shl r0, ra1.16a, 7
|
|
+ add r0, r0, ra1.16b # Combine width and height of destination area
|
|
+ shl r0, r0, i_shift16 # Shift into bits 16 upwards of the vdw_setup0 register
|
|
+ add rb26, r0, rb27 ; mov r0, unif # Packed filter offsets
|
|
+
|
|
+# get filter coefficients and discard unused B frame values
|
|
+ shl.ifz r0, r0, i_shift16 # Pick half to use
|
|
+ shl ra8, r0, 3
|
|
+
|
|
+# Pack the 1st 4 filter coefs for H & V tightly
|
|
+
|
|
+ mov r1,0x00010100 # -ve
|
|
+ ror ra2.8a, r1, ra8.8d
|
|
+ ror ra0.8a, r1, ra8.8c
|
|
+
|
|
+ mov r1,0x01040400
|
|
+ ror ra2.8b, r1, ra8.8d
|
|
+ ror ra0.8b, r1, ra8.8c
|
|
+
|
|
+ mov r1,0x050b0a00 # -ve
|
|
+ ror ra2.8c, r1, ra8.8d
|
|
+ ror ra0.8c, r1, ra8.8c
|
|
+
|
|
+ mov r1,0x11283a40
|
|
+ ror ra2.8d, r1, ra8.8d
|
|
+ ror ra0.8d, r1, ra8.8c
|
|
+
|
|
+# In the 2nd vertical half we use b registers due to
|
|
+# using a-side fifo regs. The easiest way to achieve this to pack it
|
|
+# and then unpack!
|
|
+
|
|
+ mov r1,0x3a281100
|
|
+ ror ra3.8a, r1, ra8.8d
|
|
+ ror ra1.8a, r1, ra8.8c
|
|
+
|
|
+ mov r1,0x0a0b0500 # -ve
|
|
+ ror ra3.8b, r1, ra8.8d
|
|
+ ror ra1.8b, r1, ra8.8c
|
|
+
|
|
+ mov r1,0x04040100
|
|
+ ror ra3.8c, r1, ra8.8d
|
|
+ ror ra1.8c, r1, ra8.8c
|
|
+
|
|
+# Extract weighted prediction information in parallel
|
|
+
|
|
+ mov r1,0x01010000 # -ve
|
|
+ ror ra3.8d, r1, ra8.8d ; mov r0, unif # ; weight L1 weight L1 (hi16)/weight L0 (lo16)
|
|
+ ror ra1.8d, r1, ra8.8c ; mov r1, rb13 # ; rb13 = weight denom + 6 + 9
|
|
+
|
|
+# r3 = 16 from (long way) above
|
|
+ shl r1, unif, r1 ; mov rb4, ra3.8a # combined offet = ((is P) ? offset L0 * 2 : offset L1 + offset L0) + 1) ;
|
|
+ asr ra18, r0, r3 ; mov rb5, ra3.8b
|
|
+ bra -, ra31
|
|
+ shl r0, r0, r3 ; mov rb6, ra3.8c
|
|
+ mov r3, 0 ; mov rb7, ra3.8d # loop count ;
|
|
+ asr rb12, r1, 9
|
|
+
|
|
+# >>> branch ra31
|
|
+#
|
|
+# r3 = 0
|
|
+# ra18 = weight L1
|
|
+# r0 = weight L0 << 16 (will be put into rb14 in filter preamble)
|
|
+# rb13 = weight denom + 6 + 9
|
|
+# rb12 = (((is P) ? offset L0 * 2 : offset L1 + offset L0) + 1) << (rb13 - 1)
|
|
+
|
|
+
|
|
+################################################################################
|
|
+# mc_filter(y_x, frame_base, y2_x2, frame_base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
|
|
+# In a P block, y2_x2 should be y_x+8
|
|
+# At this point we have already issued two pairs of texture requests for the current block
|
|
+
|
|
+::mc_filter
|
|
+# r0 = weight << 16; We want weight * 2 in rb14
|
|
+ asr rb14, r0, 15
|
|
+
|
|
+# r3 = 0
|
|
+
|
|
+:yloop
|
|
+# retrieve texture results and pick out bytes
|
|
+# then submit two more texture requests
|
|
+
|
|
+# If we knew there was no clipping then this code would get simpler.
|
|
+# Perhaps we could add on the pitch and clip using larger values?
|
|
+
|
|
+# N.B. Whilst y == y2 as far as this loop is concerned we will start
|
|
+# the grab for the next block before we finish with this block and that
|
|
+# might be B where y != y2 so we must do full processing on both y and y2
|
|
+
|
|
+ sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0
|
|
+ shr r0, r4, ra_xshift ; mov.ifz ra_frame_base2, rx_frame_base2_next ; ldtmu1
|
|
+ mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
|
|
+ mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch
|
|
+ shr r1, r4, rx_xshift2 ; mov.ifz ra_y2, ra_y2_next
|
|
+
|
|
+ max r2, ra_y, 0 # y
|
|
+ min r2, r2, rb_frame_height_minus_1
|
|
+ add ra_y, ra_y, 1 ; mul24 r2, r2, r3
|
|
+ add t0s, ra_frame_base, r2 ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
|
|
+
|
|
+ max r2, ra_y2, 0 # y
|
|
+ min r2, r2, rb_frame_height_minus_1
|
|
+ add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3
|
|
+ add t1s, ra_frame_base2, r2 ; v8subs r1, r1, rb20
|
|
+
|
|
+# generate seven shifted versions
|
|
+# interleave with scroll of vertical context
|
|
+
|
|
+ mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
|
|
+
|
|
+# apply horizontal filter
|
|
+ nop ; mul24 r3, ra0.8a, r0
|
|
+ nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
|
|
+ nop ; mul24 r2, ra0.8b << 1, r0 << 1
|
|
+ nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
|
|
+ sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2
|
|
+ nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
|
|
+ sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3
|
|
+ nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
|
|
+ add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4
|
|
+ nop ; mul24.ifnz r3, ra1.8a << 12, r1 << 12
|
|
+ add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5
|
|
+ nop ; mul24.ifnz r3, ra1.8b << 13, r1 << 13
|
|
+ sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6
|
|
+ nop ; mul24.ifnz r3, ra1.8c << 14, r1 << 14
|
|
+ add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7
|
|
+ nop ; mul24.ifnz r3, ra1.8d << 15, r1 << 15
|
|
+ sub r0, r2, r3 ; mov r3, rb31
|
|
+
|
|
+ sub.setf -, r3, 8 ; mov r1, ra8
|
|
+ mov ra8, ra9 ; mov rb8, rb9
|
|
+ brr.anyn -, r:yloop
|
|
+ mov ra9, ra10 ; mov rb9, rb10
|
|
+ mov ra10, ra11 ; mov rb10, rb11
|
|
+ mov ra11, r0 ; mov rb11, r1
|
|
+ # >>> .anyn yloop
|
|
+
|
|
+ # apply vertical filter and write to VPM
|
|
+
|
|
+ nop ; mul24 r0, rb8, ra2.8a
|
|
+ nop ; mul24 r1, rb9, ra2.8b
|
|
+ sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c
|
|
+ sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d
|
|
+ add r1, r1, r0 ; mul24 r0, ra8, rb4
|
|
+ add r1, r1, r0 ; mul24 r0, ra9, rb5
|
|
+ sub r1, r1, r0 ; mul24 r0, ra10, rb6
|
|
+ add r1, r1, r0 ; mul24 r0, ra11, rb7
|
|
+ sub r1, r1, r0 ; mov -, vw_wait
|
|
+# At this point r1 is a 22-bit signed quantity: 8 (original sample),
|
|
+# +6, +6 (each pass), +1 (the passes can overflow slightly), +1 (sign)
|
|
+# The top 8 bits have rubbish in them as mul24 is unsigned
|
|
+# The low 6 bits need discard before weighting
|
|
+ sub.setf -, r3, rb18 ; mul24 r1, r1, ra_k256 # x256 - sign extend & discard rubbish
|
|
+ asr r1, r1, 14
|
|
+ nop ; mul24 r1, r1, rb14
|
|
+ add r1, r1, rb12
|
|
+
|
|
+ shl r1, r1, 8
|
|
+ brr.anyn -, r:yloop
|
|
+ asr r1, r1, rb13
|
|
+# We have a saturating pack unit - I can't help feeling it should be useful here
|
|
+ min r1, r1, rb_k255 # Delay 2 rb_k255 = 255
|
|
+ max vpm, r1, 0 # Delay 3
|
|
+# >>> branch.anyn yloop
|
|
+
|
|
+# DMA out
|
|
+
|
|
+ brr -, r:per_block_setup
|
|
+ mov vw_setup, rb26 # VDW setup 0 Delay 1
|
|
+ mov vw_setup, rb29 # Stride Delay 2
|
|
+ mov vw_addr, unif # start the VDW Delay 3
|
|
+
|
|
+
|
|
+
|
|
+################################################################################
|
|
+
|
|
+# mc_filter_b(y_x, frame_base, y2_x2, frame_base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
|
|
+# In a P block, only the first half of coefficients contain used information.
|
|
+# At this point we have already issued two pairs of texture requests for the current block
|
|
+# May be better to just send 16.16 motion vector and figure out the coefficients inside this block (only 4 cases so can compute hcoeffs in around 24 cycles?)
|
|
+# Can fill in the coefficients so only
|
|
+# Can also assume default weighted prediction for B frames.
|
|
+# Perhaps can unpack coefficients in a more efficient manner by doing H/V for a and b at the same time?
|
|
+# Or possibly by taking advantage of symmetry?
|
|
+# From 19->7 32bits per command.
|
|
+
|
|
+::mc_filter_b
|
|
+ # r0 = weightL0 << 16, we want it in rb14
|
|
+ asr rb14, r0, i_shift16
|
|
+
|
|
+:yloopb
|
|
+# retrieve texture results and pick out bytes
|
|
+# then submit two more texture requests
|
|
+
|
|
+# If we knew there was no clipping then this code would get simpler.
|
|
+# Perhaps we could add on the pitch and clip using larger values?
|
|
+
|
|
+ sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0
|
|
+ shr r0, r4, ra_xshift ; mov.ifz ra_frame_base2, rx_frame_base2_next ; ldtmu1
|
|
+ mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
|
|
+ mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch
|
|
+ shr r1, r4, rx_xshift2 ; mov.ifz ra_y2, ra_y2_next
|
|
+
|
|
+ max r2, ra_y, 0 # y
|
|
+ min r2, r2, rb_frame_height_minus_1
|
|
+ add ra_y, ra_y, 1 ; mul24 r2, r2, r3
|
|
+ add t0s, ra_frame_base, r2 ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
|
|
+
|
|
+ max r2, ra_y2, 0 # y
|
|
+ min r2, r2, rb_frame_height_minus_1
|
|
+ add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3
|
|
+ add t1s, ra_frame_base2, r2 ; v8subs r1, r1, rb20
|
|
+
|
|
+# generate seven shifted versions
|
|
+# interleave with scroll of vertical context
|
|
+
|
|
+ mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
|
|
+
|
|
+# apply horizontal filter
|
|
+ nop ; mul24 r3, ra0.8a, r0
|
|
+ nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
|
|
+ nop ; mul24 r2, ra0.8b << 1, r0 << 1
|
|
+ nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
|
|
+ sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2
|
|
+ nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
|
|
+ sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3
|
|
+ nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
|
|
+ add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4
|
|
+ nop ; mul24.ifnz r3, ra1.8a << 12, r1 << 12
|
|
+ add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5
|
|
+ nop ; mul24.ifnz r3, ra1.8b << 13, r1 << 13
|
|
+ sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6
|
|
+ nop ; mul24.ifnz r3, ra1.8c << 14, r1 << 14
|
|
+ add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7
|
|
+ nop ; mul24.ifnz r3, ra1.8d << 15, r1 << 15
|
|
+ sub r0, r2, r3 ; mov r3, rb31
|
|
+
|
|
+ sub.setf -, r3, 8 ; mov r1, ra8
|
|
+ mov ra8, ra9 ; mov rb8, rb9
|
|
+ brr.anyn -, r:yloopb
|
|
+ mov ra9, ra10 ; mov rb9, rb10
|
|
+ mov ra10, ra11 ; mov rb10, rb11
|
|
+ mov ra11, r0 ; mov rb11, r1
|
|
+ # >>> .anyn yloopb
|
|
+
|
|
+ # apply vertical filter and write to VPM
|
|
+
|
|
+ nop ; mul24 r0, rb8, ra2.8a
|
|
+ nop ; mul24 r1, rb9, ra2.8b
|
|
+ sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c
|
|
+ sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d
|
|
+ add r1, r1, r0 ; mul24 r0, ra8, rb4
|
|
+ add r1, r1, r0 ; mul24 r0, ra9, rb5
|
|
+ sub r1, r1, r0 ; mul24 r0, ra10, rb6
|
|
+ add r1, r1, r0 ; mul24 r0, ra11, rb7
|
|
+ sub r1, r1, r0 ; mov r2, rb12
|
|
+# As with P-pred r1 is a 22-bit signed quantity in 32-bits
|
|
+# Top 8 bits are bad - low 6 bits should be discarded
|
|
+ sub.setf -, r3, rb18 ; mul24 r1, r1, ra_k256
|
|
+
|
|
+ asr r1, r1, 14
|
|
+ nop ; mul24 r0, r1, rb14
|
|
+ add r0, r0, r2 ; mul24 r1, r1 << 8, ra18 << 8
|
|
+
|
|
+ add r1, r1, r0 ; mov -, vw_wait
|
|
+ shl r1, r1, 8
|
|
+
|
|
+ brr.anyn -, r:yloopb
|
|
+ asr r1, r1, rb13 # Delay 1
|
|
+ min r1, r1, rb_k255 # Delay 2
|
|
+ max vpm, r1, 0 # Delay 3
|
|
+
|
|
+# DMA out
|
|
+ brr -, r:per_block_setup
|
|
+ mov vw_setup, rb26 # VDW setup 0 Delay 1
|
|
+ mov vw_setup, rb29 # Stride Delay 2
|
|
+ mov vw_addr, unif # start the VDW Delay 3
|
|
+
|
|
+################################################################################
|
|
+
|
|
+# mc_interrupt_exit12()
|
|
+::mc_interrupt_exit12
|
|
+ mov -, vw_wait # wait on the VDW
|
|
+
|
|
+ # Dummy wait to test instructions
|
|
+# mov r3,1000000
|
|
+#:dummy_loop
|
|
+# sub.setf r3, r3, 1
|
|
+# nop
|
|
+# nop
|
|
+# brr.anynn -, r:dummy_loop
|
|
+# nop
|
|
+# nop
|
|
+# nop
|
|
+
|
|
+ ldtmu0
|
|
+ ldtmu0
|
|
+ ldtmu1
|
|
+ ldtmu1
|
|
+
|
|
+ mov -,sacq(0) # 1
|
|
+ mov -,sacq(0) # 2
|
|
+ mov -,sacq(0) # 3
|
|
+ mov -,sacq(0) # 4
|
|
+ mov -,sacq(0) # 5
|
|
+ mov -,sacq(0) # 6
|
|
+ mov -,sacq(0) # 7
|
|
+ mov -,sacq(0) # 8
|
|
+ mov -,sacq(0) # 9
|
|
+ mov -,sacq(0) # 10
|
|
+ mov -,sacq(0) # 11
|
|
+
|
|
+ nop ; nop ; thrend
|
|
+ mov interrupt, 1; nop # delay slot 1
|
|
+ nop ; nop # delay slot 2
|
|
+
|
|
+
|
|
+::mc_exit1
|
|
+ mov -, vw_wait # wait on the VDW
|
|
+
|
|
+ ldtmu0
|
|
+ ldtmu1
|
|
+ ldtmu0
|
|
+ ldtmu1
|
|
+ nop ; nop ; thrend
|
|
+ mov interrupt, 1; nop # delay slot 1
|
|
+ nop ; nop # delay slot 2
|
|
+
|
|
+
|
|
+::mc_end
|
|
+# Do not add code here because mc_end must appear after all other code.
|
|
diff --git a/libavcodec/rpi_user_vcsm.h b/libavcodec/rpi_user_vcsm.h
|
|
new file mode 100644
|
|
index 0000000..db41a4d
|
|
--- /dev/null
|
|
+++ b/libavcodec/rpi_user_vcsm.h
|
|
@@ -0,0 +1,459 @@
|
|
+/*****************************************************************************
|
|
+* Copyright 2001 - 2011 Broadcom Corporation. All rights reserved.
|
|
+*
|
|
+* This program is the proprietary software of Broadcom Corporation and/or
|
|
+* its licensors, and may only be used, duplicated, modified or distributed
|
|
+* pursuant to the terms and conditions of a separate, written license
|
|
+* agreement executed between you and Broadcom (an "Authorized License").
|
|
+* Except as set forth in an Authorized License, Broadcom grants no license
|
|
+* (express or implied), right to use, or waiver of any kind with respect to
|
|
+* the Software, and Broadcom expressly reserves all rights in and to the
|
|
+* Software and all intellectual property rights therein. IF YOU HAVE NO
|
|
+* AUTHORIZED LICENSE, THEN YOU HAVE NO RIGHT TO USE THIS SOFTWARE IN ANY
|
|
+* WAY, AND SHOULD IMMEDIATELY NOTIFY BROADCOM AND DISCONTINUE ALL USE OF
|
|
+* THE SOFTWARE.
|
|
+*
|
|
+* Except as expressly set forth in the Authorized License,
|
|
+* 1. This program, including its structure, sequence and organization,
|
|
+* constitutes the valuable trade secrets of Broadcom, and you shall use
|
|
+* all reasonable efforts to protect the confidentiality thereof, and to
|
|
+* use this information only in connection with your use of Broadcom
|
|
+* integrated circuit products.
|
|
+* 2. TO THE MAXIMUM EXTENT PERMITTED BY LAW, THE SOFTWARE IS PROVIDED "AS IS"
|
|
+* AND WITH ALL FAULTS AND BROADCOM MAKES NO PROMISES, REPRESENTATIONS OR
|
|
+* WARRANTIES, EITHER EXPRESS, IMPLIED, STATUTORY, OR OTHERWISE, WITH
|
|
+* RESPECT TO THE SOFTWARE. BROADCOM SPECIFICALLY DISCLAIMS ANY AND ALL
|
|
+* IMPLIED WARRANTIES OF TITLE, MERCHANTABILITY, NONINFRINGEMENT, FITNESS
|
|
+* FOR A PARTICULAR PURPOSE, LACK OF VIRUSES, ACCURACY OR COMPLETENESS,
|
|
+* QUIET ENJOYMENT, QUIET POSSESSION OR CORRESPONDENCE TO DESCRIPTION. YOU
|
|
+* ASSUME THE ENTIRE RISK ARISING OUT OF USE OR PERFORMANCE OF THE SOFTWARE.
|
|
+* 3. TO THE MAXIMUM EXTENT PERMITTED BY LAW, IN NO EVENT SHALL BROADCOM OR ITS
|
|
+* LICENSORS BE LIABLE FOR (i) CONSEQUENTIAL, INCIDENTAL, SPECIAL, INDIRECT,
|
|
+* OR EXEMPLARY DAMAGES WHATSOEVER ARISING OUT OF OR IN ANY WAY RELATING TO
|
|
+* YOUR USE OF OR INABILITY TO USE THE SOFTWARE EVEN IF BROADCOM HAS BEEN
|
|
+* ADVISED OF THE POSSIBILITY OF SUCH DAMAGES; OR (ii) ANY AMOUNT IN EXCESS
|
|
+* OF THE AMOUNT ACTUALLY PAID FOR THE SOFTWARE ITSELF OR U.S. $1, WHICHEVER
|
|
+* IS GREATER. THESE LIMITATIONS SHALL APPLY NOTWITHSTANDING ANY FAILURE OF
|
|
+* ESSENTIAL PURPOSE OF ANY LIMITED REMEDY.
|
|
+*****************************************************************************/
|
|
+
|
|
+#ifndef __USER_VCSM__H__INCLUDED__
|
|
+#define __USER_VCSM__H__INCLUDED__
|
|
+
|
|
+/* VideoCore Shared Memory - user interface library.
|
|
+**
|
|
+** This library provides all the necessary abstraction for any application to
|
|
+** make use of the shared memory service which is distributed accross a kernel
|
|
+** driver and a videocore service.
|
|
+**
|
|
+** It is an application design decision to choose or not to use this service.
|
|
+**
|
|
+** The logical flow of operations that a user application needs to follow when
|
|
+** using this service is:
|
|
+**
|
|
+** 1) Initialize the service.
|
|
+** 2) Allocate shared memory blocks.
|
|
+** 3) Start using the allocated blocks.
|
|
+** - In order to gain ownership on a block, lock the allocated block,
|
|
+** locking a block returns a valid address that the user application
|
|
+** can access.
|
|
+** - When finished with using the block for the current execution cycle
|
|
+** or function, and so when giving up the ownership, unlock the block.
|
|
+** 4) A block can be locked/unlocked as many times required - within or outside
|
|
+** of - a specific execution context.
|
|
+** 5) To completely release an allocated block, free it.
|
|
+** 6) If the service is no longer required, terminate it.
|
|
+**
|
|
+**
|
|
+** Some generic considerations:
|
|
+
|
|
+** Allocating memory blocks.
|
|
+**
|
|
+** Memory blocks can be allocated in different manners depending on the cache
|
|
+** behavior desired. A given block can either be:
|
|
+
|
|
+** - Allocated in a non cached fashion all the way through host and videocore.
|
|
+** - Allocated in a cached fashion on host OR videocore.
|
|
+** - Allocated in a cached fashion on host AND videocore.
|
|
+**
|
|
+** It is an application decision to determine how to allocate a block. Evidently
|
|
+** if the application will be doing substantial read/write accesses to a given block,
|
|
+** it is recommended to allocate the block at least in a 'host cached' fashion for
|
|
+** better results.
|
|
+**
|
|
+**
|
|
+** Locking memory blocks.
|
|
+**
|
|
+** When the memory block has been allocated in a host cached fashion, locking the
|
|
+** memory block (and so taking ownership of it) will trigger a cache invalidation.
|
|
+**
|
|
+** For the above reason and when using host cached allocation, it is important that
|
|
+** an application properly implements the lock/unlock mechanism to ensure cache will
|
|
+** stay coherent, otherwise there is no guarantee it will at all be.
|
|
+**
|
|
+** It is possible to dynamically change the host cache behavior (ie cached or non
|
|
+** cached) of a given allocation without needing to free and re-allocate the block.
|
|
+** This feature can be useful for such application which requires access to the block
|
|
+** only at certain times and not otherwise. By changing the cache behavior dynamically
|
|
+** the application can optimize performances for a given duration of use.
|
|
+** Such dynamic cache behavior remapping only applies to host cache and not videocore
|
|
+** cache. If one requires to change the videocore cache behavior, then a new block
|
|
+** must be created to replace the old one.
|
|
+**
|
|
+** On successful locking, a valid pointer is returned that the application can use
|
|
+** to access to data inside the block. There is no guarantee that the pointer will
|
|
+** stay valid following the unlock action corresponding to this lock.
|
|
+**
|
|
+**
|
|
+** Unocking memory blocks.
|
|
+**
|
|
+** When the memory block has been allocated in a host cached fashion, unlocking the
|
|
+** memory block (and so forgiving its ownership) will trigger a cache flush unless
|
|
+** explicitely asked not to flush the cache for performances reasons.
|
|
+**
|
|
+** For the above reason and when using host cached allocation, it is important that
|
|
+** an application properly implements the lock/unlock mechanism to ensure cache will
|
|
+** stay coherent, otherwise there is no guarantee it will at all be.
|
|
+**
|
|
+**
|
|
+** A complete API is defined below.
|
|
+*/
|
|
+
|
|
+#ifdef __cplusplus
|
|
+extern "C"
|
|
+{
|
|
+#endif
|
|
+
|
|
+/* Different status that can be dumped.
|
|
+*/
|
|
+typedef enum
|
|
+{
|
|
+ VCSM_STATUS_VC_WALK_ALLOC = 0, // Walks *all* the allocation on videocore.
|
|
+ // Result of the walk is seen in the videocore
|
|
+ // log.
|
|
+ VCSM_STATUS_HOST_WALK_MAP, // Walks the *full* mapping allocation on host
|
|
+ // driver (ie for all processes). Result of
|
|
+ // the walk is seen in the kernel log.
|
|
+ VCSM_STATUS_HOST_WALK_PID_MAP, // Walks the per process mapping allocation on host
|
|
+ // driver (for current process). Result of
|
|
+ // the walk is seen in the kernel log.
|
|
+ VCSM_STATUS_HOST_WALK_PID_ALLOC, // Walks the per process host allocation on host
|
|
+ // driver (for current process). Result of
|
|
+ // the walk is seen in the kernel log.
|
|
+ VCSM_STATUS_VC_MAP_ALL, // Equivalent to both VCSM_STATUS_VC_WALK_ALLOC and
|
|
+ // VCSM_STATUS_HOST_WALK_MAP.
|
|
+ //
|
|
+ VCSM_STATUS_NONE, // Must be last - invalid.
|
|
+
|
|
+} VCSM_STATUS_T;
|
|
+
|
|
+/* Different kind of cache behavior.
|
|
+*/
|
|
+typedef enum
|
|
+{
|
|
+ VCSM_CACHE_TYPE_NONE = 0, // No caching applies.
|
|
+ VCSM_CACHE_TYPE_HOST, // Allocation is cached on host (user space).
|
|
+ VCSM_CACHE_TYPE_VC, // Allocation is cached on videocore.
|
|
+ VCSM_CACHE_TYPE_HOST_AND_VC, // Allocation is cached on both host and videocore.
|
|
+
|
|
+} VCSM_CACHE_TYPE_T;
|
|
+
|
|
+/* Initialize the vcsm processing.
|
|
+**
|
|
+** Must be called once before attempting to do anything else.
|
|
+**
|
|
+** Returns 0 on success, -1 on error.
|
|
+*/
|
|
+int vcsm_init( void );
|
|
+
|
|
+
|
|
+/* Terminates the vcsm processing.
|
|
+**
|
|
+** Must be called vcsm services are no longer needed, it will
|
|
+** take care of removing any allocation under the current process
|
|
+** control if deemed necessary.
|
|
+*/
|
|
+void vcsm_exit( void );
|
|
+
|
|
+
|
|
+/* Queries the status of the the vcsm.
|
|
+**
|
|
+** Triggers dump of various kind of information, see the
|
|
+** different variants specified in VCSM_STATUS_T.
|
|
+**
|
|
+** Pid is optional.
|
|
+*/
|
|
+void vcsm_status( VCSM_STATUS_T status, int pid );
|
|
+
|
|
+
|
|
+/* Allocates a non-cached block of memory of size 'size' via the vcsm memory
|
|
+** allocator.
|
|
+**
|
|
+** Returns: 0 on error
|
|
+** a non-zero opaque handle on success.
|
|
+**
|
|
+** On success, the user must invoke vcsm_lock with the returned opaque
|
|
+** handle to gain access to the memory associated with the opaque handle.
|
|
+** When finished using the memory, the user calls vcsm_unlock_xx (see those
|
|
+** function definition for more details on the one that can be used).
|
|
+**
|
|
+** A well behaved application should make every attempt to lock/unlock
|
|
+** only for the duration it needs to access the memory data associated with
|
|
+** the opaque handle.
|
|
+*/
|
|
+unsigned int vcsm_malloc( unsigned int size, char *name );
|
|
+
|
|
+
|
|
+/* Allocates a cached block of memory of size 'size' via the vcsm memory
|
|
+** allocator, the type of caching requested is passed as argument of the
|
|
+** function call.
|
|
+**
|
|
+** Returns: 0 on error
|
|
+** a non-zero opaque handle on success.
|
|
+**
|
|
+** On success, the user must invoke vcsm_lock with the returned opaque
|
|
+** handle to gain access to the memory associated with the opaque handle.
|
|
+** When finished using the memory, the user calls vcsm_unlock_xx (see those
|
|
+** function definition for more details on the one that can be used).
|
|
+**
|
|
+** A well behaved application should make every attempt to lock/unlock
|
|
+** only for the duration it needs to access the memory data associated with
|
|
+** the opaque handle.
|
|
+*/
|
|
+unsigned int vcsm_malloc_cache( unsigned int size, VCSM_CACHE_TYPE_T cache, char *name );
|
|
+
|
|
+
|
|
+/* Shares an allocated block of memory via the vcsm memory allocator.
|
|
+**
|
|
+** Returns: 0 on error
|
|
+** a non-zero opaque handle on success.
|
|
+**
|
|
+** On success, the user must invoke vcsm_lock with the returned opaque
|
|
+** handle to gain access to the memory associated with the opaque handle.
|
|
+** When finished using the memory, the user calls vcsm_unlock_xx (see those
|
|
+** function definition for more details on the one that can be used).
|
|
+**
|
|
+** A well behaved application should make every attempt to lock/unlock
|
|
+** only for the duration it needs to access the memory data associated with
|
|
+** the opaque handle.
|
|
+*/
|
|
+unsigned int vcsm_malloc_share( unsigned int handle );
|
|
+
|
|
+
|
|
+/* Resizes a block of memory allocated previously by vcsm_alloc.
|
|
+**
|
|
+** Returns: 0 on success
|
|
+** -errno on error.
|
|
+**
|
|
+** The handle must be unlocked by user prior to attempting any
|
|
+** resize action.
|
|
+**
|
|
+** On error, the original size allocated against the handle
|
|
+** remains available the same way it would be following a
|
|
+** successful vcsm_malloc.
|
|
+*/
|
|
+int vcsm_resize( unsigned int handle, unsigned int new_size );
|
|
+
|
|
+
|
|
+/* Frees a block of memory that was successfully allocated by
|
|
+** a prior call the vcms_alloc.
|
|
+**
|
|
+** The handle should be considered invalid upon return from this
|
|
+** call.
|
|
+**
|
|
+** Whether any memory is actually freed up or not as the result of
|
|
+** this call will depends on many factors, if all goes well it will
|
|
+** be freed. If something goes wrong, the memory will likely end up
|
|
+** being freed up as part of the vcsm_exit process. In the end the
|
|
+** memory is guaranteed to be freed one way or another.
|
|
+*/
|
|
+void vcsm_free( unsigned int handle );
|
|
+
|
|
+
|
|
+/* Retrieves a videocore opaque handle from a mapped user address
|
|
+** pointer. The videocore handle will correspond to the actual
|
|
+** memory mapped in videocore.
|
|
+**
|
|
+** Returns: 0 on error
|
|
+** a non-zero opaque handle on success.
|
|
+**
|
|
+** Note: the videocore opaque handle is distinct from the user
|
|
+** opaque handle (allocated via vcsm_malloc) and it is only
|
|
+** significant for such application which knows what to do
|
|
+** with it, for the others it is just a number with little
|
|
+** use since nothing can be done with it (in particular
|
|
+** for safety reason it cannot be used to map anything).
|
|
+*/
|
|
+unsigned int vcsm_vc_hdl_from_ptr( void *usr_ptr );
|
|
+
|
|
+
|
|
+/* Retrieves a videocore opaque handle from a opaque handle
|
|
+** pointer. The videocore handle will correspond to the actual
|
|
+** memory mapped in videocore.
|
|
+**
|
|
+** Returns: 0 on error
|
|
+** a non-zero opaque handle on success.
|
|
+**
|
|
+** Note: the videocore opaque handle is distinct from the user
|
|
+** opaque handle (allocated via vcsm_malloc) and it is only
|
|
+** significant for such application which knows what to do
|
|
+** with it, for the others it is just a number with little
|
|
+** use since nothing can be done with it (in particular
|
|
+** for safety reason it cannot be used to map anything).
|
|
+*/
|
|
+unsigned int vcsm_vc_hdl_from_hdl( unsigned int handle );
|
|
+
|
|
+
|
|
+/* Retrieves a user opaque handle from a mapped user address
|
|
+** pointer.
|
|
+**
|
|
+** Returns: 0 on error
|
|
+** a non-zero opaque handle on success.
|
|
+*/
|
|
+unsigned int vcsm_usr_handle( void *usr_ptr );
|
|
+
|
|
+
|
|
+/* Retrieves a mapped user address from an opaque user
|
|
+** handle.
|
|
+**
|
|
+** Returns: 0 on error
|
|
+** a non-zero address on success.
|
|
+**
|
|
+** On success, the address corresponds to the pointer
|
|
+** which can access the data allocated via the vcsm_malloc
|
|
+** call.
|
|
+*/
|
|
+void *vcsm_usr_address( unsigned int handle );
|
|
+
|
|
+
|
|
+/* Locks the memory associated with this opaque handle.
|
|
+**
|
|
+** Returns: NULL on error
|
|
+** a valid pointer on success.
|
|
+**
|
|
+** A user MUST lock the handle received from vcsm_malloc
|
|
+** in order to be able to use the memory associated with it.
|
|
+**
|
|
+** On success, the pointer returned is only valid within
|
|
+** the lock content (ie until a corresponding vcsm_unlock_xx
|
|
+** is invoked).
|
|
+*/
|
|
+void *vcsm_lock( unsigned int handle );
|
|
+
|
|
+
|
|
+/* Locks the memory associated with this opaque handle. The lock
|
|
+** also gives a chance to update the *host* cache behavior of the
|
|
+** allocated buffer if so desired. The *videocore* cache behavior
|
|
+** of the allocated buffer cannot be changed by this call and such
|
|
+** attempt will be ignored.
|
|
+**
|
|
+** The system will attempt to honour the cache_update mode request,
|
|
+** the cache_result mode will provide the final answer on which cache
|
|
+** mode is really in use. Failing to change the cache mode will not
|
|
+** result in a failure to lock the buffer as it is an application
|
|
+** decision to choose what to do if (cache_result != cache_update)
|
|
+**
|
|
+** The value returned in cache_result can only be considered valid if
|
|
+** the returned pointer is non NULL. The cache_result pointer may be
|
|
+** NULL if the application does not care about the actual outcome of
|
|
+** its action with regards to the cache behavior change.
|
|
+**
|
|
+** Returns: NULL on error
|
|
+** a valid pointer on success.
|
|
+**
|
|
+** A user MUST lock the handle received from vcsm_malloc
|
|
+** in order to be able to use the memory associated with it.
|
|
+**
|
|
+** On success, the pointer returned is only valid within
|
|
+** the lock content (ie until a corresponding vcsm_unlock_xx
|
|
+** is invoked).
|
|
+*/
|
|
+void *vcsm_lock_cache( unsigned int handle,
|
|
+ VCSM_CACHE_TYPE_T cache_update,
|
|
+ VCSM_CACHE_TYPE_T *cache_result );
|
|
+
|
|
+
|
|
+/* Unlocks the memory associated with this user mapped address.
|
|
+**
|
|
+** Returns: 0 on success
|
|
+** -errno on error.
|
|
+**
|
|
+** After unlocking a mapped address, the user should no longer
|
|
+** attempt to reference it.
|
|
+*/
|
|
+int vcsm_unlock_ptr( void *usr_ptr );
|
|
+
|
|
+
|
|
+/* Unlocks the memory associated with this user mapped address.
|
|
+** Apply special processing that would override the otherwise
|
|
+** default behavior.
|
|
+**
|
|
+** If 'cache_no_flush' is specified:
|
|
+** Do not flush cache as the result of the unlock (if cache
|
|
+** flush was otherwise applicable in this case).
|
|
+**
|
|
+** Returns: 0 on success
|
|
+** -errno on error.
|
|
+**
|
|
+** After unlocking a mapped address, the user should no longer
|
|
+** attempt to reference it.
|
|
+*/
|
|
+int vcsm_unlock_ptr_sp( void *usr_ptr, int cache_no_flush );
|
|
+
|
|
+
|
|
+/* Unlocks the memory associated with this user opaque handle.
|
|
+**
|
|
+** Returns: 0 on success
|
|
+** -errno on error.
|
|
+**
|
|
+** After unlocking an opaque handle, the user should no longer
|
|
+** attempt to reference the mapped addressed once associated
|
|
+** with it.
|
|
+*/
|
|
+int vcsm_unlock_hdl( unsigned int handle );
|
|
+
|
|
+
|
|
+/* Unlocks the memory associated with this user opaque handle.
|
|
+** Apply special processing that would override the otherwise
|
|
+** default behavior.
|
|
+**
|
|
+** If 'cache_no_flush' is specified:
|
|
+** Do not flush cache as the result of the unlock (if cache
|
|
+** flush was otherwise applicable in this case).
|
|
+**
|
|
+** Returns: 0 on success
|
|
+** -errno on error.
|
|
+**
|
|
+** After unlocking an opaque handle, the user should no longer
|
|
+** attempt to reference the mapped addressed once associated
|
|
+** with it.
|
|
+*/
|
|
+int vcsm_unlock_hdl_sp( unsigned int handle, int cache_no_flush );
|
|
+
|
|
+/* Clean and/or invalidate the memory associated with this user opaque handle
|
|
+**
|
|
+** Returns: non-zero on error
|
|
+**
|
|
+** structure contains a list of flush/invalidate commands. Commands are:
|
|
+** 0: nop
|
|
+** 1: invalidate given virtual range in L1/L2
|
|
+** 2: clean given virtual range in L1/L2
|
|
+** 3: clean+invalidate given virtual range in L1/L2
|
|
+** 4: flush all L1/L2
|
|
+*/
|
|
+struct vcsm_user_clean_invalid_s {
|
|
+ struct {
|
|
+ unsigned int cmd;
|
|
+ unsigned int handle;
|
|
+ unsigned int addr;
|
|
+ unsigned int size;
|
|
+ } s[8];
|
|
+};
|
|
+
|
|
+int vcsm_clean_invalid( struct vcsm_user_clean_invalid_s *s );
|
|
+
|
|
+#ifdef __cplusplus
|
|
+}
|
|
+#endif
|
|
+
|
|
+#endif /* __USER_VCSM__H__INCLUDED__ */
|
|
diff --git a/libavcodec/rpi_zc.c b/libavcodec/rpi_zc.c
|
|
new file mode 100644
|
|
index 0000000..9580165
|
|
--- /dev/null
|
|
+++ b/libavcodec/rpi_zc.c
|
|
@@ -0,0 +1,406 @@
|
|
+#include "config.h"
|
|
+#ifdef RPI
|
|
+#include "rpi_qpu.h"
|
|
+#include "rpi_zc.h"
|
|
+
|
|
+#include "libavutil/buffer_internal.h"
|
|
+
|
|
+struct ZcPoolEnt;
|
|
+
|
|
+typedef struct ZcPool
|
|
+{
|
|
+ int numbytes;
|
|
+ struct ZcPoolEnt * head;
|
|
+ pthread_mutex_t lock;
|
|
+} ZcPool;
|
|
+
|
|
+typedef struct ZcPoolEnt
|
|
+{
|
|
+ // It is important that we start with gmem as other bits of code will expect to see that
|
|
+ GPU_MEM_PTR_T gmem;
|
|
+ struct ZcPoolEnt * next;
|
|
+ struct ZcPool * pool;
|
|
+} ZcPoolEnt;
|
|
+
|
|
+static ZcPoolEnt * zc_pool_ent_alloc(ZcPool * const pool, const int size)
|
|
+{
|
|
+ ZcPoolEnt * const zp = av_malloc(sizeof(ZcPoolEnt));
|
|
+
|
|
+ if (zp == NULL) {
|
|
+ av_log(NULL, AV_LOG_ERROR, "av_malloc(ZcPoolEnt) failed\n");
|
|
+ goto fail0;
|
|
+ }
|
|
+
|
|
+ if (gpu_malloc_cached(size, &zp->gmem) != 0)
|
|
+ {
|
|
+ av_log(NULL, AV_LOG_ERROR, "av_gpu_malloc_cached(%d) failed\n", size);
|
|
+ goto fail1;
|
|
+ }
|
|
+
|
|
+ zp->next = NULL;
|
|
+ zp->pool = pool;
|
|
+ return zp;
|
|
+
|
|
+fail1:
|
|
+ av_free(zp);
|
|
+fail0:
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+static void zc_pool_ent_free(ZcPoolEnt * const zp)
|
|
+{
|
|
+ gpu_free(&zp->gmem);
|
|
+ av_free(zp);
|
|
+}
|
|
+
|
|
+static void zc_pool_flush(ZcPool * const pool)
|
|
+{
|
|
+ ZcPoolEnt * p = pool->head;
|
|
+ pool->head = NULL;
|
|
+ while (p != NULL)
|
|
+ {
|
|
+ ZcPoolEnt * const zp = p;
|
|
+ p = p->next;
|
|
+ zc_pool_ent_free(zp);
|
|
+ }
|
|
+}
|
|
+
|
|
+static ZcPoolEnt * zc_pool_alloc(ZcPool * const pool, const int numbytes)
|
|
+{
|
|
+ ZcPoolEnt * zp;
|
|
+ pthread_mutex_lock(&pool->lock);
|
|
+
|
|
+ if (numbytes != pool->numbytes)
|
|
+ {
|
|
+ zc_pool_flush(pool);
|
|
+ pool->numbytes = numbytes;
|
|
+ }
|
|
+
|
|
+ if (pool->head != NULL)
|
|
+ {
|
|
+ zp = pool->head;
|
|
+ pool->head = zp->next;
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ zp = zc_pool_ent_alloc(pool, numbytes);
|
|
+ }
|
|
+
|
|
+ pthread_mutex_unlock(&pool->lock);
|
|
+ return zp;
|
|
+}
|
|
+
|
|
+static void zc_pool_free(ZcPoolEnt * const zp)
|
|
+{
|
|
+ ZcPool * const pool = zp == NULL ? NULL : zp->pool;
|
|
+ if (zp != NULL)
|
|
+ {
|
|
+ pthread_mutex_lock(&pool->lock);
|
|
+ if (pool->numbytes == zp->gmem.numbytes)
|
|
+ {
|
|
+ zp->next = pool->head;
|
|
+ pool->head = zp;
|
|
+ pthread_mutex_unlock(&pool->lock);
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ pthread_mutex_unlock(&pool->lock);
|
|
+ zc_pool_ent_free(zp);
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+static void
|
|
+zc_pool_init(ZcPool * const pool)
|
|
+{
|
|
+ pool->numbytes = -1;
|
|
+ pool->head = NULL;
|
|
+ pthread_mutex_init(&pool->lock, NULL);
|
|
+}
|
|
+
|
|
+static void
|
|
+zc_pool_destroy(ZcPool * const pool)
|
|
+{
|
|
+ pool->numbytes = -1;
|
|
+ zc_pool_flush(pool);
|
|
+ pthread_mutex_destroy(&pool->lock);
|
|
+}
|
|
+
|
|
+
|
|
+typedef struct AVZcEnv
|
|
+{
|
|
+ ZcPool pool;
|
|
+} ZcEnv;
|
|
+
|
|
+// Callback when buffer unrefed to zero
|
|
+static void rpi_free_display_buffer(void *opaque, uint8_t *data)
|
|
+{
|
|
+ ZcPoolEnt *const zp = opaque;
|
|
+// printf("%s: data=%p\n", __func__, data);
|
|
+ zc_pool_free(zp);
|
|
+}
|
|
+
|
|
+static inline GPU_MEM_PTR_T * pic_gm_ptr(AVBufferRef * const buf)
|
|
+{
|
|
+ // Kludge where we check the free fn to check this is really
|
|
+ // one of our buffers - can't think of a better way
|
|
+ return buf == NULL || buf->buffer->free != rpi_free_display_buffer ? NULL :
|
|
+ av_buffer_get_opaque(buf);
|
|
+}
|
|
+
|
|
+AVRpiZcFrameGeometry av_rpi_zc_frame_geometry(
|
|
+ const unsigned int video_width, const unsigned int video_height)
|
|
+{
|
|
+ AVRpiZcFrameGeometry geo;
|
|
+ geo.stride_y = (video_width + 32 + 31) & ~31;
|
|
+ geo.stride_c = geo.stride_y / 2;
|
|
+// geo.height_y = (video_height + 15) & ~15;
|
|
+ geo.height_y = (video_height + 32 + 31) & ~31;
|
|
+ geo.height_c = geo.height_y / 2;
|
|
+ return geo;
|
|
+}
|
|
+
|
|
+static AVBufferRef * rpi_buf_pool_alloc(ZcPool * const pool, int size)
|
|
+{
|
|
+ ZcPoolEnt *const zp = zc_pool_alloc(pool, size);
|
|
+ AVBufferRef * buf;
|
|
+
|
|
+ if (zp == NULL) {
|
|
+ av_log(NULL, AV_LOG_ERROR, "zc_pool_alloc(%d) failed\n", size);
|
|
+ goto fail0;
|
|
+ }
|
|
+
|
|
+ if ((buf = av_buffer_create(zp->gmem.arm, size, rpi_free_display_buffer, zp, AV_BUFFER_FLAG_READONLY)) == NULL)
|
|
+ {
|
|
+ av_log(NULL, AV_LOG_ERROR, "av_buffer_create() failed\n");
|
|
+ goto fail2;
|
|
+ }
|
|
+
|
|
+ return buf;
|
|
+
|
|
+fail2:
|
|
+ zc_pool_free(zp);
|
|
+fail0:
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+static int rpi_get_display_buffer(struct AVCodecContext * const s, AVFrame * const frame)
|
|
+{
|
|
+ ZcEnv *const zc = s->get_buffer_context;
|
|
+ const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(frame->width, frame->height);
|
|
+ const unsigned int size_y = geo.stride_y * geo.height_y;
|
|
+ const unsigned int size_c = geo.stride_c * geo.height_c;
|
|
+ const unsigned int size_pic = size_y + size_c * 2;
|
|
+ AVBufferRef * buf;
|
|
+ unsigned int i;
|
|
+
|
|
+// printf("Do local alloc: format=%#x, %dx%d: %u\n", frame->format, frame->width, frame->height, size_pic);
|
|
+
|
|
+ if ((buf = rpi_buf_pool_alloc(&zc->pool, size_pic)) == NULL)
|
|
+ {
|
|
+ av_log(s, AV_LOG_ERROR, "rpi_get_display_buffer: Failed to get buffer from pool\n");
|
|
+ return AVERROR(ENOMEM);
|
|
+ }
|
|
+
|
|
+ for (i = 0; i < AV_NUM_DATA_POINTERS; i++) {
|
|
+ frame->buf[i] = NULL;
|
|
+ frame->data[i] = NULL;
|
|
+ frame->linesize[i] = 0;
|
|
+ }
|
|
+
|
|
+ frame->buf[0] = buf;
|
|
+ frame->linesize[0] = geo.stride_y;
|
|
+ frame->linesize[1] = geo.stride_c;
|
|
+ frame->linesize[2] = geo.stride_c;
|
|
+ frame->data[0] = buf->data;
|
|
+ frame->data[1] = frame->data[0] + size_y;
|
|
+ frame->data[2] = frame->data[1] + size_c;
|
|
+ frame->extended_data = frame->data;
|
|
+ // Leave extended buf alone
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+
|
|
+#define RPI_GET_BUFFER2 1
|
|
+
|
|
+int av_rpi_zc_get_buffer2(struct AVCodecContext *s, AVFrame *frame, int flags)
|
|
+{
|
|
+#if !RPI_GET_BUFFER2
|
|
+ return avcodec_default_get_buffer2(s, frame, flags);
|
|
+#else
|
|
+ int rv;
|
|
+
|
|
+ if ((s->codec->capabilities & AV_CODEC_CAP_DR1) == 0 ||
|
|
+ frame->format != AV_PIX_FMT_YUV420P)
|
|
+ {
|
|
+// printf("Do default alloc: format=%#x\n", frame->format);
|
|
+ rv = avcodec_default_get_buffer2(s, frame, flags);
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ rv = rpi_get_display_buffer(s, frame);
|
|
+ }
|
|
+
|
|
+#if 0
|
|
+ printf("%s: %dx%d lsize=%d/%d/%d data=%p/%p/%p bref=%p/%p/%p opaque[0]=%p\n", __func__,
|
|
+ frame->width, frame->height,
|
|
+ frame->linesize[0], frame->linesize[1], frame->linesize[2],
|
|
+ frame->data[0], frame->data[1], frame->data[2],
|
|
+ frame->buf[0], frame->buf[1], frame->buf[2],
|
|
+ av_buffer_get_opaque(frame->buf[0]));
|
|
+#endif
|
|
+ return rv;
|
|
+#endif
|
|
+}
|
|
+
|
|
+
|
|
+static AVBufferRef * zc_copy(struct AVCodecContext * const s,
|
|
+ const AVFrame * const src)
|
|
+{
|
|
+ AVFrame dest_frame;
|
|
+ AVFrame * const dest = &dest_frame;
|
|
+ unsigned int i;
|
|
+ uint8_t * psrc, * pdest;
|
|
+
|
|
+ dest->width = src->width;
|
|
+ dest->height = src->height;
|
|
+
|
|
+ if (rpi_get_display_buffer(s, dest) != 0)
|
|
+ {
|
|
+ return NULL;
|
|
+ }
|
|
+
|
|
+ for (i = 0, psrc = src->data[0], pdest = dest->data[0];
|
|
+ i != dest->height;
|
|
+ ++i, psrc += src->linesize[0], pdest += dest->linesize[0])
|
|
+ {
|
|
+ memcpy(pdest, psrc, dest->width);
|
|
+ }
|
|
+ for (i = 0, psrc = src->data[1], pdest = dest->data[1];
|
|
+ i != dest->height / 2;
|
|
+ ++i, psrc += src->linesize[1], pdest += dest->linesize[1])
|
|
+ {
|
|
+ memcpy(pdest, psrc, dest->width / 2);
|
|
+ }
|
|
+ for (i = 0, psrc = src->data[2], pdest = dest->data[2];
|
|
+ i != dest->height / 2;
|
|
+ ++i, psrc += src->linesize[2], pdest += dest->linesize[2])
|
|
+ {
|
|
+ memcpy(pdest, psrc, dest->width / 2);
|
|
+ }
|
|
+
|
|
+ return dest->buf[0];
|
|
+}
|
|
+
|
|
+
|
|
+AVRpiZcRefPtr av_rpi_zc_ref(struct AVCodecContext * const s,
|
|
+ const AVFrame * const frame, const int maycopy)
|
|
+{
|
|
+ assert(s != NULL);
|
|
+
|
|
+ if (frame->format != AV_PIX_FMT_YUV420P)
|
|
+ {
|
|
+ av_log(s, AV_LOG_WARNING, "%s: *** Format not YUV420P: %d\n", __func__, frame->format);
|
|
+ return NULL;
|
|
+ }
|
|
+
|
|
+ if (frame->buf[1] != NULL)
|
|
+ {
|
|
+ if (maycopy)
|
|
+ {
|
|
+ av_log(s, AV_LOG_INFO, "%s: *** Not a single buf frame: copying\n", __func__);
|
|
+ return zc_copy(s, frame);
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ av_log(s, AV_LOG_WARNING, "%s: *** Not a single buf frame: NULL\n", __func__);
|
|
+ return NULL;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (pic_gm_ptr(frame->buf[0]) == NULL)
|
|
+ {
|
|
+ if (maycopy)
|
|
+ {
|
|
+ av_log(s, AV_LOG_INFO, "%s: *** Not one of our buffers: copying\n", __func__);
|
|
+ return zc_copy(s, frame);
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ av_log(s, AV_LOG_WARNING, "%s: *** Not one of our buffers: NULL\n", __func__);
|
|
+ return NULL;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return av_buffer_ref(frame->buf[0]);
|
|
+}
|
|
+
|
|
+int av_rpi_zc_vc_handle(const AVRpiZcRefPtr fr_ref)
|
|
+{
|
|
+ const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref);
|
|
+ return p == NULL ? -1 : p->vc_handle;
|
|
+}
|
|
+
|
|
+int av_rpi_zc_numbytes(const AVRpiZcRefPtr fr_ref)
|
|
+{
|
|
+ const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref);
|
|
+ return p == NULL ? 0 : p->numbytes;
|
|
+}
|
|
+
|
|
+void av_rpi_zc_unref(AVRpiZcRefPtr fr_ref)
|
|
+{
|
|
+ if (fr_ref != NULL)
|
|
+ {
|
|
+ av_buffer_unref(&fr_ref);
|
|
+ }
|
|
+}
|
|
+
|
|
+AVZcEnvPtr av_rpi_zc_env_alloc(void)
|
|
+{
|
|
+ ZcEnv * const zc = av_mallocz(sizeof(ZcEnv));
|
|
+ if (zc == NULL)
|
|
+ {
|
|
+ av_log(NULL, AV_LOG_ERROR, "av_rpi_zc_env_alloc: Context allocation failed\n");
|
|
+ return NULL;
|
|
+ }
|
|
+
|
|
+ zc_pool_init(&zc->pool);
|
|
+ return zc;
|
|
+}
|
|
+
|
|
+void av_rpi_zc_env_free(AVZcEnvPtr zc)
|
|
+{
|
|
+ if (zc != NULL)
|
|
+ {
|
|
+ zc_pool_destroy(&zc->pool); ;
|
|
+ av_free(zc);
|
|
+ }
|
|
+}
|
|
+
|
|
+int av_rpi_zc_init(struct AVCodecContext * const s)
|
|
+{
|
|
+ ZcEnv * const zc = av_rpi_zc_env_alloc();
|
|
+ if (zc == NULL)
|
|
+ {
|
|
+ return AVERROR(ENOMEM);
|
|
+ }
|
|
+
|
|
+ s->get_buffer_context = zc;
|
|
+ s->get_buffer2 = av_rpi_zc_get_buffer2;
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+void av_rpi_zc_uninit(struct AVCodecContext * const s)
|
|
+{
|
|
+ if (s->get_buffer2 == av_rpi_zc_get_buffer2)
|
|
+ {
|
|
+ ZcEnv * const zc = s->get_buffer_context;
|
|
+ s->get_buffer2 = avcodec_default_get_buffer2;
|
|
+ s->get_buffer_context = NULL;
|
|
+ av_rpi_zc_env_free(zc);
|
|
+ }
|
|
+}
|
|
+
|
|
+#endif // RPI
|
|
+
|
|
diff --git a/libavcodec/rpi_zc.h b/libavcodec/rpi_zc.h
|
|
new file mode 100644
|
|
index 0000000..f0109f4
|
|
--- /dev/null
|
|
+++ b/libavcodec/rpi_zc.h
|
|
@@ -0,0 +1,83 @@
|
|
+#ifndef LIBAVCODEC_RPI_ZC_H
|
|
+#define LIBAVCODEC_RPI_ZC_H
|
|
+
|
|
+// Zero-Copy frame code for RPi
|
|
+// RPi needs Y/U/V planes to be contiguous for display. By default
|
|
+// ffmpeg will allocate separated planes so a memcpy is needed before
|
|
+// display. This code prodes a method a making ffmpeg allocate a single
|
|
+// bit of memory for the frame when can then be refrence counted until
|
|
+// display ahs finsihed with it.
|
|
+
|
|
+#include "libavutil/frame.h"
|
|
+#include "libavcodec/avcodec.h"
|
|
+
|
|
+// "Opaque" pointer to whatever we are using as a buffer reference
|
|
+typedef AVBufferRef * AVRpiZcRefPtr;
|
|
+
|
|
+struct AVZcEnv;
|
|
+typedef struct AVZcEnv * AVZcEnvPtr;
|
|
+
|
|
+typedef struct AVRpiZcFrameGeometry
|
|
+{
|
|
+ unsigned int stride_y;
|
|
+ unsigned int height_y;
|
|
+ unsigned int stride_c;
|
|
+ unsigned int height_c;
|
|
+} AVRpiZcFrameGeometry;
|
|
+
|
|
+
|
|
+AVRpiZcFrameGeometry av_rpi_zc_frame_geometry(
|
|
+ const unsigned int video_width, const unsigned int video_height);
|
|
+
|
|
+// Replacement fn for avctx->get_buffer2
|
|
+// Should be set before calling avcodec_decode_open2
|
|
+//
|
|
+// N.B. in addition to to setting avctx->get_buffer2, avctx->refcounted_frames
|
|
+// must be set to 1 as otherwise the buffer info is killed before being returned
|
|
+// by avcodec_decode_video2. Note also that this means that the AVFrame that is
|
|
+// return must be manually derefed with av_frame_unref. This should be done
|
|
+// after av_rpi_zc_ref has been called.
|
|
+int av_rpi_zc_get_buffer2(struct AVCodecContext *s, AVFrame *frame, int flags);
|
|
+
|
|
+// Generate a ZC reference to the buffer(s) in this frame
|
|
+// If the buffer doesn't appear to be one allocated by _get_buffer_2
|
|
+// then the behaviour depends on maycopy:
|
|
+// If maycopy=0 then return NULL
|
|
+// If maycopy=1 && the src frame is in a form where we can easily copy
|
|
+// the data, then allocate a new buffer and copy the data into it
|
|
+// Otherwise return NULL
|
|
+AVRpiZcRefPtr av_rpi_zc_ref(struct AVCodecContext * const s,
|
|
+ const AVFrame * const frame, const int maycopy);
|
|
+
|
|
+// Get the vc_handle from the frame ref
|
|
+// Returns -1 if ref doesn't look valid
|
|
+int av_rpi_zc_vc_handle(const AVRpiZcRefPtr fr_ref);
|
|
+// Get the number of bytes allocated from the frame ref
|
|
+// Returns 0 if ref doesn't look valid
|
|
+int av_rpi_zc_numbytes(const AVRpiZcRefPtr fr_ref);
|
|
+
|
|
+// Unreference the buffer refed/allocated by _zc_ref
|
|
+// If fr_ref is NULL then this will NOP
|
|
+void av_rpi_zc_unref(AVRpiZcRefPtr fr_ref);
|
|
+
|
|
+// Allocate an environment for the buffer pool used by the ZC code
|
|
+// This should be put in avctx->get_buffer_context so it can be found by
|
|
+// av_rpi_zc_get_buffer2 when it is called from ffmpeg
|
|
+AVZcEnvPtr av_rpi_zc_env_alloc(void);
|
|
+
|
|
+// Allocate the environment used by the ZC code
|
|
+void av_rpi_zc_env_free(AVZcEnvPtr);
|
|
+
|
|
+
|
|
+// Init ZC into a context
|
|
+// There is nothing magic in this fn - it just packages setting
|
|
+// get_buffer2 & get_buffer_context
|
|
+int av_rpi_zc_init(struct AVCodecContext * const s);
|
|
+
|
|
+// Free ZC from a context
|
|
+// There is nothing magic in this fn - it just packages unsetting
|
|
+// get_buffer2 & get_buffer_context
|
|
+void av_rpi_zc_uninit(struct AVCodecContext * const s);
|
|
+
|
|
+#endif
|
|
+
|
|
diff --git a/libavcodec/utils.c b/libavcodec/utils.c
|
|
index f7adb52..708526e 100644
|
|
--- a/libavcodec/utils.c
|
|
+++ b/libavcodec/utils.c
|
|
@@ -26,6 +26,12 @@
|
|
*/
|
|
|
|
#include "config.h"
|
|
+
|
|
+#ifdef RPI
|
|
+// Move video buffers to GPU memory
|
|
+#define RPI_GPU_BUFFERS
|
|
+#endif
|
|
+
|
|
#include "libavutil/atomic.h"
|
|
#include "libavutil/attributes.h"
|
|
#include "libavutil/avassert.h"
|
|
@@ -64,6 +70,10 @@
|
|
#include "libavutil/ffversion.h"
|
|
const char av_codec_ffversion[] = "FFmpeg version " FFMPEG_VERSION;
|
|
|
|
+#ifdef RPI_GPU_BUFFERS
|
|
+#include "rpi_qpu.h"
|
|
+#endif
|
|
+
|
|
#if HAVE_PTHREADS || HAVE_W32THREADS || HAVE_OS2THREADS
|
|
static int default_lockmgr_cb(void **arg, enum AVLockOp op)
|
|
{
|
|
@@ -503,6 +513,47 @@ int avcodec_fill_audio_frame(AVFrame *frame, int nb_channels,
|
|
return ret;
|
|
}
|
|
|
|
+#ifdef RPI_GPU_BUFFERS
|
|
+static void rpi_buffer_default_free(void *opaque, uint8_t *data)
|
|
+{
|
|
+ GPU_MEM_PTR_T *p = opaque;
|
|
+ gpu_free(p);
|
|
+ av_free(p);
|
|
+}
|
|
+
|
|
+static AVBufferRef *rpi_buffer_alloc(int size)
|
|
+{
|
|
+ AVBufferRef *ret = NULL;
|
|
+ uint8_t *data = NULL;
|
|
+ GPU_MEM_PTR_T *p;
|
|
+
|
|
+ static int total=0;
|
|
+ total+=size;
|
|
+
|
|
+ p = av_malloc(sizeof *p);
|
|
+ if (!p)
|
|
+ return NULL;
|
|
+
|
|
+ if (gpu_malloc_cached(size,p)<0) // Change this line to choose cached or uncached memory. The caching here refers to the ARM data cache.
|
|
+ return NULL;
|
|
+
|
|
+ data = p->arm;
|
|
+ printf("Rpi alloc %d/%d ARM=%p VC=%x->%x\n",size,total,p->arm,p->vc,p->vc+size);
|
|
+ //memset(data, 64, size);
|
|
+
|
|
+ if (!data)
|
|
+ return NULL;
|
|
+
|
|
+ ret = av_buffer_create(data, size, rpi_buffer_default_free, p, 0);
|
|
+ if (!ret) {
|
|
+ gpu_free(p);
|
|
+ av_freep(&p);
|
|
+ }
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+#endif
|
|
+
|
|
static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame)
|
|
{
|
|
FramePool *pool = avctx->internal->pool;
|
|
@@ -550,6 +601,14 @@ static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame)
|
|
av_buffer_pool_uninit(&pool->pools[i]);
|
|
pool->linesize[i] = linesize[i];
|
|
if (size[i]) {
|
|
+#ifdef RPI_GPU_BUFFERS
|
|
+ if (avctx->codec_id == AV_CODEC_ID_HEVC)
|
|
+ pool->pools[i] = av_buffer_pool_init(size[i] + 16 + STRIDE_ALIGN - 1,
|
|
+ CONFIG_MEMORY_POISONING ?
|
|
+ NULL :
|
|
+ rpi_buffer_alloc);
|
|
+ else
|
|
+#endif
|
|
pool->pools[i] = av_buffer_pool_init(size[i] + 16 + STRIDE_ALIGN - 1,
|
|
CONFIG_MEMORY_POISONING ?
|
|
NULL :
|
|
diff --git a/libavformat/mpegts.c b/libavformat/mpegts.c
|
|
index b31d233..2767306 100644
|
|
--- a/libavformat/mpegts.c
|
|
+++ b/libavformat/mpegts.c
|
|
@@ -701,7 +701,7 @@ static const StreamType ISO_types[] = {
|
|
#endif
|
|
{ 0x1b, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264 },
|
|
{ 0x1c, AVMEDIA_TYPE_AUDIO, AV_CODEC_ID_AAC },
|
|
- { 0x20, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264 },
|
|
+ { 0x20, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264_MVC },
|
|
{ 0x21, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_JPEG2000 },
|
|
{ 0x24, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_HEVC },
|
|
{ 0x42, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_CAVS },
|
|
diff --git a/libavformat/utils.c b/libavformat/utils.c
|
|
index 6f343f2..83f26d5 100644
|
|
--- a/libavformat/utils.c
|
|
+++ b/libavformat/utils.c
|
|
@@ -691,7 +691,7 @@ static int update_wrap_reference(AVFormatContext *s, AVStream *st, int stream_in
|
|
int default_stream_index = av_find_default_stream_index(s);
|
|
if (s->streams[default_stream_index]->pts_wrap_reference == AV_NOPTS_VALUE) {
|
|
for (i = 0; i < s->nb_streams; i++) {
|
|
- if (av_find_program_from_stream(s, NULL, i))
|
|
+ if (0 && av_find_program_from_stream(s, NULL, i))
|
|
continue;
|
|
s->streams[i]->pts_wrap_reference = pts_wrap_reference;
|
|
s->streams[i]->pts_wrap_behavior = pts_wrap_behavior;
|
|
diff --git a/libavutil/buffer.c b/libavutil/buffer.c
|
|
index 694e116..203ca7b 100644
|
|
--- a/libavutil/buffer.c
|
|
+++ b/libavutil/buffer.c
|
|
@@ -425,3 +425,9 @@ AVBufferRef *av_buffer_pool_get(AVBufferPool *pool)
|
|
|
|
return ret;
|
|
}
|
|
+
|
|
+// Return the opaque for the underlying frame (gives us a GPU_MEM_PTR_T)
|
|
+void *av_buffer_pool_opaque(AVBufferRef *ref) {
|
|
+ BufferPoolEntry *buf = av_buffer_get_opaque(ref);
|
|
+ return buf->opaque;
|
|
+}
|
|
diff --git a/libavutil/buffer.h b/libavutil/buffer.h
|
|
index 0c0ce12..82e0bc3 100644
|
|
--- a/libavutil/buffer.h
|
|
+++ b/libavutil/buffer.h
|
|
@@ -283,6 +283,9 @@ void av_buffer_pool_uninit(AVBufferPool **pool);
|
|
*/
|
|
AVBufferRef *av_buffer_pool_get(AVBufferPool *pool);
|
|
|
|
+// Return the opaque for the underlying frame
|
|
+void *av_buffer_pool_opaque(AVBufferRef *ref);
|
|
+
|
|
/**
|
|
* @}
|
|
*/
|
|
diff --git a/pi-util/conf.sh b/pi-util/conf.sh
|
|
new file mode 100755
|
|
index 0000000..8b596a2
|
|
--- /dev/null
|
|
+++ b/pi-util/conf.sh
|
|
@@ -0,0 +1,33 @@
|
|
+echo "Configure for Pi2/3"
|
|
+
|
|
+RPI_BUILDROOT=`pwd`/build
|
|
+RPI_ROOTFS=$RPI_BUILDROOT/linux/raspian_jessie_pi1-sysroot
|
|
+RPI_TOOLROOT=$RPI_BUILDROOT/tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf
|
|
+RPI_OPT_VC=$RPI_ROOTFS/opt/vc
|
|
+#RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_ROOTFS/usr/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
|
|
+RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
|
|
+RPI_DEFS="-D__VCCOREVER__=0x04000000 -DRPI=1"
|
|
+#RPI_DEFS="-D__VCCOREVER__=0x04000000"
|
|
+RPI_LIBDIRS="-L$RPI_ROOTFS/lib -L$RPI_ROOTFS/usr/lib -L$RPI_OPT_VC/lib"
|
|
+#RPI_KEEPS="-save-temps=obj"
|
|
+RPI_KEEPS=""
|
|
+
|
|
+./configure --enable-cross-compile\
|
|
+ --arch=armv6t2\
|
|
+ --cpu=cortex-a7\
|
|
+ --target-os=linux\
|
|
+ --disable-stripping\
|
|
+ --disable-thumb\
|
|
+ --enable-mmal\
|
|
+ --extra-cflags="-g $RPI_KEEPS $RPI_DEFS $RPI_INCLUDES"\
|
|
+ --extra-cxxflags="$RPI_DEFS $RPI_INCLUDES"\
|
|
+ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_ROOTFS/lib,-rpath-link=$RPI_ROOTFS/usr/lib"\
|
|
+ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\
|
|
+ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf-
|
|
+
|
|
+# --enable-extra-warnings\
|
|
+# --arch=armv71\
|
|
+# --enable-shared\
|
|
+
|
|
+# gcc option for getting asm listing
|
|
+# -Wa,-ahls
|
|
diff --git a/pi-util/conf_h265.csv b/pi-util/conf_h265.csv
|
|
new file mode 100644
|
|
index 0000000..61d1399
|
|
--- /dev/null
|
|
+++ b/pi-util/conf_h265.csv
|
|
@@ -0,0 +1,144 @@
|
|
+1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.md5
|
|
+2,AMP_A_Samsung_6,AMP_A_Samsung_6.bin,AMP_A_Samsung_6.md5
|
|
+1,AMP_B_Samsung_6,AMP_B_Samsung_6.bin,AMP_B_Samsung_6.md5
|
|
+1,AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5
|
|
+1,AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5
|
|
+1,AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5
|
|
+1,AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5
|
|
+1,AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5
|
|
+2,AMVP_C_Samsung_6,AMVP_C_Samsung_6.bin,AMVP_C_Samsung_6.md5
|
|
+1,BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5
|
|
+1,CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5
|
|
+1,CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5
|
|
+1,CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5
|
|
+1,CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5
|
|
+1,CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5
|
|
+1,CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5
|
|
+1,CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5
|
|
+1,CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5
|
|
+1,CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5
|
|
+1,cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5
|
|
+1,CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5
|
|
+1,CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5
|
|
+1,DBLK_A_MAIN10_VIXS_3,DBLK_A_MAIN10_VIXS_3.bit,DBLK_A_MAIN10_VIXS_3.md5
|
|
+1,DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5
|
|
+1,DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5
|
|
+1,DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5
|
|
+1,DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5
|
|
+1,DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5
|
|
+1,DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5
|
|
+1,DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5
|
|
+2,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
|
|
+1,DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5
|
|
+1,DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5
|
|
+1,DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5
|
|
+1,DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5
|
|
+1,DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5
|
|
+1,ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5
|
|
+1,ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5
|
|
+1,ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5
|
|
+1,EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5
|
|
+1,FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5
|
|
+1,HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5
|
|
+1,INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5
|
|
+1,INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5
|
|
+1,ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5
|
|
+1,ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5
|
|
+1,ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5
|
|
+1,ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5
|
|
+1,ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5
|
|
+1,IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5
|
|
+1,IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5
|
|
+1,IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5
|
|
+1,LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5
|
|
+1,LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5
|
|
+1,LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5
|
|
+1,MAXBINS_A_TI_4,MAXBINS_A_TI_4.bit,MAXBINS_A_TI_4.md5
|
|
+1,MAXBINS_B_TI_4,MAXBINS_B_TI_4.bit,MAXBINS_B_TI_4.md5
|
|
+1,MAXBINS_C_TI_4,MAXBINS_C_TI_4.bit,MAXBINS_C_TI_4.md5
|
|
+1,MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5
|
|
+1,MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5
|
|
+1,MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5
|
|
+1,MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5
|
|
+1,MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5
|
|
+1,MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5
|
|
+1,MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5
|
|
+1,MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5
|
|
+1,MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5
|
|
+1,MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5
|
|
+1,NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5
|
|
+1,NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5
|
|
+2,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
|
|
+1,OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5
|
|
+1,OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5
|
|
+1,OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5
|
|
+1,PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5
|
|
+1,PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5
|
|
+1,PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5
|
|
+1,PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5
|
|
+1,PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5
|
|
+1,PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5
|
|
+1,PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5
|
|
+1,PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5
|
|
+1,PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5
|
|
+1,POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5
|
|
+2,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
|
|
+1,PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5
|
|
+1,RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5
|
|
+2,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
|
|
+1,RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5
|
|
+1,RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5
|
|
+1,RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5
|
|
+1,RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5
|
|
+1,RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5
|
|
+1,RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5
|
|
+1,RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5
|
|
+1,RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5
|
|
+1,RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5
|
|
+1,RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5
|
|
+1,RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5
|
|
+1,RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5
|
|
+1,RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5
|
|
+1,RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5
|
|
+1,RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5
|
|
+1,SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5
|
|
+1,SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5
|
|
+1,SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5
|
|
+1,SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5
|
|
+1,SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5
|
|
+1,SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5
|
|
+1,SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5
|
|
+1,SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5
|
|
+1,SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5
|
|
+1,SLIST_A_Sony_4,str.bin,SLIST_A_Sony_4_yuv.md5
|
|
+1,SLIST_B_Sony_8,str.bin,SLIST_B_Sony_8_yuv.md5
|
|
+1,SLIST_C_Sony_3,str.bin,SLIST_C_Sony_3_yuv.md5
|
|
+1,SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5
|
|
+2,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
|
|
+1,STRUCT_A_Samsung_6,STRUCT_A_Samsung_6.bin,STRUCT_A_Samsung_6.md5
|
|
+1,STRUCT_B_Samsung_6,STRUCT_B_Samsung_6.bin,STRUCT_B_Samsung_6.md5
|
|
+1,TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5
|
|
+1,TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5
|
|
+1,TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5
|
|
+1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5
|
|
+1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5
|
|
+1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5
|
|
+2,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5
|
|
+1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5
|
|
+2,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
|
|
+1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5
|
|
+1,WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5
|
|
+1,WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5
|
|
+1,WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5
|
|
+1,WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5
|
|
+1,WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5
|
|
+1,WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5
|
|
+1,WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5
|
|
+1,WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5
|
|
+1,WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5
|
|
+1,WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5
|
|
+1,WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5
|
|
+1,WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5
|
|
+1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5
|
|
+1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5
|
|
+1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
|
|
diff --git a/pi-util/ffconf.py b/pi-util/ffconf.py
|
|
new file mode 100644
|
|
index 0000000..38f942f
|
|
--- /dev/null
|
|
+++ b/pi-util/ffconf.py
|
|
@@ -0,0 +1,146 @@
|
|
+#!/usr/bin/env python
|
|
+
|
|
+import os
|
|
+import subprocess
|
|
+import re
|
|
+import argparse
|
|
+import sys
|
|
+import csv
|
|
+from stat import *
|
|
+
|
|
+conf_root = "/opt/conform/h265"
|
|
+ffmpeg_exec = "./ffmpeg"
|
|
+
|
|
+def testone(fileroot, name, es_file, md5_file):
|
|
+ tmp_root = "/tmp"
|
|
+
|
|
+ dec_file = os.path.join(tmp_root, name + ".dec.md5")
|
|
+ try:
|
|
+ os.remove(dec_file)
|
|
+ except:
|
|
+ pass
|
|
+
|
|
+ flog = open(os.path.join(tmp_root, name + ".log"), "wt")
|
|
+
|
|
+ # Unaligned needed for cropping conformance
|
|
+ rstr = subprocess.call(
|
|
+ [ffmpeg_exec, "-flags", "unaligned", "-vcodec", "hevc", "-i", os.path.join(fileroot, es_file), "-f", "md5", dec_file],
|
|
+ stdout=flog, stderr=subprocess.STDOUT)
|
|
+
|
|
+ try:
|
|
+ m1 = None
|
|
+ m2 = None
|
|
+ with open(os.path.join(fileroot, md5_file)) as f:
|
|
+ for line in f:
|
|
+ m1 = re.search("[0-9a-f]{32}", line.lower())
|
|
+ if m1:
|
|
+ break
|
|
+
|
|
+ with open(dec_file) as f:
|
|
+ m2 = re.search("[0-9a-f]{32}", f.readline())
|
|
+ except:
|
|
+ pass
|
|
+
|
|
+ rv = False
|
|
+ if m1 and m2 and m1.group() == m2.group():
|
|
+ print >> flog, "Match: " + m1.group()
|
|
+ rv = True
|
|
+ elif not m1:
|
|
+ print >> flog, "****** Cannot find m1"
|
|
+ elif not m2:
|
|
+ print >> flog, "****** Cannot find m2"
|
|
+ else:
|
|
+ print >> flog, "****** Mismatch: " + m1.group() + " != " + m2.group()
|
|
+ flog.close()
|
|
+ return rv
|
|
+
|
|
+def scandir(root):
|
|
+ aconf = []
|
|
+ ents = os.listdir(conf_root)
|
|
+ ents.sort(key=str.lower)
|
|
+ for name in ents:
|
|
+ test_path = os.path.join(conf_root, name)
|
|
+ if S_ISDIR(os.stat(test_path).st_mode):
|
|
+ files = os.listdir(test_path)
|
|
+ es_file = "?"
|
|
+ md5_file = "?"
|
|
+ for f in files:
|
|
+ (base, ext) = os.path.splitext(f)
|
|
+ if base[0] == '.':
|
|
+ pass
|
|
+ elif ext == ".bit" or ext == ".bin":
|
|
+ es_file = f
|
|
+ elif ext == ".md5":
|
|
+ if md5_file == "?":
|
|
+ md5_file = f
|
|
+ elif base[-3:] == "yuv":
|
|
+ md5_file = f
|
|
+ aconf.append((1, name, es_file, md5_file))
|
|
+ return aconf
|
|
+
|
|
+def runtest(name, tests):
|
|
+ if not tests:
|
|
+ return True
|
|
+ for t in tests:
|
|
+ if name[0:len(t)] == t:
|
|
+ return True
|
|
+ return False
|
|
+
|
|
+def doconf(csva, tests):
|
|
+ failures = []
|
|
+ unx_success = []
|
|
+ for a in csva:
|
|
+ exp_test = int(a[0])
|
|
+ if (exp_test and runtest(a[1], tests)):
|
|
+ name = a[1]
|
|
+ print "==== ", name,
|
|
+ sys.stdout.flush()
|
|
+
|
|
+ if (not testone(os.path.join(conf_root, name), name, a[2], a[3])) :
|
|
+ if exp_test == 1:
|
|
+ failures.append(name)
|
|
+ print ": * FAIL *"
|
|
+ else:
|
|
+ print ": fail"
|
|
+ else:
|
|
+ if exp_test == 2:
|
|
+ print ": * OK *"
|
|
+ unx_success.append(name)
|
|
+ else:
|
|
+ print ": ok"
|
|
+
|
|
+
|
|
+ if failures or unx_success:
|
|
+ print "Unexpected Failures:", failures
|
|
+ print "Unexpected Success: ", unx_success
|
|
+ else:
|
|
+ print "All tests normal"
|
|
+
|
|
+
|
|
+class ConfCSVDialect(csv.Dialect):
|
|
+ delimiter = ','
|
|
+ doublequote = True
|
|
+ lineterminator = '\n'
|
|
+ quotechar='"'
|
|
+ quoting = csv.QUOTE_MINIMAL
|
|
+ skipinitialspace = True
|
|
+ strict = True
|
|
+
|
|
+if __name__ == '__main__':
|
|
+
|
|
+ argp = argparse.ArgumentParser(description="FFmpeg h265 conformance tester")
|
|
+ argp.add_argument("tests", nargs='*')
|
|
+ argp.add_argument("--csvgen", action='store_true', help="Generate CSV file for dir")
|
|
+ argp.add_argument("--csv", default="pi-util/conf_h265.csv", help="CSV filename")
|
|
+ args = argp.parse_args()
|
|
+
|
|
+ if args.csvgen:
|
|
+ csv.writer(sys.stdout).writerows(scandir(conf_root))
|
|
+ exit(0)
|
|
+
|
|
+ with open(args.csv, 'rt') as csvfile:
|
|
+ csva = [a for a in csv.reader(csvfile, ConfCSVDialect())]
|
|
+
|
|
+
|
|
+ doconf(csva, args.tests)
|
|
+
|
|
diff --git a/pi-util/qasm.py b/pi-util/qasm.py
|
|
new file mode 100644
|
|
index 0000000..1eacc04
|
|
--- /dev/null
|
|
+++ b/pi-util/qasm.py
|
|
@@ -0,0 +1,2502 @@
|
|
+#!/usr/bin/env python
|
|
+
|
|
+# add.ifz.setf -, r0, ra0 ; fmul rb1, rany2, 0 ; thrend # comment
|
|
+# add r0, r0, 1 # implicit mul nop
|
|
+# nop # explicit add nop, implicit mul nop
|
|
+# bkpt # implicit add/mul nop
|
|
+# mov r0, 0x1234 # hex immediate
|
|
+# mov r0, 20 * 40 # expressions...
|
|
+# mov r0, f(sqrt(2.0) * 3.0) # f() converts float to bits
|
|
+# mov r0, a:label # put address of label in r0
|
|
+# :label
|
|
+# bra.allnn ra2, a:1f # branch to label 1 (searching forward), using absolute address
|
|
+# :1
|
|
+# brr.anyz -, r:1b # branch to label 1 (searching backward), using relative address
|
|
+# :1 # multiple definitions of numeric labels (differentiated using f/b)
|
|
+# .set my_val, 3 # introduce alias for 3
|
|
+# .set my_reg, r0 # and for r0
|
|
+# mov my_reg, my_val # then use them
|
|
+# .set my_reg2, my_reg + my_val # r0 plus 3 is r3
|
|
+# .macro my_add, a, b, c # a, b, c act as if .set on entry
|
|
+# .set my_val, 10
|
|
+# add a, b, c
|
|
+# mov r0, my_val # 10
|
|
+# .endm # forget all .sets since .macro (including arg .sets)
|
|
+# mov r0, my_val # 3
|
|
+# my_add my_reg2, my_reg, ra0 << 4 # << rotates left (>> rotates right)
|
|
+
|
|
+import math
|
|
+import optparse
|
|
+import os
|
|
+import random
|
|
+import re
|
|
+import struct
|
|
+import sys
|
|
+import time
|
|
+
|
|
+###############################################################################
|
|
+# constants
|
|
+###############################################################################
|
|
+
|
|
+# ops
|
|
+######
|
|
+
|
|
+# negatives are internal qasm ops
|
|
+
|
|
+AOP_MOV = -3 # two operands
|
|
+AOP_BRA = -2 # two operands
|
|
+AOP_BRR = -1 # two operands
|
|
+AOP_NOP = 0x00 # no operands
|
|
+AOP_FADD = 0x01
|
|
+AOP_FSUB = 0x02
|
|
+AOP_FMIN = 0x03
|
|
+AOP_FMAX = 0x04
|
|
+AOP_FMINABS = 0x05
|
|
+AOP_FMAXABS = 0x06
|
|
+AOP_FTOI = 0x07 # two operands
|
|
+AOP_ITOF = 0x08 # two operands
|
|
+AOP_ADD = 0x0c
|
|
+AOP_SUB = 0x0d
|
|
+AOP_SHR = 0x0e
|
|
+AOP_ASR = 0x0f
|
|
+AOP_ROR = 0x10
|
|
+AOP_SHL = 0x11
|
|
+AOP_MIN = 0x12
|
|
+AOP_MAX = 0x13
|
|
+AOP_AND = 0x14
|
|
+AOP_OR = 0x15
|
|
+AOP_XOR = 0x16
|
|
+AOP_NOT = 0x17 # two operands
|
|
+AOP_CLZ = 0x18 # two operands
|
|
+AOP_V8ADDS = 0x1e
|
|
+AOP_V8SUBS = 0x1f
|
|
+
|
|
+MOP_MOV = -1 # two operands
|
|
+MOP_NOP = 0x0 # no operands
|
|
+MOP_FMUL = 0x1
|
|
+MOP_MUL24 = 0x2
|
|
+MOP_V8MULD = 0x3
|
|
+MOP_V8MIN = 0x4
|
|
+MOP_V8MAX = 0x5
|
|
+MOP_V8ADDS = 0x6
|
|
+MOP_V8SUBS = 0x7
|
|
+
|
|
+# ldi modes
|
|
+############
|
|
+
|
|
+LDI_32 = 0
|
|
+LDI_EL_SIGNED = 1
|
|
+LDI_EL_UNSIGNED = 3
|
|
+LDI_SEMA = 4
|
|
+
|
|
+# conds
|
|
+########
|
|
+
|
|
+COND_NEVER = 0
|
|
+COND_ALWAYS = 1
|
|
+COND_IFZ = 2
|
|
+COND_IFNZ = 3
|
|
+COND_IFN = 4
|
|
+COND_IFNN = 5
|
|
+COND_IFC = 6
|
|
+COND_IFNC = 7
|
|
+
|
|
+BCOND_ALLZ = 0
|
|
+BCOND_ALLNZ = 1
|
|
+BCOND_ANYZ = 2
|
|
+BCOND_ANYNZ = 3
|
|
+BCOND_ALLN = 4
|
|
+BCOND_ALLNN = 5
|
|
+BCOND_ANYN = 6
|
|
+BCOND_ANYNN = 7
|
|
+BCOND_ALLC = 8
|
|
+BCOND_ALLNC = 9
|
|
+BCOND_ANYC = 10
|
|
+BCOND_ANYNC = 11
|
|
+BCOND_ALWAYS = 15
|
|
+
|
|
+# packing/unpacking
|
|
+####################
|
|
+
|
|
+# regfile a pack modes
|
|
+PACK_A_NOP = 0
|
|
+PACK_A_16A = 1
|
|
+PACK_A_16B = 2
|
|
+PACK_A_8888 = 3
|
|
+PACK_A_8A = 4
|
|
+PACK_A_8B = 5
|
|
+PACK_A_8C = 6
|
|
+PACK_A_8D = 7
|
|
+PACK_A_32S = 8
|
|
+PACK_A_16AS = 9
|
|
+PACK_A_16BS = 10
|
|
+PACK_A_8888S = 11
|
|
+PACK_A_8AS = 12
|
|
+PACK_A_8BS = 13
|
|
+PACK_A_8CS = 14
|
|
+PACK_A_8DS = 15
|
|
+
|
|
+# mul unit pack modes
|
|
+PACK_MUL_NOP = 0
|
|
+PACK_MUL_8888 = 3
|
|
+PACK_MUL_8A = 4
|
|
+PACK_MUL_8B = 5
|
|
+PACK_MUL_8C = 6
|
|
+PACK_MUL_8D = 7
|
|
+
|
|
+# regfile a unpack modes
|
|
+UNPACK_A_NOP = 0
|
|
+UNPACK_A_16A = 1
|
|
+UNPACK_A_16B = 2
|
|
+UNPACK_A_8R = 3
|
|
+UNPACK_A_8A = 4
|
|
+UNPACK_A_8B = 5
|
|
+UNPACK_A_8C = 6
|
|
+UNPACK_A_8D = 7
|
|
+
|
|
+# r4 unpack modes
|
|
+UNPACK_R4_NOP = 0
|
|
+UNPACK_R4_16A = 1
|
|
+UNPACK_R4_16B = 2
|
|
+UNPACK_R4_8R = 3
|
|
+UNPACK_R4_8A = 4
|
|
+UNPACK_R4_8B = 5
|
|
+UNPACK_R4_8C = 6
|
|
+UNPACK_R4_8D = 7
|
|
+
|
|
+PACK_TYPE_INT = 0
|
|
+PACK_TYPE_FLOAT = 1
|
|
+PACK_TYPE_EITHER = -1
|
|
+
|
|
+PACK_MODE_A = 0 # regfile a
|
|
+PACK_MODE_M = 1 # mul unit
|
|
+PACK_MODE_EITHER = -1
|
|
+
|
|
+UNPACK_LOC_A = 0 # regfile a
|
|
+UNPACK_LOC_R4 = 1 # r4
|
|
+UNPACK_LOC_AB = 2 # either regfile a or regfile b
|
|
+UNPACK_LOC_OTHER = 3 # somewhere else
|
|
+
|
|
+# args
|
|
+#######
|
|
+
|
|
+# loc_t, ie internal
|
|
+MUX_AC = 0
|
|
+MUX_ANY = 1
|
|
+MUX_A = 2
|
|
+MUX_B = 3
|
|
+RW_EITHER = 0
|
|
+RW_READ = 1
|
|
+RW_WRITE = 2
|
|
+
|
|
+RADDR_NOP = 39
|
|
+
|
|
+# negatives are for internal use
|
|
+RMUX_SEMA = -6
|
|
+RMUX_LABEL = -5
|
|
+RMUX_IMMV = -4
|
|
+RMUX_IMM = -3
|
|
+RMUX_AC = -2
|
|
+RMUX_ANY = -1
|
|
+RMUX_A0 = 0 # followed by A1, A2, A3, A4, A5
|
|
+RMUX_A = 6
|
|
+RMUX_B = 7
|
|
+
|
|
+WADDR_R0 = 32 # followed by R1, R2, R3
|
|
+WADDR_NOP = 39
|
|
+
|
|
+WMUX_ANY = 0
|
|
+WMUX_A = 1
|
|
+WMUX_B = 2
|
|
+
|
|
+# signals
|
|
+##########
|
|
+
|
|
+SIG_BKPT = 0
|
|
+SIG_NORMAL = 1
|
|
+SIG_THRSW = 2
|
|
+SIG_THREND = 3
|
|
+SIG_SBWAIT = 4
|
|
+SIG_SBDONE = 5
|
|
+SIG_INT = 6 # on a0
|
|
+SIG_LTHRSW = 6 # on b0
|
|
+SIG_LOADCV = 7
|
|
+SIG_LOADC = 8
|
|
+SIG_LDCEND = 9
|
|
+SIG_LDTMU0 = 10
|
|
+SIG_LDTMU1 = 11
|
|
+SIG_ROTATE = 12 # on a0
|
|
+SIG_LOADAM = 12 # on b0
|
|
+SIG_SMALLIMMED = 13
|
|
+SIG_IMMED = 14
|
|
+SIG_BRANCH = 15
|
|
+
|
|
+# multi-line assembler constructs
|
|
+##################################
|
|
+
|
|
+CONSTRUCT_MACRO = 0x1
|
|
+CONSTRUCT_IF = 0x2
|
|
+CONSTRUCT_ELSE = 0x4
|
|
+CONSTRUCT_REP = 0x8
|
|
+
|
|
+###############################################################################
|
|
+# helpers
|
|
+###############################################################################
|
|
+
|
|
+def asm_error(message, location = None):
|
|
+ if location is None:
|
|
+ location = current_location
|
|
+ if location == '':
|
|
+ sys.stderr.write('qasm ERROR: %s\n' % message)
|
|
+ else:
|
|
+ sys.stderr.write('qasm ERROR: %s: %s\n' % (location, message))
|
|
+ sys.exit(-1)
|
|
+
|
|
+def asm_warning(message, location = None):
|
|
+ if disable_warnings or (nwarn_level != 0):
|
|
+ return
|
|
+ if location is None:
|
|
+ location = current_location
|
|
+ if location == '':
|
|
+ sys.stderr.write('qasm WARNING: %s\n' % message)
|
|
+ else:
|
|
+ sys.stderr.write('qasm WARNING: %s: %s\n' % (location, message))
|
|
+ if warnings_are_errors:
|
|
+ asm_error('warnings are errors!', location)
|
|
+
|
|
+# smart_split('') = []
|
|
+# smart_split('a') = ['a']
|
|
+# smart_split('a(1, 2),[3, 4, 5],6') = ['a(1, 2)', '[3, 4, 5]', '6']
|
|
+def smart_split(s, delim = ',', count = 0):
|
|
+ if len(s) == 0:
|
|
+ return []
|
|
+ parts = []
|
|
+ depth = 0
|
|
+ i = 0
|
|
+ for j in xrange(len(s)):
|
|
+ if s[j] in '([{':
|
|
+ depth += 1
|
|
+ elif s[j] in ')]}':
|
|
+ depth -= 1
|
|
+ elif (s[j] == delim) and (depth == 0):
|
|
+ parts.append(s[i:j])
|
|
+ i = j + 1
|
|
+ if len(parts) == count:
|
|
+ break
|
|
+ if depth != 0:
|
|
+ asm_error('bracket nesting fail')
|
|
+ parts.append(s[i:])
|
|
+ return parts
|
|
+
|
|
+def is_int(x):
|
|
+ return isinstance(x, int) or isinstance(x, long)
|
|
+
|
|
+###############################################################################
|
|
+# "parsing" stuff
|
|
+###############################################################################
|
|
+
|
|
+re_macro = re.compile('\\.macro\\s+(?P<name>\\w+)(?P<params>(\\s*,\\s*\\w+)*)$')
|
|
+re_if = re.compile('\\.if((?P<set>n?set)\\s+(?P<name>\\w+)|\\s(?P<condition>.+))$')
|
|
+re_elif = re.compile('\\.elif((?P<set>n?set)\\s+(?P<name>\\w+)|\\s(?P<condition>.+))$')
|
|
+re_rep = re.compile('\\.rep\\s+(?P<name>\\w+)\\s*,(?P<count>.+)$')
|
|
+re_include = re.compile('\\.include\\s(?P<filename>.+)$')
|
|
+re_set = re.compile('\\.set\\s+(?P<name>\\w+)\\s*,(?P<val>.+)$')
|
|
+re_unset = re.compile('\\.unset\\s+(?P<name>\\w+)$')
|
|
+re_eval = re.compile('\\.eval\\s(?P<expr>.+)$')
|
|
+re_print_info_warn_error = re.compile('\\.(?P<print_info_warn_error>print|info|warn|error)\\s(?P<message>.+)$')
|
|
+re_assert = re.compile('\\.assert\\s(?P<condition>.+)$')
|
|
+re_data = re.compile('\\.d(?P<size>[124])\\s(?P<data>.+)$')
|
|
+re_macro_inst = re.compile('(?P<name>\\w+)(?P<args>\\s.+|)$')
|
|
+re_label = re.compile(':(?P<name>:?[a-zA-Z_]\\w*|\\d+)$')
|
|
+re_op = re.compile('(?P<op>\\w+)(\\.(?P<cond>\\w+))??(\\.(?P<sf>setf))?(?P<args>\\s.+|)$')
|
|
+re_label_ref_left = re.compile('\\b([ar]):')
|
|
+re_label_ref_right = re.compile('[a-zA-Z_]\\w*|\\d+[bf]$')
|
|
+re_pack = re.compile('\\.([0-9]\\w*[a-df-zA-DF-Z_])') # a bit weird because we don't want to pick up float literals...
|
|
+
|
|
+# ops
|
|
+######
|
|
+
|
|
+aops = {
|
|
+ 'mov': (AOP_MOV, 2),
|
|
+ 'bra': (AOP_BRA, 2),
|
|
+ 'brr': (AOP_BRR, 2),
|
|
+ 'nop': (AOP_NOP, 0),
|
|
+ 'fadd': (AOP_FADD, 3),
|
|
+ 'fsub': (AOP_FSUB, 3),
|
|
+ 'fmin': (AOP_FMIN, 3),
|
|
+ 'fmax': (AOP_FMAX, 3),
|
|
+ 'fminabs': (AOP_FMINABS, 3),
|
|
+ 'fmaxabs': (AOP_FMAXABS, 3),
|
|
+ 'ftoi': (AOP_FTOI, 2),
|
|
+ 'itof': (AOP_ITOF, 2),
|
|
+ 'add': (AOP_ADD, 3),
|
|
+ 'sub': (AOP_SUB, 3),
|
|
+ 'shr': (AOP_SHR, 3),
|
|
+ 'asr': (AOP_ASR, 3),
|
|
+ 'ror': (AOP_ROR, 3),
|
|
+ 'shl': (AOP_SHL, 3),
|
|
+ 'min': (AOP_MIN, 3),
|
|
+ 'max': (AOP_MAX, 3),
|
|
+ 'and': (AOP_AND, 3),
|
|
+ 'or': (AOP_OR, 3),
|
|
+ 'xor': (AOP_XOR, 3),
|
|
+ 'not': (AOP_NOT, 2),
|
|
+ 'clz': (AOP_CLZ, 2),
|
|
+ 'v8adds': (AOP_V8ADDS, 3),
|
|
+ 'v8subs': (AOP_V8SUBS, 3)}
|
|
+
|
|
+def get_aop(aop):
|
|
+ if aop not in aops:
|
|
+ asm_error('invalid aop')
|
|
+ return aops[aop]
|
|
+
|
|
+mops = {
|
|
+ 'mov': (MOP_MOV, 2),
|
|
+ 'nop': (MOP_NOP, 0),
|
|
+ 'fmul': (MOP_FMUL, 3),
|
|
+ 'mul24': (MOP_MUL24, 3),
|
|
+ 'v8muld': (MOP_V8MULD, 3),
|
|
+ 'v8min': (MOP_V8MIN, 3),
|
|
+ 'v8max': (MOP_V8MAX, 3),
|
|
+ 'v8adds': (MOP_V8ADDS, 3),
|
|
+ 'v8subs': (MOP_V8SUBS, 3)}
|
|
+
|
|
+def get_mop(mop):
|
|
+ if mop not in mops:
|
|
+ asm_error('invalid mop')
|
|
+ return mops[mop]
|
|
+
|
|
+# conds
|
|
+########
|
|
+
|
|
+conds = {
|
|
+ 'ifz': COND_IFZ,
|
|
+ 'ifnz': COND_IFNZ,
|
|
+ 'ifn': COND_IFN,
|
|
+ 'ifnn': COND_IFNN,
|
|
+ 'ifc': COND_IFC,
|
|
+ 'ifnc': COND_IFNC}
|
|
+
|
|
+def get_cond(cond):
|
|
+ if not cond:
|
|
+ return COND_ALWAYS
|
|
+ if cond not in conds:
|
|
+ asm_error('invalid cond')
|
|
+ return conds[cond]
|
|
+
|
|
+bconds = {
|
|
+ 'allz': BCOND_ALLZ,
|
|
+ 'allnz': BCOND_ALLNZ,
|
|
+ 'anyz': BCOND_ANYZ,
|
|
+ 'anynz': BCOND_ANYNZ,
|
|
+ 'alln': BCOND_ALLN,
|
|
+ 'allnn': BCOND_ALLNN,
|
|
+ 'anyn': BCOND_ANYN,
|
|
+ 'anynn': BCOND_ANYNN,
|
|
+ 'allc': BCOND_ALLC,
|
|
+ 'allnc': BCOND_ALLNC,
|
|
+ 'anyc': BCOND_ANYC,
|
|
+ 'anync': BCOND_ANYNC}
|
|
+
|
|
+def get_bcond(bcond):
|
|
+ if not bcond:
|
|
+ return BCOND_ALWAYS
|
|
+ if bcond not in bconds:
|
|
+ asm_error('invalid bcond')
|
|
+ return bconds[bcond]
|
|
+
|
|
+def get_setf(setf):
|
|
+ if not setf:
|
|
+ return False
|
|
+ return True
|
|
+
|
|
+# packing/unpacking
|
|
+####################
|
|
+
|
|
+packs = {
|
|
+ '16a': (PACK_A_16A, PACK_TYPE_INT, PACK_MODE_A),
|
|
+ '16b': (PACK_A_16B, PACK_TYPE_INT, PACK_MODE_A),
|
|
+ '16af': (PACK_A_16A, PACK_TYPE_FLOAT, PACK_MODE_A),
|
|
+ '16bf': (PACK_A_16B, PACK_TYPE_FLOAT, PACK_MODE_A),
|
|
+ '8abcd': (PACK_A_8888, PACK_TYPE_EITHER, PACK_MODE_A),
|
|
+ '8a': (PACK_A_8A, PACK_TYPE_EITHER, PACK_MODE_A),
|
|
+ '8b': (PACK_A_8B, PACK_TYPE_EITHER, PACK_MODE_A),
|
|
+ '8c': (PACK_A_8C, PACK_TYPE_EITHER, PACK_MODE_A),
|
|
+ '8d': (PACK_A_8D, PACK_TYPE_EITHER, PACK_MODE_A),
|
|
+ 's': (PACK_A_32S, PACK_TYPE_EITHER, PACK_MODE_A),
|
|
+ '16as': (PACK_A_16AS, PACK_TYPE_EITHER, PACK_MODE_A),
|
|
+ '16bs': (PACK_A_16BS, PACK_TYPE_EITHER, PACK_MODE_A),
|
|
+ '8abcds': (PACK_A_8888S, PACK_TYPE_EITHER, PACK_MODE_A),
|
|
+ '8as': (PACK_A_8AS, PACK_TYPE_EITHER, PACK_MODE_A),
|
|
+ '8bs': (PACK_A_8BS, PACK_TYPE_EITHER, PACK_MODE_A),
|
|
+ '8cs': (PACK_A_8CS, PACK_TYPE_EITHER, PACK_MODE_A),
|
|
+ '8ds': (PACK_A_8DS, PACK_TYPE_EITHER, PACK_MODE_A),
|
|
+ '8abcdc': (PACK_MUL_8888, PACK_TYPE_EITHER, PACK_MODE_M),
|
|
+ '8ac': (PACK_MUL_8A, PACK_TYPE_EITHER, PACK_MODE_M),
|
|
+ '8bc': (PACK_MUL_8B, PACK_TYPE_EITHER, PACK_MODE_M),
|
|
+ '8cc': (PACK_MUL_8C, PACK_TYPE_EITHER, PACK_MODE_M),
|
|
+ '8dc': (PACK_MUL_8D, PACK_TYPE_EITHER, PACK_MODE_M)}
|
|
+
|
|
+def get_pack(pack):
|
|
+ if not pack:
|
|
+ return (0, PACK_TYPE_EITHER, PACK_MODE_EITHER)
|
|
+ if pack not in packs:
|
|
+ asm_error('invalid pack')
|
|
+ return packs[pack]
|
|
+
|
|
+a_unpacks = {
|
|
+ '16a': (UNPACK_A_16A, PACK_TYPE_INT),
|
|
+ '16b': (UNPACK_A_16B, PACK_TYPE_INT),
|
|
+ '16af': (UNPACK_A_16A, PACK_TYPE_FLOAT),
|
|
+ '16bf': (UNPACK_A_16B, PACK_TYPE_FLOAT),
|
|
+ '8dr': (UNPACK_A_8R, PACK_TYPE_EITHER),
|
|
+ '8a': (UNPACK_A_8A, PACK_TYPE_INT),
|
|
+ '8b': (UNPACK_A_8B, PACK_TYPE_INT),
|
|
+ '8c': (UNPACK_A_8C, PACK_TYPE_INT),
|
|
+ '8d': (UNPACK_A_8D, PACK_TYPE_INT),
|
|
+ '8ac': (UNPACK_A_8A, PACK_TYPE_FLOAT),
|
|
+ '8bc': (UNPACK_A_8B, PACK_TYPE_FLOAT),
|
|
+ '8cc': (UNPACK_A_8C, PACK_TYPE_FLOAT),
|
|
+ '8dc': (UNPACK_A_8D, PACK_TYPE_FLOAT)}
|
|
+
|
|
+def get_a_unpack(unpack):
|
|
+ if not unpack:
|
|
+ return (UNPACK_A_NOP, PACK_TYPE_EITHER, UNPACK_LOC_A)
|
|
+ if unpack not in a_unpacks:
|
|
+ asm_error('invalid ra unpack')
|
|
+ return a_unpacks[unpack] + (UNPACK_LOC_A,)
|
|
+
|
|
+r4_unpacks = {
|
|
+ '16af': UNPACK_R4_16A,
|
|
+ '16bf': UNPACK_R4_16B,
|
|
+ '8dr': UNPACK_R4_8R,
|
|
+ '8ac': UNPACK_R4_8A,
|
|
+ '8bc': UNPACK_R4_8B,
|
|
+ '8cc': UNPACK_R4_8C,
|
|
+ '8dc': UNPACK_R4_8D}
|
|
+
|
|
+def get_r4_unpack(unpack):
|
|
+ if not unpack:
|
|
+ return (UNPACK_R4_NOP, PACK_TYPE_EITHER, UNPACK_LOC_R4)
|
|
+ if unpack not in r4_unpacks:
|
|
+ asm_error('invalid r4 unpack')
|
|
+ return (r4_unpacks[unpack], PACK_TYPE_EITHER, UNPACK_LOC_R4)
|
|
+
|
|
+# args
|
|
+#######
|
|
+
|
|
+class loc_t:
|
|
+ def __init__(self, mux, i, rot, r5_rot, pack, rw):
|
|
+ self.mux = mux
|
|
+ self.i = i
|
|
+ self.rot = rot % 16
|
|
+ self.r5_rot = r5_rot % 16
|
|
+ self.pack = pack
|
|
+ self.rw = rw
|
|
+
|
|
+ def copy(self):
|
|
+ return loc_t(self.mux, self.i, self.rot, self.r5_rot, self.pack, self.rw)
|
|
+
|
|
+ def __add__(self, i):
|
|
+ if not is_int(i):
|
|
+ raise Exception('can only add integer to loc')
|
|
+ return loc_t(self.mux, self.i + i, self.rot, self.r5_rot, self.pack, self.rw)
|
|
+
|
|
+ def __sub__(self, i):
|
|
+ if not is_int(i):
|
|
+ raise Exception('can only subtract integer from loc')
|
|
+ return loc_t(self.mux, self.i - i, self.rot, self.r5_rot, self.pack, self.rw)
|
|
+
|
|
+ def __cmp__(self, other):
|
|
+ if is_int(other):
|
|
+ return cmp(self.i, other)
|
|
+ if not isinstance(other, loc_t):
|
|
+ raise Exception('can only compare loc to integer or other loc')
|
|
+ if self.mux != other.mux:
|
|
+ return cmp(self.mux, other.mux)
|
|
+ if self.i != other.i:
|
|
+ return cmp(self.i, other.i)
|
|
+ if self.rot != other.rot:
|
|
+ return cmp(self.rot, other.rot)
|
|
+ if self.r5_rot != other.r5_rot:
|
|
+ return cmp(self.r5_rot, other.r5_rot)
|
|
+ return cmp(self.pack, other.pack)
|
|
+
|
|
+ def is_r5(self):
|
|
+ return (self.mux == MUX_AC) and (self.i == 5)
|
|
+
|
|
+ def shift(self, rot, left):
|
|
+ if isinstance(rot, loc_t) and rot.is_r5():
|
|
+ if (rot.rot != 0) or (rot.r5_rot != 0) or rot.pack:
|
|
+ raise Exception('can\'t rotate by rotated/unpacked r5')
|
|
+ return loc_t(self.mux, self.i, self.rot, self.r5_rot + (-1 if left else 1), self.pack, self.rw)
|
|
+ if not is_int(rot):
|
|
+ raise Exception('can only rotate by integer or r5')
|
|
+ return loc_t(self.mux, self.i, self.rot + (-rot if left else rot), self.r5_rot, self.pack, self.rw)
|
|
+
|
|
+ def __lshift__(self, rot):
|
|
+ return self.shift(rot, True)
|
|
+
|
|
+ def __rshift__(self, rot):
|
|
+ return self.shift(rot, False)
|
|
+
|
|
+ def __getattr__(self, name):
|
|
+ # discard the first character if it is an underscore. this is a total hack
|
|
+ # to allow packs starting with a digit to work
|
|
+ if name[0] == '_':
|
|
+ name = name[1:]
|
|
+ if (name in packs) or (name in a_unpacks) or (name in r4_unpacks):
|
|
+ if self.pack:
|
|
+ raise Exception('can\'t specify two packs')
|
|
+ return loc_t(self.mux, self.i, self.rot, self.r5_rot, name, self.rw)
|
|
+ raise AttributeError()
|
|
+
|
|
+ def __str__(self):
|
|
+ if self.mux == MUX_AC:
|
|
+ return 'r%d' % self.i
|
|
+ if self.mux == MUX_ANY:
|
|
+ return 'rany%d' % self.i
|
|
+ if self.mux == MUX_A:
|
|
+ return 'ra%d' % self.i
|
|
+ if self.mux == MUX_B:
|
|
+ return 'rb%d' % self.i
|
|
+ assert 0
|
|
+
|
|
+class sema_t:
|
|
+ def __init__(self, acq, i):
|
|
+ if not is_int(i):
|
|
+ raise Exception('semaphore index must be integer')
|
|
+ self.acq = acq
|
|
+ self.i = i
|
|
+
|
|
+class label_t:
|
|
+ def __init__(self, rel, name, offset):
|
|
+ self.rel = rel
|
|
+ self.name = name
|
|
+ self.offset = offset
|
|
+
|
|
+ def __add__(self, offset):
|
|
+ return label_t(self.rel, self.name, self.offset + offset)
|
|
+
|
|
+ def __sub__(self, offset):
|
|
+ return label_t(self.rel, self.name, self.offset - offset)
|
|
+
|
|
+class label_maker_t:
|
|
+ def __init__(self, rel):
|
|
+ self.rel = rel
|
|
+
|
|
+ def __getattr__(self, name):
|
|
+ # we discard the first character. this is a total hack to allow numeric labels to work
|
|
+ if not re_label_ref_right.match(name[1:]):
|
|
+ raise Exception('invalid label reference')
|
|
+ return label_t(self.rel, name[1:], 0)
|
|
+
|
|
+def bits(x, n):
|
|
+ if (x >> n) != 0:
|
|
+ raise Exception('%d doesn\'t fit in %d bits' % (x, n))
|
|
+ return x
|
|
+
|
|
+def bitsw(x, n):
|
|
+ if x == (1 << n):
|
|
+ x = 0
|
|
+ return bits(x, n)
|
|
+
|
|
+def bitsws(x, n):
|
|
+ if x == (1 << (n - 1)):
|
|
+ x = 0
|
|
+ if -(1 << (n - 1)) <= x < 0:
|
|
+ x += 1 << n
|
|
+ return bits(x, n)
|
|
+
|
|
+def vpm_setup(n, stride, addr, v2 = False):
|
|
+ horiz, laned, size, y, x, p = addr
|
|
+ if size not in (0, 1, 2):
|
|
+ raise Exception('addr size should be 0, 1, or 2')
|
|
+ if horiz:
|
|
+ if x != 0:
|
|
+ raise Exception('horizontal accesses must have x of 0')
|
|
+ else:
|
|
+ if (y & 0xf) != 0:
|
|
+ raise Exception('vertical accesses must be 16 row aligned')
|
|
+ hls = (bits(horiz, 1) << 3) | (bits(laned, 1) << 2) | (2 - size)
|
|
+ if v2:
|
|
+ return ((1 << 29) | (bitsw(n, 5) << 24) | (bitsws(stride, 7) << 16) |
|
|
+ (hls << 12) | ((bits(y, 8) | bits(x, 4)) << size) | bits(p, size))
|
|
+ return ((bitsw(n, 4) << 20) | (bitsw(stride, 6) << 12) |
|
|
+ (hls << 8) | ((bits(y, 6) | bits(x, 4)) << size) | bits(p, size))
|
|
+
|
|
+def vdw_setup_0(n, m, addr):
|
|
+ horiz, size, y, x, p = addr
|
|
+ if size not in (0, 1, 2):
|
|
+ raise Exception('addr size should be 0, 1, or 2')
|
|
+ return ((2 << 30) | (bitsw(n, 7) << 23) | (bitsw(m, 7) << 16) |
|
|
+ (bits(horiz, 1) << 14) | (bits(y, 7) << 7) | (bits(x, 4) << 3) | (size << 1) | bits(p, size))
|
|
+
|
|
+def vdr_setup_0(n, m, addr, vpm_stride, stride):
|
|
+ horiz, size, y, x, p = addr
|
|
+ if size not in (0, 1, 2):
|
|
+ raise Exception('addr size should be 0, 1, or 2')
|
|
+ if (stride < 8) or (stride & (stride - 1)):
|
|
+ raise Exception('stride must be power of 2 >= 8, 8 meaning use extended stride')
|
|
+ log2_stride = 3
|
|
+ while (1 << log2_stride) != stride:
|
|
+ log2_stride += 1
|
|
+ return ((1 << 31) | (size << 29) | (bits(p, size) << 28) | (bits(log2_stride - 3, 4) << 24) |
|
|
+ (bitsw(m, 4) << 20) | (bitsw(n, 4) << 16) | (bitsw(vpm_stride, 4) << 12) |
|
|
+ (bits(1 - horiz, 1) << 11) | (bits(y, 7) << 4) | bits(x, 4))
|
|
+
|
|
+class allocator_t:
|
|
+ def __init__(self, *available):
|
|
+ self.available = list(available)
|
|
+ self.allocated = {}
|
|
+ self.reserved = []
|
|
+
|
|
+ def copy(self):
|
|
+ a = allocator_t()
|
|
+ a.available = self.available[:]
|
|
+ a.allocated = self.allocated.copy()
|
|
+ a.reserved = self.reserved[:]
|
|
+ return a
|
|
+
|
|
+ def forget(self):
|
|
+ self.__init__(self.available + self.allocated.values() + self.reserved)
|
|
+
|
|
+ def reserve(self, *rs):
|
|
+ for r in rs:
|
|
+ self.available.remove(r)
|
|
+ self.reserved.append(r)
|
|
+
|
|
+ def retire(self, name):
|
|
+ r = self.allocated.pop(name)
|
|
+ del r.__invert__
|
|
+ del r.retire
|
|
+ self.available.append(r)
|
|
+ return r
|
|
+
|
|
+ def __getattr__(self, name):
|
|
+ if name not in self.allocated:
|
|
+ r = self.available.pop()
|
|
+ r.retire = lambda: self.retire(name) # this is an ugly hack to get nicer retire syntax
|
|
+ r.__invert__ = r.retire
|
|
+ self.allocated[name] = r
|
|
+ return self.allocated[name]
|
|
+
|
|
+def pragma_allow_xor_0(x):
|
|
+ global allow_xor_0
|
|
+
|
|
+ if not isinstance(x, bool):
|
|
+ raise Exception('allow_xor_0 must be bool')
|
|
+ x, allow_xor_0 = allow_xor_0, x
|
|
+ return x
|
|
+
|
|
+def pragma_dont_warn_when_mul_rot_inp_r5(x):
|
|
+ global dont_warn_when_mul_rot_inp_r5
|
|
+
|
|
+ if not isinstance(x, bool):
|
|
+ raise Exception('dont_warn_when_mul_rot_inp_r5 must be bool')
|
|
+ x, dont_warn_when_mul_rot_inp_r5 = dont_warn_when_mul_rot_inp_r5, x
|
|
+ return x
|
|
+
|
|
+arg_defs = {
|
|
+ # special reg names (these alias the regular names, but also have appropriate read/write restrictions)
|
|
+ 'w': loc_t(MUX_A, 15, 0, 0, None, RW_EITHER),
|
|
+ 'z': loc_t(MUX_B, 15, 0, 0, None, RW_EITHER),
|
|
+ 'unif': loc_t(MUX_ANY, 32, 0, 0, None, RW_READ),
|
|
+ 'vary': loc_t(MUX_ANY, 35, 0, 0, None, RW_READ),
|
|
+ 'tmurs': loc_t(MUX_ANY, 36, 0, 0, None, RW_WRITE),
|
|
+ 'r5quad': loc_t(MUX_A, 37, 0, 0, None, RW_WRITE),
|
|
+ 'r5rep': loc_t(MUX_B, 37, 0, 0, None, RW_WRITE),
|
|
+ 'elem_num': loc_t(MUX_A, 38, 0, 0, None, RW_READ),
|
|
+ 'qpu_num': loc_t(MUX_B, 38, 0, 0, None, RW_READ),
|
|
+ 'unif_addr': loc_t(MUX_A, 40, 0, 0, None, RW_WRITE),
|
|
+ 'unif_addr_rel': loc_t(MUX_B, 40, 0, 0, None, RW_WRITE),
|
|
+ 'x_coord': loc_t(MUX_A, 41, 0, 0, None, RW_EITHER),
|
|
+ 'y_coord': loc_t(MUX_B, 41, 0, 0, None, RW_EITHER),
|
|
+ 'ms_mask': loc_t(MUX_A, 42, 0, 0, None, RW_EITHER),
|
|
+ 'rev_flag': loc_t(MUX_B, 42, 0, 0, None, RW_EITHER),
|
|
+ 'stencil': loc_t(MUX_ANY, 43, 0, 0, None, RW_WRITE),
|
|
+ 'tlbz': loc_t(MUX_ANY, 44, 0, 0, None, RW_WRITE),
|
|
+ 'tlbm': loc_t(MUX_ANY, 45, 0, 0, None, RW_WRITE),
|
|
+ 'tlbc': loc_t(MUX_ANY, 46, 0, 0, None, RW_WRITE),
|
|
+ 'vpm': loc_t(MUX_ANY, 48, 0, 0, None, RW_EITHER),
|
|
+ 'vr_busy': loc_t(MUX_A, 49, 0, 0, None, RW_READ),
|
|
+ 'vw_busy': loc_t(MUX_B, 49, 0, 0, None, RW_READ),
|
|
+ 'vr_setup': loc_t(MUX_A, 49, 0, 0, None, RW_WRITE),
|
|
+ 'vw_setup': loc_t(MUX_B, 49, 0, 0, None, RW_WRITE),
|
|
+ 'vr_wait': loc_t(MUX_A, 50, 0, 0, None, RW_READ),
|
|
+ 'vw_wait': loc_t(MUX_B, 50, 0, 0, None, RW_READ),
|
|
+ 'vr_addr': loc_t(MUX_A, 50, 0, 0, None, RW_WRITE),
|
|
+ 'vw_addr': loc_t(MUX_B, 50, 0, 0, None, RW_WRITE),
|
|
+ 'mutex': loc_t(MUX_ANY, 51, 0, 0, None, RW_EITHER),
|
|
+ 'recip': loc_t(MUX_ANY, 52, 0, 0, None, RW_WRITE),
|
|
+ 'recipsqrt': loc_t(MUX_ANY, 53, 0, 0, None, RW_WRITE),
|
|
+ 'rsqrt': loc_t(MUX_ANY, 53, 0, 0, None, RW_WRITE),
|
|
+ 'exp': loc_t(MUX_ANY, 54, 0, 0, None, RW_WRITE),
|
|
+ 'log': loc_t(MUX_ANY, 55, 0, 0, None, RW_WRITE),
|
|
+ 't0s': loc_t(MUX_ANY, 56, 0, 0, None, RW_WRITE),
|
|
+ 't0t': loc_t(MUX_ANY, 57, 0, 0, None, RW_WRITE),
|
|
+ 't0r': loc_t(MUX_ANY, 58, 0, 0, None, RW_WRITE),
|
|
+ 't0b': loc_t(MUX_ANY, 59, 0, 0, None, RW_WRITE),
|
|
+ 't1s': loc_t(MUX_ANY, 60, 0, 0, None, RW_WRITE),
|
|
+ 't1t': loc_t(MUX_ANY, 61, 0, 0, None, RW_WRITE),
|
|
+ 't1r': loc_t(MUX_ANY, 62, 0, 0, None, RW_WRITE),
|
|
+ 't1b': loc_t(MUX_ANY, 63, 0, 0, None, RW_WRITE),
|
|
+
|
|
+ # semaphore acq/rel
|
|
+ 'sacq': lambda i: sema_t(True, i),
|
|
+ 'srel': lambda i: sema_t(False, i),
|
|
+
|
|
+ # label makers (before evaluating, the syntax x:label gets transformed to x_label_maker._label)
|
|
+ 'r_label_maker': label_maker_t(True),
|
|
+ 'a_label_maker': label_maker_t(False),
|
|
+
|
|
+ # handy functions
|
|
+ 'f': lambda x: struct.unpack('I', struct.pack('f', x))[0],
|
|
+ 'sqrt': math.sqrt,
|
|
+ 'sin': math.sin,
|
|
+ 'cos': math.cos,
|
|
+ 'atan2': math.atan2,
|
|
+ 'pi': math.pi,
|
|
+ 'rseed': random.seed,
|
|
+ 'rand': lambda: int(random.getrandbits(32)),
|
|
+ 'bits': bits,
|
|
+ 'bitsw': bitsw,
|
|
+ 'bitsws': bitsws,
|
|
+
|
|
+ # handy vpm/vdw/vdr stuff
|
|
+ 'h32': lambda y: (1, 0, 0, y, 0, 0),
|
|
+ 'h16l': lambda y, p: (1, 1, 1, y, 0, p),
|
|
+ 'h16p': lambda y, p: (1, 0, 1, y, 0, p),
|
|
+ 'h8l': lambda y, p: (1, 1, 2, y, 0, p),
|
|
+ 'h8p': lambda y, p: (1, 0, 2, y, 0, p),
|
|
+ 'v32': lambda y, x: (0, 0, 0, y, x, 0),
|
|
+ 'v16l': lambda y, x, p: (0, 1, 1, y, x, p),
|
|
+ 'v16p': lambda y, x, p: (0, 0, 1, y, x, p),
|
|
+ 'v8l': lambda y, x, p: (0, 1, 2, y, x, p),
|
|
+ 'v8p': lambda y, x, p: (0, 0, 2, y, x, p),
|
|
+ 'dma_h32': lambda y, x: (1, 0, y, x, 0),
|
|
+ 'dma_h16p': lambda y, x, p: (1, 1, y, x, p),
|
|
+ 'dma_h8p': lambda y, x, p: (1, 2, y, x, p),
|
|
+ 'dma_v32': lambda y, x: (0, 0, y, x, 0),
|
|
+ 'dma_v16p': lambda y, x, p: (0, 1, y, x, p),
|
|
+ 'dma_v8p': lambda y, x, p: (0, 2, y, x, p),
|
|
+ 'vpm_setup': vpm_setup,
|
|
+ 'vpm_setup_v2': lambda n, stride, addr: vpm_setup(n, stride, addr, True),
|
|
+ 'vdw_setup_0': vdw_setup_0,
|
|
+ 'vdw_setup_1': lambda stride: (3 << 30) | bits(stride, 13),
|
|
+ 'vdr_setup_0': vdr_setup_0,
|
|
+ 'vdr_setup_ext_stride': 8, # stride of 8 means use extended stride
|
|
+ 'vdr_setup_1': lambda stride: (9 << 28) | bits(stride, 13),
|
|
+
|
|
+ # annotations
|
|
+ 'mul_used': lambda *is_: ('mul_used', sum(1 << i for i in is_)),
|
|
+ 'mul_unused': lambda *is_: ('mul_used', sum(1 << i for i in is_) ^ 0xffff),
|
|
+ 'preserve_cond': ('preserve_cond', 1),
|
|
+
|
|
+ # somewhat experimental register allocator
|
|
+ 'allocator_t': allocator_t,
|
|
+
|
|
+ # pragmas
|
|
+ 'pragma_allow_xor_0': pragma_allow_xor_0,
|
|
+ 'pragma_dont_warn_when_mul_rot_inp_r5': pragma_dont_warn_when_mul_rot_inp_r5}
|
|
+
|
|
+# accumulators and regs (regular names -- r0, ra0, etc)
|
|
+arg_defs.update(('r%d' % i, loc_t(MUX_AC, i, 0, 0, None, RW_EITHER)) for i in xrange(6))
|
|
+arg_defs.update(('rany%d' % i, loc_t(MUX_ANY, i, 0, 0, None, RW_EITHER)) for i in xrange(64))
|
|
+arg_defs.update(('ra%d' % i, loc_t(MUX_A, i, 0, 0, None, RW_EITHER)) for i in xrange(64))
|
|
+arg_defs.update(('rb%d' % i, loc_t(MUX_B, i, 0, 0, None, RW_EITHER)) for i in xrange(64))
|
|
+
|
|
+def arg_eval(arg, sets):
|
|
+ s = (arg.strip().split('.', 1) + [None])[:2]
|
|
+ if s[0] == '-':
|
|
+ return loc_t(MUX_ANY, WADDR_NOP, 0, 0, s[1], RW_WRITE)
|
|
+ arg = re_label_ref_left.sub('\\1_label_maker._', arg) # todo: we probably don't want to replace in strings...
|
|
+ arg = re_pack.sub('._\\1', arg)
|
|
+ try:
|
|
+ # todo: i would like to be able to pass both arg_defs and sets in here
|
|
+ # (with sets hiding arg_defs in the case of conflicts), but the obvious
|
|
+ # dict(arg_defs, **sets) won't permit things such as:
|
|
+ # .set f, lambda x: y
|
|
+ # .set y, 4
|
|
+ # (the y in the lambda will be looked up in the temporary dict we created
|
|
+ # when evaluating the f .set, which doesn't contain y)
|
|
+ #
|
|
+ # instead, sets is initially set to (a copy of) arg_defs. to simulate the
|
|
+ # hiding behaviour, on an unset, we restore any hidden arg_defs value.
|
|
+ # also, before dumping sets at the end, we strip out the arg_defs stuff
|
|
+ # (this isn't entirely correct as we want to dump sets that are hiding
|
|
+ # arg_defs)
|
|
+ return eval(arg, sets)
|
|
+ except Exception, e:
|
|
+ asm_error(e)
|
|
+ except:
|
|
+ asm_error('unknown error while evaluating argument')
|
|
+
|
|
+# doesn't check/fixup pack
|
|
+def check_and_fixup_loc(loc, read):
|
|
+ if (not read) and (loc.rw == RW_READ):
|
|
+ asm_error('writing to read-only hardware register')
|
|
+ if read and (loc.rw == RW_WRITE):
|
|
+ asm_error('reading from write-only hardware register')
|
|
+ if not read:
|
|
+ # conceptually, we are writing to a location rotated right by
|
|
+ # loc.rot/loc.r5_rot. but we are actually rotating the output right by
|
|
+ # -loc.rot/-loc.r5_rot then writing it to the unrotated location
|
|
+ loc.rot = -loc.rot % 16
|
|
+ loc.r5_rot = -loc.r5_rot % 16
|
|
+ if (loc.rot != 0) and (loc.r5_rot != 0):
|
|
+ asm_error('can\'t rotate by both r5 and immediate')
|
|
+ if (loc.r5_rot != 0) and (loc.r5_rot != 1):
|
|
+ asm_error('only supported rotation by r5 is once to the %s' % ('left', 'right')[read])
|
|
+ if (not mulw_rotate) and ((loc.rot != 0) or loc.r5_rot): # mulw_rotate source checking is done later
|
|
+ if not read:
|
|
+ asm_error('target doesn\'t support write rotation')
|
|
+ if loc.mux == MUX_ANY:
|
|
+ loc.mux = MUX_A # can't do rotated read from regfile b
|
|
+ if loc.mux != MUX_A:
|
|
+ asm_error('rotation on read only allowed from regfile a')
|
|
+ if loc.i >= 32:
|
|
+ asm_warning('rotation only works from physical regfile')
|
|
+ if loc.mux == MUX_AC:
|
|
+ if (loc.i < 0) or (loc.i >= 6):
|
|
+ asm_error('reg out of range')
|
|
+ if not read:
|
|
+ if loc.i == 4:
|
|
+ asm_error('not allowed to write to r4')
|
|
+ if loc.i == 5:
|
|
+
|
|
+ asm_error('not allowed to write to r5 -- please specify r5quad or r5rep')
|
|
+ elif (loc.mux == MUX_ANY) or (loc.mux == MUX_A) or (loc.mux == MUX_B):
|
|
+ if (loc.i < 0) or (loc.i >= 64):
|
|
+ asm_error('reg out of range')
|
|
+ else:
|
|
+ assert 0
|
|
+
|
|
+def get_dst(dst, sets):
|
|
+ if not dst:
|
|
+ return None, None, (0, PACK_TYPE_EITHER, PACK_MODE_EITHER), 0, 0
|
|
+ dst = arg_eval(dst, sets)
|
|
+ if not isinstance(dst, loc_t):
|
|
+ asm_error('invalid dst')
|
|
+ dst = dst.copy()
|
|
+ check_and_fixup_loc(dst, False)
|
|
+ pack = get_pack(dst.pack)
|
|
+ if dst.mux == MUX_AC:
|
|
+ if pack[2] == PACK_MODE_A:
|
|
+ asm_warning('ra packing only works when writing to physical regfile')
|
|
+ return WADDR_R0 + dst.i, WMUX_A, pack, dst.rot, dst.r5_rot
|
|
+ return WADDR_R0 + dst.i, WMUX_ANY, pack, dst.rot, dst.r5_rot
|
|
+ if (dst.mux == MUX_A) or ((dst.mux == MUX_ANY) and (pack[2] == PACK_MODE_A)): # can't pack to regfile b with this operation
|
|
+ if (pack[2] == PACK_MODE_A) and (dst.i >= 32):
|
|
+ asm_warning('ra packing only works when writing to physical regfile')
|
|
+ return dst.i, WMUX_A, pack, dst.rot, dst.r5_rot
|
|
+ if dst.mux == MUX_ANY:
|
|
+ return dst.i, WMUX_ANY, pack, dst.rot, dst.r5_rot
|
|
+ if dst.mux == MUX_B:
|
|
+ if pack[2] == PACK_MODE_A:
|
|
+ asm_error('this packing operation can only be used for regfile a')
|
|
+ return dst.i, WMUX_B, pack, dst.rot, dst.r5_rot
|
|
+ assert 0
|
|
+
|
|
+def get_src(src, sets):
|
|
+ if not src:
|
|
+ return None, None, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), None, None
|
|
+ src = arg_eval(src, sets)
|
|
+ if isinstance(src, sema_t):
|
|
+ if not have_sema:
|
|
+ asm_error('target does not support semaphores')
|
|
+ if (src.i < 0) or (src.i >= 16):
|
|
+ asm_error('semaphore number must be in [0, 16)')
|
|
+ return src.i | (src.acq << 4), RMUX_SEMA, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), 0, 0
|
|
+ if isinstance(src, label_t):
|
|
+ return (src.name, src.rel, src.offset), RMUX_LABEL, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), 0, 0
|
|
+ if isinstance(src, list):
|
|
+ if len(src) != 16:
|
|
+ asm_error('vector immediate must have length 16')
|
|
+ src = src[:]
|
|
+ for i in xrange(16):
|
|
+ if not is_int(src[i]):
|
|
+ asm_error('all elements of vector immediate must be integers')
|
|
+ src[i] &= (1 << 32) - 1
|
|
+ return src, RMUX_IMMV, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), 0, 0
|
|
+ if is_int(src):
|
|
+ return src & ((1 << 32) - 1), RMUX_IMM, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), 0, 0
|
|
+ if not isinstance(src, loc_t):
|
|
+ asm_error('invalid src')
|
|
+ src = src.copy()
|
|
+ check_and_fixup_loc(src, True)
|
|
+ if mulw_rotate:
|
|
+ srot, sr5rot = 0, 0
|
|
+ drot, dr5rot = src.rot, src.r5_rot
|
|
+ else:
|
|
+ srot, sr5rot = src.rot, src.r5_rot
|
|
+ drot, dr5rot = 0, 0
|
|
+ if src.mux == MUX_AC:
|
|
+ if src.i == 4:
|
|
+ return 4, RMUX_AC, get_r4_unpack(src.pack), drot, dr5rot
|
|
+ if src.pack:
|
|
+ asm_error('unpack only allowed for regfile a or r4')
|
|
+ return src.i, RMUX_AC, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), drot, dr5rot
|
|
+ if (src.mux == MUX_A) or ((src.mux == MUX_ANY) and src.pack): # can't unpack from regfile b
|
|
+ return (src.i, srot, sr5rot), RMUX_A, get_a_unpack(src.pack), drot, dr5rot
|
|
+ if src.mux == MUX_ANY:
|
|
+ return src.i, RMUX_ANY, (0, PACK_TYPE_EITHER, UNPACK_LOC_AB), drot, dr5rot
|
|
+ if src.mux == MUX_B:
|
|
+ if src.pack:
|
|
+ asm_error('unpack only allowed for regfile a or r4')
|
|
+ return src.i, RMUX_B, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), drot, dr5rot
|
|
+ assert 0
|
|
+
|
|
+# signals
|
|
+##########
|
|
+
|
|
+sigs = {
|
|
+ 'bkpt': SIG_BKPT,
|
|
+ 'thrsw': SIG_THRSW,
|
|
+ 'thrend': SIG_THREND,
|
|
+ 'sbwait': SIG_SBWAIT,
|
|
+ 'sbdone': SIG_SBDONE,
|
|
+ 'int': SIG_INT,
|
|
+ 'loadcv': SIG_LOADCV,
|
|
+ 'loadc': SIG_LOADC,
|
|
+ 'ldcend': SIG_LDCEND,
|
|
+ 'ldtmu0': SIG_LDTMU0,
|
|
+ 'ldtmu1': SIG_LDTMU1}
|
|
+
|
|
+def get_sig(sig):
|
|
+ if sig not in sigs:
|
|
+ return SIG_NORMAL
|
|
+ return sigs[sig]
|
|
+
|
|
+# annotations
|
|
+##############
|
|
+
|
|
+def get_annots(annot, sets):
|
|
+ annots = arg_eval(annot, sets)
|
|
+ if isinstance(annots, list):
|
|
+ annots = annots[:]
|
|
+ else:
|
|
+ annots = [annots]
|
|
+ for i, annot in enumerate(annots):
|
|
+ if ((not isinstance(annot, tuple)) or (len(annot) != 2) or (not isinstance(annot[0], str)) or
|
|
+ (not is_int(annot[1]))):
|
|
+ asm_error('annotation must be (string, integer) pair, or a list of such pairs')
|
|
+ annots[i] = (annot[0], annot[1] & ((1 << 32) - 1))
|
|
+ return annots
|
|
+
|
|
+###############################################################################
|
|
+# core
|
|
+###############################################################################
|
|
+
|
|
+def calculate_pack_modes(rpacks, rfloats, couldrfloat, wpacks, wfloats):
|
|
+ needfloat = PACK_TYPE_EITHER
|
|
+ havefloata = False
|
|
+ havefloatr4 = False
|
|
+ unpacka = None
|
|
+ unpackr4 = None
|
|
+ forcebs = [False, False, False, False]
|
|
+ forcerafloat = False
|
|
+
|
|
+ pm = PACK_MODE_EITHER
|
|
+ for i in (0, 1, 2, 3):
|
|
+ if (rpacks[i][2] == UNPACK_LOC_OTHER) or (rpacks[i][2] == UNPACK_LOC_AB):
|
|
+ assert rpacks[i][0] == 0
|
|
+ else:
|
|
+ if rpacks[i][2] == UNPACK_LOC_A:
|
|
+ if unpacka is None:
|
|
+ unpacka = rpacks[i][0]
|
|
+ elif unpacka != rpacks[i][0]:
|
|
+ asm_error('conflicting unpack operations on regfile a')
|
|
+ havefloata = havefloata or rfloats[i]
|
|
+ elif rpacks[i][2] == UNPACK_LOC_R4:
|
|
+ if unpackr4 is None:
|
|
+ unpackr4 = rpacks[i][0]
|
|
+ elif unpackr4 != rpacks[i][0]:
|
|
+ asm_error('conflicting unpack operations on r4')
|
|
+ havefloatr4 = havefloatr4 or rfloats[i]
|
|
+ else:
|
|
+ assert 0
|
|
+
|
|
+ if rpacks[i][1] != PACK_TYPE_EITHER:
|
|
+ if (needfloat != PACK_TYPE_EITHER) and (needfloat != rpacks[i][1]):
|
|
+ asm_error('conflicting unpack float requirements')
|
|
+ needfloat = rpacks[i][1]
|
|
+ for i in (0, 1, 2, 3):
|
|
+ if rpacks[i][2] == UNPACK_LOC_AB:
|
|
+ if (unpacka is not None) and (unpacka != UNPACK_A_NOP):
|
|
+ forcebs[i] = True # non-nop unpack from regfile a. must use b
|
|
+
|
|
+ if unpacka:
|
|
+ if (needfloat == PACK_TYPE_FLOAT) and (not havefloata) and couldrfloat:
|
|
+ havefloata = True
|
|
+ forcerafloat = True
|
|
+ havefloat = havefloata
|
|
+ else:
|
|
+ havefloat = havefloatr4
|
|
+
|
|
+ if (needfloat == PACK_TYPE_FLOAT) and (not havefloat):
|
|
+ asm_error('float unpack operation used in integer alu operations')
|
|
+ if (needfloat == PACK_TYPE_INT) and havefloat:
|
|
+ asm_error('integer unpack operation used in float alu operation')
|
|
+
|
|
+ unpack = 0
|
|
+ if unpacka and unpackr4:
|
|
+ asm_error('cannot specify pack operation for both regfile a and r4')
|
|
+ if unpacka:
|
|
+ pm = PACK_MODE_A
|
|
+ unpack = unpacka
|
|
+ elif unpackr4:
|
|
+ pm = PACK_MODE_M
|
|
+ unpack = unpackr4
|
|
+
|
|
+ pack = 0
|
|
+ if wpacks[0][2] == PACK_MODE_M:
|
|
+ asm_error('mul-unit pack operation used on add result')
|
|
+ for i in (0, 1):
|
|
+ if wpacks[i][2] == PACK_MODE_A:
|
|
+ if (pm != PACK_MODE_EITHER) and (pm != PACK_MODE_A):
|
|
+ asm_error('conflicting pack modes')
|
|
+ pm = PACK_MODE_A
|
|
+ pack = wpacks[i][0]
|
|
+ elif wpacks[i][2] == PACK_MODE_M:
|
|
+ if (pm != PACK_MODE_EITHER) and (pm != PACK_MODE_M):
|
|
+ asm_error('conflicting pack modes')
|
|
+ pm = PACK_MODE_M
|
|
+ pack = wpacks[i][0]
|
|
+
|
|
+ if (wpacks[i][1] == PACK_TYPE_FLOAT) and (not wfloats[i]):
|
|
+ asm_error('float pack operation used with integer alu result')
|
|
+ if (wpacks[i][1] == PACK_TYPE_INT) and wfloats[i]:
|
|
+ asm_error('integer pack operation used with float alu result')
|
|
+
|
|
+ if pm == PACK_MODE_EITHER:
|
|
+ pm = PACK_MODE_A
|
|
+ return pm, pack, unpack, forcebs, forcerafloat
|
|
+
|
|
+# immediates that can be encoded with SIG_SMALLIMMED
|
|
+bimms = {}
|
|
+bimms.update((i, i) for i in xrange(16))
|
|
+bimms.update(((i - 32) + (1 << 32), i) for i in xrange(16, 32))
|
|
+bimms.update(((127 + (i - 32)) << 23, i) for i in xrange(32, 40))
|
|
+bimms.update(((127 + (i - 48)) << 23, i) for i in xrange(40, 48))
|
|
+
|
|
+def merge_rmux(raddr_a, raddr_b, immb, arot_r5, raddr, rmux):
|
|
+ if rmux == RMUX_SEMA:
|
|
+ asm_error('semaphore op can only be used with mov')
|
|
+ if rmux == RMUX_LABEL:
|
|
+ asm_error('label not allowed here')
|
|
+ if rmux == RMUX_IMMV:
|
|
+ asm_error('vector immediate can only be used with mov')
|
|
+ if rmux == RMUX_IMM:
|
|
+ if raddr not in bimms:
|
|
+ asm_error('can\'t encode immediate 0x%08x' % raddr)
|
|
+ raddr = bimms[raddr]
|
|
+ if not immb:
|
|
+ if raddr_b is not None:
|
|
+ asm_error('regfile b and immediates don\'t mix')
|
|
+ raddr_b = raddr
|
|
+ immb = True
|
|
+ elif raddr_b != raddr:
|
|
+ asm_error('can only encode one rotation/immediate')
|
|
+ return raddr_a, raddr_b, immb, arot_r5, RMUX_B
|
|
+ if rmux == RMUX_AC:
|
|
+ return raddr_a, raddr_b, immb, arot_r5, RMUX_A0 + raddr
|
|
+ if rmux == RMUX_ANY:
|
|
+ if (mulw_rotate or (((not immb) or (raddr_b < 48)) and (not arot_r5))) and (raddr_a == raddr):
|
|
+ return raddr_a, raddr_b, immb, arot_r5, RMUX_A
|
|
+ if (not immb) and (raddr_b == raddr):
|
|
+ return raddr_a, raddr_b, immb, arot_r5, RMUX_B
|
|
+ if raddr_a is None:
|
|
+ assert mulw_rotate or (((not immb) or (raddr_b < 48)) and (not arot_r5))
|
|
+ raddr_a = raddr
|
|
+ return raddr_a, raddr_b, immb, arot_r5, RMUX_A
|
|
+ if raddr_b is None:
|
|
+ assert not immb
|
|
+ raddr_b = raddr
|
|
+ return raddr_a, raddr_b, immb, arot_r5, RMUX_B
|
|
+ asm_error('no free read slots')
|
|
+ if rmux == RMUX_A:
|
|
+ if (not mulw_rotate) and (raddr_a is not None) and (
|
|
+ ((raddr[1] != 0) | ((raddr[2] != 0) << 1)) != ((immb and (raddr_b >= 48)) | (arot_r5 << 1))):
|
|
+ asm_error('conflicting rotations from regfile a')
|
|
+ if raddr_a is None:
|
|
+ raddr_a = raddr[0]
|
|
+ elif raddr_a != raddr[0]:
|
|
+ asm_error('can only read from one location in each regfile')
|
|
+ arot_r5 = raddr[2]
|
|
+ if raddr[1] == 0:
|
|
+ return raddr_a, raddr_b, immb, arot_r5, RMUX_A
|
|
+ raddr = 48 + raddr[1]
|
|
+ if not immb:
|
|
+ if raddr_b is not None:
|
|
+ asm_error('regfile b and rotation don\'t mix')
|
|
+ raddr_b = raddr
|
|
+ immb = True
|
|
+ elif raddr_b != raddr:
|
|
+ asm_error('can only encode one rotation/immediate')
|
|
+ return raddr_a, raddr_b, immb, arot_r5, RMUX_A
|
|
+ if rmux == RMUX_B:
|
|
+ if immb:
|
|
+ asm_error('regfile b and rotation/immediates don\'t mix')
|
|
+ if raddr_b is None:
|
|
+ raddr_b = raddr
|
|
+ elif raddr_b != raddr:
|
|
+ asm_error('can only read from one location in each regfile')
|
|
+ return raddr_a, raddr_b, immb, arot_r5, RMUX_B
|
|
+ assert 0
|
|
+
|
|
+# ok if:
|
|
+# - accumulator (r0-r3)
|
|
+# - uniform (ie all elements identical). this is true of unif, qpu_num, vr_busy,
|
|
+# and vw_busy. it's also true of r5 if it was written by r5rep, but not if it
|
|
+# was written by r5quad. so, by default, r5 isn't considered uniform. todo:
|
|
+# what about vr_wait/vw_wait/mutex?
|
|
+def read_rot_ok(rmux, raddr_a, raddr_b):
|
|
+ return ((rmux < 4) or ((rmux == 5) and dont_warn_when_mul_rot_inp_r5) or
|
|
+ ((rmux == 6) and (raddr_a in (32, 49))) or # unif/vr_busy
|
|
+ ((rmux == 7) and (raddr_b in (32, 38, 49)))) # unif/qpu_num/vw_busy
|
|
+
|
|
+def asm_flush_prog_data():
|
|
+ global prog_data
|
|
+
|
|
+ while len(prog_data) & 7:
|
|
+ prog_data.append(0)
|
|
+ for i in xrange(0, len(prog_data), 8):
|
|
+ prog.append(((prog_data[i + 3] << 24) | (prog_data[i + 2] << 16) | (prog_data[i + 1] << 8) | (prog_data[i + 0] << 0),
|
|
+ (prog_data[i + 7] << 24) | (prog_data[i + 6] << 16) | (prog_data[i + 5] << 8) | (prog_data[i + 4] << 0), 'data', {}))
|
|
+ prog_data = []
|
|
+
|
|
+def asm_line(sets, location, line):
|
|
+ global current_location, construct, nwarn_level
|
|
+
|
|
+ prev_location = current_location
|
|
+ current_location = location
|
|
+
|
|
+ try:
|
|
+ if construct != None:
|
|
+ if re_macro.match(line):
|
|
+ construct_stack.append(CONSTRUCT_MACRO)
|
|
+ elif re_if.match(line):
|
|
+ construct_stack.append(CONSTRUCT_IF)
|
|
+ elif re_rep.match(line):
|
|
+ construct_stack.append(CONSTRUCT_REP)
|
|
+ else:
|
|
+ else_m = line == '.else'
|
|
+ elif_m = re_elif.match(line)
|
|
+ if elif_m:
|
|
+ end_construct = CONSTRUCT_IF
|
|
+ else:
|
|
+ end_construct = {
|
|
+ '.endm': CONSTRUCT_MACRO,
|
|
+ '.else': CONSTRUCT_IF,
|
|
+ '.endif': CONSTRUCT_IF | CONSTRUCT_ELSE,
|
|
+ '.endr': CONSTRUCT_REP}.get(line)
|
|
+ if end_construct is not None:
|
|
+ end_construct &= construct_stack.pop()
|
|
+ if end_construct == 0:
|
|
+ if elif_m:
|
|
+ asm_error('unexpected .elif')
|
|
+ asm_error('unexpected %s' % line)
|
|
+ if len(construct_stack) == 0:
|
|
+ lines = construct
|
|
+ construct = None
|
|
+ if end_construct == CONSTRUCT_MACRO:
|
|
+ return
|
|
+ if (end_construct == CONSTRUCT_IF) or (end_construct == CONSTRUCT_ELSE):
|
|
+ condition_if, condition_else = lines[0]
|
|
+ lines = lines[1:]
|
|
+ if condition_if:
|
|
+ for location, line in lines:
|
|
+ asm_line(sets, location, line)
|
|
+ if else_m:
|
|
+ construct = [(condition_else, False)]
|
|
+ construct_stack.append(CONSTRUCT_ELSE)
|
|
+ elif elif_m:
|
|
+ if elif_m.group('set'):
|
|
+ condition_if = condition_else and ((elif_m.group('set') == 'nset') ^ (elif_m.group('name') in sets))
|
|
+ else:
|
|
+ condition_if = condition_else and arg_eval(elif_m.group('condition'), sets)
|
|
+ condition_else = condition_else and (not condition_if)
|
|
+ construct = [(condition_if, condition_else)]
|
|
+ construct_stack.append(CONSTRUCT_IF)
|
|
+ return
|
|
+ if end_construct == CONSTRUCT_REP:
|
|
+ name, count = lines[0]
|
|
+ lines = lines[1:]
|
|
+ for i in xrange(count):
|
|
+ sets[name] = i
|
|
+ for location, line in lines:
|
|
+ asm_line(sets, location, line)
|
|
+ return
|
|
+ assert 0
|
|
+ if else_m:
|
|
+ construct_stack.append(CONSTRUCT_ELSE)
|
|
+ elif elif_m:
|
|
+ construct_stack.append(CONSTRUCT_IF)
|
|
+ construct.append((current_location, line))
|
|
+ return
|
|
+
|
|
+ if line in ('.endm', '.else', '.endif', '.endr'):
|
|
+ asm_error('unexpected %s' % line)
|
|
+ if re_elif.match(line):
|
|
+ asm_error('unexpected .elif')
|
|
+
|
|
+ m = re_macro.match(line)
|
|
+ if m:
|
|
+ construct = []
|
|
+ construct_stack.append(CONSTRUCT_MACRO)
|
|
+ macros[m.group('name')] = ([param.strip() for param in m.group('params').split(',')[1:]], construct)
|
|
+ return
|
|
+
|
|
+ m = re_if.match(line)
|
|
+ if m:
|
|
+ if m.group('set'):
|
|
+ condition = (m.group('set') == 'nset') ^ (m.group('name') in sets)
|
|
+ else:
|
|
+ # not not forces condition to a bool (this matters if condition is
|
|
+ # something mutable like a list)
|
|
+ condition = not not arg_eval(m.group('condition'), sets)
|
|
+ construct = [(condition, not condition)]
|
|
+ construct_stack.append(CONSTRUCT_IF)
|
|
+ return
|
|
+
|
|
+ m = re_rep.match(line)
|
|
+ if m:
|
|
+ count = arg_eval(m.group('count'), sets)
|
|
+ if not is_int(count):
|
|
+ asm_error('.rep count must be integer')
|
|
+ construct = [(m.group('name'), count)]
|
|
+ construct_stack.append(CONSTRUCT_REP)
|
|
+ return
|
|
+
|
|
+ m = re_include.match(line)
|
|
+ if m:
|
|
+ filename = arg_eval(m.group('filename'), sets)
|
|
+ if not isinstance(filename, str):
|
|
+ asm_error('expected string')
|
|
+ asm_file(sets, '%s: %s' % (current_location, filename), filename)
|
|
+ return
|
|
+
|
|
+ m = re_set.match(line)
|
|
+ if m:
|
|
+ sets[m.group('name')] = arg_eval(m.group('val'), sets)
|
|
+ return
|
|
+
|
|
+ m = re_unset.match(line)
|
|
+ if m:
|
|
+ name = m.group('name')
|
|
+ if name not in sets:
|
|
+ asm_error('%s not set' % name)
|
|
+ if name in arg_defs: # todo: see arg_eval
|
|
+ sets[name] = arg_defs[name]
|
|
+ else:
|
|
+ del sets[name]
|
|
+ return
|
|
+
|
|
+ m = re_eval.match(line)
|
|
+ if m:
|
|
+ arg_eval(m.group('expr'), sets)
|
|
+ return
|
|
+
|
|
+ m = re_print_info_warn_error.match(line)
|
|
+ if m:
|
|
+ def print_fn(message):
|
|
+ print message
|
|
+ def info_fn(message):
|
|
+ sys.stderr.write('%s\n' % message)
|
|
+ {'print': print_fn, 'info': info_fn, 'warn': asm_warning, 'error': asm_error}[
|
|
+ m.group('print_info_warn_error')](arg_eval(m.group('message'), sets))
|
|
+ return
|
|
+
|
|
+ m = re_assert.match(line)
|
|
+ if m:
|
|
+ if not arg_eval(m.group('condition'), sets):
|
|
+ asm_error('assertion failure: \'%s\'' % m.group('condition'))
|
|
+ return
|
|
+
|
|
+ m = re_data.match(line)
|
|
+ if m:
|
|
+ size = int(m.group('size'))
|
|
+ for datum in smart_split(m.group('data')):
|
|
+ datum = arg_eval(datum, sets)
|
|
+ if not is_int(datum):
|
|
+ asm_error('datum must be integer')
|
|
+ prog_data.extend(((datum >> (i * 8)) & 0xff) for i in xrange(size))
|
|
+ return
|
|
+
|
|
+ m = re_macro_inst.match(line)
|
|
+ if m:
|
|
+ name = m.group('name')
|
|
+ if name in macros:
|
|
+ params, lines = macros[name]
|
|
+ args = smart_split(m.group('args'))
|
|
+ if len(args) > len(params):
|
|
+ asm_error('too many arguments to macro')
|
|
+ sets = sets.copy()
|
|
+ sets.update(zip(params, (arg_eval(arg, sets) for arg in args)))
|
|
+ for param in params[len(args):]:
|
|
+ if param in sets:
|
|
+ if param in arg_defs: # todo: see arg_eval
|
|
+ sets[param] = arg_defs[param]
|
|
+ else:
|
|
+ del sets[param]
|
|
+ for location, line in lines:
|
|
+ asm_line(sets, '%s: %s' % (current_location, location), line)
|
|
+ return
|
|
+
|
|
+ if line == '.pushnwarn':
|
|
+ nwarn_level += 1
|
|
+ return
|
|
+ if line == '.popnwarn':
|
|
+ if nwarn_level == 0:
|
|
+ asm_error('.popnwarn without .pushnwarn')
|
|
+ nwarn_level -= 1
|
|
+ return
|
|
+
|
|
+ # everything below assumes prog is up to date
|
|
+ asm_flush_prog_data()
|
|
+
|
|
+ m = re_label.match(line)
|
|
+ if m:
|
|
+ name = m.group('name')
|
|
+ if name[0].isdigit():
|
|
+ labels.setdefault(name, []).append(len(prog))
|
|
+ else:
|
|
+ if name[0] == ':':
|
|
+ undecorated_name = name[1:]
|
|
+ else:
|
|
+ undecorated_name = name
|
|
+ if (undecorated_name in labels) or ((':' + undecorated_name) in labels):
|
|
+ asm_error('named label defined twice')
|
|
+ labels[name] = len(prog)
|
|
+ return
|
|
+
|
|
+ annots = line.split('@')
|
|
+ ops = [op.strip() for op in annots[0].split(';')]
|
|
+ annots = sum((get_annots(annot, sets) for annot in annots[1:]), [])
|
|
+ sig = get_sig(ops[-1])
|
|
+ if sig != SIG_NORMAL:
|
|
+ ops = ops[:-1]
|
|
+ if len(ops) > 2:
|
|
+ asm_error('too many ops')
|
|
+ elif (len(ops) == 1) and (ops[0] == ''):
|
|
+ ops = []
|
|
+ ops = (ops + ['nop', 'nop'])[:2]
|
|
+ m = re_op.match(ops[0])
|
|
+ if not m:
|
|
+ asm_error('invalid syntax')
|
|
+ aop, aargs_n = get_aop(m.group('op'))
|
|
+ if (aop == AOP_BRA) or (aop == AOP_BRR):
|
|
+ acond = get_bcond(m.group('cond'))
|
|
+ else:
|
|
+ acond = get_cond(m.group('cond'))
|
|
+ asf = get_setf(m.group('sf'))
|
|
+ aargs = smart_split(m.group('args'))
|
|
+ if len(aargs) != aargs_n:
|
|
+ asm_error('wrong operand count')
|
|
+ ard, ara, arb = (aargs + [None, None, None])[:3]
|
|
+ m = re_op.match(ops[1])
|
|
+ if not m:
|
|
+ asm_error('invalid syntax')
|
|
+ mop, margs_n = get_mop(m.group('op'))
|
|
+ mcond = get_cond(m.group('cond'))
|
|
+ msf = get_setf(m.group('sf'))
|
|
+ margs = smart_split(m.group('args'))
|
|
+ if len(margs) != margs_n:
|
|
+ asm_error('wrong operand count')
|
|
+ mrd, mra, mrb = (margs + [None, None, None])[:3]
|
|
+ # eval srcs first so allocator can retire and reuse registers for dst
|
|
+ aaraddr, aarmux, aarpack, aadrot, aadrot_r5 = get_src(ara, sets)
|
|
+ abraddr, abrmux, abrpack, abdrot, abdrot_r5 = get_src(arb, sets)
|
|
+ maraddr, marmux, marpack, madrot, madrot_r5 = get_src(mra, sets)
|
|
+ mbraddr, mbrmux, mbrpack, mbdrot, mbdrot_r5 = get_src(mrb, sets)
|
|
+ awaddr, awmux, awpack, awrot, awrot_r5 = get_dst(ard, sets)
|
|
+ mwaddr, mwmux, mwpack, mwrot, mwrot_r5 = get_dst(mrd, sets)
|
|
+ if (((abrmux is not None) and ((aadrot != abdrot) or (aadrot_r5 != abdrot_r5))) or
|
|
+ ((mbrmux is not None) and ((madrot != mbdrot) or (madrot_r5 != mbdrot_r5)))):
|
|
+ asm_error('cannot have 2 arguments with different rotations')
|
|
+ if aarmux is not None:
|
|
+ awrot = (awrot + aadrot) % 16
|
|
+ awrot_r5 = (awrot_r5 + aadrot_r5) % 16
|
|
+ if (awrot != 0) or awrot_r5:
|
|
+ asm_error('rotate not allowed on add write')
|
|
+ if marmux is not None:
|
|
+ mwrot = (mwrot + madrot) % 16
|
|
+ mwrot_r5 = (mwrot_r5 + madrot_r5) % 16
|
|
+
|
|
+ afloatr = aop in (AOP_FADD, AOP_FSUB, AOP_FMIN, AOP_FMAX, AOP_FMINABS, AOP_FMAXABS, AOP_FTOI)
|
|
+ afloatw = aop in (AOP_FADD, AOP_FSUB, AOP_FMIN, AOP_FMAX, AOP_FMINABS, AOP_FMAXABS, AOP_ITOF)
|
|
+ pm, pack, unpack, forcebs, forcerafloat = calculate_pack_modes(
|
|
+ [aarpack, abrpack, marpack, mbrpack],
|
|
+ [afloatr, afloatr, mop == MOP_FMUL, mop == MOP_FMUL],
|
|
+ aop == AOP_FTOI,
|
|
+ [awpack, mwpack],
|
|
+ [afloatw, mop == MOP_FMUL])
|
|
+ if forcebs[0]:
|
|
+ aarmux = RMUX_B
|
|
+ if forcebs[1]:
|
|
+ abrmux = RMUX_B
|
|
+ if forcebs[2]:
|
|
+ marmux = RMUX_B
|
|
+ if forcebs[3]:
|
|
+ mbrmux = RMUX_B
|
|
+
|
|
+ # extend nops to 3 operands
|
|
+ if aop == AOP_NOP:
|
|
+ awaddr, awmux, aaraddr, aarmux, abraddr, abrmux = WADDR_NOP, WMUX_ANY, 0, RMUX_AC, 0, RMUX_AC
|
|
+ if mop == MOP_NOP:
|
|
+ mwaddr, mwmux, maraddr, marmux, mbraddr, mbrmux = WADDR_NOP, WMUX_ANY, 0, RMUX_AC, 0, RMUX_AC
|
|
+
|
|
+ # extend 2 operand alu ops to 3 operands (by duplicating the 2nd operand)
|
|
+ if (aop == AOP_FTOI) or (aop == AOP_ITOF) or (aop == AOP_NOT) or (aop == AOP_CLZ):
|
|
+ if forcerafloat:
|
|
+ assert aop == AOP_FTOI # can only forcerafloat if we have an unused float operand
|
|
+ # instead of duplicating the 2nd operand, take the ra operand from
|
|
+ # the mul op thus forcing the ra value to be considered a float for
|
|
+ # the purposes of unpacking
|
|
+ if marmux == RMUX_A:
|
|
+ abraddr, abrmux = maraddr, marmux
|
|
+ else:
|
|
+ assert mbrmux == RMUX_A
|
|
+ abraddr, abrmux = mbraddr, mbrmux
|
|
+ else:
|
|
+ abraddr, abrmux = aaraddr, aarmux
|
|
+ else:
|
|
+ assert not forcerafloat # can only forcerafloat if we have an unused operand
|
|
+
|
|
+ # handle write addrs
|
|
+ if (awmux == mwmux) and (awmux != WMUX_ANY):
|
|
+ asm_error('add/mul ops not allowed to write to same regfile')
|
|
+ ws = (awmux == WMUX_B) or (mwmux == WMUX_A)
|
|
+
|
|
+ # handle branch
|
|
+ if (aop == AOP_BRA) or (aop == AOP_BRR):
|
|
+ # check setf
|
|
+ if asf:
|
|
+ asm_error('setf not allowed on bra/brr')
|
|
+
|
|
+ # check pack/unpack
|
|
+ if (pack != 0) or (unpack != 0):
|
|
+ asm_error('pack/unpack not allowed with bra/brr')
|
|
+
|
|
+ # handle read address
|
|
+ if aarmux == RMUX_LABEL:
|
|
+ if (aop == AOP_BRA) and aaraddr[1]:
|
|
+ asm_warning('bra with rel label')
|
|
+ if (aop == AOP_BRR) and (not aaraddr[1]):
|
|
+ asm_warning('brr with abs label')
|
|
+ aaraddr, aarmux = (current_location,) + aaraddr, RMUX_IMM
|
|
+ if aarmux == RMUX_ANY:
|
|
+ aaraddr, aarmux = (aaraddr, 0, 0), RMUX_A
|
|
+ if (aarmux != RMUX_IMM) and (aarmux != RMUX_A):
|
|
+ asm_error('branch destination must be either label, immediate, or from regfile a')
|
|
+ if aarmux == RMUX_IMM:
|
|
+ imm = aaraddr
|
|
+ raddr = 0 # can't use RADDR_NOP
|
|
+ elif aarmux == RMUX_A:
|
|
+ if (aaraddr[1] != 0) or (aaraddr[2] != 0):
|
|
+ asm_error('rotation of read from regfile a not allowed with branch')
|
|
+ if aop == AOP_BRR:
|
|
+ asm_warning('brr with ra')
|
|
+ imm = 0
|
|
+ raddr = aaraddr[0]
|
|
+ else:
|
|
+ assert 0
|
|
+
|
|
+ # check mul op is nop
|
|
+ if mop != MOP_NOP:
|
|
+ asm_error('mul op not allowed with branch')
|
|
+
|
|
+ # check sig
|
|
+ if sig != SIG_NORMAL:
|
|
+ asm_error('no signal allowed with branch')
|
|
+
|
|
+ if raddr >= 32:
|
|
+ asm_error('can only branch to register locations in physical regfile')
|
|
+ if raddr & 1:
|
|
+ asm_warning('branch instruction will destroy flags (see hw-2780)')
|
|
+
|
|
+ # construct branch instruction
|
|
+ prog.append((imm,
|
|
+ (mwaddr << 0) | (awaddr << 6) | (ws << 12) | (raddr << 13) | ((aarmux == RMUX_A) << 18) | ((aop == AOP_BRR) << 19) | (acond << 20) | (SIG_BRANCH << 28),
|
|
+ line, annots))
|
|
+
|
|
+ return
|
|
+
|
|
+ # use COND_NEVER when possible (might save power / allow mul setf)
|
|
+ if not dict(annots).get('preserve_cond', 0):
|
|
+ if (awaddr == WADDR_NOP) and (not asf):
|
|
+ acond = COND_NEVER
|
|
+ if (mwaddr == WADDR_NOP) and (not msf):
|
|
+ mcond = COND_NEVER
|
|
+
|
|
+ # attempt to convert movs to ldi
|
|
+ if (# no mul setf
|
|
+ (not msf) and
|
|
+ # ops must either be nop or mov of sema/label/imm/immv
|
|
+ ((aop == AOP_NOP) or ((aop == AOP_MOV) and (aarmux in (RMUX_SEMA, RMUX_LABEL, RMUX_IMMV, RMUX_IMM)))) and
|
|
+ ((mop == MOP_NOP) or ((mop == MOP_MOV) and (marmux in (RMUX_SEMA, RMUX_LABEL, RMUX_IMMV, RMUX_IMM)))) and
|
|
+ # but we don't want 2 nops
|
|
+ ((aop != AOP_NOP) or (mop != MOP_NOP)) and
|
|
+ # if both ops are movs, srcs must be identical
|
|
+ ((aop != AOP_MOV) or (mop != MOP_MOV) or ((aarmux == marmux) and (aaraddr == maraddr))) and
|
|
+ # no signal
|
|
+ (sig == SIG_NORMAL)):
|
|
+ # make sure aarmux/aaraddr contains the value
|
|
+ if aop != AOP_MOV:
|
|
+ aarmux = marmux
|
|
+ aaraddr = maraddr
|
|
+
|
|
+ # convert immediate
|
|
+ if aarmux == RMUX_SEMA:
|
|
+ ldi_mode = LDI_SEMA
|
|
+ elif aarmux == RMUX_LABEL:
|
|
+ ldi_mode = LDI_32
|
|
+ aaraddr, aarmux = (current_location,) + aaraddr, RMUX_IMM
|
|
+ elif aarmux == RMUX_IMMV:
|
|
+ signed, unsigned = True, True
|
|
+ imm = 0
|
|
+ for i, elem in enumerate(aaraddr):
|
|
+ if elem not in (-2 + (1 << 32), -1 + (1 << 32), 0, 1):
|
|
+ signed = False
|
|
+ if elem not in (0, 1, 2, 3):
|
|
+ unsigned = False
|
|
+ imm |= ((elem & 0x1) << i) | ((elem & 0x2) << (15 + i))
|
|
+ if not (signed or unsigned):
|
|
+ asm_error('can\'t encode vector immediate')
|
|
+ if signed:
|
|
+ ldi_mode = LDI_EL_SIGNED
|
|
+ else:
|
|
+ ldi_mode = LDI_EL_UNSIGNED
|
|
+ aaraddr, aarmux = imm, RMUX_IMM
|
|
+ elif aarmux == RMUX_IMM:
|
|
+ ldi_mode = LDI_32
|
|
+ else:
|
|
+ assert 0
|
|
+
|
|
+ # construct ldi instruction
|
|
+ prog.append((aaraddr,
|
|
+ (mwaddr << 0) | (awaddr << 6) | (ws << 12) | (asf << 13) | (mcond << 14) | (acond << 17) | (pack << 20) | (pm << 24) | (ldi_mode << 25) | (SIG_IMMED << 28),
|
|
+ line, annots))
|
|
+
|
|
+ return
|
|
+
|
|
+ # convert movs to alu ops
|
|
+ if aop == AOP_MOV:
|
|
+ if allow_xor_0 and (aarmux == RMUX_IMM) and (aaraddr == 0):
|
|
+ aop = AOP_XOR
|
|
+ aaraddr, aarmux = 0, RMUX_AC
|
|
+ abraddr, abrmux = 0, RMUX_AC
|
|
+ else:
|
|
+ aop = AOP_OR
|
|
+ abraddr, abrmux = aaraddr, aarmux
|
|
+ if mop == MOP_MOV:
|
|
+ if allow_xor_0 and (marmux == RMUX_IMM) and (maraddr == 0):
|
|
+ mop = MOP_V8SUBS
|
|
+ maraddr, marmux = 0, RMUX_AC
|
|
+ mbraddr, mbrmux = 0, RMUX_AC
|
|
+ else:
|
|
+ mop = MOP_V8MIN
|
|
+ mbraddr, mbrmux = maraddr, marmux
|
|
+
|
|
+ # normal alu instruction...
|
|
+
|
|
+ # handle setf
|
|
+ if asf and (aop == AOP_NOP):
|
|
+ asm_error('nop.setf is not allowed in add pipe')
|
|
+ if msf and (mop == MOP_NOP):
|
|
+ asm_warning('nop.setf, really?')
|
|
+ if (aop == AOP_NOP) or (acond == COND_NEVER):
|
|
+ sf = msf
|
|
+ else:
|
|
+ if msf:
|
|
+ asm_error('setf only allowed on mul op if add op is nop or add condition is never')
|
|
+ sf = asf
|
|
+
|
|
+ # handle read addrs
|
|
+ raddr_a = None
|
|
+ raddr_b = None
|
|
+ immb = False
|
|
+ arot_r5 = False
|
|
+ muxes = [0, 0, 0, 0]
|
|
+ if mwrot != 0:
|
|
+ raddr_b = 48 + mwrot
|
|
+ immb = True
|
|
+ if mwrot_r5 and have_am:
|
|
+ raddr_b = 48
|
|
+ immb = True
|
|
+ for f in lambda rmux: rmux != RMUX_ANY, lambda rmux: rmux == RMUX_ANY: # do RMUX_ANY last
|
|
+ for i, raddr, rmux in (0, aaraddr, aarmux), (1, abraddr, abrmux), (2, maraddr, marmux), (3, mbraddr, mbrmux):
|
|
+ if f(rmux):
|
|
+ raddr_a, raddr_b, immb, arot_r5, muxes[i] = merge_rmux(raddr_a, raddr_b, immb, arot_r5, raddr, rmux)
|
|
+ add_a, add_b, mul_a, mul_b = muxes
|
|
+ if (not read_rot_ok(mul_a, raddr_a, raddr_b)) or (not read_rot_ok(mul_b, raddr_a, raddr_b)):
|
|
+ # some output elements might not be as expected
|
|
+ if mwrot_r5 or ((mwrot >= 4) and (mwrot <= 12)):
|
|
+ bad_elems = 0xffff
|
|
+ else:
|
|
+ bad_elems = ((1 << (mwrot & 0x3)) - 1) * 0x1111
|
|
+ if mwrot > 12:
|
|
+ bad_elems ^= 0xffff
|
|
+ bad_elems &= dict(annots).get('mul_used', 0xffff)
|
|
+ if not msf:
|
|
+ if mwaddr == WADDR_NOP:
|
|
+ # not writing anywhere and not setting flags. no elements used
|
|
+ bad_elems = 0
|
|
+ elif ((mwaddr in (36, 40, 43, 49, 50, 51)) or
|
|
+ ((not ws) and (mwaddr == 37))):
|
|
+ # writing to tmurs/r5rep/unif_addr/unif_addr_rel/stencil/
|
|
+ # vr_setup/vw_setup/vr_addr/vw_addr/mutex and not setting flags.
|
|
+ # only use element 0
|
|
+ bad_elems &= 0x0001
|
|
+ elif ((mwaddr == 41) or (ws and (mwaddr == 37)) or
|
|
+ ((not ws) and (mwaddr == 42))):
|
|
+ # writing to r5quad/x_coord/y_coord/rev_flag and not setting
|
|
+ # flags. only use elements 0, 4, 8, and 12
|
|
+ bad_elems &= 0x1111
|
|
+ if bad_elems:
|
|
+ asm_warning('mul inputs don\'t come from accumulators (r0-r3). output may not be as expected')
|
|
+ if raddr_a is None:
|
|
+ raddr_a = RADDR_NOP
|
|
+ if raddr_b is None:
|
|
+ raddr_b = RADDR_NOP
|
|
+ if immb:
|
|
+ if sig != SIG_NORMAL:
|
|
+ asm_error('rotation/immediates and signal don\'t mix')
|
|
+ sig = SIG_SMALLIMMED
|
|
+ if arot_r5 or (mwrot_r5 and (not have_am)):
|
|
+ if sig != SIG_NORMAL:
|
|
+ asm_error('rotation/immediates/signal don\'t mix')
|
|
+ sig = SIG_ROTATE
|
|
+
|
|
+ # construct instruction
|
|
+ prog.append(((mul_b << 0) | (mul_a << 3) | (add_b << 6) | (add_a << 9) | (raddr_b << 12) | (raddr_a << 18) | (aop << 24) | (mop << 29),
|
|
+ (mwaddr << 0) | (awaddr << 6) | (ws << 12) | (sf << 13) | (mcond << 14) | (acond << 17) | (pack << 20) | (pm << 24) | (unpack << 25) | (sig << 28),
|
|
+ line, annots))
|
|
+ finally:
|
|
+ current_location = prev_location
|
|
+
|
|
+def preprocess_passthrough(file):
|
|
+ line_number = 0
|
|
+ for line in file:
|
|
+ line_number += 1
|
|
+ yield line_number, line
|
|
+
|
|
+def asm_file(sets, location, filename, preprocess = None):
|
|
+ global current_dir, current_location
|
|
+
|
|
+ if filename is None:
|
|
+ location = '<stdin>'
|
|
+ file = sys.stdin
|
|
+
|
|
+ prev_dir = current_dir
|
|
+ else:
|
|
+ filename = os.path.normpath(os.path.join(current_dir, filename))
|
|
+
|
|
+ try:
|
|
+ file = open(filename)
|
|
+ except Exception, e:
|
|
+ asm_error(e)
|
|
+ except:
|
|
+ asm_error('unknown error while opening file %s' % filename)
|
|
+
|
|
+ prev_dir = current_dir
|
|
+ current_dir = os.path.dirname(filename)
|
|
+
|
|
+ prev_location = current_location
|
|
+ current_location = location
|
|
+
|
|
+ if preprocess is None:
|
|
+ preprocess = preprocess_passthrough
|
|
+
|
|
+ try:
|
|
+ for line_number, line in preprocess(file):
|
|
+ # strip off comments and whitespace
|
|
+ line = line.split('#')[0].strip()
|
|
+ if line == '':
|
|
+ continue
|
|
+
|
|
+ asm_line(sets, '%s: %d' % (current_location, line_number), line)
|
|
+ finally:
|
|
+ current_dir = prev_dir
|
|
+ current_location = prev_location
|
|
+
|
|
+def asm_end_prog():
|
|
+ # check we aren't in a multi-line construct (eg .macro or .rep)
|
|
+ if construct != None:
|
|
+ asm_error({
|
|
+ CONSTRUCT_MACRO: '.macro without .endm',
|
|
+ CONSTRUCT_IF: '.if/.elif without .endif',
|
|
+ CONSTRUCT_ELSE: '.else without .endif',
|
|
+ CONSTRUCT_REP: '.rep without .endr'}[construct_stack[-1]])
|
|
+
|
|
+ # check no warnings level back to 0
|
|
+ if nwarn_level != 0:
|
|
+ asm_error('.pushnwarn without .popnwarn')
|
|
+
|
|
+ # flush queued up data
|
|
+ asm_flush_prog_data()
|
|
+
|
|
+ # fixup all the label references we can
|
|
+ for pc in xrange(len(prog)):
|
|
+ if isinstance(prog[pc][0], tuple):
|
|
+ location, label, rel, offset = prog[pc][0]
|
|
+ if label[0].isdigit():
|
|
+ label_pcs = labels.get(label[:-1], [])
|
|
+ if label[-1] == 'b':
|
|
+ label_pcs = filter(lambda label_pc: label_pc <= pc, label_pcs)[-1:]
|
|
+ else:
|
|
+ label_pcs = filter(lambda label_pc: label_pc > pc, label_pcs)[:1]
|
|
+ if label_pcs == []:
|
|
+ asm_error('search for label reached begin/end of file', location = location)
|
|
+ imm = label_pcs[0]
|
|
+ elif label in labels:
|
|
+ imm = labels[label]
|
|
+ elif (':' + label) in labels:
|
|
+ imm = labels[':' + label]
|
|
+ elif external_link:
|
|
+ continue # let the external linker deal with it
|
|
+ else:
|
|
+ asm_error('undefined label', location = location)
|
|
+ imm = (imm * 8) + offset
|
|
+ if rel:
|
|
+ imm -= (pc + 4) * 8 # relative to instruction after delay slots
|
|
+ imm &= (1 << 32) - 1
|
|
+ else:
|
|
+ if not external_link:
|
|
+ asm_error('can\'t get absolute address without using an external linker. this mode doesn\'t have an external linker', location = location)
|
|
+ imm = (location, label, rel, offset, imm)
|
|
+ prog[pc] = (imm,) + prog[pc][1:]
|
|
+
|
|
+def asm_init():
|
|
+ global current_dir, current_location, prog, prog_data, macros, labels, construct, construct_stack, nwarn_level
|
|
+
|
|
+ current_dir = os.getcwd()
|
|
+ current_location = ''
|
|
+ prog = []
|
|
+ prog_data = []
|
|
+ macros = {
|
|
+ 'sacq': (['dst', 'i'], [('candyland', 'mov dst, sacq(i)')]),
|
|
+ 'srel': (['dst', 'i'], [('candyland', 'mov dst, srel(i)')])}
|
|
+ labels = {}
|
|
+ construct = None
|
|
+ construct_stack = []
|
|
+ nwarn_level = 0
|
|
+
|
|
+def asm_reset_prog():
|
|
+ global prog, labels
|
|
+
|
|
+ prog = []
|
|
+ labels = {}
|
|
+
|
|
+###############################################################################
|
|
+# dumping
|
|
+###############################################################################
|
|
+
|
|
+def print_lines(lines):
|
|
+ for line in lines:
|
|
+ print line
|
|
+
|
|
+class dumper_t:
|
|
+ def external_link(self): return False
|
|
+ def begin(self): pass
|
|
+ def label(self, pc, name): pass
|
|
+ def line(self, pc, ls, ms, line, annots, first): pass
|
|
+ def end(self): pass
|
|
+ def sets(self, sets): pass
|
|
+ def direct(self, line): pass
|
|
+
|
|
+class clif_dumper_t(dumper_t):
|
|
+ def __init__(self):
|
|
+ self.annot_mode = 0
|
|
+
|
|
+ def external_link(self):
|
|
+ return True
|
|
+
|
|
+ def parse_annot_mode(self, line):
|
|
+ l = line.split(',')
|
|
+ self.annot_mode = int(l[0])
|
|
+ if self.annot_mode not in (0, 1, 2):
|
|
+ asm_error('bad annot mode')
|
|
+ if self.annot_mode == 2:
|
|
+ if len(l) != 2:
|
|
+ asm_error('expected buffer name')
|
|
+ self.annot_name = l[1].strip()
|
|
+ self.annot_offset = 0
|
|
+ elif len(l) != 1:
|
|
+ asm_error('unexpected comma')
|
|
+
|
|
+ def label(self, pc, name):
|
|
+ if (self.annot_mode != 1) and (name[0] == ':'):
|
|
+ if self.annot_mode == 2:
|
|
+ name = name + '_annotations'
|
|
+ print '@label %s' % name[1:]
|
|
+ else:
|
|
+ print '// :%s' % name
|
|
+
|
|
+ def line(self, pc, ls, ms, line, annots, first):
|
|
+ if self.annot_mode == 0:
|
|
+ if isinstance(ls, tuple):
|
|
+ if len(ls) == 5:
|
|
+ location, label, rel, offset, offset_from_prog = ls
|
|
+ assert not rel
|
|
+ ls = '[. - %d + %d]' % (pc * 8, offset_from_prog)
|
|
+ else:
|
|
+ location, label, rel, offset = ls
|
|
+ if rel:
|
|
+ asm_error('relative external label references not allowed in this mode', location = location)
|
|
+ ls = '[%s + %d]' % (label, offset)
|
|
+ else:
|
|
+ ls = '0x%08x' % ls
|
|
+ print '%s 0x%08x // %s' % (ls, ms, line)
|
|
+ elif self.annot_mode == 1:
|
|
+ print '// %s' % line
|
|
+ for annot in annots:
|
|
+ print '0x%08x 0x%08x // %s' % ({
|
|
+ # todo: would rather not have these hard coded
|
|
+ 'mul_used': 1,
|
|
+ 'preserve_cond': 2,
|
|
+ 'geomd_open': 3,
|
|
+ 'geomd_i': 4,
|
|
+ 'geomd_tris_clear': 5,
|
|
+ 'geomd_verts': 6,
|
|
+ 'geomd_tris_add': 7,
|
|
+ 'geomd_tris_set_center': 8,
|
|
+ 'geomd_region_clear': 9,
|
|
+ 'geomd_region_set': 10,
|
|
+ 'geomd_images_clear': 11,
|
|
+ 'geomd_images_l': 12,
|
|
+ 'geomd_images_b': 13,
|
|
+ 'geomd_images_r': 14,
|
|
+ 'geomd_images_t': 15,
|
|
+ 'geomd_images_add_vpm': 16,
|
|
+ 'trace_4c': 17,
|
|
+ 'geomd_images_add_tex': 18,}[annot[0]], annot[1], annot[0])
|
|
+ if len(annots) != 0:
|
|
+ print '0x00000000 // end'
|
|
+ else:
|
|
+ assert self.annot_mode == 2
|
|
+ if len(annots) == 0:
|
|
+ print '0x00000000 // %s' % line
|
|
+ else:
|
|
+ print '[%s + %d] // %s' % (self.annot_name, self.annot_offset, line)
|
|
+ self.annot_offset += (len(annots) * 8) + 4
|
|
+
|
|
+ def direct(self, line):
|
|
+ print line
|
|
+
|
|
+class plain_dumper_t(dumper_t):
|
|
+ def line(self, pc, ls, ms, line, annots, first):
|
|
+ print '0x%08x, 0x%08x, // %s' % (ls, ms, line)
|
|
+
|
|
+class c_c_dumper_t(dumper_t):
|
|
+ def __init__(self, header_name, full_header_name, array_name):
|
|
+ self.header_name = header_name
|
|
+ self.array_name = array_name
|
|
+
|
|
+ def external_link(self):
|
|
+ return True
|
|
+
|
|
+ def begin(self):
|
|
+ self.external_labels = set()
|
|
+ self.lines = []
|
|
+
|
|
+ print '#include "%s.h"' % self.header_name
|
|
+ print ''
|
|
+ print '#ifdef _MSC_VER'
|
|
+ print ' #include <stdint.h>'
|
|
+ print ' /* cast through uintptr_t to avoid warnings */'
|
|
+ print ' #define POINTER_TO_UINT(X) ((unsigned int)(uintptr_t)(X))'
|
|
+ print '#else'
|
|
+ print ' #define POINTER_TO_UINT(X) ((unsigned int)(X))'
|
|
+ print '#endif'
|
|
+ print ''
|
|
+ print '#ifdef __cplusplus'
|
|
+ print 'extern "C" { /* the types are probably wrong... */'
|
|
+ print '#endif'
|
|
+
|
|
+ def label(self, pc, name):
|
|
+ self.lines.append('// :%s' % name)
|
|
+
|
|
+ def line(self, pc, ls, ms, line, annots, first):
|
|
+ if isinstance(ls, tuple):
|
|
+ if len(ls) == 5:
|
|
+ location, label, rel, offset, offset_from_prog = ls
|
|
+ assert not rel
|
|
+ ls = 'POINTER_TO_UINT(%s) + %d' % (self.array_name, offset_from_prog)
|
|
+ else:
|
|
+ location, label, rel, offset = ls
|
|
+ if rel:
|
|
+ asm_error('relative external label references not allowed in this mode', location = location)
|
|
+ if label not in self.external_labels:
|
|
+ self.external_labels.add(label)
|
|
+ print 'extern uint8_t %s[];' % label
|
|
+ ls = 'POINTER_TO_UINT(%s) + %d' % (label, offset)
|
|
+ else:
|
|
+ ls = '0x%08x' % ls
|
|
+ self.lines.append('/* [0x%08x] */ %s, 0x%08x, // %s' % (pc * 8, ls, ms, line))
|
|
+
|
|
+ def end(self):
|
|
+ print '#ifdef __cplusplus'
|
|
+ print '}'
|
|
+ print '#endif'
|
|
+ print ''
|
|
+ print '#ifdef _MSC_VER'
|
|
+ print '__declspec(align(8))'
|
|
+ print '#elif defined(__GNUC__)'
|
|
+ print '__attribute__((aligned(8)))'
|
|
+ print '#endif'
|
|
+ print 'unsigned int %s[] = {' % self.array_name
|
|
+ print_lines(self.lines)
|
|
+ print '};'
|
|
+ print '#ifdef __HIGHC__'
|
|
+ print '#pragma Align_to(8, %s)' % self.array_name
|
|
+ print '#endif'
|
|
+
|
|
+class c_h_dumper_t(dumper_t):
|
|
+ def __init__(self, header_name, full_header_name, array_name):
|
|
+ self.full_header_name = full_header_name
|
|
+ self.array_name = array_name
|
|
+
|
|
+ def external_link(self):
|
|
+ return True
|
|
+
|
|
+ def begin(self):
|
|
+ print '#ifndef %s_H' % self.full_header_name
|
|
+ print '#define %s_H' % self.full_header_name
|
|
+ print ''
|
|
+ print 'extern unsigned int %s[];' % self.array_name
|
|
+ print ''
|
|
+
|
|
+ def label(self, pc, name):
|
|
+ if name[0] == ':':
|
|
+ print '#define %s (%s + %d)' % (name[1:], self.array_name, pc * 2)
|
|
+
|
|
+ def end(self):
|
|
+ print ''
|
|
+ print '#endif'
|
|
+
|
|
+class ml_c_dumper_t(dumper_t):
|
|
+ def __init__(self, header_name, full_header_name, name, annots):
|
|
+ self.header_name = header_name
|
|
+ self.name = name
|
|
+ self.annots = annots
|
|
+
|
|
+ def external_link(self):
|
|
+ return True
|
|
+
|
|
+ def begin(self):
|
|
+ if self.annots:
|
|
+ self.annot_lines = []
|
|
+ self.lines = []
|
|
+ self.external_labels = set()
|
|
+ self.link_lines = []
|
|
+
|
|
+ print '#include "%s.h"' % self.header_name
|
|
+ print '#include <assert.h>'
|
|
+ if self.annots:
|
|
+ print '#ifdef SIMPENROSE'
|
|
+ print '#include <stddef.h>'
|
|
+ print '#include "v3d/verification/tools/2760sim/simpenrose.h"'
|
|
+ print ''
|
|
+
|
|
+ def label(self, pc, name):
|
|
+ self.lines.append('// :%s' % name)
|
|
+
|
|
+ def line(self, pc, ls, ms, line, annots, first):
|
|
+ if self.annots:
|
|
+ if len(annots) == 0:
|
|
+ self.annot_lines.append('NULL,')
|
|
+ else:
|
|
+ print 'static unsigned int const annotations_%d[] = {' % pc
|
|
+ for annot in annots:
|
|
+ print ' SIMPENROSE_SHADER_ANNOTATION_%s, 0x%08x,' % (annot[0].upper(), annot[1])
|
|
+ print ' SIMPENROSE_SHADER_ANNOTATION_END};'
|
|
+ print ''
|
|
+ self.annot_lines.append('annotations_%d,' % pc)
|
|
+ if isinstance(ls, tuple):
|
|
+ self.link_lines.append(' assert(p[%d] == 0xdeadbeef);' % (pc * 2))
|
|
+ if len(ls) == 5:
|
|
+ location, label, rel, offset, offset_from_prog = ls
|
|
+ assert not rel
|
|
+ self.link_lines.append(' p[%d] = base + %d;' % (pc * 2, offset_from_prog))
|
|
+ else:
|
|
+ location, label, rel, offset = ls
|
|
+ self.external_labels.add(label)
|
|
+ if rel:
|
|
+ self.link_lines.append(' p[%d] = (%s + %d) - (base + %d);' % (pc * 2, label, offset, (pc + 4) * 8))
|
|
+ else:
|
|
+ self.link_lines.append(' p[%d] = %s + %d;' % (pc * 2, label, offset))
|
|
+ ls = '0xdeadbeef'
|
|
+ else:
|
|
+ ls = '0x%08x' % ls
|
|
+ self.lines.append('/* [0x%08x] */ %s, 0x%08x, // %s' % (pc * 8, ls, ms, line))
|
|
+
|
|
+ def end(self):
|
|
+ if self.annots:
|
|
+ print 'unsigned int const *const %s_annotations_array[] = {' % self.name
|
|
+ print_lines(self.annot_lines)
|
|
+ print '};'
|
|
+ print '#endif'
|
|
+ print ''
|
|
+ print 'static unsigned int const array[] = {'
|
|
+ print_lines(self.lines)
|
|
+ print '};'
|
|
+ print ''
|
|
+ print 'void %s_link(void *p_in, unsigned int base' % self.name
|
|
+ for label in sorted(self.external_labels):
|
|
+ print ' , unsigned int %s' % label
|
|
+ print ' )'
|
|
+ print '{'
|
|
+ print ' unsigned int *p = (unsigned int *)p_in;'
|
|
+ print ' unsigned int i;'
|
|
+ print ' for (i = 0; i != (%s_SIZE / 4); ++i) {' % self.name.upper()
|
|
+ print ' p[i] = array[i];'
|
|
+ print ' }'
|
|
+ print_lines(self.link_lines)
|
|
+ print '}'
|
|
+
|
|
+class ml_h_dumper_t(dumper_t):
|
|
+ def __init__(self, header_name, full_header_name, name, annots):
|
|
+ self.full_header_name = full_header_name
|
|
+ self.name = name
|
|
+ self.annots = annots
|
|
+
|
|
+ def external_link(self):
|
|
+ return True
|
|
+
|
|
+ def begin(self):
|
|
+ self.external_labels = set()
|
|
+ self.lines_n = 0
|
|
+
|
|
+ print '#ifndef %s_H' % self.full_header_name
|
|
+ print '#define %s_H' % self.full_header_name
|
|
+ print ''
|
|
+ if self.annots:
|
|
+ print '#ifdef SIMPENROSE'
|
|
+ print ' extern unsigned int const *const %s_annotations_array[];' % self.name
|
|
+ print '#endif'
|
|
+ print ''
|
|
+
|
|
+ def label(self, pc, name):
|
|
+ if name[0] == ':':
|
|
+ print '#define %s_OFFSET %d' % (name[1:].upper(), pc * 8)
|
|
+ if self.annots:
|
|
+ print '#ifdef SIMPENROSE'
|
|
+ print ' #define %s_annotations (%s_annotations_array + %d)' % (name[1:], self.name, pc)
|
|
+ print '#endif'
|
|
+
|
|
+ def line(self, pc, ls, ms, line, annots, first):
|
|
+ if isinstance(ls, tuple) and (len(ls) != 5):
|
|
+ self.external_labels.add(ls[1])
|
|
+ self.lines_n += 1
|
|
+
|
|
+ def end(self):
|
|
+ print ''
|
|
+ print 'extern void %s_link(void *p, unsigned int base' % self.name
|
|
+ for label in sorted(self.external_labels):
|
|
+ print ' , unsigned int %s' % label
|
|
+ print ' );'
|
|
+ print ''
|
|
+ print '#define %s_SIZE %d' % (self.name.upper(), (self.lines_n * 8))
|
|
+ print ''
|
|
+ print '#endif'
|
|
+
|
|
+def print_lines_lc(lines):
|
|
+ for line in lines:
|
|
+ print '%s \\' % line
|
|
+
|
|
+def print_groups_lc(groups):
|
|
+ first = True
|
|
+ for group in groups:
|
|
+ if first:
|
|
+ print '{ \\'
|
|
+ else:
|
|
+ print ', { \\'
|
|
+ print_lines_lc(group)
|
|
+ print '} \\'
|
|
+ first = False
|
|
+
|
|
+class inline_c_dumper_t(dumper_t):
|
|
+ def __init__(self, annots):
|
|
+ self.annots = annots
|
|
+ self.iteration = False
|
|
+
|
|
+ def begin_iteration(self):
|
|
+ assert not self.iteration
|
|
+ self.iteration = True
|
|
+ self.iteration_lines = []
|
|
+ if self.annots:
|
|
+ self.iteration_annot_lines = []
|
|
+ self.annot_arrs = []
|
|
+
|
|
+ def end_iteration(self):
|
|
+ assert self.iteration
|
|
+ self.iteration = False
|
|
+ print '%d, \\' % self.iteration_n
|
|
+ if self.annots:
|
|
+ print '( \\'
|
|
+ print_groups_lc(self.iteration_lines)
|
|
+ if self.annots:
|
|
+ print '), ( \\'
|
|
+ print_groups_lc(self.iteration_annot_lines)
|
|
+ print '), ( \\'
|
|
+ for annot_arr in self.annot_arrs:
|
|
+ print_lines_lc(annot_arr)
|
|
+ print ') \\'
|
|
+
|
|
+ def begin(self):
|
|
+ self.n = 0
|
|
+ self.lines = []
|
|
+ if self.annots:
|
|
+ self.annot_lines = []
|
|
+ if not self.iteration:
|
|
+ self.annot_arrs = []
|
|
+
|
|
+ def label(self, pc, name):
|
|
+ self.lines.append('/* :%s */' % name)
|
|
+ if self.annots:
|
|
+ self.annot_lines.append('/* :%s */' % name)
|
|
+
|
|
+ def line(self, pc, ls, ms, line, annots, first):
|
|
+ self.n += 1
|
|
+ if first:
|
|
+ prefix = ''
|
|
+ else:
|
|
+ prefix = ', '
|
|
+ self.lines.append('%s0x%08x, 0x%08x /* %s */' % (prefix, ls, ms, line))
|
|
+ if self.annots:
|
|
+ if len(annots) == 0:
|
|
+ a = 'NULL'
|
|
+ else:
|
|
+ a = 'annotations_%d' % len(self.annot_arrs)
|
|
+ annot_arr = ['static unsigned int const annotations_%d[] = {' % len(self.annot_arrs)]
|
|
+ for annot in annots:
|
|
+ annot_arr.append(' SIMPENROSE_SHADER_ANNOTATION_%s, 0x%08x,' % (annot[0].upper(), annot[1]))
|
|
+ annot_arr.append(' SIMPENROSE_SHADER_ANNOTATION_END};')
|
|
+ self.annot_arrs.append(annot_arr)
|
|
+ self.annot_lines.append('%s%s /* %s */' % (prefix, a, line))
|
|
+
|
|
+ def end(self):
|
|
+ if self.iteration:
|
|
+ if len(self.iteration_lines) == 0:
|
|
+ self.iteration_n = self.n
|
|
+ elif self.iteration_n != self.n:
|
|
+ asm_error('number of instructions differs between iterations')
|
|
+ self.iteration_lines.append(self.lines)
|
|
+ if self.annots:
|
|
+ self.iteration_annot_lines.append(self.annot_lines)
|
|
+ else:
|
|
+ if self.annots:
|
|
+ print '( \\'
|
|
+ print_lines_lc(self.lines)
|
|
+ if self.annots:
|
|
+ print '), ( \\'
|
|
+ print_lines_lc(self.annot_lines)
|
|
+ print '), ( \\'
|
|
+ for annot_arr in self.annot_arrs:
|
|
+ print_lines_lc(annot_arr)
|
|
+ print ') \\'
|
|
+
|
|
+ def direct(self, line):
|
|
+ print line
|
|
+
|
|
+class asvc_dumper_t(dumper_t):
|
|
+ def external_link(self):
|
|
+ return True
|
|
+
|
|
+ def begin(self):
|
|
+ print '.align 8'
|
|
+
|
|
+ def label(self, pc, name):
|
|
+ if name[0] == ':':
|
|
+ print '%s::' % name[1:]
|
|
+ else:
|
|
+ print '%s:' % name
|
|
+
|
|
+ def line(self, pc, ls, ms, line, annots, first):
|
|
+ if isinstance(ls, tuple):
|
|
+ location, label, rel, offset = ls[:4]
|
|
+ if rel:
|
|
+ ls = '%s + %d - (. + 32)' % (label, offset)
|
|
+ else:
|
|
+ ls = '%s + %d' % (label, offset)
|
|
+ else:
|
|
+ ls = '0x%08x' % ls
|
|
+ print '.word %s, 0x%08x ; %s' % (ls, ms, line)
|
|
+
|
|
+def is_ra_or_rb(val):
|
|
+ return isinstance(val, loc_t) and ((val.mux == MUX_A) or (val.mux == MUX_B))
|
|
+
|
|
+class aliases_dumper_t(dumper_t):
|
|
+ def external_link(self):
|
|
+ return True
|
|
+
|
|
+ def begin(self):
|
|
+ print '#ifndef JUST_DQASM_ARGS'
|
|
+
|
|
+ def label(self, pc, name):
|
|
+ if not name[0].isdigit():
|
|
+ if name[0] == ':':
|
|
+ name = name[1:]
|
|
+ print '"bs%s", "bs%x",' % (name, pc * 8)
|
|
+ print '"bu%s", "bu%x",' % (name, pc * 8)
|
|
+
|
|
+ def end(self):
|
|
+ print '#endif'
|
|
+
|
|
+ # todo: handle things other than ra and rb? dqasm only allows ra and rb atm
|
|
+ def sets(self, sets):
|
|
+ dqasm_args = []
|
|
+ print '#ifndef JUST_DQASM_ARGS'
|
|
+ for name in sets:
|
|
+ if is_ra_or_rb(sets[name]):
|
|
+ dqasm_args.append('-r%s=%s' % (sets[name], name))
|
|
+ print '"%s", "%s",' % (name, sets[name])
|
|
+ elif isinstance(sets[name], list):
|
|
+ for i, val in enumerate(sets[name]):
|
|
+ if is_ra_or_rb(val):
|
|
+ dqasm_args.append('-r%s=%s[%d]' % (val, name, i))
|
|
+ print '"%s[%d]", "%s",' % (name, i, val)
|
|
+ print '#endif'
|
|
+ print '#define DQASM_ARGS "%s"' % ' '.join(dqasm_args)
|
|
+
|
|
+def dump(dumper):
|
|
+ if (len(prog) != 0) or (len(labels) != 0):
|
|
+ dumper.begin()
|
|
+
|
|
+ sorted_labels = []
|
|
+ for name in labels:
|
|
+ if name[0].isdigit():
|
|
+ for pc in labels[name]:
|
|
+ sorted_labels.append((pc, name))
|
|
+ else:
|
|
+ sorted_labels.append((labels[name], name))
|
|
+ sorted_labels.sort(reverse = True)
|
|
+
|
|
+ first = True
|
|
+ for pc in xrange(len(prog)):
|
|
+ ls, ms, line, annots = prog[pc]
|
|
+ while (len(sorted_labels) != 0) and (sorted_labels[-1][0] == pc):
|
|
+ dumper.label(*sorted_labels.pop())
|
|
+ dumper.line(pc, ls, ms, line, annots, first)
|
|
+ first = False
|
|
+ for sorted_label in sorted_labels:
|
|
+ assert sorted_label[0] == len(prog)
|
|
+ dumper.label(*sorted_label)
|
|
+
|
|
+ dumper.end()
|
|
+
|
|
+###############################################################################
|
|
+# preprocessing
|
|
+###############################################################################
|
|
+
|
|
+def preprocess_inline_c(dumper):
|
|
+ def preprocess(file):
|
|
+ ls = None
|
|
+ line_number = 0
|
|
+ for line in file:
|
|
+ line_number += 1
|
|
+ while True:
|
|
+ if ls is None:
|
|
+ l = line.split('%[', 1)
|
|
+ if len(l) == 1:
|
|
+ dumper.direct(l[0].rstrip())
|
|
+ break
|
|
+ dumper.direct('%s \\' % l[0].rstrip())
|
|
+ line = l[1]
|
|
+ ls = []
|
|
+ else:
|
|
+ l = line.split('%]', 1)
|
|
+ ls.append((line_number, l[0]))
|
|
+ if len(l) == 1:
|
|
+ break
|
|
+ line = l[1]
|
|
+ l = ls[-1][1].split('%|', 1)
|
|
+ if len(l) == 1:
|
|
+ for l_number, l in ls:
|
|
+ yield l_number, l
|
|
+ asm_end_prog()
|
|
+ dump(dumper)
|
|
+ asm_reset_prog()
|
|
+ else:
|
|
+ ls[-1] = (ls[-1][0], l[0])
|
|
+ if hasattr(dumper, 'begin_iteration'):
|
|
+ dumper.begin_iteration()
|
|
+ for repls in l[1].split('%,'):
|
|
+ repls = [repl.strip() for repl in repls.split('%/')]
|
|
+ for l_number, l in ls:
|
|
+ for i, repl in enumerate(repls):
|
|
+ l = l.replace('%' + str(i), repl)
|
|
+ yield l_number, l
|
|
+ asm_end_prog()
|
|
+ dump(dumper)
|
|
+ asm_reset_prog()
|
|
+ if hasattr(dumper, 'end_iteration'):
|
|
+ dumper.end_iteration()
|
|
+ ls = None
|
|
+ return preprocess
|
|
+
|
|
+def preprocess_clif(dumper):
|
|
+ def preprocess(file):
|
|
+ in_asm = False
|
|
+ line_number = 0
|
|
+ for line in file:
|
|
+ line_number += 1
|
|
+ if in_asm:
|
|
+ if line.strip() == '%]':
|
|
+ asm_end_prog()
|
|
+ dump(dumper)
|
|
+ asm_reset_prog()
|
|
+ in_asm = False
|
|
+ else:
|
|
+ yield line_number, line
|
|
+ else:
|
|
+ if line.strip() == '%[':
|
|
+ in_asm = True
|
|
+ elif (line[:1] == '%') and (line[:2] != '%@'):
|
|
+ yield line_number, line[1:]
|
|
+ else:
|
|
+ asm_end_prog()
|
|
+ dump(dumper)
|
|
+ asm_reset_prog()
|
|
+ if line[:2] == '%@':
|
|
+ if hasattr(dumper, 'parse_annot_mode'):
|
|
+ dumper.parse_annot_mode(line[2:])
|
|
+ else:
|
|
+ dumper.direct(line.rstrip())
|
|
+ return preprocess
|
|
+
|
|
+###############################################################################
|
|
+# main
|
|
+###############################################################################
|
|
+
|
|
+def main():
|
|
+ global external_link, allow_xor_0, dont_warn_when_mul_rot_inp_r5
|
|
+ global warnings_are_errors, disable_warnings, have_sema, have_am, mulw_rotate
|
|
+
|
|
+ asm_init() # do this first so we can use asm_error without having to pass a location and so asm_warning will work
|
|
+
|
|
+ # parse command line
|
|
+ parser = optparse.OptionParser(usage = 'usage: %prog [options] <filename>')
|
|
+ parser.add_option('-m', '--mode', dest = 'mode',
|
|
+ help = '<mode> should be clif, plain, ' +
|
|
+ 'c_c:<header_name>,<full_header_name>,<array_name>, ' +
|
|
+ 'c_h:<header_name>,<full_header_name>,<array_name>, ' +
|
|
+ 'ml_c:<header_name>,<full_header_name>,<name>[,annots], ' +
|
|
+ 'ml_h:<header_name>,<full_header_name>,<name>[,annots], ' +
|
|
+ 'inline_c[:annots], asvc, or aliases[:<preprocess_mode>]', metavar = '<mode>')
|
|
+ parser.add_option('-t', '--target', dest = 'target',
|
|
+ help = '<target> should be a0, b0, or hera', metavar = '<target>')
|
|
+ parser.add_option('-x', '--allow_xor_0', dest = 'allow_xor_0', action = 'store_true', default = False)
|
|
+ parser.add_option('-r', '--dont_warn_when_mul_rot_inp_r5', dest = 'dont_warn_when_mul_rot_inp_r5', action = 'store_true', default = False)
|
|
+ parser.add_option('-w', '--warnings_are_errors', dest = 'warnings_are_errors', action = 'store_true', default = False)
|
|
+ parser.add_option('-d', '--disable_warnings', dest = 'disable_warnings', action = 'store_true', default = False)
|
|
+ parser.add_option('-s', '--set', dest = 'sets', action = 'append', default = [], metavar = '<name>=<val>')
|
|
+ options, args = parser.parse_args()
|
|
+ if len(args) == 0:
|
|
+ filename = None
|
|
+ elif len(args) == 1:
|
|
+ filename = args[0]
|
|
+ else:
|
|
+ parser.print_help()
|
|
+ sys.exit(-1)
|
|
+
|
|
+ # handle mode
|
|
+ mode = options.mode or 'clif' # assume clif if no mode specified
|
|
+ if mode == 'clif':
|
|
+ dumper = clif_dumper_t()
|
|
+ preprocess = preprocess_clif(dumper)
|
|
+ elif mode == 'plain':
|
|
+ dumper = plain_dumper_t()
|
|
+ preprocess = None
|
|
+ elif (mode[:4] == 'c_c:') or (mode[:4] == 'c_h:'):
|
|
+ mode_options = mode[4:].split(',')
|
|
+ if len(mode_options) != 3:
|
|
+ asm_error('badly formatted mode on command line')
|
|
+ dumper = {'c_c': c_c_dumper_t, 'c_h': c_h_dumper_t}[mode[:3]](*mode_options)
|
|
+ preprocess = None
|
|
+ elif (mode[:5] == 'ml_c:') or (mode[:5] == 'ml_h:'):
|
|
+ mode_options = mode[5:].split(',')
|
|
+ if (len(mode_options) != 3) and ((len(mode_options) != 4) or (mode_options[3] != 'annots')):
|
|
+ asm_error('badly formatted mode on command line')
|
|
+ dumper = {'ml_c': ml_c_dumper_t, 'ml_h': ml_h_dumper_t
|
|
+ }[mode[:4]](*(mode_options[:3] + [len(mode_options) == 4]))
|
|
+ preprocess = None
|
|
+ elif mode == 'inline_c':
|
|
+ dumper = inline_c_dumper_t(False)
|
|
+ preprocess = preprocess_inline_c(dumper)
|
|
+ elif mode == 'inline_c:annots':
|
|
+ dumper = inline_c_dumper_t(True)
|
|
+ preprocess = preprocess_inline_c(dumper)
|
|
+ elif mode == 'asvc':
|
|
+ dumper = asvc_dumper_t()
|
|
+ preprocess = None
|
|
+ elif mode == 'aliases':
|
|
+ dumper = aliases_dumper_t()
|
|
+ preprocess = None
|
|
+ elif mode == 'aliases:inline_c':
|
|
+ dumper = aliases_dumper_t()
|
|
+ preprocess = preprocess_inline_c(dumper)
|
|
+ else:
|
|
+ asm_error('invalid mode')
|
|
+ external_link = dumper.external_link()
|
|
+
|
|
+ # handle target
|
|
+ target = options.target or 'b0' # assume b0 if no target specified
|
|
+ if target == 'a0':
|
|
+ have_sema = False
|
|
+ have_am = False
|
|
+ mulw_rotate = False
|
|
+ have_lthrsw = False
|
|
+ elif target == 'b0':
|
|
+ have_sema = True
|
|
+ have_am = True
|
|
+ mulw_rotate = True
|
|
+ have_lthrsw = True
|
|
+ elif target == 'hera':
|
|
+ have_sema = True
|
|
+ have_am = False
|
|
+ mulw_rotate = True
|
|
+ have_lthrsw = True
|
|
+ else:
|
|
+ asm_error('invalid target')
|
|
+ if have_am:
|
|
+ sigs['loadam'] = SIG_LOADAM
|
|
+ arg_defs['tlbam'] = loc_t(MUX_ANY, 47, 0, 0, None, RW_WRITE)
|
|
+ if have_lthrsw:
|
|
+ sigs['lthrsw'] = SIG_LTHRSW
|
|
+ del sigs['int']
|
|
+ arg_defs['interrupt'] = loc_t(MUX_ANY, 38, 0, 0, None, RW_WRITE)
|
|
+
|
|
+ # handle misc options
|
|
+ allow_xor_0 = options.allow_xor_0
|
|
+ dont_warn_when_mul_rot_inp_r5 = options.dont_warn_when_mul_rot_inp_r5
|
|
+ warnings_are_errors = options.warnings_are_errors
|
|
+ disable_warnings = options.disable_warnings
|
|
+
|
|
+ # make options visible to asm
|
|
+ arg_defs['mode'] = mode
|
|
+ arg_defs['target'] = target
|
|
+
|
|
+ # arg_defs all setup at this point
|
|
+ sets = arg_defs.copy() # todo: see arg_eval
|
|
+
|
|
+ # handle command line sets
|
|
+ re_options_set = re.compile('(?P<name>\\w+)=(?P<val>.+)$')
|
|
+ for options_set in options.sets:
|
|
+ m = re_options_set.match(options_set)
|
|
+ if not m:
|
|
+ asm_error('badly formatted set on command line')
|
|
+ sets[m.group('name')] = arg_eval(m.group('val'), sets)
|
|
+
|
|
+ # assemble input file and dump
|
|
+ asm_file(sets, filename, filename, preprocess)
|
|
+ asm_end_prog()
|
|
+ dump(dumper)
|
|
+ for name in arg_defs: # todo: see arg_eval
|
|
+ del sets[name]
|
|
+ dumper.sets(sets)
|
|
+
|
|
+if __name__ == '__main__':
|
|
+ main()
|
|
diff --git a/pi-util/rebase_liblinks.py b/pi-util/rebase_liblinks.py
|
|
new file mode 100755
|
|
index 0000000..6a9a33f
|
|
--- /dev/null
|
|
+++ b/pi-util/rebase_liblinks.py
|
|
@@ -0,0 +1,37 @@
|
|
+#!/usr/bin/env python
|
|
+
|
|
+import os, sys
|
|
+from stat import *
|
|
+
|
|
+def walktree(top, callback, n, prefix):
|
|
+ '''recursively descend the directory tree rooted at top,
|
|
+ calling the callback function for each regular file'''
|
|
+
|
|
+ for f in os.listdir(top):
|
|
+ pathname = os.path.join(top, f)
|
|
+ mode = os.lstat(pathname).st_mode
|
|
+ if S_ISDIR(mode):
|
|
+ # It's a directory, recurse into it
|
|
+ walktree(pathname, callback, n+1, prefix)
|
|
+ elif S_ISLNK(mode):
|
|
+ # It's a file, call the callback function
|
|
+ callback(pathname, os.readlink(pathname), n, prefix)
|
|
+
|
|
+def visitfile(file, linkname, n, prefix):
|
|
+ if (linkname.startswith(prefix + 'lib/')):
|
|
+ newlink = "../" * n + linkname[len(prefix):]
|
|
+ print 'relinking', file, "->", newlink
|
|
+ os.remove(file)
|
|
+ os.symlink(newlink, file)
|
|
+
|
|
+if __name__ == '__main__':
|
|
+ argc = len(sys.argv)
|
|
+ if argc == 2:
|
|
+ walktree(sys.argv[1], visitfile, 0, "/")
|
|
+ elif argc == 3:
|
|
+ walktree(sys.argv[1], visitfile, 0, sys.argv[2])
|
|
+ else:
|
|
+ print "rebase_liblinks.py <local root> [<old sysroot>]"
|
|
+
|
|
+
|
|
+
|
|
diff --git a/pi-util/syncroot.sh b/pi-util/syncroot.sh
|
|
new file mode 100755
|
|
index 0000000..d8bdd91
|
|
--- /dev/null
|
|
+++ b/pi-util/syncroot.sh
|
|
@@ -0,0 +1,43 @@
|
|
+set -e
|
|
+
|
|
+if [ "$1" == "" ]; then
|
|
+ echo Usage: $0 \<src_dir\> [\<rootname\>]
|
|
+ echo src_dir is a source for rsync so may contain m/c name.
|
|
+ echo rootname will be set to \"raspian_jessie_pi1\" if missing
|
|
+ echo e.g.: pi-util/syncroot.sh my-pi: raspian_jessie_pi1
|
|
+ exit 1
|
|
+fi
|
|
+
|
|
+SYSROOT_NAME=$2
|
|
+if [ "$SYSROOT_NAME" == "" ]; then
|
|
+ SYSROOT_NAME=raspian_jessie_pi1
|
|
+fi
|
|
+
|
|
+DST_ROOT=`pwd`
|
|
+DST=$DST_ROOT/build/linux/$SYSROOT_NAME-sysroot
|
|
+SRC=$1
|
|
+
|
|
+echo Sync src: $SRC
|
|
+echo Sync dest: $DST
|
|
+
|
|
+mkdir -p $DST/lib
|
|
+mkdir -p $DST/opt/vc/include
|
|
+mkdir -p $DST/usr/lib/pkgconfig
|
|
+mkdir -p $DST/usr/bin
|
|
+mkdir -p $DST/usr/share
|
|
+
|
|
+#### MUST NOT include /opt/vc/include/*GL*
|
|
+# Creates conflicts with GL includes inside Chrome
|
|
+
|
|
+rsync -rl $SRC/lib/arm-linux-gnueabihf $DST/lib
|
|
+rsync -rl $SRC/opt/vc/lib $DST/opt/vc
|
|
+rsync -l $SRC/opt/vc/include/bcm_host.h $DST/opt/vc/include
|
|
+rsync -rl $SRC/opt/vc/include/interface $DST/opt/vc/include
|
|
+rsync -rl $SRC/opt/vc/include/vcinclude $DST/opt/vc/include
|
|
+rsync -rl $SRC/usr/lib/arm-linux-gnueabihf $DST/usr/lib
|
|
+rsync -rl $SRC/usr/lib/gcc $DST/usr/lib
|
|
+rsync -rl $SRC/usr/include $DST/usr
|
|
+
|
|
+pi-util/rebase_liblinks.py $DST
|
|
+
|
|
+
|
|
|