Skip to content

Commit 4fcd2ac

Browse files
committed
Merge branch 'avx2-intra-temp2'
2 parents 0844120 + ae646ec commit 4fcd2ac

13 files changed

+12019
-2076
lines changed

src/global.h

+6
Original file line numberDiff line numberDiff line change
@@ -316,6 +316,12 @@ typedef int32_t mv_t;
316316
#define ALIGNED(alignment) __attribute__((aligned (alignment)))
317317
#endif
318318

319+
#ifdef _MSC_VER
320+
#define NO_ASAN
321+
#else
322+
#define NO_ASAN __attribute__((no_sanitize("address")))
323+
#endif
324+
319325
#ifdef _MSC_VER
320326
// Buggy VS2010 throws intellisense warnings if void* is not casted.
321327
#define MALLOC(type, num) (type *)malloc(sizeof(type) * (num))

src/intra.c

+2-306
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,6 @@
3636

3737
#include "image.h"
3838
#include "uvg_math.h"
39-
#include "mip_data.h"
4039
#include "rdo.h"
4140
#include "search.h"
4241
#include "search_intra.h"
@@ -86,17 +85,6 @@ static const uint8_t num_ref_pixels_left[16][16] = {
8685
{ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 }
8786
};
8887

89-
90-
static void mip_predict(
91-
const encoder_state_t* const state,
92-
const uvg_intra_references* const refs,
93-
const uint16_t pred_block_width,
94-
const uint16_t pred_block_height,
95-
uvg_pixel* dst,
96-
const int mip_mode,
97-
const bool mip_transp);
98-
99-
10088
int8_t uvg_intra_get_dir_luma_predictor(
10189
const uint32_t x,
10290
const uint32_t y,
@@ -646,298 +634,6 @@ uint8_t uvg_get_mip_flag_context(
646634
}
647635

648636

649-
void uvg_mip_boundary_downsampling_1D(int* reduced_dst, const int* const ref_src, int src_len, int dst_len)
650-
{
651-
if (dst_len < src_len)
652-
{
653-
// Create reduced boundary by downsampling
654-
uint16_t down_smp_factor = src_len / dst_len;
655-
const int log2_factor = uvg_math_floor_log2(down_smp_factor);
656-
const int rounding_offset = (1 << (log2_factor - 1));
657-
658-
uint16_t src_idx = 0;
659-
for (uint16_t dst_idx = 0; dst_idx < dst_len; dst_idx++)
660-
{
661-
int sum = 0;
662-
for (int k = 0; k < down_smp_factor; k++)
663-
{
664-
sum += ref_src[src_idx++];
665-
}
666-
reduced_dst[dst_idx] = (sum + rounding_offset) >> log2_factor;
667-
}
668-
}
669-
else
670-
{
671-
// Copy boundary if no downsampling is needed
672-
for (uint16_t i = 0; i < dst_len; ++i)
673-
{
674-
reduced_dst[i] = ref_src[i];
675-
}
676-
}
677-
}
678-
679-
680-
void uvg_mip_reduced_pred(int* const output,
681-
const int* const input,
682-
const uint8_t* matrix,
683-
const bool transpose,
684-
const int red_bdry_size,
685-
const int red_pred_size,
686-
const int size_id,
687-
const int in_offset,
688-
const int in_offset_tr)
689-
{
690-
const int input_size = 2 * red_bdry_size;
691-
692-
// Use local buffer for transposed result
693-
int out_buf_transposed[LCU_WIDTH * LCU_WIDTH];
694-
int* const out_ptr = transpose ? out_buf_transposed : output;
695-
696-
int sum = 0;
697-
for (int i = 0; i < input_size; i++) {
698-
sum += input[i];
699-
}
700-
const int offset = (1 << (MIP_SHIFT_MATRIX - 1)) - MIP_OFFSET_MATRIX * sum;
701-
assert((input_size == 4 * (input_size >> 2)) && "MIP input size must be divisible by four");
702-
703-
const uint8_t* weight = matrix;
704-
const int input_offset = transpose ? in_offset_tr : in_offset;
705-
706-
const bool red_size = (size_id == 2);
707-
int pos_res = 0;
708-
for (int y = 0; y < red_pred_size; y++) {
709-
for (int x = 0; x < red_pred_size; x++) {
710-
if (red_size) {
711-
weight -= 1;
712-
}
713-
int tmp0 = red_size ? 0 : (input[0] * weight[0]);
714-
int tmp1 = input[1] * weight[1];
715-
int tmp2 = input[2] * weight[2];
716-
int tmp3 = input[3] * weight[3];
717-
for (int i = 4; i < input_size; i += 4) {
718-
tmp0 += input[i] * weight[i];
719-
tmp1 += input[i + 1] * weight[i + 1];
720-
tmp2 += input[i + 2] * weight[i + 2];
721-
tmp3 += input[i + 3] * weight[i + 3];
722-
}
723-
out_ptr[pos_res] = CLIP_TO_PIXEL(((tmp0 + tmp1 + tmp2 + tmp3 + offset) >> MIP_SHIFT_MATRIX) + input_offset);
724-
pos_res++;
725-
weight += input_size;
726-
}
727-
}
728-
729-
if (transpose) {
730-
for (int y = 0; y < red_pred_size; y++) {
731-
for (int x = 0; x < red_pred_size; x++) {
732-
output[y * red_pred_size + x] = out_ptr[x * red_pred_size + y];
733-
}
734-
}
735-
}
736-
}
737-
738-
739-
void uvg_mip_pred_upsampling_1D(int* const dst, const int* const src, const int* const boundary,
740-
const uint16_t src_size_ups_dim, const uint16_t src_size_orth_dim,
741-
const uint16_t src_step, const uint16_t src_stride,
742-
const uint16_t dst_step, const uint16_t dst_stride,
743-
const uint16_t boundary_step,
744-
const uint16_t ups_factor)
745-
{
746-
const int log2_factor = uvg_math_floor_log2(ups_factor);
747-
assert(ups_factor >= 2 && "Upsampling factor must be at least 2.");
748-
const int rounding_offset = 1 << (log2_factor - 1);
749-
750-
uint16_t idx_orth_dim = 0;
751-
const int* src_line = src;
752-
int* dst_line = dst;
753-
const int* boundary_line = boundary + boundary_step - 1;
754-
while (idx_orth_dim < src_size_orth_dim)
755-
{
756-
uint16_t idx_upsample_dim = 0;
757-
const int* before = boundary_line;
758-
const int* behind = src_line;
759-
int* cur_dst = dst_line;
760-
while (idx_upsample_dim < src_size_ups_dim)
761-
{
762-
uint16_t pos = 1;
763-
int scaled_before = (*before) << log2_factor;
764-
int scaled_behind = 0;
765-
while (pos <= ups_factor)
766-
{
767-
scaled_before -= *before;
768-
scaled_behind += *behind;
769-
*cur_dst = (scaled_before + scaled_behind + rounding_offset) >> log2_factor;
770-
771-
pos++;
772-
cur_dst += dst_step;
773-
}
774-
775-
idx_upsample_dim++;
776-
before = behind;
777-
behind += src_step;
778-
}
779-
780-
idx_orth_dim++;
781-
src_line += src_stride;
782-
dst_line += dst_stride;
783-
boundary_line += boundary_step;
784-
}
785-
}
786-
787-
788-
789-
/** \brief Matrix weighted intra prediction.
790-
*/
791-
static void mip_predict(
792-
const encoder_state_t* const state,
793-
const uvg_intra_references* const refs,
794-
const uint16_t pred_block_width,
795-
const uint16_t pred_block_height,
796-
uvg_pixel* dst,
797-
const int mip_mode,
798-
const bool mip_transp)
799-
{
800-
// MIP prediction uses int values instead of uvg_pixel as some temp values may be negative
801-
802-
uvg_pixel* out = dst;
803-
int result[32*32] = {0};
804-
const int mode_idx = mip_mode;
805-
806-
// *** INPUT PREP ***
807-
808-
// Initialize prediction parameters START
809-
uint16_t width = pred_block_width;
810-
uint16_t height = pred_block_height;
811-
812-
int size_id; // Prediction block type
813-
if (width == 4 && height == 4) {
814-
size_id = 0;
815-
}
816-
else if (width == 4 || height == 4 || (width == 8 && height == 8)) {
817-
size_id = 1;
818-
}
819-
else {
820-
size_id = 2;
821-
}
822-
823-
// Reduced boundary and prediction sizes
824-
int red_bdry_size = (size_id == 0) ? 2 : 4;
825-
int red_pred_size = (size_id < 2) ? 4 : 8;
826-
827-
// Upsampling factors
828-
uint16_t ups_hor_factor = width / red_pred_size;
829-
uint16_t ups_ver_factor = height / red_pred_size;
830-
831-
// Upsampling factors must be powers of two
832-
assert(!((ups_hor_factor < 1) || ((ups_hor_factor & (ups_hor_factor - 1))) != 0) && "Horizontal upsampling factor must be power of two.");
833-
assert(!((ups_ver_factor < 1) || ((ups_ver_factor & (ups_ver_factor - 1))) != 0) && "Vertical upsampling factor must be power of two.");
834-
835-
// Initialize prediction parameters END
836-
837-
int ref_samples_top[INTRA_REF_LENGTH];
838-
int ref_samples_left[INTRA_REF_LENGTH];
839-
840-
for (int i = 1; i < INTRA_REF_LENGTH; i++) {
841-
ref_samples_top[i-1] = (int)refs->ref.top[i]; // NOTE: in VTM code these are indexed as x + 1 & y + 1 during init
842-
ref_samples_left[i-1] = (int)refs->ref.left[i];
843-
}
844-
845-
// Compute reduced boundary with Haar-downsampling
846-
const int input_size = 2 * red_bdry_size;
847-
848-
int red_bdry[MIP_MAX_INPUT_SIZE];
849-
int red_bdry_trans[MIP_MAX_INPUT_SIZE];
850-
851-
int* const top_reduced = &red_bdry[0];
852-
int* const left_reduced = &red_bdry[red_bdry_size];
853-
854-
uvg_mip_boundary_downsampling_1D(top_reduced, ref_samples_top, width, red_bdry_size);
855-
uvg_mip_boundary_downsampling_1D(left_reduced, ref_samples_left, height, red_bdry_size);
856-
857-
// Transposed reduced boundaries
858-
int* const left_reduced_trans = &red_bdry_trans[0];
859-
int* const top_reduced_trans = &red_bdry_trans[red_bdry_size];
860-
861-
for (int x = 0; x < red_bdry_size; x++) {
862-
top_reduced_trans[x] = top_reduced[x];
863-
}
864-
for (int y = 0; y < red_bdry_size; y++) {
865-
left_reduced_trans[y] = left_reduced[y];
866-
}
867-
868-
int input_offset = red_bdry[0];
869-
int input_offset_trans = red_bdry_trans[0];
870-
871-
const bool has_first_col = (size_id < 2);
872-
// First column of matrix not needed for large blocks
873-
red_bdry[0] = has_first_col ? ((1 << (UVG_BIT_DEPTH - 1)) - input_offset) : 0;
874-
red_bdry_trans[0] = has_first_col ? ((1 << (UVG_BIT_DEPTH - 1)) - input_offset_trans) : 0;
875-
876-
for (int i = 1; i < input_size; ++i) {
877-
red_bdry[i] -= input_offset;
878-
red_bdry_trans[i] -= input_offset_trans;
879-
}
880-
881-
// *** INPUT PREP *** END
882-
883-
// *** BLOCK PREDICT ***
884-
885-
const bool need_upsampling = (ups_hor_factor > 1) || (ups_ver_factor > 1);
886-
const bool transpose = mip_transp;
887-
888-
const uint8_t* matrix;
889-
switch (size_id) {
890-
case 0:
891-
matrix = &uvg_mip_matrix_4x4[mode_idx][0][0];
892-
break;
893-
case 1:
894-
matrix = &uvg_mip_matrix_8x8[mode_idx][0][0];
895-
break;
896-
case 2:
897-
matrix = &uvg_mip_matrix_16x16[mode_idx][0][0];
898-
break;
899-
default:
900-
assert(false && "Invalid MIP size id.");
901-
}
902-
903-
// Max possible size is red_pred_size * red_pred_size, red_pred_size can be either 4 or 8
904-
int red_pred_buffer[8*8];
905-
int* const reduced_pred = need_upsampling ? red_pred_buffer : result;
906-
907-
const int* const reduced_bdry = transpose ? red_bdry_trans : red_bdry;
908-
909-
uvg_mip_reduced_pred(reduced_pred, reduced_bdry, matrix, transpose, red_bdry_size, red_pred_size, size_id, input_offset, input_offset_trans);
910-
if (need_upsampling) {
911-
const int* ver_src = reduced_pred;
912-
uint16_t ver_src_step = width;
913-
914-
if (ups_hor_factor > 1) {
915-
int* const hor_dst = result + (ups_ver_factor - 1) * width;
916-
ver_src = hor_dst;
917-
ver_src_step *= ups_ver_factor;
918-
919-
uvg_mip_pred_upsampling_1D(hor_dst, reduced_pred, ref_samples_left,
920-
red_pred_size, red_pred_size,
921-
1, red_pred_size, 1, ver_src_step,
922-
ups_ver_factor, ups_hor_factor);
923-
}
924-
925-
if (ups_ver_factor > 1) {
926-
uvg_mip_pred_upsampling_1D(result, ver_src, ref_samples_top,
927-
red_pred_size, width,
928-
ver_src_step, 1, width, 1,
929-
1, ups_ver_factor);
930-
}
931-
}
932-
933-
// Assign and cast values from temp array to output
934-
for (int i = 0; i < 32 * 32; i++) {
935-
out[i] = (uvg_pixel)result[i];
936-
}
937-
// *** BLOCK PREDICT *** END
938-
}
939-
940-
941637
int8_t uvg_wide_angle_correction(
942638
int_fast8_t mode,
943639
const int log2_width,
@@ -1618,7 +1314,7 @@ void uvg_intra_predict(
16181314
if (intra_mode < 68) {
16191315
if (use_mip) {
16201316
assert(intra_mode >= 0 && intra_mode < 16 && "MIP mode must be between [0, 15]");
1621-
mip_predict(state, refs, width, height, dst, intra_mode, data->pred_cu.intra.mip_is_transposed);
1317+
uvg_mip_predict(refs, width, height, dst, intra_mode, data->pred_cu.intra.mip_is_transposed);
16221318
}
16231319
else {
16241320
intra_predict_regular(state, refs, &data->pred_cu, cu_loc, pu_loc, intra_mode, color, dst, data->pred_cu.intra.multi_ref_idx, data->pred_cu.intra.isp_mode);
@@ -1804,7 +1500,7 @@ static void intra_recon_tb_leaf(
18041500

18051501
uvg_intra_build_reference(state, pu_loc, cu_loc, color, &luma_px, &pic_px, lcu, &refs, cfg->wpp, extra_refs, multi_ref_index, isp_mode);
18061502

1807-
uvg_pixel pred[32 * 32];
1503+
ALIGNED(32) uvg_pixel pred[32 * 32];
18081504
uvg_intra_predict(state, &refs, cu_loc, pu_loc, color, pred, search_data, lcu);
18091505

18101506
const int index = lcu_px.x + lcu_px.y * lcu_width;

0 commit comments

Comments
 (0)