ultravideo
diff --git a/‎src/global.h
+6 b/‎src/global.h
+6
diff --git a/‎src/intra.c
+2-306 b/‎src/intra.c
+2-306
@@ -316,6 +316,12 @@ typedef int32_t mv_t;
   #define ALIGNED(alignment) __attribute__((aligned (alignment)))
 #endif
 
+#ifdef _MSC_VER
+#define NO_ASAN
+#else
+#define NO_ASAN __attribute__((no_sanitize("address")))
+#endif
+
 #ifdef _MSC_VER
 // Buggy VS2010 throws intellisense warnings if void* is not casted.
   #define MALLOC(type, num) (type *)malloc(sizeof(type) * (num))
 
@@ -36,7 +36,6 @@
 
 #include "image.h"
 #include "uvg_math.h"
-#include "mip_data.h"
 #include "rdo.h"
 #include "search.h"
 #include "search_intra.h"
@@ -86,17 +85,6 @@ static const uint8_t num_ref_pixels_left[16][16] = {
   { 4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4 }
 };
 
-
-static void mip_predict(
-  const encoder_state_t* const state,
-  const uvg_intra_references* const refs,
-  const uint16_t pred_block_width,
-  const uint16_t pred_block_height,
-  uvg_pixel* dst,
-  const int mip_mode,
-  const bool mip_transp);
-
-
 int8_t uvg_intra_get_dir_luma_predictor(
   const uint32_t x,
   const uint32_t y,
@@ -646,298 +634,6 @@ uint8_t uvg_get_mip_flag_context(
 }
 
 
-void uvg_mip_boundary_downsampling_1D(int* reduced_dst, const int* const ref_src, int src_len, int dst_len)
-{
-  if (dst_len < src_len)
-  {
-    // Create reduced boundary by downsampling
-    uint16_t down_smp_factor = src_len / dst_len;
-    const int log2_factor = uvg_math_floor_log2(down_smp_factor);
-    const int rounding_offset = (1 << (log2_factor - 1));
-
-    uint16_t src_idx = 0;
-    for (uint16_t dst_idx = 0; dst_idx < dst_len; dst_idx++)
-    {
-      int sum = 0;
-      for (int k = 0; k < down_smp_factor; k++)
-      {
-        sum += ref_src[src_idx++];
-      }
-      reduced_dst[dst_idx] = (sum + rounding_offset) >> log2_factor;
-    }
-  }
-  else
-  {
-    // Copy boundary if no downsampling is needed
-    for (uint16_t i = 0; i < dst_len; ++i)
-    {
-      reduced_dst[i] = ref_src[i];
-    }
-  }
-}
-
-
-void uvg_mip_reduced_pred(int* const output,
-                          const int* const input,
-                          const uint8_t* matrix,
-                          const bool transpose,
-                          const int red_bdry_size,
-                          const int red_pred_size,
-                          const int size_id,
-                          const int in_offset,
-                          const int in_offset_tr)
-{
-  const int input_size = 2 * red_bdry_size;
-
-  // Use local buffer for transposed result
-  int out_buf_transposed[LCU_WIDTH * LCU_WIDTH];
-  int* const out_ptr = transpose ? out_buf_transposed : output;
-
-  int sum = 0;
-  for (int i = 0; i < input_size; i++) { 
-    sum += input[i];
-  }
-  const int offset = (1 << (MIP_SHIFT_MATRIX - 1)) - MIP_OFFSET_MATRIX * sum;
-  assert((input_size == 4 * (input_size >> 2)) && "MIP input size must be divisible by four");
-
-  const uint8_t* weight = matrix;
-  const int input_offset = transpose ? in_offset_tr : in_offset;
-
-  const bool red_size = (size_id == 2);
-  int pos_res = 0;
-  for (int y = 0; y < red_pred_size; y++) {
-    for (int x = 0; x < red_pred_size; x++) {
-      if (red_size) {
-        weight -= 1;
-      }
-      int tmp0 = red_size ? 0 : (input[0] * weight[0]);
-      int tmp1 = input[1] * weight[1];
-      int tmp2 = input[2] * weight[2];
-      int tmp3 = input[3] * weight[3];
-      for (int i = 4; i < input_size; i += 4) {
-        tmp0 += input[i] * weight[i];
-        tmp1 += input[i + 1] * weight[i + 1];
-        tmp2 += input[i + 2] * weight[i + 2];
-        tmp3 += input[i + 3] * weight[i + 3];
-      }
-      out_ptr[pos_res] = CLIP_TO_PIXEL(((tmp0 + tmp1 + tmp2 + tmp3 + offset) >> MIP_SHIFT_MATRIX) + input_offset);
-      pos_res++;
-      weight += input_size;
-    }
-  }
-
-  if (transpose) {
-    for (int y = 0; y < red_pred_size; y++) {
-      for (int x = 0; x < red_pred_size; x++) {
-        output[y * red_pred_size + x] = out_ptr[x * red_pred_size + y];
-      }
-    }
-  }
-}
-
-
-void uvg_mip_pred_upsampling_1D(int* const dst, const int* const src, const int* const boundary,
-                                const uint16_t src_size_ups_dim, const uint16_t src_size_orth_dim,
-                                const uint16_t src_step, const uint16_t src_stride,
-                                const uint16_t dst_step, const uint16_t dst_stride,
-                                const uint16_t boundary_step,
-                                const uint16_t ups_factor)
-{
-  const int log2_factor = uvg_math_floor_log2(ups_factor);
-  assert(ups_factor >= 2 && "Upsampling factor must be at least 2.");
-  const int rounding_offset = 1 << (log2_factor - 1);
-
-  uint16_t idx_orth_dim = 0;
-  const int* src_line = src;
-  int* dst_line = dst;
-  const int* boundary_line = boundary + boundary_step - 1;
-  while (idx_orth_dim < src_size_orth_dim)
-  {
-    uint16_t idx_upsample_dim = 0;
-    const int* before = boundary_line;
-    const int* behind = src_line;
-    int* cur_dst = dst_line;
-    while (idx_upsample_dim < src_size_ups_dim)
-    {
-      uint16_t pos = 1;
-      int scaled_before = (*before) << log2_factor;
-      int scaled_behind = 0;
-      while (pos <= ups_factor)
-      {
-        scaled_before -= *before;
-        scaled_behind += *behind;
-        *cur_dst = (scaled_before + scaled_behind + rounding_offset) >> log2_factor;
-
-        pos++;
-        cur_dst += dst_step;
-      }
-
-      idx_upsample_dim++;
-      before = behind;
-      behind += src_step;
-    }
-
-    idx_orth_dim++;
-    src_line += src_stride;
-    dst_line += dst_stride;
-    boundary_line += boundary_step;
-  }
-}
-
-
-
-/** \brief Matrix weighted intra prediction.
-*/
-static void mip_predict(
-  const encoder_state_t* const state,
-  const uvg_intra_references* const refs,
-  const uint16_t pred_block_width,
-  const uint16_t pred_block_height,
-  uvg_pixel* dst,
-  const int mip_mode,
-  const bool mip_transp)
-{
-  // MIP prediction uses int values instead of uvg_pixel as some temp values may be negative
-  
-  uvg_pixel* out = dst;
-  int result[32*32] = {0};
-  const int mode_idx = mip_mode;
-
-  // *** INPUT PREP ***
-
-  // Initialize prediction parameters START
-  uint16_t width = pred_block_width;
-  uint16_t height = pred_block_height;
-
-  int size_id; // Prediction block type
-  if (width == 4 && height == 4) {
-    size_id = 0;
-  }
-  else if (width == 4 || height == 4 || (width == 8 && height == 8)) {
-    size_id = 1;
-  }
-  else {
-    size_id = 2;
-  }
-
-  // Reduced boundary and prediction sizes
-  int red_bdry_size = (size_id == 0) ? 2 : 4;
-  int red_pred_size = (size_id < 2) ? 4 : 8;
-
-  // Upsampling factors
-  uint16_t ups_hor_factor = width / red_pred_size;
-  uint16_t ups_ver_factor = height / red_pred_size;
-
-  // Upsampling factors must be powers of two
-  assert(!((ups_hor_factor < 1) || ((ups_hor_factor & (ups_hor_factor - 1))) != 0) && "Horizontal upsampling factor must be power of two.");
-  assert(!((ups_ver_factor < 1) || ((ups_ver_factor & (ups_ver_factor - 1))) != 0) && "Vertical upsampling factor must be power of two.");
-
-  // Initialize prediction parameters END
-
-  int ref_samples_top[INTRA_REF_LENGTH]; 
-  int ref_samples_left[INTRA_REF_LENGTH];
-
-  for (int i = 1; i < INTRA_REF_LENGTH; i++) {
-    ref_samples_top[i-1] =  (int)refs->ref.top[i]; // NOTE: in VTM code these are indexed as x + 1 & y + 1 during init
-    ref_samples_left[i-1] = (int)refs->ref.left[i];
-  }
-
-  // Compute reduced boundary with Haar-downsampling
-  const int input_size = 2 * red_bdry_size;
-
-  int red_bdry[MIP_MAX_INPUT_SIZE];
-  int red_bdry_trans[MIP_MAX_INPUT_SIZE];
-
-  int* const top_reduced = &red_bdry[0];
-  int* const left_reduced = &red_bdry[red_bdry_size];
-
-  uvg_mip_boundary_downsampling_1D(top_reduced, ref_samples_top, width, red_bdry_size);
-  uvg_mip_boundary_downsampling_1D(left_reduced, ref_samples_left, height, red_bdry_size);
-
-  // Transposed reduced boundaries
-  int* const left_reduced_trans = &red_bdry_trans[0];
-  int* const top_reduced_trans = &red_bdry_trans[red_bdry_size];
-
-  for (int x = 0; x < red_bdry_size; x++) {
-    top_reduced_trans[x] = top_reduced[x];
-  }
-  for (int y = 0; y < red_bdry_size; y++) {
-    left_reduced_trans[y] = left_reduced[y];
-  }
-
-  int input_offset = red_bdry[0];
-  int input_offset_trans = red_bdry_trans[0];
-
-  const bool has_first_col = (size_id < 2);
-  // First column of matrix not needed for large blocks
-  red_bdry[0] = has_first_col ? ((1 << (UVG_BIT_DEPTH - 1)) - input_offset) : 0;
-  red_bdry_trans[0] = has_first_col ? ((1 << (UVG_BIT_DEPTH - 1)) - input_offset_trans) : 0;
-
-  for (int i = 1; i < input_size; ++i) {
-    red_bdry[i] -= input_offset;
-    red_bdry_trans[i] -= input_offset_trans;
-  }
-
-  // *** INPUT PREP *** END
-
-  // *** BLOCK PREDICT ***
-
-  const bool need_upsampling = (ups_hor_factor > 1) || (ups_ver_factor > 1);
-  const bool transpose = mip_transp;
-
-  const uint8_t* matrix;
-  switch (size_id) {
-    case 0: 
-      matrix = &uvg_mip_matrix_4x4[mode_idx][0][0];
-      break;
-    case 1: 
-      matrix = &uvg_mip_matrix_8x8[mode_idx][0][0];
-      break;
-    case 2: 
-      matrix = &uvg_mip_matrix_16x16[mode_idx][0][0];
-      break;
-    default:
-      assert(false && "Invalid MIP size id.");
-  }
-
-  // Max possible size is red_pred_size * red_pred_size, red_pred_size can be either 4 or 8
-  int red_pred_buffer[8*8];
-  int* const reduced_pred = need_upsampling ? red_pred_buffer : result;
-
-  const int* const reduced_bdry = transpose ? red_bdry_trans : red_bdry;
-
-  uvg_mip_reduced_pred(reduced_pred, reduced_bdry, matrix, transpose, red_bdry_size, red_pred_size, size_id, input_offset, input_offset_trans);
-  if (need_upsampling) {
-    const int* ver_src = reduced_pred;
-    uint16_t ver_src_step = width;
-    
-    if (ups_hor_factor > 1) {
-      int* const hor_dst = result + (ups_ver_factor - 1) * width;
-      ver_src = hor_dst;
-      ver_src_step *= ups_ver_factor;
-
-      uvg_mip_pred_upsampling_1D(hor_dst, reduced_pred, ref_samples_left,
-        red_pred_size, red_pred_size,
-        1, red_pred_size, 1, ver_src_step,
-        ups_ver_factor, ups_hor_factor);
-    }
-
-    if (ups_ver_factor > 1) {
-      uvg_mip_pred_upsampling_1D(result, ver_src, ref_samples_top,
-        red_pred_size, width,
-        ver_src_step, 1, width, 1,
-        1, ups_ver_factor);
-    }
-  }
-
-  // Assign and cast values from temp array to output
-  for (int i = 0; i < 32 * 32; i++) {
-    out[i] = (uvg_pixel)result[i];
-  }
-  // *** BLOCK PREDICT *** END
-}
-
-
 int8_t uvg_wide_angle_correction(
   int_fast8_t mode,
   const int log2_width,
@@ -1618,7 +1314,7 @@ void uvg_intra_predict(
   if (intra_mode < 68) {
     if (use_mip) {
       assert(intra_mode >= 0 && intra_mode < 16 && "MIP mode must be between [0, 15]");
-      mip_predict(state, refs, width, height, dst, intra_mode, data->pred_cu.intra.mip_is_transposed);
+      uvg_mip_predict(refs, width, height, dst, intra_mode, data->pred_cu.intra.mip_is_transposed);
     }
     else {
       intra_predict_regular(state, refs, &data->pred_cu, cu_loc, pu_loc, intra_mode, color, dst, data->pred_cu.intra.multi_ref_idx, data->pred_cu.intra.isp_mode);
@@ -1804,7 +1500,7 @@ static void intra_recon_tb_leaf(
 
   uvg_intra_build_reference(state, pu_loc, cu_loc, color, &luma_px, &pic_px, lcu, &refs, cfg->wpp, extra_refs, multi_ref_index, isp_mode);
 
-  uvg_pixel pred[32 * 32];
+  ALIGNED(32) uvg_pixel pred[32 * 32];
   uvg_intra_predict(state, &refs, cu_loc, pu_loc, color, pred, search_data, lcu);
 
   const int index = lcu_px.x + lcu_px.y * lcu_width;