|
| 1 | +/////////////////////////////////////////////////////////////////////// |
| 2 | +// File: intsimdmatrix.h |
| 3 | +// Description: Base class for 8-bit int SIMD matrix multipliers. |
| 4 | +// Author: Ray Smith |
| 5 | +// Created: Tue Aug 15 07:37:20 PST 2017 |
| 6 | +// |
| 7 | +// (C) Copyright 2017, Google Inc. |
| 8 | +// Licensed under the Apache License, Version 2.0 (the "License"); |
| 9 | +// you may not use this file except in compliance with the License. |
| 10 | +// You may obtain a copy of the License at |
| 11 | +// http://www.apache.org/licenses/LICENSE-2.0 |
| 12 | +// Unless required by applicable law or agreed to in writing, software |
| 13 | +// distributed under the License is distributed on an "AS IS" BASIS, |
| 14 | +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 15 | +// See the License for the specific language governing permissions and |
| 16 | +// limitations under the License. |
| 17 | +/////////////////////////////////////////////////////////////////////// |
| 18 | + |
| 19 | +#ifndef TESSERACT_ARCH_INTSIMDMATRIX_H_ |
| 20 | +#define TESSERACT_ARCH_INTSIMDMATRIX_H_ |
| 21 | + |
| 22 | +#include <stdint.h> |
| 23 | +#include <vector> |
| 24 | +#include "genericvector.h" |
| 25 | +#include "matrix.h" |
| 26 | + |
| 27 | +namespace tesseract { |
| 28 | + |
| 29 | +// Base class for a SIMD function to multiply a matrix by a vector, with sources |
| 30 | +// of 8-bit signed integer, and result in a double, after appropriate scaling. |
| 31 | +// Assumes a specific method of multiplication that can be applied to any size |
| 32 | +// and number of SIMD registers as follows: |
| 33 | +// int32_t results are computed with num_outputs_per_register_ in each of |
| 34 | +// max_output_registers_ result registers, repeatedly until it would make too |
| 35 | +// many results, then the number of registers is halved, and so-on down to a |
| 36 | +// single result register. The last calculation only outputs the required number |
| 37 | +// of results instead of writing beyond the bounds. Eg: matrix has 75 outputs, |
| 38 | +// num_outputs_per_register_ = 4, and max_output_registers_ = 8, |
| 39 | +// Step 1: 8x4=32 results are computed, |
| 40 | +// Step 2: 8x4=32 again, total 64, |
| 41 | +// Step 3: 2x4=8 (since 8x4 is too many, so is 4x4), total 72, |
| 42 | +// Step 4: 1x3, total 75. |
| 43 | +// Each step above is computed using a PartialFunc, which runs over the input |
| 44 | +// vector once. The input is read one registerful of num_inputs_per_register_ |
| 45 | +// at a time (presumably 4x num_outputs_per_register_ since they are int8_t) |
| 46 | +// so the inputs MUST BE PADDED to a multiple of num_inputs_per_register_. |
| 47 | +// Since it is slow (on Intel at least) to horizontally add in a register, |
| 48 | +// provision is made to process num_inputs_per_group_ inputs at a time, with |
| 49 | +// the group being replicated num_input_groups_ times and multiplied by a |
| 50 | +// num_inputs_per_group_ by num_input_groups_ rectangle of the weights matrix. |
| 51 | +// This is most convenient if num_inputs_per_group_ is 4, and the product |
| 52 | +// sign-extends and sums 8x8=16 bit results to 32 bits, adding 4 adjacent |
| 53 | +// results in the process, but it doesn't have to be implemented that way. |
| 54 | +// The weights are re-ordered by Init() to be used sequentially by the above |
| 55 | +// algorithm, followed by the biases, so they can be added at the end. |
| 56 | +// The base class computes the base C++ implementation. |
| 57 | +// NOTE that, although the subclasses execute on different SIMD hardware, no |
| 58 | +// virtual methods are needed, as the constructor sets up everything that |
| 59 | +// is required to allow the base class implementation to do all the work. |
| 60 | +class IntSimdMatrix { |
| 61 | + public: |
| 62 | + // Constructor should set the data members to indicate the sizes. |
| 63 | + // NOTE: Base constructor public only for test purposes. |
| 64 | + IntSimdMatrix() |
| 65 | + : num_outputs_per_register_(1), |
| 66 | + max_output_registers_(1), |
| 67 | + num_inputs_per_register_(1), |
| 68 | + num_inputs_per_group_(1), |
| 69 | + num_input_groups_(1) {} |
| 70 | + |
| 71 | + // Factory makes and returns an IntSimdMatrix (sub)class of the best |
| 72 | + // available type for the current architecture. |
| 73 | + static IntSimdMatrix* GetFastestMultiplier(); |
| 74 | + |
| 75 | + // Computes a reshaped copy of the weight matrix w. If there are no |
| 76 | + // partial_funcs_, it does nothing. |
| 77 | + void Init(const GENERIC_2D_ARRAY<int8_t>& w); |
| 78 | + |
| 79 | + // Rounds the size up to a multiple of the input register size (in int8_t). |
| 80 | + int RoundInputs(int size) const { |
| 81 | + return Roundup(size, num_inputs_per_register_); |
| 82 | + } |
| 83 | + // Rounds the size up to a multiple of the output register size (in int32_t). |
| 84 | + int RoundOutputs(int size) const { |
| 85 | + return Roundup(size, num_outputs_per_register_); |
| 86 | + } |
| 87 | + |
| 88 | + // Computes matrix.vector v = Wu. |
| 89 | + // u is of size W.dim2() - 1 and the output v is of size W.dim1(). |
| 90 | + // u is imagined to have an extra element at the end with value 1, to |
| 91 | + // implement the bias, but it doesn't actually have it. |
| 92 | + // Computes the base C++ implementation, if there are no partial_funcs_. |
| 93 | + // NOTE: The size of the input vector (u) must be padded using |
| 94 | + // RoundInputs above. |
| 95 | + // The input will be over-read to the extent of the padding. There are no |
| 96 | + // alignment requirements. |
| 97 | + void MatrixDotVector(const GENERIC_2D_ARRAY<int8_t>& w, |
| 98 | + const GenericVector<double>& scales, const int8_t* u, |
| 99 | + double* v) const; |
| 100 | + |
| 101 | + protected: |
| 102 | + // Function to compute part of a matrix.vector multiplication. The weights |
| 103 | + // are in a very specific order (see above) in w, which is multiplied by |
| 104 | + // u of length num_in, to produce output v after scaling the integer results |
| 105 | + // by the corresponding member of scales. |
| 106 | + // The amount of w and scales consumed is fixed and not available to the |
| 107 | + // caller. The number of outputs written to v will be at most num_out. |
| 108 | + typedef void (*PartialFunc)(const int8_t* w, const double* scales, |
| 109 | + const int8_t* u, int num_in, int num_out, |
| 110 | + double* v); |
| 111 | + |
| 112 | + // Rounds the input up to a multiple of the given factor. |
| 113 | + static int Roundup(int input, int factor) { |
| 114 | + return (input + factor - 1) / factor * factor; |
| 115 | + } |
| 116 | + |
| 117 | + // Number of 32 bit outputs held in each register. |
| 118 | + int num_outputs_per_register_; |
| 119 | + // Maximum number of registers that we will use to hold outputs. |
| 120 | + int max_output_registers_; |
| 121 | + // Number of 8 bit inputs in the inputs register. |
| 122 | + int num_inputs_per_register_; |
| 123 | + // Number of inputs in each weight group. |
| 124 | + int num_inputs_per_group_; |
| 125 | + // Number of groups of inputs to be broadcast. |
| 126 | + int num_input_groups_; |
| 127 | + // The weights matrix reorganized in whatever way suits this instance. |
| 128 | + std::vector<int8_t> shaped_w_; |
| 129 | + // A series of functions to compute a partial result. |
| 130 | + std::vector<PartialFunc> partial_funcs_; |
| 131 | +}; |
| 132 | + |
| 133 | +} // namespace tesseract |
| 134 | + |
| 135 | +#endif // TESSERACT_ARCH_INTSIMDMATRIX_H_ |
0 commit comments