Skip to content

Commit fc6a390

Browse files
author
Ray Smith
committed
Added intsimdmatrix as a generic integer matrixdotvector function with AVX2 and SSE specializations
1 parent ad74e8a commit fc6a390

21 files changed

+1549
-41
lines changed

api/Makefile.am

+2
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ libtesseract_api_la_LIBADD = \
3030
../dict/libtesseract_dict.la \
3131
../arch/libtesseract_arch.la \
3232
../arch/libtesseract_avx.la \
33+
../arch/libtesseract_avx2.la \
3334
../arch/libtesseract_sse.la \
3435
../lstm/libtesseract_lstm.la \
3536
../ccstruct/libtesseract_ccstruct.la \
@@ -60,6 +61,7 @@ libtesseract_la_LIBADD = \
6061
../dict/libtesseract_dict.la \
6162
../arch/libtesseract_arch.la \
6263
../arch/libtesseract_avx.la \
64+
../arch/libtesseract_avx2.la \
6365
../arch/libtesseract_sse.la \
6466
../lstm/libtesseract_lstm.la \
6567
../ccstruct/libtesseract_ccstruct.la \

arch/Makefile.am

+12-6
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
AM_CPPFLAGS += -I$(top_srcdir)/ccutil -I$(top_srcdir)/viewer -DUSE_STD_NAMESPACE
1+
AM_CPPFLAGS += -I$(top_srcdir)/ccstruct -I$(top_srcdir)/ccutil -I$(top_srcdir)/viewer -DUSE_STD_NAMESPACE
22
AUTOMAKE_OPTIONS = subdir-objects
33
SUBDIRS =
44
AM_CXXFLAGS =
@@ -8,31 +8,37 @@ AM_CXXFLAGS += -fvisibility=hidden -fvisibility-inlines-hidden
88
AM_CPPFLAGS += -DTESS_EXPORTS
99
endif
1010

11-
include_HEADERS = dotproductavx.h dotproductsse.h simddetect.h
11+
include_HEADERS = dotproductavx.h dotproductsse.h intsimdmatrix.h intsimdmatrixavx2.h intsimdmatrixsse.h simddetect.h
1212

1313
noinst_HEADERS =
1414

1515
if !USING_MULTIPLELIBS
16-
noinst_LTLIBRARIES = libtesseract_avx.la libtesseract_sse.la
16+
noinst_LTLIBRARIES = libtesseract_avx.la libtesseract_avx2.la libtesseract_sse.la
1717
noinst_LTLIBRARIES += libtesseract_arch.la
1818
else
19-
lib_LTLIBRARIES = libtesseract_avx.la libtesseract_sse.la
19+
lib_LTLIBRARIES = libtesseract_avx.la libtesseract_avx2.la libtesseract_sse.la
2020
lib_LTLIBRARIES += libtesseract_arch.la
2121
libtesseract_arch_la_LDFLAGS = -version-info $(GENERIC_LIBRARY_VERSION)
2222
libtesseract_avx_la_LDFLAGS = -version-info $(GENERIC_LIBRARY_VERSION)
23+
libtesseract_avx2_la_LDFLAGS = -version-info $(GENERIC_LIBRARY_VERSION)
2324
libtesseract_sse_la_LDFLAGS = -version-info $(GENERIC_LIBRARY_VERSION)
2425
endif
2526

2627
if AVX_OPT
2728
libtesseract_avx_la_CXXFLAGS = -mavx
2829
endif
30+
if AVX2_OPT
31+
libtesseract_avx2_la_CXXFLAGS = -mavx2
32+
endif
2933
if SSE41_OPT
3034
libtesseract_sse_la_CXXFLAGS = -msse4.1
3135
endif
3236

33-
libtesseract_arch_la_SOURCES = simddetect.cpp
37+
libtesseract_arch_la_SOURCES = intsimdmatrix.cpp simddetect.cpp
3438

3539
libtesseract_avx_la_SOURCES = dotproductavx.cpp
3640

37-
libtesseract_sse_la_SOURCES = dotproductsse.cpp
41+
libtesseract_avx2_la_SOURCES = intsimdmatrixavx2.cpp
42+
43+
libtesseract_sse_la_SOURCES = dotproductsse.cpp intsimdmatrixsse.cpp
3844

arch/intsimdmatrix.cpp

+133
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
///////////////////////////////////////////////////////////////////////
2+
// File: intsimdmatrix.cpp
3+
// Description: Base class for 8-bit int SIMD matrix multipliers.
4+
// Author: Ray Smith
5+
// Created: Tue Aug 15 08:01:32 PST 2017
6+
//
7+
// (C) Copyright 2017, Google Inc.
8+
// Licensed under the Apache License, Version 2.0 (the "License");
9+
// you may not use this file except in compliance with the License.
10+
// You may obtain a copy of the License at
11+
// http://www.apache.org/licenses/LICENSE-2.0
12+
// Unless required by applicable law or agreed to in writing, software
13+
// distributed under the License is distributed on an "AS IS" BASIS,
14+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
// See the License for the specific language governing permissions and
16+
// limitations under the License.
17+
///////////////////////////////////////////////////////////////////////
18+
19+
#include "intsimdmatrix.h"
20+
#include "intsimdmatrixavx2.h"
21+
#include "intsimdmatrixsse.h"
22+
#include "simddetect.h"
23+
24+
namespace tesseract {
25+
26+
// Factory makes and returns an IntSimdMatrix (sub)class of the best
27+
// available type for the current architecture.
28+
/* static */
29+
IntSimdMatrix* IntSimdMatrix::GetFastestMultiplier() {
30+
IntSimdMatrix* multiplier = nullptr;
31+
if (SIMDDetect::IsAVX2Available()) {
32+
multiplier = new IntSimdMatrixAVX2();
33+
} else if (SIMDDetect::IsSSEAvailable()) {
34+
multiplier = new IntSimdMatrixSSE();
35+
} else {
36+
// Default c++ implementation.
37+
multiplier = new IntSimdMatrix();
38+
}
39+
return multiplier;
40+
}
41+
42+
// Computes a reshaped copy of the weight matrix w. If there are no
43+
// partial_funcs_, it does nothing.
44+
void IntSimdMatrix::Init(const GENERIC_2D_ARRAY<int8_t>& w) {
45+
if (partial_funcs_.empty()) return;
46+
int num_out = w.dim1();
47+
int num_in = w.dim2() - 1;
48+
// The rounded-up sizes of the reshaped weight matrix, excluding biases.
49+
int rounded_num_in = Roundup(num_in, num_inputs_per_group_);
50+
int rounded_num_out = RoundOutputs(num_out);
51+
// Add the bias and compute the required size.
52+
shaped_w_.resize((rounded_num_in + 1) * rounded_num_out, 0);
53+
int shaped_index = 0;
54+
int output = 0;
55+
// Each number of registers needs a different format! Iterates over the
56+
// different numbers of registers (each a power of 2).
57+
for (int num_registers = max_output_registers_; num_registers >= 1;
58+
num_registers /= 2) {
59+
// The number of outputs that we will generate with this many registers.
60+
int num_outputs_per_register_set =
61+
num_registers * num_outputs_per_register_;
62+
// Use the max number of registers until we have to go fewer.
63+
while (output + num_outputs_per_register_set <= rounded_num_out) {
64+
// Accumulating outputs in registers saves iterating over the inputs, so
65+
// we only have to do it once per output register set.
66+
for (int input = 0; input < num_in; input += num_inputs_per_group_) {
67+
// Iterate over the number of outputs in a register set.
68+
for (int j = 0; j < num_outputs_per_register_set; ++j) {
69+
// Inner-most loop corresponds to the number of inputs in an input
70+
// group.
71+
for (int i = 0; i < num_inputs_per_group_; ++i) {
72+
int8_t weight = 0;
73+
if (output + j < num_out && input + i < num_in)
74+
weight = w(output + j, input + i);
75+
shaped_w_[shaped_index++] = weight;
76+
}
77+
}
78+
}
79+
// Append the bias weights for the register set.
80+
for (int j = 0; j < num_outputs_per_register_set; ++j) {
81+
int8_t weight = 0;
82+
if (output + j < num_out) weight = w(output + j, num_in);
83+
shaped_w_[shaped_index++] = weight;
84+
}
85+
output += num_outputs_per_register_set;
86+
}
87+
}
88+
}
89+
90+
// Computes matrix.vector v = Wu.
91+
// u is of size W.dim2() - 1 and the output v is of size W.dim1().
92+
// u is imagined to have an extra element at the end with value 1, to
93+
// implement the bias, but it doesn't actually have it.
94+
void IntSimdMatrix::MatrixDotVector(const GENERIC_2D_ARRAY<int8_t>& w,
95+
const GenericVector<double>& scales,
96+
const int8_t* u, double* v) const {
97+
int num_out = w.dim1();
98+
int num_in = w.dim2() - 1;
99+
if (partial_funcs_.empty()) {
100+
// Base implementation.
101+
for (int i = 0; i < num_out; ++i) {
102+
const int8_t* wi = w[i];
103+
int total = 0;
104+
for (int j = 0; j < num_in; ++j) total += wi[j] * u[j];
105+
// Add in the bias and correct for integer values.
106+
v[i] = (static_cast<double>(total) / MAX_INT8 + wi[num_in]) * scales[i];
107+
}
108+
} else {
109+
const int8_t* w_data = shaped_w_.data();
110+
const double* scales_data = &scales[0];
111+
// Each call to a partial_func_ produces group_size outputs, except the
112+
// last one, which can produce less.
113+
int group_size = num_outputs_per_register_ * max_output_registers_;
114+
int rounded_num_in = Roundup(num_in, num_inputs_per_group_);
115+
int rounded_num_out = RoundOutputs(num_out);
116+
int output = 0;
117+
for (auto fn : partial_funcs_) {
118+
// The amount of w_data consumed by each call to fn.
119+
int w_step = (rounded_num_in + 1) * group_size;
120+
// Run with this group size, until it would produce too much output, then
121+
// switch to a smaller size.
122+
for (; output + group_size <= rounded_num_out; output += group_size) {
123+
(*fn)(w_data, scales_data, u, rounded_num_in, num_out - output, v);
124+
w_data += w_step;
125+
scales_data += group_size;
126+
v += group_size;
127+
}
128+
group_size /= 2;
129+
}
130+
}
131+
}
132+
133+
} // namespace tesseract

arch/intsimdmatrix.h

+135
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
///////////////////////////////////////////////////////////////////////
2+
// File: intsimdmatrix.h
3+
// Description: Base class for 8-bit int SIMD matrix multipliers.
4+
// Author: Ray Smith
5+
// Created: Tue Aug 15 07:37:20 PST 2017
6+
//
7+
// (C) Copyright 2017, Google Inc.
8+
// Licensed under the Apache License, Version 2.0 (the "License");
9+
// you may not use this file except in compliance with the License.
10+
// You may obtain a copy of the License at
11+
// http://www.apache.org/licenses/LICENSE-2.0
12+
// Unless required by applicable law or agreed to in writing, software
13+
// distributed under the License is distributed on an "AS IS" BASIS,
14+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
// See the License for the specific language governing permissions and
16+
// limitations under the License.
17+
///////////////////////////////////////////////////////////////////////
18+
19+
#ifndef TESSERACT_ARCH_INTSIMDMATRIX_H_
20+
#define TESSERACT_ARCH_INTSIMDMATRIX_H_
21+
22+
#include <stdint.h>
23+
#include <vector>
24+
#include "genericvector.h"
25+
#include "matrix.h"
26+
27+
namespace tesseract {
28+
29+
// Base class for a SIMD function to multiply a matrix by a vector, with sources
30+
// of 8-bit signed integer, and result in a double, after appropriate scaling.
31+
// Assumes a specific method of multiplication that can be applied to any size
32+
// and number of SIMD registers as follows:
33+
// int32_t results are computed with num_outputs_per_register_ in each of
34+
// max_output_registers_ result registers, repeatedly until it would make too
35+
// many results, then the number of registers is halved, and so-on down to a
36+
// single result register. The last calculation only outputs the required number
37+
// of results instead of writing beyond the bounds. Eg: matrix has 75 outputs,
38+
// num_outputs_per_register_ = 4, and max_output_registers_ = 8,
39+
// Step 1: 8x4=32 results are computed,
40+
// Step 2: 8x4=32 again, total 64,
41+
// Step 3: 2x4=8 (since 8x4 is too many, so is 4x4), total 72,
42+
// Step 4: 1x3, total 75.
43+
// Each step above is computed using a PartialFunc, which runs over the input
44+
// vector once. The input is read one registerful of num_inputs_per_register_
45+
// at a time (presumably 4x num_outputs_per_register_ since they are int8_t)
46+
// so the inputs MUST BE PADDED to a multiple of num_inputs_per_register_.
47+
// Since it is slow (on Intel at least) to horizontally add in a register,
48+
// provision is made to process num_inputs_per_group_ inputs at a time, with
49+
// the group being replicated num_input_groups_ times and multiplied by a
50+
// num_inputs_per_group_ by num_input_groups_ rectangle of the weights matrix.
51+
// This is most convenient if num_inputs_per_group_ is 4, and the product
52+
// sign-extends and sums 8x8=16 bit results to 32 bits, adding 4 adjacent
53+
// results in the process, but it doesn't have to be implemented that way.
54+
// The weights are re-ordered by Init() to be used sequentially by the above
55+
// algorithm, followed by the biases, so they can be added at the end.
56+
// The base class computes the base C++ implementation.
57+
// NOTE that, although the subclasses execute on different SIMD hardware, no
58+
// virtual methods are needed, as the constructor sets up everything that
59+
// is required to allow the base class implementation to do all the work.
60+
class IntSimdMatrix {
61+
public:
62+
// Constructor should set the data members to indicate the sizes.
63+
// NOTE: Base constructor public only for test purposes.
64+
IntSimdMatrix()
65+
: num_outputs_per_register_(1),
66+
max_output_registers_(1),
67+
num_inputs_per_register_(1),
68+
num_inputs_per_group_(1),
69+
num_input_groups_(1) {}
70+
71+
// Factory makes and returns an IntSimdMatrix (sub)class of the best
72+
// available type for the current architecture.
73+
static IntSimdMatrix* GetFastestMultiplier();
74+
75+
// Computes a reshaped copy of the weight matrix w. If there are no
76+
// partial_funcs_, it does nothing.
77+
void Init(const GENERIC_2D_ARRAY<int8_t>& w);
78+
79+
// Rounds the size up to a multiple of the input register size (in int8_t).
80+
int RoundInputs(int size) const {
81+
return Roundup(size, num_inputs_per_register_);
82+
}
83+
// Rounds the size up to a multiple of the output register size (in int32_t).
84+
int RoundOutputs(int size) const {
85+
return Roundup(size, num_outputs_per_register_);
86+
}
87+
88+
// Computes matrix.vector v = Wu.
89+
// u is of size W.dim2() - 1 and the output v is of size W.dim1().
90+
// u is imagined to have an extra element at the end with value 1, to
91+
// implement the bias, but it doesn't actually have it.
92+
// Computes the base C++ implementation, if there are no partial_funcs_.
93+
// NOTE: The size of the input vector (u) must be padded using
94+
// RoundInputs above.
95+
// The input will be over-read to the extent of the padding. There are no
96+
// alignment requirements.
97+
void MatrixDotVector(const GENERIC_2D_ARRAY<int8_t>& w,
98+
const GenericVector<double>& scales, const int8_t* u,
99+
double* v) const;
100+
101+
protected:
102+
// Function to compute part of a matrix.vector multiplication. The weights
103+
// are in a very specific order (see above) in w, which is multiplied by
104+
// u of length num_in, to produce output v after scaling the integer results
105+
// by the corresponding member of scales.
106+
// The amount of w and scales consumed is fixed and not available to the
107+
// caller. The number of outputs written to v will be at most num_out.
108+
typedef void (*PartialFunc)(const int8_t* w, const double* scales,
109+
const int8_t* u, int num_in, int num_out,
110+
double* v);
111+
112+
// Rounds the input up to a multiple of the given factor.
113+
static int Roundup(int input, int factor) {
114+
return (input + factor - 1) / factor * factor;
115+
}
116+
117+
// Number of 32 bit outputs held in each register.
118+
int num_outputs_per_register_;
119+
// Maximum number of registers that we will use to hold outputs.
120+
int max_output_registers_;
121+
// Number of 8 bit inputs in the inputs register.
122+
int num_inputs_per_register_;
123+
// Number of inputs in each weight group.
124+
int num_inputs_per_group_;
125+
// Number of groups of inputs to be broadcast.
126+
int num_input_groups_;
127+
// The weights matrix reorganized in whatever way suits this instance.
128+
std::vector<int8_t> shaped_w_;
129+
// A series of functions to compute a partial result.
130+
std::vector<PartialFunc> partial_funcs_;
131+
};
132+
133+
} // namespace tesseract
134+
135+
#endif // TESSERACT_ARCH_INTSIMDMATRIX_H_

0 commit comments

Comments
 (0)