Skip to content

Commit 19616b0

Browse files
committed
lstm: Move class SIMDDetect to new source file and improve code
Modify also the code to use a singleton. This simplifies the code as no locking is needed. It also slightly improves the performance because no check whether the architecture was tested is needed. Signed-off-by: Stefan Weil <sw@weilnetz.de>
1 parent e949812 commit 19616b0

File tree

5 files changed

+123
-83
lines changed

5 files changed

+123
-83
lines changed

api/Makefile.am

+2
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ libtesseract_api_la_LIBADD = \
2828
../wordrec/libtesseract_wordrec.la \
2929
../classify/libtesseract_classify.la \
3030
../dict/libtesseract_dict.la \
31+
../arch/libtesseract_arch.la \
3132
../arch/libtesseract_avx.la \
3233
../arch/libtesseract_sse.la \
3334
../lstm/libtesseract_lstm.la \
@@ -57,6 +58,7 @@ libtesseract_la_LIBADD = \
5758
../wordrec/libtesseract_wordrec.la \
5859
../classify/libtesseract_classify.la \
5960
../dict/libtesseract_dict.la \
61+
../arch/libtesseract_arch.la \
6062
../arch/libtesseract_avx.la \
6163
../arch/libtesseract_sse.la \
6264
../lstm/libtesseract_lstm.la \

arch/Makefile.am

+8-4
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
AM_CPPFLAGS += -I$(top_srcdir)/ccutil
1+
AM_CPPFLAGS += -I$(top_srcdir)/ccutil -I$(top_srcdir)/viewer
22
AUTOMAKE_OPTIONS = subdir-objects
33
SUBDIRS =
44
AM_CXXFLAGS =
@@ -8,15 +8,17 @@ AM_CXXFLAGS += -fvisibility=hidden -fvisibility-inlines-hidden
88
AM_CPPFLAGS += -DTESS_EXPORTS
99
endif
1010

11-
include_HEADERS = \
12-
dotproductavx.h dotproductsse.h
11+
include_HEADERS = dotproductavx.h dotproductsse.h simddetect.h
1312

14-
noinst_HEADERS =
13+
noinst_HEADERS =
1514

1615
if !USING_MULTIPLELIBS
1716
noinst_LTLIBRARIES = libtesseract_avx.la libtesseract_sse.la
17+
noinst_LTLIBRARIES += libtesseract_arch.la
1818
else
1919
lib_LTLIBRARIES = libtesseract_avx.la libtesseract_sse.la
20+
lib_LTLIBRARIES += libtesseract_arch.la
21+
libtesseract_arch_la_LDFLAGS = -version-info $(GENERIC_LIBRARY_VERSION)
2022
libtesseract_avx_la_LDFLAGS = -version-info $(GENERIC_LIBRARY_VERSION)
2123
libtesseract_sse_la_LDFLAGS = -version-info $(GENERIC_LIBRARY_VERSION)
2224
endif
@@ -28,6 +30,8 @@ if SSE41_OPT
2830
libtesseract_sse_la_CXXFLAGS = -msse4.1
2931
endif
3032

33+
libtesseract_arch_la_SOURCES = simddetect.cpp
34+
3135
libtesseract_avx_la_SOURCES = dotproductavx.cpp
3236

3337
libtesseract_sse_la_SOURCES = dotproductsse.cpp

arch/simddetect.cpp

+66
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
///////////////////////////////////////////////////////////////////////
2+
// File: simddetect.h
3+
// Description: Architecture detector.
4+
// Author: Stefan Weil (based on code from Ray Smith)
5+
//
6+
// (C) Copyright 2014, Google Inc.
7+
// Licensed under the Apache License, Version 2.0 (the "License");
8+
// you may not use this file except in compliance with the License.
9+
// You may obtain a copy of the License at
10+
// http://www.apache.org/licenses/LICENSE-2.0
11+
// Unless required by applicable law or agreed to in writing, software
12+
// distributed under the License is distributed on an "AS IS" BASIS,
13+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
// See the License for the specific language governing permissions and
15+
// limitations under the License.
16+
///////////////////////////////////////////////////////////////////////
17+
18+
#include "simddetect.h"
19+
#include "tprintf.h"
20+
21+
#undef X86_BUILD
22+
#if defined(__x86_64__) || defined(__i386__) || defined(_WIN32)
23+
# if !defined(ANDROID_BUILD)
24+
# define X86_BUILD 1
25+
# endif // !ANDROID_BUILD
26+
#endif // x86 target
27+
28+
#if defined(X86_BUILD)
29+
# if defined(__linux__) || defined(__MINGW32__)
30+
# include <cpuid.h>
31+
# elif defined(_WIN32)
32+
# include <intrin.h>
33+
# endif
34+
#endif
35+
36+
SIMDDetect SIMDDetect::detector;
37+
38+
// If true, then AVX has been detected.
39+
bool SIMDDetect::avx_available_;
40+
// If true, then SSe4.1 has been detected.
41+
bool SIMDDetect::sse_available_;
42+
43+
// Constructor.
44+
// Tests the architecture in a system-dependent way to detect AVX, SSE and
45+
// any other available SIMD equipment.
46+
SIMDDetect::SIMDDetect() {
47+
#if defined(X86_BUILD)
48+
# if defined(__linux__) || defined(__MINGW32__)
49+
unsigned int eax, ebx, ecx, edx;
50+
if (__get_cpuid(1, &eax, &ebx, &ecx, &edx) != 0) {
51+
sse_available_ = (ecx & 0x00080000) != 0;
52+
avx_available_ = (ecx & 0x10000000) != 0;
53+
}
54+
# elif defined(_WIN32)
55+
int cpuInfo[4];
56+
__cpuid(cpuInfo, 0);
57+
if (cpuInfo[0] >= 1) {
58+
__cpuid(cpuInfo, 1);
59+
sse_available_ = (cpuInfo[2] & 0x00080000) != 0;
60+
avx_available_ = (cpuInfo[2] & 0x10000000) != 0;
61+
}
62+
# endif
63+
if (avx_available_) tprintf("Found AVX\n");
64+
if (sse_available_) tprintf("Found SSE\n");
65+
#endif // X86_BUILD
66+
}

arch/simddetect.h

+43
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
///////////////////////////////////////////////////////////////////////
2+
// File: simddetect.h
3+
// Description: Architecture detector.
4+
// Author: Stefan Weil (based on code from Ray Smith)
5+
//
6+
// (C) Copyright 2014, Google Inc.
7+
// Licensed under the Apache License, Version 2.0 (the "License");
8+
// you may not use this file except in compliance with the License.
9+
// You may obtain a copy of the License at
10+
// http://www.apache.org/licenses/LICENSE-2.0
11+
// Unless required by applicable law or agreed to in writing, software
12+
// distributed under the License is distributed on an "AS IS" BASIS,
13+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
// See the License for the specific language governing permissions and
15+
// limitations under the License.
16+
///////////////////////////////////////////////////////////////////////
17+
18+
// Architecture detector. Add code here to detect any other architectures for
19+
// SIMD-based faster dot product functions. Intended to be a single static
20+
// object, but it does no real harm to have more than one.
21+
class SIMDDetect {
22+
public:
23+
// Returns true if AVX is available on this system.
24+
static inline bool IsAVXAvailable() {
25+
return detector.avx_available_;
26+
}
27+
// Returns true if SSE4.1 is available on this system.
28+
static inline bool IsSSEAvailable() {
29+
return detector.sse_available_;
30+
}
31+
32+
private:
33+
// Constructor, must set all static member variables.
34+
SIMDDetect();
35+
36+
private:
37+
// Singleton.
38+
static SIMDDetect detector;
39+
// If true, then AVX has been detected.
40+
static bool avx_available_;
41+
// If true, then SSe4.1 has been detected.
42+
static bool sse_available_;
43+
};

lstm/weightmatrix.cpp

+4-79
Original file line numberDiff line numberDiff line change
@@ -18,89 +18,14 @@
1818

1919
#include "weightmatrix.h"
2020

21-
#undef X86_BUILD
22-
#if defined(__x86_64__) || defined(__i386__) || defined(_WIN32)
23-
# if !defined(ANDROID_BUILD)
24-
# define X86_BUILD 1
25-
# endif // !ANDROID_BUILD
26-
#endif // x86 target
27-
28-
#if defined(X86_BUILD)
29-
# if defined(__linux__) || defined(__MINGW32__)
30-
# include <cpuid.h>
31-
# elif defined(_WIN32)
32-
# include <intrin.h>
33-
# endif
34-
#endif
3521
#include "dotproductavx.h"
3622
#include "dotproductsse.h"
23+
#include "simddetect.h"
3724
#include "statistc.h"
38-
#include "svutil.h"
3925
#include "tprintf.h"
4026

4127
namespace tesseract {
4228

43-
// Architecture detector. Add code here to detect any other architectures for
44-
// SIMD-based faster dot product functions. Intended to be a single static
45-
// object, but it does no real harm to have more than one.
46-
class SIMDDetect {
47-
public:
48-
SIMDDetect()
49-
: arch_tested_(false), avx_available_(false), sse_available_(false) {}
50-
51-
// Returns true if AVX is available on this system.
52-
bool IsAVXAvailable() {
53-
if (!arch_tested_) TestArchitecture();
54-
return avx_available_;
55-
}
56-
// Returns true if SSE4.1 is available on this system.
57-
bool IsSSEAvailable() {
58-
if (!arch_tested_) TestArchitecture();
59-
return sse_available_;
60-
}
61-
62-
private:
63-
// Tests the architecture in a system-dependent way to detect AVX, SSE and
64-
// any other available SIMD equipment.
65-
void TestArchitecture() {
66-
SVAutoLock lock(&arch_mutex_);
67-
if (!arch_tested_) {
68-
#if defined(X86_BUILD)
69-
# if defined(__linux__) || defined(__MINGW32__)
70-
unsigned int eax, ebx, ecx, edx;
71-
if (__get_cpuid(1, &eax, &ebx, &ecx, &edx) != 0) {
72-
sse_available_ = (ecx & 0x00080000) != 0;
73-
avx_available_ = (ecx & 0x10000000) != 0;
74-
}
75-
# elif defined(_WIN32)
76-
int cpuInfo[4];
77-
__cpuid(cpuInfo, 0);
78-
if (cpuInfo[0] >= 1) {
79-
__cpuid(cpuInfo, 1);
80-
sse_available_ = (cpuInfo[2] & 0x00080000) != 0;
81-
avx_available_ = (cpuInfo[2] & 0x10000000) != 0;
82-
}
83-
# endif
84-
if (avx_available_) tprintf("Found AVX\n");
85-
if (sse_available_) tprintf("Found SSE\n");
86-
#endif // X86_BUILD
87-
arch_tested_ = true;
88-
}
89-
}
90-
91-
private:
92-
// Detect architecture in only a single thread.
93-
SVMutex arch_mutex_;
94-
// Flag set to true after TestArchitecture has been called.
95-
bool arch_tested_;
96-
// If true, then AVX has been detected.
97-
bool avx_available_;
98-
// If true, then SSe4.1 has been detected.
99-
bool sse_available_;
100-
};
101-
102-
static SIMDDetect detector;
103-
10429
// Copies the whole input transposed, converted to double, into *this.
10530
void TransposedArray::Transpose(const GENERIC_2D_ARRAY<double>& input) {
10631
int width = input.dim1();
@@ -258,7 +183,7 @@ void WeightMatrix::MatrixDotVector(const inT8* u, double* v) const {
258183
for (int i = 0; i < num_out; ++i) {
259184
const inT8* Wi = wi_[i];
260185
int total = 0;
261-
if (detector.IsSSEAvailable()) {
186+
if (SIMDDetect::IsSSEAvailable()) {
262187
total = IntDotProductSSE(u, Wi, num_in);
263188
} else {
264189
for (int j = 0; j < num_in; ++j) total += Wi[j] * u[j];
@@ -410,8 +335,8 @@ double WeightMatrix::DotProduct(const double* u, const double* v, int n) {
410335
// is about 8% faster than sse. This suggests that the time is memory
411336
// bandwidth constrained and could benefit from holding the reused vector
412337
// in AVX registers.
413-
if (detector.IsAVXAvailable()) return DotProductAVX(u, v, n);
414-
if (detector.IsSSEAvailable()) return DotProductSSE(u, v, n);
338+
if (SIMDDetect::IsAVXAvailable()) return DotProductAVX(u, v, n);
339+
if (SIMDDetect::IsSSEAvailable()) return DotProductSSE(u, v, n);
415340
double total = 0.0;
416341
for (int k = 0; k < n; ++k) total += u[k] * v[k];
417342
return total;

0 commit comments

Comments
 (0)