Skip to content

Commit b0ead95

Browse files
committed
Changed the way unicharsets are handled to allow support for the ™ character. Can find the issue where it was requested.
1 parent 4efc539 commit b0ead95

File tree

9 files changed

+177
-112
lines changed

9 files changed

+177
-112
lines changed

ccstruct/ratngs.cpp

+5-2
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424

2525
#include "ratngs.h"
2626

27+
#include <string>
2728
#include "blobs.h"
2829
#include "callcpp.h"
2930
#include "genericvector.h"
@@ -200,10 +201,12 @@ WERD_CHOICE::WERD_CHOICE(const char *src_string,
200201
: unicharset_(&unicharset){
201202
GenericVector<UNICHAR_ID> encoding;
202203
GenericVector<char> lengths;
203-
if (unicharset.encode_string(src_string, true, &encoding, &lengths, NULL)) {
204+
string cleaned = unicharset.CleanupString(src_string);
205+
if (unicharset.encode_string(cleaned.c_str(), true, &encoding, &lengths,
206+
NULL)) {
204207
lengths.push_back('\0');
205208
STRING src_lengths = &lengths[0];
206-
this->init(src_string, src_lengths.string(), 0.0, 0.0, NO_PERM);
209+
this->init(cleaned.c_str(), src_lengths.string(), 0.0, 0.0, NO_PERM);
207210
} else { // There must have been an invalid unichar in the string.
208211
this->init(8);
209212
this->make_bad();

ccutil/ambigs.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -357,7 +357,7 @@ bool UnicharAmbigs::InsertIntoTable(
357357
// Insert the corresponding correct ngram into the unicharset.
358358
// Unicharset code assumes that the "base" ngram is inserted into
359359
// the unicharset before fragments of this ngram are inserted.
360-
unicharset->unichar_insert(replacement_string);
360+
unicharset->unichar_insert(replacement_string, OldUncleanUnichars::kTrue);
361361
ambig_spec->correct_ngram_id =
362362
unicharset->unichar_to_id(replacement_string);
363363
if (replacement_ambig_part_size > 1) {
@@ -372,7 +372,7 @@ bool UnicharAmbigs::InsertIntoTable(
372372
} else {
373373
STRING frag_str = CHAR_FRAGMENT::to_string(
374374
replacement_string, i, test_ambig_part_size, false);
375-
unicharset->unichar_insert(frag_str.string());
375+
unicharset->unichar_insert(frag_str.string(), OldUncleanUnichars::kTrue);
376376
unichar_id = unicharset->unichar_to_id(frag_str.string());
377377
}
378378
ambig_spec->correct_fragments[i] = unichar_id;

ccutil/unicharcompress.cpp

+3-2
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ bool UnicharCompress::ComputeEncoding(const UNICHARSET& unicharset, int null_id,
117117
direct_set.clear();
118118
radicals.clear();
119119
// Always keep space as 0;
120-
direct_set.unichar_insert(" ");
120+
direct_set.unichar_insert(" ", OldUncleanUnichars::kTrue);
121121
// Null char is next if we have one.
122122
if (null_id >= 0) {
123123
direct_set.unichar_insert(kNullChar);
@@ -160,7 +160,8 @@ bool UnicharCompress::ComputeEncoding(const UNICHARSET& unicharset, int null_id,
160160
if (it != radical_map.end()) {
161161
// This is Han. Convert to radical, stroke, index.
162162
if (!radicals.contains_unichar(it->second.radical.string())) {
163-
radicals.unichar_insert(it->second.radical.string());
163+
radicals.unichar_insert(it->second.radical.string(),
164+
OldUncleanUnichars::kTrue);
164165
}
165166
int radical = radicals.unichar_to_id(it->second.radical.string());
166167
int num_strokes = it->second.num_strokes;

ccutil/unicharmap.cpp

+18-54
Original file line numberDiff line numberDiff line change
@@ -31,41 +31,24 @@ UNICHARMAP::~UNICHARMAP() {
3131
delete[] nodes;
3232
}
3333

34-
// Search the given unichar representation in the tree. Each character in the
35-
// string is interpreted as an index in an array of nodes.
36-
UNICHAR_ID UNICHARMAP::unichar_to_id(const char* const unichar_repr) const {
37-
const char* current_char = unichar_repr;
38-
UNICHARMAP_NODE* current_nodes = nodes;
39-
40-
assert(*unichar_repr != '\0');
41-
42-
do {
43-
if (*(current_char + 1) == '\0')
44-
return current_nodes[static_cast<unsigned char>(*current_char)].id;
45-
current_nodes =
46-
current_nodes[static_cast<unsigned char>(*current_char)].children;
47-
++current_char;
48-
} while (true);
49-
}
50-
5134
// Search the given unichar representation in the tree, using length characters
5235
// from it maximum. Each character in the string is interpreted as an index in
5336
// an array of nodes.
5437
UNICHAR_ID UNICHARMAP::unichar_to_id(const char* const unichar_repr,
5538
int length) const {
56-
const char* current_char = unichar_repr;
5739
UNICHARMAP_NODE* current_nodes = nodes;
5840

5941
assert(*unichar_repr != '\0');
6042
assert(length > 0 && length <= UNICHAR_LEN);
6143

44+
int index = 0;
45+
if (index >= length || unichar_repr[index] == '\0') return INVALID_UNICHAR_ID;
6246
do {
63-
if (length == 1 || *(current_char + 1) == '\0')
64-
return current_nodes[static_cast<unsigned char>(*current_char)].id;
47+
if (index + 1 >= length || unichar_repr[index + 1] == '\0')
48+
return current_nodes[static_cast<unsigned char>(unichar_repr[index])].id;
6549
current_nodes =
66-
current_nodes[static_cast<unsigned char>(*current_char)].children;
67-
++current_char;
68-
--length;
50+
current_nodes[static_cast<unsigned char>(unichar_repr[index])].children;
51+
++index;
6952
} while (true);
7053
}
7154

@@ -75,15 +58,12 @@ UNICHAR_ID UNICHARMAP::unichar_to_id(const char* const unichar_repr,
7558
// string is interpreted as an index in an array of nodes.
7659
void UNICHARMAP::insert(const char* const unichar_repr, UNICHAR_ID id) {
7760
const char* current_char = unichar_repr;
61+
if (*current_char == '\0') return;
7862
UNICHARMAP_NODE** current_nodes_pointer = &nodes;
79-
80-
assert(*unichar_repr != '\0');
81-
assert(id >= 0);
82-
8363
do {
8464
if (*current_nodes_pointer == 0)
8565
*current_nodes_pointer = new UNICHARMAP_NODE[256];
86-
if (*(current_char + 1) == '\0') {
66+
if (current_char[1] == '\0') {
8767
(*current_nodes_pointer)
8868
[static_cast<unsigned char>(*current_char)].id = id;
8969
return;
@@ -95,24 +75,6 @@ void UNICHARMAP::insert(const char* const unichar_repr, UNICHAR_ID id) {
9575
} while (true);
9676
}
9777

98-
// Search the given unichar representation in the tree. Each character in the
99-
// string is interpreted as an index in an array of nodes. Stop once the tree
100-
// does not have anymore nodes or once we found the right unichar_repr.
101-
bool UNICHARMAP::contains(const char* const unichar_repr) const {
102-
if (unichar_repr == NULL || *unichar_repr == '\0') return false;
103-
104-
const char* current_char = unichar_repr;
105-
UNICHARMAP_NODE* current_nodes = nodes;
106-
107-
while (current_nodes != 0 && *(current_char + 1) != '\0') {
108-
current_nodes =
109-
current_nodes[static_cast<unsigned char>(*current_char)].children;
110-
++current_char;
111-
}
112-
return current_nodes != 0 && *(current_char + 1) == '\0' &&
113-
current_nodes[static_cast<unsigned char>(*current_char)].id >= 0;
114-
}
115-
11678
// Search the given unichar representation in the tree, using length characters
11779
// from it maximum. Each character in the string is interpreted as an index in
11880
// an array of nodes. Stop once the tree does not have anymore nodes or once we
@@ -121,24 +83,26 @@ bool UNICHARMAP::contains(const char* const unichar_repr,
12183
int length) const {
12284
if (unichar_repr == NULL || *unichar_repr == '\0') return false;
12385
if (length <= 0 || length > UNICHAR_LEN) return false;
124-
125-
const char* current_char = unichar_repr;
86+
int index = 0;
87+
if (index >= length || unichar_repr[index] == '\0') return false;
12688
UNICHARMAP_NODE* current_nodes = nodes;
12789

128-
while (current_nodes != 0 && (length > 1 && *(current_char + 1) != '\0')) {
90+
while (current_nodes != 0 && index + 1 < length &&
91+
unichar_repr[index + 1] != '\0') {
12992
current_nodes =
130-
current_nodes[static_cast<unsigned char>(*current_char)].children;
131-
--length;
132-
++current_char;
93+
current_nodes[static_cast<unsigned char>(unichar_repr[index])].children;
94+
++index;
13395
}
134-
return current_nodes != 0 && (length == 1 || *(current_char + 1) == '\0') &&
135-
current_nodes[static_cast<unsigned char>(*current_char)].id >= 0;
96+
return current_nodes != 0 &&
97+
(index + 1 >= length || unichar_repr[index + 1] == '\0') &&
98+
current_nodes[static_cast<unsigned char>(unichar_repr[index])].id >= 0;
13699
}
137100

138101
// Return the minimum number of characters that must be used from this string
139102
// to obtain a match in the UNICHARMAP.
140103
int UNICHARMAP::minmatch(const char* const unichar_repr) const {
141104
const char* current_char = unichar_repr;
105+
if (*current_char == '\0') return 0;
142106
UNICHARMAP_NODE* current_nodes = nodes;
143107

144108
while (current_nodes != NULL && *current_char != '\0') {

ccutil/unicharmap.h

-9
Original file line numberDiff line numberDiff line change
@@ -36,21 +36,12 @@ class UNICHARMAP {
3636
// with the given id. The length of the representation MUST be non-zero.
3737
void insert(const char* const unichar_repr, UNICHAR_ID id);
3838

39-
// Return the id associated with the given unichar representation,
40-
// this representation MUST exist within the UNICHARMAP.
41-
// The length of the representation MUST be non-zero.
42-
UNICHAR_ID unichar_to_id(const char* const unichar_repr) const;
43-
4439
// Return the id associated with the given unichar representation,
4540
// this representation MUST exist within the UNICHARMAP. The first
4641
// length characters (maximum) from unichar_repr are used. The length
4742
// MUST be non-zero.
4843
UNICHAR_ID unichar_to_id(const char* const unichar_repr, int length) const;
4944

50-
// Return true if the given unichar representation is already present in the
51-
// UNICHARMAP. The length of the representation MUST be non-zero.
52-
bool contains(const char* const unichar_repr) const;
53-
5445
// Return true if the given unichar representation is already present in the
5546
// UNICHARMAP. The first length characters (maximum) from unichar_repr are
5647
// used. The length MUST be non-zero.

0 commit comments

Comments
 (0)