|
30 | 30 |
|
31 | 31 | import argparse
|
32 | 32 | import os
|
33 |
| -import re |
| 33 | +import regex as re |
| 34 | +import json |
34 | 35 | from math import atan, cos, sin
|
35 | 36 | from pathlib import Path
|
36 | 37 | from typing import Any, NamedTuple, Optional, Tuple, Union
|
|
40 | 41 | from reportlab.lib.units import inch
|
41 | 42 | from reportlab.pdfgen.canvas import Canvas
|
42 | 43 |
|
| 44 | +try: |
| 45 | + with open("typofixes.json") as f: |
| 46 | + my_typofixes = json.load(f) |
| 47 | +except: |
| 48 | + print('No typos were corrected.') |
| 49 | + my_typofixes = {} |
| 50 | + |
43 | 51 | # According to Wikipedia these languages are supported in the ISO-8859-1 character
|
44 | 52 | # set, meaning reportlab can generate them and they are compatible with hocr,
|
45 | 53 | # assuming Tesseract has the necessary languages installed. Note that there may
|
@@ -129,7 +137,7 @@ class HocrTransform:
|
129 | 137 | re.VERBOSE,
|
130 | 138 | )
|
131 | 139 | ligatures = str.maketrans(
|
132 |
| - {'ff': 'ff', 'ffi': 'ffi', 'ffl': 'ffl', 'fi': 'fi', 'fl': 'fl'} |
| 140 | + {'ff': 'ff', 'ffi': 'ffi', 'ffl': 'ffl', 'fi': 'fi', 'fl': 'fl', 'ſ': 's', '⸗': '-', '—': '-', '·': '-'} |
133 | 141 | )
|
134 | 142 |
|
135 | 143 | def __init__(self, *, hocr_filename: Union[str, Path], dpi: float):
|
@@ -400,6 +408,14 @@ def _do_line(
|
400 | 408 | for elem in elements:
|
401 | 409 | elemtxt = self._get_element_text(elem).strip()
|
402 | 410 | elemtxt = self.replace_unsupported_chars(elemtxt)
|
| 411 | + |
| 412 | + try: |
| 413 | + pre, elemtxt, post = re.match("([^\p{Letter}^\p{Mark}]*-{0,1})([\w-]*\w+)(-{0,1}[^\p{Letter}^\p{Mark}]*)", elemtxt).groups() |
| 414 | + elemtxt = my_typofixes.get(elemtxt, elemtxt) |
| 415 | + elemtxt = pre + elemtxt + post |
| 416 | + except: |
| 417 | + pass |
| 418 | + |
403 | 419 | if elemtxt == '':
|
404 | 420 | continue
|
405 | 421 |
|
|
0 commit comments