Skip to content

Commit 0840107

Browse files
authored
Merge pull request #1 from femifrak/femifrak-typo_corrections
Update hocrtransform.py
2 parents b0ad07b + 915be1e commit 0840107

File tree

1 file changed

+18
-2
lines changed

1 file changed

+18
-2
lines changed

src/ocrmypdf/hocrtransform.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,8 @@
3030

3131
import argparse
3232
import os
33-
import re
33+
import regex as re
34+
import json
3435
from math import atan, cos, sin
3536
from pathlib import Path
3637
from typing import Any, NamedTuple, Optional, Tuple, Union
@@ -40,6 +41,13 @@
4041
from reportlab.lib.units import inch
4142
from reportlab.pdfgen.canvas import Canvas
4243

44+
try:
45+
with open("typofixes.json") as f:
46+
my_typofixes = json.load(f)
47+
except:
48+
print('No typos were corrected.')
49+
my_typofixes = {}
50+
4351
# According to Wikipedia these languages are supported in the ISO-8859-1 character
4452
# set, meaning reportlab can generate them and they are compatible with hocr,
4553
# assuming Tesseract has the necessary languages installed. Note that there may
@@ -129,7 +137,7 @@ class HocrTransform:
129137
re.VERBOSE,
130138
)
131139
ligatures = str.maketrans(
132-
{'ff': 'ff', 'ffi': 'f‌f‌i', 'ffl': 'f‌f‌l', 'fi': 'fi', 'fl': 'fl'}
140+
{'ff': 'ff', 'ffi': 'f‌f‌i', 'ffl': 'f‌f‌l', 'fi': 'fi', 'fl': 'fl', 'ſ': 's', '⸗': '-', '—': '-', '·': '-'}
133141
)
134142

135143
def __init__(self, *, hocr_filename: Union[str, Path], dpi: float):
@@ -400,6 +408,14 @@ def _do_line(
400408
for elem in elements:
401409
elemtxt = self._get_element_text(elem).strip()
402410
elemtxt = self.replace_unsupported_chars(elemtxt)
411+
412+
try:
413+
pre, elemtxt, post = re.match("([^\p{Letter}^\p{Mark}]*-{0,1})([\w-]*\w+)(-{0,1}[^\p{Letter}^\p{Mark}]*)", elemtxt).groups()
414+
elemtxt = my_typofixes.get(elemtxt, elemtxt)
415+
elemtxt = pre + elemtxt + post
416+
except:
417+
pass
418+
403419
if elemtxt == '':
404420
continue
405421

0 commit comments

Comments
 (0)