Skip to content

Commit 2e9fd69

Browse files
committed
use 'import pathlib'; fix "TypeError: argument of type 'WindowsPath' is not iterable"
1 parent a0527b4 commit 2e9fd69

File tree

1 file changed

+36
-31
lines changed

1 file changed

+36
-31
lines changed

src/training/tesstrain_utils.py

+36-31
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
import sys
2020
from datetime import date
2121
from tempfile import TemporaryDirectory, mkdtemp
22-
from pathlib import Path
22+
import pathlib
2323
import logging
2424
import subprocess
2525
import argparse
@@ -75,8 +75,13 @@ def run_command(cmd, *args, env=None):
7575
err_exit(f"{cmd} not found")
7676

7777
log.debug(f"Running {cmd}")
78-
for arg in args:
78+
args = list(args)
79+
for idx, arg in enumerate(args):
7980
log.debug(arg)
81+
# Workaround for https://bugs.python.org/issue33617
82+
# TypeError: argument of type 'WindowsPath' is not iterable
83+
if isinstance(arg, pathlib.WindowsPath):
84+
args[idx] = str(arg)
8085

8186
proc = subprocess.run(
8287
[cmd, *args], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, env=env
@@ -96,11 +101,11 @@ def run_command(cmd, *args, env=None):
96101
# Used to check required input files and produced output files in each phase.
97102
# Usage: check_file_readable FILE1 FILE2...
98103
def check_file_readable(*filenames):
99-
if isinstance(filenames, (str, Path)):
104+
if isinstance(filenames, (str, pathlib.Path)):
100105
filenames = [filenames]
101106
for filename in filenames:
102107
try:
103-
with Path(filename).open():
108+
with pathlib.Path(filename).open():
104109
pass
105110
except FileNotFoundError:
106111
err_exit(f"Required/expected file '{filename}' does not exist")
@@ -227,13 +232,13 @@ def parse_flags(argv=None):
227232
else:
228233
ctx.training_dir = mkdtemp(prefix=f"{ctx.lang_code}-{ctx.timestamp}", dir=ctx.tmp_dir)
229234
# Location of log file for the whole run.
230-
ctx.log_file = Path(ctx.training_dir) / "tesstrain.log"
235+
ctx.log_file = pathlib.Path(ctx.training_dir) / "tesstrain.log"
231236
log.info(f"Log file location: {ctx.log_file}")
232237

233238
def show_tmpdir_location(training_dir):
234239
# On successful exit we will delete this first; on failure we want to let the user
235240
# know where the log is
236-
if Path(training_dir).exists():
241+
if pathlib.Path(training_dir).exists():
237242
print(f"Temporary files retained at: {training_dir}")
238243

239244
atexit.register(show_tmpdir_location, ctx.training_dir)
@@ -242,27 +247,27 @@ def show_tmpdir_location(training_dir):
242247
# specified in the command-line.
243248
if not ctx.training_text:
244249
ctx.training_text = (
245-
Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.training_text"
250+
pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.training_text"
246251
)
247252
if not ctx.wordlist_file:
248253
ctx.wordlist_file = (
249-
Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.wordlist"
254+
pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.wordlist"
250255
)
251256

252257
ctx.word_bigrams_file = (
253-
Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.word.bigrams"
258+
pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.word.bigrams"
254259
)
255260
ctx.numbers_file = (
256-
Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.numbers"
261+
pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.numbers"
257262
)
258-
ctx.punc_file = Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.punc"
259-
ctx.bigram_freqs_file = Path(ctx.training_text).with_suffix(
263+
ctx.punc_file = pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.punc"
264+
ctx.bigram_freqs_file = pathlib.Path(ctx.training_text).with_suffix(
260265
".training_text.bigram_freqs"
261266
)
262-
ctx.unigram_freqs_file = Path(ctx.training_text).with_suffix(
267+
ctx.unigram_freqs_file = pathlib.Path(ctx.training_text).with_suffix(
263268
".training_text.unigram_freqs"
264269
)
265-
ctx.train_ngrams_file = Path(ctx.training_text).with_suffix(
270+
ctx.train_ngrams_file = pathlib.Path(ctx.training_text).with_suffix(
266271
".training_text.train_ngrams"
267272
)
268273
ctx.generate_dawgs = 1
@@ -278,8 +283,8 @@ def cleanup(ctx):
278283

279284
# Function initializes font config with a unique font cache dir.
280285
def initialize_fontconfig(ctx):
281-
sample_path = Path(ctx.font_config_cache) / "sample_text.txt"
282-
Path(sample_path).write_text("Text\n")
286+
sample_path = pathlib.Path(ctx.font_config_cache) / "sample_text.txt"
287+
pathlib.Path(sample_path).write_text("Text\n")
283288
log.info(f"Testing font: {ctx.fonts[0]}")
284289
run_command(
285290
"text2image",
@@ -296,7 +301,7 @@ def make_fontname(font):
296301

297302

298303
def make_outbase(ctx, fontname, exposure):
299-
return Path(ctx.training_dir) / f"{ctx.lang_code}.{fontname}.exp{exposure}"
304+
return pathlib.Path(ctx.training_dir) / f"{ctx.lang_code}.{fontname}.exp{exposure}"
300305

301306

302307
# Helper function for phaseI_generate_image. Generates the image for a single
@@ -336,7 +341,7 @@ def generate_font_image(ctx, font, exposure, char_spacing):
336341

337342
check_file_readable(str(outbase) + ".box", str(outbase) + ".tif")
338343

339-
if ctx.extract_font_properties and Path(ctx.train_ngrams_file).exists():
344+
if ctx.extract_font_properties and pathlib.Path(ctx.train_ngrams_file).exists():
340345
log.info(f"Extracting font properties of {font}")
341346
run_command(
342347
"text2image",
@@ -362,16 +367,16 @@ def phase_I_generate_image(ctx, par_factor):
362367
char_spacing = 0.0
363368

364369
for exposure in ctx.exposures:
365-
if ctx.extract_font_properties and Path(ctx.bigram_freqs_file).exists():
370+
if ctx.extract_font_properties and pathlib.Path(ctx.bigram_freqs_file).exists():
366371
# Parse .bigram_freqs file and compose a .train_ngrams file with text
367372
# for tesseract to recognize during training. Take only the ngrams whose
368373
# combined weight accounts for 95% of all the bigrams in the language.
369-
lines = Path(ctx.bigram_freqs_file).read_text(encoding="utf-8").split("\n")
374+
lines = pathlib.Path(ctx.bigram_freqs_file).read_text(encoding="utf-8").split("\n")
370375
records = (line.split(" ") for line in lines)
371376
p = 0.99
372377
ngram_frac = p * sum(int(rec[1]) for rec in records if len(rec) >= 2)
373378

374-
with Path(ctx.train_ngrams_file).open("w", encoding="utf-8") as f:
379+
with pathlib.Path(ctx.train_ngrams_file).open("w", encoding="utf-8") as f:
375380
cumsum = 0
376381
for bigram, count in sorted(records, key=itemgetter(1), reverse=True):
377382
if cumsum > ngram_frac:
@@ -408,9 +413,9 @@ def phase_I_generate_image(ctx, par_factor):
408413
def phase_UP_generate_unicharset(ctx):
409414
log.info("=== Phase UP: Generating unicharset and unichar properties files ===")
410415

411-
box_files = Path(ctx.training_dir).glob("*.box")
416+
box_files = pathlib.Path(ctx.training_dir).glob("*.box")
412417

413-
ctx.unicharset_file = Path(ctx.training_dir) / f"{ctx.lang_code}.unicharset"
418+
ctx.unicharset_file = pathlib.Path(ctx.training_dir) / f"{ctx.lang_code}.unicharset"
414419

415420
run_command(
416421
"unicharset_extractor",
@@ -422,7 +427,7 @@ def phase_UP_generate_unicharset(ctx):
422427
)
423428
check_file_readable(ctx.unicharset_file)
424429

425-
ctx.xheights_file = Path(ctx.training_dir) / f"{ctx.lang_code}.xheights"
430+
ctx.xheights_file = pathlib.Path(ctx.training_dir) / f"{ctx.lang_code}.xheights"
426431
run_command(
427432
"set_unicharset_properties",
428433
"-U",
@@ -512,12 +517,12 @@ def phase_UP_generate_unicharset(ctx):
512517
def phase_E_extract_features(ctx, box_config, ext):
513518
log.info(f"=== Phase E: Generating {ext} files ===")
514519

515-
img_files = list(Path(ctx.training_dir).glob("*.exp*.tif"))
520+
img_files = list(pathlib.Path(ctx.training_dir).glob("*.exp*.tif"))
516521
log.debug(img_files)
517522

518523
# Use any available language-specific configs.
519524
config = ""
520-
testconfig = Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.config"
525+
testconfig = pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.config"
521526
if testconfig.exists():
522527
config = testconfig
523528
log.info(f"Using {ctx.lang_code}.config")
@@ -536,7 +541,7 @@ def phase_E_extract_features(ctx, box_config, ext):
536541
run_command,
537542
"tesseract",
538543
img_file,
539-
Path(img_file).with_suffix(""),
544+
pathlib.Path(img_file).with_suffix(""),
540545
*box_config,
541546
config,
542547
env=tessdata_environ,
@@ -552,7 +557,7 @@ def phase_E_extract_features(ctx, box_config, ext):
552557
pbar.update(1)
553558
# Check that all the output files were produced.
554559
for img_file in img_files:
555-
check_file_readable(Path(img_file.with_suffix("." + ext)))
560+
check_file_readable(pathlib.Path(img_file.with_suffix("." + ext)))
556561

557562
return
558563

@@ -640,7 +645,7 @@ def phase_E_extract_features(ctx, box_config, ext):
640645
def make_lstmdata(ctx):
641646
log.info("=== Constructing LSTM training data ===")
642647
lang_prefix = f"{ctx.langdata_dir}/{ctx.lang_code}/{ctx.lang_code}"
643-
path_output = Path(ctx.output_dir)
648+
path_output = pathlib.Path(ctx.output_dir)
644649
if not path_output.is_dir():
645650
log.info(f"Creating new directory {ctx.output_dir}")
646651
path_output.mkdir(exist_ok=True, parents=True)
@@ -672,7 +677,7 @@ def make_lstmdata(ctx):
672677
)
673678

674679
def get_file_list():
675-
training_path = Path(ctx.training_dir)
680+
training_path = pathlib.Path(ctx.training_dir)
676681
if ctx.save_box_tiff:
677682
log.info("=== Saving box/tiff pairs for training data ===")
678683
yield from training_path.glob(f"{ctx.lang_code}*.box")
@@ -686,7 +691,7 @@ def get_file_list():
686691

687692
lstm_list = f"{ctx.output_dir}/{ctx.lang_code}.training_files.txt"
688693
dir_listing = (str(p) for p in path_output.glob(f"{ctx.lang_code}.*.lstmf"))
689-
Path(lstm_list).write_text("\n".join(dir_listing))
694+
pathlib.Path(lstm_list).write_text("\n".join(dir_listing))
690695

691696

692697
# make__traineddata() {

0 commit comments

Comments
 (0)