Skip to content

Commit 14a97bc

Browse files
committed
Improve text parsing
1 parent 44f2b08 commit 14a97bc

File tree

1 file changed

+30
-5
lines changed

1 file changed

+30
-5
lines changed

docling/backend/msword_backend.py

Lines changed: 30 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -269,19 +269,44 @@ def handle_equations_in_text(self, element, text):
269269
for subt in element.iter():
270270
tag_name = etree.QName(subt).localname
271271
if tag_name == "t" and "math" not in subt.tag:
272-
only_texts.append(subt.text)
273-
texts_and_equations.append(subt.text)
272+
if isinstance(subt.text, str):
273+
only_texts.append(subt.text)
274+
texts_and_equations.append(subt.text)
274275
elif "oMath" in subt.tag and "oMathPara" not in subt.tag:
275276
latex_equation = str(oMath2Latex(subt))
276277
only_equations.append(latex_equation)
277278
texts_and_equations.append(latex_equation)
278279

279-
if "".join(only_texts).strip() != text.strip():
280+
if len(only_equations) < 1:
281+
return text, []
282+
283+
if (
284+
re.sub(r"\s+", "", "".join(only_texts)).strip()
285+
!= re.sub(r"\s+", "", text).strip()
286+
):
280287
# If we are not able to reconstruct the initial raw text
281288
# do not try to parse equations and return the original
282289
return text, []
283290

284-
return "".join(texts_and_equations), only_equations
291+
# Insert equations into original text
292+
# This is done to preserve white space structure
293+
output_text = ""
294+
init_i = 0
295+
for i_substr, substr in enumerate(texts_and_equations):
296+
if substr not in text:
297+
if i_substr > 0:
298+
i_text_before = text[init_i:].find(
299+
texts_and_equations[i_substr - 1]
300+
)
301+
output_text += text[init_i:][
302+
: i_text_before + len(texts_and_equations[i_substr - 1])
303+
]
304+
init_i += i_text_before + len(texts_and_equations[i_substr - 1])
305+
output_text += substr
306+
if only_equations.index(substr) == len(only_equations) - 1:
307+
output_text += text[init_i:]
308+
309+
return output_text, only_equations
285310

286311
def handle_text_elements(
287312
self,
@@ -348,7 +373,7 @@ def handle_text_elements(
348373
)
349374
elif "Heading" in p_style_id:
350375
style_element = getattr(paragraph.style, "element", None)
351-
if style_element:
376+
if style_element is not None:
352377
is_numbered_style = (
353378
"<w:numPr>" in style_element.xml or "<w:numPr>" in element.xml
354379
)

0 commit comments

Comments
 (0)