@@ -269,19 +269,44 @@ def handle_equations_in_text(self, element, text):
269
269
for subt in element .iter ():
270
270
tag_name = etree .QName (subt ).localname
271
271
if tag_name == "t" and "math" not in subt .tag :
272
- only_texts .append (subt .text )
273
- texts_and_equations .append (subt .text )
272
+ if isinstance (subt .text , str ):
273
+ only_texts .append (subt .text )
274
+ texts_and_equations .append (subt .text )
274
275
elif "oMath" in subt .tag and "oMathPara" not in subt .tag :
275
276
latex_equation = str (oMath2Latex (subt ))
276
277
only_equations .append (latex_equation )
277
278
texts_and_equations .append (latex_equation )
278
279
279
- if "" .join (only_texts ).strip () != text .strip ():
280
+ if len (only_equations ) < 1 :
281
+ return text , []
282
+
283
+ if (
284
+ re .sub (r"\s+" , "" , "" .join (only_texts )).strip ()
285
+ != re .sub (r"\s+" , "" , text ).strip ()
286
+ ):
280
287
# If we are not able to reconstruct the initial raw text
281
288
# do not try to parse equations and return the original
282
289
return text , []
283
290
284
- return "" .join (texts_and_equations ), only_equations
291
+ # Insert equations into original text
292
+ # This is done to preserve white space structure
293
+ output_text = ""
294
+ init_i = 0
295
+ for i_substr , substr in enumerate (texts_and_equations ):
296
+ if substr not in text :
297
+ if i_substr > 0 :
298
+ i_text_before = text [init_i :].find (
299
+ texts_and_equations [i_substr - 1 ]
300
+ )
301
+ output_text += text [init_i :][
302
+ : i_text_before + len (texts_and_equations [i_substr - 1 ])
303
+ ]
304
+ init_i += i_text_before + len (texts_and_equations [i_substr - 1 ])
305
+ output_text += substr
306
+ if only_equations .index (substr ) == len (only_equations ) - 1 :
307
+ output_text += text [init_i :]
308
+
309
+ return output_text , only_equations
285
310
286
311
def handle_text_elements (
287
312
self ,
@@ -348,7 +373,7 @@ def handle_text_elements(
348
373
)
349
374
elif "Heading" in p_style_id :
350
375
style_element = getattr (paragraph .style , "element" , None )
351
- if style_element :
376
+ if style_element is not None :
352
377
is_numbered_style = (
353
378
"<w:numPr>" in style_element .xml or "<w:numPr>" in element .xml
354
379
)
0 commit comments