From eb4d17bba5940a240bd80bed73837fdd7ef81217 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 29 Mar 2025 11:56:42 +0000 Subject: [PATCH 1/6] chore: bump version to 2.28.4 [skip ci] Signed-off-by: Rafael Teixeira de Lima --- CHANGELOG.md | 6 ++++++ pyproject.toml | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1bb86f666..fdbd2b22e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +## [v2.28.4](https://github.com/docling-project/docling/releases/tag/v2.28.4) - 2025-03-29 + +### Fix + +* Fixes tables when using OCR ([#1261](https://github.com/docling-project/docling/issues/1261)) ([`7afad7e`](https://github.com/docling-project/docling/commit/7afad7e52da642b258edd67f8f4815ea430f05e1)) + ## [v2.28.3](https://github.com/docling-project/docling/releases/tag/v2.28.3) - 2025-03-28 ### Fix diff --git a/pyproject.toml b/pyproject.toml index 3e94e9d88..dd48a9d27 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "docling" -version = "2.28.3" # DO NOT EDIT, updated automatically +version = "2.28.4" # DO NOT EDIT, updated automatically description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications." authors = [ "Christoph Auer ", From 76982a5b1594f4c420a77cc6c7ea6ad0357d299e Mon Sep 17 00:00:00 2001 From: Rafael Teixeira de Lima Date: Mon, 31 Mar 2025 11:41:06 +0200 Subject: [PATCH 2/6] Improve text parsing Signed-off-by: Rafael Teixeira de Lima --- docling/backend/msword_backend.py | 35 ++++++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index 47775b89f..941a4b4ab 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -269,19 +269,44 @@ def handle_equations_in_text(self, element, text): for subt in element.iter(): tag_name = etree.QName(subt).localname if tag_name == "t" and "math" not in subt.tag: - only_texts.append(subt.text) - texts_and_equations.append(subt.text) + if isinstance(subt.text, str): + only_texts.append(subt.text) + texts_and_equations.append(subt.text) elif "oMath" in subt.tag and "oMathPara" not in subt.tag: latex_equation = str(oMath2Latex(subt)) only_equations.append(latex_equation) texts_and_equations.append(latex_equation) - if "".join(only_texts).strip() != text.strip(): + if len(only_equations) < 1: + return text, [] + + if ( + re.sub(r"\s+", "", "".join(only_texts)).strip() + != re.sub(r"\s+", "", text).strip() + ): # If we are not able to reconstruct the initial raw text # do not try to parse equations and return the original return text, [] - return "".join(texts_and_equations), only_equations + # Insert equations into original text + # This is done to preserve white space structure + output_text = "" + init_i = 0 + for i_substr, substr in enumerate(texts_and_equations): + if substr not in text: + if i_substr > 0: + i_text_before = text[init_i:].find( + texts_and_equations[i_substr - 1] + ) + output_text += text[init_i:][ + : i_text_before + len(texts_and_equations[i_substr - 1]) + ] + init_i += i_text_before + len(texts_and_equations[i_substr - 1]) + output_text += substr + if only_equations.index(substr) == len(only_equations) - 1: + output_text += text[init_i:] + + return output_text, only_equations def handle_text_elements( self, @@ -348,7 +373,7 @@ def handle_text_elements( ) elif "Heading" in p_style_id: style_element = getattr(paragraph.style, "element", None) - if style_element: + if style_element is not None: is_numbered_style = ( "" in style_element.xml or "" in element.xml ) From d5431577f0c1e9dee9803a848fb3c89aa35f7364 Mon Sep 17 00:00:00 2001 From: Guilhem VERMOREL <83694424+guilhemvermorel@users.noreply.github.com> Date: Mon, 31 Mar 2025 10:53:49 +0200 Subject: [PATCH 3/6] fix: Tesseract OCR CLI can't process images composed with numbers only (#1201) fix wrong type text extracted by tesseract_ocr_cli_model Signed-off-by: gvl4 Co-authored-by: gvl4 Signed-off-by: Rafael Teixeira de Lima --- docling/models/tesseract_ocr_cli_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docling/models/tesseract_ocr_cli_model.py b/docling/models/tesseract_ocr_cli_model.py index 56968a2ef..1e7fe039f 100644 --- a/docling/models/tesseract_ocr_cli_model.py +++ b/docling/models/tesseract_ocr_cli_model.py @@ -247,7 +247,7 @@ def __call__( cell = TextCell( index=ix, - text=text, + text=str(text), orig=text, from_ocr=True, confidence=conf / 100.0, From e535209c7587e8d2890c685e83f52265c1445bba Mon Sep 17 00:00:00 2001 From: Rafael Teixeira de Lima Date: Wed, 2 Apr 2025 10:32:36 +0200 Subject: [PATCH 4/6] Flexibilize heading detection Signed-off-by: Rafael Teixeira de Lima --- docling/backend/msword_backend.py | 41 ++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index 941a4b4ab..34cdaec88 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -234,33 +234,44 @@ def get_numId_and_ilvl( return None, None # If the paragraph is not part of a list + def get_heading_and_level(self, style_label: str) -> tuple[str, Optional[int]]: + parts = self.split_text_and_number(style_label) + + if len(parts) == 2: + parts.sort() + label_str: str = "" + label_level: Optional[int] = 0 + if parts[0].strip().lower() == "heading": + label_str = "Heading" + label_level = self.str_to_int(parts[1], None) + if parts[1].strip().lower() == "heading": + label_str = "Heading" + label_level = self.str_to_int(parts[0], None) + return label_str, label_level + + return style_label, None + def get_label_and_level(self, paragraph: Paragraph) -> tuple[str, Optional[int]]: if paragraph.style is None: return "Normal", None + label = paragraph.style.style_id + name = paragraph.style.name + if label is None: return "Normal", None + if ":" in label: parts = label.split(":") - if len(parts) == 2: return parts[0], self.str_to_int(parts[1], None) - parts = self.split_text_and_number(label) + if "heading" in label.lower(): + return self.get_heading_and_level(label) + if "heading" in name.lower(): + return self.get_heading_and_level(name) - if "Heading" in label and len(parts) == 2: - parts.sort() - label_str: str = "" - label_level: Optional[int] = 0 - if parts[0] == "Heading": - label_str = parts[0] - label_level = self.str_to_int(parts[1], None) - if parts[1] == "Heading": - label_str = parts[1] - label_level = self.str_to_int(parts[0], None) - return label_str, label_level - else: - return label, None + return label, None def handle_equations_in_text(self, element, text): only_texts = [] From 331c6ab466dcf35f37447a4e63310230a5f33545 Mon Sep 17 00:00:00 2001 From: Rafael Teixeira de Lima Date: Wed, 2 Apr 2025 11:29:14 +0200 Subject: [PATCH 5/6] Fix trailing space Signed-off-by: Rafael Teixeira de Lima --- docling/backend/msword_backend.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index 34cdaec88..8b553828e 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -412,8 +412,8 @@ def handle_text_elements( if len(text_tmp) == 0: break - pre_eq_text = text_tmp.split(eq, maxsplit=1)[0] - text_tmp = text_tmp.split(eq, maxsplit=1)[1] + pre_eq_text = text_tmp.split(eq.strip(), maxsplit=1)[0] + text_tmp = text_tmp.split(eq.strip(), maxsplit=1)[1] if len(pre_eq_text) > 0: doc.add_text( label=DocItemLabel.PARAGRAPH, From c979eaab1ac3738a1629e4343b2b108f0c47e93a Mon Sep 17 00:00:00 2001 From: Rafael Teixeira de Lima Date: Wed, 2 Apr 2025 12:02:05 +0200 Subject: [PATCH 6/6] Remove trailing space Signed-off-by: Rafael Teixeira de Lima --- docling/backend/msword_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index 8b553828e..5094c8f9e 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -429,7 +429,7 @@ def handle_text_elements( doc.add_text( label=DocItemLabel.PARAGRAPH, parent=inline_equation, - text=text_tmp, + text=text_tmp.strip(), ) elif p_style_id in [