Skip to content

feat: adding support for images inside docx #277

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 76 additions & 0 deletions src/markitdown/_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -714,6 +714,29 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
result = mammoth.convert_to_html(docx_file, style_map=style_map)
html_content = result.value
result = self._convert(html_content)

# Extract any base64 encoded images from the HTML
descriptions = []
if kwargs.get("llm_client") and kwargs.get("llm_model"):
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah should also check for keep_data_uris when calling convert; id imagine that gets passed along in the args

for match in re.finditer(r'data:image/[^;]+;base64,([^"\']+)', html_content):
img_converter = ImageConverter()
descriptions.append(img_converter.convert_from_base64(match.group(1),'.png',**kwargs))

# Replace each base64 image with its description
if descriptions and result:
text_content = result.text_content

# Find all base64 image markdown patterns
base64_pattern = r'!\[[\s\S]*?\]\(data:image/[a-z]+;base64.*?\)'
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this might be a little error prone, if the source doc has a pattern match, do there could be extras floating around in the doc

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also at least in my default use, i get images appearing as ![](data:image/png;base64...), so i think there needs to be another check for if the data actually exists in the output, or to fetch it another way, or ensure flags are enabled to embed the image data into the md

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think because of this; #1140


# Find all base64 image markdown patterns
matches = list(re.finditer(base64_pattern, text_content))

# Replace each match with corresponding description
for i, match in enumerate(matches):
if i < len(descriptions):
text_content = text_content.replace(match.group(), f'[Image description {i}] \n{descriptions[i]}\n[End Image description {i}]')
result.text_content = text_content

return result

Expand Down Expand Up @@ -1114,6 +1137,59 @@ def _get_llm_description(self, local_path, extension, client, model, prompt=None

response = client.chat.completions.create(model=model, messages=messages)
return response.choices[0].message.content

def _get_llm_description_from_base64(
self,
base64_str: str,
extension: str,
client: Any,
model: str,
prompt: Optional[str] = None
) -> str:
"""Get LLM description for a base64-encoded image string."""
if prompt is None or prompt.strip() == "":
prompt = "Write a detailed caption for this image."

# Remove data URI prefix if present
if ',' in base64_str:
base64_str = base64_str.split(',')[1]

# Create data URI
content_type, encoding = mimetypes.guess_type("_dummy" + extension)
if content_type is None:
content_type = "image/jpeg"

data_uri = f"data:{content_type};base64,{base64_str}"
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {
"url": data_uri,
},
},
],
}
]

response = client.chat.completions.create(model=model, messages=messages)
return response.choices[0].message.content

def convert_from_base64(
self,
base64_str: str,
extension: str,
**kwargs: Any
) -> Union[None, DocumentConverterResult]:
"""Convert a base64-encoded image string to markdown."""
client = kwargs.get("llm_client")
model = kwargs.get("llm_model")
prompt = kwargs.get("llm_prompt")
result = self._get_llm_description_from_base64(base64_str, extension, client, model, prompt)
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

convert from base64 does more than just converting from base 64; probably better to rename

return result


class OutlookMsgConverter(DocumentConverter):
Expand Down