-
Notifications
You must be signed in to change notification settings - Fork 3k
feat: adding support for images inside docx #277
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -714,6 +714,29 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: | |
result = mammoth.convert_to_html(docx_file, style_map=style_map) | ||
html_content = result.value | ||
result = self._convert(html_content) | ||
|
||
# Extract any base64 encoded images from the HTML | ||
descriptions = [] | ||
if kwargs.get("llm_client") and kwargs.get("llm_model"): | ||
for match in re.finditer(r'data:image/[^;]+;base64,([^"\']+)', html_content): | ||
img_converter = ImageConverter() | ||
descriptions.append(img_converter.convert_from_base64(match.group(1),'.png',**kwargs)) | ||
|
||
# Replace each base64 image with its description | ||
if descriptions and result: | ||
text_content = result.text_content | ||
|
||
# Find all base64 image markdown patterns | ||
base64_pattern = r'!\[[\s\S]*?\]\(data:image/[a-z]+;base64.*?\)' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this might be a little error prone, if the source doc has a pattern match, do there could be extras floating around in the doc There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. also at least in my default use, i get images appearing as There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i think because of this; #1140 |
||
|
||
# Find all base64 image markdown patterns | ||
matches = list(re.finditer(base64_pattern, text_content)) | ||
|
||
# Replace each match with corresponding description | ||
for i, match in enumerate(matches): | ||
if i < len(descriptions): | ||
text_content = text_content.replace(match.group(), f'[Image description {i}] \n{descriptions[i]}\n[End Image description {i}]') | ||
result.text_content = text_content | ||
|
||
return result | ||
|
||
|
@@ -1114,6 +1137,59 @@ def _get_llm_description(self, local_path, extension, client, model, prompt=None | |
|
||
response = client.chat.completions.create(model=model, messages=messages) | ||
return response.choices[0].message.content | ||
|
||
def _get_llm_description_from_base64( | ||
self, | ||
base64_str: str, | ||
extension: str, | ||
client: Any, | ||
model: str, | ||
prompt: Optional[str] = None | ||
) -> str: | ||
"""Get LLM description for a base64-encoded image string.""" | ||
if prompt is None or prompt.strip() == "": | ||
prompt = "Write a detailed caption for this image." | ||
|
||
# Remove data URI prefix if present | ||
if ',' in base64_str: | ||
base64_str = base64_str.split(',')[1] | ||
|
||
# Create data URI | ||
content_type, encoding = mimetypes.guess_type("_dummy" + extension) | ||
if content_type is None: | ||
content_type = "image/jpeg" | ||
|
||
data_uri = f"data:{content_type};base64,{base64_str}" | ||
messages = [ | ||
{ | ||
"role": "user", | ||
"content": [ | ||
{"type": "text", "text": prompt}, | ||
{ | ||
"type": "image_url", | ||
"image_url": { | ||
"url": data_uri, | ||
}, | ||
}, | ||
], | ||
} | ||
] | ||
|
||
response = client.chat.completions.create(model=model, messages=messages) | ||
return response.choices[0].message.content | ||
|
||
def convert_from_base64( | ||
self, | ||
base64_str: str, | ||
extension: str, | ||
**kwargs: Any | ||
) -> Union[None, DocumentConverterResult]: | ||
"""Convert a base64-encoded image string to markdown.""" | ||
client = kwargs.get("llm_client") | ||
model = kwargs.get("llm_model") | ||
prompt = kwargs.get("llm_prompt") | ||
result = self._get_llm_description_from_base64(base64_str, extension, client, model, prompt) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. convert from base64 does more than just converting from base 64; probably better to rename |
||
return result | ||
|
||
|
||
class OutlookMsgConverter(DocumentConverter): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yeah should also check for
keep_data_uris
when callingconvert
; id imagine that gets passed along in the args