Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,32 @@ result = md.convert("example.jpg")
print(result.text_content)
```

Alternatively, you can provide a custom `llm_describber` callback function to use any LLM for image descriptions. This provides greater flexibility and allows for the use of any multimodal LLM library.

```python
from markitdown import MarkItDown

# Assume a hypothetical gemini_client is available
# from gemini.client import GeminiClient
# gemini_client = GeminiClient()

def gemini_image_describber(data_uri: str, prompt: str) -> str:
"""
A callback function that uses a hypothetical Gemini client to describe an image.
"""
# response = gemini_client.multimodal.generate_content(
# prompt=prompt,
# image_data=data_uri
# )
# return response.text
return f"Description for prompt '{prompt}' and image data starting with '{data_uri[:50]}...'"


md = MarkItDown(llm_describber=gemini_image_describber)
result = md.convert("my_image.jpg")
print(result.markdown)
```

### Docker

```sh
Expand Down
7 changes: 6 additions & 1 deletion packages/markitdown/src/markitdown/_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import io
from dataclasses import dataclass
from importlib.metadata import entry_points
from typing import Any, List, Dict, Optional, Union, BinaryIO
from typing import Any, Callable, List, Dict, Optional, Union, BinaryIO
from pathlib import Path
from urllib.parse import urlparse
from warnings import warn
Expand Down Expand Up @@ -116,6 +116,7 @@ def __init__(
self._llm_client: Any = None
self._llm_model: Union[str | None] = None
self._llm_prompt: Union[str | None] = None
self._llm_describber: Union[Callable[..., Union[str, None]], None] = None
self._exiftool_path: Union[str | None] = None
self._style_map: Union[str | None] = None

Expand All @@ -141,6 +142,7 @@ def enable_builtins(self, **kwargs) -> None:
self._llm_client = kwargs.get("llm_client")
self._llm_model = kwargs.get("llm_model")
self._llm_prompt = kwargs.get("llm_prompt")
self._llm_describber = kwargs.get("llm_describber")
self._exiftool_path = kwargs.get("exiftool_path")
self._style_map = kwargs.get("style_map")

Expand Down Expand Up @@ -564,6 +566,9 @@ def _convert(
if "llm_prompt" not in _kwargs and self._llm_prompt is not None:
_kwargs["llm_prompt"] = self._llm_prompt

if "llm_describber" not in _kwargs and self._llm_describber is not None:
_kwargs["llm_describber"] = self._llm_describber

if "style_map" not in _kwargs and self._style_map is not None:
_kwargs["style_map"] = self._style_map

Expand Down
91 changes: 68 additions & 23 deletions packages/markitdown/src/markitdown/converters/_image_converter.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import BinaryIO, Any, Union
from typing import BinaryIO, Any, Union, Callable
import base64
import mimetypes
from ._exiftool import exiftool_metadata
Expand All @@ -15,7 +15,8 @@

class ImageConverter(DocumentConverter):
"""
Converts images to markdown via extraction of metadata (if `exiftool` is installed), and description via a multimodal LLM (if an llm_client is configured).
Converts images to markdown via extraction of metadata (if `exiftool` is installed), and description via a multimodal LLM.
The LLM interaction can be customized by passing a callback function.
"""

def accepts(
Expand Down Expand Up @@ -65,37 +66,42 @@ def convert(
if f in metadata:
md_content += f"{f}: {metadata[f]}\n"

# Try describing the image with GPT
# Try describing the image with an LLM
llm_describber = kwargs.get("llm_describber")
llm_client = kwargs.get("llm_client")
llm_model = kwargs.get("llm_model")
if llm_client is not None and llm_model is not None:
llm_description = self._get_llm_description(
llm_prompt = kwargs.get("llm_prompt")
llm_description = None

if llm_describber is not None:
# New, flexible path using a callback
llm_description = self._get_llm_description_from_callback(
file_stream,
stream_info,
describber=llm_describber,
prompt=llm_prompt,
)
elif llm_client is not None and llm_model is not None:
# Legacy path for backward compatibility with OpenAI client
llm_description = self._get_llm_description_openai(
file_stream,
stream_info,
client=llm_client,
model=llm_model,
prompt=kwargs.get("llm_prompt"),
prompt=llm_prompt,
)

if llm_description is not None:
md_content += "\n# Description:\n" + llm_description.strip() + "\n"
if llm_description:
md_content += "\n# Description:\n" + llm_description.strip() + "\n"

return DocumentConverterResult(
markdown=md_content,
)

def _get_llm_description(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
*,
client,
model,
prompt=None,
) -> Union[None, str]:
if prompt is None or prompt.strip() == "":
prompt = "Write a detailed caption for this image."

def _prepare_data_uri(
self, file_stream: BinaryIO, stream_info: StreamInfo
) -> Union[str, None]:
"""Prepares a data URI from a file stream."""
# Get the content type
content_type = stream_info.mimetype
if not content_type:
Expand All @@ -109,13 +115,52 @@ def _get_llm_description(
cur_pos = file_stream.tell()
try:
base64_image = base64.b64encode(file_stream.read()).decode("utf-8")
except Exception as e:
except Exception:
return None
finally:
file_stream.seek(cur_pos)

# Prepare the data-uri
data_uri = f"data:{content_type};base64,{base64_image}"
return f"data:{content_type};base64,{base64_image}"

def _get_llm_description_from_callback(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
*,
describber: Callable[..., Union[str, None]],
prompt: Union[str, None],
) -> Union[str, None]:
"""Gets image description from a user-provided callback function."""
if prompt is None or prompt.strip() == "":
prompt = "Write a detailed caption for this image."

data_uri = self._prepare_data_uri(file_stream, stream_info)
if not data_uri:
return None

try:
# The callback is responsible for the actual LLM call
return describber(data_uri=data_uri, prompt=prompt)
except Exception:
# Broad exception to safeguard against errors in user-provided code
return None

def _get_llm_description_openai(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
*,
client: Any,
model: str,
prompt: Union[str, None],
) -> Union[str, None]:
"""Gets image description using the OpenAI client (legacy method)."""
if prompt is None or prompt.strip() == "":
prompt = "Write a detailed caption for this image."

data_uri = self._prepare_data_uri(file_stream, stream_info)
if not data_uri:
return None

# Prepare the OpenAI API request
messages = [
Expand Down
76 changes: 62 additions & 14 deletions packages/markitdown/src/markitdown/converters/_llm_caption.py
Original file line number Diff line number Diff line change
@@ -1,50 +1,98 @@
from typing import BinaryIO, Union
from typing import BinaryIO, Union, Callable, Any
import base64
import mimetypes
from .._stream_info import StreamInfo


def llm_caption(
file_stream: BinaryIO, stream_info: StreamInfo, *, client, model, prompt=None
file_stream: BinaryIO, stream_info: StreamInfo, **kwargs: Any
) -> Union[None, str]:
if prompt is None or prompt.strip() == "":
prompt = "Write a detailed caption for this image."
llm_describber = kwargs.get("llm_describber")
llm_client = kwargs.get("client")
llm_model = kwargs.get("model")
llm_prompt = kwargs.get("prompt")

if llm_describber is not None:
return _get_llm_description_from_callback(
file_stream, stream_info, describber=llm_describber, prompt=llm_prompt
)
elif llm_client is not None and llm_model is not None:
return _get_llm_description_openai(
file_stream, stream_info, client=llm_client, model=llm_model, prompt=llm_prompt
)
return None

# Get the content type

def _prepare_data_uri(
file_stream: BinaryIO, stream_info: StreamInfo
) -> Union[str, None]:
"""Prepares a data URI from a file stream."""
content_type = stream_info.mimetype
if not content_type:
content_type, _ = mimetypes.guess_type("_dummy" + (stream_info.extension or ""))
if not content_type:
content_type = "application/octet-stream"

# Convert to base64
cur_pos = file_stream.tell()
try:
base64_image = base64.b64encode(file_stream.read()).decode("utf-8")
except Exception as e:
except Exception:
return None
finally:
file_stream.seek(cur_pos)

# Prepare the data-uri
data_uri = f"data:{content_type};base64,{base64_image}"
return f"data:{content_type};base64,{base64_image}"


def _get_llm_description_from_callback(
file_stream: BinaryIO,
stream_info: StreamInfo,
*,
describber: Callable[..., Union[str, None]],
prompt: Union[str, None],
) -> Union[str, None]:
"""Gets image description from a user-provided callback function."""
if prompt is None or prompt.strip() == "":
prompt = "Write a detailed caption for this image."

data_uri = _prepare_data_uri(file_stream, stream_info)
if not data_uri:
return None

try:
return describber(data_uri=data_uri, prompt=prompt)
except Exception:
return None


def _get_llm_description_openai(
file_stream: BinaryIO,
stream_info: StreamInfo,
*,
client: Any,
model: str,
prompt: Union[str, None],
) -> Union[str, None]:
"""Gets image description using the OpenAI client (legacy method)."""
if prompt is None or prompt.strip() == "":
prompt = "Write a detailed caption for this image."

data_uri = _prepare_data_uri(file_stream, stream_info)
if not data_uri:
return None

# Prepare the OpenAI API request
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {
"url": data_uri,
},
"image_url": {"url": data_uri},
},
],
}
]

# Call the OpenAI API
response = client.chat.completions.create(model=model, messages=messages)
return response.choices[0].message.content
59 changes: 33 additions & 26 deletions packages/markitdown/src/markitdown/converters/_pptx_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,34 +99,36 @@ def get_shape_content(shape, **kwargs):
alt_text = ""

# Potentially generate a description using an LLM
llm_describber = kwargs.get("llm_describber")
llm_client = kwargs.get("llm_client")
llm_model = kwargs.get("llm_model")
if llm_client is not None and llm_model is not None:
# Prepare a file_stream and stream_info for the image data
image_filename = shape.image.filename
image_extension = None
if image_filename:
image_extension = os.path.splitext(image_filename)[1]
image_stream_info = StreamInfo(
mimetype=shape.image.content_type,
extension=image_extension,
filename=image_filename,
)

image_stream = io.BytesIO(shape.image.blob)

# Caption the image
try:
llm_description = llm_caption(
image_stream,
image_stream_info,
client=llm_client,
model=llm_model,
prompt=kwargs.get("llm_prompt"),
)
except Exception:
# Unable to generate a description
pass
# Prepare a file_stream and stream_info for the image data
image_filename = shape.image.filename
image_extension = None
if image_filename:
image_extension = os.path.splitext(image_filename)[1]
image_stream_info = StreamInfo(
mimetype=shape.image.content_type,
extension=image_extension,
filename=image_filename,
)

image_stream = io.BytesIO(shape.image.blob)

# Caption the image
try:
llm_description = llm_caption(
image_stream,
image_stream_info,
llm_describber=llm_describber,
client=llm_client,
model=llm_model,
prompt=kwargs.get("llm_prompt"),
)
except Exception:
# Unable to generate a description
pass

# Also grab any description embedded in the deck
try:
Expand All @@ -136,7 +138,12 @@ def get_shape_content(shape, **kwargs):
pass

# Prepare the alt, escaping any special characters
alt_text = "\n".join([llm_description, alt_text]) or shape.name
alt_text_parts = []
if llm_description:
alt_text_parts.append(llm_description)
if alt_text:
alt_text_parts.append(alt_text)
alt_text = "\n".join(alt_text_parts) or shape.name
alt_text = re.sub(r"[\r\n\[\]]", " ", alt_text)
alt_text = re.sub(r"\s+", " ", alt_text).strip()

Expand Down
Loading