microsoft · Jhonnyr97 · Oct 16, 2025 · Oct 16, 2025 · Oct 23, 2025
diff --git a/README.md b/README.md
@@ -176,6 +176,32 @@ result = md.convert("example.jpg")
 print(result.text_content)
 ```
 
+Alternatively, you can provide a custom `llm_describber` callback function to use any LLM for image descriptions. This provides greater flexibility and allows for the use of any multimodal LLM library.
+
+```python
+from markitdown import MarkItDown
+
+# Assume a hypothetical gemini_client is available
+# from gemini.client import GeminiClient
+# gemini_client = GeminiClient()
+
+def gemini_image_describber(data_uri: str, prompt: str) -> str:
+    """
+    A callback function that uses a hypothetical Gemini client to describe an image.
+    """
+    # response = gemini_client.multimodal.generate_content(
+    #     prompt=prompt,
+    #     image_data=data_uri
+    # )
+    # return response.text
+    return f"Description for prompt '{prompt}' and image data starting with '{data_uri[:50]}...'"
+
+
+md = MarkItDown(llm_describber=gemini_image_describber)
+result = md.convert("my_image.jpg")
+print(result.markdown)
+```
+
 ### Docker
 
 ```sh

diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py
@@ -7,7 +7,7 @@
 import io
 from dataclasses import dataclass
 from importlib.metadata import entry_points
-from typing import Any, List, Dict, Optional, Union, BinaryIO
+from typing import Any, Callable, List, Dict, Optional, Union, BinaryIO
 from pathlib import Path
 from urllib.parse import urlparse
 from warnings import warn
@@ -116,6 +116,7 @@ def __init__(
         self._llm_client: Any = None
         self._llm_model: Union[str | None] = None
         self._llm_prompt: Union[str | None] = None
+        self._llm_describber: Union[Callable[..., Union[str, None]], None] = None
         self._exiftool_path: Union[str | None] = None
         self._style_map: Union[str | None] = None
 
@@ -141,6 +142,7 @@ def enable_builtins(self, **kwargs) -> None:
             self._llm_client = kwargs.get("llm_client")
             self._llm_model = kwargs.get("llm_model")
             self._llm_prompt = kwargs.get("llm_prompt")
+            self._llm_describber = kwargs.get("llm_describber")
             self._exiftool_path = kwargs.get("exiftool_path")
             self._style_map = kwargs.get("style_map")
 
@@ -564,6 +566,9 @@ def _convert(
                 if "llm_prompt" not in _kwargs and self._llm_prompt is not None:
                     _kwargs["llm_prompt"] = self._llm_prompt
 
+                if "llm_describber" not in _kwargs and self._llm_describber is not None:
+                    _kwargs["llm_describber"] = self._llm_describber
+
                 if "style_map" not in _kwargs and self._style_map is not None:
                     _kwargs["style_map"] = self._style_map
 

diff --git a/packages/markitdown/src/markitdown/converters/_image_converter.py b/packages/markitdown/src/markitdown/converters/_image_converter.py
@@ -1,4 +1,4 @@
-from typing import BinaryIO, Any, Union
+from typing import BinaryIO, Any, Union, Callable
 import base64
 import mimetypes
 from ._exiftool import exiftool_metadata
@@ -15,7 +15,8 @@
 
 class ImageConverter(DocumentConverter):
     """
-    Converts images to markdown via extraction of metadata (if `exiftool` is installed), and description via a multimodal LLM (if an llm_client is configured).
+    Converts images to markdown via extraction of metadata (if `exiftool` is installed), and description via a multimodal LLM.
+    The LLM interaction can be customized by passing a callback function.
     """
 
     def accepts(
@@ -65,37 +66,42 @@ def convert(
                 if f in metadata:
                     md_content += f"{f}: {metadata[f]}\n"
 
-        # Try describing the image with GPT
+        # Try describing the image with an LLM
+        llm_describber = kwargs.get("llm_describber")
         llm_client = kwargs.get("llm_client")
         llm_model = kwargs.get("llm_model")
-        if llm_client is not None and llm_model is not None:
-            llm_description = self._get_llm_description(
+        llm_prompt = kwargs.get("llm_prompt")
+        llm_description = None
+
+        if llm_describber is not None:
+            # New, flexible path using a callback
+            llm_description = self._get_llm_description_from_callback(
+                file_stream,
+                stream_info,
+                describber=llm_describber,
+                prompt=llm_prompt,
+            )
+        elif llm_client is not None and llm_model is not None:
+            # Legacy path for backward compatibility with OpenAI client
+            llm_description = self._get_llm_description_openai(
                 file_stream,
                 stream_info,
                 client=llm_client,
                 model=llm_model,
-                prompt=kwargs.get("llm_prompt"),
+                prompt=llm_prompt,
             )
 
-            if llm_description is not None:
-                md_content += "\n# Description:\n" + llm_description.strip() + "\n"
+        if llm_description:
+            md_content += "\n# Description:\n" + llm_description.strip() + "\n"
 
         return DocumentConverterResult(
             markdown=md_content,
         )
 
-    def _get_llm_description(
-        self,
-        file_stream: BinaryIO,
-        stream_info: StreamInfo,
-        *,
-        client,
-        model,
-        prompt=None,
-    ) -> Union[None, str]:
-        if prompt is None or prompt.strip() == "":
-            prompt = "Write a detailed caption for this image."
-
+    def _prepare_data_uri(
+        self, file_stream: BinaryIO, stream_info: StreamInfo
+    ) -> Union[str, None]:
+        """Prepares a data URI from a file stream."""
         # Get the content type
         content_type = stream_info.mimetype
         if not content_type:
@@ -109,13 +115,52 @@ def _get_llm_description(
         cur_pos = file_stream.tell()
         try:
             base64_image = base64.b64encode(file_stream.read()).decode("utf-8")
-        except Exception as e:
+        except Exception:
             return None
         finally:
             file_stream.seek(cur_pos)
 
-        # Prepare the data-uri
-        data_uri = f"data:{content_type};base64,{base64_image}"
+        return f"data:{content_type};base64,{base64_image}"
+
+    def _get_llm_description_from_callback(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        *,
+        describber: Callable[..., Union[str, None]],
+        prompt: Union[str, None],
+    ) -> Union[str, None]:
+        """Gets image description from a user-provided callback function."""
+        if prompt is None or prompt.strip() == "":
+            prompt = "Write a detailed caption for this image."
+
+        data_uri = self._prepare_data_uri(file_stream, stream_info)
+        if not data_uri:
+            return None
+
+        try:
+            # The callback is responsible for the actual LLM call
+            return describber(data_uri=data_uri, prompt=prompt)
+        except Exception:
+            # Broad exception to safeguard against errors in user-provided code
+            return None
+
+    def _get_llm_description_openai(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        *,
+        client: Any,
+        model: str,
+        prompt: Union[str, None],
+    ) -> Union[str, None]:
+        """Gets image description using the OpenAI client (legacy method)."""
+        if prompt is None or prompt.strip() == "":
+            prompt = "Write a detailed caption for this image."
+
+        data_uri = self._prepare_data_uri(file_stream, stream_info)
+        if not data_uri:
+            return None
 
         # Prepare the OpenAI API request
         messages = [

diff --git a/packages/markitdown/src/markitdown/converters/_llm_caption.py b/packages/markitdown/src/markitdown/converters/_llm_caption.py
@@ -1,50 +1,98 @@
-from typing import BinaryIO, Union
+from typing import BinaryIO, Union, Callable, Any
 import base64
 import mimetypes
 from .._stream_info import StreamInfo
 
 
 def llm_caption(
-    file_stream: BinaryIO, stream_info: StreamInfo, *, client, model, prompt=None
+    file_stream: BinaryIO, stream_info: StreamInfo, **kwargs: Any
 ) -> Union[None, str]:
-    if prompt is None or prompt.strip() == "":
-        prompt = "Write a detailed caption for this image."
+    llm_describber = kwargs.get("llm_describber")
+    llm_client = kwargs.get("client")
+    llm_model = kwargs.get("model")
+    llm_prompt = kwargs.get("prompt")
+
+    if llm_describber is not None:
+        return _get_llm_description_from_callback(
+            file_stream, stream_info, describber=llm_describber, prompt=llm_prompt
+        )
+    elif llm_client is not None and llm_model is not None:
+        return _get_llm_description_openai(
+            file_stream, stream_info, client=llm_client, model=llm_model, prompt=llm_prompt
+        )
+    return None
 
-    # Get the content type
+
+def _prepare_data_uri(
+    file_stream: BinaryIO, stream_info: StreamInfo
+) -> Union[str, None]:
+    """Prepares a data URI from a file stream."""
     content_type = stream_info.mimetype
     if not content_type:
         content_type, _ = mimetypes.guess_type("_dummy" + (stream_info.extension or ""))
     if not content_type:
         content_type = "application/octet-stream"
 
-    # Convert to base64
     cur_pos = file_stream.tell()
     try:
         base64_image = base64.b64encode(file_stream.read()).decode("utf-8")
-    except Exception as e:
+    except Exception:
         return None
     finally:
         file_stream.seek(cur_pos)
 
-    # Prepare the data-uri
-    data_uri = f"data:{content_type};base64,{base64_image}"
+    return f"data:{content_type};base64,{base64_image}"
+
+
+def _get_llm_description_from_callback(
+    file_stream: BinaryIO,
+    stream_info: StreamInfo,
+    *,
+    describber: Callable[..., Union[str, None]],
+    prompt: Union[str, None],
+) -> Union[str, None]:
+    """Gets image description from a user-provided callback function."""
+    if prompt is None or prompt.strip() == "":
+        prompt = "Write a detailed caption for this image."
+
+    data_uri = _prepare_data_uri(file_stream, stream_info)
+    if not data_uri:
+        return None
+
+    try:
+        return describber(data_uri=data_uri, prompt=prompt)
+    except Exception:
+        return None
+
+
+def _get_llm_description_openai(
+    file_stream: BinaryIO,
+    stream_info: StreamInfo,
+    *,
+    client: Any,
+    model: str,
+    prompt: Union[str, None],
+) -> Union[str, None]:
+    """Gets image description using the OpenAI client (legacy method)."""
+    if prompt is None or prompt.strip() == "":
+        prompt = "Write a detailed caption for this image."
+
+    data_uri = _prepare_data_uri(file_stream, stream_info)
+    if not data_uri:
+        return None
 
-    # Prepare the OpenAI API request
     messages = [
         {
             "role": "user",
             "content": [
                 {"type": "text", "text": prompt},
                 {
                     "type": "image_url",
-                    "image_url": {
-                        "url": data_uri,
-                    },
+                    "image_url": {"url": data_uri},
                 },
             ],
         }
     ]
 
-    # Call the OpenAI API
     response = client.chat.completions.create(model=model, messages=messages)
     return response.choices[0].message.content
diff --git a/packages/markitdown/src/markitdown/converters/_pptx_converter.py b/packages/markitdown/src/markitdown/converters/_pptx_converter.py
@@ -99,34 +99,36 @@ def get_shape_content(shape, **kwargs):
                     alt_text = ""
 
                     # Potentially generate a description using an LLM
+                    llm_describber = kwargs.get("llm_describber")
                     llm_client = kwargs.get("llm_client")
                     llm_model = kwargs.get("llm_model")
-                    if llm_client is not None and llm_model is not None:
-                        # Prepare a file_stream and stream_info for the image data
-                        image_filename = shape.image.filename
-                        image_extension = None
-                        if image_filename:
-                            image_extension = os.path.splitext(image_filename)[1]
-                        image_stream_info = StreamInfo(
-                            mimetype=shape.image.content_type,
-                            extension=image_extension,
-                            filename=image_filename,
-                        )
 
-                        image_stream = io.BytesIO(shape.image.blob)
-
-                        # Caption the image
-                        try:
-                            llm_description = llm_caption(
-                                image_stream,
-                                image_stream_info,
-                                client=llm_client,
-                                model=llm_model,
-                                prompt=kwargs.get("llm_prompt"),
-                            )
-                        except Exception:
-                            # Unable to generate a description
-                            pass
+                    # Prepare a file_stream and stream_info for the image data
+                    image_filename = shape.image.filename
+                    image_extension = None
+                    if image_filename:
+                        image_extension = os.path.splitext(image_filename)[1]
+                    image_stream_info = StreamInfo(
+                        mimetype=shape.image.content_type,
+                        extension=image_extension,
+                        filename=image_filename,
+                    )
+
+                    image_stream = io.BytesIO(shape.image.blob)
+
+                    # Caption the image
+                    try:
+                        llm_description = llm_caption(
+                            image_stream,
+                            image_stream_info,
+                            llm_describber=llm_describber,
+                            client=llm_client,
+                            model=llm_model,
+                            prompt=kwargs.get("llm_prompt"),
+                        )
+                    except Exception:
+                        # Unable to generate a description
+                        pass
 
                     # Also grab any description embedded in the deck
                     try:
@@ -136,7 +138,12 @@ def get_shape_content(shape, **kwargs):
                         pass
 
                     # Prepare the alt, escaping any special characters
-                    alt_text = "\n".join([llm_description, alt_text]) or shape.name
+                    alt_text_parts = []
+                    if llm_description:
+                        alt_text_parts.append(llm_description)
+                    if alt_text:
+                        alt_text_parts.append(alt_text)
+                    alt_text = "\n".join(alt_text_parts) or shape.name
                     alt_text = re.sub(r"[\r\n\[\]]", " ", alt_text)
                     alt_text = re.sub(r"\s+", " ", alt_text).strip()