-
Notifications
You must be signed in to change notification settings - Fork 4.8k
Open
Description
We've tried to process a document from MS Marco dataset that has the word with length about 190k characters.
The processing has failed. The document is attached.
✗ Error processing C:\msmarco-v2 Top 0 - 5k files\'Longest' word has 189,819 letters, takes three hours to pronounce.docx: Traceback (most recent call last):
File "<frozen runpy>", line 198, in _run_module_as_main
File "<frozen runpy>", line 88, in _run_code
File "C:\venv\Scripts\markitdown.exe\__main__.py", line 6, in <module>
sys.exit(main())
~~~~^^
File "C:\venv\Lib\site-packages\markitdown\__main__.py", line 91, in main
result = markitdown.convert(args.filename)
File "C:\venv\Lib\site-packages\markitdown\_markitdown.py", line 1563, in convert
return self.convert_local(source, **kwargs)
~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^
File "C:\venv\Lib\site-packages\markitdown\_markitdown.py", line 1587, in convert_local
return self._convert(path, extensions, **kwargs)
~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\venv\Lib\site-packages\markitdown\_markitdown.py", line 1731, in _convert
raise FileConversionException(
f"Could not convert '{local_path}' to Markdown. File type was recognized as {extensions}. While converting the file, the following error was encountered:\n\n{error_trace}"
)
markitdown._markitdown.FileConversionException: Could not convert 'C:\msmarco-v2 Top 0 - 5k files\'Longest' word has 189,819 letters, takes three hours to pronounce.docx' to Markdown. File type was recognized as ['.docx', '.wb3', '.doc', '.spo', '.opt', '.rvt', '.vsd', '.msi', '.pub', '.mtw', '.ac_', '.dot', '.pps', '.ppt', '.xla', '.xls', '.wiz', '.sou', '.wps', '.apr', '.msc', '.adp', '.db', '.wdb', '.xlr']. While converting the file, the following error was encountered:
Traceback (most recent call last):
File "C:\venv\Lib\site-packages\markitdown\_markitdown.py", line 1715, in _convert
res = converter.convert(local_path, **_kwargs)
File "C:\venv\Lib\site-packages\markitdown\_markitdown.py", line 779, in convert
sheets = pd.read_excel(local_path, sheet_name=None, engine="xlrd")
File "C:\venv\Lib\site-packages\pandas\io\excel\_base.py", line 495, in read_excel
io = ExcelFile(
io,
...<2 lines>...
engine_kwargs=engine_kwargs,
)
File "C:\venv\Lib\site-packages\pandas\io\excel\_base.py", line 1567, in __init__
self._reader = self._engines[engine](
~~~~~~~~~~~~~~~~~~~~~^
self._io,
^^^^^^^^^
storage_options=storage_options,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
engine_kwargs=engine_kwargs,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
)
^
File "C:\venv\Lib\site-packages\pandas\io\excel\_xlrd.py", line 46, in __init__
super().__init__(
~~~~~~~~~~~~~~~~^
filepath_or_buffer,
^^^^^^^^^^^^^^^^^^^
storage_options=storage_options,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
engine_kwargs=engine_kwargs,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
)
^
File "C:\venv\Lib\site-packages\pandas\io\excel\_base.py", line 573, in __init__
self.book = self.load_workbook(self.handles.handle, engine_kwargs)
~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\venv\Lib\site-packages\pandas\io\excel\_xlrd.py", line 63, in load_workbook
return open_workbook(file_contents=data, **engine_kwargs)
File "C:\venv\Lib\site-packages\xlrd\__init__.py", line 172, in open_workbook
bk = open_workbook_xls(
filename=filename,
...<8 lines>...
ignore_workbook_corruption=ignore_workbook_corruption,
)
File "C:\venv\Lib\site-packages\xlrd\book.py", line 68, in open_workbook_xls
bk.biff2_8_load(
~~~~~~~~~~~~~~~^
filename=filename, file_contents=file_contents,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
...<5 lines>...
ignore_workbook_corruption=ignore_workbook_corruption
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
)
^
File "C:\venv\Lib\site-packages\xlrd\book.py", line 645, in biff2_8_load
raise XLRDError("Can't find workbook in OLE2 compound document")
xlrd.biffh.XLRDError: Can't find workbook in OLE2 compound document'Longest' word has 189,819 letters, takes three hours to pronounce.docx
Metadata
Metadata
Assignees
Labels
No labels