|
15 | 15 | """ |
16 | 16 | from charon.utils.files import digest |
17 | 17 | from typing import Tuple, List, Dict |
18 | | -from bs4 import BeautifulSoup |
| 18 | +from html.parser import HTMLParser |
19 | 19 | import tempfile |
20 | 20 | import os |
21 | 21 | import logging |
@@ -224,24 +224,30 @@ def _list_folder_content(folder_url: str, folder_path: str) -> List[str]: |
224 | 224 | contentType = r.headers.get('Content-Type') |
225 | 225 | if contentType and "text/html" in contentType: |
226 | 226 | pageContent = r.text |
227 | | - return _parseContent(pageContent, folder_path) |
| 227 | + p = _IndexParser() |
| 228 | + p.feed(pageContent) |
| 229 | + return p.get_content(folder_path) |
228 | 230 | else: |
229 | 231 | logger.warning("%s is not a folder!", folder_url) |
230 | 232 | except Exception as e: |
231 | 233 | logger.error("Can not list folder %s. The error is %s", folder_url, e) |
232 | 234 | return [] |
233 | 235 |
|
234 | 236 |
|
235 | | -def _parseContent(pageContent: str, parent: str) -> List[str]: |
236 | | - items = [] |
237 | | - soup = BeautifulSoup(pageContent, "html.parser") |
238 | | - contents = soup.find("ul", id="contents").find_all("a") |
239 | | - for c in contents: |
240 | | - item = c["href"] |
241 | | - if not item or item.strip() == '../': |
242 | | - continue |
243 | | - items.append(os.path.join(parent, item)) |
244 | | - return items |
| 237 | +class _IndexParser(HTMLParser): |
| 238 | + def __init__(self): |
| 239 | + super().__init__() |
| 240 | + self.reset() |
| 241 | + self.__content = [] |
| 242 | + |
| 243 | + def handle_starttag(self, tag, attrs): |
| 244 | + if tag == "a": |
| 245 | + for name, link in attrs: |
| 246 | + if name == "href" and link.strip() not in ['../', '']: |
| 247 | + self.__content.append(link) |
| 248 | + |
| 249 | + def get_content(self, parent): |
| 250 | + return [os.path.join(parent, i) for i in self.__content] |
245 | 251 |
|
246 | 252 |
|
247 | 253 | def _read_remote_file_content(remote_file_url: str) -> str: |
|
0 commit comments