|
15 | 15 | """ |
16 | 16 | from charon.utils.files import digest |
17 | 17 | from typing import Tuple, List, Dict |
18 | | -from bs4 import BeautifulSoup |
| 18 | +from html.parser import HTMLParser |
19 | 19 | import tempfile |
20 | 20 | import os |
21 | 21 | import logging |
@@ -232,18 +232,27 @@ def _list_folder_content(folder_url: str, folder_path: str) -> List[str]: |
232 | 232 | return [] |
233 | 233 |
|
234 | 234 |
|
235 | | -def _parseContent(pageContent: str, parent: str) -> List[str]: |
236 | | - items = [] |
237 | | - soup = BeautifulSoup(pageContent, "html.parser") |
238 | | - contents = soup.find("ul", id="contents").find_all("a") |
239 | | - for c in contents: |
240 | | - item = c["href"] |
241 | | - if not item or item.strip() == '../': |
242 | | - continue |
243 | | - items.append(os.path.join(parent, item)) |
244 | | - return items |
| 235 | +class IndexParser(HTMLParser): |
| 236 | + def __init__(self): |
| 237 | + super().__init__() |
| 238 | + self.reset() |
| 239 | + self.__content = [] |
| 240 | + |
| 241 | + def handle_starttag(self, tag, attrs): |
| 242 | + if tag == "a": |
| 243 | + for name, link in attrs: |
| 244 | + if name == "href" and link.strip() not in ['../', '']: |
| 245 | + self.__content.append(link) |
| 246 | + |
| 247 | + def get_content(self): |
| 248 | + return self.__content |
245 | 249 |
|
246 | 250 |
|
| 251 | +def _parseContent(pageContent: str, parent: str) -> List[str]: |
| 252 | + parser = IndexParser() |
| 253 | + parser.feed(pageContent) |
| 254 | + return [os.path.join(parent, i) for i in parser.get_content()] |
| 255 | + |
247 | 256 | def _read_remote_file_content(remote_file_url: str) -> str: |
248 | 257 | try: |
249 | 258 | with requests.get(remote_file_url) as r: |
|
0 commit comments