Skip to content

Commit 8e75a43

Browse files
committed
Use HTMLParser instead bs4 in checksum validation
1 parent 3dd7e96 commit 8e75a43

File tree

3 files changed

+20
-13
lines changed

3 files changed

+20
-13
lines changed

charon.spec

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,6 @@ Requires: python%{python3_pkgversion}-importlib-metadata
5050
Requires: python%{python3_pkgversion}-zipp
5151
Requires: python%{python3_pkgversion}-attrs
5252
Requires: python%{python3_pkgversion}-pyrsistent
53-
Requires: python%{python3_pkgversion}-beautifulsoup4
5453

5554
%description
5655
Simple Python tool with command line interface for charon init,

charon/pkgs/checksum_http.py

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
"""
1616
from charon.utils.files import digest
1717
from typing import Tuple, List, Dict
18-
from bs4 import BeautifulSoup
18+
from html.parser import HTMLParser
1919
import tempfile
2020
import os
2121
import logging
@@ -232,18 +232,27 @@ def _list_folder_content(folder_url: str, folder_path: str) -> List[str]:
232232
return []
233233

234234

235-
def _parseContent(pageContent: str, parent: str) -> List[str]:
236-
items = []
237-
soup = BeautifulSoup(pageContent, "html.parser")
238-
contents = soup.find("ul", id="contents").find_all("a")
239-
for c in contents:
240-
item = c["href"]
241-
if not item or item.strip() == '../':
242-
continue
243-
items.append(os.path.join(parent, item))
244-
return items
235+
class IndexParser(HTMLParser):
236+
def __init__(self):
237+
super().__init__()
238+
self.reset()
239+
self.__content = []
240+
241+
def handle_starttag(self, tag, attrs):
242+
if tag == "a":
243+
for name, link in attrs:
244+
if name == "href" and link.strip() not in ['../', '']:
245+
self.__content.append(link)
246+
247+
def get_content(self):
248+
return self.__content
245249

246250

251+
def _parseContent(pageContent: str, parent: str) -> List[str]:
252+
parser = IndexParser()
253+
parser.feed(pageContent)
254+
return [os.path.join(parent, i) for i in parser.get_content()]
255+
247256
def _read_remote_file_content(remote_file_url: str) -> str:
248257
try:
249258
with requests.get(remote_file_url) as r:

requirements.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,5 +8,4 @@ PyYAML==6.0.1
88
defusedxml==0.7.1
99
subresource-integrity==0.2
1010
jsonschema==4.19.0
11-
beautifulsoup4==4.11.1
1211
urllib3==1.26.18

0 commit comments

Comments
 (0)