Skip to content

Commit c0feb5d

Browse files
authored
Merge pull request #199 from ligangty/checksum-http
Use HTMLParser instead bs4 in checksum validation
2 parents 3dd7e96 + 41d2b32 commit c0feb5d

File tree

3 files changed

+18
-14
lines changed

3 files changed

+18
-14
lines changed

charon.spec

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,6 @@ Requires: python%{python3_pkgversion}-importlib-metadata
5050
Requires: python%{python3_pkgversion}-zipp
5151
Requires: python%{python3_pkgversion}-attrs
5252
Requires: python%{python3_pkgversion}-pyrsistent
53-
Requires: python%{python3_pkgversion}-beautifulsoup4
5453

5554
%description
5655
Simple Python tool with command line interface for charon init,

charon/pkgs/checksum_http.py

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
"""
1616
from charon.utils.files import digest
1717
from typing import Tuple, List, Dict
18-
from bs4 import BeautifulSoup
18+
from html.parser import HTMLParser
1919
import tempfile
2020
import os
2121
import logging
@@ -224,24 +224,30 @@ def _list_folder_content(folder_url: str, folder_path: str) -> List[str]:
224224
contentType = r.headers.get('Content-Type')
225225
if contentType and "text/html" in contentType:
226226
pageContent = r.text
227-
return _parseContent(pageContent, folder_path)
227+
p = _IndexParser()
228+
p.feed(pageContent)
229+
return p.get_content(folder_path)
228230
else:
229231
logger.warning("%s is not a folder!", folder_url)
230232
except Exception as e:
231233
logger.error("Can not list folder %s. The error is %s", folder_url, e)
232234
return []
233235

234236

235-
def _parseContent(pageContent: str, parent: str) -> List[str]:
236-
items = []
237-
soup = BeautifulSoup(pageContent, "html.parser")
238-
contents = soup.find("ul", id="contents").find_all("a")
239-
for c in contents:
240-
item = c["href"]
241-
if not item or item.strip() == '../':
242-
continue
243-
items.append(os.path.join(parent, item))
244-
return items
237+
class _IndexParser(HTMLParser):
238+
def __init__(self):
239+
super().__init__()
240+
self.reset()
241+
self.__content = []
242+
243+
def handle_starttag(self, tag, attrs):
244+
if tag == "a":
245+
for name, link in attrs:
246+
if name == "href" and link.strip() not in ['../', '']:
247+
self.__content.append(link)
248+
249+
def get_content(self, parent):
250+
return [os.path.join(parent, i) for i in self.__content]
245251

246252

247253
def _read_remote_file_content(remote_file_url: str) -> str:

requirements.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,5 +8,4 @@ PyYAML==6.0.1
88
defusedxml==0.7.1
99
subresource-integrity==0.2
1010
jsonschema==4.19.0
11-
beautifulsoup4==4.11.1
1211
urllib3==1.26.18

0 commit comments

Comments
 (0)