Use HTMLParser instead bs4 in checksum validation

ligangty · ligangty · commit 8e75a43c57f7 · 2024-03-20T15:26:26.000+08:00
diff --git a/charon.spec b/charon.spec
@@ -50,7 +50,6 @@ Requires: python%{python3_pkgversion}-importlib-metadata
 Requires: python%{python3_pkgversion}-zipp
 Requires: python%{python3_pkgversion}-attrs
 Requires: python%{python3_pkgversion}-pyrsistent
-Requires: python%{python3_pkgversion}-beautifulsoup4
 
 %description
 Simple Python tool with command line interface for charon init,
diff --git a/charon/pkgs/checksum_http.py b/charon/pkgs/checksum_http.py
@@ -15,7 +15,7 @@
 """
 from charon.utils.files import digest
 from typing import Tuple, List, Dict
-from bs4 import BeautifulSoup
+from html.parser import HTMLParser
 import tempfile
 import os
 import logging
@@ -232,18 +232,27 @@ def _list_folder_content(folder_url: str, folder_path: str) -> List[str]:
     return []
 
 
-def _parseContent(pageContent: str, parent: str) -> List[str]:
-    items = []
-    soup = BeautifulSoup(pageContent, "html.parser")
-    contents = soup.find("ul", id="contents").find_all("a")
-    for c in contents:
-        item = c["href"]
-        if not item or item.strip() == '../':
-            continue
-        items.append(os.path.join(parent, item))
-    return items
+class IndexParser(HTMLParser):
+    def __init__(self):
+        super().__init__()
+        self.reset()
+        self.__content = []
+
+    def handle_starttag(self, tag, attrs):
+        if tag == "a":
+            for name, link in attrs:
+                if name == "href" and link.strip() not in ['../', '']:
+                    self.__content.append(link)
+
+    def get_content(self):
+        return self.__content
 
 
+def _parseContent(pageContent: str, parent: str) -> List[str]:
+    parser = IndexParser()
+    parser.feed(pageContent)
+    return [os.path.join(parent, i) for i in parser.get_content()]
+
 def _read_remote_file_content(remote_file_url: str) -> str:
     try:
         with requests.get(remote_file_url) as r:
diff --git a/requirements.txt b/requirements.txt
@@ -8,5 +8,4 @@ PyYAML==6.0.1
 defusedxml==0.7.1
 subresource-integrity==0.2
 jsonschema==4.19.0
-beautifulsoup4==4.11.1
 urllib3==1.26.18