Merge pull request #199 from ligangty/checksum-http

ligangty · web-flow · commit c0feb5db5bf2 · 2024-03-20T15:42:35.000+08:00
Use HTMLParser instead bs4 in checksum validation
diff --git a/charon.spec b/charon.spec
@@ -50,7 +50,6 @@ Requires: python%{python3_pkgversion}-importlib-metadata
 Requires: python%{python3_pkgversion}-zipp
 Requires: python%{python3_pkgversion}-attrs
 Requires: python%{python3_pkgversion}-pyrsistent
-Requires: python%{python3_pkgversion}-beautifulsoup4
 
 %description
 Simple Python tool with command line interface for charon init,
diff --git a/charon/pkgs/checksum_http.py b/charon/pkgs/checksum_http.py
@@ -15,7 +15,7 @@
 """
 from charon.utils.files import digest
 from typing import Tuple, List, Dict
-from bs4 import BeautifulSoup
+from html.parser import HTMLParser
 import tempfile
 import os
 import logging
@@ -224,24 +224,30 @@ def _list_folder_content(folder_url: str, folder_path: str) -> List[str]:
                 contentType = r.headers.get('Content-Type')
                 if contentType and "text/html" in contentType:
                     pageContent = r.text
-                    return _parseContent(pageContent, folder_path)
+                    p = _IndexParser()
+                    p.feed(pageContent)
+                    return p.get_content(folder_path)
                 else:
                     logger.warning("%s is not a folder!", folder_url)
     except Exception as e:
         logger.error("Can not list folder %s. The error is %s", folder_url, e)
     return []
 
 
-def _parseContent(pageContent: str, parent: str) -> List[str]:
-    items = []
-    soup = BeautifulSoup(pageContent, "html.parser")
-    contents = soup.find("ul", id="contents").find_all("a")
-    for c in contents:
-        item = c["href"]
-        if not item or item.strip() == '../':
-            continue
-        items.append(os.path.join(parent, item))
-    return items
+class _IndexParser(HTMLParser):
+    def __init__(self):
+        super().__init__()
+        self.reset()
+        self.__content = []
+
+    def handle_starttag(self, tag, attrs):
+        if tag == "a":
+            for name, link in attrs:
+                if name == "href" and link.strip() not in ['../', '']:
+                    self.__content.append(link)
+
+    def get_content(self, parent):
+        return [os.path.join(parent, i) for i in self.__content]
 
 
 def _read_remote_file_content(remote_file_url: str) -> str:
diff --git a/requirements.txt b/requirements.txt
@@ -8,5 +8,4 @@ PyYAML==6.0.1
 defusedxml==0.7.1
 subresource-integrity==0.2
 jsonschema==4.19.0
-beautifulsoup4==4.11.1
 urllib3==1.26.18