Refactor JSON file retrieval to use regex for improved accuracy in matching file links and streamline file name extraction.

2025-07-29 18:24:01 +02:00
parent 7a614fd824
commit 54d13032ab
2 changed files with 59 additions and 9 deletions
--- a/main.py
+++ b/main.py
@@ -7,7 +7,7 @@
 import sys
 import galdPl
 import requests
-from bs4 import BeautifulSoup
+import re
 import os

 def get_json_files_from_folder(folder):
@@ -15,15 +15,16 @@ def get_json_files_from_folder(folder):
    url = base_url + folder
    r = requests.get(url, timeout=10)
    r.raise_for_status()
-    soup = BeautifulSoup(r.text, "html.parser")
+    
+    # Hledáme JSON soubory pomocí regex
+    json_pattern = r'href="(/gald/galdistream/src/branch/main/resources/[^"]*\.json)"'
+    matches = re.findall(json_pattern, r.text)
+    
    files = []
-    # Hledáme odkazy s .json v href
-    for a in soup.find_all("a", href=lambda x: x and x.endswith('.json')):
-        href = a.get("href", "")
-        if href.startswith("/gald/galdistream/src/branch/main/resources/"):
-            # Extrahujeme pouze název souboru
-            file_name = href.split("/")[-1]
-            files.append(file_name)
+    for match in matches:
+        # Extrahujeme pouze název souboru
+        file_name = match.split("/")[-1]
+        files.append(file_name)
    return files

 def update_json_db():