automate/utils/html-to-json.py

42 lines
939 B
Python

import requests
import json
from bs4 import BeautifulSoup
import argparse
# Create the parser
parser = argparse.ArgumentParser(description='Scrape Debian package information from a URL.')
# Add the URL argument
parser.add_argument('url', type=str, help='The URL to scrape Debian package information from.')
# Parse the arguments
args = parser.parse_args()
# Use the parsed URL
url = args.url
response = requests.get(url)
html_content = response.text
soup = BeautifulSoup(html_content, 'html.parser')
data = []
# Find all the table rows
rows = soup.find_all('tr')
for row in rows:
cells = row.find_all('td')
if cells:
name = cells[1].text.strip()
last_modified = cells[2].text.strip()
size = cells[3].text.strip()
data.append({
"Name": name,
"Last modified": last_modified,
"Size": size
})
json_data = json.dumps(data, indent=2)
print(json_data)