rebuild-debian-csv: Switch from FTP to HTTP

This commit is contained in:
Johan Gunnarsson 2023-07-27 13:53:25 +02:00
parent 14091f59d6
commit 559316ff5d
1 changed files with 65 additions and 21 deletions

View File

@ -1,11 +1,13 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
from datetime import timedelta, date, datetime from datetime import timedelta, date, datetime
from io import BytesIO
from csv import DictWriter from csv import DictWriter
import ftplib import gzip
import logging import logging
import re
import urllib.request
import urllib.error
logging.basicConfig() logging.basicConfig()
@ -102,28 +104,66 @@ class Release:
return False return False
def get_releases(): def get_releases(url):
ftp = ftplib.FTP("mirror.nl.leaseweb.net") dirlinepattern = re.compile(
ftp.login() r"\.(/dists/[\w\-]+):"
logger.debug("Connected to FTP") )
for distdir in ("/debian/dists", "/ubuntu/dists", ): filelinepattern = re.compile(
ftp.cwd(distdir) r"([\w\-]+)" # permissions (1)
distsubdirs = ftp.nlst() r"\s+"
assert len(distsubdirs) > 0 r"(\d+)" # inodes (2)
logger.debug("Found %d items in %s", len(distsubdirs), distdir) r"\s+"
r"(\w+)" # user (3)
r"\s+"
r"(\w+)" # group (4)
r"\s+"
r"(\d+)" # size (5)
r"\s+"
r"(\w+\s+\d+\s+\d+:\d+|\w+\s+\d+\s+\d+)" # datetime (6)
r"\s+"
r"(.*)" # filename (7)
)
for x in distsubdirs: listurl = url + "/ls-lR.gz"
data = BytesIO() with gzip.open(urllib.request.urlopen(listurl), "rt") as f:
logger.debug("Downloaded %s", listurl)
while f:
try: try:
ftp.retrbinary(f"RETR {x}/Release", data.write) dirnameline = next(f).strip()
assert data.tell() > 0 assert dirnameline.startswith(".")
data.seek(0) totalline = next(f).strip()
logger.debug("Downloaded %s/%s/Release", distdir, x) assert totalline.startswith("total ")
except StopIteration:
break
yield Release(data) skipdir = True
except ftplib.error_perm: dirlinematch = dirlinepattern.fullmatch(dirnameline)
pass if dirlinematch:
debiandir = dirlinematch.group(1)
skipdir = False
for a in f:
fileline = a.strip()
if fileline == "":
break
if skipdir:
continue
filelinematch = filelinepattern.fullmatch(fileline)
if not filelinematch:
continue
filename = filelinematch.group(7)
if filename == "Release" or filename.startswith("Release ->"):
relurl = url + debiandir + "/Release"
try:
with urllib.request.urlopen(relurl) as u:
logger.debug("Downloaded %s", relurl)
yield Release(u)
except urllib.error.URLError as e:
logger.warning("Failed to download %s: %s", relurl, e)
def write_csv(filename, releases, archs): def write_csv(filename, releases, archs):
@ -155,7 +195,11 @@ def write_csv(filename, releases, archs):
if __name__ == "__main__": if __name__ == "__main__":
logger.info("Downloading releases...") logger.info("Downloading releases...")
releases = list(sorted(set(get_releases()))) debianreleases = set(get_releases("http://ftp.debian.org/debian"))
assert len(debianreleases) > 0
ubuntureleases = set(get_releases("http://ftp.ubuntu.com/ubuntu"))
assert len(ubuntureleases) > 0
releases = list(sorted(debianreleases | ubuntureleases))
assert len(releases) > 0 assert len(releases) > 0
logger.info("Found %d releases", len(releases)) logger.info("Found %d releases", len(releases))