From d62ab370364b45669251c1eda2f5af8b667e0df8 Mon Sep 17 00:00:00 2001 From: cyteen Date: Fri, 10 May 2024 08:21:05 +0100 Subject: [PATCH] Refined the Release class and get_dist_releases Now uses the webpage for dist information rather than the large ls-lR.gz that needed unpacking. --- metascripts/rebuild-debian-csv | 106 ++++++++++++++++++++++++++++----- 1 file changed, 91 insertions(+), 15 deletions(-) mode change 100755 => 100644 metascripts/rebuild-debian-csv diff --git a/metascripts/rebuild-debian-csv b/metascripts/rebuild-debian-csv old mode 100755 new mode 100644 index 92d5b89..5a6a854 --- a/metascripts/rebuild-debian-csv +++ b/metascripts/rebuild-debian-csv @@ -19,14 +19,22 @@ class Release: def __init__(self, fileobj): params = {} for line in fileobj: - line = line.decode("utf-8") - if line.startswith(" ") or ": " not in line: - continue + line = line.decode('utf-8').strip() + if line == "MD5Sum:" or line == "SHA256:": + break - k, v = line.strip().split(": ", 1) + k, v = line.split(": ", 1) params[k] = v - self.label = params.get("Label") + # "Origin" is Debian/Ubuntu/Devuan as expected. + # "Origin" = "Label" for Debian and Ubuntu, not always for Devuan. + # "Label" is "Debian"/"Ubuntu" for Debian/Ubuntu. + # "Label" is "Devuan" or "Master" for Devuan. + # "Label" of "Master" has no equivalent in Debian/Ubuntu. + # Where this program uses "label" it really wants "origin". + self.origin = params.get("Origin") + self.label = self.origin + self.suite = params.get("Suite") self.version = params.get("Version") self.codename = params.get("Codename") @@ -84,16 +92,29 @@ class Release: return date.today() - release_date def is_relevant(self): - if self.label not in ("Debian", "Ubuntu", ): + if self.label not in ("Debian", "Ubuntu", "Devuan", ): return False - bl1 = ("oldoldstable", "devel", ) - if self.suite in bl1: - return False + if self.label == "Debian" or self.label == "Ubuntu": + bl1 = ("oldoldstable", "devel", ) + if self.suite in bl1: + return False - bl2 = ("-updates", "-backports", "-security", "-proposed", "-sloppy", ) - if any(self.suite.endswith(suffix) for suffix in bl2): - return False + bl2 = ("-updates", "-backports", "-security", "-proposed", "-sloppy", ) + if any(self.suite.endswith(suffix) for suffix in bl2): + return False + + if self.label == "Devuan": + # "oldoldstable" is maintained in Devuan. + # These are no longer maintained. + bl_ = ("jessie", "ascii", ) + if self.suite in bl_: + return False + + # For fine-grained control: + bl_ = ("-backports", "-security", "-proposed-updates", ) + if any(self.suite.endswith(suffix) for suffix in bl_): + return False if self.label == "Ubuntu": if self.is_lts(): @@ -108,10 +129,19 @@ class Release: return True if self.label == "Ubuntu" and self.age() < timedelta(days=0): return True + if self.label == "Devuan" and self.suite == "experimental": + return True return False +""" +# Note: get_releases(url) is deprecated because it can not work with Devuan. +# Instead, use get_dist_releases(url). +# +# get_releases(url) requires the file ls-lR.gz to be present. +# This is not available in Devuan. +# def get_releases(url): dirlinepattern = re.compile( r"\.(/dists/[\w\-]+):" @@ -172,6 +202,49 @@ def get_releases(url): yield Release(u) except urllib.error.URLError as e: logger.warning("Failed to download %s: %s", relurl, e) +""" + + +def get_dist_releases(url): + + from lxml import html + + # Open the web page listurl and use an xpath to extract the dist names. + listurl = url + "/dists/" + + try: + tree = html.fromstring(urllib.request.urlopen(listurl).read()) + logger.debug("Downloaded %s", listurl) + except urllib.error.URLError as e: + logger.warning("Failed to download %s: %s", listurl, e) + else: + # Extract dist names from the web links. + """ + Finds in the web page. + + Using Xpath 1.0: + matches: buster/, daedalus/, noble/, oldstable, stable/, unstable/ + does not match: ../, /debian/, daedalus-updates/, 6.0/ + + """ + dist_path = "//a[contains(@href,'/') \ + and not(starts-with(@href,'/')) \ + and not(contains(@href,'-')) \ + and not(contains(@href,'.')) \ + ]/@href" + + dist_names = tree.xpath(dist_path) + + for debiandir in dist_names: + relurl = listurl + debiandir + "Release" + + try: + with urllib.request.urlopen(relurl) as u: + logger.debug("Downloaded %s", relurl) + + yield Release(u) + except urllib.error.URLError as e: + logger.warning("Failed to download %s: %s", relurl, e) def write_csv(filename, releases, archs): @@ -181,6 +254,7 @@ def write_csv(filename, releases, archs): for r in releases: if not r.is_relevant(): + logger.debug("Discarding as not relevant: %s ", repr(r)) continue for arch in archs: @@ -203,11 +277,13 @@ def write_csv(filename, releases, archs): if __name__ == "__main__": logger.info("Downloading releases...") - debianreleases = set(get_releases("http://ftp.debian.org/debian")) + debianreleases = set(get_dist_releases("http://ftp.debian.org/debian")) assert len(debianreleases) > 0 - ubuntureleases = set(get_releases("http://ftp.ubuntu.com/ubuntu")) + ubuntureleases = set(get_dist_releases("http://ftp.ubuntu.com/ubuntu")) assert len(ubuntureleases) > 0 - releases = list(sorted(debianreleases | ubuntureleases)) + devuanreleases = set(get_dist_releases("http://deb.devuan.org/merged")) + assert len(devuanreleases) > 0 + releases = list(sorted(debianreleases | ubuntureleases | devuanreleases)) assert len(releases) > 0 logger.info("Found %d releases", len(releases))