#!/usr/bin/env python3 from datetime import timedelta, date, datetime from csv import DictWriter import gzip import logging import re import urllib.request import urllib.error logging.basicConfig() logger = logging.getLogger("rebuild-debian-csv") logger.setLevel(logging.DEBUG) class Release: def __init__(self, fileobj): params = {} for line in fileobj: line = line.decode('utf-8').strip() # Header of "Release" finishes at: # "MD5Sum:" in Debian/Ubuntu # "SHA256:" in Devuan if line == "MD5Sum:" or line == "SHA256:": break k, v = line.split(": ", 1) params[k] = v # In Release files, # e.g. https://ftp.debian.org/debian/dists/stable/Release # "Origin" is Debian/Ubuntu/Devuan as expected. # "Origin" = "Label" for Debian and Ubuntu, not always for Devuan. # "Label" is "Debian"/"Ubuntu" for Debian/Ubuntu. # "Label" is "Devuan" or "Master" for Devuan. # "Label" of "Master" has no equivalent in Debian/Ubuntu. # # Where this program uses "label" it really wants "origin". self.origin = params.get("Origin") self.label = self.origin self.suite = params.get("Suite") self.version = params.get("Version") self.codename = params.get("Codename") self.architectures = params.get("Architectures", "").split(" ") SUITE_TO_VERSION = { "testing": "96", "unstable": "97", "experimental": "98", } self.sortkey = self.label + \ (self.version or SUITE_TO_VERSION.get(self.suite, "99")) def __repr__(self): name = self.label if self.version and self.suite and self.suite != self.codename: name += f" {self.suite}/{self.version}" elif self.version: name += f" {self.version}" elif self.suite: name += f" {self.suite}" if self.is_lts(): name += " LTS" if self.codename: name += f" (\"{self.codename}\")" return name def __eq__(self, other): return repr(self) == repr(other) def __lt__(self, other): return self.sortkey < other.sortkey def __hash__(self): return hash(repr(self)) def release_date(self): if self.label == "Ubuntu" and self.version: try: return datetime.strptime(self.version, "%y.%m").date() except ValueError as e: logger.warning("Can't parse calver %s: %s", self.version, e) def is_lts(self): release_date = self.release_date() if release_date: return release_date.year % 2 == 0 and release_date.month == 4 else: return False def age(self): release_date = self.release_date() if release_date: return date.today() - release_date def is_relevant(self): if self.label not in ("Debian", "Ubuntu", "Devuan", ): return False if self.label == "Debian" or self.label == "Ubuntu": bl1 = ("oldoldstable", "devel", ) if self.suite in bl1: return False bl2 = ("-updates", "-backports", "-security", "-proposed", "-sloppy", ) if any(self.suite.endswith(suffix) for suffix in bl2): return False if self.label == "Devuan": # "oldoldstable" is maintained in Devuan. # These are no longer maintained. bl_ = ("jessie", "ascii", ) if self.suite in bl_: return False # For fine-grained control: bl_ = ("-backports", "-security", "-proposed-updates", ) if any(self.suite.endswith(suffix) for suffix in bl_): return False if self.label == "Ubuntu": if self.is_lts(): return self.age() < 5 * timedelta(days=365) else: return self.age() < timedelta(days=365) return True def is_experimental(self): if self.label == "Debian" and self.suite == "experimental": return True if self.label == "Ubuntu" and self.age() < timedelta(days=0): return True if self.label == "Devuan" and self.suite == "experimental": return True return False """ # Note: get_releases(url) is deprecated because it can not work with Devuan. # Instead, use get_dist_releases(url). # # get_releases(url) requires the file ls-lR.gz to be present. # This is not available in Devuan. # def get_releases(url): dirlinepattern = re.compile( r"\.(/dists/[\w\-]+):" ) filelinepattern = re.compile( r"([\w\-]+)" # permissions (1) r"\s+" r"(\d+)" # inodes (2) r"\s+" r"(\w+)" # user (3) r"\s+" r"(\w+)" # group (4) r"\s+" r"(\d+)" # size (5) r"\s+" r"(\w+\s+\d+\s+\d+:\d+|\w+\s+\d+\s+\d+)" # datetime (6) r"\s+" r"(.*)" # filename (7) ) listurl = url + "/ls-lR.gz" with gzip.open(urllib.request.urlopen(listurl), "rt") as f: logger.debug("Downloaded %s", listurl) while f: try: dirnameline = next(f).strip() assert dirnameline.startswith(".") totalline = next(f).strip() assert totalline.startswith("total ") except StopIteration: break skipdir = True dirlinematch = dirlinepattern.fullmatch(dirnameline) if dirlinematch: debiandir = dirlinematch.group(1) skipdir = False for a in f: fileline = a.strip() if fileline == "": break if skipdir: continue filelinematch = filelinepattern.fullmatch(fileline) if not filelinematch: continue filename = filelinematch.group(7) if filename == "Release" or filename.startswith("Release ->"): relurl = url + debiandir + "/Release" try: with urllib.request.urlopen(relurl) as u: logger.debug("Downloaded %s", relurl) yield Release(u) except urllib.error.URLError as e: logger.warning("Failed to download %s: %s", relurl, e) """ def get_dist_releases(url): from lxml import html # Open the web page listurl and use an xpath to extract the dist names. listurl = url + "/dists/" try: tree = html.fromstring(urllib.request.urlopen(listurl).read()) logger.debug("Downloaded %s", listurl) except urllib.error.URLError as e: logger.warning("Failed to download %s: %s", listurl, e) else: # Extract dist names from the web links. """ Finds in the web page. Using Xpath 1.0: matches: buster/, daedalus/, noble/, oldstable, stable/, unstable/ does not match: ../, /debian/, daedalus-updates/, 6.0/ The chosen xpath: dist_path = "//a[contains(@href,'/') \ and not(starts-with(@href,'/')) \ and not(contains(@href,'-')) \ and not(contains(@href,'.')) \ ]/@href" will select only hrefs which are: not, e.g., /debian/ not(starts-with(@href,'/')) directories contains(@href,'/') codenames not(contains(@href,'-')) not numbers or ../ not(contains(@href,'.')) This excludes all "-updates", "-backports", "-security", "-proposed", etc. """ dist_path = "//a[contains(@href,'/') \ and not(starts-with(@href,'/')) \ and not(contains(@href,'-')) \ and not(contains(@href,'.')) \ ]/@href" dist_names = tree.xpath(dist_path) for debiandir in dist_names: relurl = listurl + debiandir + "Release" try: with urllib.request.urlopen(relurl) as u: logger.debug("Downloaded %s", relurl) yield Release(u) except urllib.error.URLError as e: logger.warning("Failed to download %s: %s", relurl, e) def write_csv(filename, releases, archs): with open(filename, "w", newline="") as f: w = DictWriter(f, fieldnames=("OS", "Dist", "Arch", "Name", "Exp", )) w.writeheader() for r in releases: if not r.is_relevant(): logger.debug("Discarding as not relevant: %s ", repr(r)) continue for arch in archs: if arch not in r.architectures: continue dist = r.codename.lower() if dist == "rc-buggy": dist = "experimental" w.writerow({ "OS": r.label.lower(), "Dist": dist, "Arch": arch, "Name": repr(r), "Exp": r.is_experimental(), }) logger.debug("Wrote %s to file %s", r, filename) if __name__ == "__main__": logger.info("Downloading releases...") debianreleases = set(get_dist_releases("http://ftp.debian.org/debian")) assert len(debianreleases) > 0 ubuntureleases = set(get_dist_releases("http://ftp.ubuntu.com/ubuntu")) assert len(ubuntureleases) > 0 devuanreleases = set(get_dist_releases("http://deb.devuan.org/merged")) assert len(devuanreleases) > 0 releases = list(sorted(debianreleases | ubuntureleases | devuanreleases)) assert len(releases) > 0 logger.info("Found %d releases", len(releases)) write_csv("debians-arm.csv", releases, ("armhf", "arm64")) logger.info("Wrote debians-arm.csv") write_csv("debians-x86.csv", releases, ("i386", "amd64")) logger.info("Wrote debians-x86.csv")