Refined the Release class and get_dist_releases

Now uses the webpage for dist information rather than the large ls-lR.gz
that needed unpacking.
This commit is contained in:
cyteen 2024-05-10 08:21:05 +01:00
parent 31b31c2bd7
commit d62ab37036
1 changed files with 91 additions and 15 deletions

106
metascripts/rebuild-debian-csv Executable file → Normal file
View File

@ -19,14 +19,22 @@ class Release:
def __init__(self, fileobj):
params = {}
for line in fileobj:
line = line.decode("utf-8")
if line.startswith(" ") or ": " not in line:
continue
line = line.decode('utf-8').strip()
if line == "MD5Sum:" or line == "SHA256:":
break
k, v = line.strip().split(": ", 1)
k, v = line.split(": ", 1)
params[k] = v
self.label = params.get("Label")
# "Origin" is Debian/Ubuntu/Devuan as expected.
# "Origin" = "Label" for Debian and Ubuntu, not always for Devuan.
# "Label" is "Debian"/"Ubuntu" for Debian/Ubuntu.
# "Label" is "Devuan" or "Master" for Devuan.
# "Label" of "Master" has no equivalent in Debian/Ubuntu.
# Where this program uses "label" it really wants "origin".
self.origin = params.get("Origin")
self.label = self.origin
self.suite = params.get("Suite")
self.version = params.get("Version")
self.codename = params.get("Codename")
@ -84,16 +92,29 @@ class Release:
return date.today() - release_date
def is_relevant(self):
if self.label not in ("Debian", "Ubuntu", ):
if self.label not in ("Debian", "Ubuntu", "Devuan", ):
return False
bl1 = ("oldoldstable", "devel", )
if self.suite in bl1:
return False
if self.label == "Debian" or self.label == "Ubuntu":
bl1 = ("oldoldstable", "devel", )
if self.suite in bl1:
return False
bl2 = ("-updates", "-backports", "-security", "-proposed", "-sloppy", )
if any(self.suite.endswith(suffix) for suffix in bl2):
return False
bl2 = ("-updates", "-backports", "-security", "-proposed", "-sloppy", )
if any(self.suite.endswith(suffix) for suffix in bl2):
return False
if self.label == "Devuan":
# "oldoldstable" is maintained in Devuan.
# These are no longer maintained.
bl_ = ("jessie", "ascii", )
if self.suite in bl_:
return False
# For fine-grained control:
bl_ = ("-backports", "-security", "-proposed-updates", )
if any(self.suite.endswith(suffix) for suffix in bl_):
return False
if self.label == "Ubuntu":
if self.is_lts():
@ -108,10 +129,19 @@ class Release:
return True
if self.label == "Ubuntu" and self.age() < timedelta(days=0):
return True
if self.label == "Devuan" and self.suite == "experimental":
return True
return False
"""
# Note: get_releases(url) is deprecated because it can not work with Devuan.
# Instead, use get_dist_releases(url).
#
# get_releases(url) requires the file ls-lR.gz to be present.
# This is not available in Devuan.
#
def get_releases(url):
dirlinepattern = re.compile(
r"\.(/dists/[\w\-]+):"
@ -172,6 +202,49 @@ def get_releases(url):
yield Release(u)
except urllib.error.URLError as e:
logger.warning("Failed to download %s: %s", relurl, e)
"""
def get_dist_releases(url):
from lxml import html
# Open the web page listurl and use an xpath to extract the dist names.
listurl = url + "/dists/"
try:
tree = html.fromstring(urllib.request.urlopen(listurl).read())
logger.debug("Downloaded %s", listurl)
except urllib.error.URLError as e:
logger.warning("Failed to download %s: %s", listurl, e)
else:
# Extract dist names from the web links.
"""
Finds <a href="{debiandir}"> in the web page.
Using Xpath 1.0:
matches: buster/, daedalus/, noble/, oldstable, stable/, unstable/
does not match: ../, /debian/, daedalus-updates/, 6.0/
"""
dist_path = "//a[contains(@href,'/') \
and not(starts-with(@href,'/')) \
and not(contains(@href,'-')) \
and not(contains(@href,'.')) \
]/@href"
dist_names = tree.xpath(dist_path)
for debiandir in dist_names:
relurl = listurl + debiandir + "Release"
try:
with urllib.request.urlopen(relurl) as u:
logger.debug("Downloaded %s", relurl)
yield Release(u)
except urllib.error.URLError as e:
logger.warning("Failed to download %s: %s", relurl, e)
def write_csv(filename, releases, archs):
@ -181,6 +254,7 @@ def write_csv(filename, releases, archs):
for r in releases:
if not r.is_relevant():
logger.debug("Discarding as not relevant: %s ", repr(r))
continue
for arch in archs:
@ -203,11 +277,13 @@ def write_csv(filename, releases, archs):
if __name__ == "__main__":
logger.info("Downloading releases...")
debianreleases = set(get_releases("http://ftp.debian.org/debian"))
debianreleases = set(get_dist_releases("http://ftp.debian.org/debian"))
assert len(debianreleases) > 0
ubuntureleases = set(get_releases("http://ftp.ubuntu.com/ubuntu"))
ubuntureleases = set(get_dist_releases("http://ftp.ubuntu.com/ubuntu"))
assert len(ubuntureleases) > 0
releases = list(sorted(debianreleases | ubuntureleases))
devuanreleases = set(get_dist_releases("http://deb.devuan.org/merged"))
assert len(devuanreleases) > 0
releases = list(sorted(debianreleases | ubuntureleases | devuanreleases))
assert len(releases) > 0
logger.info("Found %d releases", len(releases))