Description: Add debian maintainer assistant
 This patch add a helper script for maintainers of Debian FreeDict packages. See
 the documentation of that script to find out more.
Author: Sebastian Humenda <shumenda@gmx.de>
Last-Update: 2018-10-28

Index: freedict-tools/fetchdictdata.py
===================================================================
--- /dev/null
+++ freedict-tools/fetchdictdata.py
@@ -0,0 +1,619 @@
+#!/usr/bin/env python3
+"""This script is designed to help with the Debian FreeDict packaging. It can
+fetch all available databases from
+https://www.freedict.org/freedict-database.xml
+and generate a orig source tar ball or generate debian/control and
+debian/copyright.
+
+Please use `--help` to find out more details.
+
+IMPORTANT: you must run this script from the root of the freedict source,
+otherwise the operations will fail.
+
+Generating debian/copyright and debian/control
+==============================================
+
+Since FreeDict packages contain a lot of very similar dictionaries, the process
+is made more convenient by generating the mentioned files. d/control is made up
+of a d/control.HEAD file (make sure this one exists), usually with the normal
+source stanza and followed by auto-generated stanzas, derived from information
+of the debian/freedict-database.xml. The XML API file is fetched automatically,
+if required.
+
+The d/copyright file is generated using licensecheck and a bit of guessing
+logic. The file d/copyright.snippets/TAIL is appended and usually contains the
+licence definition stanzas. If the copyright file contains "FIXME", the
+maintainer may also like to create a file d/copyright.snippets/xxx-yyy plain
+text As soon as such a file is found, no licence information will be queried for
+the specified dictionary.
+"""
+
+# pylint: disable=multiple-imports,too-few-public-methods,global-statement
+import argparse
+import datetime
+import os
+import re
+import shutil
+import subprocess
+import sys
+import textwrap
+import urllib.request
+import xml.etree.ElementTree as ET
+from packaging.version import Version
+
+XML_URL = "http://www.freedict.org/freedict-database.xml"
+XML_API_PATH = "debian/freedict-database.xml"
+__DICT2LONGNAME = {}  # filled by dictionarycode2longdescription
+
+
+def dictionarycode2longdescription(string):
+    """Translate a dictionary code into a human-readable string.
+    A dictionary is encoded using a hyphen-delimited pair of ISO 639-3 codes.
+    This function translates this into a language pair of English names,
+    delimited by a hyphen."""
+    global __DICT2LONGNAME
+    if not __DICT2LONGNAME:
+        __DICT2LONGNAME = parse_iso_table()
+    string = string.split("-")
+    return __DICT2LONGNAME[string[0]] + "-" + __DICT2LONGNAME[string[1]]
+
+
+def parse_iso_table():
+    """Parse the ISO codes from a tab-delimited data file in debian/ named
+    iso*.tab."""
+    tbl = {}
+    for item in os.listdir("debian"):
+        if item.startswith("iso-"):
+            with open(os.path.join("debian", item), "r", encoding="utf-8") as tbl_file:
+                tmp = tbl_file.read()
+            tmp = tmp.split("\n")
+            for line in tmp[1:]:
+                line = line.split("\t")
+                name = line[6]
+                # remove parenthesized vom language name
+                name = re.sub(r"\s*\(.*?\)\s*", "", name)
+                tbl[line[0]] = name
+                if line[1] != "":  # some have two language code, add it:
+                    tbl[line[1]] = name
+    return tbl
+
+
+def get_xml_content(fetch_new=False):
+    """Either read contents of freedict-database.xml or fetch it from the web
+first and save it."""
+    if fetch_new or not os.path.exists(XML_API_PATH):
+        print(f"Downloading API from {XML_URL}")
+        with urllib.request.urlopen(XML_URL) as req:
+            data = req.read().decode("utf-8")
+        with open(XML_API_PATH, "w", encoding="UTF-8") as file:
+            file.write(data)
+    else:  # read from file
+        with open(XML_API_PATH, encoding="UTF-8") as file:
+            data = file.read()
+    return data
+
+def run_licensecheck_on(file_path):
+    """Try to extract licence with licence check."""
+    with subprocess.Popen(
+            ["licensecheck", "--copyright", "--deb-fmt", file_path], stdout=subprocess.PIPE
+            ) as proc:
+        try:
+            text = proc.communicate()[0].decode("utf-8")
+            text = text.lstrip().split("\n", maxsplit=1)[0]
+        except IndexError:
+            return None
+    sre = re.search(r"^.+: (.+)$", text)
+    if not sre:
+        sre = re.search(r"^.No copyright. (.+)$", text.split("\n")[0])
+    if not sre:
+        return None
+    match = sre.groups()[0]
+    return (None if "UNKNOWN" in match else match)
+
+def find_licence(dictionary):
+    """Find out licence of dictionary.
+First run licence-check, afterwards use a self-brewed licence checker.
+Currently only GPL is detected, else FIXMe is output. It tries to output
+something like GPL, GPL-2, GPL-3, GPL-2+, GPL-3+"""
+
+
+    tei_fn = "{0}{1}{0}.{2}".format(dictionary.name, os.sep, "tei")
+    licence = run_licensecheck_on(tei_fn)
+    if not licence:
+        licence = run_licensecheck_on(dictionary.name + os.sep + "COPYING.tei")
+
+        if not licence:
+            licence = "FIXME"  # backup solution
+
+    if licence != "FIXME":  # found a licence, return
+        NO_COPYRIGHT_BY_LICENSECHECK = "*No copyright* "
+        if licence.startswith(NO_COPYRIGHT_BY_LICENSECHECK):
+            licence = licence[len(NO_COPYRIGHT_BY_LICENSECHECK):]
+        return licence
+
+    # try guessing the licence from the TEI file
+    with open(tei_fn, "r", encoding="utf-8") as f:
+        in_header = True
+        line = "start"
+        lastline = ""
+        while in_header and line != "":
+            line = f.readline().lower()
+            if "<body" in line:
+                line = line[: line.find("<body")]  # parse everything before
+                in_header = False
+            if "gpl" in line.lower() or "gnu general public lic" in line.lower():
+                licence = "GPL"
+                # try to extract version number
+                res = re.search(
+                    r"(?:version|ver.|licence|licence)\s+(\d+)", lastline + line.lower()
+                )
+                if res:
+                    licence += "-%s" % res.groups()[0]
+            elif "attribution-sharealike" in line:
+                licence = "CC-BY-SA"
+                version = re.search(r"sharealike \(?v?(\d+\.?\d*)", line)
+                if version:
+                    licence += "-%s" % version.groups()[0]
+            lastlines = lastline + line
+            if licence.startswith("GPL") and not licence.endswith("+"):
+                if (
+                    re.search(".*later.*version", lastlines)
+                    or "or later" in lastlines
+                    or "and any later" in lastlines
+                ):
+                    licence += "+"
+            lastline = line[:]
+    return licence
+
+
+def recursive_text(node):
+    text = ""
+    if not node.text.strip() == "":
+        text = node.text
+    for child in node:
+        text += "\n" + recursive_text(child)
+    return text
+
+class Dictionary:
+    """A `named tuple` to represent a dictionary from the API."""
+    def __init__(self, name):
+        self.name = name
+        self.headwords, self.version = None, None
+        self.status, self.maintainer, self.source_release = None, None, None
+
+    def set(self, key, val):
+        """Map the XML attribute names  to the internal name."""
+        match key:
+            case "headwords": self.headwords = val
+            case "edition": self.version = val
+            case "status": self.status = val
+            case "maintainerName": self.maintainer = val
+            case _: raise ValueError(f"invalid key: {key}")
+
+class Action:
+    """An action to perform on the FD API."""
+    def __init__(self, package_name, dictionaries):
+        self.dictionaries = dictionaries
+        # name of the debian (source) package
+        self.package_name = package_name
+
+    def perform_action(self):
+        pass
+
+class GenerateControlCopyright(Action):
+    def __init__(self, package_name, dictionaries, no_desc_version):
+        super().__init__(package_name, dictionaries)
+        # keep/strip version number from pkg description
+        self.__desc_version = not no_desc_version
+
+    def perform_action(self):
+        """Write both control as well as the copyright file."""
+        self.write_control()
+        self.write_copyright()
+
+    def gen_control_for_dictionary(self, platform, dictionary, depends=None,
+            recommends = None, suggests=None, provides=None):
+        """Generate a list of tokens that, if joined by "" resembles a packet description."""
+        tokens = [f"\nPackage: {platform}-freedict-{dictionary.name}\n"]
+        tokens.append(f"Architecture: all\n")
+        tokens.append("Depends: ${misc:Depends}")
+        tokens.append(f" {depends}" if depends else "\n")
+        if recommends:
+            tokens.append(f"Recommends: {recommends}\n")
+        if suggests:
+            tokens.append(f"Suggests: {suggests}\n")
+        if provides:
+            tokens.append(f"Provides: {provides}\n")
+        status = ""
+        if dictionary.status:
+            status = f" (FreeDict status: {dictionary.status})"
+        longname = dictionarycode2longdescription(dictionary.name)
+        tokens.append(
+            f"Description: {longname} dictionary\n"
+        )
+        version = ""
+        if self.__desc_version:
+            version = f", version {dictionary.version}"
+        longdesc = (
+            f"This is the {longname} dictionary from the FreeDict "
+            f"project{version}. It contains {dictionary.headwords} "
+            f"headwords{status}. It can be used with various dictionary "
+            "applications."
+        )
+        # format description to 80 characters per line
+        tokens.append(
+            "\n".join(
+                textwrap.wrap(
+                    longdesc, width=79, initial_indent=" ", subsequent_indent=" "
+                )
+            )
+        )
+        tokens.append("\n")
+        return tokens
+
+    def write_control(self):
+        """Generate debian/control from debian/control.HEAD and the gathered
+        dictionary data."""
+        HEAD = open("debian/control.HEAD", "r", encoding="utf-8").read() + "\n"
+        tokens = [HEAD.rstrip(), "\n\n"]
+
+        for dictionary in self.dictionaries:
+            tokens.extend(self.gen_control_for_dictionary("dict", dictionary,
+                    suggests = ("dictd | dicod, dict | goldendict "
+                        "| dictionaryreader.app"),
+                    provides = "dictd-dictionary"))
+            tokens.extend(self.gen_control_for_dictionary("stardicct",
+                    dictionary,
+                    recommends = ("sdcv (>= 0.5~) | stardict (>= 3.0~) "
+                        "| stardict-gtk (>= 3.0~) | goldendict | qstardict")))
+
+        with open("debian/control", "w", encoding="UTF-8") as file:
+            file.write("".join(tokens))
+
+    def write_copyright(self):
+        """Generate debian/copyright from debian/copyright.HEAD and the gathered
+        dictionary data."""
+        upstream_last_touched = int(
+            subprocess.check_output(["dpkg-parsechangelog", "-S", "version"])
+            .decode(sys.getdefaultencoding())
+            .split(".", 1)[0]
+            .strip()
+        )
+        cprght_snippets = "{1}{0}{2}{0}".format(os.sep, "debian", "copyright.snippets")
+        HEAD = open(cprght_snippets + "HEAD", encoding="utf-8").read()
+        string = [HEAD, "\n"]
+        for dictionary in self.dictionaries:
+            # is there a snippet for an exception to licensecheck?
+            exception_snippet = cprght_snippets + dictionary.name
+            if os.path.exists(exception_snippet):
+                with open(exception_snippet) as f:
+                    string.append("\n" + f.read())
+            else:
+                string.append(f"\nFiles: {dictionary.name}/*\n")
+                string.append(
+                    f"Copyright: 2000-{upstream_last_touched} FreeDict contributors\n"
+                )
+                string.append(f"License: {find_licence(dictionary)}\n")
+        with open(cprght_snippets + "TAIL", encoding="UTF-8") as f:
+            string += ["\n\n", f.read()]
+
+        document = "".join(string)
+        with open("debian/copyright", "w", encoding="utf-8") as f:
+            f.write(document)
+        if "FIXME" in document:
+            print(
+                'NOTE: some licences could not be extracted, search for "FIXME" in debian/copyright.'
+            )
+
+
+class FetchSource(Action):
+    """Fetch the sources of all dictionaries and the tools directory."""
+
+    def __init__(self, package_name, dictionaries):
+        super().__init__(package_name, dictionaries)
+        self.date = self.gen_date()
+        self.dirname = f"{package_name}-{self.date}.orig"
+        self.exclude_dictionaries = []
+        if len(sys.argv) == 4:  # there's the -x option given
+            if sys.argv[2] == "-x":
+                self.exclude_dictionaries = sys.argv[3].split(" ")
+
+    def gen_date(self):
+        """Return date in format "yyyy.mm.dd"."""
+        d = datetime.datetime.now()
+        return (
+            str(d.year)
+            + "."
+            + str(d.month).zfill(2).replace(" ", "0")
+            + "."
+            + str(d.day).zfill(2).replace(" ", "0")
+        )
+
+    def prepare_environment(self):
+        """Perform all actions which are needed before downloading the
+        source.
+        """
+        if os.path.exists(self.dirname):
+            print(
+                "Removing %s; possibly left over from an interrupted run."
+                % self.dirname
+            )
+            shutil.rmtree(self.dirname)
+        os.mkdir(self.dirname)
+        os.chdir(self.dirname)
+
+    def clean_up(self):
+        """
+        Compress the original source, move it to the right destination and
+        remove download directory."""
+        tarname = f"{self.package_name}_{self.date}.orig.tar.xz"
+        os.chdir("..")
+        ret = subprocess.call(["tar", "cJf", tarname, self.dirname])
+        if ret:
+            sys.exit(9)
+        print("Moving tar archive upward to.", os.path.join("..", tarname))
+        os.rename(tarname, ".." + os.sep + tarname)
+        shutil.rmtree(self.dirname)
+
+    def perform_action(self):
+        """Download all upstream source packages."""
+        self.prepare_environment()
+        imported = 0
+        tarname = f"{self.package_name}_{self.date}.orig.tar.xz"
+        for dictionary in self.dictionaries:
+            if dictionary.name in self.exclude_dictionaries:
+                print(f"Skip {dictionary.name} (specified via commmand line)")
+                continue
+            elif dictionary.source_release is None:
+                print(f"Skip {dictionary.name} (no source release)")
+                continue
+            fname = dictionary.source_release.split("/")[-1]
+            print(f"Fetching {dictionary.name} from {dictionary.source_release}")
+            try:
+                with urllib.request.urlopen(dictionary.source_release) as u:
+                    data = u.read()
+            except urllib.error.HTTPError as h:
+                if int(h.code) == 404:
+                    reason = f"{str(h)}: {dictionary.source_release}"
+                    raise urllib.error.URLError(reason) from None
+                raise h from None
+
+            with open(fname, "wb") as f:
+                f.write(data)
+            print("Extracting", fname)
+            if fname.endswith(".zip"):
+                os.system('unzip -qq "%s"' % fname)
+            elif any(
+                fname.endswith(suf) for suf in (".tar.bz2", ".tar.gz", ".tar.xz")
+            ):
+                subprocess.run(["tar", "xf", fname], check=True)
+            else:
+                print(f'E: unknown format of "{fname}".')
+                sys.exit(0)
+
+            subprocess.run(["tar", "xf", fname], check=True)
+            os.remove(fname)
+            imported += 1
+        print("Imported %d dictionaries." % imported)
+        self.clean_up()
+
+class CheckForMoreRecentDicts(Action):
+    """Check whether the downstream copies are outdated."""
+    def __init__(self, package_name, dictionaries):
+        super().__init__(package_name, dictionaries)
+
+    def perform_action(self):
+        """Check whether any dictionary is more recent upstream."""
+        version_regex = re.compile("<edition>(.+?)</edition>")
+        def version_of(v):
+            return Version(v.split("-")[0])
+        updates_required = []
+        for dictionary in self.dictionaries:
+            version = None
+            with open(f"{dictionary.name}/{dictionary.name}.tei",
+                    encoding="UTF-8") as file:
+                for line in file:
+                    match = version_regex.search(line)
+                    if match:
+                        version = match.groups()[0]
+                        break
+            if not version:
+                raise ValueError(f"unable to get version from {dictionary.name}")
+            if version_of(version) < version_of(dictionary.version):
+                updates_required.append((dictionary, version))
+        if updates_required:
+            print("Updates required for:")
+            for dictionary, old_version in updates_required:
+                print(f"  {dictionary.name}: old {old_version}, new {dictionary.version}")
+        else:
+            print("All dictionaries up-to-date.")
+
+class Criteria:
+    """A criteria matcher: Use the string
+    sourceURL:wikdict
+    to filter for all dictionaries fwith wikdict in their URL and use
+    sourceURL!wikdict
+    to filter for all  dictionaries not derived from wikdict. The first part is
+    an XML attribute name, the second bit is a regular expression.
+    This class can cope with empty criteria: all matches will be true."""
+
+    def __init__(self, criteria):
+        self.__rgx = None
+        self.__delim = None
+        self.__criteria_attr = None
+        if criteria and (not ":" in criteria and not "!" in criteria):
+            raise ValueError(
+                "criteria needs to consist of a dictionary "
+                "attribute followed by `:`  or `!` followed by a regular "
+                "expression"
+            )
+        if criteria:
+            self.__delim = ":" if ":" in criteria else "!"
+            self.__criteria_attr, self.__rgx = criteria.split(self.__delim, 1)
+            self.__rgx = re.compile(self.__rgx)
+
+    def matches(self, dictnode):
+        """Test whether given dictnode matches configured criteria."""
+        if not self.__criteria_attr:
+            return True  # no criteria to match on, include unconditionally
+        attr = dictnode.get(self.__criteria_attr)
+        if not attr:
+            # **only** include positive matches, yet nothing to match on found
+            return not self.__delim == ":"
+        if self.__delim == ":":
+            return bool(self.__rgx.search(attr))
+        return not bool(self.__rgx.search(attr))
+
+
+def clean_up_tree(root, criteria):
+    """Iterate over XML tree and delete those <dictionary/>-nodes which have no
+    release or which don't match one of the given criteria."""
+    criteria_matched = False  # track whether a criteria was matchd
+    criteria = Criteria(criteria)
+    dictionary_idx = 0
+    dictionaries = list(root)
+    while dictionary_idx < len(dictionaries) - 1:
+        dictionary = dictionaries[dictionary_idx]
+        # criteria matched and has children (releases)
+        if criteria.matches(dictionary) and list(dictionary):
+            criteria_matched = True
+            dictionary_idx += 1
+        else:
+            print(f'Skipping {dictionary.attrib["name"]}, ', end="")
+            print(("no releases" if not list(dictionary) else "didn't match criteria"))
+            root.remove(dictionary)
+            dictionaries = list(root)
+    if not criteria_matched:
+        print("Warning: given criteria never matched")
+
+
+def parse_api(root) -> list[str, dict]:
+    """Parse all dictionaries from the given XML FreeDict API. Returns a sorted
+    list (by ISO-639-3 dictionary name)."""
+    dictionaries = []
+    for child in root:
+        if not list(child) or not child.attrib.get("name") \
+                or not child.tag.endswith("dictionary"):
+            continue  # skip dictionaries without releases or non-dictionary nodes
+        dictionary = Dictionary(child.attrib["name"])
+        for key in ("headwords", "edition", "status", "maintainerName"):
+            try:
+                dictionary.set(key, child.attrib[key])
+            except KeyError as e:
+                if e.args[0] != "status":
+                    raise KeyError(
+                        "missing attribute for %s: %s"
+                        % (child.attrib["name"], e.args[0])
+                    ) from None
+        for release in child:
+            if not release.attrib.get("platform") \
+                    or release.attrib["platform"] != "src":
+                continue
+            dictionary.source_release = release.attrib["URL"]
+            break # only one release
+        dictionaries.append(dictionary)
+    return dictionaries
+
+
+def parse_args():
+    if os.getcwd().endswith("debian"):
+        os.chdir("..")
+    if not any(re.match(r"[a-z]{3}-[a-z]{3}", f) for f in os.listdir(os.getcwd())):
+        print("You must run this script from the FreeDict packaging root.")
+        sys.exit(127)
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--criteria",
+        dest="criteria",
+        default=None,
+        help=(
+            "A criteria is a XML attribute name (from the Dictionary tag),"
+            " separated by a delimiter, followed by a regular expression. "
+            "This can be used to filter dictionaries. If attribute name and"
+            " regular expression are separated by a colon `:`, the "
+            "dictionary node MUST match the given expression, if it "
+            "separated by a exclamation mark `!`, these dictionaries will "
+            "be dropped."
+        ),
+    )
+    parser.add_argument(
+        "--dc",
+        dest="gen_control_copyright",
+        action="store_true",
+        default=False,
+        help="generate debian/copyright and debian/control",
+    )
+    parser.add_argument(
+        "--need-update",
+        dest="check_for_upstream_dict_updates",
+        action="store_true",
+        default=False,
+        help="check whether any package needs an update (implies -u",
+    )
+    parser.add_argument(
+        "--no-desc-version",
+        dest="no_desc_version",
+        action="store_true",
+        default=False,
+        help="omit the version number in package descriptions (only "
+        "useful when using --dc)",
+    )
+    parser.add_argument(
+        "--orig",
+        dest="fetch_orig",
+        action="store_true",
+        default=False,
+        help="fetch a new orig source tar ball to ../",
+    )
+    parser.add_argument(
+        "-u",
+        dest="update_xml_api",
+        action="store_true",
+        default=False,
+        help="Update FreeDict XML API file and exit.",
+    )
+    if len(sys.argv) == 1:
+        parser.print_usage()
+    return parser.parse_args()
+
+
+def main():
+    # are we in the correct directory?
+    # cmd args
+    args = parse_args()
+    # perform an update of the XML API if requested *or* if  the check for newer
+    # upstream dictionaries is requested
+    perform_xml_api_update = (True if args.check_for_upstream_dict_updates
+        else args.update_xml_api)
+    xmlsrc = get_xml_content(perform_xml_api_update)
+    actions = []
+    # must be checked first
+    # actions can be combined
+    if args.fetch_orig:
+        xmlsrc = get_xml_content(fetch_new=True)  # fetch latest FreeDict API file
+        actions += [FetchSource]
+    if args.gen_control_copyright:
+        actions += [GenerateControlCopyright]
+    if args.check_for_upstream_dict_updates:
+        actions += [CheckForMoreRecentDicts]
+    package_name = subprocess.check_output(["dpkg-parsechangelog",
+            "-S", "Source"]) \
+        .decode(sys.getdefaultencoding()) \
+        .strip()
+
+    # usual operation
+    root = ET.fromstring(xmlsrc)
+    clean_up_tree(root, criteria=args.criteria)
+    dictionaries = parse_api(root)
+    for obj in actions:
+        inst = None
+        if obj == GenerateControlCopyright:
+            #pylint: disable=too-many-function-args
+            inst = obj(package_name, dictionaries, args.no_desc_version)
+        else:
+            inst = obj(package_name, dictionaries)
+        inst.perform_action()
+
+
+if __name__ == "__main__":
+    main()
