From 68800a35ea5965a9e6e84ab8952e7962fae4bc43 Mon Sep 17 00:00:00 2001 From: Osamu Aoki Date: Thu, 24 Apr 2025 10:19:44 +0900 Subject: [PATCH] Improve PO file checker Signed-off-by: Osamu Aoki --- Makefile | 4 + README.md | 6 ++ bin/xmlpo | 318 ++++++++++++++++++++++++++++-------------------------- 3 files changed, 176 insertions(+), 152 deletions(-) diff --git a/Makefile b/Makefile index 3d03093b..431ba586 100644 --- a/Makefile +++ b/Makefile @@ -834,6 +834,10 @@ check: for XX in $(foreach LX, $(LANGPO), $(DPO)/$(LX).po); do \ $(MSGCAT) --no-wrap $$XX | grep -e '^msgstr "http.*%' || true ;\ done + # check for XML sanity in PO% + for XX in $(foreach LX, $(LANGPO), $(DPO)/$(LX).po); do \ + $(DBIN)/xmlpo $$XX || true ;\ + done stat: @$(call check-command, msgcat, gettext) diff --git a/README.md b/README.md index aa5177f2..d91a146b 100644 --- a/README.md +++ b/README.md @@ -164,6 +164,8 @@ Let's assume you are updating "fr": $ make wrap LANGPO=fr; make po LANGPO=fr # check fuzzy.log ... hack-hack-hack $ make wrap LANGPO=fr; make po LANGPO=fr # check fuzzy.log + $ make check LANGPO=fr + ... check sanity of PO file (optional) $ make test LANGPO=fr ... check build/ for good HTML build $ git add po/fr.po @@ -172,6 +174,10 @@ Let's assume you are updating "fr": $ gitk --all # check no remote updates $ git push origin latest ``` +Here, `make check` is an optional step. This can help to identify the error +location in the PO file easier than tracing errors in the test HTML build. If +you intentionally add supplemental contents with extra XML tags, you can add a +translator comment "`skip-tag-match`" for each such PO entry. If merging is more complicated, you can create a local topic branch and work. You can use such a topic branch as a remote backup, too. diff --git a/bin/xmlpo b/bin/xmlpo index 532b5b17..c359fe35 100755 --- a/bin/xmlpo +++ b/bin/xmlpo @@ -18,188 +18,202 @@ along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA """ -import argparse -import locale -import collections import sys +import argparse +import collections import xml.etree.ElementTree as ET +# You must install python3-polib package +import polib + + ####################################################################### -# Global variables +# PO data XML analyzer class ####################################################################### -verbose = 0 # quiet -# verbose = 1: default -# verbose = 2: verbose -# verbose = 3: debug +class XMLPOEntry: + def __init__(self, **kwargs): + self.linenum = kwargs.get("linenum", 0) + self.tcomment = kwargs.get("tcomment", "") + self.msgid = kwargs.get("msgid", "") + self.msgstr = kwargs.get("msgstr", "") + self.skip_fuzzy = kwargs.get("skip_fuzzy", True) + self.skip_obsolete = kwargs.get("skip_obsolete", True) + self.mask_entity = kwargs.get("mask_entity", True) + self.mask_namespace = kwargs.get("mask_namespace", True) + self.check_skip_tag_match = kwargs.get("check_skip_tag_match", True) + self.xmsgid = self.msgid + self.xmsgstr = self.msgstr + if self.mask_entity: + self.xmsgid = self.xmsgid.replace("&", "_") + self.xmsgstr = self.xmsgstr.replace("&", "_") + if self.mask_namespace: + self.xmsgid = self.xmsgid.replace("xl:href=", "xl_href=") + self.xmsgstr = self.xmsgstr.replace("xl:href=", "xl_href=") + # msgid xml analyze + self.xml_msgid_err = None + xml_msgid = ET.fromstring("") + try: + xml_msgid = ET.fromstring("" + self.xmsgid + "") + except ET.ParseError as err: + # look for error position + col = max(err.position[1] - len(""), 0) + self.xml_msgid_err = polib.escape(self.msgid[:col]) + '"<<< ERROR' + except Exception as err: + self.xml_msgid_err = "{} error: {}".format(type(err), err) + self.xml_msgid_tags = collections.Counter( + [element.tag for element in xml_msgid.iter()] + ) + del self.xml_msgid_tags["msg"] + # msgstr xml analyz + self.xml_msgstr_err = None + xml_msgstr = ET.fromstring("") + try: + xml_msgstr = ET.fromstring("" + self.xmsgstr + "") + except ET.ParseError as err: + # look for error position + col = max(err.position[1] - len(""), 0) + self.xml_msgstr_err = polib.escape(self.msgstr[:col]) + '"<<< ERROR' + except Exception as err: + self.xml_msgstr_err = "{} error: {}".format(type(err), err) + self.xml_msgstr_tags = collections.Counter( + [element.tag for element in xml_msgstr.iter()] + ) + del self.xml_msgstr_tags["msg"] + return + + def is_unmatched_xml(self): + if self.msgstr == "": + # ignore not-yet-translated data + return False + elif self.xml_msgid_err is not None: + # ignore non-valid XML in msgid + return False + elif self.xml_msgstr_err is not None: + # ignore non-valid XML in msgstr + return False + elif "skip-tag-match" in self.tcomment and self.check_skip_tag_match: + # ignore data with "skip-tag-match" in its translator comment + return False + elif self.xml_msgid_tags == self.xml_msgstr_tags: + return False + else: + return True + # + + def print_error(self): + if self.xml_msgid_err is not None: + print("E: msgid XML error at {}".format(self.linenum)) + print(" {}".format(self.xml_msgid_err)) + if self.xml_msgstr_err is not None: + print("E: msgstr XML error at {}".format(self.linenum)) + print(" {}".format(self.xml_msgstr_err)) + if self.xml_msgid_err is not None or self.xml_msgstr_err is not None: + print(' msgid "{}"'.format(polib.escape(self.msgid))) + print(' msgstr "{}"'.format(polib.escape(self.msgstr))) + print() + + def print_unmatched_tags(self): + # unmatchd tags + if self.is_unmatched_xml() is True: + print("W: unmatched XML tag at {}".format(self.linenum)) + print(" msgid_tags = {}".format(self.xml_msgid_tags)) + print(" msgstr_tags = {}".format(self.xml_msgstr_tags)) + print(' msgid "{}"'.format(polib.escape(self.msgid))) + print(' msgstr "{}"'.format(polib.escape(self.msgstr))) + print() + return ####################################################################### # main: parse command line parser ####################################################################### def main(): - locale.setlocale(locale.LC_ALL, "en_US.UTF-8") parser = argparse.ArgumentParser( description="""\ -xml tag checker for po-file - -When PO file is generated from DocBook XML or similar file, it will contain -some XML markers. Many translation errors come from typos around such markers. - -This checker will find unmatched set of XML markers between msgid a msgstr. - -Return 0, if no error. Return count of errors, if the error is found. +analyzer for po-file copyright 2024 Osamu Aoki license: MIT - """ ) - parser.add_argument("-v", "--verbose", action="count", default=1, help="verbose") - parser.add_argument( - "-m", - "--msg", - action="store_true", - default=False, - help="print msgid and msgstr for each error", - ) - parser.add_argument( - "-i", - "--ignore-entity", - action="store_true", - default=False, - help="ignore entity by replacing & with _", - ) parser.add_argument( "-f", - "--test-fuzzy", + "--include-fuzzy", action="store_true", default=False, - help="test applies to fuzzy msg too", + help="force to include fuzzy PO entries", + ) + parser.add_argument( + "-o", + "--include-obsolete", + action="store_true", + default=False, + help="force to include obsolete PO entries", + ) + parser.add_argument( + "-t", + "--ignore-skip-tag-match", + action="store_true", + default=False, + help="force to ignore skip-tag-match in translator comment", + ) + parser.add_argument( + "-e", + "--expose-entity", + action="store_true", + default=False, + help="force to disable masking of '&' by '_' (expose entity)", + ) + parser.add_argument( + "-n", + "--expose-namespace", + action="store_true", + default=False, + help="force to disable masking of 'xl:href=' by 'xl_href=' (expose namespace)", ) parser.add_argument("pofile", help="po file to be analyzed") ####################################################################### # generate argument parser instance ####################################################################### args = parser.parse_args() - # verbose = args.verbose ####################################################################### - state = "" # "msgid"/"msgstr"/"" - state_last = "" - msgid_str = "" - msgstr_str = "" - fuzzy = False - msgstr_lnum = 0 + print("I: Process pofile='{}'".format(args.pofile)) + print() + try: + po = polib.pofile(args.pofile) + except Exception as err: + print("{} error: {} for PO file='{}'".format(type(err), err, args.pofile)) + sys.exit(1) error_count = 0 - with open(args.pofile, "r") as fp: - for lnum, line in enumerate(fp.readlines()): - line = line.strip() # remove NL - if line.startswith("msgid"): - state = "msgid" - msgid_str = line[len("msgid ") :].strip()[1:-1] - elif line.startswith("msgstr"): - state = "msgstr" - msgstr_lnum = lnum - msgstr_str = line[len("msgstr ") :].strip()[1:-1] - elif line.startswith('"'): - if state == "msgid": - msgid_str += line[1:-1] - elif state == "msgstr": - msgstr_str += line[1:-1] - else: - # line number should start at 1 like editor - print("E: **INVALID** PO file line={}: '{}'".format(lnum + 1, line)) - sys.exit(2) - elif line.startswith("#") and "fuzzy" in line: - state = "#" - fuzzy = True - elif line.startswith("#"): - state = "#" - else: - state = "" - if state == "" and state_last == "msgstr": - fuzzy_in = fuzzy - fuzzy = False - # ready to report - # print("I: ----------------------------------------------------------") - if msgid_str == "" or msgstr_str == "" or "<" not in msgid_str: - # notworth analyzing - continue - if not args.test_fuzzy and fuzzy_in: - # test_fuzzy=*, fuzzy_in=False -> test - # test_fuzzy=True, fuzzy_in=True -> test - # test_fuzzy=False, fuzzy_in=True -> don't test - continue - # normalize - msgid_str = msgid_str.replace("xl:href", "href").replace('\\"', '"') - msgstr_str = msgstr_str.replace("xl:href", "href").replace('\\"', '"') - if args.ignore_entity: - msgid_str = msgid_str.replace("&", "_") - msgstr_str = msgstr_str.replace("&", "_") - # msgstr is not "" and msgid may have XML tag - xml_msgid = ET.fromstring("") - xml_msgstr = ET.fromstring("") - err0_str = "" - try: - xml_msgid = ET.fromstring("" + msgid_str + "") - except ET.ParseError as err0: - valid_msgid = False - # look for error position - col0 = max(err0.position[1] - len(""), 0) - err0_str = msgid_str[col0 : col0 + 20] - except Exception as err0: - valid_msgid = False - print(f"err0 unexpected {err0=}, {type(err0)=}") - else: - valid_msgid = True - err1_str = "" - try: - xml_msgstr = ET.fromstring("" + msgstr_str + "") - except ET.ParseError as err1: - valid_msgstr = False - # look for error position - col1 = max(err1.position[1] - len(""), 0) - err1_str = msgstr_str[col1 : col1 + 20] - except Exception as _: - valid_msgstr = False - else: - valid_msgstr = True - if valid_msgid and valid_msgstr: - tags_msgid = collections.Counter( - [element.tag for element in xml_msgid.iter()] - ) - del tags_msgid["msg"] - tags_msgstr = collections.Counter( - [element.tag for element in xml_msgstr.iter()] - ) - del tags_msgstr["msg"] - if tags_msgid == tags_msgstr: - # print("I: line={} valid XML and matched XML tags msgid={}".format(msgstr_lnum, tags_msgid)) - pass - else: - # line number should start at 1 like editor - print( - "E: line={} **UNMATCHED XML TAG: fuzzy={} tags_msgid={} tags_msgstr={}".format( - msgstr_lnum + 1, fuzzy_in, tags_msgid, tags_msgstr - ) - ) - if args.msg: - print(" msgid = '{}'".format(msgid_str)) - print(" msgstr = '{}'".format(msgstr_str)) - error_count += 1 - else: - # line number should start at 1 like editor - print( - "E: line={} **INVALID** XML: fuzzy={} error at msgid='{}' msgstr='{}' (truncated)".format( - msgstr_lnum + 1, fuzzy_in, err0_str, err1_str - ) - ) - if args.msg: - print(" msgid = '{}'".format(msgid_str)) - print(" msgstr = '{}'".format(msgstr_str)) - error_count += 1 - state_last = state - print("ERROR COUNT = {}".format(error_count)) - sys.exit(error_count) + warn_count = 0 + for entry in po: + if entry.fuzzy and not args.include_fuzzy: + continue + if entry.obsolete and not args.include_obsolete: + continue + xmlentry = XMLPOEntry( + linenum=entry.linenum, + tcomment=entry.tcomment, + fuzzy=entry.fuzzy, + obsolete=entry.obsolete, + msgid=entry.msgid, + msgstr=entry.msgstr, + mask_entity=not args.expose_entity, + mask_namespace=not args.expose_namespace, + check_skip_tag_match=not args.ignore_skip_tag_match, + ) + xmlentry.print_error() + xmlentry.print_unmatched_tags() + if ( + xmlentry.xml_msgid_err is not None + or xmlentry.xml_msgstr_err is not None + ): + error_count += 1 + if xmlentry.is_unmatched_xml() is True: + warn_count += 1 + print("E: XML markup error counts = {}".format(error_count)) + print("W: XML unmatched tag counts = {}".format(warn_count)) + sys.exit(error_count + warn_count) #######################################################################