Improve PO file checker

Signed-off-by: Osamu Aoki <osamu@debian.org>
This commit is contained in:
Osamu Aoki 2025-04-24 10:19:44 +09:00
parent 335a23012b
commit 68800a35ea
3 changed files with 176 additions and 152 deletions

View file

@ -834,6 +834,10 @@ check:
for XX in $(foreach LX, $(LANGPO), $(DPO)/$(LX).po); do \
$(MSGCAT) --no-wrap $$XX | grep -e '^msgstr "http.*%' || true ;\
done
# check for XML sanity in PO%
for XX in $(foreach LX, $(LANGPO), $(DPO)/$(LX).po); do \
$(DBIN)/xmlpo $$XX || true ;\
done
stat:
@$(call check-command, msgcat, gettext)

View file

@ -164,6 +164,8 @@ Let's assume you are updating "fr":
$ make wrap LANGPO=fr; make po LANGPO=fr # check fuzzy.log
... hack-hack-hack
$ make wrap LANGPO=fr; make po LANGPO=fr # check fuzzy.log
$ make check LANGPO=fr
... check sanity of PO file (optional)
$ make test LANGPO=fr
... check build/ for good HTML build
$ git add po/fr.po
@ -172,6 +174,10 @@ Let's assume you are updating "fr":
$ gitk --all # check no remote updates
$ git push origin latest
```
Here, `make check` is an optional step. This can help to identify the error
location in the PO file easier than tracing errors in the test HTML build. If
you intentionally add supplemental contents with extra XML tags, you can add a
translator comment "`skip-tag-match`" for each such PO entry.
If merging is more complicated, you can create a local topic branch and work.
You can use such a topic branch as a remote backup, too.

318
bin/xmlpo
View file

@ -18,188 +18,202 @@ along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
USA
"""
import argparse
import locale
import collections
import sys
import argparse
import collections
import xml.etree.ElementTree as ET
# You must install python3-polib package
import polib
#######################################################################
# Global variables
# PO data XML analyzer class
#######################################################################
verbose = 0 # quiet
# verbose = 1: default
# verbose = 2: verbose
# verbose = 3: debug
class XMLPOEntry:
def __init__(self, **kwargs):
self.linenum = kwargs.get("linenum", 0)
self.tcomment = kwargs.get("tcomment", "")
self.msgid = kwargs.get("msgid", "")
self.msgstr = kwargs.get("msgstr", "")
self.skip_fuzzy = kwargs.get("skip_fuzzy", True)
self.skip_obsolete = kwargs.get("skip_obsolete", True)
self.mask_entity = kwargs.get("mask_entity", True)
self.mask_namespace = kwargs.get("mask_namespace", True)
self.check_skip_tag_match = kwargs.get("check_skip_tag_match", True)
self.xmsgid = self.msgid
self.xmsgstr = self.msgstr
if self.mask_entity:
self.xmsgid = self.xmsgid.replace("&", "_")
self.xmsgstr = self.xmsgstr.replace("&", "_")
if self.mask_namespace:
self.xmsgid = self.xmsgid.replace("xl:href=", "xl_href=")
self.xmsgstr = self.xmsgstr.replace("xl:href=", "xl_href=")
# msgid xml analyze
self.xml_msgid_err = None
xml_msgid = ET.fromstring("<msg></msg>")
try:
xml_msgid = ET.fromstring("<msg>" + self.xmsgid + "</msg>")
except ET.ParseError as err:
# look for error position
col = max(err.position[1] - len("<msg>"), 0)
self.xml_msgid_err = polib.escape(self.msgid[:col]) + '"<<< ERROR'
except Exception as err:
self.xml_msgid_err = "{} error: {}".format(type(err), err)
self.xml_msgid_tags = collections.Counter(
[element.tag for element in xml_msgid.iter()]
)
del self.xml_msgid_tags["msg"]
# msgstr xml analyz
self.xml_msgstr_err = None
xml_msgstr = ET.fromstring("<msg></msg>")
try:
xml_msgstr = ET.fromstring("<msg>" + self.xmsgstr + "</msg>")
except ET.ParseError as err:
# look for error position
col = max(err.position[1] - len("<msg>"), 0)
self.xml_msgstr_err = polib.escape(self.msgstr[:col]) + '"<<< ERROR'
except Exception as err:
self.xml_msgstr_err = "{} error: {}".format(type(err), err)
self.xml_msgstr_tags = collections.Counter(
[element.tag for element in xml_msgstr.iter()]
)
del self.xml_msgstr_tags["msg"]
return
def is_unmatched_xml(self):
if self.msgstr == "":
# ignore not-yet-translated data
return False
elif self.xml_msgid_err is not None:
# ignore non-valid XML in msgid
return False
elif self.xml_msgstr_err is not None:
# ignore non-valid XML in msgstr
return False
elif "skip-tag-match" in self.tcomment and self.check_skip_tag_match:
# ignore data with "skip-tag-match" in its translator comment
return False
elif self.xml_msgid_tags == self.xml_msgstr_tags:
return False
else:
return True
#
def print_error(self):
if self.xml_msgid_err is not None:
print("E: msgid XML error at {}".format(self.linenum))
print(" {}".format(self.xml_msgid_err))
if self.xml_msgstr_err is not None:
print("E: msgstr XML error at {}".format(self.linenum))
print(" {}".format(self.xml_msgstr_err))
if self.xml_msgid_err is not None or self.xml_msgstr_err is not None:
print(' msgid "{}"'.format(polib.escape(self.msgid)))
print(' msgstr "{}"'.format(polib.escape(self.msgstr)))
print()
def print_unmatched_tags(self):
# unmatchd tags
if self.is_unmatched_xml() is True:
print("W: unmatched XML tag at {}".format(self.linenum))
print(" msgid_tags = {}".format(self.xml_msgid_tags))
print(" msgstr_tags = {}".format(self.xml_msgstr_tags))
print(' msgid "{}"'.format(polib.escape(self.msgid)))
print(' msgstr "{}"'.format(polib.escape(self.msgstr)))
print()
return
#######################################################################
# main: parse command line parser
#######################################################################
def main():
locale.setlocale(locale.LC_ALL, "en_US.UTF-8")
parser = argparse.ArgumentParser(
description="""\
xml tag checker for po-file
When PO file is generated from DocBook XML or similar file, it will contain
some XML markers. Many translation errors come from typos around such markers.
This checker will find unmatched set of XML markers between msgid a msgstr.
Return 0, if no error. Return count of errors, if the error is found.
analyzer for po-file
copyright 2024 Osamu Aoki <osamu@debian.org>
license: MIT
"""
)
parser.add_argument("-v", "--verbose", action="count", default=1, help="verbose")
parser.add_argument(
"-m",
"--msg",
action="store_true",
default=False,
help="print msgid and msgstr for each error",
)
parser.add_argument(
"-i",
"--ignore-entity",
action="store_true",
default=False,
help="ignore entity by replacing & with _",
)
parser.add_argument(
"-f",
"--test-fuzzy",
"--include-fuzzy",
action="store_true",
default=False,
help="test applies to fuzzy msg too",
help="force to include fuzzy PO entries",
)
parser.add_argument(
"-o",
"--include-obsolete",
action="store_true",
default=False,
help="force to include obsolete PO entries",
)
parser.add_argument(
"-t",
"--ignore-skip-tag-match",
action="store_true",
default=False,
help="force to ignore skip-tag-match in translator comment",
)
parser.add_argument(
"-e",
"--expose-entity",
action="store_true",
default=False,
help="force to disable masking of '&' by '_' (expose entity)",
)
parser.add_argument(
"-n",
"--expose-namespace",
action="store_true",
default=False,
help="force to disable masking of 'xl:href=' by 'xl_href=' (expose namespace)",
)
parser.add_argument("pofile", help="po file to be analyzed")
#######################################################################
# generate argument parser instance
#######################################################################
args = parser.parse_args()
# verbose = args.verbose
#######################################################################
state = "" # "msgid"/"msgstr"/""
state_last = ""
msgid_str = ""
msgstr_str = ""
fuzzy = False
msgstr_lnum = 0
print("I: Process pofile='{}'".format(args.pofile))
print()
try:
po = polib.pofile(args.pofile)
except Exception as err:
print("{} error: {} for PO file='{}'".format(type(err), err, args.pofile))
sys.exit(1)
error_count = 0
with open(args.pofile, "r") as fp:
for lnum, line in enumerate(fp.readlines()):
line = line.strip() # remove NL
if line.startswith("msgid"):
state = "msgid"
msgid_str = line[len("msgid ") :].strip()[1:-1]
elif line.startswith("msgstr"):
state = "msgstr"
msgstr_lnum = lnum
msgstr_str = line[len("msgstr ") :].strip()[1:-1]
elif line.startswith('"'):
if state == "msgid":
msgid_str += line[1:-1]
elif state == "msgstr":
msgstr_str += line[1:-1]
else:
# line number should start at 1 like editor
print("E: **INVALID** PO file line={}: '{}'".format(lnum + 1, line))
sys.exit(2)
elif line.startswith("#") and "fuzzy" in line:
state = "#"
fuzzy = True
elif line.startswith("#"):
state = "#"
else:
state = ""
if state == "" and state_last == "msgstr":
fuzzy_in = fuzzy
fuzzy = False
# ready to report
# print("I: ----------------------------------------------------------")
if msgid_str == "" or msgstr_str == "" or "<" not in msgid_str:
# notworth analyzing
continue
if not args.test_fuzzy and fuzzy_in:
# test_fuzzy=*, fuzzy_in=False -> test
# test_fuzzy=True, fuzzy_in=True -> test
# test_fuzzy=False, fuzzy_in=True -> don't test
continue
# normalize
msgid_str = msgid_str.replace("xl:href", "href").replace('\\"', '"')
msgstr_str = msgstr_str.replace("xl:href", "href").replace('\\"', '"')
if args.ignore_entity:
msgid_str = msgid_str.replace("&", "_")
msgstr_str = msgstr_str.replace("&", "_")
# msgstr is not "" and msgid may have XML tag
xml_msgid = ET.fromstring("<msg></msg>")
xml_msgstr = ET.fromstring("<msg></msg>")
err0_str = ""
try:
xml_msgid = ET.fromstring("<msg>" + msgid_str + "</msg>")
except ET.ParseError as err0:
valid_msgid = False
# look for error position
col0 = max(err0.position[1] - len("<msg>"), 0)
err0_str = msgid_str[col0 : col0 + 20]
except Exception as err0:
valid_msgid = False
print(f"err0 unexpected {err0=}, {type(err0)=}")
else:
valid_msgid = True
err1_str = ""
try:
xml_msgstr = ET.fromstring("<msg>" + msgstr_str + "</msg>")
except ET.ParseError as err1:
valid_msgstr = False
# look for error position
col1 = max(err1.position[1] - len("<msg>"), 0)
err1_str = msgstr_str[col1 : col1 + 20]
except Exception as _:
valid_msgstr = False
else:
valid_msgstr = True
if valid_msgid and valid_msgstr:
tags_msgid = collections.Counter(
[element.tag for element in xml_msgid.iter()]
)
del tags_msgid["msg"]
tags_msgstr = collections.Counter(
[element.tag for element in xml_msgstr.iter()]
)
del tags_msgstr["msg"]
if tags_msgid == tags_msgstr:
# print("I: line={} valid XML and matched XML tags msgid={}".format(msgstr_lnum, tags_msgid))
pass
else:
# line number should start at 1 like editor
print(
"E: line={} **UNMATCHED XML TAG: fuzzy={} tags_msgid={} tags_msgstr={}".format(
msgstr_lnum + 1, fuzzy_in, tags_msgid, tags_msgstr
)
)
if args.msg:
print(" msgid = '{}'".format(msgid_str))
print(" msgstr = '{}'".format(msgstr_str))
error_count += 1
else:
# line number should start at 1 like editor
print(
"E: line={} **INVALID** XML: fuzzy={} error at msgid='{}' msgstr='{}' (truncated)".format(
msgstr_lnum + 1, fuzzy_in, err0_str, err1_str
)
)
if args.msg:
print(" msgid = '{}'".format(msgid_str))
print(" msgstr = '{}'".format(msgstr_str))
error_count += 1
state_last = state
print("ERROR COUNT = {}".format(error_count))
sys.exit(error_count)
warn_count = 0
for entry in po:
if entry.fuzzy and not args.include_fuzzy:
continue
if entry.obsolete and not args.include_obsolete:
continue
xmlentry = XMLPOEntry(
linenum=entry.linenum,
tcomment=entry.tcomment,
fuzzy=entry.fuzzy,
obsolete=entry.obsolete,
msgid=entry.msgid,
msgstr=entry.msgstr,
mask_entity=not args.expose_entity,
mask_namespace=not args.expose_namespace,
check_skip_tag_match=not args.ignore_skip_tag_match,
)
xmlentry.print_error()
xmlentry.print_unmatched_tags()
if (
xmlentry.xml_msgid_err is not None
or xmlentry.xml_msgstr_err is not None
):
error_count += 1
if xmlentry.is_unmatched_xml() is True:
warn_count += 1
print("E: XML markup error counts = {}".format(error_count))
print("W: XML unmatched tag counts = {}".format(warn_count))
sys.exit(error_count + warn_count)
#######################################################################