freebsd-src/usr.bin/localedef/charmap.c
Mark Johnston 770fba248d localedef: Use consistent sorting order
In several modules we build up an RB tree keyed by wide characters.
wchar_t has different signedness on different platforms, so iteration
over such a tree results in platform-dependent ordering.

The ctype module uses this ordering when writing the output file, which
creates reproducibility problems when comparing the results of cross
builds and native builds (e.g., native amd64 vs. cross-building on
arm64).

Modify such comparisons to always be unsigned.  Introduce a helper
function for this purpose.  In the other modules I believe the sort
order does not affect program output.

On systems with signed wchar_t, of the files in /usr/share/locale this
only affects zh_CN.GB18030/LC_CTYPE.

MFC after:	2 weeks
Sponsored by:	The FreeBSD Foundation
Sponsored by:	Klara, Inc.
Differential Revision:	https://reviews.freebsd.org/D50756
2025-06-20 16:39:22 +00:00

395 lines
8.2 KiB
C

/*-
* Copyright 2010 Nexenta Systems, Inc. All rights reserved.
* Copyright 2015 John Marino <draco@marino.st>
*
* This source code is derived from the illumos localedef command, and
* provided under BSD-style license terms by Nexenta Systems, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* CHARMAP file handling for localedef.
*/
#include <sys/types.h>
#include <sys/tree.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <limits.h>
#include <stddef.h>
#include <unistd.h>
#include "localedef.h"
#include "parser.h"
typedef struct charmap {
const char *name;
wchar_t wc;
RB_ENTRY(charmap) rb_sym;
RB_ENTRY(charmap) rb_wc;
} charmap_t;
static int cmap_compare_sym(const void *n1, const void *n2);
static int cmap_compare_wc(const void *n1, const void *n2);
static RB_HEAD(cmap_sym, charmap) cmap_sym;
static RB_HEAD(cmap_wc, charmap) cmap_wc;
RB_GENERATE_STATIC(cmap_sym, charmap, rb_sym, cmap_compare_sym);
RB_GENERATE_STATIC(cmap_wc, charmap, rb_wc, cmap_compare_wc);
/*
* Array of POSIX specific portable characters.
*/
static const struct {
const char *name;
int ch;
} portable_chars[] = {
{ "NUL", '\0' },
{ "SOH", '\x01' },
{ "STX", '\x02' },
{ "ETX", '\x03' },
{ "EOT", '\x04' },
{ "ENQ", '\x05' },
{ "ACK", '\x06' },
{ "BEL", '\a' },
{ "alert", '\a' },
{ "BS", '\b' },
{ "backspace", '\b' },
{ "HT", '\t' },
{ "tab", '\t' },
{ "LF", '\n' },
{ "newline", '\n' },
{ "VT", '\v' },
{ "vertical-tab", '\v' },
{ "FF", '\f' },
{ "form-feed", '\f' },
{ "CR", '\r' },
{ "carriage-return", '\r' },
{ "SO", '\x0e' },
{ "SI", '\x0f' },
{ "DLE", '\x10' },
{ "DC1", '\x11' },
{ "DC2", '\x12' },
{ "DC3", '\x13' },
{ "DC4", '\x14' },
{ "NAK", '\x15' },
{ "SYN", '\x16' },
{ "ETB", '\x17' },
{ "CAN", '\x18' },
{ "EM", '\x19' },
{ "SUB", '\x1a' },
{ "ESC", '\x1b' },
{ "FS", '\x1c' },
{ "IS4", '\x1c' },
{ "GS", '\x1d' },
{ "IS3", '\x1d' },
{ "RS", '\x1e' },
{ "IS2", '\x1e' },
{ "US", '\x1f' },
{ "IS1", '\x1f' },
{ "DEL", '\x7f' },
{ "space", ' ' },
{ "exclamation-mark", '!' },
{ "quotation-mark", '"' },
{ "number-sign", '#' },
{ "dollar-sign", '$' },
{ "percent-sign", '%' },
{ "ampersand", '&' },
{ "apostrophe", '\'' },
{ "left-parenthesis", '(' },
{ "right-parenthesis", ')' },
{ "asterisk", '*' },
{ "plus-sign", '+' },
{ "comma", ','},
{ "hyphen-minus", '-' },
{ "hyphen", '-' },
{ "full-stop", '.' },
{ "period", '.' },
{ "slash", '/' },
{ "solidus", '/' },
{ "zero", '0' },
{ "one", '1' },
{ "two", '2' },
{ "three", '3' },
{ "four", '4' },
{ "five", '5' },
{ "six", '6' },
{ "seven", '7' },
{ "eight", '8' },
{ "nine", '9' },
{ "colon", ':' },
{ "semicolon", ';' },
{ "less-than-sign", '<' },
{ "equals-sign", '=' },
{ "greater-than-sign", '>' },
{ "question-mark", '?' },
{ "commercial-at", '@' },
{ "left-square-bracket", '[' },
{ "backslash", '\\' },
{ "reverse-solidus", '\\' },
{ "right-square-bracket", ']' },
{ "circumflex", '^' },
{ "circumflex-accent", '^' },
{ "low-line", '_' },
{ "underscore", '_' },
{ "grave-accent", '`' },
{ "left-brace", '{' },
{ "left-curly-bracket", '{' },
{ "vertical-line", '|' },
{ "right-brace", '}' },
{ "right-curly-bracket", '}' },
{ "tilde", '~' },
{ "A", 'A' },
{ "B", 'B' },
{ "C", 'C' },
{ "D", 'D' },
{ "E", 'E' },
{ "F", 'F' },
{ "G", 'G' },
{ "H", 'H' },
{ "I", 'I' },
{ "J", 'J' },
{ "K", 'K' },
{ "L", 'L' },
{ "M", 'M' },
{ "N", 'N' },
{ "O", 'O' },
{ "P", 'P' },
{ "Q", 'Q' },
{ "R", 'R' },
{ "S", 'S' },
{ "T", 'T' },
{ "U", 'U' },
{ "V", 'V' },
{ "W", 'W' },
{ "X", 'X' },
{ "Y", 'Y' },
{ "Z", 'Z' },
{ "a", 'a' },
{ "b", 'b' },
{ "c", 'c' },
{ "d", 'd' },
{ "e", 'e' },
{ "f", 'f' },
{ "g", 'g' },
{ "h", 'h' },
{ "i", 'i' },
{ "j", 'j' },
{ "k", 'k' },
{ "l", 'l' },
{ "m", 'm' },
{ "n", 'n' },
{ "o", 'o' },
{ "p", 'p' },
{ "q", 'q' },
{ "r", 'r' },
{ "s", 's' },
{ "t", 't' },
{ "u", 'u' },
{ "v", 'v' },
{ "w", 'w' },
{ "x", 'x' },
{ "y", 'y' },
{ "z", 'z' },
{ NULL, 0 }
};
static int
cmap_compare_sym(const void *n1, const void *n2)
{
const charmap_t *c1 = n1;
const charmap_t *c2 = n2;
int rv;
rv = strcmp(c1->name, c2->name);
return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
}
static int
cmap_compare_wc(const void *n1, const void *n2)
{
const charmap_t *c1 = n1;
const charmap_t *c2 = n2;
return (wchar_cmp(c1->wc, c2->wc));
}
void
init_charmap(void)
{
RB_INIT(&cmap_sym);
RB_INIT(&cmap_wc);
}
static void
add_charmap_impl(const char *sym, wchar_t wc, int nodups)
{
charmap_t srch;
charmap_t *n = NULL;
srch.wc = wc;
srch.name = sym;
/*
* also possibly insert the wide mapping, although note that there
* can only be one of these per wide character code.
*/
if ((wc != (wchar_t)-1) && ((RB_FIND(cmap_wc, &cmap_wc, &srch)) == NULL)) {
if ((n = calloc(1, sizeof (*n))) == NULL) {
errf("out of memory");
return;
}
n->wc = wc;
RB_INSERT(cmap_wc, &cmap_wc, n);
}
if (sym) {
if (RB_FIND(cmap_sym, &cmap_sym, &srch) != NULL) {
if (nodups) {
errf("duplicate character definition");
}
return;
}
if ((n == NULL) && ((n = calloc(1, sizeof (*n))) == NULL)) {
errf("out of memory");
return;
}
n->wc = wc;
n->name = sym;
RB_INSERT(cmap_sym, &cmap_sym, n);
}
}
void
add_charmap(const char *sym, int c)
{
add_charmap_impl(sym, c, 1);
}
void
add_charmap_undefined(char *sym)
{
charmap_t srch;
charmap_t *cm = NULL;
srch.name = sym;
cm = RB_FIND(cmap_sym, &cmap_sym, &srch);
if ((undefok == 0) && ((cm == NULL) || (cm->wc == (wchar_t)-1))) {
warn("undefined symbol <%s>", sym);
add_charmap_impl(sym, -1, 0);
} else {
free(sym);
}
}
void
add_charmap_range(char *s, char *e, int wc)
{
int ls, le;
int si;
int sn, en;
int i;
static const char *digits = "0123456789";
ls = strlen(s);
le = strlen(e);
if (((si = strcspn(s, digits)) == 0) || (si == ls) ||
(strncmp(s, e, si) != 0) ||
((int)strspn(s + si, digits) != (ls - si)) ||
((int)strspn(e + si, digits) != (le - si)) ||
((sn = atoi(s + si)) > ((en = atoi(e + si))))) {
errf("malformed charmap range");
return;
}
s[si] = 0;
for (i = sn; i <= en; i++) {
char *nn;
(void) asprintf(&nn, "%s%0*u", s, ls - si, i);
if (nn == NULL) {
errf("out of memory");
return;
}
add_charmap_impl(nn, wc, 1);
wc++;
}
free(s);
free(e);
}
void
add_charmap_char(const char *name, int val)
{
add_charmap_impl(name, val, 0);
}
/*
* POSIX insists that certain entries be present, even when not in the
* original charmap file.
*/
void
add_charmap_posix(void)
{
int i;
for (i = 0; portable_chars[i].name; i++) {
add_charmap_char(portable_chars[i].name, portable_chars[i].ch);
}
}
int
lookup_charmap(const char *sym, wchar_t *wc)
{
charmap_t srch;
charmap_t *n;
srch.name = sym;
n = RB_FIND(cmap_sym, &cmap_sym, &srch);
if (n && n->wc != (wchar_t)-1) {
if (wc)
*wc = n->wc;
return (0);
}
return (-1);
}
int
check_charmap(wchar_t wc)
{
charmap_t srch;
srch.wc = wc;
return (RB_FIND(cmap_wc, &cmap_wc, &srch) ? 0 : -1);
}