/* $NetBSD: utils.c,v 1.2 2025/02/25 22:11:36 christos Exp $ */ /* * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #ifndef lint __RCSID("$NetBSD: utils.c,v 1.2 2025/02/25 22:11:36 christos Exp $"); #endif /* not lint */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include "defs.h" #include "utils.h" #define UCS2_REPLACEMENT_CHARACTER 0xfffd /************************************************************************ * Character encoding conversion routnes * * UEFI uses UCS-2 character encoding. Note that this is not UTF-16 * as it doesn't interpret surrogate pairs. * * From : * * "UCS-2 is obsolete terminology which refers to a Unicode * implementation up to Unicode 1.1, before surrogate code points and * UTF-16 were added to Version 2.0 of the standard. This term should * now be avoided. * * UCS-2 does not describe a data format distinct from UTF-16, because * both use exactly the same 16-bit code unit * representations. However, UCS-2 does not interpret surrogate code * points, and thus cannot be used to conformantly represent * supplementary characters. * * Sometimes in the past an implementation has been labeled "UCS-2" to * indicate that it does not support supplementary characters and * doesn't interpret pairs of surrogate code points as * characters. Such an implementation would not handle processing of * character properties, code point boundaries, collation, etc. for * supplementary characters, nor would it be able to support most * emoji, for example. [AF]" * * Regarding illegal UTF-8 sequences, the same document says: * * "None of the UTFs can generate every arbitrary byte sequence. For * example, in UTF-8 every byte of the form 110xxxxx_2 must be * followed with a byte of the form 10xxxxxx_2. A sequence such as * <110xxxxx_2 0xxxxxxx_2> is illegal, and must never be * generated. When faced with this illegal byte sequence while * transforming or interpreting, a UTF-8 conformant process must treat * the first byte 110xxxxx_2 as an illegal termination error: for * example, either signaling an error, filtering the byte out, or * representing the byte with a marker such as U+FFFD REPLACEMENT * CHARACTER. In the latter two cases, it will continue processing at * the second byte 0xxxxxxx_2. * * A conformant process must not interpret illegal or ill-formed byte * sequences as characters, however, it may take error recovery * actions. No conformant process may use irregular byte sequences to * encode out-of-band information." */ /* * ibuf = input buffer (uint16_t *) * isz = bytes in input buffer * obuf = output buffer (char *) * osz = bytes in output buffer (size_t *) * * if (obuf == NULL), malloc obuf. * if (obuf != NULL), write to existing output buffer. * * Return resulting utf8 string. */ PUBLIC char * ucs2_to_utf8(const uint16_t *ibuf, size_t isz, char *obuf, size_t *osz) { uint8_t *dst; size_t dsz, i, j, j_max, n; uint16_t c; assert(isz > 0); if (obuf != NULL) { assert(osz != NULL); dsz = *osz; dst = (uint8_t *)obuf; } else { dsz = isz * sizeof(*dst); dst = malloc(dsz); } /* * Each UCS2 character will encode as at most 3 UTF8 bytes. * 'isz' is the number of bytes in the source buffer which * may well be larger than the UCS2 string. 'osz' is the * actual number of bytes in the NUL terminated USC2 string. */ n = isz / sizeof(*ibuf); /* max number of characters in input buffer */ j = 0; j_max = dsz / sizeof(*dst); for (i = 0; i < n; i++) { c = le16toh(ibuf[i]); if (c == 0) { break; } if (c < 0x0080) { if (j + 1 >= j_max) break; dst[j++] = (uint8_t)c; } else if (c < 0x0800) { if (j + 2 >= j_max) break; dst[j++] = 0xc0 | (uint8_t)(c >> 6); dst[j++] = 0x80 | (uint8_t)(c & 0x3f); } else { if (j + 3 >= j_max) break; dst[j++] = 0xe0 | (uint8_t)(c >> 12); dst[j++] = 0x80 | (uint8_t)((c >> 6) & 0x3f); dst[j++] = 0x80 | (uint8_t)(c & 0x3f); } } if (dst != NULL) dst[j] = '\0'; if (osz) *osz = j; return (char *)dst; } /* * ibuf = input buffer (char *) * isz = bytes in input buffer * obuf = output buffer (uint16_t *) * osz = bytes in output buffer (size_t *) * * if (obuf == NULL), malloc obuf. * if (obuf != NULL), write to existing output buffer. * * Return resulting ucs2 string. */ PUBLIC uint16_t * utf8_to_ucs2(const char *ibuf, size_t isz, uint16_t *obuf, size_t *osz) { const uint8_t *src = (const uint8_t *)ibuf; uint16_t *dst; uint16_t out; size_t dsz, i, j, j_max; if (obuf != NULL) { assert(osz != NULL); dst = obuf; dsz = *osz; } else { dsz = isz * sizeof(*dst); dst = malloc(dsz); } j = 0; j_max = dsz / sizeof(*dst); for (i = 0; i < isz; i++) { out = src[i]; if (out == '\0') { break; } else if (j + 1 >= j_max) { break; } else if ((out & 0x80) == 0) { /* we're good to go */ } else if ((out & 0xe0) == 0xc0) { if (i + 1 >= isz) { /* insufficient source */ break; } if ((src[i + 1] & 0xc0) != 0x80) { out = UCS2_REPLACEMENT_CHARACTER; } else { out &= 0x1f; out <<= 6; out |= src[++i] & 0x3f; } } else if ((out & 0xf0) == 0xe0) { if (i + 2 >= isz) { /* insufficient source */ break; } if ((src[i + 1] & 0xc0) != 0x80 || (src[i + 2] & 0xc0) != 0x80) { out = UCS2_REPLACEMENT_CHARACTER; } else { out &= 0x0f; out <<= 6; out |= src[++i] & 0x3f; out <<= 6; out |= src[++i] & 0x3f; } } else { /* cannot encode as USC2 */ out = UCS2_REPLACEMENT_CHARACTER; } dst[j++] = htole16(out); } dst[j] = '\0'; if (src[i] != '\0') warnx("bad UTF8 string"); if (osz) *osz = (j + 1) * sizeof(*dst); return dst; } PUBLIC size_t utf8_to_ucs2_size(const char *src) { #if 0 uint16_t *dst; size_t sz; dst = utf8_to_ucs2(src, strlen(src) + 1, NULL, &sz); free(dst); return sz; #else const uint8_t *buf = (const uint8_t *)src; uint out; size_t i, j; j = 0; for (i = 0; (out = buf[i]) != '\0'; i++) { if ((out & 0x80) == 0) { /* we're good to go */ } else if ((out & 0xe0) == 0xc0) { if ((buf[i + 1] & 0xc0) == 0x80) { i++; } } else if ((out & 0xf0) == 0xe0) { if ((buf[i + 1] & 0xc0) == 0x80 && (buf[i + 2] & 0xc0) == 0x80) { i += 2; } } j++; } return (j + 1) * sizeof(uint16_t); #endif } /************************************************************************ * UUID routines */ PUBLIC int uuid_scanf(struct uuid *uuid, const char *str) { return sscanf(str, "%08x-%04hx-%04hx-%02hhx%02hhx-%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx", &uuid->time_low, &uuid->time_mid, &uuid->time_hi_and_version, &uuid->clock_seq_hi_and_reserved, &uuid->clock_seq_low, &uuid->node[0], &uuid->node[1], &uuid->node[2], &uuid->node[3], &uuid->node[4], &uuid->node[5]); } /* * from sys/kern/kern_uuid.c */ PUBLIC int uuid_snprintf(char *buf, size_t sz, const struct uuid *uuid) { return snprintf(buf, sz, "%08x-%04x-%04x-%02x%02x-%02x%02x%02x%02x%02x%02x", uuid->time_low, uuid->time_mid, uuid->time_hi_and_version, uuid->clock_seq_hi_and_reserved, uuid->clock_seq_low, uuid->node[0], uuid->node[1], uuid->node[2], uuid->node[3], uuid->node[4], uuid->node[5]); } PUBLIC int uuid_printf(const struct uuid *uuid) { char buf[UUID_STR_LEN]; (void) uuid_snprintf(buf, sizeof(buf), uuid); printf("%s", buf); return (0); } /************************************************************************ * Misc routines */ PUBLIC void show_data(const uint8_t *buf, size_t len, const char *prefix) { uint8_t line_buf[17]; size_t i; line_buf[16] = '\0'; for (i = 0; i < len; i++) { if ((i & 0xf) == 0) printf("%s%08zx: ", prefix, i); line_buf[i & 0xf] = isprint((int)buf[i]) ? buf[i] : '.'; printf("%02x ", buf[i]); if ((i & 0xf) == 0xf) printf(" %s\n", line_buf); else if ((i & 0x7) == 0x7) printf(" "); } i &= 0xf; if (i != 0) { line_buf[i] = '\0'; if (i < 8) printf(" "); while (i++ < 16) printf(" "); printf(" %s\n", line_buf); } } PUBLIC uint16_t strtous(const char *str, char **endptr, int base) { uintmax_t val; int rstatus; val = strtou(str, endptr, base, 0, USHRT_MAX, &rstatus); switch (rstatus) { case EINVAL: assert(0); break; case ENOTSUP: if (endptr != NULL) break; /*FALLTHROUGH*/ case ECANCELED: err(EXIT_FAILURE, "invalid numeric string: %s\n", str); case ERANGE: err(EXIT_FAILURE, "value out of range [0,%#x]: %s\n", USHRT_MAX, str); default: break; } return (uint16_t)val; } char * read_file(const char *fname, size_t *size) { char *buf, *cp, *ep; size_t bufsz, cnt, sz; ssize_t ssz; int fd, fd_flags; assert(fname != NULL); if (fname == NULL) return 0; if (strcmp(fname, "-") == 0) { fd = STDIN_FILENO; if ((fd_flags = fcntl(fd, F_GETFL)) == -1) err(EXIT_FAILURE, "fcntl F_GETFL"); if (fcntl(fd, F_SETFL, O_NONBLOCK | fd_flags) == -1) err(EXIT_FAILURE, "fcntl F_SETFL"); } else { fd_flags = -1; fd = open(fname, O_RDONLY); if (fd == -1) err(EXIT_FAILURE, "open"); } bufsz = 0x800; buf = emalloc(bufsz); cp = buf; ep = buf + bufsz; cnt = 0; for (;;) { ssz = read(fd, cp, (size_t)(ep - cp)); if (ssz == -1) { if (errno == EAGAIN) continue; err(EXIT_FAILURE, "read"); } assert(ssz >= 0); #if 0 printf("ssz: %zd\n", ssz); show_data((uint8_t *)cp, (size_t)ssz, ""); #endif if (ssz == 0) break; cp += ssz; sz = (size_t)ssz; cnt += sz; if (cp < ep) { /* XXX: what about UCS-2? */ *cp = '\0'; cnt++; break; } if (cp == ep) { bufsz *= 2; buf = erealloc(buf, bufsz); cp = buf + cnt; ep = buf + bufsz; } } if (fd_flags != -1) fcntl(fd, F_SETFL, fd_flags); else close(fd); *size = cnt; return buf; }