1*820c1a8dSHiroo HAYASHI /* Convert multibyte character to wide character. 2*820c1a8dSHiroo HAYASHI Copyright (C) 1999-2002, 2005-2021 Free Software Foundation, Inc. 3*820c1a8dSHiroo HAYASHI 4*820c1a8dSHiroo HAYASHI This file is free software: you can redistribute it and/or modify 5*820c1a8dSHiroo HAYASHI it under the terms of the GNU Lesser General Public License as 6*820c1a8dSHiroo HAYASHI published by the Free Software Foundation; either version 2.1 of the 7*820c1a8dSHiroo HAYASHI License, or (at your option) any later version. 8*820c1a8dSHiroo HAYASHI 9*820c1a8dSHiroo HAYASHI This file is distributed in the hope that it will be useful, 10*820c1a8dSHiroo HAYASHI but WITHOUT ANY WARRANTY; without even the implied warranty of 11*820c1a8dSHiroo HAYASHI MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12*820c1a8dSHiroo HAYASHI GNU Lesser General Public License for more details. 13*820c1a8dSHiroo HAYASHI 14*820c1a8dSHiroo HAYASHI You should have received a copy of the GNU Lesser General Public License 15*820c1a8dSHiroo HAYASHI along with this program. If not, see <https://www.gnu.org/licenses/>. */ 16*820c1a8dSHiroo HAYASHI 17*820c1a8dSHiroo HAYASHI /* Written by Bruno Haible <bruno@clisp.org>, 2008. */ 18*820c1a8dSHiroo HAYASHI 19*820c1a8dSHiroo HAYASHI /* This file contains the body of the mbrtowc and mbrtoc32 functions, 20*820c1a8dSHiroo HAYASHI when GNULIB_defined_mbstate_t is defined. */ 21*820c1a8dSHiroo HAYASHI 22*820c1a8dSHiroo HAYASHI char *pstate = (char *)ps; 23*820c1a8dSHiroo HAYASHI 24*820c1a8dSHiroo HAYASHI if (s == NULL) 25*820c1a8dSHiroo HAYASHI { 26*820c1a8dSHiroo HAYASHI pwc = NULL; 27*820c1a8dSHiroo HAYASHI s = ""; 28*820c1a8dSHiroo HAYASHI n = 1; 29*820c1a8dSHiroo HAYASHI } 30*820c1a8dSHiroo HAYASHI 31*820c1a8dSHiroo HAYASHI if (n == 0) 32*820c1a8dSHiroo HAYASHI return (size_t)(-2); 33*820c1a8dSHiroo HAYASHI 34*820c1a8dSHiroo HAYASHI /* Here n > 0. */ 35*820c1a8dSHiroo HAYASHI 36*820c1a8dSHiroo HAYASHI if (pstate == NULL) 37*820c1a8dSHiroo HAYASHI pstate = internal_state; 38*820c1a8dSHiroo HAYASHI 39*820c1a8dSHiroo HAYASHI { 40*820c1a8dSHiroo HAYASHI size_t nstate = pstate[0]; 41*820c1a8dSHiroo HAYASHI char buf[4]; 42*820c1a8dSHiroo HAYASHI const char *p; 43*820c1a8dSHiroo HAYASHI size_t m; 44*820c1a8dSHiroo HAYASHI enc_t enc; 45*820c1a8dSHiroo HAYASHI int res; 46*820c1a8dSHiroo HAYASHI 47*820c1a8dSHiroo HAYASHI switch (nstate) 48*820c1a8dSHiroo HAYASHI { 49*820c1a8dSHiroo HAYASHI case 0: 50*820c1a8dSHiroo HAYASHI p = s; 51*820c1a8dSHiroo HAYASHI m = n; 52*820c1a8dSHiroo HAYASHI break; 53*820c1a8dSHiroo HAYASHI case 3: 54*820c1a8dSHiroo HAYASHI buf[2] = pstate[3]; 55*820c1a8dSHiroo HAYASHI FALLTHROUGH; 56*820c1a8dSHiroo HAYASHI case 2: 57*820c1a8dSHiroo HAYASHI buf[1] = pstate[2]; 58*820c1a8dSHiroo HAYASHI FALLTHROUGH; 59*820c1a8dSHiroo HAYASHI case 1: 60*820c1a8dSHiroo HAYASHI buf[0] = pstate[1]; 61*820c1a8dSHiroo HAYASHI p = buf; 62*820c1a8dSHiroo HAYASHI m = nstate; 63*820c1a8dSHiroo HAYASHI buf[m++] = s[0]; 64*820c1a8dSHiroo HAYASHI if (n >= 2 && m < 4) 65*820c1a8dSHiroo HAYASHI { 66*820c1a8dSHiroo HAYASHI buf[m++] = s[1]; 67*820c1a8dSHiroo HAYASHI if (n >= 3 && m < 4) 68*820c1a8dSHiroo HAYASHI buf[m++] = s[2]; 69*820c1a8dSHiroo HAYASHI } 70*820c1a8dSHiroo HAYASHI break; 71*820c1a8dSHiroo HAYASHI default: 72*820c1a8dSHiroo HAYASHI errno = EINVAL; 73*820c1a8dSHiroo HAYASHI return (size_t)(-1); 74*820c1a8dSHiroo HAYASHI } 75*820c1a8dSHiroo HAYASHI 76*820c1a8dSHiroo HAYASHI /* Here m > 0. */ 77*820c1a8dSHiroo HAYASHI 78*820c1a8dSHiroo HAYASHI enc = locale_encoding_classification (); 79*820c1a8dSHiroo HAYASHI 80*820c1a8dSHiroo HAYASHI if (enc == enc_utf8) /* UTF-8 */ 81*820c1a8dSHiroo HAYASHI { 82*820c1a8dSHiroo HAYASHI /* Achieve 83*820c1a8dSHiroo HAYASHI - multi-thread safety and 84*820c1a8dSHiroo HAYASHI - the ability to produce wide character values > WCHAR_MAX 85*820c1a8dSHiroo HAYASHI by not calling mbtowc() at all. */ 86*820c1a8dSHiroo HAYASHI #include "mbrtowc-impl-utf8.h" 87*820c1a8dSHiroo HAYASHI } 88*820c1a8dSHiroo HAYASHI else 89*820c1a8dSHiroo HAYASHI { 90*820c1a8dSHiroo HAYASHI /* The hidden internal state of mbtowc would make this function not 91*820c1a8dSHiroo HAYASHI multi-thread safe. Achieve multi-thread safety through a lock. */ 92*820c1a8dSHiroo HAYASHI wchar_t wc; 93*820c1a8dSHiroo HAYASHI res = mbtowc_with_lock (&wc, p, m); 94*820c1a8dSHiroo HAYASHI 95*820c1a8dSHiroo HAYASHI if (res >= 0) 96*820c1a8dSHiroo HAYASHI { 97*820c1a8dSHiroo HAYASHI if ((wc == 0) != (res == 0)) 98*820c1a8dSHiroo HAYASHI abort (); 99*820c1a8dSHiroo HAYASHI if (pwc != NULL) 100*820c1a8dSHiroo HAYASHI *pwc = wc; 101*820c1a8dSHiroo HAYASHI goto success; 102*820c1a8dSHiroo HAYASHI } 103*820c1a8dSHiroo HAYASHI 104*820c1a8dSHiroo HAYASHI /* mbtowc does not distinguish between invalid and incomplete multibyte 105*820c1a8dSHiroo HAYASHI sequences. But mbrtowc needs to make this distinction. 106*820c1a8dSHiroo HAYASHI There are two possible approaches: 107*820c1a8dSHiroo HAYASHI - Use iconv() and its return value. 108*820c1a8dSHiroo HAYASHI - Use built-in knowledge about the possible encodings. 109*820c1a8dSHiroo HAYASHI Given the low quality of implementation of iconv() on the systems 110*820c1a8dSHiroo HAYASHI that lack mbrtowc(), we use the second approach. 111*820c1a8dSHiroo HAYASHI The possible encodings are: 112*820c1a8dSHiroo HAYASHI - 8-bit encodings, 113*820c1a8dSHiroo HAYASHI - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS, 114*820c1a8dSHiroo HAYASHI - UTF-8 (already handled above). 115*820c1a8dSHiroo HAYASHI Use specialized code for each. */ 116*820c1a8dSHiroo HAYASHI if (m >= 4 || m >= MB_CUR_MAX) 117*820c1a8dSHiroo HAYASHI goto invalid; 118*820c1a8dSHiroo HAYASHI /* Here MB_CUR_MAX > 1 and 0 < m < 4. */ 119*820c1a8dSHiroo HAYASHI switch (enc) 120*820c1a8dSHiroo HAYASHI { 121*820c1a8dSHiroo HAYASHI /* As a reference for this code, you can use the GNU libiconv 122*820c1a8dSHiroo HAYASHI implementation. Look for uses of the RET_TOOFEW macro. */ 123*820c1a8dSHiroo HAYASHI 124*820c1a8dSHiroo HAYASHI case enc_eucjp: /* EUC-JP */ 125*820c1a8dSHiroo HAYASHI { 126*820c1a8dSHiroo HAYASHI if (m == 1) 127*820c1a8dSHiroo HAYASHI { 128*820c1a8dSHiroo HAYASHI unsigned char c = (unsigned char) p[0]; 129*820c1a8dSHiroo HAYASHI 130*820c1a8dSHiroo HAYASHI if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f) 131*820c1a8dSHiroo HAYASHI goto incomplete; 132*820c1a8dSHiroo HAYASHI } 133*820c1a8dSHiroo HAYASHI if (m == 2) 134*820c1a8dSHiroo HAYASHI { 135*820c1a8dSHiroo HAYASHI unsigned char c = (unsigned char) p[0]; 136*820c1a8dSHiroo HAYASHI 137*820c1a8dSHiroo HAYASHI if (c == 0x8f) 138*820c1a8dSHiroo HAYASHI { 139*820c1a8dSHiroo HAYASHI unsigned char c2 = (unsigned char) p[1]; 140*820c1a8dSHiroo HAYASHI 141*820c1a8dSHiroo HAYASHI if (c2 >= 0xa1 && c2 < 0xff) 142*820c1a8dSHiroo HAYASHI goto incomplete; 143*820c1a8dSHiroo HAYASHI } 144*820c1a8dSHiroo HAYASHI } 145*820c1a8dSHiroo HAYASHI goto invalid; 146*820c1a8dSHiroo HAYASHI } 147*820c1a8dSHiroo HAYASHI 148*820c1a8dSHiroo HAYASHI case enc_94: /* EUC-KR, GB2312, BIG5 */ 149*820c1a8dSHiroo HAYASHI { 150*820c1a8dSHiroo HAYASHI if (m == 1) 151*820c1a8dSHiroo HAYASHI { 152*820c1a8dSHiroo HAYASHI unsigned char c = (unsigned char) p[0]; 153*820c1a8dSHiroo HAYASHI 154*820c1a8dSHiroo HAYASHI if (c >= 0xa1 && c < 0xff) 155*820c1a8dSHiroo HAYASHI goto incomplete; 156*820c1a8dSHiroo HAYASHI } 157*820c1a8dSHiroo HAYASHI goto invalid; 158*820c1a8dSHiroo HAYASHI } 159*820c1a8dSHiroo HAYASHI 160*820c1a8dSHiroo HAYASHI case enc_euctw: /* EUC-TW */ 161*820c1a8dSHiroo HAYASHI { 162*820c1a8dSHiroo HAYASHI if (m == 1) 163*820c1a8dSHiroo HAYASHI { 164*820c1a8dSHiroo HAYASHI unsigned char c = (unsigned char) p[0]; 165*820c1a8dSHiroo HAYASHI 166*820c1a8dSHiroo HAYASHI if ((c >= 0xa1 && c < 0xff) || c == 0x8e) 167*820c1a8dSHiroo HAYASHI goto incomplete; 168*820c1a8dSHiroo HAYASHI } 169*820c1a8dSHiroo HAYASHI else /* m == 2 || m == 3 */ 170*820c1a8dSHiroo HAYASHI { 171*820c1a8dSHiroo HAYASHI unsigned char c = (unsigned char) p[0]; 172*820c1a8dSHiroo HAYASHI 173*820c1a8dSHiroo HAYASHI if (c == 0x8e) 174*820c1a8dSHiroo HAYASHI goto incomplete; 175*820c1a8dSHiroo HAYASHI } 176*820c1a8dSHiroo HAYASHI goto invalid; 177*820c1a8dSHiroo HAYASHI } 178*820c1a8dSHiroo HAYASHI 179*820c1a8dSHiroo HAYASHI case enc_gb18030: /* GB18030 */ 180*820c1a8dSHiroo HAYASHI { 181*820c1a8dSHiroo HAYASHI if (m == 1) 182*820c1a8dSHiroo HAYASHI { 183*820c1a8dSHiroo HAYASHI unsigned char c = (unsigned char) p[0]; 184*820c1a8dSHiroo HAYASHI 185*820c1a8dSHiroo HAYASHI if ((c >= 0x90 && c <= 0xe3) || (c >= 0xf8 && c <= 0xfe)) 186*820c1a8dSHiroo HAYASHI goto incomplete; 187*820c1a8dSHiroo HAYASHI } 188*820c1a8dSHiroo HAYASHI else /* m == 2 || m == 3 */ 189*820c1a8dSHiroo HAYASHI { 190*820c1a8dSHiroo HAYASHI unsigned char c = (unsigned char) p[0]; 191*820c1a8dSHiroo HAYASHI 192*820c1a8dSHiroo HAYASHI if (c >= 0x90 && c <= 0xe3) 193*820c1a8dSHiroo HAYASHI { 194*820c1a8dSHiroo HAYASHI unsigned char c2 = (unsigned char) p[1]; 195*820c1a8dSHiroo HAYASHI 196*820c1a8dSHiroo HAYASHI if (c2 >= 0x30 && c2 <= 0x39) 197*820c1a8dSHiroo HAYASHI { 198*820c1a8dSHiroo HAYASHI if (m == 2) 199*820c1a8dSHiroo HAYASHI goto incomplete; 200*820c1a8dSHiroo HAYASHI else /* m == 3 */ 201*820c1a8dSHiroo HAYASHI { 202*820c1a8dSHiroo HAYASHI unsigned char c3 = (unsigned char) p[2]; 203*820c1a8dSHiroo HAYASHI 204*820c1a8dSHiroo HAYASHI if (c3 >= 0x81 && c3 <= 0xfe) 205*820c1a8dSHiroo HAYASHI goto incomplete; 206*820c1a8dSHiroo HAYASHI } 207*820c1a8dSHiroo HAYASHI } 208*820c1a8dSHiroo HAYASHI } 209*820c1a8dSHiroo HAYASHI } 210*820c1a8dSHiroo HAYASHI goto invalid; 211*820c1a8dSHiroo HAYASHI } 212*820c1a8dSHiroo HAYASHI 213*820c1a8dSHiroo HAYASHI case enc_sjis: /* SJIS */ 214*820c1a8dSHiroo HAYASHI { 215*820c1a8dSHiroo HAYASHI if (m == 1) 216*820c1a8dSHiroo HAYASHI { 217*820c1a8dSHiroo HAYASHI unsigned char c = (unsigned char) p[0]; 218*820c1a8dSHiroo HAYASHI 219*820c1a8dSHiroo HAYASHI if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea) 220*820c1a8dSHiroo HAYASHI || (c >= 0xf0 && c <= 0xf9)) 221*820c1a8dSHiroo HAYASHI goto incomplete; 222*820c1a8dSHiroo HAYASHI } 223*820c1a8dSHiroo HAYASHI goto invalid; 224*820c1a8dSHiroo HAYASHI } 225*820c1a8dSHiroo HAYASHI 226*820c1a8dSHiroo HAYASHI default: 227*820c1a8dSHiroo HAYASHI /* An unknown multibyte encoding. */ 228*820c1a8dSHiroo HAYASHI goto incomplete; 229*820c1a8dSHiroo HAYASHI } 230*820c1a8dSHiroo HAYASHI } 231*820c1a8dSHiroo HAYASHI 232*820c1a8dSHiroo HAYASHI success: 233*820c1a8dSHiroo HAYASHI /* res >= 0 is the corrected return value of 234*820c1a8dSHiroo HAYASHI mbtowc_with_lock (&wc, p, m). */ 235*820c1a8dSHiroo HAYASHI if (nstate >= (res > 0 ? res : 1)) 236*820c1a8dSHiroo HAYASHI abort (); 237*820c1a8dSHiroo HAYASHI res -= nstate; 238*820c1a8dSHiroo HAYASHI pstate[0] = 0; 239*820c1a8dSHiroo HAYASHI return res; 240*820c1a8dSHiroo HAYASHI 241*820c1a8dSHiroo HAYASHI incomplete: 242*820c1a8dSHiroo HAYASHI { 243*820c1a8dSHiroo HAYASHI size_t k = nstate; 244*820c1a8dSHiroo HAYASHI /* Here 0 <= k < m < 4. */ 245*820c1a8dSHiroo HAYASHI pstate[++k] = s[0]; 246*820c1a8dSHiroo HAYASHI if (k < m) 247*820c1a8dSHiroo HAYASHI { 248*820c1a8dSHiroo HAYASHI pstate[++k] = s[1]; 249*820c1a8dSHiroo HAYASHI if (k < m) 250*820c1a8dSHiroo HAYASHI pstate[++k] = s[2]; 251*820c1a8dSHiroo HAYASHI } 252*820c1a8dSHiroo HAYASHI if (k != m) 253*820c1a8dSHiroo HAYASHI abort (); 254*820c1a8dSHiroo HAYASHI } 255*820c1a8dSHiroo HAYASHI pstate[0] = m; 256*820c1a8dSHiroo HAYASHI return (size_t)(-2); 257*820c1a8dSHiroo HAYASHI 258*820c1a8dSHiroo HAYASHI invalid: 259*820c1a8dSHiroo HAYASHI errno = EILSEQ; 260*820c1a8dSHiroo HAYASHI /* The conversion state is undefined, says POSIX. */ 261*820c1a8dSHiroo HAYASHI return (size_t)(-1); 262*820c1a8dSHiroo HAYASHI } 263