xref: /Universal-ctags/gnulib/mbrtowc-impl.h (revision 820c1a8d46849a90376d8eb15b319ac05439f656)
1*820c1a8dSHiroo HAYASHI /* Convert multibyte character to wide character.
2*820c1a8dSHiroo HAYASHI    Copyright (C) 1999-2002, 2005-2021 Free Software Foundation, Inc.
3*820c1a8dSHiroo HAYASHI 
4*820c1a8dSHiroo HAYASHI    This file is free software: you can redistribute it and/or modify
5*820c1a8dSHiroo HAYASHI    it under the terms of the GNU Lesser General Public License as
6*820c1a8dSHiroo HAYASHI    published by the Free Software Foundation; either version 2.1 of the
7*820c1a8dSHiroo HAYASHI    License, or (at your option) any later version.
8*820c1a8dSHiroo HAYASHI 
9*820c1a8dSHiroo HAYASHI    This file is distributed in the hope that it will be useful,
10*820c1a8dSHiroo HAYASHI    but WITHOUT ANY WARRANTY; without even the implied warranty of
11*820c1a8dSHiroo HAYASHI    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12*820c1a8dSHiroo HAYASHI    GNU Lesser General Public License for more details.
13*820c1a8dSHiroo HAYASHI 
14*820c1a8dSHiroo HAYASHI    You should have received a copy of the GNU Lesser General Public License
15*820c1a8dSHiroo HAYASHI    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
16*820c1a8dSHiroo HAYASHI 
17*820c1a8dSHiroo HAYASHI /* Written by Bruno Haible <bruno@clisp.org>, 2008.  */
18*820c1a8dSHiroo HAYASHI 
19*820c1a8dSHiroo HAYASHI /* This file contains the body of the mbrtowc and mbrtoc32 functions,
20*820c1a8dSHiroo HAYASHI    when GNULIB_defined_mbstate_t is defined.  */
21*820c1a8dSHiroo HAYASHI 
22*820c1a8dSHiroo HAYASHI   char *pstate = (char *)ps;
23*820c1a8dSHiroo HAYASHI 
24*820c1a8dSHiroo HAYASHI   if (s == NULL)
25*820c1a8dSHiroo HAYASHI     {
26*820c1a8dSHiroo HAYASHI       pwc = NULL;
27*820c1a8dSHiroo HAYASHI       s = "";
28*820c1a8dSHiroo HAYASHI       n = 1;
29*820c1a8dSHiroo HAYASHI     }
30*820c1a8dSHiroo HAYASHI 
31*820c1a8dSHiroo HAYASHI   if (n == 0)
32*820c1a8dSHiroo HAYASHI     return (size_t)(-2);
33*820c1a8dSHiroo HAYASHI 
34*820c1a8dSHiroo HAYASHI   /* Here n > 0.  */
35*820c1a8dSHiroo HAYASHI 
36*820c1a8dSHiroo HAYASHI   if (pstate == NULL)
37*820c1a8dSHiroo HAYASHI     pstate = internal_state;
38*820c1a8dSHiroo HAYASHI 
39*820c1a8dSHiroo HAYASHI   {
40*820c1a8dSHiroo HAYASHI     size_t nstate = pstate[0];
41*820c1a8dSHiroo HAYASHI     char buf[4];
42*820c1a8dSHiroo HAYASHI     const char *p;
43*820c1a8dSHiroo HAYASHI     size_t m;
44*820c1a8dSHiroo HAYASHI     enc_t enc;
45*820c1a8dSHiroo HAYASHI     int res;
46*820c1a8dSHiroo HAYASHI 
47*820c1a8dSHiroo HAYASHI     switch (nstate)
48*820c1a8dSHiroo HAYASHI       {
49*820c1a8dSHiroo HAYASHI       case 0:
50*820c1a8dSHiroo HAYASHI         p = s;
51*820c1a8dSHiroo HAYASHI         m = n;
52*820c1a8dSHiroo HAYASHI         break;
53*820c1a8dSHiroo HAYASHI       case 3:
54*820c1a8dSHiroo HAYASHI         buf[2] = pstate[3];
55*820c1a8dSHiroo HAYASHI         FALLTHROUGH;
56*820c1a8dSHiroo HAYASHI       case 2:
57*820c1a8dSHiroo HAYASHI         buf[1] = pstate[2];
58*820c1a8dSHiroo HAYASHI         FALLTHROUGH;
59*820c1a8dSHiroo HAYASHI       case 1:
60*820c1a8dSHiroo HAYASHI         buf[0] = pstate[1];
61*820c1a8dSHiroo HAYASHI         p = buf;
62*820c1a8dSHiroo HAYASHI         m = nstate;
63*820c1a8dSHiroo HAYASHI         buf[m++] = s[0];
64*820c1a8dSHiroo HAYASHI         if (n >= 2 && m < 4)
65*820c1a8dSHiroo HAYASHI           {
66*820c1a8dSHiroo HAYASHI             buf[m++] = s[1];
67*820c1a8dSHiroo HAYASHI             if (n >= 3 && m < 4)
68*820c1a8dSHiroo HAYASHI               buf[m++] = s[2];
69*820c1a8dSHiroo HAYASHI           }
70*820c1a8dSHiroo HAYASHI         break;
71*820c1a8dSHiroo HAYASHI       default:
72*820c1a8dSHiroo HAYASHI         errno = EINVAL;
73*820c1a8dSHiroo HAYASHI         return (size_t)(-1);
74*820c1a8dSHiroo HAYASHI       }
75*820c1a8dSHiroo HAYASHI 
76*820c1a8dSHiroo HAYASHI     /* Here m > 0.  */
77*820c1a8dSHiroo HAYASHI 
78*820c1a8dSHiroo HAYASHI     enc = locale_encoding_classification ();
79*820c1a8dSHiroo HAYASHI 
80*820c1a8dSHiroo HAYASHI     if (enc == enc_utf8) /* UTF-8 */
81*820c1a8dSHiroo HAYASHI       {
82*820c1a8dSHiroo HAYASHI         /* Achieve
83*820c1a8dSHiroo HAYASHI              - multi-thread safety and
84*820c1a8dSHiroo HAYASHI              - the ability to produce wide character values > WCHAR_MAX
85*820c1a8dSHiroo HAYASHI            by not calling mbtowc() at all.  */
86*820c1a8dSHiroo HAYASHI #include "mbrtowc-impl-utf8.h"
87*820c1a8dSHiroo HAYASHI       }
88*820c1a8dSHiroo HAYASHI     else
89*820c1a8dSHiroo HAYASHI       {
90*820c1a8dSHiroo HAYASHI         /* The hidden internal state of mbtowc would make this function not
91*820c1a8dSHiroo HAYASHI            multi-thread safe.  Achieve multi-thread safety through a lock.  */
92*820c1a8dSHiroo HAYASHI         wchar_t wc;
93*820c1a8dSHiroo HAYASHI         res = mbtowc_with_lock (&wc, p, m);
94*820c1a8dSHiroo HAYASHI 
95*820c1a8dSHiroo HAYASHI         if (res >= 0)
96*820c1a8dSHiroo HAYASHI           {
97*820c1a8dSHiroo HAYASHI             if ((wc == 0) != (res == 0))
98*820c1a8dSHiroo HAYASHI               abort ();
99*820c1a8dSHiroo HAYASHI             if (pwc != NULL)
100*820c1a8dSHiroo HAYASHI               *pwc = wc;
101*820c1a8dSHiroo HAYASHI             goto success;
102*820c1a8dSHiroo HAYASHI           }
103*820c1a8dSHiroo HAYASHI 
104*820c1a8dSHiroo HAYASHI         /* mbtowc does not distinguish between invalid and incomplete multibyte
105*820c1a8dSHiroo HAYASHI            sequences.  But mbrtowc needs to make this distinction.
106*820c1a8dSHiroo HAYASHI            There are two possible approaches:
107*820c1a8dSHiroo HAYASHI              - Use iconv() and its return value.
108*820c1a8dSHiroo HAYASHI              - Use built-in knowledge about the possible encodings.
109*820c1a8dSHiroo HAYASHI            Given the low quality of implementation of iconv() on the systems
110*820c1a8dSHiroo HAYASHI            that lack mbrtowc(), we use the second approach.
111*820c1a8dSHiroo HAYASHI            The possible encodings are:
112*820c1a8dSHiroo HAYASHI              - 8-bit encodings,
113*820c1a8dSHiroo HAYASHI              - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS,
114*820c1a8dSHiroo HAYASHI              - UTF-8 (already handled above).
115*820c1a8dSHiroo HAYASHI            Use specialized code for each.  */
116*820c1a8dSHiroo HAYASHI         if (m >= 4 || m >= MB_CUR_MAX)
117*820c1a8dSHiroo HAYASHI           goto invalid;
118*820c1a8dSHiroo HAYASHI         /* Here MB_CUR_MAX > 1 and 0 < m < 4.  */
119*820c1a8dSHiroo HAYASHI         switch (enc)
120*820c1a8dSHiroo HAYASHI           {
121*820c1a8dSHiroo HAYASHI           /* As a reference for this code, you can use the GNU libiconv
122*820c1a8dSHiroo HAYASHI              implementation.  Look for uses of the RET_TOOFEW macro.  */
123*820c1a8dSHiroo HAYASHI 
124*820c1a8dSHiroo HAYASHI           case enc_eucjp: /* EUC-JP */
125*820c1a8dSHiroo HAYASHI             {
126*820c1a8dSHiroo HAYASHI               if (m == 1)
127*820c1a8dSHiroo HAYASHI                 {
128*820c1a8dSHiroo HAYASHI                   unsigned char c = (unsigned char) p[0];
129*820c1a8dSHiroo HAYASHI 
130*820c1a8dSHiroo HAYASHI                   if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f)
131*820c1a8dSHiroo HAYASHI                     goto incomplete;
132*820c1a8dSHiroo HAYASHI                 }
133*820c1a8dSHiroo HAYASHI               if (m == 2)
134*820c1a8dSHiroo HAYASHI                 {
135*820c1a8dSHiroo HAYASHI                   unsigned char c = (unsigned char) p[0];
136*820c1a8dSHiroo HAYASHI 
137*820c1a8dSHiroo HAYASHI                   if (c == 0x8f)
138*820c1a8dSHiroo HAYASHI                     {
139*820c1a8dSHiroo HAYASHI                       unsigned char c2 = (unsigned char) p[1];
140*820c1a8dSHiroo HAYASHI 
141*820c1a8dSHiroo HAYASHI                       if (c2 >= 0xa1 && c2 < 0xff)
142*820c1a8dSHiroo HAYASHI                         goto incomplete;
143*820c1a8dSHiroo HAYASHI                     }
144*820c1a8dSHiroo HAYASHI                 }
145*820c1a8dSHiroo HAYASHI               goto invalid;
146*820c1a8dSHiroo HAYASHI             }
147*820c1a8dSHiroo HAYASHI 
148*820c1a8dSHiroo HAYASHI           case enc_94: /* EUC-KR, GB2312, BIG5 */
149*820c1a8dSHiroo HAYASHI             {
150*820c1a8dSHiroo HAYASHI               if (m == 1)
151*820c1a8dSHiroo HAYASHI                 {
152*820c1a8dSHiroo HAYASHI                   unsigned char c = (unsigned char) p[0];
153*820c1a8dSHiroo HAYASHI 
154*820c1a8dSHiroo HAYASHI                   if (c >= 0xa1 && c < 0xff)
155*820c1a8dSHiroo HAYASHI                     goto incomplete;
156*820c1a8dSHiroo HAYASHI                 }
157*820c1a8dSHiroo HAYASHI               goto invalid;
158*820c1a8dSHiroo HAYASHI             }
159*820c1a8dSHiroo HAYASHI 
160*820c1a8dSHiroo HAYASHI           case enc_euctw: /* EUC-TW */
161*820c1a8dSHiroo HAYASHI             {
162*820c1a8dSHiroo HAYASHI               if (m == 1)
163*820c1a8dSHiroo HAYASHI                 {
164*820c1a8dSHiroo HAYASHI                   unsigned char c = (unsigned char) p[0];
165*820c1a8dSHiroo HAYASHI 
166*820c1a8dSHiroo HAYASHI                   if ((c >= 0xa1 && c < 0xff) || c == 0x8e)
167*820c1a8dSHiroo HAYASHI                     goto incomplete;
168*820c1a8dSHiroo HAYASHI                 }
169*820c1a8dSHiroo HAYASHI               else /* m == 2 || m == 3 */
170*820c1a8dSHiroo HAYASHI                 {
171*820c1a8dSHiroo HAYASHI                   unsigned char c = (unsigned char) p[0];
172*820c1a8dSHiroo HAYASHI 
173*820c1a8dSHiroo HAYASHI                   if (c == 0x8e)
174*820c1a8dSHiroo HAYASHI                     goto incomplete;
175*820c1a8dSHiroo HAYASHI                 }
176*820c1a8dSHiroo HAYASHI               goto invalid;
177*820c1a8dSHiroo HAYASHI             }
178*820c1a8dSHiroo HAYASHI 
179*820c1a8dSHiroo HAYASHI           case enc_gb18030: /* GB18030 */
180*820c1a8dSHiroo HAYASHI             {
181*820c1a8dSHiroo HAYASHI               if (m == 1)
182*820c1a8dSHiroo HAYASHI                 {
183*820c1a8dSHiroo HAYASHI                   unsigned char c = (unsigned char) p[0];
184*820c1a8dSHiroo HAYASHI 
185*820c1a8dSHiroo HAYASHI                   if ((c >= 0x90 && c <= 0xe3) || (c >= 0xf8 && c <= 0xfe))
186*820c1a8dSHiroo HAYASHI                     goto incomplete;
187*820c1a8dSHiroo HAYASHI                 }
188*820c1a8dSHiroo HAYASHI               else /* m == 2 || m == 3 */
189*820c1a8dSHiroo HAYASHI                 {
190*820c1a8dSHiroo HAYASHI                   unsigned char c = (unsigned char) p[0];
191*820c1a8dSHiroo HAYASHI 
192*820c1a8dSHiroo HAYASHI                   if (c >= 0x90 && c <= 0xe3)
193*820c1a8dSHiroo HAYASHI                     {
194*820c1a8dSHiroo HAYASHI                       unsigned char c2 = (unsigned char) p[1];
195*820c1a8dSHiroo HAYASHI 
196*820c1a8dSHiroo HAYASHI                       if (c2 >= 0x30 && c2 <= 0x39)
197*820c1a8dSHiroo HAYASHI                         {
198*820c1a8dSHiroo HAYASHI                           if (m == 2)
199*820c1a8dSHiroo HAYASHI                             goto incomplete;
200*820c1a8dSHiroo HAYASHI                           else /* m == 3 */
201*820c1a8dSHiroo HAYASHI                             {
202*820c1a8dSHiroo HAYASHI                               unsigned char c3 = (unsigned char) p[2];
203*820c1a8dSHiroo HAYASHI 
204*820c1a8dSHiroo HAYASHI                               if (c3 >= 0x81 && c3 <= 0xfe)
205*820c1a8dSHiroo HAYASHI                                 goto incomplete;
206*820c1a8dSHiroo HAYASHI                             }
207*820c1a8dSHiroo HAYASHI                         }
208*820c1a8dSHiroo HAYASHI                     }
209*820c1a8dSHiroo HAYASHI                 }
210*820c1a8dSHiroo HAYASHI               goto invalid;
211*820c1a8dSHiroo HAYASHI             }
212*820c1a8dSHiroo HAYASHI 
213*820c1a8dSHiroo HAYASHI           case enc_sjis: /* SJIS */
214*820c1a8dSHiroo HAYASHI             {
215*820c1a8dSHiroo HAYASHI               if (m == 1)
216*820c1a8dSHiroo HAYASHI                 {
217*820c1a8dSHiroo HAYASHI                   unsigned char c = (unsigned char) p[0];
218*820c1a8dSHiroo HAYASHI 
219*820c1a8dSHiroo HAYASHI                   if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)
220*820c1a8dSHiroo HAYASHI                       || (c >= 0xf0 && c <= 0xf9))
221*820c1a8dSHiroo HAYASHI                     goto incomplete;
222*820c1a8dSHiroo HAYASHI                 }
223*820c1a8dSHiroo HAYASHI               goto invalid;
224*820c1a8dSHiroo HAYASHI             }
225*820c1a8dSHiroo HAYASHI 
226*820c1a8dSHiroo HAYASHI           default:
227*820c1a8dSHiroo HAYASHI             /* An unknown multibyte encoding.  */
228*820c1a8dSHiroo HAYASHI             goto incomplete;
229*820c1a8dSHiroo HAYASHI           }
230*820c1a8dSHiroo HAYASHI       }
231*820c1a8dSHiroo HAYASHI 
232*820c1a8dSHiroo HAYASHI    success:
233*820c1a8dSHiroo HAYASHI     /* res >= 0 is the corrected return value of
234*820c1a8dSHiroo HAYASHI        mbtowc_with_lock (&wc, p, m).  */
235*820c1a8dSHiroo HAYASHI     if (nstate >= (res > 0 ? res : 1))
236*820c1a8dSHiroo HAYASHI       abort ();
237*820c1a8dSHiroo HAYASHI     res -= nstate;
238*820c1a8dSHiroo HAYASHI     pstate[0] = 0;
239*820c1a8dSHiroo HAYASHI     return res;
240*820c1a8dSHiroo HAYASHI 
241*820c1a8dSHiroo HAYASHI    incomplete:
242*820c1a8dSHiroo HAYASHI     {
243*820c1a8dSHiroo HAYASHI       size_t k = nstate;
244*820c1a8dSHiroo HAYASHI       /* Here 0 <= k < m < 4.  */
245*820c1a8dSHiroo HAYASHI       pstate[++k] = s[0];
246*820c1a8dSHiroo HAYASHI       if (k < m)
247*820c1a8dSHiroo HAYASHI         {
248*820c1a8dSHiroo HAYASHI           pstate[++k] = s[1];
249*820c1a8dSHiroo HAYASHI           if (k < m)
250*820c1a8dSHiroo HAYASHI             pstate[++k] = s[2];
251*820c1a8dSHiroo HAYASHI         }
252*820c1a8dSHiroo HAYASHI       if (k != m)
253*820c1a8dSHiroo HAYASHI         abort ();
254*820c1a8dSHiroo HAYASHI     }
255*820c1a8dSHiroo HAYASHI     pstate[0] = m;
256*820c1a8dSHiroo HAYASHI     return (size_t)(-2);
257*820c1a8dSHiroo HAYASHI 
258*820c1a8dSHiroo HAYASHI    invalid:
259*820c1a8dSHiroo HAYASHI     errno = EILSEQ;
260*820c1a8dSHiroo HAYASHI     /* The conversion state is undefined, says POSIX.  */
261*820c1a8dSHiroo HAYASHI     return (size_t)(-1);
262*820c1a8dSHiroo HAYASHI   }
263