262 lines
		
	
	
	
		
			7.2 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			262 lines
		
	
	
	
		
			7.2 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
/* Convert multibyte character to wide character.
 | 
						|
   Copyright (C) 1999-2002, 2005-2023 Free Software Foundation, Inc.
 | 
						|
 | 
						|
   This file is free software: you can redistribute it and/or modify
 | 
						|
   it under the terms of the GNU Lesser General Public License as
 | 
						|
   published by the Free Software Foundation; either version 2.1 of the
 | 
						|
   License, or (at your option) any later version.
 | 
						|
 | 
						|
   This file is distributed in the hope that it will be useful,
 | 
						|
   but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
						|
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
						|
   GNU Lesser General Public License for more details.
 | 
						|
 | 
						|
   You should have received a copy of the GNU Lesser General Public License
 | 
						|
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
 | 
						|
 | 
						|
/* Written by Bruno Haible <bruno@clisp.org>, 2008.  */
 | 
						|
 | 
						|
/* This file contains the body of the mbrtowc and mbrtoc32 functions,
 | 
						|
   when GNULIB_defined_mbstate_t is defined.  */
 | 
						|
 | 
						|
  char *pstate = (char *)ps;
 | 
						|
 | 
						|
  if (s == NULL)
 | 
						|
    {
 | 
						|
      pwc = NULL;
 | 
						|
      s = "";
 | 
						|
      n = 1;
 | 
						|
    }
 | 
						|
 | 
						|
  if (n == 0)
 | 
						|
    return (size_t)(-2);
 | 
						|
 | 
						|
  /* Here n > 0.  */
 | 
						|
 | 
						|
  if (pstate == NULL)
 | 
						|
    pstate = internal_state;
 | 
						|
 | 
						|
  {
 | 
						|
    size_t nstate = pstate[0];
 | 
						|
    char buf[4];
 | 
						|
    const char *p;
 | 
						|
    size_t m;
 | 
						|
    enc_t enc;
 | 
						|
    int res;
 | 
						|
 | 
						|
    switch (nstate)
 | 
						|
      {
 | 
						|
      case 0:
 | 
						|
        p = s;
 | 
						|
        m = n;
 | 
						|
        break;
 | 
						|
      case 3:
 | 
						|
        buf[2] = pstate[3];
 | 
						|
        FALLTHROUGH;
 | 
						|
      case 2:
 | 
						|
        buf[1] = pstate[2];
 | 
						|
        FALLTHROUGH;
 | 
						|
      case 1:
 | 
						|
        buf[0] = pstate[1];
 | 
						|
        p = buf;
 | 
						|
        m = nstate;
 | 
						|
        buf[m++] = s[0];
 | 
						|
        if (n >= 2 && m < 4)
 | 
						|
          {
 | 
						|
            buf[m++] = s[1];
 | 
						|
            if (n >= 3 && m < 4)
 | 
						|
              buf[m++] = s[2];
 | 
						|
          }
 | 
						|
        break;
 | 
						|
      default:
 | 
						|
        errno = EINVAL;
 | 
						|
        return (size_t)(-1);
 | 
						|
      }
 | 
						|
 | 
						|
    /* Here m > 0.  */
 | 
						|
 | 
						|
    enc = locale_encoding_classification ();
 | 
						|
 | 
						|
    if (enc == enc_utf8) /* UTF-8 */
 | 
						|
      {
 | 
						|
        /* Achieve
 | 
						|
             - multi-thread safety and
 | 
						|
             - the ability to produce wide character values > WCHAR_MAX
 | 
						|
           by not calling mbtowc() at all.  */
 | 
						|
#include "mbrtowc-impl-utf8.h"
 | 
						|
      }
 | 
						|
    else
 | 
						|
      {
 | 
						|
        /* The hidden internal state of mbtowc would make this function not
 | 
						|
           multi-thread safe.  Achieve multi-thread safety through a lock.  */
 | 
						|
        wchar_t wc;
 | 
						|
        res = mbtowc_with_lock (&wc, p, m);
 | 
						|
 | 
						|
        if (res >= 0)
 | 
						|
          {
 | 
						|
            if ((wc == 0) != (res == 0))
 | 
						|
              abort ();
 | 
						|
            if (pwc != NULL)
 | 
						|
              *pwc = wc;
 | 
						|
            goto success;
 | 
						|
          }
 | 
						|
 | 
						|
        /* mbtowc does not distinguish between invalid and incomplete multibyte
 | 
						|
           sequences.  But mbrtowc needs to make this distinction.
 | 
						|
           There are two possible approaches:
 | 
						|
             - Use iconv() and its return value.
 | 
						|
             - Use built-in knowledge about the possible encodings.
 | 
						|
           Given the low quality of implementation of iconv() on the systems
 | 
						|
           that lack mbrtowc(), we use the second approach.
 | 
						|
           The possible encodings are:
 | 
						|
             - 8-bit encodings,
 | 
						|
             - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS,
 | 
						|
             - UTF-8 (already handled above).
 | 
						|
           Use specialized code for each.  */
 | 
						|
        if (m >= 4 || m >= MB_CUR_MAX)
 | 
						|
          goto invalid;
 | 
						|
        /* Here MB_CUR_MAX > 1 and 0 < m < 4.  */
 | 
						|
        switch (enc)
 | 
						|
          {
 | 
						|
          /* As a reference for this code, you can use the GNU libiconv
 | 
						|
             implementation.  Look for uses of the RET_TOOFEW macro.  */
 | 
						|
 | 
						|
          case enc_eucjp: /* EUC-JP */
 | 
						|
            {
 | 
						|
              if (m == 1)
 | 
						|
                {
 | 
						|
                  unsigned char c = (unsigned char) p[0];
 | 
						|
 | 
						|
                  if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f)
 | 
						|
                    goto incomplete;
 | 
						|
                }
 | 
						|
              if (m == 2)
 | 
						|
                {
 | 
						|
                  unsigned char c = (unsigned char) p[0];
 | 
						|
 | 
						|
                  if (c == 0x8f)
 | 
						|
                    {
 | 
						|
                      unsigned char c2 = (unsigned char) p[1];
 | 
						|
 | 
						|
                      if (c2 >= 0xa1 && c2 < 0xff)
 | 
						|
                        goto incomplete;
 | 
						|
                    }
 | 
						|
                }
 | 
						|
              goto invalid;
 | 
						|
            }
 | 
						|
 | 
						|
          case enc_94: /* EUC-KR, GB2312, BIG5 */
 | 
						|
            {
 | 
						|
              if (m == 1)
 | 
						|
                {
 | 
						|
                  unsigned char c = (unsigned char) p[0];
 | 
						|
 | 
						|
                  if (c >= 0xa1 && c < 0xff)
 | 
						|
                    goto incomplete;
 | 
						|
                }
 | 
						|
              goto invalid;
 | 
						|
            }
 | 
						|
 | 
						|
          case enc_euctw: /* EUC-TW */
 | 
						|
            {
 | 
						|
              if (m == 1)
 | 
						|
                {
 | 
						|
                  unsigned char c = (unsigned char) p[0];
 | 
						|
 | 
						|
                  if ((c >= 0xa1 && c < 0xff) || c == 0x8e)
 | 
						|
                    goto incomplete;
 | 
						|
                }
 | 
						|
              else /* m == 2 || m == 3 */
 | 
						|
                {
 | 
						|
                  unsigned char c = (unsigned char) p[0];
 | 
						|
 | 
						|
                  if (c == 0x8e)
 | 
						|
                    goto incomplete;
 | 
						|
                }
 | 
						|
              goto invalid;
 | 
						|
            }
 | 
						|
 | 
						|
          case enc_gb18030: /* GB18030 */
 | 
						|
            {
 | 
						|
              if (m == 1)
 | 
						|
                {
 | 
						|
                  unsigned char c = (unsigned char) p[0];
 | 
						|
 | 
						|
                  if ((c >= 0x90 && c <= 0xe3) || (c >= 0xf8 && c <= 0xfe))
 | 
						|
                    goto incomplete;
 | 
						|
                }
 | 
						|
              else /* m == 2 || m == 3 */
 | 
						|
                {
 | 
						|
                  unsigned char c = (unsigned char) p[0];
 | 
						|
 | 
						|
                  if (c >= 0x90 && c <= 0xe3)
 | 
						|
                    {
 | 
						|
                      unsigned char c2 = (unsigned char) p[1];
 | 
						|
 | 
						|
                      if (c2 >= 0x30 && c2 <= 0x39)
 | 
						|
                        {
 | 
						|
                          if (m == 2)
 | 
						|
                            goto incomplete;
 | 
						|
                          else /* m == 3 */
 | 
						|
                            {
 | 
						|
                              unsigned char c3 = (unsigned char) p[2];
 | 
						|
 | 
						|
                              if (c3 >= 0x81 && c3 <= 0xfe)
 | 
						|
                                goto incomplete;
 | 
						|
                            }
 | 
						|
                        }
 | 
						|
                    }
 | 
						|
                }
 | 
						|
              goto invalid;
 | 
						|
            }
 | 
						|
 | 
						|
          case enc_sjis: /* SJIS */
 | 
						|
            {
 | 
						|
              if (m == 1)
 | 
						|
                {
 | 
						|
                  unsigned char c = (unsigned char) p[0];
 | 
						|
 | 
						|
                  if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)
 | 
						|
                      || (c >= 0xf0 && c <= 0xf9))
 | 
						|
                    goto incomplete;
 | 
						|
                }
 | 
						|
              goto invalid;
 | 
						|
            }
 | 
						|
 | 
						|
          default:
 | 
						|
            /* An unknown multibyte encoding.  */
 | 
						|
            goto incomplete;
 | 
						|
          }
 | 
						|
      }
 | 
						|
 | 
						|
   success:
 | 
						|
    /* res >= 0 is the corrected return value of
 | 
						|
       mbtowc_with_lock (&wc, p, m).  */
 | 
						|
    if (nstate >= (res > 0 ? res : 1))
 | 
						|
      abort ();
 | 
						|
    res -= nstate;
 | 
						|
    pstate[0] = 0;
 | 
						|
    return res;
 | 
						|
 | 
						|
   incomplete:
 | 
						|
    {
 | 
						|
      size_t k = nstate;
 | 
						|
      /* Here 0 <= k < m < 4.  */
 | 
						|
      pstate[++k] = s[0];
 | 
						|
      if (k < m)
 | 
						|
        {
 | 
						|
          pstate[++k] = s[1];
 | 
						|
          if (k < m)
 | 
						|
            pstate[++k] = s[2];
 | 
						|
        }
 | 
						|
      if (k != m)
 | 
						|
        abort ();
 | 
						|
    }
 | 
						|
    pstate[0] = m;
 | 
						|
    return (size_t)(-2);
 | 
						|
 | 
						|
   invalid:
 | 
						|
    errno = EILSEQ;
 | 
						|
    /* The conversion state is undefined, says POSIX.  */
 | 
						|
    return (size_t)(-1);
 | 
						|
  }
 |