LCOV - code coverage report
Current view: top level - sal/textenc - tcvtutf8.cxx (source / functions) Hit Total Coverage
Test: commit 10e77ab3ff6f4314137acd6e2702a6e5c1ce1fae Lines: 182 208 87.5 %
Date: 2014-11-03 Functions: 8 8 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
       2             : /*
       3             :  * This file is part of the LibreOffice project.
       4             :  *
       5             :  * This Source Code Form is subject to the terms of the Mozilla Public
       6             :  * License, v. 2.0. If a copy of the MPL was not distributed with this
       7             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
       8             :  *
       9             :  * This file incorporates work covered by the following license notice:
      10             :  *
      11             :  *   Licensed to the Apache Software Foundation (ASF) under one or more
      12             :  *   contributor license agreements. See the NOTICE file distributed
      13             :  *   with this work for additional information regarding copyright
      14             :  *   ownership. The ASF licenses this file to you under the Apache
      15             :  *   License, Version 2.0 (the "License"); you may not use this file
      16             :  *   except in compliance with the License. You may obtain a copy of
      17             :  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
      18             :  */
      19             : 
      20             : #include "sal/config.h"
      21             : 
      22             : #include "sal/types.h"
      23             : #include "rtl/textcvt.h"
      24             : 
      25             : #include "converter.hxx"
      26             : #include "tcvtutf8.hxx"
      27             : #include "tenchelp.hxx"
      28             : #include "unichars.hxx"
      29             : 
      30             : struct ImplUtf8ToUnicodeContext
      31             : {
      32             :     sal_uInt32 nUtf32;
      33             :     int nShift;
      34             :     bool bCheckBom;
      35             : };
      36             : 
      37             : struct ImplUnicodeToUtf8Context
      38             : {
      39             :     sal_Unicode nHighSurrogate; /* 0xFFFF: write BOM */
      40             : };
      41             : 
      42          96 : void * ImplCreateUtf8ToUnicodeContext()
      43             : {
      44          96 :     ImplUtf8ToUnicodeContext * p = new ImplUtf8ToUnicodeContext;
      45          96 :     ImplResetUtf8ToUnicodeContext(p);
      46          96 :     return p;
      47             : }
      48             : 
      49         108 : void ImplResetUtf8ToUnicodeContext(void * pContext)
      50             : {
      51         108 :     if (pContext != NULL)
      52             :     {
      53         108 :         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift = -1;
      54         108 :         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom = true;
      55             :     }
      56         108 : }
      57             : 
      58          96 : void ImplDestroyUtf8ToUnicodeContext(void * pContext)
      59             : {
      60          96 :     delete static_cast< ImplUtf8ToUnicodeContext * >(pContext);
      61          96 : }
      62             : 
      63     1658577 : sal_Size ImplConvertUtf8ToUnicode(
      64             :     void const * pData, void * pContext, char const * pSrcBuf,
      65             :     sal_Size nSrcBytes, sal_Unicode * pDestBuf, sal_Size nDestChars,
      66             :     sal_uInt32 nFlags, sal_uInt32 * pInfo, sal_Size * pSrcCvtBytes)
      67             : {
      68             :     /*
      69             :        This function is very liberal with the UTF-8 input.  Accepted are:
      70             :        - non-shortest forms (e.g., C0 41 instead of 41 to represent U+0041)
      71             :        - surrogates (e.g., ED A0 80 to represent U+D800)
      72             :        - encodings with up to six bytes (everything outside the range
      73             :          U+0000..10FFFF is considered "undefined")
      74             :        The first two of these points allow this routine to translate from both
      75             :        RTL_TEXTENCODING_UTF8 and RTL_TEXTENCODING_JAVA_UTF8.
      76             :       */
      77             : 
      78     1658577 :     bool bJavaUtf8 = pData != NULL;
      79     1658577 :     sal_uInt32 nUtf32 = 0;
      80     1658577 :     int nShift = -1;
      81     1658577 :     bool bCheckBom = true;
      82     1658577 :     sal_uInt32 nInfo = 0;
      83     1658577 :     unsigned char const * pSrcBufPtr = (unsigned char const *) pSrcBuf;
      84     1658577 :     unsigned char const * pSrcBufEnd = pSrcBufPtr + nSrcBytes;
      85     1658577 :     sal_Unicode * pDestBufPtr = pDestBuf;
      86     1658577 :     sal_Unicode * pDestBufEnd = pDestBufPtr + nDestChars;
      87             : 
      88     1658577 :     if (pContext != NULL)
      89             :     {
      90      185378 :         nUtf32 = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nUtf32;
      91      185378 :         nShift = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift;
      92      185378 :         bCheckBom = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom;
      93             :     }
      94             : 
      95    39711428 :     while (pSrcBufPtr < pSrcBufEnd)
      96             :     {
      97    36394694 :         bool bUndefined = false;
      98    36394694 :         bool bConsume = true;
      99    36394694 :         sal_uInt32 nChar = *pSrcBufPtr++;
     100    36394694 :         if (nShift < 0)
     101    30384495 :             if (nChar <= 0x7F)
     102             :             {
     103    27052560 :                 nUtf32 = nChar;
     104    27052560 :                 goto transform;
     105             :             }
     106     3331935 :             else if (nChar <= 0xBF)
     107       23104 :                 goto bad_input;
     108     3308831 :             else if (nChar <= 0xDF)
     109             :             {
     110       69491 :                 nUtf32 = (nChar & 0x1F) << 6;
     111       69491 :                 nShift = 0;
     112             :             }
     113     3239340 :             else if (nChar <= 0xEF)
     114             :             {
     115     2964824 :                 nUtf32 = (nChar & 0x0F) << 12;
     116     2964824 :                 nShift = 6;
     117             :             }
     118      274516 :             else if (nChar <= 0xF7)
     119             :             {
     120       16896 :                 nUtf32 = (nChar & 0x07) << 18;
     121       16896 :                 nShift = 12;
     122             :             }
     123      257620 :             else if (nChar <= 0xFB)
     124             :             {
     125          30 :                 nUtf32 = (nChar & 0x03) << 24;
     126          30 :                 nShift = 18;
     127             :             }
     128      257590 :             else if (nChar <= 0xFD)
     129             :             {
     130        1286 :                 nUtf32 = (nChar & 0x01) << 30;
     131        1286 :                 nShift = 24;
     132             :             }
     133             :             else
     134      256304 :                 goto bad_input;
     135     6010199 :         else if ((nChar & 0xC0) == 0x80)
     136             :         {
     137     5968403 :             nUtf32 |= (nChar & 0x3F) << nShift;
     138     5968403 :             if (nShift == 0)
     139     3010723 :                 goto transform;
     140             :             else
     141     2957680 :                 nShift -= 6;
     142             :         }
     143             :         else
     144             :         {
     145             :             /*
     146             :              This byte is preceded by a broken UTF-8 sequence; if this byte
     147             :              is neither in the range [0x80..0xBF] nor in the range
     148             :              [0xFE..0xFF], assume that this byte does not belong to that
     149             :              broken sequence, but instead starts a new, legal UTF-8 sequence:
     150             :              */
     151       41796 :             bConsume = nChar >= 0xFE;
     152       41796 :             goto bad_input;
     153             :         }
     154     6010207 :         continue;
     155             : 
     156             :     transform:
     157    30063283 :         if (!bCheckBom || nUtf32 != 0xFEFF
     158          20 :             || (nFlags & RTL_TEXTTOUNICODE_FLAGS_GLOBAL_SIGNATURE) == 0
     159          12 :             || bJavaUtf8)
     160             :         {
     161    30063275 :             if (nUtf32 <= 0xFFFF)
     162    30063249 :                 if (pDestBufPtr != pDestBufEnd)
     163    30063015 :                     *pDestBufPtr++ = (sal_Unicode) nUtf32;
     164             :                 else
     165         234 :                     goto no_output;
     166          26 :             else if (nUtf32 <= 0x10FFFF)
     167          10 :                 if (pDestBufEnd - pDestBufPtr >= 2)
     168             :                 {
     169          10 :                     *pDestBufPtr++ = (sal_Unicode) ImplGetHighSurrogate(nUtf32);
     170          10 :                     *pDestBufPtr++ = (sal_Unicode) ImplGetLowSurrogate(nUtf32);
     171             :                 }
     172             :                 else
     173           0 :                     goto no_output;
     174             :             else
     175             :             {
     176          16 :                 bUndefined = true;
     177          16 :                 goto bad_input;
     178             :             }
     179             :         }
     180    30063033 :         nShift = -1;
     181    30063033 :         bCheckBom = false;
     182    30063033 :         continue;
     183             : 
     184             :     bad_input:
     185      321220 :         switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
     186             :                     bUndefined, true, 0, nFlags, &pDestBufPtr, pDestBufEnd,
     187      321220 :                     &nInfo))
     188             :         {
     189             :         case sal::detail::textenc::BAD_INPUT_STOP:
     190          62 :             nShift = -1;
     191          62 :             bCheckBom = false;
     192          62 :             if (!bConsume)
     193           4 :                 --pSrcBufPtr;
     194          62 :             break;
     195             : 
     196             :         case sal::detail::textenc::BAD_INPUT_CONTINUE:
     197      321034 :             nShift = -1;
     198      321034 :             bCheckBom = false;
     199      321034 :             if (!bConsume)
     200       34876 :                 --pSrcBufPtr;
     201      321034 :             continue;
     202             : 
     203             :         case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
     204         124 :             goto no_output;
     205             :         }
     206          62 :         break;
     207             : 
     208             :     no_output:
     209         358 :         --pSrcBufPtr;
     210         358 :         nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
     211         358 :         break;
     212             :     }
     213             : 
     214     1658577 :     if (nShift >= 0
     215         212 :         && (nInfo & (RTL_TEXTTOUNICODE_INFO_ERROR
     216             :                          | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL))
     217             :                == 0)
     218             :     {
     219         100 :         if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0)
     220          98 :             nInfo |= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL;
     221             :         else
     222           2 :             switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
     223             :                         false, true, 0, nFlags, &pDestBufPtr, pDestBufEnd,
     224           2 :                         &nInfo))
     225             :             {
     226             :             case sal::detail::textenc::BAD_INPUT_STOP:
     227             :             case sal::detail::textenc::BAD_INPUT_CONTINUE:
     228           2 :                 nShift = -1;
     229           2 :                 bCheckBom = false;
     230           2 :                 break;
     231             : 
     232             :             case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
     233           0 :                 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
     234           0 :                 break;
     235             :             }
     236             :     }
     237             : 
     238     1658577 :     if (pContext != NULL)
     239             :     {
     240      185378 :         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nUtf32 = nUtf32;
     241      185378 :         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift = nShift;
     242      185378 :         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom = bCheckBom;
     243             :     }
     244     1658577 :     if (pInfo != NULL)
     245     1658577 :         *pInfo = nInfo;
     246     1658577 :     if (pSrcCvtBytes != NULL)
     247     1658577 :         *pSrcCvtBytes = reinterpret_cast< char const * >(pSrcBufPtr) - pSrcBuf;
     248     1658577 :     return pDestBufPtr - pDestBuf;
     249             : }
     250             : 
     251        4568 : void * ImplCreateUnicodeToUtf8Context()
     252             : {
     253        4568 :     ImplUnicodeToUtf8Context * p = new ImplUnicodeToUtf8Context;
     254        4568 :     ImplResetUnicodeToUtf8Context(p);
     255        4568 :     return p;
     256             : }
     257             : 
     258        4568 : void ImplResetUnicodeToUtf8Context(void * pContext)
     259             : {
     260        4568 :     if (pContext != NULL)
     261        4568 :         static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate = 0xFFFF;
     262        4568 : }
     263             : 
     264        4568 : void ImplDestroyUnicodeToUtf8Context(void * pContext)
     265             : {
     266        4568 :     delete static_cast< ImplUnicodeToUtf8Context * >(pContext);
     267        4568 : }
     268             : 
     269     1284215 : sal_Size ImplConvertUnicodeToUtf8(
     270             :     void const * pData, void * pContext, sal_Unicode const * pSrcBuf,
     271             :     sal_Size nSrcChars, char * pDestBuf, sal_Size nDestBytes, sal_uInt32 nFlags,
     272             :     sal_uInt32 * pInfo, sal_Size * pSrcCvtChars)
     273             : {
     274     1284215 :     bool bJavaUtf8 = pData != NULL;
     275     1284215 :     sal_Unicode nHighSurrogate = 0xFFFF;
     276     1284215 :     sal_uInt32 nInfo = 0;
     277     1284215 :     sal_Unicode const * pSrcBufPtr = pSrcBuf;
     278     1284215 :     sal_Unicode const * pSrcBufEnd = pSrcBufPtr + nSrcChars;
     279     1284215 :     char * pDestBufPtr = pDestBuf;
     280     1284215 :     char * pDestBufEnd = pDestBufPtr + nDestBytes;
     281             : 
     282     1284215 :     if (pContext != NULL)
     283             :         nHighSurrogate
     284       38122 :             = static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate;
     285             : 
     286     1284215 :     if (nHighSurrogate == 0xFFFF)
     287             :     {
     288     1250661 :         if ((nFlags & RTL_UNICODETOTEXT_FLAGS_GLOBAL_SIGNATURE) != 0
     289           6 :             && !bJavaUtf8)
     290             :         {
     291           4 :             if (pDestBufEnd - pDestBufPtr >= 3)
     292             :             {
     293             :                 /* Write BOM (U+FEFF) as UTF-8: */
     294           4 :                 *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xEF));
     295           4 :                 *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xBB));
     296           4 :                 *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xBF));
     297             :             }
     298             :             else
     299             :             {
     300           0 :                 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
     301           0 :                 goto done;
     302             :             }
     303             :         }
     304     1250661 :         nHighSurrogate = 0;
     305             :     }
     306             : 
     307    48386242 :     while (pSrcBufPtr < pSrcBufEnd)
     308             :     {
     309    45817812 :         sal_uInt32 nChar = *pSrcBufPtr++;
     310    45817812 :         if (nHighSurrogate == 0)
     311             :         {
     312    45817810 :             if (ImplIsHighSurrogate(nChar) && !bJavaUtf8)
     313             :             {
     314           2 :                 nHighSurrogate = (sal_Unicode) nChar;
     315           2 :                 continue;
     316             :             }
     317             :         }
     318           2 :         else if (ImplIsLowSurrogate(nChar) && !bJavaUtf8)
     319           2 :             nChar = ImplCombineSurrogates(nHighSurrogate, nChar);
     320             :         else
     321           0 :             goto bad_input;
     322             : 
     323    91635622 :         if ((ImplIsLowSurrogate(nChar) && !bJavaUtf8)
     324    91635620 :             || ImplIsNoncharacter(nChar))
     325           4 :             goto bad_input;
     326             : 
     327    45817806 :         if (nChar <= 0x7F && (!bJavaUtf8 || nChar != 0))
     328    91438808 :             if (pDestBufPtr != pDestBufEnd)
     329    45719404 :                 *pDestBufPtr++ = static_cast< char >(nChar);
     330             :             else
     331           0 :                 goto no_output;
     332       98402 :         else if (nChar <= 0x7FF)
     333        9483 :             if (pDestBufEnd - pDestBufPtr >= 2)
     334             :             {
     335        9483 :                 *pDestBufPtr++ = static_cast< char >(0xC0 | (nChar >> 6));
     336        9483 :                 *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
     337             :             }
     338             :             else
     339           0 :                 goto no_output;
     340       88919 :         else if (nChar <= 0xFFFF)
     341       88917 :             if (pDestBufEnd - pDestBufPtr >= 3)
     342             :             {
     343       88917 :                 *pDestBufPtr++ = static_cast< char >(0xE0 | (nChar >> 12));
     344       88917 :                 *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 6) & 0x3F));
     345       88917 :                 *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
     346             :             }
     347             :             else
     348           0 :                 goto no_output;
     349           2 :         else if (pDestBufEnd - pDestBufPtr >= 4)
     350             :         {
     351           2 :             *pDestBufPtr++ = static_cast< char >(0xF0 | (nChar >> 18));
     352           2 :             *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 12) & 0x3F));
     353           2 :             *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 6) & 0x3F));
     354           2 :             *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
     355             :         }
     356             :         else
     357           0 :             goto no_output;
     358    45817806 :         nHighSurrogate = 0;
     359    45817806 :         continue;
     360             : 
     361             :     bad_input:
     362           4 :         switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
     363             :                     false, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo, NULL,
     364           4 :                     0, NULL))
     365             :         {
     366             :         case sal::detail::textenc::BAD_INPUT_STOP:
     367           0 :             nHighSurrogate = 0;
     368           0 :             break;
     369             : 
     370             :         case sal::detail::textenc::BAD_INPUT_CONTINUE:
     371           4 :             nHighSurrogate = 0;
     372           4 :             continue;
     373             : 
     374             :         case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
     375           0 :             goto no_output;
     376             :         }
     377           0 :         break;
     378             : 
     379             :     no_output:
     380           0 :         --pSrcBufPtr;
     381           0 :         nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
     382           0 :         break;
     383             :     }
     384             : 
     385     1284215 :     if (nHighSurrogate != 0
     386           0 :         && (nInfo & (RTL_UNICODETOTEXT_INFO_ERROR
     387             :                          | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL))
     388             :                == 0)
     389             :     {
     390           0 :         if ((nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0)
     391           0 :             nInfo |= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL;
     392             :         else
     393           0 :             switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
     394             :                         false, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo,
     395           0 :                         NULL, 0, NULL))
     396             :             {
     397             :             case sal::detail::textenc::BAD_INPUT_STOP:
     398             :             case sal::detail::textenc::BAD_INPUT_CONTINUE:
     399           0 :                 nHighSurrogate = 0;
     400           0 :                 break;
     401             : 
     402             :             case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
     403           0 :                 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
     404           0 :                 break;
     405             :             }
     406             :     }
     407             : 
     408             :  done:
     409     1284215 :     if (pContext != NULL)
     410             :         static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate
     411       38122 :             = nHighSurrogate;
     412     1284215 :     if (pInfo != NULL)
     413     1284215 :         *pInfo = nInfo;
     414     1284215 :     if (pSrcCvtChars != NULL)
     415     1284215 :         *pSrcCvtChars = pSrcBufPtr - pSrcBuf;
     416     1284215 :     return pDestBufPtr - pDestBuf;
     417             : }
     418             : 
     419             : /* vim:set shiftwidth=4 softtabstop=4 expandtab: */

Generated by: LCOV version 1.10